From 535101ec884533a288f7ea35ad93cf28c89cb764 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Sun, 30 Jul 2017 12:29:25 +0530
Subject: netfilter: ipvs: Fix space before '[' error.

Fix checkpatch.pl error:
ERROR: space prohibited before open square bracket '['.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..569631d2b2a1 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
 	return tcp_state_active_table[state];
 }
 
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
 /*	INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
 /*	INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
-- 
cgit v1.2.3-59-g8ed1b


From a2c09ac0fb6756d7085c359b6c020ef8b4205e0f Mon Sep 17 00:00:00 2001
From: Inju Song <inju.song@navercorp.com>
Date: Tue, 27 Mar 2018 23:14:40 +0900
Subject: netfilter: ipvs: Keep latest weight of destination

The hashing table in scheduler such as source hash or maglev hash
should ignore the changed weight to 0 and allow changing the weight
from/to non-0 values. So, struct ip_vs_dest needs to keep weight
with latest non-0 weight.

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h            | 1 +
 net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..0ac795b41ab8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -668,6 +668,7 @@ struct ip_vs_dest {
 	volatile unsigned int	flags;		/* dest status flags */
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
+	atomic_t		last_weight;	/* server latest weight */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5ebde4b15810..b91bb70ece92 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	if (add && udest->af != svc->af)
 		ipvs->mixed_address_family_dests++;
 
+	/* keep the last_weight with latest non-0 weight */
+	if (add || udest->weight != 0)
+		atomic_set(&dest->last_weight, udest->weight);
+
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
-- 
cgit v1.2.3-59-g8ed1b


From 039f32e8cdea29b4d0680df7a83817b5ec4166e1 Mon Sep 17 00:00:00 2001
From: Inju Song <inju.song@navercorp.com>
Date: Tue, 27 Mar 2018 23:15:18 +0900
Subject: netfilter: ipvs: Add Maglev hashing scheduler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the Google's Maglev hashing algorithm as a IPVS scheduler.

Basically it provides consistent hashing but offers some special
features about disruption and load balancing.

 1) minimal disruption: when the set of destinations changes,
    a connection will likely be sent to the same destination
    as it was before.

 2) load balancing: each destination will receive an almost
    equal number of connections.

Seel also for detail: [3.4 Consistent Hasing] in
https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_mh.c | 540 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 540 insertions(+)
 create mode 100644 net/netfilter/ipvs/ip_vs_mh.c

diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS:	Maglev Hashing scheduling module
+ *
+ * Authors:	Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign�a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address�through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+	unsigned int	offset; /* starting offset */
+	unsigned int	skip;	/* skip */
+	unsigned int	perm;	/* next_offset */
+	int		turns;	/* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+		       8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX	12
+#endif
+#define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+	struct rcu_head			rcu_head;
+	struct ip_vs_mh_lookup		*lookup;
+	struct ip_vs_mh_dest_setup	*dest_setup;
+	hsiphash_key_t			hash1, hash2;
+	int				gcd;
+	int				rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+					hsiphash_key_t *hash2)
+{
+	hash1->key[0] = 2654435761UL;
+	hash1->key[1] = 2654435761UL;
+
+	hash2->key[0] = 2654446892UL;
+	hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->weight) <= 0 ||
+	       dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+		 __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+	unsigned int v;
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+			    addr->ip6[2] ^ addr->ip6[3];
+#endif
+	v = (offset + ntohs(port) + ntohl(addr_fold));
+	return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+	int i;
+	struct ip_vs_mh_lookup *l;
+	struct ip_vs_dest *dest;
+
+	l = &s->lookup[0];
+	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+		dest = rcu_dereference_protected(l->dest, 1);
+		if (dest) {
+			ip_vs_dest_put(dest);
+			RCU_INIT_POINTER(l->dest, NULL);
+		}
+		l++;
+	}
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+			      struct ip_vs_service *svc)
+{
+	struct list_head *p;
+	struct ip_vs_mh_dest_setup *ds;
+	struct ip_vs_dest *dest;
+	int lw;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, skip
+	 * permutation for the dests.
+	 */
+	if (s->gcd < 1)
+		return 0;
+
+	/* Set dest_setup for the dests permutation */
+	p = &svc->destinations;
+	ds = &s->dest_setup[0];
+	while ((p = p->next) != &svc->destinations) {
+		dest = list_entry(p, struct ip_vs_dest, n_list);
+
+		ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					      dest->port, &s->hash1, 0) %
+					      IP_VS_MH_TAB_SIZE;
+		ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					    dest->port, &s->hash2, 0) %
+					    (IP_VS_MH_TAB_SIZE - 1) + 1;
+		ds->perm = ds->offset;
+
+		lw = atomic_read(&dest->last_weight);
+		ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+		ds++;
+	}
+
+	return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
+{
+	int n, c, dt_count;
+	unsigned long *table;
+	struct list_head *p;
+	struct ip_vs_mh_dest_setup *ds;
+	struct ip_vs_dest *dest, *new_dest;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, skip
+	 * the population for the dests and reset lookup table.
+	 */
+	if (s->gcd < 1) {
+		ip_vs_mh_reset(s);
+		return 0;
+	}
+
+	table =  kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+			 sizeof(unsigned long), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	p = &svc->destinations;
+	n = 0;
+	dt_count = 0;
+	while (n < IP_VS_MH_TAB_SIZE) {
+		if (p == &svc->destinations)
+			p = p->next;
+
+		ds = &s->dest_setup[0];
+		while (p != &svc->destinations) {
+			/* Ignore added server with zero weight */
+			if (ds->turns < 1) {
+				p = p->next;
+				ds++;
+				continue;
+			}
+
+			c = ds->perm;
+			while (test_bit(c, table)) {
+				/* Add skip, mod IP_VS_MH_TAB_SIZE */
+				ds->perm += ds->skip;
+				if (ds->perm >= IP_VS_MH_TAB_SIZE)
+					ds->perm -= IP_VS_MH_TAB_SIZE;
+				c = ds->perm;
+			}
+
+			__set_bit(c, table);
+
+			dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+			new_dest = list_entry(p, struct ip_vs_dest, n_list);
+			if (dest != new_dest) {
+				if (dest)
+					ip_vs_dest_put(dest);
+				ip_vs_dest_hold(new_dest);
+				RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+			}
+
+			if (++n == IP_VS_MH_TAB_SIZE)
+				goto out;
+
+			if (++dt_count >= ds->turns) {
+				dt_count = 0;
+				p = p->next;
+				ds++;
+			}
+		}
+	}
+
+out:
+	kfree(table);
+	return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+	     const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+					     % IP_VS_MH_TAB_SIZE;
+	struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+	return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+		      const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int offset, roffset;
+	unsigned int hash, ihash;
+	struct ip_vs_dest *dest;
+
+	/* First try the dest it's supposed to go to */
+	ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+				 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+	dest = rcu_dereference(s->lookup[ihash].dest);
+	if (!dest)
+		return NULL;
+	if (!is_unavailable(dest))
+		return dest;
+
+	IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+	/* If the original dest is unavailable, loop around the table
+	 * starting from ihash to find a new dest
+	 */
+	for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+		roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+		hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+					roffset) % IP_VS_MH_TAB_SIZE;
+		dest = rcu_dereference(s->lookup[hash].dest);
+		if (!dest)
+			break;
+		if (!is_unavailable(dest))
+			return dest;
+		IP_VS_DBG_BUF(6,
+			      "MH: selected unavailable server %s:%u (offset %u), reselecting",
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port), roffset);
+	}
+
+	return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
+{
+	int ret;
+
+	if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+		return -EINVAL;
+
+	if (svc->num_dests >= 1) {
+		s->dest_setup = kcalloc(svc->num_dests,
+					sizeof(struct ip_vs_mh_dest_setup),
+					GFP_KERNEL);
+		if (!s->dest_setup)
+			return -ENOMEM;
+	}
+
+	ip_vs_mh_permutate(s, svc);
+
+	ret = ip_vs_mh_populate(s, svc);
+	if (ret < 0)
+		goto out;
+
+	IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+		      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+		      ntohs(svc->port));
+
+out:
+	if (svc->num_dests >= 1) {
+		kfree(s->dest_setup);
+		s->dest_setup = NULL;
+	}
+	return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->last_weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+	struct ip_vs_dest *dest;
+	int new_weight, weight = 0;
+	int mw, shift;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, return
+	 * shift value as zero.
+	 */
+	if (gcd < 1)
+		return 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		new_weight = atomic_read(&dest->last_weight);
+		if (new_weight > weight)
+			weight = new_weight;
+	}
+
+	/* Because gcd is greater than zero,
+	 * the maximum weight and gcd are always greater than zero
+	 */
+	mw = weight / gcd;
+
+	/* shift = occupied bits of weight/gcd - MH highest bits */
+	shift = fls(mw) - IP_VS_MH_TAB_BITS;
+	return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+	struct ip_vs_mh_state *s;
+
+	s = container_of(head, struct ip_vs_mh_state, rcu_head);
+	kfree(s->lookup);
+	kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+	int ret;
+	struct ip_vs_mh_state *s;
+
+	/* Allocate the MH table for this service */
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+			    GFP_KERNEL);
+	if (!s->lookup) {
+		kfree(s);
+		return -ENOMEM;
+	}
+
+	generate_hash_secret(&s->hash1, &s->hash2);
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+	IP_VS_DBG(6,
+		  "MH lookup table (memory=%zdbytes) allocated for current service\n",
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+	/* Assign the lookup table with current dests */
+	ret = ip_vs_mh_reassign(s, svc);
+	if (ret < 0) {
+		ip_vs_mh_reset(s);
+		ip_vs_mh_state_free(&s->rcu_head);
+		return ret;
+	}
+
+	/* No more failures, attach state */
+	svc->sched_data = s;
+	return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_mh_state *s = svc->sched_data;
+
+	/* Got to clean up lookup entry here */
+	ip_vs_mh_reset(s);
+
+	call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+	IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+				 struct ip_vs_dest *dest)
+{
+	struct ip_vs_mh_state *s = svc->sched_data;
+
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+	/* Assign the lookup table with the updated service */
+	return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+	__be16 _ports[2], *ports;
+
+	/* At this point we know that we have a valid packet of some kind.
+	 * Because ICMP packets are only guaranteed to have the first 8
+	 * bytes, let's just grab the ports.  Fortunately they're in the
+	 * same position for all three of the protocols we care about.
+	 */
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+		ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+					   &_ports);
+		if (unlikely(!ports))
+			return 0;
+
+		if (likely(!ip_vs_iph_inverse(iph)))
+			return ports[0];
+		else
+			return ports[1];
+	default:
+		return 0;
+	}
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+		  struct ip_vs_iphdr *iph)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_mh_state *s;
+	__be16 port = 0;
+	const union nf_inet_addr *hash_addr;
+
+	hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+	IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+		port = ip_vs_mh_get_port(skb, iph);
+
+	s = (struct ip_vs_mh_state *)svc->sched_data;
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+		dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+	else
+		dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+	if (!dest) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+	IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+		      IP_VS_DBG_ADDR(svc->af, hash_addr),
+		      ntohs(port),
+		      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+		      ntohs(dest->port));
+
+	return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+	.name =			"mh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list	 =		LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+	.init_service =		ip_vs_mh_init_svc,
+	.done_service =		ip_vs_mh_done_svc,
+	.add_dest =		ip_vs_mh_dest_changed,
+	.del_dest =		ip_vs_mh_dest_changed,
+	.upd_dest =		ip_vs_mh_dest_changed,
+	.schedule =		ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+	rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
-- 
cgit v1.2.3-59-g8ed1b


From 30edf801d7ed4b5a59601265792d3256836dc127 Mon Sep 17 00:00:00 2001
From: Inju Song <inju.song@navercorp.com>
Date: Tue, 27 Mar 2018 23:15:54 +0900
Subject: netfilter: ipvs: Add configurations of Maglev hashing

To build the maglev hashing scheduler, add some configuration
to Kconfig and Makefile.

 - The compile configurations of MH are added to the Kconfig.

 - The MH build rule is added to the Makefile.

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/Kconfig  | 37 +++++++++++++++++++++++++++++++++++++
 net/netfilter/ipvs/Makefile |  1 +
 2 files changed, 38 insertions(+)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config	IP_VS_SH
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_MH
+	tristate "maglev hashing scheduling"
+	---help---
+	  The maglev consistent hashing scheduling algorithm provides the
+	  Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+	  network connections to the servers through looking up a statically
+	  assigned special hash table called the lookup table. Maglev hashing
+	  is to assign a preference list of all the lookup table positions
+	  to each destination.
+
+	  Through this operation, The maglev hashing gives an almost equal
+	  share of the lookup table to each of the destinations and provides
+	  minimal disruption by using the lookup table. When the set of
+	  destinations changes, a connection will likely be sent to the same
+	  destination as it was before.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
 config	IP_VS_SED
 	tristate "shortest expected delay scheduling"
 	---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
 	  needs to be large enough to effectively fit all the destinations
 	  multiplied by their respective weights.
 
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+	int "IPVS maglev hashing table index of size (the prime numbers)"
+	range 8 17
+	default 12
+	---help---
+	  The maglev hashing scheduler maps source IPs to destinations
+	  stored in a hash table. This table is assigned by a preference
+	  list of the positions to each destination until all slots in
+	  the table are filled. The index determines the prime for size of
+	  the table as�251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+	  65521 or 131071.�When using weights to allow destinations to
+	  receive more connections,�the table is assigned an amount
+	  proportional to the weights specified.�The table needs to be large
+	  enough to effectively fit all the destinations multiplied by their
+	  respective weights.
+
 comment 'IPVS application helper'
 
 config	IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a17740e0ea1c9b1edd89836bb27c76272f54641 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Sun, 1 Apr 2018 12:27:11 +0200
Subject: ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms

The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative
hashing incorrectly. Replace its use by the hash_32() macro, which
correctly implements this algorithm. It doesn't use the same constant,
but it shouldn't matter.

Signed-off-by: Vincent Bernat <vincent@bernat.im>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_dh.c    | 3 ++-
 net/netfilter/ipvs/ip_vs_lblc.c  | 3 ++-
 net/netfilter/ipvs/ip_vs_lblcr.c | 3 ++-
 net/netfilter/ipvs/ip_vs_sh.c    | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/hash.h>
 
 #include <net/ip_vs.h>
 
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..08147fc6400c 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/jiffies.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..9b6a6c9e9cfa 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
 #include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+	return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+				 IP_VS_SH_TAB_BITS)) &
 		IP_VS_SH_TAB_MASK;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6707ba0105a2d350710bc0a537a98f49eb4b895d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 29 Mar 2018 00:06:10 +0200
Subject: ath10k: avoid possible string overflow

The way that 'strncat' is used here raised a warning in gcc-8:

drivers/net/wireless/ath/ath10k/wmi.c: In function 'ath10k_wmi_tpc_stats_final_disp_tables':
drivers/net/wireless/ath/ath10k/wmi.c:4649:4: error: 'strncat' output truncated before terminating nul copying as many bytes from a string as its length [-Werror=stringop-truncation]

Effectively, this is simply a strcat() but the use of strncat() suggests
some form of overflow check. Regardless of whether this might actually
overflow, using strlcat() instead of strncat() avoids the warning and
makes the code more robust.

Fixes: bc64d05220f3 ("ath10k: debugfs support to get final TPC stats for 10.4 variants")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index c5e1ca5945db..80a80830b363 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -4357,7 +4357,7 @@ static void ath10k_tpc_config_disp_tables(struct ath10k *ar,
 							    rate_code[i],
 							    type);
 			snprintf(buff, sizeof(buff), "%8d ", tpc[j]);
-			strncat(tpc_value, buff, strlen(buff));
+			strlcat(tpc_value, buff, sizeof(tpc_value));
 		}
 		tpc_stats->tpc_table[type].pream_idx[i] = pream_idx;
 		tpc_stats->tpc_table[type].rate_code[i] = rate_code[i];
@@ -4694,7 +4694,7 @@ ath10k_wmi_tpc_stats_final_disp_tables(struct ath10k *ar,
 							       rate_code[i],
 							       type, pream_idx);
 			snprintf(buff, sizeof(buff), "%8d ", tpc[j]);
-			strncat(tpc_value, buff, strlen(buff));
+			strlcat(tpc_value, buff, sizeof(tpc_value));
 		}
 		tpc_stats->tpc_table_final[type].pream_idx[i] = pream_idx;
 		tpc_stats->tpc_table_final[type].rate_code[i] = rate_code[i];
-- 
cgit v1.2.3-59-g8ed1b


From db5a4d5e1073be452645d7021eeaf807d70325a2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 28 Mar 2018 18:40:27 +0100
Subject: wil6210: fix potential null dereference of ndev before null check

The pointer ndev is being dereferenced before it is being null checked,
hence there is a potential null pointer deference. Fix this by only
dereferencing ndev after it has been null checked

Detected by CoverityScan, CID#1467010 ("Dereference before null check")

Fixes: e00243fab84b ("wil6210: infrastructure for multiple virtual interfaces")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c
index a4b413e8d55a..82aec6b06d09 100644
--- a/drivers/net/wireless/ath/wil6210/main.c
+++ b/drivers/net/wireless/ath/wil6210/main.c
@@ -391,7 +391,7 @@ static void wil_fw_error_worker(struct work_struct *work)
 	struct wil6210_priv *wil = container_of(work, struct wil6210_priv,
 						fw_error_worker);
 	struct net_device *ndev = wil->main_ndev;
-	struct wireless_dev *wdev = ndev->ieee80211_ptr;
+	struct wireless_dev *wdev;
 
 	wil_dbg_misc(wil, "fw error worker\n");
 
@@ -399,6 +399,7 @@ static void wil_fw_error_worker(struct work_struct *work)
 		wil_info(wil, "No recovery - interface is down\n");
 		return;
 	}
+	wdev = ndev->ieee80211_ptr;
 
 	/* increment @recovery_count if less then WIL6210_FW_RECOVERY_TO
 	 * passed since last recovery attempt
-- 
cgit v1.2.3-59-g8ed1b


From ec2c64e202576295bdef8edd18c67428a378cc16 Mon Sep 17 00:00:00 2001
From: Marcus Folkesson <marcus.folkesson@gmail.com>
Date: Tue, 27 Mar 2018 22:31:44 +0200
Subject: ath10k: sdio: fix memory leak for probe allocations

These allocations are not freed upon release.
When on it; go for managed resources instead.

Signed-off-by: Marcus Folkesson <marcus.folkesson@gmail.com>
[kvalo: fix two checkpatch warnings]
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/sdio.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c
index 03a69e5b1116..2d04c54a4153 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -1957,25 +1957,25 @@ static int ath10k_sdio_probe(struct sdio_func *func,
 	ar_sdio = ath10k_sdio_priv(ar);
 
 	ar_sdio->irq_data.irq_proc_reg =
-		kzalloc(sizeof(struct ath10k_sdio_irq_proc_regs),
-			GFP_KERNEL);
+		devm_kzalloc(ar->dev, sizeof(struct ath10k_sdio_irq_proc_regs),
+			     GFP_KERNEL);
 	if (!ar_sdio->irq_data.irq_proc_reg) {
 		ret = -ENOMEM;
 		goto err_core_destroy;
 	}
 
 	ar_sdio->irq_data.irq_en_reg =
-		kzalloc(sizeof(struct ath10k_sdio_irq_enable_regs),
-			GFP_KERNEL);
+		devm_kzalloc(ar->dev, sizeof(struct ath10k_sdio_irq_enable_regs),
+			     GFP_KERNEL);
 	if (!ar_sdio->irq_data.irq_en_reg) {
 		ret = -ENOMEM;
-		goto err_free_proc_reg;
+		goto err_core_destroy;
 	}
 
-	ar_sdio->bmi_buf = kzalloc(BMI_MAX_CMDBUF_SIZE, GFP_KERNEL);
+	ar_sdio->bmi_buf = devm_kzalloc(ar->dev, BMI_MAX_CMDBUF_SIZE, GFP_KERNEL);
 	if (!ar_sdio->bmi_buf) {
 		ret = -ENOMEM;
-		goto err_free_en_reg;
+		goto err_core_destroy;
 	}
 
 	ar_sdio->func = func;
@@ -1995,7 +1995,7 @@ static int ath10k_sdio_probe(struct sdio_func *func,
 	ar_sdio->workqueue = create_singlethread_workqueue("ath10k_sdio_wq");
 	if (!ar_sdio->workqueue) {
 		ret = -ENOMEM;
-		goto err_free_bmi_buf;
+		goto err_core_destroy;
 	}
 
 	for (i = 0; i < ATH10K_SDIO_BUS_REQUEST_MAX_NUM; i++)
@@ -2011,7 +2011,7 @@ static int ath10k_sdio_probe(struct sdio_func *func,
 		ret = -ENODEV;
 		ath10k_err(ar, "unsupported device id %u (0x%x)\n",
 			   dev_id_base, id->device);
-		goto err_free_bmi_buf;
+		goto err_core_destroy;
 	}
 
 	ar->id.vendor = id->vendor;
@@ -2040,12 +2040,6 @@ static int ath10k_sdio_probe(struct sdio_func *func,
 
 err_free_wq:
 	destroy_workqueue(ar_sdio->workqueue);
-err_free_bmi_buf:
-	kfree(ar_sdio->bmi_buf);
-err_free_en_reg:
-	kfree(ar_sdio->irq_data.irq_en_reg);
-err_free_proc_reg:
-	kfree(ar_sdio->irq_data.irq_proc_reg);
 err_core_destroy:
 	ath10k_core_destroy(ar);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6ce36faeac353813b0bfd3b9bc867f3e0d6a35b2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 29 Mar 2018 17:59:49 +0100
Subject: ath10k: fix spelling mistake: "tiggers" -> "triggers"

Trivial fix to spelling mistake in message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 80a80830b363..eb1c8a1a1c88 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -7768,7 +7768,7 @@ ath10k_wmi_fw_pdev_tx_stats_fill(const struct ath10k_fw_stats_pdev *pdev,
 	len += scnprintf(buf + len, buf_len - len, "%30s %10d\n",
 			 "HW rate", pdev->data_rc);
 	len += scnprintf(buf + len, buf_len - len, "%30s %10d\n",
-			 "Sched self tiggers", pdev->self_triggers);
+			 "Sched self triggers", pdev->self_triggers);
 	len += scnprintf(buf + len, buf_len - len, "%30s %10d\n",
 			 "Dropped due to SW retries",
 			 pdev->sw_retry_failure);
-- 
cgit v1.2.3-59-g8ed1b


From 9c27489a34548913baaaf3b2776e05d4a9389e3e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 30 Mar 2018 13:34:01 -0500
Subject: ath9k: dfs: remove accidental use of stack VLA

In preparation to enabling -Wvla, remove accidental use of stack VLA.

This avoids an accidental stack VLA (since the compiler thinks
the value of FFT_NUM_SAMPLES can change, even when marked
"const"). This just replaces it with a #define.

Also, fixed as part of the directive to remove all VLAs from
the kernel: https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath9k/dfs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/dfs.c b/drivers/net/wireless/ath/ath9k/dfs.c
index 6fee9a464cce..c8844f55574c 100644
--- a/drivers/net/wireless/ath/ath9k/dfs.c
+++ b/drivers/net/wireless/ath/ath9k/dfs.c
@@ -41,7 +41,7 @@ static const int BIN_DELTA_MAX		= 10;
 
 /* we need at least 3 deltas / 4 samples for a reliable chirp detection */
 #define NUM_DIFFS 3
-static const int FFT_NUM_SAMPLES	= (NUM_DIFFS + 1);
+#define FFT_NUM_SAMPLES		(NUM_DIFFS + 1)
 
 /* Threshold for difference of delta peaks */
 static const int MAX_DIFF		= 2;
@@ -114,7 +114,7 @@ static bool ath9k_check_chirping(struct ath_softc *sc, u8 *data,
 
 		ath_dbg(common, DFS, "HT40: datalen=%d, num_fft_packets=%d\n",
 			datalen, num_fft_packets);
-		if (num_fft_packets < (FFT_NUM_SAMPLES)) {
+		if (num_fft_packets < FFT_NUM_SAMPLES) {
 			ath_dbg(common, DFS, "not enough packets for chirp\n");
 			return false;
 		}
@@ -136,7 +136,7 @@ static bool ath9k_check_chirping(struct ath_softc *sc, u8 *data,
 			return false;
 		ath_dbg(common, DFS, "HT20: datalen=%d, num_fft_packets=%d\n",
 			datalen, num_fft_packets);
-		if (num_fft_packets < (FFT_NUM_SAMPLES)) {
+		if (num_fft_packets < FFT_NUM_SAMPLES) {
 			ath_dbg(common, DFS, "not enough packets for chirp\n");
 			return false;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 7cae35199bee1cd661926f2d68ac46d07682fc54 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 3 Apr 2018 18:51:52 +0200
Subject: wcn36xx: check for DMA mapping errors in wcn36xx_dxe_tx_frame()

Bail out if the mapping fails. Even though this hasn't occured during
tests, this unlikely case should still be handled.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Ramon Fried <rfried@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 2c3b899a88fa..405d350f8708 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -705,6 +705,11 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 					  ctl->skb->data,
 					  ctl->skb->len,
 					  DMA_TO_DEVICE);
+	if (dma_mapping_error(wcn->dev, desc->src_addr_l)) {
+		dev_err(wcn->dev, "unable to DMA map src_addr_l\n");
+		ret = -ENOMEM;
+		goto unlock;
+	}
 
 	desc->dst_addr_l = ch->dxe_wq;
 	desc->fr_len = ctl->skb->len;
-- 
cgit v1.2.3-59-g8ed1b


From 271f1e65ff38f18aa440fec17c759e70a6adfa4e Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 3 Apr 2018 18:51:53 +0200
Subject: wcn36xx: don't keep reference to skb if transmission failed

When wcn36xx_dxe_tx_frame() fails to transmit the TX frame, the driver
will call into ieee80211_free_txskb() for the skb in flight, so it'll no
longer be valid. Hence, we shouldn't keep a reference to it in ctl->skb.
Also, if the skb has IEEE80211_TX_CTL_REQ_TX_STATUS set, a pointer to
it will currently remain in wcn->tx_ack_skb, which will potentially lead
to a crash if accessed later.

Fix this by checking the return value of wcn36xx_dxe_tx_frame(), and
nullify wcn->tx_ack_skb again in case of errors. Move the assignment
of ctl->skb in wcn36xx_dxe_tx_frame() so it only happens when the
transmission is successful.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c  |  6 +++---
 drivers/net/wireless/ath/wcn36xx/txrx.c | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 405d350f8708..2cedb5a4f9e3 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -693,7 +693,6 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 
 	/* Set source address of the SKB we send */
 	ctl = ctl->next;
-	ctl->skb = skb;
 	desc = ctl->desc;
 	if (ctl->bd_cpu_addr) {
 		wcn36xx_err("bd_cpu_addr cannot be NULL for skb DXE\n");
@@ -702,8 +701,8 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 	}
 
 	desc->src_addr_l = dma_map_single(wcn->dev,
-					  ctl->skb->data,
-					  ctl->skb->len,
+					  skb->data,
+					  skb->len,
 					  DMA_TO_DEVICE);
 	if (dma_mapping_error(wcn->dev, desc->src_addr_l)) {
 		dev_err(wcn->dev, "unable to DMA map src_addr_l\n");
@@ -711,6 +710,7 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 		goto unlock;
 	}
 
+	ctl->skb = skb;
 	desc->dst_addr_l = ch->dxe_wq;
 	desc->fr_len = ctl->skb->len;
 
diff --git a/drivers/net/wireless/ath/wcn36xx/txrx.c b/drivers/net/wireless/ath/wcn36xx/txrx.c
index b1768ed6b0be..a6902371e89c 100644
--- a/drivers/net/wireless/ath/wcn36xx/txrx.c
+++ b/drivers/net/wireless/ath/wcn36xx/txrx.c
@@ -273,6 +273,7 @@ int wcn36xx_start_tx(struct wcn36xx *wcn,
 	bool bcast = is_broadcast_ether_addr(hdr->addr1) ||
 		is_multicast_ether_addr(hdr->addr1);
 	struct wcn36xx_tx_bd bd;
+	int ret;
 
 	memset(&bd, 0, sizeof(bd));
 
@@ -317,5 +318,17 @@ int wcn36xx_start_tx(struct wcn36xx *wcn,
 	buff_to_be((u32 *)&bd, sizeof(bd)/sizeof(u32));
 	bd.tx_bd_sign = 0xbdbdbdbd;
 
-	return wcn36xx_dxe_tx_frame(wcn, vif_priv, &bd, skb, is_low);
+	ret = wcn36xx_dxe_tx_frame(wcn, vif_priv, &bd, skb, is_low);
+	if (ret && bd.tx_comp) {
+		/* If the skb has not been transmitted,
+		 * don't keep a reference to it.
+		 */
+		spin_lock_irqsave(&wcn->dxe_lock, flags);
+		wcn->tx_ack_skb = NULL;
+		spin_unlock_irqrestore(&wcn->dxe_lock, flags);
+
+		ieee80211_wake_queues(wcn->hw);
+	}
+
+	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2edfcf2b303cdd63fe0176c6771b47af2d655f8e Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 3 Apr 2018 18:51:54 +0200
Subject: wcn36xx: don't delete invalid bss indices

The firmware code cannot cope with requests to remove BSS indices that have
not previously been added. This primarily happens when the device is
suspended and then resumed. ieee80211_reconfig() then calls into
wcn36xx_bss_info_changed() with an empty bssid and BSS_CHANGED_BSSID set,
which subsequently leads to a firmware crash:

[   43.647928] qcom-wcnss-pil a204000.wcnss: fatal error received: halMsg.c:4964:halMsg_DelBss: Invalid BSSIndex 0
[   43.647959] remoteproc remoteproc0: crash detected in a204000.wcnss: type fatal error

To fix this, set bss_index to WCN36XX_HAL_BSS_INVALID_IDX for all bss
that have not been configured in the firmware, and don't call into the
firmware with invalid indices.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 1 +
 drivers/net/wireless/ath/wcn36xx/smd.c  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 69d6be59d97f..32bbd6e2fd09 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -953,6 +953,7 @@ static int wcn36xx_add_interface(struct ieee80211_hw *hw,
 
 	mutex_lock(&wcn->conf_mutex);
 
+	vif_priv->bss_index = WCN36XX_HAL_BSS_INVALID_IDX;
 	list_add(&vif_priv->list, &wcn->vif_list);
 	wcn36xx_smd_add_sta_self(wcn, vif);
 
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 8932af5e4d8d..5be07e40a86d 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -1446,6 +1446,10 @@ int wcn36xx_smd_delete_bss(struct wcn36xx *wcn, struct ieee80211_vif *vif)
 	int ret = 0;
 
 	mutex_lock(&wcn->hal_mutex);
+
+	if (vif_priv->bss_index == WCN36XX_HAL_BSS_INVALID_IDX)
+		goto out;
+
 	INIT_HAL_MSG(msg_body, WCN36XX_HAL_DELETE_BSS_REQ);
 
 	msg_body.bss_index = vif_priv->bss_index;
@@ -1464,6 +1468,8 @@ int wcn36xx_smd_delete_bss(struct wcn36xx *wcn, struct ieee80211_vif *vif)
 		wcn36xx_err("hal_delete_bss response failed err=%d\n", ret);
 		goto out;
 	}
+
+	vif_priv->bss_index = WCN36XX_HAL_BSS_INVALID_IDX;
 out:
 	mutex_unlock(&wcn->hal_mutex);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 1391cce7daf65dce586aba78df13b0cc4e737416 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 5 Apr 2018 13:51:49 +0200
Subject: wcn36xx: Add missing fall through comment in smd.c

This prevents GCC warning.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 5be07e40a86d..9827a1e1124b 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -2143,6 +2143,7 @@ static int wcn36xx_smd_hw_scan_ind(struct wcn36xx *wcn, void *buf, size_t len)
 	switch (rsp->type) {
 	case WCN36XX_HAL_SCAN_IND_FAILED:
 		scan_info.aborted = true;
+		/* fall through */
 	case WCN36XX_HAL_SCAN_IND_COMPLETED:
 		mutex_lock(&wcn->scan_lock);
 		wcn->scan_req = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 6062546d9b7fecd93b9ad5ed6a34d4741cf3c51a Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Fri, 6 Apr 2018 12:11:28 +0200
Subject: wcn36xx: Remove useless skb spinlock

Each DXE control block is associated to a specific channel.
The channel lock is always taken before accessing a control block.
There is no need to have an extra (useless) spinlock for the control
block skb.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 8 +-------
 drivers/net/wireless/ath/wcn36xx/dxe.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 2cedb5a4f9e3..a1541a8f5212 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -78,7 +78,6 @@ static int wcn36xx_dxe_allocate_ctl_block(struct wcn36xx_dxe_ch *ch)
 		if (!cur_ctl)
 			goto out_fail;
 
-		spin_lock_init(&cur_ctl->skb_lock);
 		cur_ctl->ctl_blk_order = i;
 		if (i == 0) {
 			ch->head_blk_ctl = cur_ctl;
@@ -377,12 +376,11 @@ static void reap_tx_dxes(struct wcn36xx *wcn, struct wcn36xx_dxe_ch *ch)
 				/* Keep frame until TX status comes */
 				ieee80211_free_txskb(wcn->hw, ctl->skb);
 			}
-			spin_lock(&ctl->skb_lock);
+
 			if (wcn->queues_stopped) {
 				wcn->queues_stopped = false;
 				ieee80211_wake_queues(wcn->hw);
 			}
-			spin_unlock(&ctl->skb_lock);
 
 			ctl->skb = NULL;
 		}
@@ -654,8 +652,6 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 	spin_lock_irqsave(&ch->lock, flags);
 	ctl = ch->head_blk_ctl;
 
-	spin_lock(&ctl->next->skb_lock);
-
 	/*
 	 * If skb is not null that means that we reached the tail of the ring
 	 * hence ring is full. Stop queues to let mac80211 back off until ring
@@ -664,11 +660,9 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 	if (NULL != ctl->next->skb) {
 		ieee80211_stop_queues(wcn->hw);
 		wcn->queues_stopped = true;
-		spin_unlock(&ctl->next->skb_lock);
 		spin_unlock_irqrestore(&ch->lock, flags);
 		return -EBUSY;
 	}
-	spin_unlock(&ctl->next->skb_lock);
 
 	ctl->skb = NULL;
 	desc = ctl->desc;
diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.h b/drivers/net/wireless/ath/wcn36xx/dxe.h
index ce580960d109..31b81b7547a3 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.h
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.h
@@ -422,7 +422,6 @@ struct wcn36xx_dxe_ctl {
 	unsigned int		desc_phy_addr;
 	int			ctl_blk_order;
 	struct sk_buff		*skb;
-	spinlock_t              skb_lock;
 	void			*bd_cpu_addr;
 	dma_addr_t		bd_phy_addr;
 };
-- 
cgit v1.2.3-59-g8ed1b


From 5151a673da43fbaeb62fcbd20814a3cdd4dd3afd Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 10 Apr 2018 17:52:45 +0300
Subject: wcn36xx: allocate skbs with GFP_KERNEL during init

GFP_ATOMIC should only be used when the allocation is done from atomic
context. Introduce a new flag to wcn36xx_dxe_fill_skb() and use GFP_KERNEL
when pre-allocating buffers during init.

This doesn't fix an issue that was observed in the wild, but it reduces
the chance of failed allocations under memory pressure.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index a1541a8f5212..a6702ada13d3 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -274,12 +274,14 @@ static int wcn36xx_dxe_enable_ch_int(struct wcn36xx *wcn, u16 wcn_ch)
 	return 0;
 }
 
-static int wcn36xx_dxe_fill_skb(struct device *dev, struct wcn36xx_dxe_ctl *ctl)
+static int wcn36xx_dxe_fill_skb(struct device *dev,
+				struct wcn36xx_dxe_ctl *ctl,
+				gfp_t gfp)
 {
 	struct wcn36xx_dxe_desc *dxe = ctl->desc;
 	struct sk_buff *skb;
 
-	skb = alloc_skb(WCN36XX_PKT_SIZE, GFP_ATOMIC);
+	skb = alloc_skb(WCN36XX_PKT_SIZE, gfp);
 	if (skb == NULL)
 		return -ENOMEM;
 
@@ -306,7 +308,7 @@ static int wcn36xx_dxe_ch_alloc_skb(struct wcn36xx *wcn,
 	cur_ctl = wcn_ch->head_blk_ctl;
 
 	for (i = 0; i < wcn_ch->desc_num; i++) {
-		wcn36xx_dxe_fill_skb(wcn->dev, cur_ctl);
+		wcn36xx_dxe_fill_skb(wcn->dev, cur_ctl, GFP_KERNEL);
 		cur_ctl = cur_ctl->next;
 	}
 
@@ -531,7 +533,7 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 	while (!(dxe->ctrl & WCN36xx_DXE_CTRL_VLD)) {
 		skb = ctl->skb;
 		dma_addr = dxe->dst_addr_l;
-		ret = wcn36xx_dxe_fill_skb(wcn->dev, ctl);
+		ret = wcn36xx_dxe_fill_skb(wcn->dev, ctl, GFP_ATOMIC);
 		if (0 == ret) {
 			/* new skb allocation ok. Use the new one and queue
 			 * the old one to network system.
-- 
cgit v1.2.3-59-g8ed1b


From eda7d46da443e154cbc8f7c1d68acc5088d38181 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Mon, 16 Apr 2018 10:33:41 +0800
Subject: net: mediatek: use of_device_get_match_data()

The usage of of_device_get_match_data() reduce the code size a bit.

Also, the only way to call mtk_probe() is to match an entry in
of_mtk_match[], so match cannot be NULL.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index e0b72bf428f7..d8ebf0a05e0c 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2503,7 +2503,6 @@ static int mtk_probe(struct platform_device *pdev)
 {
 	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	struct device_node *mac_np;
-	const struct of_device_id *match;
 	struct mtk_eth *eth;
 	int err;
 	int i;
@@ -2512,8 +2511,7 @@ static int mtk_probe(struct platform_device *pdev)
 	if (!eth)
 		return -ENOMEM;
 
-	match = of_match_device(of_mtk_match, &pdev->dev);
-	eth->soc = (struct mtk_soc_data *)match->data;
+	eth->soc = of_device_get_match_data(&pdev->dev);
 
 	eth->dev = &pdev->dev;
 	eth->base = devm_ioremap_resource(&pdev->dev, res);
-- 
cgit v1.2.3-59-g8ed1b


From c009f413b79de526a355b6eefa4f900b6c45d5f4 Mon Sep 17 00:00:00 2001
From: Jassi Brar <jaswinder.singh@linaro.org>
Date: Mon, 16 Apr 2018 12:52:16 +0530
Subject: net: netsec: enable tx-irq during open callback

Enable TX-irq as well during ndo_open() as we can not count upon
RX to arrive early enough to trigger the napi. This patch is critical
for installation over network.

Fixes: 533dd11a12f6 ("net: socionext: Add Synquacer NetSec driver")
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/socionext/netsec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index f4c0b02ddad8..f6fe70edbbfe 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1313,8 +1313,8 @@ static int netsec_netdev_open(struct net_device *ndev)
 	napi_enable(&priv->napi);
 	netif_start_queue(ndev);
 
-	/* Enable RX intr. */
-	netsec_write(priv, NETSEC_REG_INTEN_SET, NETSEC_IRQ_RX);
+	/* Enable TX+RX intr. */
+	netsec_write(priv, NETSEC_REG_INTEN_SET, NETSEC_IRQ_RX | NETSEC_IRQ_TX);
 
 	return 0;
 err3:
-- 
cgit v1.2.3-59-g8ed1b


From 9a00b697ce31e38c670a3042cf9f1e9cf28dabb5 Mon Sep 17 00:00:00 2001
From: Masahisa KOJIMA <masahisa.kojima@linaro.org>
Date: Mon, 16 Apr 2018 13:09:59 +0530
Subject: net: socionext: reset hardware in ndo_stop

When the interface is down, head/tail of the descriptor
ring address is set to 0 in netsec_netdev_stop().
But netsec hardware still keeps the previous descriptor
ring address, so there is inconsistency between driver
and hardware after interface is up at a later time.
To address this inconsistency, add netsec_reset_hardware()
when the interface is down.

In addition, to minimize the reset process,
add flag to decide whether driver loads the netsec microcode.
Even if driver resets the netsec hardware, netsec microcode
keeps resident on RAM, so it is ok we only load the microcode
at initialization.

This patch is critical for installation over network.

Signed-off-by: Masahisa KOJIMA <masahisa.kojima@linaro.org>
Fixes: 533dd11a12f6 ("net: socionext: Add Synquacer NetSec driver")
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/socionext/netsec.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index f6fe70edbbfe..aa50331b7607 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1057,7 +1057,8 @@ static int netsec_netdev_load_microcode(struct netsec_priv *priv)
 	return 0;
 }
 
-static int netsec_reset_hardware(struct netsec_priv *priv)
+static int netsec_reset_hardware(struct netsec_priv *priv,
+				 bool load_ucode)
 {
 	u32 value;
 	int err;
@@ -1102,11 +1103,14 @@ static int netsec_reset_hardware(struct netsec_priv *priv)
 	netsec_write(priv, NETSEC_REG_NRM_RX_CONFIG,
 		     1 << NETSEC_REG_DESC_ENDIAN);
 
-	err = netsec_netdev_load_microcode(priv);
-	if (err) {
-		netif_err(priv, probe, priv->ndev,
-			  "%s: failed to load microcode (%d)\n", __func__, err);
-		return err;
+	if (load_ucode) {
+		err = netsec_netdev_load_microcode(priv);
+		if (err) {
+			netif_err(priv, probe, priv->ndev,
+				  "%s: failed to load microcode (%d)\n",
+				  __func__, err);
+			return err;
+		}
 	}
 
 	/* start DMA engines */
@@ -1328,6 +1332,7 @@ err1:
 
 static int netsec_netdev_stop(struct net_device *ndev)
 {
+	int ret;
 	struct netsec_priv *priv = netdev_priv(ndev);
 
 	netif_stop_queue(priv->ndev);
@@ -1343,12 +1348,14 @@ static int netsec_netdev_stop(struct net_device *ndev)
 	netsec_uninit_pkt_dring(priv, NETSEC_RING_TX);
 	netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
 
+	ret = netsec_reset_hardware(priv, false);
+
 	phy_stop(ndev->phydev);
 	phy_disconnect(ndev->phydev);
 
 	pm_runtime_put_sync(priv->dev);
 
-	return 0;
+	return ret;
 }
 
 static int netsec_netdev_init(struct net_device *ndev)
@@ -1364,7 +1371,7 @@ static int netsec_netdev_init(struct net_device *ndev)
 	if (ret)
 		goto err1;
 
-	ret = netsec_reset_hardware(priv);
+	ret = netsec_reset_hardware(priv, true);
 	if (ret)
 		goto err2;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9f463608b98331f58e37d175830b53cec01859a4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 9 Apr 2018 13:43:36 +0100
Subject: net/mlx5: remove some extraneous spaces in indentations

There are several lines where there is an extraneous space causing
indentation misalignment. Remove them.

Cleans up Cocconelle warnings:

./drivers/net/ethernet/mellanox/mlx5/core/qp.c:409:3-18: code aligned
with following code on line 410
./drivers/net/ethernet/mellanox/mlx5/core/qp.c:415:3-18: code aligned
with following code on line 416
./drivers/net/ethernet/mellanox/mlx5/core/qp.c:421:3-18: code aligned
with following code on line 422

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 02d6c5b5d502..4ca07bfb6b14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -407,21 +407,21 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
 	case MLX5_CMD_OP_RST2INIT_QP:
 		if (MBOX_ALLOC(mbox, rst2init_qp))
 			return -ENOMEM;
-		 MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
-				   opt_param_mask, qpc);
-		 break;
+		MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
 	case MLX5_CMD_OP_INIT2RTR_QP:
 		if (MBOX_ALLOC(mbox, init2rtr_qp))
 			return -ENOMEM;
-		 MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
-				   opt_param_mask, qpc);
-		 break;
+		MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
 	case MLX5_CMD_OP_RTR2RTS_QP:
 		if (MBOX_ALLOC(mbox, rtr2rts_qp))
 			return -ENOMEM;
-		 MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
-				   opt_param_mask, qpc);
-		 break;
+		MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
 	case MLX5_CMD_OP_RTS2RTS_QP:
 		if (MBOX_ALLOC(mbox, rts2rts_qp))
 			return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From f85f94b87164e77e476218b83a5f67875d6b161c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Mon, 16 Apr 2018 17:52:59 +0200
Subject: ipv6: remove unnecessary check in addrconf_prefix_rcv_add_addr()

Remove unnecessary check on update_lft variable in
addrconf_prefix_rcv_add_addr routine since it is always set to 0.
Moreover remove update_lft re-initialization to 0

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 78cef00c9596..62b97722722c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2529,7 +2529,6 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
 
-		update_lft = 0;
 		create = 1;
 		spin_lock_bh(&ifp->lock);
 		ifp->flags |= IFA_F_MANAGETEMPADDR;
@@ -2551,7 +2550,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 			stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
 		else
 			stored_lft = 0;
-		if (!update_lft && !create && stored_lft) {
+		if (!create && stored_lft) {
 			const u32 minimum_lft = min_t(u32,
 				stored_lft, MIN_VALID_LIFETIME);
 			valid_lft = max(valid_lft, minimum_lft);
-- 
cgit v1.2.3-59-g8ed1b


From 10b19aeac1700c3ba94fb50583a766d9cdaf1e9e Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Mon, 16 Apr 2018 12:06:04 -0400
Subject: tc-testing: add sample action tests

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../tc-testing/tc-tests/actions/sample.json        | 588 +++++++++++++++++++++
 1 file changed, 588 insertions(+)
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/sample.json

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json
new file mode 100644
index 000000000000..3aca33c00039
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json
@@ -0,0 +1,588 @@
+[
+    {
+        "id": "9784",
+        "name": "Add valid sample action with mandatory arguments",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 10 group 1 index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 2",
+        "matchPattern": "action order [0-9]+: sample rate 1/10 group 1.*index 2 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "5c91",
+        "name": "Add valid sample action with mandatory arguments and continue control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 700 group 2 continue index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 2",
+        "matchPattern": "action order [0-9]+: sample rate 1/700 group 2 continue.*index 2 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "334b",
+        "name": "Add valid sample action with mandatory arguments and drop control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 10000 group 11 drop index 22",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/10000 group 11 drop.*index 22 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "da69",
+        "name": "Add valid sample action with mandatory arguments and reclassify control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 20000 group 72 reclassify index 100",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/20000 group 72 reclassify.*index 100 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "13ce",
+        "name": "Add valid sample action with mandatory arguments and pipe control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 20 group 2 pipe index 100",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/20 group 2 pipe.*index 100 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "1886",
+        "name": "Add valid sample action with mandatory arguments and jump control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 700 group 25 jump 4 index 200",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 200",
+        "matchPattern": "action order [0-9]+: sample rate 1/700 group 25 jump 4.*index 200 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "b6d4",
+        "name": "Add sample action with mandatory arguments and invalid control action",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 200000 group 52 foo index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/200000 group 52 foo.*index 1 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "a874",
+        "name": "Add invalid sample action without mandatory arguments",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample.*index 1 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "ac01",
+        "name": "Add invalid sample action without mandatory argument rate",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample group 10 index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample.*group 10.*index 1 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "4203",
+        "name": "Add invalid sample action without mandatory argument group",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 100 index 10",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/100.*index 10 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "14a7",
+        "name": "Add invalid sample action without mandatory argument group",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 100 index 10",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/100.*index 10 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "8f2e",
+        "name": "Add valid sample action with trunc argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 trunc 1024 index 10",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 trunc_size 1024 pipe.*index 10 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "45f8",
+        "name": "Add sample action with maximum rate argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 4294967295 group 4 index 10",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/4294967295 group 4 pipe.*index 10 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "ad0c",
+        "name": "Add sample action with maximum trunc argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 16000 group 4 trunc 4294967295 index 10",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/16000 group 4 trunc_size 4294967295 pipe.*index 10 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "83a9",
+        "name": "Add sample action with maximum group argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 4294 group 4294967295 index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 1",
+        "matchPattern": "action order [0-9]+: sample rate 1/4294 group 4294967295 pipe.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "ed27",
+        "name": "Add sample action with invalid rate argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 4294967296 group 4 index 10",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 10",
+        "matchPattern": "action order [0-9]+: sample rate 1/4294967296 group 4 pipe.*index 10 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "2eae",
+        "name": "Add sample action with invalid group argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 4098 group 5294967299 continue index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 1",
+        "matchPattern": "action order [0-9]+: sample rate 1/4098 group 5294967299 continue.*index 1 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "6ff3",
+        "name": "Add sample action with invalid trunc size",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 trunc 112233445566 index 11",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 11",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 trunc_size 112233445566.*index 11 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "2b2a",
+        "name": "Add sample action with invalid index",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 5294967299",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action sample index 5294967299",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 5294967299 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "dee2",
+        "name": "Add sample action with maximum allowed index",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 4294967295",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 4294967295",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 4294967295 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "560e",
+        "name": "Add sample action with cookie",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 45 cookie aabbccdd",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action sample index 45",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 45.*cookie aabbccdd",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "704a",
+        "name": "Replace existing sample action with new rate argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action sample rate 1024 group 4 index 4"
+        ],
+        "cmdUnderTest": "$TC actions replace action sample rate 2048 group 4 index 4",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/2048 group 4 pipe.*index 4",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "60eb",
+        "name": "Replace existing sample action with new group argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action sample rate 1024 group 4 index 4"
+        ],
+        "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 index 4",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 pipe.*index 4",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "2cce",
+        "name": "Replace existing sample action with new trunc argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action sample rate 1024 group 4 trunc 48 index 4"
+        ],
+        "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 trunc 64 index 4",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 trunc_size 64 pipe.*index 4",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    },
+    {
+        "id": "59d1",
+        "name": "Replace existing sample action with new control argument",
+        "category": [
+            "actions",
+            "sample"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action sample",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action sample rate 1024 group 4 reclassify index 4"
+        ],
+        "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 pipe index 4",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action sample",
+        "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 pipe.*index 4",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action sample"
+        ]
+    }
+]
-- 
cgit v1.2.3-59-g8ed1b


From d1361840f8c519eaee9a78ffe09e4f0a1b586846 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:35 -0700
Subject: tcp: fix SO_RCVLOWAT and RCVBUF autotuning

Applications might use SO_RCVLOWAT on TCP socket hoping to receive
one [E]POLLIN event only when a given amount of bytes are ready in socket
receive queue.

Problem is that receive autotuning is not aware of this constraint,
meaning sk_rcvbuf might be too small to allow all bytes to be stored.

Add a new (struct proto_ops)->set_rcvlowat method so that a protocol
can override the default setsockopt(SO_RCVLOWAT) behavior.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  1 +
 include/net/tcp.h   |  1 +
 net/core/sock.c     |  5 ++++-
 net/ipv4/af_inet.c  |  1 +
 net/ipv4/tcp.c      | 21 +++++++++++++++++++++
 net/ipv6/af_inet6.c |  1 +
 6 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index 2248a052061d..6554d3ba4396 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -197,6 +197,7 @@ struct proto_ops {
 					   int offset, size_t size, int flags);
 	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
 					  size_t size);
+	int		(*set_rcvlowat)(struct sock *sk, int val);
 };
 
 #define DECLARE_SOCKADDR(type, dst, src)	\
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9c9b3768b350..b2318242cad8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -402,6 +402,7 @@ void tcp_set_keepalive(struct sock *sk, int val);
 void tcp_syn_ack_timeout(const struct request_sock *req);
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
+int tcp_set_rcvlowat(struct sock *sk, int val);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6444525f610c..b2c3db169ca1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -905,7 +905,10 @@ set_rcvbuf:
 	case SO_RCVLOWAT:
 		if (val < 0)
 			val = INT_MAX;
-		sk->sk_rcvlowat = val ? : 1;
+		if (sock->ops->set_rcvlowat)
+			ret = sock->ops->set_rcvlowat(sk, val);
+		else
+			sk->sk_rcvlowat = val ? : 1;
 		break;
 
 	case SO_RCVTIMEO:
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index eaed0367e669..f5c562aaef35 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = {
 	.compat_getsockopt = compat_sock_common_getsockopt,
 	.compat_ioctl	   = inet_compat_ioctl,
 #endif
+	.set_rcvlowat	   = tcp_set_rcvlowat,
 };
 EXPORT_SYMBOL(inet_stream_ops);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bccc4c270087..0abd8d1d3d1d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1701,6 +1701,27 @@ int tcp_peek_len(struct socket *sock)
 }
 EXPORT_SYMBOL(tcp_peek_len);
 
+/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
+int tcp_set_rcvlowat(struct sock *sk, int val)
+{
+	sk->sk_rcvlowat = val ? : 1;
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+		return 0;
+
+	/* val comes from user space and might be close to INT_MAX */
+	val <<= 1;
+	if (val < 0)
+		val = INT_MAX;
+
+	val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+	if (val > sk->sk_rcvbuf) {
+		sk->sk_rcvbuf = val;
+		tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcp_set_rcvlowat);
+
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 8da0b513f188..e70d59fb26e1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = {
 	.compat_setsockopt = compat_sock_common_setsockopt,
 	.compat_getsockopt = compat_sock_common_getsockopt,
 #endif
+	.set_rcvlowat	   = tcp_set_rcvlowat,
 };
 
 const struct proto_ops inet6_dgram_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 796f82eafcd96629c2f9a0332dbb4f474854aaf8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:36 -0700
Subject: tcp: fix delayed acks behavior for SO_RCVLOWAT

We should not delay acks if there are not enough bytes
in receive queue to satisfy SO_RCVLOWAT.

Since [E]POLLIN event is not going to be generated, there is little
hope for a delayed ack to be useful.

In fact, delaying ACK prevents sender from completing
the transfer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 367def6ddeda..d854363a4387 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5026,9 +5026,12 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
 	     /* ... and right edge of window advances far enough.
-	      * (tcp_recvmsg() will send ACK otherwise). Or...
+	      * (tcp_recvmsg() will send ACK otherwise).
+	      * If application uses SO_RCVLOWAT, we want send ack now if
+	      * we have not received enough bytes to satisfy the condition.
 	      */
-	     __tcp_select_window(sk) >= tp->rcv_wnd) ||
+	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* We have out of order data. */
-- 
cgit v1.2.3-59-g8ed1b


From 03f45c883c6f391ed4fff8292415b35bd1107519 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:37 -0700
Subject: tcp: avoid extra wakeups for SO_RCVLOWAT users

SO_RCVLOWAT is properly handled in tcp_poll(), so that POLLIN is only
generated when enough bytes are available in receive queue, after
David change (commit c7004482e8dc "tcp: Respect SO_RCVLOWAT in tcp_poll().")

But TCP still calls sk->sk_data_ready() for each chunk added in receive
queue, meaning thread is awaken, and goes back to sleep shortly after.

Tested:

tcp_mmap test program, receiving 32768 MB of data with SO_RCVLOWAT set to 512KB

-> Should get ~2 wakeups (c-switches) per MB, regardless of how many
(tiny or big) packets were received.

High speed (mostly full size GRO packets)

received 32768 MB (100 % mmap'ed) in 8.03112 s, 34.2266 Gbit,
  cpu usage user:0.037 sys:1.404, 43.9758 usec per MB, 65497 c-switches

received 32768 MB (99.9954 % mmap'ed) in 7.98453 s, 34.4263 Gbit,
  cpu usage user:0.03 sys:1.422, 44.3115 usec per MB, 65485 c-switches

Low speed (sender is ratelimited and sends 1-MSS at a time, so GRO is not helping)

received 22474.5 MB (100 % mmap'ed) in 6015.35 s, 0.0313414 Gbit,
  cpu usage user:0.05 sys:1.586, 72.7952 usec per MB, 44950 c-switches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  1 +
 net/ipv4/tcp.c       |  4 ++++
 net/ipv4/tcp_input.c | 15 +++++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b2318242cad8..0ee85c47c185 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -403,6 +403,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req);
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
 int tcp_set_rcvlowat(struct sock *sk, int val);
+void tcp_data_ready(struct sock *sk);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0abd8d1d3d1d..c768d306b657 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1705,6 +1705,10 @@ EXPORT_SYMBOL(tcp_peek_len);
 int tcp_set_rcvlowat(struct sock *sk, int val)
 {
 	sk->sk_rcvlowat = val ? : 1;
+
+	/* Check if we need to signal EPOLLIN right now */
+	tcp_data_ready(sk);
+
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
 		return 0;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d854363a4387..f93687f97d80 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4576,6 +4576,17 @@ err:
 
 }
 
+void tcp_data_ready(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	int avail = tp->rcv_nxt - tp->copied_seq;
+
+	if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
+		return;
+
+	sk->sk_data_ready(sk);
+}
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4633,7 +4644,7 @@ queue_and_out:
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
 		if (!sock_flag(sk, SOCK_DEAD))
-			sk->sk_data_ready(sk);
+			tcp_data_ready(sk);
 		return;
 	}
 
@@ -5434,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 no_ack:
 			if (eaten)
 				kfree_skb_partial(skb, fragstolen);
-			sk->sk_data_ready(sk);
+			tcp_data_ready(sk);
 			return;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 93ab6cc69162775201587cc9da00d5016dc890e2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:38 -0700
Subject: tcp: implement mmap() for zero copy receive

Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.

Implement mmap() system call so that applications can avoid
copying data without complex splice() games.

Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)

Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.

If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.

Application must fallback to recvmsg() to read the problematic sequence.

mmap() wont block,  regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.

An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()

On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.

Tested:

mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168  (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)

Without mmap() (tcp_mmap -s)

received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
  cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
  cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
  cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
  cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches

With mmap() on receiver (tcp_mmap -s -z)

received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
  cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
  cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
  cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
  cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |   2 +
 net/ipv4/af_inet.c  |   2 +-
 net/ipv4/tcp.c      | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/af_inet6.c |   2 +-
 4 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ee85c47c185..833154e3df17 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
 int tcp_set_rcvlowat(struct sock *sk, int val);
 void tcp_data_ready(struct sock *sk);
+int tcp_mmap(struct file *file, struct socket *sock,
+	     struct vm_area_struct *vma);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f5c562aaef35..3ebf599cebae 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
-	.mmap		   = sock_no_mmap,
+	.mmap		   = tcp_mmap,
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c768d306b657..438fbca96cd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
+/* When user wants to mmap X pages, we first need to perform the mapping
+ * before freeing any skbs in receive queue, otherwise user would be unable
+ * to fallback to standard recvmsg(). This happens if some data in the
+ * requested block is not exactly fitting in a page.
+ *
+ * We only support order-0 pages for the moment.
+ * mmap() on TCP is very strict, there is no point
+ * trying to accommodate with pathological layouts.
+ */
+int tcp_mmap(struct file *file, struct socket *sock,
+	     struct vm_area_struct *vma)
+{
+	unsigned long size = vma->vm_end - vma->vm_start;
+	unsigned int nr_pages = size >> PAGE_SHIFT;
+	struct page **pages_array = NULL;
+	u32 seq, len, offset, nr = 0;
+	struct sock *sk = sock->sk;
+	const skb_frag_t *frags;
+	struct tcp_sock *tp;
+	struct sk_buff *skb;
+	int ret;
+
+	if (vma->vm_pgoff || !nr_pages)
+		return -EINVAL;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+	/* TODO: Maybe the following is not needed if pages are COW */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	lock_sock(sk);
+
+	ret = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	sock_rps_record_flow(sk);
+
+	if (tcp_inq(sk) < size) {
+		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+		goto out;
+	}
+	tp = tcp_sk(sk);
+	seq = tp->copied_seq;
+	/* Abort if urgent data is in the area */
+	if (unlikely(tp->urg_data)) {
+		u32 urg_offset = tp->urg_seq - seq;
+
+		ret = -EINVAL;
+		if (urg_offset < size)
+			goto out;
+	}
+	ret = -ENOMEM;
+	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
+				     GFP_KERNEL);
+	if (!pages_array)
+		goto out;
+	skb = tcp_recv_skb(sk, seq, &offset);
+	ret = -EINVAL;
+skb_start:
+	/* We do not support anything not in page frags */
+	offset -= skb_headlen(skb);
+	if ((int)offset < 0)
+		goto out;
+	if (skb_has_frag_list(skb))
+		goto out;
+	len = skb->data_len - offset;
+	frags = skb_shinfo(skb)->frags;
+	while (offset) {
+		if (frags->size > offset)
+			goto out;
+		offset -= frags->size;
+		frags++;
+	}
+	while (nr < nr_pages) {
+		if (len) {
+			if (len < PAGE_SIZE)
+				goto out;
+			if (frags->size != PAGE_SIZE || frags->page_offset)
+				goto out;
+			pages_array[nr++] = skb_frag_page(frags);
+			frags++;
+			len -= PAGE_SIZE;
+			seq += PAGE_SIZE;
+			continue;
+		}
+		skb = skb->next;
+		offset = seq - TCP_SKB_CB(skb)->seq;
+		goto skb_start;
+	}
+	/* OK, we have a full set of pages ready to be inserted into vma */
+	for (nr = 0; nr < nr_pages; nr++) {
+		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
+				     pages_array[nr]);
+		if (ret)
+			goto out;
+	}
+	/* operation is complete, we can 'consume' all skbs */
+	tp->copied_seq = seq;
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	tcp_recv_skb(sk, seq, &offset);
+	tcp_cleanup_rbuf(sk, size);
+
+	ret = 0;
+out:
+	release_sock(sk);
+	kvfree(pages_array);
+	return ret;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e70d59fb26e1..2c694912df2e 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
-	.mmap		   = sock_no_mmap,
+	.mmap		   = tcp_mmap,
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
-- 
cgit v1.2.3-59-g8ed1b


From 192dc405f3080f1f7d8e90f638dc08c0e5a803a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:39 -0700
Subject: selftests: net: add tcp_mmap program

This is a reference program showing how mmap() can be used
on TCP flows to implement receive zero copy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile   |   2 +
 tools/testing/selftests/net/tcp_mmap.c | 437 +++++++++++++++++++++++++++++++++
 2 files changed, 439 insertions(+)
 create mode 100644 tools/testing/selftests/net/tcp_mmap.c

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 785fc18a16b4..23e725f4305e 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -8,9 +8,11 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
+TEST_GEN_FILES += tcp_mmap
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
 
 include ../lib.mk
 
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
+$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
new file mode 100644
index 000000000000..dea342fe6f4e
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright 2018 Google Inc.
+ * Author: Eric Dumazet (edumazet@google.com)
+ *
+ * Reference program demonstrating tcp mmap() usage,
+ * and SO_RCVLOWAT hints for receiver.
+ *
+ * Note : NIC with header split is needed to use mmap() on TCP :
+ * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload.
+ *
+ * How to use on loopback interface :
+ *
+ *  ifconfig lo mtu 61512  # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header)
+ *  tcp_mmap -s -z &
+ *  tcp_mmap -H ::1 -z
+ *
+ *  Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12)
+ *      (4096 : page size on x86, 12: TCP TS option length)
+ *  tcp_mmap -s -z -M $((4096+12)) &
+ *  tcp_mmap -H ::1 -z -M $((4096+12))
+ *
+ * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface.
+ *       We might use sendfile() instead, but really this test program is about mmap(), for receivers ;)
+ *
+ * $ ./tcp_mmap -s &                                 # Without mmap()
+ * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
+ * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit
+ *   cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches
+ * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit
+ *  cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches
+ * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit
+ *   cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches
+ * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit
+ *   cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches
+ * $ kill %1   # kill tcp_mmap server
+ *
+ * $ ./tcp_mmap -s -z &                              # With mmap()
+ * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
+ * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit
+ *   cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit
+ *   cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit
+ *   cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit
+ *   cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches
+ *
+ * License (GPLv2):
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <error.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <poll.h>
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY    0x4000000
+#endif
+
+#define FILE_SZ (1UL << 35)
+static int cfg_family = AF_INET6;
+static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
+static int cfg_port = 8787;
+
+static int rcvbuf; /* Default: autotuning.  Can be set with -r <integer> option */
+static int sndbuf; /* Default: autotuning.  Can be set with -w <integer> option */
+static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */
+static int xflg; /* hash received data (simple xor) (-h option) */
+static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
+
+static int chunk_size  = 512*1024;
+
+unsigned long htotal;
+
+static inline void prefetch(const void *x)
+{
+#if defined(__x86_64__)
+	asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x));
+#endif
+}
+
+void hash_zone(void *zone, unsigned int length)
+{
+	unsigned long temp = htotal;
+
+	while (length >= 8*sizeof(long)) {
+		prefetch(zone + 384);
+		temp ^= *(unsigned long *)zone;
+		temp ^= *(unsigned long *)(zone + sizeof(long));
+		temp ^= *(unsigned long *)(zone + 2*sizeof(long));
+		temp ^= *(unsigned long *)(zone + 3*sizeof(long));
+		temp ^= *(unsigned long *)(zone + 4*sizeof(long));
+		temp ^= *(unsigned long *)(zone + 5*sizeof(long));
+		temp ^= *(unsigned long *)(zone + 6*sizeof(long));
+		temp ^= *(unsigned long *)(zone + 7*sizeof(long));
+		zone += 8*sizeof(long);
+		length -= 8*sizeof(long);
+	}
+	while (length >= 1) {
+		temp ^= *(unsigned char *)zone;
+		zone += 1;
+		length--;
+	}
+	htotal = temp;
+}
+
+void *child_thread(void *arg)
+{
+	unsigned long total_mmap = 0, total = 0;
+	unsigned long delta_usec;
+	int flags = MAP_SHARED;
+	struct timeval t0, t1;
+	char *buffer = NULL;
+	void *oaddr = NULL;
+	double throughput;
+	struct rusage ru;
+	int lu, fd;
+
+	fd = (int)(unsigned long)arg;
+
+	gettimeofday(&t0, NULL);
+
+	fcntl(fd, F_SETFL, O_NDELAY);
+	buffer = malloc(chunk_size);
+	if (!buffer) {
+		perror("malloc");
+		goto error;
+	}
+	while (1) {
+		struct pollfd pfd = { .fd = fd, .events = POLLIN, };
+		int sub;
+
+		poll(&pfd, 1, 10000);
+		if (zflg) {
+			void *naddr;
+
+			naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
+			if (naddr == (void *)-1) {
+				if (errno == EAGAIN) {
+					/* That is if SO_RCVLOWAT is buggy */
+					usleep(1000);
+					continue;
+				}
+				if (errno == EINVAL) {
+					flags = MAP_SHARED;
+					oaddr = NULL;
+					goto fallback;
+				}
+				if (errno != EIO)
+					perror("mmap()");
+				break;
+			}
+			total_mmap += chunk_size;
+			if (xflg)
+				hash_zone(naddr, chunk_size);
+			total += chunk_size;
+			if (!keepflag) {
+				flags |= MAP_FIXED;
+				oaddr = naddr;
+			}
+			continue;
+		}
+fallback:
+		sub = 0;
+		while (sub < chunk_size) {
+			lu = read(fd, buffer + sub, chunk_size - sub);
+			if (lu == 0)
+				goto end;
+			if (lu < 0)
+				break;
+			if (xflg)
+				hash_zone(buffer + sub, lu);
+			total += lu;
+			sub += lu;
+		}
+	}
+end:
+	gettimeofday(&t1, NULL);
+	delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+	throughput = 0;
+	if (delta_usec)
+		throughput = total * 8.0 / (double)delta_usec / 1000.0;
+	getrusage(RUSAGE_THREAD, &ru);
+	if (total > 1024*1024) {
+		unsigned long total_usec;
+		unsigned long mb = total >> 20;
+		total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec +
+			     1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec;
+		printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n"
+		       "  cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n",
+				total / (1024.0 * 1024.0),
+				100.0*total_mmap/total,
+				(double)delta_usec / 1000000.0,
+				throughput,
+				(double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0,
+				(double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0,
+				(double)total_usec/mb,
+				ru.ru_nvcsw);
+	}
+error:
+	free(buffer);
+	close(fd);
+	pthread_exit(0);
+}
+
+static void apply_rcvsnd_buf(int fd)
+{
+	if (rcvbuf && setsockopt(fd, SOL_SOCKET,
+				 SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) {
+		perror("setsockopt SO_RCVBUF");
+	}
+
+	if (sndbuf && setsockopt(fd, SOL_SOCKET,
+				 SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) {
+		perror("setsockopt SO_SNDBUF");
+	}
+}
+
+
+static void setup_sockaddr(int domain, const char *str_addr,
+			   struct sockaddr_storage *sockaddr)
+{
+	struct sockaddr_in6 *addr6 = (void *) sockaddr;
+	struct sockaddr_in *addr4 = (void *) sockaddr;
+
+	switch (domain) {
+	case PF_INET:
+		memset(addr4, 0, sizeof(*addr4));
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(cfg_port);
+		if (str_addr &&
+		    inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+			error(1, 0, "ipv4 parse error: %s", str_addr);
+		break;
+	case PF_INET6:
+		memset(addr6, 0, sizeof(*addr6));
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(cfg_port);
+		if (str_addr &&
+		    inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+			error(1, 0, "ipv6 parse error: %s", str_addr);
+		break;
+	default:
+		error(1, 0, "illegal domain");
+	}
+}
+
+static void do_accept(int fdlisten)
+{
+	if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
+		       &chunk_size, sizeof(chunk_size)) == -1) {
+		perror("setsockopt SO_RCVLOWAT");
+	}
+
+	apply_rcvsnd_buf(fdlisten);
+
+	while (1) {
+		struct sockaddr_in addr;
+		socklen_t addrlen = sizeof(addr);
+		pthread_t th;
+		int fd, res;
+
+		fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen);
+		if (fd == -1) {
+			perror("accept");
+			continue;
+		}
+		res = pthread_create(&th, NULL, child_thread,
+				     (void *)(unsigned long)fd);
+		if (res) {
+			errno = res;
+			perror("pthread_create");
+			close(fd);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct sockaddr_storage listenaddr, addr;
+	unsigned int max_pacing_rate = 0;
+	unsigned long total = 0;
+	char *host = NULL;
+	int fd, c, on = 1;
+	char *buffer;
+	int sflg = 0;
+	int mss = 0;
+
+	while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) {
+		switch (c) {
+		case '4':
+			cfg_family = PF_INET;
+			cfg_alen = sizeof(struct sockaddr_in);
+			break;
+		case '6':
+			cfg_family = PF_INET6;
+			cfg_alen = sizeof(struct sockaddr_in6);
+			break;
+		case 'p':
+			cfg_port = atoi(optarg);
+			break;
+		case 'H':
+			host = optarg;
+			break;
+		case 's': /* server : listen for incoming connections */
+			sflg++;
+			break;
+		case 'r':
+			rcvbuf = atoi(optarg);
+			break;
+		case 'w':
+			sndbuf = atoi(optarg);
+			break;
+		case 'z':
+			zflg = 1;
+			break;
+		case 'M':
+			mss = atoi(optarg);
+			break;
+		case 'x':
+			xflg = 1;
+			break;
+		case 'k':
+			keepflag = 1;
+			break;
+		case 'P':
+			max_pacing_rate = atoi(optarg) ;
+			break;
+		default:
+			exit(1);
+		}
+	}
+	if (sflg) {
+		int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
+
+		if (fdlisten == -1) {
+			perror("socket");
+			exit(1);
+		}
+		apply_rcvsnd_buf(fdlisten);
+		setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+		setup_sockaddr(cfg_family, host, &listenaddr);
+
+		if (mss &&
+		    setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+			perror("setsockopt TCP_MAXSEG");
+			exit(1);
+		}
+		if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) {
+			perror("bind");
+			exit(1);
+		}
+		if (listen(fdlisten, 128) == -1) {
+			perror("listen");
+			exit(1);
+		}
+		do_accept(fdlisten);
+	}
+	buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE,
+			      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (buffer == (char *)-1) {
+		perror("mmap");
+		exit(1);
+	}
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (fd == -1) {
+		perror("socket");
+		exit(1);
+	}
+	apply_rcvsnd_buf(fd);
+
+	setup_sockaddr(cfg_family, host, &addr);
+
+	if (mss &&
+	    setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+		perror("setsockopt TCP_MAXSEG");
+		exit(1);
+	}
+	if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) {
+		perror("connect");
+		exit(1);
+	}
+	if (max_pacing_rate &&
+	    setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE,
+		       &max_pacing_rate, sizeof(max_pacing_rate)) == -1)
+		perror("setsockopt SO_MAX_PACING_RATE");
+
+	if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
+			       &on, sizeof(on)) == -1) {
+		perror("setsockopt SO_ZEROCOPY, (-z option disabled)");
+		zflg = 0;
+	}
+	while (total < FILE_SZ) {
+		long wr = FILE_SZ - total;
+
+		if (wr > chunk_size)
+			wr = chunk_size;
+		/* Note : we just want to fill the pipe with 0 bytes */
+		wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0);
+		if (wr <= 0)
+			break;
+		total += wr;
+	}
+	close(fd);
+	munmap(buffer, chunk_size);
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 42de047d60bc5d87e369c36115058b9dacc5683c Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 16 Apr 2018 16:08:12 +0100
Subject: net: stmmac: Switch stmmac_desc_ops to generic HW Interface Helpers

Switch stmmac_desc_ops to generic Hardware Interface Helpers instead of
using hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/chain_mode.c   |  14 +--
 drivers/net/ethernet/stmicro/stmmac/common.h       |  55 +--------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c |   4 +-
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     |   4 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 125 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    |   4 +-
 drivers/net/ethernet/stmicro/stmmac/ring_mode.c    |  15 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 110 +++++++++---------
 8 files changed, 195 insertions(+), 136 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/hwif.h

diff --git a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
index e93c40b4631e..ca0f9c205f9d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
@@ -51,8 +51,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 	tx_q->tx_skbuff_dma[entry].buf = des2;
 	tx_q->tx_skbuff_dma[entry].len = bmax;
 	/* do not close the descriptor and do not set own bit */
-	priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum, STMMAC_CHAIN_MODE,
-					0, false, skb->len);
+	stmmac_prepare_tx_desc(priv, desc, 1, bmax, csum, STMMAC_CHAIN_MODE,
+			0, false, skb->len);
 
 	while (len != 0) {
 		tx_q->tx_skbuff[entry] = NULL;
@@ -68,9 +68,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 				return -1;
 			tx_q->tx_skbuff_dma[entry].buf = des2;
 			tx_q->tx_skbuff_dma[entry].len = bmax;
-			priv->hw->desc->prepare_tx_desc(desc, 0, bmax, csum,
-							STMMAC_CHAIN_MODE, 1,
-							false, skb->len);
+			stmmac_prepare_tx_desc(priv, desc, 0, bmax, csum,
+					STMMAC_CHAIN_MODE, 1, false, skb->len);
 			len -= bmax;
 			i++;
 		} else {
@@ -83,9 +82,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 			tx_q->tx_skbuff_dma[entry].buf = des2;
 			tx_q->tx_skbuff_dma[entry].len = len;
 			/* last descriptor can be set now */
-			priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
-							STMMAC_CHAIN_MODE, 1,
-							true, skb->len);
+			stmmac_prepare_tx_desc(priv, desc, 0, len, csum,
+					STMMAC_CHAIN_MODE, 1, true, skb->len);
 			len = 0;
 		}
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index ad2388aee463..2c50d8c0a557 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -32,6 +32,7 @@
 #endif
 
 #include "descs.h"
+#include "hwif.h"
 #include "mmc.h"
 
 /* Synopsys Core versions */
@@ -377,60 +378,6 @@ struct dma_features {
 
 #define JUMBO_LEN		9000
 
-/* Descriptors helpers */
-struct stmmac_desc_ops {
-	/* DMA RX descriptor ring initialization */
-	void (*init_rx_desc) (struct dma_desc *p, int disable_rx_ic, int mode,
-			      int end);
-	/* DMA TX descriptor ring initialization */
-	void (*init_tx_desc) (struct dma_desc *p, int mode, int end);
-
-	/* Invoked by the xmit function to prepare the tx descriptor */
-	void (*prepare_tx_desc) (struct dma_desc *p, int is_fs, int len,
-				 bool csum_flag, int mode, bool tx_own,
-				 bool ls, unsigned int tot_pkt_len);
-	void (*prepare_tso_tx_desc)(struct dma_desc *p, int is_fs, int len1,
-				    int len2, bool tx_own, bool ls,
-				    unsigned int tcphdrlen,
-				    unsigned int tcppayloadlen);
-	/* Set/get the owner of the descriptor */
-	void (*set_tx_owner) (struct dma_desc *p);
-	int (*get_tx_owner) (struct dma_desc *p);
-	/* Clean the tx descriptor as soon as the tx irq is received */
-	void (*release_tx_desc) (struct dma_desc *p, int mode);
-	/* Clear interrupt on tx frame completion. When this bit is
-	 * set an interrupt happens as soon as the frame is transmitted */
-	void (*set_tx_ic)(struct dma_desc *p);
-	/* Last tx segment reports the transmit status */
-	int (*get_tx_ls) (struct dma_desc *p);
-	/* Return the transmit status looking at the TDES1 */
-	int (*tx_status) (void *data, struct stmmac_extra_stats *x,
-			  struct dma_desc *p, void __iomem *ioaddr);
-	/* Get the buffer size from the descriptor */
-	int (*get_tx_len) (struct dma_desc *p);
-	/* Handle extra events on specific interrupts hw dependent */
-	void (*set_rx_owner) (struct dma_desc *p);
-	/* Get the receive frame size */
-	int (*get_rx_frame_len) (struct dma_desc *p, int rx_coe_type);
-	/* Return the reception status looking at the RDES1 */
-	int (*rx_status) (void *data, struct stmmac_extra_stats *x,
-			  struct dma_desc *p);
-	void (*rx_extended_status) (void *data, struct stmmac_extra_stats *x,
-				    struct dma_extended_desc *p);
-	/* Set tx timestamp enable bit */
-	void (*enable_tx_timestamp) (struct dma_desc *p);
-	/* get tx timestamp status */
-	int (*get_tx_timestamp_status) (struct dma_desc *p);
-	/* get timestamp value */
-	 u64(*get_timestamp) (void *desc, u32 ats);
-	/* get rx timestamp status */
-	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
-	/* Display ring */
-	void (*display_ring)(void *head, unsigned int size, bool rx);
-	/* set MSS via context descriptor */
-	void (*set_mss)(struct dma_desc *p, unsigned int mss);
-};
-
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 2a6521d33e43..65ed896c13cb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -223,7 +223,7 @@ static int dwmac4_wrback_get_tx_timestamp_status(struct dma_desc *p)
 	return 0;
 }
 
-static inline u64 dwmac4_get_timestamp(void *desc, u32 ats)
+static inline void dwmac4_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 	u64 ns;
@@ -232,7 +232,7 @@ static inline u64 dwmac4_get_timestamp(void *desc, u32 ats)
 	/* convert high/sec time stamp value to nanosecond */
 	ns += le32_to_cpu(p->des1) * 1000000000ULL;
 
-	return ns;
+	*ts = ns;
 }
 
 static int dwmac4_rx_check_timestamp(void *desc)
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 6768a25b6aa0..3bfb3f584be2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -382,7 +382,7 @@ static int enh_desc_get_tx_timestamp_status(struct dma_desc *p)
 	return (le32_to_cpu(p->des0) & ETDES0_TIME_STAMP_STATUS) >> 17;
 }
 
-static u64 enh_desc_get_timestamp(void *desc, u32 ats)
+static void enh_desc_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	u64 ns;
 
@@ -397,7 +397,7 @@ static u64 enh_desc_get_timestamp(void *desc, u32 ats)
 		ns += le32_to_cpu(p->des3) * 1000000000ULL;
 	}
 
-	return ns;
+	*ts = ns;
 }
 
 static int enh_desc_get_rx_timestamp_status(void *desc, void *next_desc,
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
new file mode 100644
index 000000000000..4994677f66cd
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+// Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+// stmmac HW Interface Callbacks
+
+#ifndef __STMMAC_HWIF_H__
+#define __STMMAC_HWIF_H__
+
+#define stmmac_do_void_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module->__cname) { \
+		(__priv)->hw->__module->__cname((__arg0), ##__args); \
+		__result = 0; \
+	} \
+	__result; \
+})
+#define stmmac_do_callback(__priv, __module, __cname,  __arg0, __args...) \
+({ \
+	int __result = -EINVAL; \
+	if ((__priv)->hw->__module->__cname) \
+		__result = (__priv)->hw->__module->__cname((__arg0), ##__args); \
+	__result; \
+})
+
+struct stmmac_extra_stats;
+struct stmmac_safety_stats;
+struct dma_desc;
+struct dma_extended_desc;
+
+/* Descriptors helpers */
+struct stmmac_desc_ops {
+	/* DMA RX descriptor ring initialization */
+	void (*init_rx_desc)(struct dma_desc *p, int disable_rx_ic, int mode,
+			int end);
+	/* DMA TX descriptor ring initialization */
+	void (*init_tx_desc)(struct dma_desc *p, int mode, int end);
+	/* Invoked by the xmit function to prepare the tx descriptor */
+	void (*prepare_tx_desc)(struct dma_desc *p, int is_fs, int len,
+			bool csum_flag, int mode, bool tx_own, bool ls,
+			unsigned int tot_pkt_len);
+	void (*prepare_tso_tx_desc)(struct dma_desc *p, int is_fs, int len1,
+			int len2, bool tx_own, bool ls, unsigned int tcphdrlen,
+			unsigned int tcppayloadlen);
+	/* Set/get the owner of the descriptor */
+	void (*set_tx_owner)(struct dma_desc *p);
+	int (*get_tx_owner)(struct dma_desc *p);
+	/* Clean the tx descriptor as soon as the tx irq is received */
+	void (*release_tx_desc)(struct dma_desc *p, int mode);
+	/* Clear interrupt on tx frame completion. When this bit is
+	 * set an interrupt happens as soon as the frame is transmitted */
+	void (*set_tx_ic)(struct dma_desc *p);
+	/* Last tx segment reports the transmit status */
+	int (*get_tx_ls)(struct dma_desc *p);
+	/* Return the transmit status looking at the TDES1 */
+	int (*tx_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_desc *p, void __iomem *ioaddr);
+	/* Get the buffer size from the descriptor */
+	int (*get_tx_len)(struct dma_desc *p);
+	/* Handle extra events on specific interrupts hw dependent */
+	void (*set_rx_owner)(struct dma_desc *p);
+	/* Get the receive frame size */
+	int (*get_rx_frame_len)(struct dma_desc *p, int rx_coe_type);
+	/* Return the reception status looking at the RDES1 */
+	int (*rx_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_desc *p);
+	void (*rx_extended_status)(void *data, struct stmmac_extra_stats *x,
+			struct dma_extended_desc *p);
+	/* Set tx timestamp enable bit */
+	void (*enable_tx_timestamp) (struct dma_desc *p);
+	/* get tx timestamp status */
+	int (*get_tx_timestamp_status) (struct dma_desc *p);
+	/* get timestamp value */
+	void (*get_timestamp)(void *desc, u32 ats, u64 *ts);
+	/* get rx timestamp status */
+	int (*get_rx_timestamp_status)(void *desc, void *next_desc, u32 ats);
+	/* Display ring */
+	void (*display_ring)(void *head, unsigned int size, bool rx);
+	/* set MSS via context descriptor */
+	void (*set_mss)(struct dma_desc *p, unsigned int mss);
+};
+
+#define stmmac_init_rx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, init_rx_desc, __args)
+#define stmmac_init_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, init_tx_desc, __args)
+#define stmmac_prepare_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, prepare_tx_desc, __args)
+#define stmmac_prepare_tso_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, prepare_tso_tx_desc, __args)
+#define stmmac_set_tx_owner(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_tx_owner, __args)
+#define stmmac_get_tx_owner(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_owner, __args)
+#define stmmac_release_tx_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, release_tx_desc, __args)
+#define stmmac_set_tx_ic(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_tx_ic, __args)
+#define stmmac_get_tx_ls(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_ls, __args)
+#define stmmac_tx_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, tx_status, __args)
+#define stmmac_get_tx_len(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_len, __args)
+#define stmmac_set_rx_owner(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_rx_owner, __args)
+#define stmmac_get_rx_frame_len(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_rx_frame_len, __args)
+#define stmmac_rx_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, rx_status, __args)
+#define stmmac_rx_extended_status(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, rx_extended_status, __args)
+#define stmmac_enable_tx_timestamp(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, enable_tx_timestamp, __args)
+#define stmmac_get_tx_timestamp_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_tx_timestamp_status, __args)
+#define stmmac_get_timestamp(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, get_timestamp, __args)
+#define stmmac_get_rx_timestamp_status(__priv, __args...) \
+	stmmac_do_callback(__priv, desc, get_rx_timestamp_status, __args)
+#define stmmac_display_ring(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, display_ring, __args)
+#define stmmac_set_mss(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_mss, __args)
+
+#endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index ebd9e5e00f16..7b1d901bf5bc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -253,7 +253,7 @@ static int ndesc_get_tx_timestamp_status(struct dma_desc *p)
 	return (le32_to_cpu(p->des0) & TDES0_TIME_STAMP_STATUS) >> 17;
 }
 
-static u64 ndesc_get_timestamp(void *desc, u32 ats)
+static void ndesc_get_timestamp(void *desc, u32 ats, u64 *ts)
 {
 	struct dma_desc *p = (struct dma_desc *)desc;
 	u64 ns;
@@ -262,7 +262,7 @@ static u64 ndesc_get_timestamp(void *desc, u32 ats)
 	/* convert high/sec time stamp value to nanosecond */
 	ns += le32_to_cpu(p->des3) * 1000000000ULL;
 
-	return ns;
+	*ts = ns;
 }
 
 static int ndesc_get_rx_timestamp_status(void *desc, void *next_desc, u32 ats)
diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
index 28e4b5d50ce6..808ca758d0ae 100644
--- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
@@ -58,9 +58,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 1, bmax, csum,
-						STMMAC_RING_MODE, 0,
-						false, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 1, bmax, csum,
+				STMMAC_RING_MODE, 0, false, skb->len);
 		tx_q->tx_skbuff[entry] = NULL;
 		entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
 
@@ -79,9 +78,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum,
-						STMMAC_RING_MODE, 1,
-						true, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 0, len, csum,
+				STMMAC_RING_MODE, 1, true, skb->len);
 	} else {
 		des2 = dma_map_single(priv->device, skb->data,
 				      nopaged_len, DMA_TO_DEVICE);
@@ -92,9 +90,8 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 		tx_q->tx_skbuff_dma[entry].len = nopaged_len;
 		tx_q->tx_skbuff_dma[entry].is_jumbo = true;
 		desc->des3 = cpu_to_le32(des2 + BUF_SIZE_4KiB);
-		priv->hw->desc->prepare_tx_desc(desc, 1, nopaged_len, csum,
-						STMMAC_RING_MODE, 0,
-						true, skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 1, nopaged_len, csum,
+				STMMAC_RING_MODE, 0, true, skb->len);
 	}
 
 	tx_q->cur_tx = entry;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 9a16931ce39d..c44e19d5dbf2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -50,6 +50,7 @@
 #include <linux/reset.h>
 #include <linux/of_mdio.h>
 #include "dwmac1000.h"
+#include "hwif.h"
 
 #define STMMAC_ALIGN(x)	L1_CACHE_ALIGN(x)
 #define	TSO_MAX_BUFF_SIZE	(SZ_16K - 1)
@@ -464,9 +465,9 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv,
 		return;
 
 	/* check tx tstamp status */
-	if (priv->hw->desc->get_tx_timestamp_status(p)) {
+	if (stmmac_get_tx_timestamp_status(priv, p)) {
 		/* get the valid tstamp */
-		ns = priv->hw->desc->get_timestamp(p, priv->adv_ts);
+		stmmac_get_timestamp(priv, p, priv->adv_ts, &ns);
 
 		memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
 		shhwtstamp.hwtstamp = ns_to_ktime(ns);
@@ -502,8 +503,8 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p,
 		desc = np;
 
 	/* Check if timestamp is available */
-	if (priv->hw->desc->get_rx_timestamp_status(p, np, priv->adv_ts)) {
-		ns = priv->hw->desc->get_timestamp(desc, priv->adv_ts);
+	if (stmmac_get_rx_timestamp_status(priv, p, np, priv->adv_ts)) {
+		stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns);
 		netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns);
 		shhwtstamp = skb_hwtstamps(skb);
 		memset(shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps));
@@ -1008,7 +1009,7 @@ static void stmmac_display_rx_rings(struct stmmac_priv *priv)
 			head_rx = (void *)rx_q->dma_rx;
 
 		/* Display RX ring */
-		priv->hw->desc->display_ring(head_rx, DMA_RX_SIZE, true);
+		stmmac_display_ring(priv, head_rx, DMA_RX_SIZE, true);
 	}
 }
 
@@ -1029,7 +1030,7 @@ static void stmmac_display_tx_rings(struct stmmac_priv *priv)
 		else
 			head_tx = (void *)tx_q->dma_tx;
 
-		priv->hw->desc->display_ring(head_tx, DMA_TX_SIZE, false);
+		stmmac_display_ring(priv, head_tx, DMA_TX_SIZE, false);
 	}
 }
 
@@ -1073,13 +1074,13 @@ static void stmmac_clear_rx_descriptors(struct stmmac_priv *priv, u32 queue)
 	/* Clear the RX descriptors */
 	for (i = 0; i < DMA_RX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_rx_desc(&rx_q->dma_erx[i].basic,
-						     priv->use_riwt, priv->mode,
-						     (i == DMA_RX_SIZE - 1));
+			stmmac_init_rx_desc(priv, &rx_q->dma_erx[i].basic,
+					priv->use_riwt, priv->mode,
+					(i == DMA_RX_SIZE - 1));
 		else
-			priv->hw->desc->init_rx_desc(&rx_q->dma_rx[i],
-						     priv->use_riwt, priv->mode,
-						     (i == DMA_RX_SIZE - 1));
+			stmmac_init_rx_desc(priv, &rx_q->dma_rx[i],
+					priv->use_riwt, priv->mode,
+					(i == DMA_RX_SIZE - 1));
 }
 
 /**
@@ -1097,13 +1098,11 @@ static void stmmac_clear_tx_descriptors(struct stmmac_priv *priv, u32 queue)
 	/* Clear the TX descriptors */
 	for (i = 0; i < DMA_TX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_tx_desc(&tx_q->dma_etx[i].basic,
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic,
+					priv->mode, (i == DMA_TX_SIZE - 1));
 		else
-			priv->hw->desc->init_tx_desc(&tx_q->dma_tx[i],
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_tx[i],
+					priv->mode, (i == DMA_TX_SIZE - 1));
 }
 
 /**
@@ -1851,9 +1850,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 		else
 			p = tx_q->dma_tx + entry;
 
-		status = priv->hw->desc->tx_status(&priv->dev->stats,
-						      &priv->xstats, p,
-						      priv->ioaddr);
+		status = stmmac_tx_status(priv, &priv->dev->stats,
+				&priv->xstats, p, priv->ioaddr);
 		/* Check if the descriptor is owned by the DMA */
 		if (unlikely(status & tx_dma_own))
 			break;
@@ -1904,7 +1902,7 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 			tx_q->tx_skbuff[entry] = NULL;
 		}
 
-		priv->hw->desc->release_tx_desc(p, priv->mode);
+		stmmac_release_tx_desc(priv, p, priv->mode);
 
 		entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
 	}
@@ -1957,13 +1955,11 @@ static void stmmac_tx_err(struct stmmac_priv *priv, u32 chan)
 	dma_free_tx_skbufs(priv, chan);
 	for (i = 0; i < DMA_TX_SIZE; i++)
 		if (priv->extend_desc)
-			priv->hw->desc->init_tx_desc(&tx_q->dma_etx[i].basic,
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_etx[i].basic,
+					priv->mode, (i == DMA_TX_SIZE - 1));
 		else
-			priv->hw->desc->init_tx_desc(&tx_q->dma_tx[i],
-						     priv->mode,
-						     (i == DMA_TX_SIZE - 1));
+			stmmac_init_tx_desc(priv, &tx_q->dma_tx[i],
+					priv->mode, (i == DMA_TX_SIZE - 1));
 	tx_q->dirty_tx = 0;
 	tx_q->cur_tx = 0;
 	tx_q->mss = 0;
@@ -2851,10 +2847,10 @@ static void stmmac_tso_allocator(struct stmmac_priv *priv, unsigned int des,
 		buff_size = tmp_len >= TSO_MAX_BUFF_SIZE ?
 			    TSO_MAX_BUFF_SIZE : tmp_len;
 
-		priv->hw->desc->prepare_tso_tx_desc(desc, 0, buff_size,
-			0, 1,
-			(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
-			0, 0);
+		stmmac_prepare_tso_tx_desc(priv, desc, 0, buff_size,
+				0, 1,
+				(last_segment) && (tmp_len <= TSO_MAX_BUFF_SIZE),
+				0, 0);
 
 		tmp_len -= TSO_MAX_BUFF_SIZE;
 	}
@@ -2926,7 +2922,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* set new MSS value if needed */
 	if (mss != tx_q->mss) {
 		mss_desc = tx_q->dma_tx + tx_q->cur_tx;
-		priv->hw->desc->set_mss(mss_desc, mss);
+		stmmac_set_mss(priv, mss_desc, mss);
 		tx_q->mss = mss;
 		tx_q->cur_tx = STMMAC_GET_ENTRY(tx_q->cur_tx, DMA_TX_SIZE);
 		WARN_ON(tx_q->tx_skbuff[tx_q->cur_tx]);
@@ -3012,7 +3008,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
-		priv->hw->desc->set_tx_ic(desc);
+		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 	}
 
@@ -3022,11 +3018,11 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		     priv->hwts_tx_en)) {
 		/* declare that device is doing timestamping */
 		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-		priv->hw->desc->enable_tx_timestamp(first);
+		stmmac_enable_tx_timestamp(priv, first);
 	}
 
 	/* Complete the first descriptor before granting the DMA */
-	priv->hw->desc->prepare_tso_tx_desc(first, 1,
+	stmmac_prepare_tso_tx_desc(priv, first, 1,
 			proto_hdr_len,
 			pay_len,
 			1, tx_q->tx_skbuff_dma[first_entry].last_segment,
@@ -3040,7 +3036,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * sure that MSS's own bit is the last thing written.
 		 */
 		dma_wmb();
-		priv->hw->desc->set_tx_owner(mss_desc);
+		stmmac_set_tx_owner(priv, mss_desc);
 	}
 
 	/* The own bit must be the latest setting done when prepare the
@@ -3054,8 +3050,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 			__func__, tx_q->cur_tx, tx_q->dirty_tx, first_entry,
 			tx_q->cur_tx, first, nfrags);
 
-		priv->hw->desc->display_ring((void *)tx_q->dma_tx, DMA_TX_SIZE,
-					     0);
+		stmmac_display_ring(priv, (void *)tx_q->dma_tx, DMA_TX_SIZE, 0);
 
 		pr_info(">>> frame to be transmitted: ");
 		print_pkt(skb->data, skb_headlen(skb));
@@ -3174,9 +3169,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		tx_q->tx_skbuff_dma[entry].last_segment = last_segment;
 
 		/* Prepare the descriptor and set the own bit too */
-		priv->hw->desc->prepare_tx_desc(desc, 0, len, csum_insertion,
-						priv->mode, 1, last_segment,
-						skb->len);
+		stmmac_prepare_tx_desc(priv, desc, 0, len, csum_insertion,
+				priv->mode, 1, last_segment, skb->len);
 	}
 
 	/* Only the last descriptor gets to point to the skb. */
@@ -3203,7 +3197,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		else
 			tx_head = (void *)tx_q->dma_tx;
 
-		priv->hw->desc->display_ring(tx_head, DMA_TX_SIZE, false);
+		stmmac_display_ring(priv, tx_head, DMA_TX_SIZE, false);
 
 		netdev_dbg(priv->dev, ">>> frame to be transmitted: ");
 		print_pkt(skb->data, skb->len);
@@ -3228,7 +3222,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
 	} else {
 		priv->tx_count_frames = 0;
-		priv->hw->desc->set_tx_ic(desc);
+		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 	}
 
@@ -3259,13 +3253,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			     priv->hwts_tx_en)) {
 			/* declare that device is doing timestamping */
 			skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-			priv->hw->desc->enable_tx_timestamp(first);
+			stmmac_enable_tx_timestamp(priv, first);
 		}
 
 		/* Prepare the first descriptor setting the OWN bit too */
-		priv->hw->desc->prepare_tx_desc(first, 1, nopaged_len,
-						csum_insertion, priv->mode, 1,
-						last_segment, skb->len);
+		stmmac_prepare_tx_desc(priv, first, 1, nopaged_len,
+				csum_insertion, priv->mode, 1, last_segment,
+				skb->len);
 
 		/* The own bit must be the latest setting done when prepare the
 		 * descriptor and then barrier is needed to make sure that
@@ -3382,9 +3376,9 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 		dma_wmb();
 
 		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-			priv->hw->desc->init_rx_desc(p, priv->use_riwt, 0, 0);
+			stmmac_init_rx_desc(priv, p, priv->use_riwt, 0, 0);
 		else
-			priv->hw->desc->set_rx_owner(p);
+			stmmac_set_rx_owner(priv, p);
 
 		dma_wmb();
 
@@ -3418,7 +3412,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 		else
 			rx_head = (void *)rx_q->dma_rx;
 
-		priv->hw->desc->display_ring(rx_head, DMA_RX_SIZE, true);
+		stmmac_display_ring(priv, rx_head, DMA_RX_SIZE, true);
 	}
 	while (count < limit) {
 		int status;
@@ -3431,8 +3425,8 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			p = rx_q->dma_rx + entry;
 
 		/* read the status of the incoming frame */
-		status = priv->hw->desc->rx_status(&priv->dev->stats,
-						   &priv->xstats, p);
+		status = stmmac_rx_status(priv, &priv->dev->stats,
+				&priv->xstats, p);
 		/* check if managed by the DMA otherwise go ahead */
 		if (unlikely(status & dma_own))
 			break;
@@ -3449,11 +3443,9 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 
 		prefetch(np);
 
-		if ((priv->extend_desc) && (priv->hw->desc->rx_extended_status))
-			priv->hw->desc->rx_extended_status(&priv->dev->stats,
-							   &priv->xstats,
-							   rx_q->dma_erx +
-							   entry);
+		if (priv->extend_desc)
+			stmmac_rx_extended_status(priv, &priv->dev->stats,
+					&priv->xstats, rx_q->dma_erx + entry);
 		if (unlikely(status == discard_frame)) {
 			priv->dev->stats.rx_errors++;
 			if (priv->hwts_rx_en && !priv->extend_desc) {
@@ -3479,7 +3471,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			else
 				des = le32_to_cpu(p->des2);
 
-			frame_len = priv->hw->desc->get_rx_frame_len(p, coe);
+			frame_len = stmmac_get_rx_frame_len(priv, p, coe);
 
 			/*  If frame length is greater than skb buffer size
 			 *  (preallocated during init) then the packet is
-- 
cgit v1.2.3-59-g8ed1b


From a4e887fa6dcc4e8ca4ba93bca3f40b5ab366edf2 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 16 Apr 2018 16:08:13 +0100
Subject: net: stmmac: Switch stmmac_dma_ops to generic HW Interface Helpers

Switch stmmac_dma_ops to generic Hardware Interface Helpers instead of
using hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  50 -------
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 106 +++++++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  14 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 147 +++++++++------------
 4 files changed, 172 insertions(+), 145 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 2c50d8c0a557..b27221bd358c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -381,56 +381,6 @@ struct dma_features {
 extern const struct stmmac_desc_ops enh_desc_ops;
 extern const struct stmmac_desc_ops ndesc_ops;
 
-/* Specific DMA helpers */
-struct stmmac_dma_ops {
-	/* DMA core initialization */
-	int (*reset)(void __iomem *ioaddr);
-	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
-		     u32 dma_tx, u32 dma_rx, int atds);
-	void (*init_chan)(void __iomem *ioaddr,
-			  struct stmmac_dma_cfg *dma_cfg, u32 chan);
-	void (*init_rx_chan)(void __iomem *ioaddr,
-			     struct stmmac_dma_cfg *dma_cfg,
-			     u32 dma_rx_phy, u32 chan);
-	void (*init_tx_chan)(void __iomem *ioaddr,
-			     struct stmmac_dma_cfg *dma_cfg,
-			     u32 dma_tx_phy, u32 chan);
-	/* Configure the AXI Bus Mode Register */
-	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
-	/* Dump DMA registers */
-	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
-	/* Set tx/rx threshold in the csr6 register
-	 * An invalid value enables the store-and-forward mode */
-	void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
-			 int rxfifosz);
-	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
-			    int fifosz, u8 qmode);
-	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
-			    int fifosz, u8 qmode);
-	/* To track extra statistic (if supported) */
-	void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x,
-				   void __iomem *ioaddr);
-	void (*enable_dma_transmission) (void __iomem *ioaddr);
-	void (*enable_dma_irq)(void __iomem *ioaddr, u32 chan);
-	void (*disable_dma_irq)(void __iomem *ioaddr, u32 chan);
-	void (*start_tx)(void __iomem *ioaddr, u32 chan);
-	void (*stop_tx)(void __iomem *ioaddr, u32 chan);
-	void (*start_rx)(void __iomem *ioaddr, u32 chan);
-	void (*stop_rx)(void __iomem *ioaddr, u32 chan);
-	int (*dma_interrupt) (void __iomem *ioaddr,
-			      struct stmmac_extra_stats *x, u32 chan);
-	/* If supported then get the optional core features */
-	void (*get_hw_feature)(void __iomem *ioaddr,
-			       struct dma_features *dma_cap);
-	/* Program the HW RX Watchdog */
-	void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 number_chan);
-	void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
-	void (*set_rx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
-	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
-	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
-	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
-};
-
 struct mac_device_info;
 
 /* Helpers to program the MAC core */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 4994677f66cd..e1a9ae63f859 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -122,4 +122,110 @@ struct stmmac_desc_ops {
 #define stmmac_set_mss(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_mss, __args)
 
+struct stmmac_dma_cfg;
+struct dma_features;
+
+/* Specific DMA helpers */
+struct stmmac_dma_ops {
+	/* DMA core initialization */
+	int (*reset)(void __iomem *ioaddr);
+	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
+		     u32 dma_tx, u32 dma_rx, int atds);
+	void (*init_chan)(void __iomem *ioaddr,
+			  struct stmmac_dma_cfg *dma_cfg, u32 chan);
+	void (*init_rx_chan)(void __iomem *ioaddr,
+			     struct stmmac_dma_cfg *dma_cfg,
+			     u32 dma_rx_phy, u32 chan);
+	void (*init_tx_chan)(void __iomem *ioaddr,
+			     struct stmmac_dma_cfg *dma_cfg,
+			     u32 dma_tx_phy, u32 chan);
+	/* Configure the AXI Bus Mode Register */
+	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
+	/* Dump DMA registers */
+	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
+	/* Set tx/rx threshold in the csr6 register
+	 * An invalid value enables the store-and-forward mode */
+	void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
+			 int rxfifosz);
+	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode);
+	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
+			    int fifosz, u8 qmode);
+	/* To track extra statistic (if supported) */
+	void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x,
+				   void __iomem *ioaddr);
+	void (*enable_dma_transmission) (void __iomem *ioaddr);
+	void (*enable_dma_irq)(void __iomem *ioaddr, u32 chan);
+	void (*disable_dma_irq)(void __iomem *ioaddr, u32 chan);
+	void (*start_tx)(void __iomem *ioaddr, u32 chan);
+	void (*stop_tx)(void __iomem *ioaddr, u32 chan);
+	void (*start_rx)(void __iomem *ioaddr, u32 chan);
+	void (*stop_rx)(void __iomem *ioaddr, u32 chan);
+	int (*dma_interrupt) (void __iomem *ioaddr,
+			      struct stmmac_extra_stats *x, u32 chan);
+	/* If supported then get the optional core features */
+	void (*get_hw_feature)(void __iomem *ioaddr,
+			       struct dma_features *dma_cap);
+	/* Program the HW RX Watchdog */
+	void (*rx_watchdog)(void __iomem *ioaddr, u32 riwt, u32 number_chan);
+	void (*set_tx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
+	void (*set_rx_ring_len)(void __iomem *ioaddr, u32 len, u32 chan);
+	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
+	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
+	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
+};
+
+#define stmmac_reset(__priv, __args...) \
+	stmmac_do_callback(__priv, dma, reset, __args)
+#define stmmac_dma_init(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init, __args)
+#define stmmac_init_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_chan, __args)
+#define stmmac_init_rx_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_rx_chan, __args)
+#define stmmac_init_tx_chan(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, init_tx_chan, __args)
+#define stmmac_axi(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, axi, __args)
+#define stmmac_dump_dma_regs(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dump_regs, __args)
+#define stmmac_dma_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_mode, __args)
+#define stmmac_dma_rx_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_rx_mode, __args)
+#define stmmac_dma_tx_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_tx_mode, __args)
+#define stmmac_dma_diagnostic_fr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, dma_diagnostic_fr, __args)
+#define stmmac_enable_dma_transmission(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_dma_transmission, __args)
+#define stmmac_enable_dma_irq(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_dma_irq, __args)
+#define stmmac_disable_dma_irq(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, disable_dma_irq, __args)
+#define stmmac_start_tx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, start_tx, __args)
+#define stmmac_stop_tx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, stop_tx, __args)
+#define stmmac_start_rx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, start_rx, __args)
+#define stmmac_stop_rx(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, stop_rx, __args)
+#define stmmac_dma_interrupt_status(__priv, __args...) \
+	stmmac_do_callback(__priv, dma, dma_interrupt, __args)
+#define stmmac_get_hw_feature(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, get_hw_feature, __args)
+#define stmmac_rx_watchdog(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, rx_watchdog, __args)
+#define stmmac_set_tx_ring_len(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_tx_ring_len, __args)
+#define stmmac_set_rx_ring_len(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_rx_ring_len, __args)
+#define stmmac_set_rx_tail_ptr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_rx_tail_ptr, __args)
+#define stmmac_set_tx_tail_ptr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, set_tx_tail_ptr, __args)
+#define stmmac_enable_tso(__priv, __args...) \
+	stmmac_do_void_callback(__priv, dma, enable_tso, __args)
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 2c6ed47704fc..7009718dc7a6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -443,7 +443,7 @@ static void stmmac_ethtool_gregs(struct net_device *dev,
 	memset(reg_space, 0x0, REG_SPACE_SIZE);
 
 	priv->hw->mac->dump_regs(priv->hw, reg_space);
-	priv->hw->dma->dump_regs(priv->ioaddr, reg_space);
+	stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space);
 	/* Copy DMA registers to where ethtool expects them */
 	memcpy(&reg_space[ETHTOOL_DMA_OFFSET], &reg_space[DMA_BUS_MODE / 4],
 	       NUM_DWMAC1000_DMA_REGS * 4);
@@ -529,7 +529,7 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 	u32 rx_queues_count = priv->plat->rx_queues_to_use;
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 	unsigned long count;
-	int i, j = 0;
+	int i, j = 0, ret;
 
 	if (priv->dma_cap.asp && priv->hw->mac->safety_feat_dump) {
 		dump = priv->hw->mac->safety_feat_dump;
@@ -541,11 +541,9 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 	}
 
 	/* Update the DMA HW counters for dwmac10/100 */
-	if (priv->hw->dma->dma_diagnostic_fr)
-		priv->hw->dma->dma_diagnostic_fr(&dev->stats,
-						 (void *) &priv->xstats,
-						 priv->ioaddr);
-	else {
+	ret = stmmac_dma_diagnostic_fr(priv, &dev->stats, (void *) &priv->xstats,
+			priv->ioaddr);
+	if (ret) {
 		/* If supported, for new GMAC chips expose the MMC counters */
 		if (priv->dma_cap.rmon) {
 			dwmac_mmc_read(priv->mmcaddr, &priv->mmc);
@@ -810,7 +808,7 @@ static int stmmac_set_coalesce(struct net_device *dev,
 	priv->tx_coal_frames = ec->tx_max_coalesced_frames;
 	priv->tx_coal_timer = ec->tx_coalesce_usecs;
 	priv->rx_riwt = rx_riwt;
-	priv->hw->dma->rx_watchdog(priv->ioaddr, priv->rx_riwt, rx_cnt);
+	stmmac_rx_watchdog(priv, priv->ioaddr, priv->rx_riwt, rx_cnt);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c44e19d5dbf2..7f2aeaede682 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1677,7 +1677,7 @@ static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv)
 static void stmmac_start_rx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA RX processes started in channel %d\n", chan);
-	priv->hw->dma->start_rx(priv->ioaddr, chan);
+	stmmac_start_rx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1690,7 +1690,7 @@ static void stmmac_start_rx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_start_tx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA TX processes started in channel %d\n", chan);
-	priv->hw->dma->start_tx(priv->ioaddr, chan);
+	stmmac_start_tx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1703,7 +1703,7 @@ static void stmmac_start_tx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_stop_rx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA RX processes stopped in channel %d\n", chan);
-	priv->hw->dma->stop_rx(priv->ioaddr, chan);
+	stmmac_stop_rx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1716,7 +1716,7 @@ static void stmmac_stop_rx_dma(struct stmmac_priv *priv, u32 chan)
 static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan)
 {
 	netdev_dbg(priv->dev, "DMA TX processes stopped in channel %d\n", chan);
-	priv->hw->dma->stop_tx(priv->ioaddr, chan);
+	stmmac_stop_tx(priv, priv->ioaddr, chan);
 }
 
 /**
@@ -1807,19 +1807,18 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
 		for (chan = 0; chan < rx_channels_count; chan++) {
 			qmode = priv->plat->rx_queues_cfg[chan].mode_to_use;
 
-			priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan,
-						   rxfifosz, qmode);
+			stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan,
+					rxfifosz, qmode);
 		}
 
 		for (chan = 0; chan < tx_channels_count; chan++) {
 			qmode = priv->plat->tx_queues_cfg[chan].mode_to_use;
 
-			priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan,
-						   txfifosz, qmode);
+			stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
+					txfifosz, qmode);
 		}
 	} else {
-		priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode,
-					rxfifosz);
+		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
 	}
 }
 
@@ -1927,16 +1926,6 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 	netif_tx_unlock(priv->dev);
 }
 
-static inline void stmmac_enable_dma_irq(struct stmmac_priv *priv, u32 chan)
-{
-	priv->hw->dma->enable_dma_irq(priv->ioaddr, chan);
-}
-
-static inline void stmmac_disable_dma_irq(struct stmmac_priv *priv, u32 chan)
-{
-	priv->hw->dma->disable_dma_irq(priv->ioaddr, chan);
-}
-
 /**
  * stmmac_tx_err - to manage the tx error
  * @priv: driver private structure
@@ -2000,13 +1989,12 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode,
 	txfifosz /= tx_channels_count;
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan,
-					   rxfifosz, rxqmode);
-		priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan,
-					   txfifosz, txqmode);
+		stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, rxfifosz,
+				rxqmode);
+		stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan, txfifosz,
+				txqmode);
 	} else {
-		priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode,
-					rxfifosz);
+		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
 	}
 }
 
@@ -2050,16 +2038,15 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 	 * all tx queues rather than just a single tx queue.
 	 */
 	for (chan = 0; chan < channels_to_check; chan++)
-		status[chan] = priv->hw->dma->dma_interrupt(priv->ioaddr,
-							    &priv->xstats,
-							    chan);
+		status[chan] = stmmac_dma_interrupt_status(priv, priv->ioaddr,
+				&priv->xstats, chan);
 
 	for (chan = 0; chan < rx_channel_count; chan++) {
 		if (likely(status[chan] & handle_rx)) {
 			struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan];
 
 			if (likely(napi_schedule_prep(&rx_q->napi))) {
-				stmmac_disable_dma_irq(priv, chan);
+				stmmac_disable_dma_irq(priv, priv->ioaddr, chan);
 				__napi_schedule(&rx_q->napi);
 				poll_scheduled = true;
 			}
@@ -2080,7 +2067,8 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 					&priv->rx_queue[0];
 
 				if (likely(napi_schedule_prep(&rx_q->napi))) {
-					stmmac_disable_dma_irq(priv, chan);
+					stmmac_disable_dma_irq(priv,
+							priv->ioaddr, chan);
 					__napi_schedule(&rx_q->napi);
 				}
 				break;
@@ -2176,15 +2164,7 @@ static void stmmac_selec_desc_mode(struct stmmac_priv *priv)
  */
 static int stmmac_get_hw_features(struct stmmac_priv *priv)
 {
-	u32 ret = 0;
-
-	if (priv->hw->dma->get_hw_feature) {
-		priv->hw->dma->get_hw_feature(priv->ioaddr,
-					      &priv->dma_cap);
-		ret = 1;
-	}
-
-	return ret;
+	return stmmac_get_hw_feature(priv, priv->ioaddr, &priv->dma_cap) == 0;
 }
 
 /**
@@ -2234,7 +2214,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
 		atds = 1;
 
-	ret = priv->hw->dma->reset(priv->ioaddr);
+	ret = stmmac_reset(priv, priv->ioaddr);
 	if (ret) {
 		dev_err(priv->device, "Failed to reset the dma\n");
 		return ret;
@@ -2242,51 +2222,48 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
 		/* DMA Configuration */
-		priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg,
-				    dummy_dma_tx_phy, dummy_dma_rx_phy, atds);
+		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
+				dummy_dma_tx_phy, dummy_dma_rx_phy, atds);
 
 		/* DMA RX Channel Configuration */
 		for (chan = 0; chan < rx_channels_count; chan++) {
 			rx_q = &priv->rx_queue[chan];
 
-			priv->hw->dma->init_rx_chan(priv->ioaddr,
-						    priv->plat->dma_cfg,
-						    rx_q->dma_rx_phy, chan);
+			stmmac_init_rx_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, rx_q->dma_rx_phy,
+					chan);
 
 			rx_q->rx_tail_addr = rx_q->dma_rx_phy +
 				    (DMA_RX_SIZE * sizeof(struct dma_desc));
-			priv->hw->dma->set_rx_tail_ptr(priv->ioaddr,
-						       rx_q->rx_tail_addr,
-						       chan);
+			stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
+					rx_q->rx_tail_addr, chan);
 		}
 
 		/* DMA TX Channel Configuration */
 		for (chan = 0; chan < tx_channels_count; chan++) {
 			tx_q = &priv->tx_queue[chan];
 
-			priv->hw->dma->init_chan(priv->ioaddr,
-						 priv->plat->dma_cfg,
-						 chan);
+			stmmac_init_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, chan);
 
-			priv->hw->dma->init_tx_chan(priv->ioaddr,
-						    priv->plat->dma_cfg,
-						    tx_q->dma_tx_phy, chan);
+			stmmac_init_tx_chan(priv, priv->ioaddr,
+					priv->plat->dma_cfg, tx_q->dma_tx_phy,
+					chan);
 
 			tx_q->tx_tail_addr = tx_q->dma_tx_phy +
 				    (DMA_TX_SIZE * sizeof(struct dma_desc));
-			priv->hw->dma->set_tx_tail_ptr(priv->ioaddr,
-						       tx_q->tx_tail_addr,
-						       chan);
+			stmmac_set_tx_tail_ptr(priv, priv->ioaddr,
+					tx_q->tx_tail_addr, chan);
 		}
 	} else {
 		rx_q = &priv->rx_queue[chan];
 		tx_q = &priv->tx_queue[chan];
-		priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg,
-				    tx_q->dma_tx_phy, rx_q->dma_rx_phy, atds);
+		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
+				tx_q->dma_tx_phy, rx_q->dma_rx_phy, atds);
 	}
 
-	if (priv->plat->axi && priv->hw->dma->axi)
-		priv->hw->dma->axi(priv->ioaddr, priv->plat->axi);
+	if (priv->plat->axi)
+		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
 
 	return ret;
 }
@@ -2332,18 +2309,14 @@ static void stmmac_set_rings_length(struct stmmac_priv *priv)
 	u32 chan;
 
 	/* set TX ring length */
-	if (priv->hw->dma->set_tx_ring_len) {
-		for (chan = 0; chan < tx_channels_count; chan++)
-			priv->hw->dma->set_tx_ring_len(priv->ioaddr,
-						       (DMA_TX_SIZE - 1), chan);
-	}
+	for (chan = 0; chan < tx_channels_count; chan++)
+		stmmac_set_tx_ring_len(priv, priv->ioaddr,
+				(DMA_TX_SIZE - 1), chan);
 
 	/* set RX ring length */
-	if (priv->hw->dma->set_rx_ring_len) {
-		for (chan = 0; chan < rx_channels_count; chan++)
-			priv->hw->dma->set_rx_ring_len(priv->ioaddr,
-						       (DMA_RX_SIZE - 1), chan);
-	}
+	for (chan = 0; chan < rx_channels_count; chan++)
+		stmmac_set_rx_ring_len(priv, priv->ioaddr,
+				(DMA_RX_SIZE - 1), chan);
 }
 
 /**
@@ -2619,9 +2592,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 
 	priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
 
-	if ((priv->use_riwt) && (priv->hw->dma->rx_watchdog)) {
-		priv->rx_riwt = MAX_DMA_RIWT;
-		priv->hw->dma->rx_watchdog(priv->ioaddr, MAX_DMA_RIWT, rx_cnt);
+	if (priv->use_riwt) {
+		ret = stmmac_rx_watchdog(priv, priv->ioaddr, MAX_DMA_RIWT, rx_cnt);
+		if (!ret)
+			priv->rx_riwt = MAX_DMA_RIWT;
 	}
 
 	if (priv->hw->pcs && priv->hw->mac->pcs_ctrl_ane)
@@ -2633,7 +2607,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	/* Enable TSO */
 	if (priv->tso) {
 		for (chan = 0; chan < tx_cnt; chan++)
-			priv->hw->dma->enable_tso(priv->ioaddr, 1, chan);
+			stmmac_enable_tso(priv, priv->ioaddr, 1, chan);
 	}
 
 	return 0;
@@ -3058,8 +3032,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
 
-	priv->hw->dma->set_tx_tail_ptr(priv->ioaddr, tx_q->tx_tail_addr,
-				       queue);
+	stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
 
 	return NETDEV_TX_OK;
 
@@ -3271,10 +3244,10 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
 
 	if (priv->synopsys_id < DWMAC_CORE_4_00)
-		priv->hw->dma->enable_dma_transmission(priv->ioaddr);
+		stmmac_enable_dma_transmission(priv, priv->ioaddr);
 	else
-		priv->hw->dma->set_tx_tail_ptr(priv->ioaddr, tx_q->tx_tail_addr,
-					       queue);
+		stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr,
+				queue);
 
 	return NETDEV_TX_OK;
 
@@ -3608,7 +3581,7 @@ static int stmmac_poll(struct napi_struct *napi, int budget)
 	work_done = stmmac_rx(priv, budget, rx_q->queue_index);
 	if (work_done < budget) {
 		napi_complete_done(napi, work_done);
-		stmmac_enable_dma_irq(priv, chan);
+		stmmac_enable_dma_irq(priv, priv->ioaddr, chan);
 	}
 	return work_done;
 }
@@ -3778,11 +3751,11 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 				priv->hw->mac->host_mtl_irq_status(priv->hw,
 								   queue);
 
-				if (status & CORE_IRQ_MTL_RX_OVERFLOW &&
-				    priv->hw->dma->set_rx_tail_ptr)
-					priv->hw->dma->set_rx_tail_ptr(priv->ioaddr,
-								rx_q->rx_tail_addr,
-								queue);
+				if (status & CORE_IRQ_MTL_RX_OVERFLOW)
+					stmmac_set_rx_tail_ptr(priv,
+							priv->ioaddr,
+							rx_q->rx_tail_addr,
+							queue);
 			}
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From c10d4c82a5c84c22e1bd6be75478853c28a69d71 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 16 Apr 2018 16:08:14 +0100
Subject: net: stmmac: Switch stmmac_ops to generic HW Interface Helpers

Switch stmmac_ops to generic Hardware Interface Helpers instead of using
hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       |  70 -----------
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c       |  19 +--
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h       |   6 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 138 +++++++++++++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  68 ++++------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 122 +++++++++---------
 6 files changed, 235 insertions(+), 188 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index b27221bd358c..0e0b6f19104d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -383,76 +383,6 @@ extern const struct stmmac_desc_ops ndesc_ops;
 
 struct mac_device_info;
 
-/* Helpers to program the MAC core */
-struct stmmac_ops {
-	/* MAC core initialization */
-	void (*core_init)(struct mac_device_info *hw, struct net_device *dev);
-	/* Enable the MAC RX/TX */
-	void (*set_mac)(void __iomem *ioaddr, bool enable);
-	/* Enable and verify that the IPC module is supported */
-	int (*rx_ipc)(struct mac_device_info *hw);
-	/* Enable RX Queues */
-	void (*rx_queue_enable)(struct mac_device_info *hw, u8 mode, u32 queue);
-	/* RX Queues Priority */
-	void (*rx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
-	/* TX Queues Priority */
-	void (*tx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
-	/* RX Queues Routing */
-	void (*rx_queue_routing)(struct mac_device_info *hw, u8 packet,
-				 u32 queue);
-	/* Program RX Algorithms */
-	void (*prog_mtl_rx_algorithms)(struct mac_device_info *hw, u32 rx_alg);
-	/* Program TX Algorithms */
-	void (*prog_mtl_tx_algorithms)(struct mac_device_info *hw, u32 tx_alg);
-	/* Set MTL TX queues weight */
-	void (*set_mtl_tx_queue_weight)(struct mac_device_info *hw,
-					u32 weight, u32 queue);
-	/* RX MTL queue to RX dma mapping */
-	void (*map_mtl_to_dma)(struct mac_device_info *hw, u32 queue, u32 chan);
-	/* Configure AV Algorithm */
-	void (*config_cbs)(struct mac_device_info *hw, u32 send_slope,
-			   u32 idle_slope, u32 high_credit, u32 low_credit,
-			   u32 queue);
-	/* Dump MAC registers */
-	void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
-	/* Handle extra events on specific interrupts hw dependent */
-	int (*host_irq_status)(struct mac_device_info *hw,
-			       struct stmmac_extra_stats *x);
-	/* Handle MTL interrupts */
-	int (*host_mtl_irq_status)(struct mac_device_info *hw, u32 chan);
-	/* Multicast filter setting */
-	void (*set_filter)(struct mac_device_info *hw, struct net_device *dev);
-	/* Flow control setting */
-	void (*flow_ctrl)(struct mac_device_info *hw, unsigned int duplex,
-			  unsigned int fc, unsigned int pause_time, u32 tx_cnt);
-	/* Set power management mode (e.g. magic frame) */
-	void (*pmt)(struct mac_device_info *hw, unsigned long mode);
-	/* Set/Get Unicast MAC addresses */
-	void (*set_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
-			      unsigned int reg_n);
-	void (*get_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
-			      unsigned int reg_n);
-	void (*set_eee_mode)(struct mac_device_info *hw,
-			     bool en_tx_lpi_clockgating);
-	void (*reset_eee_mode)(struct mac_device_info *hw);
-	void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
-	void (*set_eee_pls)(struct mac_device_info *hw, int link);
-	void (*debug)(void __iomem *ioaddr, struct stmmac_extra_stats *x,
-		      u32 rx_queues, u32 tx_queues);
-	/* PCS calls */
-	void (*pcs_ctrl_ane)(void __iomem *ioaddr, bool ane, bool srgmi_ral,
-			     bool loopback);
-	void (*pcs_rane)(void __iomem *ioaddr, bool restart);
-	void (*pcs_get_adv_lp)(void __iomem *ioaddr, struct rgmii_adv *adv);
-	/* Safety Features */
-	int (*safety_feat_config)(void __iomem *ioaddr, unsigned int asp);
-	bool (*safety_feat_irq_status)(struct net_device *ndev,
-			void __iomem *ioaddr, unsigned int asp,
-			struct stmmac_safety_stats *stats);
-	const char *(*safety_feat_dump)(struct stmmac_safety_stats *stats,
-			int index, unsigned long *count);
-};
-
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
 	void (*config_hw_tstamping) (void __iomem *ioaddr, u32 data);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index 860de39999c7..2978550bb7f6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -237,15 +237,16 @@ int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp)
 	return 0;
 }
 
-bool dwmac5_safety_feat_irq_status(struct net_device *ndev,
+int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		void __iomem *ioaddr, unsigned int asp,
 		struct stmmac_safety_stats *stats)
 {
-	bool ret = false, err, corr;
+	bool err, corr;
 	u32 mtl, dma;
+	int ret = 0;
 
 	if (!asp)
-		return false;
+		return -EINVAL;
 
 	mtl = readl(ioaddr + MTL_SAFETY_INT_STATUS);
 	dma = readl(ioaddr + DMA_SAFETY_INT_STATUS);
@@ -282,17 +283,19 @@ static const struct dwmac5_error {
 	{ dwmac5_dma_errors },
 };
 
-const char *dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
-			int index, unsigned long *count)
+int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
+			int index, unsigned long *count, const char **desc)
 {
 	int module = index / 32, offset = index % 32;
 	unsigned long *ptr = (unsigned long *)stats;
 
 	if (module >= ARRAY_SIZE(dwmac5_all_errors))
-		return NULL;
+		return -EINVAL;
 	if (!dwmac5_all_errors[module].desc[offset].valid)
-		return NULL;
+		return -EINVAL;
 	if (count)
 		*count = *(ptr + index);
-	return dwmac5_all_errors[module].desc[offset].desc;
+	if (desc)
+		*desc = dwmac5_all_errors[module].desc[offset].desc;
+	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index a0d2c44711b9..bd4c466d303e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -43,10 +43,10 @@
 #define DMA_ECC_INT_STATUS		0x00001088
 
 int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp);
-bool dwmac5_safety_feat_irq_status(struct net_device *ndev,
+int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		void __iomem *ioaddr, unsigned int asp,
 		struct stmmac_safety_stats *stats);
-const char *dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
-			int index, unsigned long *count);
+int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
+			int index, unsigned long *count, const char **desc);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index e1a9ae63f859..9575135c0699 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -228,4 +228,142 @@ struct stmmac_dma_ops {
 #define stmmac_enable_tso(__priv, __args...) \
 	stmmac_do_void_callback(__priv, dma, enable_tso, __args)
 
+struct mac_device_info;
+struct net_device;
+struct rgmii_adv;
+struct stmmac_safety_stats;
+
+/* Helpers to program the MAC core */
+struct stmmac_ops {
+	/* MAC core initialization */
+	void (*core_init)(struct mac_device_info *hw, struct net_device *dev);
+	/* Enable the MAC RX/TX */
+	void (*set_mac)(void __iomem *ioaddr, bool enable);
+	/* Enable and verify that the IPC module is supported */
+	int (*rx_ipc)(struct mac_device_info *hw);
+	/* Enable RX Queues */
+	void (*rx_queue_enable)(struct mac_device_info *hw, u8 mode, u32 queue);
+	/* RX Queues Priority */
+	void (*rx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* TX Queues Priority */
+	void (*tx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* RX Queues Routing */
+	void (*rx_queue_routing)(struct mac_device_info *hw, u8 packet,
+				 u32 queue);
+	/* Program RX Algorithms */
+	void (*prog_mtl_rx_algorithms)(struct mac_device_info *hw, u32 rx_alg);
+	/* Program TX Algorithms */
+	void (*prog_mtl_tx_algorithms)(struct mac_device_info *hw, u32 tx_alg);
+	/* Set MTL TX queues weight */
+	void (*set_mtl_tx_queue_weight)(struct mac_device_info *hw,
+					u32 weight, u32 queue);
+	/* RX MTL queue to RX dma mapping */
+	void (*map_mtl_to_dma)(struct mac_device_info *hw, u32 queue, u32 chan);
+	/* Configure AV Algorithm */
+	void (*config_cbs)(struct mac_device_info *hw, u32 send_slope,
+			   u32 idle_slope, u32 high_credit, u32 low_credit,
+			   u32 queue);
+	/* Dump MAC registers */
+	void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
+	/* Handle extra events on specific interrupts hw dependent */
+	int (*host_irq_status)(struct mac_device_info *hw,
+			       struct stmmac_extra_stats *x);
+	/* Handle MTL interrupts */
+	int (*host_mtl_irq_status)(struct mac_device_info *hw, u32 chan);
+	/* Multicast filter setting */
+	void (*set_filter)(struct mac_device_info *hw, struct net_device *dev);
+	/* Flow control setting */
+	void (*flow_ctrl)(struct mac_device_info *hw, unsigned int duplex,
+			  unsigned int fc, unsigned int pause_time, u32 tx_cnt);
+	/* Set power management mode (e.g. magic frame) */
+	void (*pmt)(struct mac_device_info *hw, unsigned long mode);
+	/* Set/Get Unicast MAC addresses */
+	void (*set_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
+			      unsigned int reg_n);
+	void (*get_umac_addr)(struct mac_device_info *hw, unsigned char *addr,
+			      unsigned int reg_n);
+	void (*set_eee_mode)(struct mac_device_info *hw,
+			     bool en_tx_lpi_clockgating);
+	void (*reset_eee_mode)(struct mac_device_info *hw);
+	void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
+	void (*set_eee_pls)(struct mac_device_info *hw, int link);
+	void (*debug)(void __iomem *ioaddr, struct stmmac_extra_stats *x,
+		      u32 rx_queues, u32 tx_queues);
+	/* PCS calls */
+	void (*pcs_ctrl_ane)(void __iomem *ioaddr, bool ane, bool srgmi_ral,
+			     bool loopback);
+	void (*pcs_rane)(void __iomem *ioaddr, bool restart);
+	void (*pcs_get_adv_lp)(void __iomem *ioaddr, struct rgmii_adv *adv);
+	/* Safety Features */
+	int (*safety_feat_config)(void __iomem *ioaddr, unsigned int asp);
+	int (*safety_feat_irq_status)(struct net_device *ndev,
+			void __iomem *ioaddr, unsigned int asp,
+			struct stmmac_safety_stats *stats);
+	int (*safety_feat_dump)(struct stmmac_safety_stats *stats,
+			int index, unsigned long *count, const char **desc);
+};
+
+#define stmmac_core_init(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, core_init, __args)
+#define stmmac_mac_set(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_mac, __args)
+#define stmmac_rx_ipc(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, rx_ipc, __args)
+#define stmmac_rx_queue_enable(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, rx_queue_enable, __args)
+#define stmmac_rx_queue_prio(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, rx_queue_prio, __args)
+#define stmmac_tx_queue_prio(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, tx_queue_prio, __args)
+#define stmmac_rx_queue_routing(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, rx_queue_routing, __args)
+#define stmmac_prog_mtl_rx_algorithms(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, prog_mtl_rx_algorithms, __args)
+#define stmmac_prog_mtl_tx_algorithms(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, prog_mtl_tx_algorithms, __args)
+#define stmmac_set_mtl_tx_queue_weight(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_mtl_tx_queue_weight, __args)
+#define stmmac_map_mtl_to_dma(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, map_mtl_to_dma, __args)
+#define stmmac_config_cbs(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, config_cbs, __args)
+#define stmmac_dump_mac_regs(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, dump_regs, __args)
+#define stmmac_host_irq_status(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, host_irq_status, __args)
+#define stmmac_host_mtl_irq_status(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, host_mtl_irq_status, __args)
+#define stmmac_set_filter(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_filter, __args)
+#define stmmac_flow_ctrl(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, flow_ctrl, __args)
+#define stmmac_pmt(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, pmt, __args)
+#define stmmac_set_umac_addr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_umac_addr, __args)
+#define stmmac_get_umac_addr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, get_umac_addr, __args)
+#define stmmac_set_eee_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_eee_mode, __args)
+#define stmmac_reset_eee_mode(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, reset_eee_mode, __args)
+#define stmmac_set_eee_timer(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_eee_timer, __args)
+#define stmmac_set_eee_pls(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, set_eee_pls, __args)
+#define stmmac_mac_debug(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, debug, __args)
+#define stmmac_pcs_ctrl_ane(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, pcs_ctrl_ane, __args)
+#define stmmac_pcs_rane(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, pcs_rane, __args)
+#define stmmac_pcs_get_adv_lp(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mac, pcs_get_adv_lp, __args)
+#define stmmac_safety_feat_config(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, safety_feat_config, __args)
+#define stmmac_safety_feat_irq_status(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, safety_feat_irq_status, __args)
+#define stmmac_safety_feat_dump(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, safety_feat_dump, __args)
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 7009718dc7a6..6d82b3ef5c3b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -291,11 +291,9 @@ static int stmmac_ethtool_get_link_ksettings(struct net_device *dev,
 		cmd->base.speed = priv->xstats.pcs_speed;
 
 		/* Get and convert ADV/LP_ADV from the HW AN registers */
-		if (!priv->hw->mac->pcs_get_adv_lp)
+		if (stmmac_pcs_get_adv_lp(priv, priv->ioaddr, &adv))
 			return -EOPNOTSUPP;	/* should never happen indeed */
 
-		priv->hw->mac->pcs_get_adv_lp(priv->ioaddr, &adv);
-
 		/* Encoding of PSE bits is defined in 802.3z, 37.2.1.4 */
 
 		ethtool_convert_link_mode_to_legacy_u32(
@@ -393,11 +391,7 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev,
 			ADVERTISED_10baseT_Full);
 
 		spin_lock(&priv->lock);
-
-		if (priv->hw->mac->pcs_ctrl_ane)
-			priv->hw->mac->pcs_ctrl_ane(priv->ioaddr, 1,
-						    priv->hw->ps, 0);
-
+		stmmac_pcs_ctrl_ane(priv, priv->ioaddr, 1, priv->hw->ps, 0);
 		spin_unlock(&priv->lock);
 
 		return 0;
@@ -442,7 +436,7 @@ static void stmmac_ethtool_gregs(struct net_device *dev,
 
 	memset(reg_space, 0x0, REG_SPACE_SIZE);
 
-	priv->hw->mac->dump_regs(priv->hw, reg_space);
+	stmmac_dump_mac_regs(priv, priv->hw, reg_space);
 	stmmac_dump_dma_regs(priv, priv->ioaddr, reg_space);
 	/* Copy DMA registers to where ethtool expects them */
 	memcpy(&reg_space[ETHTOOL_DMA_OFFSET], &reg_space[DMA_BUS_MODE / 4],
@@ -454,15 +448,13 @@ stmmac_get_pauseparam(struct net_device *netdev,
 		      struct ethtool_pauseparam *pause)
 {
 	struct stmmac_priv *priv = netdev_priv(netdev);
+	struct rgmii_adv adv_lp;
 
 	pause->rx_pause = 0;
 	pause->tx_pause = 0;
 
-	if (priv->hw->pcs && priv->hw->mac->pcs_get_adv_lp) {
-		struct rgmii_adv adv_lp;
-
+	if (priv->hw->pcs && !stmmac_pcs_get_adv_lp(priv, priv->ioaddr, &adv_lp)) {
 		pause->autoneg = 1;
-		priv->hw->mac->pcs_get_adv_lp(priv->ioaddr, &adv_lp);
 		if (!adv_lp.pause)
 			return;
 	} else {
@@ -488,12 +480,10 @@ stmmac_set_pauseparam(struct net_device *netdev,
 	u32 tx_cnt = priv->plat->tx_queues_to_use;
 	struct phy_device *phy = netdev->phydev;
 	int new_pause = FLOW_OFF;
+	struct rgmii_adv adv_lp;
 
-	if (priv->hw->pcs && priv->hw->mac->pcs_get_adv_lp) {
-		struct rgmii_adv adv_lp;
-
+	if (priv->hw->pcs && !stmmac_pcs_get_adv_lp(priv, priv->ioaddr, &adv_lp)) {
 		pause->autoneg = 1;
-		priv->hw->mac->pcs_get_adv_lp(priv->ioaddr, &adv_lp);
 		if (!adv_lp.pause)
 			return -EOPNOTSUPP;
 	} else {
@@ -515,27 +505,24 @@ stmmac_set_pauseparam(struct net_device *netdev,
 			return phy_start_aneg(phy);
 	}
 
-	priv->hw->mac->flow_ctrl(priv->hw, phy->duplex, priv->flow_ctrl,
-				 priv->pause, tx_cnt);
+	stmmac_flow_ctrl(priv, priv->hw, phy->duplex, priv->flow_ctrl,
+			priv->pause, tx_cnt);
 	return 0;
 }
 
 static void stmmac_get_ethtool_stats(struct net_device *dev,
 				 struct ethtool_stats *dummy, u64 *data)
 {
-	const char *(*dump)(struct stmmac_safety_stats *stats, int index,
-			unsigned long *count);
 	struct stmmac_priv *priv = netdev_priv(dev);
 	u32 rx_queues_count = priv->plat->rx_queues_to_use;
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 	unsigned long count;
 	int i, j = 0, ret;
 
-	if (priv->dma_cap.asp && priv->hw->mac->safety_feat_dump) {
-		dump = priv->hw->mac->safety_feat_dump;
-
+	if (priv->dma_cap.asp) {
 		for (i = 0; i < STMMAC_SAFETY_FEAT_SIZE; i++) {
-			if (dump(&priv->sstats, i, &count))
+			if (!stmmac_safety_feat_dump(priv, &priv->sstats, i,
+						&count, NULL))
 				data[j++] = count;
 		}
 	}
@@ -563,11 +550,10 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 				priv->xstats.phy_eee_wakeup_error_n = val;
 		}
 
-		if ((priv->hw->mac->debug) &&
-		    (priv->synopsys_id >= DWMAC_CORE_3_50))
-			priv->hw->mac->debug(priv->ioaddr,
-					     (void *)&priv->xstats,
-					     rx_queues_count, tx_queues_count);
+		if (priv->synopsys_id >= DWMAC_CORE_3_50)
+			stmmac_mac_debug(priv, priv->ioaddr,
+					(void *)&priv->xstats,
+					rx_queues_count, tx_queues_count);
 	}
 	for (i = 0; i < STMMAC_STATS_LEN; i++) {
 		char *p = (char *)priv + stmmac_gstrings_stats[i].stat_offset;
@@ -579,8 +565,6 @@ static void stmmac_get_ethtool_stats(struct net_device *dev,
 static int stmmac_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct stmmac_priv *priv = netdev_priv(netdev);
-	const char *(*dump)(struct stmmac_safety_stats *stats, int index,
-			unsigned long *count);
 	int i, len, safety_len = 0;
 
 	switch (sset) {
@@ -589,11 +573,11 @@ static int stmmac_get_sset_count(struct net_device *netdev, int sset)
 
 		if (priv->dma_cap.rmon)
 			len += STMMAC_MMC_STATS_LEN;
-		if (priv->dma_cap.asp && priv->hw->mac->safety_feat_dump) {
-			dump = priv->hw->mac->safety_feat_dump;
-
+		if (priv->dma_cap.asp) {
 			for (i = 0; i < STMMAC_SAFETY_FEAT_SIZE; i++) {
-				if (dump(&priv->sstats, i, NULL))
+				if (!stmmac_safety_feat_dump(priv,
+							&priv->sstats, i,
+							NULL, NULL))
 					safety_len++;
 			}
 
@@ -611,17 +595,15 @@ static void stmmac_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 	int i;
 	u8 *p = data;
 	struct stmmac_priv *priv = netdev_priv(dev);
-	const char *(*dump)(struct stmmac_safety_stats *stats, int index,
-			unsigned long *count);
 
 	switch (stringset) {
 	case ETH_SS_STATS:
-		if (priv->dma_cap.asp && priv->hw->mac->safety_feat_dump) {
-			dump = priv->hw->mac->safety_feat_dump;
+		if (priv->dma_cap.asp) {
 			for (i = 0; i < STMMAC_SAFETY_FEAT_SIZE; i++) {
-				const char *desc = dump(&priv->sstats, i, NULL);
-
-				if (desc) {
+				const char *desc;
+				if (!stmmac_safety_feat_dump(priv,
+							&priv->sstats, i,
+							NULL, &desc)) {
 					memcpy(p, desc, ETH_GSTRING_LEN);
 					p += ETH_GSTRING_LEN;
 				}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 7f2aeaede682..2b521035185d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -336,8 +336,8 @@ static void stmmac_enable_eee_mode(struct stmmac_priv *priv)
 
 	/* Check and enter in LPI mode */
 	if (!priv->tx_path_in_lpi_mode)
-		priv->hw->mac->set_eee_mode(priv->hw,
-					    priv->plat->en_tx_lpi_clockgating);
+		stmmac_set_eee_mode(priv, priv->hw,
+				priv->plat->en_tx_lpi_clockgating);
 }
 
 /**
@@ -348,7 +348,7 @@ static void stmmac_enable_eee_mode(struct stmmac_priv *priv)
  */
 void stmmac_disable_eee_mode(struct stmmac_priv *priv)
 {
-	priv->hw->mac->reset_eee_mode(priv->hw);
+	stmmac_reset_eee_mode(priv, priv->hw);
 	del_timer_sync(&priv->eee_ctrl_timer);
 	priv->tx_path_in_lpi_mode = false;
 }
@@ -411,8 +411,8 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 			if (priv->eee_active) {
 				netdev_dbg(priv->dev, "disable EEE\n");
 				del_timer_sync(&priv->eee_ctrl_timer);
-				priv->hw->mac->set_eee_timer(priv->hw, 0,
-							     tx_lpi_timer);
+				stmmac_set_eee_timer(priv, priv->hw, 0,
+						tx_lpi_timer);
 			}
 			priv->eee_active = 0;
 			spin_unlock_irqrestore(&priv->lock, flags);
@@ -427,12 +427,11 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 			mod_timer(&priv->eee_ctrl_timer,
 				  STMMAC_LPI_T(eee_timer));
 
-			priv->hw->mac->set_eee_timer(priv->hw,
-						     STMMAC_DEFAULT_LIT_LS,
-						     tx_lpi_timer);
+			stmmac_set_eee_timer(priv, priv->hw,
+					STMMAC_DEFAULT_LIT_LS, tx_lpi_timer);
 		}
 		/* Set HW EEE according to the speed */
-		priv->hw->mac->set_eee_pls(priv->hw, ndev->phydev->link);
+		stmmac_set_eee_pls(priv, priv->hw, ndev->phydev->link);
 
 		ret = true;
 		spin_unlock_irqrestore(&priv->lock, flags);
@@ -796,8 +795,8 @@ static void stmmac_mac_flow_ctrl(struct stmmac_priv *priv, u32 duplex)
 {
 	u32 tx_cnt = priv->plat->tx_queues_to_use;
 
-	priv->hw->mac->flow_ctrl(priv->hw, duplex, priv->flow_ctrl,
-				 priv->pause, tx_cnt);
+	stmmac_flow_ctrl(priv, priv->hw, duplex, priv->flow_ctrl,
+			priv->pause, tx_cnt);
 }
 
 /**
@@ -1663,7 +1662,7 @@ static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv)
 
 	for (queue = 0; queue < rx_queues_count; queue++) {
 		mode = priv->plat->rx_queues_cfg[queue].mode_to_use;
-		priv->hw->mac->rx_queue_enable(priv->hw, mode, queue);
+		stmmac_rx_queue_enable(priv, priv->hw, mode, queue);
 	}
 }
 
@@ -2000,18 +1999,19 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode,
 
 static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv)
 {
-	bool ret = false;
+	int ret = false;
 
 	/* Safety features are only available in cores >= 5.10 */
 	if (priv->synopsys_id < DWMAC_CORE_5_10)
 		return ret;
-	if (priv->hw->mac->safety_feat_irq_status)
-		ret = priv->hw->mac->safety_feat_irq_status(priv->dev,
-				priv->ioaddr, priv->dma_cap.asp, &priv->sstats);
-
-	if (ret)
+	ret = stmmac_safety_feat_irq_status(priv, priv->dev,
+			priv->ioaddr, priv->dma_cap.asp, &priv->sstats);
+	if (ret && (ret != -EINVAL)) {
 		stmmac_global_err(priv);
-	return ret;
+		return true;
+	}
+
+	return false;
 }
 
 /**
@@ -2177,8 +2177,7 @@ static int stmmac_get_hw_features(struct stmmac_priv *priv)
 static void stmmac_check_ether_addr(struct stmmac_priv *priv)
 {
 	if (!is_valid_ether_addr(priv->dev->dev_addr)) {
-		priv->hw->mac->get_umac_addr(priv->hw,
-					     priv->dev->dev_addr, 0);
+		stmmac_get_umac_addr(priv, priv->hw, priv->dev->dev_addr, 0);
 		if (!is_valid_ether_addr(priv->dev->dev_addr))
 			eth_hw_addr_random(priv->dev);
 		netdev_info(priv->dev, "device MAC address %pM\n",
@@ -2332,7 +2331,7 @@ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv)
 
 	for (queue = 0; queue < tx_queues_count; queue++) {
 		weight = priv->plat->tx_queues_cfg[queue].weight;
-		priv->hw->mac->set_mtl_tx_queue_weight(priv->hw, weight, queue);
+		stmmac_set_mtl_tx_queue_weight(priv, priv->hw, weight, queue);
 	}
 }
 
@@ -2353,7 +2352,7 @@ static void stmmac_configure_cbs(struct stmmac_priv *priv)
 		if (mode_to_use == MTL_QUEUE_DCB)
 			continue;
 
-		priv->hw->mac->config_cbs(priv->hw,
+		stmmac_config_cbs(priv, priv->hw,
 				priv->plat->tx_queues_cfg[queue].send_slope,
 				priv->plat->tx_queues_cfg[queue].idle_slope,
 				priv->plat->tx_queues_cfg[queue].high_credit,
@@ -2375,7 +2374,7 @@ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv)
 
 	for (queue = 0; queue < rx_queues_count; queue++) {
 		chan = priv->plat->rx_queues_cfg[queue].chan;
-		priv->hw->mac->map_mtl_to_dma(priv->hw, queue, chan);
+		stmmac_map_mtl_to_dma(priv, priv->hw, queue, chan);
 	}
 }
 
@@ -2395,7 +2394,7 @@ static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv)
 			continue;
 
 		prio = priv->plat->rx_queues_cfg[queue].prio;
-		priv->hw->mac->rx_queue_prio(priv->hw, prio, queue);
+		stmmac_rx_queue_prio(priv, priv->hw, prio, queue);
 	}
 }
 
@@ -2415,7 +2414,7 @@ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv)
 			continue;
 
 		prio = priv->plat->tx_queues_cfg[queue].prio;
-		priv->hw->mac->tx_queue_prio(priv->hw, prio, queue);
+		stmmac_tx_queue_prio(priv, priv->hw, prio, queue);
 	}
 }
 
@@ -2436,7 +2435,7 @@ static void stmmac_mac_config_rx_queues_routing(struct stmmac_priv *priv)
 			continue;
 
 		packet = priv->plat->rx_queues_cfg[queue].pkt_route;
-		priv->hw->mac->rx_queue_routing(priv->hw, packet, queue);
+		stmmac_rx_queue_routing(priv, priv->hw, packet, queue);
 	}
 }
 
@@ -2450,50 +2449,47 @@ static void stmmac_mtl_configuration(struct stmmac_priv *priv)
 	u32 rx_queues_count = priv->plat->rx_queues_to_use;
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 
-	if (tx_queues_count > 1 && priv->hw->mac->set_mtl_tx_queue_weight)
+	if (tx_queues_count > 1)
 		stmmac_set_tx_queue_weight(priv);
 
 	/* Configure MTL RX algorithms */
-	if (rx_queues_count > 1 && priv->hw->mac->prog_mtl_rx_algorithms)
-		priv->hw->mac->prog_mtl_rx_algorithms(priv->hw,
-						priv->plat->rx_sched_algorithm);
+	if (rx_queues_count > 1)
+		stmmac_prog_mtl_rx_algorithms(priv, priv->hw,
+				priv->plat->rx_sched_algorithm);
 
 	/* Configure MTL TX algorithms */
-	if (tx_queues_count > 1 && priv->hw->mac->prog_mtl_tx_algorithms)
-		priv->hw->mac->prog_mtl_tx_algorithms(priv->hw,
-						priv->plat->tx_sched_algorithm);
+	if (tx_queues_count > 1)
+		stmmac_prog_mtl_tx_algorithms(priv, priv->hw,
+				priv->plat->tx_sched_algorithm);
 
 	/* Configure CBS in AVB TX queues */
-	if (tx_queues_count > 1 && priv->hw->mac->config_cbs)
+	if (tx_queues_count > 1)
 		stmmac_configure_cbs(priv);
 
 	/* Map RX MTL to DMA channels */
-	if (priv->hw->mac->map_mtl_to_dma)
-		stmmac_rx_queue_dma_chan_map(priv);
+	stmmac_rx_queue_dma_chan_map(priv);
 
 	/* Enable MAC RX Queues */
-	if (priv->hw->mac->rx_queue_enable)
-		stmmac_mac_enable_rx_queues(priv);
+	stmmac_mac_enable_rx_queues(priv);
 
 	/* Set RX priorities */
-	if (rx_queues_count > 1 && priv->hw->mac->rx_queue_prio)
+	if (rx_queues_count > 1)
 		stmmac_mac_config_rx_queues_prio(priv);
 
 	/* Set TX priorities */
-	if (tx_queues_count > 1 && priv->hw->mac->tx_queue_prio)
+	if (tx_queues_count > 1)
 		stmmac_mac_config_tx_queues_prio(priv);
 
 	/* Set RX routing */
-	if (rx_queues_count > 1 && priv->hw->mac->rx_queue_routing)
+	if (rx_queues_count > 1)
 		stmmac_mac_config_rx_queues_routing(priv);
 }
 
 static void stmmac_safety_feat_configuration(struct stmmac_priv *priv)
 {
-	if (priv->hw->mac->safety_feat_config && priv->dma_cap.asp) {
+	if (priv->dma_cap.asp) {
 		netdev_info(priv->dev, "Enabling Safety Features\n");
-		priv->hw->mac->safety_feat_config(priv->ioaddr,
-				priv->dma_cap.asp);
+		stmmac_safety_feat_config(priv, priv->ioaddr, priv->dma_cap.asp);
 	} else {
 		netdev_info(priv->dev, "No Safety Features support found\n");
 	}
@@ -2528,7 +2524,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	}
 
 	/* Copy the MAC addr into the HW  */
-	priv->hw->mac->set_umac_addr(priv->hw, dev->dev_addr, 0);
+	stmmac_set_umac_addr(priv, priv->hw, dev->dev_addr, 0);
 
 	/* PS and related bits will be programmed according to the speed */
 	if (priv->hw->pcs) {
@@ -2544,7 +2540,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	}
 
 	/* Initialize the MAC Core */
-	priv->hw->mac->core_init(priv->hw, dev);
+	stmmac_core_init(priv, priv->hw, dev);
 
 	/* Initialize MTL*/
 	if (priv->synopsys_id >= DWMAC_CORE_4_00)
@@ -2554,7 +2550,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	if (priv->synopsys_id >= DWMAC_CORE_5_10)
 		stmmac_safety_feat_configuration(priv);
 
-	ret = priv->hw->mac->rx_ipc(priv->hw);
+	ret = stmmac_rx_ipc(priv, priv->hw);
 	if (!ret) {
 		netdev_warn(priv->dev, "RX IPC Checksum Offload disabled\n");
 		priv->plat->rx_coe = STMMAC_RX_COE_NONE;
@@ -2562,7 +2558,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	}
 
 	/* Enable the MAC Rx/Tx */
-	priv->hw->mac->set_mac(priv->ioaddr, true);
+	stmmac_mac_set(priv, priv->ioaddr, true);
 
 	/* Set the HW DMA mode and the COE */
 	stmmac_dma_operation_mode(priv);
@@ -2598,8 +2594,8 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 			priv->rx_riwt = MAX_DMA_RIWT;
 	}
 
-	if (priv->hw->pcs && priv->hw->mac->pcs_ctrl_ane)
-		priv->hw->mac->pcs_ctrl_ane(priv->hw, 1, priv->hw->ps, 0);
+	if (priv->hw->pcs)
+		stmmac_pcs_ctrl_ane(priv, priv->hw, 1, priv->hw->ps, 0);
 
 	/* set TX and RX rings length */
 	stmmac_set_rings_length(priv);
@@ -2778,7 +2774,7 @@ static int stmmac_release(struct net_device *dev)
 	free_dma_desc_resources(priv);
 
 	/* Disable the MAC Rx/Tx */
-	priv->hw->mac->set_mac(priv->ioaddr, false);
+	stmmac_mac_set(priv, priv->ioaddr, false);
 
 	netif_carrier_off(dev);
 
@@ -3614,7 +3610,7 @@ static void stmmac_set_rx_mode(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	priv->hw->mac->set_filter(priv->hw, dev);
+	stmmac_set_filter(priv, priv->hw, dev);
 }
 
 /**
@@ -3687,7 +3683,7 @@ static int stmmac_set_features(struct net_device *netdev,
 	/* No check needed because rx_coe has been set before and it will be
 	 * fixed in case of issue.
 	 */
-	priv->hw->mac->rx_ipc(priv->hw);
+	stmmac_rx_ipc(priv, priv->hw);
 
 	return 0;
 }
@@ -3731,8 +3727,7 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 
 	/* To handle GMAC own interrupts */
 	if ((priv->plat->has_gmac) || (priv->plat->has_gmac4)) {
-		int status = priv->hw->mac->host_irq_status(priv->hw,
-							    &priv->xstats);
+		int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
 
 		if (unlikely(status)) {
 			/* For LPI we need to save the tx status */
@@ -3747,9 +3742,8 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 				struct stmmac_rx_queue *rx_q =
 				&priv->rx_queue[queue];
 
-				status |=
-				priv->hw->mac->host_mtl_irq_status(priv->hw,
-								   queue);
+				status |= stmmac_host_mtl_irq_status(priv,
+						priv->hw, queue);
 
 				if (status & CORE_IRQ_MTL_RX_OVERFLOW)
 					stmmac_set_rx_tail_ptr(priv,
@@ -3829,7 +3823,7 @@ static int stmmac_set_mac_address(struct net_device *ndev, void *addr)
 	if (ret)
 		return ret;
 
-	priv->hw->mac->set_umac_addr(priv->hw, ndev->dev_addr, 0);
+	stmmac_set_umac_addr(priv, priv->hw, ndev->dev_addr, 0);
 
 	return ret;
 }
@@ -4418,7 +4412,7 @@ int stmmac_dvr_remove(struct device *dev)
 
 	stmmac_stop_all_dma(priv);
 
-	priv->hw->mac->set_mac(priv->ioaddr, false);
+	stmmac_mac_set(priv, priv->ioaddr, false);
 	netif_carrier_off(ndev);
 	unregister_netdev(ndev);
 	if (priv->plat->stmmac_rst)
@@ -4467,10 +4461,10 @@ int stmmac_suspend(struct device *dev)
 
 	/* Enable Power down mode by programming the PMT regs */
 	if (device_may_wakeup(priv->device)) {
-		priv->hw->mac->pmt(priv->hw, priv->wolopts);
+		stmmac_pmt(priv, priv->hw, priv->wolopts);
 		priv->irq_wake = 1;
 	} else {
-		priv->hw->mac->set_mac(priv->ioaddr, false);
+		stmmac_mac_set(priv, priv->ioaddr, false);
 		pinctrl_pm_select_sleep_state(priv->device);
 		/* Disable clock in case of PWM is off */
 		clk_disable(priv->plat->pclk);
@@ -4534,7 +4528,7 @@ int stmmac_resume(struct device *dev)
 	 */
 	if (device_may_wakeup(priv->device)) {
 		spin_lock_irqsave(&priv->lock, flags);
-		priv->hw->mac->pmt(priv->hw, 0);
+		stmmac_pmt(priv, priv->hw, 0);
 		spin_unlock_irqrestore(&priv->lock, flags);
 		priv->irq_wake = 0;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From cc4c9001ce31e0c3933fc67ac8c72899d6584b12 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 16 Apr 2018 16:08:15 +0100
Subject: net: stmmac: Switch stmmac_hwtimestamp to generic HW Interface
 Helpers

Switch stmmac_hwtimestamp to generic Hardware Interface Helpers instead
of using hard-coded callbacks. This makes the code more readable and
more flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h       | 12 --------
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 25 ++++++++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c  | 34 ++++++++++++----------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 17 +++++------
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c   | 18 ++++--------
 5 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 0e0b6f19104d..7291561ad217 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -383,18 +383,6 @@ extern const struct stmmac_desc_ops ndesc_ops;
 
 struct mac_device_info;
 
-/* PTP and HW Timer helpers */
-struct stmmac_hwtimestamp {
-	void (*config_hw_tstamping) (void __iomem *ioaddr, u32 data);
-	u32 (*config_sub_second_increment)(void __iomem *ioaddr, u32 ptp_clock,
-					   int gmac4);
-	int (*init_systime) (void __iomem *ioaddr, u32 sec, u32 nsec);
-	int (*config_addend) (void __iomem *ioaddr, u32 addend);
-	int (*adjust_systime) (void __iomem *ioaddr, u32 sec, u32 nsec,
-			       int add_sub, int gmac4);
-	 u64(*get_systime) (void __iomem *ioaddr);
-};
-
 extern const struct stmmac_hwtimestamp stmmac_ptp;
 extern const struct stmmac_mode_ops dwmac4_ring_mode_ops;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 9575135c0699..e23c0a3448bb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -366,4 +366,29 @@ struct stmmac_ops {
 #define stmmac_safety_feat_dump(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, safety_feat_dump, __args)
 
+/* PTP and HW Timer helpers */
+struct stmmac_hwtimestamp {
+	void (*config_hw_tstamping) (void __iomem *ioaddr, u32 data);
+	void (*config_sub_second_increment)(void __iomem *ioaddr, u32 ptp_clock,
+					   int gmac4, u32 *ssinc);
+	int (*init_systime) (void __iomem *ioaddr, u32 sec, u32 nsec);
+	int (*config_addend) (void __iomem *ioaddr, u32 addend);
+	int (*adjust_systime) (void __iomem *ioaddr, u32 sec, u32 nsec,
+			       int add_sub, int gmac4);
+	void (*get_systime) (void __iomem *ioaddr, u64 *systime);
+};
+
+#define stmmac_config_hw_tstamping(__priv, __args...) \
+	stmmac_do_void_callback(__priv, ptp, config_hw_tstamping, __args)
+#define stmmac_config_sub_second_increment(__priv, __args...) \
+	stmmac_do_void_callback(__priv, ptp, config_sub_second_increment, __args)
+#define stmmac_init_systime(__priv, __args...) \
+	stmmac_do_callback(__priv, ptp, init_systime, __args)
+#define stmmac_config_addend(__priv, __args...) \
+	stmmac_do_callback(__priv, ptp, config_addend, __args)
+#define stmmac_adjust_systime(__priv, __args...) \
+	stmmac_do_callback(__priv, ptp, adjust_systime, __args)
+#define stmmac_get_systime(__priv, __args...) \
+	stmmac_do_void_callback(__priv, ptp, get_systime, __args)
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index 08c19ebd5306..8d9cc2157afd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -24,13 +24,13 @@
 #include "common.h"
 #include "stmmac_ptp.h"
 
-static void stmmac_config_hw_tstamping(void __iomem *ioaddr, u32 data)
+static void config_hw_tstamping(void __iomem *ioaddr, u32 data)
 {
 	writel(data, ioaddr + PTP_TCR);
 }
 
-static u32 stmmac_config_sub_second_increment(void __iomem *ioaddr,
-					      u32 ptp_clock, int gmac4)
+static void config_sub_second_increment(void __iomem *ioaddr,
+		u32 ptp_clock, int gmac4, u32 *ssinc)
 {
 	u32 value = readl(ioaddr + PTP_TCR);
 	unsigned long data;
@@ -57,10 +57,11 @@ static u32 stmmac_config_sub_second_increment(void __iomem *ioaddr,
 
 	writel(reg_value, ioaddr + PTP_SSIR);
 
-	return data;
+	if (ssinc)
+		*ssinc = data;
 }
 
-static int stmmac_init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
+static int init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
 {
 	int limit;
 	u32 value;
@@ -85,7 +86,7 @@ static int stmmac_init_systime(void __iomem *ioaddr, u32 sec, u32 nsec)
 	return 0;
 }
 
-static int stmmac_config_addend(void __iomem *ioaddr, u32 addend)
+static int config_addend(void __iomem *ioaddr, u32 addend)
 {
 	u32 value;
 	int limit;
@@ -109,8 +110,8 @@ static int stmmac_config_addend(void __iomem *ioaddr, u32 addend)
 	return 0;
 }
 
-static int stmmac_adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec,
-				 int add_sub, int gmac4)
+static int adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec,
+		int add_sub, int gmac4)
 {
 	u32 value;
 	int limit;
@@ -152,7 +153,7 @@ static int stmmac_adjust_systime(void __iomem *ioaddr, u32 sec, u32 nsec,
 	return 0;
 }
 
-static u64 stmmac_get_systime(void __iomem *ioaddr)
+static void get_systime(void __iomem *ioaddr, u64 *systime)
 {
 	u64 ns;
 
@@ -161,14 +162,15 @@ static u64 stmmac_get_systime(void __iomem *ioaddr)
 	/* Get the TSS and convert sec time value to nanosecond */
 	ns += readl(ioaddr + PTP_STSR) * 1000000000ULL;
 
-	return ns;
+	if (systime)
+		*systime = ns;
 }
 
 const struct stmmac_hwtimestamp stmmac_ptp = {
-	.config_hw_tstamping = stmmac_config_hw_tstamping,
-	.init_systime = stmmac_init_systime,
-	.config_sub_second_increment = stmmac_config_sub_second_increment,
-	.config_addend = stmmac_config_addend,
-	.adjust_systime = stmmac_adjust_systime,
-	.get_systime = stmmac_get_systime,
+	.config_hw_tstamping = config_hw_tstamping,
+	.init_systime = init_systime,
+	.config_sub_second_increment = config_sub_second_increment,
+	.config_addend = config_addend,
+	.adjust_systime = adjust_systime,
+	.get_systime = get_systime,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2b521035185d..896a1e76f2ec 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -707,18 +707,18 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 	priv->hwts_tx_en = config.tx_type == HWTSTAMP_TX_ON;
 
 	if (!priv->hwts_tx_en && !priv->hwts_rx_en)
-		priv->hw->ptp->config_hw_tstamping(priv->ptpaddr, 0);
+		stmmac_config_hw_tstamping(priv, priv->ptpaddr, 0);
 	else {
 		value = (PTP_TCR_TSENA | PTP_TCR_TSCFUPDT | PTP_TCR_TSCTRLSSR |
 			 tstamp_all | ptp_v2 | ptp_over_ethernet |
 			 ptp_over_ipv6_udp | ptp_over_ipv4_udp | ts_event_en |
 			 ts_master_en | snap_type_sel);
-		priv->hw->ptp->config_hw_tstamping(priv->ptpaddr, value);
+		stmmac_config_hw_tstamping(priv, priv->ptpaddr, value);
 
 		/* program Sub Second Increment reg */
-		sec_inc = priv->hw->ptp->config_sub_second_increment(
-			priv->ptpaddr, priv->plat->clk_ptp_rate,
-			priv->plat->has_gmac4);
+		stmmac_config_sub_second_increment(priv,
+				priv->ptpaddr, priv->plat->clk_ptp_rate,
+				priv->plat->has_gmac4, &sec_inc);
 		temp = div_u64(1000000000ULL, sec_inc);
 
 		/* calculate default added value:
@@ -728,15 +728,14 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 		 */
 		temp = (u64)(temp << 32);
 		priv->default_addend = div_u64(temp, priv->plat->clk_ptp_rate);
-		priv->hw->ptp->config_addend(priv->ptpaddr,
-					     priv->default_addend);
+		stmmac_config_addend(priv, priv->ptpaddr, priv->default_addend);
 
 		/* initialize system time */
 		ktime_get_real_ts64(&now);
 
 		/* lower 32 bits of tv_sec are safe until y2106 */
-		priv->hw->ptp->init_systime(priv->ptpaddr, (u32)now.tv_sec,
-					    now.tv_nsec);
+		stmmac_init_systime(priv, priv->ptpaddr,
+				(u32)now.tv_sec, now.tv_nsec);
 	}
 
 	return copy_to_user(ifr->ifr_data, &config,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index e471a903c654..7d3a5c7f5db6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -49,9 +49,7 @@ static int stmmac_adjust_freq(struct ptp_clock_info *ptp, s32 ppb)
 	addend = neg_adj ? (addend - diff) : (addend + diff);
 
 	spin_lock_irqsave(&priv->ptp_lock, flags);
-
-	priv->hw->ptp->config_addend(priv->ptpaddr, addend);
-
+	stmmac_config_addend(priv, priv->ptpaddr, addend);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
 	return 0;
@@ -84,10 +82,8 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	nsec = reminder;
 
 	spin_lock_irqsave(&priv->ptp_lock, flags);
-
-	priv->hw->ptp->adjust_systime(priv->ptpaddr, sec, nsec, neg_adj,
-				      priv->plat->has_gmac4);
-
+	stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj,
+			priv->plat->has_gmac4);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
 	return 0;
@@ -110,9 +106,7 @@ static int stmmac_get_time(struct ptp_clock_info *ptp, struct timespec64 *ts)
 	u64 ns;
 
 	spin_lock_irqsave(&priv->ptp_lock, flags);
-
-	ns = priv->hw->ptp->get_systime(priv->ptpaddr);
-
+	stmmac_get_systime(priv, priv->ptpaddr, &ns);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
 	*ts = ns_to_timespec64(ns);
@@ -137,9 +131,7 @@ static int stmmac_set_time(struct ptp_clock_info *ptp,
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->ptp_lock, flags);
-
-	priv->hw->ptp->init_systime(priv->ptpaddr, ts->tv_sec, ts->tv_nsec);
-
+	stmmac_init_systime(priv, priv->ptpaddr, ts->tv_sec, ts->tv_nsec);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2c520b1c9cfa7d43fd251fa47bac6bd953ec8239 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 16 Apr 2018 16:08:16 +0100
Subject: net: stmmac: Switch stmmac_mode_ops to generic HW Interface Helpers

Switch stmmac_mode_ops to generic Hardware Interface Helpers instead of
using hard-coded callbacks. This makes the code more readable and more
flexible.

No functional change.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/chain_mode.c  | 20 +++++------
 drivers/net/ethernet/stmicro/stmmac/common.h      | 12 -------
 drivers/net/ethernet/stmicro/stmmac/hwif.h        | 27 ++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/ring_mode.c   | 24 ++++++-------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 43 ++++++++++-------------
 5 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
index ca0f9c205f9d..b9c9003060c5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/chain_mode.c
@@ -24,7 +24,7 @@
 
 #include "stmmac.h"
 
-static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int jumbo_frm(void *p, struct sk_buff *skb, int csum)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)p;
 	unsigned int nopaged_len = skb_headlen(skb);
@@ -93,7 +93,7 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 	return entry;
 }
 
-static unsigned int stmmac_is_jumbo_frm(int len, int enh_desc)
+static unsigned int is_jumbo_frm(int len, int enh_desc)
 {
 	unsigned int ret = 0;
 
@@ -105,7 +105,7 @@ static unsigned int stmmac_is_jumbo_frm(int len, int enh_desc)
 	return ret;
 }
 
-static void stmmac_init_dma_chain(void *des, dma_addr_t phy_addr,
+static void init_dma_chain(void *des, dma_addr_t phy_addr,
 				  unsigned int size, unsigned int extend_desc)
 {
 	/*
@@ -135,7 +135,7 @@ static void stmmac_init_dma_chain(void *des, dma_addr_t phy_addr,
 	}
 }
 
-static void stmmac_refill_desc3(void *priv_ptr, struct dma_desc *p)
+static void refill_desc3(void *priv_ptr, struct dma_desc *p)
 {
 	struct stmmac_rx_queue *rx_q = (struct stmmac_rx_queue *)priv_ptr;
 	struct stmmac_priv *priv = rx_q->priv_data;
@@ -151,7 +151,7 @@ static void stmmac_refill_desc3(void *priv_ptr, struct dma_desc *p)
 				      sizeof(struct dma_desc)));
 }
 
-static void stmmac_clean_desc3(void *priv_ptr, struct dma_desc *p)
+static void clean_desc3(void *priv_ptr, struct dma_desc *p)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)priv_ptr;
 	struct stmmac_priv *priv = tx_q->priv_data;
@@ -169,9 +169,9 @@ static void stmmac_clean_desc3(void *priv_ptr, struct dma_desc *p)
 }
 
 const struct stmmac_mode_ops chain_mode_ops = {
-	.init = stmmac_init_dma_chain,
-	.is_jumbo_frm = stmmac_is_jumbo_frm,
-	.jumbo_frm = stmmac_jumbo_frm,
-	.refill_desc3 = stmmac_refill_desc3,
-	.clean_desc3 = stmmac_clean_desc3,
+	.init = init_dma_chain,
+	.is_jumbo_frm = is_jumbo_frm,
+	.jumbo_frm = jumbo_frm,
+	.refill_desc3 = refill_desc3,
+	.clean_desc3 = clean_desc3,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 7291561ad217..59673c6d2e52 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -405,18 +405,6 @@ struct mii_regs {
 	unsigned int clk_csr_mask;
 };
 
-/* Helpers to manage the descriptors for chain and ring modes */
-struct stmmac_mode_ops {
-	void (*init) (void *des, dma_addr_t phy_addr, unsigned int size,
-		      unsigned int extend_desc);
-	unsigned int (*is_jumbo_frm) (int len, int ehn_desc);
-	int (*jumbo_frm)(void *priv, struct sk_buff *skb, int csum);
-	int (*set_16kib_bfsize)(int mtu);
-	void (*init_desc3)(struct dma_desc *p);
-	void (*refill_desc3) (void *priv, struct dma_desc *p);
-	void (*clean_desc3) (void *priv, struct dma_desc *p);
-};
-
 struct mac_device_info {
 	const struct stmmac_ops *mac;
 	const struct stmmac_desc_ops *desc;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index e23c0a3448bb..f81ded4a9946 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -391,4 +391,31 @@ struct stmmac_hwtimestamp {
 #define stmmac_get_systime(__priv, __args...) \
 	stmmac_do_void_callback(__priv, ptp, get_systime, __args)
 
+/* Helpers to manage the descriptors for chain and ring modes */
+struct stmmac_mode_ops {
+	void (*init) (void *des, dma_addr_t phy_addr, unsigned int size,
+		      unsigned int extend_desc);
+	unsigned int (*is_jumbo_frm) (int len, int ehn_desc);
+	int (*jumbo_frm)(void *priv, struct sk_buff *skb, int csum);
+	int (*set_16kib_bfsize)(int mtu);
+	void (*init_desc3)(struct dma_desc *p);
+	void (*refill_desc3) (void *priv, struct dma_desc *p);
+	void (*clean_desc3) (void *priv, struct dma_desc *p);
+};
+
+#define stmmac_mode_init(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mode, init, __args)
+#define stmmac_is_jumbo_frm(__priv, __args...) \
+	stmmac_do_callback(__priv, mode, is_jumbo_frm, __args)
+#define stmmac_jumbo_frm(__priv, __args...) \
+	stmmac_do_callback(__priv, mode, jumbo_frm, __args)
+#define stmmac_set_16kib_bfsize(__priv, __args...) \
+	stmmac_do_callback(__priv, mode, set_16kib_bfsize, __args)
+#define stmmac_init_desc3(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mode, init_desc3, __args)
+#define stmmac_refill_desc3(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mode, refill_desc3, __args)
+#define stmmac_clean_desc3(__priv, __args...) \
+	stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
index 808ca758d0ae..a7ffc73fffe8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
+++ b/drivers/net/ethernet/stmicro/stmmac/ring_mode.c
@@ -24,7 +24,7 @@
 
 #include "stmmac.h"
 
-static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
+static int jumbo_frm(void *p, struct sk_buff *skb, int csum)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)p;
 	unsigned int nopaged_len = skb_headlen(skb);
@@ -99,7 +99,7 @@ static int stmmac_jumbo_frm(void *p, struct sk_buff *skb, int csum)
 	return entry;
 }
 
-static unsigned int stmmac_is_jumbo_frm(int len, int enh_desc)
+static unsigned int is_jumbo_frm(int len, int enh_desc)
 {
 	unsigned int ret = 0;
 
@@ -109,7 +109,7 @@ static unsigned int stmmac_is_jumbo_frm(int len, int enh_desc)
 	return ret;
 }
 
-static void stmmac_refill_desc3(void *priv_ptr, struct dma_desc *p)
+static void refill_desc3(void *priv_ptr, struct dma_desc *p)
 {
 	struct stmmac_priv *priv = (struct stmmac_priv *)priv_ptr;
 
@@ -119,12 +119,12 @@ static void stmmac_refill_desc3(void *priv_ptr, struct dma_desc *p)
 }
 
 /* In ring mode we need to fill the desc3 because it is used as buffer */
-static void stmmac_init_desc3(struct dma_desc *p)
+static void init_desc3(struct dma_desc *p)
 {
 	p->des3 = cpu_to_le32(le32_to_cpu(p->des2) + BUF_SIZE_8KiB);
 }
 
-static void stmmac_clean_desc3(void *priv_ptr, struct dma_desc *p)
+static void clean_desc3(void *priv_ptr, struct dma_desc *p)
 {
 	struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)priv_ptr;
 	struct stmmac_priv *priv = tx_q->priv_data;
@@ -137,7 +137,7 @@ static void stmmac_clean_desc3(void *priv_ptr, struct dma_desc *p)
 		p->des3 = 0;
 }
 
-static int stmmac_set_16kib_bfsize(int mtu)
+static int set_16kib_bfsize(int mtu)
 {
 	int ret = 0;
 	if (unlikely(mtu >= BUF_SIZE_8KiB))
@@ -146,10 +146,10 @@ static int stmmac_set_16kib_bfsize(int mtu)
 }
 
 const struct stmmac_mode_ops ring_mode_ops = {
-	.is_jumbo_frm = stmmac_is_jumbo_frm,
-	.jumbo_frm = stmmac_jumbo_frm,
-	.refill_desc3 = stmmac_refill_desc3,
-	.init_desc3 = stmmac_init_desc3,
-	.clean_desc3 = stmmac_clean_desc3,
-	.set_16kib_bfsize = stmmac_set_16kib_bfsize,
+	.is_jumbo_frm = is_jumbo_frm,
+	.jumbo_frm = jumbo_frm,
+	.refill_desc3 = refill_desc3,
+	.init_desc3 = init_desc3,
+	.clean_desc3 = clean_desc3,
+	.set_16kib_bfsize = set_16kib_bfsize,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 896a1e76f2ec..90363a8b15db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1161,9 +1161,8 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
 	else
 		p->des2 = cpu_to_le32(rx_q->rx_skbuff_dma[i]);
 
-	if ((priv->hw->mode->init_desc3) &&
-	    (priv->dma_buf_sz == BUF_SIZE_16KiB))
-		priv->hw->mode->init_desc3(p);
+	if (priv->dma_buf_sz == BUF_SIZE_16KiB)
+		stmmac_init_desc3(priv, p);
 
 	return 0;
 }
@@ -1229,13 +1228,14 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	u32 rx_count = priv->plat->rx_queues_to_use;
-	unsigned int bfsize = 0;
 	int ret = -ENOMEM;
+	int bfsize = 0;
 	int queue;
 	int i;
 
-	if (priv->hw->mode->set_16kib_bfsize)
-		bfsize = priv->hw->mode->set_16kib_bfsize(dev->mtu);
+	bfsize = stmmac_set_16kib_bfsize(priv, dev->mtu);
+	if (bfsize < 0)
+		bfsize = 0;
 
 	if (bfsize < BUF_SIZE_16KiB)
 		bfsize = stmmac_set_bfsize(dev->mtu, priv->dma_buf_sz);
@@ -1279,13 +1279,11 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags)
 		/* Setup the chained descriptor addresses */
 		if (priv->mode == STMMAC_CHAIN_MODE) {
 			if (priv->extend_desc)
-				priv->hw->mode->init(rx_q->dma_erx,
-						     rx_q->dma_rx_phy,
-						     DMA_RX_SIZE, 1);
+				stmmac_mode_init(priv, rx_q->dma_erx,
+						rx_q->dma_rx_phy, DMA_RX_SIZE, 1);
 			else
-				priv->hw->mode->init(rx_q->dma_rx,
-						     rx_q->dma_rx_phy,
-						     DMA_RX_SIZE, 0);
+				stmmac_mode_init(priv, rx_q->dma_rx,
+						rx_q->dma_rx_phy, DMA_RX_SIZE, 0);
 		}
 	}
 
@@ -1332,13 +1330,11 @@ static int init_dma_tx_desc_rings(struct net_device *dev)
 		/* Setup the chained descriptor addresses */
 		if (priv->mode == STMMAC_CHAIN_MODE) {
 			if (priv->extend_desc)
-				priv->hw->mode->init(tx_q->dma_etx,
-						     tx_q->dma_tx_phy,
-						     DMA_TX_SIZE, 1);
+				stmmac_mode_init(priv, tx_q->dma_etx,
+						tx_q->dma_tx_phy, DMA_TX_SIZE, 1);
 			else
-				priv->hw->mode->init(tx_q->dma_tx,
-						     tx_q->dma_tx_phy,
-						     DMA_TX_SIZE, 0);
+				stmmac_mode_init(priv, tx_q->dma_tx,
+						tx_q->dma_tx_phy, DMA_TX_SIZE, 0);
 		}
 
 		for (i = 0; i < DMA_TX_SIZE; i++) {
@@ -1886,8 +1882,7 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue)
 			tx_q->tx_skbuff_dma[entry].map_as_page = false;
 		}
 
-		if (priv->hw->mode->clean_desc3)
-			priv->hw->mode->clean_desc3(tx_q, p);
+		stmmac_clean_desc3(priv, tx_q, p);
 
 		tx_q->tx_skbuff_dma[entry].last_segment = false;
 		tx_q->tx_skbuff_dma[entry].is_jumbo = false;
@@ -3099,11 +3094,11 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	enh_desc = priv->plat->enh_desc;
 	/* To program the descriptors according to the size of the frame */
 	if (enh_desc)
-		is_jumbo = priv->hw->mode->is_jumbo_frm(skb->len, enh_desc);
+		is_jumbo = stmmac_is_jumbo_frm(priv, skb->len, enh_desc);
 
 	if (unlikely(is_jumbo) && likely(priv->synopsys_id <
 					 DWMAC_CORE_4_00)) {
-		entry = priv->hw->mode->jumbo_frm(tx_q, skb, csum_insertion);
+		entry = stmmac_jumbo_frm(priv, tx_q, skb, csum_insertion);
 		if (unlikely(entry < 0))
 			goto dma_map_err;
 	}
@@ -3332,8 +3327,8 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 			} else {
 				p->des2 = cpu_to_le32(rx_q->rx_skbuff_dma[entry]);
 			}
-			if (priv->hw->mode->refill_desc3)
-				priv->hw->mode->refill_desc3(rx_q, p);
+
+			stmmac_refill_desc3(priv, rx_q, p);
 
 			if (rx_q->rx_zeroc_thresh > 0)
 				rx_q->rx_zeroc_thresh--;
-- 
cgit v1.2.3-59-g8ed1b


From a5724fc3834643a975bd0db71f001ca65f4a8382 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 16 Apr 2018 21:37:13 +0200
Subject: PCI: Add two more values for PCIe Max_Read_Request_Size

This patch adds missing values for the max read request size.
E.g. network driver r8169 uses a value of 4K.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pci_regs.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba797a8f3..83ade9b5cf95 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -506,6 +506,8 @@
 #define  PCI_EXP_DEVCTL_READRQ_256B  0x1000 /* 256 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_512B  0x2000 /* 512 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */
 #define  PCI_EXP_DEVCTL_BCR_FLR 0x8000  /* Bridge Configuration Retry / FLR */
 #define PCI_EXP_DEVSTA		10	/* Device Status */
 #define  PCI_EXP_DEVSTA_CED	0x0001	/* Correctable Error Detected */
-- 
cgit v1.2.3-59-g8ed1b


From 8d98aa39b8d497e4c3da8c3973456ff710b68693 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 16 Apr 2018 21:38:27 +0200
Subject: r8169: replace magic numbers with PCI MRRS constant

Replace magic number "0x5 << MAX_READ_REQUEST_SHIFT" with the
appropriate constant as defined in PCI core.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 39 ++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 604ae78381ae..7d4e68907055 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -84,7 +84,6 @@
    The RTL chips use a 64 element hash table based on the Ethernet CRC. */
 static const int multicast_filter_limit = 32;
 
-#define MAX_READ_REQUEST_SHIFT	12
 #define TX_DMA_BURST	7	/* Maximum PCI burst, '7' is unlimited */
 #define InterFrameGap	0x03	/* 3 means InterFrameGap = the shortest one */
 
@@ -5125,7 +5124,7 @@ static void r8168c_hw_jumbo_disable(struct rtl8169_private *tp)
 {
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0);
 	RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~Jumbo_En1);
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 }
 
 static void r8168dp_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5151,7 +5150,7 @@ static void r8168e_hw_jumbo_disable(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, 0x0c);
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0);
 	RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~0x01);
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 }
 
 static void r8168b_0_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5163,7 +5162,7 @@ static void r8168b_0_hw_jumbo_enable(struct rtl8169_private *tp)
 static void r8168b_0_hw_jumbo_disable(struct rtl8169_private *tp)
 {
 	rtl_tx_performance_tweak(tp,
-		(0x5 << MAX_READ_REQUEST_SHIFT) | PCI_EXP_DEVCTL_NOSNOOP_EN);
+		PCI_EXP_DEVCTL_READRQ_4096B | PCI_EXP_DEVCTL_NOSNOOP_EN);
 }
 
 static void r8168b_1_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5736,7 +5735,7 @@ static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN) {
-		rtl_tx_performance_tweak(tp, (0x5 << MAX_READ_REQUEST_SHIFT) |
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B |
 					 PCI_EXP_DEVCTL_NOSNOOP_EN);
 	}
 }
@@ -5757,7 +5756,7 @@ static void __rtl_hw_start_8168cp(struct rtl8169_private *tp)
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_disable_clock_request(tp);
 
@@ -5788,7 +5787,7 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp)
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5805,7 +5804,7 @@ static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5862,7 +5861,7 @@ static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5872,7 +5871,7 @@ static void rtl_hw_start_8168dp(struct rtl8169_private *tp)
 	rtl_csi_access_enable_1(tp);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5889,7 +5888,7 @@ static void rtl_hw_start_8168d_4(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5921,7 +5920,7 @@ static void rtl_hw_start_8168e_1(struct rtl8169_private *tp)
 	rtl_ephy_init(tp, e_info_8168e_1, ARRAY_SIZE(e_info_8168e_1));
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5946,7 +5945,7 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private *tp)
 	rtl_ephy_init(tp, e_info_8168e_2, ARRAY_SIZE(e_info_8168e_2));
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
@@ -5976,7 +5975,7 @@ static void rtl_hw_start_8168f(struct rtl8169_private *tp)
 {
 	rtl_csi_access_enable_2(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
@@ -6047,7 +6046,7 @@ static void rtl_hw_start_8168g(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6147,7 +6146,7 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6229,7 +6228,7 @@ static void rtl_hw_start_8168ep(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6495,7 +6494,7 @@ static void rtl_hw_start_8102e_1(struct rtl8169_private *tp)
 
 	RTL_W8(tp, DBG_REG, FIX_NAK_1);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, Config1,
 	       LEDS1 | LEDS0 | Speed_down | MEMMAP | IOMAP | VPD | PMEnable);
@@ -6512,7 +6511,7 @@ static void rtl_hw_start_8102e_2(struct rtl8169_private *tp)
 {
 	rtl_csi_access_enable_2(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, Config1, MEMMAP | IOMAP | VPD | PMEnable);
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
@@ -6575,7 +6574,7 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp)
 
 	rtl_ephy_init(tp, e_info_8402, ARRAY_SIZE(e_info_8402));
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc8, ERIAR_MASK_1111, 0x00000002, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xe8, ERIAR_MASK_1111, 0x00000006, ERIAR_EXGMAC);
-- 
cgit v1.2.3-59-g8ed1b


From ef53e9e14714de2ce26eaae0244c07c426064d69 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 16 Apr 2018 15:07:13 -0700
Subject: net: Remove unused tcp_set_state tracepoint

This tracepoint was replaced by inet_sock_set_state in 563e0bb and not
used anywhere in the kernel anymore. Remove it.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 47 ----------------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 878b2be7ce77..3dd68029d77a 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -166,53 +166,6 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
 	TP_ARGS(sk)
 );
 
-TRACE_EVENT(tcp_set_state,
-
-	TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),
-
-	TP_ARGS(sk, oldstate, newstate),
-
-	TP_STRUCT__entry(
-		__field(const void *, skaddr)
-		__field(int, oldstate)
-		__field(int, newstate)
-		__field(__u16, sport)
-		__field(__u16, dport)
-		__array(__u8, saddr, 4)
-		__array(__u8, daddr, 4)
-		__array(__u8, saddr_v6, 16)
-		__array(__u8, daddr_v6, 16)
-	),
-
-	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
-		__be32 *p32;
-
-		__entry->skaddr = sk;
-		__entry->oldstate = oldstate;
-		__entry->newstate = newstate;
-
-		__entry->sport = ntohs(inet->inet_sport);
-		__entry->dport = ntohs(inet->inet_dport);
-
-		p32 = (__be32 *) __entry->saddr;
-		*p32 = inet->inet_saddr;
-
-		p32 = (__be32 *) __entry->daddr;
-		*p32 =  inet->inet_daddr;
-
-		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
-			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
-	),
-
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
-		  __entry->sport, __entry->dport,
-		  __entry->saddr, __entry->daddr,
-		  __entry->saddr_v6, __entry->daddr_v6,
-		  show_tcp_state_name(__entry->oldstate),
-		  show_tcp_state_name(__entry->newstate))
-);
-
 TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_PROTO(const struct sock *sk, const struct request_sock *req),
-- 
cgit v1.2.3-59-g8ed1b


From 5e596ee171ba99f0cfb885f256c6275390e63271 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Thu, 12 Apr 2018 15:59:59 -0700
Subject: selftests: add xfrm state-policy-monitor to rtnetlink.sh

Add a simple set of tests for the IPsec xfrm commands.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 tools/testing/selftests/net/rtnetlink.sh | 103 +++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index e6f485235435..760faef2e12e 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -502,6 +502,108 @@ kci_test_macsec()
 	echo "PASS: macsec"
 }
 
+#-------------------------------------------------------------------
+# Example commands
+#   ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+#            spi 0x07 mode transport reqid 0x07 replay-window 32 \
+#            aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+#            sel src 14.0.0.52/24 dst 14.0.0.70/24
+#   ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+#            tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+#            spi 0x07 mode transport reqid 0x07
+#
+# Subcommands not tested
+#    ip x s update
+#    ip x s allocspi
+#    ip x s deleteall
+#    ip x p update
+#    ip x p deleteall
+#    ip x p set
+#-------------------------------------------------------------------
+kci_test_ipsec()
+{
+	srcip="14.0.0.52"
+	dstip="14.0.0.70"
+	algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+
+	# flush to be sure there's nothing configured
+	ip x s flush ; ip x p flush
+	check_err $?
+
+	# start the monitor in the background
+	tmpfile=`mktemp ipsectestXXX`
+	ip x m > $tmpfile &
+	mpid=$!
+	sleep 0.2
+
+	ipsecid="proto esp src $srcip dst $dstip spi 0x07"
+	ip x s add $ipsecid \
+            mode transport reqid 0x07 replay-window 32 \
+            $algo sel src $srcip/24 dst $dstip/24
+	check_err $?
+
+	lines=`ip x s list | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x s count | grep -q "SAD count 1"
+	check_err $?
+
+	lines=`ip x s get $ipsecid | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x s delete $ipsecid
+	check_err $?
+
+	lines=`ip x s list | wc -l`
+	test $lines -eq 0
+	check_err $?
+
+	ipsecsel="dir out src $srcip/24 dst $dstip/24"
+	ip x p add $ipsecsel \
+		    tmpl proto esp src $srcip dst $dstip \
+		    spi 0x07 mode transport reqid 0x07
+	check_err $?
+
+	lines=`ip x p list | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x p count | grep -q "SPD IN  0 OUT 1 FWD 0"
+	check_err $?
+
+	lines=`ip x p get $ipsecsel | grep $srcip | grep $dstip | wc -l`
+	test $lines -eq 2
+	check_err $?
+
+	ip x p delete $ipsecsel
+	check_err $?
+
+	lines=`ip x p list | wc -l`
+	test $lines -eq 0
+	check_err $?
+
+	# check the monitor results
+	kill $mpid
+	lines=`wc -l $tmpfile | cut "-d " -f1`
+	test $lines -eq 20
+	check_err $?
+	rm -rf $tmpfile
+
+	# clean up any leftovers
+	ip x s flush
+	check_err $?
+	ip x p flush
+	check_err $?
+
+	if [ $ret -ne 0 ]; then
+		echo "FAIL: ipsec"
+		return 1
+	fi
+	echo "PASS: ipsec"
+}
+
 kci_test_gretap()
 {
 	testns="testns"
@@ -755,6 +857,7 @@ kci_test_rtnl()
 	kci_test_vrf
 	kci_test_encap
 	kci_test_macsec
+	kci_test_ipsec
 
 	kci_del_dummy
 }
-- 
cgit v1.2.3-59-g8ed1b


From 897ddc24835ac9e267d70f1a77e75d30a0a636e9 Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Mon, 16 Apr 2018 23:30:53 -0700
Subject: liquidio: Enhanced ethtool stats

1. Added red_drops stats. Inbound packets dropped by RED, buffer exhaustion
2. Included fcs_err, jabber_err, l2_err and frame_err errors under
   rx_errors
3. Included fifo_err, dmac_drop, red_drops, fw_err_pko, fw_err_link and
   fw_err_drop under rx_dropped
4. Included max_collision_fail, max_deferral_fail, total_collisions,
   fw_err_pko, fw_err_link, fw_err_drop and fw_err_pki under tx_dropped
5. Counting dma mapping errors
6. Added some firmware stats description and removed for some

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Acked-by: Satanand Burla <satananda.burla@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 54 ++++++++++------
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  2 +
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 75 ++++++++++++++--------
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |  4 +-
 4 files changed, 86 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 550ac29682a5..9926a12dd805 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -96,11 +96,9 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 	"tx_packets",
 	"rx_bytes",
 	"tx_bytes",
-	"rx_errors",	/*jabber_err+l2_err+frame_err */
-	"tx_errors",	/*fw_err_pko+fw_err_link+fw_err_drop */
-	"rx_dropped",   /*st->fromwire.total_rcvd - st->fromwire.fw_total_rcvd +
-			 *st->fromwire.dmac_drop + st->fromwire.fw_err_drop
-			 */
+	"rx_errors",
+	"tx_errors",
+	"rx_dropped",
 	"tx_dropped",
 
 	"tx_total_sent",
@@ -119,7 +117,7 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 	"mac_tx_total_bytes",
 	"mac_tx_mcast_pkts",
 	"mac_tx_bcast_pkts",
-	"mac_tx_ctl_packets",	/*oct->link_stats.fromhost.ctl_sent */
+	"mac_tx_ctl_packets",
 	"mac_tx_total_collisions",
 	"mac_tx_one_collision",
 	"mac_tx_multi_collison",
@@ -170,17 +168,17 @@ static const char oct_vf_stats_strings[][ETH_GSTRING_LEN] = {
 	"tx_packets",
 	"rx_bytes",
 	"tx_bytes",
-	"rx_errors", /* jabber_err + l2_err+frame_err */
-	"tx_errors", /* fw_err_pko + fw_err_link+fw_err_drop */
-	"rx_dropped", /* total_rcvd - fw_total_rcvd + dmac_drop + fw_err_drop */
+	"rx_errors",
+	"tx_errors",
+	"rx_dropped",
 	"tx_dropped",
 	"link_state_changes",
 };
 
 /* statistics of host tx queue */
 static const char oct_iq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",		/*oct->instr_queue[iq_no]->stats.tx_done*/
-	"bytes",		/*oct->instr_queue[iq_no]->stats.tx_tot_bytes*/
+	"packets",
+	"bytes",
 	"dropped",
 	"iq_busy",
 	"sgentry_sent",
@@ -197,13 +195,9 @@ static const char oct_iq_stats_strings[][ETH_GSTRING_LEN] = {
 
 /* statistics of host rx queue */
 static const char oct_droq_stats_strings[][ETH_GSTRING_LEN] = {
-	"packets",		/*oct->droq[oq_no]->stats.rx_pkts_received */
-	"bytes",		/*oct->droq[oq_no]->stats.rx_bytes_received */
-	"dropped",		/*oct->droq[oq_no]->stats.rx_dropped+
-				 *oct->droq[oq_no]->stats.dropped_nodispatch+
-				 *oct->droq[oq_no]->stats.dropped_toomany+
-				 *oct->droq[oq_no]->stats.dropped_nomem
-				 */
+	"packets",
+	"bytes",
+	"dropped",
 	"dropped_nomem",
 	"dropped_toomany",
 	"fw_dropped",
@@ -1080,16 +1074,33 @@ lio_get_ethtool_stats(struct net_device *netdev,
 	data[i++] = CVM_CAST64(netstats->rx_bytes);
 	/*sum of oct->instr_queue[iq_no]->stats.tx_tot_bytes */
 	data[i++] = CVM_CAST64(netstats->tx_bytes);
-	data[i++] = CVM_CAST64(netstats->rx_errors);
+	data[i++] = CVM_CAST64(netstats->rx_errors +
+			       oct_dev->link_stats.fromwire.fcs_err +
+			       oct_dev->link_stats.fromwire.jabber_err +
+			       oct_dev->link_stats.fromwire.l2_err +
+			       oct_dev->link_stats.fromwire.frame_err);
 	data[i++] = CVM_CAST64(netstats->tx_errors);
 	/*sum of oct->droq[oq_no]->stats->rx_dropped +
 	 *oct->droq[oq_no]->stats->dropped_nodispatch +
 	 *oct->droq[oq_no]->stats->dropped_toomany +
 	 *oct->droq[oq_no]->stats->dropped_nomem
 	 */
-	data[i++] = CVM_CAST64(netstats->rx_dropped);
+	data[i++] = CVM_CAST64(netstats->rx_dropped +
+			       oct_dev->link_stats.fromwire.fifo_err +
+			       oct_dev->link_stats.fromwire.dmac_drop +
+			       oct_dev->link_stats.fromwire.red_drops +
+			       oct_dev->link_stats.fromwire.fw_err_pko +
+			       oct_dev->link_stats.fromwire.fw_err_link +
+			       oct_dev->link_stats.fromwire.fw_err_drop);
 	/*sum of oct->instr_queue[iq_no]->stats.tx_dropped */
-	data[i++] = CVM_CAST64(netstats->tx_dropped);
+	data[i++] = CVM_CAST64(netstats->tx_dropped +
+			       oct_dev->link_stats.fromhost.max_collision_fail +
+			       oct_dev->link_stats.fromhost.max_deferral_fail +
+			       oct_dev->link_stats.fromhost.total_collisions +
+			       oct_dev->link_stats.fromhost.fw_err_pko +
+			       oct_dev->link_stats.fromhost.fw_err_link +
+			       oct_dev->link_stats.fromhost.fw_err_drop +
+			       oct_dev->link_stats.fromhost.fw_err_pki);
 
 	/* firmware tx stats */
 	/*per_core_stats[cvmx_get_core_num()].link_stats[mdata->from_ifidx].
@@ -1798,6 +1809,7 @@ octnet_nic_stats_callback(struct octeon_device *oct_dev,
 		rstats->jabber_err = rsp_rstats->jabber_err;
 		rstats->l2_err    = rsp_rstats->l2_err;
 		rstats->frame_err = rsp_rstats->frame_err;
+		rstats->red_drops = rsp_rstats->red_drops;
 
 		/* RX firmware stats */
 		rstats->fw_total_rcvd = rsp_rstats->fw_total_rcvd;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 603a144d3d9c..44c2a0c1bbdd 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2585,6 +2585,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 		if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
 			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 1\n",
 				__func__);
+			stats->tx_dmamap_fail++;
 			return NETDEV_TX_BUSY;
 		}
 
@@ -2624,6 +2625,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 		if (dma_mapping_error(&oct->pci_dev->dev, g->sg[0].ptr[0])) {
 			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 2\n",
 				__func__);
+			stats->tx_dmamap_fail++;
 			return NETDEV_TX_BUSY;
 		}
 		add_sg_size(&g->sg[0], (skb->len - skb->data_len), 0);
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 75eea83c7cc6..34a94daca590 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -779,18 +779,24 @@ struct liquidio_if_cfg_info {
 /** Stats for each NIC port in RX direction. */
 struct nic_rx_stats {
 	/* link-level stats */
-	u64 total_rcvd;
-	u64 bytes_rcvd;
-	u64 total_bcst;
-	u64 total_mcst;
-	u64 runts;
-	u64 ctl_rcvd;
-	u64 fifo_err;      /* Accounts for over/under-run of buffers */
-	u64 dmac_drop;
-	u64 fcs_err;
-	u64 jabber_err;
-	u64 l2_err;
-	u64 frame_err;
+	u64 total_rcvd;		/* Received packets */
+	u64 bytes_rcvd;		/* Octets of received packets */
+	u64 total_bcst;		/* Number of non-dropped L2 broadcast packets */
+	u64 total_mcst;		/* Number of non-dropped L2 multicast packets */
+	u64 runts;		/* Packets shorter than allowed */
+	u64 ctl_rcvd;		/* Received PAUSE packets */
+	u64 fifo_err;		/* Packets dropped due to RX FIFO full */
+	u64 dmac_drop;		/* Packets dropped by the DMAC filter */
+	u64 fcs_err;		/* Sum of fragment, overrun, and FCS errors */
+	u64 jabber_err;		/* Packets larger than allowed */
+	u64 l2_err;		/* Sum of DMA, parity, PCAM access, no memory,
+				 * buffer overflow, malformed L2 header or
+				 * length, oversize errors
+				 **/
+	u64 frame_err;		/* Sum of IPv4 and L4 checksum errors */
+	u64 red_drops;		/* Packets dropped by RED due to buffer
+				 * exhaustion
+				 **/
 
 	/* firmware stats */
 	u64 fw_total_rcvd;
@@ -806,11 +812,11 @@ struct nic_rx_stats {
 	u64 fw_lro_pkts;   /* Number of packets that are LROed      */
 	u64 fw_lro_octs;   /* Number of octets that are LROed       */
 	u64 fw_total_lro;  /* Number of LRO packets formed          */
-	u64 fw_lro_aborts; /* Number of times lRO of packet aborted */
+	u64 fw_lro_aborts; /* Number of times LRO of packet aborted */
 	u64 fw_lro_aborts_port;
 	u64 fw_lro_aborts_seq;
 	u64 fw_lro_aborts_tsval;
-	u64 fw_lro_aborts_timer;
+	u64 fw_lro_aborts_timer;	/* Timer setting error */
 	/* intrmod: packet forward rate */
 	u64 fwd_rate;
 };
@@ -818,18 +824,35 @@ struct nic_rx_stats {
 /** Stats for each NIC port in RX direction. */
 struct nic_tx_stats {
 	/* link-level stats */
-	u64 total_pkts_sent;
-	u64 total_bytes_sent;
-	u64 mcast_pkts_sent;
-	u64 bcast_pkts_sent;
-	u64 ctl_sent;
-	u64 one_collision_sent;   /* Packets sent after one collision*/
-	u64 multi_collision_sent; /* Packets sent after multiple collision*/
-	u64 max_collision_fail;   /* Packets not sent due to max collisions */
-	u64 max_deferral_fail;   /* Packets not sent due to max deferrals */
-	u64 fifo_err;       /* Accounts for over/under-run of buffers */
-	u64 runts;
-	u64 total_collisions; /* Total number of collisions detected */
+	u64 total_pkts_sent;		/* Total frames sent on the interface */
+	u64 total_bytes_sent;		/* Total octets sent on the interface */
+	u64 mcast_pkts_sent;		/* Packets sent to the multicast DMAC */
+	u64 bcast_pkts_sent;		/* Packets sent to a broadcast DMAC */
+	u64 ctl_sent;			/* Control/PAUSE packets sent */
+	u64 one_collision_sent;		/* Packets sent that experienced a
+					 * single collision before successful
+					 * transmission
+					 **/
+	u64 multi_collision_sent;	/* Packets sent that experienced
+					 * multiple collisions before successful
+					 * transmission
+					 **/
+	u64 max_collision_fail;		/* Packets dropped due to excessive
+					 * collisions
+					 **/
+	u64 max_deferral_fail;		/* Packets not sent due to max
+					 * deferrals
+					 **/
+	u64 fifo_err;			/* Packets sent that experienced a
+					 * transmit underflow and were
+					 * truncated
+					 **/
+	u64 runts;			/* Packets sent with an octet count
+					 * lessthan 64
+					 **/
+	u64 total_collisions;		/* Packets dropped due to excessive
+					 * collisions
+					 **/
 
 	/* firmware stats */
 	u64 fw_total_sent;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
index 81c987682941..5fed7b63223e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_iq.h
@@ -62,8 +62,8 @@ struct oct_iq_stats {
 	u64 tx_tot_bytes;/**< Total count of bytes sento to network. */
 	u64 tx_gso;  /* count of tso */
 	u64 tx_vxlan; /* tunnel */
-	u64 tx_dmamap_fail;
-	u64 tx_restart;
+	u64 tx_dmamap_fail; /* Number of times dma mapping failed */
+	u64 tx_restart; /* Number of times this queue restarted */
 };
 
 #define OCT_IQ_STATS_SIZE   (sizeof(struct oct_iq_stats))
-- 
cgit v1.2.3-59-g8ed1b


From 5168d73201898c581d0503434ee9743cfc8f964b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:21 +0200
Subject: mlx5: basic XDP_REDIRECT forward support

This implements basic XDP redirect support in mlx5 driver.

Notice that the ndo_xdp_xmit() is NOT implemented, because that API
need some changes that this patchset is working towards.

The main purpose of this patch is have different drivers doing
XDP_REDIRECT to show how different memory models behave in a cross
driver world.

Update(pre-RFCv2 Tariq): Need to DMA unmap page before xdp_do_redirect,
as the return API does not exist yet to to keep this mapped.

Update(pre-RFCv3 Saeed): Don't mix XDP_TX and XDP_REDIRECT flushing,
introduce xdpsq.db.redirect_flush boolian.

V9: Adjust for commit 121e89275471 ("net/mlx5e: Refactor RQ XDP_TX indication")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 27 ++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 30cad07be2b5..1a05d1072c5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -392,6 +392,7 @@ struct mlx5e_xdpsq {
 	struct {
 		struct mlx5e_dma_info     *di;
 		bool                       doorbell;
+		bool                       redirect_flush;
 	} db;
 
 	/* read only */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 176645762e49..0e24be05907f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -236,14 +236,20 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	return 0;
 }
 
+static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
+					struct mlx5e_dma_info *dma_info)
+{
+	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+		       rq->buff.map_dir);
+}
+
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 			bool recycle)
 {
 	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
 		return;
 
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	mlx5e_page_dma_unmap(rq, dma_info);
 	put_page(dma_info->page);
 }
 
@@ -800,9 +806,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 				   struct mlx5e_dma_info *di,
 				   void *va, u16 *rx_headroom, u32 *len)
 {
-	const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
 	struct xdp_buff xdp;
 	u32 act;
+	int err;
 
 	if (!prog)
 		return false;
@@ -823,6 +830,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
 			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
+	case XDP_REDIRECT:
+		/* When XDP enabled then page-refcnt==1 here */
+		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		if (!err) {
+			__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
+			rq->xdpsq.db.redirect_flush = true;
+			mlx5e_page_dma_unmap(rq, di);
+		}
+		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
@@ -1140,6 +1156,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 		xdpsq->db.doorbell = false;
 	}
 
+	if (xdpsq->db.redirect_flush) {
+		xdp_do_flush_map();
+		xdpsq->db.redirect_flush = false;
+	}
+
 	mlx5_cqwq_update_db_record(&cq->wq);
 
 	/* ensure cq space is freed before enabling more cqes */
-- 
cgit v1.2.3-59-g8ed1b


From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:26 +0200
Subject: xdp: introduce xdp_return_frame API and use in cpumap

Introduce an xdp_return_frame API, and convert over cpumap as
the first user, given it have queued XDP frame structure to leverage.

V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck.
V6: Remove comment that id will be added later (Req by Alex Duyck)
V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h   | 27 ++++++++++++++++++++++++
 kernel/bpf/cpumap.c | 60 ++++++++++++++++++++++++++++++++---------------------
 net/core/xdp.c      | 18 ++++++++++++++++
 3 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index b2362ddfa694..e4207699c410 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -33,16 +33,43 @@
  * also mandatory during RX-ring setup.
  */
 
+enum xdp_mem_type {
+	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+	u32 type; /* enum xdp_mem_type, but known size type */
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
 	u32 reg_state;
+	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+
+static inline
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED)
+		page_frag_free(data);
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator);
 
 #endif /* __LINUX_NET_XDP_H__ */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..3e4bbcbe3e86 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -188,6 +168,10 @@ struct xdp_pkt {
 	u16 len;
 	u16 headroom;
 	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
 	struct net_device *dev_rx;
 };
 
@@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
 	xdp_pkt->metasize = metasize;
 
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_pkt->mem = xdp->rxq->mem;
+
 	return xdp_pkt;
 }
 
@@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_pkt *xdp_pkt;
+
+	while ((xdp_pkt = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdp_pkt))
+			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdp_pkt);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
 				continue;
 			}
 
@@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_pkt *xdp_pkt = bq->q[i];
 		int err;
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 097a0f74e004..7e6b3545277d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -71,3 +71,21 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator)
+{
+	if (type >= MEM_TYPE_MAX)
+		return -EINVAL;
+
+	xdp_rxq->mem.type = type;
+
+	if (allocator)
+		return -EOPNOTSUPP;
+
+	/* TODO: Allocate an ID that maps to allocator pointer
+	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
-- 
cgit v1.2.3-59-g8ed1b


From 189ead81a83eba5f5c5ce56c45620e51abcb5cb8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:32 +0200
Subject: ixgbe: use xdp_return_frame API

Extend struct ixgbe_tx_buffer to store the xdp_mem_info.

Notice that this could be optimized further by putting this into
a union in the struct ixgbe_tx_buffer, but this patchset
works towards removing this again.  Thus, this is not done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      | 1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 4f08c712e58e..abb5248e917e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -250,6 +250,7 @@ struct ixgbe_tx_buffer {
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
 	u32 tx_flags;
+	struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..0bfe6cf2bf8b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -5797,7 +5797,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8366,6 +8366,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
 	tx_buffer->data = xdp->data;
+	tx_buffer->xdp_mem = xdp->rxq->mem;
+
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
 	/* put descriptor type bits */
-- 
cgit v1.2.3-59-g8ed1b


From 106ca27f2922e8de820d1bd3d79b1cbdf2d78eea Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:37 +0200
Subject: xdp: move struct xdp_buff from filter.h to xdp.h

This is done to prepare for the next patch, and it is also
nice to move this XDP related struct out of filter.h.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 24 +-----------------------
 include/net/xdp.h      | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f91b03d..4da8b2308174 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -30,6 +30,7 @@ struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
+struct xdp_buff;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct xdp_buff {
-	void *data;
-	void *data_end;
-	void *data_meta;
-	void *data_hard_start;
-	struct xdp_rxq_info *rxq;
-};
-
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
-	xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
-	return unlikely(xdp->data_meta > xdp->data);
-}
-
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index e4207699c410..15f8ade008b5 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -50,6 +50,13 @@ struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_meta;
+	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
+};
 
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
@@ -72,4 +79,19 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 #endif /* __LINUX_NET_XDP_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From c0048cff8abb69c956ce1277d17a3f7a14e41522 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:42 +0200
Subject: xdp: introduce a new xdp_frame type

This is needed to convert drivers tuntap and virtio_net.

This is a generalization of what is done inside cpumap, which will be
converted later.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 15f8ade008b5..756c42811e78 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -58,6 +58,46 @@ struct xdp_buff {
 	struct xdp_rxq_info *rxq;
 };
 
+struct xdp_frame {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdp_frame;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_frame = xdp->data_hard_start;
+
+	xdp_frame->data = xdp->data;
+	xdp_frame->len  = xdp->data_end - xdp->data;
+	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+	xdp_frame->metasize = metasize;
+
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_frame->mem = xdp->rxq->mem;
+
+	return xdp_frame;
+}
+
 static inline
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 1ffcbc8537d0bc32aaca7000cb9c904ec4b6300f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:47 +0200
Subject: tun: convert to use generic xdp_frame and xdp_return_frame API

The tuntap driver invented it's own driver specific way of queuing
XDP packets, by storing the xdp_buff information in the top of
the XDP frame data.

Convert it over to use the more generic xdp_frame structure.  The
main problem with the in-driver method is that the xdp_rxq_info pointer
cannot be trused/used when dequeueing the frame.

V3: Remove check based on feedback from Jason

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      | 43 ++++++++++++++++++++-----------------------
 drivers/vhost/net.c    |  7 ++++---
 include/linux/if_tun.h |  4 ++--
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 28583aa0c17d..2c85e5cac2a9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -248,11 +248,11 @@ struct veth {
 	__be16 h_vlan_TCI;
 };
 
-bool tun_is_xdp_buff(void *ptr)
+bool tun_is_xdp_frame(void *ptr)
 {
 	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
-EXPORT_SYMBOL(tun_is_xdp_buff);
+EXPORT_SYMBOL(tun_is_xdp_frame);
 
 void *tun_xdp_to_ptr(void *ptr)
 {
@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		put_page(virt_to_head_page(xdp->data));
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -1298,17 +1298,14 @@ static const struct net_device_ops tun_netdev_ops = {
 static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_buff *buff = xdp->data_hard_start;
-	int headroom = xdp->data - xdp->data_hard_start;
+	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	/* Assure headroom is available and buff is properly aligned */
-	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
-		return -ENOSPC;
-
-	*buff = *xdp;
+	frame = convert_to_xdp_frame(xdp);
+	if (unlikely(!frame))
+		return -EOVERFLOW;
 
 	rcu_read_lock();
 
@@ -1323,7 +1320,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	/* Encode the XDP flag into lowest bit for consumer to differ
 	 * XDP buffer from sk_buff.
 	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
 		this_cpu_inc(tun->pcpu_stats->tx_dropped);
 		ret = -ENOSPC;
 	}
@@ -2001,11 +1998,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_buff *xdp,
+				struct xdp_frame *xdp_frame,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp->data_end - xdp->data;
+	size_t size = xdp_frame->len;
 	struct tun_pcpu_stats *stats;
 	size_t ret;
 
@@ -2021,7 +2018,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -2189,11 +2186,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			return err;
 	}
 
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdp, to);
-		put_page(virt_to_head_page(xdp->data));
+		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		xdp_return_frame(xdpf->data, &xdpf->mem);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2432,10 +2429,10 @@ out_free:
 static int tun_ptr_peek_len(void *ptr)
 {
 	if (likely(ptr)) {
-		if (tun_is_xdp_buff(ptr)) {
-			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+		if (tun_is_xdp_frame(ptr)) {
+			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-			return xdp->data_end - xdp->data;
+			return xdpf->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a57917..bbf38befefb2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -32,6 +32,7 @@
 #include <linux/skbuff.h>
 
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "vhost.h"
 
@@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_buf_peek_len(void *ptr)
 {
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		return xdp->data_end - xdp->data;
+		return xdpf->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index fd00170b494f..3d2996dc7d85 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,7 +22,7 @@
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
 void tun_ptr_free(void *ptr);
@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }
-- 
cgit v1.2.3-59-g8ed1b


From cac320c850efb25480cd0f71383b84ec61c0e138 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:52 +0200
Subject: virtio_net: convert to use generic xdp_frame and xdp_return_frame API

The virtio_net driver assumes XDP frames are always released based on
page refcnt (via put_page).  Thus, is only queues the XDP data pointer
address and uses virt_to_head_page() to retrieve struct page.

Use the XDP return API to get away from such assumptions. Instead
queue an xdp_frame, which allow us to use the xdp_return_frame API,
when releasing the frame.

V8: Avoid endianness issues (found by kbuild test robot)
V9: Change __virtnet_xdp_xmit from bool to int return value (found by Dan Carpenter)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 54 ++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec7411e..f50e1ad81ad4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -415,38 +415,48 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static bool __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_buff *xdp)
+static int __virtnet_xdp_xmit(struct virtnet_info *vi,
+			      struct xdp_buff *xdp)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	unsigned int len;
+	struct xdp_frame *xdpf, *xdpf_sent;
 	struct send_queue *sq;
+	unsigned int len;
 	unsigned int qp;
-	void *xdp_sent;
 	int err;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
 	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
 
-		put_page(sent_page);
-	}
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* virtqueue want to use data area in-front of packet */
+	if (unlikely(xdpf->metasize > 0))
+		return -EOPNOTSUPP;
 
-	xdp->data -= vi->hdr_len;
+	if (unlikely(xdpf->headroom < vi->hdr_len))
+		return -EOVERFLOW;
+
+	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
+	xdpf->data -= vi->hdr_len;
 	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
+	hdr = xdpf->data;
 	memset(hdr, 0, vi->hdr_len);
+	xdpf->len   += vi->hdr_len;
 
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	sg_init_one(sq->sg, xdpf->data, xdpf->len);
 
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
 	if (unlikely(err))
-		return false; /* Caller handle free/refcnt */
+		return -ENOSPC; /* Caller handle free/refcnt */
 
-	return true;
+	return 0;
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
@@ -454,7 +464,6 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
 	struct bpf_prog *xdp_prog;
-	bool sent;
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -463,10 +472,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	sent = __virtnet_xdp_xmit(vi, xdp);
-	if (!sent)
-		return -ENOSPC;
-	return 0;
+	return __virtnet_xdp_xmit(vi, xdp);
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -555,7 +561,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	unsigned int delta = 0;
 	struct page *xdp_page;
-	bool sent;
 	int err;
 
 	len -= vi->hdr_len;
@@ -606,8 +611,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			err = __virtnet_xdp_xmit(vi, &xdp);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
 			}
@@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 	unsigned int truesize;
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-	bool sent;
 	int err;
 
 	head_skb = NULL;
@@ -762,8 +766,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			err = __virtnet_xdp_xmit(vi, &xdp);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
 					put_page(xdp_page);
-- 
cgit v1.2.3-59-g8ed1b


From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:45:57 +0200
Subject: bpf: cpumap convert to use generic xdp_frame

The generic xdp_frame format, was inspired by the cpumap own internal
xdp_pkt format.  It is now time to convert it over to the generic
xdp_frame format.  The cpumap needs one extra field dev_rx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xdp.h   |   1 +
 kernel/bpf/cpumap.c | 100 +++++++++++++++-------------------------------------
 2 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 756c42811e78..ea3773f94f65 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -67,6 +67,7 @@ struct xdp_frame {
 	 * while mem info is valid on remote CPU.
 	 */
 	struct xdp_mem_info mem;
+	struct net_device *dev_rx; /* used by cpumap */
 };
 
 /* Convert xdp_buff to xdp_frame */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3e4bbcbe3e86..bcdc4dea5ce7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
-	 * while mem info is valid on remote CPU.
-	 */
-	struct xdp_mem_info mem;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
-	xdp_pkt->mem = xdp->rxq->mem;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
 	 * gracefully and warn once.
 	 */
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	while ((xdp_pkt = ptr_ring_consume(ring)))
-		if (WARN_ON_ONCE(xdp_pkt))
-			xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdp_pkt, &xdp_pkt->mem);
+				xdp_return_frame(xdpf->data, &xdpf->mem);
 				continue;
 			}
 
@@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		struct xdp_pkt *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem);
+			xdp_return_frame(xdpf->data, &xdpf->mem);
 		}
 		processed++;
 	}
@@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b411ef11020d012790839a5414040283d9335386 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:02 +0200
Subject: i40e: convert to use generic xdp_frame and xdp_return_frame API

Also convert driver i40e, which very recently got XDP_REDIRECT support
in commit d9314c474d4f ("i40e: add support for XDP_REDIRECT").

V7: This patch got added in V7 of this patchset.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 20 +++++++++++++++-----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index f174c72480ab..96c54cbfb1f9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -638,7 +638,8 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
 		else if (ring_is_xdp(ring))
-			page_frag_free(tx_buffer->raw_buf);
+			xdp_return_frame(tx_buffer->xdpf->data,
+					 &tx_buffer->xdpf->mem);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -841,7 +842,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
 		/* free the skb/XDP data */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buf->raw_buf);
+			xdp_return_frame(tx_buf->xdpf->data, &tx_buf->xdpf->mem);
 		else
 			napi_consume_skb(tx_buf->skb, napi_budget);
 
@@ -2225,6 +2226,8 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
@@ -3481,25 +3484,32 @@ dma_error:
 static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
 			      struct i40e_ring *xdp_ring)
 {
-	u32 size = xdp->data_end - xdp->data;
 	u16 i = xdp_ring->next_to_use;
 	struct i40e_tx_buffer *tx_bi;
 	struct i40e_tx_desc *tx_desc;
+	struct xdp_frame *xdpf;
 	dma_addr_t dma;
+	u32 size;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return I40E_XDP_CONSUMED;
+
+	size = xdpf->len;
 
 	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
 		xdp_ring->tx_stats.tx_busy++;
 		return I40E_XDP_CONSUMED;
 	}
 
-	dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
+	dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(xdp_ring->dev, dma))
 		return I40E_XDP_CONSUMED;
 
 	tx_bi = &xdp_ring->tx_bi[i];
 	tx_bi->bytecount = size;
 	tx_bi->gso_segs = 1;
-	tx_bi->raw_buf = xdp->data;
+	tx_bi->xdpf = xdpf;
 
 	/* record length, and DMA address */
 	dma_unmap_len_set(tx_bi, len, size);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 3043483ec426..857b1d743c8d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
 struct i40e_tx_buffer {
 	struct i40e_tx_desc *next_to_watch;
 	union {
+		struct xdp_frame *xdpf;
 		struct sk_buff *skb;
 		void *raw_buf;
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 84f5e3fb790547860fbbe7e0ef87104d4922dd1a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:07 +0200
Subject: mlx5: register a memory model when XDP is enabled

Now all the users of ndo_xdp_xmit have been converted to use xdp_return_frame.
This enable a different memory model, thus activating another code path
in the xdp_return_frame API.

V2: Fixed issues pointed out by Tariq.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b29c1d93f058..2dca0933dfd3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -512,6 +512,14 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mkey_be = c->mkey_be;
 	}
 
+	/* This must only be activate for order-0 pages */
+	if (rq->xdp_prog) {
+		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+						 MEM_TYPE_PAGE_ORDER0, NULL);
+		if (err)
+			goto err_rq_wq_destroy;
+	}
+
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8d5d88527587516bd58ff0f3810f07c38e65e2be Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:12 +0200
Subject: xdp: rhashtable with allocator ID to pointer mapping

Use the IDA infrastructure for getting a cyclic increasing ID number,
that is used for keeping track of each registered allocator per
RX-queue xdp_rxq_info.  Instead of using the IDR infrastructure, which
uses a radix tree, use a dynamic rhashtable, for creating ID to
pointer lookup table, because this is faster.

The problem that is being solved here is that, the xdp_rxq_info
pointer (stored in xdp_buff) cannot be used directly, as the
guaranteed lifetime is too short.  The info is needed on a
(potentially) remote CPU during DMA-TX completion time . In an
xdp_frame the xdp_mem_info is stored, when it got converted from an
xdp_buff, which is sufficient for the simple page refcnt based recycle
schemes.

For more advanced allocators there is a need to store a pointer to the
registered allocator.  Thus, there is a need to guard the lifetime or
validity of the allocator pointer, which is done through this
rhashtable ID map to pointer. The removal and validity of of the
allocator and helper struct xdp_mem_allocator is guarded by RCU.  The
allocator will be created by the driver, and registered with
xdp_rxq_info_reg_mem_model().

It is up-to debate who is responsible for freeing the allocator
pointer or invoking the allocator destructor function.  In any case,
this must happen via RCU freeing.

Use the IDA infrastructure for getting a cyclic increasing ID number,
that is used for keeping track of each registered allocator per
RX-queue xdp_rxq_info.

V4: Per req of Jason Wang
- Use xdp_rxq_info_reg_mem_model() in all drivers implementing
  XDP_REDIRECT, even-though it's not strictly necessary when
  allocator==NULL for type MEM_TYPE_PAGE_SHARED (given it's zero).

V6: Per req of Alex Duyck
- Introduce rhashtable_lookup() call in later patch

V8: Address sparse should be static warnings (from kbuild test robot)

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   9 +-
 drivers/net/tun.c                             |   6 +
 drivers/net/virtio_net.c                      |   7 +
 include/net/xdp.h                             |  14 +-
 net/core/xdp.c                                | 223 +++++++++++++++++++++++++-
 5 files changed, 241 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0bfe6cf2bf8b..f10904ec2172 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6370,7 +6370,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 	struct device *dev = rx_ring->dev;
 	int orig_node = dev_to_node(dev);
 	int ring_node = -1;
-	int size;
+	int size, err;
 
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 
@@ -6407,6 +6407,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 			     rx_ring->queue_index) < 0)
 		goto err;
 
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (err) {
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+		goto err;
+	}
+
 	rx_ring->xdp_prog = adapter->xdp_prog;
 
 	return 0;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c85e5cac2a9..283bde85c455 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 				       tun->dev, tfile->queue_index);
 		if (err < 0)
 			goto out;
+		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+			goto out;
+		}
 		err = 0;
 	}
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f50e1ad81ad4..42d338fe9a8d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1305,6 +1305,13 @@ static int virtnet_open(struct net_device *dev)
 		if (err < 0)
 			return err;
 
+		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
+			return err;
+		}
+
 		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 	}
diff --git a/include/net/xdp.h b/include/net/xdp.h
index ea3773f94f65..5f67c62540aa 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,6 +41,7 @@ enum xdp_mem_type {
 
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
+	u32 id;
 };
 
 struct xdp_rxq_info {
@@ -99,18 +100,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
-static inline
-void xdp_return_frame(void *data, struct xdp_mem_info *mem)
-{
-	if (mem->type == MEM_TYPE_PAGE_SHARED)
-		page_frag_free(data);
-
-	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
-		struct page *page = virt_to_page(data); /* Assumes order0 page*/
-
-		put_page(page);
-	}
-}
+void xdp_return_frame(void *data, struct xdp_mem_info *mem);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 7e6b3545277d..8b2cb79b5de0 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -5,6 +5,9 @@
  */
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
 
 #include <net/xdp.h>
 
@@ -13,6 +16,99 @@
 #define REG_STATE_UNREGISTERED	0x2
 #define REG_STATE_UNUSED	0x3
 
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+	struct xdp_mem_info mem;
+	void *allocator;
+	struct rhash_head node;
+	struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+	const u32 *k = data;
+	const u32 key = *k;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+		     != sizeof(u32));
+
+	/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+	return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+			  const void *ptr)
+{
+	const struct xdp_mem_allocator *xa = ptr;
+	u32 mem_id = *(u32 *)arg->key;
+
+	return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+	.nelem_hint = 64,
+	.head_offset = offsetof(struct xdp_mem_allocator, node),
+	.key_offset  = offsetof(struct xdp_mem_allocator, mem.id),
+	.key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+	.max_size = MEM_ID_MAX,
+	.min_size = 8,
+	.automatic_shrinking = true,
+	.hashfn    = xdp_mem_id_hashfn,
+	.obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+	struct xdp_mem_allocator *xa;
+
+	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+	/* Allow this ID to be reused */
+	ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+	/* TODO: Depending on allocator type/pointer free resources */
+
+	/* Poison memory */
+	xa->mem.id = 0xFFFF;
+	xa->mem.type = 0xF0F0;
+	xa->allocator = (void *)0xDEAD9001;
+
+	kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+	struct xdp_mem_allocator *xa;
+	int id = xdp_rxq->mem.id;
+	int err;
+
+	if (id == 0)
+		return;
+
+	mutex_lock(&mem_id_lock);
+
+	xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		return;
+	}
+
+	err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+	WARN_ON(err);
+
+	call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+	mutex_unlock(&mem_id_lock);
+}
+
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +117,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 
 	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
 
+	__xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
 	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
 	xdp_rxq->dev = NULL;
+
+	/* Reset mem info to defaults */
+	xdp_rxq->mem.id = 0;
+	xdp_rxq->mem.type = 0;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
 
@@ -72,20 +174,131 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
 
+static int __mem_id_init_hash_table(void)
+{
+	struct rhashtable *rht;
+	int ret;
+
+	if (unlikely(mem_id_init))
+		return 0;
+
+	rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+	if (!rht)
+		return -ENOMEM;
+
+	ret = rhashtable_init(rht, &mem_id_rht_params);
+	if (ret < 0) {
+		kfree(rht);
+		return ret;
+	}
+	mem_id_ht = rht;
+	smp_mb(); /* mutex lock should provide enough pairing */
+	mem_id_init = true;
+
+	return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+	int retries = 1;
+	int id;
+
+again:
+	id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+	if (id < 0) {
+		if (id == -ENOSPC) {
+			/* Cyclic allocator, reset next id */
+			if (retries--) {
+				mem_id_next = MEM_ID_MIN;
+				goto again;
+			}
+		}
+		return id; /* errno */
+	}
+	mem_id_next = id + 1;
+
+	return id;
+}
+
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator)
 {
+	struct xdp_mem_allocator *xdp_alloc;
+	gfp_t gfp = GFP_KERNEL;
+	int id, errno, ret;
+	void *ptr;
+
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return -EFAULT;
+	}
+
 	if (type >= MEM_TYPE_MAX)
 		return -EINVAL;
 
 	xdp_rxq->mem.type = type;
 
-	if (allocator)
-		return -EOPNOTSUPP;
+	if (!allocator)
+		return 0;
+
+	/* Delay init of rhashtable to save memory if feature isn't used */
+	if (!mem_id_init) {
+		mutex_lock(&mem_id_lock);
+		ret = __mem_id_init_hash_table();
+		mutex_unlock(&mem_id_lock);
+		if (ret < 0) {
+			WARN_ON(1);
+			return ret;
+		}
+	}
+
+	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+	if (!xdp_alloc)
+		return -ENOMEM;
+
+	mutex_lock(&mem_id_lock);
+	id = __mem_id_cyclic_get(gfp);
+	if (id < 0) {
+		errno = id;
+		goto err;
+	}
+	xdp_rxq->mem.id = id;
+	xdp_alloc->mem  = xdp_rxq->mem;
+	xdp_alloc->allocator = allocator;
+
+	/* Insert allocator into ID lookup table */
+	ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+	if (IS_ERR(ptr)) {
+		errno = PTR_ERR(ptr);
+		goto err;
+	}
+
+	mutex_unlock(&mem_id_lock);
 
-	/* TODO: Allocate an ID that maps to allocator pointer
-	 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
-	 */
 	return 0;
+err:
+	mutex_unlock(&mem_id_lock);
+	kfree(xdp_alloc);
+	return errno;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+{
+	if (mem->type == MEM_TYPE_PAGE_SHARED) {
+		page_frag_free(data);
+		return;
+	}
+
+	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
+		struct page *page = virt_to_page(data); /* Assumes order0 page*/
+
+		put_page(page);
+	}
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);
-- 
cgit v1.2.3-59-g8ed1b


From ff7d6b27f894f1469dc51ccb828b7363ccd9799f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:17 +0200
Subject: page_pool: refurbish version of page_pool code

Need a fast page recycle mechanism for ndo_xdp_xmit API for returning
pages on DMA-TX completion time, which have good cross CPU
performance, given DMA-TX completion time can happen on a remote CPU.

Refurbish my page_pool code, that was presented[1] at MM-summit 2016.
Adapted page_pool code to not depend the page allocator and
integration into struct page.  The DMA mapping feature is kept,
even-though it will not be activated/used in this patchset.

[1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes, don't return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.

V4: many small improvements and cleanups
- Add DOC comment section, that can be used by kernel-doc
- Improve fallback mode, to work better with refcnt based recycling
  e.g. remove a WARN as pointed out by Tariq
  e.g. quicker fallback if ptr_ring is empty.

V5: Fixed SPDX license as pointed out by Alexei

V6: Adjustments requested by Eric Dumazet
 - Adjust ____cacheline_aligned_in_smp usage/placement
 - Move rcu_head in struct page_pool
 - Free pages quicker on destroy, minimize resources delayed an RCU period
 - Remove code for forward/backward compat ABI interface

V8: Issues found by kbuild test robot
 - Address sparse should be static warnings
 - Only compile+link when a driver use/select page_pool,
   mlx5 selects CONFIG_PAGE_POOL, although its first used in two patches

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig |   1 +
 include/net/page_pool.h                         | 129 ++++++++++
 net/Kconfig                                     |   3 +
 net/core/Makefile                               |   1 +
 net/core/page_pool.c                            | 317 ++++++++++++++++++++++++
 5 files changed, 451 insertions(+)
 create mode 100644 include/net/page_pool.h
 create mode 100644 net/core/page_pool.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index c032319f1cb9..12257034131e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -30,6 +30,7 @@ config MLX5_CORE_EN
 	bool "Mellanox Technologies ConnectX-4 Ethernet support"
 	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
+	select PAGE_POOL
 	default n
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
new file mode 100644
index 000000000000..1fe77db59518
--- /dev/null
+++ b/include/net/page_pool.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.h
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+
+/**
+ * DOC: page_pool allocator
+ *
+ * This page_pool allocator is optimized for the XDP mode that
+ * uses one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * Basic use involve replacing alloc_pages() calls with the
+ * page_pool_alloc_pages() call.  Drivers should likely use
+ * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ *
+ * If page_pool handles DMA mapping (use page->private), then API user
+ * is responsible for invoking page_pool_put_page() once.  In-case of
+ * elevated refcnt, the DMA state is released, assuming other users of
+ * the page will eventually call put_page().
+ *
+ * If no DMA mapping is done, then it can act as shim-layer that
+ * fall-through to alloc_page.  As no state is kept on the page, the
+ * regular put_page() call is sufficient.
+ */
+#ifndef _NET_PAGE_POOL_H
+#define _NET_PAGE_POOL_H
+
+#include <linux/mm.h> /* Needed by ptr_ring */
+#include <linux/ptr_ring.h>
+#include <linux/dma-direction.h>
+
+#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_ALL	PP_FLAG_DMA_MAP
+
+/*
+ * Fast allocation side cache array/stack
+ *
+ * The cache size and refill watermark is related to the network
+ * use-case.  The NAPI budget is 64 packets.  After a NAPI poll the RX
+ * ring is usually refilled and the max consumed elements will be 64,
+ * thus a natural max size of objects needed in the cache.
+ *
+ * Keeping room for more objects, is due to XDP_DROP use-case.  As
+ * XDP_DROP allows the opportunity to recycle objects directly into
+ * this array, as it shares the same softirq/NAPI protection.  If
+ * cache is already full (or partly full) then the XDP_DROP recycles
+ * would have to take a slower code path.
+ */
+#define PP_ALLOC_CACHE_SIZE	128
+#define PP_ALLOC_CACHE_REFILL	64
+struct pp_alloc_cache {
+	u32 count;
+	void *cache[PP_ALLOC_CACHE_SIZE];
+};
+
+struct page_pool_params {
+	unsigned int	flags;
+	unsigned int	order;
+	unsigned int	pool_size;
+	int		nid;  /* Numa node id to allocate from pages from */
+	struct device	*dev; /* device, for DMA pre-mapping purposes */
+	enum dma_data_direction dma_dir; /* DMA mapping direction */
+};
+
+struct page_pool {
+	struct rcu_head rcu;
+	struct page_pool_params p;
+
+	/*
+	 * Data structure for allocation side
+	 *
+	 * Drivers allocation side usually already perform some kind
+	 * of resource protection.  Piggyback on this protection, and
+	 * require driver to protect allocation side.
+	 *
+	 * For NIC drivers this means, allocate a page_pool per
+	 * RX-queue. As the RX-queue is already protected by
+	 * Softirq/BH scheduling and napi_schedule. NAPI schedule
+	 * guarantee that a single napi_struct will only be scheduled
+	 * on a single CPU (see napi_schedule).
+	 */
+	struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
+
+	/* Data structure for storing recycled pages.
+	 *
+	 * Returning/freeing pages is more complicated synchronization
+	 * wise, because free's can happen on remote CPUs, with no
+	 * association with allocation resource.
+	 *
+	 * Use ptr_ring, as it separates consumer and producer
+	 * effeciently, it a way that doesn't bounce cache-lines.
+	 *
+	 * TODO: Implement bulk return pages into this structure.
+	 */
+	struct ptr_ring ring;
+};
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_pages(pool, gfp);
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params);
+
+void page_pool_destroy(struct page_pool *pool);
+
+/* Never call this directly, use helpers below */
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct);
+
+static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_put_page(pool, page, false);
+}
+/* Very limited use-cases allow recycle direct */
+static inline void page_pool_recycle_direct(struct page_pool *pool,
+					    struct page *page)
+{
+	__page_pool_put_page(pool, page, true);
+}
+
+#endif /* _NET_PAGE_POOL_H */
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12c25c2..6fa1a4493b8c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -423,6 +423,9 @@ config MAY_USE_DEVLINK
 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
 	  devlink is a loadable module and the driver using it is built-in.
 
+config PAGE_POOL
+       bool
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 6dbbba8c57ae..7080417f8bc8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -14,6 +14,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
new file mode 100644
index 000000000000..68bf07206744
--- /dev/null
+++ b/net/core/page_pool.c
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+			  const struct page_pool_params *params)
+{
+	unsigned int ring_qsize = 1024; /* Default */
+
+	memcpy(&pool->p, params, sizeof(pool->p));
+
+	/* Validate only known flags were used */
+	if (pool->p.flags & ~(PP_FLAG_ALL))
+		return -EINVAL;
+
+	if (pool->p.pool_size)
+		ring_qsize = pool->p.pool_size;
+
+	/* Sanity limit mem that can be pinned down */
+	if (ring_qsize > 32768)
+		return -E2BIG;
+
+	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+	 * which is the XDP_TX use-case.
+	 */
+	if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+		return -EINVAL;
+
+	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+	struct page_pool *pool;
+	int err = 0;
+
+	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	err = page_pool_init(pool, params);
+	if (err < 0) {
+		pr_warn("%s() gave up with errno %d\n", __func__, err);
+		kfree(pool);
+		return ERR_PTR(err);
+	}
+	return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+	struct ptr_ring *r = &pool->ring;
+	struct page *page;
+
+	/* Quicker fallback, avoid locks when ring is empty */
+	if (__ptr_ring_empty(r))
+		return NULL;
+
+	/* Test for safe-context, caller should provide this guarantee */
+	if (likely(in_serving_softirq())) {
+		if (likely(pool->alloc.count)) {
+			/* Fast-path */
+			page = pool->alloc.cache[--pool->alloc.count];
+			return page;
+		}
+		/* Slower-path: Alloc array empty, time to refill
+		 *
+		 * Open-coded bulk ptr_ring consumer.
+		 *
+		 * Discussion: the ring consumer lock is not really
+		 * needed due to the softirq/NAPI protection, but
+		 * later need the ability to reclaim pages on the
+		 * ring. Thus, keeping the locks.
+		 */
+		spin_lock(&r->consumer_lock);
+		while ((page = __ptr_ring_consume(r))) {
+			if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+				break;
+			pool->alloc.cache[pool->alloc.count++] = page;
+		}
+		spin_unlock(&r->consumer_lock);
+		return page;
+	}
+
+	/* Slow-path: Get page from locked ring queue */
+	page = ptr_ring_consume(&pool->ring);
+	return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+						 gfp_t _gfp)
+{
+	struct page *page;
+	gfp_t gfp = _gfp;
+	dma_addr_t dma;
+
+	/* We could always set __GFP_COMP, and avoid this branch, as
+	 * prep_new_page() can handle order-0 with __GFP_COMP.
+	 */
+	if (pool->p.order)
+		gfp |= __GFP_COMP;
+
+	/* FUTURE development:
+	 *
+	 * Current slow-path essentially falls back to single page
+	 * allocations, which doesn't improve performance.  This code
+	 * need bulk allocation support from the page allocator code.
+	 */
+
+	/* Cache was empty, do real allocation */
+	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+	if (!page)
+		return NULL;
+
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		goto skip_dma_map;
+
+	/* Setup DMA mapping: use page->private for DMA-addr
+	 * This mapping is kept for lifetime of page, until leaving pool.
+	 */
+	dma = dma_map_page(pool->p.dev, page, 0,
+			   (PAGE_SIZE << pool->p.order),
+			   pool->p.dma_dir);
+	if (dma_mapping_error(pool->p.dev, dma)) {
+		put_page(page);
+		return NULL;
+	}
+	set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+	/* When page just alloc'ed is should/must have refcnt 1. */
+	return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	/* Fast-path: Get a page from cache */
+	page = __page_pool_get_cached(pool);
+	if (page)
+		return page;
+
+	/* Slow-path: cache empty, do real allocation */
+	page = __page_pool_alloc_pages_slow(pool, gfp);
+	return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+				   struct page *page)
+{
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		return;
+
+	/* DMA unmap */
+	dma_unmap_page(pool->p.dev, page_private(page),
+		       PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+	set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+	/* An optimization would be to call __free_pages(page, pool->p.order)
+	 * knowing page is not part of page-cache (thus avoiding a
+	 * __page_cache_release() call).
+	 */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+				   struct page *page)
+{
+	int ret;
+	/* BH protection not needed if current is serving softirq */
+	if (in_serving_softirq())
+		ret = ptr_ring_produce(&pool->ring, page);
+	else
+		ret = ptr_ring_produce_bh(&pool->ring, page);
+
+	return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+				       struct page_pool *pool)
+{
+	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+		return false;
+
+	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
+	pool->alloc.cache[pool->alloc.count++] = page;
+	return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct)
+{
+	/* This allocator is optimized for the XDP mode that uses
+	 * one-frame-per-page, but have fallbacks that act like the
+	 * regular page allocator APIs.
+	 *
+	 * refcnt == 1 means page_pool owns page, and can recycle it.
+	 */
+	if (likely(page_ref_count(page) == 1)) {
+		/* Read barrier done in page_ref_count / READ_ONCE */
+
+		if (allow_direct && in_serving_softirq())
+			if (__page_pool_recycle_direct(page, pool))
+				return;
+
+		if (!__page_pool_recycle_into_ring(pool, page)) {
+			/* Cache full, fallback to free pages */
+			__page_pool_return_page(pool, page);
+		}
+		return;
+	}
+	/* Fallback/non-XDP mode: API user have elevated refcnt.
+	 *
+	 * Many drivers split up the page into fragments, and some
+	 * want to keep doing this to save memory and do refcnt based
+	 * recycling. Support this use case too, to ease drivers
+	 * switching between XDP/non-XDP.
+	 *
+	 * In-case page_pool maintains the DMA mapping, API user must
+	 * call page_pool_put_page once.  In this elevated refcnt
+	 * case, the DMA is unmapped/released, as driver is likely
+	 * doing refcnt based recycle tricks, meaning another process
+	 * will be invoking put_page.
+	 */
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty recycle ring */
+	while ((page = ptr_ring_consume(&pool->ring))) {
+		/* Verify the refcnt invariant of cached pages */
+		if (!(page_ref_count(page) == 1))
+			pr_crit("%s() page_pool refcnt %d violation\n",
+				__func__, page_ref_count(page));
+
+		__page_pool_return_page(pool, page);
+	}
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+	struct page_pool *pool;
+
+	pool = container_of(rcu, struct page_pool, rcu);
+
+	WARN(pool->alloc.count, "API usage violation");
+
+	__page_pool_empty_ring(pool);
+	ptr_ring_cleanup(&pool->ring, NULL);
+	kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty alloc cache, assume caller made sure this is
+	 * no-longer in use, and page_pool_alloc_pages() cannot be
+	 * call concurrently.
+	 */
+	while (pool->alloc.count) {
+		page = pool->alloc.cache[--pool->alloc.count];
+		__page_pool_return_page(pool, page);
+	}
+
+	/* No more consumers should exist, but producers could still
+	 * be in-flight.
+	 */
+	__page_pool_empty_ring(pool);
+
+	/* An xdp_mem_allocator can still ref page_pool pointer */
+	call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);
-- 
cgit v1.2.3-59-g8ed1b


From 57d0a1c1ac9e6a836bbab4698ba2a2e03f64bf1b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:22 +0200
Subject: xdp: allow page_pool as an allocator type in xdp_return_frame

New allocator type MEM_TYPE_PAGE_POOL for page_pool usage.

The registered allocator page_pool pointer is not available directly
from xdp_rxq_info, but it could be (if needed).  For now, the driver
should keep separate track of the page_pool pointer, which it should
use for RX-ring page allocation.

As suggested by Saeed, to maintain a symmetric API it is the drivers
responsibility to allocate/create and free/destroy the page_pool.
Thus, after the driver have called xdp_rxq_info_unreg(), it is drivers
responsibility to free the page_pool, but with a RCU free call.  This
is done easily via the page_pool helper page_pool_destroy() (which
avoids touching any driver code during the RCU callback, which could
happen after the driver have been unloaded).

V8: address issues found by kbuild test robot
 - Address sparse should be static warnings
 - Allow xdp.o to be compiled without page_pool.o

V9: Remove inline from .c file, compiler knows best

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 14 ++++++++++++
 include/net/xdp.h       |  3 +++
 net/core/xdp.c          | 60 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 1fe77db59518..c79087153148 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -117,7 +117,12 @@ void __page_pool_put_page(struct page_pool *pool,
 
 static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
 {
+	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+	 */
+#ifdef CONFIG_PAGE_POOL
 	__page_pool_put_page(pool, page, false);
+#endif
 }
 /* Very limited use-cases allow recycle direct */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
@@ -126,4 +131,13 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
+static inline bool is_page_pool_compiled_in(void)
+{
+#ifdef CONFIG_PAGE_POOL
+	return true;
+#else
+	return false;
+#endif
+}
+
 #endif /* _NET_PAGE_POOL_H */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 5f67c62540aa..d0ee437753dc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -36,6 +36,7 @@
 enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_PAGE_POOL,
 	MEM_TYPE_MAX,
 };
 
@@ -44,6 +45,8 @@ struct xdp_mem_info {
 	u32 id;
 };
 
+struct page_pool;
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8b2cb79b5de0..33e382afbd95 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/rhashtable.h>
+#include <net/page_pool.h>
 
 #include <net/xdp.h>
 
@@ -27,7 +28,10 @@ static struct rhashtable *mem_id_ht;
 
 struct xdp_mem_allocator {
 	struct xdp_mem_info mem;
-	void *allocator;
+	union {
+		void *allocator;
+		struct page_pool *page_pool;
+	};
 	struct rhash_head node;
 	struct rcu_head rcu;
 };
@@ -74,7 +78,9 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	/* Allow this ID to be reused */
 	ida_simple_remove(&mem_id_pool, xa->mem.id);
 
-	/* TODO: Depending on allocator type/pointer free resources */
+	/* Notice, driver is expected to free the *allocator,
+	 * e.g. page_pool, and MUST also use RCU free.
+	 */
 
 	/* Poison memory */
 	xa->mem.id = 0xFFFF;
@@ -225,6 +231,17 @@ again:
 	return id;
 }
 
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+	if (type == MEM_TYPE_PAGE_POOL)
+		return is_page_pool_compiled_in();
+
+	if (type >= MEM_TYPE_MAX)
+		return false;
+
+	return true;
+}
+
 int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 			       enum xdp_mem_type type, void *allocator)
 {
@@ -238,13 +255,16 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 		return -EFAULT;
 	}
 
-	if (type >= MEM_TYPE_MAX)
-		return -EINVAL;
+	if (!__is_supported_mem_type(type))
+		return -EOPNOTSUPP;
 
 	xdp_rxq->mem.type = type;
 
-	if (!allocator)
+	if (!allocator) {
+		if (type == MEM_TYPE_PAGE_POOL)
+			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
+	}
 
 	/* Delay init of rhashtable to save memory if feature isn't used */
 	if (!mem_id_init) {
@@ -290,15 +310,31 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
 void xdp_return_frame(void *data, struct xdp_mem_info *mem)
 {
-	if (mem->type == MEM_TYPE_PAGE_SHARED) {
+	struct xdp_mem_allocator *xa;
+	struct page *page;
+
+	switch (mem->type) {
+	case MEM_TYPE_PAGE_POOL:
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		page = virt_to_head_page(data);
+		if (xa)
+			page_pool_put_page(xa->page_pool, page);
+		else
+			put_page(page);
+		rcu_read_unlock();
+		break;
+	case MEM_TYPE_PAGE_SHARED:
 		page_frag_free(data);
-		return;
-	}
-
-	if (mem->type == MEM_TYPE_PAGE_ORDER0) {
-		struct page *page = virt_to_page(data); /* Assumes order0 page*/
-
+		break;
+	case MEM_TYPE_PAGE_ORDER0:
+		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
+		break;
+	default:
+		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+		break;
 	}
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
-- 
cgit v1.2.3-59-g8ed1b


From 60bbf7eeef10dc647430646d7fe5e3d8d132dbec Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:27 +0200
Subject: mlx5: use page_pool for xdp_return_frame call

This patch shows how it is possible to have both the driver local page
cache, which uses elevated refcnt for "catching"/avoiding SKB
put_page returns the page through the page allocator.  And at the
same time, have pages getting returned to the page_pool from
ndp_xdp_xmit DMA completion.

The performance improvement for XDP_REDIRECT in this patch is really
good.  Especially considering that (currently) the xdp_return_frame
API and page_pool_put_page() does per frame operations of both
rhashtable ID-lookup and locked return into (page_pool) ptr_ring.
(It is the plan to remove these per frame operation in a followup
patchset).

The benchmark performed was RX on mlx5 and XDP_REDIRECT out ixgbe,
with xdp_redirect_map (using devmap) . And the target/maximum
capability of ixgbe is 13Mpps (on this HW setup).

Before this patch for mlx5, XDP redirected frames were returned via
the page allocator.  The single flow performance was 6Mpps, and if I
started two flows the collective performance drop to 4Mpps, because we
hit the page allocator lock (further negative scaling occurs).

Two test scenarios need to be covered, for xdp_return_frame API, which
is DMA-TX completion running on same-CPU or cross-CPU free/return.
Results were same-CPU=10Mpps, and cross-CPU=12Mpps.  This is very
close to our 13Mpps max target.

The reason max target isn't reached in cross-CPU test, is likely due
to RX-ring DMA unmap/map overhead (which doesn't occur in ixgbe to
ixgbe testing).  It is also planned to remove this unnecessary DMA
unmap in a later patchset

V2: Adjustments requested by Tariq
 - Changed page_pool_create return codes not return NULL, only
   ERR_PTR, as this simplifies err handling in drivers.
 - Save a branch in mlx5e_page_release
 - Correct page_pool size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V5: Updated patch desc

V8: Adjust for b0cedc844c00 ("net/mlx5e: Remove rq_headroom field from params")
V9:
 - Adjust for 121e89275471 ("net/mlx5e: Refactor RQ XDP_TX indication")
 - Adjust for 73281b78a37a ("net/mlx5e: Derive Striding RQ size from MTU")
 - Correct handling if page_pool_create fail for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

V10: Req from Tariq
 - Change pool_size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 41 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 16 ++++++---
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1a05d1072c5e..3317a4da87cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -53,6 +53,8 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 
+struct page_pool;
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -534,6 +536,7 @@ struct mlx5e_rq {
 	unsigned int           hw_mtu;
 	struct mlx5e_xdpsq     xdpsq;
 	DECLARE_BITMAP(flags, 8);
+	struct page_pool      *page_pool;
 
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2dca0933dfd3..f10037419978 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -35,6 +35,7 @@
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
 #include <linux/bpf.h>
+#include <net/page_pool.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en_tc.h"
@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_rq_param *rqp,
 			  struct mlx5e_rq *rq)
 {
+	struct page_pool_params pp_params = { 0 };
 	struct mlx5_core_dev *mdev = c->mdev;
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	u32 byte_count;
+	u32 byte_count, pool_size;
 	int npages;
 	int wq_sz;
 	int err;
@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+	pool_size = 1 << params->log_rq_mtu_frames;
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+
+		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -512,13 +517,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mkey_be = c->mkey_be;
 	}
 
-	/* This must only be activate for order-0 pages */
-	if (rq->xdp_prog) {
-		err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
-						 MEM_TYPE_PAGE_ORDER0, NULL);
-		if (err)
-			goto err_rq_wq_destroy;
+	/* Create a page_pool and register it with rxq */
+	pp_params.order     = rq->buff.page_order;
+	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+	pp_params.pool_size = pool_size;
+	pp_params.nid       = cpu_to_node(c->cpu);
+	pp_params.dev       = c->pdev;
+	pp_params.dma_dir   = rq->buff.map_dir;
+
+	/* page_pool can be used even when there is no rq->xdp_prog,
+	 * given page_pool does not handle DMA mapping there is no
+	 * required state to clear. And page_pool gracefully handle
+	 * elevated refcnt.
+	 */
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rq->page_pool)) {
+		if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+			kfree(rq->wqe.frag_info);
+		err = PTR_ERR(rq->page_pool);
+		rq->page_pool = NULL;
+		goto err_rq_wq_destroy;
 	}
+	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+					 MEM_TYPE_PAGE_POOL, rq->page_pool);
+	if (err)
+		goto err_rq_wq_destroy;
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
@@ -556,6 +579,8 @@ err_rq_wq_destroy:
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
@@ -569,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		bpf_prog_put(rq->xdp_prog);
 
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 0e24be05907f..f42436d7f2d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -37,6 +37,7 @@
 #include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include <net/ip6_checksum.h>
+#include <net/page_pool.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	if (mlx5e_rx_cache_get(rq, dma_info))
 		return 0;
 
-	dma_info->page = dev_alloc_pages(rq->buff.page_order);
+	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
 	if (unlikely(!dma_info->page))
 		return -ENOMEM;
 
@@ -246,11 +247,16 @@ static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 			bool recycle)
 {
-	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
-		return;
+	if (likely(recycle)) {
+		if (mlx5e_rx_cache_put(rq, dma_info))
+			return;
 
-	mlx5e_page_dma_unmap(rq, dma_info);
-	put_page(dma_info->page);
+		mlx5e_page_dma_unmap(rq, dma_info);
+		page_pool_recycle_direct(rq->page_pool, dma_info->page);
+	} else {
+		mlx5e_page_dma_unmap(rq, dma_info);
+		put_page(dma_info->page);
+	}
 }
 
 static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
-- 
cgit v1.2.3-59-g8ed1b


From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:32 +0200
Subject: xdp: transition into using xdp_frame for return API

Changing API xdp_return_frame() to take struct xdp_frame as argument,
seems like a natural choice. But there are some subtle performance
details here that needs extra care, which is a deliberate choice.

When de-referencing xdp_frame on a remote CPU during DMA-TX
completion, result in the cache-line is change to "Shared"
state. Later when the page is reused for RX, then this xdp_frame
cache-line is written, which change the state to "Modified".

This situation already happens (naturally) for, virtio_net, tun and
cpumap as the xdp_frame pointer is the queued object.  In tun and
cpumap, the ptr_ring is used for efficiently transferring cache-lines
(with pointers) between CPUs. Thus, the only option is to
de-referencing xdp_frame.

It is only the ixgbe driver that had an optimization, in which it can
avoid doing the de-reference of xdp_frame.  The driver already have
TX-ring queue, which (in case of remote DMA-TX completion) have to be
transferred between CPUs anyhow.  In this data area, we stored a
struct xdp_mem_info and a data pointer, which allowed us to avoid
de-referencing xdp_frame.

To compensate for this, a prefetchw is used for telling the cache
coherency protocol about our access pattern.  My benchmarks show that
this prefetchw is enough to compensate the ixgbe driver.

V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT")
V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address
and offset in dma_sync call")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c     |  5 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h        |  4 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 17 +++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  1 +
 drivers/net/tun.c                               |  4 ++--
 drivers/net/virtio_net.c                        |  2 +-
 include/net/xdp.h                               |  2 +-
 kernel/bpf/cpumap.c                             |  6 +++---
 net/core/xdp.c                                  |  4 +++-
 9 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 96c54cbfb1f9..c8bf4d35fdea 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -638,8 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
 		else if (ring_is_xdp(ring))
-			xdp_return_frame(tx_buffer->xdpf->data,
-					 &tx_buffer->xdpf->mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -842,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
 		/* free the skb/XDP data */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buf->xdpf->data, &tx_buf->xdpf->mem);
+			xdp_return_frame(tx_buf->xdpf);
 		else
 			napi_consume_skb(tx_buf->skb, napi_budget);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index abb5248e917e..7dd5038cfcc4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
 	unsigned long time_stamp;
 	union {
 		struct sk_buff *skb;
-		/* XDP uses address ptr on irq_clean */
-		void *data;
+		struct xdp_frame *xdpf;
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;
@@ -250,7 +249,6 @@ struct ixgbe_tx_buffer {
 	DEFINE_DMA_UNMAP_ADDR(dma);
 	DEFINE_DMA_UNMAP_LEN(len);
 	u32 tx_flags;
-	struct xdp_mem_info xdp_mem;
 };
 
 struct ixgbe_rx_buffer {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f10904ec2172..4f2864165723 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2386,6 +2386,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
+			prefetchw(xdp.data_hard_start); /* xdp_frame write */
 
 			skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
 		}
@@ -5797,7 +5798,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			xdp_return_frame(tx_buffer->data, &tx_buffer->xdp_mem);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -8348,16 +8349,21 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
 	struct ixgbe_tx_buffer *tx_buffer;
 	union ixgbe_adv_tx_desc *tx_desc;
+	struct xdp_frame *xdpf;
 	u32 len, cmd_type;
 	dma_addr_t dma;
 	u16 i;
 
-	len = xdp->data_end - xdp->data;
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	len = xdpf->len;
 
 	if (unlikely(!ixgbe_desc_unused(ring)))
 		return IXGBE_XDP_CONSUMED;
 
-	dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
+	dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
 	if (dma_mapping_error(ring->dev, dma))
 		return IXGBE_XDP_CONSUMED;
 
@@ -8372,8 +8378,7 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
-	tx_buffer->data = xdp->data;
-	tx_buffer->xdp_mem = xdp->rxq->mem;
+	tx_buffer->xdpf = xdpf;
 
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f42436d7f2d9..7bbf0db27a01 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -890,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 	dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
 				      frag_size, DMA_FROM_DEVICE);
+	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
 	wi->offset += frag_size;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 283bde85c455..bec130cdbd9d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -663,7 +663,7 @@ void tun_ptr_free(void *ptr)
 	if (tun_is_xdp_frame(ptr)) {
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		xdp_return_frame(xdpf->data, &xdpf->mem);
+		xdp_return_frame(xdpf);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -2196,7 +2196,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
 		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
-		xdp_return_frame(xdpf->data, &xdpf->mem);
+		xdp_return_frame(xdpf);
 	} else {
 		struct sk_buff *skb = ptr;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 42d338fe9a8d..ab3d7cbc4c49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -430,7 +430,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent->data, &xdpf_sent->mem);
+		xdp_return_frame(xdpf_sent);
 
 	xdpf = convert_to_xdp_frame(xdp);
 	if (unlikely(!xdpf))
diff --git a/include/net/xdp.h b/include/net/xdp.h
index d0ee437753dc..137ad5f9f40f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -103,7 +103,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	return xdp_frame;
 }
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem);
+void xdp_return_frame(struct xdp_frame *xdpf);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index bcdc4dea5ce7..c95b04ec103e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 
 	while ((xdpf = ptr_ring_consume(ring)))
 		if (WARN_ON_ONCE(xdpf))
-			xdp_return_frame(xdpf->data, &xdpf->mem);
+			xdp_return_frame(xdpf);
 }
 
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data)
 
 			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				xdp_return_frame(xdpf->data, &xdpf->mem);
+				xdp_return_frame(xdpf);
 				continue;
 			}
 
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf->data, &xdpf->mem);
+			xdp_return_frame(xdpf);
 		}
 		processed++;
 	}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 33e382afbd95..0c86b53a3a63 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,9 +308,11 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-void xdp_return_frame(void *data, struct xdp_mem_info *mem)
+void xdp_return_frame(struct xdp_frame *xdpf)
 {
+	struct xdp_mem_info *mem = &xdpf->mem;
 	struct xdp_mem_allocator *xa;
+	void *data = xdpf->data;
 	struct page *page;
 
 	switch (mem->type) {
-- 
cgit v1.2.3-59-g8ed1b


From 44fa2dbd475996ddc8f3a0e6113dee983e0ee3aa Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:37 +0200
Subject: xdp: transition into using xdp_frame for ndo_xdp_xmit

Changing API ndo_xdp_xmit to take a struct xdp_frame instead of struct
xdp_buff.  This brings xdp_return_frame and ndp_xdp_xmit in sync.

This builds towards changing the API further to become a bulk API,
because xdp_buff is not a queue-able object while xdp_frame is.

V4: Adjust for commit 59655a5b6c83 ("tuntap: XDP_TX can use native XDP")
V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT")

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 30 +++++++++++++++------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 24 +++++++++++----------
 drivers/net/tun.c                             | 19 ++++++++++-------
 drivers/net/virtio_net.c                      | 24 ++++++++++++---------
 include/linux/netdevice.h                     |  4 ++--
 net/core/filter.c                             | 17 +++++++++++++--
 7 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8bf4d35fdea..87fb27ab9c24 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
 #define I40E_XDP_CONSUMED 1
 #define I40E_XDP_TX 2
 
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring);
 
+static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
+				 struct i40e_ring *xdp_ring)
+{
+	struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!xdpf))
+		return I40E_XDP_CONSUMED;
+
+	return i40e_xmit_xdp_ring(xdpf, xdp_ring);
+}
+
 /**
  * i40e_run_xdp - run an XDP program
  * @rx_ring: Rx ring being processed
@@ -2233,7 +2244,7 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 		break;
 	case XDP_TX:
 		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
-		result = i40e_xmit_xdp_ring(xdp, xdp_ring);
+		result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
@@ -3480,21 +3491,14 @@ dma_error:
  * @xdp: data to transmit
  * @xdp_ring: XDP Tx ring
  **/
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring)
 {
 	u16 i = xdp_ring->next_to_use;
 	struct i40e_tx_buffer *tx_bi;
 	struct i40e_tx_desc *tx_desc;
-	struct xdp_frame *xdpf;
+	u32 size = xdpf->len;
 	dma_addr_t dma;
-	u32 size;
-
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return I40E_XDP_CONSUMED;
-
-	size = xdpf->len;
 
 	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
 		xdp_ring->tx_stats.tx_busy++;
@@ -3684,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  *
  * Returns Zero if sent, else an error code
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3697,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
+	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
 	if (err != I40E_XDP_TX)
 		return -ENOSPC;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 857b1d743c8d..4bf318b8be85 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -511,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4f2864165723..51e7d82a5860 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 #define IXGBE_XDP_TX 2
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp);
+			       struct xdp_frame *xdpf);
 
 static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 				     struct ixgbe_ring *rx_ring,
@@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 {
 	int err, result = IXGBE_XDP_PASS;
 	struct bpf_prog *xdp_prog;
+	struct xdp_frame *xdpf;
 	u32 act;
 
 	rcu_read_lock();
@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
 		break;
 	case XDP_TX:
-		result = ixgbe_xmit_xdp_ring(adapter, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf)) {
+			result = IXGBE_XDP_CONSUMED;
+			break;
+		}
+		result = ixgbe_xmit_xdp_ring(adapter, xdpf);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
@@ -2386,7 +2394,6 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-			prefetchw(xdp.data_hard_start); /* xdp_frame write */
 
 			skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
 		}
@@ -8344,20 +8351,15 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp)
+			       struct xdp_frame *xdpf)
 {
 	struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
 	struct ixgbe_tx_buffer *tx_buffer;
 	union ixgbe_adv_tx_desc *tx_desc;
-	struct xdp_frame *xdpf;
 	u32 len, cmd_type;
 	dma_addr_t dma;
 	u16 i;
 
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	len = xdpf->len;
 
 	if (unlikely(!ixgbe_desc_unused(ring)))
@@ -10010,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10026,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdp);
+	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
 	if (err != IXGBE_XDP_TX)
 		return -ENOSPC;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bec130cdbd9d..1e58be152d5c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1301,18 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_frame *frame;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	frame = convert_to_xdp_frame(xdp);
-	if (unlikely(!frame))
-		return -EOVERFLOW;
-
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1336,6 +1331,16 @@ out:
 	return ret;
 }
 
+static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!frame))
+		return -EOVERFLOW;
+
+	return tun_xdp_xmit(dev, frame);
+}
+
 static void tun_xdp_flush(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -1683,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		case XDP_TX:
 			get_page(alloc_frag->page);
 			alloc_frag->offset += buflen;
-			if (tun_xdp_xmit(tun->dev, &xdp))
+			if (tun_xdp_tx(tun->dev, &xdp))
 				goto err_redirect;
 			tun_xdp_flush(tun->dev);
 			rcu_read_unlock();
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ab3d7cbc4c49..01694e26f03e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -416,10 +416,10 @@ static void virtnet_xdp_flush(struct net_device *dev)
 }
 
 static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			      struct xdp_buff *xdp)
+			       struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf, *xdpf_sent;
+	struct xdp_frame *xdpf_sent;
 	struct send_queue *sq;
 	unsigned int len;
 	unsigned int qp;
@@ -432,10 +432,6 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
 		xdp_return_frame(xdpf_sent);
 
-	xdpf = convert_to_xdp_frame(xdp);
-	if (unlikely(!xdpf))
-		return -EOVERFLOW;
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,7 +455,7 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -472,7 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdp);
+	return __virtnet_xdp_xmit(vi, xdpf);
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -569,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
 		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
+		struct xdp_frame *xdpf;
 		struct xdp_buff xdp;
 		void *orig_data;
 		u32 act;
@@ -611,7 +608,10 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			err = __virtnet_xdp_xmit(vi, &xdp);
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -702,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
+		struct xdp_frame *xdpf;
 		struct page *xdp_page;
 		struct xdp_buff xdp;
 		void *data;
@@ -766,7 +767,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			err = __virtnet_xdp_xmit(vi, &xdp);
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf44503ea81a..14e0777ffcfb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1165,7 +1165,7 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
  * void (*ndo_xdp_flush)(struct net_device *dev);
@@ -1356,7 +1356,7 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_buff *xdp);
+						struct xdp_frame *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index d31aff93270d..3bb0cb98a9be 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2749,13 +2749,18 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			struct xdp_buff *xdp,
 			u32 index)
 {
+	struct xdp_frame *xdpf;
 	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
 	}
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 	if (err)
 		return err;
 	dev->netdev_ops->ndo_xdp_flush(dev);
@@ -2771,11 +2776,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct net_device *dev = fwd;
+		struct xdp_frame *xdpf;
 
 		if (!dev->netdev_ops->ndo_xdp_xmit)
 			return -EOPNOTSUPP;
 
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf))
+			return -EOVERFLOW;
+
+		/* TODO: move to inside map code instead, for bulk support
+		 * err = dev_map_enqueue(dev, xdp);
+		 */
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3-59-g8ed1b


From 6dfb970d3dbd6bf274c4bf003ca8ab4a51dc66c7 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:46:43 +0200
Subject: xdp: avoid leaking info stored in frame data on page reuse

The bpf infrastructure and verifier goes to great length to avoid
bpf progs leaking kernel (pointer) info.

For queueing an xdp_buff via XDP_REDIRECT, xdp_frame info stores
kernel info (incl pointers) in top part of frame data (xdp->data_hard_start).
Checks are in place to assure enough headroom is available for this.

This info is not cleared, and if the frame is reused, then a
malicious user could use bpf_xdp_adjust_head helper to move
xdp->data into this area.  Thus, making this area readable.

This is not super critical as XDP progs requires root or
CAP_SYS_ADMIN, which are privileged enough for such info.  An
effort (is underway) towards moving networking bpf hooks to the
lesser privileged mode CAP_NET_ADMIN, where leaking such info
should be avoided.  Thus, this patch to clear the info when
needed.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 3bb0cb98a9be..a374b8560bc4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2692,6 +2692,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
 
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	unsigned long metalen = xdp_get_metalen(xdp);
 	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
@@ -2700,6 +2701,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	/* Avoid info leak, when reusing area prev used by xdp_frame */
+	if (data < xdp_frame_end) {
+		unsigned long clearlen = xdp_frame_end - data;
+
+		memset(data, 0, clearlen);
+	}
+
 	if (metalen)
 		memmove(xdp->data_meta + offset,
 			xdp->data_meta, metalen);
-- 
cgit v1.2.3-59-g8ed1b


From 032234d8231909ac049f22ea3b408487e1c103eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 10:00:39 -0700
Subject: net/ipv6: Make __inet6_bind static

BPF core gets access to __inet6_bind via ipv6_bpf_stub_impl, so it is
not invoked directly outside of af_inet6.c. Make it static and move
inet6_bind after to avoid forward declaration.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h  |  2 --
 net/ipv6/af_inet6.c | 53 ++++++++++++++++++++++++++---------------------------
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 836f31af1369..68b167d98879 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1044,8 +1044,6 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
 void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
 
 int inet6_release(struct socket *sock);
-int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len,
-		 bool force_bind_address_no_port, bool with_lock);
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 		  int peer);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2c694912df2e..36d622c477b1 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -273,33 +273,8 @@ out_rcu_unlock:
 	goto out;
 }
 
-
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
-	struct sock *sk = sock->sk;
-	int err = 0;
-
-	/* If the socket has its own bind function then use it. */
-	if (sk->sk_prot->bind)
-		return sk->sk_prot->bind(sk, uaddr, addr_len);
-
-	if (addr_len < SIN6_LEN_RFC2133)
-		return -EINVAL;
-
-	/* BPF prog is run before any checks are done so that if the prog
-	 * changes context in a wrong way it will be caught.
-	 */
-	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
-	if (err)
-		return err;
-
-	return __inet6_bind(sk, uaddr, addr_len, false, true);
-}
-EXPORT_SYMBOL(inet6_bind);
-
-int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-		 bool force_bind_address_no_port, bool with_lock)
+static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+			bool force_bind_address_no_port, bool with_lock)
 {
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
 	struct inet_sock *inet = inet_sk(sk);
@@ -444,6 +419,30 @@ out_unlock:
 	goto out;
 }
 
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	/* If the socket has its own bind function then use it. */
+	if (sk->sk_prot->bind)
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	/* BPF prog is run before any checks are done so that if the prog
+	 * changes context in a wrong way it will be caught.
+	 */
+	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+	if (err)
+		return err;
+
+	return __inet6_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet6_bind);
+
 int inet6_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
-- 
cgit v1.2.3-59-g8ed1b


From bdb7cc643fc9db8d6ed9a2b9e524e27ac5882029 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Mon, 16 Apr 2018 13:42:16 -0400
Subject: ipv6: Count interface receive statistics on the ingress netdev

The statistics such as InHdrErrors should be counted on the ingress
netdev rather than on the dev from the dst, which is the egress.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h          | 14 +++++++++++
 net/ipv6/exthdrs.c              | 55 ++++++++++++++++-------------------------
 net/ipv6/ip6_input.c            |  2 +-
 net/ipv6/ip6_output.c           | 18 ++++++--------
 net/ipv6/reassembly.c           |  6 ++---
 net/ipv6/route.c                |  3 ++-
 net/netfilter/ipvs/ip_vs_xmit.c |  5 ++--
 7 files changed, 51 insertions(+), 52 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 378d601258be..8312cc25a3af 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -307,6 +307,20 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
 	return rcu_dereference_rtnl(dev->ip6_ptr);
 }
 
+/**
+ * __in6_dev_get_safely - get inet6_dev pointer from netdevice
+ * @dev: network device
+ *
+ * This is a safer version of __in6_dev_get
+ */
+static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
+{
+	if (likely(dev))
+		return rcu_dereference_rtnl(dev->ip6_ptr);
+	else
+		return NULL;
+}
+
 /**
  * in6_dev_get - get inet6_dev pointer from netdevice
  * @dev: network device
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bc68eb661970..5bc2bf3733ab 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = {
 
 static int ipv6_destopt_rcv(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 	__u16 dstbuf;
@@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		__IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+		__IP6_INC_STATS(dev_net(dst->dev), idev,
 				IPSTATS_MIB_INHDRERRORS);
 fail_and_free:
 		kfree_skb(skb);
@@ -319,8 +320,7 @@ fail_and_free:
 		return 1;
 	}
 
-	__IP6_INC_STATS(dev_net(dst->dev),
-			ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 	return -1;
 }
 
@@ -416,8 +416,7 @@ looped_back:
 	}
 
 	if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
 				   skb_network_header(skb)));
@@ -456,8 +455,7 @@ looped_back:
 
 	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
 				    ICMPV6_EXC_HOPLIMIT, 0);
 			kfree_skb(skb);
@@ -481,10 +479,10 @@ looped_back:
 /* called with rcu_read_lock() */
 static int ipv6_rthdr_rcv(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
 	struct inet6_skb_parm *opt = IP6CB(skb);
 	struct in6_addr *addr = NULL;
 	struct in6_addr daddr;
-	struct inet6_dev *idev;
 	int n, i;
 	struct ipv6_rt_hdr *hdr;
 	struct rt0_hdr *rthdr;
@@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 
 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) ||
 	    skb->pkt_type != PACKET_HOST) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INADDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -527,7 +523,7 @@ looped_back:
 			 * processed by own
 			 */
 			if (!addr) {
-				__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				__IP6_INC_STATS(net, idev,
 						IPSTATS_MIB_INADDRERRORS);
 				kfree_skb(skb);
 				return -1;
@@ -553,8 +549,7 @@ looped_back:
 			goto unknown_rh;
 		/* Silently discard invalid RTH type 2 */
 		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
@@ -572,8 +567,7 @@ looped_back:
 	n = hdr->hdrlen >> 1;
 
 	if (hdr->segments_left > n) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((&hdr->segments_left) -
 				   skb_network_header(skb)));
@@ -609,14 +603,12 @@ looped_back:
 		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
 				     (xfrm_address_t *)&ipv6_hdr(skb)->saddr,
 				     IPPROTO_ROUTING) < 0) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INADDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
 		if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INADDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
 		}
@@ -627,8 +619,7 @@ looped_back:
 	}
 
 	if (ipv6_addr_is_multicast(addr)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				IPSTATS_MIB_INADDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
 		return -1;
 	}
@@ -647,8 +638,7 @@ looped_back:
 
 	if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
 				    0);
 			kfree_skb(skb);
@@ -663,7 +653,7 @@ looped_back:
 	return -1;
 
 unknown_rh:
-	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 			  (&hdr->type) - skb_network_header(skb));
 	return -1;
@@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
 static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
 {
 	const unsigned char *nh = skb_network_header(skb);
+	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 	struct net *net = ipv6_skb_net(skb);
 	u32 pkt_len;
 
 	if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
 		net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
 				    nh[optoff+1]);
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		goto drop;
 	}
 
 	pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
 	if (pkt_len <= IPV6_MAXPLEN) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
 		return false;
 	}
 	if (ipv6_hdr(skb)->payload_len) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
 		return false;
 	}
 
 	if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
-		__IP6_INC_STATS(net, ipv6_skb_idev(skb),
-				IPSTATS_MIB_INTRUNCATEDPKTS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
 		goto drop;
 	}
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9ee208a348f5..f08d34491ece 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb)
 	bool deliver;
 
 	__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
-			 ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST,
+			 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 			 skb->len);
 
 	hdr = ipv6_hdr(skb);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2e891d2c30ef..a39b04f9fa6e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -425,6 +425,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 
 int ip6_forward(struct sk_buff *skb)
 {
+	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -444,8 +445,7 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INDISCARDS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 
@@ -476,8 +476,7 @@ int ip6_forward(struct sk_buff *skb)
 		/* Force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INHDRERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 
 		kfree_skb(skb);
 		return -ETIMEDOUT;
@@ -490,15 +489,13 @@ int ip6_forward(struct sk_buff *skb)
 		if (proxied > 0)
 			return ip6_input(skb);
 		else if (proxied < 0) {
-			__IP6_INC_STATS(net, ip6_dst_idev(dst),
-					IPSTATS_MIB_INDISCARDS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 			goto drop;
 		}
 	}
 
 	if (!xfrm6_route_forward(skb)) {
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INDISCARDS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 		goto drop;
 	}
 	dst = skb_dst(skb);
@@ -554,8 +551,7 @@ int ip6_forward(struct sk_buff *skb)
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		__IP6_INC_STATS(net, ip6_dst_idev(dst),
-				IPSTATS_MIB_INTOOBIGERRORS);
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 				IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
@@ -579,7 +575,7 @@ int ip6_forward(struct sk_buff *skb)
 		       ip6_forward_finish);
 
 error:
-	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
+	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 drop:
 	kfree_skb(skb);
 	return -EINVAL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4979610287e2..2cdf3dcf8c2c 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -179,7 +179,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 				IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 				  ((u8 *)&fhdr->frag_off -
@@ -214,7 +214,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 					IPSTATS_MIB_INHDRERRORS);
 			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
 					  offsetof(struct ipv6hdr, payload_len));
@@ -536,7 +536,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	return -1;
 
 fail_hdr:
-	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+	__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 			IPSTATS_MIB_INHDRERRORS);
 	icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb));
 	return -1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49b954d6d0fa..1d738bfe893b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3541,7 +3541,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 	case IPSTATS_MIB_INNOROUTES:
 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
 		if (type == IPV6_ADDR_ANY) {
-			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
+			IP6_INC_STATS(dev_net(dst->dev),
+				      __in6_dev_get_safely(skb->dev),
 				      IPSTATS_MIB_INADDRERRORS);
 			break;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
 
 		/* check and decrement ttl */
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
+			struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
+
 			/* Force OUTPUT device used as source address */
 			skb->dev = dst->dev;
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
 				    ICMPV6_EXC_HOPLIMIT, 0);
-			__IP6_INC_STATS(net, ip6_dst_idev(dst),
-					IPSTATS_MIB_INHDRERRORS);
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 
 			return false;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From e3c1917e45d030411888285f56c9d42e7472cb1f Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 16 Apr 2018 22:59:26 +0200
Subject: selftest: tc_flower: add testcase for 'ip_flags'

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/net/forwarding/tc_flower.sh  | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh
index 032b882adfc0..0c54059f1875 100755
--- a/tools/testing/selftests/net/forwarding/tc_flower.sh
+++ b/tools/testing/selftests/net/forwarding/tc_flower.sh
@@ -149,6 +149,74 @@ match_src_ip_test()
 	log_test "src_ip match ($tcflags)"
 }
 
+match_ip_flags_test()
+{
+	RET=0
+
+	tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+		$tcflags ip_flags frag action continue
+	tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+		$tcflags ip_flags firstfrag action continue
+	tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+		$tcflags ip_flags nofirstfrag action continue
+	tc filter add dev $h2 ingress protocol ip pref 4 handle 104 flower \
+		$tcflags ip_flags nofrag action drop
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=0" -q
+
+	tc_check_packets "dev $h2 ingress" 101 1
+	check_fail $? "Matched on wrong frag filter (nofrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_fail $? "Matched on wrong firstfrag filter (nofrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 1
+	check_err $? "Did not match on nofirstfrag filter (nofrag) "
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Did not match on nofrag filter (nofrag)"
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=0,mf" -q
+
+	tc_check_packets "dev $h2 ingress" 101 1
+	check_err $? "Did not match on frag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_err $? "Did not match fistfrag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 1
+	check_err $? "Matched on wrong nofirstfrag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Match on wrong nofrag filter (1stfrag)"
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=256,mf" -q
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=256" -q
+
+	tc_check_packets "dev $h2 ingress" 101 3
+	check_err $? "Did not match on frag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_err $? "Matched on wrong firstfrag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 3
+	check_err $? "Did not match on nofirstfrag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Matched on nofrag filter (no1stfrag)"
+
+	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+	tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+	tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+	tc filter del dev $h2 ingress protocol ip pref 4 handle 104 flower
+
+	log_test "ip_flags match ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
@@ -181,6 +249,7 @@ match_dst_mac_test
 match_src_mac_test
 match_dst_ip_test
 match_src_ip_test
+match_ip_flags_test
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
@@ -191,6 +260,7 @@ else
 	match_src_mac_test
 	match_dst_ip_test
 	match_src_ip_test
+	match_ip_flags_test
 fi
 
 exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From c210f7b411790ee0fc4252547d645c5d8648adc5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 16 Apr 2018 14:29:22 -0700
Subject: KEYS: DNS: limit the length of option strings

Adding a dns_resolver key whose payload contains a very long option name
resulted in that string being printed in full.  This hit the WARN_ONCE()
in set_precision() during the printk(), because printk() only supports a
precision of up to 32767 bytes:

    precision 1000000 too large
    WARNING: CPU: 0 PID: 752 at lib/vsprintf.c:2189 vsnprintf+0x4bc/0x5b0

Fix it by limiting option strings (combined name + value) to a much more
reasonable 128 bytes.  The exact limit is arbitrary, but currently the
only recognized option is formatted as "dnserror=%lu" which fits well
within this limit.

Also ratelimit the printks.

Reproducer:

    perl -e 'print "#", "A" x 1000000, "\x00"' | keyctl padd dns_resolver desc @s

This bug was found using syzkaller.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dns_resolver/dns_key.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8396705deffc..40c851693f77 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -91,9 +91,9 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 
 			next_opt = memchr(opt, '#', end - opt) ?: end;
 			opt_len = next_opt - opt;
-			if (!opt_len) {
-				printk(KERN_WARNING
-				       "Empty option to dns_resolver key\n");
+			if (opt_len <= 0 || opt_len > 128) {
+				pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
+						    opt_len);
 				return -EINVAL;
 			}
 
@@ -127,10 +127,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 			}
 
 		bad_option_value:
-			printk(KERN_WARNING
-			       "Option '%*.*s' to dns_resolver key:"
-			       " bad/missing value\n",
-			       opt_nlen, opt_nlen, opt);
+			pr_warn_ratelimited("Option '%*.*s' to dns_resolver key: bad/missing value\n",
+					    opt_nlen, opt_nlen, opt);
 			return -EINVAL;
 		} while (opt = next_opt + 1, opt < end);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 062b3e1b6d4f2a33c1d0fd7ae9b4550da5cf7e4b Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Tue, 17 Apr 2018 14:23:23 +1000
Subject: net/ncsi: Refactor MAC, VLAN filters

The NCSI driver defines a generic ncsi_channel_filter struct that can be
used to store arbitrarily formatted filters, and several generic methods
of accessing data stored in such a filter.
However in both the driver and as defined in the NCSI specification
there are only two actual filters: VLAN ID filters and MAC address
filters. The splitting of the MAC filter into unicast, multicast, and
mixed is also technically not necessary as these are stored in the same
location in hardware.

To save complexity, particularly in the set up and accessing of these
generic filters, remove them in favour of two specific structs. These
can be acted on directly and do not need several generic helper
functions to use.

This also fixes a memory error found by KASAN on ARM32 (which is not
upstream yet), where response handlers accessing a filter's data field
could write past allocated memory.

[  114.926512] ==================================================================
[  114.933861] BUG: KASAN: slab-out-of-bounds in ncsi_configure_channel+0x4b8/0xc58
[  114.941304] Read of size 2 at addr 94888558 by task kworker/0:2/546
[  114.947593]
[  114.949146] CPU: 0 PID: 546 Comm: kworker/0:2 Not tainted 4.16.0-rc6-00119-ge156398bfcad #13
...
[  115.170233] The buggy address belongs to the object at 94888540
[  115.170233]  which belongs to the cache kmalloc-32 of size 32
[  115.181917] The buggy address is located 24 bytes inside of
[  115.181917]  32-byte region [94888540, 94888560)
[  115.192115] The buggy address belongs to the page:
[  115.196943] page:9eeac100 count:1 mapcount:0 mapping:94888000 index:0x94888fc1
[  115.204200] flags: 0x100(slab)
[  115.207330] raw: 00000100 94888000 94888fc1 0000003f 00000001 9eea2014 9eecaa74 96c003e0
[  115.215444] page dumped because: kasan: bad access detected
[  115.221036]
[  115.222544] Memory state around the buggy address:
[  115.227384]  94888400: fb fb fb fb fc fc fc fc 04 fc fc fc fc fc fc fc
[  115.233959]  94888480: 00 00 00 fc fc fc fc fc 00 04 fc fc fc fc fc fc
[  115.240529] >94888500: 00 00 04 fc fc fc fc fc 00 00 04 fc fc fc fc fc
[  115.247077]                                             ^
[  115.252523]  94888580: 00 04 fc fc fc fc fc fc 06 fc fc fc fc fc fc fc
[  115.259093]  94888600: 00 00 06 fc fc fc fc fc 00 00 04 fc fc fc fc fc
[  115.265639] ==================================================================

Reported-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/internal.h     |  34 ++++----
 net/ncsi/ncsi-manage.c  | 226 +++++++++++-------------------------------------
 net/ncsi/ncsi-netlink.c |  20 ++---
 net/ncsi/ncsi-rsp.c     | 178 ++++++++++++++++----------------------
 4 files changed, 147 insertions(+), 311 deletions(-)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 8da84312cd3b..8055e3965cef 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -68,15 +68,6 @@ enum {
 	NCSI_MODE_MAX
 };
 
-enum {
-	NCSI_FILTER_BASE	= 0,
-	NCSI_FILTER_VLAN	= 0,
-	NCSI_FILTER_UC,
-	NCSI_FILTER_MC,
-	NCSI_FILTER_MIXED,
-	NCSI_FILTER_MAX
-};
-
 struct ncsi_channel_version {
 	u32 version;		/* Supported BCD encoded NCSI version */
 	u32 alpha2;		/* Supported BCD encoded NCSI version */
@@ -98,11 +89,18 @@ struct ncsi_channel_mode {
 	u32 data[8];	/* Data entries                */
 };
 
-struct ncsi_channel_filter {
-	u32 index;	/* Index of channel filters          */
-	u32 total;	/* Total entries in the filter table */
-	u64 bitmap;	/* Bitmap of valid entries           */
-	u32 data[];	/* Data for the valid entries        */
+struct ncsi_channel_mac_filter {
+	u8	n_uc;
+	u8	n_mc;
+	u8	n_mixed;
+	u64	bitmap;
+	unsigned char	*addrs;
+};
+
+struct ncsi_channel_vlan_filter {
+	u8	n_vids;
+	u64	bitmap;
+	u16	*vids;
 };
 
 struct ncsi_channel_stats {
@@ -186,7 +184,9 @@ struct ncsi_channel {
 	struct ncsi_channel_version version;
 	struct ncsi_channel_cap	    caps[NCSI_CAP_MAX];
 	struct ncsi_channel_mode    modes[NCSI_MODE_MAX];
-	struct ncsi_channel_filter  *filters[NCSI_FILTER_MAX];
+	/* Filtering Settings */
+	struct ncsi_channel_mac_filter	mac_filter;
+	struct ncsi_channel_vlan_filter	vlan_filter;
 	struct ncsi_channel_stats   stats;
 	struct {
 		struct timer_list   timer;
@@ -320,10 +320,6 @@ extern spinlock_t ncsi_dev_lock;
 	list_for_each_entry_rcu(nc, &np->channels, node)
 
 /* Resources */
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index);
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
 void ncsi_start_channel_monitor(struct ncsi_channel *nc);
 void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
 struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c3695ba0cf94..5561e221b71f 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -27,125 +27,6 @@
 LIST_HEAD(ncsi_dev_list);
 DEFINE_SPINLOCK(ncsi_dev_lock);
 
-static inline int ncsi_filter_size(int table)
-{
-	int sizes[] = { 2, 6, 6, 6 };
-
-	BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX);
-	if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX)
-		return -EINVAL;
-
-	return sizes[table];
-}
-
-u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
-{
-	struct ncsi_channel_filter *ncf;
-	int size;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return NULL;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return NULL;
-
-	return ncf->data + size * index;
-}
-
-/* Find the first active filter in a filter table that matches the given
- * data parameter. If data is NULL, this returns the first active filter.
- */
-int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
-{
-	struct ncsi_channel_filter *ncf;
-	void *bitmap;
-	int index, size;
-	unsigned long flags;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return -ENXIO;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	index = -1;
-	while ((index = find_next_bit(bitmap, ncf->total, index + 1))
-	       < ncf->total) {
-		if (!data || !memcmp(ncf->data + size * index, data, size)) {
-			spin_unlock_irqrestore(&nc->lock, flags);
-			return index;
-		}
-	}
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return -ENOENT;
-}
-
-int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data)
-{
-	struct ncsi_channel_filter *ncf;
-	int index, size;
-	void *bitmap;
-	unsigned long flags;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	index = ncsi_find_filter(nc, table, data);
-	if (index >= 0)
-		return index;
-
-	ncf = nc->filters[table];
-	if (!ncf)
-		return -ENODEV;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	do {
-		index = find_next_zero_bit(bitmap, ncf->total, 0);
-		if (index >= ncf->total) {
-			spin_unlock_irqrestore(&nc->lock, flags);
-			return -ENOSPC;
-		}
-	} while (test_and_set_bit(index, bitmap));
-
-	memcpy(ncf->data + size * index, data, size);
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return index;
-}
-
-int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index)
-{
-	struct ncsi_channel_filter *ncf;
-	int size;
-	void *bitmap;
-	unsigned long flags;
-
-	size = ncsi_filter_size(table);
-	if (size < 0)
-		return size;
-
-	ncf = nc->filters[table];
-	if (!ncf || index >= ncf->total)
-		return -ENODEV;
-
-	spin_lock_irqsave(&nc->lock, flags);
-	bitmap = (void *)&ncf->bitmap;
-	if (test_and_clear_bit(index, bitmap))
-		memset(ncf->data + size * index, 0, size);
-	spin_unlock_irqrestore(&nc->lock, flags);
-
-	return 0;
-}
-
 static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -339,20 +220,13 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id)
 static void ncsi_remove_channel(struct ncsi_channel *nc)
 {
 	struct ncsi_package *np = nc->package;
-	struct ncsi_channel_filter *ncf;
 	unsigned long flags;
-	int i;
 
-	/* Release filters */
 	spin_lock_irqsave(&nc->lock, flags);
-	for (i = 0; i < NCSI_FILTER_MAX; i++) {
-		ncf = nc->filters[i];
-		if (!ncf)
-			continue;
 
-		nc->filters[i] = NULL;
-		kfree(ncf);
-	}
+	/* Release filters */
+	kfree(nc->mac_filter.addrs);
+	kfree(nc->vlan_filter.vids);
 
 	nc->state = NCSI_CHANNEL_INACTIVE;
 	spin_unlock_irqrestore(&nc->lock, flags);
@@ -670,32 +544,26 @@ error:
 static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 			 struct ncsi_cmd_arg *nca)
 {
+	struct ncsi_channel_vlan_filter *ncf;
+	unsigned long flags;
+	void *bitmap;
 	int index;
-	u32 *data;
 	u16 vid;
 
-	index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL);
-	if (index < 0) {
-		/* Filter table empty */
-		return -1;
-	}
+	ncf = &nc->vlan_filter;
+	bitmap = &ncf->bitmap;
 
-	data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index);
-	if (!data) {
-		netdev_err(ndp->ndev.dev,
-			   "NCSI: failed to retrieve filter %d\n", index);
-		/* Set the VLAN id to 0 - this will still disable the entry in
-		 * the filter table, but we won't know what it was.
-		 */
-		vid = 0;
-	} else {
-		vid = *(u16 *)data;
+	spin_lock_irqsave(&nc->lock, flags);
+	index = find_next_bit(bitmap, ncf->n_vids, 0);
+	if (index >= ncf->n_vids) {
+		spin_unlock_irqrestore(&nc->lock, flags);
+		return -1;
 	}
+	vid = ncf->vids[index];
 
-	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-		      "NCSI: removed vlan tag %u at index %d\n",
-		      vid, index + 1);
-	ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index);
+	clear_bit(index, bitmap);
+	ncf->vids[index] = 0;
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	nca->type = NCSI_PKT_CMD_SVF;
 	nca->words[1] = vid;
@@ -711,45 +579,55 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 		       struct ncsi_cmd_arg *nca)
 {
+	struct ncsi_channel_vlan_filter *ncf;
 	struct vlan_vid *vlan = NULL;
-	int index = 0;
+	unsigned long flags;
+	int i, index;
+	void *bitmap;
+	u16 vid;
 
+	if (list_empty(&ndp->vlan_vids))
+		return -1;
+
+	ncf = &nc->vlan_filter;
+	bitmap = &ncf->bitmap;
+
+	spin_lock_irqsave(&nc->lock, flags);
+
+	rcu_read_lock();
 	list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
-		index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
-		if (index < 0) {
-			/* New tag to add */
-			netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-				      "NCSI: new vlan id to set: %u\n",
-				      vlan->vid);
+		vid = vlan->vid;
+		for (i = 0; i < ncf->n_vids; i++)
+			if (ncf->vids[i] == vid) {
+				vid = 0;
+				break;
+			}
+		if (vid)
 			break;
-		}
-		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-			      "vid %u already at filter pos %d\n",
-			      vlan->vid, index);
 	}
+	rcu_read_unlock();
 
-	if (!vlan || index >= 0) {
-		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-			      "no vlan ids left to set\n");
+	if (!vid) {
+		/* No VLAN ID is not set */
+		spin_unlock_irqrestore(&nc->lock, flags);
 		return -1;
 	}
 
-	index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
-	if (index < 0) {
+	index = find_next_zero_bit(bitmap, ncf->n_vids, 0);
+	if (index < 0 || index >= ncf->n_vids) {
 		netdev_err(ndp->ndev.dev,
-			   "Failed to add new VLAN tag, error %d\n", index);
-		if (index == -ENOSPC)
-			netdev_err(ndp->ndev.dev,
-				   "Channel %u already has all VLAN filters set\n",
-				   nc->id);
+			   "Channel %u already has all VLAN filters set\n",
+			   nc->id);
+		spin_unlock_irqrestore(&nc->lock, flags);
 		return -1;
 	}
 
-	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
-		      "NCSI: set vid %u in packet, index %u\n",
-		      vlan->vid, index + 1);
+	ncf->vids[index] = vid;
+	set_bit(index, bitmap);
+	spin_unlock_irqrestore(&nc->lock, flags);
+
 	nca->type = NCSI_PKT_CMD_SVF;
-	nca->words[1] = vlan->vid;
+	nca->words[1] = vid;
 	/* HW filter index starts at 1 */
 	nca->bytes[6] = index + 1;
 	nca->bytes[7] = 0x01;
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 8d7e849d4825..b09ef77bf4cd 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -58,10 +58,9 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
 				   struct ncsi_dev_priv *ndp,
 				   struct ncsi_channel *nc)
 {
-	struct nlattr *vid_nest;
-	struct ncsi_channel_filter *ncf;
+	struct ncsi_channel_vlan_filter *ncf;
 	struct ncsi_channel_mode *m;
-	u32 *data;
+	struct nlattr *vid_nest;
 	int i;
 
 	nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id);
@@ -79,18 +78,13 @@ static int ncsi_write_channel_info(struct sk_buff *skb,
 	vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST);
 	if (!vid_nest)
 		return -ENOMEM;
-	ncf = nc->filters[NCSI_FILTER_VLAN];
+	ncf = &nc->vlan_filter;
 	i = -1;
-	if (ncf) {
-		while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total,
-					  i + 1)) < ncf->total) {
-			data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i);
-			/* Uninitialised channels will have 'zero' vlan ids */
-			if (!data || !*data)
-				continue;
+	while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids,
+				  i + 1)) < ncf->n_vids) {
+		if (ncf->vids[i])
 			nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID,
-				    *(u16 *)data);
-		}
+				    ncf->vids[i]);
 	}
 	nla_nest_end(skb, vid_nest);
 
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index efd933ff5570..ce9497966ebe 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -334,9 +334,9 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
 	struct ncsi_rsp_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
-	unsigned short vlan;
-	int ret;
+	struct ncsi_channel_vlan_filter *ncf;
+	unsigned long flags;
+	void *bitmap;
 
 	/* Find the package and channel */
 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -346,22 +346,23 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
 		return -ENODEV;
 
 	cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
-	ncf = nc->filters[NCSI_FILTER_VLAN];
-	if (!ncf)
-		return -ENOENT;
-	if (cmd->index >= ncf->total)
+	ncf = &nc->vlan_filter;
+	if (cmd->index > ncf->n_vids)
 		return -ERANGE;
 
-	/* Add or remove the VLAN filter */
+	/* Add or remove the VLAN filter. Remember HW indexes from 1 */
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = &ncf->bitmap;
 	if (!(cmd->enable & 0x1)) {
-		/* HW indexes from 1 */
-		ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index - 1);
+		if (test_and_clear_bit(cmd->index - 1, bitmap))
+			ncf->vids[cmd->index - 1] = 0;
 	} else {
-		vlan = ntohs(cmd->vlan);
-		ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+		set_bit(cmd->index - 1, bitmap);
+		ncf->vids[cmd->index - 1] = ntohs(cmd->vlan);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
-	return ret;
+	return 0;
 }
 
 static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
@@ -422,8 +423,12 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
 	struct ncsi_rsp_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
+	struct ncsi_channel_mac_filter *ncf;
+	unsigned long flags;
 	void *bitmap;
+	bool enabled;
+	int index;
+
 
 	/* Find the package and channel */
 	rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp);
@@ -436,31 +441,23 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
 	 * isn't supported yet.
 	 */
 	cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd);
-	switch (cmd->at_e >> 5) {
-	case 0x0:	/* UC address */
-		ncf = nc->filters[NCSI_FILTER_UC];
-		break;
-	case 0x1:	/* MC address */
-		ncf = nc->filters[NCSI_FILTER_MC];
-		break;
-	default:
-		return -EINVAL;
-	}
+	enabled = cmd->at_e & 0x1;
+	ncf = &nc->mac_filter;
+	bitmap = &ncf->bitmap;
 
-	/* Sanity check on the filter */
-	if (!ncf)
-		return -ENOENT;
-	else if (cmd->index >= ncf->total)
+	if (cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
 		return -ERANGE;
 
-	bitmap = &ncf->bitmap;
-	if (cmd->at_e & 0x1) {
-		set_bit(cmd->index, bitmap);
-		memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6);
+	index = (cmd->index - 1) * ETH_ALEN;
+	spin_lock_irqsave(&nc->lock, flags);
+	if (enabled) {
+		set_bit(cmd->index - 1, bitmap);
+		memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN);
 	} else {
-		clear_bit(cmd->index, bitmap);
-		memset(ncf->data + 6 * cmd->index, 0, 6);
+		clear_bit(cmd->index - 1, bitmap);
+		memset(&ncf->addrs[index], 0, ETH_ALEN);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	return 0;
 }
@@ -631,9 +628,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 	struct ncsi_rsp_gc_pkt *rsp;
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	struct ncsi_channel *nc;
-	struct ncsi_channel_filter *ncf;
-	size_t size, entry_size;
-	int cnt, i;
+	size_t size;
 
 	/* Find the channel */
 	rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp);
@@ -655,64 +650,40 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 	nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode &
 				      NCSI_CAP_VLAN_MASK;
 
-	/* Build filters */
-	for (i = 0; i < NCSI_FILTER_MAX; i++) {
-		switch (i) {
-		case NCSI_FILTER_VLAN:
-			cnt = rsp->vlan_cnt;
-			entry_size = 2;
-			break;
-		case NCSI_FILTER_MIXED:
-			cnt = rsp->mixed_cnt;
-			entry_size = 6;
-			break;
-		case NCSI_FILTER_MC:
-			cnt = rsp->mc_cnt;
-			entry_size = 6;
-			break;
-		case NCSI_FILTER_UC:
-			cnt = rsp->uc_cnt;
-			entry_size = 6;
-			break;
-		default:
-			continue;
-		}
-
-		if (!cnt || nc->filters[i])
-			continue;
-
-		size = sizeof(*ncf) + cnt * entry_size;
-		ncf = kzalloc(size, GFP_ATOMIC);
-		if (!ncf) {
-			pr_warn("%s: Cannot alloc filter table (%d)\n",
-				__func__, i);
-			return -ENOMEM;
-		}
-
-		ncf->index = i;
-		ncf->total = cnt;
-		if (i == NCSI_FILTER_VLAN) {
-			/* Set VLAN filters active so they are cleared in
-			 * first configuration state
-			 */
-			ncf->bitmap = U64_MAX;
-		} else {
-			ncf->bitmap = 0x0ul;
-		}
-		nc->filters[i] = ncf;
-	}
+	size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN;
+	nc->mac_filter.addrs = kzalloc(size, GFP_KERNEL);
+	if (!nc->mac_filter.addrs)
+		return -ENOMEM;
+	nc->mac_filter.n_uc = rsp->uc_cnt;
+	nc->mac_filter.n_mc = rsp->mc_cnt;
+	nc->mac_filter.n_mixed = rsp->mixed_cnt;
+
+	nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt,
+				       sizeof(*nc->vlan_filter.vids),
+				       GFP_KERNEL);
+	if (!nc->vlan_filter.vids)
+		return -ENOMEM;
+	/* Set VLAN filters active so they are cleared in the first
+	 * configuration state
+	 */
+	nc->vlan_filter.bitmap = U64_MAX;
+	nc->vlan_filter.n_vids = rsp->vlan_cnt;
 
 	return 0;
 }
 
 static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
 {
-	struct ncsi_rsp_gp_pkt *rsp;
+	struct ncsi_channel_vlan_filter *ncvf;
+	struct ncsi_channel_mac_filter *ncmf;
 	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct ncsi_rsp_gp_pkt *rsp;
 	struct ncsi_channel *nc;
-	unsigned short enable, vlan;
+	unsigned short enable;
 	unsigned char *pdata;
-	int table, i;
+	unsigned long flags;
+	void *bitmap;
+	int i;
 
 	/* Find the channel */
 	rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp);
@@ -746,36 +717,33 @@ static int ncsi_rsp_handler_gp(struct ncsi_request *nr)
 	/* MAC addresses filter table */
 	pdata = (unsigned char *)rsp + 48;
 	enable = rsp->mac_enable;
+	ncmf = &nc->mac_filter;
+	spin_lock_irqsave(&nc->lock, flags);
+	bitmap = &ncmf->bitmap;
 	for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) {
-		if (i >= (nc->filters[NCSI_FILTER_UC]->total +
-			  nc->filters[NCSI_FILTER_MC]->total))
-			table = NCSI_FILTER_MIXED;
-		else if (i >= nc->filters[NCSI_FILTER_UC]->total)
-			table = NCSI_FILTER_MC;
-		else
-			table = NCSI_FILTER_UC;
-
 		if (!(enable & (0x1 << i)))
-			continue;
-
-		if (ncsi_find_filter(nc, table, pdata) >= 0)
-			continue;
+			clear_bit(i, bitmap);
+		else
+			set_bit(i, bitmap);
 
-		ncsi_add_filter(nc, table, pdata);
+		memcpy(&ncmf->addrs[i * ETH_ALEN], pdata, ETH_ALEN);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	/* VLAN filter table */
 	enable = ntohs(rsp->vlan_enable);
+	ncvf = &nc->vlan_filter;
+	bitmap = &ncvf->bitmap;
+	spin_lock_irqsave(&nc->lock, flags);
 	for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) {
 		if (!(enable & (0x1 << i)))
-			continue;
-
-		vlan = ntohs(*(__be16 *)pdata);
-		if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0)
-			continue;
+			clear_bit(i, bitmap);
+		else
+			set_bit(i, bitmap);
 
-		ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan);
+		ncvf->vids[i] = ntohs(*(__be16 *)pdata);
 	}
+	spin_unlock_irqrestore(&nc->lock, flags);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 72f6d71e491e6ce269b564865b21fab0a4402dd3 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 17 Apr 2018 14:11:28 +0800
Subject: vxlan: add ttl inherit support

Like tos inherit, ttl inherit should also means inherit the inner protocol's
ttl values, which actually not implemented in vxlan yet.

But we could not treat ttl == 0 as "use the inner TTL", because that would be
used also when the "ttl" option is not specified and that would be a behavior
change, and breaking real use cases.

So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is
specified with ip cmd.

Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 17 ++++++++++++++---
 include/net/ip_tunnels.h     | 11 +++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index fab7a4db249e..aee0e60471f1 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2085,9 +2085,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		local_ip = vxlan->cfg.saddr;
 		dst_cache = &rdst->dst_cache;
 		md->gbp = skb->mark;
-		ttl = vxlan->cfg.ttl;
-		if (!ttl && vxlan_addr_multicast(dst))
-			ttl = 1;
+		if (flags & VXLAN_F_TTL_INHERIT) {
+			ttl = ip_tunnel_get_ttl(old_iph, skb);
+		} else {
+			ttl = vxlan->cfg.ttl;
+			if (!ttl && vxlan_addr_multicast(dst))
+				ttl = 1;
+		}
 
 		tos = vxlan->cfg.tos;
 		if (tos == 1)
@@ -2709,6 +2713,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -3254,6 +3259,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_VXLAN_TTL])
 		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_TTL_INHERIT]) {
+		if (changelink)
+			return -EOPNOTSUPP;
+		conf->flags |= VXLAN_F_TTL_INHERIT;
+	}
+
 	if (data[IFLA_VXLAN_LABEL])
 		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
 			     IPV6_FLOWLABEL_MASK;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 540a4b4417bf..751646adc769 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -379,6 +379,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
 		return 0;
 }
 
+static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph,
+				       const struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP))
+		return iph->ttl;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		return ((const struct ipv6hdr *)iph)->hop_limit;
+	else
+		return 0;
+}
+
 /* Propogate ECN bits out */
 static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 				     const struct sk_buff *skb)
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index ad73d8b3fcc2..b99a02ae3934 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -262,6 +262,7 @@ struct vxlan_dev {
 #define VXLAN_F_COLLECT_METADATA	0x2000
 #define VXLAN_F_GPE			0x4000
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
+#define VXLAN_F_TTL_INHERIT		0x10000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 68699f654118..b85266420bfb 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -516,6 +516,7 @@ enum {
 	IFLA_VXLAN_COLLECT_METADATA,
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
+	IFLA_VXLAN_TTL_INHERIT,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3-59-g8ed1b


From a64dcddc5cdfbf1d4d0af519c30cd6c2e4828282 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Tue, 17 Apr 2018 15:17:11 +0530
Subject: cxgb4vf: display pause settings

Add support to display pause settings

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 9a81b52307a9..71f13bd2b5e4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1419,6 +1419,22 @@ static int cxgb4vf_get_link_ksettings(struct net_device *dev,
 		base->duplex = DUPLEX_UNKNOWN;
 	}
 
+	if (pi->link_cfg.fc & PAUSE_RX) {
+		if (pi->link_cfg.fc & PAUSE_TX) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Pause);
+		} else {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Asym_Pause);
+		}
+	} else if (pi->link_cfg.fc & PAUSE_TX) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising,
+						     Asym_Pause);
+	}
+
 	base->autoneg = pi->link_cfg.autoneg;
 	if (pi->link_cfg.pcaps & FW_PORT_CAP32_ANEG)
 		ethtool_link_ksettings_add_link_mode(link_ksettings,
-- 
cgit v1.2.3-59-g8ed1b


From a2d481b326c98b6b67eea8a378c858d57ca5ff3d Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 17 Apr 2018 11:54:39 +0200
Subject: ipv6: send netlink notifications for manually configured addresses

Send a netlink notification when userspace adds a manually configured
address if DAD is enabled and optimistic flag isn't set.
Moreover send RTM_DELADDR notifications for tentative addresses.

Some userspace applications (e.g. NetworkManager) are interested in
addr netlink events albeit the address is still in tentative state,
however events are not sent if DAD process is not completed.
If the address is added and immediately removed userspace listeners
are not notified. This behaviour can be easily reproduced by using
veth interfaces:

$ ip -b - <<EOF
> link add dev vm1 type veth peer name vm2
> link set dev vm1 up
> link set dev vm2 up
> addr add 2001:db8:a:b:1:2:3:4/64 dev vm1
> addr del 2001:db8:a:b:1:2:3:4/64 dev vm1
EOF

This patch reverts the behaviour introduced by the commit f784ad3d79e5
("ipv6: do not send RTM_DELADDR for tentative addresses")

Suggested-by: Thomas Haller <thaller@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 62b97722722c..b2c0175125db 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2901,6 +2901,11 @@ static int inet6_addr_add(struct net *net, int ifindex,
 					      expires, flags);
 		}
 
+		/* Send a netlink notification if DAD is enabled and
+		 * optimistic flag is not set
+		 */
+		if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
+			ipv6_ifa_notify(0, ifp);
 		/*
 		 * Note that section 3.1 of RFC 4429 indicates
 		 * that the Optimistic flag should not be set for
@@ -5028,14 +5033,6 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 	struct net *net = dev_net(ifa->idev->dev);
 	int err = -ENOBUFS;
 
-	/* Don't send DELADDR notification for TENTATIVE address,
-	 * since NEWADDR notification is sent only after removing
-	 * TENTATIVE flag, if DAD has not failed.
-	 */
-	if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) &&
-	    event == RTM_DELADDR)
-		return;
-
 	skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
-- 
cgit v1.2.3-59-g8ed1b


From a919525ad832d2bb1388b2303832a2307b30aeff Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:07 -0700
Subject: net: Move fib_convert_metrics to metrics file

Move logic of fib_convert_metrics into ip_metrics_convert. This allows
the code that converts netlink attributes into metrics struct to be
re-used in a later patch by IPv6.

This is mostly a code move with the following changes to variable names:
  - fi->fib_net becomes net
  - fc_mx and fc_mx_len are passed as inputs pulled from fib_config
  - metrics array is passed as an input from fi->fib_metrics->metrics

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  3 +++
 net/ipv4/Makefile        |  3 ++-
 net/ipv4/fib_semantics.c | 43 ++-------------------------------------
 net/ipv4/metrics.c       | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 42 deletions(-)
 create mode 100644 net/ipv4/metrics.c

diff --git a/include/net/ip.h b/include/net/ip.h
index ecffd843e7b8..dc4a2d6e58a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -396,6 +396,9 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+		       u32 *metrics);
+
 u32 ip_idents_reserve(u32 hash, int segs);
 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs);
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a07b7dd06def..b379520f9133 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -13,7 +13,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
-	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
+	     metrics.o
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c27122f01b87..6608db23f54b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1019,47 +1019,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
 static int
 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 {
-	bool ecn_ca = false;
-	struct nlattr *nla;
-	int remaining;
-
-	if (!cfg->fc_mx)
-		return 0;
-
-	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-		int type = nla_type(nla);
-		u32 val;
-
-		if (!type)
-			continue;
-		if (type > RTAX_MAX)
-			return -EINVAL;
-
-		if (type == RTAX_CC_ALGO) {
-			char tmp[TCP_CA_NAME_MAX];
-
-			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
-				return -EINVAL;
-		} else {
-			val = nla_get_u32(nla);
-		}
-		if (type == RTAX_ADVMSS && val > 65535 - 40)
-			val = 65535 - 40;
-		if (type == RTAX_MTU && val > 65535 - 15)
-			val = 65535 - 15;
-		if (type == RTAX_HOPLIMIT && val > 255)
-			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
-			return -EINVAL;
-		fi->fib_metrics->metrics[type - 1] = val;
-	}
-
-	if (ecn_ca)
-		fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
-
-	return 0;
+	return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
+				  fi->fib_metrics->metrics);
 }
 
 struct fib_info *fib_create_info(struct fib_config *cfg,
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
new file mode 100644
index 000000000000..5121c6475e6b
--- /dev/null
+++ b/net/ipv4/metrics.c
@@ -0,0 +1,53 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/tcp.h>
+
+int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
+		       u32 *metrics)
+{
+	bool ecn_ca = false;
+	struct nlattr *nla;
+	int remaining;
+
+	if (!fc_mx)
+		return 0;
+
+	nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
+		int type = nla_type(nla);
+		u32 val;
+
+		if (!type)
+			continue;
+		if (type > RTAX_MAX)
+			return -EINVAL;
+
+		if (type == RTAX_CC_ALGO) {
+			char tmp[TCP_CA_NAME_MAX];
+
+			nla_strlcpy(tmp, nla, sizeof(tmp));
+			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
+			if (val == TCP_CA_UNSPEC)
+				return -EINVAL;
+		} else {
+			val = nla_get_u32(nla);
+		}
+		if (type == RTAX_ADVMSS && val > 65535 - 40)
+			val = 65535 - 40;
+		if (type == RTAX_MTU && val > 65535 - 15)
+			val = 65535 - 15;
+		if (type == RTAX_HOPLIMIT && val > 255)
+			val = 255;
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+			return -EINVAL;
+		metrics[type - 1] = val;
+	}
+
+	if (ecn_ca)
+		metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_metrics_convert);
-- 
cgit v1.2.3-59-g8ed1b


From 3940746d867f2f0390342dcb7ac06f1ed0e68027 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:08 -0700
Subject: net: Handle null dst in rtnl_put_cacheinfo

Need to keep expires time for IPv6 routes in a dump of FIB entries.
Update rtnl_put_cacheinfo to allow dst to be NULL in which case
rta_cacheinfo will only contain non-dst data.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 45936922d7e2..80802546c279 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -785,13 +785,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 		       long expires, u32 error)
 {
 	struct rta_cacheinfo ci = {
-		.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse),
-		.rta_used = dst->__use,
-		.rta_clntref = atomic_read(&(dst->__refcnt)),
 		.rta_error = error,
 		.rta_id =  id,
 	};
 
+	if (dst) {
+		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+		ci.rta_used = dst->__use;
+		ci.rta_clntref = atomic_read(&dst->__refcnt);
+	}
 	if (expires) {
 		unsigned long clock;
 
-- 
cgit v1.2.3-59-g8ed1b


From 43b059a312f35799ef7e3a49aba4f1e0128e30ca Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:09 -0700
Subject: vrf: Move fib6_table into net_vrf

A later patch removes rt6i_table from rt6_info. Save the ipv6
table for a VRF in net_vrf. fib tables can not be deleted so
no reference counting or locking is required.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 0a2b180d138a..90b5f3900c22 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -48,6 +48,9 @@ static unsigned int vrf_net_id;
 struct net_vrf {
 	struct rtable __rcu	*rth;
 	struct rt6_info	__rcu	*rt6;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct fib6_table	*fib6_table;
+#endif
 	u32                     tb_id;
 };
 
@@ -496,7 +499,6 @@ static int vrf_rt6_create(struct net_device *dev)
 	int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
 	struct net_vrf *vrf = netdev_priv(dev);
 	struct net *net = dev_net(dev);
-	struct fib6_table *rt6i_table;
 	struct rt6_info *rt6;
 	int rc = -ENOMEM;
 
@@ -504,8 +506,8 @@ static int vrf_rt6_create(struct net_device *dev)
 	if (!ipv6_mod_enabled())
 		return 0;
 
-	rt6i_table = fib6_new_table(net, vrf->tb_id);
-	if (!rt6i_table)
+	vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
+	if (!vrf->fib6_table)
 		goto out;
 
 	/* create a dst for routing packets out a VRF device */
@@ -513,7 +515,6 @@ static int vrf_rt6_create(struct net_device *dev)
 	if (!rt6)
 		goto out;
 
-	rt6->rt6i_table = rt6i_table;
 	rt6->dst.output	= vrf_output6;
 
 	rcu_assign_pointer(vrf->rt6, rt6);
@@ -946,22 +947,8 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
 					     int flags)
 {
 	struct net_vrf *vrf = netdev_priv(dev);
-	struct fib6_table *table = NULL;
-	struct rt6_info *rt6;
-
-	rcu_read_lock();
-
-	/* fib6_table does not have a refcnt and can not be freed */
-	rt6 = rcu_dereference(vrf->rt6);
-	if (likely(rt6))
-		table = rt6->rt6i_table;
-
-	rcu_read_unlock();
-
-	if (!table)
-		return NULL;
 
-	return ip6_pol_route(net, table, ifindex, fl6, skb, flags);
+	return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
 }
 
 static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
-- 
cgit v1.2.3-59-g8ed1b


From 7aef6859ee91ea867a3dff9ba47bca9b2de382f6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:10 -0700
Subject: net/ipv6: Pass net to fib6_update_sernum

Pass net namespace to fib6_update_sernum. It can not be marked const
as fib6_new_sernum will change ipv6.fib6_sernum.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/ip6_fib.c    |  3 +--
 net/ipv6/route.c      | 10 +++++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5e86fd9dc857..f0aaf1c8f1a8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -408,7 +408,7 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
-void fib6_update_sernum(struct rt6_info *rt);
+void fib6_update_sernum(struct net *net, struct rt6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index deab2db6692e..74d2a3748e2f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,9 +105,8 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct rt6_info *rt)
 {
-	struct net *net = dev_net(rt->dst.dev);
 	struct fib6_node *fn;
 
 	fn = rcu_dereference_protected(rt->rt6i_node,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1d738bfe893b..0a99cda9fd7b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1352,7 +1352,7 @@ out:
 	/* Update fn->fn_sernum to invalidate all cached dst */
 	if (!err) {
 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
-		fib6_update_sernum(ort);
+		fib6_update_sernum(net, ort);
 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
 		fib6_force_start_gc(net);
 	}
@@ -3786,11 +3786,11 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
-	const struct net *net = dev_net(arg->dev);
+	struct net *net = dev_net(arg->dev);
 
 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
 		rt->rt6i_nh_flags &= ~arg->nh_flags;
-		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
+		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
 
@@ -3869,7 +3869,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	const struct net_device *dev = arg->dev;
-	const struct net *net = dev_net(dev);
+	struct net *net = dev_net(dev);
 
 	if (rt == net->ipv6.ip6_null_entry)
 		return 0;
@@ -3892,7 +3892,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 			}
 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
 						   RTNH_F_LINKDOWN);
-			fib6_update_sernum(rt);
+			fib6_update_sernum(net, rt);
 			rt6_multipath_rebalance(rt);
 		}
 		return -2;
-- 
cgit v1.2.3-59-g8ed1b


From afb1d4b59311a8252f67c214b37ec69d8100cb55 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:11 -0700
Subject: net/ipv6: Pass net namespace to route functions

Pass network namespace reference into route add, delete and get
functions.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 12 ++++++-----
 net/ipv6/addrconf.c     | 33 ++++++++++++++++--------------
 net/ipv6/anycast.c      | 10 +++++----
 net/ipv6/ndisc.c        | 12 ++++++-----
 net/ipv6/route.c        | 54 +++++++++++++++++++++++++------------------------
 5 files changed, 66 insertions(+), 55 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 08b132381984..1130a1144dfd 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -101,8 +101,8 @@ void ip6_route_cleanup(void);
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct rt6_info *);
-int ip6_del_rt(struct rt6_info *);
+int ip6_ins_rt(struct net *net, struct rt6_info *rt);
+int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
 int rt6_remove_exception_rt(struct rt6_info *rt);
@@ -137,7 +137,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
 				    const struct in6_addr *addr, bool anycast);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
@@ -147,9 +147,11 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
  *	support functions for ND
  *
  */
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr,
+struct rt6_info *rt6_get_dflt_router(struct net *net,
+				     const struct in6_addr *addr,
 				     struct net_device *dev);
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct rt6_info *rt6_add_dflt_router(struct net *net,
+				     const struct in6_addr *gwaddr,
 				     struct net_device *dev, unsigned int pref);
 
 void rt6_purge_dflt_routers(struct net *net);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b2c0175125db..7ff7466c52e5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(idev, addr, false);
+	rt = addrconf_dst_alloc(net, idev, addr, false);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		rt = NULL;
@@ -1187,7 +1187,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 				       0, RTF_GATEWAY | RTF_DEFAULT);
 	if (rt) {
 		if (del_rt)
-			ip6_del_rt(rt);
+			ip6_del_rt(dev_net(ifp->idev->dev), rt);
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
 				rt6_set_expires(rt, expires);
@@ -2666,7 +2666,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 		if (rt) {
 			/* Autoconf prefix route */
 			if (valid_lft == 0) {
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 				rt = NULL;
 			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
@@ -3333,7 +3333,8 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
-static int fixup_permanent_addr(struct inet6_dev *idev,
+static int fixup_permanent_addr(struct net *net,
+				struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
 	/* !rt6i_node means the host route was removed from the
@@ -3343,7 +3344,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
 		struct rt6_info *rt, *prev;
 
-		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
@@ -3367,7 +3368,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 	return 0;
 }
 
-static void addrconf_permanent_addr(struct net_device *dev)
+static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
 {
 	struct inet6_ifaddr *ifp, *tmp;
 	struct inet6_dev *idev;
@@ -3380,7 +3381,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
 
 	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
 		if ((ifp->flags & IFA_F_PERMANENT) &&
-		    fixup_permanent_addr(idev, ifp) < 0) {
+		    fixup_permanent_addr(net, idev, ifp) < 0) {
 			write_unlock_bh(&idev->lock);
 			in6_ifa_hold(ifp);
 			ipv6_del_addr(ifp);
@@ -3449,7 +3450,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 
 		if (event == NETDEV_UP) {
 			/* restore routes for permanent addresses */
-			addrconf_permanent_addr(dev);
+			addrconf_permanent_addr(net, dev);
 
 			if (!addrconf_link_ready(dev)) {
 				/* device is not ready yet. */
@@ -3735,7 +3736,7 @@ restart:
 		spin_unlock_bh(&ifa->lock);
 
 		if (rt)
-			ip6_del_rt(rt);
+			ip6_del_rt(net, rt);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3853,6 +3854,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	struct inet6_dev *idev = ifp->idev;
 	struct net_device *dev = idev->dev;
 	bool bump_id, notify = false;
+	struct net *net;
 
 	addrconf_join_solict(dev, &ifp->addr);
 
@@ -3863,8 +3865,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	if (ifp->state == INET6_IFADDR_STATE_DEAD)
 		goto out;
 
+	net = dev_net(dev);
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
-	    (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+	    (net->ipv6.devconf_all->accept_dad < 1 &&
 	     idev->cnf.accept_dad < 1) ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
 	    ifp->flags & IFA_F_NODAD) {
@@ -3900,8 +3903,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	 * Frames right away
 	 */
 	if (ifp->flags & IFA_F_OPTIMISTIC) {
-		ip6_ins_rt(ifp->rt);
-		if (ipv6_use_optimistic_addr(dev_net(dev), idev)) {
+		ip6_ins_rt(net, ifp->rt);
+		if (ipv6_use_optimistic_addr(net, idev)) {
 			/* Because optimistic nodes can use this address,
 			 * notify listeners. If DAD fails, RTM_DELADDR is sent.
 			 */
@@ -5604,7 +5607,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * to do it again
 		 */
 		if (!rcu_access_pointer(ifp->rt->rt6i_node))
-			ip6_ins_rt(ifp->rt);
+			ip6_ins_rt(net, ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
@@ -5621,11 +5624,11 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
 						       ifp->idev->dev, 0, 0);
 			if (rt)
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 		}
 		if (ifp->rt) {
 			if (dst_hold_safe(&ifp->rt->dst))
-				ip6_del_rt(ifp->rt);
+				ip6_del_rt(net, ifp->rt);
 		}
 		rt_genid_bump_ipv6(net);
 		break;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index bbcabbba9bd8..1122fb299b86 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -247,6 +247,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
 	struct rt6_info *rt;
+	struct net *net;
 	int err;
 
 	ASSERT_RTNL();
@@ -265,7 +266,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 		}
 	}
 
-	rt = addrconf_dst_alloc(idev, addr, true);
+	net = dev_net(idev->dev);
+	rt = addrconf_dst_alloc(net, idev, addr, true);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		goto out;
@@ -286,7 +288,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	aca_get(aca);
 	write_unlock_bh(&idev->lock);
 
-	ip6_ins_rt(rt);
+	ip6_ins_rt(net, rt);
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
 
@@ -329,7 +331,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
 	dst_hold(&aca->aca_rt->dst);
-	ip6_del_rt(aca->aca_rt);
+	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 	aca_put(aca);
 	return 0;
@@ -357,7 +359,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
 		dst_hold(&aca->aca_rt->dst);
-		ip6_del_rt(aca->aca_rt);
+		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 		aca_put(aca);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9de4dfb126ba..3fbc3805e69b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1156,6 +1156,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
 	struct rt6_info *rt = NULL;
+	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
 	int optlen;
@@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	/* Do not accept RA with source-addr found on local machine unless
 	 * accept_ra_from_local is set to true.
 	 */
+	net = dev_net(in6_dev->dev);
 	if (!in6_dev->cnf.accept_ra_from_local &&
-	    ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
-			  in6_dev->dev, 0)) {
+	    ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
 		ND_PRINTK(2, info,
 			  "RA from local address detected on dev: %s: default router ignored\n",
 			  skb->dev->name);
@@ -1272,7 +1273,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
 #endif
 
-	rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev);
+	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
 		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
@@ -1285,7 +1286,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 	}
 	if (rt && lifetime == 0) {
-		ip6_del_rt(rt);
+		ip6_del_rt(net, rt);
 		rt = NULL;
 	}
 
@@ -1294,7 +1295,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	if (!rt && lifetime) {
 		ND_PRINTK(3, info, "RA: adding default router\n");
 
-		rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref);
+		rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
+					 skb->dev, pref);
 		if (!rt) {
 			ND_PRINTK(0, err,
 				  "RA: %s failed to add default route\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0a99cda9fd7b..045811a3da76 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -850,13 +850,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	}
 
 	if (rinfo->prefix_len == 0)
-		rt = rt6_get_dflt_router(gwaddr, dev);
+		rt = rt6_get_dflt_router(net, gwaddr, dev);
 	else
 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 					gwaddr, dev);
 
 	if (rt && !lifetime) {
-		ip6_del_rt(rt);
+		ip6_del_rt(net, rt);
 		rt = NULL;
 	}
 
@@ -1014,9 +1014,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 	return err;
 }
 
-int ip6_ins_rt(struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
-	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
+	struct nl_info info = {	.nl_net = net, };
 	struct mx6_config mxc = { .mx = NULL, };
 
 	/* Hold dst to account for the reference from the fib6 tree */
@@ -1121,14 +1121,13 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 	return pcpu_rt;
 }
 
-static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_make_pcpu_route(struct net *net,
+					    struct rt6_info *rt)
 {
 	struct rt6_info *pcpu_rt, *prev, **p;
 
 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
 	if (!pcpu_rt) {
-		struct net *net = dev_net(rt->dst.dev);
-
 		dst_hold(&net->ipv6.ip6_null_entry->dst);
 		return net->ipv6.ip6_null_entry;
 	}
@@ -1787,7 +1786,7 @@ uncached_rt_out:
 				/* No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				pcpu_rt = rt6_make_pcpu_route(rt);
+				pcpu_rt = rt6_make_pcpu_route(net, rt);
 				rt6_release(rt);
 			} else {
 				/* rt is already removed from tree */
@@ -2088,7 +2087,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (rt6_check_expired(rt)) {
-				ip6_del_rt(rt);
+				ip6_del_rt(dev_net(dst->dev), rt);
 				dst = NULL;
 			}
 		} else {
@@ -2109,7 +2108,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
-				ip6_del_rt(rt);
+				ip6_del_rt(dev_net(rt->dst.dev), rt);
 		} else {
 			struct fib6_node *fn;
 
@@ -3018,9 +3017,9 @@ out:
 
 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 {
-	int err;
+	struct net *net = info->nl_net;
 	struct fib6_table *table;
-	struct net *net = dev_net(rt->dst.dev);
+	int err;
 
 	if (rt == net->ipv6.ip6_null_entry) {
 		err = -ENOENT;
@@ -3037,11 +3036,10 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct rt6_info *rt)
+int ip6_del_rt(struct net *net, struct rt6_info *rt)
 {
-	struct nl_info info = {
-		.nl_net = dev_net(rt->dst.dev),
-	};
+	struct nl_info info = { .nl_net = net };
+
 	return __ip6_del_rt(rt, &info);
 }
 
@@ -3376,13 +3374,15 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 }
 #endif
 
-struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
+struct rt6_info *rt6_get_dflt_router(struct net *net,
+				     const struct in6_addr *addr,
+				     struct net_device *dev)
 {
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
 	struct rt6_info *rt;
 	struct fib6_table *table;
 
-	table = fib6_get_table(dev_net(dev), tb_id);
+	table = fib6_get_table(net, tb_id);
 	if (!table)
 		return NULL;
 
@@ -3399,7 +3399,8 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
 	return rt;
 }
 
-struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
+struct rt6_info *rt6_add_dflt_router(struct net *net,
+				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref)
 {
@@ -3412,7 +3413,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 		.fc_protocol = RTPROT_RA,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
-		.fc_nlinfo.nl_net = dev_net(dev),
+		.fc_nlinfo.nl_net = net,
 	};
 
 	cfg.fc_gateway = *gwaddr;
@@ -3425,10 +3426,11 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
 	}
 
-	return rt6_get_dflt_router(gwaddr, dev);
+	return rt6_get_dflt_router(net, gwaddr, dev);
 }
 
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
+static void __rt6_purge_dflt_routers(struct net *net,
+				     struct fib6_table *table)
 {
 	struct rt6_info *rt;
 
@@ -3439,7 +3441,7 @@ restart:
 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
 			if (dst_hold_safe(&rt->dst)) {
 				rcu_read_unlock();
-				ip6_del_rt(rt);
+				ip6_del_rt(net, rt);
 			} else {
 				rcu_read_unlock();
 			}
@@ -3463,7 +3465,7 @@ void rt6_purge_dflt_routers(struct net *net)
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
-				__rt6_purge_dflt_routers(table);
+				__rt6_purge_dflt_routers(net, table);
 		}
 	}
 
@@ -3583,12 +3585,12 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
+struct rt6_info *addrconf_dst_alloc(struct net *net,
+				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
 				    bool anycast)
 {
 	u32 tb_id;
-	struct net *net = dev_net(idev->dev);
 	struct net_device *dev = idev->dev;
 	struct rt6_info *rt;
 
-- 
cgit v1.2.3-59-g8ed1b


From ae90d867f99b86152a7c2a979f8a466d8e4ac2ad Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:12 -0700
Subject: net/ipv6: Move support functions up in route.c

Code move only.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 119 +++++++++++++++++++++++++++----------------------------
 1 file changed, 59 insertions(+), 60 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 045811a3da76..0daf4c9c9f2b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -78,7 +78,6 @@ enum rt6_nud_state {
 	RT6_NUD_SUCCEED = 1
 };
 
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
@@ -879,6 +878,65 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
+/*
+ *	Misc support functions
+ */
+
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+{
+	struct net_device *dev = rt->dst.dev;
+
+	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+		/* for copies of local routes, dst->dev needs to be the
+		 * device if it is a master device, the master device if
+		 * device is enslaved, and the loopback as the default
+		 */
+		if (netif_is_l3_slave(dev) &&
+		    !rt6_need_strict(&rt->rt6i_dst.addr))
+			dev = l3mdev_master_dev_rcu(dev);
+		else if (!netif_is_l3_master(dev))
+			dev = dev_net(dev)->loopback_dev;
+		/* last case is netif_is_l3_master(dev) is true in which
+		 * case we want dev returned to be dev
+		 */
+	}
+
+	return dev;
+}
+
+static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
+{
+	BUG_ON(from->from);
+
+	rt->rt6i_flags &= ~RTF_EXPIRES;
+	dst_hold(&from->dst);
+	rt->from = from;
+	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+}
+
+static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+{
+	rt->dst.input = ort->dst.input;
+	rt->dst.output = ort->dst.output;
+	rt->rt6i_dst = ort->rt6i_dst;
+	rt->dst.error = ort->dst.error;
+	rt->rt6i_idev = ort->rt6i_idev;
+	if (rt->rt6i_idev)
+		in6_dev_hold(rt->rt6i_idev);
+	rt->dst.lastuse = jiffies;
+	rt->rt6i_gateway = ort->rt6i_gateway;
+	rt->rt6i_flags = ort->rt6i_flags;
+	rt6_set_from(rt, ort);
+	rt->rt6i_metric = ort->rt6i_metric;
+#ifdef CONFIG_IPV6_SUBTREES
+	rt->rt6i_src = ort->rt6i_src;
+#endif
+	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+	rt->rt6i_table = ort->rt6i_table;
+	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
+}
+
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 					struct in6_addr *saddr)
 {
@@ -1024,29 +1082,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
 }
 
-/* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
-{
-	struct net_device *dev = rt->dst.dev;
-
-	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
-		/* for copies of local routes, dst->dev needs to be the
-		 * device if it is a master device, the master device if
-		 * device is enslaved, and the loopback as the default
-		 */
-		if (netif_is_l3_slave(dev) &&
-		    !rt6_need_strict(&rt->rt6i_dst.addr))
-			dev = l3mdev_master_dev_rcu(dev);
-		else if (!netif_is_l3_master(dev))
-			dev = dev_net(dev)->loopback_dev;
-		/* last case is netif_is_l3_master(dev) is true in which
-		 * case we want dev returned to be dev
-		 */
-	}
-
-	return dev;
-}
-
 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 					   const struct in6_addr *daddr,
 					   const struct in6_addr *saddr)
@@ -3270,42 +3305,6 @@ out:
 	neigh_release(neigh);
 }
 
-/*
- *	Misc support functions
- */
-
-static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
-{
-	BUG_ON(from->from);
-
-	rt->rt6i_flags &= ~RTF_EXPIRES;
-	dst_hold(&from->dst);
-	rt->from = from;
-	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
-}
-
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
-{
-	rt->dst.input = ort->dst.input;
-	rt->dst.output = ort->dst.output;
-	rt->rt6i_dst = ort->rt6i_dst;
-	rt->dst.error = ort->dst.error;
-	rt->rt6i_idev = ort->rt6i_idev;
-	if (rt->rt6i_idev)
-		in6_dev_hold(rt->rt6i_idev);
-	rt->dst.lastuse = jiffies;
-	rt->rt6i_gateway = ort->rt6i_gateway;
-	rt->rt6i_flags = ort->rt6i_flags;
-	rt6_set_from(rt, ort);
-	rt->rt6i_metric = ort->rt6i_metric;
-#ifdef CONFIG_IPV6_SUBTREES
-	rt->rt6i_src = ort->rt6i_src;
-#endif
-	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
-	rt->rt6i_table = ort->rt6i_table;
-	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
-}
-
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
-- 
cgit v1.2.3-59-g8ed1b


From e8478e80e5a74f4ce47b043735f0066588fb64c7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:13 -0700
Subject: net/ipv6: Save route type in rt6_info

The RTN_ type for IPv6 FIB entries is currently embedded in rt6i_flags
and dst.error. Since dst is going to be removed, it can no longer be
relied on for FIB dumps so save the route type as fib6_type.

fc_type is set in current users based on the algorithm in rt6_fill_node:
  - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL
  - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST
  - else fc_type = RTN_UNICAST

Similarly, fib6_type is set in the rt6_info templates based on the
RTF_REJECT section of rt6_fill_node converting dst.error to RTN type.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/addrconf.c   |  2 ++
 net/ipv6/route.c      | 46 ++++++++++++++++++++--------------------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f0aaf1c8f1a8..0165820bbafb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -174,6 +174,7 @@ struct rt6_info {
 	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
+	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
 					unused:6;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7ff7466c52e5..f71fcf2635d5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2331,6 +2331,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		.fc_flags = RTF_UP | flags,
 		.fc_nlinfo.nl_net = dev_net(dev),
 		.fc_protocol = RTPROT_KERNEL,
+		.fc_type = RTN_UNICAST,
 	};
 
 	cfg.fc_dst = *pfx;
@@ -2394,6 +2395,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 		.fc_ifindex = dev->ifindex,
 		.fc_dst_len = 8,
 		.fc_flags = RTF_UP,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.nl_net = dev_net(dev),
 	};
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0daf4c9c9f2b..1cc01f5bb773 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -307,6 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_UNREACHABLE,
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -324,6 +325,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_PROHIBIT,
 };
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -339,6 +341,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_BLACKHOLE,
 };
 
 #endif
@@ -2802,6 +2805,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	if (cfg->fc_type > RTN_MAX) {
+		NL_SET_ERR_MSG(extack, "Invalid route type");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -2914,6 +2922,8 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	rt->rt6i_metric = cfg->fc_metric;
 	rt->rt6i_nh_weight = 1;
 
+	rt->fib6_type = cfg->fc_type;
+
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
 	 */
@@ -3354,6 +3364,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
 				  RTF_UP | RTF_PREF(pref),
 		.fc_protocol = RTPROT_RA,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
 		.fc_nlinfo.nl_net = net,
@@ -3410,6 +3421,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
 		.fc_protocol = RTPROT_RA,
+		.fc_type = RTN_UNICAST,
 		.fc_nlinfo.portid = 0,
 		.fc_nlinfo.nlh = NULL,
 		.fc_nlinfo.nl_net = net,
@@ -3485,6 +3497,7 @@ static void rtmsg_to_fib6_config(struct net *net,
 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
 	cfg->fc_flags = rtmsg->rtmsg_flags;
+	cfg->fc_type = rtmsg->rtmsg_type;
 
 	cfg->fc_nlinfo.nl_net = net;
 
@@ -3606,10 +3619,13 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
-	if (anycast)
+	if (anycast) {
+		rt->fib6_type = RTN_ANYCAST;
 		rt->rt6i_flags |= RTF_ANYCAST;
-	else
+	} else {
+		rt->fib6_type = RTN_LOCAL;
 		rt->rt6i_flags |= RTF_LOCAL;
+	}
 
 	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
@@ -4509,30 +4525,8 @@ static int rt6_fill_node(struct net *net,
 	rtm->rtm_table = table;
 	if (nla_put_u32(skb, RTA_TABLE, table))
 		goto nla_put_failure;
-	if (rt->rt6i_flags & RTF_REJECT) {
-		switch (rt->dst.error) {
-		case -EINVAL:
-			rtm->rtm_type = RTN_BLACKHOLE;
-			break;
-		case -EACCES:
-			rtm->rtm_type = RTN_PROHIBIT;
-			break;
-		case -EAGAIN:
-			rtm->rtm_type = RTN_THROW;
-			break;
-		default:
-			rtm->rtm_type = RTN_UNREACHABLE;
-			break;
-		}
-	}
-	else if (rt->rt6i_flags & RTF_LOCAL)
-		rtm->rtm_type = RTN_LOCAL;
-	else if (rt->rt6i_flags & RTF_ANYCAST)
-		rtm->rtm_type = RTN_ANYCAST;
-	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
-		rtm->rtm_type = RTN_LOCAL;
-	else
-		rtm->rtm_type = RTN_UNICAST;
+
+	rtm->rtm_type = rt->fib6_type;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
-- 
cgit v1.2.3-59-g8ed1b


From 5e670d844b2a4e47d1b9b9aceb14dd3c12a6d4bf Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:14 -0700
Subject: net/ipv6: Move nexthop data to fib6_nh

Introduce fib6_nh structure and move nexthop related data from
rt6_info and rt6_info.dst to fib6_nh. References to dev, gateway or
lwtstate from a FIB lookup perspective are converted to use fib6_nh;
datapath references to dst version are left as is.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  32 ++--
 include/net/ip6_fib.h                              |  16 +-
 include/net/ip6_route.h                            |   6 +-
 net/ipv6/addrconf.c                                |   2 +-
 net/ipv6/ip6_fib.c                                 |   6 +-
 net/ipv6/route.c                                   | 162 +++++++++++----------
 6 files changed, 125 insertions(+), 99 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 1904c0323d39..d995a0b52d7c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2770,9 +2770,9 @@ mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
 		struct in6_addr *gw;
 		int ifindex, weight;
 
-		ifindex = mlxsw_sp_rt6->rt->dst.dev->ifindex;
-		weight = mlxsw_sp_rt6->rt->rt6i_nh_weight;
-		gw = &mlxsw_sp_rt6->rt->rt6i_gateway;
+		ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex;
+		weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight;
+		gw = &mlxsw_sp_rt6->rt->fib6_nh.nh_gw;
 		if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
 							 weight))
 			return false;
@@ -2838,7 +2838,7 @@ mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
 	struct net_device *dev;
 
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		dev = mlxsw_sp_rt6->rt->dst.dev;
+		dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev;
 		val ^= dev->ifindex;
 	}
 
@@ -3836,9 +3836,9 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		if (nh->rif && nh->rif->dev == rt->dst.dev &&
+		if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
 		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
-				    &rt->rt6i_gateway))
+				    &rt->fib6_nh.nh_gw))
 			return nh;
 		continue;
 	}
@@ -3895,7 +3895,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
 		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-				 list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+				 list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
 
@@ -3905,9 +3905,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
 		if (nh && nh->offloaded)
-			mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
 		else
-			mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -3922,7 +3922,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
+		rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -4818,8 +4818,8 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
 					const struct rt6_info *rt,
 					enum mlxsw_sp_ipip_type *ret)
 {
-	return rt->dst.dev &&
-	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->dst.dev, ret);
+	return rt->fib6_nh.nh_dev &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret);
 }
 
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
@@ -4829,7 +4829,7 @@ static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
 {
 	const struct mlxsw_sp_ipip_ops *ipip_ops;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct mlxsw_sp_rif *rif;
 	int err;
 
@@ -4872,11 +4872,11 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 				  struct mlxsw_sp_nexthop *nh,
 				  const struct rt6_info *rt)
 {
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 
 	nh->nh_grp = nh_grp;
-	nh->nh_weight = rt->rt6i_nh_weight;
-	memcpy(&nh->gw_addr, &rt->rt6i_gateway, sizeof(nh->gw_addr));
+	nh->nh_weight = rt->fib6_nh.nh_weight;
+	memcpy(&nh->gw_addr, &rt->fib6_nh.nh_gw, sizeof(nh->gw_addr));
 	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
 
 	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0165820bbafb..f0a88370ba95 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -127,6 +127,16 @@ struct rt6_exception {
 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
 #define FIB6_MAX_DEPTH 5
 
+struct fib6_nh {
+	struct in6_addr		nh_gw;
+	struct net_device	*nh_dev;
+	struct lwtunnel_state	*nh_lwtstate;
+
+	unsigned int		nh_flags;
+	atomic_t		nh_upper_bound;
+	int			nh_weight;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
@@ -149,12 +159,9 @@ struct rt6_info {
 	 */
 	struct list_head		rt6i_siblings;
 	unsigned int			rt6i_nsiblings;
-	atomic_t			rt6i_nh_upper_bound;
 
 	atomic_t			rt6i_ref;
 
-	unsigned int			rt6i_nh_flags;
-
 	/* These are in a separate cache line. */
 	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
 	u32				rt6i_flags;
@@ -171,13 +178,14 @@ struct rt6_info {
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
-	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
 					unused:6;
+
+	struct fib6_nh			fib6_nh;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 1130a1144dfd..655e13017a45 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -273,10 +273,10 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
 {
-	return a->dst.dev == b->dst.dev &&
+	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
 	       a->rt6i_idev == b->rt6i_idev &&
-	       ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) &&
-	       !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate);
+	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
+	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
 #endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f71fcf2635d5..483c8772e856 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2369,7 +2369,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->dst.dev->ifindex != dev->ifindex)
+		if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
 			continue;
 		if ((rt->rt6i_flags & flags) != flags)
 			continue;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 74d2a3748e2f..64b73e65f114 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2221,6 +2221,7 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
 	struct rt6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
+	const struct net_device *dev;
 
 	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
@@ -2230,14 +2231,15 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
 	if (rt->rt6i_flags & RTF_GATEWAY)
-		seq_printf(seq, "%pi6", &rt->rt6i_gateway);
+		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
+	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
 		   rt->dst.__use, rt->rt6i_flags,
-		   rt->dst.dev ? rt->dst.dev->name : "");
+		   dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1cc01f5bb773..222af19d3403 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -466,12 +466,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
 	if (!fl6->mp_hash)
 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
-	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
+	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
 		return match;
 
 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
 				 rt6i_siblings) {
-		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
+		int nh_upper_bound;
+
+		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
+		if (fl6->mp_hash > nh_upper_bound)
 			continue;
 		if (rt6_score_route(sibling, oif, strict) < 0)
 			break;
@@ -495,13 +498,14 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 	struct rt6_info *local = NULL;
 	struct rt6_info *sprt;
 
-	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
+	if (!oif && ipv6_addr_any(saddr) &&
+	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
 		return rt;
 
 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
-		struct net_device *dev = sprt->dst.dev;
+		const struct net_device *dev = sprt->fib6_nh.nh_dev;
 
-		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
+		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
 
 		if (oif) {
@@ -533,7 +537,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 			return net->ipv6.ip6_null_entry;
 	}
 
-	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
+	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -558,7 +562,10 @@ static void rt6_probe_deferred(struct work_struct *w)
 static void rt6_probe(struct rt6_info *rt)
 {
 	struct __rt6_probe_work *work;
+	const struct in6_addr *nh_gw;
 	struct neighbour *neigh;
+	struct net_device *dev;
+
 	/*
 	 * Okay, this does not seem to be appropriate
 	 * for now, however, we need to check if it
@@ -569,8 +576,11 @@ static void rt6_probe(struct rt6_info *rt)
 	 */
 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
 		return;
+
+	nh_gw = &rt->fib6_nh.nh_gw;
+	dev = rt->fib6_nh.nh_dev;
 	rcu_read_lock_bh();
-	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
 		if (neigh->nud_state & NUD_VALID)
 			goto out;
@@ -592,9 +602,9 @@ static void rt6_probe(struct rt6_info *rt)
 
 	if (work) {
 		INIT_WORK(&work->work, rt6_probe_deferred);
-		work->target = rt->rt6i_gateway;
-		dev_hold(rt->dst.dev);
-		work->dev = rt->dst.dev;
+		work->target = *nh_gw;
+		dev_hold(dev);
+		work->dev = dev;
 		schedule_work(&work->work);
 	}
 
@@ -612,7 +622,8 @@ static inline void rt6_probe(struct rt6_info *rt)
  */
 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 {
-	struct net_device *dev = rt->dst.dev;
+	const struct net_device *dev = rt->fib6_nh.nh_dev;
+
 	if (!oif || dev->ifindex == oif)
 		return 2;
 	if ((dev->flags & IFF_LOOPBACK) &&
@@ -623,15 +634,16 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 
 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 {
-	struct neighbour *neigh;
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
+	struct neighbour *neigh;
 
 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 	    !(rt->rt6i_flags & RTF_GATEWAY))
 		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
-	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
+					  &rt->fib6_nh.nh_gw);
 	if (neigh) {
 		read_lock(&neigh->lock);
 		if (neigh->nud_state & NUD_VALID)
@@ -679,11 +691,11 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	bool match_do_rr = false;
 	struct inet6_dev *idev = rt->rt6i_idev;
 
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
 
 	if (idev->cnf.ignore_routes_with_linkdown &&
-	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
@@ -888,7 +900,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 /* called with rcu_lock held */
 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
 {
-	struct net_device *dev = rt->dst.dev;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
 
 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
@@ -928,7 +940,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 	if (rt->rt6i_idev)
 		in6_dev_hold(rt->rt6i_idev);
 	rt->dst.lastuse = jiffies;
-	rt->rt6i_gateway = ort->rt6i_gateway;
+	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->rt6i_flags;
 	rt6_set_from(rt, ort);
 	rt->rt6i_metric = ort->rt6i_metric;
@@ -937,7 +949,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
 	rt->rt6i_table = ort->rt6i_table;
-	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
+	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1308,7 +1320,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 static int rt6_insert_exception(struct rt6_info *nrt,
 				struct rt6_info *ort)
 {
-	struct net *net = dev_net(ort->dst.dev);
+	struct net *net = dev_net(nrt->dst.dev);
 	struct rt6_exception_bucket *bucket;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
@@ -2313,7 +2325,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
 		if (rt6_check_expired(rt))
 			continue;
@@ -2321,14 +2333,14 @@ restart:
 			break;
 		if (!(rt->rt6i_flags & RTF_GATEWAY))
 			continue;
-		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
+		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
 			continue;
 		/* rt_cache's gateway might be different from its 'parent'
 		 * in the case of an ip redirect.
 		 * So we keep searching in the exception table if the gateway
 		 * is different.
 		 */
-		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
 			rt_cache = rt6_find_cached_rt(rt,
 						      &fl6->daddr,
 						      &fl6->saddr);
@@ -2905,7 +2917,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 					   &lwtstate, extack);
 		if (err)
 			goto out;
-		rt->dst.lwtstate = lwtstate_get(lwtstate);
+		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
 		lwtunnel_set_redirect(&rt->dst);
 	}
 
@@ -2920,7 +2932,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 #endif
 
 	rt->rt6i_metric = cfg->fc_metric;
-	rt->rt6i_nh_weight = 1;
+	rt->fib6_nh.nh_weight = 1;
 
 	rt->fib6_type = cfg->fc_type;
 
@@ -2975,7 +2987,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 
-		rt->rt6i_gateway = cfg->fc_gateway;
+		rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway;
 	}
 
 	err = -ENODEV;
@@ -3010,9 +3022,9 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 install_route:
 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
-		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
-	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
-	rt->dst.dev = dev;
+		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
+	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
+	rt->fib6_nh.nh_dev = rt->dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
 
@@ -3171,11 +3183,11 @@ static int ip6_route_del(struct fib6_config *cfg,
 				rt = rt_cache;
 			}
 			if (cfg->fc_ifindex &&
-			    (!rt->dst.dev ||
-			     rt->dst.dev->ifindex != cfg->fc_ifindex))
+			    (!rt->fib6_nh.nh_dev ||
+			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
 				continue;
 			if (cfg->fc_flags & RTF_GATEWAY &&
-			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
 				continue;
 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
 				continue;
@@ -3337,11 +3349,11 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 		goto out;
 
 	for_each_fib6_node_rt_rcu(fn) {
-		if (rt->dst.dev->ifindex != ifindex)
+		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
 			continue;
 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
-		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
+		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
 		ip6_hold_safe(NULL, &rt, false);
 		break;
@@ -3398,9 +3410,9 @@ struct rt6_info *rt6_get_dflt_router(struct net *net,
 
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		if (dev == rt->dst.dev &&
+		if (dev == rt->fib6_nh.nh_dev &&
 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
-		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
+		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
 	if (rt)
@@ -3627,6 +3639,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 		rt->rt6i_flags |= RTF_LOCAL;
 	}
 
+	rt->fib6_nh.nh_gw = *addr;
+	rt->fib6_nh.nh_dev = dev;
 	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
@@ -3649,7 +3663,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
-	if (((void *)rt->dst.dev == dev || !dev) &&
+	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
 	    rt != net->ipv6.ip6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
@@ -3681,7 +3695,7 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
-	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
 		return -1;
 	}
 
@@ -3729,8 +3743,8 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
 
 static bool rt6_is_dead(const struct rt6_info *rt)
 {
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
-	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
+	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
 		return true;
 
@@ -3743,11 +3757,11 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total += rt->rt6i_nh_weight;
+		total += rt->fib6_nh.nh_weight;
 
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
 		if (!rt6_is_dead(iter))
-			total += iter->rt6i_nh_weight;
+			total += iter->fib6_nh.nh_weight;
 	}
 
 	return total;
@@ -3758,11 +3772,11 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		*weight += rt->rt6i_nh_weight;
+		*weight += rt->fib6_nh.nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
-	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
+	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
 }
 
 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
@@ -3805,8 +3819,8 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
-		rt->rt6i_nh_flags &= ~arg->nh_flags;
+	if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
+		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
 	}
@@ -3834,10 +3848,10 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
 {
 	struct rt6_info *iter;
 
-	if (rt->dst.dev == dev)
+	if (rt->fib6_nh.nh_dev == dev)
 		return true;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == dev)
+		if (iter->fib6_nh.nh_dev == dev)
 			return true;
 
 	return false;
@@ -3858,11 +3872,12 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
 	struct rt6_info *iter;
 	unsigned int dead = 0;
 
-	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_dev == down_dev ||
+	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		dead++;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == down_dev ||
-		    iter->rt6i_nh_flags & RTNH_F_DEAD)
+		if (iter->fib6_nh.nh_dev == down_dev ||
+		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
 			dead++;
 
 	return dead;
@@ -3874,11 +3889,11 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
 {
 	struct rt6_info *iter;
 
-	if (rt->dst.dev == dev)
-		rt->rt6i_nh_flags |= nh_flags;
+	if (rt->fib6_nh.nh_dev == dev)
+		rt->fib6_nh.nh_flags |= nh_flags;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
-		if (iter->dst.dev == dev)
-			iter->rt6i_nh_flags |= nh_flags;
+		if (iter->fib6_nh.nh_dev == dev)
+			iter->fib6_nh.nh_flags |= nh_flags;
 }
 
 /* called with write lock held for table with rt */
@@ -3893,12 +3908,12 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 
 	switch (arg->event) {
 	case NETDEV_UNREGISTER:
-		return rt->dst.dev == dev ? -1 : 0;
+		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
 		if (!rt->rt6i_nsiblings)
-			return rt->dst.dev == dev ? -1 : 0;
+			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
@@ -3914,10 +3929,10 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 		}
 		return -2;
 	case NETDEV_CHANGE:
-		if (rt->dst.dev != dev ||
+		if (rt->fib6_nh.nh_dev != dev ||
 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
-		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
+		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
 		break;
 	}
@@ -3969,7 +3984,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   Since RFC 1981 doesn't include administrative MTU increase
 	   update PMTU increase is a MUST. (i.e. jumbo frame)
 	 */
-	if (rt->dst.dev == arg->dev &&
+	if (rt->fib6_nh.nh_dev == arg->dev &&
 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
 		spin_lock_bh(&rt6_exception_lock);
 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
@@ -4255,7 +4270,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
-		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
@@ -4412,7 +4427,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
-			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
+			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
 
 		nexthop_len *= rt->rt6i_nsiblings;
 	}
@@ -4430,38 +4445,38 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nla_total_size(sizeof(struct rta_cacheinfo))
 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
 	       + nla_total_size(1) /* RTA_PREF */
-	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
+	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
 	       + nexthop_len;
 }
 
 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
-	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
+	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
 
-	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
+	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
 			*flags |= RTNH_F_DEAD;
 	}
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
-		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
+		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
 			goto nla_put_failure;
 	}
 
-	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
-	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
+	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
+	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-	if (!skip_oif && rt->dst.dev &&
-	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
+	if (!skip_oif && rt->fib6_nh.nh_dev &&
+	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
 		goto nla_put_failure;
 
-	if (rt->dst.lwtstate &&
-	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
+	if (rt->fib6_nh.nh_lwtstate &&
+	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
 		goto nla_put_failure;
 
 	return 0;
@@ -4473,6 +4488,7 @@ nla_put_failure:
 /* add multipath next hop */
 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 {
+	const struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rtnexthop *rtnh;
 	unsigned int flags = 0;
 
@@ -4480,8 +4496,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
-	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
+	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
+	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3-59-g8ed1b


From 6edb3c96a5f028e0cee5fe80ad09b866eb2f6179 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:15 -0700
Subject: net/ipv6: Defer initialization of dst to data path

Defer setting dst input, output and error until fib entry is copied.

The reject path from ip6_route_info_create is moved to a new function
ip6_rt_init_dst_reject with a helper doing the conversion from fib6_type
to dst error.

The remainder of the new ip6_rt_init_dst is an amalgamtion of dst code
from addrconf_dst_alloc and the non-reject path of ip6_route_info_create.
The dst output function is always ip6_output and the input function is
either ip6_input (local routes), ip6_mc_input (multicast routes) or
ip6_forward (anything else).

A couple of places using dst.error are updated to look at rt6i_flags.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 115 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 74 insertions(+), 41 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 222af19d3403..3b301aafd2ed 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -920,6 +920,75 @@ static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
 	return dev;
 }
 
+static const int fib6_prop[RTN_MAX + 1] = {
+	[RTN_UNSPEC]	= 0,
+	[RTN_UNICAST]	= 0,
+	[RTN_LOCAL]	= 0,
+	[RTN_BROADCAST]	= 0,
+	[RTN_ANYCAST]	= 0,
+	[RTN_MULTICAST]	= 0,
+	[RTN_BLACKHOLE]	= -EINVAL,
+	[RTN_UNREACHABLE] = -EHOSTUNREACH,
+	[RTN_PROHIBIT]	= -EACCES,
+	[RTN_THROW]	= -EAGAIN,
+	[RTN_NAT]	= -EINVAL,
+	[RTN_XRESOLVE]	= -EINVAL,
+};
+
+static int ip6_rt_type_to_error(u8 fib6_type)
+{
+	return fib6_prop[fib6_type];
+}
+
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
+{
+	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
+
+	switch (ort->fib6_type) {
+	case RTN_BLACKHOLE:
+		rt->dst.output = dst_discard_out;
+		rt->dst.input = dst_discard;
+		break;
+	case RTN_PROHIBIT:
+		rt->dst.output = ip6_pkt_prohibit_out;
+		rt->dst.input = ip6_pkt_prohibit;
+		break;
+	case RTN_THROW:
+	case RTN_UNREACHABLE:
+	default:
+		rt->dst.output = ip6_pkt_discard_out;
+		rt->dst.input = ip6_pkt_discard;
+		break;
+	}
+}
+
+static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
+{
+	if (ort->rt6i_flags & RTF_REJECT) {
+		ip6_rt_init_dst_reject(rt, ort);
+		return;
+	}
+
+	rt->dst.error = 0;
+	rt->dst.output = ip6_output;
+
+	if (ort->fib6_type == RTN_LOCAL) {
+		rt->dst.flags |= DST_HOST;
+		rt->dst.input = ip6_input;
+	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
+		rt->dst.input = ip6_mc_input;
+	} else {
+		rt->dst.input = ip6_forward;
+	}
+
+	if (ort->fib6_nh.nh_lwtstate) {
+		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+		lwtunnel_set_redirect(&rt->dst);
+	}
+
+	rt->dst.lastuse = jiffies;
+}
+
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
 	BUG_ON(from->from);
@@ -932,14 +1001,12 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 {
-	rt->dst.input = ort->dst.input;
-	rt->dst.output = ort->dst.output;
+	ip6_rt_init_dst(rt, ort);
+
 	rt->rt6i_dst = ort->rt6i_dst;
-	rt->dst.error = ort->dst.error;
 	rt->rt6i_idev = ort->rt6i_idev;
 	if (rt->rt6i_idev)
 		in6_dev_hold(rt->rt6i_idev);
-	rt->dst.lastuse = jiffies;
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->rt6i_flags;
 	rt6_set_from(rt, ort);
@@ -2329,7 +2396,7 @@ restart:
 			continue;
 		if (rt6_check_expired(rt))
 			continue;
-		if (rt->dst.error)
+		if (rt->rt6i_flags & RTF_REJECT)
 			break;
 		if (!(rt->rt6i_flags & RTF_GATEWAY))
 			continue;
@@ -2357,7 +2424,7 @@ restart:
 
 	if (!rt)
 		rt = net->ipv6.ip6_null_entry;
-	else if (rt->dst.error) {
+	else if (rt->rt6i_flags & RTF_REJECT) {
 		rt = net->ipv6.ip6_null_entry;
 		goto out;
 	}
@@ -2900,15 +2967,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 
 	addr_type = ipv6_addr_type(&cfg->fc_dst);
 
-	if (addr_type & IPV6_ADDR_MULTICAST)
-		rt->dst.input = ip6_mc_input;
-	else if (cfg->fc_flags & RTF_LOCAL)
-		rt->dst.input = ip6_input;
-	else
-		rt->dst.input = ip6_forward;
-
-	rt->dst.output = ip6_output;
-
 	if (cfg->fc_encap) {
 		struct lwtunnel_state *lwtstate;
 
@@ -2918,7 +2976,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
-		lwtunnel_set_redirect(&rt->dst);
 	}
 
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
@@ -2958,27 +3015,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 			}
 		}
 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
-		switch (cfg->fc_type) {
-		case RTN_BLACKHOLE:
-			rt->dst.error = -EINVAL;
-			rt->dst.output = dst_discard_out;
-			rt->dst.input = dst_discard;
-			break;
-		case RTN_PROHIBIT:
-			rt->dst.error = -EACCES;
-			rt->dst.output = ip6_pkt_prohibit_out;
-			rt->dst.input = ip6_pkt_prohibit;
-			break;
-		case RTN_THROW:
-		case RTN_UNREACHABLE:
-		default:
-			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
-					: (cfg->fc_type == RTN_UNREACHABLE)
-					? -EHOSTUNREACH : -ENETUNREACH;
-			rt->dst.output = ip6_pkt_discard_out;
-			rt->dst.input = ip6_pkt_discard;
-			break;
-		}
 		goto install_route;
 	}
 
@@ -3623,12 +3659,9 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 		return ERR_PTR(-ENOMEM);
 
 	in6_dev_hold(idev);
-
-	rt->dst.flags |= DST_HOST;
-	rt->dst.input = ip6_input;
-	rt->dst.output = ip6_output;
 	rt->rt6i_idev = idev;
 
+	rt->dst.flags |= DST_HOST;
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
-- 
cgit v1.2.3-59-g8ed1b


From d4ead6b34b67fd711639324b6465a050bcb197d4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:16 -0700
Subject: net/ipv6: move metrics from dst to rt6_info

Similar to IPv4, add fib metrics to the fib struct, which at the moment
is rt6_info. Will be moved to fib6_info in a later patch. Copy metrics
into dst by reference using refcount.

To make the transition:
- add dst_metrics to rt6_info. Default to dst_default_metrics if no
  metrics are passed during route add. No need for a separate pmtu
  entry; it can reference the MTU slot in fib6_metrics

- ip6_convert_metrics allocates memory in the FIB entry and uses
  ip_metrics_convert to copy from netlink attribute to metrics entry

- the convert metrics call is done in ip6_route_info_create simplifying
  the route add path
  + fib6_commit_metrics and fib6_copy_metrics and the temporary
    mx6_config are no longer needed

- add fib6_metric_set helper to change the value of a metric in the
  fib entry since dst_metric_set can no longer be used

- cow_metrics for IPv6 can drop to dst_cow_metrics_generic

- rt6_dst_from_metrics_check is no longer needed

- rt6_fill_node needs the FIB entry and dst as separate arguments to
  keep compatibility with existing output. Current dst address is
  renamed to dest.
  (to be consistent with IPv4 rt6_fill_node really should be split
  into 2 functions similar to fib_dump_info and rt_fill_info)

- rt6_fill_node no longer needs the temporary metrics variable

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  17 ++--
 net/core/dst.c        |   1 +
 net/ipv6/ip6_fib.c    |  66 +++++--------
 net/ipv6/ndisc.c      |  10 +-
 net/ipv6/route.c      | 257 +++++++++++++++++++-------------------------------
 5 files changed, 133 insertions(+), 218 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f0a88370ba95..1f8dc9d12abb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -94,11 +94,6 @@ struct fib6_gc_args {
 #define FIB6_SUBTREE(fn)	(rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
-struct mx6_config {
-	const u32 *mx;
-	DECLARE_BITMAP(mx_valid, RTAX_MAX);
-};
-
 /*
  *	routing information
  *
@@ -176,7 +171,6 @@ struct rt6_info {
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
 	u32				rt6i_metric;
-	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
@@ -185,6 +179,8 @@ struct rt6_info {
 					should_flush:1,
 					unused:6;
 
+	struct dst_metrics		*fib6_metrics;
+#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 	struct fib6_nh			fib6_nh;
 };
 
@@ -390,8 +386,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 		    void *arg);
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
-	     struct nl_info *info, struct mx6_config *mxc,
-	     struct netlink_ext_ack *extack);
+	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct rt6_info *rt, struct nl_info *info);
 
 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
@@ -420,6 +415,12 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 void fib6_update_sernum(struct net *net, struct rt6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
 
+void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val);
+static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric)
+{
+	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
+}
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
diff --git a/net/core/dst.c b/net/core/dst.c
index 007aa0b08291..2d9b37f8944a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = {
 	 */
 	.refcnt = REFCOUNT_INIT(1),
 };
+EXPORT_SYMBOL(dst_default_metrics);
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	      struct net_device *dev, int initial_ref, int initial_obsolete,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 64b73e65f114..0d94c56c3e41 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -578,6 +578,24 @@ out:
 	return res;
 }
 
+void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val)
+{
+	if (!f6i)
+		return;
+
+	if (f6i->fib6_metrics == &dst_default_metrics) {
+		struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
+
+		if (!p)
+			return;
+
+		refcount_set(&p->refcnt, 1);
+		f6i->fib6_metrics = p;
+	}
+
+	f6i->fib6_metrics->metrics[metric - 1] = val;
+}
+
 /*
  *	Routing Table
  *
@@ -801,38 +819,6 @@ insert_above:
 	return ln;
 }
 
-static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
-{
-	int i;
-
-	for (i = 0; i < RTAX_MAX; i++) {
-		if (test_bit(i, mxc->mx_valid))
-			mp[i] = mxc->mx[i];
-	}
-}
-
-static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
-{
-	if (!mxc->mx)
-		return 0;
-
-	if (dst->flags & DST_HOST) {
-		u32 *mp = dst_metrics_write_ptr(dst);
-
-		if (unlikely(!mp))
-			return -ENOMEM;
-
-		fib6_copy_metrics(mp, mxc);
-	} else {
-		dst_init_metrics(dst, mxc->mx, false);
-
-		/* We've stolen mx now. */
-		mxc->mx = NULL;
-	}
-
-	return 0;
-}
-
 static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
@@ -866,7 +852,7 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-			    struct nl_info *info, struct mx6_config *mxc,
+			    struct nl_info *info,
 			    struct netlink_ext_ack *extack)
 {
 	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
@@ -923,7 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 					rt6_clean_expires(iter);
 				else
 					rt6_set_expires(iter, rt->dst.expires);
-				iter->rt6i_pmtu = rt->rt6i_pmtu;
+				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
 			/* If we have the same destination and the same metric,
@@ -1002,9 +988,6 @@ next_iter:
 
 add:
 		nlflags |= NLM_F_CREATE;
-		err = fib6_commit_metrics(&rt->dst, mxc);
-		if (err)
-			return err;
 
 		err = call_fib6_entry_notifiers(info->nl_net,
 						FIB_EVENT_ENTRY_ADD,
@@ -1035,10 +1018,6 @@ add:
 			return -ENOENT;
 		}
 
-		err = fib6_commit_metrics(&rt->dst, mxc);
-		if (err)
-			return err;
-
 		err = call_fib6_entry_notifiers(info->nl_net,
 						FIB_EVENT_ENTRY_REPLACE,
 						rt, extack);
@@ -1135,8 +1114,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
  */
 
 int fib6_add(struct fib6_node *root, struct rt6_info *rt,
-	     struct nl_info *info, struct mx6_config *mxc,
-	     struct netlink_ext_ack *extack)
+	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
 	struct fib6_table *table = rt->rt6i_table;
 	struct fib6_node *fn, *pn = NULL;
@@ -1244,7 +1222,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, info, mxc, extack);
+	err = fib6_add_rt2node(fn, rt, info, extack);
 	if (!err) {
 		__fib6_update_sernum_upto_root(rt, sernum);
 		fib6_start_gc(info->nl_net, rt);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3fbc3805e69b..b058ea9ecec0 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1323,9 +1323,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	    ra_msg->icmph.icmp6_hop_limit) {
 		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
 			in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
-			if (rt)
-				dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
-					       ra_msg->icmph.icmp6_hop_limit);
+			fib6_metric_set(rt, RTAX_HOPLIMIT,
+					ra_msg->icmph.icmp6_hop_limit);
 		} else {
 			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
@@ -1477,10 +1476,7 @@ skip_routeinfo:
 			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
 		} else if (in6_dev->cnf.mtu6 != mtu) {
 			in6_dev->cnf.mtu6 = mtu;
-
-			if (rt)
-				dst_metric_set(&rt->dst, RTAX_MTU, mtu);
-
+			fib6_metric_set(rt, RTAX_MTU, mtu);
 			rt6_mtu_change(skb->dev, mtu);
 		}
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3b301aafd2ed..62aafd06c35f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -96,12 +96,11 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 static size_t rt6_nlmsg_size(struct rt6_info *rt);
-static int rt6_fill_node(struct net *net,
-			 struct sk_buff *skb, struct rt6_info *rt,
-			 struct in6_addr *dst, struct in6_addr *src,
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags);
 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
@@ -183,23 +182,6 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 	}
 }
 
-static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
-{
-	return dst_metrics_write_ptr(&rt->from->dst);
-}
-
-static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
-{
-	struct rt6_info *rt = (struct rt6_info *)dst;
-
-	if (rt->rt6i_flags & RTF_PCPU)
-		return rt6_pcpu_cow_metrics(rt);
-	else if (rt->rt6i_flags & RTF_CACHE)
-		return NULL;
-	else
-		return dst_cow_metrics_generic(dst, old);
-}
-
 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
 					     struct sk_buff *skb,
 					     const void *daddr)
@@ -249,7 +231,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.check			=	ip6_dst_check,
 	.default_advmss		=	ip6_default_advmss,
 	.mtu			=	ip6_mtu,
-	.cow_metrics		=	ipv6_cow_metrics,
+	.cow_metrics		=	dst_cow_metrics_generic,
 	.destroy		=	ip6_dst_destroy,
 	.ifdown			=	ip6_dst_ifdown,
 	.negative_advice	=	ip6_negative_advice,
@@ -353,6 +335,7 @@ static void rt6_info_init(struct rt6_info *rt)
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
+	rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 }
 
 /* allocate dst with ip6_dst_ops */
@@ -395,6 +378,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
+	struct dst_metrics *m;
 
 	dst_destroy_metrics_generic(dst);
 	free_percpu(rt->rt6i_pcpu);
@@ -411,6 +395,10 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		kfree(bucket);
 	}
 
+	m = rt->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
+
 	rt->from = NULL;
 	dst_release(&from->dst);
 }
@@ -996,7 +984,11 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	dst_hold(&from->dst);
 	rt->from = from;
-	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
+	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
+	if (from->fib6_metrics != &dst_default_metrics) {
+		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+		refcount_inc(&from->fib6_metrics->refcnt);
+	}
 }
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
@@ -1140,7 +1132,6 @@ EXPORT_SYMBOL(rt6_lookup);
  */
 
 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
-			struct mx6_config *mxc,
 			struct netlink_ext_ack *extack)
 {
 	int err;
@@ -1148,7 +1139,7 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 
 	table = rt->rt6i_table;
 	spin_lock_bh(&table->tb6_lock);
-	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
+	err = fib6_add(&table->tb6_root, rt, info, extack);
 	spin_unlock_bh(&table->tb6_lock);
 
 	return err;
@@ -1157,11 +1148,10 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
-	struct mx6_config mxc = { .mx = NULL, };
 
 	/* Hold dst to account for the reference from the fib6 tree */
 	dst_hold(&rt->dst);
-	return __ip6_ins_rt(rt, &info, &mxc, NULL);
+	return __ip6_ins_rt(rt, &info, NULL);
 }
 
 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
@@ -1232,8 +1222,8 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 	p = this_cpu_ptr(rt->rt6i_pcpu);
 	pcpu_rt = *p;
 
-	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
-		rt6_dst_from_metrics_check(pcpu_rt);
+	if (pcpu_rt)
+		ip6_hold_safe(NULL, &pcpu_rt, false);
 
 	return pcpu_rt;
 }
@@ -1254,7 +1244,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net,
 	prev = cmpxchg(p, NULL, pcpu_rt);
 	BUG_ON(prev);
 
-	rt6_dst_from_metrics_check(pcpu_rt);
 	return pcpu_rt;
 }
 
@@ -1384,6 +1373,16 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 	return NULL;
 }
 
+static unsigned int fib6_mtu(const struct rt6_info *rt)
+{
+	unsigned int mtu;
+
+	mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
+	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
+}
+
 static int rt6_insert_exception(struct rt6_info *nrt,
 				struct rt6_info *ort)
 {
@@ -1436,7 +1435,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	 * Only insert this exception route if its mtu
 	 * is less than ort's mtu value.
 	 */
-	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -1673,12 +1672,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
 			struct rt6_info *entry = rt6_ex->rt6i;
 
 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
-			 * route), the metrics of its rt->dst.from have already
+			 * route), the metrics of its rt->from have already
 			 * been updated.
 			 */
-			if (entry->rt6i_pmtu &&
+			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
-				entry->rt6i_pmtu = mtu;
+				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
 		}
 		bucket++;
 	}
@@ -1844,10 +1843,9 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (ip6_hold_safe(net, &rt, true)) {
+		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
-			rt6_dst_from_metrics_check(rt);
-		}
+
 		rcu_read_unlock();
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
@@ -2147,13 +2145,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
-static void rt6_dst_from_metrics_check(struct rt6_info *rt)
-{
-	if (rt->from &&
-	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
-		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
-}
-
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
@@ -2188,8 +2179,6 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * into this function always.
 	 */
 
-	rt6_dst_from_metrics_check(rt);
-
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		return rt6_dst_from_check(rt, cookie);
@@ -2242,8 +2231,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 {
 	struct net *net = dev_net(rt->dst.dev);
 
+	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
 	rt->rt6i_flags |= RTF_MODIFIED;
-	rt->rt6i_pmtu = mtu;
 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
 }
 
@@ -2289,10 +2278,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	} else if (daddr) {
 		struct rt6_info *nrt6;
 
-		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
+		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
-			if (rt6_insert_exception(nrt6, rt6))
+			if (rt6_insert_exception(nrt6, rt6->from))
 				dst_release_immediate(&nrt6->dst);
 		}
 	}
@@ -2533,12 +2522,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 
 static unsigned int ip6_mtu(const struct dst_entry *dst)
 {
-	const struct rt6_info *rt = (const struct rt6_info *)dst;
-	unsigned int mtu = rt->rt6i_pmtu;
 	struct inet6_dev *idev;
-
-	if (mtu)
-		goto out;
+	unsigned int mtu;
 
 	mtu = dst_metric_raw(dst, RTAX_MTU);
 	if (mtu)
@@ -2622,60 +2607,24 @@ out:
 	return entries > rt_max_size;
 }
 
-static int ip6_convert_metrics(struct mx6_config *mxc,
-			       const struct fib6_config *cfg)
+static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
+			       struct fib6_config *cfg)
 {
-	struct net *net = cfg->fc_nlinfo.nl_net;
-	bool ecn_ca = false;
-	struct nlattr *nla;
-	int remaining;
-	u32 *mp;
-
-	if (!cfg->fc_mx)
-		return 0;
-
-	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
-	if (unlikely(!mp))
-		return -ENOMEM;
-
-	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-		int type = nla_type(nla);
-		u32 val;
-
-		if (!type)
-			continue;
-		if (unlikely(type > RTAX_MAX))
-			goto err;
-
-		if (type == RTAX_CC_ALGO) {
-			char tmp[TCP_CA_NAME_MAX];
+	int err = 0;
 
-			nla_strlcpy(tmp, nla, sizeof(tmp));
-			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
-				goto err;
-		} else {
-			val = nla_get_u32(nla);
-		}
-		if (type == RTAX_HOPLIMIT && val > 255)
-			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
-			goto err;
+	if (cfg->fc_mx) {
+		rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
+					   GFP_KERNEL);
+		if (unlikely(!rt->fib6_metrics))
+			return -ENOMEM;
 
-		mp[type - 1] = val;
-		__set_bit(type - 1, mxc->mx_valid);
-	}
+		refcount_set(&rt->fib6_metrics->refcnt, 1);
 
-	if (ecn_ca) {
-		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
-		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+		err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
+					 rt->fib6_metrics->metrics);
 	}
 
-	mxc->mx = mp;
-	return 0;
- err:
-	kfree(mp);
-	return -EINVAL;
+	return err;
 }
 
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
@@ -2955,6 +2904,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	err = ip6_convert_metrics(net, rt, cfg);
+	if (err < 0)
+		goto out;
+
 	if (cfg->fc_flags & RTF_EXPIRES)
 		rt6_set_expires(rt, jiffies +
 				clock_t_to_jiffies(cfg->fc_expires));
@@ -3078,32 +3031,16 @@ out:
 	return ERR_PTR(err);
 }
 
-int ip6_route_add(struct fib6_config *cfg,
-		  struct netlink_ext_ack *extack)
+int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack)
 {
-	struct mx6_config mxc = { .mx = NULL, };
 	struct rt6_info *rt;
 	int err;
 
 	rt = ip6_route_info_create(cfg, extack);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		rt = NULL;
-		goto out;
-	}
-
-	err = ip6_convert_metrics(&mxc, cfg);
-	if (err)
-		goto out;
-
-	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
-
-	kfree(mxc.mx);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
 
-	return err;
-out:
-	if (rt)
-		dst_release_immediate(&rt->dst);
+	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
 
 	return err;
 }
@@ -3157,7 +3094,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 		if (skb) {
 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 
-			if (rt6_fill_node(net, skb, rt,
+			if (rt6_fill_node(net, skb, rt, NULL,
 					  NULL, NULL, 0, RTM_DELROUTE,
 					  info->portid, seq, 0) < 0) {
 				kfree_skb(skb);
@@ -3348,7 +3285,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * a cached route because rt6_insert_exception() will
 	 * takes care of it
 	 */
-	if (rt6_insert_exception(nrt, rt)) {
+	if (rt6_insert_exception(nrt, rt->from)) {
 		dst_release_immediate(&nrt->dst);
 		goto out;
 	}
@@ -4018,11 +3955,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   update PMTU increase is a MUST. (i.e. jumbo frame)
 	 */
 	if (rt->fib6_nh.nh_dev == arg->dev &&
-	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
+	    !fib6_metric_locked(rt, RTAX_MTU)) {
+		u32 mtu = rt->fib6_pmtu;
+
+		if (mtu >= arg->mtu ||
+		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+
 		spin_lock_bh(&rt6_exception_lock);
-		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
-		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
-			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
 		spin_unlock_bh(&rt6_exception_lock);
 	}
@@ -4183,7 +4123,6 @@ errout:
 struct rt6_nh {
 	struct rt6_info *rt6_info;
 	struct fib6_config r_cfg;
-	struct mx6_config mxc;
 	struct list_head next;
 };
 
@@ -4198,7 +4137,8 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 	}
 }
 
-static int ip6_route_info_append(struct list_head *rt6_nh_list,
+static int ip6_route_info_append(struct net *net,
+				 struct list_head *rt6_nh_list,
 				 struct rt6_info *rt, struct fib6_config *r_cfg)
 {
 	struct rt6_nh *nh;
@@ -4214,7 +4154,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list,
 	if (!nh)
 		return -ENOMEM;
 	nh->rt6_info = rt;
-	err = ip6_convert_metrics(&nh->mxc, r_cfg);
+	err = ip6_convert_metrics(net, rt, r_cfg);
 	if (err) {
 		kfree(nh);
 		return err;
@@ -4305,7 +4245,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 
 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
 
-		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
+		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
+					    rt, &r_cfg);
 		if (err) {
 			dst_release_immediate(&rt->dst);
 			goto cleanup;
@@ -4323,7 +4264,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		rt_last = nh->rt6_info;
-		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
+		err = __ip6_ins_rt(nh->rt6_info, info, extack);
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
 			rt_notif = nh->rt6_info;
@@ -4372,7 +4313,6 @@ cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
 		if (nh->rt6_info)
 			dst_release_immediate(&nh->rt6_info->dst);
-		kfree(nh->mxc.mx);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4546,16 +4486,16 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-static int rt6_fill_node(struct net *net,
-			 struct sk_buff *skb, struct rt6_info *rt,
-			 struct in6_addr *dst, struct in6_addr *src,
+static int rt6_fill_node(struct net *net, struct sk_buff *skb,
+			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags)
 {
-	u32 metrics[RTAX_MAX];
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
-	long expires;
+	long expires = 0;
+	u32 *pmetrics;
 	u32 table;
 
 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
@@ -4583,8 +4523,8 @@ static int rt6_fill_node(struct net *net,
 	if (rt->rt6i_flags & RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
-	if (dst) {
-		if (nla_put_in6_addr(skb, RTA_DST, dst))
+	if (dest) {
+		if (nla_put_in6_addr(skb, RTA_DST, dest))
 			goto nla_put_failure;
 		rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
@@ -4612,9 +4552,9 @@ static int rt6_fill_node(struct net *net,
 #endif
 			if (nla_put_u32(skb, RTA_IIF, iif))
 				goto nla_put_failure;
-	} else if (dst) {
+	} else if (dest) {
 		struct in6_addr saddr_buf;
-		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
+		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
 			goto nla_put_failure;
 	}
@@ -4626,10 +4566,8 @@ static int rt6_fill_node(struct net *net,
 			goto nla_put_failure;
 	}
 
-	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
-	if (rt->rt6i_pmtu)
-		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
-	if (rtnetlink_put_metrics(skb, metrics) < 0)
+	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
+	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
 		goto nla_put_failure;
 
 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
@@ -4661,9 +4599,10 @@ static int rt6_fill_node(struct net *net,
 			goto nla_put_failure;
 	}
 
-	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
+	if (rt->rt6i_flags & RTF_EXPIRES && dst)
+		expires = dst->expires - jiffies;
 
-	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
+	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
 
 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
@@ -4697,10 +4636,9 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 		}
 	}
 
-	return rt6_fill_node(net,
-		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
-		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
-		     NLM_F_MULTI);
+	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
+			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
+			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4814,13 +4752,14 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 	skb_dst_set(skb, &rt->dst);
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
-				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
-				    nlh->nlmsg_seq, 0);
+		err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr,
+				    iif, RTM_NEWROUTE,
+				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+				    0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
@@ -4846,8 +4785,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
 	if (!skb)
 		goto errout;
 
-	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
-				event, info->portid, seq, nlm_flags);
+	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+			    event, info->portid, seq, nlm_flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
-- 
cgit v1.2.3-59-g8ed1b


From 14895687d36805f051bb54014c32e48e5937f7e1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:17 -0700
Subject: net/ipv6: move expires into rt6_info

Add expires to rt6_info for FIB entries, and add fib6 helpers to
manage it. Data path use of dst.expires remains.

The transition is fairly straightforward: when working with fib entries,
rt->dst.expires is just rt->expires, rt6_clean_expires is replaced with
fib6_clean_expires, rt6_set_expires becomes fib6_set_expires, and
rt6_check_expired becomes fib6_check_expired, where the fib6 versions
are added by this patch.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 27 +++++++++++++++++++++++----
 net/ipv6/addrconf.c   |  6 +++---
 net/ipv6/ip6_fib.c    |  8 ++++----
 net/ipv6/ndisc.c      |  2 +-
 net/ipv6/route.c      | 20 +++++++++++---------
 5 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1f8dc9d12abb..c73b985734f5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -179,6 +179,7 @@ struct rt6_info {
 					should_flush:1,
 					unused:6;
 
+	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 	struct fib6_nh			fib6_nh;
@@ -197,6 +198,26 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 	return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
+static inline void fib6_clean_expires(struct rt6_info *f6i)
+{
+	f6i->rt6i_flags &= ~RTF_EXPIRES;
+	f6i->expires = 0;
+}
+
+static inline void fib6_set_expires(struct rt6_info *f6i,
+				    unsigned long expires)
+{
+	f6i->expires = expires;
+	f6i->rt6i_flags |= RTF_EXPIRES;
+}
+
+static inline bool fib6_check_expired(const struct rt6_info *f6i)
+{
+	if (f6i->rt6i_flags & RTF_EXPIRES)
+		return time_after(jiffies, f6i->expires);
+	return false;
+}
+
 static inline void rt6_clean_expires(struct rt6_info *rt)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
@@ -211,11 +232,9 @@ static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires)
 
 static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
-	struct rt6_info *rt;
+	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
+		rt0->dst.expires = rt0->from->expires;
 
-	for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from);
-	if (rt && rt != rt0)
-		rt0->dst.expires = rt->dst.expires;
 	dst_set_expires(&rt0->dst, timeout);
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 483c8772e856..a156f1a0b1a7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1190,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 			ip6_del_rt(dev_net(ifp->idev->dev), rt);
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
-				rt6_set_expires(rt, expires);
+				fib6_set_expires(rt, expires);
 			ip6_rt_put(rt);
 		}
 	}
@@ -2672,9 +2672,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				rt = NULL;
 			} else if (addrconf_finite_timeout(rt_expires)) {
 				/* not infinity */
-				rt6_set_expires(rt, jiffies + rt_expires);
+				fib6_set_expires(rt, jiffies + rt_expires);
 			} else {
-				rt6_clean_expires(rt);
+				fib6_clean_expires(rt);
 			}
 		} else if (valid_lft) {
 			clock_t expires = 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0d94c56c3e41..f25f4d9831e8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -906,9 +906,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 				if (!(iter->rt6i_flags & RTF_EXPIRES))
 					return -EEXIST;
 				if (!(rt->rt6i_flags & RTF_EXPIRES))
-					rt6_clean_expires(iter);
+					fib6_clean_expires(iter);
 				else
-					rt6_set_expires(iter, rt->dst.expires);
+					fib6_set_expires(iter, rt->expires);
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
@@ -2003,8 +2003,8 @@ static int fib6_age(struct rt6_info *rt, void *arg)
 	 *	Routes are expired even if they are in use.
 	 */
 
-	if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
-		if (time_after(now, rt->dst.expires)) {
+	if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+		if (time_after(now, rt->expires)) {
 			RT6_TRACE("expiring %p\n", rt);
 			return -1;
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b058ea9ecec0..e4d9eea92139 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	}
 
 	if (rt)
-		rt6_set_expires(rt, jiffies + (HZ * lifetime));
+		fib6_set_expires(rt, jiffies + (HZ * lifetime));
 	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
 	    ra_msg->icmph.icmp6_hop_limit) {
 		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 62aafd06c35f..f6ca55d21fac 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -435,7 +435,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 			return true;
 	} else if (rt->from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-			rt6_check_expired(rt->from);
+			fib6_check_expired(rt->from);
 	}
 	return false;
 }
@@ -687,7 +687,7 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
-	if (rt6_check_expired(rt))
+	if (fib6_check_expired(rt))
 		goto out;
 
 	m = rt6_score_route(rt, oif, strict);
@@ -871,9 +871,9 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 
 	if (rt) {
 		if (!addrconf_finite_timeout(lifetime))
-			rt6_clean_expires(rt);
+			fib6_clean_expires(rt);
 		else
-			rt6_set_expires(rt, jiffies + HZ * lifetime);
+			fib6_set_expires(rt, jiffies + HZ * lifetime);
 
 		ip6_rt_put(rt);
 	}
@@ -2383,7 +2383,7 @@ restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 			continue;
-		if (rt6_check_expired(rt))
+		if (fib6_check_expired(rt))
 			continue;
 		if (rt->rt6i_flags & RTF_REJECT)
 			break;
@@ -2909,10 +2909,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 
 	if (cfg->fc_flags & RTF_EXPIRES)
-		rt6_set_expires(rt, jiffies +
+		fib6_set_expires(rt, jiffies +
 				clock_t_to_jiffies(cfg->fc_expires));
 	else
-		rt6_clean_expires(rt);
+		fib6_clean_expires(rt);
 
 	if (cfg->fc_protocol == RTPROT_UNSPEC)
 		cfg->fc_protocol = RTPROT_BOOT;
@@ -4599,8 +4599,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_EXPIRES && dst)
-		expires = dst->expires - jiffies;
+	if (rt->rt6i_flags & RTF_EXPIRES) {
+		expires = dst ? dst->expires : rt->expires;
+		expires -= jiffies;
+	}
 
 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3-59-g8ed1b


From 421842edeaf62c4e180b687f5a4efca8c19c49ad Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:18 -0700
Subject: net/ipv6: Add fib6_null_entry

ip6_null_entry will stay a dst based return for lookups that fail to
match an entry.

Add a new fib6_null_entry which constitutes the root node and leafs
for fibs. Replace existing references to ip6_null_entry with the
new fib6_null_entry when dealing with FIBs.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  3 ++-
 net/ipv6/ip6_fib.c       | 26 ++++++++++----------
 net/ipv6/route.c         | 62 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 58 insertions(+), 33 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index c29f09cfc9d7..74e4e1e449d5 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -60,7 +60,8 @@ struct netns_ipv6 {
 #endif
 	struct xt_table		*ip6table_nat;
 #endif
-	struct rt6_info         *ip6_null_entry;
+	struct rt6_info         *fib6_null_entry;
+	struct rt6_info		*ip6_null_entry;
 	struct rt6_statistics   *rt6_stats;
 	struct timer_list       ip6_fib_timer;
 	struct hlist_head       *fib_table_hash;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f25f4d9831e8..280b69497ad0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -231,7 +231,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
 	if (table) {
 		table->tb6_id = id;
 		rcu_assign_pointer(table->tb6_root.leaf,
-				   net->ipv6.ip6_null_entry);
+				   net->ipv6.fib6_null_entry);
 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 		inet_peer_base_init(&table->tb6_peers);
 	}
@@ -369,7 +369,7 @@ struct fib6_dump_arg {
 
 static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 {
-	if (rt == arg->net->ipv6.ip6_null_entry)
+	if (rt == arg->net->ipv6.fib6_null_entry)
 		return;
 	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
 }
@@ -658,7 +658,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 			/* remove null_entry in the root node */
 			} else if (fn->fn_flags & RTN_TL_ROOT &&
 				   rcu_access_pointer(fn->leaf) ==
-				   net->ipv6.ip6_null_entry) {
+				   net->ipv6.fib6_null_entry) {
 				RCU_INIT_POINTER(fn->leaf, NULL);
 			}
 
@@ -1171,9 +1171,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			if (!sfn)
 				goto failure;
 
-			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref);
 			rcu_assign_pointer(sfn->leaf,
-					   info->nl_net->ipv6.ip6_null_entry);
+					   info->nl_net->ipv6.fib6_null_entry);
 			sfn->fn_flags = RTN_ROOT;
 
 			/* Now add the first leaf node to new subtree */
@@ -1212,7 +1212,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			if (fn->fn_flags & RTN_TL_ROOT) {
 				/* put back null_entry for root node */
 				rcu_assign_pointer(fn->leaf,
-					    info->nl_net->ipv6.ip6_null_entry);
+					    info->nl_net->ipv6.fib6_null_entry);
 			} else {
 				atomic_inc(&rt->rt6i_ref);
 				rcu_assign_pointer(fn->leaf, rt);
@@ -1251,7 +1251,7 @@ out:
 				if (!pn_leaf) {
 					WARN_ON(!pn_leaf);
 					pn_leaf =
-					    info->nl_net->ipv6.ip6_null_entry;
+					    info->nl_net->ipv6.fib6_null_entry;
 				}
 #endif
 				atomic_inc(&pn_leaf->rt6i_ref);
@@ -1494,7 +1494,7 @@ static struct rt6_info *fib6_find_prefix(struct net *net,
 	struct fib6_node *child_left, *child_right;
 
 	if (fn->fn_flags & RTN_ROOT)
-		return net->ipv6.ip6_null_entry;
+		return net->ipv6.fib6_null_entry;
 
 	while (fn) {
 		child_left = rcu_dereference_protected(fn->left,
@@ -1531,7 +1531,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 
 	/* Set fn->leaf to null_entry for root node. */
 	if (fn->fn_flags & RTN_TL_ROOT) {
-		rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry);
+		rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
 		return fn;
 	}
 
@@ -1576,7 +1576,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 #if RT6_DEBUG >= 2
 			if (!new_fn_leaf) {
 				WARN_ON(!new_fn_leaf);
-				new_fn_leaf = net->ipv6.ip6_null_entry;
+				new_fn_leaf = net->ipv6.fib6_null_entry;
 			}
 #endif
 			atomic_inc(&new_fn_leaf->rt6i_ref);
@@ -1726,7 +1726,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 		return -ENOENT;
 	}
 #endif
-	if (!fn || rt == net->ipv6.ip6_null_entry)
+	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
 
 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
@@ -2087,7 +2087,7 @@ static int __net_init fib6_net_init(struct net *net)
 
 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
 	rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
-			   net->ipv6.ip6_null_entry);
+			   net->ipv6.fib6_null_entry);
 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2099,7 +2099,7 @@ static int __net_init fib6_net_init(struct net *net)
 		goto out_fib6_main_tbl;
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
 	rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
-			   net->ipv6.ip6_null_entry);
+			   net->ipv6.fib6_null_entry);
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f6ca55d21fac..7c141394d4f1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -276,6 +276,15 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 	[RTAX_HOPLIMIT - 1] = 0,
 };
 
+static const struct rt6_info fib6_null_entry_template = {
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
+	.rt6i_metric	= ~(u32)0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_type	= RTN_UNREACHABLE,
+	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
+};
+
 static const struct rt6_info ip6_null_entry_template = {
 	.dst = {
 		.__refcnt	= ATOMIC_INIT(1),
@@ -522,10 +531,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
 			return local;
 
 		if (flags & RT6_LOOKUP_F_IFACE)
-			return net->ipv6.ip6_null_entry;
+			return net->ipv6.fib6_null_entry;
 	}
 
-	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
+	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
 }
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
@@ -758,8 +767,8 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	bool do_rr = false;
 	int key_plen;
 
-	if (!leaf || leaf == net->ipv6.ip6_null_entry)
-		return net->ipv6.ip6_null_entry;
+	if (!leaf || leaf == net->ipv6.fib6_null_entry)
+		return net->ipv6.fib6_null_entry;
 
 	rt0 = rcu_dereference(fn->rr_ptr);
 	if (!rt0)
@@ -776,7 +785,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 		key_plen = rt0->rt6i_src.plen;
 #endif
 	if (fn->fn_bit != key_plen)
-		return net->ipv6.ip6_null_entry;
+		return net->ipv6.fib6_null_entry;
 
 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
 			     &do_rr);
@@ -797,7 +806,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 		}
 	}
 
-	return match ? match : net->ipv6.ip6_null_entry;
+	return match ? match : net->ipv6.fib6_null_entry;
 }
 
 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
@@ -1063,7 +1072,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 restart:
 	rt = rcu_dereference(fn->leaf);
 	if (!rt) {
-		rt = net->ipv6.ip6_null_entry;
+		rt = net->ipv6.fib6_null_entry;
 	} else {
 		rt = rt6_device_match(net, rt, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
@@ -1071,7 +1080,7 @@ restart:
 			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
 						  skb, flags);
 	}
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
@@ -1820,7 +1829,7 @@ redo_rt6_select:
 	rt = rt6_select(net, fn, oif, strict);
 	if (rt->rt6i_nsiblings)
 		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
@@ -1837,7 +1846,8 @@ redo_rt6_select:
 	if (rt_cache)
 		rt = rt_cache;
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
+		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		trace_fib6_table_lookup(net, rt, table, fl6);
@@ -2412,13 +2422,13 @@ restart:
 	}
 
 	if (!rt)
-		rt = net->ipv6.ip6_null_entry;
+		rt = net->ipv6.fib6_null_entry;
 	else if (rt->rt6i_flags & RTF_REJECT) {
 		rt = net->ipv6.ip6_null_entry;
 		goto out;
 	}
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
@@ -3051,7 +3061,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	struct fib6_table *table;
 	int err;
 
-	if (rt == net->ipv6.ip6_null_entry) {
+	if (rt == net->ipv6.fib6_null_entry) {
 		err = -ENOENT;
 		goto out;
 	}
@@ -3081,7 +3091,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	struct fib6_table *table;
 	int err = -ENOENT;
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		goto out_put;
 	table = rt->rt6i_table;
 	spin_lock_bh(&table->tb6_lock);
@@ -3634,7 +3644,7 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
 
 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
-	    rt != net->ipv6.ip6_null_entry &&
+	    rt != net->ipv6.fib6_null_entry &&
 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
 		/* remove prefsrc entry */
@@ -3789,7 +3799,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
 
-	if (rt != net->ipv6.ip6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
+	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
 		fib6_update_sernum_upto_root(net, rt);
 		rt6_multipath_rebalance(rt);
@@ -3873,7 +3883,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
 	const struct net_device *dev = arg->dev;
 	struct net *net = dev_net(dev);
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
 	switch (arg->event) {
@@ -4624,7 +4634,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	struct net *net = arg->net;
 
-	if (rt == net->ipv6.ip6_null_entry)
+	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
@@ -4813,6 +4823,8 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
+		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
+		net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4826,6 +4838,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
+		in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5009,11 +5022,17 @@ static int __net_init ip6_route_net_init(struct net *net)
 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
 		goto out_ip6_dst_ops;
 
+	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
+					    sizeof(*net->ipv6.fib6_null_entry),
+					    GFP_KERNEL);
+	if (!net->ipv6.fib6_null_entry)
+		goto out_ip6_dst_entries;
+
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
-		goto out_ip6_dst_entries;
+		goto out_fib6_null_entry;
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
@@ -5060,6 +5079,8 @@ out_ip6_prohibit_entry:
 out_ip6_null_entry:
 	kfree(net->ipv6.ip6_null_entry);
 #endif
+out_fib6_null_entry:
+	kfree(net->ipv6.fib6_null_entry);
 out_ip6_dst_entries:
 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
 out_ip6_dst_ops:
@@ -5068,6 +5089,7 @@ out_ip6_dst_ops:
 
 static void __net_exit ip6_route_net_exit(struct net *net)
 {
+	kfree(net->ipv6.fib6_null_entry);
 	kfree(net->ipv6.ip6_null_entry);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	kfree(net->ipv6.ip6_prohibit_entry);
@@ -5138,6 +5160,8 @@ void __init ip6_route_init_special_entries(void)
 	/* Registering of the loopback is done before this portion of code,
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
+	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
+	init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3-59-g8ed1b


From dec9b0e295f6b34b89041cf19001ba86298d0687 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:19 -0700
Subject: net/ipv6: Add rt6_info create function for ip6_pol_route_lookup

ip6_pol_route_lookup is the lookup function for ip6_route_lookup and
rt6_lookup. At the moment it returns either a reference to a FIB entry
or a cached exception. To move FIB entries to a separate struct, this
lookup function needs to convert FIB entries to an rt6_info that is
returned to the caller.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c141394d4f1..e293692174ba 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1055,6 +1055,19 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 	return false;
 }
 
+/* called with rcu_lock held */
+static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
+{
+	struct net_device *dev = rt->fib6_nh.nh_dev;
+	struct rt6_info *nrt;
+
+	nrt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	if (nrt)
+		ip6_rt_copy_init(nrt, rt);
+
+	return nrt;
+}
+
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
 					     struct flowi6 *fl6,
@@ -1087,18 +1100,26 @@ restart:
 	}
 	/* Search through exception table */
 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
-	if (rt_cache)
+	if (rt_cache) {
 		rt = rt_cache;
+		if (ip6_hold_safe(net, &rt, true))
+			dst_use_noref(&rt->dst, jiffies);
+	} else if (dst_hold_safe(&rt->dst)) {
+		struct rt6_info *nrt;
 
-	if (ip6_hold_safe(net, &rt, true))
-		dst_use_noref(&rt->dst, jiffies);
+		nrt = ip6_create_rt_rcu(rt);
+		dst_release(&rt->dst);
+		rt = nrt;
+	} else {
+		rt = net->ipv6.ip6_null_entry;
+		dst_hold(&rt->dst);
+	}
 
 	rcu_read_unlock();
 
 	trace_fib6_table_lookup(net, rt, table, fl6);
 
 	return rt;
-
 }
 
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
-- 
cgit v1.2.3-59-g8ed1b


From 3b6761d18bc11f2af2a6fc494e9026d39593f22c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:20 -0700
Subject: net/ipv6: Move dst flags to booleans in fib entries

Continuing to wean FIB paths off of dst_entry, use a bool to hold
requests for certain dst settings. Add a helper to convert the
flags to DST flags when a FIB entry is converted to a dst_entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 ++++-
 net/ipv6/addrconf.c   |  4 ++--
 net/ipv6/route.c      | 29 ++++++++++++++++++++++++-----
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c73b985734f5..159f651dee55 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -177,7 +177,10 @@ struct rt6_info {
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
-					unused:6;
+					dst_nocount:1,
+					dst_nopolicy:1,
+					dst_host:1,
+					unused:3;
 
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a156f1a0b1a7..c8df5c6499db 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1046,7 +1046,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	if (net->ipv6.devconf_all->disable_policy ||
 	    idev->cnf.disable_policy)
-		rt->dst.flags |= DST_NOPOLICY;
+		rt->dst_nopolicy = true;
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
@@ -5981,7 +5981,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 			int cpu;
 
 			rcu_read_lock();
-			addrconf_set_nopolicy(ifa->rt, val);
+			ifa->rt->dst_nopolicy = val ? true : false;
 			if (rt->rt6i_pcpu) {
 				for_each_possible_cpu(cpu) {
 					struct rt6_info **rtp;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e293692174ba..1a3e0db31b34 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -937,6 +937,20 @@ static int ip6_rt_type_to_error(u8 fib6_type)
 	return fib6_prop[fib6_type];
 }
 
+static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
+{
+	unsigned short flags = 0;
+
+	if (rt->dst_nocount)
+		flags |= DST_NOCOUNT;
+	if (rt->dst_nopolicy)
+		flags |= DST_NOPOLICY;
+	if (rt->dst_host)
+		flags |= DST_HOST;
+
+	return flags;
+}
+
 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 {
 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
@@ -961,6 +975,8 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 
 static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 {
+	rt->dst.flags |= fib6_info_dst_flags(ort);
+
 	if (ort->rt6i_flags & RTF_REJECT) {
 		ip6_rt_init_dst_reject(rt, ort);
 		return;
@@ -970,7 +986,6 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 	rt->dst.output = ip6_output;
 
 	if (ort->fib6_type == RTN_LOCAL) {
-		rt->dst.flags |= DST_HOST;
 		rt->dst.input = ip6_input;
 	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
 		rt->dst.input = ip6_mc_input;
@@ -1058,10 +1073,11 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 /* called with rcu_lock held */
 static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
 {
+	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
-	nrt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	nrt = __ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
 
@@ -1229,12 +1245,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 
 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
+	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev;
 	struct rt6_info *pcpu_rt;
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
-	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
 	if (!pcpu_rt)
 		return NULL;
@@ -2965,7 +2982,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
 	rt->rt6i_dst.plen = cfg->fc_dst_len;
 	if (rt->rt6i_dst.plen == 128)
-		rt->dst.flags |= DST_HOST;
+		rt->dst_host = true;
 
 #ifdef CONFIG_IPV6_SUBTREES
 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -3626,10 +3643,12 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
+	rt->dst_nocount = true;
+
 	in6_dev_hold(idev);
 	rt->rt6i_idev = idev;
 
-	rt->dst.flags |= DST_HOST;
+	rt->dst_host = true;
 	rt->rt6i_protocol = RTPROT_KERNEL;
 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
-- 
cgit v1.2.3-59-g8ed1b


From f8a1b43b709d8ef33a8de2f8f35095b4a4413713 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:21 -0700
Subject: net/ipv6: Create a neigh_lookup for FIB entries

The router discovery code has a FIB entry and wants to validate the
gateway has a neighbor entry. Refactor the existing dst_neigh_lookup
for IPv6 and create a new function that takes the gateway and device
and returns a neighbor entry. Use the new function in
ndisc_router_discovery to validate the gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  3 +++
 net/ipv6/ndisc.c        |  8 ++++++--
 net/ipv6/route.c        | 33 ++++++++++++++++++++-------------
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 655e13017a45..cb6fb7e16a28 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -279,4 +279,7 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+				   struct net_device *dev, struct sk_buff *skb,
+				   const void *daddr);
 #endif
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e4d9eea92139..556717154fa3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1276,7 +1276,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
 
 	if (rt) {
-		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+					 rt->fib6_nh.nh_dev, NULL,
+					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
@@ -1304,7 +1306,9 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			return;
 		}
 
-		neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr);
+		neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
+					 rt->fib6_nh.nh_dev, NULL,
+					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1a3e0db31b34..d635d71f7d51 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -182,12 +182,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 	}
 }
 
-static inline const void *choose_neigh_daddr(struct rt6_info *rt,
+static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 					     struct sk_buff *skb,
 					     const void *daddr)
 {
-	struct in6_addr *p = &rt->rt6i_gateway;
-
 	if (!ipv6_addr_any(p))
 		return (const void *) p;
 	else if (skb)
@@ -195,18 +193,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt,
 	return daddr;
 }
 
-static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
-					  struct sk_buff *skb,
-					  const void *daddr)
+struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
+				   struct net_device *dev,
+				   struct sk_buff *skb,
+				   const void *daddr)
 {
-	struct rt6_info *rt = (struct rt6_info *) dst;
 	struct neighbour *n;
 
-	daddr = choose_neigh_daddr(rt, skb, daddr);
-	n = __ipv6_neigh_lookup(dst->dev, daddr);
+	daddr = choose_neigh_daddr(gw, skb, daddr);
+	n = __ipv6_neigh_lookup(dev, daddr);
 	if (n)
 		return n;
-	return neigh_create(&nd_tbl, daddr, dst->dev);
+	return neigh_create(&nd_tbl, daddr, dev);
+}
+
+static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
+					      struct sk_buff *skb,
+					      const void *daddr)
+{
+	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+
+	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
 }
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
@@ -214,7 +221,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 	struct net_device *dev = dst->dev;
 	struct rt6_info *rt = (struct rt6_info *)dst;
 
-	daddr = choose_neigh_daddr(rt, NULL, daddr);
+	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
 	if (!daddr)
 		return;
 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -239,7 +246,7 @@ static struct dst_ops ip6_dst_ops_template = {
 	.update_pmtu		=	ip6_rt_update_pmtu,
 	.redirect		=	rt6_do_redirect,
 	.local_out		=	__ip6_local_out,
-	.neigh_lookup		=	ip6_neigh_lookup,
+	.neigh_lookup		=	ip6_dst_neigh_lookup,
 	.confirm_neigh		=	ip6_confirm_neigh,
 };
 
@@ -269,7 +276,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 	.redirect		=	ip6_rt_blackhole_redirect,
 	.cow_metrics		=	dst_cow_metrics_generic,
-	.neigh_lookup		=	ip6_neigh_lookup,
+	.neigh_lookup		=	ip6_dst_neigh_lookup,
 };
 
 static const u32 ip6_template_metrics[RTAX_MAX] = {
-- 
cgit v1.2.3-59-g8ed1b


From acb54e3cba404c20f07733f3222c0418a7724a5b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:22 -0700
Subject: net/ipv6: Add gfp_flags to route add functions

Most FIB entries can be added using memory allocated with GFP_KERNEL.
Add gfp_flags to ip6_route_add and addrconf_dst_alloc. Code paths that
can be reached from the packet path (e.g., ndisc and autoconfig) or
atomic notifiers use GFP_ATOMIC; paths from user context (adding
addresses and routes) use GFP_KERNEL.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  6 ++++--
 net/ipv6/addrconf.c     | 39 +++++++++++++++++++++++----------------
 net/ipv6/anycast.c      |  2 +-
 net/ipv6/route.c        | 18 ++++++++++--------
 4 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cb6fb7e16a28..ff70266e30d7 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -100,7 +100,8 @@ void ip6_route_cleanup(void);
 
 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
-int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+		  struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct rt6_info *rt);
 int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
@@ -138,7 +139,8 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 void fib6_force_start_gc(struct net *net);
 
 struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
-				    const struct in6_addr *addr, bool anycast);
+				    const struct in6_addr *addr, bool anycast,
+				    gfp_t gfp_flags);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 			       int flags);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c8df5c6499db..915cd0734b27 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1037,7 +1037,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(net, idev, addr, false);
+	rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		rt = NULL;
@@ -2320,7 +2320,7 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
 
 static void
 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, u32 flags)
+		      unsigned long expires, u32 flags, gfp_t gfp_flags)
 {
 	struct fib6_config cfg = {
 		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
@@ -2345,7 +2345,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		cfg.fc_flags |= RTF_NONEXTHOP;
 #endif
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, gfp_flags, NULL);
 }
 
 
@@ -2401,7 +2401,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 
 	ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
 }
 
 static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
@@ -2685,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				expires = jiffies_to_clock_t(rt_expires);
 			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, expires, flags);
+					      dev, expires, flags, GFP_ATOMIC);
 		}
 		ip6_rt_put(rt);
 	}
@@ -2900,7 +2900,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-					      expires, flags);
+					      expires, flags, GFP_KERNEL);
 		}
 
 		/* Send a netlink notification if DAD is enabled and
@@ -3053,7 +3053,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 	if (addr.s6_addr32[3]) {
 		add_addr(idev, &addr, plen, scope);
-		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags);
+		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+				      GFP_ATOMIC);
 		return;
 	}
 
@@ -3078,7 +3079,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 				add_addr(idev, &addr, plen, flag);
 				addrconf_prefix_route(&addr, plen, idev->dev, 0,
-						      pflags);
+						      pflags, GFP_ATOMIC);
 			}
 		}
 	}
@@ -3118,7 +3119,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
 			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
 	if (!IS_ERR(ifp)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0);
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+				      0, 0, GFP_ATOMIC);
 		addrconf_dad_start(ifp);
 		in6_ifa_put(ifp);
 	}
@@ -3233,7 +3235,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr,
 					       IFA_F_STABLE_PRIVACY);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+			addrconf_prefix_route(&addr, 64, idev->dev,
+					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_EUI64:
 		/* addrconf_add_linklocal also adds a prefix_route and we
@@ -3243,7 +3246,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+			addrconf_prefix_route(&addr, 64, idev->dev,
+					      0, 0, GFP_ATOMIC);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
 	default:
@@ -3346,7 +3350,8 @@ static int fixup_permanent_addr(struct net *net,
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
 		struct rt6_info *rt, *prev;
 
-		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false);
+		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
+					GFP_ATOMIC);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 
@@ -3361,7 +3366,7 @@ static int fixup_permanent_addr(struct net *net,
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0);
+				      idev->dev, 0, 0, GFP_ATOMIC);
 	}
 
 	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4572,8 +4577,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 		ipv6_ifa_notify(0, ifp);
 
 	if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev,
-				      expires, flags);
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      ifp->idev->dev, expires, flags,
+				      GFP_KERNEL);
 	} else if (had_prefixroute) {
 		enum cleanup_prefix_rt_t action;
 		unsigned long rt_expires;
@@ -5614,7 +5620,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
 			addrconf_prefix_route(&ifp->peer_addr, 128,
-					      ifp->idev->dev, 0, 0);
+					      ifp->idev->dev, 0, 0,
+					      GFP_KERNEL);
 		break;
 	case RTM_DELADDR:
 		if (ifp->idev->cnf.forwarding)
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 1122fb299b86..e456386fe4d5 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -267,7 +267,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 
 	net = dev_net(idev->dev);
-	rt = addrconf_dst_alloc(net, idev, addr, true);
+	rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC);
 	if (IS_ERR(rt)) {
 		err = PTR_ERR(rt);
 		goto out;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d635d71f7d51..56a854b426a6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2866,6 +2866,7 @@ out:
 }
 
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
+					      gfp_t gfp_flags,
 					      struct netlink_ext_ack *extack)
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
@@ -3086,12 +3087,13 @@ out:
 	return ERR_PTR(err);
 }
 
-int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack)
+int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
+		  struct netlink_ext_ack *extack)
 {
 	struct rt6_info *rt;
 	int err;
 
-	rt = ip6_route_info_create(cfg, extack);
+	rt = ip6_route_info_create(cfg, gfp_flags, extack);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 
@@ -3418,7 +3420,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 	if (!prefixlen)
 		cfg.fc_flags |= RTF_DEFAULT;
 
-	ip6_route_add(&cfg, NULL);
+	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
 
 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
 }
@@ -3469,7 +3471,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 
 	cfg.fc_gateway = *gwaddr;
 
-	if (!ip6_route_add(&cfg, NULL)) {
+	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
 		struct fib6_table *table;
 
 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
@@ -3567,7 +3569,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			err = ip6_route_add(&cfg, NULL);
+			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
 			break;
 		case SIOCDELRT:
 			err = ip6_route_del(&cfg, NULL);
@@ -3640,7 +3642,7 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
 struct rt6_info *addrconf_dst_alloc(struct net *net,
 				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
-				    bool anycast)
+				    bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
@@ -4293,7 +4295,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		}
 
 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
-		rt = ip6_route_info_create(&r_cfg, extack);
+		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
 		if (IS_ERR(rt)) {
 			err = PTR_ERR(rt);
 			rt = NULL;
@@ -4446,7 +4448,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (cfg.fc_mp)
 		return ip6_route_multipath_add(&cfg, extack);
 	else
-		return ip6_route_add(&cfg, extack);
+		return ip6_route_add(&cfg, GFP_KERNEL, extack);
 }
 
 static size_t rt6_nlmsg_size(struct rt6_info *rt)
-- 
cgit v1.2.3-59-g8ed1b


From 23fb93a4d3f118a900790066d03368a296dce0d6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:23 -0700
Subject: net/ipv6: Cleanup exception and cache route handling

IPv6 FIB will only contain FIB entries with exception routes added to
the FIB entry. Once this transformation is complete, FIB lookups will
return a fib6_info with the lookup functions still returning a dst
based rt6_info. The current code uses rt6_info for both paths and
overloads the rt6_info variable usually called 'rt'.

This patch introduces a new 'f6i' variable name for the result of the FIB
lookup and keeps 'rt' as the dst based return variable. 'f6i' becomes a
fib6_info in a later patch which is why it is introduced as f6i now;
avoids the additional churn in the later patch.

In addition, remove RTF_CACHE and dst checks from fib6 add and delete
since they can not happen now and will never happen after the data
type flip.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   1 -
 net/ipv6/ip6_fib.c      |  16 +-----
 net/ipv6/route.c        | 142 +++++++++++++++++++++++++++---------------------
 3 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ff70266e30d7..686cdc7f356a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -106,7 +106,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt);
 int ip6_del_rt(struct net *net, struct rt6_info *rt);
 
 void rt6_flush_exceptions(struct rt6_info *rt);
-int rt6_remove_exception_rt(struct rt6_info *rt);
 void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 280b69497ad0..53df4a98a7f7 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1074,7 +1074,7 @@ add:
 static void fib6_start_gc(struct net *net, struct rt6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
-	    (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE)))
+	    (rt->rt6i_flags & RTF_EXPIRES))
 		mod_timer(&net->ipv6.ip6_fib_timer,
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
@@ -1125,8 +1125,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
 		return -EINVAL;
-	if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
-		return -EINVAL;
 
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1650,8 +1648,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	RT6_TRACE("fib6_del_route\n");
 
-	WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
-
 	/* Unlink it */
 	*rtp = rt->rt6_next;
 	rt->rt6i_node = NULL;
@@ -1720,21 +1716,11 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	struct rt6_info __rcu **rtp;
 	struct rt6_info __rcu **rtp_next;
 
-#if RT6_DEBUG >= 2
-	if (rt->dst.obsolete > 0) {
-		WARN_ON(fn);
-		return -ENOENT;
-	}
-#endif
 	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
 
 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
 
-	/* remove cached dst from exception table */
-	if (rt->rt6i_flags & RTF_CACHE)
-		return rt6_remove_exception_rt(rt);
-
 	/*
 	 *	Walk the leaf entries looking for ourself
 	 */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 56a854b426a6..ad9eaecf539c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1013,8 +1013,8 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	BUG_ON(from->from);
 
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	dst_hold(&from->dst);
-	rt->from = from;
+	if (dst_hold_safe(&from->dst))
+		rt->from = from;
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -1097,8 +1097,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     const struct sk_buff *skb,
 					     int flags)
 {
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *f6i;
 	struct fib6_node *fn;
+	struct rt6_info *rt;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
 		flags &= ~RT6_LOOKUP_F_IFACE;
@@ -1106,36 +1107,36 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 	rcu_read_lock();
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
-	rt = rcu_dereference(fn->leaf);
-	if (!rt) {
-		rt = net->ipv6.fib6_null_entry;
+	f6i = rcu_dereference(fn->leaf);
+	if (!f6i) {
+		f6i = net->ipv6.fib6_null_entry;
 	} else {
-		rt = rt6_device_match(net, rt, &fl6->saddr,
+		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
-		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
-						  skb, flags);
+		if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+			f6i = rt6_multipath_select(net, f6i, fl6,
+						   fl6->flowi6_oif, skb, flags);
 	}
-	if (rt == net->ipv6.fib6_null_entry) {
+	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto restart;
 	}
+
 	/* Search through exception table */
-	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
-	if (rt_cache) {
-		rt = rt_cache;
+	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+	if (rt) {
 		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
-	} else if (dst_hold_safe(&rt->dst)) {
-		struct rt6_info *nrt;
-
-		nrt = ip6_create_rt_rcu(rt);
-		dst_release(&rt->dst);
-		rt = nrt;
-	} else {
+	} else if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		dst_hold(&rt->dst);
+	} else {
+		rt = ip6_create_rt_rcu(f6i);
+		if (!rt) {
+			rt = net->ipv6.ip6_null_entry;
+			dst_hold(&rt->dst);
+		}
 	}
 
 	rcu_read_unlock();
@@ -1218,9 +1219,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	 *	Clone the route.
 	 */
 
-	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = ort->from;
-
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
@@ -1446,11 +1444,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	struct rt6_exception *rt6_ex;
 	int err = 0;
 
-	/* ort can't be a cache or pcpu route */
-	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
-		ort = ort->from;
-	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
-
 	spin_lock_bh(&rt6_exception_lock);
 
 	if (ort->exception_bucket_flushed) {
@@ -1589,7 +1582,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 }
 
 /* Remove the passed in cached rt from the hash table that contains it */
-int rt6_remove_exception_rt(struct rt6_info *rt)
+static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
@@ -1854,7 +1847,8 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *f6i;
+	struct rt6_info *rt;
 	int strict = 0;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1871,10 +1865,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		oif = 0;
 
 redo_rt6_select:
-	rt = rt6_select(net, fn, oif, strict);
-	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
-	if (rt == net->ipv6.fib6_null_entry) {
+	f6i = rt6_select(net, fn, oif, strict);
+	if (f6i->rt6i_nsiblings)
+		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
@@ -1886,18 +1880,17 @@ redo_rt6_select:
 		}
 	}
 
-	/*Search through exception table */
-	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
-	if (rt_cache)
-		rt = rt_cache;
-
-	if (rt == net->ipv6.fib6_null_entry) {
+	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
-	} else if (rt->rt6i_flags & RTF_CACHE) {
+	}
+
+	/*Search through exception table */
+	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+	if (rt) {
 		if (ip6_hold_safe(net, &rt, true))
 			dst_use_noref(&rt->dst, jiffies);
 
@@ -1905,7 +1898,7 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
-			    !(rt->rt6i_flags & RTF_GATEWAY))) {
+			    !(f6i->rt6i_flags & RTF_GATEWAY))) {
 		/* Create a RTF_CACHE clone which will not be
 		 * owned by the fib6 tree.  It is for the special case where
 		 * the daddr in the skb during the neighbor look-up is different
@@ -1914,16 +1907,16 @@ redo_rt6_select:
 
 		struct rt6_info *uncached_rt;
 
-		if (ip6_hold_safe(net, &rt, true)) {
-			dst_use_noref(&rt->dst, jiffies);
+		if (ip6_hold_safe(net, &f6i, true)) {
+			dst_use_noref(&f6i->dst, jiffies);
 		} else {
 			rcu_read_unlock();
-			uncached_rt = rt;
+			uncached_rt = f6i;
 			goto uncached_rt_out;
 		}
 		rcu_read_unlock();
 
-		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
+		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
 		dst_release(&rt->dst);
 
 		if (uncached_rt) {
@@ -1946,18 +1939,18 @@ uncached_rt_out:
 
 		struct rt6_info *pcpu_rt;
 
-		dst_use_noref(&rt->dst, jiffies);
+		dst_use_noref(&f6i->dst, jiffies);
 		local_bh_disable();
-		pcpu_rt = rt6_get_pcpu_route(rt);
+		pcpu_rt = rt6_get_pcpu_route(f6i);
 
 		if (!pcpu_rt) {
 			/* atomic_inc_not_zero() is needed when using rcu */
-			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+			if (atomic_inc_not_zero(&f6i->rt6i_ref)) {
 				/* No dst_hold() on rt is needed because grabbing
 				 * rt->rt6i_ref makes sure rt can't be released.
 				 */
-				pcpu_rt = rt6_make_pcpu_route(net, rt);
-				rt6_release(rt);
+				pcpu_rt = rt6_make_pcpu_route(net, f6i);
+				rt6_release(f6i);
 			} else {
 				/* rt is already removed from tree */
 				pcpu_rt = net->ipv6.ip6_null_entry;
@@ -2419,7 +2412,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 					     int flags)
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *ret = NULL, *rt_cache;
+	struct rt6_info *rt;
 	struct fib6_node *fn;
 
 	/* Get the "current" route for this destination and
@@ -2458,7 +2452,7 @@ restart:
 			if (rt_cache &&
 			    ipv6_addr_equal(&rdfl->gateway,
 					    &rt_cache->rt6i_gateway)) {
-				rt = rt_cache;
+				ret = rt_cache;
 				break;
 			}
 			continue;
@@ -2469,7 +2463,7 @@ restart:
 	if (!rt)
 		rt = net->ipv6.fib6_null_entry;
 	else if (rt->rt6i_flags & RTF_REJECT) {
-		rt = net->ipv6.ip6_null_entry;
+		ret = net->ipv6.ip6_null_entry;
 		goto out;
 	}
 
@@ -2480,12 +2474,15 @@ restart:
 	}
 
 out:
-	ip6_hold_safe(net, &rt, true);
+	if (ret)
+		dst_hold(&ret->dst);
+	else
+		ret = ip6_create_rt_rcu(rt);
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-	return rt;
+	trace_fib6_table_lookup(net, ret, table, fl6);
+	return ret;
 };
 
 static struct dst_entry *ip6_route_redirect(struct net *net,
@@ -3182,6 +3179,22 @@ out_put:
 	return err;
 }
 
+static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+{
+	int rc = -ESRCH;
+
+	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
+		goto out;
+
+	if (cfg->fc_flags & RTF_GATEWAY &&
+	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
+		goto out;
+	if (dst_hold_safe(&rt->dst))
+		rc = rt6_remove_exception_rt(rt);
+out:
+	return rc;
+}
+
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
@@ -3206,11 +3219,16 @@ static int ip6_route_del(struct fib6_config *cfg,
 	if (fn) {
 		for_each_fib6_node_rt_rcu(fn) {
 			if (cfg->fc_flags & RTF_CACHE) {
+				int rc;
+
 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
 							      &cfg->fc_src);
-				if (!rt_cache)
-					continue;
-				rt = rt_cache;
+				if (rt_cache) {
+					rc = ip6_del_cached_rt(rt_cache, cfg);
+					if (rc != -ESRCH)
+						return rc;
+				}
+				continue;
 			}
 			if (cfg->fc_ifindex &&
 			    (!rt->fib6_nh.nh_dev ||
@@ -3327,7 +3345,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 				     NEIGH_UPDATE_F_ISROUTER)),
 		     NDISC_REDIRECT, &ndopts);
 
-	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
+	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
 	if (!nrt)
 		goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From a64efe142f5e70b7e39276a414bbb3b96691c608 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:24 -0700
Subject: net/ipv6: introduce fib6_info struct and helpers

Add fib6_info struct and alloc, destroy, hold and release helpers.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c    | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 159f651dee55..630392ae12d8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -38,6 +38,7 @@
 #endif
 
 struct rt6_info;
+struct fib6_info;
 
 struct fib6_config {
 	u32		fc_table;
@@ -132,6 +133,46 @@ struct fib6_nh {
 	int			nh_weight;
 };
 
+struct fib6_info {
+	struct fib6_table		*rt6i_table;
+	struct fib6_info __rcu		*rt6_next;
+	struct fib6_node __rcu		*rt6i_node;
+
+	/* Multipath routes:
+	 * siblings is a list of fib6_info that have the the same metric/weight,
+	 * destination, but not the same gateway. nsiblings is just a cache
+	 * to speed up lookup.
+	 */
+	struct list_head		rt6i_siblings;
+	unsigned int			rt6i_nsiblings;
+
+	atomic_t			rt6i_ref;
+	struct inet6_dev		*rt6i_idev;
+	unsigned long			expires;
+	struct dst_metrics		*fib6_metrics;
+#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
+
+	struct rt6key			rt6i_dst;
+	u32				rt6i_flags;
+	struct rt6key			rt6i_src;
+	struct rt6key			rt6i_prefsrc;
+
+	struct rt6_info * __percpu	*rt6i_pcpu;
+	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
+
+	u32				rt6i_metric;
+	u8				rt6i_protocol;
+	u8				fib6_type;
+	u8				exception_bucket_flushed:1,
+					should_flush:1,
+					dst_nocount:1,
+					dst_nopolicy:1,
+					dst_host:1,
+					unused:3;
+
+	struct fib6_nh			fib6_nh;
+};
+
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
@@ -291,6 +332,20 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags);
+void fib6_info_destroy(struct rt6_info *f6i);
+
+static inline void fib6_info_hold(struct rt6_info *f6i)
+{
+	atomic_inc(&f6i->rt6i_ref);
+}
+
+static inline void fib6_info_release(struct rt6_info *f6i)
+{
+	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
+		fib6_info_destroy(f6i);
+}
+
 static inline void rt6_hold(struct rt6_info *rt)
 {
 	atomic_inc(&rt->rt6i_ref);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 53df4a98a7f7..d07578d84db0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -145,6 +145,66 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
+struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
+{
+	struct rt6_info *f6i;
+
+	f6i = kzalloc(sizeof(*f6i), gfp_flags);
+	if (!f6i)
+		return NULL;
+
+	f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+	if (!f6i->rt6i_pcpu) {
+		kfree(f6i);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&f6i->rt6i_siblings);
+	f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+
+	atomic_inc(&f6i->rt6i_ref);
+
+	return f6i;
+}
+
+void fib6_info_destroy(struct rt6_info *f6i)
+{
+	struct rt6_exception_bucket *bucket;
+
+	WARN_ON(f6i->rt6i_node);
+
+	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
+	if (bucket) {
+		f6i->rt6i_exception_bucket = NULL;
+		kfree(bucket);
+	}
+
+	if (f6i->rt6i_pcpu) {
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct rt6_info **ppcpu_rt;
+			struct rt6_info *pcpu_rt;
+
+			ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+			pcpu_rt = *ppcpu_rt;
+			if (pcpu_rt) {
+				dst_dev_put(&pcpu_rt->dst);
+				dst_release(&pcpu_rt->dst);
+				*ppcpu_rt = NULL;
+			}
+		}
+	}
+
+	if (f6i->rt6i_idev)
+		in6_dev_put(f6i->rt6i_idev);
+	if (f6i->fib6_nh.nh_dev)
+		dev_put(f6i->fib6_nh.nh_dev);
+
+	kfree(f6i);
+}
+EXPORT_SYMBOL_GPL(fib6_info_destroy);
+
 static struct fib6_node *node_alloc(struct net *net)
 {
 	struct fib6_node *fn;
-- 
cgit v1.2.3-59-g8ed1b


From 93531c6743157d7e8c5792f8ed1a57641149d62c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:25 -0700
Subject: net/ipv6: separate handling of FIB entries from dst based routes

Last step before flipping the data type for FIB entries:
- use fib6_info_alloc to create FIB entries in ip6_route_info_create
  and addrconf_dst_alloc
- use fib6_info_release in place of dst_release, ip6_rt_put and
  rt6_release
- remove the dst_hold before calling __ip6_ins_rt or ip6_del_rt
- when purging routes, drop per-cpu routes
- replace inc and dec of rt6i_ref with fib6_info_hold and fib6_info_release
- use rt->from since it points to the FIB entry
- drop references to exception bucket, fib6_metrics and per-cpu from
  dst entries (those are relevant for fib entries only)

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |   4 +-
 include/net/ip6_route.h |   3 +-
 net/ipv6/addrconf.c     |  18 +++--
 net/ipv6/anycast.c      |   7 +-
 net/ipv6/ip6_fib.c      |  55 ++++++++++------
 net/ipv6/ip6_output.c   |   3 +-
 net/ipv6/ndisc.c        |   6 +-
 net/ipv6/route.c        | 171 +++++++++++++++++-------------------------------
 8 files changed, 115 insertions(+), 152 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 630392ae12d8..6c3d92bb3459 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -314,9 +314,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		rt = rt->from;
-
-	rt6_get_cookie_safe(rt, &cookie);
+		rt6_get_cookie_safe(rt->from, &cookie);
 
 	return cookie;
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 686cdc7f356a..57d0d45667f1 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -114,8 +114,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev =
-			rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
+	struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL;
 	int err = 0;
 
 	if (rt && rt->rt6i_prefsrc.plen)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 915cd0734b27..e533a447f68c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 		pr_warn("Freeing alive inet6 address %p\n", ifp);
 		return;
 	}
-	ip6_rt_put(ifp->rt);
 
 	kfree_rcu(ifp, rcu);
 }
@@ -1102,8 +1101,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
 	if (unlikely(err < 0)) {
-		if (rt)
-			ip6_rt_put(rt);
+		fib6_info_release(rt);
+
 		if (ifa) {
 			if (ifa->idev)
 				in6_dev_put(ifa->idev);
@@ -1191,7 +1190,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
 		else {
 			if (!(rt->rt6i_flags & RTF_EXPIRES))
 				fib6_set_expires(rt, expires);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 		}
 	}
 }
@@ -2375,8 +2374,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 			continue;
 		if ((rt->rt6i_flags & noflags) != 0)
 			continue;
-		if (!dst_hold_safe(&rt->dst))
-			rt = NULL;
+		fib6_info_hold(rt);
 		break;
 	}
 out:
@@ -2687,7 +2685,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
 					      dev, expires, flags, GFP_ATOMIC);
 		}
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 	}
 
 	/* Try to figure out our local address for this prefix */
@@ -3361,7 +3359,7 @@ static int fixup_permanent_addr(struct net *net,
 		ifp->rt = rt;
 		spin_unlock(&ifp->lock);
 
-		ip6_rt_put(prev);
+		fib6_info_release(prev);
 	}
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
@@ -5636,8 +5634,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 				ip6_del_rt(net, rt);
 		}
 		if (ifp->rt) {
-			if (dst_hold_safe(&ifp->rt->dst))
-				ip6_del_rt(net, ifp->rt);
+			ip6_del_rt(net, ifp->rt);
+			ifp->rt = NULL;
 		}
 		rt_genid_bump_ipv6(net);
 		break;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index e456386fe4d5..3db8fe10322b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -213,7 +213,7 @@ static void aca_put(struct ifacaddr6 *ac)
 {
 	if (refcount_dec_and_test(&ac->aca_refcnt)) {
 		in6_dev_put(ac->aca_idev);
-		dst_release(&ac->aca_rt->dst);
+		fib6_info_release(ac->aca_rt);
 		kfree(ac);
 	}
 }
@@ -231,6 +231,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
 	aca->aca_addr = *addr;
 	in6_dev_hold(idev);
 	aca->aca_idev = idev;
+	fib6_info_hold(rt);
 	aca->aca_rt = rt;
 	aca->aca_users = 1;
 	/* aca_tstamp should be updated upon changes */
@@ -274,7 +275,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 	aca = aca_alloc(rt, addr);
 	if (!aca) {
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -330,7 +331,6 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
-	dst_hold(&aca->aca_rt->dst);
 	ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 	aca_put(aca);
@@ -358,7 +358,6 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
 
 		addrconf_leave_solict(idev, &aca->aca_addr);
 
-		dst_hold(&aca->aca_rt->dst);
 		ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
 		aca_put(aca);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d07578d84db0..4d6bd033dccd 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -170,6 +170,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
 void fib6_info_destroy(struct rt6_info *f6i)
 {
 	struct rt6_exception_bucket *bucket;
+	struct dst_metrics *m;
 
 	WARN_ON(f6i->rt6i_node);
 
@@ -201,6 +202,10 @@ void fib6_info_destroy(struct rt6_info *f6i)
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
+	m = f6i->fib6_metrics;
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
+		kfree(m);
+
 	kfree(f6i);
 }
 EXPORT_SYMBOL_GPL(fib6_info_destroy);
@@ -714,7 +719,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 			/* clean up an intermediate node */
 			if (!(fn->fn_flags & RTN_RTINFO)) {
 				RCU_INIT_POINTER(fn->leaf, NULL);
-				rt6_release(leaf);
+				fib6_info_release(leaf);
 			/* remove null_entry in the root node */
 			} else if (fn->fn_flags & RTN_TL_ROOT &&
 				   rcu_access_pointer(fn->leaf) ==
@@ -898,12 +903,32 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
 				atomic_inc(&new_leaf->rt6i_ref);
+
 				rcu_assign_pointer(fn->leaf, new_leaf);
-				rt6_release(rt);
+				fib6_info_release(rt);
 			}
 			fn = rcu_dereference_protected(fn->parent,
 				    lockdep_is_held(&table->tb6_lock));
 		}
+
+		if (rt->rt6i_pcpu) {
+			int cpu;
+
+			/* release the reference to this fib entry from
+			 * all of its cached pcpu routes
+			 */
+			for_each_possible_cpu(cpu) {
+				struct rt6_info **ppcpu_rt;
+				struct rt6_info *pcpu_rt;
+
+				ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+				pcpu_rt = *ppcpu_rt;
+				if (pcpu_rt) {
+					fib6_info_release(pcpu_rt->from);
+					pcpu_rt->from = NULL;
+				}
+			}
+		}
 	}
 }
 
@@ -1099,7 +1124,7 @@ add:
 		fib6_purge_rt(iter, fn, info->nl_net);
 		if (rcu_access_pointer(fn->rr_ptr) == iter)
 			fn->rr_ptr = NULL;
-		rt6_release(iter);
+		fib6_info_release(iter);
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
@@ -1115,7 +1140,7 @@ add:
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
 						fn->rr_ptr = NULL;
-					rt6_release(iter);
+					fib6_info_release(iter);
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
@@ -1183,9 +1208,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	int replace_required = 0;
 	int sernum = fib6_new_sernum(info->nl_net);
 
-	if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
-		return -EINVAL;
-
 	if (info->nlh) {
 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
 			allow_create = 0;
@@ -1300,7 +1322,7 @@ out:
 			if (pn_leaf == rt) {
 				pn_leaf = NULL;
 				RCU_INIT_POINTER(pn->leaf, NULL);
-				atomic_dec(&rt->rt6i_ref);
+				fib6_info_release(rt);
 			}
 			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
 				pn_leaf = fib6_find_prefix(info->nl_net, table,
@@ -1312,7 +1334,7 @@ out:
 					    info->nl_net->ipv6.fib6_null_entry;
 				}
 #endif
-				atomic_inc(&pn_leaf->rt6i_ref);
+				fib6_info_hold(pn_leaf);
 				rcu_assign_pointer(pn->leaf, pn_leaf);
 			}
 		}
@@ -1334,10 +1356,6 @@ failure:
 	     (fn->fn_flags & RTN_TL_ROOT &&
 	      !rcu_access_pointer(fn->leaf))))
 		fib6_repair_tree(info->nl_net, table, fn);
-	/* Always release dst as dst->__refcnt is guaranteed
-	 * to be taken before entering this function
-	 */
-	dst_release_immediate(&rt->dst);
 	return err;
 }
 
@@ -1637,7 +1655,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 				new_fn_leaf = net->ipv6.fib6_null_entry;
 			}
 #endif
-			atomic_inc(&new_fn_leaf->rt6i_ref);
+			fib6_info_hold(new_fn_leaf);
 			rcu_assign_pointer(fn->leaf, new_fn_leaf);
 			return pn;
 		}
@@ -1693,7 +1711,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 			return pn;
 
 		RCU_INIT_POINTER(pn->leaf, NULL);
-		rt6_release(pn_leaf);
+		fib6_info_release(pn_leaf);
 		fn = pn;
 	}
 }
@@ -1763,7 +1781,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
-	rt6_release(rt);
+	fib6_info_release(rt);
 }
 
 /* Need to own table->tb6_lock */
@@ -2261,9 +2279,8 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 
 	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
-		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
-		   rt->dst.__use, rt->rt6i_flags,
-		   dev ? dev->name : "");
+		   rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0,
+		   rt->rt6i_flags, dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a39b04f9fa6e..3db47986ef38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -968,7 +968,8 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		if (!had_dst)
 			*dst = ip6_route_output(net, sk, fl6);
 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
-		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+		err = ip6_route_get_saddr(net, rt ? rt->from : NULL,
+					  &fl6->daddr,
 					  sk ? inet6_sk(sk)->srcprefs : 0,
 					  &fl6->saddr);
 		if (err)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 556717154fa3..a28857088bff 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1283,7 +1283,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
 				  __func__);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 			return;
 		}
 	}
@@ -1313,7 +1313,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 			ND_PRINTK(0, err,
 				  "RA: %s got default router without neighbour\n",
 				  __func__);
-			ip6_rt_put(rt);
+			fib6_info_release(rt);
 			return;
 		}
 		neigh->flags |= NTF_ROUTER;
@@ -1499,7 +1499,7 @@ skip_routeinfo:
 		ND_PRINTK(2, warn, "RA: invalid RA options\n");
 	}
 out:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 	if (neigh)
 		neigh_release(neigh);
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad9eaecf539c..0d861bd07673 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -351,13 +351,11 @@ static void rt6_info_init(struct rt6_info *rt)
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
-	rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 }
 
 /* allocate dst with ip6_dst_ops */
-static struct rt6_info *__ip6_dst_alloc(struct net *net,
-					struct net_device *dev,
-					int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+			       int flags)
 {
 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 					1, DST_OBSOLETE_FORCE_CHK, flags);
@@ -369,35 +367,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
 
 	return rt;
 }
-
-struct rt6_info *ip6_dst_alloc(struct net *net,
-			       struct net_device *dev,
-			       int flags)
-{
-	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
-
-	if (rt) {
-		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
-		if (!rt->rt6i_pcpu) {
-			dst_release_immediate(&rt->dst);
-			return NULL;
-		}
-	}
-
-	return rt;
-}
 EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct rt6_exception_bucket *bucket;
 	struct rt6_info *from = rt->from;
 	struct inet6_dev *idev;
-	struct dst_metrics *m;
 
 	dst_destroy_metrics_generic(dst);
-	free_percpu(rt->rt6i_pcpu);
 	rt6_uncached_list_del(rt);
 
 	idev = rt->rt6i_idev;
@@ -405,18 +383,9 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		rt->rt6i_idev = NULL;
 		in6_dev_put(idev);
 	}
-	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
-	if (bucket) {
-		rt->rt6i_exception_bucket = NULL;
-		kfree(bucket);
-	}
-
-	m = rt->fib6_metrics;
-	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
-		kfree(m);
 
 	rt->from = NULL;
-	dst_release(&from->dst);
+	fib6_info_release(from);
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -891,7 +860,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		else
 			fib6_set_expires(rt, jiffies + HZ * lifetime);
 
-		ip6_rt_put(rt);
+		fib6_info_release(rt);
 	}
 	return 0;
 }
@@ -1010,11 +979,9 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 
 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 {
-	BUG_ON(from->from);
-
 	rt->rt6i_flags &= ~RTF_EXPIRES;
-	if (dst_hold_safe(&from->dst))
-		rt->from = from;
+	fib6_info_hold(from);
+	rt->from = from;
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -1084,7 +1051,7 @@ static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rt6_info *nrt;
 
-	nrt = __ip6_dst_alloc(dev_net(dev), dev, flags);
+	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	if (nrt)
 		ip6_rt_copy_init(nrt, rt);
 
@@ -1203,8 +1170,6 @@ int ip6_ins_rt(struct net *net, struct rt6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
 
-	/* Hold dst to account for the reference from the fib6 tree */
-	dst_hold(&rt->dst);
 	return __ip6_ins_rt(rt, &info, NULL);
 }
 
@@ -1221,7 +1186,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
-	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
 	rcu_read_unlock();
 	if (!rt)
 		return NULL;
@@ -1256,7 +1221,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 
 	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(rt);
-	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, flags);
+	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
 	rcu_read_unlock();
 	if (!pcpu_rt)
 		return NULL;
@@ -1317,7 +1282,7 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 	net = dev_net(rt6_ex->rt6i->dst.dev);
 	rt6_ex->rt6i->rt6i_node = NULL;
 	hlist_del_rcu(&rt6_ex->hlist);
-	rt6_release(rt6_ex->rt6i);
+	ip6_rt_put(rt6_ex->rt6i);
 	kfree_rcu(rt6_ex, rcu);
 	WARN_ON_ONCE(!bucket->depth);
 	bucket->depth--;
@@ -1907,17 +1872,11 @@ redo_rt6_select:
 
 		struct rt6_info *uncached_rt;
 
-		if (ip6_hold_safe(net, &f6i, true)) {
-			dst_use_noref(&f6i->dst, jiffies);
-		} else {
-			rcu_read_unlock();
-			uncached_rt = f6i;
-			goto uncached_rt_out;
-		}
+		fib6_info_hold(f6i);
 		rcu_read_unlock();
 
 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
-		dst_release(&rt->dst);
+		fib6_info_release(f6i);
 
 		if (uncached_rt) {
 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -1930,7 +1889,6 @@ redo_rt6_select:
 			dst_hold(&uncached_rt->dst);
 		}
 
-uncached_rt_out:
 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
 
@@ -1939,24 +1897,12 @@ uncached_rt_out:
 
 		struct rt6_info *pcpu_rt;
 
-		dst_use_noref(&f6i->dst, jiffies);
 		local_bh_disable();
 		pcpu_rt = rt6_get_pcpu_route(f6i);
 
-		if (!pcpu_rt) {
-			/* atomic_inc_not_zero() is needed when using rcu */
-			if (atomic_inc_not_zero(&f6i->rt6i_ref)) {
-				/* No dst_hold() on rt is needed because grabbing
-				 * rt->rt6i_ref makes sure rt can't be released.
-				 */
-				pcpu_rt = rt6_make_pcpu_route(net, f6i);
-				rt6_release(f6i);
-			} else {
-				/* rt is already removed from tree */
-				pcpu_rt = net->ipv6.ip6_null_entry;
-				dst_hold(&pcpu_rt->dst);
-			}
-		}
+		if (!pcpu_rt)
+			pcpu_rt = rt6_make_pcpu_route(net, f6i);
+
 		local_bh_enable();
 		rcu_read_unlock();
 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
@@ -2193,11 +2139,26 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
+static bool fib6_check(struct rt6_info *f6i, u32 cookie)
+{
+	u32 rt_cookie = 0;
+
+	if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
+	     rt_cookie != cookie)
+		return false;
+
+	if (fib6_check_expired(f6i))
+		return false;
+
+	return true;
+}
+
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
+	if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	    rt_cookie != cookie)
 		return NULL;
 
 	if (rt6_check_expired(rt))
@@ -2210,7 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    rt6_check(rt->from, cookie))
+	    fib6_check(rt->from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -2241,7 +2202,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (rt6_check_expired(rt)) {
-				ip6_del_rt(dev_net(dst->dev), rt);
+				rt6_remove_exception_rt(rt);
 				dst = NULL;
 			}
 		} else {
@@ -2262,12 +2223,12 @@ static void ip6_link_failure(struct sk_buff *skb)
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
-				ip6_del_rt(dev_net(rt->dst.dev), rt);
-		} else {
+				rt6_remove_exception_rt(rt);
+		} else if (rt->from) {
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->rt6i_node);
+			fn = rcu_dereference(rt->from->rt6i_node);
 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
 				fn->fn_sernum = -1;
 			rcu_read_unlock();
@@ -2949,13 +2910,13 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!table)
 		goto out;
 
-	rt = ip6_dst_alloc(net, NULL,
-			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
-
-	if (!rt) {
-		err = -ENOMEM;
+	err = -ENOMEM;
+	rt = fib6_info_alloc(gfp_flags);
+	if (!rt)
 		goto out;
-	}
+
+	if (cfg->fc_flags & RTF_ADDRCONF)
+		rt->dst_nocount = true;
 
 	err = ip6_convert_metrics(net, rt, cfg);
 	if (err < 0)
@@ -3029,7 +2990,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 
-		rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway;
+		rt->fib6_nh.nh_gw = cfg->fc_gateway;
 	}
 
 	err = -ENODEV;
@@ -3066,7 +3027,7 @@ install_route:
 	    !netif_carrier_ok(dev))
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
-	rt->fib6_nh.nh_dev = rt->dst.dev = dev;
+	rt->fib6_nh.nh_dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
 
@@ -3078,9 +3039,8 @@ out:
 		dev_put(dev);
 	if (idev)
 		in6_dev_put(idev);
-	if (rt)
-		dst_release_immediate(&rt->dst);
 
+	fib6_info_release(rt);
 	return ERR_PTR(err);
 }
 
@@ -3095,6 +3055,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		return PTR_ERR(rt);
 
 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
+	fib6_info_release(rt);
 
 	return err;
 }
@@ -3116,7 +3077,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 	spin_unlock_bh(&table->tb6_lock);
 
 out:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 	return err;
 }
 
@@ -3170,7 +3131,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 out_unlock:
 	spin_unlock_bh(&table->tb6_lock);
 out_put:
-	ip6_rt_put(rt);
+	fib6_info_release(rt);
 
 	if (skb) {
 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
@@ -3241,8 +3202,7 @@ static int ip6_route_del(struct fib6_config *cfg,
 				continue;
 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
 				continue;
-			if (!dst_hold_safe(&rt->dst))
-				break;
+			fib6_info_hold(rt);
 			rcu_read_unlock();
 
 			/* if gateway was specified only delete the one hop */
@@ -3510,12 +3470,9 @@ restart:
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-			if (dst_hold_safe(&rt->dst)) {
-				rcu_read_unlock();
-				ip6_del_rt(net, rt);
-			} else {
-				rcu_read_unlock();
-			}
+			fib6_info_hold(rt);
+			rcu_read_unlock();
+			ip6_del_rt(net, rt);
 			goto restart;
 		}
 	}
@@ -3666,7 +3623,7 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	struct net_device *dev = idev->dev;
 	struct rt6_info *rt;
 
-	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
+	rt = fib6_info_alloc(gfp_flags);
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
 
@@ -3687,8 +3644,8 @@ struct rt6_info *addrconf_dst_alloc(struct net *net,
 	}
 
 	rt->fib6_nh.nh_gw = *addr;
+	dev_hold(dev);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_gateway  = *addr;
 	rt->rt6i_dst.addr = *addr;
 	rt->rt6i_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
@@ -4325,7 +4282,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
 					    rt, &r_cfg);
 		if (err) {
-			dst_release_immediate(&rt->dst);
+			fib6_info_release(rt);
 			goto cleanup;
 		}
 
@@ -4342,6 +4299,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	list_for_each_entry(nh, &rt6_nh_list, next) {
 		rt_last = nh->rt6_info;
 		err = __ip6_ins_rt(nh->rt6_info, info, extack);
+		fib6_info_release(nh->rt6_info);
+
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
 			rt_notif = nh->rt6_info;
@@ -4389,7 +4348,7 @@ add_errout:
 cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
 		if (nh->rt6_info)
-			dst_release_immediate(&nh->rt6_info->dst);
+			fib6_info_release(nh->rt6_info);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4814,14 +4773,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	if (fibmatch && rt->from) {
-		struct rt6_info *ort = rt->from;
-
-		dst_hold(&ort->dst);
-		ip6_rt_put(rt);
-		rt = ort;
-	}
-
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
 		ip6_rt_put(rt);
@@ -4831,12 +4782,12 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 	skb_dst_set(skb, &rt->dst);
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr,
-				    iif, RTM_NEWROUTE,
+		err = rt6_fill_node(net, skb, rt->from, dst,
+				    &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 				    0);
 	if (err < 0) {
-- 
cgit v1.2.3-59-g8ed1b


From 8d1c802b2815edc97af8a58c5045ebaf3848621a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:26 -0700
Subject: net/ipv6: Flip FIB entries to fib6_info

Convert all code paths referencing a FIB entry from
rt6_info to fib6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  64 ++---
 include/net/if_inet6.h                             |   4 +-
 include/net/ip6_fib.h                              |  42 ++--
 include/net/ip6_route.h                            |  28 +--
 include/net/netns/ipv6.h                           |   2 +-
 net/ipv6/addrconf.c                                |  20 +-
 net/ipv6/anycast.c                                 |   4 +-
 net/ipv6/ip6_fib.c                                 | 116 ++++-----
 net/ipv6/ndisc.c                                   |   2 +-
 net/ipv6/route.c                                   | 259 +++++++++++----------
 10 files changed, 271 insertions(+), 270 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index d995a0b52d7c..b85b15851841 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -442,7 +442,7 @@ struct mlxsw_sp_fib6_entry {
 
 struct mlxsw_sp_rt6 {
 	struct list_head list;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 };
 
 struct mlxsw_sp_lpm_tree {
@@ -3834,7 +3834,7 @@ mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
 
 	for (i = 0; i < nh_grp->count; i++) {
 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
 		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
@@ -3920,7 +3920,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
 				  common);
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
 	}
@@ -4699,7 +4699,7 @@ static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
 	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
 }
 
-static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
 {
 	/* Packets with link-local destination IP arriving to the router
 	 * are trapped to the CPU, so no need to program specific routes
@@ -4721,7 +4721,7 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct rt6_info *rt)
 	return false;
 }
 
-static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
+static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -4734,18 +4734,18 @@ static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct rt6_info *rt)
 	 * memory.
 	 */
 	mlxsw_sp_rt6->rt = rt;
-	rt6_hold(rt);
+	fib6_info_hold(rt);
 
 	return mlxsw_sp_rt6;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
-	rt6_release(rt);
+	fib6_info_release(rt);
 }
 #else
-static void mlxsw_sp_rt6_release(struct rt6_info *rt)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
 {
 }
 #endif
@@ -4756,13 +4756,13 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 	kfree(mlxsw_sp_rt6);
 }
 
-static bool mlxsw_sp_fib6_rt_can_mp(const struct rt6_info *rt)
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
 	/* RTF_CACHE routes are ignored */
 	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
 }
 
-static struct rt6_info *
+static struct fib6_info *
 mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 {
 	return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
@@ -4771,7 +4771,7 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-				 const struct rt6_info *nrt, bool replace)
+				 const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
@@ -4779,7 +4779,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
 		 * virtual router.
@@ -4802,7 +4802,7 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 
 static struct mlxsw_sp_rt6 *
 mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
-			    const struct rt6_info *rt)
+			    const struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -4815,7 +4815,7 @@ mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
 }
 
 static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
-					const struct rt6_info *rt,
+					const struct fib6_info *rt,
 					enum mlxsw_sp_ipip_type *ret)
 {
 	return rt->fib6_nh.nh_dev &&
@@ -4825,7 +4825,7 @@ static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
 static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
 				       struct mlxsw_sp_nexthop_group *nh_grp,
 				       struct mlxsw_sp_nexthop *nh,
-				       const struct rt6_info *rt)
+				       const struct fib6_info *rt)
 {
 	const struct mlxsw_sp_ipip_ops *ipip_ops;
 	struct mlxsw_sp_ipip_entry *ipip_entry;
@@ -4870,7 +4870,7 @@ static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp,
 static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
 				  struct mlxsw_sp_nexthop_group *nh_grp,
 				  struct mlxsw_sp_nexthop *nh,
-				  const struct rt6_info *rt)
+				  const struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -4897,7 +4897,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 }
 
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
-				    const struct rt6_info *rt)
+				    const struct fib6_info *rt)
 {
 	return rt->rt6i_flags & RTF_GATEWAY ||
 	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
@@ -4928,7 +4928,7 @@ mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp,
 	nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt);
 	nh_grp->count = fib6_entry->nrt6;
 	for (i = 0; i < nh_grp->count; i++) {
-		struct rt6_info *rt = mlxsw_sp_rt6->rt;
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
 
 		nh = &nh_grp->nexthops[i];
 		err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt);
@@ -5040,7 +5040,7 @@ err_nexthop6_group_get:
 static int
 mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
 				struct mlxsw_sp_fib6_entry *fib6_entry,
-				struct rt6_info *rt)
+				struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 	int err;
@@ -5068,7 +5068,7 @@ err_nexthop6_group_update:
 static void
 mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
 				struct mlxsw_sp_fib6_entry *fib6_entry,
-				struct rt6_info *rt)
+				struct fib6_info *rt)
 {
 	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
 
@@ -5084,7 +5084,7 @@ mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
 
 static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 					 struct mlxsw_sp_fib_entry *fib_entry,
-					 const struct rt6_info *rt)
+					 const struct fib6_info *rt)
 {
 	/* Packets hitting RTF_REJECT routes need to be discarded by the
 	 * stack. We can rely on their destination device not having a
@@ -5118,7 +5118,7 @@ mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry)
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp,
 			   struct mlxsw_sp_fib_node *fib_node,
-			   struct rt6_info *rt)
+			   struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_entry *fib_entry;
@@ -5168,12 +5168,12 @@ static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp,
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
-			      const struct rt6_info *nrt, bool replace)
+			      const struct fib6_info *nrt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
 			continue;
@@ -5198,7 +5198,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 			       bool replace)
 {
 	struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node;
-	struct rt6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
+	struct fib6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 
 	fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace);
@@ -5213,7 +5213,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 		struct mlxsw_sp_fib6_entry *last;
 
 		list_for_each_entry(last, &fib_node->entry_list, common.list) {
-			struct rt6_info *rt = mlxsw_sp_fib6_entry_rt(last);
+			struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);
 
 			if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
 				break;
@@ -5268,7 +5268,7 @@ mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
 
 static struct mlxsw_sp_fib6_entry *
 mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
-			   const struct rt6_info *rt)
+			   const struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5287,7 +5287,7 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
-		struct rt6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+		struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
 		if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
 		    rt->rt6i_metric == iter_rt->rt6i_metric &&
@@ -5316,7 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
-				    struct rt6_info *rt, bool replace)
+				    struct fib6_info *rt, bool replace)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5373,7 +5373,7 @@ err_fib6_entry_nexthop_add:
 }
 
 static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp,
-				     struct rt6_info *rt)
+				     struct fib6_info *rt)
 {
 	struct mlxsw_sp_fib6_entry *fib6_entry;
 	struct mlxsw_sp_fib_node *fib_node;
@@ -5836,7 +5836,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
 		fen6_info = container_of(info, struct fib6_entry_notifier_info,
 					 info);
 		fib_work->fen6_info = *fen6_info;
-		rt6_hold(fib_work->fen6_info.rt);
+		fib6_info_hold(fib_work->fen6_info.rt);
 		break;
 	}
 }
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d4088d1a688d..d6089b2e64fe 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -64,7 +64,7 @@ struct inet6_ifaddr {
 	struct delayed_work	dad_work;
 
 	struct inet6_dev	*idev;
-	struct rt6_info		*rt;
+	struct fib6_info	*rt;
 
 	struct hlist_node	addr_lst;
 	struct list_head	if_list;
@@ -144,7 +144,7 @@ struct ipv6_ac_socklist {
 struct ifacaddr6 {
 	struct in6_addr		aca_addr;
 	struct inet6_dev	*aca_idev;
-	struct rt6_info		*aca_rt;
+	struct fib6_info	*aca_rt;
 	struct ifacaddr6	*aca_next;
 	int			aca_users;
 	refcount_t		aca_refcnt;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6c3d92bb3459..d41b7bd69fb3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -75,12 +75,12 @@ struct fib6_node {
 #ifdef CONFIG_IPV6_SUBTREES
 	struct fib6_node __rcu	*subtree;
 #endif
-	struct rt6_info __rcu	*leaf;
+	struct fib6_info __rcu	*leaf;
 
 	__u16			fn_bit;		/* bit key */
 	__u16			fn_flags;
 	int			fn_sernum;
-	struct rt6_info __rcu	*rr_ptr;
+	struct fib6_info __rcu	*rr_ptr;
 	struct rcu_head		rcu;
 };
 
@@ -176,7 +176,7 @@ struct fib6_info {
 struct rt6_info {
 	struct dst_entry		dst;
 	struct rt6_info __rcu		*rt6_next;
-	struct rt6_info			*from;
+	struct fib6_info		*from;
 
 	/*
 	 * Tail elements of dst_entry (__refcnt etc.)
@@ -242,20 +242,20 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 	return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
-static inline void fib6_clean_expires(struct rt6_info *f6i)
+static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
 	f6i->rt6i_flags &= ~RTF_EXPIRES;
 	f6i->expires = 0;
 }
 
-static inline void fib6_set_expires(struct rt6_info *f6i,
+static inline void fib6_set_expires(struct fib6_info *f6i,
 				    unsigned long expires)
 {
 	f6i->expires = expires;
 	f6i->rt6i_flags |= RTF_EXPIRES;
 }
 
-static inline bool fib6_check_expired(const struct rt6_info *f6i)
+static inline bool fib6_check_expired(const struct fib6_info *f6i)
 {
 	if (f6i->rt6i_flags & RTF_EXPIRES)
 		return time_after(jiffies, f6i->expires);
@@ -288,7 +288,7 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+static inline bool rt6_get_cookie_safe(const struct fib6_info *rt,
 				       u32 *cookie)
 {
 	struct fib6_node *fn;
@@ -330,15 +330,15 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 
 void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
 
-struct rt6_info *fib6_info_alloc(gfp_t gfp_flags);
-void fib6_info_destroy(struct rt6_info *f6i);
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
+void fib6_info_destroy(struct fib6_info *f6i);
 
-static inline void fib6_info_hold(struct rt6_info *f6i)
+static inline void fib6_info_hold(struct fib6_info *f6i)
 {
 	atomic_inc(&f6i->rt6i_ref);
 }
 
-static inline void fib6_info_release(struct rt6_info *f6i)
+static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
 		fib6_info_destroy(f6i);
@@ -371,7 +371,7 @@ enum fib6_walk_state {
 struct fib6_walker {
 	struct list_head lh;
 	struct fib6_node *root, *node;
-	struct rt6_info *leaf;
+	struct fib6_info *leaf;
 	enum fib6_walk_state state;
 	unsigned int skip;
 	unsigned int count;
@@ -435,7 +435,7 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *,
 
 struct fib6_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 };
 
 /*
@@ -457,14 +457,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *saddr, int src_len,
 			      bool exact_match);
 
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg),
 		    void *arg);
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
-int fib6_del(struct rt6_info *rt, struct nl_info *info);
+int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
 void fib6_run_gc(unsigned long expires, struct net *net, bool force);
@@ -487,11 +487,11 @@ void __net_exit fib6_notifier_exit(struct net *net);
 unsigned int fib6_tables_seq_read(struct net *net);
 int fib6_tables_dump(struct net *net, struct notifier_block *nb);
 
-void fib6_update_sernum(struct net *net, struct rt6_info *rt);
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt);
+void fib6_update_sernum(struct net *net, struct fib6_info *rt);
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
 
-void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val);
-static inline bool fib6_metric_locked(struct rt6_info *f6i, int metric)
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
+static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 {
 	return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric));
 }
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 57d0d45667f1..d5fb1e4ae7ac 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,7 +66,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt)
 {
 	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 	       RTF_GATEWAY;
@@ -102,14 +102,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct net *net, struct rt6_info *rt);
-int ip6_del_rt(struct net *net, struct rt6_info *rt);
+int ip6_ins_rt(struct net *net, struct fib6_info *rt);
+int ip6_del_rt(struct net *net, struct fib6_info *rt);
 
-void rt6_flush_exceptions(struct rt6_info *rt);
-void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+void rt6_flush_exceptions(struct fib6_info *rt);
+void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
-static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
+static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt,
 				      const struct in6_addr *daddr,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
@@ -136,9 +136,9 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct rt6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
-				    const struct in6_addr *addr, bool anycast,
-				    gfp_t gfp_flags);
+struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
+				     const struct in6_addr *addr, bool anycast,
+				     gfp_t gfp_flags);
 
 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 			       int flags);
@@ -147,10 +147,10 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
  *	support functions for ND
  *
  */
-struct rt6_info *rt6_get_dflt_router(struct net *net,
+struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     const struct in6_addr *addr,
 				     struct net_device *dev);
-struct rt6_info *rt6_add_dflt_router(struct net *net,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev, unsigned int pref);
 
@@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg {
 	struct net *net;
 };
 
-int rt6_dump_route(struct rt6_info *rt, void *p_arg);
+int rt6_dump_route(struct fib6_info *rt, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
-void rt6_multipath_rebalance(struct rt6_info *rt);
+void rt6_multipath_rebalance(struct fib6_info *rt);
 
 void rt6_uncached_list_add(struct rt6_info *rt);
 void rt6_uncached_list_del(struct rt6_info *rt);
@@ -271,7 +271,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 		return daddr;
 }
 
-static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b)
+static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
 	       a->rt6i_idev == b->rt6i_idev &&
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 74e4e1e449d5..97b3a54579c8 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -60,7 +60,7 @@ struct netns_ipv6 {
 #endif
 	struct xt_table		*ip6table_nat;
 #endif
-	struct rt6_info         *fib6_null_entry;
+	struct fib6_info	*fib6_null_entry;
 	struct rt6_info		*ip6_null_entry;
 	struct rt6_statistics   *rt6_stats;
 	struct timer_list       ip6_fib_timer;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e533a447f68c..a294e86a9b25 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev,
 				 unsigned long event);
 static int addrconf_ifdown(struct net_device *dev, int how);
 
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 						  int plen,
 						  const struct net_device *dev,
 						  u32 flags, u32 noflags);
@@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
 
@@ -1178,7 +1178,7 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
 static void
 cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	rt = addrconf_get_prefix_route(&ifp->addr,
 				       ifp->prefix_len,
@@ -2348,13 +2348,13 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 }
 
 
-static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
+static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 						  int plen,
 						  const struct net_device *dev,
 						  u32 flags, u32 noflags)
 {
 	struct fib6_node *fn;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct fib6_table *table;
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
 
@@ -2641,7 +2641,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 	 */
 
 	if (pinfo->onlink) {
-		struct rt6_info *rt;
+		struct fib6_info *rt;
 		unsigned long rt_expires;
 
 		/* Avoid arithmetic overflow. Really, we could
@@ -3346,7 +3346,7 @@ static int fixup_permanent_addr(struct net *net,
 	 * case regenerate the host route.
 	 */
 	if (!ifp->rt || !ifp->rt->rt6i_node) {
-		struct rt6_info *rt, *prev;
+		struct fib6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
 					GFP_ATOMIC);
@@ -3713,7 +3713,7 @@ restart:
 	keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
 
 	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
-		struct rt6_info *rt = NULL;
+		struct fib6_info *rt = NULL;
 		bool keep;
 
 		addrconf_del_dad_work(ifa);
@@ -5626,7 +5626,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		if (!ipv6_addr_any(&ifp->peer_addr)) {
-			struct rt6_info *rt;
+			struct fib6_info *rt;
 
 			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
 						       ifp->idev->dev, 0, 0);
@@ -5982,7 +5982,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
 	list_for_each_entry(ifa, &idev->addr_list, if_list) {
 		spin_lock(&ifa->lock);
 		if (ifa->rt) {
-			struct rt6_info *rt = ifa->rt;
+			struct fib6_info *rt = ifa->rt;
 			int cpu;
 
 			rcu_read_lock();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 3db8fe10322b..0e35657501a7 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -218,7 +218,7 @@ static void aca_put(struct ifacaddr6 *ac)
 	}
 }
 
-static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
 				   const struct in6_addr *addr)
 {
 	struct inet6_dev *idev = rt->rt6i_idev;
@@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt,
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct net *net;
 	int err;
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4d6bd033dccd..77cf43b2d858 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly;
 struct fib6_cleaner {
 	struct fib6_walker w;
 	struct net *net;
-	int (*func)(struct rt6_info *, void *arg);
+	int (*func)(struct fib6_info *, void *arg);
 	int sernum;
 	void *arg;
 };
@@ -54,7 +54,7 @@ struct fib6_cleaner {
 #define FWS_INIT FWS_L
 #endif
 
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
 					 struct fib6_table *table,
 					 struct fib6_node *fn);
 static struct fib6_node *fib6_repair_tree(struct net *net,
@@ -105,7 +105,7 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *rt)
 {
 	struct fib6_node *fn;
 
@@ -145,9 +145,9 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
 	       addr[fn_bit >> 5];
 }
 
-struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 {
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 
 	f6i = kzalloc(sizeof(*f6i), gfp_flags);
 	if (!f6i)
@@ -167,7 +167,7 @@ struct rt6_info *fib6_info_alloc(gfp_t gfp_flags)
 	return f6i;
 }
 
-void fib6_info_destroy(struct rt6_info *f6i)
+void fib6_info_destroy(struct fib6_info *f6i)
 {
 	struct rt6_exception_bucket *bucket;
 	struct dst_metrics *m;
@@ -404,7 +404,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
 
 static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 				    enum fib_event_type event_type,
-				    struct rt6_info *rt)
+				    struct fib6_info *rt)
 {
 	struct fib6_entry_notifier_info info = {
 		.rt = rt,
@@ -415,7 +415,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 
 static int call_fib6_entry_notifiers(struct net *net,
 				     enum fib_event_type event_type,
-				     struct rt6_info *rt,
+				     struct fib6_info *rt,
 				     struct netlink_ext_ack *extack)
 {
 	struct fib6_entry_notifier_info info = {
@@ -432,7 +432,7 @@ struct fib6_dump_arg {
 	struct notifier_block *nb;
 };
 
-static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 {
 	if (rt == arg->net->ipv6.fib6_null_entry)
 		return;
@@ -441,7 +441,7 @@ static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
 
 static int fib6_node_dump(struct fib6_walker *w)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	for_each_fib6_walker_rt(w)
 		fib6_rt_dump(rt, w->args);
@@ -490,7 +490,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	for_each_fib6_walker_rt(w) {
 		res = rt6_dump_route(rt, w->args);
@@ -507,7 +507,7 @@ static int fib6_dump_node(struct fib6_walker *w)
 		 */
 		if (rt->rt6i_nsiblings)
 			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct rt6_info,
+					     struct fib6_info,
 					     rt6i_siblings);
 	}
 	w->leaf = NULL;
@@ -643,7 +643,7 @@ out:
 	return res;
 }
 
-void fib6_metric_set(struct rt6_info *f6i, int metric, u32 val)
+void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
 {
 	if (!f6i)
 		return;
@@ -690,7 +690,7 @@ static struct fib6_node *fib6_add_1(struct net *net,
 	fn = root;
 
 	do {
-		struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+		struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
 		key = (struct rt6key *)((u8 *)leaf + offset);
 
@@ -884,7 +884,7 @@ insert_above:
 	return ln;
 }
 
-static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
+static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
 	struct fib6_table *table = rt->rt6i_table;
@@ -897,9 +897,9 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 		 * to still alive ones.
 		 */
 		while (fn) {
-			struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+			struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-			struct rt6_info *new_leaf;
+			struct fib6_info *new_leaf;
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
 				atomic_inc(&new_leaf->rt6i_ref);
@@ -936,15 +936,15 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  *	Insert routing information in a node.
  */
 
-static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
+static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			    struct nl_info *info,
 			    struct netlink_ext_ack *extack)
 {
-	struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
-	struct rt6_info *iter = NULL;
-	struct rt6_info __rcu **ins;
-	struct rt6_info __rcu **fallback_ins = NULL;
+	struct fib6_info *iter = NULL;
+	struct fib6_info __rcu **ins;
+	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
 	int add = (!info->nlh ||
@@ -1035,7 +1035,7 @@ next_iter:
 	/* Link this route to others same route. */
 	if (rt->rt6i_nsiblings) {
 		unsigned int rt6i_nsiblings;
-		struct rt6_info *sibling, *temp_sibling;
+		struct fib6_info *sibling, *temp_sibling;
 
 		/* Find the first route that have the same metric */
 		sibling = leaf;
@@ -1156,7 +1156,7 @@ add:
 	return 0;
 }
 
-static void fib6_start_gc(struct net *net, struct rt6_info *rt)
+static void fib6_start_gc(struct net *net, struct fib6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
 	    (rt->rt6i_flags & RTF_EXPIRES))
@@ -1171,7 +1171,7 @@ void fib6_force_start_gc(struct net *net)
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
 
-static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
+static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
 					   int sernum)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
@@ -1186,7 +1186,7 @@ static void __fib6_update_sernum_upto_root(struct rt6_info *rt,
 	}
 }
 
-void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
+void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
 {
 	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
 }
@@ -1198,7 +1198,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt)
  *	Need to own table->tb6_lock
  */
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt,
+int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
 	struct fib6_table *table = rt->rt6i_table;
@@ -1219,7 +1219,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 	fn = fib6_add_1(info->nl_net, table, root,
 			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
-			offsetof(struct rt6_info, rt6i_dst), allow_create,
+			offsetof(struct fib6_info, rt6i_dst), allow_create,
 			replace_required, extack);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
@@ -1260,7 +1260,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			sn = fib6_add_1(info->nl_net, table, sfn,
 					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src),
+					offsetof(struct fib6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1279,7 +1279,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 		} else {
 			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
 					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct rt6_info, rt6i_src),
+					offsetof(struct fib6_info, rt6i_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1316,7 +1316,7 @@ out:
 		 * super-tree leaf node we have to find a new one for it.
 		 */
 		if (pn != fn) {
-			struct rt6_info *pn_leaf =
+			struct fib6_info *pn_leaf =
 				rcu_dereference_protected(pn->leaf,
 				    lockdep_is_held(&table->tb6_lock));
 			if (pn_leaf == rt) {
@@ -1365,7 +1365,7 @@ failure:
  */
 
 struct lookup_args {
-	int			offset;		/* key offset on rt6_info	*/
+	int			offset;		/* key offset on fib6_info */
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
@@ -1403,7 +1403,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 		struct fib6_node *subtree = FIB6_SUBTREE(fn);
 
 		if (subtree || fn->fn_flags & RTN_RTINFO) {
-			struct rt6_info *leaf = rcu_dereference(fn->leaf);
+			struct fib6_info *leaf = rcu_dereference(fn->leaf);
 			struct rt6key *key;
 
 			if (!leaf)
@@ -1443,12 +1443,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
 		{
-			.offset = offsetof(struct rt6_info, rt6i_dst),
+			.offset = offsetof(struct fib6_info, rt6i_dst),
 			.addr = daddr,
 		},
 #ifdef CONFIG_IPV6_SUBTREES
 		{
-			.offset = offsetof(struct rt6_info, rt6i_src),
+			.offset = offsetof(struct fib6_info, rt6i_src),
 			.addr = saddr,
 		},
 #endif
@@ -1484,7 +1484,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
 	struct fib6_node *fn, *prev = NULL;
 
 	for (fn = root; fn ; ) {
-		struct rt6_info *leaf = rcu_dereference(fn->leaf);
+		struct fib6_info *leaf = rcu_dereference(fn->leaf);
 		struct rt6key *key;
 
 		/* This node is being deleted */
@@ -1533,7 +1533,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 	struct fib6_node *fn;
 
 	fn = fib6_locate_1(root, daddr, dst_len,
-			   offsetof(struct rt6_info, rt6i_dst),
+			   offsetof(struct fib6_info, rt6i_dst),
 			   exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1544,7 +1544,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 			if (subtree) {
 				fn = fib6_locate_1(subtree, saddr, src_len,
-					   offsetof(struct rt6_info, rt6i_src),
+					   offsetof(struct fib6_info, rt6i_src),
 					   exact_match);
 			}
 		}
@@ -1563,7 +1563,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
  *
  */
 
-static struct rt6_info *fib6_find_prefix(struct net *net,
+static struct fib6_info *fib6_find_prefix(struct net *net,
 					 struct fib6_table *table,
 					 struct fib6_node *fn)
 {
@@ -1622,11 +1622,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 					    lockdep_is_held(&table->tb6_lock));
 		struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+		struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+		struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
 					    lockdep_is_held(&table->tb6_lock));
-		struct rt6_info *new_fn_leaf;
+		struct fib6_info *new_fn_leaf;
 
 		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
 		iter++;
@@ -1717,10 +1717,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
 }
 
 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
-			   struct rt6_info __rcu **rtp, struct nl_info *info)
+			   struct fib6_info __rcu **rtp, struct nl_info *info)
 {
 	struct fib6_walker *w;
-	struct rt6_info *rt = rcu_dereference_protected(*rtp,
+	struct fib6_info *rt = rcu_dereference_protected(*rtp,
 				    lockdep_is_held(&table->tb6_lock));
 	struct net *net = info->nl_net;
 
@@ -1741,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	/* Remove this entry from other siblings */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->rt6i_siblings, rt6i_siblings)
@@ -1785,14 +1785,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 }
 
 /* Need to own table->tb6_lock */
-int fib6_del(struct rt6_info *rt, struct nl_info *info)
+int fib6_del(struct fib6_info *rt, struct nl_info *info)
 {
 	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
 				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	struct fib6_table *table = rt->rt6i_table;
 	struct net *net = info->nl_net;
-	struct rt6_info __rcu **rtp;
-	struct rt6_info __rcu **rtp_next;
+	struct fib6_info __rcu **rtp;
+	struct fib6_info __rcu **rtp_next;
 
 	if (!fn || rt == net->ipv6.fib6_null_entry)
 		return -ENOENT;
@@ -1804,7 +1804,7 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
 	 */
 
 	for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
-		struct rt6_info *cur = rcu_dereference_protected(*rtp,
+		struct fib6_info *cur = rcu_dereference_protected(*rtp,
 					lockdep_is_held(&table->tb6_lock));
 		if (rt == cur) {
 			fib6_del_route(table, fn, rtp, info);
@@ -1948,7 +1948,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w)
 static int fib6_clean_node(struct fib6_walker *w)
 {
 	int res;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
 	struct nl_info info = {
 		.nl_net = c->net,
@@ -1983,7 +1983,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 			if (WARN_ON(!rt->rt6i_nsiblings))
 				continue;
 			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct rt6_info, rt6i_siblings);
+					     struct fib6_info, rt6i_siblings);
 			continue;
 		}
 		WARN_ON(res != 0);
@@ -2002,7 +2002,7 @@ static int fib6_clean_node(struct fib6_walker *w)
  */
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
-			    int (*func)(struct rt6_info *, void *arg),
+			    int (*func)(struct fib6_info *, void *arg),
 			    int sernum, void *arg)
 {
 	struct fib6_cleaner c;
@@ -2020,7 +2020,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
 }
 
 static void __fib6_clean_all(struct net *net,
-			     int (*func)(struct rt6_info *, void *),
+			     int (*func)(struct fib6_info *, void *),
 			     int sernum, void *arg)
 {
 	struct fib6_table *table;
@@ -2040,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
 	rcu_read_unlock();
 }
 
-void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
+void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
 		    void *arg)
 {
 	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
@@ -2057,7 +2057,7 @@ static void fib6_flush_trees(struct net *net)
  *	Garbage collection
  */
 
-static int fib6_age(struct rt6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, void *arg)
 {
 	struct fib6_gc_args *gc_args = arg;
 	unsigned long now = jiffies;
@@ -2261,7 +2261,7 @@ struct ipv6_route_iter {
 
 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 {
-	struct rt6_info *rt = v;
+	struct fib6_info *rt = v;
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
@@ -2353,14 +2353,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	int r;
-	struct rt6_info *n;
+	struct fib6_info *n;
 	struct net *net = seq_file_net(seq);
 	struct ipv6_route_iter *iter = seq->private;
 
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next);
+	n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a28857088bff..102645298692 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1155,7 +1155,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 	struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
 	struct neighbour *neigh = NULL;
 	struct inet6_dev *in6_dev;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct net *net;
 	int lifetime;
 	struct ndisc_options ndopts;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0d861bd07673..2ccf939e1a20 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -96,24 +96,24 @@ static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
-static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
-static size_t rt6_nlmsg_size(struct rt6_info *rt);
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
+static size_t rt6_nlmsg_size(struct fib6_info *rt);
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
-			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags);
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 					   struct in6_addr *daddr,
 					   struct in6_addr *saddr);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev,
 					   unsigned int pref);
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev);
@@ -283,7 +283,7 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 	[RTAX_HOPLIMIT - 1] = 0,
 };
 
-static const struct rt6_info fib6_null_entry_template = {
+static const struct fib6_info fib6_null_entry_template = {
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32)0,
@@ -372,7 +372,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -425,13 +425,13 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct rt6_info *rt6_multipath_select(const struct net *net,
-					     struct rt6_info *match,
+static struct fib6_info *rt6_multipath_select(const struct net *net,
+					      struct fib6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     const struct sk_buff *skb,
 					     int strict)
 {
-	struct rt6_info *sibling, *next_sibling;
+	struct fib6_info *sibling, *next_sibling;
 
 	/* We might have already computed the hash for ICMPv6 errors. In such
 	 * case it will always be non-zero. Otherwise now is the time to do it.
@@ -462,14 +462,14 @@ static struct rt6_info *rt6_multipath_select(const struct net *net,
  *	Route lookup. rcu_read_lock() should be held.
  */
 
-static inline struct rt6_info *rt6_device_match(struct net *net,
-						    struct rt6_info *rt,
+static inline struct fib6_info *rt6_device_match(struct net *net,
+						 struct fib6_info *rt,
 						    const struct in6_addr *saddr,
 						    int oif,
 						    int flags)
 {
-	struct rt6_info *local = NULL;
-	struct rt6_info *sprt;
+	struct fib6_info *local = NULL;
+	struct fib6_info *sprt;
 
 	if (!oif && ipv6_addr_any(saddr) &&
 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
@@ -532,7 +532,7 @@ static void rt6_probe_deferred(struct work_struct *w)
 	kfree(work);
 }
 
-static void rt6_probe(struct rt6_info *rt)
+static void rt6_probe(struct fib6_info *rt)
 {
 	struct __rt6_probe_work *work;
 	const struct in6_addr *nh_gw;
@@ -585,7 +585,7 @@ out:
 	rcu_read_unlock_bh();
 }
 #else
-static inline void rt6_probe(struct rt6_info *rt)
+static inline void rt6_probe(struct fib6_info *rt)
 {
 }
 #endif
@@ -593,7 +593,7 @@ static inline void rt6_probe(struct rt6_info *rt)
 /*
  * Default Router Selection (RFC 2461 6.3.6)
  */
-static inline int rt6_check_dev(struct rt6_info *rt, int oif)
+static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 {
 	const struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -605,7 +605,7 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 	return 0;
 }
 
-static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
+static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 {
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
@@ -637,8 +637,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 	return ret;
 }
 
-static int rt6_score_route(struct rt6_info *rt, int oif,
-			   int strict)
+static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 {
 	int m;
 
@@ -656,8 +655,8 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	return m;
 }
 
-static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
-				   int *mpri, struct rt6_info *match,
+static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
+				   int *mpri, struct fib6_info *match,
 				   bool *do_rr)
 {
 	int m;
@@ -696,13 +695,13 @@ out:
 	return match;
 }
 
-static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
-				     struct rt6_info *leaf,
-				     struct rt6_info *rr_head,
+static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
+				     struct fib6_info *leaf,
+				     struct fib6_info *rr_head,
 				     u32 metric, int oif, int strict,
 				     bool *do_rr)
 {
-	struct rt6_info *rt, *match, *cont;
+	struct fib6_info *rt, *match, *cont;
 	int mpri = -1;
 
 	match = NULL;
@@ -735,11 +734,11 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 	return match;
 }
 
-static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 				   int oif, int strict)
 {
-	struct rt6_info *leaf = rcu_dereference(fn->leaf);
-	struct rt6_info *match, *rt0;
+	struct fib6_info *leaf = rcu_dereference(fn->leaf);
+	struct fib6_info *match, *rt0;
 	bool do_rr = false;
 	int key_plen;
 
@@ -767,7 +766,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
+		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
@@ -785,7 +784,7 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	return match ? match : net->ipv6.fib6_null_entry;
 }
 
-static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
+static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
 {
 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 }
@@ -799,7 +798,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	struct in6_addr prefix_buf, *prefix;
 	unsigned int pref;
 	unsigned long lifetime;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	if (len < sizeof(struct route_info)) {
 		return -EINVAL;
@@ -871,7 +870,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
  */
 
 /* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
@@ -913,7 +912,7 @@ static int ip6_rt_type_to_error(u8 fib6_type)
 	return fib6_prop[fib6_type];
 }
 
-static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
+static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
 {
 	unsigned short flags = 0;
 
@@ -927,7 +926,7 @@ static unsigned short fib6_info_dst_flags(struct rt6_info *rt)
 	return flags;
 }
 
-static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
 
@@ -949,7 +948,7 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
 	}
 }
 
-static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.flags |= fib6_info_dst_flags(ort);
 
@@ -977,7 +976,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
 	rt->dst.lastuse = jiffies;
 }
 
-static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
+static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	fib6_info_hold(from);
@@ -989,7 +988,7 @@ static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
 	}
 }
 
-static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
+static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	ip6_rt_init_dst(rt, ort);
 
@@ -1045,7 +1044,7 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 }
 
 /* called with rcu_lock held */
-static struct rt6_info *ip6_create_rt_rcu(struct rt6_info *rt)
+static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
 {
 	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev = rt->fib6_nh.nh_dev;
@@ -1064,7 +1063,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     const struct sk_buff *skb,
 					     int flags)
 {
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 
@@ -1152,7 +1151,7 @@ EXPORT_SYMBOL(rt6_lookup);
  * Caller must hold dst before calling it.
  */
 
-static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
+static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
 			struct netlink_ext_ack *extack)
 {
 	int err;
@@ -1166,14 +1165,14 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 	return err;
 }
 
-int ip6_ins_rt(struct net *net, struct rt6_info *rt)
+int ip6_ins_rt(struct net *net, struct fib6_info *rt)
 {
 	struct nl_info info = {	.nl_net = net, };
 
 	return __ip6_ins_rt(rt, &info, NULL);
 }
 
-static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
+static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 					   const struct in6_addr *daddr,
 					   const struct in6_addr *saddr)
 {
@@ -1213,7 +1212,7 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	return rt;
 }
 
-static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
+static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 {
 	unsigned short flags = fib6_info_dst_flags(rt);
 	struct net_device *dev;
@@ -1232,7 +1231,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 }
 
 /* It should be called with rcu_read_lock() acquired */
-static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
+static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
 {
 	struct rt6_info *pcpu_rt, **p;
 
@@ -1246,7 +1245,7 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
 }
 
 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
-					    struct rt6_info *rt)
+					    struct fib6_info *rt)
 {
 	struct rt6_info *pcpu_rt, *prev, **p;
 
@@ -1390,7 +1389,7 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
 	return NULL;
 }
 
-static unsigned int fib6_mtu(const struct rt6_info *rt)
+static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
@@ -1401,7 +1400,7 @@ static unsigned int fib6_mtu(const struct rt6_info *rt)
 }
 
 static int rt6_insert_exception(struct rt6_info *nrt,
-				struct rt6_info *ort)
+				struct fib6_info *ort)
 {
 	struct net *net = dev_net(nrt->dst.dev);
 	struct rt6_exception_bucket *bucket;
@@ -1487,7 +1486,7 @@ out:
 	return err;
 }
 
-void rt6_flush_exceptions(struct rt6_info *rt)
+void rt6_flush_exceptions(struct fib6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1517,7 +1516,7 @@ out:
 /* Find cached rt in the hash table inside passed in rt
  * Caller has to hold rcu_read_lock()
  */
-static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 					   struct in6_addr *daddr,
 					   struct in6_addr *saddr)
 {
@@ -1550,7 +1549,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
 static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 	int err;
@@ -1595,7 +1594,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
-	struct rt6_info *from = rt->from;
+	struct fib6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
 
@@ -1625,7 +1624,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	rcu_read_unlock();
 }
 
-static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1667,7 +1666,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
 }
 
 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
-				       struct rt6_info *rt, int mtu)
+				       struct fib6_info *rt, int mtu)
 {
 	struct rt6_exception_bucket *bucket;
 	struct rt6_exception *rt6_ex;
@@ -1697,7 +1696,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
 
 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
 
-static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
 					struct in6_addr *gateway)
 {
 	struct rt6_exception_bucket *bucket;
@@ -1776,7 +1775,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	gc_args->more++;
 }
 
-void rt6_age_exceptions(struct rt6_info *rt,
+void rt6_age_exceptions(struct fib6_info *rt,
 			struct fib6_gc_args *gc_args,
 			unsigned long now)
 {
@@ -1812,7 +1811,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
-	struct rt6_info *f6i;
+	struct fib6_info *f6i;
 	struct rt6_info *rt;
 	int strict = 0;
 
@@ -2139,7 +2138,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
  *	Destination cache support functions
  */
 
-static bool fib6_check(struct rt6_info *f6i, u32 cookie)
+static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
@@ -2374,7 +2373,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
 	struct rt6_info *ret = NULL, *rt_cache;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_node *fn;
 
 	/* Get the "current" route for this destination and
@@ -2620,7 +2619,7 @@ out:
 	return entries > rt_max_size;
 }
 
-static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
+static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
 			       struct fib6_config *cfg)
 {
 	int err = 0;
@@ -2823,12 +2822,12 @@ out:
 	return err;
 }
 
-static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
+static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      gfp_t gfp_flags,
 					      struct netlink_ext_ack *extack)
 {
 	struct net *net = cfg->fc_nlinfo.nl_net;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
@@ -3047,7 +3046,7 @@ out:
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	int err;
 
 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
@@ -3060,7 +3059,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 	return err;
 }
 
-static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
+static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
 {
 	struct net *net = info->nl_net;
 	struct fib6_table *table;
@@ -3081,14 +3080,14 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct net *net, struct rt6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt)
 {
 	struct nl_info info = { .nl_net = net };
 
 	return __ip6_del_rt(rt, &info);
 }
 
-static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 {
 	struct nl_info *info = &cfg->fc_nlinfo;
 	struct net *net = info->nl_net;
@@ -3102,7 +3101,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
 	spin_lock_bh(&table->tb6_lock);
 
 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 
 		/* prefer to send a single notification with all hops */
 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
@@ -3159,8 +3158,9 @@ out:
 static int ip6_route_del(struct fib6_config *cfg,
 			 struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt, *rt_cache;
+	struct rt6_info *rt_cache;
 	struct fib6_table *table;
+	struct fib6_info *rt;
 	struct fib6_node *fn;
 	int err = -ESRCH;
 
@@ -3336,7 +3336,7 @@ out:
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
-static struct rt6_info *rt6_get_route_info(struct net *net,
+static struct fib6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev)
@@ -3344,7 +3344,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
 	int ifindex = dev->ifindex;
 	struct fib6_node *fn;
-	struct rt6_info *rt = NULL;
+	struct fib6_info *rt = NULL;
 	struct fib6_table *table;
 
 	table = fib6_get_table(net, tb_id);
@@ -3363,7 +3363,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
-		ip6_hold_safe(NULL, &rt, false);
+		fib6_info_hold(rt);
 		break;
 	}
 out:
@@ -3371,7 +3371,7 @@ out:
 	return rt;
 }
 
-static struct rt6_info *rt6_add_route_info(struct net *net,
+static struct fib6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
 					   const struct in6_addr *gwaddr,
 					   struct net_device *dev,
@@ -3404,12 +3404,12 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 }
 #endif
 
-struct rt6_info *rt6_get_dflt_router(struct net *net,
+struct fib6_info *rt6_get_dflt_router(struct net *net,
 				     const struct in6_addr *addr,
 				     struct net_device *dev)
 {
 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct fib6_table *table;
 
 	table = fib6_get_table(net, tb_id);
@@ -3424,12 +3424,12 @@ struct rt6_info *rt6_get_dflt_router(struct net *net,
 			break;
 	}
 	if (rt)
-		ip6_hold_safe(NULL, &rt, false);
+		fib6_info_hold(rt);
 	rcu_read_unlock();
 	return rt;
 }
 
-struct rt6_info *rt6_add_dflt_router(struct net *net,
+struct fib6_info *rt6_add_dflt_router(struct net *net,
 				     const struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref)
@@ -3463,7 +3463,7 @@ struct rt6_info *rt6_add_dflt_router(struct net *net,
 static void __rt6_purge_dflt_routers(struct net *net,
 				     struct fib6_table *table)
 {
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 restart:
 	rcu_read_lock();
@@ -3614,14 +3614,14 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct rt6_info *addrconf_dst_alloc(struct net *net,
+struct fib6_info *addrconf_dst_alloc(struct net *net,
 				    struct inet6_dev *idev,
 				    const struct in6_addr *addr,
 				    bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 
 	rt = fib6_info_alloc(gfp_flags);
 	if (!rt)
@@ -3661,7 +3661,7 @@ struct arg_dev_net_ip {
 	struct in6_addr *addr;
 };
 
-static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 {
 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
@@ -3694,7 +3694,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
 
 /* Remove routers and update dst entries when gateway turn into host. */
-static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
+static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
@@ -3725,9 +3725,9 @@ struct arg_netdev_event {
 	};
 };
 
-static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
+static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	struct fib6_node *fn;
 
 	fn = rcu_dereference_protected(rt->rt6i_node,
@@ -3745,7 +3745,7 @@ static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
 	return NULL;
 }
 
-static bool rt6_is_dead(const struct rt6_info *rt)
+static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
@@ -3755,9 +3755,9 @@ static bool rt6_is_dead(const struct rt6_info *rt)
 	return false;
 }
 
-static int rt6_multipath_total_weight(const struct rt6_info *rt)
+static int rt6_multipath_total_weight(const struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
@@ -3771,7 +3771,7 @@ static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	return total;
 }
 
-static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
+static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
 {
 	int upper_bound = -1;
 
@@ -3783,9 +3783,9 @@ static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
 }
 
-static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
+static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	int weight = 0;
 
 	rt6_upper_bound_set(rt, &weight, total);
@@ -3794,9 +3794,9 @@ static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
 		rt6_upper_bound_set(iter, &weight, total);
 }
 
-void rt6_multipath_rebalance(struct rt6_info *rt)
+void rt6_multipath_rebalance(struct fib6_info *rt)
 {
-	struct rt6_info *first;
+	struct fib6_info *first;
 	int total;
 
 	/* In case the entire multipath route was marked for flushing,
@@ -3818,7 +3818,7 @@ void rt6_multipath_rebalance(struct rt6_info *rt)
 	rt6_multipath_upper_bound_set(first, total);
 }
 
-static int fib6_ifup(struct rt6_info *rt, void *p_arg)
+static int fib6_ifup(struct fib6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	struct net *net = dev_net(arg->dev);
@@ -3847,10 +3847,10 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
 }
 
-static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
+static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 				   const struct net_device *dev)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	if (rt->fib6_nh.nh_dev == dev)
 		return true;
@@ -3861,19 +3861,19 @@ static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
 	return false;
 }
 
-static void rt6_multipath_flush(struct rt6_info *rt)
+static void rt6_multipath_flush(struct fib6_info *rt)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	rt->should_flush = 1;
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
 		iter->should_flush = 1;
 }
 
-static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
+static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 					     const struct net_device *down_dev)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 	unsigned int dead = 0;
 
 	if (rt->fib6_nh.nh_dev == down_dev ||
@@ -3887,11 +3887,11 @@ static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
 	return dead;
 }
 
-static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
+static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 				       const struct net_device *dev,
 				       unsigned int nh_flags)
 {
-	struct rt6_info *iter;
+	struct fib6_info *iter;
 
 	if (rt->fib6_nh.nh_dev == dev)
 		rt->fib6_nh.nh_flags |= nh_flags;
@@ -3901,7 +3901,7 @@ static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
 }
 
 /* called with write lock held for table with rt */
-static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
+static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 {
 	const struct arg_netdev_event *arg = p_arg;
 	const struct net_device *dev = arg->dev;
@@ -3968,7 +3968,7 @@ struct rt6_mtu_change_arg {
 	unsigned int mtu;
 };
 
-static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
+static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
 	struct inet6_dev *idev;
@@ -4155,7 +4155,7 @@ errout:
 }
 
 struct rt6_nh {
-	struct rt6_info *rt6_info;
+	struct fib6_info *fib6_info;
 	struct fib6_config r_cfg;
 	struct list_head next;
 };
@@ -4173,21 +4173,22 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
 
 static int ip6_route_info_append(struct net *net,
 				 struct list_head *rt6_nh_list,
-				 struct rt6_info *rt, struct fib6_config *r_cfg)
+				 struct fib6_info *rt,
+				 struct fib6_config *r_cfg)
 {
 	struct rt6_nh *nh;
 	int err = -EEXIST;
 
 	list_for_each_entry(nh, rt6_nh_list, next) {
-		/* check if rt6_info already exists */
-		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
+		/* check if fib6_info already exists */
+		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
 			return err;
 	}
 
 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
 	if (!nh)
 		return -ENOMEM;
-	nh->rt6_info = rt;
+	nh->fib6_info = rt;
 	err = ip6_convert_metrics(net, rt, r_cfg);
 	if (err) {
 		kfree(nh);
@@ -4199,8 +4200,8 @@ static int ip6_route_info_append(struct net *net,
 	return 0;
 }
 
-static void ip6_route_mpath_notify(struct rt6_info *rt,
-				   struct rt6_info *rt_last,
+static void ip6_route_mpath_notify(struct fib6_info *rt,
+				   struct fib6_info *rt_last,
 				   struct nl_info *info,
 				   __u16 nlflags)
 {
@@ -4212,7 +4213,7 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
 	 */
 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
 		rt = list_first_entry(&rt_last->rt6i_siblings,
-				      struct rt6_info,
+				      struct fib6_info,
 				      rt6i_siblings);
 	}
 
@@ -4223,11 +4224,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt,
 static int ip6_route_multipath_add(struct fib6_config *cfg,
 				   struct netlink_ext_ack *extack)
 {
-	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
+	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
 	struct nl_info *info = &cfg->fc_nlinfo;
 	struct fib6_config r_cfg;
 	struct rtnexthop *rtnh;
-	struct rt6_info *rt;
+	struct fib6_info *rt;
 	struct rt6_nh *err_nh;
 	struct rt6_nh *nh, *nh_safe;
 	__u16 nlflags;
@@ -4247,7 +4248,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
 
 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
-	 * rt6_info structs per nexthop
+	 * fib6_info structs per nexthop
 	 */
 	while (rtnh_ok(rtnh, remaining)) {
 		memcpy(&r_cfg, cfg, sizeof(*cfg));
@@ -4297,16 +4298,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
-		rt_last = nh->rt6_info;
-		err = __ip6_ins_rt(nh->rt6_info, info, extack);
-		fib6_info_release(nh->rt6_info);
+		rt_last = nh->fib6_info;
+		err = __ip6_ins_rt(nh->fib6_info, info, extack);
+		fib6_info_release(nh->fib6_info);
 
 		/* save reference to first route for notification */
 		if (!rt_notif && !err)
-			rt_notif = nh->rt6_info;
+			rt_notif = nh->fib6_info;
 
-		/* nh->rt6_info is used or freed at this point, reset to NULL*/
-		nh->rt6_info = NULL;
+		/* nh->fib6_info is used or freed at this point, reset to NULL*/
+		nh->fib6_info = NULL;
 		if (err) {
 			if (replace && nhn)
 				ip6_print_replace_route_err(&rt6_nh_list);
@@ -4347,8 +4348,8 @@ add_errout:
 
 cleanup:
 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
-		if (nh->rt6_info)
-			fib6_info_release(nh->rt6_info);
+		if (nh->fib6_info)
+			fib6_info_release(nh->fib6_info);
 		list_del(&nh->next);
 		kfree(nh);
 	}
@@ -4428,7 +4429,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
 }
 
-static size_t rt6_nlmsg_size(struct rt6_info *rt)
+static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
@@ -4458,7 +4459,7 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt)
 	       + nexthop_len;
 }
 
-static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
+static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 			    unsigned int *flags, bool skip_oif)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
@@ -4495,7 +4496,7 @@ nla_put_failure:
 }
 
 /* add multipath next hop */
-static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
+static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
 {
 	const struct net_device *dev = rt->fib6_nh.nh_dev;
 	struct rtnexthop *rtnh;
@@ -4523,7 +4524,7 @@ nla_put_failure:
 }
 
 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
-			 struct rt6_info *rt, struct dst_entry *dst,
+			 struct fib6_info *rt, struct dst_entry *dst,
 			 struct in6_addr *dest, struct in6_addr *src,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags)
@@ -4613,7 +4614,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
 	if (rt->rt6i_nsiblings) {
-		struct rt6_info *sibling, *next_sibling;
+		struct fib6_info *sibling, *next_sibling;
 		struct nlattr *mp;
 
 		mp = nla_nest_start(skb, RTA_MULTIPATH);
@@ -4655,7 +4656,7 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	struct net *net = arg->net;
@@ -4800,7 +4801,7 @@ errout:
 	return err;
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
+void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int nlm_flags)
 {
 	struct sk_buff *skb;
-- 
cgit v1.2.3-59-g8ed1b


From 77634cc67dc1ffc8ae6d869af6dee4b2ea6025ee Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 17 Apr 2018 17:33:27 -0700
Subject: net/ipv6: Remove unused code and variables for rt6_info

Drop unneeded elements from rt6_info struct and rearrange layout to
something more relevant for the data path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 60 +++----------------------------------------------
 net/ipv6/ip6_fib.c      | 22 ------------------
 net/ipv6/route.c        | 27 ++--------------------
 net/ipv6/xfrm6_policy.c |  2 --
 4 files changed, 5 insertions(+), 106 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index d41b7bd69fb3..a36116b92100 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -175,58 +175,20 @@ struct fib6_info {
 
 struct rt6_info {
 	struct dst_entry		dst;
-	struct rt6_info __rcu		*rt6_next;
 	struct fib6_info		*from;
 
-	/*
-	 * Tail elements of dst_entry (__refcnt etc.)
-	 * and these elements (rarely used in hot path) are in
-	 * the same cache line.
-	 */
-	struct fib6_table		*rt6i_table;
-	struct fib6_node __rcu		*rt6i_node;
-
+	struct rt6key			rt6i_dst;
+	struct rt6key			rt6i_src;
 	struct in6_addr			rt6i_gateway;
-
-	/* Multipath routes:
-	 * siblings is a list of rt6_info that have the the same metric/weight,
-	 * destination, but not the same gateway. nsiblings is just a cache
-	 * to speed up lookup.
-	 */
-	struct list_head		rt6i_siblings;
-	unsigned int			rt6i_nsiblings;
-
-	atomic_t			rt6i_ref;
-
-	/* These are in a separate cache line. */
-	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
+	struct inet6_dev		*rt6i_idev;
 	u32				rt6i_flags;
-	struct rt6key			rt6i_src;
 	struct rt6key			rt6i_prefsrc;
 
 	struct list_head		rt6i_uncached;
 	struct uncached_list		*rt6i_uncached_list;
 
-	struct inet6_dev		*rt6i_idev;
-	struct rt6_info * __percpu	*rt6i_pcpu;
-	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
-
-	u32				rt6i_metric;
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
-	u8				rt6i_protocol;
-	u8				fib6_type;
-	u8				exception_bucket_flushed:1,
-					should_flush:1,
-					dst_nocount:1,
-					dst_nopolicy:1,
-					dst_host:1,
-					unused:3;
-
-	unsigned long			expires;
-	struct dst_metrics		*fib6_metrics;
-#define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
-	struct fib6_nh			fib6_nh;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
@@ -328,8 +290,6 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
-
 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags);
 void fib6_info_destroy(struct fib6_info *f6i);
 
@@ -344,20 +304,6 @@ static inline void fib6_info_release(struct fib6_info *f6i)
 		fib6_info_destroy(f6i);
 }
 
-static inline void rt6_hold(struct rt6_info *rt)
-{
-	atomic_inc(&rt->rt6i_ref);
-}
-
-static inline void rt6_release(struct rt6_info *rt)
-{
-	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		rt6_free_pcpu(rt);
-		dst_dev_put(&rt->dst);
-		dst_release(&rt->dst);
-	}
-}
-
 enum fib6_walk_state {
 #ifdef CONFIG_IPV6_SUBTREES
 	FWS_S,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 77cf43b2d858..2ab49b7cac22 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -240,28 +240,6 @@ static void node_free(struct net *net, struct fib6_node *fn)
 	net->ipv6.rt6_stats->fib_nodes--;
 }
 
-void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
-{
-	int cpu;
-
-	if (!non_pcpu_rt->rt6i_pcpu)
-		return;
-
-	for_each_possible_cpu(cpu) {
-		struct rt6_info **ppcpu_rt;
-		struct rt6_info *pcpu_rt;
-
-		ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);
-		pcpu_rt = *ppcpu_rt;
-		if (pcpu_rt) {
-			dst_dev_put(&pcpu_rt->dst);
-			dst_release(&pcpu_rt->dst);
-			*ppcpu_rt = NULL;
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(rt6_free_pcpu);
-
 static void fib6_free_table(struct fib6_table *table)
 {
 	inetpeer_invalidate_tree(&table->tb6_peers);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2ccf939e1a20..f9c363327d62 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -302,10 +302,6 @@ static const struct rt6_info ip6_null_entry_template = {
 		.output		= ip6_pkt_discard_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_UNREACHABLE,
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -320,10 +316,6 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 		.output		= ip6_pkt_prohibit_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_PROHIBIT,
 };
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
@@ -336,10 +328,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 		.output		= dst_discard_out,
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32) 0,
-	.rt6i_ref	= ATOMIC_INIT(1),
-	.fib6_type	= RTN_BLACKHOLE,
 };
 
 #endif
@@ -349,7 +337,6 @@ static void rt6_info_init(struct rt6_info *rt)
 	struct dst_entry *dst = &rt->dst;
 
 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
-	INIT_LIST_HEAD(&rt->rt6i_siblings);
 	INIT_LIST_HEAD(&rt->rt6i_uncached);
 }
 
@@ -999,12 +986,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->rt6i_flags;
 	rt6_set_from(rt, ort);
-	rt->rt6i_metric = ort->rt6i_metric;
 #ifdef CONFIG_IPV6_SUBTREES
 	rt->rt6i_src = ort->rt6i_src;
 #endif
 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
-	rt->rt6i_table = ort->rt6i_table;
 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
@@ -1192,7 +1177,6 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 
 	ip6_rt_copy_init(rt, ort);
 	rt->rt6i_flags |= RTF_CACHE;
-	rt->rt6i_metric = 0;
 	rt->dst.flags |= DST_HOST;
 	rt->rt6i_dst.addr = *daddr;
 	rt->rt6i_dst.plen = 128;
@@ -1225,7 +1209,6 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
 	if (!pcpu_rt)
 		return NULL;
 	ip6_rt_copy_init(pcpu_rt, rt);
-	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
 	pcpu_rt->rt6i_flags |= RTF_PCPU;
 	return pcpu_rt;
 }
@@ -1279,9 +1262,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 		return;
 
 	net = dev_net(rt6_ex->rt6i->dst.dev);
-	rt6_ex->rt6i->rt6i_node = NULL;
 	hlist_del_rcu(&rt6_ex->hlist);
-	ip6_rt_put(rt6_ex->rt6i);
+	dst_release(&rt6_ex->rt6i->dst);
 	kfree_rcu(rt6_ex, rcu);
 	WARN_ON_ONCE(!bucket->depth);
 	bucket->depth--;
@@ -1463,8 +1445,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	}
 	rt6_ex->rt6i = nrt;
 	rt6_ex->stamp = jiffies;
-	atomic_inc(&nrt->rt6i_ref);
-	nrt->rt6i_node = ort->rt6i_node;
 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
 	bucket->depth++;
 	net->ipv6.rt6_stats->fib_rt_cache++;
@@ -2122,7 +2102,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 		rt->rt6i_idev = in6_dev_get(loopback_dev);
 		rt->rt6i_gateway = ort->rt6i_gateway;
 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
-		rt->rt6i_metric = 0;
 
 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 #ifdef CONFIG_IPV6_SUBTREES
@@ -2247,8 +2226,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU ||
-		 rcu_access_pointer(rt->rt6i_node));
+		(rt->rt6i_flags & RTF_PCPU || rt->from);
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -3313,7 +3291,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	if (on_link)
 		nrt->rt6i_flags &= ~RTF_GATEWAY;
 
-	nrt->rt6i_protocol = RTPROT_REDIRECT;
 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
 
 	/* No need to remove rt from the exception table if rt is
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 416fe67271a9..2cff209d0fc1 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	 * it was magically lost, so this code needs audit */
 	xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST |
 						   RTF_LOCAL);
-	xdst->u.rt6.rt6i_metric = rt->rt6i_metric;
-	xdst->u.rt6.rt6i_node = rt->rt6i_node;
 	xdst->route_cookie = rt6_get_cookie(rt);
 	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
 	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
-- 
cgit v1.2.3-59-g8ed1b


From cd027a5433d66734097adbd9d262c203471102a3 Mon Sep 17 00:00:00 2001
From: Jacek Kalwas <jacek.kalwas@intel.com>
Date: Thu, 12 Apr 2018 12:03:13 -0700
Subject: udp: enable UDP checksum offload for ESP

In case NIC has support for ESP TX CSUM offload skb->ip_summed is not
set to CHECKSUM_PARTIAL which results in checksum calculated by SW.

Fix enables ESP TX CSUM for UDP by extending condition with check for
NETIF_F_HW_ESP_TX_CSUM.

Signed-off-by: Jacek Kalwas <jacek.kalwas@intel.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..a2dfb5a9ba76 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -907,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
 	    length + fragheaderlen <= mtu &&
 	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
 	    !(flags & MSG_MORE) &&
-	    !exthdrlen)
+	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
 		csummode = CHECKSUM_PARTIAL;
 
 	cork->length += length;
-- 
cgit v1.2.3-59-g8ed1b


From 393de512e719a5fbd6712fc392a571ab287eb8ab Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 17 Apr 2018 10:28:44 -0700
Subject: bpftool: Support new prog types and attach types

Add recently added prog types to `bpftool prog` and attach types to
`bpftool cgroup`.

Update bpftool documentation and bash completion appropriately.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 11 +++++++++--
 tools/bpf/bpftool/bash-completion/bpftool          |  6 ++++--
 tools/bpf/bpftool/cgroup.c                         | 15 ++++++++++++---
 tools/bpf/bpftool/prog.c                           |  3 +++
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 0e4e923235b6..d004f6382dab 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -26,7 +26,8 @@ MAP COMMANDS
 |	**bpftool** **cgroup help**
 |
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** }
+|	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
+|		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** }
 |	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -63,7 +64,13 @@ DESCRIPTION
 		  **egress** egress path of the inet socket (since 4.10);
 		  **sock_create** opening of an inet socket (since 4.10);
 		  **sock_ops** various socket operations (since 4.12);
-		  **device** device access (since 4.15).
+		  **device** device access (since 4.15);
+		  **bind4** call to bind(2) for an inet4 socket (since 4.17);
+		  **bind6** call to bind(2) for an inet6 socket (since 4.17);
+		  **post_bind4** return from bind(2) for an inet4 socket (since 4.17);
+		  **post_bind6** return from bind(2) for an inet6 socket (since 4.17);
+		  **connect4** call to connect(2) for an inet4 socket (since 4.17);
+		  **connect6** call to connect(2) for an inet6 socket (since 4.17).
 
 	**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
 		  Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 490811b45fa7..c01b2deb1c77 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -372,7 +372,8 @@ _bpftool()
                     ;;
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
-                        device'
+                        device bind4 bind6 post_bind4 post_bind6 connect4 \
+                        connect6'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag'
                     case $prev in
@@ -380,7 +381,8 @@ _bpftool()
                             _filedir
                             return 0
                             ;;
-                        ingress|egress|sock_create|sock_ops|device)
+                        ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
+                        post_bind4|post_bind6|connect4|connect6)
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index cae32a61cb18..1351bd6b5aa7 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -16,8 +16,11 @@
 #define HELP_SPEC_ATTACH_FLAGS						\
 	"ATTACH_FLAGS := { multi | override }"
 
-#define HELP_SPEC_ATTACH_TYPES						\
-	"ATTACH_TYPE := { ingress | egress | sock_create | sock_ops | device }"
+#define HELP_SPEC_ATTACH_TYPES						       \
+	"       ATTACH_TYPE := { ingress | egress | sock_create |\n"	       \
+	"                        sock_ops | device | bind4 | bind6 |\n"	       \
+	"                        post_bind4 | post_bind6 | connect4 |\n"       \
+	"                        connect6 }"
 
 static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET_INGRESS] = "ingress",
@@ -25,6 +28,12 @@ static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
 	[BPF_CGROUP_SOCK_OPS] = "sock_ops",
 	[BPF_CGROUP_DEVICE] = "device",
+	[BPF_CGROUP_INET4_BIND] = "bind4",
+	[BPF_CGROUP_INET6_BIND] = "bind6",
+	[BPF_CGROUP_INET4_CONNECT] = "connect4",
+	[BPF_CGROUP_INET6_CONNECT] = "connect6",
+	[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
+	[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
@@ -282,7 +291,7 @@ static int do_help(int argc, char **argv)
 		"       %s %s detach CGROUP ATTACH_TYPE PROG\n"
 		"       %s %s help\n"
 		"\n"
-		"       " HELP_SPEC_ATTACH_TYPES "\n"
+		HELP_SPEC_ATTACH_TYPES "\n"
 		"       " HELP_SPEC_ATTACH_FLAGS "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
 		"       " HELP_SPEC_OPTIONS "\n"
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f7a810897eac..548adb9b7317 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -68,6 +68,9 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SOCK_OPS]	= "sock_ops",
 	[BPF_PROG_TYPE_SK_SKB]		= "sk_skb",
 	[BPF_PROG_TYPE_CGROUP_DEVICE]	= "cgroup_device",
+	[BPF_PROG_TYPE_SK_MSG]		= "sk_msg",
+	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
+	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
-- 
cgit v1.2.3-59-g8ed1b


From 81efee75c4a8210cac6eb791892f974dfcfc24b0 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 17 Apr 2018 10:28:45 -0700
Subject: libbpf: Support guessing post_bind{4,6} progs

libbpf can guess prog type and expected attach type based on section
name. Add hints for "cgroup/post_bind4" and "cgroup/post_bind6" section
names.

Existing "cgroup/sock" is not changed, i.e. expected_attach_type for it
is not set to `BPF_CGROUP_INET_SOCK_CREATE`, for backward compatibility.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5922443063f0..0fcc4474e1cc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1859,6 +1859,9 @@ static void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 
 #define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_FULL(string, ptype, 0)
 
+#define BPF_S_PROG_SEC(string, ptype) \
+	BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK, ptype)
+
 #define BPF_SA_PROG_SEC(string, ptype) \
 	BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ptype)
 
@@ -1889,10 +1892,13 @@ static const struct {
 	BPF_SA_PROG_SEC("cgroup/bind6",	BPF_CGROUP_INET6_BIND),
 	BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
 	BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT),
+	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
+	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
 };
 
 #undef BPF_PROG_SEC
 #undef BPF_PROG_SEC_FULL
+#undef BPF_S_PROG_SEC
 #undef BPF_SA_PROG_SEC
 
 static int bpf_program__identify_section(struct bpf_program *prog)
-- 
cgit v1.2.3-59-g8ed1b


From e14c93fd5be14369d7c0190f4c908a81bfeae922 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 17 Apr 2018 10:28:46 -0700
Subject: libbpf: Type functions for raw tracepoints

Add missing pieces for BPF_PROG_TYPE_RAW_TRACEPOINT in libbpf:
* is- and set- functions;
* support guessing prog type.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 2 ++
 tools/lib/bpf/libbpf.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0fcc4474e1cc..3d35bacf656f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1845,6 +1845,7 @@ BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE);
 BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS);
 BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT);
 BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT);
+BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT);
 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
 
@@ -1877,6 +1878,7 @@ static const struct {
 	BPF_PROG_SEC("classifier",	BPF_PROG_TYPE_SCHED_CLS),
 	BPF_PROG_SEC("action",		BPF_PROG_TYPE_SCHED_ACT),
 	BPF_PROG_SEC("tracepoint/",	BPF_PROG_TYPE_TRACEPOINT),
+	BPF_PROG_SEC("raw_tracepoint/",	BPF_PROG_TYPE_RAW_TRACEPOINT),
 	BPF_PROG_SEC("xdp",		BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",	BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("cgroup/skb",	BPF_PROG_TYPE_CGROUP_SKB),
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a3a62a583f27..8b242486b464 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -185,6 +185,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n);
  */
 int bpf_program__set_socket_filter(struct bpf_program *prog);
 int bpf_program__set_tracepoint(struct bpf_program *prog);
+int bpf_program__set_raw_tracepoint(struct bpf_program *prog);
 int bpf_program__set_kprobe(struct bpf_program *prog);
 int bpf_program__set_sched_cls(struct bpf_program *prog);
 int bpf_program__set_sched_act(struct bpf_program *prog);
@@ -194,6 +195,7 @@ void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
+bool bpf_program__is_raw_tracepoint(struct bpf_program *prog);
 bool bpf_program__is_kprobe(struct bpf_program *prog);
 bool bpf_program__is_sched_cls(struct bpf_program *prog);
 bool bpf_program__is_sched_act(struct bpf_program *prog);
-- 
cgit v1.2.3-59-g8ed1b


From 97c33610acc596f318f3ec8d644ca70aeac30a7f Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@foxmail.com>
Date: Tue, 17 Apr 2018 10:25:20 +0800
Subject: samples/bpf: correct comment in sock_example.c

The program run against loopback interace "lo", not "eth0".
Correct the comment.

Signed-off-by: Wang Sheng-Hui <shhuiw@foxmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/sock_example.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 6fc6e193ef1b..33a637507c00 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -9,10 +9,10 @@
  *   if (value)
  *        (*(u64*)value) += 1;
  *
- * - attaches this program to eth0 raw socket
+ * - attaches this program to loopback interface "lo" raw socket
  *
  * - every second user space reads map[tcp], map[udp], map[icmp] to see
- *   how many packets of given protocol were seen on eth0
+ *   how many packets of given protocol were seen on "lo"
  */
 #include <stdio.h>
 #include <unistd.h>
-- 
cgit v1.2.3-59-g8ed1b


From 8de0e8ba973f710346f61e52b86df199b20d23b8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 17 Apr 2018 16:08:06 +0200
Subject: samples/bpf: fix xdp_monitor user output for tracepoint exception

The variable rec_i contains an XDP action code not an error.
Thus, using err2str() was wrong, it should have been action2str().

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_monitor_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index eec14520d513..894bc64c2cac 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -330,7 +330,7 @@ static void stats_print(struct stats_record *stats_rec,
 			pps = calc_pps_u64(r, p, t);
 			if (pps > 0)
 				printf(fmt1, "Exception", i,
-				       0.0, pps, err2str(rec_i));
+				       0.0, pps, action2str(rec_i));
 		}
 		pps = calc_pps_u64(&rec->total, &prev->total, t);
 		if (pps > 0)
-- 
cgit v1.2.3-59-g8ed1b


From 0c90f2243ec67eeacf9624ae52ab43c734fe0e93 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 17 Apr 2018 19:46:34 -0700
Subject: tools: bpftool: make it easier to feed hex bytes to bpftool

bpftool uses hexadecimal values when it dumps map contents:

    # bpftool map dump id 1337
    key: ff 13 37 ff  value: a1 b2 c3 d4 ff ff ff ff
    Found 1 element

In order to lookup or update values with bpftool, the natural reflex is
then to copy and paste the values to the command line, and to try to run
something like:

    # bpftool map update id 1337 key ff 13 37 ff \
            value 00 00 00 00 00 00 1a 2b
    Error: error parsing byte: ff

bpftool complains, because it uses strtoul() with a 0 base to parse the
bytes, and that without a "0x" prefix, the bytes are considered as
decimal values (or even octal if they start with "0").

To feed hexadecimal values instead, one needs to add "0x" prefixes
everywhere necessary:

    # bpftool map update id 1337 key 0xff 0x13 0x37 0xff \
            value 0 0 0 0 0 0 0x1a 0x2b

To make it easier to use hexadecimal values, add an optional "hex"
keyword to put after "key" or "value" to tell bpftool to consider the
digits as hexadecimal. We can now do:

    # bpftool map update id 1337 key hex ff 13 37 ff \
            value hex 0 0 0 0 0 0 1a 2b

Without the "hex" keyword, the bytes are still parsed according to
normal integer notation (decimal if no prefix, or hexadecimal or octal
if "0x" or "0" prefix is used, respectively).

The patch also add related documentation and bash completion for the
"hex" keyword.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Suggested-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst | 29 +++++++++++++++++--------
 tools/bpf/bpftool/bash-completion/bpftool       |  8 ++++---
 tools/bpf/bpftool/map.c                         | 17 ++++++++++-----
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 457e868bd32f..5f512b14bff9 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -23,10 +23,10 @@ MAP COMMANDS
 
 |	**bpftool** **map { show | list }**   [*MAP*]
 |	**bpftool** **map dump**    *MAP*
-|	**bpftool** **map update**  *MAP*  **key** *BYTES*   **value** *VALUE* [*UPDATE_FLAGS*]
-|	**bpftool** **map lookup**  *MAP*  **key** *BYTES*
-|	**bpftool** **map getnext** *MAP* [**key** *BYTES*]
-|	**bpftool** **map delete**  *MAP*  **key** *BYTES*
+|	**bpftool** **map update**  *MAP*  **key** [**hex**] *BYTES*   **value** [**hex**] *VALUE* [*UPDATE_FLAGS*]
+|	**bpftool** **map lookup**  *MAP*  **key** [**hex**] *BYTES*
+|	**bpftool** **map getnext** *MAP* [**key** [**hex**] *BYTES*]
+|	**bpftool** **map delete**  *MAP*  **key** [**hex**] *BYTES*
 |	**bpftool** **map pin**     *MAP*  *FILE*
 |	**bpftool** **map help**
 |
@@ -48,20 +48,26 @@ DESCRIPTION
 	**bpftool map dump**    *MAP*
 		  Dump all entries in a given *MAP*.
 
-	**bpftool map update**  *MAP*  **key** *BYTES*   **value** *VALUE* [*UPDATE_FLAGS*]
+	**bpftool map update**  *MAP*  **key** [**hex**] *BYTES*   **value** [**hex**] *VALUE* [*UPDATE_FLAGS*]
 		  Update map entry for a given *KEY*.
 
 		  *UPDATE_FLAGS* can be one of: **any** update existing entry
 		  or add if doesn't exit; **exist** update only if entry already
 		  exists; **noexist** update only if entry doesn't exist.
 
-	**bpftool map lookup**  *MAP*  **key** *BYTES*
+		  If the **hex** keyword is provided in front of the bytes
+		  sequence, the bytes are parsed as hexadeximal values, even if
+		  no "0x" prefix is added. If the keyword is not provided, then
+		  the bytes are parsed as decimal values, unless a "0x" prefix
+		  (for hexadecimal) or a "0" prefix (for octal) is provided.
+
+	**bpftool map lookup**  *MAP*  **key** [**hex**] *BYTES*
 		  Lookup **key** in the map.
 
-	**bpftool map getnext** *MAP* [**key** *BYTES*]
+	**bpftool map getnext** *MAP* [**key** [**hex**] *BYTES*]
 		  Get next key.  If *key* is not specified, get first key.
 
-	**bpftool map delete**  *MAP*  **key** *BYTES*
+	**bpftool map delete**  *MAP*  **key** [**hex**] *BYTES*
 		  Remove entry from the map.
 
 	**bpftool map pin**     *MAP*  *FILE*
@@ -98,7 +104,12 @@ EXAMPLES
   10: hash  name some_map  flags 0x0
 	key 4B  value 8B  max_entries 2048  memlock 167936B
 
-**# bpftool map update id 10 key 13 00 07 00 value 02 00 00 00 01 02 03 04**
+The following three commands are equivalent:
+
+|
+| **# bpftool map update id 10 key hex   20   c4   b7   00 value hex   0f   ff   ff   ab   01   02   03   4c**
+| **# bpftool map update id 10 key     0x20 0xc4 0xb7 0x00 value     0x0f 0xff 0xff 0xab 0x01 0x02 0x03 0x4c**
+| **# bpftool map update id 10 key       32  196  183    0 value       15  255  255  171    1    2    3   76**
 
 **# bpftool map lookup id 10 key 0 1 2 3**
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index c01b2deb1c77..852d84a98acd 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -147,7 +147,7 @@ _bpftool()
 
     # Deal with simplest keywords
     case $prev in
-        help|key|opcodes|visual)
+        help|hex|opcodes|visual)
             return 0
             ;;
         tag)
@@ -283,7 +283,7 @@ _bpftool()
                             return 0
                             ;;
                         key)
-                            return 0
+                            COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) )
                             ;;
                         *)
                             _bpftool_once_attr 'key'
@@ -302,7 +302,7 @@ _bpftool()
                             return 0
                             ;;
                         key)
-                            return 0
+                            COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) )
                             ;;
                         value)
                             # We can have bytes, or references to a prog or a
@@ -321,6 +321,8 @@ _bpftool()
                                     return 0
                                     ;;
                                 *)
+                                    COMPREPLY+=( $( compgen -W 'hex' \
+                                        -- "$cur" ) )
                                     return 0
                                     ;;
                             esac
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index f509c86faede..a6cdb640a0d7 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -283,11 +283,16 @@ static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
 static char **parse_bytes(char **argv, const char *name, unsigned char *val,
 			  unsigned int n)
 {
-	unsigned int i = 0;
+	unsigned int i = 0, base = 0;
 	char *endptr;
 
+	if (is_prefix(*argv, "hex")) {
+		base = 16;
+		argv++;
+	}
+
 	while (i < n && argv[i]) {
-		val[i] = strtoul(argv[i], &endptr, 0);
+		val[i] = strtoul(argv[i], &endptr, base);
 		if (*endptr) {
 			p_err("error parsing byte: %s", argv[i]);
 			return NULL;
@@ -869,10 +874,10 @@ static int do_help(int argc, char **argv)
 	fprintf(stderr,
 		"Usage: %s %s { show | list }   [MAP]\n"
 		"       %s %s dump    MAP\n"
-		"       %s %s update  MAP  key BYTES value VALUE [UPDATE_FLAGS]\n"
-		"       %s %s lookup  MAP  key BYTES\n"
-		"       %s %s getnext MAP [key BYTES]\n"
-		"       %s %s delete  MAP  key BYTES\n"
+		"       %s %s update  MAP  key [hex] BYTES value [hex] VALUE [UPDATE_FLAGS]\n"
+		"       %s %s lookup  MAP  key [hex] BYTES\n"
+		"       %s %s getnext MAP [key [hex] BYTES]\n"
+		"       %s %s delete  MAP  key [hex] BYTES\n"
 		"       %s %s pin     MAP  FILE\n"
 		"       %s %s help\n"
 		"\n"
-- 
cgit v1.2.3-59-g8ed1b


From b32cc5b9a346319c171e3ad905e0cddda032b5eb Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:13 -0700
Subject: bpf: adding bpf_xdp_adjust_tail helper

Adding new bpf helper which would allow us to manipulate
xdp's data_end pointer, and allow us to reduce packet's size
indended use case: to generate ICMP messages from XDP context,
where such message would contain truncated original packet.

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 29 ++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..9a2d1a04eb24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -755,6 +755,13 @@ union bpf_attr {
  *     @addr: pointer to struct sockaddr to bind socket to
  *     @addr_len: length of sockaddr structure
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_xdp_adjust_tail(xdp_md, delta)
+ *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
+ *     size is supported.
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: A negative integer to be added to xdp_md.data_end
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -821,7 +828,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(xdp_adjust_tail),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..29318598fd60 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2725,6 +2725,30 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
+{
+	void *data_end = xdp->data_end + offset;
+
+	/* only shrinking is allowed for now. */
+	if (unlikely(offset >= 0))
+		return -EINVAL;
+
+	if (unlikely(data_end < xdp->data + ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data_end = data_end;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
+	.func		= bpf_xdp_adjust_tail,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
 {
 	void *meta = xdp->data_meta + offset;
@@ -3074,7 +3098,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
 	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data)
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail)
 		return true;
 
 	return false;
@@ -3888,6 +3913,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
 		return &bpf_xdp_redirect_map_proto;
+	case BPF_FUNC_xdp_adjust_tail:
+		return &bpf_xdp_adjust_tail_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 198d83bb3becf2e9c6c4fa744f35296c20da795a Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:14 -0700
Subject: bpf: make generic xdp compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for generic XDP we need to reflect this packet's length change by
adjusting skb's tail pointer

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..11c789231a03 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3996,9 +3996,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
 	struct netdev_rx_queue *rxqueue;
+	void *orig_data, *orig_data_end;
 	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
 
@@ -4037,6 +4037,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
+	orig_data_end = xdp.data_end;
 	orig_data = xdp.data;
 
 	rxqueue = netif_get_rxqueue(skb);
@@ -4051,6 +4052,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		__skb_push(skb, -off);
 	skb->mac_header += off;
 
+	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+	 * pckt.
+	 */
+	off = orig_data_end - xdp.data_end;
+	if (off != 0)
+		skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
-- 
cgit v1.2.3-59-g8ed1b


From e5e0a59b7cd17244cb0c1d87112b0c9e5e2bfb39 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:15 -0700
Subject: bpf: make mlx4 compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for mlx4 driver we will just calculate packet's length unconditionally
(the same way as it's already being done in mlx5)

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 5c613c6663da..efc55feddc5c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -775,8 +775,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
+			length = xdp.data_end - xdp.data;
 			if (xdp.data != orig_data) {
-				length = xdp.data_end - xdp.data;
 				frags[0].page_offset = xdp.data -
 					xdp.data_hard_start;
 				va = xdp.data;
-- 
cgit v1.2.3-59-g8ed1b


From b968e735c79767a3c91217fbae691581aa557d8d Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:16 -0700
Subject: bpf: make bnxt compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for bnxt driver we will just calculate packet's length unconditionally

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 1389ab5e05df..1f0e872d0667 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -113,10 +113,10 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 	if (tx_avail != bp->tx_ring_size)
 		*event &= ~BNXT_RX_EVENT;
 
+	*len = xdp.data_end - xdp.data;
 	if (orig_data != xdp.data) {
 		offset = xdp.data - xdp.data_hard_start;
 		*data_ptr = xdp.data_hard_start + offset;
-		*len = xdp.data_end - xdp.data;
 	}
 	switch (act) {
 	case XDP_PASS:
-- 
cgit v1.2.3-59-g8ed1b


From a48ce00f9aeb6ce1227e291c36a0fa4995273144 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:17 -0700
Subject: bpf: make cavium thunder compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for cavium's thunder driver we will just calculate packet's length
unconditionally

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 707db3304396..7135db45927e 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -538,9 +538,9 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 	action = bpf_prog_run_xdp(prog, &xdp);
 	rcu_read_unlock();
 
+	len = xdp.data_end - xdp.data;
 	/* Check if XDP program has changed headers */
 	if (orig_data != xdp.data) {
-		len = xdp.data_end - xdp.data;
 		offset = orig_data - xdp.data;
 		dma_addr -= offset;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5a6a22e378244cb3aa0473abc31a5d1c2cb2327b Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:18 -0700
Subject: bpf: make netronome nfp compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for nfp driver we will just calculate packet's length unconditionally

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1eb6549f2a54..d9111c077699 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1722,7 +1722,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 
 			act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
-			pkt_len -= xdp.data - orig_data;
+			pkt_len = xdp.data_end - xdp.data;
 			pkt_off += xdp.data - orig_data;
 
 			switch (act) {
-- 
cgit v1.2.3-59-g8ed1b


From 8fb58f1ecf5cbcc7805dd339243b7d9837cbabbe Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:19 -0700
Subject: bpf: make tun compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for tun driver we need to adjust XDP_PASS handling by recalculating
length of the packet if it was passed to the TCP/IP stack
(in case if after xdp's prog run data_end pointer was adjusted)

Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/tun.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1e58be152d5c..901351a6ed21 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1696,6 +1696,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 			return NULL;
 		case XDP_PASS:
 			delta = orig_data - xdp.data;
+			len = xdp.data_end - xdp.data;
 			break;
 		default:
 			bpf_warn_invalid_xdp_action(act);
@@ -1716,7 +1717,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	}
 
 	skb_reserve(skb, pad - delta);
-	skb_put(skb, len + delta);
+	skb_put(skb, len);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += buflen;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6870de435b90c083ae0f3f7f341287976ef56f03 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:20 -0700
Subject: bpf: make virtio compatible w/ bpf_xdp_adjust_tail

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for virtio driver we need to adjust XDP_PASS handling by recalculating
length of the packet if it was passed to the TCP/IP stack

Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/virtio_net.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 01694e26f03e..779a4f798522 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -606,6 +606,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		case XDP_PASS:
 			/* Recalculate length in case bpf program changed it */
 			delta = orig_data - xdp.data;
+			len = xdp.data_end - xdp.data;
 			break;
 		case XDP_TX:
 			xdpf = convert_to_xdp_frame(&xdp);
@@ -642,7 +643,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		goto err;
 	}
 	skb_reserve(skb, headroom - delta);
-	skb_put(skb, len + delta);
+	skb_put(skb, len);
 	if (!delta) {
 		buf += header_offset;
 		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
@@ -757,6 +758,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			offset = xdp.data -
 					page_address(xdp_page) - vi->hdr_len;
 
+			/* recalculate len if xdp.data or xdp.data_end were
+			 * adjusted
+			 */
+			len = xdp.data_end - xdp.data;
 			/* We can only create skb based on xdp_page. */
 			if (unlikely(xdp_page != page)) {
 				rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 587b80cce95b8aab89e5e35033953c005dc47f01 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:21 -0700
Subject: bpf: making bpf_prog_test run aware of possible data_end ptr change

after introduction of bpf_xdp_adjust_tail helper packet length
could be changed not only if xdp->data pointer has been changed
but xdp->data_end as well. making bpf_prog_test_run aware of this
possibility

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/bpf/test_run.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2ced48662c1f..68c3578343b4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	xdp.rxq = &rxqueue->xdp_rxq;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
-	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN)
+	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
+	    xdp.data_end != xdp.data + size)
 		size = xdp.data_end - xdp.data;
 	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
 	kfree(data);
-- 
cgit v1.2.3-59-g8ed1b


From 0367d0a29427d5916b98cf31dfc85a8293540614 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:22 -0700
Subject: bpf: adding tests for bpf_xdp_adjust_tail

adding selftests for bpf_xdp_adjust_tail helper. in this synthetic test
we are testing that 1) if data_end < data helper will return EINVAL
2) for normal use case packet's length would be reduced.

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h                 | 10 +++++++-
 tools/testing/selftests/bpf/Makefile           |  2 +-
 tools/testing/selftests/bpf/bpf_helpers.h      |  3 +++
 tools/testing/selftests/bpf/test_adjust_tail.c | 30 ++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.c       | 32 ++++++++++++++++++++++++++
 5 files changed, 75 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_adjust_tail.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9d07465023a2..56bf493ba7ed 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -755,6 +755,13 @@ union bpf_attr {
  *     @addr: pointer to struct sockaddr to bind socket to
  *     @addr_len: length of sockaddr structure
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_xdp_adjust_tail(xdp_md, delta)
+ *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
+ *     size is supported.
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: A negative integer to be added to xdp_md.data_end
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -821,7 +828,8 @@ union bpf_attr {
 	FN(msg_apply_bytes),		\
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
-	FN(bind),
+	FN(bind),			\
+	FN(xdp_adjust_tail),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0a315ddabbf4..3e819dc70bee 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -31,7 +31,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
-	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o
+	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index d8223d99f96d..50c607014b22 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -96,6 +96,9 @@ static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_pull_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
+static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
+	(void *) BPF_FUNC_xdp_adjust_tail;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_adjust_tail.c b/tools/testing/selftests/bpf/test_adjust_tail.c
new file mode 100644
index 000000000000..4cd5e860c903
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_adjust_tail.c
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+SEC("xdp_adjust_tail")
+int _xdp_adjust_tail(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	int offset = 0;
+
+	if (data_end - data == 54)
+		offset = 256;
+	else
+		offset = 20;
+	if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+		return XDP_DROP;
+	return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index faadbe233966..eedda98d7bb1 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -166,6 +166,37 @@ out:
 	bpf_object__close(obj);
 }
 
+static void test_xdp_adjust_tail(void)
+{
+	const char *file = "./test_adjust_tail.o";
+	struct bpf_object *obj;
+	char buf[128];
+	__u32 duration, retval, size;
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+
+	CHECK(err || errno || retval != XDP_DROP,
+	      "ipv4", "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
+				buf, &size, &retval, &duration);
+	CHECK(err || errno || retval != XDP_TX || size != 54,
+	      "ipv6", "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+	bpf_object__close(obj);
+}
+
+
+
 #define MAGIC_VAL 0x1234
 #define NUM_ITER 100000
 #define VIP_NUM 5
@@ -1177,6 +1208,7 @@ int main(void)
 {
 	test_pkt_access();
 	test_xdp();
+	test_xdp_adjust_tail();
 	test_l4lb_all();
 	test_xdp_noinline();
 	test_tcp_estats();
-- 
cgit v1.2.3-59-g8ed1b


From c6ffd1ff785675c4a572c79f0e55ba5735edbaa0 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Tue, 17 Apr 2018 21:42:23 -0700
Subject: bpf: add bpf_xdp_adjust_tail sample prog

adding bpf's sample program which is using bpf_xdp_adjust_tail helper
by generating ICMPv4 "packet to big" message if ingress packet's size is
bigger then 600 bytes

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_adjust_tail_kern.c        | 152 ++++++++++++++++++++++++++++++
 samples/bpf/xdp_adjust_tail_user.c        | 142 ++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   2 +
 4 files changed, 300 insertions(+)
 create mode 100644 samples/bpf/xdp_adjust_tail_kern.c
 create mode 100644 samples/bpf/xdp_adjust_tail_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6edd4bf6..aa8c392e2e52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,6 +44,7 @@ hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
+hostprogs-y += xdp_adjust_tail
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -95,6 +96,7 @@ xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
+xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -148,6 +150,7 @@ always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
+always += xdp_adjust_tail_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -193,6 +196,7 @@ HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
+HOSTLOADLIBES_xdp_adjust_tail += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000000..411fdb21f8bc
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_tail() by
+ * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
+ * to be more preice in case of v4)" where receiving packets bigger then
+ * 600 bytes.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_TTL 64
+#define MAX_PCKT_SIZE 600
+#define ICMP_TOOBIG_SIZE 98
+#define ICMP_TOOBIG_PAYLOAD_SIZE 92
+
+struct bpf_map_def SEC("maps") icmpcnt = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64),
+	.max_entries = 1,
+};
+
+static __always_inline void count_icmp(void)
+{
+	u64 key = 0;
+	u64 *icmp_count;
+
+	icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
+	if (icmp_count)
+		*icmp_count += 1;
+}
+
+static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
+{
+	struct ethhdr *eth;
+
+	eth = data;
+	memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
+	memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
+	eth->h_proto = orig_eth->h_proto;
+}
+
+static __always_inline __u16 csum_fold_helper(__u32 csum)
+{
+	return ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline void ipv4_csum(void *data_start, int data_size,
+				      __u32 *csum)
+{
+	*csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
+	*csum = csum_fold_helper(*csum);
+}
+
+static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
+{
+	int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
+
+	if (bpf_xdp_adjust_head(xdp, 0 - headroom))
+		return XDP_DROP;
+	void *data = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+
+	if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
+		return XDP_DROP;
+
+	struct iphdr *iph, *orig_iph;
+	struct icmphdr *icmp_hdr;
+	struct ethhdr *orig_eth;
+	__u32 csum = 0;
+	__u64 off = 0;
+
+	orig_eth = data + headroom;
+	swap_mac(data, orig_eth);
+	off += sizeof(struct ethhdr);
+	iph = data + off;
+	off += sizeof(struct iphdr);
+	icmp_hdr = data + off;
+	off += sizeof(struct icmphdr);
+	orig_iph = data + off;
+	icmp_hdr->type = ICMP_DEST_UNREACH;
+	icmp_hdr->code = ICMP_FRAG_NEEDED;
+	icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct ethhdr));
+	icmp_hdr->checksum = 0;
+	ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
+	icmp_hdr->checksum = csum;
+	iph->ttl = DEFAULT_TTL;
+	iph->daddr = orig_iph->saddr;
+	iph->saddr = orig_iph->daddr;
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->protocol = IPPROTO_ICMP;
+	iph->tos = 0;
+	iph->tot_len = htons(
+		ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
+	iph->check = 0;
+	csum = 0;
+	ipv4_csum(iph, sizeof(struct iphdr), &csum);
+	iph->check = csum;
+	count_icmp();
+	return XDP_TX;
+}
+
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	int pckt_size = data_end - data;
+	int offset;
+
+	if (pckt_size > MAX_PCKT_SIZE) {
+		offset = pckt_size - ICMP_TOOBIG_SIZE;
+		if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+			return XDP_PASS;
+		return send_icmp4_too_big(xdp);
+	}
+	return XDP_PASS;
+}
+
+SEC("xdp_icmp")
+int _xdp_icmp(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eth = data;
+	__u16 h_proto;
+
+	if (eth + 1 > data_end)
+		return XDP_DROP;
+
+	h_proto = eth->h_proto;
+
+	if (h_proto == htons(ETH_P_IP))
+		return handle_ipv4(xdp);
+	else
+		return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000000..f621a541b574
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+	if (ifindex > -1)
+		bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(0);
+}
+
+/* simple "icmp packet too big sent" counter
+ */
+static void poll_stats(unsigned int kill_after_s)
+{
+	time_t started_at = time(NULL);
+	__u64 value = 0;
+	int key = 0;
+
+
+	while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+		sleep(STATS_INTERVAL_S);
+
+		assert(bpf_map_lookup_elem(map_fd[0], &key, &value) == 0);
+
+		printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
+	}
+}
+
+static void usage(const char *cmd)
+{
+	printf("Start a XDP prog which send ICMP \"packet too big\" \n"
+		"messages if ingress packet is bigger then MAX_SIZE bytes\n");
+	printf("Usage: %s [...]\n", cmd);
+	printf("    -i <ifindex> Interface Index\n");
+	printf("    -T <stop-after-X-seconds> Default: 0 (forever)\n");
+	printf("    -S use skb-mode\n");
+	printf("    -N enforce native mode\n");
+	printf("    -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+	unsigned char opt_flags[256] = {};
+	unsigned int kill_after_s = 0;
+	const char *optstr = "i:T:SNh";
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	char filename[256];
+	int opt;
+	int i;
+
+
+	for (i = 0; i < strlen(optstr); i++)
+		if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+			opt_flags[(unsigned char)optstr[i]] = 1;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+
+		switch (opt) {
+		case 'i':
+			ifindex = atoi(optarg);
+			break;
+		case 'T':
+			kill_after_s = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
+		default:
+			usage(argv[0]);
+			return 1;
+		}
+		opt_flags[opt] = 0;
+	}
+
+	for (i = 0; i < strlen(optstr); i++) {
+		if (opt_flags[(unsigned int)optstr[i]]) {
+			fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+			usage(argv[0]);
+			return 1;
+		}
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+		return 1;
+	}
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+		printf("link set xdp fd failed\n");
+		return 1;
+	}
+
+	poll_stats(kill_after_s);
+
+	bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 50c607014b22..9271576bdc8f 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -132,6 +132,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag
 	(void *) BPF_FUNC_l3_csum_replace;
 static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) =
 	(void *) BPF_FUNC_l4_csum_replace;
+static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) =
+	(void *) BPF_FUNC_csum_diff;
 static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_skb_under_cgroup;
 static int (*bpf_skb_change_head)(void *, int len, int flags) =
-- 
cgit v1.2.3-59-g8ed1b


From 350601b4f7ab45a3ef39575acc21d6b7a69f724b Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:30 -0400
Subject: soc: ti: K2G: enhancement to support QMSS in K2G NAVSS

Navigator Subsystem (NAVSS) available on K2G SoC has a cut down
version of QMSS with less number of queues, internal linking ram
with lesser number of buffers etc.  It doesn't have status and
explicit push register space as in QMSS available on other K2 SoCs.
So define reg indices specific to QMSS on K2G. This patch introduces
"ti,66ak2g-navss-qm" compatibility to identify QMSS on K2G NAVSS
and to customize the dts handling code. Per Device manual,
descriptors with index less than or equal to regions0_size is in region 0
in the case of K2 QMSS where as for QMSS on K2G, descriptors with index
less than regions0_size is in region 0. So update the size accordingly in
the regions0_size bits of the linking ram size 0 register.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: WingMan Kwok <w-kwok2@ti.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../bindings/soc/ti/keystone-navigator-qmss.txt    |  9 ++-
 drivers/soc/ti/knav_qmss.h                         |  6 ++
 drivers/soc/ti/knav_qmss_queue.c                   | 90 ++++++++++++++++------
 3 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt b/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
index 77cd42cc5f54..b025770eeb92 100644
--- a/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
+++ b/Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
@@ -17,7 +17,8 @@ pool management.
 
 
 Required properties:
-- compatible	: Must be "ti,keystone-navigator-qmss";
+- compatible	: Must be "ti,keystone-navigator-qmss".
+		: Must be "ti,66ak2g-navss-qm" for QMSS on K2G SoC.
 - clocks	: phandle to the reference clock for this device.
 - queue-range	: <start number> total range of queue numbers for the device.
 - linkram0	: <address size> for internal link ram, where size is the total
@@ -39,6 +40,12 @@ Required properties:
 			  - Descriptor memory setup region.
 			  - Queue Management/Queue Proxy region for queue Push.
 			  - Queue Management/Queue Proxy region for queue Pop.
+
+For QMSS on K2G SoC, following QM reg indexes are used in that order
+			  - Queue Peek region.
+			  - Queue configuration region.
+			  - Queue Management/Queue Proxy region for queue Push/Pop.
+
 - queue-pools	: child node classifying the queue ranges into pools.
 		  Queue ranges are grouped into 3 type of pools:
 		  - qpend	    : pool of qpend(interruptible) queues
diff --git a/drivers/soc/ti/knav_qmss.h b/drivers/soc/ti/knav_qmss.h
index 905b974d1bdc..56866ba4cfc4 100644
--- a/drivers/soc/ti/knav_qmss.h
+++ b/drivers/soc/ti/knav_qmss.h
@@ -292,6 +292,11 @@ struct knav_queue {
 	struct list_head		list;
 };
 
+enum qmss_version {
+	QMSS,
+	QMSS_66AK2G,
+};
+
 struct knav_device {
 	struct device				*dev;
 	unsigned				base_id;
@@ -305,6 +310,7 @@ struct knav_device {
 	struct list_head			pools;
 	struct list_head			pdsps;
 	struct list_head			qmgrs;
+	enum qmss_version			version;
 };
 
 struct knav_range_ops {
diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c
index 77d6b5c03aae..8526c8ed3af2 100644
--- a/drivers/soc/ti/knav_qmss_queue.c
+++ b/drivers/soc/ti/knav_qmss_queue.c
@@ -42,6 +42,15 @@ static DEFINE_MUTEX(knav_dev_lock);
 #define KNAV_QUEUE_PUSH_REG_INDEX	4
 #define KNAV_QUEUE_POP_REG_INDEX	5
 
+/* Queue manager register indices in DTS for QMSS in K2G NAVSS.
+ * There are no status and vbusm push registers on this version
+ * of QMSS. Push registers are same as pop, So all indices above 1
+ * are to be re-defined
+ */
+#define KNAV_L_QUEUE_CONFIG_REG_INDEX	1
+#define KNAV_L_QUEUE_REGION_REG_INDEX	2
+#define KNAV_L_QUEUE_PUSH_REG_INDEX	3
+
 /* PDSP register indices in DTS */
 #define KNAV_QUEUE_PDSP_IRAM_REG_INDEX	0
 #define KNAV_QUEUE_PDSP_REGS_REG_INDEX	1
@@ -1169,8 +1178,12 @@ static int knav_queue_setup_link_ram(struct knav_device *kdev)
 		dev_dbg(kdev->dev, "linkram0: dma:%pad, virt:%p, size:%x\n",
 			&block->dma, block->virt, block->size);
 		writel_relaxed((u32)block->dma, &qmgr->reg_config->link_ram_base0);
-		writel_relaxed(block->size, &qmgr->reg_config->link_ram_size0);
-
+		if (kdev->version == QMSS_66AK2G)
+			writel_relaxed(block->size,
+				       &qmgr->reg_config->link_ram_size0);
+		else
+			writel_relaxed(block->size - 1,
+				       &qmgr->reg_config->link_ram_size0);
 		block++;
 		if (!block->size)
 			continue;
@@ -1387,42 +1400,64 @@ static int knav_queue_init_qmgrs(struct knav_device *kdev,
 		qmgr->reg_peek =
 			knav_queue_map_reg(kdev, child,
 					   KNAV_QUEUE_PEEK_REG_INDEX);
-		qmgr->reg_status =
-			knav_queue_map_reg(kdev, child,
-					   KNAV_QUEUE_STATUS_REG_INDEX);
+
+		if (kdev->version == QMSS) {
+			qmgr->reg_status =
+				knav_queue_map_reg(kdev, child,
+						   KNAV_QUEUE_STATUS_REG_INDEX);
+		}
+
 		qmgr->reg_config =
 			knav_queue_map_reg(kdev, child,
+					   (kdev->version == QMSS_66AK2G) ?
+					   KNAV_L_QUEUE_CONFIG_REG_INDEX :
 					   KNAV_QUEUE_CONFIG_REG_INDEX);
 		qmgr->reg_region =
 			knav_queue_map_reg(kdev, child,
+					   (kdev->version == QMSS_66AK2G) ?
+					   KNAV_L_QUEUE_REGION_REG_INDEX :
 					   KNAV_QUEUE_REGION_REG_INDEX);
+
 		qmgr->reg_push =
 			knav_queue_map_reg(kdev, child,
-					   KNAV_QUEUE_PUSH_REG_INDEX);
-		qmgr->reg_pop =
-			knav_queue_map_reg(kdev, child,
-					   KNAV_QUEUE_POP_REG_INDEX);
+					   (kdev->version == QMSS_66AK2G) ?
+					    KNAV_L_QUEUE_PUSH_REG_INDEX :
+					    KNAV_QUEUE_PUSH_REG_INDEX);
+
+		if (kdev->version == QMSS) {
+			qmgr->reg_pop =
+				knav_queue_map_reg(kdev, child,
+						   KNAV_QUEUE_POP_REG_INDEX);
+		}
 
-		if (IS_ERR(qmgr->reg_peek) || IS_ERR(qmgr->reg_status) ||
+		if (IS_ERR(qmgr->reg_peek) ||
+		    ((kdev->version == QMSS) &&
+		    (IS_ERR(qmgr->reg_status) || IS_ERR(qmgr->reg_pop))) ||
 		    IS_ERR(qmgr->reg_config) || IS_ERR(qmgr->reg_region) ||
-		    IS_ERR(qmgr->reg_push) || IS_ERR(qmgr->reg_pop)) {
+		    IS_ERR(qmgr->reg_push)) {
 			dev_err(dev, "failed to map qmgr regs\n");
+			if (kdev->version == QMSS) {
+				if (!IS_ERR(qmgr->reg_status))
+					devm_iounmap(dev, qmgr->reg_status);
+				if (!IS_ERR(qmgr->reg_pop))
+					devm_iounmap(dev, qmgr->reg_pop);
+			}
 			if (!IS_ERR(qmgr->reg_peek))
 				devm_iounmap(dev, qmgr->reg_peek);
-			if (!IS_ERR(qmgr->reg_status))
-				devm_iounmap(dev, qmgr->reg_status);
 			if (!IS_ERR(qmgr->reg_config))
 				devm_iounmap(dev, qmgr->reg_config);
 			if (!IS_ERR(qmgr->reg_region))
 				devm_iounmap(dev, qmgr->reg_region);
 			if (!IS_ERR(qmgr->reg_push))
 				devm_iounmap(dev, qmgr->reg_push);
-			if (!IS_ERR(qmgr->reg_pop))
-				devm_iounmap(dev, qmgr->reg_pop);
 			devm_kfree(dev, qmgr);
 			continue;
 		}
 
+		/* Use same push register for pop as well */
+		if (kdev->version == QMSS_66AK2G)
+			qmgr->reg_pop = qmgr->reg_push;
+
 		list_add_tail(&qmgr->list, &kdev->qmgrs);
 		dev_info(dev, "added qmgr start queue %d, num of queues %d, reg_peek %p, reg_status %p, reg_config %p, reg_region %p, reg_push %p, reg_pop %p\n",
 			 qmgr->start_queue, qmgr->num_queues,
@@ -1681,10 +1716,24 @@ static int knav_queue_init_queues(struct knav_device *kdev)
 	return 0;
 }
 
+/* Match table for of_platform binding */
+static const struct of_device_id keystone_qmss_of_match[] = {
+	{
+		.compatible = "ti,keystone-navigator-qmss",
+	},
+	{
+		.compatible = "ti,66ak2g-navss-qm",
+		.data	= (void *)QMSS_66AK2G,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, keystone_qmss_of_match);
+
 static int knav_queue_probe(struct platform_device *pdev)
 {
 	struct device_node *node = pdev->dev.of_node;
 	struct device_node *qmgrs, *queue_pools, *regions, *pdsps;
+	const struct of_device_id *match;
 	struct device *dev = &pdev->dev;
 	u32 temp[2];
 	int ret;
@@ -1700,6 +1749,10 @@ static int knav_queue_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
+	match = of_match_device(of_match_ptr(keystone_qmss_of_match), dev);
+	if (match && match->data)
+		kdev->version = QMSS_66AK2G;
+
 	platform_set_drvdata(pdev, kdev);
 	kdev->dev = dev;
 	INIT_LIST_HEAD(&kdev->queue_ranges);
@@ -1815,13 +1868,6 @@ static int knav_queue_remove(struct platform_device *pdev)
 	return 0;
 }
 
-/* Match table for of_platform binding */
-static struct of_device_id keystone_qmss_of_match[] = {
-	{ .compatible = "ti,keystone-navigator-qmss", },
-	{},
-};
-MODULE_DEVICE_TABLE(of, keystone_qmss_of_match);
-
 static struct platform_driver keystone_qmss_driver = {
 	.probe		= knav_queue_probe,
 	.remove		= knav_queue_remove,
-- 
cgit v1.2.3-59-g8ed1b


From a2dd6877b43ef14129f258910d60b2e81b32100b Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:31 -0400
Subject: soc: ti: K2G: provide APIs to support driver probe deferral

This patch provide APIs to allow client drivers to support
probe deferral. On K2G SoC, devices can be probed only
after the ti_sci_pm_domains driver is probed and ready.
As drivers may get probed at different order, any driver
that depends on knav dma and qmss drivers, for example
netcp network driver, needs to defer probe until
knav devices are probed and ready to service. To do this,
add an API to query the device ready status from the knav
dma and qmss devices.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/soc/ti/knav_dma.c        |  8 ++++++++
 drivers/soc/ti/knav_qmss_queue.c |  8 ++++++++
 include/linux/soc/ti/knav_dma.h  | 12 ++++++++++++
 include/linux/soc/ti/knav_qmss.h |  1 +
 4 files changed, 29 insertions(+)

diff --git a/drivers/soc/ti/knav_dma.c b/drivers/soc/ti/knav_dma.c
index 026182d3b27c..224d7ddeeb76 100644
--- a/drivers/soc/ti/knav_dma.c
+++ b/drivers/soc/ti/knav_dma.c
@@ -134,6 +134,13 @@ struct knav_dma_chan {
 
 static struct knav_dma_pool_device *kdev;
 
+static bool device_ready;
+bool knav_dma_device_ready(void)
+{
+	return device_ready;
+}
+EXPORT_SYMBOL_GPL(knav_dma_device_ready);
+
 static bool check_config(struct knav_dma_chan *chan, struct knav_dma_cfg *cfg)
 {
 	if (!memcmp(&chan->cfg, cfg, sizeof(*cfg)))
@@ -773,6 +780,7 @@ static int knav_dma_probe(struct platform_device *pdev)
 	debugfs_create_file("knav_dma", S_IFREG | S_IRUGO, NULL, NULL,
 			    &knav_dma_debug_ops);
 
+	device_ready = true;
 	return ret;
 }
 
diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c
index 8526c8ed3af2..419365a8d1c2 100644
--- a/drivers/soc/ti/knav_qmss_queue.c
+++ b/drivers/soc/ti/knav_qmss_queue.c
@@ -74,6 +74,13 @@ static DEFINE_MUTEX(knav_dev_lock);
  */
 const char *knav_acc_firmwares[] = {"ks2_qmss_pdsp_acc48.bin"};
 
+static bool device_ready;
+bool knav_qmss_device_ready(void)
+{
+	return device_ready;
+}
+EXPORT_SYMBOL_GPL(knav_qmss_device_ready);
+
 /**
  * knav_queue_notify: qmss queue notfier call
  *
@@ -1849,6 +1856,7 @@ static int knav_queue_probe(struct platform_device *pdev)
 
 	debugfs_create_file("qmss", S_IFREG | S_IRUGO, NULL, NULL,
 			    &knav_queue_debug_ops);
+	device_ready = true;
 	return 0;
 
 err:
diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 66693bc4c6ad..7127ec301537 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -167,6 +167,8 @@ struct knav_dma_desc {
 void *knav_dma_open_channel(struct device *dev, const char *name,
 				struct knav_dma_cfg *config);
 void knav_dma_close_channel(void *channel);
+int knav_dma_get_flow(void *channel);
+bool knav_dma_device_ready(void);
 #else
 static inline void *knav_dma_open_channel(struct device *dev, const char *name,
 				struct knav_dma_cfg *config)
@@ -176,6 +178,16 @@ static inline void *knav_dma_open_channel(struct device *dev, const char *name,
 static inline void knav_dma_close_channel(void *channel)
 {}
 
+static inline int knav_dma_get_flow(void *channel)
+{
+	return -EINVAL;
+}
+
+static inline bool knav_dma_device_ready(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__ */
diff --git a/include/linux/soc/ti/knav_qmss.h b/include/linux/soc/ti/knav_qmss.h
index 9f0ebb3bad27..9745df6ed9d3 100644
--- a/include/linux/soc/ti/knav_qmss.h
+++ b/include/linux/soc/ti/knav_qmss.h
@@ -86,5 +86,6 @@ int knav_pool_desc_map(void *ph, void *desc, unsigned size,
 void *knav_pool_desc_unmap(void *ph, dma_addr_t dma, unsigned dma_sz);
 dma_addr_t knav_pool_desc_virt_to_dma(void *ph, void *virt);
 void *knav_pool_desc_dma_to_virt(void *ph, dma_addr_t dma);
+bool knav_qmss_device_ready(void);
 
 #endif /* __SOC_TI_KNAV_QMSS_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 2953586df76e76ec2e62ed05897c1d242c06874d Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:32 -0400
Subject: net: netcp: ethss: use macro for checking ss_version consistently

Driver currently uses macro for NU and XBE hardwrae, while other
places for older hardware such as that on K2H/K SoC (version 1.4
of the cpsw hardware, it explicitly check for the ss_version
inline. Add a new macro for version 1.4 and use it to customize
code in the driver. While at it also fix similar issue with
checking XBE version by re-using existing macro IS_SS_ID_XGBE().

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 56dbc0b9fedc..1b79fe51c747 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -42,7 +42,7 @@
 
 /* 1G Ethernet SS defines */
 #define GBE_MODULE_NAME			"netcp-gbe"
-#define GBE_SS_VERSION_14		0x4ed21104
+#define GBE_SS_VERSION_14		0x4ed2
 
 #define GBE_SS_REG_INDEX		0
 #define GBE_SGMII34_REG_INDEX		1
@@ -72,6 +72,9 @@
 #define IS_SS_ID_NU(d) \
 	(GBE_IDENT((d)->ss_version) == GBE_SS_ID_NU)
 
+#define IS_SS_ID_VER_14(d) \
+	(GBE_IDENT((d)->ss_version) == GBE_SS_VERSION_14)
+
 #define GBENU_SS_REG_INDEX		0
 #define GBENU_SM_REG_INDEX		1
 #define GBENU_SGMII_MODULE_OFFSET	0x100
@@ -86,7 +89,7 @@
 
 /* 10G Ethernet SS defines */
 #define XGBE_MODULE_NAME		"netcp-xgbe"
-#define XGBE_SS_VERSION_10		0x4ee42100
+#define XGBE_SS_VERSION_10		0x4ee4
 
 #define XGBE_SS_REG_INDEX		0
 #define XGBE_SM_REG_INDEX		1
@@ -1915,7 +1918,7 @@ static void keystone_get_ethtool_stats(struct net_device *ndev,
 
 	gbe_dev = gbe_intf->gbe_dev;
 	spin_lock_bh(&gbe_dev->hw_stats_lock);
-	if (gbe_dev->ss_version == GBE_SS_VERSION_14)
+	if (IS_SS_ID_VER_14(gbe_dev))
 		gbe_update_stats_ver14(gbe_dev, data);
 	else
 		gbe_update_stats(gbe_dev, data);
@@ -2205,7 +2208,7 @@ static void gbe_port_config(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 		max_rx_len = NETCP_MAX_FRAME_SIZE;
 
 	/* Enable correct MII mode at SS level */
-	if ((gbe_dev->ss_version == XGBE_SS_VERSION_10) &&
+	if (IS_SS_ID_XGBE(gbe_dev) &&
 	    (slave->link_interface >= XGMII_LINK_MAC_PHY)) {
 		xgmii_mode = readl(GBE_REG_ADDR(gbe_dev, ss_regs, control));
 		xgmii_mode |= (1 << slave->slave_num);
@@ -2293,7 +2296,7 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 	}
 
 	if (has_phy) {
-		if (priv->ss_version == XGBE_SS_VERSION_10)
+		if (IS_SS_ID_XGBE(priv))
 			hndlr = xgbe_adjust_link;
 
 		slave->phy = of_phy_connect(gbe_intf->ndev,
@@ -2764,7 +2767,7 @@ static void netcp_ethss_timer(struct timer_list *t)
 	/* A timer runs as a BH, no need to block them */
 	spin_lock(&gbe_dev->hw_stats_lock);
 
-	if (gbe_dev->ss_version == GBE_SS_VERSION_14)
+	if (IS_SS_ID_VER_14(gbe_dev))
 		gbe_update_stats_ver14(gbe_dev, NULL);
 	else
 		gbe_update_stats(gbe_dev, NULL);
@@ -2807,7 +2810,7 @@ static int gbe_open(void *intf_priv, struct net_device *ndev)
 		GBE_RTL_VERSION(reg), GBE_IDENT(reg));
 
 	/* For 10G and on NetCP 1.5, use directed to port */
-	if ((gbe_dev->ss_version == XGBE_SS_VERSION_10) || IS_SS_ID_MU(gbe_dev))
+	if (IS_SS_ID_XGBE(gbe_dev) || IS_SS_ID_MU(gbe_dev))
 		gbe_intf->tx_pipe.flags = SWITCH_TO_PORT_IN_TAGINFO;
 
 	if (gbe_dev->enable_ale)
@@ -2924,7 +2927,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 
 	/* Emac regs memmap are contiguous but port regs are not */
 	port_reg_num = slave->slave_num;
-	if (gbe_dev->ss_version == GBE_SS_VERSION_14) {
+	if (IS_SS_ID_VER_14(gbe_dev)) {
 		if (slave->slave_num > 1) {
 			port_reg_ofs = GBE13_SLAVE_PORT2_OFFSET;
 			port_reg_num -= 2;
@@ -2939,7 +2942,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 		emac_reg_ofs = GBENU_EMAC_OFFSET;
 		port_reg_blk_sz = 0x1000;
 		emac_reg_blk_sz = 0x1000;
-	} else if (gbe_dev->ss_version == XGBE_SS_VERSION_10) {
+	} else if (IS_SS_ID_XGBE(gbe_dev)) {
 		port_reg_ofs = XGBE10_SLAVE_PORT_OFFSET;
 		emac_reg_ofs = XGBE10_EMAC_OFFSET;
 		port_reg_blk_sz = 0x30;
@@ -2955,7 +2958,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 	slave->emac_regs = gbe_dev->switch_regs + emac_reg_ofs +
 				(emac_reg_blk_sz * slave->slave_num);
 
-	if (gbe_dev->ss_version == GBE_SS_VERSION_14) {
+	if (IS_SS_ID_VER_14(gbe_dev)) {
 		/* Initialize  slave port register offsets */
 		GBE_SET_REG_OFS(slave, port_regs, port_vlan);
 		GBE_SET_REG_OFS(slave, port_regs, tx_pri_map);
@@ -2989,7 +2992,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 		GBENU_SET_REG_OFS(slave, emac_regs, mac_control);
 		GBENU_SET_REG_OFS(slave, emac_regs, soft_reset);
 
-	} else if (gbe_dev->ss_version == XGBE_SS_VERSION_10) {
+	} else if (IS_SS_ID_XGBE(gbe_dev)) {
 		/* Initialize  slave port register offsets */
 		XGBE_SET_REG_OFS(slave, port_regs, port_vlan);
 		XGBE_SET_REG_OFS(slave, port_regs, tx_pri_map);
@@ -3508,7 +3511,7 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 
 		dev_dbg(dev, "ss_version: 0x%08x\n", gbe_dev->ss_version);
 
-		if (gbe_dev->ss_version == GBE_SS_VERSION_14)
+		if (IS_SS_ID_VER_14(gbe_dev))
 			ret = set_gbe_ethss14_priv(gbe_dev, node);
 		else if (IS_SS_ID_MU(gbe_dev))
 			ret = set_gbenu_ethss_priv(gbe_dev, node);
@@ -3606,7 +3609,7 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 
 	spin_lock_bh(&gbe_dev->hw_stats_lock);
 	for (i = 0; i < gbe_dev->num_stats_mods; i++) {
-		if (gbe_dev->ss_version == GBE_SS_VERSION_14)
+		if (IS_SS_ID_VER_14(gbe_dev))
 			gbe_reset_mod_stats_ver14(gbe_dev, i);
 		else
 			gbe_reset_mod_stats(gbe_dev, i);
-- 
cgit v1.2.3-59-g8ed1b


From 775f9535dc5e53ef57d0ebd17e1d7a94c599ecce Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:33 -0400
Subject: net: netcp: ethss: make sgmii configuration conditional

As a preparatory patch to add support for 2u cpsw hardware found on
K2G SoC, make sgmii configuration conditional. This is required
since 2u uses RGMII interface instead of SGMII and to allow for driver
re-use.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 1b79fe51c747..e1e67ff9eb86 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -74,6 +74,8 @@
 
 #define IS_SS_ID_VER_14(d) \
 	(GBE_IDENT((d)->ss_version) == GBE_SS_VERSION_14)
+#define IS_SS_ID_2U(d) \
+	(GBE_IDENT((d)->ss_version) == GBE_SS_ID_2U)
 
 #define GBENU_SS_REG_INDEX		0
 #define GBENU_SM_REG_INDEX		1
@@ -2239,7 +2241,8 @@ static void gbe_slave_stop(struct gbe_intf *intf)
 	struct gbe_priv *gbe_dev = intf->gbe_dev;
 	struct gbe_slave *slave = intf->slave;
 
-	gbe_sgmii_rtreset(gbe_dev, slave, true);
+	if (!IS_SS_ID_2U(gbe_dev))
+		gbe_sgmii_rtreset(gbe_dev, slave, true);
 	gbe_port_reset(slave);
 	/* Disable forwarding */
 	cpsw_ale_control_set(gbe_dev->ale, slave->port_num,
@@ -2274,9 +2277,11 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 
 	void (*hndlr)(struct net_device *) = gbe_adjust_link;
 
-	gbe_sgmii_config(priv, slave);
+	if (!IS_SS_ID_2U(priv))
+		gbe_sgmii_config(priv, slave);
 	gbe_port_reset(slave);
-	gbe_sgmii_rtreset(priv, slave, false);
+	if (!IS_SS_ID_2U(priv))
+		gbe_sgmii_rtreset(priv, slave, false);
 	gbe_port_config(priv, slave, priv->rx_packet_max);
 	gbe_set_slave_mac(slave, gbe_intf);
 	/* enable forwarding */
@@ -3042,7 +3047,8 @@ static void init_secondary_ports(struct gbe_priv *gbe_dev,
 			continue;
 		}
 
-		gbe_sgmii_config(gbe_dev, slave);
+		if (!IS_SS_ID_2U(gbe_dev))
+			gbe_sgmii_config(gbe_dev, slave);
 		gbe_port_reset(slave);
 		gbe_port_config(gbe_dev, slave, gbe_dev->rx_packet_max);
 		list_add_tail(&slave->slave_list, &gbe_dev->secondary_slaves);
@@ -3399,7 +3405,9 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev,
 	}
 	gbe_dev->switch_regs = regs;
 
-	gbe_dev->sgmii_port_regs = gbe_dev->ss_regs + GBENU_SGMII_MODULE_OFFSET;
+	if (!IS_SS_ID_2U(gbe_dev))
+		gbe_dev->sgmii_port_regs =
+		       gbe_dev->ss_regs + GBENU_SGMII_MODULE_OFFSET;
 
 	/* Although sgmii modules are mem mapped to one contiguous
 	 * region on GBENU devices, setting sgmii_port34_regs allows
-- 
cgit v1.2.3-59-g8ed1b


From 478e9a5fcf3fda621f3d660979c42d24f3435d73 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:34 -0400
Subject: net: netcp: ethss: add support for handling rgmii link interface

2u cpsw hardware on K2G uses rgmii link to interface with Phy. So add
support for this interface in the code so that driver can be re-used
for 2u hardware.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp.h       |  2 ++
 drivers/net/ethernet/ti/netcp_ethss.c | 15 +++++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index 8900a6fad318..416f732a00f7 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -33,6 +33,8 @@
 #define SGMII_LINK_MAC_MAC_FORCED	2
 #define SGMII_LINK_MAC_FIBER		3
 #define SGMII_LINK_MAC_PHY_NO_MDIO	4
+#define RGMII_LINK_MAC_PHY		5
+#define RGMII_LINK_MAC_PHY_NO_MDIO	7
 #define XGMII_LINK_MAC_PHY		10
 #define XGMII_LINK_MAC_MAC_FORCED	11
 
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index e1e67ff9eb86..68b10b5f9a51 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2096,8 +2096,9 @@ static void netcp_ethss_link_state_action(struct gbe_priv *gbe_dev,
 				     ALE_PORT_STATE_FORWARD);
 
 		if (ndev && slave->open &&
-		    slave->link_interface != SGMII_LINK_MAC_PHY &&
-		    slave->link_interface != XGMII_LINK_MAC_PHY)
+		    ((slave->link_interface != SGMII_LINK_MAC_PHY) &&
+		    (slave->link_interface != RGMII_LINK_MAC_PHY) &&
+		    (slave->link_interface != XGMII_LINK_MAC_PHY)))
 			netif_carrier_on(ndev);
 	} else {
 		writel(mac_control, GBE_REG_ADDR(slave, emac_regs,
@@ -2106,8 +2107,9 @@ static void netcp_ethss_link_state_action(struct gbe_priv *gbe_dev,
 				     ALE_PORT_STATE,
 				     ALE_PORT_STATE_DISABLE);
 		if (ndev &&
-		    slave->link_interface != SGMII_LINK_MAC_PHY &&
-		    slave->link_interface != XGMII_LINK_MAC_PHY)
+		    ((slave->link_interface != SGMII_LINK_MAC_PHY) &&
+		    (slave->link_interface != RGMII_LINK_MAC_PHY) &&
+		    (slave->link_interface != XGMII_LINK_MAC_PHY)))
 			netif_carrier_off(ndev);
 	}
 
@@ -2921,6 +2923,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 
 	slave->open = false;
 	if ((slave->link_interface == SGMII_LINK_MAC_PHY) ||
+	    (slave->link_interface == RGMII_LINK_MAC_PHY) ||
 	    (slave->link_interface == XGMII_LINK_MAC_PHY))
 		slave->phy_node = of_parse_phandle(node, "phy-handle", 0);
 	slave->port_num = gbe_get_slave_port(gbe_dev, slave->slave_num);
@@ -3082,6 +3085,9 @@ static void init_secondary_ports(struct gbe_priv *gbe_dev,
 	if (slave->link_interface == SGMII_LINK_MAC_PHY) {
 		phy_mode = PHY_INTERFACE_MODE_SGMII;
 		slave->phy_port_t = PORT_MII;
+	} else if (slave->link_interface == RGMII_LINK_MAC_PHY) {
+		phy_mode = PHY_INTERFACE_MODE_RGMII;
+		slave->phy_port_t = PORT_MII;
 	} else {
 		phy_mode = PHY_INTERFACE_MODE_NA;
 		slave->phy_port_t = PORT_FIBRE;
@@ -3089,6 +3095,7 @@ static void init_secondary_ports(struct gbe_priv *gbe_dev,
 
 	for_each_sec_slave(slave, gbe_dev) {
 		if ((slave->link_interface != SGMII_LINK_MAC_PHY) &&
+		    (slave->link_interface != RGMII_LINK_MAC_PHY) &&
 		    (slave->link_interface != XGMII_LINK_MAC_PHY))
 			continue;
 		slave->phy =
-- 
cgit v1.2.3-59-g8ed1b


From 7771f2b4d0f774de752346d6340cd79c1d0b0477 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:35 -0400
Subject: net: netcp: ethss: use rgmii link status for 2u cpsw hardware

Introduce rgmii link status to handle link state events for 2u
cpsw hardware on K2G.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 68b10b5f9a51..83acb728a8f7 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -171,6 +171,11 @@
 #define	GBE_RXHOOK_ORDER			0
 #define GBE_DEFAULT_ALE_AGEOUT			30
 #define SLAVE_LINK_IS_XGMII(s) ((s)->link_interface >= XGMII_LINK_MAC_PHY)
+#define SLAVE_LINK_IS_RGMII(s) \
+	(((s)->link_interface >= RGMII_LINK_MAC_PHY) && \
+	 ((s)->link_interface <= RGMII_LINK_MAC_PHY_NO_MDIO))
+#define SLAVE_LINK_IS_SGMII(s) \
+	((s)->link_interface <= SGMII_LINK_MAC_PHY_NO_MDIO)
 #define NETCP_LINK_STATE_INVALID		-1
 
 #define GBE_SET_REG_OFS(p, rb, rn) p->rb##_ofs.rn = \
@@ -554,6 +559,7 @@ struct gbe_ss_regs {
 struct gbe_ss_regs_ofs {
 	u16	id_ver;
 	u16	control;
+	u16	rgmii_status; /* 2U */
 };
 
 struct gbe_switch_regs {
@@ -2122,23 +2128,35 @@ static bool gbe_phy_link_status(struct gbe_slave *slave)
 	 return !slave->phy || slave->phy->link;
 }
 
+#define RGMII_REG_STATUS_LINK	BIT(0)
+
+static void netcp_2u_rgmii_get_port_link(struct gbe_priv *gbe_dev, bool *status)
+{
+	u32 val = 0;
+
+	val = readl(GBE_REG_ADDR(gbe_dev, ss_regs, rgmii_status));
+	*status = !!(val & RGMII_REG_STATUS_LINK);
+}
+
 static void netcp_ethss_update_link_state(struct gbe_priv *gbe_dev,
 					  struct gbe_slave *slave,
 					  struct net_device *ndev)
 {
-	int sp = slave->slave_num;
-	int phy_link_state, sgmii_link_state = 1, link_state;
+	bool sw_link_state = true, phy_link_state;
+	int sp = slave->slave_num, link_state;
 
 	if (!slave->open)
 		return;
 
-	if (!SLAVE_LINK_IS_XGMII(slave)) {
-		sgmii_link_state =
-			netcp_sgmii_get_port_link(SGMII_BASE(gbe_dev, sp), sp);
-	}
+	if (SLAVE_LINK_IS_RGMII(slave))
+		netcp_2u_rgmii_get_port_link(gbe_dev,
+					     &sw_link_state);
+	if (SLAVE_LINK_IS_SGMII(slave))
+		sw_link_state =
+		netcp_sgmii_get_port_link(SGMII_BASE(gbe_dev, sp), sp);
 
 	phy_link_state = gbe_phy_link_status(slave);
-	link_state = phy_link_state & sgmii_link_state;
+	link_state = phy_link_state & sw_link_state;
 
 	if (atomic_xchg(&slave->link_state, link_state) != link_state)
 		netcp_ethss_link_state_action(gbe_dev, ndev, slave,
@@ -3437,6 +3455,8 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev,
 
 	/* Subsystem registers */
 	GBENU_SET_REG_OFS(gbe_dev, ss_regs, id_ver);
+	/* ok to set for MU, but used by 2U only */
+	GBENU_SET_REG_OFS(gbe_dev, ss_regs, rgmii_status);
 
 	/* Switch module registers */
 	GBENU_SET_REG_OFS(gbe_dev, switch_regs, id_ver);
-- 
cgit v1.2.3-59-g8ed1b


From 65c450642c4991ab456ebd157e14d52c81106695 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:36 -0400
Subject: net: netcp: ethss: map vlan priorities to zero flow

The driver currently support only vlan priority zero. So map the
vlan priorities to zero flow in hardware.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 83acb728a8f7..91935e589b08 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -602,6 +602,7 @@ struct gbe_port_regs {
 struct gbe_port_regs_ofs {
 	u16	port_vlan;
 	u16	tx_pri_map;
+	u16     rx_pri_map;
 	u16	sa_lo;
 	u16	sa_hi;
 	u16	ts_ctl;
@@ -2304,6 +2305,13 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 		gbe_sgmii_rtreset(priv, slave, false);
 	gbe_port_config(priv, slave, priv->rx_packet_max);
 	gbe_set_slave_mac(slave, gbe_intf);
+	/* For NU & 2U switch, map the vlan priorities to zero
+	 * as we only configure to use priority 0
+	 */
+	if (IS_SS_ID_MU(priv))
+		writel(HOST_TX_PRI_MAP_DEFAULT,
+		       GBE_REG_ADDR(slave, port_regs, rx_pri_map));
+
 	/* enable forwarding */
 	cpsw_ale_control_set(priv->ale, slave->port_num,
 			     ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
@@ -3005,6 +3013,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 		/* Initialize  slave port register offsets */
 		GBENU_SET_REG_OFS(slave, port_regs, port_vlan);
 		GBENU_SET_REG_OFS(slave, port_regs, tx_pri_map);
+		GBENU_SET_REG_OFS(slave, port_regs, rx_pri_map);
 		GBENU_SET_REG_OFS(slave, port_regs, sa_lo);
 		GBENU_SET_REG_OFS(slave, port_regs, sa_hi);
 		GBENU_SET_REG_OFS(slave, port_regs, ts_ctl);
-- 
cgit v1.2.3-59-g8ed1b


From dc07ec97368f064102a78bddbfba4574ed7cb73d Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:37 -0400
Subject: net: netcp: ethss: re-use stats handling code for 2u hardware

The stats block in 2u cpsw hardware is similar to the one on nu
and hence handle it in a similar way by using a macro that includes
2u hardware as well.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 91935e589b08..d982dcbe17b5 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -3398,7 +3398,7 @@ static int set_gbenu_ethss_priv(struct gbe_priv *gbe_dev,
 	gbe_dev->num_stats_mods = gbe_dev->max_num_ports;
 	gbe_dev->et_stats = gbenu_et_stats;
 
-	if (IS_SS_ID_NU(gbe_dev))
+	if (IS_SS_ID_MU(gbe_dev))
 		gbe_dev->num_et_stats = GBENU_ET_STATS_HOST_SIZE +
 			(gbe_dev->max_num_slaves * GBENU_ET_STATS_PORT_SIZE);
 	else
-- 
cgit v1.2.3-59-g8ed1b


From 7b647b93b359d590094e0787fc86a6a9d57b25f7 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:38 -0400
Subject: net: netcp: ethss: use of_get_phy_mode() to support different RGMII
 modes

The phy used for K2G allows for internal delays to be added optionally
to the clock circuitry based on board desing. To add this support,
enhance the driver to use of_get_phy_mode() to read the phy-mode from
the phy device and pass the same to phy through of_phy_connect().

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index d982dcbe17b5..5bc792175e4e 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -21,6 +21,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of_mdio.h>
+#include <linux/of_net.h>
 #include <linux/of_address.h>
 #include <linux/if_vlan.h>
 #include <linux/ptp_classify.h>
@@ -707,6 +708,7 @@ struct gbe_slave {
 	u32				link_interface;
 	u32				mac_control;
 	u8				phy_port_t;
+	struct device_node		*node;
 	struct device_node		*phy_node;
 	struct ts_ctl                   ts_ctl;
 	struct list_head		slave_list;
@@ -2322,6 +2324,21 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 		has_phy = true;
 		phy_mode = PHY_INTERFACE_MODE_SGMII;
 		slave->phy_port_t = PORT_MII;
+	} else if (slave->link_interface == RGMII_LINK_MAC_PHY) {
+		has_phy = true;
+		phy_mode = of_get_phy_mode(slave->node);
+		/* if phy-mode is not present, default to
+		 * PHY_INTERFACE_MODE_RGMII
+		 */
+		if (phy_mode < 0)
+			phy_mode = PHY_INTERFACE_MODE_RGMII;
+
+		if (!phy_interface_mode_is_rgmii(phy_mode)) {
+			dev_err(priv->dev,
+				"Unsupported phy mode %d\n", phy_mode);
+			return -EINVAL;
+		}
+		slave->phy_port_t = PORT_MII;
 	} else if (slave->link_interface == XGMII_LINK_MAC_PHY) {
 		has_phy = true;
 		phy_mode = PHY_INTERFACE_MODE_NA;
@@ -2947,6 +2964,7 @@ static int init_slave(struct gbe_priv *gbe_dev, struct gbe_slave *slave,
 		slave->link_interface = SGMII_LINK_MAC_PHY;
 	}
 
+	slave->node = node;
 	slave->open = false;
 	if ((slave->link_interface == SGMII_LINK_MAC_PHY) ||
 	    (slave->link_interface == RGMII_LINK_MAC_PHY) ||
-- 
cgit v1.2.3-59-g8ed1b


From c52b6782d00e4de299647dc6448aa844a9f4941b Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:39 -0400
Subject: Revert "net: netcp: remove dead code from the driver"

As the probe sequence is not guaranteed contrary to the assumption
of the commit 2d8e276a9030, same has to be reverted.

commit 2d8e276a9030 ("net: netcp: remove dead code from the driver")

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index f5a7eb22d0f5..9c51b25d7fd0 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -2155,6 +2155,7 @@ static int netcp_probe(struct platform_device *pdev)
 	struct device_node *child, *interfaces;
 	struct netcp_device *netcp_device;
 	struct device *dev = &pdev->dev;
+	struct netcp_module *module;
 	int ret;
 
 	if (!node) {
@@ -2203,6 +2204,14 @@ static int netcp_probe(struct platform_device *pdev)
 	/* Add the device instance to the list */
 	list_add_tail(&netcp_device->device_list, &netcp_devices);
 
+	/* Probe & attach any modules already registered */
+	mutex_lock(&netcp_modules_lock);
+	for_each_netcp_module(module) {
+		ret = netcp_module_probe(netcp_device, module);
+		if (ret < 0)
+			dev_err(dev, "module(%s) probe failed\n", module->name);
+	}
+	mutex_unlock(&netcp_modules_lock);
 	return 0;
 
 probe_quit_interface:
-- 
cgit v1.2.3-59-g8ed1b


From 21f706bb10cf7d905d89ceb3f689b8b844520127 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 17 Apr 2018 17:30:40 -0400
Subject: net: netcp: support probe deferral

The netcp driver shouldn't proceed until the knav qmss and dma
devices are ready. So return -EPROBE_DEFER if these devices are not
ready.

Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 9c51b25d7fd0..736f6f713c09 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -2158,6 +2158,10 @@ static int netcp_probe(struct platform_device *pdev)
 	struct netcp_module *module;
 	int ret;
 
+	if (!knav_dma_device_ready() ||
+	    !knav_qmss_device_ready())
+		return -EPROBE_DEFER;
+
 	if (!node) {
 		dev_err(dev, "could not find device info\n");
 		return -ENODEV;
-- 
cgit v1.2.3-59-g8ed1b


From 0542a87c546726510e7a572b16c5493ca42024c4 Mon Sep 17 00:00:00 2001
From: WingMan Kwok <w-kwok2@ti.com>
Date: Tue, 17 Apr 2018 17:30:41 -0400
Subject: net: netcp: add api to support set rx mode in netcp modules

This patch adds an API to support setting rx mode in
netcp modules.  If a netcp module needs to be notified
when upper layer transitions from one rx mode to
another and react accordingly, such a module will implement
the new API set_rx_mode added in this patch.  Currently
rx modes supported are PROMISCUOUS and NON_PROMISCUOUS
modes.

Signed-off-by: WingMan Kwok <w-kwok2@ti.com>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp.h      |  1 +
 drivers/net/ethernet/ti/netcp_core.c | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp.h b/drivers/net/ethernet/ti/netcp.h
index 416f732a00f7..c4ffdf47bad5 100644
--- a/drivers/net/ethernet/ti/netcp.h
+++ b/drivers/net/ethernet/ti/netcp.h
@@ -214,6 +214,7 @@ struct netcp_module {
 	int	(*add_vid)(void *intf_priv, int vid);
 	int	(*del_vid)(void *intf_priv, int vid);
 	int	(*ioctl)(void *intf_priv, struct ifreq *req, int cmd);
+	int	(*set_rx_mode)(void *intf_priv, bool promisc);
 
 	/* used internally */
 	struct list_head	module_list;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 736f6f713c09..e40aa3e31af2 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1509,6 +1509,24 @@ static void netcp_addr_sweep_add(struct netcp_intf *netcp)
 	}
 }
 
+static int netcp_set_promiscuous(struct netcp_intf *netcp, bool promisc)
+{
+	struct netcp_intf_modpriv *priv;
+	struct netcp_module *module;
+	int error;
+
+	for_each_module(netcp, priv) {
+		module = priv->netcp_module;
+		if (!module->set_rx_mode)
+			continue;
+
+		error = module->set_rx_mode(priv->module_priv, promisc);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
 static void netcp_set_rx_mode(struct net_device *ndev)
 {
 	struct netcp_intf *netcp = netdev_priv(ndev);
@@ -1538,6 +1556,7 @@ static void netcp_set_rx_mode(struct net_device *ndev)
 	/* finally sweep and callout into modules */
 	netcp_addr_sweep_del(netcp);
 	netcp_addr_sweep_add(netcp);
+	netcp_set_promiscuous(netcp, promisc);
 	spin_unlock(&netcp->lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8585661b1a4c6924c4c7e7abe9beec0bcf53eab7 Mon Sep 17 00:00:00 2001
From: WingMan Kwok <w-kwok2@ti.com>
Date: Tue, 17 Apr 2018 17:30:42 -0400
Subject: net: netcp: ethss: k2g: add promiscuous mode support

This patch adds support for promiscuous mode in k2g's network
driver. When upper layer instructs to transition from
non-promiscuous mode to promiscuous mode or vice versa
K2G network driver needs to configure ALE accordingly
so that in case of non-promiscuous mode, ALE will not flood
all unicast packets to host port, while in promiscuous
mode, it will pass all received unicast packets to
host port.

Signed-off-by: WingMan Kwok <w-kwok2@ti.com>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 56 +++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 5bc792175e4e..6a728d35e776 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2775,6 +2775,61 @@ static inline int gbe_hwtstamp_set(struct gbe_intf *gbe_intf, struct ifreq *req)
 }
 #endif /* CONFIG_TI_CPTS */
 
+static int gbe_set_rx_mode(void *intf_priv, bool promisc)
+{
+	struct gbe_intf *gbe_intf = intf_priv;
+	struct gbe_priv *gbe_dev = gbe_intf->gbe_dev;
+	struct cpsw_ale *ale = gbe_dev->ale;
+	unsigned long timeout;
+	int i, ret = -ETIMEDOUT;
+
+	/* Disable(1)/Enable(0) Learn for all ports (host is port 0 and
+	 * slaves are port 1 and up
+	 */
+	for (i = 0; i <= gbe_dev->num_slaves; i++) {
+		cpsw_ale_control_set(ale, i,
+				     ALE_PORT_NOLEARN, !!promisc);
+		cpsw_ale_control_set(ale, i,
+				     ALE_PORT_NO_SA_UPDATE, !!promisc);
+	}
+
+	if (!promisc) {
+		/* Don't Flood All Unicast Packets to Host port */
+		cpsw_ale_control_set(ale, 0, ALE_P0_UNI_FLOOD, 0);
+		dev_vdbg(gbe_dev->dev, "promiscuous mode disabled\n");
+		return 0;
+	}
+
+	timeout = jiffies + HZ;
+
+	/* Clear All Untouched entries */
+	cpsw_ale_control_set(ale, 0, ALE_AGEOUT, 1);
+	do {
+		cpu_relax();
+		if (cpsw_ale_control_get(ale, 0, ALE_AGEOUT)) {
+			ret = 0;
+			break;
+		}
+
+	} while (time_after(timeout, jiffies));
+
+	/* Make sure it is not a false timeout */
+	if (ret && !cpsw_ale_control_get(ale, 0, ALE_AGEOUT))
+		return ret;
+
+	cpsw_ale_control_set(ale, 0, ALE_AGEOUT, 1);
+
+	/* Clear all mcast from ALE */
+	cpsw_ale_flush_multicast(ale,
+				 GBE_PORT_MASK(gbe_dev->ale_ports),
+				 -1);
+
+	/* Flood All Unicast Packets to Host port */
+	cpsw_ale_control_set(ale, 0, ALE_P0_UNI_FLOOD, 1);
+	dev_vdbg(gbe_dev->dev, "promiscuous mode enabled\n");
+	return ret;
+}
+
 static int gbe_ioctl(void *intf_priv, struct ifreq *req, int cmd)
 {
 	struct gbe_intf *gbe_intf = intf_priv;
@@ -3529,6 +3584,7 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 		gbe_dev->max_num_slaves = 8;
 	} else if (of_device_is_compatible(node, "ti,netcp-gbe-2")) {
 		gbe_dev->max_num_slaves = 1;
+		gbe_module.set_rx_mode = gbe_set_rx_mode;
 	} else if (of_device_is_compatible(node, "ti,netcp-xgbe")) {
 		gbe_dev->max_num_slaves = 2;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From 008b4a938d785159dc060da7aec06f66449a4ef8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:17:13 +0200
Subject: r8169: remove unused member features from struct

Member features of struct rtl8169_private isn't used any longer since
commit 6c6aa15fdea5 "r8169: improve interrupt handling", so remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 7d4e68907055..5cd10ee62f20 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -832,8 +832,6 @@ struct rtl8169_private {
 		struct work_struct work;
 	} wk;
 
-	unsigned features;
-
 	struct mii_if_info mii;
 	dma_addr_t counters_phys_addr;
 	struct rtl8169_counters *counters;
-- 
cgit v1.2.3-59-g8ed1b


From 1cbbf01cfe251447d232ccd6b031252bc5fa4b80 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:18:05 +0200
Subject: r8169: remove member align from struct rtl_cfg_info

Since commit 6f0333b8fde4 "r8169: use 50% less ram for RX ring" member
align isn't used any longer, so remove it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 5cd10ee62f20..14f494a9318a 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8016,7 +8016,6 @@ static const struct net_device_ops rtl_netdev_ops = {
 static const struct rtl_cfg_info {
 	void (*hw_start)(struct net_device *);
 	unsigned int region;
-	unsigned int align;
 	u16 event_slow;
 	unsigned int has_gmii:1;
 	const struct rtl_coalesce_info *coalesce_info;
@@ -8025,7 +8024,6 @@ static const struct rtl_cfg_info {
 	[RTL_CFG_0] = {
 		.hw_start	= rtl_hw_start_8169,
 		.region		= 1,
-		.align		= 0,
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver,
 		.has_gmii	= 1,
 		.coalesce_info	= rtl_coalesce_info_8169,
@@ -8034,7 +8032,6 @@ static const struct rtl_cfg_info {
 	[RTL_CFG_1] = {
 		.hw_start	= rtl_hw_start_8168,
 		.region		= 2,
-		.align		= 8,
 		.event_slow	= SYSErr | LinkChg | RxOverflow,
 		.has_gmii	= 1,
 		.coalesce_info	= rtl_coalesce_info_8168_8136,
@@ -8043,7 +8040,6 @@ static const struct rtl_cfg_info {
 	[RTL_CFG_2] = {
 		.hw_start	= rtl_hw_start_8101,
 		.region		= 2,
-		.align		= 8,
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver |
 				  PCSTimeout,
 		.coalesce_info	= rtl_coalesce_info_8168_8136,
-- 
cgit v1.2.3-59-g8ed1b


From 8a67aa868c7a3e6ef92b4e355abd7a2836573aa9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:19:07 +0200
Subject: r8169: use skb_copy_to_linear_data in rtl8169_try_rx_copy

Not a giant leap for mankind, but let's avoid the open-coded memcpy
and use standard helper skb_copy_to_linear_data instead.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 14f494a9318a..8c303617cc37 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7359,7 +7359,7 @@ static struct sk_buff *rtl8169_try_rx_copy(void *data,
 	prefetch(data);
 	skb = napi_alloc_skb(&tp->napi, pkt_size);
 	if (skb)
-		memcpy(skb->data, data, pkt_size);
+		skb_copy_to_linear_data(skb, data, pkt_size);
 	dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
 
 	return skb;
-- 
cgit v1.2.3-59-g8ed1b


From 37621493b873b3701129333117da683b8e7a9da6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:20:03 +0200
Subject: r8169: use constant NAPI_POLL_WAIT

We can use generic constant NAPI_POLL_WAIT instead of defining an own
constant for the same value.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 8c303617cc37..cc88e8ee45c7 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -88,7 +88,6 @@ static const int multicast_filter_limit = 32;
 #define InterFrameGap	0x03	/* 3 means InterFrameGap = the shortest one */
 
 #define R8169_REGS_SIZE		256
-#define R8169_NAPI_WEIGHT	64
 #define NUM_TX_DESC	64	/* Number of Tx descriptor registers */
 #define NUM_RX_DESC	256U	/* Number of Rx descriptor registers */
 #define R8169_TX_RING_BYTES	(NUM_TX_DESC * sizeof(struct TxDesc))
@@ -8316,7 +8315,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->ethtool_ops = &rtl8169_ethtool_ops;
 	dev->watchdog_timeo = RTL8169_TX_TIMEOUT;
 
-	netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);
+	netif_napi_add(dev, &tp->napi, rtl8169_poll, NAPI_POLL_WEIGHT);
 
 	/* don't enable SG, IP_CSUM and TSO by default - it might not work
 	 * properly for all devices */
-- 
cgit v1.2.3-59-g8ed1b


From 9a899a35b0d6b28b31cb04e042a349ef2fe18557 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:21:01 +0200
Subject: r8169: switch to napi_schedule_irqoff

napi_schedule() is called from hard irq context, so we can switch to
napi_schedule_irqoff() and avoid some overhead.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index cc88e8ee45c7..34447a2e3763 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7473,7 +7473,7 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 			handled = 1;
 
 			rtl_irq_disable(tp);
-			napi_schedule(&tp->napi);
+			napi_schedule_irqoff(&tp->napi);
 		}
 	}
 	return IRQ_RETVAL(handled);
-- 
cgit v1.2.3-59-g8ed1b


From d3b404c29c97ecf56001dc94f94a5b8e5efcd2e3 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:22:14 +0200
Subject: r8169: simplify rtl8169_alloc_rx_data

dev->dev.parent has the same value as tp_to_dev(tp)
(set by SET_NETDEV_DEV() in rtl_init_one()) and we know it can't be NULL.
This allows us to simplify the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 34447a2e3763..0216ca71d1aa 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6731,8 +6731,7 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 	void *data;
 	dma_addr_t mapping;
 	struct device *d = tp_to_dev(tp);
-	struct net_device *dev = tp->dev;
-	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+	int node = dev_to_node(d);
 
 	data = kmalloc_node(rx_buf_sz, GFP_KERNEL, node);
 	if (!data)
-- 
cgit v1.2.3-59-g8ed1b


From b1127e641e8986212620111ee5f78a24cbe8ae27 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:23:35 +0200
Subject: r8169: improve rtl8169_init_ring

This function doesn't use the net_device, therefore change the
parameter to type struct rtl8169_private * to simplify the code.
In addition we don't need the calculations in the memset
statements, we can use the size of the arrays directly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 0216ca71d1aa..3971d089ee89 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6803,14 +6803,12 @@ err_out:
 	return -ENOMEM;
 }
 
-static int rtl8169_init_ring(struct net_device *dev)
+static int rtl8169_init_ring(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-
 	rtl8169_init_ring_indexes(tp);
 
-	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
-	memset(tp->Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
+	memset(tp->tx_skb, 0, sizeof(tp->tx_skb));
+	memset(tp->Rx_databuff, 0, sizeof(tp->Rx_databuff));
 
 	return rtl8169_rx_fill(tp);
 }
@@ -7678,7 +7676,7 @@ static int rtl_open(struct net_device *dev)
 	if (!tp->RxDescArray)
 		goto err_free_tx_0;
 
-	retval = rtl8169_init_ring(dev);
+	retval = rtl8169_init_ring(tp);
 	if (retval < 0)
 		goto err_free_rx_1;
 
-- 
cgit v1.2.3-59-g8ed1b


From 1528192f8975dbe741de9f9d05c492f81ab7fa71 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:24:43 +0200
Subject: r8169: remove unneeded check in rtl8169_rx_fill

rtl8169_rx_fill() is called only once and directly before the call
array tp->Rx_databuff[] is filled with zero's. Therefore we don't
need this check.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3971d089ee89..d2d0940e7253 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6784,9 +6784,6 @@ static int rtl8169_rx_fill(struct rtl8169_private *tp)
 	for (i = 0; i < NUM_RX_DESC; i++) {
 		void *data;
 
-		if (tp->Rx_databuff[i])
-			continue;
-
 		data = rtl8169_alloc_rx_data(tp, tp->RxDescArray + i);
 		if (!data) {
 			rtl8169_make_unusable_by_asic(tp->RxDescArray + i);
-- 
cgit v1.2.3-59-g8ed1b


From 1d0254dd9b44dd187cbbdfe42f9b2cb6b4aa36a4 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:25:46 +0200
Subject: r8169: replace rx_buf_sz with a constant

rx_buf_sz is constant, so we don't have to pass it as parameter and
in general can replace it with a constant.

When working on this I noticed that also before in
rtl_set_rx_max_size() a value of 0x4000 is set, what is not in line
with the chip spec. According to the spec only bits 0..13 are used
and we set an effective value of zero therefore.
However, the driver still seems to work and due to potential side
effects I'm reluctant to make a change.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 37 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index d2d0940e7253..c68771d0e89b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -88,6 +88,7 @@ static const int multicast_filter_limit = 32;
 #define InterFrameGap	0x03	/* 3 means InterFrameGap = the shortest one */
 
 #define R8169_REGS_SIZE		256
+#define R8169_RX_BUF_SIZE	(SZ_16K - 1)
 #define NUM_TX_DESC	64	/* Number of Tx descriptor registers */
 #define NUM_RX_DESC	256U	/* Number of Rx descriptor registers */
 #define R8169_TX_RING_BYTES	(NUM_TX_DESC * sizeof(struct TxDesc))
@@ -343,7 +344,6 @@ static const struct pci_device_id rtl8169_pci_tbl[] = {
 
 MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
 
-static int rx_buf_sz = 16383;
 static int use_dac = -1;
 static struct {
 	u32 msg_enable;
@@ -5385,10 +5385,10 @@ static u16 rtl_rw_cpluscmd(struct rtl8169_private *tp)
 	return cmd;
 }
 
-static void rtl_set_rx_max_size(struct rtl8169_private *tp, unsigned int rx_buf_sz)
+static void rtl_set_rx_max_size(struct rtl8169_private *tp)
 {
 	/* Low hurts. Let's disable the filtering. */
-	RTL_W16(tp, RxMaxSize, rx_buf_sz + 1);
+	RTL_W16(tp, RxMaxSize, R8169_RX_BUF_SIZE + 1);
 }
 
 static void rtl8169_set_magic_reg(struct rtl8169_private *tp, unsigned mac_version)
@@ -5489,7 +5489,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
 
 	RTL_W8(tp, EarlyTxThres, NoEarlyTx);
 
-	rtl_set_rx_max_size(tp, rx_buf_sz);
+	rtl_set_rx_max_size(tp);
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_01 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_02 ||
@@ -6329,7 +6329,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
-	rtl_set_rx_max_size(tp, rx_buf_sz);
+	rtl_set_rx_max_size(tp);
 
 	tp->cp_cmd |= RTL_R16(tp, CPlusCmd) | PktCntrDisable | INTT_1;
 
@@ -6613,7 +6613,7 @@ static void rtl_hw_start_8101(struct net_device *dev)
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
-	rtl_set_rx_max_size(tp, rx_buf_sz);
+	rtl_set_rx_max_size(tp);
 
 	tp->cp_cmd &= ~R810X_CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
@@ -6695,29 +6695,28 @@ static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc)
 static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
 				     void **data_buff, struct RxDesc *desc)
 {
-	dma_unmap_single(tp_to_dev(tp), le64_to_cpu(desc->addr), rx_buf_sz,
-			 DMA_FROM_DEVICE);
+	dma_unmap_single(tp_to_dev(tp), le64_to_cpu(desc->addr),
+			 R8169_RX_BUF_SIZE, DMA_FROM_DEVICE);
 
 	kfree(*data_buff);
 	*data_buff = NULL;
 	rtl8169_make_unusable_by_asic(desc);
 }
 
-static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz)
+static inline void rtl8169_mark_to_asic(struct RxDesc *desc)
 {
 	u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
 
 	/* Force memory writes to complete before releasing descriptor */
 	dma_wmb();
 
-	desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz);
+	desc->opts1 = cpu_to_le32(DescOwn | eor | R8169_RX_BUF_SIZE);
 }
 
-static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
-				       u32 rx_buf_sz)
+static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping)
 {
 	desc->addr = cpu_to_le64(mapping);
-	rtl8169_mark_to_asic(desc, rx_buf_sz);
+	rtl8169_mark_to_asic(desc);
 }
 
 static inline void *rtl8169_align(void *data)
@@ -6733,18 +6732,18 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 	struct device *d = tp_to_dev(tp);
 	int node = dev_to_node(d);
 
-	data = kmalloc_node(rx_buf_sz, GFP_KERNEL, node);
+	data = kmalloc_node(R8169_RX_BUF_SIZE, GFP_KERNEL, node);
 	if (!data)
 		return NULL;
 
 	if (rtl8169_align(data) != data) {
 		kfree(data);
-		data = kmalloc_node(rx_buf_sz + 15, GFP_KERNEL, node);
+		data = kmalloc_node(R8169_RX_BUF_SIZE + 15, GFP_KERNEL, node);
 		if (!data)
 			return NULL;
 	}
 
-	mapping = dma_map_single(d, rtl8169_align(data), rx_buf_sz,
+	mapping = dma_map_single(d, rtl8169_align(data), R8169_RX_BUF_SIZE,
 				 DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(d, mapping))) {
 		if (net_ratelimit())
@@ -6752,7 +6751,7 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 		goto err_out;
 	}
 
-	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
+	rtl8169_map_to_asic(desc, mapping);
 	return data;
 
 err_out:
@@ -6864,7 +6863,7 @@ static void rtl_reset_work(struct rtl8169_private *tp)
 	rtl8169_hw_reset(tp);
 
 	for (i = 0; i < NUM_RX_DESC; i++)
-		rtl8169_mark_to_asic(tp->RxDescArray + i, rx_buf_sz);
+		rtl8169_mark_to_asic(tp->RxDescArray + i);
 
 	rtl8169_tx_clear(tp);
 	rtl8169_init_ring_indexes(tp);
@@ -7444,7 +7443,7 @@ process_pkt:
 		}
 release_descriptor:
 		desc->opts2 = 0;
-		rtl8169_mark_to_asic(desc, rx_buf_sz);
+		rtl8169_mark_to_asic(desc);
 	}
 
 	count = cur_rx - tp->cur_rx;
-- 
cgit v1.2.3-59-g8ed1b


From d731af7822cc80ee2735ac2188023c95339c136b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:26:41 +0200
Subject: r8169: remove rtl8169_map_to_asic

This function is very simple and used only once, so we can inline
the two statements.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index c68771d0e89b..347c61521e56 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6713,12 +6713,6 @@ static inline void rtl8169_mark_to_asic(struct RxDesc *desc)
 	desc->opts1 = cpu_to_le32(DescOwn | eor | R8169_RX_BUF_SIZE);
 }
 
-static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping)
-{
-	desc->addr = cpu_to_le64(mapping);
-	rtl8169_mark_to_asic(desc);
-}
-
 static inline void *rtl8169_align(void *data)
 {
 	return (void *)ALIGN((long)data, 16);
@@ -6751,7 +6745,8 @@ static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
 		goto err_out;
 	}
 
-	rtl8169_map_to_asic(desc, mapping);
+	desc->addr = cpu_to_le64(mapping);
+	rtl8169_mark_to_asic(desc);
 	return data;
 
 err_out:
-- 
cgit v1.2.3-59-g8ed1b


From 61cb532d5224b388145582dbe40cb0ae7a57c7e6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:27:38 +0200
Subject: r8169: change hw_start argument type

Code can be simplified by changing the argument type of hw_start
callbacks from struct net_device * to struct rtl8169_private *.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 41 +++++++++++++-----------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 347c61521e56..7a513b97e1e5 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -819,7 +819,7 @@ struct rtl8169_private {
 	int (*get_link_ksettings)(struct net_device *,
 				  struct ethtool_link_ksettings *);
 	void (*phy_reset_enable)(struct rtl8169_private *tp);
-	void (*hw_start)(struct net_device *);
+	void (*hw_start)(struct rtl8169_private *tp);
 	unsigned int (*phy_reset_pending)(struct rtl8169_private *tp);
 	unsigned int (*link_ok)(struct rtl8169_private *tp);
 	int (*do_ioctl)(struct rtl8169_private *tp, struct mii_ioctl_data *data, int cmd);
@@ -5354,12 +5354,9 @@ static void rtl_set_rx_tx_config_registers(struct rtl8169_private *tp)
 		(InterFrameGap << TxInterFrameGapShift));
 }
 
-static void rtl_hw_start(struct net_device *dev)
+static void rtl_hw_start(struct  rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-
-	tp->hw_start(dev);
-
+	tp->hw_start(tp);
 	rtl_irq_enable_all(tp);
 }
 
@@ -5468,14 +5465,11 @@ static void rtl_set_rx_mode(struct net_device *dev)
 	RTL_W32(tp, RxConfig, tmp);
 }
 
-static void rtl_hw_start_8169(struct net_device *dev)
+static void rtl_hw_start_8169(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-	struct pci_dev *pdev = tp->pci_dev;
-
 	if (tp->mac_version == RTL_GIGA_MAC_VER_05) {
 		RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) | PCIMulRW);
-		pci_write_config_byte(pdev, PCI_CACHE_LINE_SIZE, 0x08);
+		pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
 	}
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
@@ -5533,7 +5527,7 @@ static void rtl_hw_start_8169(struct net_device *dev)
 
 	RTL_W32(tp, RxMissed, 0);
 
-	rtl_set_rx_mode(dev);
+	rtl_set_rx_mode(tp->dev);
 
 	/* no early-rx interrupts */
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
@@ -6321,10 +6315,8 @@ static void rtl_hw_start_8168ep_3(struct rtl8169_private *tp)
 	r8168_mac_ocp_write(tp, 0xe860, data);
 }
 
-static void rtl_hw_start_8168(struct net_device *dev)
+static void rtl_hw_start_8168(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
@@ -6449,7 +6441,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 
 	default:
 		printk(KERN_ERR PFX "%s: unknown chipset (mac_version = %d).\n",
-			dev->name, tp->mac_version);
+		       tp->dev->name, tp->mac_version);
 		break;
 	}
 
@@ -6457,7 +6449,7 @@ static void rtl_hw_start_8168(struct net_device *dev)
 
 	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
 
-	rtl_set_rx_mode(dev);
+	rtl_set_rx_mode(tp->dev);
 
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
@@ -6596,17 +6588,14 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp)
 	rtl_pcie_state_l2l3_enable(tp, false);
 }
 
-static void rtl_hw_start_8101(struct net_device *dev)
+static void rtl_hw_start_8101(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-	struct pci_dev *pdev = tp->pci_dev;
-
 	if (tp->mac_version >= RTL_GIGA_MAC_VER_30)
 		tp->event_slow &= ~RxFIFOOver;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_13 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_16)
-		pcie_capability_set_word(pdev, PCI_EXP_DEVCTL,
+		pcie_capability_set_word(tp->pci_dev, PCI_EXP_DEVCTL,
 					 PCI_EXP_DEVCTL_NOSNOOP_EN);
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
@@ -6664,7 +6653,7 @@ static void rtl_hw_start_8101(struct net_device *dev)
 
 	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
 
-	rtl_set_rx_mode(dev);
+	rtl_set_rx_mode(tp->dev);
 
 	RTL_R8(tp, IntrMask);
 
@@ -6864,7 +6853,7 @@ static void rtl_reset_work(struct rtl8169_private *tp)
 	rtl8169_init_ring_indexes(tp);
 
 	napi_enable(&tp->napi);
-	rtl_hw_start(dev);
+	rtl_hw_start(tp);
 	netif_wake_queue(dev);
 	rtl8169_check_link_status(dev, tp);
 }
@@ -7694,7 +7683,7 @@ static int rtl_open(struct net_device *dev)
 
 	rtl_pll_power_up(tp);
 
-	rtl_hw_start(dev);
+	rtl_hw_start(tp);
 
 	if (!rtl8169_init_counter_offsets(dev))
 		netif_warn(tp, hw, dev, "counter reset/update failed\n");
@@ -8001,7 +7990,7 @@ static const struct net_device_ops rtl_netdev_ops = {
 };
 
 static const struct rtl_cfg_info {
-	void (*hw_start)(struct net_device *);
+	void (*hw_start)(struct rtl8169_private *tp);
 	unsigned int region;
 	u16 event_slow;
 	unsigned int has_gmii:1;
-- 
cgit v1.2.3-59-g8ed1b


From e71c9ce262269ed12d042ea56da63a51c97fd880 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:28:28 +0200
Subject: r8169: change argument type of counters handling functions

The counter handling functions don't deal with the net_device, so code
can be simplified by changing the argument type to
struct rtl8169_private *.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 7a513b97e1e5..7ee7b67b7a08 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2151,9 +2151,8 @@ DECLARE_RTL_COND(rtl_counters_cond)
 	return RTL_R32(tp, CounterAddrLow) & (CounterReset | CounterDump);
 }
 
-static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd)
+static bool rtl8169_do_counters(struct rtl8169_private *tp, u32 counter_cmd)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
 	dma_addr_t paddr = tp->counters_phys_addr;
 	u32 cmd;
 
@@ -2166,10 +2165,8 @@ static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd)
 	return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000);
 }
 
-static bool rtl8169_reset_counters(struct net_device *dev)
+static bool rtl8169_reset_counters(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-
 	/*
 	 * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the
 	 * tally counters.
@@ -2177,13 +2174,11 @@ static bool rtl8169_reset_counters(struct net_device *dev)
 	if (tp->mac_version < RTL_GIGA_MAC_VER_19)
 		return true;
 
-	return rtl8169_do_counters(dev, CounterReset);
+	return rtl8169_do_counters(tp, CounterReset);
 }
 
-static bool rtl8169_update_counters(struct net_device *dev)
+static bool rtl8169_update_counters(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
-
 	/*
 	 * Some chips are unable to dump tally counters when the receiver
 	 * is disabled.
@@ -2191,12 +2186,11 @@ static bool rtl8169_update_counters(struct net_device *dev)
 	if ((RTL_R8(tp, ChipCmd) & CmdRxEnb) == 0)
 		return true;
 
-	return rtl8169_do_counters(dev, CounterDump);
+	return rtl8169_do_counters(tp, CounterDump);
 }
 
-static bool rtl8169_init_counter_offsets(struct net_device *dev)
+static bool rtl8169_init_counter_offsets(struct rtl8169_private *tp)
 {
-	struct rtl8169_private *tp = netdev_priv(dev);
 	struct rtl8169_counters *counters = tp->counters;
 	bool ret = false;
 
@@ -2219,10 +2213,10 @@ static bool rtl8169_init_counter_offsets(struct net_device *dev)
 		return true;
 
 	/* If both, reset and update fail, propagate to caller. */
-	if (rtl8169_reset_counters(dev))
+	if (rtl8169_reset_counters(tp))
 		ret = true;
 
-	if (rtl8169_update_counters(dev))
+	if (rtl8169_update_counters(tp))
 		ret = true;
 
 	tp->tc_offset.tx_errors = counters->tx_errors;
@@ -2245,7 +2239,7 @@ static void rtl8169_get_ethtool_stats(struct net_device *dev,
 	pm_runtime_get_noresume(d);
 
 	if (pm_runtime_active(d))
-		rtl8169_update_counters(dev);
+		rtl8169_update_counters(tp);
 
 	pm_runtime_put_noidle(d);
 
@@ -7601,7 +7595,7 @@ static int rtl8169_close(struct net_device *dev)
 	pm_runtime_get_sync(&pdev->dev);
 
 	/* Update counters before going down */
-	rtl8169_update_counters(dev);
+	rtl8169_update_counters(tp);
 
 	rtl_lock_work(tp);
 	clear_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags);
@@ -7685,7 +7679,7 @@ static int rtl_open(struct net_device *dev)
 
 	rtl_hw_start(tp);
 
-	if (!rtl8169_init_counter_offsets(dev))
+	if (!rtl8169_init_counter_offsets(tp))
 		netif_warn(tp, hw, dev, "counter reset/update failed\n");
 
 	netif_start_queue(dev);
@@ -7754,7 +7748,7 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	 * from tally counters.
 	 */
 	if (pm_runtime_active(&pdev->dev))
-		rtl8169_update_counters(dev);
+		rtl8169_update_counters(tp);
 
 	/*
 	 * Subtract values fetched during initalization.
@@ -7850,7 +7844,7 @@ static int rtl8169_runtime_suspend(struct device *device)
 
 	/* Update counters before going runtime suspend */
 	rtl8169_rx_missed(dev);
-	rtl8169_update_counters(dev);
+	rtl8169_update_counters(tp);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ebcd5daa7ffd6e8be53401bfce5143a26543afe5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:29:20 +0200
Subject: r8169: change interrupt handler argument type

Code can be a little simplified by switching the interrupt handler
argument type to struct rtl8169_private *.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 7ee7b67b7a08..ead853ac63de 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7432,8 +7432,7 @@ release_descriptor:
 
 static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 {
-	struct net_device *dev = dev_instance;
-	struct rtl8169_private *tp = netdev_priv(dev);
+	struct rtl8169_private *tp = dev_instance;
 	int handled = 0;
 	u16 status;
 
@@ -7605,7 +7604,7 @@ static int rtl8169_close(struct net_device *dev)
 
 	cancel_work_sync(&tp->wk.work);
 
-	pci_free_irq(pdev, 0, dev);
+	pci_free_irq(pdev, 0, tp);
 
 	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
 			  tp->RxPhyAddr);
@@ -7660,7 +7659,7 @@ static int rtl_open(struct net_device *dev)
 
 	rtl_request_firmware(tp);
 
-	retval = pci_request_irq(pdev, 0, rtl8169_interrupt, NULL, dev,
+	retval = pci_request_irq(pdev, 0, rtl8169_interrupt, NULL, tp,
 				 dev->name);
 	if (retval < 0)
 		goto err_release_fw_2;
-- 
cgit v1.2.3-59-g8ed1b


From 6202806e7c03a116a973f1a5ed770584768129f4 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:30:29 +0200
Subject: r8169: drop member opts1_mask from struct rtl8169_private

We can get rid of member opts1_mask and in addition save a few cpu
cycles in the hot path of rtl_rx().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index ead853ac63de..fe838112143a 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -836,7 +836,6 @@ struct rtl8169_private {
 	struct rtl8169_counters *counters;
 	struct rtl8169_tc_offsets tc_offset;
 	u32 saved_wolopts;
-	u32 opts1_mask;
 
 	struct rtl_fw {
 		const struct firmware *fw;
@@ -7347,7 +7346,7 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
 		struct RxDesc *desc = tp->RxDescArray + entry;
 		u32 status;
 
-		status = le32_to_cpu(desc->opts1) & tp->opts1_mask;
+		status = le32_to_cpu(desc->opts1);
 		if (status & DescOwn)
 			break;
 
@@ -7365,14 +7364,16 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget
 				dev->stats.rx_length_errors++;
 			if (status & RxCRC)
 				dev->stats.rx_crc_errors++;
-			if (status & RxFOVF) {
+			/* RxFOVF is a reserved bit on later chip versions */
+			if (tp->mac_version == RTL_GIGA_MAC_VER_01 &&
+			    status & RxFOVF) {
 				rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
 				dev->stats.rx_fifo_errors++;
-			}
-			if ((status & (RxRUNT | RxCRC)) &&
-			    !(status & (RxRWT | RxFOVF)) &&
-			    (dev->features & NETIF_F_RXALL))
+			} else if (status & (RxRUNT | RxCRC) &&
+				   !(status & RxRWT) &&
+				   dev->features & NETIF_F_RXALL) {
 				goto process_pkt;
+			}
 		} else {
 			struct sk_buff *skb;
 			dma_addr_t addr;
@@ -8327,9 +8328,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	tp->event_slow = cfg->event_slow;
 	tp->coalesce_info = cfg->coalesce_info;
 
-	tp->opts1_mask = (tp->mac_version != RTL_GIGA_MAC_VER_01) ?
-		~(RxBOVF | RxFOVF) : ~0;
-
 	timer_setup(&tp->timer, rtl8169_phy_timer, 0);
 
 	tp->rtl_fw = RTL_FIRMWARE_UNKNOWN;
-- 
cgit v1.2.3-59-g8ed1b


From 2d6c5a61fad6c0f0092a333188ded27416471c5e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:31:21 +0200
Subject: r8169: don't display tp->mmio_addr address

For security reasons since commit ad67b74d2469 "printk: hash addresses
printed with %p" %p doesn't display the full address any longer.
We could switch to %px, but I think the pointer address doesn't
provide a real benefit, so remove printing the hashed address.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index fe838112143a..f81abbe2858d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8344,8 +8344,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc < 0)
 		return rc;
 
-	netif_info(tp, probe, dev, "%s at 0x%p, %pM, XID %08x IRQ %d\n",
-		   rtl_chip_infos[chipset].name, tp->mmio_addr, dev->dev_addr,
+	netif_info(tp, probe, dev, "%s, %pM, XID %08x, IRQ %d\n",
+		   rtl_chip_infos[chipset].name, dev->dev_addr,
 		   (u32)(RTL_R32(tp, TxConfig) & 0x9cf0f8ff),
 		   pci_irq_vector(pdev, 0));
 	if (rtl_chip_infos[chipset].jumbo_max != JUMBO_1K) {
-- 
cgit v1.2.3-59-g8ed1b


From 90b989c5532190c6b30a0a2c6dc38beae6578b9c Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:32:15 +0200
Subject: r8169: improve rtl8169_get_mac_version

Certain entries in array mac_info[] are redundant, so remove them:
0x7cf, 0x2c200000 (VER 33): matched by entry 0x7c8, 0x2c000000
0x7cf, 0x28300000 (VER 26): matched by entry 0x7c8, 0x28000000
0x7cf, 0x3cb00000 (VER 24): matched by entry 0x7c8, 0x3c800000
0x7cf, 0x3c400000 (VER 22): matched by entry 0x7c8, 0x3c000000
0x7cf, 0x38500000 (VER 17): matched by entry 0x7c8, 0x38000000
0x7cf, 0x44900000 (VER 39): matched by entry 0x7c8, 0x44800000
0x7cf, 0x40b00000 (VER 30): matched by entry 0x7c8, 0x40800000
0x7cf, 0x40a00000 (VER 30): matched by entry 0x7c8, 0x40800000
0x7cf, 0x34a00000 (VER 09): matched by entry 0x7c8, 0x34800000
0x7cf, 0x24a00000 (VER 09): matched by entry 0x7c8, 0x24800000

In addition don't mask out bits 30 and 29 when printing the XID.
Most likely this is a relict from the times when the driver covered
RTL8169 chip version only.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index f81abbe2858d..dbb7ba2c357d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2549,12 +2549,10 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp,
 
 		/* 8168E family. */
 		{ 0x7c800000, 0x2c800000,	RTL_GIGA_MAC_VER_34 },
-		{ 0x7cf00000, 0x2c200000,	RTL_GIGA_MAC_VER_33 },
 		{ 0x7cf00000, 0x2c100000,	RTL_GIGA_MAC_VER_32 },
 		{ 0x7c800000, 0x2c000000,	RTL_GIGA_MAC_VER_33 },
 
 		/* 8168D family. */
-		{ 0x7cf00000, 0x28300000,	RTL_GIGA_MAC_VER_26 },
 		{ 0x7cf00000, 0x28100000,	RTL_GIGA_MAC_VER_25 },
 		{ 0x7c800000, 0x28000000,	RTL_GIGA_MAC_VER_26 },
 
@@ -2564,32 +2562,24 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp,
 		{ 0x7cf00000, 0x28b00000,	RTL_GIGA_MAC_VER_31 },
 
 		/* 8168C family. */
-		{ 0x7cf00000, 0x3cb00000,	RTL_GIGA_MAC_VER_24 },
 		{ 0x7cf00000, 0x3c900000,	RTL_GIGA_MAC_VER_23 },
 		{ 0x7cf00000, 0x3c800000,	RTL_GIGA_MAC_VER_18 },
 		{ 0x7c800000, 0x3c800000,	RTL_GIGA_MAC_VER_24 },
 		{ 0x7cf00000, 0x3c000000,	RTL_GIGA_MAC_VER_19 },
 		{ 0x7cf00000, 0x3c200000,	RTL_GIGA_MAC_VER_20 },
 		{ 0x7cf00000, 0x3c300000,	RTL_GIGA_MAC_VER_21 },
-		{ 0x7cf00000, 0x3c400000,	RTL_GIGA_MAC_VER_22 },
 		{ 0x7c800000, 0x3c000000,	RTL_GIGA_MAC_VER_22 },
 
 		/* 8168B family. */
 		{ 0x7cf00000, 0x38000000,	RTL_GIGA_MAC_VER_12 },
-		{ 0x7cf00000, 0x38500000,	RTL_GIGA_MAC_VER_17 },
 		{ 0x7c800000, 0x38000000,	RTL_GIGA_MAC_VER_17 },
 		{ 0x7c800000, 0x30000000,	RTL_GIGA_MAC_VER_11 },
 
 		/* 8101 family. */
-		{ 0x7cf00000, 0x44900000,	RTL_GIGA_MAC_VER_39 },
 		{ 0x7c800000, 0x44800000,	RTL_GIGA_MAC_VER_39 },
 		{ 0x7c800000, 0x44000000,	RTL_GIGA_MAC_VER_37 },
-		{ 0x7cf00000, 0x40b00000,	RTL_GIGA_MAC_VER_30 },
-		{ 0x7cf00000, 0x40a00000,	RTL_GIGA_MAC_VER_30 },
 		{ 0x7cf00000, 0x40900000,	RTL_GIGA_MAC_VER_29 },
 		{ 0x7c800000, 0x40800000,	RTL_GIGA_MAC_VER_30 },
-		{ 0x7cf00000, 0x34a00000,	RTL_GIGA_MAC_VER_09 },
-		{ 0x7cf00000, 0x24a00000,	RTL_GIGA_MAC_VER_09 },
 		{ 0x7cf00000, 0x34900000,	RTL_GIGA_MAC_VER_08 },
 		{ 0x7cf00000, 0x24900000,	RTL_GIGA_MAC_VER_08 },
 		{ 0x7cf00000, 0x34800000,	RTL_GIGA_MAC_VER_07 },
@@ -8346,7 +8336,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netif_info(tp, probe, dev, "%s, %pM, XID %08x, IRQ %d\n",
 		   rtl_chip_infos[chipset].name, dev->dev_addr,
-		   (u32)(RTL_R32(tp, TxConfig) & 0x9cf0f8ff),
+		   (u32)(RTL_R32(tp, TxConfig) & 0xfcf0f8ff),
 		   pci_irq_vector(pdev, 0));
 	if (rtl_chip_infos[chipset].jumbo_max != JUMBO_1K) {
 		netif_info(tp, probe, dev, "jumbo features [frames: %d bytes, "
-- 
cgit v1.2.3-59-g8ed1b


From a4328ddb55f27b7ed0ec0aa7e9509869bbbbf097 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:33:03 +0200
Subject: r8169: drop member txd_version from struct rtl8169_private

txd_version is used in rtl_init_one() only, so we can drop member
txd_version from struct rtl8169_private.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index dbb7ba2c357d..ccc57683ed1b 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -776,7 +776,6 @@ struct rtl8169_private {
 	struct net_device *dev;
 	struct napi_struct napi;
 	u32 msg_enable;
-	u16 txd_version;
 	u16 mac_version;
 	u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
 	u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
@@ -8214,7 +8213,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	rtl8169_print_mac_version(tp);
 
 	chipset = tp->mac_version;
-	tp->txd_version = rtl_chip_infos[chipset].txd_version;
 
 	rc = rtl_alloc_irq(tp);
 	if (rc < 0) {
@@ -8299,13 +8297,17 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		/* Disallow toggling */
 		dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
 
-	if (tp->txd_version == RTL_TD_0)
+	switch (rtl_chip_infos[chipset].txd_version) {
+	case RTL_TD_0:
 		tp->tso_csum = rtl8169_tso_csum_v1;
-	else if (tp->txd_version == RTL_TD_1) {
+		break;
+	case RTL_TD_1:
 		tp->tso_csum = rtl8169_tso_csum_v2;
 		dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
-	} else
+		break;
+	default:
 		WARN_ON_ONCE(1);
+	}
 
 	dev->hw_features |= NETIF_F_RXALL;
 	dev->hw_features |= NETIF_F_RXFCS;
-- 
cgit v1.2.3-59-g8ed1b


From c8d48d9c5bf7a0e884c17867eb8e0b30b7eae139 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:34:22 +0200
Subject: r8169: improve pci region handling

The region to be used is always the first of type IORESOURCE_MEM.
We can implement this rule directly w/o having to specify which
region is the first one per configuration entry.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index ccc57683ed1b..94e91d3c2233 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7974,7 +7974,6 @@ static const struct net_device_ops rtl_netdev_ops = {
 
 static const struct rtl_cfg_info {
 	void (*hw_start)(struct rtl8169_private *tp);
-	unsigned int region;
 	u16 event_slow;
 	unsigned int has_gmii:1;
 	const struct rtl_coalesce_info *coalesce_info;
@@ -7982,7 +7981,6 @@ static const struct rtl_cfg_info {
 } rtl_cfg_infos [] = {
 	[RTL_CFG_0] = {
 		.hw_start	= rtl_hw_start_8169,
-		.region		= 1,
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver,
 		.has_gmii	= 1,
 		.coalesce_info	= rtl_coalesce_info_8169,
@@ -7990,7 +7988,6 @@ static const struct rtl_cfg_info {
 	},
 	[RTL_CFG_1] = {
 		.hw_start	= rtl_hw_start_8168,
-		.region		= 2,
 		.event_slow	= SYSErr | LinkChg | RxOverflow,
 		.has_gmii	= 1,
 		.coalesce_info	= rtl_coalesce_info_8168_8136,
@@ -7998,7 +7995,6 @@ static const struct rtl_cfg_info {
 	},
 	[RTL_CFG_2] = {
 		.hw_start	= rtl_hw_start_8101,
-		.region		= 2,
 		.event_slow	= SYSErr | LinkChg | RxOverflow | RxFIFOOver |
 				  PCSTimeout,
 		.coalesce_info	= rtl_coalesce_info_8168_8136,
@@ -8098,11 +8094,10 @@ static void rtl_hw_initialize(struct rtl8169_private *tp)
 static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
-	const unsigned int region = cfg->region;
 	struct rtl8169_private *tp;
 	struct mii_if_info *mii;
 	struct net_device *dev;
-	int chipset, i;
+	int chipset, region, i;
 	int rc;
 
 	if (netif_msg_drv(&debug)) {
@@ -8144,11 +8139,10 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pcim_set_mwi(pdev) < 0)
 		netif_info(tp, probe, dev, "Mem-Wr-Inval unavailable\n");
 
-	/* make sure PCI base addr 1 is MMIO */
-	if (!(pci_resource_flags(pdev, region) & IORESOURCE_MEM)) {
-		netif_err(tp, probe, dev,
-			  "region #%d not an MMIO resource, aborting\n",
-			  region);
+	/* use first MMIO region */
+	region = ffs(pci_select_bars(pdev, IORESOURCE_MEM)) - 1;
+	if (region < 0) {
+		netif_err(tp, probe, dev, "no MMIO resource found\n");
 		return -ENODEV;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 6ed0e08f9e2a90ab455d897e4679b903c82dac67 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 17 Apr 2018 23:36:12 +0200
Subject: r8169: remove jumbo_tx_csum from chip config struct

According to the chip configuration entries only RTL8169 (ver <= 06)
supports tx checksumming for jumbo packets.
By the way: constant JUMBO_1K is a little misleading because it refers
to the standard packet size and not to a jumbo packet size.

By implementing this rule we can get rid of configuring tx checksumming
support per chip type.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 133 ++++++++++++++---------------------
 1 file changed, 54 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 94e91d3c2233..fcd42d0387b9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -171,12 +171,11 @@ enum rtl_tx_desc_version {
 #define JUMBO_7K	(7*1024 - ETH_HLEN - 2)
 #define JUMBO_9K	(9*1024 - ETH_HLEN - 2)
 
-#define _R(NAME,TD,FW,SZ,B) {	\
+#define _R(NAME,TD,FW,SZ) {	\
 	.name = NAME,		\
 	.txd_version = TD,	\
 	.fw_name = FW,		\
 	.jumbo_max = SZ,	\
-	.jumbo_tx_csum = B	\
 }
 
 static const struct {
@@ -184,135 +183,111 @@ static const struct {
 	enum rtl_tx_desc_version txd_version;
 	const char *fw_name;
 	u16 jumbo_max;
-	bool jumbo_tx_csum;
 } rtl_chip_infos[] = {
 	/* PCI devices. */
 	[RTL_GIGA_MAC_VER_01] =
-		_R("RTL8169",		RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8169",		RTL_TD_0, NULL, JUMBO_7K),
 	[RTL_GIGA_MAC_VER_02] =
-		_R("RTL8169s",		RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8169s",		RTL_TD_0, NULL, JUMBO_7K),
 	[RTL_GIGA_MAC_VER_03] =
-		_R("RTL8110s",		RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8110s",		RTL_TD_0, NULL, JUMBO_7K),
 	[RTL_GIGA_MAC_VER_04] =
-		_R("RTL8169sb/8110sb",	RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8169sb/8110sb",	RTL_TD_0, NULL, JUMBO_7K),
 	[RTL_GIGA_MAC_VER_05] =
-		_R("RTL8169sc/8110sc",	RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8169sc/8110sc",	RTL_TD_0, NULL, JUMBO_7K),
 	[RTL_GIGA_MAC_VER_06] =
-		_R("RTL8169sc/8110sc",	RTL_TD_0, NULL, JUMBO_7K, true),
+		_R("RTL8169sc/8110sc",	RTL_TD_0, NULL, JUMBO_7K),
 	/* PCI-E devices. */
 	[RTL_GIGA_MAC_VER_07] =
-		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K, true),
+		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_08] =
-		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K, true),
+		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_09] =
-		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K, true),
+		_R("RTL8102e",		RTL_TD_1, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_10] =
-		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K, true),
+		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_11] =
-		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K, false),
+		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K),
 	[RTL_GIGA_MAC_VER_12] =
-		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K, false),
+		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K),
 	[RTL_GIGA_MAC_VER_13] =
-		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K, true),
+		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_14] =
-		_R("RTL8100e",		RTL_TD_0, NULL, JUMBO_1K, true),
+		_R("RTL8100e",		RTL_TD_0, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_15] =
-		_R("RTL8100e",		RTL_TD_0, NULL, JUMBO_1K, true),
+		_R("RTL8100e",		RTL_TD_0, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_16] =
-		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K, true),
+		_R("RTL8101e",		RTL_TD_0, NULL, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_17] =
-		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K, false),
+		_R("RTL8168b/8111b",	RTL_TD_0, NULL, JUMBO_4K),
 	[RTL_GIGA_MAC_VER_18] =
-		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_19] =
-		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_20] =
-		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_21] =
-		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_22] =
-		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168c/8111c",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_23] =
-		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_24] =
-		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K, false),
+		_R("RTL8168cp/8111cp",	RTL_TD_1, NULL, JUMBO_6K),
 	[RTL_GIGA_MAC_VER_25] =
-		_R("RTL8168d/8111d",	RTL_TD_1, FIRMWARE_8168D_1,
-							JUMBO_9K, false),
+		_R("RTL8168d/8111d",	RTL_TD_1, FIRMWARE_8168D_1, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_26] =
-		_R("RTL8168d/8111d",	RTL_TD_1, FIRMWARE_8168D_2,
-							JUMBO_9K, false),
+		_R("RTL8168d/8111d",	RTL_TD_1, FIRMWARE_8168D_2, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_27] =
-		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K, false),
+		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_28] =
-		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K, false),
+		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_29] =
-		_R("RTL8105e",		RTL_TD_1, FIRMWARE_8105E_1,
-							JUMBO_1K, true),
+		_R("RTL8105e",		RTL_TD_1, FIRMWARE_8105E_1, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_30] =
-		_R("RTL8105e",		RTL_TD_1, FIRMWARE_8105E_1,
-							JUMBO_1K, true),
+		_R("RTL8105e",		RTL_TD_1, FIRMWARE_8105E_1, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_31] =
-		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K, false),
+		_R("RTL8168dp/8111dp",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_32] =
-		_R("RTL8168e/8111e",	RTL_TD_1, FIRMWARE_8168E_1,
-							JUMBO_9K, false),
+		_R("RTL8168e/8111e",	RTL_TD_1, FIRMWARE_8168E_1, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_33] =
-		_R("RTL8168e/8111e",	RTL_TD_1, FIRMWARE_8168E_2,
-							JUMBO_9K, false),
+		_R("RTL8168e/8111e",	RTL_TD_1, FIRMWARE_8168E_2, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_34] =
-		_R("RTL8168evl/8111evl",RTL_TD_1, FIRMWARE_8168E_3,
-							JUMBO_9K, false),
+		_R("RTL8168evl/8111evl",RTL_TD_1, FIRMWARE_8168E_3, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_35] =
-		_R("RTL8168f/8111f",	RTL_TD_1, FIRMWARE_8168F_1,
-							JUMBO_9K, false),
+		_R("RTL8168f/8111f",	RTL_TD_1, FIRMWARE_8168F_1, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_36] =
-		_R("RTL8168f/8111f",	RTL_TD_1, FIRMWARE_8168F_2,
-							JUMBO_9K, false),
+		_R("RTL8168f/8111f",	RTL_TD_1, FIRMWARE_8168F_2, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_37] =
-		_R("RTL8402",		RTL_TD_1, FIRMWARE_8402_1,
-							JUMBO_1K, true),
+		_R("RTL8402",		RTL_TD_1, FIRMWARE_8402_1,  JUMBO_1K),
 	[RTL_GIGA_MAC_VER_38] =
-		_R("RTL8411",		RTL_TD_1, FIRMWARE_8411_1,
-							JUMBO_9K, false),
+		_R("RTL8411",		RTL_TD_1, FIRMWARE_8411_1,  JUMBO_9K),
 	[RTL_GIGA_MAC_VER_39] =
-		_R("RTL8106e",		RTL_TD_1, FIRMWARE_8106E_1,
-							JUMBO_1K, true),
+		_R("RTL8106e",		RTL_TD_1, FIRMWARE_8106E_1, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_40] =
-		_R("RTL8168g/8111g",	RTL_TD_1, FIRMWARE_8168G_2,
-							JUMBO_9K, false),
+		_R("RTL8168g/8111g",	RTL_TD_1, FIRMWARE_8168G_2, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_41] =
-		_R("RTL8168g/8111g",	RTL_TD_1, NULL, JUMBO_9K, false),
+		_R("RTL8168g/8111g",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_42] =
-		_R("RTL8168g/8111g",	RTL_TD_1, FIRMWARE_8168G_3,
-							JUMBO_9K, false),
+		_R("RTL8168g/8111g",	RTL_TD_1, FIRMWARE_8168G_3, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_43] =
-		_R("RTL8106e",		RTL_TD_1, FIRMWARE_8106E_2,
-							JUMBO_1K, true),
+		_R("RTL8106e",		RTL_TD_1, FIRMWARE_8106E_2, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_44] =
-		_R("RTL8411",		RTL_TD_1, FIRMWARE_8411_2,
-							JUMBO_9K, false),
+		_R("RTL8411",		RTL_TD_1, FIRMWARE_8411_2,  JUMBO_9K),
 	[RTL_GIGA_MAC_VER_45] =
-		_R("RTL8168h/8111h",	RTL_TD_1, FIRMWARE_8168H_1,
-							JUMBO_9K, false),
+		_R("RTL8168h/8111h",	RTL_TD_1, FIRMWARE_8168H_1, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_46] =
-		_R("RTL8168h/8111h",	RTL_TD_1, FIRMWARE_8168H_2,
-							JUMBO_9K, false),
+		_R("RTL8168h/8111h",	RTL_TD_1, FIRMWARE_8168H_2, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_47] =
-		_R("RTL8107e",		RTL_TD_1, FIRMWARE_8107E_1,
-							JUMBO_1K, false),
+		_R("RTL8107e",		RTL_TD_1, FIRMWARE_8107E_1, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_48] =
-		_R("RTL8107e",		RTL_TD_1, FIRMWARE_8107E_2,
-							JUMBO_1K, false),
+		_R("RTL8107e",		RTL_TD_1, FIRMWARE_8107E_2, JUMBO_1K),
 	[RTL_GIGA_MAC_VER_49] =
-		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL,
-							JUMBO_9K, false),
+		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_50] =
-		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL,
-							JUMBO_9K, false),
+		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL, JUMBO_9K),
 	[RTL_GIGA_MAC_VER_51] =
-		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL,
-							JUMBO_9K, false),
+		_R("RTL8168ep/8111ep",	RTL_TD_1, NULL, JUMBO_9K),
 };
 #undef _R
 
@@ -1954,7 +1929,7 @@ static netdev_features_t rtl8169_fix_features(struct net_device *dev,
 		features &= ~NETIF_F_ALL_TSO;
 
 	if (dev->mtu > JUMBO_1K &&
-	    !rtl_chip_infos[tp->mac_version].jumbo_tx_csum)
+	    tp->mac_version > RTL_GIGA_MAC_VER_06)
 		features &= ~NETIF_F_IP_CSUM;
 
 	return features;
@@ -8338,7 +8313,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netif_info(tp, probe, dev, "jumbo features [frames: %d bytes, "
 			   "tx checksumming: %s]\n",
 			   rtl_chip_infos[chipset].jumbo_max,
-			   rtl_chip_infos[chipset].jumbo_tx_csum ? "ok" : "ko");
+			  tp->mac_version <= RTL_GIGA_MAC_VER_06 ? "ok" : "ko");
 	}
 
 	if (r8168_check_dash(tp))
-- 
cgit v1.2.3-59-g8ed1b


From 0fe554a46a0ff855376053c7e4204673b7879f05 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 17 Apr 2018 14:25:30 -0700
Subject: hv_netvsc: propogate Hyper-V friendly name into interface alias

This patch implement the 'Device Naming' feature of the Hyper-V
network device API. In Hyper-V on the host through the GUI or PowerShell
it is possible to enable the device naming feature which causes
the host to make available to the guest the name of the device.
This shows up in the RNDIS protocol as the friendly name.

The name has no particular meaning and is limited to 256 characters.
The value can only be set via PowerShell on the host, but could
be scripted for mass deployments. The default value is the
string 'Network Adapter' and since that is the same for all devices
and useless, the driver ignores it.

In Windows, the value goes into a registry key for use in SNMP
ifAlias. For Linux, this patch puts the value in the network
device alias property; where it is visible in ip tools and SNMP.

The host provided ifAlias is just a suggestion, and can be
overridden by later ip commands.

Also requires exporting dev_set_alias in netdev core.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 28 ++++++++++++++++++++++++++++
 net/core/dev.c                    |  1 +
 2 files changed, 29 insertions(+)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 6b127be781d9..3b6dbacaf77d 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -29,6 +29,7 @@
 #include <linux/nls.h>
 #include <linux/vmalloc.h>
 #include <linux/rtnetlink.h>
+#include <linux/ucs2_string.h>
 
 #include "hyperv_net.h"
 #include "netvsc_trace.h"
@@ -1223,6 +1224,29 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
 	return ret;
 }
 
+static void rndis_get_friendly_name(struct net_device *net,
+				    struct rndis_device *rndis_device,
+				    struct netvsc_device *net_device)
+{
+	ucs2_char_t wname[256];
+	unsigned long len;
+	u8 ifalias[256];
+	u32 size;
+
+	size = sizeof(wname);
+	if (rndis_filter_query_device(rndis_device, net_device,
+				      RNDIS_OID_GEN_FRIENDLY_NAME,
+				      wname, &size) != 0)
+		return;
+
+	/* Convert Windows Unicode string to UTF-8 */
+	len = ucs2_as_utf8(ifalias, wname, sizeof(ifalias));
+
+	/* ignore the default value from host */
+	if (strcmp(ifalias, "Network Adapter") != 0)
+		dev_set_alias(net, ifalias, len);
+}
+
 struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 				      struct netvsc_device_info *device_info)
 {
@@ -1276,6 +1300,10 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev,
 
 	memcpy(device_info->mac_adr, rndis_device->hw_mac_adr, ETH_ALEN);
 
+	/* Get friendly name as ifalias*/
+	if (!net->ifalias)
+		rndis_get_friendly_name(net, rndis_device, net_device);
+
 	/* Query and set hardware capabilities */
 	ret = rndis_netdev_set_hwcaps(rndis_device, net_device);
 	if (ret != 0)
diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..a490ef643586 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 
 	return len;
 }
+EXPORT_SYMBOL(dev_set_alias);
 
 /**
  *	dev_get_alias - get ifalias of a device
-- 
cgit v1.2.3-59-g8ed1b


From 0dcec221dd39d4d48fe6b22b4cacf2f49b95ffa0 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Tue, 17 Apr 2018 15:31:47 -0700
Subject: hv_netvsc: Add NetVSP v6 and v6.1 into version negotiation

This patch adds the NetVSP v6 and 6.1 message structures, and includes
these versions into NetVSC/NetVSP version negotiation process.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h | 164 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/hyperv/netvsc.c     |   3 +-
 2 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 960f06141472..6ebe39a3dde6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -237,6 +237,8 @@ void netvsc_switch_datapath(struct net_device *nv_dev, bool vf);
 #define NVSP_PROTOCOL_VERSION_2		0x30002
 #define NVSP_PROTOCOL_VERSION_4		0x40000
 #define NVSP_PROTOCOL_VERSION_5		0x50000
+#define NVSP_PROTOCOL_VERSION_6		0x60000
+#define NVSP_PROTOCOL_VERSION_61	0x60001
 
 enum {
 	NVSP_MSG_TYPE_NONE = 0,
@@ -308,6 +310,12 @@ enum {
 	NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
 
 	NVSP_MSG5_MAX = NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE,
+
+	/* Version 6 messages */
+	NVSP_MSG6_TYPE_PD_API,
+	NVSP_MSG6_TYPE_PD_POST_BATCH,
+
+	NVSP_MSG6_MAX = NVSP_MSG6_TYPE_PD_POST_BATCH
 };
 
 enum {
@@ -619,12 +627,168 @@ union nvsp_5_message_uber {
 	struct nvsp_5_send_indirect_table send_table;
 } __packed;
 
+enum nvsp_6_pd_api_op {
+	PD_API_OP_CONFIG = 1,
+	PD_API_OP_SW_DATAPATH, /* Switch Datapath */
+	PD_API_OP_OPEN_PROVIDER,
+	PD_API_OP_CLOSE_PROVIDER,
+	PD_API_OP_CREATE_QUEUE,
+	PD_API_OP_FLUSH_QUEUE,
+	PD_API_OP_FREE_QUEUE,
+	PD_API_OP_ALLOC_COM_BUF, /* Allocate Common Buffer */
+	PD_API_OP_FREE_COM_BUF, /* Free Common Buffer */
+	PD_API_OP_MAX
+};
+
+struct grp_affinity {
+	u64 mask;
+	u16 grp;
+	u16 reserved[3];
+} __packed;
+
+struct nvsp_6_pd_api_req {
+	u32 op;
+
+	union {
+		/* MMIO information is sent from the VM to VSP */
+		struct __packed {
+			u64 mmio_pa; /* MMIO Physical Address */
+			u32 mmio_len;
+
+			/* Number of PD queues a VM can support */
+			u16 num_subchn;
+		} config;
+
+		/* Switch Datapath */
+		struct __packed {
+			/* Host Datapath Is PacketDirect */
+			u8 host_dpath_is_pd;
+
+			/* Guest PacketDirect Is Enabled */
+			u8 guest_pd_enabled;
+		} sw_dpath;
+
+		/* Open Provider*/
+		struct __packed {
+			u32 prov_id; /* Provider id */
+			u32 flag;
+		} open_prov;
+
+		/* Close Provider */
+		struct __packed {
+			u32 prov_id;
+		} cls_prov;
+
+		/* Create Queue*/
+		struct __packed {
+			u32 prov_id;
+			u16 q_id;
+			u16 q_size;
+			u8 is_recv_q;
+			u8 is_rss_q;
+			u32 recv_data_len;
+			struct grp_affinity affy;
+		} cr_q;
+
+		/* Delete Queue*/
+		struct __packed {
+			u32 prov_id;
+			u16 q_id;
+		} del_q;
+
+		/* Flush Queue */
+		struct __packed {
+			u32 prov_id;
+			u16 q_id;
+		} flush_q;
+
+		/* Allocate Common Buffer */
+		struct __packed {
+			u32 len;
+			u32 pf_node; /* Preferred Node */
+			u16 region_id;
+		} alloc_com_buf;
+
+		/* Free Common Buffer */
+		struct __packed {
+			u32 len;
+			u64 pa; /* Physical Address */
+			u32 pf_node; /* Preferred Node */
+			u16 region_id;
+			u8 cache_type;
+		} free_com_buf;
+	} __packed;
+} __packed;
+
+struct nvsp_6_pd_api_comp {
+	u32 op;
+	u32 status;
+
+	union {
+		struct __packed {
+			/* actual number of PD queues allocated to the VM */
+			u16 num_pd_q;
+
+			/* Num Receive Rss PD Queues */
+			u8 num_rss_q;
+
+			u8 is_supported; /* Is supported by VSP */
+			u8 is_enabled; /* Is enabled by VSP */
+		} config;
+
+		/* Open Provider */
+		struct __packed {
+			u32 prov_id;
+		} open_prov;
+
+		/* Create Queue */
+		struct __packed {
+			u32 prov_id;
+			u16 q_id;
+			u16 q_size;
+			u32 recv_data_len;
+			struct grp_affinity affy;
+		} cr_q;
+
+		/* Allocate Common Buffer */
+		struct __packed {
+			u64 pa; /* Physical Address */
+			u32 len;
+			u32 pf_node; /* Preferred Node */
+			u16 region_id;
+			u8 cache_type;
+		} alloc_com_buf;
+	} __packed;
+} __packed;
+
+struct nvsp_6_pd_buf {
+	u32 region_offset;
+	u16 region_id;
+	u16 is_partial:1;
+	u16 reserved:15;
+} __packed;
+
+struct nvsp_6_pd_batch_msg {
+	struct nvsp_message_header hdr;
+	u16 count;
+	u16 guest2host:1;
+	u16 is_recv:1;
+	u16 reserved:14;
+	struct nvsp_6_pd_buf pd_buf[0];
+} __packed;
+
+union nvsp_6_message_uber {
+	struct nvsp_6_pd_api_req pd_req;
+	struct nvsp_6_pd_api_comp pd_comp;
+} __packed;
+
 union nvsp_all_messages {
 	union nvsp_message_init_uber init_msg;
 	union nvsp_1_message_uber v1_msg;
 	union nvsp_2_message_uber v2_msg;
 	union nvsp_4_message_uber v4_msg;
 	union nvsp_5_message_uber v5_msg;
+	union nvsp_6_message_uber v6_msg;
 } __packed;
 
 /* ALL Messages */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 04f611e6f678..e7308958b7a9 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -525,7 +525,8 @@ static int netvsc_connect_vsp(struct hv_device *device,
 	struct net_device *ndev = hv_get_drvdata(device);
 	static const u32 ver_list[] = {
 		NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
-		NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5
+		NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5,
+		NVSP_PROTOCOL_VERSION_6, NVSP_PROTOCOL_VERSION_61
 	};
 	struct nvsp_message *init_packet;
 	int ndis_version, i, ret;
-- 
cgit v1.2.3-59-g8ed1b


From 415787d7799f4fccbe8d49cb0b8e5811be6b0389 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 17 Apr 2018 18:11:44 -0700
Subject: ipv6: frags: fix a lockdep false positive

lockdep does not know that the locks used by IPv4 defrag
and IPv6 reassembly units are of different classes.

It complains because of following chains :

1) sch_direct_xmit()        (lock txq->_xmit_lock)
    dev_hard_start_xmit()
     xmit_one()
      dev_queue_xmit_nit()
       packet_rcv_fanout()
        ip_check_defrag()
         ip_defrag()
          spin_lock()     (lock frag queue spinlock)

2) ip6_input_finish()
    ipv6_frag_rcv()       (lock frag queue spinlock)
     ip6_frag_queue()
      icmpv6_param_prob() (lock txq->_xmit_lock at some point)

We could add lockdep annotations, but we also can make sure IPv6
calls icmpv6_param_prob() only after the release of the frag queue spinlock,
since this naturally makes frag queue spinlock a leaf in lock hierarchy.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2cdf3dcf8c2c..b939b94e7e91 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -163,7 +163,8 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 }
 
 static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
-			   struct frag_hdr *fhdr, int nhoff)
+			  struct frag_hdr *fhdr, int nhoff,
+			  u32 *prob_offset)
 {
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
@@ -179,11 +180,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
-		__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
-				IPSTATS_MIB_INHDRERRORS);
-		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
-				  ((u8 *)&fhdr->frag_off -
-				   skb_network_header(skb)));
+		*prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
 		return -1;
 	}
 
@@ -214,10 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			/* RFC2460 says always send parameter problem in
 			 * this case. -DaveM
 			 */
-			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
-					IPSTATS_MIB_INHDRERRORS);
-			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
-					  offsetof(struct ipv6hdr, payload_len));
+			*prob_offset = offsetof(struct ipv6hdr, payload_len);
 			return -1;
 		}
 		if (end > fq->q.len) {
@@ -519,15 +513,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	iif = skb->dev ? skb->dev->ifindex : 0;
 	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
+		u32 prob_offset = 0;
 		int ret;
 
 		spin_lock(&fq->q.lock);
 
 		fq->iif = iif;
-		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
+		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
+				     &prob_offset);
 
 		spin_unlock(&fq->q.lock);
 		inet_frag_put(&fq->q);
+		if (prob_offset) {
+			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+					IPSTATS_MIB_INHDRERRORS);
+			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
+		}
 		return ret;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ce20cdf498bdfa6c6130be75cf4359dad305ebcd Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 16:40:34 +0200
Subject: netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet.

The nfulnl_log_packet() is added to make sure that the NFLOG target
works as only user-space logger. but now, nf_log_packet() can find proper
log function using NF_LOG_TYPE_ULOG and NF_LOG_TYPE_LOG.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_log.h | 17 -----------------
 net/netfilter/nfnetlink_log.c         |  8 +++-----
 net/netfilter/xt_NFLOG.c              | 15 +++++++++++----
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
index 612cfb63ac68..ea32a7d3cf1b 100644
--- a/include/net/netfilter/nfnetlink_log.h
+++ b/include/net/netfilter/nfnetlink_log.h
@@ -1,18 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _KER_NFNETLINK_LOG_H
-#define _KER_NFNETLINK_LOG_H
-
-void
-nfulnl_log_packet(struct net *net,
-		  u_int8_t pf,
-		  unsigned int hooknum,
-		  const struct sk_buff *skb,
-		  const struct net_device *in,
-		  const struct net_device *out,
-		  const struct nf_loginfo *li_user,
-		  const char *prefix);
-
-#define NFULNL_COPY_DISABLED    0xff
-
-#endif /* _KER_NFNETLINK_LOG_H */
-
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4c478d..e5cc4d9b9ce7 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
 #include <net/sock.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 #include <linux/atomic.h>
 #include <linux/refcount.h>
@@ -47,6 +46,7 @@
 #include "../bridge/br_private.h"
 #endif
 
+#define NFULNL_COPY_DISABLED	0xff
 #define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
 #define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
 #define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
 };
 
 /* log handler for internal netfilter logging api */
-void
+static void
 nfulnl_log_packet(struct net *net,
 		  u_int8_t pf,
 		  unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
 	struct nfulnl_instance *inst;
 	const struct nf_loginfo *li;
 	unsigned int qthreshold;
-	unsigned int plen;
+	unsigned int plen = 0;
 	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	const struct nfnl_ct_hook *nfnl_ct = NULL;
 	struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
 	if (!inst)
 		return;
 
-	plen = 0;
 	if (prefix)
 		plen = strlen(prefix) + 1;
 
@@ -760,7 +759,6 @@ alloc_failure:
 	/* FIXME: statistics */
 	goto unlock_and_release;
 }
-EXPORT_SYMBOL_GPL(nfulnl_log_packet);
 
 static int
 nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_NFLOG.h>
 #include <net/netfilter/nf_log.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->flags & XT_NFLOG_F_COPY_LEN)
 		li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
 
-	nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
-			  xt_in(par), xt_out(par), &li, info->prefix);
+	nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+		      xt_out(par), &li, "%s", info->prefix);
+
 	return XT_CONTINUE;
 }
 
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	if (info->prefix[sizeof(info->prefix) - 1] != '\0')
 		return -EINVAL;
-	return 0;
+
+	return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+}
+
+static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
 }
 
 static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.checkentry = nflog_tg_check,
+	.destroy    = nflog_tg_destroy,
 	.target     = nflog_tg,
 	.targetsize = sizeof(struct xt_nflog_info),
 	.me         = THIS_MODULE,
-- 
cgit v1.2.3-59-g8ed1b


From e4a2eb8995365858f0dad6985b119313f0a4624f Mon Sep 17 00:00:00 2001
From: Bjoern Johansson <bjoernj@google.com>
Date: Wed, 11 Apr 2018 11:41:56 -0700
Subject: mac80211_hwsim: indicate support for powersave.

Without this, higher layers in the kernel will return an error
code when trying to set the power state because the driver
doesn't indicate power state support. This in turn causes VTS
(Android Vendor Test Suite) failures because the WiFi HAL can't
enable power saving mode.

Signed-off-by: Bjoern Johansson <bjoernj@google.com>
Signed-off-by: Lingfeng Yang <lfy@google.com>
Signed-off-by: Roman Kiryanov <rkir@google.com>
[johannes: remove remaining code, it was useless even as a skeleton
 since it didn't even have the right function arguments]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 96d26cfae90b..7bfbc5916e9a 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2650,6 +2650,7 @@ static int mac80211_hwsim_new_radio(struct genl_info *info,
 	ieee80211_hw_set(hw, AMPDU_AGGREGATION);
 	ieee80211_hw_set(hw, MFP_CAPABLE);
 	ieee80211_hw_set(hw, SIGNAL_DBM);
+	ieee80211_hw_set(hw, SUPPORTS_PS);
 	ieee80211_hw_set(hw, TDLS_WIDER_BW);
 	if (rctbl)
 		ieee80211_hw_set(hw, SUPPORTS_RC_TABLE);
-- 
cgit v1.2.3-59-g8ed1b


From 8db0c433692eda3a0358d72643bd0268b736767c Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Thu, 19 Apr 2018 11:17:38 +0200
Subject: regulatory: Rename confusing 'country IE' in log output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'country IE' messages in the log can be confusing and make people think
that the country code has been set to Ireland. Fix this by changing the
log messages to use 'country element' instead (as they are no longer called
'information element' in the spec anyway).

Reported-by: Bernhard Gabler <Bernhard_Gabler@web.de>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 16c7e4ef5820..ecfee5f06c76 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1652,7 +1652,7 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
 	case NL80211_REGDOM_SET_BY_DRIVER:
 		return "driver";
 	case NL80211_REGDOM_SET_BY_COUNTRY_IE:
-		return "country IE";
+		return "country element";
 	default:
 		WARN_ON(1);
 		return "bug";
@@ -2618,7 +2618,7 @@ reg_process_hint_country_ie(struct wiphy *wiphy,
 		 * This doesn't happen yet, not sure we
 		 * ever want to support it for this case.
 		 */
-		WARN_ONCE(1, "Unexpected intersection for country IEs");
+		WARN_ONCE(1, "Unexpected intersection for country elements");
 		return REG_REQ_IGNORE;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 235b9c42766129b2534d1dd4021e04d39128901a Mon Sep 17 00:00:00 2001
From: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:05 +0300
Subject: ath10k: Add tx ack signal support for management frames

This patch add support to get RSSI from acknowledgment
frames for transmitted management frames.

hardware_used: QCA4019, QCA9984.
firmware version: 10.4-3.5.3-00052.

Signed-off-by: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.h   |  2 ++
 drivers/net/wireless/ath/ath10k/htt.h    | 10 +++++++++-
 drivers/net/wireless/ath/ath10k/htt_rx.c | 10 ++++++++++
 drivers/net/wireless/ath/ath10k/txrx.c   |  8 ++++++++
 drivers/net/wireless/ath/ath10k/wmi.h    |  7 +++++++
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index c17d805d68cc..e4ac8f2831fd 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -52,6 +52,8 @@
 /* Antenna noise floor */
 #define ATH10K_DEFAULT_NOISE_FLOOR -95
 
+#define ATH10K_INVALID_RSSI 128
+
 #define ATH10K_MAX_NUM_MGMT_PENDING 128
 
 /* number of failed packets (20 packets with 16 sw reties each) */
diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index 8cc2a8b278e4..e5dbb0bdd787 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -533,12 +534,18 @@ struct htt_ver_resp {
 	u8 rsvd0;
 } __packed;
 
+#define HTT_MGMT_TX_CMPL_FLAG_ACK_RSSI BIT(0)
+
+#define HTT_MGMT_TX_CMPL_INFO_ACK_RSSI_MASK	GENMASK(7, 0)
+
 struct htt_mgmt_tx_completion {
 	u8 rsvd0;
 	u8 rsvd1;
-	u8 rsvd2;
+	u8 flags;
 	__le32 desc_id;
 	__le32 status;
+	__le32 ppdu_id;
+	__le32 info;
 } __packed;
 
 #define HTT_RX_INDICATION_INFO0_EXT_TID_MASK  (0x1F)
@@ -1648,6 +1655,7 @@ struct htt_resp {
 struct htt_tx_done {
 	u16 msdu_id;
 	u16 status;
+	u8 ack_rssi;
 };
 
 enum htt_tx_compl_state {
diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c
index 5e02e26158f6..0315bdad7f85 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -25,6 +25,7 @@
 #include "mac.h"
 
 #include <linux/log2.h>
+#include <linux/bitfield.h>
 
 /* when under memory pressure rx ring refill may fail and needs a retry */
 #define HTT_RX_RING_REFILL_RETRY_MS 50
@@ -2719,12 +2720,21 @@ bool ath10k_htt_t2h_msg_handler(struct ath10k *ar, struct sk_buff *skb)
 	case HTT_T2H_MSG_TYPE_MGMT_TX_COMPLETION: {
 		struct htt_tx_done tx_done = {};
 		int status = __le32_to_cpu(resp->mgmt_tx_completion.status);
+		int info = __le32_to_cpu(resp->mgmt_tx_completion.info);
 
 		tx_done.msdu_id = __le32_to_cpu(resp->mgmt_tx_completion.desc_id);
 
 		switch (status) {
 		case HTT_MGMT_TX_STATUS_OK:
 			tx_done.status = HTT_TX_COMPL_STATE_ACK;
+			if (test_bit(WMI_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS,
+				     ar->wmi.svc_map) &&
+			    (resp->mgmt_tx_completion.flags &
+			     HTT_MGMT_TX_CMPL_FLAG_ACK_RSSI)) {
+				tx_done.ack_rssi =
+				FIELD_GET(HTT_MGMT_TX_CMPL_INFO_ACK_RSSI_MASK,
+					  info);
+			}
 			break;
 		case HTT_MGMT_TX_STATUS_RETRY:
 			tx_done.status = HTT_TX_COMPL_STATE_NOACK;
diff --git a/drivers/net/wireless/ath/ath10k/txrx.c b/drivers/net/wireless/ath/ath10k/txrx.c
index 70e23bbf7171..cda164f6e9f6 100644
--- a/drivers/net/wireless/ath/ath10k/txrx.c
+++ b/drivers/net/wireless/ath/ath10k/txrx.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2016 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -119,6 +120,13 @@ int ath10k_txrx_tx_unref(struct ath10k_htt *htt,
 			info->flags &= ~IEEE80211_TX_STAT_ACK;
 	}
 
+	if (tx_done->status == HTT_TX_COMPL_STATE_ACK &&
+	    tx_done->ack_rssi != ATH10K_INVALID_RSSI) {
+		info->status.ack_signal = ATH10K_DEFAULT_NOISE_FLOOR +
+						tx_done->ack_rssi;
+		info->status.is_valid_ack_signal = true;
+	}
+
 	ieee80211_tx_status(htt->ar->hw, msdu);
 	/* we do not own the msdu anymore */
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 6fbc84c29521..3cc129d812a4 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -201,6 +201,7 @@ enum wmi_service {
 	WMI_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS,
 	WMI_SERVICE_HOST_DFS_CHECK_SUPPORT,
 	WMI_SERVICE_TPC_STATS_FINAL,
+	WMI_SERVICE_RESET_CHIP,
 
 	/* keep last */
 	WMI_SERVICE_MAX,
@@ -238,6 +239,8 @@ enum wmi_10x_service {
 	WMI_10X_SERVICE_MESH,
 	WMI_10X_SERVICE_EXT_RES_CFG_SUPPORT,
 	WMI_10X_SERVICE_PEER_STATS,
+	WMI_10X_SERVICE_RESET_CHIP,
+	WMI_10X_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS,
 };
 
 enum wmi_main_service {
@@ -548,6 +551,10 @@ static inline void wmi_10x_svc_map(const __le32 *in, unsigned long *out,
 	       WMI_SERVICE_EXT_RES_CFG_SUPPORT, len);
 	SVCMAP(WMI_10X_SERVICE_PEER_STATS,
 	       WMI_SERVICE_PEER_STATS, len);
+	SVCMAP(WMI_10X_SERVICE_RESET_CHIP,
+	       WMI_SERVICE_RESET_CHIP, len);
+	SVCMAP(WMI_10X_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS,
+	       WMI_SERVICE_HTT_MGMT_TX_COMP_VALID_FLAGS, len);
 }
 
 static inline void wmi_main_svc_map(const __le32 *in, unsigned long *out,
-- 
cgit v1.2.3-59-g8ed1b


From a0aedd6e0b5755a4c1cf288744a0fbef667f9f8f Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:11 +0300
Subject: ath10k: build ce layer in ath10k core module

CE layer is shared between pci and snoc target and results
in duplicate object inclusion if both modules are compiled
together statically and undefined KBUILD_MODNAME if
compiled as module.

Fix this by building ce layer in ath10k core module by
adding ce object inclusion with ATH10K_CE boolean CONFIG.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/Kconfig  |  4 ++++
 drivers/net/wireless/ath/ath10k/Makefile |  4 ++--
 drivers/net/wireless/ath/ath10k/ce.c     | 22 ++++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig
index deb5ae21a559..d266f2792dae 100644
--- a/drivers/net/wireless/ath/ath10k/Kconfig
+++ b/drivers/net/wireless/ath/ath10k/Kconfig
@@ -4,12 +4,16 @@ config ATH10K
 	select ATH_COMMON
 	select CRC32
 	select WANT_DEV_COREDUMP
+	select ATH10K_CE
         ---help---
           This module adds support for wireless adapters based on
           Atheros IEEE 802.11ac family of chipsets.
 
           If you choose to build a module, it'll be called ath10k.
 
+config ATH10K_CE
+	bool
+
 config ATH10K_PCI
 	tristate "Atheros ath10k PCI support"
 	depends on ATH10K && PCI
diff --git a/drivers/net/wireless/ath/ath10k/Makefile b/drivers/net/wireless/ath/ath10k/Makefile
index 6739ac26fd29..536f3dfb9364 100644
--- a/drivers/net/wireless/ath/ath10k/Makefile
+++ b/drivers/net/wireless/ath/ath10k/Makefile
@@ -22,10 +22,10 @@ ath10k_core-$(CONFIG_THERMAL) += thermal.o
 ath10k_core-$(CONFIG_MAC80211_DEBUGFS) += debugfs_sta.o
 ath10k_core-$(CONFIG_PM) += wow.o
 ath10k_core-$(CONFIG_DEV_COREDUMP) += coredump.o
+ath10k_core-$(CONFIG_ATH10K_CE) += ce.o
 
 obj-$(CONFIG_ATH10K_PCI) += ath10k_pci.o
-ath10k_pci-y += pci.o \
-		ce.o
+ath10k_pci-y += pci.o
 
 ath10k_pci-$(CONFIG_ATH10K_AHB) += ahb.o
 
diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index b9def7bace2f..e7e7b342e5b8 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -464,6 +464,7 @@ int ath10k_ce_send_nolock(struct ath10k_ce_pipe *ce_state,
 	return ce_state->ops->ce_send_nolock(ce_state, per_transfer_context,
 				    buffer, nbytes, transfer_id, flags);
 }
+EXPORT_SYMBOL(ath10k_ce_send_nolock);
 
 void __ath10k_ce_send_revert(struct ath10k_ce_pipe *pipe)
 {
@@ -491,6 +492,7 @@ void __ath10k_ce_send_revert(struct ath10k_ce_pipe *pipe)
 
 	src_ring->per_transfer_context[src_ring->write_index] = NULL;
 }
+EXPORT_SYMBOL(__ath10k_ce_send_revert);
 
 int ath10k_ce_send(struct ath10k_ce_pipe *ce_state,
 		   void *per_transfer_context,
@@ -510,6 +512,7 @@ int ath10k_ce_send(struct ath10k_ce_pipe *ce_state,
 
 	return ret;
 }
+EXPORT_SYMBOL(ath10k_ce_send);
 
 int ath10k_ce_num_free_src_entries(struct ath10k_ce_pipe *pipe)
 {
@@ -525,6 +528,7 @@ int ath10k_ce_num_free_src_entries(struct ath10k_ce_pipe *pipe)
 
 	return delta;
 }
+EXPORT_SYMBOL(ath10k_ce_num_free_src_entries);
 
 int __ath10k_ce_rx_num_free_bufs(struct ath10k_ce_pipe *pipe)
 {
@@ -539,6 +543,7 @@ int __ath10k_ce_rx_num_free_bufs(struct ath10k_ce_pipe *pipe)
 
 	return CE_RING_DELTA(nentries_mask, write_index, sw_index - 1);
 }
+EXPORT_SYMBOL(__ath10k_ce_rx_num_free_bufs);
 
 static int __ath10k_ce_rx_post_buf(struct ath10k_ce_pipe *pipe, void *ctx,
 				   dma_addr_t paddr)
@@ -622,6 +627,7 @@ void ath10k_ce_rx_update_write_idx(struct ath10k_ce_pipe *pipe, u32 nentries)
 	ath10k_ce_dest_ring_write_index_set(ar, ctrl_addr, write_index);
 	dest_ring->write_index = write_index;
 }
+EXPORT_SYMBOL(ath10k_ce_rx_update_write_idx);
 
 int ath10k_ce_rx_post_buf(struct ath10k_ce_pipe *pipe, void *ctx,
 			  dma_addr_t paddr)
@@ -636,6 +642,7 @@ int ath10k_ce_rx_post_buf(struct ath10k_ce_pipe *pipe, void *ctx,
 
 	return ret;
 }
+EXPORT_SYMBOL(ath10k_ce_rx_post_buf);
 
 /*
  * Guts of ath10k_ce_completed_recv_next.
@@ -748,6 +755,7 @@ int ath10k_ce_completed_recv_next_nolock(struct ath10k_ce_pipe *ce_state,
 							    per_transfer_ctx,
 							    nbytesp);
 }
+EXPORT_SYMBOL(ath10k_ce_completed_recv_next_nolock);
 
 int ath10k_ce_completed_recv_next(struct ath10k_ce_pipe *ce_state,
 				  void **per_transfer_contextp,
@@ -766,6 +774,7 @@ int ath10k_ce_completed_recv_next(struct ath10k_ce_pipe *ce_state,
 
 	return ret;
 }
+EXPORT_SYMBOL(ath10k_ce_completed_recv_next);
 
 static int _ath10k_ce_revoke_recv_next(struct ath10k_ce_pipe *ce_state,
 				       void **per_transfer_contextp,
@@ -882,6 +891,7 @@ int ath10k_ce_revoke_recv_next(struct ath10k_ce_pipe *ce_state,
 						  per_transfer_contextp,
 						  bufferp);
 }
+EXPORT_SYMBOL(ath10k_ce_revoke_recv_next);
 
 /*
  * Guts of ath10k_ce_completed_send_next.
@@ -936,6 +946,7 @@ int ath10k_ce_completed_send_next_nolock(struct ath10k_ce_pipe *ce_state,
 
 	return 0;
 }
+EXPORT_SYMBOL(ath10k_ce_completed_send_next_nolock);
 
 static void ath10k_ce_extract_desc_data(struct ath10k *ar,
 					struct ath10k_ce_ring *src_ring,
@@ -1025,6 +1036,7 @@ int ath10k_ce_cancel_send_next(struct ath10k_ce_pipe *ce_state,
 
 	return ret;
 }
+EXPORT_SYMBOL(ath10k_ce_cancel_send_next);
 
 int ath10k_ce_completed_send_next(struct ath10k_ce_pipe *ce_state,
 				  void **per_transfer_contextp)
@@ -1040,6 +1052,7 @@ int ath10k_ce_completed_send_next(struct ath10k_ce_pipe *ce_state,
 
 	return ret;
 }
+EXPORT_SYMBOL(ath10k_ce_completed_send_next);
 
 /*
  * Guts of interrupt handler for per-engine interrupts on a particular CE.
@@ -1078,6 +1091,7 @@ void ath10k_ce_per_engine_service(struct ath10k *ar, unsigned int ce_id)
 
 	spin_unlock_bh(&ce->ce_lock);
 }
+EXPORT_SYMBOL(ath10k_ce_per_engine_service);
 
 /*
  * Handler for per-engine interrupts on ALL active CEs.
@@ -1102,6 +1116,7 @@ void ath10k_ce_per_engine_service_any(struct ath10k *ar)
 		ath10k_ce_per_engine_service(ar, ce_id);
 	}
 }
+EXPORT_SYMBOL(ath10k_ce_per_engine_service_any);
 
 /*
  * Adjust interrupts for the copy complete handler.
@@ -1139,6 +1154,7 @@ int ath10k_ce_disable_interrupts(struct ath10k *ar)
 
 	return 0;
 }
+EXPORT_SYMBOL(ath10k_ce_disable_interrupts);
 
 void ath10k_ce_enable_interrupts(struct ath10k *ar)
 {
@@ -1154,6 +1170,7 @@ void ath10k_ce_enable_interrupts(struct ath10k *ar)
 		ath10k_ce_per_engine_handler_adjust(ce_state);
 	}
 }
+EXPORT_SYMBOL(ath10k_ce_enable_interrupts);
 
 static int ath10k_ce_init_src_ring(struct ath10k *ar,
 				   unsigned int ce_id,
@@ -1454,6 +1471,7 @@ int ath10k_ce_init_pipe(struct ath10k *ar, unsigned int ce_id,
 
 	return 0;
 }
+EXPORT_SYMBOL(ath10k_ce_init_pipe);
 
 static void ath10k_ce_deinit_src_ring(struct ath10k *ar, unsigned int ce_id)
 {
@@ -1479,6 +1497,7 @@ void ath10k_ce_deinit_pipe(struct ath10k *ar, unsigned int ce_id)
 	ath10k_ce_deinit_src_ring(ar, ce_id);
 	ath10k_ce_deinit_dest_ring(ar, ce_id);
 }
+EXPORT_SYMBOL(ath10k_ce_deinit_pipe);
 
 static void _ath10k_ce_free_pipe(struct ath10k *ar, int ce_id)
 {
@@ -1545,6 +1564,7 @@ void ath10k_ce_free_pipe(struct ath10k *ar, int ce_id)
 
 	ce_state->ops->ce_free_pipe(ar, ce_id);
 }
+EXPORT_SYMBOL(ath10k_ce_free_pipe);
 
 void ath10k_ce_dump_registers(struct ath10k *ar,
 			      struct ath10k_fw_crash_data *crash_data)
@@ -1584,6 +1604,7 @@ void ath10k_ce_dump_registers(struct ath10k *ar,
 
 	spin_unlock_bh(&ce->ce_lock);
 }
+EXPORT_SYMBOL(ath10k_ce_dump_registers);
 
 static const struct ath10k_ce_ops ce_ops = {
 	.ce_alloc_src_ring = ath10k_ce_alloc_src_ring,
@@ -1680,3 +1701,4 @@ int ath10k_ce_alloc_pipe(struct ath10k *ar, int ce_id,
 
 	return 0;
 }
+EXPORT_SYMBOL(ath10k_ce_alloc_pipe);
-- 
cgit v1.2.3-59-g8ed1b


From 17f5559e0da51a8780cd93206689f05feca46615 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:14 +0300
Subject: ath10k: platform driver for WCN3990 SNOC WLAN module

WCN3990 is integrated 802.11ac chipset with SNOC
bus interface. Add snoc layer driver registration
and associated ops.

WCN3990 support is not yet complete as cold-boot
handshake is done using qmi(Qualcomm-MSM-Interface)
and qmi client support will be added once qmi framework
is available.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/Kconfig  |   8 ++
 drivers/net/wireless/ath/ath10k/Makefile |   3 +
 drivers/net/wireless/ath/ath10k/snoc.c   | 153 +++++++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/snoc.h   |  76 +++++++++++++++
 4 files changed, 240 insertions(+)
 create mode 100644 drivers/net/wireless/ath/ath10k/snoc.c
 create mode 100644 drivers/net/wireless/ath/ath10k/snoc.h

diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig
index d266f2792dae..84f071ac0d84 100644
--- a/drivers/net/wireless/ath/ath10k/Kconfig
+++ b/drivers/net/wireless/ath/ath10k/Kconfig
@@ -40,6 +40,14 @@ config ATH10K_USB
 	  This module adds experimental support for USB bus. Currently
 	  work in progress and will not fully work.
 
+config ATH10K_SNOC
+        tristate "Qualcomm ath10k SNOC support (EXPERIMENTAL)"
+        depends on ATH10K && ARCH_QCOM
+        ---help---
+          This module adds support for integrated WCN3990 chip connected
+          to system NOC(SNOC). Currently work in progress and will not
+          fully work.
+
 config ATH10K_DEBUG
 	bool "Atheros ath10k debugging"
 	depends on ATH10K
diff --git a/drivers/net/wireless/ath/ath10k/Makefile b/drivers/net/wireless/ath/ath10k/Makefile
index 536f3dfb9364..44d60a61b242 100644
--- a/drivers/net/wireless/ath/ath10k/Makefile
+++ b/drivers/net/wireless/ath/ath10k/Makefile
@@ -35,5 +35,8 @@ ath10k_sdio-y += sdio.o
 obj-$(CONFIG_ATH10K_USB) += ath10k_usb.o
 ath10k_usb-y += usb.o
 
+obj-$(CONFIG_ATH10K_SNOC) += ath10k_snoc.o
+ath10k_snoc-y += snoc.o
+
 # for tracing framework to find trace.h
 CFLAGS_trace.o := -I$(src)
diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
new file mode 100644
index 000000000000..30354a653b62
--- /dev/null
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include "debug.h"
+#include "hif.h"
+#include "htc.h"
+#include "ce.h"
+#include "snoc.h"
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+
+static const struct ath10k_snoc_drv_priv drv_priv = {
+	.hw_rev = ATH10K_HW_WCN3990,
+	.dma_mask = DMA_BIT_MASK(37),
+};
+
+void ath10k_snoc_write32(struct ath10k *ar, u32 offset, u32 value)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+
+	iowrite32(value, ar_snoc->mem + offset);
+}
+
+u32 ath10k_snoc_read32(struct ath10k *ar, u32 offset)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	u32 val;
+
+	val = ioread32(ar_snoc->mem + offset);
+
+	return val;
+}
+
+static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
+	.read32			= ath10k_snoc_read32,
+	.write32		= ath10k_snoc_write32,
+};
+
+static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
+	.read32		= ath10k_snoc_read32,
+	.write32	= ath10k_snoc_write32,
+};
+
+static const struct of_device_id ath10k_snoc_dt_match[] = {
+	{ .compatible = "qcom,wcn3990-wifi",
+	 .data = &drv_priv,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, ath10k_snoc_dt_match);
+
+static int ath10k_snoc_probe(struct platform_device *pdev)
+{
+	const struct ath10k_snoc_drv_priv *drv_data;
+	const struct of_device_id *of_id;
+	struct ath10k_snoc *ar_snoc;
+	struct device *dev;
+	struct ath10k *ar;
+	int ret;
+
+	of_id = of_match_device(ath10k_snoc_dt_match, &pdev->dev);
+	if (!of_id) {
+		dev_err(&pdev->dev, "failed to find matching device tree id\n");
+		return -EINVAL;
+	}
+
+	drv_data = of_id->data;
+	dev = &pdev->dev;
+
+	ret = dma_set_mask_and_coherent(dev, drv_data->dma_mask);
+	if (ret) {
+		dev_err(dev, "failed to set dma mask: %d", ret);
+		return ret;
+	}
+
+	ar = ath10k_core_create(sizeof(*ar_snoc), dev, ATH10K_BUS_SNOC,
+				drv_data->hw_rev, &ath10k_snoc_hif_ops);
+	if (!ar) {
+		dev_err(dev, "failed to allocate core\n");
+		return -ENOMEM;
+	}
+
+	ar_snoc = ath10k_snoc_priv(ar);
+	ar_snoc->dev = pdev;
+	platform_set_drvdata(pdev, ar);
+	ar_snoc->ar = ar;
+	ar_snoc->ce.bus_ops = &ath10k_snoc_bus_ops;
+	ar->ce_priv = &ar_snoc->ce;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc probe\n");
+	ath10k_warn(ar, "Warning: SNOC support is still work-in-progress, it will not work properly!");
+
+	return ret;
+}
+
+static int ath10k_snoc_remove(struct platform_device *pdev)
+{
+	struct ath10k *ar = platform_get_drvdata(pdev);
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc remove\n");
+	ath10k_core_destroy(ar);
+
+	return 0;
+}
+
+static struct platform_driver ath10k_snoc_driver = {
+		.probe  = ath10k_snoc_probe,
+		.remove = ath10k_snoc_remove,
+		.driver = {
+			.name   = "ath10k_snoc",
+			.owner = THIS_MODULE,
+			.of_match_table = ath10k_snoc_dt_match,
+		},
+};
+
+static int __init ath10k_snoc_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&ath10k_snoc_driver);
+	if (ret)
+		pr_err("failed to register ath10k snoc driver: %d\n",
+		       ret);
+
+	return ret;
+}
+module_init(ath10k_snoc_init);
+
+static void __exit ath10k_snoc_exit(void)
+{
+	platform_driver_unregister(&ath10k_snoc_driver);
+}
+module_exit(ath10k_snoc_exit);
+
+MODULE_AUTHOR("Qualcomm");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Driver support for Atheros WCN3990 SNOC devices");
diff --git a/drivers/net/wireless/ath/ath10k/snoc.h b/drivers/net/wireless/ath/ath10k/snoc.h
new file mode 100644
index 000000000000..cf65b01e0085
--- /dev/null
+++ b/drivers/net/wireless/ath/ath10k/snoc.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _SNOC_H_
+#define _SNOC_H_
+
+#include "hw.h"
+#include "ce.h"
+#include "pci.h"
+
+struct ath10k_snoc_drv_priv {
+	enum ath10k_hw_rev hw_rev;
+	u64 dma_mask;
+};
+
+struct snoc_state {
+	u32 pipe_cfg_addr;
+	u32 svc_to_pipe_map;
+};
+
+struct ath10k_snoc_pipe {
+	struct ath10k_ce_pipe *ce_hdl;
+	u8 pipe_num;
+	struct ath10k *hif_ce_state;
+	size_t buf_sz;
+	/* protect ce info */
+	spinlock_t pipe_lock;
+	struct ath10k_snoc *ar_snoc;
+};
+
+struct ath10k_snoc_target_info {
+	u32 target_version;
+	u32 target_type;
+	u32 target_revision;
+	u32 soc_version;
+};
+
+struct ath10k_snoc_ce_irq {
+	u32 irq_line;
+};
+
+struct ath10k_snoc {
+	struct platform_device *dev;
+	struct ath10k *ar;
+	void __iomem *mem;
+	dma_addr_t mem_pa;
+	struct ath10k_snoc_target_info target_info;
+	size_t mem_len;
+	struct ath10k_snoc_pipe pipe_info[CE_COUNT_MAX];
+	struct ath10k_snoc_ce_irq ce_irqs[CE_COUNT_MAX];
+	struct ath10k_ce ce;
+	struct timer_list rx_post_retry;
+};
+
+static inline struct ath10k_snoc *ath10k_snoc_priv(struct ath10k *ar)
+{
+	return (struct ath10k_snoc *)ar->drv_priv;
+}
+
+void ath10k_snoc_write32(struct ath10k *ar, u32 offset, u32 value);
+u32 ath10k_snoc_read32(struct ath10k *ar, u32 offset);
+
+#endif /* _SNOC_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From c963a683e70151dc458e9a85ed4b366b09f65e57 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:15 +0300
Subject: ath10k: add resource init and deinit for WCN3990

Add methods for resource(memory, irq, HOST CE config)
initialization and de-initialization for WCN3990 target.

WCN3990 target uses 12 copy engine and following CE config.

[CE0] :host->target control and raw streams
[CE1] :target->host HTT
[CE2] :target->host WMI
[CE3] :host->target WMI
[CE4] :host->target HTT
[CE5] :reserved
[CE6] :Target autonomous HIF_memcpy
[CE7] :reserved
[CE8] :reserved
[CE9] :target->host HTT
[CE10] :target->host HTT
[CE11] :target -> host PKTLOG

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 271 +++++++++++++++++++++++++++++++++
 1 file changed, 271 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 30354a653b62..575355ce675f 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -24,12 +24,135 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
+#define  WCN3990_CE_ATTR_FLAGS 0
+
+static char *const ce_name[] = {
+	"WLAN_CE_0",
+	"WLAN_CE_1",
+	"WLAN_CE_2",
+	"WLAN_CE_3",
+	"WLAN_CE_4",
+	"WLAN_CE_5",
+	"WLAN_CE_6",
+	"WLAN_CE_7",
+	"WLAN_CE_8",
+	"WLAN_CE_9",
+	"WLAN_CE_10",
+	"WLAN_CE_11",
+};
 
 static const struct ath10k_snoc_drv_priv drv_priv = {
 	.hw_rev = ATH10K_HW_WCN3990,
 	.dma_mask = DMA_BIT_MASK(37),
 };
 
+static struct ce_attr host_ce_config_wlan[] = {
+	/* CE0: host->target HTC control streams */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 16,
+		.src_sz_max = 2048,
+		.dest_nentries = 0,
+		.send_cb = NULL,
+	},
+
+	/* CE1: target->host HTT + HTC control */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 512,
+		.recv_cb = NULL,
+	},
+
+	/* CE2: target->host WMI */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 64,
+		.recv_cb = NULL,
+	},
+
+	/* CE3: host->target WMI */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 32,
+		.src_sz_max = 2048,
+		.dest_nentries = 0,
+		.send_cb = NULL,
+	},
+
+	/* CE4: host->target HTT */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS | CE_ATTR_DIS_INTR,
+		.src_nentries = 256,
+		.src_sz_max = 256,
+		.dest_nentries = 0,
+		.send_cb = NULL,
+	},
+
+	/* CE5: target->host HTT (ipa_uc->target ) */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 512,
+		.dest_nentries = 512,
+		.recv_cb = NULL,
+	},
+
+	/* CE6: target autonomous hif_memcpy */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 0,
+		.dest_nentries = 0,
+	},
+
+	/* CE7: ce_diag, the Diagnostic Window */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 2,
+		.src_sz_max = 2048,
+		.dest_nentries = 2,
+	},
+
+	/* CE8: Target to uMC */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 128,
+	},
+
+	/* CE9 target->host HTT */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 512,
+		.recv_cb = NULL,
+	},
+
+	/* CE10: target->host HTT */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 512,
+		.recv_cb = NULL,
+	},
+
+	/* CE11: target -> host PKTLOG */
+	{
+		.flags = WCN3990_CE_ATTR_FLAGS,
+		.src_nentries = 0,
+		.src_sz_max = 2048,
+		.dest_nentries = 512,
+		.recv_cb = NULL,
+	},
+};
+
 void ath10k_snoc_write32(struct ath10k *ar, u32 offset, u32 value)
 {
 	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
@@ -57,6 +180,119 @@ static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
 	.write32	= ath10k_snoc_write32,
 };
 
+static irqreturn_t ath10k_snoc_per_engine_handler(int irq, void *arg)
+{
+	return IRQ_HANDLED;
+}
+
+static int ath10k_snoc_request_irq(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	int irqflags = IRQF_TRIGGER_RISING;
+	int ret, id;
+
+	for (id = 0; id < CE_COUNT_MAX; id++) {
+		ret = request_irq(ar_snoc->ce_irqs[id].irq_line,
+				  ath10k_snoc_per_engine_handler,
+				  irqflags, ce_name[id], ar);
+		if (ret) {
+			ath10k_err(ar,
+				   "failed to register IRQ handler for CE %d: %d",
+				   id, ret);
+			goto err_irq;
+		}
+	}
+
+	return 0;
+
+err_irq:
+	for (id -= 1; id >= 0; id--)
+		free_irq(ar_snoc->ce_irqs[id].irq_line, ar);
+
+	return ret;
+}
+
+static void ath10k_snoc_free_irq(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	int id;
+
+	for (id = 0; id < CE_COUNT_MAX; id++)
+		free_irq(ar_snoc->ce_irqs[id].irq_line, ar);
+}
+
+static int ath10k_snoc_resource_init(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct platform_device *pdev;
+	struct resource *res;
+	int i, ret = 0;
+
+	pdev = ar_snoc->dev;
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "membase");
+	if (!res) {
+		ath10k_err(ar, "Memory base not found in DT\n");
+		return -EINVAL;
+	}
+
+	ar_snoc->mem_pa = res->start;
+	ar_snoc->mem = devm_ioremap(&pdev->dev, ar_snoc->mem_pa,
+				    resource_size(res));
+	if (!ar_snoc->mem) {
+		ath10k_err(ar, "Memory base ioremap failed with physical address %pa\n",
+			   &ar_snoc->mem_pa);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < CE_COUNT; i++) {
+		res = platform_get_resource(ar_snoc->dev, IORESOURCE_IRQ, i);
+		if (!res) {
+			ath10k_err(ar, "failed to get IRQ%d\n", i);
+			ret = -ENODEV;
+			goto out;
+		}
+		ar_snoc->ce_irqs[i].irq_line = res->start;
+	}
+
+out:
+	return ret;
+}
+
+static int ath10k_snoc_setup_resource(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	struct ath10k_snoc_pipe *pipe;
+	int i, ret;
+
+	spin_lock_init(&ce->ce_lock);
+	for (i = 0; i < CE_COUNT; i++) {
+		pipe = &ar_snoc->pipe_info[i];
+		pipe->ce_hdl = &ce->ce_states[i];
+		pipe->pipe_num = i;
+		pipe->hif_ce_state = ar;
+
+		ret = ath10k_ce_alloc_pipe(ar, i, &host_ce_config_wlan[i]);
+		if (ret) {
+			ath10k_err(ar, "failed to allocate copy engine pipe %d: %d\n",
+				   i, ret);
+			return ret;
+		}
+
+		pipe->buf_sz = host_ce_config_wlan[i].src_sz_max;
+	}
+
+	return 0;
+}
+
+static void ath10k_snoc_release_resource(struct ath10k *ar)
+{
+	int i;
+
+	for (i = 0; i < CE_COUNT; i++)
+		ath10k_ce_free_pipe(ar, i);
+}
+
 static const struct of_device_id ath10k_snoc_dt_match[] = {
 	{ .compatible = "qcom,wcn3990-wifi",
 	 .data = &drv_priv,
@@ -103,9 +339,41 @@ static int ath10k_snoc_probe(struct platform_device *pdev)
 	ar_snoc->ce.bus_ops = &ath10k_snoc_bus_ops;
 	ar->ce_priv = &ar_snoc->ce;
 
+	ath10k_snoc_resource_init(ar);
+	if (ret) {
+		ath10k_warn(ar, "failed to initialize resource: %d\n", ret);
+		goto err_core_destroy;
+	}
+
+	ath10k_snoc_setup_resource(ar);
+	if (ret) {
+		ath10k_warn(ar, "failed to setup resource: %d\n", ret);
+		goto err_core_destroy;
+	}
+	ret = ath10k_snoc_request_irq(ar);
+	if (ret) {
+		ath10k_warn(ar, "failed to request irqs: %d\n", ret);
+		goto err_release_resource;
+	}
+	ret = ath10k_core_register(ar, drv_data->hw_rev);
+	if (ret) {
+		ath10k_err(ar, "failed to register driver core: %d\n", ret);
+		goto err_free_irq;
+	}
 	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc probe\n");
 	ath10k_warn(ar, "Warning: SNOC support is still work-in-progress, it will not work properly!");
 
+	return 0;
+
+err_free_irq:
+	ath10k_snoc_free_irq(ar);
+
+err_release_resource:
+	ath10k_snoc_release_resource(ar);
+
+err_core_destroy:
+	ath10k_core_destroy(ar);
+
 	return ret;
 }
 
@@ -114,6 +382,9 @@ static int ath10k_snoc_remove(struct platform_device *pdev)
 	struct ath10k *ar = platform_get_drvdata(pdev);
 
 	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc remove\n");
+	ath10k_core_unregister(ar);
+	ath10k_snoc_free_irq(ar);
+	ath10k_snoc_release_resource(ar);
 	ath10k_core_destroy(ar);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From a6e149a9ff03e00b0a4293c0cc70b37db48fdf80 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:17 +0300
Subject: ath10k: add hif start/stop methods for wcn3990 snoc layer

Add hif start/stop callback for allocating/freeing buffers
on tx/rx pipe and enabling/disabling the tx/rx pipe
interrupts.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 189 ++++++++++++++++++++++++++++++++-
 1 file changed, 187 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 575355ce675f..dcd8bb7b71b4 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -25,6 +25,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #define  WCN3990_CE_ATTR_FLAGS 0
+#define ATH10K_SNOC_RX_POST_RETRY_MS 50
 
 static char *const ce_name[] = {
 	"WLAN_CE_0",
@@ -170,9 +171,193 @@ u32 ath10k_snoc_read32(struct ath10k *ar, u32 offset)
 	return val;
 }
 
+static int __ath10k_snoc_rx_post_buf(struct ath10k_snoc_pipe *pipe)
+{
+	struct ath10k_ce_pipe *ce_pipe = pipe->ce_hdl;
+	struct ath10k *ar = pipe->hif_ce_state;
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	struct sk_buff *skb;
+	dma_addr_t paddr;
+	int ret;
+
+	skb = dev_alloc_skb(pipe->buf_sz);
+	if (!skb)
+		return -ENOMEM;
+
+	WARN_ONCE((unsigned long)skb->data & 3, "unaligned skb");
+
+	paddr = dma_map_single(ar->dev, skb->data,
+			       skb->len + skb_tailroom(skb),
+			       DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(ar->dev, paddr))) {
+		ath10k_warn(ar, "failed to dma map snoc rx buf\n");
+		dev_kfree_skb_any(skb);
+		return -EIO;
+	}
+
+	ATH10K_SKB_RXCB(skb)->paddr = paddr;
+
+	spin_lock_bh(&ce->ce_lock);
+	ret = ce_pipe->ops->ce_rx_post_buf(ce_pipe, skb, paddr);
+	spin_unlock_bh(&ce->ce_lock);
+	if (ret) {
+		dma_unmap_single(ar->dev, paddr, skb->len + skb_tailroom(skb),
+				 DMA_FROM_DEVICE);
+		dev_kfree_skb_any(skb);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void ath10k_snoc_rx_post_pipe(struct ath10k_snoc_pipe *pipe)
+{
+	struct ath10k *ar = pipe->hif_ce_state;
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_ce_pipe *ce_pipe = pipe->ce_hdl;
+	int ret, num;
+
+	if (pipe->buf_sz == 0)
+		return;
+
+	if (!ce_pipe->dest_ring)
+		return;
+
+	spin_lock_bh(&ce->ce_lock);
+	num = __ath10k_ce_rx_num_free_bufs(ce_pipe);
+	spin_unlock_bh(&ce->ce_lock);
+	while (num--) {
+		ret = __ath10k_snoc_rx_post_buf(pipe);
+		if (ret) {
+			if (ret == -ENOSPC)
+				break;
+			ath10k_warn(ar, "failed to post rx buf: %d\n", ret);
+			mod_timer(&ar_snoc->rx_post_retry, jiffies +
+				  ATH10K_SNOC_RX_POST_RETRY_MS);
+			break;
+		}
+	}
+}
+
+static void ath10k_snoc_rx_post(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	int i;
+
+	for (i = 0; i < CE_COUNT; i++)
+		ath10k_snoc_rx_post_pipe(&ar_snoc->pipe_info[i]);
+}
+
+static inline void ath10k_snoc_irq_disable(struct ath10k *ar)
+{
+	ath10k_ce_disable_interrupts(ar);
+}
+
+static inline void ath10k_snoc_irq_enable(struct ath10k *ar)
+{
+	ath10k_ce_enable_interrupts(ar);
+}
+
+static void ath10k_snoc_rx_pipe_cleanup(struct ath10k_snoc_pipe *snoc_pipe)
+{
+	struct ath10k_ce_pipe *ce_pipe;
+	struct ath10k_ce_ring *ce_ring;
+	struct sk_buff *skb;
+	struct ath10k *ar;
+	int i;
+
+	ar = snoc_pipe->hif_ce_state;
+	ce_pipe = snoc_pipe->ce_hdl;
+	ce_ring = ce_pipe->dest_ring;
+
+	if (!ce_ring)
+		return;
+
+	if (!snoc_pipe->buf_sz)
+		return;
+
+	for (i = 0; i < ce_ring->nentries; i++) {
+		skb = ce_ring->per_transfer_context[i];
+		if (!skb)
+			continue;
+
+		ce_ring->per_transfer_context[i] = NULL;
+
+		dma_unmap_single(ar->dev, ATH10K_SKB_RXCB(skb)->paddr,
+				 skb->len + skb_tailroom(skb),
+				 DMA_FROM_DEVICE);
+		dev_kfree_skb_any(skb);
+	}
+}
+
+static void ath10k_snoc_tx_pipe_cleanup(struct ath10k_snoc_pipe *snoc_pipe)
+{
+	struct ath10k_ce_pipe *ce_pipe;
+	struct ath10k_ce_ring *ce_ring;
+	struct ath10k_snoc *ar_snoc;
+	struct sk_buff *skb;
+	struct ath10k *ar;
+	int i;
+
+	ar = snoc_pipe->hif_ce_state;
+	ar_snoc = ath10k_snoc_priv(ar);
+	ce_pipe = snoc_pipe->ce_hdl;
+	ce_ring = ce_pipe->src_ring;
+
+	if (!ce_ring)
+		return;
+
+	if (!snoc_pipe->buf_sz)
+		return;
+
+	for (i = 0; i < ce_ring->nentries; i++) {
+		skb = ce_ring->per_transfer_context[i];
+		if (!skb)
+			continue;
+
+		ce_ring->per_transfer_context[i] = NULL;
+
+		ath10k_htc_tx_completion_handler(ar, skb);
+	}
+}
+
+static void ath10k_snoc_buffer_cleanup(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_snoc_pipe *pipe_info;
+	int pipe_num;
+
+	del_timer_sync(&ar_snoc->rx_post_retry);
+	for (pipe_num = 0; pipe_num < CE_COUNT; pipe_num++) {
+		pipe_info = &ar_snoc->pipe_info[pipe_num];
+		ath10k_snoc_rx_pipe_cleanup(pipe_info);
+		ath10k_snoc_tx_pipe_cleanup(pipe_info);
+	}
+}
+
+static void ath10k_snoc_hif_stop(struct ath10k *ar)
+{
+	ath10k_snoc_irq_disable(ar);
+	ath10k_snoc_buffer_cleanup(ar);
+	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot hif stop\n");
+}
+
+static int ath10k_snoc_hif_start(struct ath10k *ar)
+{
+	ath10k_snoc_irq_enable(ar);
+	ath10k_snoc_rx_post(ar);
+
+	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot hif start\n");
+
+	return 0;
+}
+
 static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
-	.read32			= ath10k_snoc_read32,
-	.write32		= ath10k_snoc_write32,
+	.read32		= ath10k_snoc_read32,
+	.write32	= ath10k_snoc_write32,
+	.start		= ath10k_snoc_hif_start,
+	.stop		= ath10k_snoc_hif_stop,
 };
 
 static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From b8c27e86211832b06667f11be3c24acc78828ee5 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:18 +0300
Subject: ath10k: add HTC services for WCN3990

WCN3990 target uses 3 Copy engine(CE1/CE9/CE10) in RX path
and CE 11 for pktlog.
Add data path HTC ep services and PKTLOG services for WCN3990.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/htc.c | 6 ++++++
 drivers/net/wireless/ath/ath10k/htc.h | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/htc.c b/drivers/net/wireless/ath/ath10k/htc.c
index 492dc5b4bbf2..8902720b4e49 100644
--- a/drivers/net/wireless/ath/ath10k/htc.c
+++ b/drivers/net/wireless/ath/ath10k/htc.c
@@ -542,8 +542,14 @@ static const char *htc_service_name(enum ath10k_htc_svc_id id)
 		return "NMI Data";
 	case ATH10K_HTC_SVC_ID_HTT_DATA_MSG:
 		return "HTT Data";
+	case ATH10K_HTC_SVC_ID_HTT_DATA2_MSG:
+		return "HTT Data";
+	case ATH10K_HTC_SVC_ID_HTT_DATA3_MSG:
+		return "HTT Data";
 	case ATH10K_HTC_SVC_ID_TEST_RAW_STREAMS:
 		return "RAW";
+	case ATH10K_HTC_SVC_ID_HTT_LOG_MSG:
+		return "PKTLOG";
 	}
 
 	return "Unknown";
diff --git a/drivers/net/wireless/ath/ath10k/htc.h b/drivers/net/wireless/ath/ath10k/htc.h
index a2f8814b3e53..34877597dd6a 100644
--- a/drivers/net/wireless/ath/ath10k/htc.h
+++ b/drivers/net/wireless/ath/ath10k/htc.h
@@ -248,6 +248,7 @@ enum ath10k_htc_svc_gid {
 	ATH10K_HTC_SVC_GRP_WMI = 1,
 	ATH10K_HTC_SVC_GRP_NMI = 2,
 	ATH10K_HTC_SVC_GRP_HTT = 3,
+	ATH10K_LOG_SERVICE_GROUP = 6,
 
 	ATH10K_HTC_SVC_GRP_TEST = 254,
 	ATH10K_HTC_SVC_GRP_LAST = 255,
@@ -273,6 +274,9 @@ enum ath10k_htc_svc_id {
 
 	ATH10K_HTC_SVC_ID_HTT_DATA_MSG	= SVC(ATH10K_HTC_SVC_GRP_HTT, 0),
 
+	ATH10K_HTC_SVC_ID_HTT_DATA2_MSG = SVC(ATH10K_HTC_SVC_GRP_HTT, 1),
+	ATH10K_HTC_SVC_ID_HTT_DATA3_MSG = SVC(ATH10K_HTC_SVC_GRP_HTT, 2),
+	ATH10K_HTC_SVC_ID_HTT_LOG_MSG = SVC(ATH10K_LOG_SERVICE_GROUP, 0),
 	/* raw stream service (i.e. flash, tcmd, calibration apps) */
 	ATH10K_HTC_SVC_ID_TEST_RAW_STREAMS = SVC(ATH10K_HTC_SVC_GRP_TEST, 0),
 };
-- 
cgit v1.2.3-59-g8ed1b


From 97e19cce05e50055dafd1df71bdcbcdc3a7894c5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 19 Apr 2018 16:17:12 +0200
Subject: bpf: reserve xdp_frame size in xdp headroom

Commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on
page reuse") tried to allow user/bpf_prog to (re)use area used by
xdp_frame (stored in frame headroom), by memset clearing area when
bpf_xdp_adjust_head give bpf_prog access to headroom area.

The mentioned commit had two bugs. (1) Didn't take bpf_xdp_adjust_meta
into account. (2) a combination of bpf_xdp_adjust_head calls, where
xdp->data is moved into xdp_frame section, can cause clearing
xdp_frame area again for area previously granted to bpf_prog.

After discussions with Daniel, we choose to implement a simpler
solution to the problem, which is to reserve the headroom used by
xdp_frame info.

This also avoids the situation where bpf_prog is allowed to adjust/add
headers, and then XDP_REDIRECT later drops the packet due to lack of
headroom for the xdp_frame.  This would likely confuse the end-user.

Fixes: 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 29318598fd60..e25bc4a3aa1a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2694,20 +2694,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
 	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	unsigned long metalen = xdp_get_metalen(xdp);
-	void *data_start = xdp->data_hard_start + metalen;
+	void *data_start = xdp_frame_end + metalen;
 	void *data = xdp->data + offset;
 
 	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
-	/* Avoid info leak, when reusing area prev used by xdp_frame */
-	if (data < xdp_frame_end) {
-		unsigned long clearlen = xdp_frame_end - data;
-
-		memset(data, 0, clearlen);
-	}
-
 	if (metalen)
 		memmove(xdp->data_meta + offset,
 			xdp->data_meta, metalen);
@@ -2751,12 +2744,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
 
 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
 {
+	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	void *meta = xdp->data_meta + offset;
 	unsigned long metalen = xdp->data - meta;
 
 	if (xdp_data_meta_unsupported(xdp))
 		return -ENOTSUPP;
-	if (unlikely(meta < xdp->data_hard_start ||
+	if (unlikely(meta < xdp_frame_end ||
 		     meta > xdp->data))
 		return -EINVAL;
 	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
-- 
cgit v1.2.3-59-g8ed1b


From 84efe7f6ebc56dbeb18c3448a487f2265c647d91 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:20 +0300
Subject: ath10k: map HTC services to tx/rx pipes for wcn3990

Add mapping of HTC endpoint services supported
by wcn3990 target to tx/rx pipe.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 168 +++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index dcd8bb7b71b4..a39eee7ed56c 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -154,6 +154,116 @@ static struct ce_attr host_ce_config_wlan[] = {
 	},
 };
 
+static struct service_to_pipe target_service_to_ce_map_wlan[] = {
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_VO),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(3),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_VO),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_BK),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(3),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_BK),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_BE),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(3),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_BE),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_VI),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(3),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_DATA_VI),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_CONTROL),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(3),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_WMI_CONTROL),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_RSVD_CTRL),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(0),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_RSVD_CTRL),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{ /* not used */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_TEST_RAW_STREAMS),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(0),
+	},
+	{ /* not used */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_TEST_RAW_STREAMS),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(2),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_HTT_DATA_MSG),
+		__cpu_to_le32(PIPEDIR_OUT),	/* out = UL = host -> target */
+		__cpu_to_le32(4),
+	},
+	{
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_HTT_DATA_MSG),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(1),
+	},
+	{ /* not used */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_TEST_RAW_STREAMS),
+		__cpu_to_le32(PIPEDIR_OUT),
+		__cpu_to_le32(5),
+	},
+	{ /* in = DL = target -> host */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_HTT_DATA2_MSG),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(9),
+	},
+	{ /* in = DL = target -> host */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_HTT_DATA3_MSG),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(10),
+	},
+	{ /* in = DL = target -> host pktlog */
+		__cpu_to_le32(ATH10K_HTC_SVC_ID_HTT_LOG_MSG),
+		__cpu_to_le32(PIPEDIR_IN),	/* in = DL = target -> host */
+		__cpu_to_le32(11),
+	},
+	/* (Additions here) */
+
+	{ /* must be last */
+		__cpu_to_le32(0),
+		__cpu_to_le32(0),
+		__cpu_to_le32(0),
+	},
+};
+
 void ath10k_snoc_write32(struct ath10k *ar, u32 offset, u32 value)
 {
 	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
@@ -249,6 +359,62 @@ static void ath10k_snoc_rx_post(struct ath10k *ar)
 		ath10k_snoc_rx_post_pipe(&ar_snoc->pipe_info[i]);
 }
 
+static int ath10k_snoc_hif_map_service_to_pipe(struct ath10k *ar,
+					       u16 service_id,
+					       u8 *ul_pipe, u8 *dl_pipe)
+{
+	const struct service_to_pipe *entry;
+	bool ul_set = false, dl_set = false;
+	int i;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc hif map service\n");
+
+	for (i = 0; i < ARRAY_SIZE(target_service_to_ce_map_wlan); i++) {
+		entry = &target_service_to_ce_map_wlan[i];
+
+		if (__le32_to_cpu(entry->service_id) != service_id)
+			continue;
+
+		switch (__le32_to_cpu(entry->pipedir)) {
+		case PIPEDIR_NONE:
+			break;
+		case PIPEDIR_IN:
+			WARN_ON(dl_set);
+			*dl_pipe = __le32_to_cpu(entry->pipenum);
+			dl_set = true;
+			break;
+		case PIPEDIR_OUT:
+			WARN_ON(ul_set);
+			*ul_pipe = __le32_to_cpu(entry->pipenum);
+			ul_set = true;
+			break;
+		case PIPEDIR_INOUT:
+			WARN_ON(dl_set);
+			WARN_ON(ul_set);
+			*dl_pipe = __le32_to_cpu(entry->pipenum);
+			*ul_pipe = __le32_to_cpu(entry->pipenum);
+			dl_set = true;
+			ul_set = true;
+			break;
+		}
+	}
+
+	if (WARN_ON(!ul_set || !dl_set))
+		return -ENOENT;
+
+	return 0;
+}
+
+static void ath10k_snoc_hif_get_default_pipe(struct ath10k *ar,
+					     u8 *ul_pipe, u8 *dl_pipe)
+{
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc hif get default pipe\n");
+
+	(void)ath10k_snoc_hif_map_service_to_pipe(ar,
+						 ATH10K_HTC_SVC_ID_RSVD_CTRL,
+						 ul_pipe, dl_pipe);
+}
+
 static inline void ath10k_snoc_irq_disable(struct ath10k *ar)
 {
 	ath10k_ce_disable_interrupts(ar);
@@ -358,6 +524,8 @@ static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
 	.write32	= ath10k_snoc_write32,
 	.start		= ath10k_snoc_hif_start,
 	.stop		= ath10k_snoc_hif_stop,
+	.map_service_to_pipe	= ath10k_snoc_hif_map_service_to_pipe,
+	.get_default_pipe	= ath10k_snoc_hif_get_default_pipe,
 };
 
 static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 0fa4fbe9b8cf7656813382602a6a5d330f01d3ae Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:22 +0300
Subject: ath10k: add hif power-up/power-down methods

Add hif power-up/power-down methods for wcn3990
target.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 61 ++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index a39eee7ed56c..9d402e4afaf9 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -519,6 +519,65 @@ static int ath10k_snoc_hif_start(struct ath10k *ar)
 	return 0;
 }
 
+static int ath10k_snoc_init_pipes(struct ath10k *ar)
+{
+	int i, ret;
+
+	for (i = 0; i < CE_COUNT; i++) {
+		ret = ath10k_ce_init_pipe(ar, i, &host_ce_config_wlan[i]);
+		if (ret) {
+			ath10k_err(ar, "failed to initialize copy engine pipe %d: %d\n",
+				   i, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int ath10k_snoc_wlan_enable(struct ath10k *ar)
+{
+	return 0;
+}
+
+static void ath10k_snoc_wlan_disable(struct ath10k *ar)
+{
+}
+
+static void ath10k_snoc_hif_power_down(struct ath10k *ar)
+{
+	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot hif power down\n");
+
+	ath10k_snoc_wlan_disable(ar);
+}
+
+static int ath10k_snoc_hif_power_up(struct ath10k *ar)
+{
+	int ret;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "%s:WCN3990 driver state = %d\n",
+		   __func__, ar->state);
+
+	ret = ath10k_snoc_wlan_enable(ar);
+	if (ret) {
+		ath10k_err(ar, "failed to enable wcn3990: %d\n", ret);
+		return ret;
+	}
+
+	ret = ath10k_snoc_init_pipes(ar);
+	if (ret) {
+		ath10k_err(ar, "failed to initialize CE: %d\n", ret);
+		goto err_wlan_enable;
+	}
+
+	return 0;
+
+err_wlan_enable:
+	ath10k_snoc_wlan_disable(ar);
+
+	return ret;
+}
+
 static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
 	.read32		= ath10k_snoc_read32,
 	.write32	= ath10k_snoc_write32,
@@ -526,6 +585,8 @@ static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
 	.stop		= ath10k_snoc_hif_stop,
 	.map_service_to_pipe	= ath10k_snoc_hif_map_service_to_pipe,
 	.get_default_pipe	= ath10k_snoc_hif_get_default_pipe,
+	.power_up		= ath10k_snoc_hif_power_up,
+	.power_down		= ath10k_snoc_hif_power_down,
 };
 
 static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From d390509bdf501c9c8c6e61248e4bc9314c86d854 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:24 +0300
Subject: ath10k: add hif tx methods for wcn3990

Add hif tx/tx-complete methods for wcn3990
target.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 123 ++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 9d402e4afaf9..899f0a327050 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -42,6 +42,9 @@ static char *const ce_name[] = {
 	"WLAN_CE_11",
 };
 
+static void ath10k_snoc_htc_tx_cb(struct ath10k_ce_pipe *ce_state);
+static void ath10k_snoc_htt_tx_cb(struct ath10k_ce_pipe *ce_state);
+
 static const struct ath10k_snoc_drv_priv drv_priv = {
 	.hw_rev = ATH10K_HW_WCN3990,
 	.dma_mask = DMA_BIT_MASK(37),
@@ -54,7 +57,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 		.src_nentries = 16,
 		.src_sz_max = 2048,
 		.dest_nentries = 0,
-		.send_cb = NULL,
+		.send_cb = ath10k_snoc_htc_tx_cb,
 	},
 
 	/* CE1: target->host HTT + HTC control */
@@ -81,7 +84,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 		.src_nentries = 32,
 		.src_sz_max = 2048,
 		.dest_nentries = 0,
-		.send_cb = NULL,
+		.send_cb = ath10k_snoc_htc_tx_cb,
 	},
 
 	/* CE4: host->target HTT */
@@ -90,7 +93,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 		.src_nentries = 256,
 		.src_sz_max = 256,
 		.dest_nentries = 0,
-		.send_cb = NULL,
+		.send_cb = ath10k_snoc_htt_tx_cb,
 	},
 
 	/* CE5: target->host HTT (ipa_uc->target ) */
@@ -359,6 +362,117 @@ static void ath10k_snoc_rx_post(struct ath10k *ar)
 		ath10k_snoc_rx_post_pipe(&ar_snoc->pipe_info[i]);
 }
 
+static void ath10k_snoc_htc_tx_cb(struct ath10k_ce_pipe *ce_state)
+{
+	struct ath10k *ar = ce_state->ar;
+	struct sk_buff_head list;
+	struct sk_buff *skb;
+
+	__skb_queue_head_init(&list);
+	while (ath10k_ce_completed_send_next(ce_state, (void **)&skb) == 0) {
+		if (!skb)
+			continue;
+
+		__skb_queue_tail(&list, skb);
+	}
+
+	while ((skb = __skb_dequeue(&list)))
+		ath10k_htc_tx_completion_handler(ar, skb);
+}
+
+static void ath10k_snoc_htt_tx_cb(struct ath10k_ce_pipe *ce_state)
+{
+	struct ath10k *ar = ce_state->ar;
+	struct sk_buff *skb;
+
+	while (ath10k_ce_completed_send_next(ce_state, (void **)&skb) == 0) {
+		if (!skb)
+			continue;
+
+		dma_unmap_single(ar->dev, ATH10K_SKB_CB(skb)->paddr,
+				 skb->len, DMA_TO_DEVICE);
+		ath10k_htt_hif_tx_complete(ar, skb);
+	}
+}
+
+static int ath10k_snoc_hif_tx_sg(struct ath10k *ar, u8 pipe_id,
+				 struct ath10k_hif_sg_item *items, int n_items)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	struct ath10k_snoc_pipe *snoc_pipe;
+	struct ath10k_ce_pipe *ce_pipe;
+	int err, i = 0;
+
+	snoc_pipe = &ar_snoc->pipe_info[pipe_id];
+	ce_pipe = snoc_pipe->ce_hdl;
+	spin_lock_bh(&ce->ce_lock);
+
+	for (i = 0; i < n_items - 1; i++) {
+		ath10k_dbg(ar, ATH10K_DBG_SNOC,
+			   "snoc tx item %d paddr %pad len %d n_items %d\n",
+			   i, &items[i].paddr, items[i].len, n_items);
+
+		err = ath10k_ce_send_nolock(ce_pipe,
+					    items[i].transfer_context,
+					    items[i].paddr,
+					    items[i].len,
+					    items[i].transfer_id,
+					    CE_SEND_FLAG_GATHER);
+		if (err)
+			goto err;
+	}
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC,
+		   "snoc tx item %d paddr %pad len %d n_items %d\n",
+		   i, &items[i].paddr, items[i].len, n_items);
+
+	err = ath10k_ce_send_nolock(ce_pipe,
+				    items[i].transfer_context,
+				    items[i].paddr,
+				    items[i].len,
+				    items[i].transfer_id,
+				    0);
+	if (err)
+		goto err;
+
+	spin_unlock_bh(&ce->ce_lock);
+
+	return 0;
+
+err:
+	for (; i > 0; i--)
+		__ath10k_ce_send_revert(ce_pipe);
+
+	spin_unlock_bh(&ce->ce_lock);
+	return err;
+}
+
+static u16 ath10k_snoc_hif_get_free_queue_number(struct ath10k *ar, u8 pipe)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "hif get free queue number\n");
+
+	return ath10k_ce_num_free_src_entries(ar_snoc->pipe_info[pipe].ce_hdl);
+}
+
+static void ath10k_snoc_hif_send_complete_check(struct ath10k *ar, u8 pipe,
+						int force)
+{
+	int resources;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc hif send complete check\n");
+
+	if (!force) {
+		resources = ath10k_snoc_hif_get_free_queue_number(ar, pipe);
+
+		if (resources > (host_ce_config_wlan[pipe].src_nentries >> 1))
+			return;
+	}
+	ath10k_ce_per_engine_service(ar, pipe);
+}
+
 static int ath10k_snoc_hif_map_service_to_pipe(struct ath10k *ar,
 					       u16 service_id,
 					       u8 *ul_pipe, u8 *dl_pipe)
@@ -587,6 +701,9 @@ static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
 	.get_default_pipe	= ath10k_snoc_hif_get_default_pipe,
 	.power_up		= ath10k_snoc_hif_power_up,
 	.power_down		= ath10k_snoc_hif_power_down,
+	.tx_sg			= ath10k_snoc_hif_tx_sg,
+	.send_complete_check	= ath10k_snoc_hif_send_complete_check,
+	.get_free_queue_number	= ath10k_snoc_hif_get_free_queue_number,
 };
 
 static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From d915105231ca0581a9f87e59ed00bc17a54e254f Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:25 +0300
Subject: ath10k: add hif rx methods for wcn3990

Add hif rx methods in rx path for wcn3990
target.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 171 +++++++++++++++++++++++++++++----
 1 file changed, 153 insertions(+), 18 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 899f0a327050..6d9cccee9bbe 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -26,6 +26,7 @@
 #include <linux/platform_device.h>
 #define  WCN3990_CE_ATTR_FLAGS 0
 #define ATH10K_SNOC_RX_POST_RETRY_MS 50
+#define CE_POLL_PIPE 4
 
 static char *const ce_name[] = {
 	"WLAN_CE_0",
@@ -44,6 +45,9 @@ static char *const ce_name[] = {
 
 static void ath10k_snoc_htc_tx_cb(struct ath10k_ce_pipe *ce_state);
 static void ath10k_snoc_htt_tx_cb(struct ath10k_ce_pipe *ce_state);
+static void ath10k_snoc_htc_rx_cb(struct ath10k_ce_pipe *ce_state);
+static void ath10k_snoc_htt_rx_cb(struct ath10k_ce_pipe *ce_state);
+static void ath10k_snoc_htt_htc_rx_cb(struct ath10k_ce_pipe *ce_state);
 
 static const struct ath10k_snoc_drv_priv drv_priv = {
 	.hw_rev = ATH10K_HW_WCN3990,
@@ -53,7 +57,7 @@ static const struct ath10k_snoc_drv_priv drv_priv = {
 static struct ce_attr host_ce_config_wlan[] = {
 	/* CE0: host->target HTC control streams */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 16,
 		.src_sz_max = 2048,
 		.dest_nentries = 0,
@@ -62,25 +66,25 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE1: target->host HTT + HTC control */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 512,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htt_htc_rx_cb,
 	},
 
 	/* CE2: target->host WMI */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 64,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htc_rx_cb,
 	},
 
 	/* CE3: host->target WMI */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 32,
 		.src_sz_max = 2048,
 		.dest_nentries = 0,
@@ -89,7 +93,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE4: host->target HTT */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS | CE_ATTR_DIS_INTR,
+		.flags = CE_ATTR_FLAGS | CE_ATTR_DIS_INTR,
 		.src_nentries = 256,
 		.src_sz_max = 256,
 		.dest_nentries = 0,
@@ -98,16 +102,16 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE5: target->host HTT (ipa_uc->target ) */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 512,
 		.dest_nentries = 512,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htt_rx_cb,
 	},
 
 	/* CE6: target autonomous hif_memcpy */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 0,
 		.dest_nentries = 0,
@@ -115,7 +119,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE7: ce_diag, the Diagnostic Window */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 2,
 		.src_sz_max = 2048,
 		.dest_nentries = 2,
@@ -123,7 +127,7 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE8: Target to uMC */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 128,
@@ -131,29 +135,29 @@ static struct ce_attr host_ce_config_wlan[] = {
 
 	/* CE9 target->host HTT */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 512,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htt_htc_rx_cb,
 	},
 
 	/* CE10: target->host HTT */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 512,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htt_htc_rx_cb,
 	},
 
 	/* CE11: target -> host PKTLOG */
 	{
-		.flags = WCN3990_CE_ATTR_FLAGS,
+		.flags = CE_ATTR_FLAGS,
 		.src_nentries = 0,
 		.src_sz_max = 2048,
 		.dest_nentries = 512,
-		.recv_cb = NULL,
+		.recv_cb = ath10k_snoc_htt_htc_rx_cb,
 	},
 };
 
@@ -362,6 +366,82 @@ static void ath10k_snoc_rx_post(struct ath10k *ar)
 		ath10k_snoc_rx_post_pipe(&ar_snoc->pipe_info[i]);
 }
 
+static void ath10k_snoc_process_rx_cb(struct ath10k_ce_pipe *ce_state,
+				      void (*callback)(struct ath10k *ar,
+						       struct sk_buff *skb))
+{
+	struct ath10k *ar = ce_state->ar;
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_snoc_pipe *pipe_info =  &ar_snoc->pipe_info[ce_state->id];
+	struct sk_buff *skb;
+	struct sk_buff_head list;
+	void *transfer_context;
+	unsigned int nbytes, max_nbytes;
+
+	__skb_queue_head_init(&list);
+	while (ath10k_ce_completed_recv_next(ce_state, &transfer_context,
+					     &nbytes) == 0) {
+		skb = transfer_context;
+		max_nbytes = skb->len + skb_tailroom(skb);
+		dma_unmap_single(ar->dev, ATH10K_SKB_RXCB(skb)->paddr,
+				 max_nbytes, DMA_FROM_DEVICE);
+
+		if (unlikely(max_nbytes < nbytes)) {
+			ath10k_warn(ar, "rxed more than expected (nbytes %d, max %d)",
+				    nbytes, max_nbytes);
+			dev_kfree_skb_any(skb);
+			continue;
+		}
+
+		skb_put(skb, nbytes);
+		__skb_queue_tail(&list, skb);
+	}
+
+	while ((skb = __skb_dequeue(&list))) {
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc rx ce pipe %d len %d\n",
+			   ce_state->id, skb->len);
+
+		callback(ar, skb);
+	}
+
+	ath10k_snoc_rx_post_pipe(pipe_info);
+}
+
+static void ath10k_snoc_htc_rx_cb(struct ath10k_ce_pipe *ce_state)
+{
+	ath10k_snoc_process_rx_cb(ce_state, ath10k_htc_rx_completion_handler);
+}
+
+static void ath10k_snoc_htt_htc_rx_cb(struct ath10k_ce_pipe *ce_state)
+{
+	/* CE4 polling needs to be done whenever CE pipe which transports
+	 * HTT Rx (target->host) is processed.
+	 */
+	ath10k_ce_per_engine_service(ce_state->ar, CE_POLL_PIPE);
+
+	ath10k_snoc_process_rx_cb(ce_state, ath10k_htc_rx_completion_handler);
+}
+
+static void ath10k_snoc_htt_rx_deliver(struct ath10k *ar, struct sk_buff *skb)
+{
+	skb_pull(skb, sizeof(struct ath10k_htc_hdr));
+	ath10k_htt_t2h_msg_handler(ar, skb);
+}
+
+static void ath10k_snoc_htt_rx_cb(struct ath10k_ce_pipe *ce_state)
+{
+	ath10k_ce_per_engine_service(ce_state->ar, CE_POLL_PIPE);
+	ath10k_snoc_process_rx_cb(ce_state, ath10k_snoc_htt_rx_deliver);
+}
+
+static void ath10k_snoc_rx_replenish_retry(struct timer_list *t)
+{
+	struct ath10k_pci *ar_snoc = from_timer(ar_snoc, t, rx_post_retry);
+	struct ath10k *ar = ar_snoc->ar;
+
+	ath10k_snoc_rx_post(ar);
+}
+
 static void ath10k_snoc_htc_tx_cb(struct ath10k_ce_pipe *ce_state)
 {
 	struct ath10k *ar = ce_state->ar;
@@ -620,6 +700,8 @@ static void ath10k_snoc_hif_stop(struct ath10k *ar)
 {
 	ath10k_snoc_irq_disable(ar);
 	ath10k_snoc_buffer_cleanup(ar);
+	napi_synchronize(&ar->napi);
+	napi_disable(&ar->napi);
 	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot hif stop\n");
 }
 
@@ -684,6 +766,7 @@ static int ath10k_snoc_hif_power_up(struct ath10k *ar)
 		goto err_wlan_enable;
 	}
 
+	napi_enable(&ar->napi);
 	return 0;
 
 err_wlan_enable:
@@ -711,11 +794,60 @@ static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
 	.write32	= ath10k_snoc_write32,
 };
 
+int ath10k_snoc_get_ce_id_from_irq(struct ath10k *ar, int irq)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	int i;
+
+	for (i = 0; i < CE_COUNT_MAX; i++) {
+		if (ar_snoc->ce_irqs[i].irq_line == irq)
+			return i;
+	}
+	ath10k_err(ar, "No matching CE id for irq %d\n", irq);
+
+	return -EINVAL;
+}
+
 static irqreturn_t ath10k_snoc_per_engine_handler(int irq, void *arg)
 {
+	struct ath10k *ar = arg;
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	int ce_id = ath10k_snoc_get_ce_id_from_irq(ar, irq);
+
+	if (ce_id < 0 || ce_id >= ARRAY_SIZE(ar_snoc->pipe_info)) {
+		ath10k_warn(ar, "unexpected/invalid irq %d ce_id %d\n", irq,
+			    ce_id);
+		return IRQ_HANDLED;
+	}
+
+	ath10k_snoc_irq_disable(ar);
+	napi_schedule(&ar->napi);
+
 	return IRQ_HANDLED;
 }
 
+static int ath10k_snoc_napi_poll(struct napi_struct *ctx, int budget)
+{
+	struct ath10k *ar = container_of(ctx, struct ath10k, napi);
+	int done = 0;
+
+	ath10k_ce_per_engine_service_any(ar);
+	done = ath10k_htt_txrx_compl_task(ar, budget);
+
+	if (done < budget) {
+		napi_complete(ctx);
+		ath10k_snoc_irq_enable(ar);
+	}
+
+	return done;
+}
+
+void ath10k_snoc_init_napi(struct ath10k *ar)
+{
+	netif_napi_add(&ar->napi_dev, &ar->napi, ath10k_snoc_napi_poll,
+		       ATH10K_NAPI_BUDGET);
+}
+
 static int ath10k_snoc_request_irq(struct ath10k *ar)
 {
 	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
@@ -796,6 +928,7 @@ static int ath10k_snoc_setup_resource(struct ath10k *ar)
 	struct ath10k_snoc_pipe *pipe;
 	int i, ret;
 
+	timer_setup(&ar_snoc->rx_post_retry, ath10k_snoc_rx_replenish_retry, 0);
 	spin_lock_init(&ce->ce_lock);
 	for (i = 0; i < CE_COUNT; i++) {
 		pipe = &ar_snoc->pipe_info[i];
@@ -812,6 +945,7 @@ static int ath10k_snoc_setup_resource(struct ath10k *ar)
 
 		pipe->buf_sz = host_ce_config_wlan[i].src_sz_max;
 	}
+	ath10k_snoc_init_napi(ar);
 
 	return 0;
 }
@@ -820,6 +954,7 @@ static void ath10k_snoc_release_resource(struct ath10k *ar)
 {
 	int i;
 
+	netif_napi_del(&ar->napi);
 	for (i = 0; i < CE_COUNT; i++)
 		ath10k_ce_free_pipe(ar, i);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 546d407c905bc893886edae52f2323db8eded9b9 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:27 +0300
Subject: ath10k: modify hif tx paddr to dma_addr_t type

Change type of hif sg tx paddr to dma_addr_t for
supporting target having addressing mode greater than
32 bit.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/hif.h | 2 +-
 drivers/net/wireless/ath/ath10k/pci.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/hif.h b/drivers/net/wireless/ath/ath10k/hif.h
index 6da4e3369c5a..7abb13c1f10b 100644
--- a/drivers/net/wireless/ath/ath10k/hif.h
+++ b/drivers/net/wireless/ath/ath10k/hif.h
@@ -26,7 +26,7 @@ struct ath10k_hif_sg_item {
 	u16 transfer_id;
 	void *transfer_context; /* NULL = tx completion callback not called */
 	void *vaddr; /* for debugging mostly */
-	u32 paddr;
+	dma_addr_t paddr;
 	u16 len;
 };
 
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index fd1566cd7d2b..af2cf55c4c1e 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -1383,8 +1383,8 @@ int ath10k_pci_hif_tx_sg(struct ath10k *ar, u8 pipe_id,
 
 	for (i = 0; i < n_items - 1; i++) {
 		ath10k_dbg(ar, ATH10K_DBG_PCI,
-			   "pci tx item %d paddr 0x%08x len %d n_items %d\n",
-			   i, items[i].paddr, items[i].len, n_items);
+			   "pci tx item %d paddr %pad len %d n_items %d\n",
+			   i, &items[i].paddr, items[i].len, n_items);
 		ath10k_dbg_dump(ar, ATH10K_DBG_PCI_DUMP, NULL, "pci tx data: ",
 				items[i].vaddr, items[i].len);
 
@@ -1401,8 +1401,8 @@ int ath10k_pci_hif_tx_sg(struct ath10k *ar, u8 pipe_id,
 	/* `i` is equal to `n_items -1` after for() */
 
 	ath10k_dbg(ar, ATH10K_DBG_PCI,
-		   "pci tx item %d paddr 0x%08x len %d n_items %d\n",
-		   i, items[i].paddr, items[i].len, n_items);
+		   "pci tx item %d paddr %pad len %d n_items %d\n",
+		   i, &items[i].paddr, items[i].len, n_items);
 	ath10k_dbg_dump(ar, ATH10K_DBG_PCI_DUMP, NULL, "pci tx data: ",
 			items[i].vaddr, items[i].len);
 
-- 
cgit v1.2.3-59-g8ed1b


From 140d1214ef555bcb14c7720e91d8a9594e4ab506 Mon Sep 17 00:00:00 2001
From: Rakesh Pillai <pillair@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:29 +0300
Subject: ath10k: add support to get target info from hif ops

wcn3990 does not use bmi.
Add support to get target info from hif ops.

Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c |  8 ++++++++
 drivers/net/wireless/ath/ath10k/hif.h  | 13 +++++++++++++
 drivers/net/wireless/ath/ath10k/snoc.c | 10 ++++++++++
 3 files changed, 31 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 8a3020dbd4cf..6a9ad4ab8e4c 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2472,6 +2472,14 @@ static int ath10k_core_probe_fw(struct ath10k *ar)
 		ar->hw->wiphy->hw_version = target_info.version;
 		break;
 	case ATH10K_BUS_SNOC:
+		memset(&target_info, 0, sizeof(target_info));
+		ret = ath10k_hif_get_target_info(ar, &target_info);
+		if (ret) {
+			ath10k_err(ar, "could not get target info (%d)\n", ret);
+			goto err_power_down;
+		}
+		ar->target_version = target_info.version;
+		ar->hw->wiphy->hw_version = target_info.version;
 		break;
 	default:
 		ath10k_err(ar, "incorrect hif bus type: %d\n", ar->hif.bus);
diff --git a/drivers/net/wireless/ath/ath10k/hif.h b/drivers/net/wireless/ath/ath10k/hif.h
index 7abb13c1f10b..1a59ea0068c2 100644
--- a/drivers/net/wireless/ath/ath10k/hif.h
+++ b/drivers/net/wireless/ath/ath10k/hif.h
@@ -20,6 +20,7 @@
 
 #include <linux/kernel.h>
 #include "core.h"
+#include "bmi.h"
 #include "debug.h"
 
 struct ath10k_hif_sg_item {
@@ -93,6 +94,9 @@ struct ath10k_hif_ops {
 	/* fetch calibration data from target eeprom */
 	int (*fetch_cal_eeprom)(struct ath10k *ar, void **data,
 				size_t *data_len);
+
+	int (*get_target_info)(struct ath10k *ar,
+			       struct bmi_target_info *target_info);
 };
 
 static inline int ath10k_hif_tx_sg(struct ath10k *ar, u8 pipe_id,
@@ -218,4 +222,13 @@ static inline int ath10k_hif_fetch_cal_eeprom(struct ath10k *ar,
 	return ar->hif.ops->fetch_cal_eeprom(ar, data, data_len);
 }
 
+static inline int ath10k_hif_get_target_info(struct ath10k *ar,
+					     struct bmi_target_info *tgt_info)
+{
+	if (!ar->hif.ops->get_target_info)
+		return -EOPNOTSUPP;
+
+	return ar->hif.ops->get_target_info(ar, tgt_info);
+}
+
 #endif /* _HIF_H_ */
diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 6d9cccee9bbe..1ef0d3b51b8f 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -528,6 +528,15 @@ err:
 	return err;
 }
 
+static int ath10k_snoc_hif_get_target_info(struct ath10k *ar,
+					   struct bmi_target_info *target_info)
+{
+	target_info->version = ATH10K_HW_WCN3990;
+	target_info->type = ATH10K_HW_WCN3990;
+
+	return 0;
+}
+
 static u16 ath10k_snoc_hif_get_free_queue_number(struct ath10k *ar, u8 pipe)
 {
 	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
@@ -787,6 +796,7 @@ static const struct ath10k_hif_ops ath10k_snoc_hif_ops = {
 	.tx_sg			= ath10k_snoc_hif_tx_sg,
 	.send_complete_check	= ath10k_snoc_hif_send_complete_check,
 	.get_free_queue_number	= ath10k_snoc_hif_get_free_queue_number,
+	.get_target_info	= ath10k_snoc_hif_get_target_info,
 };
 
 static const struct ath10k_bus_ops ath10k_snoc_bus_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From ea66b12e63ac62de19685c6900903d22e2680be6 Mon Sep 17 00:00:00 2001
From: Rakesh Pillai <pillair@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:32 +0300
Subject: ath10k: check all CE for data if irq summary is not retained

WCN3990 has interrupts per CE and the interrupt summary
is not retained after the interrupt handler has finished
execution. We need to check if we received any
ce in rx and tx completion path.

Generate a interrupt summary with all CE interrupts if
the target does not retain interrupt summary after the
execution of interrupt handler.

Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/ce.h   | 10 +++++++---
 drivers/net/wireless/ath/ath10k/core.c | 13 +++++++++++++
 drivers/net/wireless/ath/ath10k/hw.h   |  3 +++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.h b/drivers/net/wireless/ath/ath10k/ce.h
index 2c3c8f5e90ea..d8f9da334529 100644
--- a/drivers/net/wireless/ath/ath10k/ce.h
+++ b/drivers/net/wireless/ath/ath10k/ce.h
@@ -355,14 +355,18 @@ static inline u32 ath10k_ce_base_address(struct ath10k *ar, unsigned int ce_id)
 	(((x) & CE_WRAPPER_INTERRUPT_SUMMARY_HOST_MSI_MASK) >> \
 		CE_WRAPPER_INTERRUPT_SUMMARY_HOST_MSI_LSB)
 #define CE_WRAPPER_INTERRUPT_SUMMARY_ADDRESS			0x0000
+#define CE_INTERRUPT_SUMMARY		(GENMASK(CE_COUNT_MAX - 1, 0))
 
 static inline u32 ath10k_ce_interrupt_summary(struct ath10k *ar)
 {
 	struct ath10k_ce *ce = ath10k_ce_priv(ar);
 
-	return CE_WRAPPER_INTERRUPT_SUMMARY_HOST_MSI_GET(
-		ce->bus_ops->read32((ar), CE_WRAPPER_BASE_ADDRESS +
-		CE_WRAPPER_INTERRUPT_SUMMARY_ADDRESS));
+	if (!ar->hw_params.per_ce_irq)
+		return CE_WRAPPER_INTERRUPT_SUMMARY_HOST_MSI_GET(
+			ce->bus_ops->read32((ar), CE_WRAPPER_BASE_ADDRESS +
+			CE_WRAPPER_INTERRUPT_SUMMARY_ADDRESS));
+	else
+		return CE_INTERRUPT_SUMMARY;
 }
 
 #endif /* _CE_H_ */
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 6a9ad4ab8e4c..a21530dacc35 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -119,6 +119,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA9887_HW_1_0_VERSION,
@@ -148,6 +149,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -176,6 +178,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -204,6 +207,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA6174_HW_3_0_VERSION,
@@ -232,6 +236,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA6174_HW_3_2_VERSION,
@@ -263,6 +268,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA99X0_HW_2_0_DEV_VERSION,
@@ -297,6 +303,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA9984_HW_1_0_DEV_VERSION,
@@ -336,6 +343,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA9888_HW_2_0_DEV_VERSION,
@@ -374,6 +382,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA9377_HW_1_0_DEV_VERSION,
@@ -402,6 +411,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA9377_HW_1_1_DEV_VERSION,
@@ -432,6 +442,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = QCA4019_HW_1_0_DEV_VERSION,
@@ -467,6 +478,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.per_ce_irq = false,
 	},
 	{
 		.id = WCN3990_HW_1_0_DEV_VERSION,
@@ -487,6 +499,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = TARGET_HL_10_TLV_NUM_WDS_ENTRIES,
 		.target_64bit = true,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL_DUAL_MAC,
+		.per_ce_irq = true,
 	},
 };
 
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index 413b1b4321f7..3041eba61e54 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -568,6 +568,9 @@ struct ath10k_hw_params {
 
 	/* Target rx ring fill level */
 	u32 rx_ring_fill_level;
+
+	/* target supporting per ce IRQ */
+	bool per_ce_irq;
 };
 
 struct htt_rx_desc;
-- 
cgit v1.2.3-59-g8ed1b


From a6a793f98786fe146f8926b02b320f0d9b48a61c Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:34 +0300
Subject: ath10k: vote for hardware resources for WCN3990

Add clock and regulator votes for WCN3990 WLAN
chipset.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 313 ++++++++++++++++++++++++++++++++-
 drivers/net/wireless/ath/ath10k/snoc.h |  19 ++
 2 files changed, 331 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 1ef0d3b51b8f..2e490ff124f1 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -24,6 +24,8 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/clk.h>
 #define  WCN3990_CE_ATTR_FLAGS 0
 #define ATH10K_SNOC_RX_POST_RETRY_MS 50
 #define CE_POLL_PIPE 4
@@ -43,6 +45,17 @@ static char *const ce_name[] = {
 	"WLAN_CE_11",
 };
 
+static struct ath10k_wcn3990_vreg_info vreg_cfg[] = {
+	{NULL, "vdd-0.8-cx-mx", 800000, 800000, 0, 0, false},
+	{NULL, "vdd-1.8-xo", 1800000, 1800000, 0, 0, false},
+	{NULL, "vdd-1.3-rfa", 1304000, 1304000, 0, 0, false},
+	{NULL, "vdd-3.3-ch0", 3312000, 3312000, 0, 0, false},
+};
+
+static struct ath10k_wcn3990_clk_info clk_cfg[] = {
+	{NULL, "cxo_ref_clk_pin", 0, false},
+};
+
 static void ath10k_snoc_htc_tx_cb(struct ath10k_ce_pipe *ce_state);
 static void ath10k_snoc_htt_tx_cb(struct ath10k_ce_pipe *ce_state);
 static void ath10k_snoc_htc_rx_cb(struct ath10k_ce_pipe *ce_state);
@@ -969,6 +982,277 @@ static void ath10k_snoc_release_resource(struct ath10k *ar)
 		ath10k_ce_free_pipe(ar, i);
 }
 
+static int ath10k_get_vreg_info(struct ath10k *ar, struct device *dev,
+				struct ath10k_wcn3990_vreg_info *vreg_info)
+{
+	struct regulator *reg;
+	int ret = 0;
+
+	reg = devm_regulator_get_optional(dev, vreg_info->name);
+
+	if (IS_ERR(reg)) {
+		ret = PTR_ERR(reg);
+
+		if (ret  == -EPROBE_DEFER) {
+			ath10k_err(ar, "EPROBE_DEFER for regulator: %s\n",
+				   vreg_info->name);
+			return ret;
+		}
+		if (vreg_info->required) {
+			ath10k_err(ar, "Regulator %s doesn't exist: %d\n",
+				   vreg_info->name, ret);
+			return ret;
+		}
+		ath10k_dbg(ar, ATH10K_DBG_SNOC,
+			   "Optional regulator %s doesn't exist: %d\n",
+			   vreg_info->name, ret);
+		goto done;
+	}
+
+	vreg_info->reg = reg;
+
+done:
+	ath10k_dbg(ar, ATH10K_DBG_SNOC,
+		   "snog vreg %s min_v %u max_v %u load_ua %u settle_delay %lu\n",
+		   vreg_info->name, vreg_info->min_v, vreg_info->max_v,
+		   vreg_info->load_ua, vreg_info->settle_delay);
+
+	return 0;
+}
+
+static int ath10k_get_clk_info(struct ath10k *ar, struct device *dev,
+			       struct ath10k_wcn3990_clk_info *clk_info)
+{
+	struct clk *handle;
+	int ret = 0;
+
+	handle = devm_clk_get(dev, clk_info->name);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		if (clk_info->required) {
+			ath10k_err(ar, "snoc clock %s isn't available: %d\n",
+				   clk_info->name, ret);
+			return ret;
+		}
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc ignoring clock %s: %d\n",
+			   clk_info->name,
+			   ret);
+		return 0;
+	}
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc clock %s freq %u\n",
+		   clk_info->name, clk_info->freq);
+
+	clk_info->handle = handle;
+
+	return ret;
+}
+
+static int ath10k_wcn3990_vreg_on(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_wcn3990_vreg_info *vreg_info;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(vreg_cfg); i++) {
+		vreg_info = &ar_snoc->vreg[i];
+
+		if (!vreg_info->reg)
+			continue;
+
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc regulator %s being enabled\n",
+			   vreg_info->name);
+
+		ret = regulator_set_voltage(vreg_info->reg, vreg_info->min_v,
+					    vreg_info->max_v);
+		if (ret) {
+			ath10k_err(ar,
+				   "failed to set regulator %s voltage-min: %d voltage-max: %d\n",
+				   vreg_info->name, vreg_info->min_v, vreg_info->max_v);
+			goto err_reg_config;
+		}
+
+		if (vreg_info->load_ua) {
+			ret = regulator_set_load(vreg_info->reg,
+						 vreg_info->load_ua);
+			if (ret < 0) {
+				ath10k_err(ar,
+					   "failed to set regulator %s load: %d\n",
+					   vreg_info->name,
+					   vreg_info->load_ua);
+				goto err_reg_config;
+			}
+		}
+
+		ret = regulator_enable(vreg_info->reg);
+		if (ret) {
+			ath10k_err(ar, "failed to enable regulator %s\n",
+				   vreg_info->name);
+			goto err_reg_config;
+		}
+
+		if (vreg_info->settle_delay)
+			udelay(vreg_info->settle_delay);
+	}
+
+	return 0;
+
+err_reg_config:
+	for (; i >= 0; i--) {
+		vreg_info = &ar_snoc->vreg[i];
+
+		if (!vreg_info->reg)
+			continue;
+
+		regulator_disable(vreg_info->reg);
+		regulator_set_load(vreg_info->reg, 0);
+		regulator_set_voltage(vreg_info->reg, 0, vreg_info->max_v);
+	}
+
+	return ret;
+}
+
+static int ath10k_wcn3990_vreg_off(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_wcn3990_vreg_info *vreg_info;
+	int ret = 0;
+	int i;
+
+	for (i = ARRAY_SIZE(vreg_cfg) - 1; i >= 0; i--) {
+		vreg_info = &ar_snoc->vreg[i];
+
+		if (!vreg_info->reg)
+			continue;
+
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc regulator %s being disabled\n",
+			   vreg_info->name);
+
+		ret = regulator_disable(vreg_info->reg);
+		if (ret)
+			ath10k_err(ar, "failed to disable regulator %s\n",
+				   vreg_info->name);
+
+		ret = regulator_set_load(vreg_info->reg, 0);
+		if (ret < 0)
+			ath10k_err(ar, "failed to set load %s\n",
+				   vreg_info->name);
+
+		ret = regulator_set_voltage(vreg_info->reg, 0,
+					    vreg_info->max_v);
+		if (ret)
+			ath10k_err(ar, "failed to set voltage %s\n",
+				   vreg_info->name);
+	}
+
+	return ret;
+}
+
+static int ath10k_wcn3990_clk_init(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_wcn3990_clk_info *clk_info;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(clk_cfg); i++) {
+		clk_info = &ar_snoc->clk[i];
+
+		if (!clk_info->handle)
+			continue;
+
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc clock %s being enabled\n",
+			   clk_info->name);
+
+		if (clk_info->freq) {
+			ret = clk_set_rate(clk_info->handle, clk_info->freq);
+
+			if (ret) {
+				ath10k_err(ar, "failed to set clock %s freq %u\n",
+					   clk_info->name, clk_info->freq);
+				goto err_clock_config;
+			}
+		}
+
+		ret = clk_prepare_enable(clk_info->handle);
+		if (ret) {
+			ath10k_err(ar, "failed to enable clock %s\n",
+				   clk_info->name);
+			goto err_clock_config;
+		}
+	}
+
+	return 0;
+
+err_clock_config:
+	for (; i >= 0; i--) {
+		clk_info = &ar_snoc->clk[i];
+
+		if (!clk_info->handle)
+			continue;
+
+		clk_disable_unprepare(clk_info->handle);
+	}
+
+	return ret;
+}
+
+static int ath10k_wcn3990_clk_deinit(struct ath10k *ar)
+{
+	struct ath10k_snoc *ar_snoc = ath10k_snoc_priv(ar);
+	struct ath10k_wcn3990_clk_info *clk_info;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(clk_cfg); i++) {
+		clk_info = &ar_snoc->clk[i];
+
+		if (!clk_info->handle)
+			continue;
+
+		ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc clock %s being disabled\n",
+			   clk_info->name);
+
+		clk_disable_unprepare(clk_info->handle);
+	}
+
+	return 0;
+}
+
+static int ath10k_hw_power_on(struct ath10k *ar)
+{
+	int ret;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "soc power on\n");
+
+	ret = ath10k_wcn3990_vreg_on(ar);
+	if (ret)
+		return ret;
+
+	ret = ath10k_wcn3990_clk_init(ar);
+	if (ret)
+		goto vreg_off;
+
+	return ret;
+
+vreg_off:
+	ath10k_wcn3990_vreg_off(ar);
+	return ret;
+}
+
+static int ath10k_hw_power_off(struct ath10k *ar)
+{
+	int ret;
+
+	ath10k_dbg(ar, ATH10K_DBG_SNOC, "soc power off\n");
+
+	ath10k_wcn3990_clk_deinit(ar);
+
+	ret = ath10k_wcn3990_vreg_off(ar);
+
+	return ret;
+}
+
 static const struct of_device_id ath10k_snoc_dt_match[] = {
 	{ .compatible = "qcom,wcn3990-wifi",
 	 .data = &drv_priv,
@@ -985,6 +1269,7 @@ static int ath10k_snoc_probe(struct platform_device *pdev)
 	struct device *dev;
 	struct ath10k *ar;
 	int ret;
+	u32 i;
 
 	of_id = of_match_device(ath10k_snoc_dt_match, &pdev->dev);
 	if (!of_id) {
@@ -1031,16 +1316,41 @@ static int ath10k_snoc_probe(struct platform_device *pdev)
 		ath10k_warn(ar, "failed to request irqs: %d\n", ret);
 		goto err_release_resource;
 	}
+
+	ar_snoc->vreg = vreg_cfg;
+	for (i = 0; i < ARRAY_SIZE(vreg_cfg); i++) {
+		ret = ath10k_get_vreg_info(ar, dev, &ar_snoc->vreg[i]);
+		if (ret)
+			goto err_free_irq;
+	}
+
+	ar_snoc->clk = clk_cfg;
+	for (i = 0; i < ARRAY_SIZE(clk_cfg); i++) {
+		ret = ath10k_get_clk_info(ar, dev, &ar_snoc->clk[i]);
+		if (ret)
+			goto err_free_irq;
+	}
+
+	ret = ath10k_hw_power_on(ar);
+	if (ret) {
+		ath10k_err(ar, "failed to power on device: %d\n", ret);
+		goto err_free_irq;
+	}
+
 	ret = ath10k_core_register(ar, drv_data->hw_rev);
 	if (ret) {
 		ath10k_err(ar, "failed to register driver core: %d\n", ret);
-		goto err_free_irq;
+		goto err_hw_power_off;
 	}
+
 	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc probe\n");
 	ath10k_warn(ar, "Warning: SNOC support is still work-in-progress, it will not work properly!");
 
 	return 0;
 
+err_hw_power_off:
+	ath10k_hw_power_off(ar);
+
 err_free_irq:
 	ath10k_snoc_free_irq(ar);
 
@@ -1059,6 +1369,7 @@ static int ath10k_snoc_remove(struct platform_device *pdev)
 
 	ath10k_dbg(ar, ATH10K_DBG_SNOC, "snoc remove\n");
 	ath10k_core_unregister(ar);
+	ath10k_hw_power_off(ar);
 	ath10k_snoc_free_irq(ar);
 	ath10k_snoc_release_resource(ar);
 	ath10k_core_destroy(ar);
diff --git a/drivers/net/wireless/ath/ath10k/snoc.h b/drivers/net/wireless/ath/ath10k/snoc.h
index cf65b01e0085..05dc98f46ccd 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.h
+++ b/drivers/net/wireless/ath/ath10k/snoc.h
@@ -52,6 +52,23 @@ struct ath10k_snoc_ce_irq {
 	u32 irq_line;
 };
 
+struct ath10k_wcn3990_vreg_info {
+	struct regulator *reg;
+	const char *name;
+	u32 min_v;
+	u32 max_v;
+	u32 load_ua;
+	unsigned long settle_delay;
+	bool required;
+};
+
+struct ath10k_wcn3990_clk_info {
+	struct clk *handle;
+	const char *name;
+	u32 freq;
+	bool required;
+};
+
 struct ath10k_snoc {
 	struct platform_device *dev;
 	struct ath10k *ar;
@@ -63,6 +80,8 @@ struct ath10k_snoc {
 	struct ath10k_snoc_ce_irq ce_irqs[CE_COUNT_MAX];
 	struct ath10k_ce ce;
 	struct timer_list rx_post_retry;
+	struct ath10k_wcn3990_vreg_info *vreg;
+	struct ath10k_wcn3990_clk_info *clk;
 };
 
 static inline struct ath10k_snoc *ath10k_snoc_priv(struct ath10k *ar)
-- 
cgit v1.2.3-59-g8ed1b


From ae316c4cbba2ee8f92bc3c5b040b275371ea052c Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 10 Apr 2018 18:01:35 +0300
Subject: dt: bindings: add bindings for wcn3990 wifi block

Add device tree binding documentation details for wcn3990
wifi block present in Qualcomm SDM845/APQ8098 SoC into
"qcom,ath10k.txt".

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../bindings/net/wireless/qcom,ath10k.txt          | 31 ++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt
index 3d2a031217da..7fd4e8ce4149 100644
--- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt
+++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.txt
@@ -4,6 +4,7 @@ Required properties:
 - compatible: Should be one of the following:
 	* "qcom,ath10k"
 	* "qcom,ipq4019-wifi"
+	* "qcom,wcn3990-wifi"
 
 PCI based devices uses compatible string "qcom,ath10k" and takes calibration
 data along with board specific data via "qcom,ath10k-calibration-data".
@@ -18,8 +19,12 @@ In general, entry "qcom,ath10k-pre-calibration-data" and
 "qcom,ath10k-calibration-data" conflict with each other and only one
 can be provided per device.
 
+SNOC based devices (i.e. wcn3990) uses compatible string "qcom,wcn3990-wifi".
+
 Optional properties:
 - reg: Address and length of the register set for the device.
+- reg-names: Must include the list of following reg names,
+	     "membase"
 - resets: Must contain an entry for each entry in reset-names.
           See ../reset/reseti.txt for details.
 - reset-names: Must include the list of following reset names,
@@ -49,6 +54,8 @@ Optional properties:
 				 hw versions.
 - qcom,ath10k-pre-calibration-data : pre calibration data as an array,
 				     the length can vary between hw versions.
+- <supply-name>-supply: handle to the regulator device tree node
+			   optional "supply-name" is "vdd-0.8-cx-mx".
 
 Example (to supply the calibration data alone):
 
@@ -119,3 +126,27 @@ wifi0: wifi@a000000 {
 	qcom,msi_base = <0x40>;
 	qcom,ath10k-pre-calibration-data = [ 01 02 03 ... ];
 };
+
+Example (to supply wcn3990 SoC wifi block details):
+
+wifi@18000000 {
+		compatible = "qcom,wcn3990-wifi";
+		reg = <0x18800000 0x800000>;
+		reg-names = "membase";
+		clocks = <&clock_gcc clk_aggre2_noc_clk>;
+		clock-names = "smmu_aggre2_noc_clk"
+		interrupts =
+			   <0 130 0 /* CE0 */ >,
+			   <0 131 0 /* CE1 */ >,
+			   <0 132 0 /* CE2 */ >,
+			   <0 133 0 /* CE3 */ >,
+			   <0 134 0 /* CE4 */ >,
+			   <0 135 0 /* CE5 */ >,
+			   <0 136 0 /* CE6 */ >,
+			   <0 137 0 /* CE7 */ >,
+			   <0 138 0 /* CE8 */ >,
+			   <0 139 0 /* CE9 */ >,
+			   <0 140 0 /* CE10 */ >,
+			   <0 141 0 /* CE11 */ >;
+		vdd-0.8-cx-mx-supply = <&pm8998_l5>;
+};
-- 
cgit v1.2.3-59-g8ed1b


From 5072d87426bb6ed685d3ad9b694f5571859eca06 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 29 Mar 2018 17:53:04 +0100
Subject: ath6kl: fix spelling mistake: "chache" -> "cache"

Trivial fix to spelling mistake in message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath6kl/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath6kl/debug.c b/drivers/net/wireless/ath/ath6kl/debug.c
index 0f965e9f38a4..4e94b22eaada 100644
--- a/drivers/net/wireless/ath/ath6kl/debug.c
+++ b/drivers/net/wireless/ath/ath6kl/debug.c
@@ -645,7 +645,7 @@ static ssize_t read_file_tgt_stats(struct file *file, char __user *user_buf,
 	len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n",
 			 "CRC Err", tgt_stats->rx_crc_err);
 	len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n",
-			 "Key chache miss", tgt_stats->rx_key_cache_miss);
+			 "Key cache miss", tgt_stats->rx_key_cache_miss);
 	len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n",
 			 "Decrypt Err", tgt_stats->rx_decrypt_err);
 	len += scnprintf(buf + len, buf_len - len, "%20s %10llu\n",
-- 
cgit v1.2.3-59-g8ed1b


From 1e8f77502341035f079039c9aa9a647ebc9d881a Mon Sep 17 00:00:00 2001
From: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Date: Thu, 12 Apr 2018 17:46:50 +0530
Subject: ath10k: correct target assert problem due to CE5 stuck

Correct a minor bug in the commit 0628467f97b5 ("ath10k: fix
copy engine 5 destination ring stuck") which introduced a change to fix
firmware assert that happens when ring indices of copy engine 5 are stuck
for a specific duration, problem with this fix is that it did not use
ring arithmatic. As a result,firmware asserts did not go away entirely
athough the frequency of occurrence has reduced. Using ring arithmatic
to fix the issue.

Tested on QCA9984(fw version-10.4-3.4-00082).

Fixes: 0628467f97b5 ("ath10k: fix copy engine 5 destination ring stuck)
Signed-off-by: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/ce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index e7e7b342e5b8..3a15923bdd26 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -620,7 +620,7 @@ void ath10k_ce_rx_update_write_idx(struct ath10k_ce_pipe *pipe, u32 nentries)
 	/* Prevent CE ring stuck issue that will occur when ring is full.
 	 * Make sure that write index is 1 less than read index.
 	 */
-	if ((cur_write_idx + nentries)  == dest_ring->sw_index)
+	if (((cur_write_idx + nentries) & nentries_mask) == dest_ring->sw_index)
 		nentries -= 1;
 
 	write_index = CE_RING_IDX_ADD(nentries_mask, write_index, nentries);
-- 
cgit v1.2.3-59-g8ed1b


From b022ca257ca6eabf38fba0c700956bdd0babf3e1 Mon Sep 17 00:00:00 2001
From: Maharaja Kennadyrajan <mkenna@codeaurora.org>
Date: Fri, 13 Apr 2018 13:18:12 +0530
Subject: ath10k: fix a typo in ath10k_wmi_set_wmm_param()

Fix a typo in the function ath10k_wmi_set_wmm_param().

Signed-off-by: Maharaja Kennadyrajan <mkenna@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index eb1c8a1a1c88..e12dd28fcf5d 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -7648,7 +7648,7 @@ ath10k_wmi_10_2_4_op_gen_pdev_get_tpc_config(struct ath10k *ar, u32 param)
 	cmd->param = __cpu_to_le32(param);
 
 	ath10k_dbg(ar, ATH10K_DBG_WMI,
-		   "wmi pdev get tcp config param:%d\n", param);
+		   "wmi pdev get tpc config param %d\n", param);
 	return skb;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c8489668065a283d3027e86e877b103a87f99d22 Mon Sep 17 00:00:00 2001
From: Thomas Hebb <tommyhebb@gmail.com>
Date: Fri, 13 Apr 2018 17:40:26 +0300
Subject: ath10k: search all IEs for variant before falling back

commit f2593cb1b291 ("ath10k: Search SMBIOS for OEM board file
extension") added a feature to ath10k that allows Board Data File
(BDF) conflicts between multiple devices that use the same device IDs
but have different calibration requirements to be resolved by allowing
a "variant" string to be stored in SMBIOS [and later device tree, added
by commit d06f26c5c8a4 ("ath10k: search DT for qcom,ath10k-calibration-
variant")] that gets appended to the ID stored in board-2.bin.

This original patch had a regression, however. Namely that devices with
a variant present in SMBIOS that didn't need custom BDFs could no longer
find the default BDF, which has no variant appended. The patch was
reverted and re-applied with a fix for this issue in commit 1657b8f84ed9
("search SMBIOS for OEM board file extension").

But the fix to fall back to a default BDF introduced another issue: the
driver currently parses IEs in board-2.bin one by one, and for each one
it first checks to see if it matches the ID with the variant appended.
If it doesn't, it checks to see if it matches the "fallback" ID with no
variant. If a matching BDF is found at any point during this search, the
search is terminated and that BDF is used. The issue is that it's very
possible (and is currently the case for board-2.bin files present in the
ath10k-firmware repository) for the default BDF to occur in an earlier
IE than the variant-specific BDF. In this case, the current code will
happily choose the default BDF even though a better-matching BDF is
present later in the file.

This patch fixes the issue by first searching the entire file for the ID
with variant, and searching for the fallback ID only if that search
fails. It also includes some code cleanup in the area, as
ath10k_core_fetch_board_data_api_n() no longer does its own string
mangling to remove the variant from an ID, instead leaving that job to a
new flag passed to ath10k_core_create_board_name().

I've tested this patch on a QCA4019 and verified that the driver behaves
correctly for 1) both fallback and variant BDFs present, 2) only fallback
BDF present, and 3) no matching BDFs present.

Fixes: 1657b8f84ed9 ("ath10k: search SMBIOS for OEM board file extension")
Signed-off-by: Thomas Hebb <tommyhebb@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 134 ++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index a21530dacc35..64674d8ce457 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -1266,14 +1266,61 @@ out:
 	return ret;
 }
 
+static int ath10k_core_search_bd(struct ath10k *ar,
+				 const char *boardname,
+				 const u8 *data,
+				 size_t len)
+{
+	size_t ie_len;
+	struct ath10k_fw_ie *hdr;
+	int ret = -ENOENT, ie_id;
+
+	while (len > sizeof(struct ath10k_fw_ie)) {
+		hdr = (struct ath10k_fw_ie *)data;
+		ie_id = le32_to_cpu(hdr->id);
+		ie_len = le32_to_cpu(hdr->len);
+
+		len -= sizeof(*hdr);
+		data = hdr->data;
+
+		if (len < ALIGN(ie_len, 4)) {
+			ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n",
+				   ie_id, ie_len, len);
+			return -EINVAL;
+		}
+
+		switch (ie_id) {
+		case ATH10K_BD_IE_BOARD:
+			ret = ath10k_core_parse_bd_ie_board(ar, data, ie_len,
+							    boardname);
+			if (ret == -ENOENT)
+				/* no match found, continue */
+				break;
+
+			/* either found or error, so stop searching */
+			goto out;
+		}
+
+		/* jump over the padding */
+		ie_len = ALIGN(ie_len, 4);
+
+		len -= ie_len;
+		data += ie_len;
+	}
+
+out:
+	/* return result of parse_bd_ie_board() or -ENOENT */
+	return ret;
+}
+
 static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 					      const char *boardname,
+					      const char *fallback_boardname,
 					      const char *filename)
 {
-	size_t len, magic_len, ie_len;
-	struct ath10k_fw_ie *hdr;
+	size_t len, magic_len;
 	const u8 *data;
-	int ret, ie_id;
+	int ret;
 
 	ar->normal_mode_fw.board = ath10k_fetch_fw_file(ar,
 							ar->hw_params.fw.dir,
@@ -1311,69 +1358,23 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 	data += magic_len;
 	len -= magic_len;
 
-	while (len > sizeof(struct ath10k_fw_ie)) {
-		hdr = (struct ath10k_fw_ie *)data;
-		ie_id = le32_to_cpu(hdr->id);
-		ie_len = le32_to_cpu(hdr->len);
-
-		len -= sizeof(*hdr);
-		data = hdr->data;
-
-		if (len < ALIGN(ie_len, 4)) {
-			ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n",
-				   ie_id, ie_len, len);
-			ret = -EINVAL;
-			goto err;
-		}
+	/* attempt to find boardname in the IE list */
+	ret = ath10k_core_search_bd(ar, boardname, data, len);
 
-		switch (ie_id) {
-		case ATH10K_BD_IE_BOARD:
-			ret = ath10k_core_parse_bd_ie_board(ar, data, ie_len,
-							    boardname);
-			if (ret == -ENOENT && ar->id.bdf_ext[0] != '\0') {
-				/* try default bdf if variant was not found */
-				char *s, *v = ",variant=";
-				char boardname2[100];
-
-				strlcpy(boardname2, boardname,
-					sizeof(boardname2));
-
-				s = strstr(boardname2, v);
-				if (s)
-					*s = '\0';  /* strip ",variant=%s" */
+	/* if we didn't find it and have a fallback name, try that */
+	if (ret == -ENOENT && fallback_boardname)
+		ret = ath10k_core_search_bd(ar, fallback_boardname, data, len);
 
-				ret = ath10k_core_parse_bd_ie_board(ar, data,
-								    ie_len,
-								    boardname2);
-			}
-
-			if (ret == -ENOENT)
-				/* no match found, continue */
-				break;
-			else if (ret)
-				/* there was an error, bail out */
-				goto err;
-
-			/* board data found */
-			goto out;
-		}
-
-		/* jump over the padding */
-		ie_len = ALIGN(ie_len, 4);
-
-		len -= ie_len;
-		data += ie_len;
-	}
-
-out:
-	if (!ar->normal_mode_fw.board_data || !ar->normal_mode_fw.board_len) {
+	if (ret == -ENOENT) {
 		ath10k_err(ar,
 			   "failed to fetch board data for %s from %s/%s\n",
 			   boardname, ar->hw_params.fw.dir, filename);
 		ret = -ENODATA;
-		goto err;
 	}
 
+	if (ret)
+		goto err;
+
 	return 0;
 
 err:
@@ -1382,12 +1383,12 @@ err:
 }
 
 static int ath10k_core_create_board_name(struct ath10k *ar, char *name,
-					 size_t name_len)
+					 size_t name_len, bool with_variant)
 {
 	/* strlen(',variant=') + strlen(ar->id.bdf_ext) */
 	char variant[9 + ATH10K_SMBIOS_BDF_EXT_STR_LENGTH] = { 0 };
 
-	if (ar->id.bdf_ext[0] != '\0')
+	if (with_variant && ar->id.bdf_ext[0] != '\0')
 		scnprintf(variant, sizeof(variant), ",variant=%s",
 			  ar->id.bdf_ext);
 
@@ -1413,17 +1414,26 @@ out:
 
 static int ath10k_core_fetch_board_file(struct ath10k *ar)
 {
-	char boardname[100];
+	char boardname[100], fallback_boardname[100];
 	int ret;
 
-	ret = ath10k_core_create_board_name(ar, boardname, sizeof(boardname));
+	ret = ath10k_core_create_board_name(ar, boardname,
+					    sizeof(boardname), true);
 	if (ret) {
 		ath10k_err(ar, "failed to create board name: %d", ret);
 		return ret;
 	}
 
+	ret = ath10k_core_create_board_name(ar, fallback_boardname,
+					    sizeof(boardname), false);
+	if (ret) {
+		ath10k_err(ar, "failed to create fallback board name: %d", ret);
+		return ret;
+	}
+
 	ar->bd_api = 2;
 	ret = ath10k_core_fetch_board_data_api_n(ar, boardname,
+						 fallback_boardname,
 						 ATH10K_BOARD_API2_FILE);
 	if (!ret)
 		goto success;
-- 
cgit v1.2.3-59-g8ed1b


From 5df6e1313044bf0b5326fa54db8a3904d43ec1dc Mon Sep 17 00:00:00 2001
From: Erik Stromdahl <erik.stromdahl@gmail.com>
Date: Sun, 15 Apr 2018 14:22:27 +0200
Subject: ath10k: add inlined wrappers for htt tx ops

These wrappers makes the HTT ops align better with the HIF ops
(where similar wrappers are used).

It also makes it easier for a target to have unsupported ops
(by letting the corresponding function pointer be NULL).

Signed-off-by: Erik Stromdahl <erik.stromdahl@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/htt.c    |  4 +--
 drivers/net/wireless/ath/ath10k/htt.h    | 51 ++++++++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/htt_tx.c | 12 ++++----
 drivers/net/wireless/ath/ath10k/mac.c    |  2 +-
 4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt.c b/drivers/net/wireless/ath/ath10k/htt.c
index 625198dea18b..21a67f82f037 100644
--- a/drivers/net/wireless/ath/ath10k/htt.c
+++ b/drivers/net/wireless/ath/ath10k/htt.c
@@ -257,11 +257,11 @@ int ath10k_htt_setup(struct ath10k_htt *htt)
 		return status;
 	}
 
-	status = htt->tx_ops->htt_send_frag_desc_bank_cfg(htt);
+	status = ath10k_htt_send_frag_desc_bank_cfg(htt);
 	if (status)
 		return status;
 
-	status = htt->tx_ops->htt_send_rx_ring_cfg(htt);
+	status = ath10k_htt_send_rx_ring_cfg(htt);
 	if (status) {
 		ath10k_warn(ar, "failed to setup rx ring: %d\n",
 			    status);
diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index e5dbb0bdd787..68f3bc98fae7 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -1856,6 +1856,57 @@ struct ath10k_htt_tx_ops {
 	void (*htt_free_txbuff)(struct ath10k_htt *htt);
 };
 
+static inline int ath10k_htt_send_rx_ring_cfg(struct ath10k_htt *htt)
+{
+	if (!htt->tx_ops->htt_send_rx_ring_cfg)
+		return -EOPNOTSUPP;
+
+	return htt->tx_ops->htt_send_rx_ring_cfg(htt);
+}
+
+static inline int ath10k_htt_send_frag_desc_bank_cfg(struct ath10k_htt *htt)
+{
+	if (!htt->tx_ops->htt_send_frag_desc_bank_cfg)
+		return -EOPNOTSUPP;
+
+	return htt->tx_ops->htt_send_frag_desc_bank_cfg(htt);
+}
+
+static inline int ath10k_htt_alloc_frag_desc(struct ath10k_htt *htt)
+{
+	if (!htt->tx_ops->htt_alloc_frag_desc)
+		return -EOPNOTSUPP;
+
+	return htt->tx_ops->htt_alloc_frag_desc(htt);
+}
+
+static inline void ath10k_htt_free_frag_desc(struct ath10k_htt *htt)
+{
+	if (htt->tx_ops->htt_free_frag_desc)
+		htt->tx_ops->htt_free_frag_desc(htt);
+}
+
+static inline int ath10k_htt_tx(struct ath10k_htt *htt,
+				enum ath10k_hw_txrx_mode txmode,
+				struct sk_buff *msdu)
+{
+	return htt->tx_ops->htt_tx(htt, txmode, msdu);
+}
+
+static inline int ath10k_htt_alloc_txbuff(struct ath10k_htt *htt)
+{
+	if (!htt->tx_ops->htt_alloc_txbuff)
+		return -EOPNOTSUPP;
+
+	return htt->tx_ops->htt_alloc_txbuff(htt);
+}
+
+static inline void ath10k_htt_free_txbuff(struct ath10k_htt *htt)
+{
+	if (htt->tx_ops->htt_free_txbuff)
+		htt->tx_ops->htt_free_txbuff(htt);
+}
+
 struct ath10k_htt_rx_ops {
 	size_t (*htt_get_rx_ring_size)(struct ath10k_htt *htt);
 	void (*htt_config_paddrs_ring)(struct ath10k_htt *htt, void *vaddr);
diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c
index d334b7be1fea..a086958c39b6 100644
--- a/drivers/net/wireless/ath/ath10k/htt_tx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_tx.c
@@ -443,13 +443,13 @@ static int ath10k_htt_tx_alloc_buf(struct ath10k_htt *htt)
 	struct ath10k *ar = htt->ar;
 	int ret;
 
-	ret = htt->tx_ops->htt_alloc_txbuff(htt);
+	ret = ath10k_htt_alloc_txbuff(htt);
 	if (ret) {
 		ath10k_err(ar, "failed to alloc cont tx buffer: %d\n", ret);
 		return ret;
 	}
 
-	ret = htt->tx_ops->htt_alloc_frag_desc(htt);
+	ret = ath10k_htt_alloc_frag_desc(htt);
 	if (ret) {
 		ath10k_err(ar, "failed to alloc cont frag desc: %d\n", ret);
 		goto free_txbuf;
@@ -473,10 +473,10 @@ free_txq:
 	ath10k_htt_tx_free_txq(htt);
 
 free_frag_desc:
-	htt->tx_ops->htt_free_frag_desc(htt);
+	ath10k_htt_free_frag_desc(htt);
 
 free_txbuf:
-	htt->tx_ops->htt_free_txbuff(htt);
+	ath10k_htt_free_txbuff(htt);
 
 	return ret;
 }
@@ -530,9 +530,9 @@ void ath10k_htt_tx_destroy(struct ath10k_htt *htt)
 	if (!htt->tx_mem_allocated)
 		return;
 
-	htt->tx_ops->htt_free_txbuff(htt);
+	ath10k_htt_free_txbuff(htt);
 	ath10k_htt_tx_free_txq(htt);
-	htt->tx_ops->htt_free_frag_desc(htt);
+	ath10k_htt_free_frag_desc(htt);
 	ath10k_htt_tx_free_txdone_fifo(htt);
 	htt->tx_mem_allocated = false;
 }
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index bf05a3689558..fc3320f94f45 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3598,7 +3598,7 @@ static int ath10k_mac_tx_submit(struct ath10k *ar,
 
 	switch (txpath) {
 	case ATH10K_MAC_TX_HTT:
-		ret = htt->tx_ops->htt_tx(htt, txmode, skb);
+		ret = ath10k_htt_tx(htt, txmode, skb);
 		break;
 	case ATH10K_MAC_TX_HTT_MGMT:
 		ret = ath10k_htt_mgmt_tx(htt, skb);
-- 
cgit v1.2.3-59-g8ed1b


From 9a5511d5f24f331c1709f398b1cff5d538fbae1d Mon Sep 17 00:00:00 2001
From: Erik Stromdahl <erik.stromdahl@gmail.com>
Date: Sun, 15 Apr 2018 14:22:28 +0200
Subject: ath10k: add inlined wrappers for htt rx ops

Added for the same reason as the TX wrappers.

Signed-off-by: Erik Stromdahl <erik.stromdahl@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/htt.h    | 37 ++++++++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/htt_rx.c | 14 ++++++------
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index 68f3bc98fae7..4e5fe539eb77 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -1916,6 +1916,43 @@ struct ath10k_htt_rx_ops {
 	void (*htt_reset_paddrs_ring)(struct ath10k_htt *htt, int idx);
 };
 
+static inline size_t ath10k_htt_get_rx_ring_size(struct ath10k_htt *htt)
+{
+	if (!htt->rx_ops->htt_get_rx_ring_size)
+		return 0;
+
+	return htt->rx_ops->htt_get_rx_ring_size(htt);
+}
+
+static inline void ath10k_htt_config_paddrs_ring(struct ath10k_htt *htt,
+						 void *vaddr)
+{
+	if (htt->rx_ops->htt_config_paddrs_ring)
+		htt->rx_ops->htt_config_paddrs_ring(htt, vaddr);
+}
+
+static inline void ath10k_htt_set_paddrs_ring(struct ath10k_htt *htt,
+					      dma_addr_t paddr,
+					      int idx)
+{
+	if (htt->rx_ops->htt_set_paddrs_ring)
+		htt->rx_ops->htt_set_paddrs_ring(htt, paddr, idx);
+}
+
+static inline void *ath10k_htt_get_vaddr_ring(struct ath10k_htt *htt)
+{
+	if (!htt->rx_ops->htt_get_vaddr_ring)
+		return NULL;
+
+	return htt->rx_ops->htt_get_vaddr_ring(htt);
+}
+
+static inline void ath10k_htt_reset_paddrs_ring(struct ath10k_htt *htt, int idx)
+{
+	if (htt->rx_ops->htt_reset_paddrs_ring)
+		htt->rx_ops->htt_reset_paddrs_ring(htt, idx);
+}
+
 #define RX_HTT_HDR_STATUS_LEN 64
 
 /* This structure layout is programmed via rx ring setup
diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c
index 0315bdad7f85..bd23f6940488 100644
--- a/drivers/net/wireless/ath/ath10k/htt_rx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_rx.c
@@ -182,7 +182,7 @@ static int __ath10k_htt_rx_ring_fill_n(struct ath10k_htt *htt, int num)
 		rxcb = ATH10K_SKB_RXCB(skb);
 		rxcb->paddr = paddr;
 		htt->rx_ring.netbufs_ring[idx] = skb;
-		htt->rx_ops->htt_set_paddrs_ring(htt, paddr, idx);
+		ath10k_htt_set_paddrs_ring(htt, paddr, idx);
 		htt->rx_ring.fill_cnt++;
 
 		if (htt->rx_ring.in_ord_rx) {
@@ -287,8 +287,8 @@ void ath10k_htt_rx_free(struct ath10k_htt *htt)
 	ath10k_htt_rx_ring_free(htt);
 
 	dma_free_coherent(htt->ar->dev,
-			  htt->rx_ops->htt_get_rx_ring_size(htt),
-			  htt->rx_ops->htt_get_vaddr_ring(htt),
+			  ath10k_htt_get_rx_ring_size(htt),
+			  ath10k_htt_get_vaddr_ring(htt),
 			  htt->rx_ring.base_paddr);
 
 	dma_free_coherent(htt->ar->dev,
@@ -315,7 +315,7 @@ static inline struct sk_buff *ath10k_htt_rx_netbuf_pop(struct ath10k_htt *htt)
 	idx = htt->rx_ring.sw_rd_idx.msdu_payld;
 	msdu = htt->rx_ring.netbufs_ring[idx];
 	htt->rx_ring.netbufs_ring[idx] = NULL;
-	htt->rx_ops->htt_reset_paddrs_ring(htt, idx);
+	ath10k_htt_reset_paddrs_ring(htt, idx);
 
 	idx++;
 	idx &= htt->rx_ring.size_mask;
@@ -587,13 +587,13 @@ int ath10k_htt_rx_alloc(struct ath10k_htt *htt)
 	if (!htt->rx_ring.netbufs_ring)
 		goto err_netbuf;
 
-	size = htt->rx_ops->htt_get_rx_ring_size(htt);
+	size = ath10k_htt_get_rx_ring_size(htt);
 
 	vaddr_ring = dma_alloc_coherent(htt->ar->dev, size, &paddr, GFP_KERNEL);
 	if (!vaddr_ring)
 		goto err_dma_ring;
 
-	htt->rx_ops->htt_config_paddrs_ring(htt, vaddr_ring);
+	ath10k_htt_config_paddrs_ring(htt, vaddr_ring);
 	htt->rx_ring.base_paddr = paddr;
 
 	vaddr = dma_alloc_coherent(htt->ar->dev,
@@ -627,7 +627,7 @@ int ath10k_htt_rx_alloc(struct ath10k_htt *htt)
 
 err_dma_idx:
 	dma_free_coherent(htt->ar->dev,
-			  htt->rx_ops->htt_get_rx_ring_size(htt),
+			  ath10k_htt_get_rx_ring_size(htt),
 			  vaddr_ring,
 			  htt->rx_ring.base_paddr);
 err_dma_ring:
-- 
cgit v1.2.3-59-g8ed1b


From 6da2b2d46ff937edc3ecea0a36d92252981d033a Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Mon, 16 Apr 2018 09:56:45 +0530
Subject: ath10k: fix fw path name for WCN3990 target

FW path is mapped incorrectly for the WCN3990
hw version. Fix fw path with correct hw1.0 name.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/hw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index 3041eba61e54..b025a1bf2fde 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -131,7 +131,7 @@ enum qca9377_chip_id_rev {
 
 /* WCN3990 1.0 definitions */
 #define WCN3990_HW_1_0_DEV_VERSION	ATH10K_HW_WCN3990
-#define WCN3990_HW_1_0_FW_DIR		ATH10K_FW_DIR "/WCN3990/hw3.0"
+#define WCN3990_HW_1_0_FW_DIR		ATH10K_FW_DIR "/WCN3990/hw1.0"
 
 #define ATH10K_FW_FILE_BASE		"firmware"
 #define ATH10K_FW_API_MAX		6
-- 
cgit v1.2.3-59-g8ed1b


From 9ff6fde86fd27a0c733c01114eb0d46bd3a33fb7 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 10 Apr 2018 19:35:58 +0200
Subject: wcn36xx: use READ_ONCE() to access desc->ctrl

When accessing shared memory to check for the stat of submitted
descriptors, make sure to use READ_ONCE(). This will guarantee the
compiler treats these memory locations as volatile and doesn't apply
any caching.

While this doesn't fix any particular problem I ran into, it's best
practice to do it this way.

Note that this patch also removes the superflous extra condition check
in the do-while loop in reap_tx_dxes(), as the loop will break
instantly anyway in that case.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index a6702ada13d3..bd2b946a65c9 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -368,7 +368,7 @@ static void reap_tx_dxes(struct wcn36xx *wcn, struct wcn36xx_dxe_ch *ch)
 	spin_lock_irqsave(&ch->lock, flags);
 	ctl = ch->tail_blk_ctl;
 	do {
-		if (ctl->desc->ctrl & WCN36xx_DXE_CTRL_VLD)
+		if (READ_ONCE(ctl->desc->ctrl) & WCN36xx_DXE_CTRL_VLD)
 			break;
 		if (ctl->skb) {
 			dma_unmap_single(wcn->dev, ctl->desc->src_addr_l,
@@ -387,8 +387,7 @@ static void reap_tx_dxes(struct wcn36xx *wcn, struct wcn36xx_dxe_ch *ch)
 			ctl->skb = NULL;
 		}
 		ctl = ctl->next;
-	} while (ctl != ch->head_blk_ctl &&
-	       !(ctl->desc->ctrl & WCN36xx_DXE_CTRL_VLD));
+	} while (ctl != ch->head_blk_ctl);
 
 	ch->tail_blk_ctl = ctl;
 	spin_unlock_irqrestore(&ch->lock, flags);
@@ -530,7 +529,7 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 		int_mask = WCN36XX_DXE_INT_CH3_MASK;
 	}
 
-	while (!(dxe->ctrl & WCN36xx_DXE_CTRL_VLD)) {
+	while (!(READ_ONCE(dxe->ctrl) & WCN36xx_DXE_CTRL_VLD)) {
 		skb = ctl->skb;
 		dma_addr = dxe->dst_addr_l;
 		ret = wcn36xx_dxe_fill_skb(wcn->dev, ctl, GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From bef5767f37cfe42b1cf1ae60f01cfb4e16a7a2b8 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:46 -0700
Subject: tcp: better delivery accounting for SYN-ACK and SYN-data

the tcp_sock:delivered has inconsistent accounting for SYN and FIN.
1. it counts pure FIN
2. it counts pure SYN
3. it counts SYN-data twice
4. it does not count SYN-ACK

For congestion control perspective it does not matter much as C.C. only
cares about the difference not the aboslute value. But the next patch
would export this field to user-space so it's better to report the absolute
value w/o these caveats.

This patch counts SYN, SYN-ACK, or SYN-data delivery once always in
the "delivered" field.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f93687f97d80..2499248d4a67 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5567,9 +5567,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		return true;
 	}
 	tp->syn_data_acked = tp->syn_data;
-	if (tp->syn_data_acked)
-		NET_INC_STATS(sock_net(sk),
-				LINUX_MIB_TCPFASTOPENACTIVE);
+	if (tp->syn_data_acked) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+		/* SYN-data is counted as two separate packets in tcp_ack() */
+		if (tp->delivered > 1)
+			--tp->delivered;
+	}
 
 	tcp_fastopen_add_skb(sk, synack);
 
@@ -5901,6 +5904,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	}
 	switch (sk->sk_state) {
 	case TCP_SYN_RECV:
+		tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
 		if (!tp->srtt_us)
 			tcp_synack_rtt_meas(sk, req);
 
-- 
cgit v1.2.3-59-g8ed1b


From a77fa0104abd4348b9ae06ab0afe218ceb2455ea Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:47 -0700
Subject: tcp: new helper to calculate newly delivered

Add new helper tcp_newly_delivered() to prepare the ECN accounting change.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2499248d4a67..01cce28f90ca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3496,6 +3496,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 	tcp_xmit_retransmit_queue(sk);
 }
 
+/* Returns the number of packets newly acked or sacked by the current ACK */
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 delivered;
+
+	delivered = tp->delivered - prior_delivered;
+	return delivered;
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3619,7 +3629,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
 		sk_dst_confirm(sk);
 
-	delivered = tp->delivered - delivered;	/* freshly ACKed or SACKed */
+	delivered = tcp_newly_delivered(sk, delivered, flag);
 	lost = tp->lost - lost;			/* freshly marked lost */
 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3629,9 +3639,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 no_queue:
 	/* If data was DSACKed, see if we can undo a cwnd reduction. */
-	if (flag & FLAG_DSACKING_ACK)
+	if (flag & FLAG_DSACKING_ACK) {
 		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
 				      &rexmit);
+		tcp_newly_delivered(sk, delivered, flag);
+	}
 	/* If this ack opens up a zero window, clear backoff.  It was
 	 * being used to time the probes, and is probably far higher than
 	 * it needs to be for normal retransmission.
@@ -3655,6 +3667,7 @@ old_ack:
 						&sack_state);
 		tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
 				      &rexmit);
+		tcp_newly_delivered(sk, delivered, flag);
 		tcp_xmit_recovery(sk, rexmit);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From e21db6f69a95b846ff04e31fe0a86004cbd000d7 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:48 -0700
Subject: tcp: track total bytes delivered with ECN CE marks

Introduce a new delivered_ce stat in tcp socket to estimate
number of packets being marked with CE bits. The estimation is
done via ACKs with ECE bit. Depending on the actual receiver
behavior, the estimation could have biases.

Since the TCP sender can't really see the CE bit in the data path,
so the sender is technically counting packets marked delivered with
the "ECE / ECN-Echo" flag set.

With RFC3168 ECN, because the ECE bit is sticky, this count can
drastically overestimate the nummber of CE-marked data packets

With DCTCP-style ECN this should be reasonably precise unless there
is loss in the ACK path, in which case it's not precise.

With AccECN proposal this can be made still more precise, even in
the case some degree of ACK loss.

However this is sender's best estimate of CE information.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  | 1 +
 net/ipv4/tcp.c       | 1 +
 net/ipv4/tcp_input.c | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8f4c54986f97..20585d5c4e1c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -281,6 +281,7 @@ struct tcp_sock {
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
 	u32	delivered;	/* Total data packets delivered incl. rexmits */
+	u32	delivered_ce;	/* Like the above but only ECE marked packets */
 	u32	lost;		/* Total data packets lost incl. rexmits */
 	u32	app_limited;	/* limited until "delivered" reaches this val */
 	u64	first_tx_mstamp;  /* start of window send phase */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 438fbca96cd3..5a5ce6da4792 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2559,6 +2559,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
+	tp->delivered_ce = 0;
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tp->is_sack_reneg = 0;
 	tcp_clear_retrans(tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 01cce28f90ca..b3bff9c20606 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3503,6 +3503,8 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 	u32 delivered;
 
 	delivered = tp->delivered - prior_delivered;
+	if (flag & FLAG_ECE)
+		tp->delivered_ce += delivered;
 	return delivered;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From feb5f2ec646483fb66f9ad7218b1aad2a93a2a5c Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Tue, 17 Apr 2018 23:18:49 -0700
Subject: tcp: export packets delivery info

Export data delivered and delivered with CE marks to
1) SNMP TCPDelivered and TCPDeliveredCE
2) getsockopt(TCP_INFO)
3) Timestamping API SOF_TIMESTAMPING_OPT_STATS

Note that for SCM_TSTAMP_ACK, the delivery info in
SOF_TIMESTAMPING_OPT_STATS is reported before the info
was fully updated on the ACK.

These stats help application monitor TCP delivery and ECN status
on per host, per connection, even per message level.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 2 ++
 include/uapi/linux/tcp.h  | 5 +++++
 net/ipv4/proc.c           | 2 ++
 net/ipv4/tcp.c            | 7 ++++++-
 net/ipv4/tcp_input.c      | 6 +++++-
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 33a70ece462f..d02e859301ff 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -276,6 +276,8 @@ enum
 	LINUX_MIB_TCPKEEPALIVE,			/* TCPKeepAlive */
 	LINUX_MIB_TCPMTUPFAIL,			/* TCPMTUPFail */
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
+	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
+	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
 	__LINUX_MIB_MAX
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 560374c978f9..379b08700a54 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -224,6 +224,9 @@ struct tcp_info {
 	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
 	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
+
+	__u32	tcpi_delivered;
+	__u32	tcpi_delivered_ce;
 };
 
 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
@@ -244,6 +247,8 @@ enum {
 	TCP_NLA_SNDQ_SIZE,	/* Data (bytes) pending in send queue */
 	TCP_NLA_CA_STATE,	/* ca_state of socket */
 	TCP_NLA_SND_SSTHRESH,	/* Slow start size threshold */
+	TCP_NLA_DELIVERED,	/* Data pkts delivered incl. out-of-order */
+	TCP_NLA_DELIVERED_CE,	/* Like above but only ones w/ CE marks */
 
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a058de677e94..261b71d0ccc5 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -296,6 +296,8 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
 	SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL),
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
+	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
+	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5a5ce6da4792..4022073b0aee 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3167,6 +3167,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	rate64 = tcp_compute_delivery_rate(tp);
 	if (rate64)
 		info->tcpi_delivery_rate = rate64;
+	info->tcpi_delivered = tp->delivered;
+	info->tcpi_delivered_ce = tp->delivered_ce;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3180,7 +3182,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  5 * nla_total_size(sizeof(u32)) +
+			  7 * nla_total_size(sizeof(u32)) +
 			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3211,9 +3213,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
+	nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
+	nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+
 	return stats;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b3bff9c20606..0396fb919b5d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3499,12 +3499,16 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 /* Returns the number of packets newly acked or sacked by the current ACK */
 static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 {
+	const struct net *net = sock_net(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 delivered;
 
 	delivered = tp->delivered - prior_delivered;
-	if (flag & FLAG_ECE)
+	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+	if (flag & FLAG_ECE) {
 		tp->delivered_ce += delivered;
+		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+	}
 	return delivered;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From af81f9e75e546de43f4f0bf532d880ddc16a4dc8 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:08 +0100
Subject: netfilter: nf_flow_table: use IP_CT_DIR_* values for
 FLOW_OFFLOAD_DIR_*

Simplifies further code cleanups

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 833752dd0c58..09ba67598991 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/rhashtable.h>
 #include <linux/rcupdate.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/dst.h>
 
 struct nf_flowtable;
@@ -27,11 +28,10 @@ struct nf_flowtable {
 };
 
 enum flow_offload_tuple_dir {
-	FLOW_OFFLOAD_DIR_ORIGINAL,
-	FLOW_OFFLOAD_DIR_REPLY,
-	__FLOW_OFFLOAD_DIR_MAX		= FLOW_OFFLOAD_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
+	FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX
 };
-#define FLOW_OFFLOAD_DIR_MAX	(__FLOW_OFFLOAD_DIR_MAX + 1)
 
 struct flow_offload_tuple {
 	union {
-- 
cgit v1.2.3-59-g8ed1b


From 047b300e6e72fc8d1a9259c2ed567e71a736683d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:09 +0100
Subject: netfilter: nf_flow_table: clean up flow_offload_alloc

Reduce code duplication and make it much easier to read

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table.c | 93 ++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 59 deletions(-)

diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index ec410cae9307..db0673a40b97 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -16,6 +16,38 @@ struct flow_offload_entry {
 	struct rcu_head		rcu_head;
 };
 
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+		      struct nf_flow_route *route,
+		      enum flow_offload_tuple_dir dir)
+{
+	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+
+	ft->dir = dir;
+
+	switch (ctt->src.l3num) {
+	case NFPROTO_IPV4:
+		ft->src_v4 = ctt->src.u3.in;
+		ft->dst_v4 = ctt->dst.u3.in;
+		break;
+	case NFPROTO_IPV6:
+		ft->src_v6 = ctt->src.u3.in6;
+		ft->dst_v6 = ctt->dst.u3.in6;
+		break;
+	}
+
+	ft->l3proto = ctt->src.l3num;
+	ft->l4proto = ctt->dst.protonum;
+	ft->src_port = ctt->src.u.tcp.port;
+	ft->dst_port = ctt->dst.u.tcp.port;
+
+	ft->iifidx = route->tuple[dir].ifindex;
+	ft->oifidx = route->tuple[!dir].ifindex;
+
+	ft->dst_cache = route->tuple[dir].dst;
+}
+
 struct flow_offload *
 flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 {
@@ -40,65 +72,8 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 
 	entry->ct = ct;
 
-	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
-	case NFPROTO_IPV4:
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
-		break;
-	case NFPROTO_IPV6:
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
-		break;
-	}
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
-		  route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
-		  route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
-		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
-		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
-						FLOW_OFFLOAD_DIR_ORIGINAL;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
-						FLOW_OFFLOAD_DIR_REPLY;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
 
 	if (ct->status & IPS_SRC_NAT)
 		flow->flags |= FLOW_OFFLOAD_SNAT;
-- 
cgit v1.2.3-59-g8ed1b


From 292eba02dbb4c41da840e462e51ee97d80d873d1 Mon Sep 17 00:00:00 2001
From: Zhao Chen <zhaochen6@huawei.com>
Date: Wed, 18 Apr 2018 06:07:39 -0400
Subject: net-next/hinic: add arm64 support

This patch enables arm64 platform support for the HINIC driver.

Signed-off-by: Zhao Chen <zhaochen6@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/Kconfig b/drivers/net/ethernet/huawei/hinic/Kconfig
index 08db24954f7e..e4e8b24c1a5d 100644
--- a/drivers/net/ethernet/huawei/hinic/Kconfig
+++ b/drivers/net/ethernet/huawei/hinic/Kconfig
@@ -4,7 +4,7 @@
 
 config HINIC
 	tristate "Huawei Intelligent PCIE Network Interface Card"
-	depends on (PCI_MSI && X86)
+	depends on (PCI_MSI && (X86 || ARM64))
 	---help---
 	  This driver supports HiNIC PCIE Ethernet cards.
 	  To compile this driver as part of the kernel, choose Y here.
-- 
cgit v1.2.3-59-g8ed1b


From 88078d98d1bb085d72af8437707279e203524fa5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Apr 2018 11:43:15 -0700
Subject: net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends

After working on IP defragmentation lately, I found that some large
packets defeat CHECKSUM_COMPLETE optimization because of NIC adding
zero paddings on the last (small) fragment.

While removing the padding with pskb_trim_rcsum(), we set skb->ip_summed
to CHECKSUM_NONE, forcing a full csum validation, even if all prior
fragments had CHECKSUM_COMPLETE set.

We can instead compute the checksum of the part we are trimming,
usually smaller than the part we keep.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 ++---
 net/core/skbuff.c      | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477ed255..d274059529eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3131,6 +3131,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
 	return skb->data;
 }
 
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
 /**
  *	pskb_trim_rcsum - trim received skb and update checksum
  *	@skb: buffer to trim
@@ -3144,9 +3145,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len >= skb->len))
 		return 0;
-	if (skb->ip_summed == CHECKSUM_COMPLETE)
-		skb->ip_summed = CHECKSUM_NONE;
-	return __pskb_trim(skb, len);
+	return pskb_trim_rcsum_slow(skb, len);
 }
 
 static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..ff49e352deea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1839,6 +1839,20 @@ done:
 }
 EXPORT_SYMBOL(___pskb_trim);
 
+/* Note : use pskb_trim_rcsum() instead of calling this directly
+ */
+int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
+{
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		int delta = skb->len - len;
+
+		skb->csum = csum_sub(skb->csum,
+				     skb_checksum(skb, len, delta, 0));
+	}
+	return __pskb_trim(skb, len);
+}
+EXPORT_SYMBOL(pskb_trim_rcsum_slow);
+
 /**
  *	__pskb_pull_tail - advance tail of skb header
  *	@skb: buffer to reallocate
-- 
cgit v1.2.3-59-g8ed1b


From 93c2fb253d177a0b8f4f93592441f88c9b7d6245 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:38:59 -0700
Subject: net/ipv6: Rename fib6_info struct elements

Change the prefix for fib6_info struct elements from rt6i_ to fib6_.
rt6i_pcpu and rt6i_exception_bucket are left as is given that they
point to rt6_info entries.

Rename only; not functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  56 ++---
 include/net/ip6_fib.h                              |  38 +--
 include/net/ip6_route.h                            |  26 +-
 net/ipv6/addrconf.c                                |  24 +-
 net/ipv6/anycast.c                                 |   8 +-
 net/ipv6/ip6_fib.c                                 | 170 ++++++-------
 net/ipv6/ndisc.c                                   |   2 +-
 net/ipv6/route.c                                   | 272 ++++++++++-----------
 8 files changed, 298 insertions(+), 298 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index b85b15851841..8e4edb634b11 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -4705,17 +4705,17 @@ static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
 	 * are trapped to the CPU, so no need to program specific routes
 	 * for them.
 	 */
-	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LINKLOCAL)
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_LINKLOCAL)
 		return true;
 
 	/* Multicast routes aren't supported, so ignore them. Neighbour
 	 * Discovery packets are specifically trapped.
 	 */
-	if (ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_MULTICAST)
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_MULTICAST)
 		return true;
 
 	/* Cloned routes are irrelevant in the forwarding path. */
-	if (rt->rt6i_flags & RTF_CACHE)
+	if (rt->fib6_flags & RTF_CACHE)
 		return true;
 
 	return false;
@@ -4759,7 +4759,7 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
 static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
 {
 	/* RTF_CACHE routes are ignored */
-	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
 }
 
 static struct fib6_info *
@@ -4784,16 +4784,16 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
 		 * virtual router.
 		 */
-		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
 			continue;
-		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (rt->rt6i_metric < nrt->rt6i_metric)
+		if (rt->fib6_metric < nrt->fib6_metric)
 			continue;
-		if (rt->rt6i_metric == nrt->rt6i_metric &&
+		if (rt->fib6_metric == nrt->fib6_metric &&
 		    mlxsw_sp_fib6_rt_can_mp(rt))
 			return fib6_entry;
-		if (rt->rt6i_metric > nrt->rt6i_metric)
+		if (rt->fib6_metric > nrt->fib6_metric)
 			break;
 	}
 
@@ -4899,7 +4899,7 @@ static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
 static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
 				    const struct fib6_info *rt)
 {
-	return rt->rt6i_flags & RTF_GATEWAY ||
+	return rt->fib6_flags & RTF_GATEWAY ||
 	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
 }
 
@@ -5092,9 +5092,9 @@ static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
 	 * local, which will cause them to be trapped with a lower
 	 * priority than packets that need to be locally received.
 	 */
-	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
-	else if (rt->rt6i_flags & RTF_REJECT)
+	else if (rt->fib6_flags & RTF_REJECT)
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
 	else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt))
 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
@@ -5175,18 +5175,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
-		if (rt->rt6i_table->tb6_id > nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
 			continue;
-		if (rt->rt6i_table->tb6_id != nrt->rt6i_table->tb6_id)
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
 			break;
-		if (replace && rt->rt6i_metric == nrt->rt6i_metric) {
+		if (replace && rt->fib6_metric == nrt->fib6_metric) {
 			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
 			    mlxsw_sp_fib6_rt_can_mp(nrt))
 				return fib6_entry;
 			if (mlxsw_sp_fib6_rt_can_mp(nrt))
 				fallback = fallback ?: fib6_entry;
 		}
-		if (rt->rt6i_metric > nrt->rt6i_metric)
+		if (rt->fib6_metric > nrt->fib6_metric)
 			return fallback ?: fib6_entry;
 	}
 
@@ -5215,7 +5215,7 @@ mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
 		list_for_each_entry(last, &fib_node->entry_list, common.list) {
 			struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);
 
-			if (nrt->rt6i_table->tb6_id > rt->rt6i_table->tb6_id)
+			if (nrt->fib6_table->tb6_id > rt->fib6_table->tb6_id)
 				break;
 			fib6_entry = last;
 		}
@@ -5275,22 +5275,22 @@ mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
 	struct mlxsw_sp_fib *fib;
 	struct mlxsw_sp_vr *vr;
 
-	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->rt6i_table->tb6_id);
+	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->fib6_table->tb6_id);
 	if (!vr)
 		return NULL;
 	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV6);
 
-	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->rt6i_dst.addr,
-					    sizeof(rt->rt6i_dst.addr),
-					    rt->rt6i_dst.plen);
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->fib6_dst.addr,
+					    sizeof(rt->fib6_dst.addr),
+					    rt->fib6_dst.plen);
 	if (!fib_node)
 		return NULL;
 
 	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
 		struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
 
-		if (rt->rt6i_table->tb6_id == iter_rt->rt6i_table->tb6_id &&
-		    rt->rt6i_metric == iter_rt->rt6i_metric &&
+		if (rt->fib6_table->tb6_id == iter_rt->fib6_table->tb6_id &&
+		    rt->fib6_metric == iter_rt->fib6_metric &&
 		    mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt))
 			return fib6_entry;
 	}
@@ -5325,16 +5325,16 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
 	if (mlxsw_sp->router->aborted)
 		return 0;
 
-	if (rt->rt6i_src.plen)
+	if (rt->fib6_src.plen)
 		return -EINVAL;
 
 	if (mlxsw_sp_fib6_rt_should_ignore(rt))
 		return 0;
 
-	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->rt6i_table->tb6_id,
-					 &rt->rt6i_dst.addr,
-					 sizeof(rt->rt6i_dst.addr),
-					 rt->rt6i_dst.plen,
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->fib6_table->tb6_id,
+					 &rt->fib6_dst.addr,
+					 sizeof(rt->fib6_dst.addr),
+					 rt->fib6_dst.plen,
 					 MLXSW_SP_L3_PROTO_IPV6);
 	if (IS_ERR(fib_node))
 		return PTR_ERR(fib_node);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a36116b92100..994dd67207fb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -134,34 +134,34 @@ struct fib6_nh {
 };
 
 struct fib6_info {
-	struct fib6_table		*rt6i_table;
+	struct fib6_table		*fib6_table;
 	struct fib6_info __rcu		*rt6_next;
-	struct fib6_node __rcu		*rt6i_node;
+	struct fib6_node __rcu		*fib6_node;
 
 	/* Multipath routes:
 	 * siblings is a list of fib6_info that have the the same metric/weight,
 	 * destination, but not the same gateway. nsiblings is just a cache
 	 * to speed up lookup.
 	 */
-	struct list_head		rt6i_siblings;
-	unsigned int			rt6i_nsiblings;
+	struct list_head		fib6_siblings;
+	unsigned int			fib6_nsiblings;
 
-	atomic_t			rt6i_ref;
-	struct inet6_dev		*rt6i_idev;
+	atomic_t			fib6_ref;
+	struct inet6_dev		*fib6_idev;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
 
-	struct rt6key			rt6i_dst;
-	u32				rt6i_flags;
-	struct rt6key			rt6i_src;
-	struct rt6key			rt6i_prefsrc;
+	struct rt6key			fib6_dst;
+	u32				fib6_flags;
+	struct rt6key			fib6_src;
+	struct rt6key			fib6_prefsrc;
 
 	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
-	u32				rt6i_metric;
-	u8				rt6i_protocol;
+	u32				fib6_metric;
+	u8				fib6_protocol;
 	u8				fib6_type;
 	u8				exception_bucket_flushed:1,
 					should_flush:1,
@@ -206,7 +206,7 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 
 static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
-	f6i->rt6i_flags &= ~RTF_EXPIRES;
+	f6i->fib6_flags &= ~RTF_EXPIRES;
 	f6i->expires = 0;
 }
 
@@ -214,12 +214,12 @@ static inline void fib6_set_expires(struct fib6_info *f6i,
 				    unsigned long expires)
 {
 	f6i->expires = expires;
-	f6i->rt6i_flags |= RTF_EXPIRES;
+	f6i->fib6_flags |= RTF_EXPIRES;
 }
 
 static inline bool fib6_check_expired(const struct fib6_info *f6i)
 {
-	if (f6i->rt6i_flags & RTF_EXPIRES)
+	if (f6i->fib6_flags & RTF_EXPIRES)
 		return time_after(jiffies, f6i->expires);
 	return false;
 }
@@ -250,14 +250,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct fib6_info *rt,
+static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i,
 				       u32 *cookie)
 {
 	struct fib6_node *fn;
 	bool status = false;
 
 	rcu_read_lock();
-	fn = rcu_dereference(rt->rt6i_node);
+	fn = rcu_dereference(f6i->fib6_node);
 
 	if (fn) {
 		*cookie = fn->fn_sernum;
@@ -295,12 +295,12 @@ void fib6_info_destroy(struct fib6_info *f6i);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
 {
-	atomic_inc(&f6i->rt6i_ref);
+	atomic_inc(&f6i->fib6_ref);
 }
 
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
-	if (f6i && atomic_dec_and_test(&f6i->rt6i_ref))
+	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
 		fib6_info_destroy(f6i);
 }
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5fb1e4ae7ac..b4b85109984f 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,9 +66,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct fib6_info *rt)
+static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 	       RTF_GATEWAY;
 }
 
@@ -102,23 +102,23 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg);
 
 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
 		  struct netlink_ext_ack *extack);
-int ip6_ins_rt(struct net *net, struct fib6_info *rt);
-int ip6_del_rt(struct net *net, struct fib6_info *rt);
+int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
+int ip6_del_rt(struct net *net, struct fib6_info *f6i);
 
-void rt6_flush_exceptions(struct fib6_info *rt);
-void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
+void rt6_flush_exceptions(struct fib6_info *f6i);
+void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
 			unsigned long now);
 
-static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *rt,
+static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
 				      const struct in6_addr *daddr,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev = rt ? rt->rt6i_idev : NULL;
+	struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL;
 	int err = 0;
 
-	if (rt && rt->rt6i_prefsrc.plen)
-		*saddr = rt->rt6i_prefsrc.addr;
+	if (f6i && f6i->fib6_prefsrc.plen)
+		*saddr = f6i->fib6_prefsrc.addr;
 	else
 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
 					 daddr, prefs, saddr);
@@ -176,14 +176,14 @@ struct rt6_rtnl_dump_arg {
 	struct net *net;
 };
 
-int rt6_dump_route(struct fib6_info *rt, void *p_arg);
+int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
-void rt6_multipath_rebalance(struct fib6_info *rt);
+void rt6_multipath_rebalance(struct fib6_info *f6i);
 
 void rt6_uncached_list_add(struct rt6_info *rt);
 void rt6_uncached_list_del(struct rt6_info *rt);
@@ -274,7 +274,7 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       a->rt6i_idev == b->rt6i_idev &&
+	       a->fib6_idev == b->fib6_idev &&
 	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a294e86a9b25..f38ea829c28b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1178,19 +1178,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
 static void
 cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
 {
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 
-	rt = addrconf_get_prefix_route(&ifp->addr,
+	f6i = addrconf_get_prefix_route(&ifp->addr,
 				       ifp->prefix_len,
 				       ifp->idev->dev,
 				       0, RTF_GATEWAY | RTF_DEFAULT);
-	if (rt) {
+	if (f6i) {
 		if (del_rt)
-			ip6_del_rt(dev_net(ifp->idev->dev), rt);
+			ip6_del_rt(dev_net(ifp->idev->dev), f6i);
 		else {
-			if (!(rt->rt6i_flags & RTF_EXPIRES))
-				fib6_set_expires(rt, expires);
-			fib6_info_release(rt);
+			if (!(f6i->fib6_flags & RTF_EXPIRES))
+				fib6_set_expires(f6i, expires);
+			fib6_info_release(f6i);
 		}
 	}
 }
@@ -2370,9 +2370,9 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
 			continue;
-		if ((rt->rt6i_flags & flags) != flags)
+		if ((rt->fib6_flags & flags) != flags)
 			continue;
-		if ((rt->rt6i_flags & noflags) != 0)
+		if ((rt->fib6_flags & noflags) != 0)
 			continue;
 		fib6_info_hold(rt);
 		break;
@@ -3341,11 +3341,11 @@ static int fixup_permanent_addr(struct net *net,
 				struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
-	/* !rt6i_node means the host route was removed from the
+	/* !fib6_node means the host route was removed from the
 	 * FIB, for example, if 'lo' device is taken down. In that
 	 * case regenerate the host route.
 	 */
-	if (!ifp->rt || !ifp->rt->rt6i_node) {
+	if (!ifp->rt || !ifp->rt->fib6_node) {
 		struct fib6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
@@ -5612,7 +5612,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * our DAD process, so we don't need
 		 * to do it again
 		 */
-		if (!rcu_access_pointer(ifp->rt->rt6i_node))
+		if (!rcu_access_pointer(ifp->rt->fib6_node))
 			ip6_ins_rt(net, ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 0e35657501a7..eed3bf63bd05 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -218,10 +218,10 @@ static void aca_put(struct ifacaddr6 *ac)
 	}
 }
 
-static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
+static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 				   const struct in6_addr *addr)
 {
-	struct inet6_dev *idev = rt->rt6i_idev;
+	struct inet6_dev *idev = f6i->fib6_idev;
 	struct ifacaddr6 *aca;
 
 	aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -231,8 +231,8 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *rt,
 	aca->aca_addr = *addr;
 	in6_dev_hold(idev);
 	aca->aca_idev = idev;
-	fib6_info_hold(rt);
-	aca->aca_rt = rt;
+	fib6_info_hold(f6i);
+	aca->aca_rt = f6i;
 	aca->aca_users = 1;
 	/* aca_tstamp should be updated upon changes */
 	aca->aca_cstamp = aca->aca_tstamp = jiffies;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 2ab49b7cac22..353f0b3e7b0d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -105,12 +105,12 @@ enum {
 	FIB6_NO_SERNUM_CHANGE = 0,
 };
 
-void fib6_update_sernum(struct net *net, struct fib6_info *rt)
+void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
 {
 	struct fib6_node *fn;
 
-	fn = rcu_dereference_protected(rt->rt6i_node,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	fn = rcu_dereference_protected(f6i->fib6_node,
+			lockdep_is_held(&f6i->fib6_table->tb6_lock));
 	if (fn)
 		fn->fn_sernum = fib6_new_sernum(net);
 }
@@ -159,10 +159,10 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 		return NULL;
 	}
 
-	INIT_LIST_HEAD(&f6i->rt6i_siblings);
+	INIT_LIST_HEAD(&f6i->fib6_siblings);
 	f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
 
-	atomic_inc(&f6i->rt6i_ref);
+	atomic_inc(&f6i->fib6_ref);
 
 	return f6i;
 }
@@ -172,7 +172,7 @@ void fib6_info_destroy(struct fib6_info *f6i)
 	struct rt6_exception_bucket *bucket;
 	struct dst_metrics *m;
 
-	WARN_ON(f6i->rt6i_node);
+	WARN_ON(f6i->fib6_node);
 
 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
 	if (bucket) {
@@ -197,8 +197,8 @@ void fib6_info_destroy(struct fib6_info *f6i)
 		}
 	}
 
-	if (f6i->rt6i_idev)
-		in6_dev_put(f6i->rt6i_idev);
+	if (f6i->fib6_idev)
+		in6_dev_put(f6i->fib6_idev);
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
@@ -401,7 +401,7 @@ static int call_fib6_entry_notifiers(struct net *net,
 		.rt = rt,
 	};
 
-	rt->rt6i_table->fib_seq++;
+	rt->fib6_table->fib_seq++;
 	return call_fib6_notifiers(net, event_type, &info.info);
 }
 
@@ -483,10 +483,10 @@ static int fib6_dump_node(struct fib6_walker *w)
 		 * last sibling of this route (no need to dump the
 		 * sibling routes again)
 		 */
-		if (rt->rt6i_nsiblings)
-			rt = list_last_entry(&rt->rt6i_siblings,
+		if (rt->fib6_nsiblings)
+			rt = list_last_entry(&rt->fib6_siblings,
 					     struct fib6_info,
-					     rt6i_siblings);
+					     fib6_siblings);
 	}
 	w->leaf = NULL;
 	return 0;
@@ -810,7 +810,7 @@ insert_above:
 		RCU_INIT_POINTER(in->parent, pn);
 		in->leaf = fn->leaf;
 		atomic_inc(&rcu_dereference_protected(in->leaf,
-				lockdep_is_held(&table->tb6_lock))->rt6i_ref);
+				lockdep_is_held(&table->tb6_lock))->fib6_ref);
 
 		/* update parent pointer */
 		if (dir)
@@ -865,9 +865,9 @@ insert_above:
 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_table *table = rt->fib6_table;
 
-	if (atomic_read(&rt->rt6i_ref) != 1) {
+	if (atomic_read(&rt->fib6_ref) != 1) {
 		/* This route is used as dummy address holder in some split
 		 * nodes. It is not leaked, but it still holds other resources,
 		 * which must be released in time. So, scan ascendant nodes
@@ -880,7 +880,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			struct fib6_info *new_leaf;
 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
 				new_leaf = fib6_find_prefix(net, table, fn);
-				atomic_inc(&new_leaf->rt6i_ref);
+				atomic_inc(&new_leaf->fib6_ref);
 
 				rcu_assign_pointer(fn->leaf, new_leaf);
 				fib6_info_release(rt);
@@ -919,7 +919,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			    struct netlink_ext_ack *extack)
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 	struct fib6_info *iter = NULL;
 	struct fib6_info __rcu **ins;
 	struct fib6_info __rcu **fallback_ins = NULL;
@@ -939,12 +939,12 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 	for (iter = leaf; iter;
 	     iter = rcu_dereference_protected(iter->rt6_next,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
+				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
 		 */
 
-		if (iter->rt6i_metric == rt->rt6i_metric) {
+		if (iter->fib6_metric == rt->fib6_metric) {
 			/*
 			 *	Same priority level
 			 */
@@ -964,11 +964,11 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
-				if (rt->rt6i_nsiblings)
-					rt->rt6i_nsiblings = 0;
-				if (!(iter->rt6i_flags & RTF_EXPIRES))
+				if (rt->fib6_nsiblings)
+					rt->fib6_nsiblings = 0;
+				if (!(iter->fib6_flags & RTF_EXPIRES))
 					return -EEXIST;
-				if (!(rt->rt6i_flags & RTF_EXPIRES))
+				if (!(rt->fib6_flags & RTF_EXPIRES))
 					fib6_clean_expires(iter);
 				else
 					fib6_set_expires(iter, rt->expires);
@@ -988,10 +988,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			 */
 			if (rt_can_ecmp &&
 			    rt6_qualify_for_ecmp(iter))
-				rt->rt6i_nsiblings++;
+				rt->fib6_nsiblings++;
 		}
 
-		if (iter->rt6i_metric > rt->rt6i_metric)
+		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
 next_iter:
@@ -1002,7 +1002,7 @@ next_iter:
 		/* No ECMP-able route found, replace first non-ECMP one */
 		ins = fallback_ins;
 		iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		found++;
 	}
 
@@ -1011,34 +1011,34 @@ next_iter:
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (rt->rt6i_nsiblings) {
-		unsigned int rt6i_nsiblings;
+	if (rt->fib6_nsiblings) {
+		unsigned int fib6_nsiblings;
 		struct fib6_info *sibling, *temp_sibling;
 
 		/* Find the first route that have the same metric */
 		sibling = leaf;
 		while (sibling) {
-			if (sibling->rt6i_metric == rt->rt6i_metric &&
+			if (sibling->fib6_metric == rt->fib6_metric &&
 			    rt6_qualify_for_ecmp(sibling)) {
-				list_add_tail(&rt->rt6i_siblings,
-					      &sibling->rt6i_siblings);
+				list_add_tail(&rt->fib6_siblings,
+					      &sibling->fib6_siblings);
 				break;
 			}
 			sibling = rcu_dereference_protected(sibling->rt6_next,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
-		rt6i_nsiblings = 0;
+		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings) {
-			sibling->rt6i_nsiblings++;
-			BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings);
-			rt6i_nsiblings++;
+					 &rt->fib6_siblings, fib6_siblings) {
+			sibling->fib6_nsiblings++;
+			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+			fib6_nsiblings++;
 		}
-		BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
+		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
 		rt6_multipath_rebalance(temp_sibling);
 	}
 
@@ -1059,8 +1059,8 @@ add:
 			return err;
 
 		rcu_assign_pointer(rt->rt6_next, iter);
-		atomic_inc(&rt->rt6i_ref);
-		rcu_assign_pointer(rt->rt6i_node, fn);
+		atomic_inc(&rt->fib6_ref);
+		rcu_assign_pointer(rt->fib6_node, fn);
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -1087,8 +1087,8 @@ add:
 		if (err)
 			return err;
 
-		atomic_inc(&rt->rt6i_ref);
-		rcu_assign_pointer(rt->rt6i_node, fn);
+		atomic_inc(&rt->fib6_ref);
+		rcu_assign_pointer(rt->fib6_node, fn);
 		rt->rt6_next = iter->rt6_next;
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
@@ -1097,8 +1097,8 @@ add:
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
-		nsiblings = iter->rt6i_nsiblings;
-		iter->rt6i_node = NULL;
+		nsiblings = iter->fib6_nsiblings;
+		iter->fib6_node = NULL;
 		fib6_purge_rt(iter, fn, info->nl_net);
 		if (rcu_access_pointer(fn->rr_ptr) == iter)
 			fn->rr_ptr = NULL;
@@ -1108,13 +1108,13 @@ add:
 			/* Replacing an ECMP route, remove all siblings */
 			ins = &rt->rt6_next;
 			iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 			while (iter) {
-				if (iter->rt6i_metric > rt->rt6i_metric)
+				if (iter->fib6_metric > rt->fib6_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->rt6_next;
-					iter->rt6i_node = NULL;
+					iter->fib6_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
 						fn->rr_ptr = NULL;
@@ -1125,7 +1125,7 @@ add:
 					ins = &iter->rt6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
-					lockdep_is_held(&rt->rt6i_table->tb6_lock));
+					lockdep_is_held(&rt->fib6_table->tb6_lock));
 			}
 			WARN_ON(nsiblings != 0);
 		}
@@ -1137,7 +1137,7 @@ add:
 static void fib6_start_gc(struct net *net, struct fib6_info *rt)
 {
 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
-	    (rt->rt6i_flags & RTF_EXPIRES))
+	    (rt->fib6_flags & RTF_EXPIRES))
 		mod_timer(&net->ipv6.ip6_fib_timer,
 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
 }
@@ -1152,15 +1152,15 @@ void fib6_force_start_gc(struct net *net)
 static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
 					   int sernum)
 {
-	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 
 	/* paired with smp_rmb() in rt6_get_cookie_safe() */
 	smp_wmb();
 	while (fn) {
 		fn->fn_sernum = sernum;
 		fn = rcu_dereference_protected(fn->parent,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 }
 
@@ -1179,7 +1179,7 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack)
 {
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_table *table = rt->fib6_table;
 	struct fib6_node *fn, *pn = NULL;
 	int err = -ENOMEM;
 	int allow_create = 1;
@@ -1196,8 +1196,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
 
 	fn = fib6_add_1(info->nl_net, table, root,
-			&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
-			offsetof(struct fib6_info, rt6i_dst), allow_create,
+			&rt->fib6_dst.addr, rt->fib6_dst.plen,
+			offsetof(struct fib6_info, fib6_dst), allow_create,
 			replace_required, extack);
 	if (IS_ERR(fn)) {
 		err = PTR_ERR(fn);
@@ -1208,7 +1208,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	pn = fn;
 
 #ifdef CONFIG_IPV6_SUBTREES
-	if (rt->rt6i_src.plen) {
+	if (rt->fib6_src.plen) {
 		struct fib6_node *sn;
 
 		if (!rcu_access_pointer(fn->subtree)) {
@@ -1229,7 +1229,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			if (!sfn)
 				goto failure;
 
-			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->rt6i_ref);
+			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
 			rcu_assign_pointer(sfn->leaf,
 					   info->nl_net->ipv6.fib6_null_entry);
 			sfn->fn_flags = RTN_ROOT;
@@ -1237,8 +1237,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			/* Now add the first leaf node to new subtree */
 
 			sn = fib6_add_1(info->nl_net, table, sfn,
-					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct fib6_info, rt6i_src),
+					&rt->fib6_src.addr, rt->fib6_src.plen,
+					offsetof(struct fib6_info, fib6_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1256,8 +1256,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 			rcu_assign_pointer(fn->subtree, sfn);
 		} else {
 			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
-					&rt->rt6i_src.addr, rt->rt6i_src.plen,
-					offsetof(struct fib6_info, rt6i_src),
+					&rt->fib6_src.addr, rt->fib6_src.plen,
+					offsetof(struct fib6_info, fib6_src),
 					allow_create, replace_required, extack);
 
 			if (IS_ERR(sn)) {
@@ -1272,7 +1272,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 				rcu_assign_pointer(fn->leaf,
 					    info->nl_net->ipv6.fib6_null_entry);
 			} else {
-				atomic_inc(&rt->rt6i_ref);
+				atomic_inc(&rt->fib6_ref);
 				rcu_assign_pointer(fn->leaf, rt);
 			}
 		}
@@ -1421,12 +1421,12 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
 		{
-			.offset = offsetof(struct fib6_info, rt6i_dst),
+			.offset = offsetof(struct fib6_info, fib6_dst),
 			.addr = daddr,
 		},
 #ifdef CONFIG_IPV6_SUBTREES
 		{
-			.offset = offsetof(struct fib6_info, rt6i_src),
+			.offset = offsetof(struct fib6_info, fib6_src),
 			.addr = saddr,
 		},
 #endif
@@ -1511,7 +1511,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 	struct fib6_node *fn;
 
 	fn = fib6_locate_1(root, daddr, dst_len,
-			   offsetof(struct fib6_info, rt6i_dst),
+			   offsetof(struct fib6_info, fib6_dst),
 			   exact_match);
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1522,7 +1522,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 			if (subtree) {
 				fn = fib6_locate_1(subtree, saddr, src_len,
-					   offsetof(struct fib6_info, rt6i_src),
+					   offsetof(struct fib6_info, fib6_src),
 					   exact_match);
 			}
 		}
@@ -1706,7 +1706,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 
 	/* Unlink it */
 	*rtp = rt->rt6_next;
-	rt->rt6i_node = NULL;
+	rt->fib6_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
 
@@ -1718,14 +1718,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 		fn->rr_ptr = NULL;
 
 	/* Remove this entry from other siblings */
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		struct fib6_info *sibling, *next_sibling;
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings)
-			sibling->rt6i_nsiblings--;
-		rt->rt6i_nsiblings = 0;
-		list_del_init(&rt->rt6i_siblings);
+					 &rt->fib6_siblings, fib6_siblings)
+			sibling->fib6_nsiblings--;
+		rt->fib6_nsiblings = 0;
+		list_del_init(&rt->fib6_siblings);
 		rt6_multipath_rebalance(next_sibling);
 	}
 
@@ -1765,9 +1765,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 /* Need to own table->tb6_lock */
 int fib6_del(struct fib6_info *rt, struct nl_info *info)
 {
-	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
-				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
-	struct fib6_table *table = rt->rt6i_table;
+	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
+				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+	struct fib6_table *table = rt->fib6_table;
 	struct net *net = info->nl_net;
 	struct fib6_info __rcu **rtp;
 	struct fib6_info __rcu **rtp_next;
@@ -1951,17 +1951,17 @@ static int fib6_clean_node(struct fib6_walker *w)
 #if RT6_DEBUG >= 2
 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
 					 __func__, rt,
-					 rcu_access_pointer(rt->rt6i_node),
+					 rcu_access_pointer(rt->fib6_node),
 					 res);
 #endif
 				continue;
 			}
 			return 0;
 		} else if (res == -2) {
-			if (WARN_ON(!rt->rt6i_nsiblings))
+			if (WARN_ON(!rt->fib6_nsiblings))
 				continue;
-			rt = list_last_entry(&rt->rt6i_siblings,
-					     struct fib6_info, rt6i_siblings);
+			rt = list_last_entry(&rt->fib6_siblings,
+					     struct fib6_info, fib6_siblings);
 			continue;
 		}
 		WARN_ON(res != 0);
@@ -2045,7 +2045,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
 	 *	Routes are expired even if they are in use.
 	 */
 
-	if (rt->rt6i_flags & RTF_EXPIRES && rt->expires) {
+	if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
 		if (time_after(now, rt->expires)) {
 			RT6_TRACE("expiring %p\n", rt);
 			return -1;
@@ -2243,22 +2243,22 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
 	struct ipv6_route_iter *iter = seq->private;
 	const struct net_device *dev;
 
-	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
+	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
 
 #ifdef CONFIG_IPV6_SUBTREES
-	seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
+	seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
 #else
 	seq_puts(seq, "00000000000000000000000000000000 00 ");
 #endif
-	if (rt->rt6i_flags & RTF_GATEWAY)
+	if (rt->fib6_flags & RTF_GATEWAY)
 		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
 	else
 		seq_puts(seq, "00000000000000000000000000000000");
 
 	dev = rt->fib6_nh.nh_dev;
 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
-		   rt->rt6i_metric, atomic_read(&rt->rt6i_ref), 0,
-		   rt->rt6i_flags, dev ? dev->name : "");
+		   rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
+		   rt->fib6_flags, dev ? dev->name : "");
 	iter->w.leaf = NULL;
 	return 0;
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 102645298692..9ac5366064e3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1318,7 +1318,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 		}
 		neigh->flags |= NTF_ROUTER;
 	} else if (rt) {
-		rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+		rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 	}
 
 	if (rt)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f9c363327d62..e23ab0784e65 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -284,10 +284,10 @@ static const u32 ip6_template_metrics[RTAX_MAX] = {
 };
 
 static const struct fib6_info fib6_null_entry_template = {
-	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
-	.rt6i_protocol  = RTPROT_KERNEL,
-	.rt6i_metric	= ~(u32)0,
-	.rt6i_ref	= ATOMIC_INIT(1),
+	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.fib6_protocol  = RTPROT_KERNEL,
+	.fib6_metric	= ~(u32)0,
+	.fib6_ref	= ATOMIC_INIT(1),
 	.fib6_type	= RTN_UNREACHABLE,
 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
 };
@@ -429,8 +429,8 @@ static struct fib6_info *rt6_multipath_select(const struct net *net,
 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
 		return match;
 
-	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
-				 rt6i_siblings) {
+	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
+				 fib6_siblings) {
 		int nh_upper_bound;
 
 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
@@ -472,12 +472,12 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 			if (dev->ifindex == oif)
 				return sprt;
 			if (dev->flags & IFF_LOOPBACK) {
-				if (!sprt->rt6i_idev ||
-				    sprt->rt6i_idev->dev->ifindex != oif) {
+				if (!sprt->fib6_idev ||
+				    sprt->fib6_idev->dev->ifindex != oif) {
 					if (flags & RT6_LOOKUP_F_IFACE)
 						continue;
 					if (local &&
-					    local->rt6i_idev->dev->ifindex == oif)
+					    local->fib6_idev->dev->ifindex == oif)
 						continue;
 				}
 				local = sprt;
@@ -534,7 +534,7 @@ static void rt6_probe(struct fib6_info *rt)
 	 * Router Reachability Probe MUST be rate-limited
 	 * to no more than one per minute.
 	 */
-	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
 		return;
 
 	nh_gw = &rt->fib6_nh.nh_gw;
@@ -550,7 +550,7 @@ static void rt6_probe(struct fib6_info *rt)
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
 			       neigh->updated +
-			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
+			       rt->fib6_idev->cnf.rtr_probe_interval)) {
 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 			if (work)
 				__neigh_set_probe_once(neigh);
@@ -587,7 +587,7 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 	if (!oif || dev->ifindex == oif)
 		return 2;
 	if ((dev->flags & IFF_LOOPBACK) &&
-	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
+	    rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif)
 		return 1;
 	return 0;
 }
@@ -597,8 +597,8 @@ static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
 
-	if (rt->rt6i_flags & RTF_NONEXTHOP ||
-	    !(rt->rt6i_flags & RTF_GATEWAY))
+	if (rt->fib6_flags & RTF_NONEXTHOP ||
+	    !(rt->fib6_flags & RTF_GATEWAY))
 		return RT6_NUD_SUCCEED;
 
 	rcu_read_lock_bh();
@@ -632,7 +632,7 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 		return RT6_NUD_FAIL_HARD;
 #ifdef CONFIG_IPV6_ROUTER_PREF
-	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
+	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
 #endif
 	if (strict & RT6_LOOKUP_F_REACHABLE) {
 		int n = rt6_check_neigh(rt);
@@ -648,7 +648,7 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 {
 	int m;
 	bool match_do_rr = false;
-	struct inet6_dev *idev = rt->rt6i_idev;
+	struct inet6_dev *idev = rt->fib6_idev;
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
@@ -694,7 +694,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	match = NULL;
 	cont = NULL;
 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
-		if (rt->rt6i_metric != metric) {
+		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
 		}
@@ -704,7 +704,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 
 	for (rt = leaf; rt && rt != rr_head;
 	     rt = rcu_dereference(rt->rt6_next)) {
-		if (rt->rt6i_metric != metric) {
+		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
 		}
@@ -741,30 +741,30 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 	 * (This might happen if all routes under fn are deleted from
 	 * the tree and fib6_repair_tree() is called on the node.)
 	 */
-	key_plen = rt0->rt6i_dst.plen;
+	key_plen = rt0->fib6_dst.plen;
 #ifdef CONFIG_IPV6_SUBTREES
-	if (rt0->rt6i_src.plen)
-		key_plen = rt0->rt6i_src.plen;
+	if (rt0->fib6_src.plen)
+		key_plen = rt0->fib6_src.plen;
 #endif
 	if (fn->fn_bit != key_plen)
 		return net->ipv6.fib6_null_entry;
 
-	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
+	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
 			     &do_rr);
 
 	if (do_rr) {
 		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
 
 		/* no entries matched; do round-robin */
-		if (!next || next->rt6i_metric != rt0->rt6i_metric)
+		if (!next || next->fib6_metric != rt0->fib6_metric)
 			next = leaf;
 
 		if (next != rt0) {
-			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+			spin_lock_bh(&leaf->fib6_table->tb6_lock);
 			/* make sure next is not being deleted from the tree */
-			if (next->rt6i_node)
+			if (next->fib6_node)
 				rcu_assign_pointer(fn->rr_ptr, next);
-			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
 		}
 	}
 
@@ -773,7 +773,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 
 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
 {
-	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -837,8 +837,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 					dev, pref);
 	else if (rt)
-		rt->rt6i_flags = RTF_ROUTEINFO |
-				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+		rt->fib6_flags = RTF_ROUTEINFO |
+				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 
 	if (rt) {
 		if (!addrconf_finite_timeout(lifetime))
@@ -861,13 +861,13 @@ static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
 {
 	struct net_device *dev = rt->fib6_nh.nh_dev;
 
-	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
 		/* for copies of local routes, dst->dev needs to be the
 		 * device if it is a master device, the master device if
 		 * device is enslaved, and the loopback as the default
 		 */
 		if (netif_is_l3_slave(dev) &&
-		    !rt6_need_strict(&rt->rt6i_dst.addr))
+		    !rt6_need_strict(&rt->fib6_dst.addr))
 			dev = l3mdev_master_dev_rcu(dev);
 		else if (!netif_is_l3_master(dev))
 			dev = dev_net(dev)->loopback_dev;
@@ -939,7 +939,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 {
 	rt->dst.flags |= fib6_info_dst_flags(ort);
 
-	if (ort->rt6i_flags & RTF_REJECT) {
+	if (ort->fib6_flags & RTF_REJECT) {
 		ip6_rt_init_dst_reject(rt, ort);
 		return;
 	}
@@ -949,7 +949,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
 
 	if (ort->fib6_type == RTN_LOCAL) {
 		rt->dst.input = ip6_input;
-	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
+	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
 		rt->dst.input = ip6_mc_input;
 	} else {
 		rt->dst.input = ip6_forward;
@@ -979,17 +979,17 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
 	ip6_rt_init_dst(rt, ort);
 
-	rt->rt6i_dst = ort->rt6i_dst;
-	rt->rt6i_idev = ort->rt6i_idev;
+	rt->rt6i_dst = ort->fib6_dst;
+	rt->rt6i_idev = ort->fib6_idev;
 	if (rt->rt6i_idev)
 		in6_dev_hold(rt->rt6i_idev);
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
-	rt->rt6i_flags = ort->rt6i_flags;
+	rt->rt6i_flags = ort->fib6_flags;
 	rt6_set_from(rt, ort);
 #ifdef CONFIG_IPV6_SUBTREES
-	rt->rt6i_src = ort->rt6i_src;
+	rt->rt6i_src = ort->fib6_src;
 #endif
-	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
+	rt->rt6i_prefsrc = ort->fib6_prefsrc;
 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
 }
 
@@ -1064,7 +1064,7 @@ restart:
 	} else {
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
-		if (f6i->rt6i_nsiblings && fl6->flowi6_oif == 0)
+		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
 			f6i = rt6_multipath_select(net, f6i, fl6,
 						   fl6->flowi6_oif, skb, flags);
 	}
@@ -1142,7 +1142,7 @@ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
 	int err;
 	struct fib6_table *table;
 
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 	err = fib6_add(&table->tb6_root, rt, info, extack);
 	spin_unlock_bh(&table->tb6_lock);
@@ -1182,8 +1182,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 	rt->rt6i_dst.plen = 128;
 
 	if (!rt6_is_gw_or_nonexthop(ort)) {
-		if (ort->rt6i_dst.plen != 128 &&
-		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
+		if (ort->fib6_dst.plen != 128 &&
+		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
 			rt->rt6i_flags |= RTF_ANYCAST;
 #ifdef CONFIG_IPV6_SUBTREES
 		if (rt->rt6i_src.plen && saddr) {
@@ -1375,7 +1375,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
-	mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
+	mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6;
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 
 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
@@ -1416,14 +1416,14 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (ort->rt6i_src.plen)
+	if (ort->fib6_src.plen)
 		src_key = &nrt->rt6i_src.addr;
 #endif
 
 	/* Update rt6i_prefsrc as it could be changed
 	 * in rt6_remove_prefsrc()
 	 */
-	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
 	/* rt6_mtu_change() might lower mtu on ort.
 	 * Only insert this exception route if its mtu
 	 * is less than ort's mtu value.
@@ -1457,9 +1457,9 @@ out:
 
 	/* Update fn->fn_sernum to invalidate all cached dst */
 	if (!err) {
-		spin_lock_bh(&ort->rt6i_table->tb6_lock);
+		spin_lock_bh(&ort->fib6_table->tb6_lock);
 		fib6_update_sernum(net, ort);
-		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
+		spin_unlock_bh(&ort->fib6_table->tb6_lock);
 		fib6_force_start_gc(net);
 	}
 
@@ -1514,7 +1514,7 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (rt->rt6i_src.plen)
+	if (rt->fib6_src.plen)
 		src_key = saddr;
 #endif
 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
@@ -1551,7 +1551,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (from->rt6i_src.plen)
+	if (from->fib6_src.plen)
 		src_key = &rt->rt6i_src.addr;
 #endif
 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1592,7 +1592,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
 	 * Otherwise, the exception table is indexed by
 	 * a hash of only rt6i_dst.
 	 */
-	if (from->rt6i_src.plen)
+	if (from->fib6_src.plen)
 		src_key = &rt->rt6i_src.addr;
 #endif
 	rt6_ex = __rt6_find_exception_rcu(&bucket,
@@ -1810,7 +1810,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->rt6i_nsiblings)
+	if (f6i->fib6_nsiblings)
 		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1842,7 +1842,7 @@ redo_rt6_select:
 		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
-			    !(f6i->rt6i_flags & RTF_GATEWAY))) {
+			    !(f6i->fib6_flags & RTF_GATEWAY))) {
 		/* Create a RTF_CACHE clone which will not be
 		 * owned by the fib6 tree.  It is for the special case where
 		 * the daddr in the skb during the neighbor look-up is different
@@ -2206,7 +2206,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->from->rt6i_node);
+			fn = rcu_dereference(rt->from->fib6_node);
 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
 				fn->fn_sernum = -1;
 			rcu_read_unlock();
@@ -2372,9 +2372,9 @@ restart:
 			continue;
 		if (fib6_check_expired(rt))
 			continue;
-		if (rt->rt6i_flags & RTF_REJECT)
+		if (rt->fib6_flags & RTF_REJECT)
 			break;
-		if (!(rt->rt6i_flags & RTF_GATEWAY))
+		if (!(rt->fib6_flags & RTF_GATEWAY))
 			continue;
 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
 			continue;
@@ -2400,7 +2400,7 @@ restart:
 
 	if (!rt)
 		rt = net->ipv6.fib6_null_entry;
-	else if (rt->rt6i_flags & RTF_REJECT) {
+	else if (rt->fib6_flags & RTF_REJECT) {
 		ret = net->ipv6.ip6_null_entry;
 		goto out;
 	}
@@ -2907,7 +2907,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 
 	if (cfg->fc_protocol == RTPROT_UNSPEC)
 		cfg->fc_protocol = RTPROT_BOOT;
-	rt->rt6i_protocol = cfg->fc_protocol;
+	rt->fib6_protocol = cfg->fc_protocol;
 
 	addr_type = ipv6_addr_type(&cfg->fc_dst);
 
@@ -2922,17 +2922,17 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
 	}
 
-	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
-	rt->rt6i_dst.plen = cfg->fc_dst_len;
-	if (rt->rt6i_dst.plen == 128)
+	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+	rt->fib6_dst.plen = cfg->fc_dst_len;
+	if (rt->fib6_dst.plen == 128)
 		rt->dst_host = true;
 
 #ifdef CONFIG_IPV6_SUBTREES
-	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
-	rt->rt6i_src.plen = cfg->fc_src_len;
+	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
+	rt->fib6_src.plen = cfg->fc_src_len;
 #endif
 
-	rt->rt6i_metric = cfg->fc_metric;
+	rt->fib6_metric = cfg->fc_metric;
 	rt->fib6_nh.nh_weight = 1;
 
 	rt->fib6_type = cfg->fc_type;
@@ -2958,7 +2958,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 				goto out;
 			}
 		}
-		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
 		goto install_route;
 	}
 
@@ -2992,21 +2992,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 			err = -EINVAL;
 			goto out;
 		}
-		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
-		rt->rt6i_prefsrc.plen = 128;
+		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
+		rt->fib6_prefsrc.plen = 128;
 	} else
-		rt->rt6i_prefsrc.plen = 0;
+		rt->fib6_prefsrc.plen = 0;
 
-	rt->rt6i_flags = cfg->fc_flags;
+	rt->fib6_flags = cfg->fc_flags;
 
 install_route:
-	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
 	    !netif_carrier_ok(dev))
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_idev = idev;
-	rt->rt6i_table = table;
+	rt->fib6_idev = idev;
+	rt->fib6_table = table;
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
@@ -3048,7 +3048,7 @@ static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
 		goto out;
 	}
 
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 	err = fib6_del(rt, info);
 	spin_unlock_bh(&table->tb6_lock);
@@ -3075,10 +3075,10 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 
 	if (rt == net->ipv6.fib6_null_entry)
 		goto out_put;
-	table = rt->rt6i_table;
+	table = rt->fib6_table;
 	spin_lock_bh(&table->tb6_lock);
 
-	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
+	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
 		struct fib6_info *sibling, *next_sibling;
 
 		/* prefer to send a single notification with all hops */
@@ -3096,8 +3096,8 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
 		}
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings,
-					 rt6i_siblings) {
+					 &rt->fib6_siblings,
+					 fib6_siblings) {
 			err = fib6_del(sibling, info);
 			if (err)
 				goto out_unlock;
@@ -3176,9 +3176,9 @@ static int ip6_route_del(struct fib6_config *cfg,
 			if (cfg->fc_flags & RTF_GATEWAY &&
 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
 				continue;
-			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
+			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
 				continue;
-			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
+			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
 				continue;
 			fib6_info_hold(rt);
 			rcu_read_unlock();
@@ -3336,7 +3336,7 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
 			continue;
-		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
 			continue;
 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
 			continue;
@@ -3396,7 +3396,7 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
 		if (dev == rt->fib6_nh.nh_dev &&
-		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
+		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
 			break;
 	}
@@ -3445,8 +3445,8 @@ static void __rt6_purge_dflt_routers(struct net *net,
 restart:
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
-		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
+		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+		    (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) {
 			fib6_info_hold(rt);
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
@@ -3607,26 +3607,26 @@ struct fib6_info *addrconf_dst_alloc(struct net *net,
 	rt->dst_nocount = true;
 
 	in6_dev_hold(idev);
-	rt->rt6i_idev = idev;
+	rt->fib6_idev = idev;
 
 	rt->dst_host = true;
-	rt->rt6i_protocol = RTPROT_KERNEL;
-	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
+	rt->fib6_protocol = RTPROT_KERNEL;
+	rt->fib6_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
 		rt->fib6_type = RTN_ANYCAST;
-		rt->rt6i_flags |= RTF_ANYCAST;
+		rt->fib6_flags |= RTF_ANYCAST;
 	} else {
 		rt->fib6_type = RTN_LOCAL;
-		rt->rt6i_flags |= RTF_LOCAL;
+		rt->fib6_flags |= RTF_LOCAL;
 	}
 
 	rt->fib6_nh.nh_gw = *addr;
 	dev_hold(dev);
 	rt->fib6_nh.nh_dev = dev;
-	rt->rt6i_dst.addr = *addr;
-	rt->rt6i_dst.plen = 128;
+	rt->fib6_dst.addr = *addr;
+	rt->fib6_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-	rt->rt6i_table = fib6_get_table(net, tb_id);
+	rt->fib6_table = fib6_get_table(net, tb_id);
 
 	return rt;
 }
@@ -3646,10 +3646,10 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
 
 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
 	    rt != net->ipv6.fib6_null_entry &&
-	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 		spin_lock_bh(&rt6_exception_lock);
 		/* remove prefsrc entry */
-		rt->rt6i_prefsrc.plen = 0;
+		rt->fib6_prefsrc.plen = 0;
 		/* need to update cache as well */
 		rt6_exceptions_remove_prefsrc(rt);
 		spin_unlock_bh(&rt6_exception_lock);
@@ -3675,7 +3675,7 @@ static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
 {
 	struct in6_addr *gateway = (struct in6_addr *)arg;
 
-	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
 		return -1;
 	}
@@ -3707,16 +3707,16 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 	struct fib6_info *iter;
 	struct fib6_node *fn;
 
-	fn = rcu_dereference_protected(rt->rt6i_node,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+	fn = rcu_dereference_protected(rt->fib6_node,
+			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	iter = rcu_dereference_protected(fn->leaf,
-			lockdep_is_held(&rt->rt6i_table->tb6_lock));
+			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
-		if (iter->rt6i_metric == rt->rt6i_metric &&
+		if (iter->fib6_metric == rt->fib6_metric &&
 		    rt6_qualify_for_ecmp(iter))
 			return iter;
 		iter = rcu_dereference_protected(iter->rt6_next,
-				lockdep_is_held(&rt->rt6i_table->tb6_lock));
+				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 
 	return NULL;
@@ -3726,7 +3726,7 @@ static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
-	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
+	     rt->fib6_idev->cnf.ignore_routes_with_linkdown))
 		return true;
 
 	return false;
@@ -3740,7 +3740,7 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
 	if (!rt6_is_dead(rt))
 		total += rt->fib6_nh.nh_weight;
 
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
 		if (!rt6_is_dead(iter))
 			total += iter->fib6_nh.nh_weight;
 	}
@@ -3767,7 +3767,7 @@ static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
 
 	rt6_upper_bound_set(rt, &weight, total);
 
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		rt6_upper_bound_set(iter, &weight, total);
 }
 
@@ -3780,7 +3780,7 @@ void rt6_multipath_rebalance(struct fib6_info *rt)
 	 * then there is no need to rebalance upon the removal of every
 	 * sibling route.
 	 */
-	if (!rt->rt6i_nsiblings || rt->should_flush)
+	if (!rt->fib6_nsiblings || rt->should_flush)
 		return;
 
 	/* During lookup routes are evaluated in order, so we need to
@@ -3831,7 +3831,7 @@ static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_dev == dev)
 		return true;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == dev)
 			return true;
 
@@ -3843,7 +3843,7 @@ static void rt6_multipath_flush(struct fib6_info *rt)
 	struct fib6_info *iter;
 
 	rt->should_flush = 1;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		iter->should_flush = 1;
 }
 
@@ -3856,7 +3856,7 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
 	if (rt->fib6_nh.nh_dev == down_dev ||
 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		dead++;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == down_dev ||
 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
 			dead++;
@@ -3872,7 +3872,7 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_dev == dev)
 		rt->fib6_nh.nh_flags |= nh_flags;
-	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
+	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
 		if (iter->fib6_nh.nh_dev == dev)
 			iter->fib6_nh.nh_flags |= nh_flags;
 }
@@ -3893,13 +3893,13 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 	case NETDEV_DOWN:
 		if (rt->should_flush)
 			return -1;
-		if (!rt->rt6i_nsiblings)
+		if (!rt->fib6_nsiblings)
 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
 		if (rt6_multipath_uses_dev(rt, dev)) {
 			unsigned int count;
 
 			count = rt6_multipath_dead_count(rt, dev);
-			if (rt->rt6i_nsiblings + 1 == count) {
+			if (rt->fib6_nsiblings + 1 == count) {
 				rt6_multipath_flush(rt);
 				return -1;
 			}
@@ -3911,7 +3911,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
 		return -2;
 	case NETDEV_CHANGE:
 		if (rt->fib6_nh.nh_dev != dev ||
-		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
+		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
 			break;
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 		rt6_multipath_rebalance(rt);
@@ -4188,10 +4188,10 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
 	 * nexthop. Since sibling routes are always added at the end of
 	 * the list, find the first sibling of the last route appended
 	 */
-	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
-		rt = list_first_entry(&rt_last->rt6i_siblings,
+	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
+		rt = list_first_entry(&rt_last->fib6_siblings,
 				      struct fib6_info,
-				      rt6i_siblings);
+				      fib6_siblings);
 	}
 
 	if (rt)
@@ -4410,13 +4410,13 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
 {
 	int nexthop_len = 0;
 
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
 			    + NLA_ALIGN(sizeof(struct rtnexthop))
 			    + nla_total_size(16) /* RTA_GATEWAY */
 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
 
-		nexthop_len *= rt->rt6i_nsiblings;
+		nexthop_len *= rt->fib6_nsiblings;
 	}
 
 	return NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -4444,11 +4444,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
-		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+		if (rt->fib6_idev->cnf.ignore_routes_with_linkdown)
 			*flags |= RTNH_F_DEAD;
 	}
 
-	if (rt->rt6i_flags & RTF_GATEWAY) {
+	if (rt->fib6_flags & RTF_GATEWAY) {
 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
 			goto nla_put_failure;
 	}
@@ -4518,11 +4518,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 
 	rtm = nlmsg_data(nlh);
 	rtm->rtm_family = AF_INET6;
-	rtm->rtm_dst_len = rt->rt6i_dst.plen;
-	rtm->rtm_src_len = rt->rt6i_src.plen;
+	rtm->rtm_dst_len = rt->fib6_dst.plen;
+	rtm->rtm_src_len = rt->fib6_src.plen;
 	rtm->rtm_tos = 0;
-	if (rt->rt6i_table)
-		table = rt->rt6i_table->tb6_id;
+	if (rt->fib6_table)
+		table = rt->fib6_table->tb6_id;
 	else
 		table = RT6_TABLE_UNSPEC;
 	rtm->rtm_table = table;
@@ -4532,9 +4532,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	rtm->rtm_type = rt->fib6_type;
 	rtm->rtm_flags = 0;
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-	rtm->rtm_protocol = rt->rt6i_protocol;
+	rtm->rtm_protocol = rt->fib6_protocol;
 
-	if (rt->rt6i_flags & RTF_CACHE)
+	if (rt->fib6_flags & RTF_CACHE)
 		rtm->rtm_flags |= RTM_F_CLONED;
 
 	if (dest) {
@@ -4542,7 +4542,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 		rtm->rtm_dst_len = 128;
 	} else if (rtm->rtm_dst_len)
-		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
+		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
 			goto nla_put_failure;
 #ifdef CONFIG_IPV6_SUBTREES
 	if (src) {
@@ -4550,12 +4550,12 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 		rtm->rtm_src_len = 128;
 	} else if (rtm->rtm_src_len &&
-		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
+		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
 		goto nla_put_failure;
 #endif
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
-		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
 			int err = ip6mr_get_route(net, skb, rtm, portid);
 
 			if (err == 0)
@@ -4573,9 +4573,9 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_prefsrc.plen) {
+	if (rt->fib6_prefsrc.plen) {
 		struct in6_addr saddr_buf;
-		saddr_buf = rt->rt6i_prefsrc.addr;
+		saddr_buf = rt->fib6_prefsrc.addr;
 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
 			goto nla_put_failure;
 	}
@@ -4584,13 +4584,13 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
 		goto nla_put_failure;
 
-	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
+	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
 		goto nla_put_failure;
 
 	/* For multipath routes, walk the siblings list and add
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
-	if (rt->rt6i_nsiblings) {
+	if (rt->fib6_nsiblings) {
 		struct fib6_info *sibling, *next_sibling;
 		struct nlattr *mp;
 
@@ -4602,7 +4602,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 
 		list_for_each_entry_safe(sibling, next_sibling,
-					 &rt->rt6i_siblings, rt6i_siblings) {
+					 &rt->fib6_siblings, fib6_siblings) {
 			if (rt6_add_nexthop(skb, sibling) < 0)
 				goto nla_put_failure;
 		}
@@ -4613,7 +4613,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_EXPIRES) {
+	if (rt->fib6_flags & RTF_EXPIRES) {
 		expires = dst ? dst->expires : rt->expires;
 		expires -= jiffies;
 	}
@@ -4621,7 +4621,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
+	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
 		goto nla_put_failure;
 
 
@@ -4646,7 +4646,7 @@ int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 
 		/* user wants prefix routes only */
 		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
+		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
 			/* success since this is not a prefix route */
 			return 1;
 		}
@@ -4820,7 +4820,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 
 	if (event == NETDEV_REGISTER) {
 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
-		net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
+		net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4834,7 +4834,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
-		in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
+		in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5157,7 +5157,7 @@ void __init ip6_route_init_special_entries(void)
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
-	init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+	init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3-59-g8ed1b


From 360a9887c8c01a715b2b4b131f7c7462f7cce576 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:00 -0700
Subject: net/ipv6: Rename addrconf_dst_alloc

addrconf_dst_alloc now returns a fib6_info. Update the name
and its users to reflect the change.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  2 +-
 net/ipv6/addrconf.c     | 28 ++++++++++++++--------------
 net/ipv6/anycast.c      | 14 +++++++-------
 net/ipv6/route.c        | 44 ++++++++++++++++++++++----------------------
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index b4b85109984f..31e24f821ef6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -136,7 +136,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
 void fib6_force_start_gc(struct net *net);
 
-struct fib6_info *addrconf_dst_alloc(struct net *net, struct inet6_dev *idev,
+struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
 				     const struct in6_addr *addr, bool anycast,
 				     gfp_t gfp_flags);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f38ea829c28b..e6dd886f8f1f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -994,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
-	struct fib6_info *rt = NULL;
+	struct fib6_info *f6i = NULL;
 	int err = 0;
 	int addr_type = ipv6_addr_type(addr);
 
@@ -1036,16 +1036,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	rt = addrconf_dst_alloc(net, idev, addr, false, gfp_flags);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
-		rt = NULL;
+	f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+	if (IS_ERR(f6i)) {
+		err = PTR_ERR(f6i);
+		f6i = NULL;
 		goto out;
 	}
 
 	if (net->ipv6.devconf_all->disable_policy ||
 	    idev->cnf.disable_policy)
-		rt->dst_nopolicy = true;
+		f6i->dst_nopolicy = true;
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
@@ -1067,7 +1067,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	ifa->cstamp = ifa->tstamp = jiffies;
 	ifa->tokenized = false;
 
-	ifa->rt = rt;
+	ifa->rt = f6i;
 
 	ifa->idev = idev;
 	in6_dev_hold(idev);
@@ -1101,7 +1101,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
 	if (unlikely(err < 0)) {
-		fib6_info_release(rt);
+		fib6_info_release(f6i);
 
 		if (ifa) {
 			if (ifa->idev)
@@ -3346,17 +3346,17 @@ static int fixup_permanent_addr(struct net *net,
 	 * case regenerate the host route.
 	 */
 	if (!ifp->rt || !ifp->rt->fib6_node) {
-		struct fib6_info *rt, *prev;
+		struct fib6_info *f6i, *prev;
 
-		rt = addrconf_dst_alloc(net, idev, &ifp->addr, false,
-					GFP_ATOMIC);
-		if (IS_ERR(rt))
-			return PTR_ERR(rt);
+		f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
+					 GFP_ATOMIC);
+		if (IS_ERR(f6i))
+			return PTR_ERR(f6i);
 
 		/* ifp->rt can be accessed outside of rtnl */
 		spin_lock(&ifp->lock);
 		prev = ifp->rt;
-		ifp->rt = rt;
+		ifp->rt = f6i;
 		spin_unlock(&ifp->lock);
 
 		fib6_info_release(prev);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index eed3bf63bd05..5c3e74d05018 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -247,7 +247,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 {
 	struct ifacaddr6 *aca;
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 	struct net *net;
 	int err;
 
@@ -268,14 +268,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	}
 
 	net = dev_net(idev->dev);
-	rt = addrconf_dst_alloc(net, idev, addr, true, GFP_ATOMIC);
-	if (IS_ERR(rt)) {
-		err = PTR_ERR(rt);
+	f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+	if (IS_ERR(f6i)) {
+		err = PTR_ERR(f6i);
 		goto out;
 	}
-	aca = aca_alloc(rt, addr);
+	aca = aca_alloc(f6i, addr);
 	if (!aca) {
-		fib6_info_release(rt);
+		fib6_info_release(f6i);
 		err = -ENOMEM;
 		goto out;
 	}
@@ -289,7 +289,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
 	aca_get(aca);
 	write_unlock_bh(&idev->lock);
 
-	ip6_ins_rt(net, rt);
+	ip6_ins_rt(net, f6i);
 
 	addrconf_join_solict(idev->dev, &aca->aca_addr);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e23ab0784e65..e44f82848143 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3591,44 +3591,44 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
  *	Allocate a dst for local (unicast / anycast) address.
  */
 
-struct fib6_info *addrconf_dst_alloc(struct net *net,
-				    struct inet6_dev *idev,
-				    const struct in6_addr *addr,
-				    bool anycast, gfp_t gfp_flags)
+struct fib6_info *addrconf_f6i_alloc(struct net *net,
+				     struct inet6_dev *idev,
+				     const struct in6_addr *addr,
+				     bool anycast, gfp_t gfp_flags)
 {
 	u32 tb_id;
 	struct net_device *dev = idev->dev;
-	struct fib6_info *rt;
+	struct fib6_info *f6i;
 
-	rt = fib6_info_alloc(gfp_flags);
-	if (!rt)
+	f6i = fib6_info_alloc(gfp_flags);
+	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
-	rt->dst_nocount = true;
+	f6i->dst_nocount = true;
 
 	in6_dev_hold(idev);
-	rt->fib6_idev = idev;
+	f6i->fib6_idev = idev;
 
-	rt->dst_host = true;
-	rt->fib6_protocol = RTPROT_KERNEL;
-	rt->fib6_flags = RTF_UP | RTF_NONEXTHOP;
+	f6i->dst_host = true;
+	f6i->fib6_protocol = RTPROT_KERNEL;
+	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
 	if (anycast) {
-		rt->fib6_type = RTN_ANYCAST;
-		rt->fib6_flags |= RTF_ANYCAST;
+		f6i->fib6_type = RTN_ANYCAST;
+		f6i->fib6_flags |= RTF_ANYCAST;
 	} else {
-		rt->fib6_type = RTN_LOCAL;
-		rt->fib6_flags |= RTF_LOCAL;
+		f6i->fib6_type = RTN_LOCAL;
+		f6i->fib6_flags |= RTF_LOCAL;
 	}
 
-	rt->fib6_nh.nh_gw = *addr;
+	f6i->fib6_nh.nh_gw = *addr;
 	dev_hold(dev);
-	rt->fib6_nh.nh_dev = dev;
-	rt->fib6_dst.addr = *addr;
-	rt->fib6_dst.plen = 128;
+	f6i->fib6_nh.nh_dev = dev;
+	f6i->fib6_dst.addr = *addr;
+	f6i->fib6_dst.plen = 128;
 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
-	rt->fib6_table = fib6_get_table(net, tb_id);
+	f6i->fib6_table = fib6_get_table(net, tb_id);
 
-	return rt;
+	return f6i;
 }
 
 /* remove deleted ip from prefsrc entries */
-- 
cgit v1.2.3-59-g8ed1b


From 9ee8cbb2fd4a7d6f483a20c4b8e82d8b1cf685fa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:01 -0700
Subject: net/ipv6: Remove aca_idev

aca_idev has only 1 user - inet6_fill_ifacaddr - and it only
wants the device index which can be extracted from the fib6_info
nexthop.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/if_inet6.h | 1 -
 include/net/ip6_fib.h  | 5 +++++
 net/ipv6/addrconf.c    | 3 ++-
 net/ipv6/anycast.c     | 4 ----
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d6089b2e64fe..db389253dc2a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -143,7 +143,6 @@ struct ipv6_ac_socklist {
 
 struct ifacaddr6 {
 	struct in6_addr		aca_addr;
-	struct inet6_dev	*aca_idev;
 	struct fib6_info	*aca_rt;
 	struct ifacaddr6	*aca_next;
 	int			aca_users;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 994dd67207fb..bd11c990c353 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -410,6 +410,11 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 	     struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
+{
+	return f6i->fib6_nh.nh_dev;
+}
+
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e6dd886f8f1f..6c42c5d5fafa 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4817,9 +4817,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
 				u32 portid, u32 seq, int event, unsigned int flags)
 {
+	struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
+	int ifindex = dev ? dev->ifindex : 1;
 	struct nlmsghdr  *nlh;
 	u8 scope = RT_SCOPE_UNIVERSE;
-	int ifindex = ifaca->aca_idev->dev->ifindex;
 
 	if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
 		scope = RT_SCOPE_SITE;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5c3e74d05018..0250d199e527 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -212,7 +212,6 @@ static void aca_get(struct ifacaddr6 *aca)
 static void aca_put(struct ifacaddr6 *ac)
 {
 	if (refcount_dec_and_test(&ac->aca_refcnt)) {
-		in6_dev_put(ac->aca_idev);
 		fib6_info_release(ac->aca_rt);
 		kfree(ac);
 	}
@@ -221,7 +220,6 @@ static void aca_put(struct ifacaddr6 *ac)
 static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 				   const struct in6_addr *addr)
 {
-	struct inet6_dev *idev = f6i->fib6_idev;
 	struct ifacaddr6 *aca;
 
 	aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
@@ -229,8 +227,6 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
 		return NULL;
 
 	aca->aca_addr = *addr;
-	in6_dev_hold(idev);
-	aca->aca_idev = idev;
 	fib6_info_hold(f6i);
 	aca->aca_rt = f6i;
 	aca->aca_users = 1;
-- 
cgit v1.2.3-59-g8ed1b


From eea68cd371a8fa7d8dbfcf75bc076e8379526119 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:02 -0700
Subject: net/ipv6: Remove unnecessary checks on fib6_idev

Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device
with address") host routes and anycast routes were installed with the
device set to loopback (or VRF device once that feature was added). In the
older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev
was used to denote the actual interface.

Commit 4832c30d5458 changed the code to have dst.dev pointing to the real
device with the switch to lo or vrf device done on dst clones. As a
consequence of this change a couple of device checks during route lookups
are no longer needed. Remove them.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e44f82848143..8cf4f0623229 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -455,7 +455,6 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 						    int oif,
 						    int flags)
 {
-	struct fib6_info *local = NULL;
 	struct fib6_info *sprt;
 
 	if (!oif && ipv6_addr_any(saddr) &&
@@ -471,17 +470,6 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 		if (oif) {
 			if (dev->ifindex == oif)
 				return sprt;
-			if (dev->flags & IFF_LOOPBACK) {
-				if (!sprt->fib6_idev ||
-				    sprt->fib6_idev->dev->ifindex != oif) {
-					if (flags & RT6_LOOKUP_F_IFACE)
-						continue;
-					if (local &&
-					    local->fib6_idev->dev->ifindex == oif)
-						continue;
-				}
-				local = sprt;
-			}
 		} else {
 			if (ipv6_chk_addr(net, saddr, dev,
 					  flags & RT6_LOOKUP_F_IFACE))
@@ -489,13 +477,8 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 		}
 	}
 
-	if (oif) {
-		if (local)
-			return local;
-
-		if (flags & RT6_LOOKUP_F_IFACE)
-			return net->ipv6.fib6_null_entry;
-	}
+	if (oif && flags & RT6_LOOKUP_F_IFACE)
+		return net->ipv6.fib6_null_entry;
 
 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
 }
@@ -586,9 +569,6 @@ static inline int rt6_check_dev(struct fib6_info *rt, int oif)
 
 	if (!oif || dev->ifindex == oif)
 		return 2;
-	if ((dev->flags & IFF_LOOPBACK) &&
-	    rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif)
-		return 1;
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6fe749414446d86cfaae1c84bfdd556fe4fa8fca Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:03 -0700
Subject: net/ipv6: Change ip6_route_get_saddr to get dev from route

Prior to 4832c30d5458 ("net: ipv6: put host and anycast routes on device
with address") host routes and anycast routes were installed with the
device set to loopback (or VRF device once that feature was added). In the
older code dst.dev was set to loopback (needed for packet tx) and rt6i_idev
was used to denote the actual interface.

Commit 4832c30d5458 changed the code to have dst.dev pointing to the real
device with the switch to lo or vrf device done on dst clones. As a
consequence of this change ip6_route_get_saddr can just pass the nexthop
device to ipv6_dev_get_saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 31e24f821ef6..c0620330035c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -114,14 +114,15 @@ static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
 				      unsigned int prefs,
 				      struct in6_addr *saddr)
 {
-	struct inet6_dev *idev = f6i ? f6i->fib6_idev : NULL;
 	int err = 0;
 
-	if (f6i && f6i->fib6_prefsrc.plen)
+	if (f6i && f6i->fib6_prefsrc.plen) {
 		*saddr = f6i->fib6_prefsrc.addr;
-	else
-		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
-					 daddr, prefs, saddr);
+	} else {
+		struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
+
+		err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
+	}
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 647d4c1363a85bec63ecf929d4ab4aae78b2a960 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:04 -0700
Subject: net/ipv6: Remove compare of fib6_idev from rt6_duplicate_nexthop

After 4832c30d5458 ("net: ipv6: put host and anycast routes on device
with address") the comparison of idev does not add value since it
correlates to the nexthop device which is already compared. Remove
the idev comparison.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index c0620330035c..8df4ff798b04 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -275,7 +275,6 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
 	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       a->fib6_idev == b->fib6_idev &&
 	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
-- 
cgit v1.2.3-59-g8ed1b


From dcd1f572954f9d66d7b4a65df894ed5b4c467368 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:05 -0700
Subject: net/ipv6: Remove fib6_idev

fib6_idev can be obtained from __in6_dev_get on the nexthop device
rather than caching it in the fib6_info. Remove it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  1 -
 net/ipv6/ip6_fib.c    |  2 --
 net/ipv6/route.c      | 66 ++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index bd11c990c353..642211174692 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -147,7 +147,6 @@ struct fib6_info {
 	unsigned int			fib6_nsiblings;
 
 	atomic_t			fib6_ref;
-	struct inet6_dev		*fib6_idev;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 353f0b3e7b0d..f936e91a8c65 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -197,8 +197,6 @@ void fib6_info_destroy(struct fib6_info *f6i)
 		}
 	}
 
-	if (f6i->fib6_idev)
-		in6_dev_put(f6i->fib6_idev);
 	if (f6i->fib6_nh.nh_dev)
 		dev_put(f6i->fib6_nh.nh_dev);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8cf4f0623229..08cae7323776 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -525,15 +525,17 @@ static void rt6_probe(struct fib6_info *rt)
 	rcu_read_lock_bh();
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
+		struct inet6_dev *idev;
+
 		if (neigh->nud_state & NUD_VALID)
 			goto out;
 
+		idev = __in6_dev_get(dev);
 		work = NULL;
 		write_lock(&neigh->lock);
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
-			       neigh->updated +
-			       rt->fib6_idev->cnf.rtr_probe_interval)) {
+			       neigh->updated + idev->cnf.rtr_probe_interval)) {
 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 			if (work)
 				__neigh_set_probe_once(neigh);
@@ -622,18 +624,32 @@ static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
 	return m;
 }
 
+/* called with rc_read_lock held */
+static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
+{
+	const struct net_device *dev = fib6_info_nh_dev(f6i);
+	bool rc = false;
+
+	if (dev) {
+		const struct inet6_dev *idev = __in6_dev_get(dev);
+
+		rc = !!idev->cnf.ignore_routes_with_linkdown;
+	}
+
+	return rc;
+}
+
 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
 				   int *mpri, struct fib6_info *match,
 				   bool *do_rr)
 {
 	int m;
 	bool match_do_rr = false;
-	struct inet6_dev *idev = rt->fib6_idev;
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
 		goto out;
 
-	if (idev->cnf.ignore_routes_with_linkdown &&
+	if (fib6_ignore_linkdown(rt) &&
 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
@@ -957,12 +973,12 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 
 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
 {
+	struct net_device *dev = fib6_info_nh_dev(ort);
+
 	ip6_rt_init_dst(rt, ort);
 
 	rt->rt6i_dst = ort->fib6_dst;
-	rt->rt6i_idev = ort->fib6_idev;
-	if (rt->rt6i_idev)
-		in6_dev_hold(rt->rt6i_idev);
+	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
 	rt->rt6i_flags = ort->fib6_flags;
 	rt6_set_from(rt, ort);
@@ -1355,7 +1371,18 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 {
 	unsigned int mtu;
 
-	mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6;
+	if (rt->fib6_pmtu) {
+		mtu = rt->fib6_pmtu;
+	} else {
+		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct inet6_dev *idev;
+
+		rcu_read_lock();
+		idev = __in6_dev_get(dev);
+		mtu = idev->cnf.mtu6;
+		rcu_read_unlock();
+	}
+
 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 
 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
@@ -2985,11 +3012,13 @@ install_route:
 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
 	rt->fib6_nh.nh_dev = dev;
-	rt->fib6_idev = idev;
 	rt->fib6_table = table;
 
 	cfg->fc_nlinfo.nl_net = dev_net(dev);
 
+	if (idev)
+		in6_dev_put(idev);
+
 	return rt;
 out:
 	if (dev)
@@ -3425,8 +3454,11 @@ static void __rt6_purge_dflt_routers(struct net *net,
 restart:
 	rcu_read_lock();
 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
+		struct net_device *dev = fib6_info_nh_dev(rt);
+		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
+
 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-		    (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) {
+		    (!idev || idev->cnf.accept_ra != 2)) {
 			fib6_info_hold(rt);
 			rcu_read_unlock();
 			ip6_del_rt(net, rt);
@@ -3585,10 +3617,6 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 		return ERR_PTR(-ENOMEM);
 
 	f6i->dst_nocount = true;
-
-	in6_dev_hold(idev);
-	f6i->fib6_idev = idev;
-
 	f6i->dst_host = true;
 	f6i->fib6_protocol = RTPROT_KERNEL;
 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
@@ -3706,7 +3734,7 @@ static bool rt6_is_dead(const struct fib6_info *rt)
 {
 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
-	     rt->fib6_idev->cnf.ignore_routes_with_linkdown))
+	     fib6_ignore_linkdown(rt)))
 		return true;
 
 	return false;
@@ -4424,8 +4452,11 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
 
 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
 		*flags |= RTNH_F_LINKDOWN;
-		if (rt->fib6_idev->cnf.ignore_routes_with_linkdown)
+
+		rcu_read_lock();
+		if (fib6_ignore_linkdown(rt))
 			*flags |= RTNH_F_DEAD;
+		rcu_read_unlock();
 	}
 
 	if (rt->fib6_flags & RTF_GATEWAY) {
@@ -4800,7 +4831,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 
 	if (event == NETDEV_REGISTER) {
 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
-		net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev);
 		net->ipv6.ip6_null_entry->dst.dev = dev;
 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -4814,7 +4844,6 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
-		in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev);
 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
@@ -5137,7 +5166,6 @@ void __init ip6_route_init_special_entries(void)
 	 * the loopback reference in rt6_info will not be taken, do it
 	 * manually for init_net */
 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
-	init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev);
 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-- 
cgit v1.2.3-59-g8ed1b


From 27b10608a2fe3dd91c9ae1f8f8489bf547c12f08 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Apr 2018 15:39:06 -0700
Subject: net/ipv6: Fix gfp_flags arg to addrconf_prefix_route

Eric noticed that __ipv6_ifa_notify is called under rcu_read_lock, so
the gfp argument to addrconf_prefix_route can not be GFP_KERNEL.

While scrubbing other calls I noticed addrconf_addr_gen has one
place with GFP_ATOMIC that can be GFP_KERNEL.

Fixes: acb54e3cba404 ("net/ipv6: Add gfp_flags to route add functions")
Reported-by: syzbot+2add39b05179b31f912f@syzkaller.appspotmail.com
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6c42c5d5fafa..7b4d7bbf2c17 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3245,7 +3245,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
 			addrconf_prefix_route(&addr, 64, idev->dev,
-					      0, 0, GFP_ATOMIC);
+					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
 	default:
@@ -5620,7 +5620,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		if (!ipv6_addr_any(&ifp->peer_addr))
 			addrconf_prefix_route(&ifp->peer_addr, 128,
 					      ifp->idev->dev, 0, 0,
-					      GFP_KERNEL);
+					      GFP_ATOMIC);
 		break;
 	case RTM_DELADDR:
 		if (ifp->idev->cnf.forwarding)
-- 
cgit v1.2.3-59-g8ed1b


From 69b693f0aefa0ed521e8bd02260523b5ae446ad7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:55:57 -0700
Subject: bpf: btf: Introduce BPF Type Format (BTF)

This patch introduces BPF type Format (BTF).

BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map.  Hence, it basically focus
on the C programming language which the modern BPF is primary
using.  The first use case is to provide a generic pretty print
capability for a BPF map.

BTF has its root from CTF (Compact C-Type format).  To simplify
the handling of BTF data, BTF removes the differences between
small and big type/struct-member.  Hence, BTF consistently uses u32
instead of supporting both "one u16" and "two u32 (+padding)" in
describing type and struct-member.

It also raises the number of types (and functions) limit
from 0x7fff to 0x7fffffff.

Due to the above changes,  the format is not compatible to CTF.
Hence, BTF starts with a new BTF_MAGIC and version number.

This patch does the first verification pass to the BTF.  The first
pass checks:
1. meta-data size (e.g. It does not go beyond the total btf's size)
2. name_offset is valid
3. Each BTF_KIND (e.g. int, enum, struct....) does its
   own check of its meta-data.

Some other checks, like checking a struct's member is referring
to a valid type, can only be done in the second pass.  The second
verification pass will be implemented in the next patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 130 +++++++
 kernel/bpf/Makefile      |   1 +
 kernel/bpf/btf.c         | 915 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1046 insertions(+)
 create mode 100644 include/uapi/linux/btf.h
 create mode 100644 kernel/bpf/btf.c

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..74a30b1090df
--- /dev/null
+++ b/include/uapi/linux/btf.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_MAGIC_SWAP	0x9FeB
+#define BTF_VERSION	1
+#define BTF_FLAGS_COMPR	0x01
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+
+	__u32	parent_label;
+	__u32	parent_name;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	label_off;	/* offset of label section	*/
+	__u32	object_off;	/* offset of data object section*/
+	__u32	func_off;	/* offset of function section	*/
+	__u32	type_off;	/* offset of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+	__u32 name;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
+	 * bits    31: root
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_PTR		2	/* Pointer	*/
+#define BTF_KIND_ARRAY		3	/* Array	*/
+#define BTF_KIND_STRUCT		4	/* Struct	*/
+#define BTF_KIND_UNION		5	/* Union	*/
+#define BTF_KIND_ENUM		6	/* Enumeration	*/
+#define BTF_KIND_FWD		7	/* Forward	*/
+#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
+#define BTF_KIND_VOLATILE	9	/* Volatile	*/
+#define BTF_KIND_CONST		10	/* Const	*/
+#define BTF_KIND_RESTRICT	11	/* Restrict	*/
+#define BTF_KIND_MAX		11
+#define NR_BTF_KINDS		12
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	0x1
+#define BTF_INT_CHAR	0x2
+#define BTF_INT_BOOL	0x4
+#define BTF_INT_VARARGS	0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..35c485fa9ea3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,6 +4,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..26e9ed7cea5f
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,915 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map.  Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data.  F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type.  Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id.  The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section.  The first one has type_id 1.  The second
+ * one has type_id 2...etc.  Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *".  A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *".  This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ *   - Each line started with "[?]" is a btf_type object
+ *   - [?] is the type_id of the btf_type object.
+ *   - CONST/PTR is the BTF_KIND_XXX
+ *   - "(anon)" is the name of the type.  It just
+ *     happens that CONST and PTR has no name.
+ *   - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data.  We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done.  e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+/* 64k. We can raise it later. The hard limit is S32_MAX. */
+#define BTF_MAX_NR_TYPES 65535
+
+#define for_each_member(i, struct_type, member)			\
+	for (i = 0, member = btf_type_member(struct_type);	\
+	     i < btf_type_vlen(struct_type);			\
+	     i++, member++)
+
+struct btf {
+	union {
+		struct btf_header *hdr;
+		void *data;
+	};
+	struct btf_type **types;
+	const char *strings;
+	void *nohdr_data;
+	u32 nr_types;
+	u32 types_size;
+	u32 data_size;
+};
+
+struct btf_verifier_env {
+	struct btf *btf;
+	struct bpf_verifier_log log;
+	u32 log_type_id;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+	[BTF_KIND_UNKN]		= "UNKNOWN",
+	[BTF_KIND_INT]		= "INT",
+	[BTF_KIND_PTR]		= "PTR",
+	[BTF_KIND_ARRAY]	= "ARRAY",
+	[BTF_KIND_STRUCT]	= "STRUCT",
+	[BTF_KIND_UNION]	= "UNION",
+	[BTF_KIND_ENUM]		= "ENUM",
+	[BTF_KIND_FWD]		= "FWD",
+	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
+	[BTF_KIND_VOLATILE]	= "VOLATILE",
+	[BTF_KIND_CONST]	= "CONST",
+	[BTF_KIND_RESTRICT]	= "RESTRICT",
+};
+
+struct btf_kind_operations {
+	s32 (*check_meta)(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left);
+	void (*log_details)(struct btf_verifier_env *env,
+			    const struct btf_type *t);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+	if (encoding == 0)
+		return "(none)";
+	else if (encoding == BTF_INT_SIGNED)
+		return "SIGNED";
+	else if (encoding == BTF_INT_CHAR)
+		return "CHAR";
+	else if (encoding == BTF_INT_BOOL)
+		return "BOOL";
+	else if (encoding == BTF_INT_VARARGS)
+		return "VARARGS";
+	else
+		return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+	return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+	return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+	return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+	return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+	return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+	return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+	return !BTF_STR_TBL_ELF_ID(offset) &&
+		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+	if (!BTF_STR_OFFSET(offset))
+		return "(anon)";
+	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else
+		return "(invalid-name-offset)";
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+					      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+					    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+						   const struct btf_type *t,
+						   bool log_details,
+						   const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	u8 kind = BTF_INFO_KIND(t->info);
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "[%u] %s %s%s",
+			   env->log_type_id,
+			   btf_kind_str[kind],
+			   btf_name_by_offset(btf, t->name),
+			   log_details ? " " : "");
+
+	if (log_details)
+		btf_type_ops(t)->log_details(env, t);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+	__btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+	__btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+				    const struct btf_type *struct_type,
+				    const struct btf_member *member,
+				    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+			   btf_name_by_offset(btf, member->name),
+			   member->type, member->offset);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+{
+	struct bpf_verifier_log *log = &env->log;
+	const struct btf *btf = env->btf;
+	const struct btf_header *hdr;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	hdr = btf->hdr;
+	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+	__btf_verifier_log(log, "version: %u\n", hdr->version);
+	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
+	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
+	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
+	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
+	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+	struct btf *btf = env->btf;
+
+	/* < 2 because +1 for btf_void which is always in btf->types[0].
+	 * btf_void is not accounted in btf->nr_types because btf_void
+	 * does not come from the BTF file.
+	 */
+	if (btf->types_size - btf->nr_types < 2) {
+		/* Expand 'types' array */
+
+		struct btf_type **new_types;
+		u32 expand_by, new_size;
+
+		if (btf->types_size == BTF_MAX_NR_TYPES) {
+			btf_verifier_log(env, "Exceeded max num of types");
+			return -E2BIG;
+		}
+
+		expand_by = max_t(u32, btf->types_size >> 2, 16);
+		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+				 btf->types_size + expand_by);
+
+		new_types = kvzalloc(new_size * sizeof(*new_types),
+				     GFP_KERNEL | __GFP_NOWARN);
+		if (!new_types)
+			return -ENOMEM;
+
+		if (btf->nr_types == 0)
+			new_types[0] = &btf_void;
+		else
+			memcpy(new_types, btf->types,
+			       sizeof(*btf->types) * (btf->nr_types + 1));
+
+		kvfree(btf->types);
+		btf->types = new_types;
+		btf->types_size = new_size;
+	}
+
+	btf->types[++(btf->nr_types)] = t;
+
+	return 0;
+}
+
+static void btf_free(struct btf *btf)
+{
+	kvfree(btf->types);
+	kvfree(btf->data);
+	kfree(btf);
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+	kfree(env);
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+	u16 encoding;
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+	if (nr_bits > BITS_PER_U64) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+				      BITS_PER_U64);
+		return -EINVAL;
+	}
+
+	if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+		return -EINVAL;
+	}
+
+	encoding = BTF_INT_ENCODING(int_data);
+	if (encoding &&
+	    encoding != BTF_INT_SIGNED &&
+	    encoding != BTF_INT_CHAR &&
+	    encoding != BTF_INT_BOOL &&
+	    encoding != BTF_INT_VARARGS) {
+		btf_verifier_log_type(env, t, "Unsupported encoding");
+		return -ENOTSUPP;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+			const struct btf_type *t)
+{
+	int int_data = btf_type_int(t);
+
+	btf_verifier_log(env,
+			 "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+			 t->size, BTF_INT_OFFSET(int_data),
+			 BTF_INT_BITS(int_data),
+			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static const struct btf_kind_operations int_ops = {
+	.check_meta = btf_int_check_meta,
+	.log_details = btf_int_log,
+};
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+				   const struct btf_type *t,
+				   u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (BTF_TYPE_PARENT(t->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+			     const struct btf_type *t)
+{
+	btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations ptr_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations fwd_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.log_details = btf_ref_type_log,
+};
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+				const struct btf_type *t,
+				u32 meta_left)
+{
+	const struct btf_array *array = btf_type_array(t);
+	u32 meta_needed = sizeof(*array);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	/* We are a little forgiving on array->index_type since
+	 * the kernel is not using it.
+	 */
+	/* Array elem cannot be in type void,
+	 * so !array->type is not allowed.
+	 */
+	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_array *array = btf_type_array(t);
+
+	btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+			 array->type, array->index_type, array->nelems);
+}
+
+static struct btf_kind_operations array_ops = {
+	.check_meta = btf_array_check_meta,
+	.log_details = btf_array_log,
+};
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+	const struct btf_member *member;
+	struct btf *btf = env->btf;
+	u32 struct_size = t->size;
+	u32 meta_needed;
+	u16 i;
+
+	meta_needed = btf_type_vlen(t) * sizeof(*member);
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for_each_member(i, t, member) {
+		if (!btf_name_offset_valid(btf, member->name)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member name_offset:%u",
+						member->name);
+			return -EINVAL;
+		}
+
+		/* A member cannot be in type void */
+		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid type_id");
+			return -EINVAL;
+		}
+
+		if (is_union && member->offset) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member bits_offset");
+			return -EINVAL;
+		}
+
+		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+			btf_verifier_log_member(env, t, member,
+						"Memmber bits_offset exceeds its struct size");
+			return -EINVAL;
+		}
+
+		btf_verifier_log_member(env, t, member, NULL);
+	}
+
+	return meta_needed;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+			   const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations struct_ops = {
+	.check_meta = btf_struct_check_meta,
+	.log_details = btf_struct_log,
+};
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	struct btf *btf = env->btf;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size != sizeof(int)) {
+		btf_verifier_log_type(env, t, "Expected size:%zu",
+				      sizeof(int));
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name);
+			return -EINVAL;
+		}
+
+		btf_verifier_log(env, "\t%s val=%d\n",
+				 btf_name_by_offset(btf, enums[i].name),
+				 enums[i].val);
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+			 const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations enum_ops = {
+	.check_meta = btf_enum_check_meta,
+	.log_details = btf_enum_log,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+	[BTF_KIND_INT] = &int_ops,
+	[BTF_KIND_PTR] = &ptr_ops,
+	[BTF_KIND_ARRAY] = &array_ops,
+	[BTF_KIND_STRUCT] = &struct_ops,
+	[BTF_KIND_UNION] = &struct_ops,
+	[BTF_KIND_ENUM] = &enum_ops,
+	[BTF_KIND_FWD] = &fwd_ops,
+	[BTF_KIND_TYPEDEF] = &modifier_ops,
+	[BTF_KIND_VOLATILE] = &modifier_ops,
+	[BTF_KIND_CONST] = &modifier_ops,
+	[BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left)
+{
+	u32 saved_meta_left = meta_left;
+	s32 var_meta_size;
+
+	if (meta_left < sizeof(*t)) {
+		btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+				 env->log_type_id, meta_left, sizeof(*t));
+		return -EINVAL;
+	}
+	meta_left -= sizeof(*t);
+
+	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+		btf_verifier_log(env, "[%u] Invalid kind:%u",
+				 env->log_type_id, BTF_INFO_KIND(t->info));
+		return -EINVAL;
+	}
+
+	if (!btf_name_offset_valid(env->btf, t->name)) {
+		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+				 env->log_type_id, t->name);
+		return -EINVAL;
+	}
+
+	var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+	if (var_meta_size < 0)
+		return var_meta_size;
+
+	meta_left -= var_meta_size;
+
+	return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	struct btf_header *hdr;
+	void *cur, *end;
+
+	hdr = btf->hdr;
+	cur = btf->nohdr_data + hdr->type_off;
+	end = btf->nohdr_data + hdr->str_off;
+
+	env->log_type_id = 1;
+	while (cur < end) {
+		struct btf_type *t = cur;
+		s32 meta_size;
+
+		meta_size = btf_check_meta(env, t, end - cur);
+		if (meta_size < 0)
+			return meta_size;
+
+		btf_add_type(env, t);
+		cur += meta_size;
+		env->log_type_id++;
+	}
+
+	return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+	return btf_check_all_metas(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	const char *start, *end;
+
+	hdr = btf->hdr;
+	start = btf->nohdr_data + hdr->str_off;
+	end = start + hdr->str_len;
+
+	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+	    start[0] || end[-1]) {
+		btf_verifier_log(env, "Invalid string section");
+		return -EINVAL;
+	}
+
+	btf->strings = start;
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	u32 meta_left;
+
+	if (btf->data_size < sizeof(*hdr)) {
+		btf_verifier_log(env, "btf_header not found");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_hdr(env);
+
+	hdr = btf->hdr;
+	if (hdr->magic != BTF_MAGIC) {
+		btf_verifier_log(env, "Invalid magic");
+		return -EINVAL;
+	}
+
+	if (hdr->version != BTF_VERSION) {
+		btf_verifier_log(env, "Unsupported version");
+		return -ENOTSUPP;
+	}
+
+	if (hdr->flags) {
+		btf_verifier_log(env, "Unsupported flags");
+		return -ENOTSUPP;
+	}
+
+	meta_left = btf->data_size - sizeof(*hdr);
+	if (!meta_left) {
+		btf_verifier_log(env, "No data");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
+	    /* Type section must align to 4 bytes */
+	    hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Invalid type_off");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->str_off ||
+	    meta_left - hdr->str_off < hdr->str_len) {
+		btf_verifier_log(env, "Invalid str_off or str_len");
+		return -EINVAL;
+	}
+
+	btf->nohdr_data = btf->hdr + 1;
+
+	return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+			     u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL;
+	u8 *data;
+	int err;
+
+	if (btf_data_size > BTF_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	if (log_level || log_ubuf || log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log->level = log_level;
+		log->ubuf = log_ubuf;
+		log->len_total = log_size;
+
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!data) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	btf->data = data;
+	btf->data_size = btf_data_size;
+
+	if (copy_from_user(data, btf_data, btf_data_size)) {
+		err = -EFAULT;
+		goto errout;
+	}
+
+	env->btf = btf;
+
+	err = btf_parse_hdr(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_type_sec(env);
+	if (err)
+		goto errout;
+
+	if (!err && log->level && bpf_verifier_log_full(log)) {
+		err = -ENOSPC;
+		goto errout;
+	}
+
+	if (!err) {
+		btf_verifier_env_free(env);
+		return btf;
+	}
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf)
+		btf_free(btf);
+	return ERR_PTR(err);
+}
-- 
cgit v1.2.3-59-g8ed1b


From eb3f595dab40b61011c5f123507a7db2df6f0e65 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:55:58 -0700
Subject: bpf: btf: Validate type reference

After collecting all btf_type in the first pass in an earlier patch,
the second pass (in this patch) can validate the reference types
(e.g. the referring type does exist and it does not refer to itself).

While checking the reference type, it also gathers other information (e.g.
the size of an array).  This info will be useful in checking the
struct's members in a later patch.  They will also be useful in doing
pretty print later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h |  37 +++
 kernel/bpf/btf.c    | 666 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 702 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/btf.h

diff --git a/include/linux/btf.h b/include/linux/btf.h
new file mode 100644
index 000000000000..f14b60368753
--- /dev/null
+++ b/include/linux/btf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef _LINUX_BTF_H
+#define _LINUX_BTF_H 1
+
+#include <linux/types.h>
+
+struct btf;
+struct btf_type;
+
+/* Figure out the size of a type_id.  If type_id is a modifier
+ * (e.g. const), it will be resolved to find out the type with size.
+ *
+ * For example:
+ * In describing "const void *",  type_id is "const" and "const"
+ * refers to "void *".  The return type will be "void *".
+ *
+ * If type_id is a simple "int", then return type will be "int".
+ *
+ * @btf: struct btf object
+ * @type_id: Find out the size of type_id. The type_id of the return
+ *           type is set to *type_id.
+ * @ret_size: It can be NULL.  If not NULL, the size of the return
+ *            type is set to *ret_size.
+ * Return: The btf_type (resolved to another type with size info if needed).
+ *         NULL is returned if type_id itself does not have size info
+ *         (e.g. void) or it cannot be resolved to another type that
+ *         has size info.
+ *         *type_id and *ret_size will not be changed in the
+ *         NULL return case.
+ */
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id,
+					u32 *ret_size);
+
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 26e9ed7cea5f..18bf266ceeda 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -105,6 +105,50 @@
  *
  * In the first pass, it still does some verifications (e.g.
  * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ *	struct A {
+ *		struct B b;
+ *	};
+ *
+ *	struct B {
+ *		struct A a;
+ *	};
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type.  A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR.  Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ *	struct A {
+ *		int m;
+ *		struct A *a;
+ *	};
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR.  Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ *                        ^                                         |
+ *                        +-----------------------------------------+
+ *
  */
 
 #define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
@@ -127,12 +171,19 @@
 	     i < btf_type_vlen(struct_type);			\
 	     i++, member++)
 
+#define for_each_member_from(i, from, struct_type, member)		\
+	for (i = from, member = btf_type_member(struct_type) + from;	\
+	     i < btf_type_vlen(struct_type);				\
+	     i++, member++)
+
 struct btf {
 	union {
 		struct btf_header *hdr;
 		void *data;
 	};
 	struct btf_type **types;
+	u32 *resolved_ids;
+	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
 	u32 nr_types;
@@ -140,10 +191,42 @@ struct btf {
 	u32 data_size;
 };
 
+enum verifier_phase {
+	CHECK_META,
+	CHECK_TYPE,
+};
+
+struct resolve_vertex {
+	const struct btf_type *t;
+	u32 type_id;
+	u16 next_member;
+};
+
+enum visit_state {
+	NOT_VISITED,
+	VISITED,
+	RESOLVED,
+};
+
+enum resolve_mode {
+	RESOLVE_TBD,	/* To Be Determined */
+	RESOLVE_PTR,	/* Resolving for Pointer */
+	RESOLVE_STRUCT_OR_ARRAY,	/* Resolving for struct/union
+					 * or array
+					 */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
 struct btf_verifier_env {
 	struct btf *btf;
+	u8 *visit_states;
+	struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
 	struct bpf_verifier_log log;
 	u32 log_type_id;
+	u32 top_stack;
+	enum verifier_phase phase;
+	enum resolve_mode resolve_mode;
 };
 
 static const char * const btf_kind_str[NR_BTF_KINDS] = {
@@ -165,6 +248,8 @@ struct btf_kind_operations {
 	s32 (*check_meta)(struct btf_verifier_env *env,
 			  const struct btf_type *t,
 			  u32 meta_left);
+	int (*resolve)(struct btf_verifier_env *env,
+		       const struct resolve_vertex *v);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 };
@@ -172,6 +257,101 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	/* Some of them is not strictly a C modifier
+	 * but they are grouped into the same bucket
+	 * for BTF concern:
+	 *   A type (t) that refers to another
+	 *   type through t->type AND its size cannot
+	 *   be determined without following the t->type.
+	 *
+	 * ptr does not fall into this bucket
+	 * because its size is always sizeof(void *).
+	 */
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+		return true;
+	}
+
+	return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+	/* void => no type and size info.
+	 * Hence, FWD is also treated as void.
+	 */
+	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+	return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+	u8 kind = BTF_INFO_KIND(t->info);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type.  Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+	return btf_type_is_modifier(t) ||
+		btf_type_is_ptr(t) ||
+		btf_type_is_struct(t) ||
+		btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+		return true;
+	}
+
+	return false;
+}
+
 static const char *btf_int_encoding_str(u8 encoding)
 {
 	if (encoding == 0)
@@ -234,6 +414,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+	if (type_id > btf->nr_types)
+		return NULL;
+
+	return btf->types[type_id];
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
@@ -308,6 +496,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	/* The CHECK_META phase already did a btf dump.
+	 *
+	 * If member is logged again, it must hit an error in
+	 * parsing this member.  It is useful to print out which
+	 * struct this member belongs to.
+	 */
+	if (env->phase != CHECK_META)
+		btf_verifier_log_type(env, struct_type, NULL);
+
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
 			   btf_name_by_offset(btf, member->name),
 			   member->type, member->offset);
@@ -393,15 +590,183 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
+	kvfree(btf->resolved_sizes);
+	kvfree(btf->resolved_ids);
 	kvfree(btf->data);
 	kfree(btf);
 }
 
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 nr_types = btf->nr_types;
+	u32 *resolved_sizes = NULL;
+	u32 *resolved_ids = NULL;
+	u8 *visit_states = NULL;
+
+	/* +1 for btf_void */
+	resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+				  GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_sizes)
+		goto nomem;
+
+	resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_ids)
+		goto nomem;
+
+	visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!visit_states)
+		goto nomem;
+
+	btf->resolved_sizes = resolved_sizes;
+	btf->resolved_ids = resolved_ids;
+	env->visit_states = visit_states;
+
+	return 0;
+
+nomem:
+	kvfree(resolved_sizes);
+	kvfree(resolved_ids);
+	kvfree(visit_states);
+	return -ENOMEM;
+}
+
 static void btf_verifier_env_free(struct btf_verifier_env *env)
 {
+	kvfree(env->visit_states);
 	kfree(env);
 }
 
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+				     const struct btf_type *next_type)
+{
+	switch (env->resolve_mode) {
+	case RESOLVE_TBD:
+		/* int, enum or void is a sink */
+		return !btf_type_needs_resolve(next_type);
+	case RESOLVE_PTR:
+		/* int, enum, void, struct or array is a sink for ptr */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_ptr(next_type);
+	case RESOLVE_STRUCT_OR_ARRAY:
+		/* int, enum, void or ptr is a sink for struct and array */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_array(next_type) &&
+			!btf_type_is_struct(next_type);
+	default:
+		BUG_ON(1);
+	}
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+				 u32 type_id)
+{
+	return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+			  const struct btf_type *t, u32 type_id)
+{
+	struct resolve_vertex *v;
+
+	if (env->top_stack == MAX_RESOLVE_DEPTH)
+		return -E2BIG;
+
+	if (env->visit_states[type_id] != NOT_VISITED)
+		return -EEXIST;
+
+	env->visit_states[type_id] = VISITED;
+
+	v = &env->stack[env->top_stack++];
+	v->t = t;
+	v->type_id = type_id;
+	v->next_member = 0;
+
+	if (env->resolve_mode == RESOLVE_TBD) {
+		if (btf_type_is_ptr(t))
+			env->resolve_mode = RESOLVE_PTR;
+		else if (btf_type_is_struct(t) || btf_type_is_array(t))
+			env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+	}
+
+	return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+				      u16 next_member)
+{
+	env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+				   u32 resolved_type_id,
+				   u32 resolved_size)
+{
+	u32 type_id = env->stack[--(env->top_stack)].type_id;
+	struct btf *btf = env->btf;
+
+	btf->resolved_sizes[type_id] = resolved_size;
+	btf->resolved_ids[type_id] = resolved_type_id;
+	env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+	return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+						  u32 *type_id)
+{
+	*type_id = btf->resolved_ids[*type_id];
+	return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id, u32 *ret_size)
+{
+	const struct btf_type *size_type;
+	u32 size_type_id = *type_id;
+	u32 size = 0;
+
+	size_type = btf_type_by_id(btf, size_type_id);
+	if (btf_type_is_void_or_null(size_type))
+		return NULL;
+
+	if (btf_type_has_size(size_type)) {
+		size = size_type->size;
+	} else if (btf_type_is_array(size_type)) {
+		size = btf->resolved_sizes[size_type_id];
+	} else if (btf_type_is_ptr(size_type)) {
+		size = sizeof(void *);
+	} else {
+		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+			return NULL;
+
+		size = btf->resolved_sizes[size_type_id];
+		size_type_id = btf->resolved_ids[size_type_id];
+		size_type = btf_type_by_id(btf, size_type_id);
+		if (btf_type_is_void(size_type))
+			return NULL;
+	}
+
+	*type_id = size_type_id;
+	if (ret_size)
+		*ret_size = size;
+
+	return size_type;
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+			  const struct resolve_vertex *v)
+{
+	btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+	return -EINVAL;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -464,6 +829,7 @@ static void btf_int_log(struct btf_verifier_env *env,
 
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_int_log,
 };
 
@@ -486,6 +852,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+				const struct resolve_vertex *v)
+{
+	const struct btf_type *t = v->t;
+	const struct btf_type *next_type;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "typedef void new_void", "const void"...etc */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* Figure out the resolved next_type_id with size.
+	 * They will be stored in the current modifier's
+	 * resolved_ids and resolved_sizes such that it can
+	 * save us a few type-following when we use it later (e.g. in
+	 * pretty print).
+	 */
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+	return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+			   const struct resolve_vertex *v)
+{
+	const struct btf_type *next_type;
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "void *" */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+	 * the modifier may have stopped resolving when it was resolved
+	 * to a ptr (last-resolved-ptr).
+	 *
+	 * We now need to continue from the last-resolved-ptr to
+	 * ensure the last-resolved-ptr will not referring back to
+	 * the currenct ptr (t).
+	 */
+	if (btf_type_is_modifier(next_type)) {
+		const struct btf_type *resolved_type;
+		u32 resolved_type_id;
+
+		resolved_type_id = next_type_id;
+		resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+		if (btf_type_is_ptr(resolved_type) &&
+		    !env_type_is_resolve_sink(env, resolved_type) &&
+		    !env_type_is_resolved(env, resolved_type_id))
+			return env_stack_push(env, resolved_type,
+					      resolved_type_id);
+	}
+
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, 0);
+
+	return 0;
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -494,16 +958,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_modifier_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_ptr_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_ref_type_log,
 };
 
@@ -542,6 +1009,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_array_resolve(struct btf_verifier_env *env,
+			     const struct resolve_vertex *v)
+{
+	const struct btf_array *array = btf_type_array(v->t);
+	const struct btf_type *elem_type;
+	u32 elem_type_id = array->type;
+	struct btf *btf = env->btf;
+	u32 elem_size;
+
+	elem_type = btf_type_by_id(btf, elem_type_id);
+	if (btf_type_is_void_or_null(elem_type)) {
+		btf_verifier_log_type(env, v->t,
+				      "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, elem_type) &&
+	    !env_type_is_resolved(env, elem_type_id))
+		return env_stack_push(env, elem_type, elem_type_id);
+
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	if (!elem_type) {
+		btf_verifier_log_type(env, v->t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (btf_type_is_int(elem_type)) {
+		int int_type_data = btf_type_int(elem_type);
+		u16 nr_bits = BTF_INT_BITS(int_type_data);
+		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+		/* Put more restriction on array of int.  The int cannot
+		 * be a bit field and it must be either u8/u16/u32/u64.
+		 */
+		if (BITS_PER_BYTE_MASKED(nr_bits) ||
+		    BTF_INT_OFFSET(int_type_data) ||
+		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+			btf_verifier_log_type(env, v->t,
+					      "Invalid array of int");
+			return -EINVAL;
+		}
+	}
+
+	if (array->nelems && elem_size > U32_MAX / array->nelems) {
+		btf_verifier_log_type(env, v->t,
+				      "Array size overflows U32_MAX");
+		return -EINVAL;
+	}
+
+	env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+	return 0;
+}
+
 static void btf_array_log(struct btf_verifier_env *env,
 			  const struct btf_type *t)
 {
@@ -553,6 +1075,7 @@ static void btf_array_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
+	.resolve = btf_array_resolve,
 	.log_details = btf_array_log,
 };
 
@@ -610,6 +1133,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_struct_resolve(struct btf_verifier_env *env,
+			      const struct resolve_vertex *v)
+{
+	const struct btf_member *member;
+	u16 i;
+
+	/* Before continue resolving the next_member,
+	 * ensure the last member is indeed resolved to a
+	 * type with size info.
+	 */
+	if (v->next_member) {
+		const struct btf_member *last_member;
+		u16 last_member_type_id;
+
+		last_member = btf_type_member(v->t) + v->next_member - 1;
+		last_member_type_id = last_member->type;
+		if (WARN_ON_ONCE(!env_type_is_resolved(env,
+						       last_member_type_id)))
+			return -EINVAL;
+	}
+
+	for_each_member_from(i, v->next_member, v->t, member) {
+		u32 member_type_id = member->type;
+		const struct btf_type *member_type = btf_type_by_id(env->btf,
+								member_type_id);
+
+		if (btf_type_is_void_or_null(member_type)) {
+			btf_verifier_log_member(env, v->t, member,
+						"Invalid member");
+			return -EINVAL;
+		}
+
+		if (!env_type_is_resolve_sink(env, member_type) &&
+		    !env_type_is_resolved(env, member_type_id)) {
+			env_stack_set_next_member(env, i + 1);
+			return env_stack_push(env, member_type, member_type_id);
+		}
+	}
+
+	env_stack_pop_resolved(env, 0, 0);
+
+	return 0;
+}
+
 static void btf_struct_log(struct btf_verifier_env *env,
 			   const struct btf_type *t)
 {
@@ -618,6 +1185,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
+	.resolve = btf_struct_resolve,
 	.log_details = btf_struct_log,
 };
 
@@ -671,6 +1239,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_enum_log,
 };
 
@@ -751,9 +1320,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	return 0;
 }
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG)
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	else if (err == -EEXIST)
+		btf_verifier_log_type(env, t, "Loop detected");
+
+	return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 type_id)
+{
+	struct btf *btf = env->btf;
+
+	if (!env_type_is_resolved(env, type_id))
+		return false;
+
+	if (btf_type_is_struct(t))
+		return !btf->resolved_ids[type_id] &&
+			!btf->resolved_sizes[type_id];
+
+	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+		t = btf_type_id_resolve(btf, &type_id);
+		return t && !btf_type_is_modifier(t);
+	}
+
+	if (btf_type_is_array(t)) {
+		const struct btf_array *array = btf_type_array(t);
+		const struct btf_type *elem_type;
+		u32 elem_type_id = array->type;
+		u32 elem_size;
+
+		elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+		return elem_type && !btf_type_is_modifier(elem_type) &&
+			(array->nelems * elem_size ==
+			 btf->resolved_sizes[type_id]);
+	}
+
+	return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 type_id;
+	int err;
+
+	err = env_resolve_init(env);
+	if (err)
+		return err;
+
+	env->phase++;
+	for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+		const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+		env->log_type_id = type_id;
+		if (btf_type_needs_resolve(t) &&
+		    !env_type_is_resolved(env, type_id)) {
+			err = btf_resolve(env, t, type_id);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_needs_resolve(t) &&
+		    !btf_resolve_valid(env, t, type_id)) {
+			btf_verifier_log_type(env, t, "Invalid resolve state");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
-	return btf_check_all_metas(env);
+	int err;
+
+	err = btf_check_all_metas(env);
+	if (err)
+		return err;
+
+	return btf_check_all_types(env);
 }
 
 static int btf_parse_str_sec(struct btf_verifier_env *env)
-- 
cgit v1.2.3-59-g8ed1b


From 179cde8cef7e88fc7c0f00c3f23482720d8c4a21 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:55:59 -0700
Subject: bpf: btf: Check members of struct/union

This patch checks a few things of struct's members:

1) It has a valid size (e.g. a "const void" is invalid)
2) A member's size (+ its member's offset) does not exceed
   the containing struct's size.
3) The member's offset satisfies the alignment requirement

The above can only be done after the needs_resolve member's type
is resolved.  Hence, the above is done together in
btf_struct_resolve().

Each possible member's type (e.g. int, enum, modifier...) implements
the check_member() ops which will be called from btf_struct_resolve().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 205 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 18bf266ceeda..4e31249f6c61 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -250,6 +250,10 @@ struct btf_kind_operations {
 			  u32 meta_left);
 	int (*resolve)(struct btf_verifier_env *env,
 		       const struct resolve_vertex *v);
+	int (*check_member)(struct btf_verifier_env *env,
+			    const struct btf_type *struct_type,
+			    const struct btf_member *member,
+			    const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 };
@@ -760,6 +764,16 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 	return size_type;
 }
 
+static int btf_df_check_member(struct btf_verifier_env *env,
+			       const struct btf_type *struct_type,
+			       const struct btf_member *member,
+			       const struct btf_type *member_type)
+{
+	btf_verifier_log_basic(env, struct_type,
+			       "Unsupported check_member");
+	return -EINVAL;
+}
+
 static int btf_df_resolve(struct btf_verifier_env *env,
 			  const struct resolve_vertex *v)
 {
@@ -767,6 +781,44 @@ static int btf_df_resolve(struct btf_verifier_env *env,
 	return -EINVAL;
 }
 
+static int btf_int_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 int_data = btf_type_int(member_type);
+	u32 struct_bits_off = member->offset;
+	u32 struct_size = struct_type->size;
+	u32 nr_copy_bits;
+	u32 bytes_offset;
+
+	if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"bits_offset exceeds U32_MAX");
+		return -EINVAL;
+	}
+
+	struct_bits_off += BTF_INT_OFFSET(int_data);
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	nr_copy_bits = BTF_INT_BITS(int_data) +
+		BITS_PER_BYTE_MASKED(struct_bits_off);
+
+	if (nr_copy_bits > BITS_PER_U64) {
+		btf_verifier_log_member(env, struct_type, member,
+					"nr_copy_bits exceeds 64");
+		return -EINVAL;
+	}
+
+	if (struct_size < bytes_offset ||
+	    struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -830,9 +882,61 @@ static void btf_int_log(struct btf_verifier_env *env,
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_int_check_member,
 	.log_details = btf_int_log,
 };
 
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+				     const struct btf_type *struct_type,
+				     const struct btf_member *member,
+				     const struct btf_type *member_type)
+{
+	const struct btf_type *resolved_type;
+	u32 resolved_type_id = member->type;
+	struct btf_member resolved_member;
+	struct btf *btf = env->btf;
+
+	resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+	if (!resolved_type) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member");
+		return -EINVAL;
+	}
+
+	resolved_member = *member;
+	resolved_member.type = resolved_type_id;
+
+	return btf_type_ops(resolved_type)->check_member(env, struct_type,
+							 &resolved_member,
+							 resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 struct_size, struct_bits_off, bytes_offset;
+
+	struct_size = struct_type->size;
+	struct_bits_off = member->offset;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	if (struct_size - bytes_offset < sizeof(void *)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 				   const struct btf_type *t,
 				   u32 meta_left)
@@ -959,21 +1063,53 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
 static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_modifier_resolve,
+	.check_member = btf_modifier_check_member,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_ptr_resolve,
+	.check_member = btf_ptr_check_member,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
 };
 
+static int btf_array_check_member(struct btf_verifier_env *env,
+				  const struct btf_type *struct_type,
+				  const struct btf_member *member,
+				  const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+	u32 array_type_id, array_size;
+	struct btf *btf = env->btf;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	array_type_id = member->type;
+	btf_type_id_size(btf, &array_type_id, &array_size);
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < array_size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_array_check_meta(struct btf_verifier_env *env,
 				const struct btf_type *t,
 				u32 meta_left)
@@ -1076,9 +1212,35 @@ static void btf_array_log(struct btf_verifier_env *env,
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
+	.check_member = btf_array_check_member,
 	.log_details = btf_array_log,
 };
 
+static int btf_struct_check_member(struct btf_verifier_env *env,
+				   const struct btf_type *struct_type,
+				   const struct btf_member *member,
+				   const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < member_type->size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 				 const struct btf_type *t,
 				 u32 meta_left)
@@ -1137,6 +1299,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			      const struct resolve_vertex *v)
 {
 	const struct btf_member *member;
+	int err;
 	u16 i;
 
 	/* Before continue resolving the next_member,
@@ -1144,6 +1307,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 	 * type with size info.
 	 */
 	if (v->next_member) {
+		const struct btf_type *last_member_type;
 		const struct btf_member *last_member;
 		u16 last_member_type_id;
 
@@ -1152,6 +1316,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 		if (WARN_ON_ONCE(!env_type_is_resolved(env,
 						       last_member_type_id)))
 			return -EINVAL;
+
+		last_member_type = btf_type_by_id(env->btf,
+						  last_member_type_id);
+		err = btf_type_ops(last_member_type)->check_member(env, v->t,
+							last_member,
+							last_member_type);
+		if (err)
+			return err;
 	}
 
 	for_each_member_from(i, v->next_member, v->t, member) {
@@ -1170,6 +1342,12 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			env_stack_set_next_member(env, i + 1);
 			return env_stack_push(env, member_type, member_type_id);
 		}
+
+		err = btf_type_ops(member_type)->check_member(env, v->t,
+							      member,
+							      member_type);
+		if (err)
+			return err;
 	}
 
 	env_stack_pop_resolved(env, 0, 0);
@@ -1186,9 +1364,35 @@ static void btf_struct_log(struct btf_verifier_env *env,
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
+	.check_member = btf_struct_check_member,
 	.log_details = btf_struct_log,
 };
 
+static int btf_enum_check_member(struct btf_verifier_env *env,
+				 const struct btf_type *struct_type,
+				 const struct btf_member *member,
+				 const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < sizeof(int)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			       const struct btf_type *t,
 			       u32 meta_left)
@@ -1240,6 +1444,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_enum_check_member,
 	.log_details = btf_enum_log,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From b00b8daec828dd59af7d1f7a42acd6e5867f80c6 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:00 -0700
Subject: bpf: btf: Add pretty print capability for data with BTF type info

This patch adds pretty print capability for data with BTF type info.
The current usage is to allow pretty print for a BPF map.

The next few patches will allow a read() on a pinned map with BTF
type info for its key and value.

This patch uses the seq_printf() infra.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h |   2 +
 kernel/bpf/btf.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index f14b60368753..d8bdab0280ba 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -33,5 +33,7 @@ struct btf_type;
 const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *type_id,
 					u32 *ret_size);
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m);
 
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4e31249f6c61..10ee41589da2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3,6 +3,7 @@
 
 #include <uapi/linux/btf.h>
 #include <uapi/linux/types.h>
+#include <linux/seq_file.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -256,6 +257,9 @@ struct btf_kind_operations {
 			    const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
+	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+			 u32 type_id, void *data, u8 bits_offsets,
+			 struct seq_file *m);
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -781,6 +785,13 @@ static int btf_df_resolve(struct btf_verifier_env *env,
 	return -EINVAL;
 }
 
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offsets,
+			    struct seq_file *m)
+{
+	seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
 static int btf_int_check_member(struct btf_verifier_env *env,
 				const struct btf_type *struct_type,
 				const struct btf_member *member,
@@ -879,11 +890,96 @@ static void btf_int_log(struct btf_verifier_env *env,
 			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
 }
 
+static void btf_int_bits_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  void *data, u8 bits_offset,
+				  struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u16 nr_bits = BTF_INT_BITS(int_data);
+	u16 total_bits_offset;
+	u16 nr_copy_bytes;
+	u16 nr_copy_bits;
+	u8 nr_upper_bits;
+	union {
+		u64 u64_num;
+		u8  u8_nums[8];
+	} print_num;
+
+	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+	nr_copy_bits = nr_bits + bits_offset;
+	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+	print_num.u64_num = 0;
+	memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+	/* Ditch the higher order bits */
+	nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+	if (nr_upper_bits) {
+		/* We need to mask out some bits of the upper byte. */
+		u8 mask = (1 << nr_upper_bits) - 1;
+
+		print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+	}
+
+	print_num.u64_num >>= bits_offset;
+
+	seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u8 encoding = BTF_INT_ENCODING(int_data);
+	bool sign = encoding & BTF_INT_SIGNED;
+	u32 nr_bits = BTF_INT_BITS(int_data);
+
+	if (bits_offset || BTF_INT_OFFSET(int_data) ||
+	    BITS_PER_BYTE_MASKED(nr_bits)) {
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+		return;
+	}
+
+	switch (nr_bits) {
+	case 64:
+		if (sign)
+			seq_printf(m, "%lld", *(s64 *)data);
+		else
+			seq_printf(m, "%llu", *(u64 *)data);
+		break;
+	case 32:
+		if (sign)
+			seq_printf(m, "%d", *(s32 *)data);
+		else
+			seq_printf(m, "%u", *(u32 *)data);
+		break;
+	case 16:
+		if (sign)
+			seq_printf(m, "%d", *(s16 *)data);
+		else
+			seq_printf(m, "%u", *(u16 *)data);
+		break;
+	case 8:
+		if (sign)
+			seq_printf(m, "%d", *(s8 *)data);
+		else
+			seq_printf(m, "%u", *(u8 *)data);
+		break;
+	default:
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+	}
+}
+
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_int_check_member,
 	.log_details = btf_int_log,
+	.seq_show = btf_int_seq_show,
 };
 
 static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1054,6 +1150,24 @@ resolved:
 	return 0;
 }
 
+static void btf_modifier_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  u32 type_id, void *data,
+				  u8 bits_offset, struct seq_file *m)
+{
+	t = btf_type_id_resolve(btf, &type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	/* It is a hashed value */
+	seq_printf(m, "%p", *(void **)data);
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -1065,6 +1179,7 @@ static struct btf_kind_operations modifier_ops = {
 	.resolve = btf_modifier_resolve,
 	.check_member = btf_modifier_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_modifier_seq_show,
 };
 
 static struct btf_kind_operations ptr_ops = {
@@ -1072,6 +1187,7 @@ static struct btf_kind_operations ptr_ops = {
 	.resolve = btf_ptr_resolve,
 	.check_member = btf_ptr_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_ptr_seq_show,
 };
 
 static struct btf_kind_operations fwd_ops = {
@@ -1079,6 +1195,7 @@ static struct btf_kind_operations fwd_ops = {
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
 };
 
 static int btf_array_check_member(struct btf_verifier_env *env,
@@ -1209,11 +1326,36 @@ static void btf_array_log(struct btf_verifier_env *env,
 			 array->type, array->index_type, array->nelems);
 }
 
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+			       u32 type_id, void *data, u8 bits_offset,
+			       struct seq_file *m)
+{
+	const struct btf_array *array = btf_type_array(t);
+	const struct btf_kind_operations *elem_ops;
+	const struct btf_type *elem_type;
+	u32 i, elem_size, elem_type_id;
+
+	elem_type_id = array->type;
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	elem_ops = btf_type_ops(elem_type);
+	seq_puts(m, "[");
+	for (i = 0; i < array->nelems; i++) {
+		if (i)
+			seq_puts(m, ",");
+
+		elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+				   bits_offset, m);
+		data += elem_size;
+	}
+	seq_puts(m, "]");
+}
+
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
 	.check_member = btf_array_check_member,
 	.log_details = btf_array_log,
+	.seq_show = btf_array_seq_show,
 };
 
 static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -1361,11 +1503,39 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+				u32 type_id, void *data, u8 bits_offset,
+				struct seq_file *m)
+{
+	const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+	const struct btf_member *member;
+	u32 i;
+
+	seq_puts(m, "{");
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								member->type);
+		u32 member_offset = member->offset;
+		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+		const struct btf_kind_operations *ops;
+
+		if (i)
+			seq_puts(m, seq);
+
+		ops = btf_type_ops(member_type);
+		ops->seq_show(btf, member_type, member->type,
+			      data + bytes_offset, bits8_offset, m);
+	}
+	seq_puts(m, "}");
+}
+
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
 	.check_member = btf_struct_check_member,
 	.log_details = btf_struct_log,
+	.seq_show = btf_struct_seq_show,
 };
 
 static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -1441,11 +1611,31 @@ static void btf_enum_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+			      u32 type_id, void *data, u8 bits_offset,
+			      struct seq_file *m)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	int v = *(int *)data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v == enums[i].val) {
+			seq_printf(m, "%s",
+				   btf_name_by_offset(btf, enums[i].name));
+			return;
+		}
+	}
+
+	seq_printf(m, "%d", v);
+}
+
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_enum_check_member,
 	.log_details = btf_enum_log,
+	.seq_show = btf_enum_seq_show,
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
@@ -1782,3 +1972,11 @@ errout:
 		btf_free(btf);
 	return ERR_PTR(err);
 }
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m)
+{
+	const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
-- 
cgit v1.2.3-59-g8ed1b


From f56a653c1fd13a197076dec4461c656fd2adec73 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:01 -0700
Subject: bpf: btf: Add BPF_BTF_LOAD command

This patch adds a BPF_BTF_LOAD command which
1) loads and verifies the BTF (implemented in earlier patches)
2) returns a BTF fd to userspace.  In the next patch, the
   BTF fd can be specified during BPF_MAP_CREATE.

It currently limits to CAP_SYS_ADMIN.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |  4 +++
 include/uapi/linux/bpf.h |  9 +++++++
 kernel/bpf/btf.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 17 ++++++++++++
 4 files changed, 97 insertions(+)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d8bdab0280ba..a7c7072535ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -8,7 +8,11 @@
 
 struct btf;
 struct btf_type;
+union bpf_attr;
 
+void btf_put(struct btf *btf);
+int btf_new_fd(const union bpf_attr *attr);
+struct btf *btf_get_by_fd(int fd);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9a2d1a04eb24..795bcd577750 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -363,6 +364,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 10ee41589da2..2322340694cf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7,6 +7,8 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/bpf_verifier.h>
@@ -190,6 +192,7 @@ struct btf {
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
+	refcount_t refcnt;
 };
 
 enum verifier_phase {
@@ -604,6 +607,17 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
+static void btf_get(struct btf *btf)
+{
+	refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+	if (btf && refcount_dec_and_test(&btf->refcnt))
+		btf_free(btf);
+}
+
 static int env_resolve_init(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1963,6 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	if (!err) {
 		btf_verifier_env_free(env);
+		btf_get(btf);
 		return btf;
 	}
 
@@ -1980,3 +1995,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 
 	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
 }
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+	btf_put(filp->private_data);
+	return 0;
+}
+
+static const struct file_operations btf_fops = {
+	.release	= btf_release,
+};
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+	struct btf *btf;
+	int fd;
+
+	btf = btf_parse(u64_to_user_ptr(attr->btf),
+			attr->btf_size, attr->btf_log_level,
+			u64_to_user_ptr(attr->btf_log_buf),
+			attr->btf_log_size);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = anon_inode_getfd("btf", &btf_fops, btf,
+			      O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+	struct btf *btf;
+	struct fd f;
+
+	f = fdget(fd);
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &btf_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	btf = f.file->private_data;
+	btf_get(btf);
+	fdput(f);
+
+	return btf;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..cd8ebadc66eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
@@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_LOAD))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_new_fd(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
+	case BPF_BTF_LOAD:
+		err = bpf_btf_load(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 60197cfb6e11ffc03aa0ed23765b2f7e70b2e2d4 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:02 -0700
Subject: bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd

This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd.
The original BTF data, which was used to create the BTF fd during
the earlier BPF_BTF_LOAD call, will be returned.

The userspace is expected to allocate buffer
to info.info and the buffer size is set to info.info_len before
calling BPF_OBJ_GET_INFO_BY_FD.

The original BTF data is copied to the userspace buffer (info.info).
Only upto the user's specified info.info_len will be copied.

The original BTF data size is set to info.info_len.  The userspace
needs to check if it is bigger than its allocated buffer size.
If it is, the userspace should realloc with the kernel-returned
info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h  |  5 +++++
 kernel/bpf/btf.c     | 17 ++++++++++++++++-
 kernel/bpf/syscall.c |  2 ++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a7c7072535ea..a966dc6d61ee 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -10,9 +10,14 @@ struct btf;
 struct btf_type;
 union bpf_attr;
 
+extern const struct file_operations btf_fops;
+
 void btf_put(struct btf *btf);
 int btf_new_fd(const union bpf_attr *attr);
 struct btf *btf_get_by_fd(int fd);
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2322340694cf..eb56ac760547 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2002,7 +2002,7 @@ static int btf_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static const struct file_operations btf_fops = {
+const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
@@ -2047,3 +2047,18 @@ struct btf *btf_get_by_fd(int fd)
 
 	return btf;
 }
+
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	void __user *udata = u64_to_user_ptr(attr->info.info);
+	u32 copy_len = min_t(u32, btf->data_size,
+			     attr->info.info_len);
+
+	if (copy_to_user(udata, btf->data, copy_len) ||
+	    put_user(btf->data_size, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd8ebadc66eb..0a4924a0a8da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2017,6 +2017,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	else if (f.file->f_op == &bpf_map_fops)
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
+	else if (f.file->f_op == &btf_fops)
+		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From a26ca7c982cb576749cbdd01e8ecde4bf010d60a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:03 -0700
Subject: bpf: btf: Add pretty print support to the basic arraymap

This patch adds pretty print support to the basic arraymap.
Support for other bpf maps can be added later.

This patch adds new attrs to the BPF_MAP_CREATE command to allow
specifying the btf_fd, btf_key_id and btf_value_id.  The
BPF_MAP_CREATE can then associate the btf to the map if
the creating map supports BTF.

A BTF supported map needs to implement two new map ops,
map_seq_show_elem() and map_check_btf().  This patch has
implemented these new map ops for the basic arraymap.

It also adds file_operations, bpffs_map_fops, to the pinned
map such that the pinned map can be opened and read.
After that, the user has an intuitive way to do
"cat bpffs/pathto/a-pinned-map" instead of getting
an error.

bpffs_map_fops should not be extended further to support
other operations.  Other operations (e.g. write/key-lookup...)
should be realized by the userspace tools (e.g. bpftool) through
the BPF_OBJ_GET_INFO_BY_FD, map's lookup/update interface...etc.
Follow up patches will allow the userspace to obtain
the BTF from a map-fd.

Here is a sample output when reading a pinned arraymap
with the following map's value:

struct map_value {
	int count_a;
	int count_b;
};

cat /sys/fs/bpf/pinned_array_map:

0: {1,2}
1: {3,4}
2: {5,6}
...

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  20 +++++-
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/arraymap.c    |  50 +++++++++++++++
 kernel/bpf/inode.c       | 156 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  32 +++++++++-
 5 files changed, 254 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd0ee92..ee5275e7d4df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,8 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_map;
 struct sock;
+struct seq_file;
+struct btf;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -43,10 +45,14 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
+				  struct seq_file *m);
+	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
+			     u32 key_type_id, u32 value_type_id);
 };
 
 struct bpf_map {
-	/* 1st cacheline with read-mostly members of which some
+	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
 	 */
 	const struct bpf_map_ops *ops ____cacheline_aligned;
@@ -62,10 +68,13 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	u32 btf_key_id;
+	u32 btf_value_id;
+	struct btf *btf;
 	bool unpriv_array;
-	/* 7 bytes hole */
+	/* 55 bytes hole */
 
-	/* 2nd cacheline with misc members to avoid false sharing
+	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
 	struct user_struct *user ____cacheline_aligned;
@@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
+{
+	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+}
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 795bcd577750..c8383a289f7b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -280,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..02a189339381 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
 
 #include "map_in_map.h"
 
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+				    struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = array_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	seq_printf(m, "%u: ", *(u32 *)key);
+	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	u32 int_data;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	/* bpf array can only take a u32 key.  This check makes
+	 * sure that the btf matches the attr used during map_create.
+	 */
+	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+	    BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size > map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_seq_show_elem = array_map_seq_show_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..a41343009ccc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
+struct map_iter {
+	void *key;
+	bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+	return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+	return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+	if (iter) {
+		kfree(iter->key);
+		kfree(iter);
+	}
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+	struct map_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+	if (!iter)
+		goto error;
+
+	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!iter->key)
+		goto error;
+
+	return iter;
+
+error:
+	map_iter_free(iter);
+	return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (map_iter(m)->done)
+		return NULL;
+
+	if (unlikely(v == SEQ_START_TOKEN))
+		goto done;
+
+	if (map->ops->map_get_next_key(map, key, key)) {
+		map_iter(m)->done = true;
+		return NULL;
+	}
+
+done:
+	++(*pos);
+	return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (map_iter(m)->done)
+		return NULL;
+
+	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (unlikely(v == SEQ_START_TOKEN)) {
+		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+		seq_puts(m, "# WARNING!! The output format will change\n");
+	} else {
+		map->ops->map_seq_show_elem(map, key, m);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+	.start	= map_seq_start,
+	.next	= map_seq_next,
+	.show	= map_seq_show,
+	.stop	= map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+	struct bpf_map *map = inode->i_private;
+	struct map_iter *iter;
+	struct seq_file *m;
+	int err;
+
+	iter = map_iter_alloc(map);
+	if (!iter)
+		return -ENOMEM;
+
+	err = seq_open(file, &bpffs_map_seq_ops);
+	if (err) {
+		map_iter_free(iter);
+		return err;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	map_iter_free(map_iter(m));
+
+	return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map.  The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+	.open		= bpffs_map_open,
+	.read		= seq_read,
+	.release	= bpffs_map_release,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-			 const struct inode_operations *iops)
+			 const struct inode_operations *iops,
+			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 		return PTR_ERR(inode);
 
 	inode->i_op = iops;
+	inode->i_fop = fops;
 	inode->i_private = raw;
 
 	bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+	struct bpf_map *map = arg;
+
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+			     map->btf ? &bpffs_map_fops : NULL);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0a4924a0a8da..fe23dc5a3ec4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,7 @@
 #include <linux/cred.h>
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
+#include <linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -251,6 +252,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
+	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -416,7 +418,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -450,6 +452,33 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
+	if (bpf_map_support_seq_show(map) &&
+	    (attr->btf_key_id || attr->btf_value_id)) {
+		struct btf *btf;
+
+		if (!attr->btf_key_id || !attr->btf_value_id) {
+			err = -EINVAL;
+			goto free_map_nouncharge;
+		}
+
+		btf = btf_get_by_fd(attr->btf_fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto free_map_nouncharge;
+		}
+
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
+					      attr->btf_value_id);
+		if (err) {
+			btf_put(btf);
+			goto free_map_nouncharge;
+		}
+
+		map->btf = btf;
+		map->btf_key_id = attr->btf_key_id;
+		map->btf_value_id = attr->btf_value_id;
+	}
+
 	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
@@ -482,6 +511,7 @@ free_map:
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
+	btf_put(map->btf);
 	map->ops->map_free(map);
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3bd86a8409fc5a87ea415087b0c30ab42818e7c8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:04 -0700
Subject: bpf: btf: Sync bpf.h and btf.h to tools/

This patch sync up the bpf.h and btf.h to tools/

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h |  12 ++++
 tools/include/uapi/linux/btf.h | 130 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 tools/include/uapi/linux/btf.h

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 56bf493ba7ed..7f7fbb9d0253 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -279,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -363,6 +367,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..74a30b1090df
--- /dev/null
+++ b/tools/include/uapi/linux/btf.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_MAGIC_SWAP	0x9FeB
+#define BTF_VERSION	1
+#define BTF_FLAGS_COMPR	0x01
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+
+	__u32	parent_label;
+	__u32	parent_name;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	label_off;	/* offset of label section	*/
+	__u32	object_off;	/* offset of data object section*/
+	__u32	func_off;	/* offset of function section	*/
+	__u32	type_off;	/* offset of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+	__u32 name;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
+	 * bits    31: root
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_PTR		2	/* Pointer	*/
+#define BTF_KIND_ARRAY		3	/* Array	*/
+#define BTF_KIND_STRUCT		4	/* Struct	*/
+#define BTF_KIND_UNION		5	/* Union	*/
+#define BTF_KIND_ENUM		6	/* Enumeration	*/
+#define BTF_KIND_FWD		7	/* Forward	*/
+#define BTF_KIND_TYPEDEF	8	/* Typedef	*/
+#define BTF_KIND_VOLATILE	9	/* Volatile	*/
+#define BTF_KIND_CONST		10	/* Const	*/
+#define BTF_KIND_RESTRICT	11	/* Restrict	*/
+#define BTF_KIND_MAX		11
+#define NR_BTF_KINDS		12
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	0x1
+#define BTF_INT_CHAR	0x2
+#define BTF_INT_BOOL	0x4
+#define BTF_INT_VARARGS	0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 8a138aed4a807ceb143882fb23a423d524dcdb35 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:05 -0700
Subject: bpf: btf: Add BTF support to libbpf

If the ".BTF" elf section exists, libbpf will try to create
a btf_fd (through BPF_BTF_LOAD).  If that fails, it will still
continue loading the bpf prog/map without the BTF.

If the bpf_object has a BTF loaded, it will create a map with the btf_fd.
libbpf will try to figure out the btf_key_id and btf_value_id of a map by
finding the BTF type with name "<map_name>_key" and "<map_name>_value".
If they cannot be found, it will continue without using the BTF.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/Build    |   2 +-
 tools/lib/bpf/bpf.c    |  92 +++++++++---
 tools/lib/bpf/bpf.h    |  16 +++
 tools/lib/bpf/btf.c    | 374 +++++++++++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/btf.h    |  22 +++
 tools/lib/bpf/libbpf.c | 148 +++++++++++++++++--
 tools/lib/bpf/libbpf.h |   3 +
 7 files changed, 626 insertions(+), 31 deletions(-)
 create mode 100644 tools/lib/bpf/btf.c
 create mode 100644 tools/lib/bpf/btf.h

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 64c679d67109..6070e655042d 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index acbb3f8b3bec..76b36cc16e7f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -73,43 +73,76 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
 	return syscall(__NR_bpf, cmd, attr, size);
 }
 
-int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
-			int key_size, int value_size, int max_entries,
-			__u32 map_flags, int node)
+int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 {
-	__u32 name_len = name ? strlen(name) : 0;
+	__u32 name_len = create_attr->name ? strlen(create_attr->name) : 0;
 	union bpf_attr attr;
 
 	memset(&attr, '\0', sizeof(attr));
 
-	attr.map_type = map_type;
-	attr.key_size = key_size;
-	attr.value_size = value_size;
-	attr.max_entries = max_entries;
-	attr.map_flags = map_flags;
-	memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+	attr.map_type = create_attr->map_type;
+	attr.key_size = create_attr->key_size;
+	attr.value_size = create_attr->value_size;
+	attr.max_entries = create_attr->max_entries;
+	attr.map_flags = create_attr->map_flags;
+	memcpy(attr.map_name, create_attr->name,
+	       min(name_len, BPF_OBJ_NAME_LEN - 1));
+	attr.numa_node = create_attr->numa_node;
+	attr.btf_fd = create_attr->btf_fd;
+	attr.btf_key_id = create_attr->btf_key_id;
+	attr.btf_value_id = create_attr->btf_value_id;
+
+	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
 
+int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
+			int key_size, int value_size, int max_entries,
+			__u32 map_flags, int node)
+{
+	struct bpf_create_map_attr map_attr = {};
+
+	map_attr.name = name;
+	map_attr.map_type = map_type;
+	map_attr.map_flags = map_flags;
+	map_attr.key_size = key_size;
+	map_attr.value_size = value_size;
+	map_attr.max_entries = max_entries;
 	if (node >= 0) {
-		attr.map_flags |= BPF_F_NUMA_NODE;
-		attr.numa_node = node;
+		map_attr.numa_node = node;
+		map_attr.map_flags |= BPF_F_NUMA_NODE;
 	}
 
-	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+	return bpf_create_map_xattr(&map_attr);
 }
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size,
 		   int value_size, int max_entries, __u32 map_flags)
 {
-	return bpf_create_map_node(map_type, NULL, key_size, value_size,
-				   max_entries, map_flags, -1);
+	struct bpf_create_map_attr map_attr = {};
+
+	map_attr.map_type = map_type;
+	map_attr.map_flags = map_flags;
+	map_attr.key_size = key_size;
+	map_attr.value_size = value_size;
+	map_attr.max_entries = max_entries;
+
+	return bpf_create_map_xattr(&map_attr);
 }
 
 int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
 			int key_size, int value_size, int max_entries,
 			__u32 map_flags)
 {
-	return bpf_create_map_node(map_type, name, key_size, value_size,
-				   max_entries, map_flags, -1);
+	struct bpf_create_map_attr map_attr = {};
+
+	map_attr.name = name;
+	map_attr.map_type = map_type;
+	map_attr.map_flags = map_flags;
+	map_attr.key_size = key_size;
+	map_attr.value_size = value_size;
+	map_attr.max_entries = max_entries;
+
+	return bpf_create_map_xattr(&map_attr);
 }
 
 int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
@@ -573,3 +606,28 @@ cleanup:
 	close(sock);
 	return ret;
 }
+
+int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
+		 bool do_log)
+{
+	union bpf_attr attr = {};
+	int fd;
+
+	attr.btf = ptr_to_u64(btf);
+	attr.btf_size = btf_size;
+
+retry:
+	if (do_log && log_buf && log_buf_size) {
+		attr.btf_log_level = 1;
+		attr.btf_log_size = log_buf_size;
+		attr.btf_log_buf = ptr_to_u64(log_buf);
+	}
+
+	fd = sys_bpf(BPF_BTF_LOAD, &attr, sizeof(attr));
+	if (fd == -1 && !do_log && log_buf && log_buf_size) {
+		do_log = true;
+		goto retry;
+	}
+
+	return fd;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 39f6a0d64a3b..01bda076310f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -26,6 +26,20 @@
 #include <linux/bpf.h>
 #include <stddef.h>
 
+struct bpf_create_map_attr {
+	const char *name;
+	enum bpf_map_type map_type;
+	__u32 map_flags;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 numa_node;
+	__u32 btf_fd;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
+};
+
+int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
 int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
 			int key_size, int value_size, int max_entries,
 			__u32 map_flags, int node);
@@ -87,4 +101,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
+		 bool do_log);
 #endif
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
new file mode 100644
index 000000000000..58b6255abc7a
--- /dev/null
+++ b/tools/lib/bpf/btf.c
@@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/err.h>
+#include <linux/btf.h>
+#include "btf.h"
+#include "bpf.h"
+
+#define elog(fmt, ...) { if (err_log) err_log(fmt, ##__VA_ARGS__); }
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#define BTF_MAX_NR_TYPES 65535
+
+static struct btf_type btf_void;
+
+struct btf {
+	union {
+		struct btf_header *hdr;
+		void *data;
+	};
+	struct btf_type **types;
+	const char *strings;
+	void *nohdr_data;
+	uint32_t nr_types;
+	uint32_t types_size;
+	uint32_t data_size;
+	int fd;
+};
+
+static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
+{
+	if (!BTF_STR_TBL_ELF_ID(offset) &&
+	    BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else
+		return NULL;
+}
+
+static int btf_add_type(struct btf *btf, struct btf_type *t)
+{
+	if (btf->types_size - btf->nr_types < 2) {
+		struct btf_type **new_types;
+		u32 expand_by, new_size;
+
+		if (btf->types_size == BTF_MAX_NR_TYPES)
+			return -E2BIG;
+
+		expand_by = max(btf->types_size >> 2, 16);
+		new_size = min(BTF_MAX_NR_TYPES, btf->types_size + expand_by);
+
+		new_types = realloc(btf->types, sizeof(*new_types) * new_size);
+		if (!new_types)
+			return -ENOMEM;
+
+		if (btf->nr_types == 0)
+			new_types[0] = &btf_void;
+
+		btf->types = new_types;
+		btf->types_size = new_size;
+	}
+
+	btf->types[++(btf->nr_types)] = t;
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf *btf, btf_print_fn_t err_log)
+{
+	const struct btf_header *hdr = btf->hdr;
+	u32 meta_left;
+
+	if (btf->data_size < sizeof(struct btf_header)) {
+		elog("BTF header not found\n");
+		return -EINVAL;
+	}
+
+	if (hdr->magic != BTF_MAGIC) {
+		elog("Invalid BTF magic:%x\n", hdr->magic);
+		return -EINVAL;
+	}
+
+	if (hdr->version != BTF_VERSION) {
+		elog("Unsupported BTF version:%u\n", hdr->version);
+		return -ENOTSUP;
+	}
+
+	if (hdr->flags) {
+		elog("Unsupported BTF flags:%x\n", hdr->flags);
+		return -ENOTSUP;
+	}
+
+	meta_left = btf->data_size - sizeof(*hdr);
+	if (!meta_left) {
+		elog("BTF has no data\n");
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->type_off) {
+		elog("Invalid BTF type section offset:%u\n", hdr->type_off);
+		return -EINVAL;
+	}
+
+	if (meta_left < hdr->str_off) {
+		elog("Invalid BTF string section offset:%u\n", hdr->str_off);
+		return -EINVAL;
+	}
+
+	if (hdr->type_off >= hdr->str_off) {
+		elog("BTF type section offset >= string section offset. No type?\n");
+		return -EINVAL;
+	}
+
+	if (hdr->type_off & 0x02) {
+		elog("BTF type section is not aligned to 4 bytes\n");
+		return -EINVAL;
+	}
+
+	btf->nohdr_data = btf->hdr + 1;
+
+	return 0;
+}
+
+static int btf_parse_str_sec(struct btf *btf, btf_print_fn_t err_log)
+{
+	const struct btf_header *hdr = btf->hdr;
+	const char *start = btf->nohdr_data + hdr->str_off;
+	const char *end = start + btf->hdr->str_len;
+
+	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+	    start[0] || end[-1]) {
+		elog("Invalid BTF string section\n");
+		return -EINVAL;
+	}
+
+	btf->strings = start;
+
+	return 0;
+}
+
+static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log)
+{
+	struct btf_header *hdr = btf->hdr;
+	void *nohdr_data = btf->nohdr_data;
+	void *next_type = nohdr_data + hdr->type_off;
+	void *end_type = nohdr_data + hdr->str_off;
+
+	while (next_type < end_type) {
+		struct btf_type *t = next_type;
+		uint16_t vlen = BTF_INFO_VLEN(t->info);
+		int err;
+
+		next_type += sizeof(*t);
+		switch (BTF_INFO_KIND(t->info)) {
+		case BTF_KIND_INT:
+			next_type += sizeof(int);
+			break;
+		case BTF_KIND_ARRAY:
+			next_type += sizeof(struct btf_array);
+			break;
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+			next_type += vlen * sizeof(struct btf_member);
+			break;
+		case BTF_KIND_ENUM:
+			next_type += vlen * sizeof(struct btf_enum);
+			break;
+		case BTF_KIND_TYPEDEF:
+		case BTF_KIND_PTR:
+		case BTF_KIND_FWD:
+		case BTF_KIND_VOLATILE:
+		case BTF_KIND_CONST:
+		case BTF_KIND_RESTRICT:
+			break;
+		default:
+			elog("Unsupported BTF_KIND:%u\n",
+			     BTF_INFO_KIND(t->info));
+			return -EINVAL;
+		}
+
+		err = btf_add_type(btf, t);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static const struct btf_type *btf_type_by_id(const struct btf *btf,
+					     uint32_t type_id)
+{
+	if (type_id > btf->nr_types)
+		return NULL;
+
+	return btf->types[type_id];
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+	return !t || btf_type_is_void(t);
+}
+
+static int64_t btf_type_size(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+		return t->size;
+	case BTF_KIND_PTR:
+		return sizeof(void *);
+	default:
+		return -EINVAL;
+	}
+}
+
+#define MAX_RESOLVE_DEPTH 32
+
+int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
+{
+	const struct btf_array *array;
+	const struct btf_type *t;
+	uint32_t nelems = 1;
+	int64_t size = -1;
+	int i;
+
+	t = btf_type_by_id(btf, type_id);
+	for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
+	     i++) {
+		size = btf_type_size(t);
+		if (size >= 0)
+			break;
+
+		switch (BTF_INFO_KIND(t->info)) {
+		case BTF_KIND_TYPEDEF:
+		case BTF_KIND_VOLATILE:
+		case BTF_KIND_CONST:
+		case BTF_KIND_RESTRICT:
+			type_id = t->type;
+			break;
+		case BTF_KIND_ARRAY:
+			array = (const struct btf_array *)(t + 1);
+			if (nelems && array->nelems > UINT32_MAX / nelems)
+				return -E2BIG;
+			nelems *= array->nelems;
+			type_id = array->type;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		t = btf_type_by_id(btf, type_id);
+	}
+
+	if (size < 0)
+		return -EINVAL;
+
+	if (nelems && size > UINT32_MAX / nelems)
+		return -E2BIG;
+
+	return nelems * size;
+}
+
+int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
+{
+	uint32_t i;
+
+	if (!strcmp(type_name, "void"))
+		return 0;
+
+	for (i = 1; i <= btf->nr_types; i++) {
+		const struct btf_type *t = btf->types[i];
+		const char *name = btf_name_by_offset(btf, t->name);
+
+		if (name && !strcmp(type_name, name))
+			return i;
+	}
+
+	return -ENOENT;
+}
+
+void btf__free(struct btf *btf)
+{
+	if (!btf)
+		return;
+
+	if (btf->fd != -1)
+		close(btf->fd);
+
+	free(btf->data);
+	free(btf->types);
+	free(btf);
+}
+
+struct btf *btf__new(uint8_t *data, uint32_t size,
+		     btf_print_fn_t err_log)
+{
+	uint32_t log_buf_size = 0;
+	char *log_buf = NULL;
+	struct btf *btf;
+	int err;
+
+	btf = calloc(1, sizeof(struct btf));
+	if (!btf)
+		return ERR_PTR(-ENOMEM);
+
+	btf->fd = -1;
+
+	if (err_log) {
+		log_buf = malloc(BPF_LOG_BUF_SIZE);
+		if (!log_buf) {
+			err = -ENOMEM;
+			goto done;
+		}
+		*log_buf = 0;
+		log_buf_size = BPF_LOG_BUF_SIZE;
+	}
+
+	btf->data = malloc(size);
+	if (!btf->data) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	memcpy(btf->data, data, size);
+	btf->data_size = size;
+
+	btf->fd = bpf_load_btf(btf->data, btf->data_size,
+			       log_buf, log_buf_size, false);
+
+	if (btf->fd == -1) {
+		err = -errno;
+		elog("Error loading BTF: %s(%d)\n", strerror(errno), errno);
+		if (log_buf && *log_buf)
+			elog("%s\n", log_buf);
+		goto done;
+	}
+
+	err = btf_parse_hdr(btf, err_log);
+	if (err)
+		goto done;
+
+	err = btf_parse_str_sec(btf, err_log);
+	if (err)
+		goto done;
+
+	err = btf_parse_type_sec(btf, err_log);
+
+done:
+	free(log_buf);
+
+	if (err) {
+		btf__free(btf);
+		return ERR_PTR(err);
+	}
+
+	return btf;
+}
+
+int btf__fd(const struct btf *btf)
+{
+	return btf->fd;
+}
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
new file mode 100644
index 000000000000..74bb344035bb
--- /dev/null
+++ b/tools/lib/bpf/btf.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef __BPF_BTF_H
+#define __BPF_BTF_H
+
+#include <stdint.h>
+
+#define BTF_ELF_SEC ".BTF"
+
+struct btf;
+
+typedef int (*btf_print_fn_t)(const char *, ...)
+	__attribute__((format(printf, 1, 2)));
+
+void btf__free(struct btf *btf);
+struct btf *btf__new(uint8_t *data, uint32_t size, btf_print_fn_t err_log);
+int32_t btf__find_by_name(const struct btf *btf, const char *type_name);
+int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id);
+int btf__fd(const struct btf *btf);
+
+#endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3d35bacf656f..6513e0b08795 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -45,6 +45,7 @@
 
 #include "libbpf.h"
 #include "bpf.h"
+#include "btf.h"
 
 #ifndef EM_BPF
 #define EM_BPF 247
@@ -212,6 +213,8 @@ struct bpf_map {
 	char *name;
 	size_t offset;
 	struct bpf_map_def def;
+	uint32_t btf_key_id;
+	uint32_t btf_value_id;
 	void *priv;
 	bpf_map_clear_priv_t clear_priv;
 };
@@ -256,6 +259,8 @@ struct bpf_object {
 	 */
 	struct list_head list;
 
+	struct btf *btf;
+
 	void *priv;
 	bpf_object_clear_priv_t clear_priv;
 
@@ -819,7 +824,15 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 							data->d_size);
 		else if (strcmp(name, "maps") == 0)
 			obj->efile.maps_shndx = idx;
-		else if (sh.sh_type == SHT_SYMTAB) {
+		else if (strcmp(name, BTF_ELF_SEC) == 0) {
+			obj->btf = btf__new(data->d_buf, data->d_size,
+					    __pr_debug);
+			if (IS_ERR(obj->btf)) {
+				pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n",
+					   BTF_ELF_SEC, PTR_ERR(obj->btf));
+				obj->btf = NULL;
+			}
+		} else if (sh.sh_type == SHT_SYMTAB) {
 			if (obj->efile.symbols) {
 				pr_warning("bpf: multiple SYMTAB in %s\n",
 					   obj->path);
@@ -996,33 +1009,126 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
 	return 0;
 }
 
+static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
+{
+	struct bpf_map_def *def = &map->def;
+	const size_t max_name = 256;
+	int64_t key_size, value_size;
+	int32_t key_id, value_id;
+	char name[max_name];
+
+	/* Find key type by name from BTF */
+	if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
+		pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
+			   map->name, map->name);
+		return -EINVAL;
+	}
+
+	key_id = btf__find_by_name(btf, name);
+	if (key_id < 0) {
+		pr_debug("map:%s key_type:%s cannot be found in BTF\n",
+			 map->name, name);
+		return key_id;
+	}
+
+	key_size = btf__resolve_size(btf, key_id);
+	if (key_size < 0) {
+		pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
+			   map->name, name);
+		return key_size;
+	}
+
+	if (def->key_size != key_size) {
+		pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n",
+			   map->name, name, key_size, def->key_size);
+		return -EINVAL;
+	}
+
+	/* Find value type from BTF */
+	if (snprintf(name, max_name, "%s_value", map->name) == max_name) {
+		pr_warning("map:%s length of BTF value_type:%s_value is too long\n",
+			  map->name, map->name);
+		return -EINVAL;
+	}
+
+	value_id = btf__find_by_name(btf, name);
+	if (value_id < 0) {
+		pr_debug("map:%s value_type:%s cannot be found in BTF\n",
+			 map->name, name);
+		return value_id;
+	}
+
+	value_size = btf__resolve_size(btf, value_id);
+	if (value_size < 0) {
+		pr_warning("map:%s value_type:%s cannot get the BTF type_size\n",
+			   map->name, name);
+		return value_size;
+	}
+
+	if (def->value_size != value_size) {
+		pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
+			   map->name, name, value_size, def->value_size);
+		return -EINVAL;
+	}
+
+	map->btf_key_id = key_id;
+	map->btf_value_id = value_id;
+
+	return 0;
+}
+
 static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
+	struct bpf_create_map_attr create_attr = {};
 	unsigned int i;
+	int err;
 
 	for (i = 0; i < obj->nr_maps; i++) {
-		struct bpf_map_def *def = &obj->maps[i].def;
-		int *pfd = &obj->maps[i].fd;
-
-		*pfd = bpf_create_map_name(def->type,
-					   obj->maps[i].name,
-					   def->key_size,
-					   def->value_size,
-					   def->max_entries,
-					   def->map_flags);
+		struct bpf_map *map = &obj->maps[i];
+		struct bpf_map_def *def = &map->def;
+		int *pfd = &map->fd;
+
+		create_attr.name = map->name;
+		create_attr.map_type = def->type;
+		create_attr.map_flags = def->map_flags;
+		create_attr.key_size = def->key_size;
+		create_attr.value_size = def->value_size;
+		create_attr.max_entries = def->max_entries;
+		create_attr.btf_fd = 0;
+		create_attr.btf_key_id = 0;
+		create_attr.btf_value_id = 0;
+
+		if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) {
+			create_attr.btf_fd = btf__fd(obj->btf);
+			create_attr.btf_key_id = map->btf_key_id;
+			create_attr.btf_value_id = map->btf_value_id;
+		}
+
+		*pfd = bpf_create_map_xattr(&create_attr);
+		if (*pfd < 0 && create_attr.btf_key_id) {
+			pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
+				   map->name, strerror(errno), errno);
+			create_attr.btf_fd = 0;
+			create_attr.btf_key_id = 0;
+			create_attr.btf_value_id = 0;
+			map->btf_key_id = 0;
+			map->btf_value_id = 0;
+			*pfd = bpf_create_map_xattr(&create_attr);
+		}
+
 		if (*pfd < 0) {
 			size_t j;
-			int err = *pfd;
 
+			err = *pfd;
 			pr_warning("failed to create map (name: '%s'): %s\n",
-				   obj->maps[i].name,
+				   map->name,
 				   strerror(errno));
 			for (j = 0; j < i; j++)
 				zclose(obj->maps[j].fd);
 			return err;
 		}
-		pr_debug("create map %s: fd=%d\n", obj->maps[i].name, *pfd);
+		pr_debug("create map %s: fd=%d\n", map->name, *pfd);
 	}
 
 	return 0;
@@ -1641,6 +1747,7 @@ void bpf_object__close(struct bpf_object *obj)
 
 	bpf_object__elf_finish(obj);
 	bpf_object__unload(obj);
+	btf__free(obj->btf);
 
 	for (i = 0; i < obj->nr_maps; i++) {
 		zfree(&obj->maps[i].name);
@@ -1692,6 +1799,11 @@ unsigned int bpf_object__kversion(struct bpf_object *obj)
 	return obj ? obj->kern_version : 0;
 }
 
+int bpf_object__btf_fd(const struct bpf_object *obj)
+{
+	return obj->btf ? btf__fd(obj->btf) : -1;
+}
+
 int bpf_object__set_priv(struct bpf_object *obj, void *priv,
 			 bpf_object_clear_priv_t clear_priv)
 {
@@ -1937,6 +2049,16 @@ const char *bpf_map__name(struct bpf_map *map)
 	return map ? map->name : NULL;
 }
 
+uint32_t bpf_map__btf_key_id(const struct bpf_map *map)
+{
+	return map ? map->btf_key_id : 0;
+}
+
+uint32_t bpf_map__btf_value_id(const struct bpf_map *map)
+{
+	return map ? map->btf_value_id : 0;
+}
+
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
 		     bpf_map_clear_priv_t clear_priv)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 8b242486b464..d6ac4fa6f472 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -78,6 +78,7 @@ int bpf_object__load(struct bpf_object *obj);
 int bpf_object__unload(struct bpf_object *obj);
 const char *bpf_object__name(struct bpf_object *obj);
 unsigned int bpf_object__kversion(struct bpf_object *obj);
+int bpf_object__btf_fd(const struct bpf_object *obj);
 
 struct bpf_object *bpf_object__next(struct bpf_object *prev);
 #define bpf_object__for_each_safe(pos, tmp)			\
@@ -241,6 +242,8 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
 int bpf_map__fd(struct bpf_map *map);
 const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
 const char *bpf_map__name(struct bpf_map *map);
+uint32_t bpf_map__btf_key_id(const struct bpf_map *map);
+uint32_t bpf_map__btf_value_id(const struct bpf_map *map);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
-- 
cgit v1.2.3-59-g8ed1b


From c0fa1b6c3efc64b4ecfec658f95475fda650742e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 18 Apr 2018 15:56:06 -0700
Subject: bpf: btf: Add BTF tests

This patch tests the BTF loading, map_create with BTF
and the changes in libbpf.

-r: Raw tests that test raw crafted BTF data
-f: Test LLVM compiled bpf prog with BTF data
-g: Test BPF_OBJ_GET_INFO_BY_FD for btf_fd
-p: Test pretty print

The tools/testing/selftests/bpf/Makefile will probe
for BTF support in llc and pahole before generating
debug info (-g) and convert them to BTF.  You can supply
the BTF supported binary through the following make variables:
LLC, BTF_PAHOLE and LLVM_OBJCOPY.

LLC: The lastest llc with -mattr=dwarfris support for the bpf target.
     It is only in the master of the llvm repo for now.
BTF_PAHOLE: The modified pahole with BTF support:
	    https://github.com/iamkafai/pahole/tree/btf
	    To add a BTF section: "pahole -J bpf_prog.o"
LLVM_OBJCOPY: Any llvm-objcopy should do

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile         |   26 +-
 tools/testing/selftests/bpf/test_btf.c       | 1669 ++++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_btf_haskv.c |   48 +
 tools/testing/selftests/bpf/test_btf_nokv.c  |   43 +
 4 files changed, 1783 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_btf.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3e819dc70bee..0b72cc7596f1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,14 +24,15 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_sock_addr
+	test_sock test_sock_addr test_btf
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
-	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o
+	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
+	test_btf_haskv.o test_btf_nokv.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -66,6 +67,8 @@ $(BPFOBJ): force
 
 CLANG ?= clang
 LLC   ?= llc
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
 
 PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
 
@@ -83,9 +86,26 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
 $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help |& grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help |& grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --version |& grep LLVM)
+
+ifneq ($(BTF_LLC_PROBE),)
+ifneq ($(BTF_PAHOLE_PROBE),)
+ifneq ($(BTF_OBJCOPY_PROBE),)
+	CLANG_FLAGS += -g
+	LLC_FLAGS += -mattr=dwarfris
+	DWARF2BTF = y
+endif
+endif
+endif
+
 $(OUTPUT)/%.o: %.c
 	$(CLANG) $(CLANG_FLAGS) \
 		 -O2 -target bpf -emit-llvm -c $< -o - |      \
-	$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+	$(BTF_PAHOLE) -J $@
+endif
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
new file mode 100644
index 000000000000..7b39b1f712a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -0,0 +1,1669 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <bpf/bpf.h>
+#include <sys/resource.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+
+#include "bpf_rlimit.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define __printf(a, b)	__attribute__((format(printf, a, b)))
+
+__printf(1, 2)
+static int __base_pr(const char *format, ...)
+{
+	va_list args;
+	int err;
+
+	va_start(args, format);
+	err = vfprintf(stderr, format, args);
+	va_end(args);
+	return err;
+}
+
+#define BTF_INFO_ENC(kind, root, vlen)			\
+	((!!(root) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
+
+#define BTF_TYPE_ENC(name, info, size_or_type)	\
+	(name), (info), (size_or_type)
+
+#define BTF_INT_ENC(encoding, bits_offset, nr_bits)	\
+	((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
+#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz)	\
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz),	\
+	BTF_INT_ENC(encoding, bits_offset, bits)
+
+#define BTF_ARRAY_ENC(type, index_type, nr_elems)	\
+	(type), (index_type), (nr_elems)
+#define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
+	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
+	BTF_ARRAY_ENC(type, index_type, nr_elems)
+
+#define BTF_MEMBER_ENC(name, type, bits_offset)	\
+	(name), (type), (bits_offset)
+#define BTF_ENUM_ENC(name, val) (name), (val)
+
+#define BTF_TYPEDEF_ENC(name, type) \
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
+
+#define BTF_PTR_ENC(name, type) \
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
+
+#define BTF_END_RAW 0xdeadbeef
+#define NAME_TBD 0xdeadb33f
+
+#define MAX_NR_RAW_TYPES 1024
+#define BTF_LOG_BUF_SIZE 65535
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+static struct args {
+	unsigned int raw_test_num;
+	unsigned int file_test_num;
+	unsigned int get_info_test_num;
+	bool raw_test;
+	bool file_test;
+	bool get_info_test;
+	bool pprint_test;
+	bool always_log;
+} args;
+
+static char btf_log_buf[BTF_LOG_BUF_SIZE];
+
+static struct btf_header hdr_tmpl = {
+	.magic = BTF_MAGIC,
+	.version = BTF_VERSION,
+};
+
+struct btf_raw_test {
+	const char *descr;
+	const char *str_sec;
+	const char *map_name;
+	__u32 raw_types[MAX_NR_RAW_TYPES];
+	__u32 str_sec_size;
+	enum bpf_map_type map_type;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 key_id;
+	__u32 value_id;
+	__u32 max_entries;
+	bool btf_load_err;
+	bool map_create_err;
+	int type_off_delta;
+	int str_off_delta;
+	int str_len_delta;
+};
+
+static struct btf_raw_test raw_tests[] = {
+/* enum E {
+ *     E0,
+ *     E1,
+ * };
+ *
+ * struct A {
+ *	int m;
+ *	unsigned long long n;
+ *	char o;
+ *	[3 bytes hole]
+ *	int p[8];
+ *	int q[4][8];
+ *	enum E r;
+ * };
+ */
+{
+	.descr = "struct test #1",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* unsigned long long */
+		BTF_TYPE_INT_ENC(0, 0, 0, 64, 8),		/* [2] */
+		/* char */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1),	/* [3] */
+		/* int[8] */
+		BTF_TYPE_ARRAY_ENC(1, 1, 8),			/* [4] */
+		/* struct A { */				/* [5] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* unsigned long long n;*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8]		*/
+		BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8]		*/
+		BTF_MEMBER_ENC(NAME_TBD, 7, 1408), /* enum E r		*/
+		/* } */
+		/* int[4][8] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 4),			/* [6] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1",
+	.str_sec_size = sizeof("\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_test1_map",
+	.key_size = sizeof(int),
+	.value_size = 180,
+	.key_id = 1,
+	.value_id = 5,
+	.max_entries = 4,
+},
+
+/* typedef struct b Struct_B;
+ *
+ * struct A {
+ *     int m;
+ *     struct b n[4];
+ *     const Struct_B o[4];
+ * };
+ *
+ * struct B {
+ *     int m;
+ *     int n;
+ * };
+ */
+{
+	.descr = "struct test #2",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct b [4] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 4),
+
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 3), 68),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct B n[4]	*/
+		BTF_MEMBER_ENC(NAME_TBD, 8, 288),/* const Struct_B o[4];*/
+		/* } */
+
+		/* struct B { */				/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */
+		/* } */
+
+		/* const int */					/* [5] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),
+		/* typedef struct b Struct_B */	/* [6] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), 4),
+		/* const Struct_B */				/* [7] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 6),
+		/* const Struct_B [4] */			/* [8] */
+		BTF_TYPE_ARRAY_ENC(7, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n\0o\0B\0m\0n\0Struct_B",
+	.str_sec_size = sizeof("\0A\0m\0n\0o\0B\0m\0n\0Struct_B"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_test2_map",
+	.key_size = sizeof(int),
+	.value_size = 68,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+},
+
+/* Test member exceeds the size of struct.
+ *
+ * struct A {
+ *     int m;
+ *     int n;
+ * };
+ */
+{
+	.descr = "size check test #1",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */				/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 -  1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check1_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ *     int m;
+ *     int n[2];
+ * };
+ */
+{
+	.descr = "size check test #2",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* int[2] */					/* [2] */
+		BTF_TYPE_ARRAY_ENC(1, 1, 2),
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 3 - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n[2]; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check2_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ *     int m;
+ *     void *n;
+ * };
+ */
+{
+	.descr = "size check test #3",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* void* */					/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) + sizeof(void *) - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* void *n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check3_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Test member exceeds the size of struct
+ *
+ * enum E {
+ *     E0,
+ *     E1,
+ * };
+ *
+ * struct A {
+ *     int m;
+ *     enum E n;
+ * };
+ */
+{
+	.descr = "size check test #4",
+	.raw_types = {
+		/* int */			/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* enum E { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		/* } */
+		/* struct A { */		/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* enum E n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0E\0E0\0E1\0A\0m\0n",
+	.str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check4_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef const void * const_void_ptr;
+ * struct A {
+ *	const_void_ptr m;
+ * };
+ */
+{
+	.descr = "void test #1",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void* */	/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+		/* typedef const void * const_void_ptr */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		/* struct A { */	/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/* const_void_ptr m; */
+		BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0const_void_ptr\0A\0m",
+	.str_sec_size = sizeof("\0const_void_ptr\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test1_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 4,
+	.max_entries = 4,
+},
+
+/* struct A {
+ *     const void m;
+ * };
+ */
+{
+	.descr = "void test #2",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* struct A { */	/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 8),
+		/* const void m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test2_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef const void * const_void_ptr;
+ * const_void_ptr[4]
+ */
+{
+	.descr = "void test #3",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void* */	/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+		/* typedef const void * const_void_ptr */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		/* const_void_ptr[4] */	/* [4] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0const_void_ptr",
+	.str_sec_size = sizeof("\0const_void_ptr"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test3_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *) * 4,
+	.key_id = 1,
+	.value_id = 4,
+	.max_entries = 4,
+},
+
+/* const void[4]  */
+{
+	.descr = "void test #4",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void[4] */	/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test4_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *) * 4,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Array_A  <------------------+
+ *     elem_type == Array_B    |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array A --+
+ */
+{
+	.descr = "loop test #1",
+	.raw_types = {
+		/* int */			/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */			/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* Array_B */			/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test1_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef is _before_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A  <------------------+
+ *     elem_type == int_array  |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #2",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* typedef Array_B int_array */
+		BTF_TYPEDEF_ENC(1, 4),				/* [2] */
+		/* Array_A */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),			/* [3] */
+		/* Array_B */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),			/* [4] */
+
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int_array\0",
+	.str_sec_size = sizeof("\0int_array"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test2_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Array_A  <------------------+
+ *     elem_type == Array_B    |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #3",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* Array_B */				/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test3_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef is _between_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A  <------------------+
+ *     elem_type == int_array  |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #4",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* typedef Array_B int_array */		/* [3] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),
+		/* Array_B */				/* [4] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int_array\0",
+	.str_sec_size = sizeof("\0int_array"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test4_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef struct B Struct_B
+ *
+ * struct A {
+ *     int x;
+ *     Struct_B y;
+ * };
+ *
+ * struct B {
+ *     int x;
+ *     struct A y;
+ * };
+ */
+{
+	.descr = "loop test #5",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* struct A */					/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 32),/* Struct_B y;	*/
+		/* typedef struct B Struct_B */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		/* struct B */					/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A y;	*/
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0x\0y\0Struct_B\0B\0x\0y",
+	.str_sec_size = sizeof("\0A\0x\0y\0Struct_B\0B\0x\0y"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test5_map",
+	.key_size = sizeof(int),
+	.value_size = 8,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* struct A {
+ *     int x;
+ *     struct A array_a[4];
+ * };
+ */
+{
+	.descr = "loop test #6",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 4),			/* [2] */
+		/* struct A */					/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A array_a[4];	*/
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0x\0y",
+	.str_sec_size = sizeof("\0A\0x\0y"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test6_map",
+	.key_size = sizeof(int),
+	.value_size = 8,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "loop test #7",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *m;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+		/* CONST type_id=3	*/		/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+		/* PTR type_id=2	*/		/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test7_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "loop test #8",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *m;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 4, 0),
+		/* struct B { */			/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *n;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 6, 0),
+		/* CONST type_id=5	*/		/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 5),
+		/* PTR type_id=6	*/		/* [5] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 6),
+		/* CONST type_id=7	*/		/* [6] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 7),
+		/* PTR type_id=4	*/		/* [7] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0B\0n",
+	.str_sec_size = sizeof("\0A\0m\0B\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test8_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "type_off == str_off",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.type_off_delta = sizeof(struct btf_type) + sizeof(int) + sizeof("\0int"),
+},
+
+{
+	.descr = "Unaligned type_off",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.type_off_delta = 1,
+},
+
+{
+	.descr = "str_off beyonds btf size",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_off_delta = sizeof("\0int") + 1,
+},
+
+{
+	.descr = "str_len beyonds btf size",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = 1,
+},
+
+{
+	.descr = "String section does not end with null",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = -1,
+},
+
+{
+	.descr = "Empty string section",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = 0 - (int)sizeof("\0int"),
+},
+
+}; /* struct btf_raw_test raw_tests[] */
+
+static const char *get_next_str(const char *start, const char *end)
+{
+	return start < end - 1 ? start + 1 : NULL;
+}
+
+static int get_type_sec_size(const __u32 *raw_types)
+{
+	int i;
+
+	for (i = MAX_NR_RAW_TYPES - 1;
+	     i >= 0 && raw_types[i] != BTF_END_RAW;
+	     i--)
+		;
+
+	return i < 0 ? i : i * sizeof(raw_types[0]);
+}
+
+static void *btf_raw_create(const struct btf_header *hdr,
+			    const __u32 *raw_types,
+			    const char *str,
+			    unsigned int str_sec_size,
+			    unsigned int *btf_size)
+{
+	const char *next_str = str, *end_str = str + str_sec_size;
+	unsigned int size_needed, offset;
+	struct btf_header *ret_hdr;
+	int i, type_sec_size;
+	uint32_t *ret_types;
+	void *raw_btf;
+
+	type_sec_size = get_type_sec_size(raw_types);
+	if (type_sec_size < 0) {
+		fprintf(stderr, "Cannot get nr_raw_types\n");
+		return NULL;
+	}
+
+	size_needed = sizeof(*hdr) + type_sec_size + str_sec_size;
+	raw_btf = malloc(size_needed);
+	if (!raw_btf) {
+		fprintf(stderr, "Cannot allocate memory for raw_btf\n");
+		return NULL;
+	}
+
+	/* Copy header */
+	memcpy(raw_btf, hdr, sizeof(*hdr));
+	offset = sizeof(*hdr);
+
+	/* Copy type section */
+	ret_types = raw_btf + offset;
+	for (i = 0; i < type_sec_size / sizeof(raw_types[0]); i++) {
+		if (raw_types[i] == NAME_TBD) {
+			next_str = get_next_str(next_str, end_str);
+			if (!next_str) {
+				fprintf(stderr, "Error in getting next_str\n");
+				free(raw_btf);
+				return NULL;
+			}
+			ret_types[i] = next_str - str;
+			next_str += strlen(next_str);
+		} else {
+			ret_types[i] = raw_types[i];
+		}
+	}
+	offset += type_sec_size;
+
+	/* Copy string section */
+	memcpy(raw_btf + offset, str, str_sec_size);
+
+	ret_hdr = (struct btf_header *)raw_btf;
+	ret_hdr->str_off = type_sec_size;
+	ret_hdr->str_len = str_sec_size;
+
+	*btf_size = size_needed;
+
+	return raw_btf;
+}
+
+static int do_test_raw(unsigned int test_num)
+{
+	struct btf_raw_test *test = &raw_tests[test_num - 1];
+	struct bpf_create_map_attr create_attr = {};
+	int map_fd = -1, btf_fd = -1;
+	unsigned int raw_btf_size;
+	struct btf_header *hdr;
+	void *raw_btf;
+	int err;
+
+	fprintf(stderr, "BTF raw test[%u] (%s): ", test_num, test->descr);
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	hdr = raw_btf;
+
+	hdr->type_off = (int)hdr->type_off + test->type_off_delta;
+	hdr->str_off = (int)hdr->str_off + test->str_off_delta;
+	hdr->str_len = (int)hdr->str_len + test->str_len_delta;
+
+	*btf_log_buf = '\0';
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	free(raw_btf);
+
+	err = ((btf_fd == -1) != test->btf_load_err);
+	if (err)
+		fprintf(stderr, "btf_load_err:%d btf_fd:%d\n",
+			test->btf_load_err, btf_fd);
+
+	if (err || btf_fd == -1)
+		goto done;
+
+	create_attr.name = test->map_name;
+	create_attr.map_type = test->map_type;
+	create_attr.key_size = test->key_size;
+	create_attr.value_size = test->value_size;
+	create_attr.max_entries = test->max_entries;
+	create_attr.btf_fd = btf_fd;
+	create_attr.btf_key_id = test->key_id;
+	create_attr.btf_value_id = test->value_id;
+
+	map_fd = bpf_create_map_xattr(&create_attr);
+
+	err = ((map_fd == -1) != test->map_create_err);
+	if (err)
+		fprintf(stderr, "map_create_err:%d map_fd:%d\n",
+			test->map_create_err, map_fd);
+
+done:
+	if (!err)
+		fprintf(stderr, "OK\n");
+
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+
+	if (btf_fd != -1)
+		close(btf_fd);
+	if (map_fd != -1)
+		close(map_fd);
+
+	return err;
+}
+
+static int test_raw(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.raw_test_num)
+		return do_test_raw(args.raw_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(raw_tests); i++)
+		err |= do_test_raw(i);
+
+	return err;
+}
+
+struct btf_get_info_test {
+	const char *descr;
+	const char *str_sec;
+	__u32 raw_types[MAX_NR_RAW_TYPES];
+	__u32 str_sec_size;
+	int info_size_delta;
+};
+
+const struct btf_get_info_test get_info_tests[] = {
+{
+	.descr = "== raw_btf_size+1",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.info_size_delta = 1,
+},
+{
+	.descr = "== raw_btf_size-3",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.info_size_delta = -3,
+},
+};
+
+static int do_test_get_info(unsigned int test_num)
+{
+	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+	unsigned int raw_btf_size, user_btf_size, expected_nbytes;
+	uint8_t *raw_btf = NULL, *user_btf = NULL;
+	int btf_fd = -1, err;
+
+	fprintf(stderr, "BTF GET_INFO_BY_ID test[%u] (%s): ",
+		test_num, test->descr);
+
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+
+	user_btf = malloc(raw_btf_size);
+	if (!user_btf) {
+		fprintf(stderr, "Cannot allocate memory for user_btf\n");
+		err = -1;
+		goto done;
+	}
+
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	if (btf_fd == -1) {
+		fprintf(stderr, "bpf_load_btf:%s(%d)\n",
+			strerror(errno), errno);
+		err = -1;
+		goto done;
+	}
+
+	user_btf_size = (int)raw_btf_size + test->info_size_delta;
+	expected_nbytes = min(raw_btf_size, user_btf_size);
+	if (raw_btf_size > expected_nbytes)
+		memset(user_btf + expected_nbytes, 0xff,
+		       raw_btf_size - expected_nbytes);
+
+	err = bpf_obj_get_info_by_fd(btf_fd, user_btf, &user_btf_size);
+	if (err || user_btf_size != raw_btf_size ||
+	    memcmp(raw_btf, user_btf, expected_nbytes)) {
+		fprintf(stderr,
+			"err:%d(errno:%d) raw_btf_size:%u user_btf_size:%u expected_nbytes:%u memcmp:%d\n",
+			err, errno,
+			raw_btf_size, user_btf_size, expected_nbytes,
+			memcmp(raw_btf, user_btf, expected_nbytes));
+		err = -1;
+		goto done;
+	}
+
+	while (expected_nbytes < raw_btf_size) {
+		fprintf(stderr, "%u...", expected_nbytes);
+		if (user_btf[expected_nbytes++] != 0xff) {
+			fprintf(stderr, "!= 0xff\n");
+			err = -1;
+			goto done;
+		}
+	}
+
+	fprintf(stderr, "OK\n");
+
+done:
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+
+	free(raw_btf);
+	free(user_btf);
+
+	if (btf_fd != -1)
+		close(btf_fd);
+
+	return err;
+}
+
+static int test_get_info(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.get_info_test_num)
+		return do_test_get_info(args.get_info_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++)
+		err |= do_test_get_info(i);
+
+	return err;
+}
+
+struct btf_file_test {
+	const char *file;
+	bool btf_kv_notfound;
+};
+
+static struct btf_file_test file_tests[] = {
+{
+	.file = "test_btf_haskv.o",
+},
+{
+	.file = "test_btf_nokv.o",
+	.btf_kv_notfound = true,
+},
+};
+
+static int file_has_btf_elf(const char *fn)
+{
+	Elf_Scn *scn = NULL;
+	GElf_Ehdr ehdr;
+	int elf_fd;
+	Elf *elf;
+	int ret;
+
+	if (elf_version(EV_CURRENT) == EV_NONE) {
+		fprintf(stderr, "Failed to init libelf\n");
+		return -1;
+	}
+
+	elf_fd = open(fn, O_RDONLY);
+	if (elf_fd == -1) {
+		fprintf(stderr, "Cannot open file %s: %s(%d)\n",
+			fn, strerror(errno), errno);
+		return -1;
+	}
+
+	elf = elf_begin(elf_fd, ELF_C_READ, NULL);
+	if (!elf) {
+		fprintf(stderr, "Failed to read ELF from %s. %s\n", fn,
+			elf_errmsg(elf_errno()));
+		ret = -1;
+		goto done;
+	}
+
+	if (!gelf_getehdr(elf, &ehdr)) {
+		fprintf(stderr, "Failed to get EHDR from %s\n", fn);
+		ret = -1;
+		goto done;
+	}
+
+	while ((scn = elf_nextscn(elf, scn))) {
+		const char *sh_name;
+		GElf_Shdr sh;
+
+		if (gelf_getshdr(scn, &sh) != &sh) {
+			fprintf(stderr,
+				"Failed to get section header from %s\n", fn);
+			ret = -1;
+			goto done;
+		}
+
+		sh_name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name);
+		if (!strcmp(sh_name, BTF_ELF_SEC)) {
+			ret = 1;
+			goto done;
+		}
+	}
+
+	ret = 0;
+
+done:
+	close(elf_fd);
+	elf_end(elf);
+	return ret;
+}
+
+static int do_test_file(unsigned int test_num)
+{
+	const struct btf_file_test *test = &file_tests[test_num - 1];
+	struct bpf_object *obj = NULL;
+	struct bpf_program *prog;
+	struct bpf_map *map;
+	int err;
+
+	fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num,
+		test->file);
+
+	err = file_has_btf_elf(test->file);
+	if (err == -1)
+		return err;
+
+	if (err == 0) {
+		fprintf(stderr, "SKIP. No ELF %s found\n", BTF_ELF_SEC);
+		return 0;
+	}
+
+	obj = bpf_object__open(test->file);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	err = bpf_object__btf_fd(obj);
+	if (err == -1) {
+		fprintf(stderr, "bpf_object__btf_fd: -1\n");
+		goto done;
+	}
+
+	prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		fprintf(stderr, "Cannot find bpf_prog\n");
+		err = -1;
+		goto done;
+	}
+
+	bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
+	err = bpf_object__load(obj);
+	if (err < 0) {
+		fprintf(stderr, "bpf_object__load: %d\n", err);
+		goto done;
+	}
+
+	map = bpf_object__find_map_by_name(obj, "btf_map");
+	if (!map) {
+		fprintf(stderr, "btf_map not found\n");
+		err = -1;
+		goto done;
+	}
+
+	err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0)
+		!= test->btf_kv_notfound;
+	if (err) {
+		fprintf(stderr,
+			"btf_kv_notfound:%u btf_key_id:%u btf_value_id:%u\n",
+			test->btf_kv_notfound,
+			bpf_map__btf_key_id(map),
+			bpf_map__btf_value_id(map));
+		goto done;
+	}
+
+	fprintf(stderr, "OK\n");
+
+done:
+	bpf_object__close(obj);
+	return err;
+}
+
+static int test_file(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.file_test_num)
+		return do_test_file(args.file_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(file_tests); i++)
+		err |= do_test_file(i);
+
+	return err;
+}
+
+const char *pprint_enum_str[] = {
+	"ENUM_ZERO",
+	"ENUM_ONE",
+	"ENUM_TWO",
+	"ENUM_THREE",
+};
+
+struct pprint_mapv {
+	uint32_t ui32;
+	uint16_t ui16;
+	/* 2 bytes hole */
+	int32_t si32;
+	uint32_t unused_bits2a:2,
+		bits28:28,
+		unused_bits2b:2;
+	union {
+		uint64_t ui64;
+		uint8_t ui8a[8];
+	};
+	enum {
+		ENUM_ZERO,
+		ENUM_ONE,
+		ENUM_TWO,
+		ENUM_THREE,
+	} aenum;
+};
+
+static struct btf_raw_test pprint_test = {
+	.descr = "BTF pretty print test #1",
+	.raw_types = {
+		/* unsighed char */			/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
+		/* unsigned short */			/* [2] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
+		/* unsigned int */			/* [3] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+		/* int */				/* [4] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		/* unsigned long long */		/* [5] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8),
+		/* 2 bits */				/* [6] */
+		BTF_TYPE_INT_ENC(0, 0, 0, 2, 2),
+		/* 28 bits */				/* [7] */
+		BTF_TYPE_INT_ENC(0, 0, 0, 28, 4),
+		/* uint8_t[8] */			/* [8] */
+		BTF_TYPE_ARRAY_ENC(9, 3, 8),
+		/* typedef unsigned char uint8_t */	/* [9] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 1),
+		/* typedef unsigned short uint16_t */	/* [10] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 2),
+		/* typedef unsigned int uint32_t */	/* [11] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 3),
+		/* typedef int int32_t */		/* [12] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),
+		/* typedef unsigned long long uint64_t *//* [13] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 5),
+		/* union (anon) */			/* [14] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 13, 0),/* uint64_t ui64; */
+		BTF_MEMBER_ENC(NAME_TBD, 8, 0),	/* uint8_t ui8a[8]; */
+		/* enum (anon) */			/* [15] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 4), 4),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		BTF_ENUM_ENC(NAME_TBD, 2),
+		BTF_ENUM_ENC(NAME_TBD, 3),
+		/* struct pprint_mapv */		/* [16] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 8), 28),
+		BTF_MEMBER_ENC(NAME_TBD, 11, 0),	/* uint32_t ui32 */
+		BTF_MEMBER_ENC(NAME_TBD, 10, 32),	/* uint16_t ui16 */
+		BTF_MEMBER_ENC(NAME_TBD, 12, 64),	/* int32_t si32 */
+		BTF_MEMBER_ENC(NAME_TBD, 6, 96),	/* unused_bits2a */
+		BTF_MEMBER_ENC(NAME_TBD, 7, 98),	/* bits28 */
+		BTF_MEMBER_ENC(NAME_TBD, 6, 126),	/* unused_bits2b */
+		BTF_MEMBER_ENC(0, 14, 128),		/* union (anon) */
+		BTF_MEMBER_ENC(NAME_TBD, 15, 192),	/* aenum */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum",
+	.str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "pprint_test",
+	.key_size = sizeof(unsigned int),
+	.value_size = sizeof(struct pprint_mapv),
+	.key_id = 3,	/* unsigned int */
+	.value_id = 16,	/* struct pprint_mapv */
+	.max_entries = 128 * 1024,
+};
+
+static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i)
+{
+	v->ui32 = i;
+	v->si32 = -i;
+	v->unused_bits2a = 3;
+	v->bits28 = i;
+	v->unused_bits2b = 3;
+	v->ui64 = i;
+	v->aenum = i & 0x03;
+}
+
+static int test_pprint(void)
+{
+	const struct btf_raw_test *test = &pprint_test;
+	struct bpf_create_map_attr create_attr = {};
+	int map_fd = -1, btf_fd = -1;
+	struct pprint_mapv mapv = {};
+	unsigned int raw_btf_size;
+	char expected_line[255];
+	FILE *pin_file = NULL;
+	char pin_path[255];
+	size_t line_len = 0;
+	char *line = NULL;
+	unsigned int key;
+	uint8_t *raw_btf;
+	ssize_t nread;
+	int err;
+
+	fprintf(stderr, "%s......", test->descr);
+	raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
+				 test->str_sec, test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	free(raw_btf);
+
+	if (btf_fd == -1) {
+		err = -1;
+		fprintf(stderr, "bpf_load_btf: %s(%d)\n",
+			strerror(errno), errno);
+		goto done;
+	}
+
+	create_attr.name = test->map_name;
+	create_attr.map_type = test->map_type;
+	create_attr.key_size = test->key_size;
+	create_attr.value_size = test->value_size;
+	create_attr.max_entries = test->max_entries;
+	create_attr.btf_fd = btf_fd;
+	create_attr.btf_key_id = test->key_id;
+	create_attr.btf_value_id = test->value_id;
+
+	map_fd = bpf_create_map_xattr(&create_attr);
+	if (map_fd == -1) {
+		err = -1;
+		fprintf(stderr, "bpf_creat_map_btf: %s(%d)\n",
+			strerror(errno), errno);
+		goto done;
+	}
+
+	if (snprintf(pin_path, sizeof(pin_path), "%s/%s",
+		     "/sys/fs/bpf", test->map_name) == sizeof(pin_path)) {
+		err = -1;
+		fprintf(stderr, "pin_path is too long\n");
+		goto done;
+	}
+
+	err = bpf_obj_pin(map_fd, pin_path);
+	if (err) {
+		fprintf(stderr, "Cannot pin to %s. %s(%d).\n", pin_path,
+			strerror(errno), errno);
+		goto done;
+	}
+
+	for (key = 0; key < test->max_entries; key++) {
+		set_pprint_mapv(&mapv, key);
+		bpf_map_update_elem(map_fd, &key, &mapv, 0);
+	}
+
+	pin_file = fopen(pin_path, "r");
+	if (!pin_file) {
+		err = -1;
+		fprintf(stderr, "fopen(%s): %s(%d)\n", pin_path,
+			strerror(errno), errno);
+		goto done;
+	}
+
+	/* Skip lines start with '#' */
+	while ((nread = getline(&line, &line_len, pin_file)) > 0 &&
+	       *line == '#')
+		;
+
+	if (nread <= 0) {
+		err = -1;
+		fprintf(stderr, "Unexpected EOF\n");
+		goto done;
+	}
+
+	key = 0;
+	do {
+		ssize_t nexpected_line;
+
+		set_pprint_mapv(&mapv, key);
+		nexpected_line = snprintf(expected_line, sizeof(expected_line),
+					  "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n",
+					  key,
+					  mapv.ui32, mapv.si32,
+					  mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b,
+					  mapv.ui64,
+					  mapv.ui8a[0], mapv.ui8a[1], mapv.ui8a[2], mapv.ui8a[3],
+					  mapv.ui8a[4], mapv.ui8a[5], mapv.ui8a[6], mapv.ui8a[7],
+					  pprint_enum_str[mapv.aenum]);
+
+		if (nexpected_line == sizeof(expected_line)) {
+			err = -1;
+			fprintf(stderr, "expected_line is too long\n");
+			goto done;
+		}
+
+		if (strcmp(expected_line, line)) {
+			err = -1;
+			fprintf(stderr, "unexpected pprint output\n");
+			fprintf(stderr, "expected: %s", expected_line);
+			fprintf(stderr, "    read: %s", line);
+			goto done;
+		}
+
+		nread = getline(&line, &line_len, pin_file);
+	} while (++key < test->max_entries && nread > 0);
+
+	if (key < test->max_entries) {
+		err = -1;
+		fprintf(stderr, "Unexpected EOF\n");
+		goto done;
+	}
+
+	if (nread > 0) {
+		err = -1;
+		fprintf(stderr, "Unexpected extra pprint output: %s\n", line);
+		goto done;
+	}
+
+	err = 0;
+
+done:
+	if (!err)
+		fprintf(stderr, "OK\n");
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+	if (btf_fd != -1)
+		close(btf_fd);
+	if (map_fd != -1)
+		close(map_fd);
+	if (pin_file)
+		fclose(pin_file);
+	unlink(pin_path);
+	free(line);
+
+	return err;
+}
+
+static void usage(const char *cmd)
+{
+	fprintf(stderr, "Usage: %s [-l] [[-r test_num (1 - %zu)] | [-g test_num (1 - %zu)] | [-f test_num (1 - %zu)] | [-p]]\n",
+		cmd, ARRAY_SIZE(raw_tests), ARRAY_SIZE(get_info_tests),
+		ARRAY_SIZE(file_tests));
+}
+
+static int parse_args(int argc, char **argv)
+{
+	const char *optstr = "lpf:r:g:";
+	int opt;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+		switch (opt) {
+		case 'l':
+			args.always_log = true;
+			break;
+		case 'f':
+			args.file_test_num = atoi(optarg);
+			args.file_test = true;
+			break;
+		case 'r':
+			args.raw_test_num = atoi(optarg);
+			args.raw_test = true;
+			break;
+		case 'g':
+			args.get_info_test_num = atoi(optarg);
+			args.get_info_test = true;
+			break;
+		case 'p':
+			args.pprint_test = true;
+			break;
+		case 'h':
+			usage(argv[0]);
+			exit(0);
+		default:
+				usage(argv[0]);
+				return -1;
+		}
+	}
+
+	if (args.raw_test_num &&
+	    (args.raw_test_num < 1 ||
+	     args.raw_test_num > ARRAY_SIZE(raw_tests))) {
+		fprintf(stderr, "BTF raw test number must be [1 - %zu]\n",
+			ARRAY_SIZE(raw_tests));
+		return -1;
+	}
+
+	if (args.file_test_num &&
+	    (args.file_test_num < 1 ||
+	     args.file_test_num > ARRAY_SIZE(file_tests))) {
+		fprintf(stderr, "BTF file test number must be [1 - %zu]\n",
+			ARRAY_SIZE(file_tests));
+		return -1;
+	}
+
+	if (args.get_info_test_num &&
+	    (args.get_info_test_num < 1 ||
+	     args.get_info_test_num > ARRAY_SIZE(get_info_tests))) {
+		fprintf(stderr, "BTF get info test number must be [1 - %zu]\n",
+			ARRAY_SIZE(get_info_tests));
+		return -1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int err = 0;
+
+	err = parse_args(argc, argv);
+	if (err)
+		return err;
+
+	if (args.always_log)
+		libbpf_set_print(__base_pr, __base_pr, __base_pr);
+
+	if (args.raw_test)
+		err |= test_raw();
+
+	if (args.get_info_test)
+		err |= test_get_info();
+
+	if (args.file_test)
+		err |= test_file();
+
+	if (args.pprint_test)
+		err |= test_pprint();
+
+	if (args.raw_test || args.get_info_test || args.file_test ||
+	    args.pprint_test)
+		return err;
+
+	err |= test_raw();
+	err |= test_get_info();
+	err |= test_file();
+
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/test_btf_haskv.c b/tools/testing/selftests/bpf/test_btf_haskv.c
new file mode 100644
index 000000000000..8c7ca096ecf2
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf_haskv.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+	unsigned int v4;
+	unsigned int v6;
+};
+
+typedef int btf_map_key;
+typedef struct ipv_counts btf_map_value;
+btf_map_key dumm_key;
+btf_map_value dummy_value;
+
+struct bpf_map_def SEC("maps") btf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct ipv_counts),
+	.max_entries = 4,
+};
+
+struct dummy_tracepoint_args {
+	unsigned long long pad;
+	struct sock *sock;
+};
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+	struct ipv_counts *counts;
+	int key = 0;
+
+	if (!arg->sock)
+		return 0;
+
+	counts = bpf_map_lookup_elem(&btf_map, &key);
+	if (!counts)
+		return 0;
+
+	counts->v6++;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_btf_nokv.c b/tools/testing/selftests/bpf/test_btf_nokv.c
new file mode 100644
index 000000000000..0ed8e088eebf
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf_nokv.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+	unsigned int v4;
+	unsigned int v6;
+};
+
+struct bpf_map_def SEC("maps") btf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct ipv_counts),
+	.max_entries = 4,
+};
+
+struct dummy_tracepoint_args {
+	unsigned long long pad;
+	struct sock *sock;
+};
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+	struct ipv_counts *counts;
+	int key = 0;
+
+	if (!arg->sock)
+		return 0;
+
+	counts = bpf_map_lookup_elem(&btf_map, &key);
+	if (!counts)
+		return 0;
+
+	counts->v6++;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From 712e5a5ce774a1542a09687441be7be456fe8b62 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:49 +0200
Subject: net: phy_ mdio-gpio: Fixup , which should be ;

Seems like an old typ0.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 4333c6e14742..369f5f35d6fd 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -161,7 +161,7 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	if (!new_bus)
 		goto out;
 
-	new_bus->name = "GPIO Bitbanged MDIO",
+	new_bus->name = "GPIO Bitbanged MDIO";
 
 	new_bus->phy_mask = pdata->phy_mask;
 	new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask;
-- 
cgit v1.2.3-59-g8ed1b


From 9e4d60938a2b2aae3c2c213c923d9eb8d0a87ba2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:50 +0200
Subject: net: phy: mdio-gpio: Remove reset function

The platform data can contain a function to call to reset
the bit banging interface. It is not used, so remove it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 1 -
 include/linux/platform_data/mdio-gpio.h | 2 --
 2 files changed, 3 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 369f5f35d6fd..570b87b82abc 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -141,7 +141,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 		goto out;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
-	bitbang->ctrl.reset = pdata->reset;
 	mdc = pdata->mdc;
 	bitbang->mdc = gpio_to_desc(mdc);
 	if (pdata->mdc_active_low)
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 11f00cdabe3d..6e8f01a570f2 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -26,8 +26,6 @@ struct mdio_gpio_platform_data {
 	u32 phy_mask;
 	u32 phy_ignore_ta_mask;
 	int irqs[PHY_MAX_ADDR];
-	/* reset callback */
-	int (*reset)(struct mii_bus *bus);
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3-59-g8ed1b


From a3283e2576736463e07ddc684efb2e587a3bbda2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:51 +0200
Subject: net: phy: mdio-bitbang: Remove reset support

The mdio-gpio driver was the only user of the interface reset option.
Since it no longer uses it, remove it from the bit banging code.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-bitbang.c | 9 ---------
 include/linux/mdio-bitbang.h   | 2 --
 2 files changed, 11 deletions(-)

diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c
index 403b085f0a89..15352f987bdf 100644
--- a/drivers/net/phy/mdio-bitbang.c
+++ b/drivers/net/phy/mdio-bitbang.c
@@ -205,14 +205,6 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
 	return 0;
 }
 
-static int mdiobb_reset(struct mii_bus *bus)
-{
-	struct mdiobb_ctrl *ctrl = bus->priv;
-	if (ctrl->reset)
-		ctrl->reset(bus);
-	return 0;
-}
-
 struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 {
 	struct mii_bus *bus;
@@ -225,7 +217,6 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 
 	bus->read = mdiobb_read;
 	bus->write = mdiobb_write;
-	bus->reset = mdiobb_reset;
 	bus->priv = ctrl;
 
 	return bus;
diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h
index a8ac9cfa014c..5d71e8a8500f 100644
--- a/include/linux/mdio-bitbang.h
+++ b/include/linux/mdio-bitbang.h
@@ -33,8 +33,6 @@ struct mdiobb_ops {
 
 struct mdiobb_ctrl {
 	const struct mdiobb_ops *ops;
-	/* reset callback */
-	int (*reset)(struct mii_bus *bus);
 };
 
 /* The returned bus is not yet registered with the phy layer. */
-- 
cgit v1.2.3-59-g8ed1b


From c1b3eb04682f80af74572aa25601dbb555c6bc6e Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:52 +0200
Subject: net: phy: mdio-gpio: remove support for ignoring turn around

This is not needed any more by devices using platform data, so remove
it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 1 -
 include/linux/platform_data/mdio-gpio.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 570b87b82abc..28ad9ca7b9e7 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -163,7 +163,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	new_bus->name = "GPIO Bitbanged MDIO";
 
 	new_bus->phy_mask = pdata->phy_mask;
-	new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask;
 	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 6e8f01a570f2..b8f914a30126 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -24,7 +24,6 @@ struct mdio_gpio_platform_data {
 	bool mdo_active_low;
 
 	u32 phy_mask;
-	u32 phy_ignore_ta_mask;
 	int irqs[PHY_MAX_ADDR];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 185a16b60a239f9db3fe8b8ce931a2fd61330853 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:53 +0200
Subject: net: phy: mdio-gpio: remove support for phy mask

This is not needed any more by devices using platform data, so remove
it.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 4 ----
 include/linux/platform_data/mdio-gpio.h | 1 -
 2 files changed, 5 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 28ad9ca7b9e7..676ba0dd04be 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -162,13 +162,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 
 	new_bus->name = "GPIO Bitbanged MDIO";
 
-	new_bus->phy_mask = pdata->phy_mask;
 	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
-	if (new_bus->phy_mask == ~0)
-		goto out_free_bus;
-
 	for (i = 0; i < PHY_MAX_ADDR; i++)
 		if (!new_bus->irq[i])
 			new_bus->irq[i] = PHY_POLL;
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index b8f914a30126..7d55dfef56dc 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -23,7 +23,6 @@ struct mdio_gpio_platform_data {
 	bool mdio_active_low;
 	bool mdo_active_low;
 
-	u32 phy_mask;
 	int irqs[PHY_MAX_ADDR];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 68abb4f25d7eaf4d5f40108aa748621ddab68209 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:54 +0200
Subject: net: phy: mdio-gpio: Remove support for IRQs in platform data

No current devices use IRQs in platform data, so remove support for
it. The MDIO core will also initialise the new bus such that all
addresses are polled, so remove the unneeded re-initialisation.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 7 -------
 include/linux/platform_data/mdio-gpio.h | 2 --
 2 files changed, 9 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 676ba0dd04be..0f8c748c8edd 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -130,7 +130,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 {
 	struct mii_bus *new_bus;
 	struct mdio_gpio_info *bitbang;
-	int i;
 	int mdc, mdio, mdo;
 	unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
 	unsigned long mdio_flags = GPIOF_DIR_IN;
@@ -161,14 +160,8 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 		goto out;
 
 	new_bus->name = "GPIO Bitbanged MDIO";
-
-	memcpy(new_bus->irq, pdata->irqs, sizeof(new_bus->irq));
 	new_bus->parent = dev;
 
-	for (i = 0; i < PHY_MAX_ADDR; i++)
-		if (!new_bus->irq[i])
-			new_bus->irq[i] = PHY_POLL;
-
 	if (bus_id != -1)
 		snprintf(new_bus->id, MII_BUS_ID_SIZE, "gpio-%x", bus_id);
 	else
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index 7d55dfef56dc..af3be0c4ff9b 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -22,8 +22,6 @@ struct mdio_gpio_platform_data {
 	bool mdc_active_low;
 	bool mdio_active_low;
 	bool mdo_active_low;
-
-	int irqs[PHY_MAX_ADDR];
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3-59-g8ed1b


From c82fc4814a93f36c9b536b5af8dd617e4ee1380f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:55 +0200
Subject: net: phy: mdio-gpio: Swap to using gpio descriptors

This simplifies the code, removing the need to handle active low
flags, etc.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c             | 73 ++++++++-------------------------
 include/linux/platform_data/mdio-gpio.h | 10 ++---
 2 files changed, 20 insertions(+), 63 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 0f8c748c8edd..b999f32374e2 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -35,35 +35,25 @@ struct mdio_gpio_info {
 	struct gpio_desc *mdc, *mdio, *mdo;
 };
 
-static void *mdio_gpio_of_get_data(struct platform_device *pdev)
+static void *mdio_gpio_of_get_data(struct device *dev)
 {
-	struct device_node *np = pdev->dev.of_node;
 	struct mdio_gpio_platform_data *pdata;
-	enum of_gpio_flags flags;
-	int ret;
 
-	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return NULL;
 
-	ret = of_get_gpio_flags(np, 0, &flags);
-	if (ret < 0)
-		return NULL;
-
-	pdata->mdc = ret;
-	pdata->mdc_active_low = flags & OF_GPIO_ACTIVE_LOW;
+	pdata->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+	if (IS_ERR(pdata->mdc))
+		return ERR_CAST(pdata->mdc);
 
-	ret = of_get_gpio_flags(np, 1, &flags);
-	if (ret < 0)
-		return NULL;
-	pdata->mdio = ret;
-	pdata->mdio_active_low = flags & OF_GPIO_ACTIVE_LOW;
+	pdata->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+	if (IS_ERR(pdata->mdio))
+		return ERR_CAST(pdata->mdio);
 
-	ret = of_get_gpio_flags(np, 2, &flags);
-	if (ret > 0) {
-		pdata->mdo = ret;
-		pdata->mdo_active_low = flags & OF_GPIO_ACTIVE_LOW;
-	}
+	pdata->mdo = devm_gpiod_get_index_optional(dev, NULL, 2, GPIOD_OUT_LOW);
+	if (IS_ERR(pdata->mdo))
+		return ERR_CAST(pdata->mdo);
 
 	return pdata;
 }
@@ -130,34 +120,19 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 {
 	struct mii_bus *new_bus;
 	struct mdio_gpio_info *bitbang;
-	int mdc, mdio, mdo;
-	unsigned long mdc_flags = GPIOF_OUT_INIT_LOW;
-	unsigned long mdio_flags = GPIOF_DIR_IN;
-	unsigned long mdo_flags = GPIOF_OUT_INIT_HIGH;
 
 	bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL);
 	if (!bitbang)
-		goto out;
+		return NULL;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
-	mdc = pdata->mdc;
-	bitbang->mdc = gpio_to_desc(mdc);
-	if (pdata->mdc_active_low)
-		mdc_flags = GPIOF_OUT_INIT_HIGH | GPIOF_ACTIVE_LOW;
-	mdio = pdata->mdio;
-	bitbang->mdio = gpio_to_desc(mdio);
-	if (pdata->mdio_active_low)
-		mdio_flags |= GPIOF_ACTIVE_LOW;
-	mdo = pdata->mdo;
-	if (mdo) {
-		bitbang->mdo = gpio_to_desc(mdo);
-		if (pdata->mdo_active_low)
-			mdo_flags = GPIOF_OUT_INIT_LOW | GPIOF_ACTIVE_LOW;
-	}
+	bitbang->mdc = pdata->mdc;
+	bitbang->mdio = pdata->mdio;
+	bitbang->mdo = pdata->mdo;
 
 	new_bus = alloc_mdio_bitbang(&bitbang->ctrl);
 	if (!new_bus)
-		goto out;
+		return NULL;
 
 	new_bus->name = "GPIO Bitbanged MDIO";
 	new_bus->parent = dev;
@@ -167,23 +142,9 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	else
 		strncpy(new_bus->id, "gpio", MII_BUS_ID_SIZE);
 
-	if (devm_gpio_request_one(dev, mdc, mdc_flags, "mdc"))
-		goto out_free_bus;
-
-	if (devm_gpio_request_one(dev, mdio, mdio_flags, "mdio"))
-		goto out_free_bus;
-
-	if (mdo && devm_gpio_request_one(dev, mdo, mdo_flags, "mdo"))
-		goto out_free_bus;
-
 	dev_set_drvdata(dev, new_bus);
 
 	return new_bus;
-
-out_free_bus:
-	free_mdio_bitbang(new_bus);
-out:
-	return NULL;
 }
 
 static void mdio_gpio_bus_deinit(struct device *dev)
@@ -208,7 +169,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
 	int ret, bus_id;
 
 	if (pdev->dev.of_node) {
-		pdata = mdio_gpio_of_get_data(pdev);
+		pdata = mdio_gpio_of_get_data(&pdev->dev);
 		bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio");
 		if (bus_id < 0) {
 			dev_warn(&pdev->dev, "failed to get alias id\n");
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
index af3be0c4ff9b..bd91fa98a3aa 100644
--- a/include/linux/platform_data/mdio-gpio.h
+++ b/include/linux/platform_data/mdio-gpio.h
@@ -15,13 +15,9 @@
 
 struct mdio_gpio_platform_data {
 	/* GPIO numbers for bus pins */
-	unsigned int mdc;
-	unsigned int mdio;
-	unsigned int mdo;
-
-	bool mdc_active_low;
-	bool mdio_active_low;
-	bool mdo_active_low;
+	struct gpio_desc *mdc;
+	struct gpio_desc *mdio;
+	struct gpio_desc *mdo;
 };
 
 #endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3-59-g8ed1b


From fb766421d881a849c80c7cb80e8ccbbddeb55d9d Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:56 +0200
Subject: net: phy: mdio-gpio: Move allocation for bitbanging data

Moving the allocation of this structure to the probe function is a
step towards making it the core data structure of the driver.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index b999f32374e2..bb57a9e89a18 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -115,15 +115,11 @@ static const struct mdiobb_ops mdio_gpio_ops = {
 };
 
 static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
+					  struct mdio_gpio_info *bitbang,
 					  struct mdio_gpio_platform_data *pdata,
 					  int bus_id)
 {
 	struct mii_bus *new_bus;
-	struct mdio_gpio_info *bitbang;
-
-	bitbang = devm_kzalloc(dev, sizeof(*bitbang), GFP_KERNEL);
-	if (!bitbang)
-		return NULL;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
 	bitbang->mdc = pdata->mdc;
@@ -165,9 +161,14 @@ static void mdio_gpio_bus_destroy(struct device *dev)
 static int mdio_gpio_probe(struct platform_device *pdev)
 {
 	struct mdio_gpio_platform_data *pdata;
+	struct mdio_gpio_info *bitbang;
 	struct mii_bus *new_bus;
 	int ret, bus_id;
 
+	bitbang = devm_kzalloc(&pdev->dev, sizeof(*bitbang), GFP_KERNEL);
+	if (!bitbang)
+		return -ENOMEM;
+
 	if (pdev->dev.of_node) {
 		pdata = mdio_gpio_of_get_data(&pdev->dev);
 		bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio");
@@ -183,7 +184,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
 	if (!pdata)
 		return -ENODEV;
 
-	new_bus = mdio_gpio_bus_init(&pdev->dev, pdata, bus_id);
+	new_bus = mdio_gpio_bus_init(&pdev->dev, bitbang, pdata, bus_id);
 	if (!new_bus)
 		return -ENODEV;
 
-- 
cgit v1.2.3-59-g8ed1b


From 4029ea3a8625bc699210623a106e887c2761fead Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:57 +0200
Subject: net: phy: mdio-gpio: Parse properties directly into bitbang structure

The same parsing code can be used for both OF and platform devices, if
the platform device uses a gpiod_lookup_table. Parse these properties
directly into the bitbang structure, rather than use an intermediate
platform data structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index bb57a9e89a18..74b982835ac1 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -35,27 +35,20 @@ struct mdio_gpio_info {
 	struct gpio_desc *mdc, *mdio, *mdo;
 };
 
-static void *mdio_gpio_of_get_data(struct device *dev)
+static int mdio_gpio_get_data(struct device *dev,
+			      struct mdio_gpio_info *bitbang)
 {
-	struct mdio_gpio_platform_data *pdata;
+	bitbang->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+	if (IS_ERR(bitbang->mdc))
+		return PTR_ERR(bitbang->mdc);
 
-	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return NULL;
-
-	pdata->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
-	if (IS_ERR(pdata->mdc))
-		return ERR_CAST(pdata->mdc);
-
-	pdata->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
-	if (IS_ERR(pdata->mdio))
-		return ERR_CAST(pdata->mdio);
-
-	pdata->mdo = devm_gpiod_get_index_optional(dev, NULL, 2, GPIOD_OUT_LOW);
-	if (IS_ERR(pdata->mdo))
-		return ERR_CAST(pdata->mdo);
+	bitbang->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+	if (IS_ERR(bitbang->mdio))
+		return PTR_ERR(bitbang->mdio);
 
-	return pdata;
+	bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, 2,
+						     GPIOD_OUT_LOW);
+	return PTR_ERR_OR_ZERO(bitbang->mdo);
 }
 
 static void mdio_dir(struct mdiobb_ctrl *ctrl, int dir)
@@ -116,15 +109,11 @@ static const struct mdiobb_ops mdio_gpio_ops = {
 
 static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 					  struct mdio_gpio_info *bitbang,
-					  struct mdio_gpio_platform_data *pdata,
 					  int bus_id)
 {
 	struct mii_bus *new_bus;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
-	bitbang->mdc = pdata->mdc;
-	bitbang->mdio = pdata->mdio;
-	bitbang->mdo = pdata->mdo;
 
 	new_bus = alloc_mdio_bitbang(&bitbang->ctrl);
 	if (!new_bus)
@@ -160,7 +149,6 @@ static void mdio_gpio_bus_destroy(struct device *dev)
 
 static int mdio_gpio_probe(struct platform_device *pdev)
 {
-	struct mdio_gpio_platform_data *pdata;
 	struct mdio_gpio_info *bitbang;
 	struct mii_bus *new_bus;
 	int ret, bus_id;
@@ -169,22 +157,21 @@ static int mdio_gpio_probe(struct platform_device *pdev)
 	if (!bitbang)
 		return -ENOMEM;
 
+	ret = mdio_gpio_get_data(&pdev->dev, bitbang);
+	if (ret)
+		return ret;
+
 	if (pdev->dev.of_node) {
-		pdata = mdio_gpio_of_get_data(&pdev->dev);
 		bus_id = of_alias_get_id(pdev->dev.of_node, "mdio-gpio");
 		if (bus_id < 0) {
 			dev_warn(&pdev->dev, "failed to get alias id\n");
 			bus_id = 0;
 		}
 	} else {
-		pdata = dev_get_platdata(&pdev->dev);
 		bus_id = pdev->id;
 	}
 
-	if (!pdata)
-		return -ENODEV;
-
-	new_bus = mdio_gpio_bus_init(&pdev->dev, bitbang, pdata, bus_id);
+	new_bus = mdio_gpio_bus_init(&pdev->dev, bitbang, bus_id);
 	if (!new_bus)
 		return -ENODEV;
 
-- 
cgit v1.2.3-59-g8ed1b


From fb78a95e22123da57cb79b98bdc62eb33965ca8a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:58 +0200
Subject: net: phy: mdio-gpio: Add #defines for the GPIO index's

The GPIOs are described in device tree using a list, without names.
Add defines to indicate what each index in the list means. These
defines should also be used by platform devices passing GPIOs via a
GPIO lookup table.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 10 +++++++---
 include/linux/mdio-gpio.h   |  9 +++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/mdio-gpio.h

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 74b982835ac1..281c905ef9fd 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -24,6 +24,8 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
+#include <linux/mdio-bitbang.h>
+#include <linux/mdio-gpio.h>
 #include <linux/gpio.h>
 #include <linux/platform_data/mdio-gpio.h>
 
@@ -38,15 +40,17 @@ struct mdio_gpio_info {
 static int mdio_gpio_get_data(struct device *dev,
 			      struct mdio_gpio_info *bitbang)
 {
-	bitbang->mdc = devm_gpiod_get_index(dev, NULL, 0, GPIOD_OUT_LOW);
+	bitbang->mdc = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDC,
+					    GPIOD_OUT_LOW);
 	if (IS_ERR(bitbang->mdc))
 		return PTR_ERR(bitbang->mdc);
 
-	bitbang->mdio = devm_gpiod_get_index(dev, NULL, 1, GPIOD_IN);
+	bitbang->mdio = devm_gpiod_get_index(dev, NULL, MDIO_GPIO_MDIO,
+					     GPIOD_IN);
 	if (IS_ERR(bitbang->mdio))
 		return PTR_ERR(bitbang->mdio);
 
-	bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, 2,
+	bitbang->mdo = devm_gpiod_get_index_optional(dev, NULL, MDIO_GPIO_MDO,
 						     GPIOD_OUT_LOW);
 	return PTR_ERR_OR_ZERO(bitbang->mdo);
 }
diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h
new file mode 100644
index 000000000000..cea443a672cb
--- /dev/null
+++ b/include/linux/mdio-gpio.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_MDIO_GPIO_H
+#define __LINUX_MDIO_GPIO_H
+
+#define MDIO_GPIO_MDC	0
+#define MDIO_GPIO_MDIO	1
+#define MDIO_GPIO_MDO	2
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 0207dd1173fe31c153ffd439c4bb33d1341829b1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 01:02:59 +0200
Subject: net: phy: mdio-gpio: Remove redundant platform data header

The platform data header file is now unused. Remove it, but add
an extra include which it brought in.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                             |  1 -
 drivers/net/phy/mdio-gpio.c             |  2 +-
 include/linux/platform_data/mdio-gpio.h | 23 -----------------------
 3 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/mdio-gpio.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b60179d948bb..a7321687cae9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5321,7 +5321,6 @@ F:	include/linux/*mdio*.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
-F:	include/linux/platform_data/mdio-gpio.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/trace/events/mdio.h
 F:	include/uapi/linux/mdio.h
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 281c905ef9fd..b501221819e1 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -27,7 +27,7 @@
 #include <linux/mdio-bitbang.h>
 #include <linux/mdio-gpio.h>
 #include <linux/gpio.h>
-#include <linux/platform_data/mdio-gpio.h>
+#include <linux/gpio/consumer.h>
 
 #include <linux/of_gpio.h>
 #include <linux/of_mdio.h>
diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h
deleted file mode 100644
index bd91fa98a3aa..000000000000
--- a/include/linux/platform_data/mdio-gpio.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * MDIO-GPIO bus platform data structures
- *
- * Copyright (C) 2008, Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-
-#ifndef __LINUX_MDIO_GPIO_H
-#define __LINUX_MDIO_GPIO_H
-
-#include <linux/mdio-bitbang.h>
-
-struct mdio_gpio_platform_data {
-	/* GPIO numbers for bus pins */
-	struct gpio_desc *mdc;
-	struct gpio_desc *mdio;
-	struct gpio_desc *mdo;
-};
-
-#endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3-59-g8ed1b


From 31dd83b966410915a56358a8a3fc13c54d016fb3 Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Thu, 19 Apr 2018 14:05:18 +1200
Subject: net-next: phy: new Asix Electronics PHY driver

The Asix Electronics PHY found on the X-Surf 100 Amiga Zorro network
card by Individual Computers is buggy, and needs the reset bit toggled
as workaround to make a PHY soft reset succeed.

Add workaround driver just for this special case.

Suggested in xsurf100 patch series review by Andrew Lunn <andrew@lunn.ch>

Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig  |  6 +++++
 drivers/net/phy/Makefile |  1 +
 drivers/net/phy/asix.c   | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+)
 create mode 100644 drivers/net/phy/asix.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index bdfbabb86ee0..edb8b9ab827f 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -218,6 +218,12 @@ config AQUANTIA_PHY
 	---help---
 	  Currently supports the Aquantia AQ1202, AQ2104, AQR105, AQR405
 
+config ASIX_PHY
+	tristate "Asix PHYs"
+	help
+	  Currently supports the Asix Electronics PHY found in the X-Surf 100
+	  AX88796B package.
+
 config AT803X_PHY
 	tristate "AT803X PHYs"
 	---help---
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 01acbcb2c798..701ca0b8717e 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -45,6 +45,7 @@ obj-y				+= $(sfp-obj-y) $(sfp-obj-m)
 
 obj-$(CONFIG_AMD_PHY)		+= amd.o
 obj-$(CONFIG_AQUANTIA_PHY)	+= aquantia.o
+obj-$(CONFIG_ASIX_PHY)		+= asix.o
 obj-$(CONFIG_AT803X_PHY)	+= at803x.o
 obj-$(CONFIG_BCM63XX_PHY)	+= bcm63xx.o
 obj-$(CONFIG_BCM7XXX_PHY)	+= bcm7xxx.o
diff --git a/drivers/net/phy/asix.c b/drivers/net/phy/asix.c
new file mode 100644
index 000000000000..8ebe7f5484ae
--- /dev/null
+++ b/drivers/net/phy/asix.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Driver for Asix PHYs
+ *
+ * Author: Michael Schmitz <schmitzmic@gmail.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mii.h>
+#include <linux/phy.h>
+
+#define PHY_ID_ASIX_AX88796B		0x003b1841
+
+MODULE_DESCRIPTION("Asix PHY driver");
+MODULE_AUTHOR("Michael Schmitz <schmitzmic@gmail.com>");
+MODULE_LICENSE("GPL");
+
+/**
+ * asix_soft_reset - software reset the PHY via BMCR_RESET bit
+ * @phydev: target phy_device struct
+ *
+ * Description: Perform a software PHY reset using the standard
+ * BMCR_RESET bit and poll for the reset bit to be cleared.
+ * Toggle BMCR_RESET bit off to accommodate broken AX8796B PHY implementation
+ * such as used on the Individual Computers' X-Surf 100 Zorro card.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int asix_soft_reset(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Asix PHY won't reset unless reset bit toggles */
+	ret = phy_write(phydev, MII_BMCR, 0);
+	if (ret < 0)
+		return ret;
+
+	return genphy_soft_reset(phydev);
+}
+
+static struct phy_driver asix_driver[] = { {
+	.phy_id		= PHY_ID_ASIX_AX88796B,
+	.name		= "Asix Electronics AX88796B",
+	.phy_id_mask	= 0xfffffff0,
+	.features	= PHY_BASIC_FEATURES,
+	.soft_reset	= asix_soft_reset,
+} };
+
+module_phy_driver(asix_driver);
+
+static struct mdio_device_id __maybe_unused asix_tbl[] = {
+	{ PHY_ID_ASIX_AX88796B, 0xfffffff0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(mdio, asix_tbl);
-- 
cgit v1.2.3-59-g8ed1b


From 7a0907bd399f6f2d40906c6d38ed1814d314ef74 Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:19 +1200
Subject: net-next: ax88796: Fix MAC address reading

To read the MAC address from the (virtual) SAprom, the remote DMA
unit needs to be set up like for every other process access to card-local
memory.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index da61cf3cb3a9..ae3937535838 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -669,10 +669,16 @@ static int ax_init_dev(struct net_device *dev)
 	if (ax->plat->flags & AXFLG_HAS_EEPROM) {
 		unsigned char SA_prom[32];
 
+		ei_outb(6, ioaddr + EN0_RCNTLO);
+		ei_outb(0, ioaddr + EN0_RCNTHI);
+		ei_outb(0, ioaddr + EN0_RSARLO);
+		ei_outb(0, ioaddr + EN0_RSARHI);
+		ei_outb(E8390_RREAD + E8390_START, ioaddr + NE_CMD);
 		for (i = 0; i < sizeof(SA_prom); i += 2) {
 			SA_prom[i] = ei_inb(ioaddr + NE_DATAPORT);
 			SA_prom[i + 1] = ei_inb(ioaddr + NE_DATAPORT);
 		}
+		ei_outb(ENISR_RDC, ioaddr + EN0_ISR);	/* Ack intr. */
 
 		if (ax->plat->wordlength == 2)
 			for (i = 0; i < 16; i++)
-- 
cgit v1.2.3-59-g8ed1b


From fd5f375c162807b4e39676c540e26400c091cf4f Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:20 +1200
Subject: net-next: ax88796: Attach MII bus only when open

Call ax_mii_init in ax_open(), and unregister/remove mdiobus resources
in ax_close().

This is needed to be able to unload the module, as the module is busy
while the MII bus is attached.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 183 +++++++++++++++++++-----------------
 1 file changed, 95 insertions(+), 88 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index ae3937535838..ab020e601ec9 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -387,6 +387,90 @@ static void ax_phy_switch(struct net_device *dev, int on)
 	ei_outb(reg_gpoc, ei_local->mem + EI_SHIFT(0x17));
 }
 
+static void ax_bb_mdc(struct mdiobb_ctrl *ctrl, int level)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (level)
+		ax->reg_memr |= AX_MEMR_MDC;
+	else
+		ax->reg_memr &= ~AX_MEMR_MDC;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static void ax_bb_dir(struct mdiobb_ctrl *ctrl, int output)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (output)
+		ax->reg_memr &= ~AX_MEMR_MDIR;
+	else
+		ax->reg_memr |= AX_MEMR_MDIR;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static void ax_bb_set_data(struct mdiobb_ctrl *ctrl, int value)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+
+	if (value)
+		ax->reg_memr |= AX_MEMR_MDO;
+	else
+		ax->reg_memr &= ~AX_MEMR_MDO;
+
+	ei_outb(ax->reg_memr, ax->addr_memr);
+}
+
+static int ax_bb_get_data(struct mdiobb_ctrl *ctrl)
+{
+	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
+	int reg_memr = ei_inb(ax->addr_memr);
+
+	return reg_memr & AX_MEMR_MDI ? 1 : 0;
+}
+
+static const struct mdiobb_ops bb_ops = {
+	.owner = THIS_MODULE,
+	.set_mdc = ax_bb_mdc,
+	.set_mdio_dir = ax_bb_dir,
+	.set_mdio_data = ax_bb_set_data,
+	.get_mdio_data = ax_bb_get_data,
+};
+
+static int ax_mii_init(struct net_device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct ax_device *ax = to_ax_dev(dev);
+	int err;
+
+	ax->bb_ctrl.ops = &bb_ops;
+	ax->addr_memr = ei_local->mem + AX_MEMR;
+	ax->mii_bus = alloc_mdio_bitbang(&ax->bb_ctrl);
+	if (!ax->mii_bus) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ax->mii_bus->name = "ax88796_mii_bus";
+	ax->mii_bus->parent = dev->dev.parent;
+	snprintf(ax->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
+		 pdev->name, pdev->id);
+
+	err = mdiobus_register(ax->mii_bus);
+	if (err)
+		goto out_free_mdio_bitbang;
+
+	return 0;
+
+ out_free_mdio_bitbang:
+	free_mdio_bitbang(ax->mii_bus);
+ out:
+	return err;
+}
+
 static int ax_open(struct net_device *dev)
 {
 	struct ax_device *ax = to_ax_dev(dev);
@@ -394,6 +478,10 @@ static int ax_open(struct net_device *dev)
 
 	netdev_dbg(dev, "open\n");
 
+	ret = ax_mii_init(dev);
+	if (ret)
+		goto failed_mii;
+
 	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
 			  dev->name, dev);
 	if (ret)
@@ -421,6 +509,10 @@ static int ax_open(struct net_device *dev)
 	ax_phy_switch(dev, 0);
 	free_irq(dev->irq, dev);
  failed_request_irq:
+	/* unregister mdiobus */
+	mdiobus_unregister(ax->mii_bus);
+	free_mdio_bitbang(ax->mii_bus);
+ failed_mii:
 	return ret;
 }
 
@@ -440,6 +532,9 @@ static int ax_close(struct net_device *dev)
 	phy_disconnect(dev->phydev);
 
 	free_irq(dev->irq, dev);
+
+	mdiobus_unregister(ax->mii_bus);
+	free_mdio_bitbang(ax->mii_bus);
 	return 0;
 }
 
@@ -539,92 +634,8 @@ static const struct net_device_ops ax_netdev_ops = {
 #endif
 };
 
-static void ax_bb_mdc(struct mdiobb_ctrl *ctrl, int level)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (level)
-		ax->reg_memr |= AX_MEMR_MDC;
-	else
-		ax->reg_memr &= ~AX_MEMR_MDC;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static void ax_bb_dir(struct mdiobb_ctrl *ctrl, int output)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (output)
-		ax->reg_memr &= ~AX_MEMR_MDIR;
-	else
-		ax->reg_memr |= AX_MEMR_MDIR;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static void ax_bb_set_data(struct mdiobb_ctrl *ctrl, int value)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-
-	if (value)
-		ax->reg_memr |= AX_MEMR_MDO;
-	else
-		ax->reg_memr &= ~AX_MEMR_MDO;
-
-	ei_outb(ax->reg_memr, ax->addr_memr);
-}
-
-static int ax_bb_get_data(struct mdiobb_ctrl *ctrl)
-{
-	struct ax_device *ax = container_of(ctrl, struct ax_device, bb_ctrl);
-	int reg_memr = ei_inb(ax->addr_memr);
-
-	return reg_memr & AX_MEMR_MDI ? 1 : 0;
-}
-
-static const struct mdiobb_ops bb_ops = {
-	.owner = THIS_MODULE,
-	.set_mdc = ax_bb_mdc,
-	.set_mdio_dir = ax_bb_dir,
-	.set_mdio_data = ax_bb_set_data,
-	.get_mdio_data = ax_bb_get_data,
-};
-
 /* setup code */
 
-static int ax_mii_init(struct net_device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev->dev.parent);
-	struct ei_device *ei_local = netdev_priv(dev);
-	struct ax_device *ax = to_ax_dev(dev);
-	int err;
-
-	ax->bb_ctrl.ops = &bb_ops;
-	ax->addr_memr = ei_local->mem + AX_MEMR;
-	ax->mii_bus = alloc_mdio_bitbang(&ax->bb_ctrl);
-	if (!ax->mii_bus) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	ax->mii_bus->name = "ax88796_mii_bus";
-	ax->mii_bus->parent = dev->dev.parent;
-	snprintf(ax->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
-		pdev->name, pdev->id);
-
-	err = mdiobus_register(ax->mii_bus);
-	if (err)
-		goto out_free_mdio_bitbang;
-
-	return 0;
-
- out_free_mdio_bitbang:
-	free_mdio_bitbang(ax->mii_bus);
- out:
-	return err;
-}
-
 static void ax_initial_setup(struct net_device *dev, struct ei_device *ei_local)
 {
 	void __iomem *ioaddr = ei_local->mem;
@@ -755,10 +766,6 @@ static int ax_init_dev(struct net_device *dev)
 	dev->netdev_ops = &ax_netdev_ops;
 	dev->ethtool_ops = &ax_ethtool_ops;
 
-	ret = ax_mii_init(dev);
-	if (ret)
-		goto err_out;
-
 	ax_NS8390_init(dev, 0);
 
 	ret = register_netdev(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 9144c3795c2636351d553e4d0fc5297201182de2 Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:21 +1200
Subject: net-next: ax88796: Do not free IRQ in ax_remove() (already freed in
 ax_close()).

This complements the fix in 82533ad9a1c ("net: ethernet: ax88796:
don't call free_irq without request_irq first") that removed the
free_irq call in the error path of probe, to also not call free_irq
when remove is called to revert the effects of probe.

Fixes: 82533ad9a1c (net: ethernet: ax88796: don't call free_irq without request_irq first)
Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index ab020e601ec9..d3f30f1c40b2 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -790,7 +790,6 @@ static int ax_remove(struct platform_device *pdev)
 	struct resource *mem;
 
 	unregister_netdev(dev);
-	free_irq(dev->irq, dev);
 
 	iounmap(ei_local->mem);
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 27cced20192d25ae528db8fe694c95c7656f3d56 Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:22 +1200
Subject: net-next: ax88796: Add block_input/output hooks to ax_plat_data

Add platform specific hooks for block transfer reads/writes of packet
buffer data, superseding the default provided ax_block_input/output.
Currently used for m68k Amiga XSurf100.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 10 ++++++++--
 include/net/ax88796.h               |  9 +++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index d3f30f1c40b2..939a572a19ff 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -758,8 +758,14 @@ static int ax_init_dev(struct net_device *dev)
 #endif
 
 	ei_local->reset_8390 = &ax_reset_8390;
-	ei_local->block_input = &ax_block_input;
-	ei_local->block_output = &ax_block_output;
+	if (ax->plat->block_input)
+		ei_local->block_input = ax->plat->block_input;
+	else
+		ei_local->block_input = &ax_block_input;
+	if (ax->plat->block_output)
+		ei_local->block_output = ax->plat->block_output;
+	else
+		ei_local->block_output = &ax_block_output;
 	ei_local->get_8390_hdr = &ax_get_8390_hdr;
 	ei_local->priv = 0;
 
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index b9a3beca0ce4..363b0ca5f7e8 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -12,6 +12,9 @@
 #ifndef __NET_AX88796_PLAT_H
 #define __NET_AX88796_PLAT_H
 
+struct sk_buff;
+struct net_device;
+
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
 #define AXFLG_HAS_93CX6			(1<<2)	/* use eeprom_93cx6 driver */
@@ -26,6 +29,12 @@ struct ax_plat_data {
 	u32		*reg_offsets;	/* register offsets */
 	u8		*mac_addr;	/* MAC addr (only used when
 					   AXFLG_MAC_FROMPLATFORM is used */
+
+	/* uses default ax88796 buffer if set to NULL */
+	void (*block_output)(struct net_device *dev, int count,
+			const unsigned char *buf, int star_page);
+	void (*block_input)(struct net_device *dev, int count,
+			struct sk_buff *skb, int ring_offset);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
cgit v1.2.3-59-g8ed1b


From cec4c1c54a643608c262bd9bb72cf9bbec64f44a Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:23 +1200
Subject: net-next: ax88796: add interrupt status callback to platform data

To be able to tell the ax88796 driver whether it is sensible to enter
the 8390 interrupt handler, an "is this interrupt caused by the 88796"
callback has been added to the ax_plat_data structure (with NULL being
compatible to the previous behaviour).

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 23 +++++++++++++++++++++--
 include/net/ax88796.h               |  5 +++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 939a572a19ff..d283ed092519 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -163,6 +163,21 @@ static void ax_reset_8390(struct net_device *dev)
 	ei_outb(ENISR_RESET, addr + EN0_ISR);	/* Ack intr. */
 }
 
+/* Wrapper for __ei_interrupt for platforms that have a platform-specific
+ * way to find out whether the interrupt request might be caused by
+ * the ax88796 chip.
+ */
+static irqreturn_t ax_ei_interrupt_filtered(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct ax_device *ax = to_ax_dev(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+
+	if (!ax->plat->check_irq(pdev))
+		return IRQ_NONE;
+
+	return ax_ei_interrupt(irq, dev_id);
+}
 
 static void ax_get_8390_hdr(struct net_device *dev, struct e8390_pkt_hdr *hdr,
 			    int ring_page)
@@ -482,8 +497,12 @@ static int ax_open(struct net_device *dev)
 	if (ret)
 		goto failed_mii;
 
-	ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
-			  dev->name, dev);
+	if (ax->plat->check_irq)
+		ret = request_irq(dev->irq, ax_ei_interrupt_filtered,
+				  ax->irqflags, dev->name, dev);
+	else
+		ret = request_irq(dev->irq, ax_ei_interrupt, ax->irqflags,
+				  dev->name, dev);
 	if (ret)
 		goto failed_request_irq;
 
diff --git a/include/net/ax88796.h b/include/net/ax88796.h
index 363b0ca5f7e8..84b3785d0e66 100644
--- a/include/net/ax88796.h
+++ b/include/net/ax88796.h
@@ -14,6 +14,7 @@
 
 struct sk_buff;
 struct net_device;
+struct platform_device;
 
 #define AXFLG_HAS_EEPROM		(1<<0)
 #define AXFLG_MAC_FROMDEV		(1<<1)	/* device already has MAC */
@@ -35,6 +36,10 @@ struct ax_plat_data {
 			const unsigned char *buf, int star_page);
 	void (*block_input)(struct net_device *dev, int count,
 			struct sk_buff *skb, int ring_offset);
+	/* returns nonzero if a pending interrupt request might by caused by
+	 * the ax88786. Handles all interrupts if set to NULL
+	 */
+	int (*check_irq)(struct platform_device *pdev);
 };
 
 #endif /* __NET_AX88796_PLAT_H */
-- 
cgit v1.2.3-59-g8ed1b


From caaf45a6ba064103fabc2cad46bb5caeec41859f Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:24 +1200
Subject: net-next: ax88796: set IRQF_SHARED flag when IRQ resource is marked
 as shareable

On the Amiga X-Surf100, the network card interrupt is shared with many
other interrupt sources, so requires the IRQF_SHARED flag to register.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index d283ed092519..229279f325a2 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -872,6 +872,9 @@ static int ax_probe(struct platform_device *pdev)
 	dev->irq = irq->start;
 	ax->irqflags = irq->flags & IRQF_TRIGGER_MASK;
 
+	if (irq->flags &  IORESOURCE_IRQ_SHAREABLE)
+		ax->irqflags |= IRQF_SHARED;
+
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!mem) {
 		dev_err(&pdev->dev, "no MEM specified\n");
-- 
cgit v1.2.3-59-g8ed1b


From 453da988785fab2658b88ab57b992cacc9dd3b7d Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Thu, 19 Apr 2018 14:05:25 +1200
Subject: net-next: ax88796: release platform device drvdata on probe error and
 module remove

The net device struct pointer is stored as platform device drvdata on
module probe - clear the drvdata entry on probe fail there, as well as
when unloading the module.

Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/ax88796.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 229279f325a2..2a0ddec1dd56 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -826,6 +826,7 @@ static int ax_remove(struct platform_device *pdev)
 		release_mem_region(mem->start, resource_size(mem));
 	}
 
+	platform_set_drvdata(pdev, NULL);
 	free_netdev(dev);
 
 	return 0;
@@ -959,6 +960,7 @@ static int ax_probe(struct platform_device *pdev)
 	release_mem_region(mem->start, mem_size);
 
  exit_mem:
+	platform_set_drvdata(pdev, NULL);
 	free_netdev(dev);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 861928f4e60e826cd8871c0c37f4b3d825b8d81d Mon Sep 17 00:00:00 2001
From: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Date: Thu, 19 Apr 2018 14:05:26 +1200
Subject: net-next: New ax88796 platform driver for Amiga X-Surf 100 Zorro
 board (m68k)

Add platform device driver to populate the ax88796 platform data from
information provided by the XSurf100 zorro device driver. The ax88796
module will be loaded through this module's probe function.

Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/8390/Kconfig    |  17 +-
 drivers/net/ethernet/8390/Makefile   |   1 +
 drivers/net/ethernet/8390/xsurf100.c | 382 +++++++++++++++++++++++++++++++++++
 3 files changed, 398 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/8390/xsurf100.c

diff --git a/drivers/net/ethernet/8390/Kconfig b/drivers/net/ethernet/8390/Kconfig
index 9fee7c83ef9f..f2f0264c58ba 100644
--- a/drivers/net/ethernet/8390/Kconfig
+++ b/drivers/net/ethernet/8390/Kconfig
@@ -29,8 +29,8 @@ config PCMCIA_AXNET
 	  called axnet_cs.  If unsure, say N.
 
 config AX88796
-	tristate "ASIX AX88796 NE2000 clone support"
-	depends on (ARM || MIPS || SUPERH)
+	tristate "ASIX AX88796 NE2000 clone support" if !ZORRO
+	depends on (ARM || MIPS || SUPERH || ZORRO || COMPILE_TEST)
 	select CRC32
 	select PHYLIB
 	select MDIO_BITBANG
@@ -45,6 +45,19 @@ config AX88796_93CX6
 	---help---
 	  Select this if your platform comes with an external 93CX6 eeprom.
 
+config XSURF100
+	tristate "Amiga XSurf 100 AX88796/NE2000 clone support"
+	depends on ZORRO
+	select AX88796
+	select ASIX_PHY
+	help
+	  This driver is for the Individual Computers X-Surf 100 Ethernet
+	  card (based on the Asix AX88796 chip). If you have such a card,
+	  say Y. Otherwise, say N.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called xsurf100.
+
 config HYDRA
 	tristate "Hydra support"
 	depends on ZORRO
diff --git a/drivers/net/ethernet/8390/Makefile b/drivers/net/ethernet/8390/Makefile
index 1d650e66cc6e..85c83c566ec6 100644
--- a/drivers/net/ethernet/8390/Makefile
+++ b/drivers/net/ethernet/8390/Makefile
@@ -16,4 +16,5 @@ obj-$(CONFIG_PCMCIA_PCNET) += pcnet_cs.o 8390.o
 obj-$(CONFIG_STNIC) += stnic.o 8390.o
 obj-$(CONFIG_ULTRA) += smc-ultra.o 8390.o
 obj-$(CONFIG_WD80x3) += wd.o 8390.o
+obj-$(CONFIG_XSURF100) += xsurf100.o
 obj-$(CONFIG_ZORRO8390) += zorro8390.o
diff --git a/drivers/net/ethernet/8390/xsurf100.c b/drivers/net/ethernet/8390/xsurf100.c
new file mode 100644
index 000000000000..e2c963821ffe
--- /dev/null
+++ b/drivers/net/ethernet/8390/xsurf100.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/zorro.h>
+#include <net/ax88796.h>
+#include <asm/amigaints.h>
+
+#define ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF100 \
+		ZORRO_ID(INDIVIDUAL_COMPUTERS, 0x64, 0)
+
+#define XS100_IRQSTATUS_BASE 0x40
+#define XS100_8390_BASE 0x800
+
+/* Longword-access area. Translated to 2 16-bit access cycles by the
+ * X-Surf 100 FPGA
+ */
+#define XS100_8390_DATA32_BASE 0x8000
+#define XS100_8390_DATA32_SIZE 0x2000
+/* Sub-Areas for fast data register access; addresses relative to area begin */
+#define XS100_8390_DATA_READ32_BASE 0x0880
+#define XS100_8390_DATA_WRITE32_BASE 0x0C80
+#define XS100_8390_DATA_AREA_SIZE 0x80
+
+#define __NS8390_init ax_NS8390_init
+
+/* force unsigned long back to 'void __iomem *' */
+#define ax_convert_addr(_a) ((void __force __iomem *)(_a))
+
+#define ei_inb(_a) z_readb(ax_convert_addr(_a))
+#define ei_outb(_v, _a) z_writeb(_v, ax_convert_addr(_a))
+
+#define ei_inw(_a) z_readw(ax_convert_addr(_a))
+#define ei_outw(_v, _a) z_writew(_v, ax_convert_addr(_a))
+
+#define ei_inb_p(_a) ei_inb(_a)
+#define ei_outb_p(_v, _a) ei_outb(_v, _a)
+
+/* define EI_SHIFT() to take into account our register offsets */
+#define EI_SHIFT(x) (ei_local->reg_offset[(x)])
+
+/* Ensure we have our RCR base value */
+#define AX88796_PLATFORM
+
+static unsigned char version[] =
+		"ax88796.c: Copyright 2005,2007 Simtec Electronics\n";
+
+#include "lib8390.c"
+
+/* from ne.c */
+#define NE_CMD		EI_SHIFT(0x00)
+#define NE_RESET	EI_SHIFT(0x1f)
+#define NE_DATAPORT	EI_SHIFT(0x10)
+
+struct xsurf100_ax_plat_data {
+	struct ax_plat_data ax;
+	void __iomem *base_regs;
+	void __iomem *data_area;
+};
+
+static int is_xsurf100_network_irq(struct platform_device *pdev)
+{
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	return (readw(xs100->base_regs + XS100_IRQSTATUS_BASE) & 0xaaaa) != 0;
+}
+
+/* These functions guarantee that the iomem is accessed with 32 bit
+ * cycles only. z_memcpy_fromio / z_memcpy_toio don't
+ */
+static void z_memcpy_fromio32(void *dst, const void __iomem *src, size_t bytes)
+{
+	while (bytes > 32) {
+		asm __volatile__
+		   ("movem.l (%0)+,%%d0-%%d7\n"
+		    "movem.l %%d0-%%d7,(%1)\n"
+		    "adda.l #32,%1" : "=a"(src), "=a"(dst)
+		    : "0"(src), "1"(dst) : "d0", "d1", "d2", "d3", "d4",
+					   "d5", "d6", "d7", "memory");
+		bytes -= 32;
+	}
+	while (bytes) {
+		*(uint32_t *)dst = z_readl(src);
+		src += 4;
+		dst += 4;
+		bytes -= 4;
+	}
+}
+
+static void z_memcpy_toio32(void __iomem *dst, const void *src, size_t bytes)
+{
+	while (bytes) {
+		z_writel(*(const uint32_t *)src, dst);
+		src += 4;
+		dst += 4;
+		bytes -= 4;
+	}
+}
+
+static void xs100_write(struct net_device *dev, const void *src,
+			unsigned int count)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	/* copy whole blocks */
+	while (count > XS100_8390_DATA_AREA_SIZE) {
+		z_memcpy_toio32(xs100->data_area +
+				XS100_8390_DATA_WRITE32_BASE, src,
+				XS100_8390_DATA_AREA_SIZE);
+		src += XS100_8390_DATA_AREA_SIZE;
+		count -= XS100_8390_DATA_AREA_SIZE;
+	}
+	/* copy whole dwords */
+	z_memcpy_toio32(xs100->data_area + XS100_8390_DATA_WRITE32_BASE,
+			src, count & ~3);
+	src += count & ~3;
+	if (count & 2) {
+		ei_outw(*(uint16_t *)src, ei_local->mem + NE_DATAPORT);
+		src += 2;
+	}
+	if (count & 1)
+		ei_outb(*(uint8_t *)src, ei_local->mem + NE_DATAPORT);
+}
+
+static void xs100_read(struct net_device *dev, void *dst, unsigned int count)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	struct platform_device *pdev = to_platform_device(dev->dev.parent);
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	/* copy whole blocks */
+	while (count > XS100_8390_DATA_AREA_SIZE) {
+		z_memcpy_fromio32(dst, xs100->data_area +
+				  XS100_8390_DATA_READ32_BASE,
+				  XS100_8390_DATA_AREA_SIZE);
+		dst += XS100_8390_DATA_AREA_SIZE;
+		count -= XS100_8390_DATA_AREA_SIZE;
+	}
+	/* copy whole dwords */
+	z_memcpy_fromio32(dst, xs100->data_area + XS100_8390_DATA_READ32_BASE,
+			  count & ~3);
+	dst += count & ~3;
+	if (count & 2) {
+		*(uint16_t *)dst = ei_inw(ei_local->mem + NE_DATAPORT);
+		dst += 2;
+	}
+	if (count & 1)
+		*(uint8_t *)dst = ei_inb(ei_local->mem + NE_DATAPORT);
+}
+
+/* Block input and output, similar to the Crynwr packet driver. If
+ * you are porting to a new ethercard, look at the packet driver
+ * source for hints. The NEx000 doesn't share the on-board packet
+ * memory -- you have to put the packet out through the "remote DMA"
+ * dataport using ei_outb.
+ */
+static void xs100_block_input(struct net_device *dev, int count,
+			      struct sk_buff *skb, int ring_offset)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	void __iomem *nic_base = ei_local->mem;
+	char *buf = skb->data;
+
+	if (ei_local->dmaing) {
+		netdev_err(dev,
+			   "DMAing conflict in %s [DMAstat:%d][irqlock:%d]\n",
+			   __func__,
+			   ei_local->dmaing, ei_local->irqlock);
+		return;
+	}
+
+	ei_local->dmaing |= 0x01;
+
+	ei_outb(E8390_NODMA + E8390_PAGE0 + E8390_START, nic_base + NE_CMD);
+	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
+	ei_outb(count >> 8, nic_base + EN0_RCNTHI);
+	ei_outb(ring_offset & 0xff, nic_base + EN0_RSARLO);
+	ei_outb(ring_offset >> 8, nic_base + EN0_RSARHI);
+	ei_outb(E8390_RREAD + E8390_START, nic_base + NE_CMD);
+
+	xs100_read(dev, buf, count);
+
+	ei_local->dmaing &= ~1;
+}
+
+static void xs100_block_output(struct net_device *dev, int count,
+			       const unsigned char *buf, const int start_page)
+{
+	struct ei_device *ei_local = netdev_priv(dev);
+	void __iomem *nic_base = ei_local->mem;
+	unsigned long dma_start;
+
+	/* Round the count up for word writes. Do we need to do this?
+	 * What effect will an odd byte count have on the 8390?  I
+	 * should check someday.
+	 */
+	if (ei_local->word16 && (count & 0x01))
+		count++;
+
+	/* This *shouldn't* happen. If it does, it's the last thing
+	 * you'll see
+	 */
+	if (ei_local->dmaing) {
+		netdev_err(dev,
+			   "DMAing conflict in %s [DMAstat:%d][irqlock:%d]\n",
+			   __func__,
+			   ei_local->dmaing, ei_local->irqlock);
+		return;
+	}
+
+	ei_local->dmaing |= 0x01;
+	/* We should already be in page 0, but to be safe... */
+	ei_outb(E8390_PAGE0 + E8390_START + E8390_NODMA, nic_base + NE_CMD);
+
+	ei_outb(ENISR_RDC, nic_base + EN0_ISR);
+
+	/* Now the normal output. */
+	ei_outb(count & 0xff, nic_base + EN0_RCNTLO);
+	ei_outb(count >> 8, nic_base + EN0_RCNTHI);
+	ei_outb(0x00, nic_base + EN0_RSARLO);
+	ei_outb(start_page, nic_base + EN0_RSARHI);
+
+	ei_outb(E8390_RWRITE + E8390_START, nic_base + NE_CMD);
+
+	xs100_write(dev, buf, count);
+
+	dma_start = jiffies;
+
+	while ((ei_inb(nic_base + EN0_ISR) & ENISR_RDC) == 0) {
+		if (jiffies - dma_start > 2 * HZ / 100) {	/* 20ms */
+			netdev_warn(dev, "timeout waiting for Tx RDC.\n");
+			ei_local->reset_8390(dev);
+			ax_NS8390_init(dev, 1);
+			break;
+		}
+	}
+
+	ei_outb(ENISR_RDC, nic_base + EN0_ISR);	/* Ack intr. */
+	ei_local->dmaing &= ~0x01;
+}
+
+static int xsurf100_probe(struct zorro_dev *zdev,
+			  const struct zorro_device_id *ent)
+{
+	struct platform_device *pdev;
+	struct xsurf100_ax_plat_data ax88796_data;
+	struct resource res[2] = {
+		DEFINE_RES_NAMED(IRQ_AMIGA_PORTS, 1, NULL,
+				 IORESOURCE_IRQ | IORESOURCE_IRQ_SHAREABLE),
+		DEFINE_RES_MEM(zdev->resource.start + XS100_8390_BASE,
+			       4 * 0x20)
+	};
+	int reg;
+	/* This table is referenced in the device structure, so it must
+	 * outlive the scope of xsurf100_probe.
+	 */
+	static u32 reg_offsets[32];
+	int ret = 0;
+
+	/* X-Surf 100 control and 32 bit ring buffer data access areas.
+	 * These resources are not used by the ax88796 driver, so must
+	 * be requested here and passed via platform data.
+	 */
+
+	if (!request_mem_region(zdev->resource.start, 0x100, zdev->name)) {
+		dev_err(&zdev->dev, "cannot reserve X-Surf 100 control registers\n");
+		return -ENXIO;
+	}
+
+	if (!request_mem_region(zdev->resource.start +
+				XS100_8390_DATA32_BASE,
+				XS100_8390_DATA32_SIZE,
+				"X-Surf 100 32-bit data access")) {
+		dev_err(&zdev->dev, "cannot reserve 32-bit area\n");
+		ret = -ENXIO;
+		goto exit_req;
+	}
+
+	for (reg = 0; reg < 0x20; reg++)
+		reg_offsets[reg] = 4 * reg;
+
+	memset(&ax88796_data, 0, sizeof(ax88796_data));
+	ax88796_data.ax.flags = AXFLG_HAS_EEPROM;
+	ax88796_data.ax.wordlength = 2;
+	ax88796_data.ax.dcr_val = 0x48;
+	ax88796_data.ax.rcr_val = 0x40;
+	ax88796_data.ax.reg_offsets = reg_offsets;
+	ax88796_data.ax.check_irq = is_xsurf100_network_irq;
+	ax88796_data.base_regs = ioremap(zdev->resource.start, 0x100);
+
+	/* error handling for ioremap regs */
+	if (!ax88796_data.base_regs) {
+		dev_err(&zdev->dev, "Cannot ioremap area %pR (registers)\n",
+			&zdev->resource);
+
+		ret = -ENXIO;
+		goto exit_req2;
+	}
+
+	ax88796_data.data_area = ioremap(zdev->resource.start +
+			XS100_8390_DATA32_BASE, XS100_8390_DATA32_SIZE);
+
+	/* error handling for ioremap data */
+	if (!ax88796_data.data_area) {
+		dev_err(&zdev->dev,
+			"Cannot ioremap area %pR offset %x (32-bit access)\n",
+			&zdev->resource,  XS100_8390_DATA32_BASE);
+
+		ret = -ENXIO;
+		goto exit_mem;
+	}
+
+	ax88796_data.ax.block_output = xs100_block_output;
+	ax88796_data.ax.block_input = xs100_block_input;
+
+	pdev = platform_device_register_resndata(&zdev->dev, "ax88796",
+						 zdev->slotaddr, res, 2,
+						 &ax88796_data,
+						 sizeof(ax88796_data));
+
+	if (IS_ERR(pdev)) {
+		dev_err(&zdev->dev, "cannot register platform device\n");
+		ret = -ENXIO;
+		goto exit_mem2;
+	}
+
+	zorro_set_drvdata(zdev, pdev);
+
+	if (!ret)
+		return 0;
+
+ exit_mem2:
+	iounmap(ax88796_data.data_area);
+
+ exit_mem:
+	iounmap(ax88796_data.base_regs);
+
+ exit_req2:
+	release_mem_region(zdev->resource.start + XS100_8390_DATA32_BASE,
+			   XS100_8390_DATA32_SIZE);
+
+ exit_req:
+	release_mem_region(zdev->resource.start, 0x100);
+
+	return ret;
+}
+
+static void xsurf100_remove(struct zorro_dev *zdev)
+{
+	struct platform_device *pdev = zorro_get_drvdata(zdev);
+	struct xsurf100_ax_plat_data *xs100 = dev_get_platdata(&pdev->dev);
+
+	platform_device_unregister(pdev);
+
+	iounmap(xs100->base_regs);
+	release_mem_region(zdev->resource.start, 0x100);
+	iounmap(xs100->data_area);
+	release_mem_region(zdev->resource.start + XS100_8390_DATA32_BASE,
+			   XS100_8390_DATA32_SIZE);
+}
+
+static const struct zorro_device_id xsurf100_zorro_tbl[] = {
+	{ ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF100, },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(zorro, xsurf100_zorro_tbl);
+
+static struct zorro_driver xsurf100_driver = {
+	.name           = "xsurf100",
+	.id_table       = xsurf100_zorro_tbl,
+	.probe          = xsurf100_probe,
+	.remove         = xsurf100_remove,
+};
+
+module_driver(xsurf100_driver, zorro_register_driver, zorro_unregister_driver);
+
+MODULE_DESCRIPTION("X-Surf 100 driver");
+MODULE_AUTHOR("Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 7d9d0d562b54d2953304693f76bc2bbfbe318c27 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Thu, 12 Apr 2018 16:15:07 +0300
Subject: iwlwifi: mvm: add traffic condition monitoring (TCM)

Traffic condition monitor gathers data about the traffic load and
other conditions and can be used to make decisions regarding latency,
throughput etc.  This patch introduces the code and data structures to
collect this data for future use.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/constants.h |   3 +
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c        |   7 +
 .../net/wireless/intel/iwlwifi/mvm/debugfs-vif.c   |   2 +
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |  59 ++++++
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  10 +
 drivers/net/wireless/intel/iwlwifi/mvm/rx.c        |  60 +++++-
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c      |   6 +
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c      |  11 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |  35 ++++
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     | 231 +++++++++++++++++++++
 10 files changed, 415 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
index 96b52a275ee3..2763fc69f04b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
@@ -112,6 +112,9 @@
 #define IWL_MVM_PARSE_NVM			0
 #define IWL_MVM_ADWELL_ENABLE			1
 #define IWL_MVM_ADWELL_MAX_BUDGET		0
+#define IWL_MVM_TCM_LOAD_MEDIUM_THRESH		10 /* percentage */
+#define IWL_MVM_TCM_LOAD_HIGH_THRESH		50 /* percentage */
+#define IWL_MVM_TCM_LOWLAT_ENABLE_THRESH	100 /* packets/10 seconds */
 #define IWL_MVM_RS_NUM_TRY_BEFORE_ANT_TOGGLE    1
 #define IWL_MVM_RS_HT_VHT_RETRIES_PER_RATE      2
 #define IWL_MVM_RS_HT_VHT_RETRIES_PER_RATE_TW   1
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index 2efe9b099556..80a9a7cb83be 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -1097,6 +1097,7 @@ int iwl_mvm_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
 
 	/* make sure the d0i3 exit work is not pending */
 	flush_work(&mvm->d0i3_exit_work);
+	iwl_mvm_pause_tcm(mvm, true);
 
 	iwl_fw_runtime_suspend(&mvm->fwrt);
 
@@ -2014,6 +2015,8 @@ int iwl_mvm_resume(struct ieee80211_hw *hw)
 
 	mvm->trans->system_pm_mode = IWL_PLAT_PM_MODE_DISABLED;
 
+	iwl_mvm_resume_tcm(mvm);
+
 	iwl_fw_runtime_resume(&mvm->fwrt);
 
 	return ret;
@@ -2042,6 +2045,8 @@ static int iwl_mvm_d3_test_open(struct inode *inode, struct file *file)
 
 	mvm->trans->system_pm_mode = IWL_PLAT_PM_MODE_D3;
 
+	iwl_mvm_pause_tcm(mvm, true);
+
 	iwl_fw_runtime_suspend(&mvm->fwrt);
 
 	/* start pseudo D3 */
@@ -2104,6 +2109,8 @@ static int iwl_mvm_d3_test_release(struct inode *inode, struct file *file)
 	__iwl_mvm_resume(mvm, true);
 	rtnl_unlock();
 
+	iwl_mvm_resume_tcm(mvm);
+
 	iwl_fw_runtime_resume(&mvm->fwrt);
 
 	mvm->trans->system_pm_mode = IWL_PLAT_PM_MODE_DISABLED;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
index f7fcf700196b..798605c4f122 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c
@@ -269,6 +269,8 @@ static ssize_t iwl_dbgfs_mac_params_read(struct file *file,
 			 mvmvif->id, mvmvif->color);
 	pos += scnprintf(buf+pos, bufsz-pos, "bssid: %pM\n",
 			 vif->bss_conf.bssid);
+	pos += scnprintf(buf+pos, bufsz-pos, "Load: %d\n",
+			 mvm->tcm.result.load[mvmvif->id]);
 	pos += scnprintf(buf+pos, bufsz-pos, "QoS:\n");
 	for (i = 0; i < ARRAY_SIZE(mvmvif->queue_params); i++)
 		pos += scnprintf(buf+pos, bufsz-pos,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index d2cf751db68d..72bab44082ea 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -92,6 +92,8 @@
 #include "fw/acpi.h"
 #include "fw/debugfs.h"
 
+#include <linux/average.h>
+
 #define IWL_MVM_MAX_ADDRESSES		5
 /* RSSI offset for WkP */
 #define IWL_RSSI_OFFSET 50
@@ -595,6 +597,51 @@ enum iwl_mvm_tdls_cs_state {
 	IWL_MVM_TDLS_SW_ACTIVE,
 };
 
+enum iwl_mvm_traffic_load {
+	IWL_MVM_TRAFFIC_LOW,
+	IWL_MVM_TRAFFIC_MEDIUM,
+	IWL_MVM_TRAFFIC_HIGH,
+};
+
+DECLARE_EWMA(rate, 16, 16)
+
+struct iwl_mvm_tcm_mac {
+	struct {
+		u32 pkts[IEEE80211_NUM_ACS];
+		u32 airtime;
+	} tx;
+	struct {
+		u32 pkts[IEEE80211_NUM_ACS];
+		u32 airtime;
+		u32 last_ampdu_ref;
+	} rx;
+	struct {
+		/* track AP's transfer in client mode */
+		u64 rx_bytes;
+		struct ewma_rate rate;
+		bool detected;
+	} uapsd_nonagg_detect;
+};
+
+struct iwl_mvm_tcm {
+	struct delayed_work work;
+	spinlock_t lock; /* used when time elapsed */
+	unsigned long ts; /* timestamp when period ends */
+	unsigned long ll_ts;
+	unsigned long uapsd_nonagg_ts;
+	bool paused;
+	struct iwl_mvm_tcm_mac data[NUM_MAC_INDEX_DRIVER];
+	struct {
+		u32 elapsed; /* milliseconds for this TCM period */
+		u32 airtime[NUM_MAC_INDEX_DRIVER];
+		enum iwl_mvm_traffic_load load[NUM_MAC_INDEX_DRIVER];
+		enum iwl_mvm_traffic_load global_load;
+		bool low_latency[NUM_MAC_INDEX_DRIVER];
+		bool change[NUM_MAC_INDEX_DRIVER];
+		bool global_change;
+	} result;
+};
+
 /**
  * struct iwl_mvm_reorder_buffer - per ra/tid/queue reorder buffer
  * @head_sn: reorder window head sn
@@ -978,6 +1025,9 @@ struct iwl_mvm {
 	 */
 	bool temperature_test;  /* Debug test temperature is enabled */
 
+	unsigned long bt_coex_last_tcm_ts;
+	struct iwl_mvm_tcm tcm;
+
 	struct iwl_time_quota_cmd last_quota_cmd;
 
 #ifdef CONFIG_NL80211_TESTMODE
@@ -1906,6 +1956,15 @@ bool iwl_mvm_is_vif_assoc(struct iwl_mvm *mvm);
 
 void iwl_mvm_inactivity_check(struct iwl_mvm *mvm);
 
+#define MVM_TCM_PERIOD_MSEC 500
+#define MVM_TCM_PERIOD (HZ * MVM_TCM_PERIOD_MSEC / 1000)
+#define MVM_LL_PERIOD (10 * HZ)
+void iwl_mvm_tcm_work(struct work_struct *work);
+void iwl_mvm_recalc_tcm(struct iwl_mvm *mvm);
+void iwl_mvm_pause_tcm(struct iwl_mvm *mvm, bool with_cancel);
+void iwl_mvm_resume_tcm(struct iwl_mvm *mvm);
+u8 iwl_mvm_tcm_load_percentage(u32 airtime, u32 elapsed);
+
 void iwl_mvm_nic_restart(struct iwl_mvm *mvm, bool fw_error);
 unsigned int iwl_mvm_get_wd_timeout(struct iwl_mvm *mvm,
 				    struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 224bfa1bcf53..6a3f557543e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -667,6 +667,12 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 
 	SET_IEEE80211_DEV(mvm->hw, mvm->trans->dev);
 
+	spin_lock_init(&mvm->tcm.lock);
+	INIT_DELAYED_WORK(&mvm->tcm.work, iwl_mvm_tcm_work);
+	mvm->tcm.ts = jiffies;
+	mvm->tcm.ll_ts = jiffies;
+	mvm->tcm.uapsd_nonagg_ts = jiffies;
+
 	INIT_DELAYED_WORK(&mvm->cs_tx_unblock_dwork, iwl_mvm_tx_unblock_dwork);
 
 	/*
@@ -859,6 +865,8 @@ static void iwl_op_mode_mvm_stop(struct iwl_op_mode *op_mode)
 	for (i = 0; i < NVM_MAX_NUM_SECTIONS; i++)
 		kfree(mvm->nvm_sections[i].data);
 
+	cancel_delayed_work_sync(&mvm->tcm.work);
+
 	iwl_mvm_tof_clean(mvm);
 
 	mutex_destroy(&mvm->mutex);
@@ -1432,6 +1440,7 @@ int iwl_mvm_enter_d0i3(struct iwl_op_mode *op_mode)
 		mvm->d0i3_offloading = false;
 	}
 
+	iwl_mvm_pause_tcm(mvm, true);
 	/* make sure we have no running tx while configuring the seqno */
 	synchronize_net();
 
@@ -1615,6 +1624,7 @@ out:
 	/* the FW might have updated the regdomain */
 	iwl_mvm_update_changed_regdom(mvm);
 
+	iwl_mvm_resume_tcm(mvm);
 	iwl_mvm_unref(mvm, IWL_MVM_REF_EXIT_WORK);
 	mutex_unlock(&mvm->mutex);
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
index d26833c5ce1f..be80294349c3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
@@ -254,6 +254,39 @@ static u32 iwl_mvm_set_mac80211_rx_flag(struct iwl_mvm *mvm,
 	return 0;
 }
 
+static void iwl_mvm_rx_handle_tcm(struct iwl_mvm *mvm,
+				  struct ieee80211_sta *sta,
+				  struct ieee80211_hdr *hdr, u32 len,
+				  struct iwl_rx_phy_info *phy_info,
+				  u32 rate_n_flags)
+{
+	struct iwl_mvm_sta *mvmsta;
+	struct iwl_mvm_tcm_mac *mdata;
+	int mac;
+	int ac = IEEE80211_AC_BE; /* treat non-QoS as BE */
+
+	if (ieee80211_is_data_qos(hdr->frame_control))
+		ac = tid_to_mac80211_ac[ieee80211_get_tid(hdr)];
+
+	mvmsta = iwl_mvm_sta_from_mac80211(sta);
+	mac = mvmsta->mac_id_n_color & FW_CTXT_ID_MSK;
+
+	if (time_after(jiffies, mvm->tcm.ts + MVM_TCM_PERIOD))
+		schedule_delayed_work(&mvm->tcm.work, 0);
+	mdata = &mvm->tcm.data[mac];
+	mdata->rx.pkts[ac]++;
+
+	/* count the airtime only once for each ampdu */
+	if (mdata->rx.last_ampdu_ref != mvm->ampdu_ref) {
+		mdata->rx.last_ampdu_ref = mvm->ampdu_ref;
+		mdata->rx.airtime += le16_to_cpu(phy_info->frame_time);
+	}
+
+	if (!(rate_n_flags & (RATE_MCS_HT_MSK | RATE_MCS_VHT_MSK)))
+		return;
+
+}
+
 static void iwl_mvm_rx_csum(struct ieee80211_sta *sta,
 			    struct sk_buff *skb,
 			    u32 status)
@@ -408,6 +441,12 @@ void iwl_mvm_rx_rx_mpdu(struct iwl_mvm *mvm, struct napi_struct *napi,
 							NULL);
 		}
 
+		if (!mvm->tcm.paused && len >= sizeof(*hdr) &&
+		    !is_multicast_ether_addr(hdr->addr1) &&
+		    ieee80211_is_data(hdr->frame_control))
+			iwl_mvm_rx_handle_tcm(mvm, sta, hdr, len, phy_info,
+					      rate_n_flags);
+
 		if (ieee80211_is_data(hdr->frame_control))
 			iwl_mvm_rx_csum(sta, skb, rx_pkt_status);
 	}
@@ -654,7 +693,7 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 	int expected_size;
 	int i;
 	u8 *energy;
-	__le32 *bytes, *air_time;
+	__le32 *air_time;
 	__le32 flags;
 
 	if (!iwl_mvm_has_new_rx_stats_api(mvm)) {
@@ -729,13 +768,11 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 		struct iwl_notif_statistics_v11 *v11 = (void *)&pkt->data;
 
 		energy = (void *)&v11->load_stats.avg_energy;
-		bytes = (void *)&v11->load_stats.byte_count;
 		air_time = (void *)&v11->load_stats.air_time;
 	} else {
 		struct iwl_notif_statistics_cdb *stats = (void *)&pkt->data;
 
 		energy = (void *)&stats->load_stats.avg_energy;
-		bytes = (void *)&stats->load_stats.byte_count;
 		air_time = (void *)&stats->load_stats.air_time;
 	}
 
@@ -752,6 +789,23 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 		sta->avg_energy = energy[i];
 	}
 	rcu_read_unlock();
+
+	/*
+	 * Don't update in case the statistics are not cleared, since
+	 * we will end up counting twice the same airtime, once in TCM
+	 * request and once in statistics notification.
+	 */
+	if (!(le32_to_cpu(flags) & IWL_STATISTICS_REPLY_FLG_CLEAR))
+		return;
+
+	spin_lock(&mvm->tcm.lock);
+	for (i = 0; i < NUM_MAC_INDEX_DRIVER; i++) {
+		struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[i];
+		u32 airtime = le32_to_cpu(air_time[i]);
+
+		mdata->rx.airtime += airtime;
+	}
+	spin_unlock(&mvm->tcm.lock);
 }
 
 void iwl_mvm_rx_statistics(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 4a4ccfd11e5b..f9cd7575b422 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -941,6 +941,12 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi,
 			       IWL_RX_MPDU_REORDER_BAID_MASK) >>
 			       IWL_RX_MPDU_REORDER_BAID_SHIFT);
 
+		if (!mvm->tcm.paused && len >= sizeof(*hdr) &&
+		    !is_multicast_ether_addr(hdr->addr1) &&
+		    ieee80211_is_data(hdr->frame_control) &&
+		    time_after(jiffies, mvm->tcm.ts + MVM_TCM_PERIOD))
+			schedule_delayed_work(&mvm->tcm.work, 0);
+
 		/*
 		 * We have tx blocked stations (with CS bit). If we heard
 		 * frames from a blocked station on a new channel we can
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index b31f0ffbbbf0..cd7ae6976935 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -76,12 +76,6 @@
 #define IWL_DENSE_EBS_SCAN_RATIO 5
 #define IWL_SPARSE_EBS_SCAN_RATIO 1
 
-enum iwl_mvm_traffic_load {
-	IWL_MVM_TRAFFIC_LOW,
-	IWL_MVM_TRAFFIC_MEDIUM,
-	IWL_MVM_TRAFFIC_HIGH,
-};
-
 #define IWL_SCAN_DWELL_ACTIVE		10
 #define IWL_SCAN_DWELL_PASSIVE		110
 #define IWL_SCAN_DWELL_FRAGMENTED	44
@@ -437,6 +431,7 @@ void iwl_mvm_rx_lmac_scan_complete_notif(struct iwl_mvm *mvm,
 		ieee80211_scan_completed(mvm->hw, &info);
 		iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN);
 		cancel_delayed_work(&mvm->scan_timeout_dwork);
+		iwl_mvm_resume_tcm(mvm);
 	} else {
 		IWL_ERR(mvm,
 			"got scan complete notification but no scan is running\n");
@@ -1568,6 +1563,8 @@ int iwl_mvm_reg_scan_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	if (ret)
 		return ret;
 
+	iwl_mvm_pause_tcm(mvm, false);
+
 	ret = iwl_mvm_send_cmd(mvm, &hcmd);
 	if (ret) {
 		/* If the scan failed, it usually means that the FW was unable
@@ -1575,6 +1572,7 @@ int iwl_mvm_reg_scan_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 		 * should try to send the command again with different params.
 		 */
 		IWL_ERR(mvm, "Scan failed! ret %d\n", ret);
+		iwl_mvm_resume_tcm(mvm);
 		return ret;
 	}
 
@@ -1711,6 +1709,7 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm,
 		mvm->scan_vif = NULL;
 		iwl_mvm_unref(mvm, IWL_MVM_REF_SCAN);
 		cancel_delayed_work(&mvm->scan_timeout_dwork);
+		iwl_mvm_resume_tcm(mvm);
 	} else if (mvm->scan_uid_status[uid] == IWL_MVM_SCAN_SCHED) {
 		ieee80211_sched_scan_stopped(mvm->hw);
 		mvm->sched_scan_pass_all = SCHED_SCAN_PASS_ALL_DISABLED;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 795065974d78..f06a0ee7be28 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -930,6 +930,32 @@ static bool iwl_mvm_txq_should_update(struct iwl_mvm *mvm, int txq_id)
 	return false;
 }
 
+static void iwl_mvm_tx_airtime(struct iwl_mvm *mvm,
+			       struct iwl_mvm_sta *mvmsta,
+			       int airtime)
+{
+	int mac = mvmsta->mac_id_n_color & FW_CTXT_ID_MSK;
+	struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[mac];
+
+	if (mvm->tcm.paused)
+		return;
+
+	if (time_after(jiffies, mvm->tcm.ts + MVM_TCM_PERIOD))
+		schedule_delayed_work(&mvm->tcm.work, 0);
+
+	mdata->tx.airtime += airtime;
+}
+
+static void iwl_mvm_tx_pkt_queued(struct iwl_mvm *mvm,
+				  struct iwl_mvm_sta *mvmsta, int tid)
+{
+	u32 ac = tid_to_mac80211_ac[tid];
+	int mac = mvmsta->mac_id_n_color & FW_CTXT_ID_MSK;
+	struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[mac];
+
+	mdata->tx.pkts[ac]++;
+}
+
 /*
  * Sets the fields in the Tx cmd that are crypto related
  */
@@ -1067,6 +1093,8 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb,
 
 	spin_unlock(&mvmsta->lock);
 
+	iwl_mvm_tx_pkt_queued(mvm, mvmsta, tid == IWL_MAX_TID_COUNT ? 0 : tid);
+
 	return 0;
 
 drop_unlock_sta:
@@ -1469,6 +1497,9 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm,
 	if (!IS_ERR(sta)) {
 		mvmsta = iwl_mvm_sta_from_mac80211(sta);
 
+		iwl_mvm_tx_airtime(mvm, mvmsta,
+				   le16_to_cpu(tx_resp->wireless_media_time));
+
 		if (tid != IWL_TID_NON_QOS && tid != IWL_MGMT_TID) {
 			struct iwl_mvm_tid_data *tid_data =
 				&mvmsta->tid_data[tid];
@@ -1610,6 +1641,8 @@ static void iwl_mvm_rx_tx_cmd_agg(struct iwl_mvm *mvm,
 			le16_to_cpu(tx_resp->wireless_media_time);
 		mvmsta->tid_data[tid].lq_color =
 			TX_RES_RATE_TABLE_COL_GET(tx_resp->tlc_info);
+		iwl_mvm_tx_airtime(mvm, mvmsta,
+				   le16_to_cpu(tx_resp->wireless_media_time));
 	}
 
 	rcu_read_unlock();
@@ -1800,6 +1833,8 @@ void iwl_mvm_rx_ba_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
 					   le32_to_cpu(ba_res->tx_rate));
 		}
 
+		iwl_mvm_tx_airtime(mvm, mvmsta,
+				   le32_to_cpu(ba_res->wireless_time));
 out_unlock:
 		rcu_read_unlock();
 out:
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index d99d9ea78e4c..27905f3fe3ba 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -1429,6 +1429,237 @@ void iwl_mvm_event_frame_timeout_callback(struct iwl_mvm *mvm,
 				sta->addr, tid);
 }
 
+u8 iwl_mvm_tcm_load_percentage(u32 airtime, u32 elapsed)
+{
+	if (!elapsed)
+		return 0;
+
+	return (100 * airtime / elapsed) / USEC_PER_MSEC;
+}
+
+static enum iwl_mvm_traffic_load
+iwl_mvm_tcm_load(struct iwl_mvm *mvm, u32 airtime, unsigned long elapsed)
+{
+	u8 load = iwl_mvm_tcm_load_percentage(airtime, elapsed);
+
+	if (load > IWL_MVM_TCM_LOAD_HIGH_THRESH)
+		return IWL_MVM_TRAFFIC_HIGH;
+	if (load > IWL_MVM_TCM_LOAD_MEDIUM_THRESH)
+		return IWL_MVM_TRAFFIC_MEDIUM;
+
+	return IWL_MVM_TRAFFIC_LOW;
+}
+
+struct iwl_mvm_tcm_iter_data {
+	struct iwl_mvm *mvm;
+	bool any_sent;
+};
+
+static void iwl_mvm_tcm_iter(void *_data, u8 *mac, struct ieee80211_vif *vif)
+{
+	struct iwl_mvm_tcm_iter_data *data = _data;
+	struct iwl_mvm *mvm = data->mvm;
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+	bool low_latency, prev = mvmvif->low_latency & LOW_LATENCY_TRAFFIC;
+
+	if (mvmvif->id >= NUM_MAC_INDEX_DRIVER)
+		return;
+
+	low_latency = mvm->tcm.result.low_latency[mvmvif->id];
+
+	if (!mvm->tcm.result.change[mvmvif->id] &&
+	    prev == low_latency) {
+		iwl_mvm_update_quotas(mvm, false, NULL);
+		return;
+	}
+
+	if (prev != low_latency) {
+		/* this sends traffic load and updates quota as well */
+		iwl_mvm_update_low_latency(mvm, vif, low_latency,
+					   LOW_LATENCY_TRAFFIC);
+	} else {
+		iwl_mvm_update_quotas(mvm, false, NULL);
+	}
+
+	data->any_sent = true;
+}
+
+static void iwl_mvm_tcm_results(struct iwl_mvm *mvm)
+{
+	struct iwl_mvm_tcm_iter_data data = {
+		.mvm = mvm,
+		.any_sent = false,
+	};
+
+	mutex_lock(&mvm->mutex);
+
+	ieee80211_iterate_active_interfaces(
+		mvm->hw, IEEE80211_IFACE_ITER_NORMAL,
+		iwl_mvm_tcm_iter, &data);
+
+	if (fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_UMAC_SCAN))
+		iwl_mvm_config_scan(mvm);
+
+	mutex_unlock(&mvm->mutex);
+}
+
+
+static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
+					    unsigned long ts,
+					    bool handle_uapsd)
+{
+	unsigned int elapsed = jiffies_to_msecs(ts - mvm->tcm.ts);
+	u32 total_airtime = 0;
+	int ac, mac;
+	bool low_latency = false;
+	enum iwl_mvm_traffic_load load;
+	bool handle_ll = time_after(ts, mvm->tcm.ll_ts + MVM_LL_PERIOD);
+
+	if (handle_ll)
+		mvm->tcm.ll_ts = ts;
+	if (handle_uapsd)
+		mvm->tcm.uapsd_nonagg_ts = ts;
+
+	mvm->tcm.result.elapsed = elapsed;
+
+	for (mac = 0; mac < NUM_MAC_INDEX_DRIVER; mac++) {
+		struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[mac];
+		u32 vo_vi_pkts = 0;
+		u32 airtime = mdata->rx.airtime + mdata->tx.airtime;
+
+		total_airtime += airtime;
+
+		load = iwl_mvm_tcm_load(mvm, airtime, elapsed);
+		mvm->tcm.result.change[mac] = load != mvm->tcm.result.load[mac];
+		mvm->tcm.result.load[mac] = load;
+		mvm->tcm.result.airtime[mac] = airtime;
+
+		for (ac = IEEE80211_AC_VO; ac <= IEEE80211_AC_VI; ac++)
+			vo_vi_pkts += mdata->rx.pkts[ac] +
+				      mdata->tx.pkts[ac];
+
+		/* enable immediately with enough packets but defer disabling */
+		if (vo_vi_pkts > IWL_MVM_TCM_LOWLAT_ENABLE_THRESH)
+			mvm->tcm.result.low_latency[mac] = true;
+		else if (handle_ll)
+			mvm->tcm.result.low_latency[mac] = false;
+
+		if (handle_ll) {
+			/* clear old data */
+			memset(&mdata->rx.pkts, 0, sizeof(mdata->rx.pkts));
+			memset(&mdata->tx.pkts, 0, sizeof(mdata->tx.pkts));
+		}
+		low_latency |= mvm->tcm.result.low_latency[mac];
+
+		memset(&mdata->rx.airtime, 0, sizeof(mdata->rx.airtime));
+		memset(&mdata->tx.airtime, 0, sizeof(mdata->tx.airtime));
+	}
+
+	load = iwl_mvm_tcm_load(mvm, total_airtime, elapsed);
+	mvm->tcm.result.global_change = load != mvm->tcm.result.global_load;
+	mvm->tcm.result.global_load = load;
+
+	/*
+	 * If the current load isn't low we need to force re-evaluation
+	 * in the TCM period, so that we can return to low load if there
+	 * was no traffic at all (and thus iwl_mvm_recalc_tcm didn't get
+	 * triggered by traffic).
+	 */
+	if (load != IWL_MVM_TRAFFIC_LOW)
+		return MVM_TCM_PERIOD;
+	/*
+	 * If low-latency is active we need to force re-evaluation after
+	 * (the longer) MVM_LL_PERIOD, so that we can disable low-latency
+	 * when there's no traffic at all.
+	 */
+	if (low_latency)
+		return MVM_LL_PERIOD;
+	/*
+	 * Otherwise, we don't need to run the work struct because we're
+	 * in the default "idle" state - traffic indication is low (which
+	 * also covers the "no traffic" case) and low-latency is disabled
+	 * so there's no state that may need to be disabled when there's
+	 * no traffic at all.
+	 *
+	 * Note that this has no impact on the regular scheduling of the
+	 * updates triggered by traffic - those happen whenever one of the
+	 * two timeouts expire (if there's traffic at all.)
+	 */
+	return 0;
+}
+
+void iwl_mvm_recalc_tcm(struct iwl_mvm *mvm)
+{
+	unsigned long ts = jiffies;
+	bool handle_uapsd =
+		false;
+
+	spin_lock(&mvm->tcm.lock);
+	if (mvm->tcm.paused || !time_after(ts, mvm->tcm.ts + MVM_TCM_PERIOD)) {
+		spin_unlock(&mvm->tcm.lock);
+		return;
+	}
+	spin_unlock(&mvm->tcm.lock);
+
+
+	spin_lock(&mvm->tcm.lock);
+	/* re-check if somebody else won the recheck race */
+	if (!mvm->tcm.paused && time_after(ts, mvm->tcm.ts + MVM_TCM_PERIOD)) {
+		/* calculate statistics */
+		unsigned long work_delay = iwl_mvm_calc_tcm_stats(mvm, ts,
+								  handle_uapsd);
+
+		/* the memset needs to be visible before the timestamp */
+		smp_mb();
+		mvm->tcm.ts = ts;
+		if (work_delay)
+			schedule_delayed_work(&mvm->tcm.work, work_delay);
+	}
+	spin_unlock(&mvm->tcm.lock);
+
+	iwl_mvm_tcm_results(mvm);
+}
+
+void iwl_mvm_tcm_work(struct work_struct *work)
+{
+	struct delayed_work *delayed_work = to_delayed_work(work);
+	struct iwl_mvm *mvm = container_of(delayed_work, struct iwl_mvm,
+					   tcm.work);
+
+	iwl_mvm_recalc_tcm(mvm);
+}
+
+void iwl_mvm_pause_tcm(struct iwl_mvm *mvm, bool with_cancel)
+{
+	spin_lock_bh(&mvm->tcm.lock);
+	mvm->tcm.paused = true;
+	spin_unlock_bh(&mvm->tcm.lock);
+	if (with_cancel)
+		cancel_delayed_work_sync(&mvm->tcm.work);
+}
+
+void iwl_mvm_resume_tcm(struct iwl_mvm *mvm)
+{
+	int mac;
+
+	spin_lock_bh(&mvm->tcm.lock);
+	mvm->tcm.ts = jiffies;
+	mvm->tcm.ll_ts = jiffies;
+	for (mac = 0; mac < NUM_MAC_INDEX_DRIVER; mac++) {
+		struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[mac];
+
+		memset(&mdata->rx.pkts, 0, sizeof(mdata->rx.pkts));
+		memset(&mdata->tx.pkts, 0, sizeof(mdata->tx.pkts));
+		memset(&mdata->rx.airtime, 0, sizeof(mdata->rx.airtime));
+		memset(&mdata->tx.airtime, 0, sizeof(mdata->tx.airtime));
+	}
+	/* The TCM data needs to be reset before "paused" flag changes */
+	smp_mb();
+	mvm->tcm.paused = false;
+	spin_unlock_bh(&mvm->tcm.lock);
+}
+
+
 void iwl_mvm_get_sync_time(struct iwl_mvm *mvm, u32 *gp2, u64 *boottime)
 {
 	bool ps_disabled;
-- 
cgit v1.2.3-59-g8ed1b


From 85207c66a2f091c005b98fc4976c8daf297b56e7 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Fri, 13 Apr 2018 08:58:35 +0300
Subject: iwlwifi: mvm: use TCM data to decide scan priority

The code for changing the scan priority is already implemented, but
isn't yet in use.  Now that TCM data is available, we can base the
scan priority decision on the traffic load.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index cd7ae6976935..8f3185b9d365 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -228,7 +228,7 @@ static void iwl_mvm_scan_condition_iterator(void *data, u8 *mac,
 
 static enum iwl_mvm_traffic_load iwl_mvm_get_traffic_load(struct iwl_mvm *mvm)
 {
-	return IWL_MVM_TRAFFIC_LOW;
+	return mvm->tcm.result.global_load;
 }
 
 static enum
-- 
cgit v1.2.3-59-g8ed1b


From bde1492d4a15ae37e1388a6f5a7972afb7ca32e3 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Tue, 9 Dec 2014 17:08:41 +0200
Subject: iwlwifi: mvm: BT Coex - make the primary / secondary pick traffic
 aware

The primary channel is the channel that will be untouched by BT. The
secondary channel might be touched by BT.  Hence, we want the primary
to be the most active channel.  To do so, use the TCM infrastructure.

Since the BT keeps sending notifications, we can rely on them to
trigger the check. Every 10 seconds, we will check what is the most
active context and chose the right primary.

We need to wait 10 seconds before we modify the settings because
frequent changes in these settings can confuse BT.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/coex.c | 37 +++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
index 890dbfff3a06..016e03a5034f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/coex.c
@@ -279,6 +279,8 @@ struct iwl_bt_iterator_data {
 	struct ieee80211_chanctx_conf *primary;
 	struct ieee80211_chanctx_conf *secondary;
 	bool primary_ll;
+	u8 primary_load;
+	u8 secondary_load;
 };
 
 static inline
@@ -295,6 +297,30 @@ void iwl_mvm_bt_coex_enable_rssi_event(struct iwl_mvm *mvm,
 		enable ? -IWL_MVM_BT_COEX_DIS_RED_TXP_THRESH : 0;
 }
 
+#define MVM_COEX_TCM_PERIOD (HZ * 10)
+
+static void iwl_mvm_bt_coex_tcm_based_ci(struct iwl_mvm *mvm,
+					 struct iwl_bt_iterator_data *data)
+{
+	unsigned long now = jiffies;
+
+	if (!time_after(now, mvm->bt_coex_last_tcm_ts + MVM_COEX_TCM_PERIOD))
+		return;
+
+	mvm->bt_coex_last_tcm_ts = now;
+
+	/* We assume here that we don't have more than 2 vifs on 2.4GHz */
+
+	/* if the primary is low latency, it will stay primary */
+	if (data->primary_ll)
+		return;
+
+	if (data->primary_load >= data->secondary_load)
+		return;
+
+	swap(data->primary, data->secondary);
+}
+
 /* must be called under rcu_read_lock */
 static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 				      struct ieee80211_vif *vif)
@@ -385,6 +411,11 @@ static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 			/* there is low latency vif - we will be secondary */
 			data->secondary = chanctx_conf;
 		}
+
+		if (data->primary == chanctx_conf)
+			data->primary_load = mvm->tcm.result.load[mvmvif->id];
+		else if (data->secondary == chanctx_conf)
+			data->secondary_load = mvm->tcm.result.load[mvmvif->id];
 		return;
 	}
 
@@ -398,6 +429,10 @@ static void iwl_mvm_bt_notif_iterator(void *_data, u8 *mac,
 		/* if secondary is not NULL, it might be a GO */
 		data->secondary = chanctx_conf;
 
+	if (data->primary == chanctx_conf)
+		data->primary_load = mvm->tcm.result.load[mvmvif->id];
+	else if (data->secondary == chanctx_conf)
+		data->secondary_load = mvm->tcm.result.load[mvmvif->id];
 	/*
 	 * don't reduce the Tx power if one of these is true:
 	 *  we are in LOOSE
@@ -449,6 +484,8 @@ static void iwl_mvm_bt_coex_notif_handle(struct iwl_mvm *mvm)
 					mvm->hw, IEEE80211_IFACE_ITER_NORMAL,
 					iwl_mvm_bt_notif_iterator, &data);
 
+	iwl_mvm_bt_coex_tcm_based_ci(mvm, &data);
+
 	if (data.primary) {
 		struct ieee80211_chanctx_conf *chan = data.primary;
 		if (WARN_ON(!chan->def.chan)) {
-- 
cgit v1.2.3-59-g8ed1b


From b0ffe455bc5bbdbcf7837274d2476f5597767237 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 11 Nov 2014 12:57:03 +0100
Subject: iwlwifi: mvm: detect U-APSD breaking aggregation

Try to detect that the AP is not using aggregation even when there's
enough traffic to make it worthwhile; if this is the case and U-APSD
is enabled then assume the AP is broken (like so many) and doesn't
enable aggregation when U-APSD is used. In this case, disconnect from
the AP and blacklist U-APSD for a potential new connection to it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/constants.h |   4 +
 drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c   |  25 ++++
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  33 ++++++
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |   9 ++
 drivers/net/wireless/intel/iwlwifi/mvm/rx.c        |  47 ++++++++
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     | 127 ++++++++++++++++++++-
 6 files changed, 244 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
index 2763fc69f04b..d61ff66ce07b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/constants.h
@@ -69,6 +69,8 @@
 
 #include <linux/ieee80211.h>
 
+#define IWL_MVM_UAPSD_NOAGG_BSSIDS_NUM		20
+
 #define IWL_MVM_DEFAULT_PS_TX_DATA_TIMEOUT	(100 * USEC_PER_MSEC)
 #define IWL_MVM_DEFAULT_PS_RX_DATA_TIMEOUT	(100 * USEC_PER_MSEC)
 #define IWL_MVM_WOWLAN_PS_TX_DATA_TIMEOUT	(10 * USEC_PER_MSEC)
@@ -115,6 +117,8 @@
 #define IWL_MVM_TCM_LOAD_MEDIUM_THRESH		10 /* percentage */
 #define IWL_MVM_TCM_LOAD_HIGH_THRESH		50 /* percentage */
 #define IWL_MVM_TCM_LOWLAT_ENABLE_THRESH	100 /* packets/10 seconds */
+#define IWL_MVM_UAPSD_NONAGG_PERIOD		5000 /* msecs */
+#define IWL_MVM_UAPSD_NOAGG_LIST_LEN		IWL_MVM_UAPSD_NOAGG_BSSIDS_NUM
 #define IWL_MVM_RS_NUM_TRY_BEFORE_ANT_TOGGLE    1
 #define IWL_MVM_RS_HT_VHT_RETRIES_PER_RATE      2
 #define IWL_MVM_RS_HT_VHT_RETRIES_PER_RATE_TW   1
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
index 0e6401cd7ccc..1c4178f20441 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c
@@ -1728,6 +1728,27 @@ iwl_dbgfs_send_echo_cmd_write(struct iwl_mvm *mvm, char *buf,
 	return ret ?: count;
 }
 
+static ssize_t
+iwl_dbgfs_uapsd_noagg_bssids_read(struct file *file, char __user *user_buf,
+				  size_t count, loff_t *ppos)
+{
+	struct iwl_mvm *mvm = file->private_data;
+	u8 buf[IWL_MVM_UAPSD_NOAGG_BSSIDS_NUM * ETH_ALEN * 3 + 1];
+	unsigned int pos = 0;
+	size_t bufsz = sizeof(buf);
+	int i;
+
+	mutex_lock(&mvm->mutex);
+
+	for (i = 0; i < IWL_MVM_UAPSD_NOAGG_LIST_LEN; i++)
+		pos += scnprintf(buf + pos, bufsz - pos, "%pM\n",
+				 mvm->uapsd_noagg_bssids[i].addr);
+
+	mutex_unlock(&mvm->mutex);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, pos);
+}
+
 MVM_DEBUGFS_READ_WRITE_FILE_OPS(prph_reg, 64);
 
 /* Device wide debugfs entries */
@@ -1762,6 +1783,8 @@ MVM_DEBUGFS_WRITE_FILE_OPS(indirection_tbl,
 			   (IWL_RSS_INDIRECTION_TABLE_SIZE * 2));
 MVM_DEBUGFS_WRITE_FILE_OPS(inject_packet, 512);
 
+MVM_DEBUGFS_READ_FILE_OPS(uapsd_noagg_bssids);
+
 #ifdef CONFIG_IWLWIFI_BCAST_FILTERING
 MVM_DEBUGFS_READ_WRITE_FILE_OPS(bcast_filters, 256);
 MVM_DEBUGFS_READ_WRITE_FILE_OPS(bcast_filters_macs, 256);
@@ -1972,6 +1995,8 @@ int iwl_mvm_dbgfs_register(struct iwl_mvm *mvm, struct dentry *dbgfs_dir)
 				 mvm->debugfs_dir, &mvm->drop_bcn_ap_mode))
 		goto err;
 
+	MVM_DEBUGFS_ADD_FILE(uapsd_noagg_bssids, mvm->debugfs_dir, S_IRUSR);
+
 #ifdef CONFIG_IWLWIFI_BCAST_FILTERING
 	if (mvm->fw->ucode_capa.flags & IWL_UCODE_TLV_FLAGS_BCAST_FILTERING) {
 		bcast_dir = debugfs_create_dir("bcast_filtering",
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 51b30424575b..08318bdaaf2e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -952,6 +952,16 @@ static int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw,
 
 	switch (action) {
 	case IEEE80211_AMPDU_RX_START:
+		if (iwl_mvm_vif_from_mac80211(vif)->ap_sta_id ==
+				iwl_mvm_sta_from_mac80211(sta)->sta_id) {
+			struct iwl_mvm_vif *mvmvif;
+			u16 macid = iwl_mvm_vif_from_mac80211(vif)->id;
+			struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[macid];
+
+			mdata->opened_rx_ba_sessions = true;
+			mvmvif = iwl_mvm_vif_from_mac80211(vif);
+			cancel_delayed_work(&mvmvif->uapsd_nonagg_detected_wk);
+		}
 		if (!iwl_enable_rx_ampdu(mvm->cfg)) {
 			ret = -EINVAL;
 			break;
@@ -1435,6 +1445,8 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw,
 		mvm->p2p_device_vif = vif;
 	}
 
+	iwl_mvm_tcm_add_vif(mvm, vif);
+
 	if (vif->type == NL80211_IFTYPE_MONITOR)
 		mvm->monitor_on = true;
 
@@ -1486,6 +1498,10 @@ static void iwl_mvm_mac_remove_interface(struct ieee80211_hw *hw,
 
 	iwl_mvm_prepare_mac_removal(mvm, vif);
 
+	if (!(vif->type == NL80211_IFTYPE_AP ||
+	      vif->type == NL80211_IFTYPE_ADHOC))
+		iwl_mvm_tcm_rm_vif(mvm, vif);
+
 	mutex_lock(&mvm->mutex);
 
 	if (mvm->bf_allowed_vif == mvmvif) {
@@ -2535,6 +2551,16 @@ static void iwl_mvm_sta_pre_rcu_remove(struct ieee80211_hw *hw,
 static void iwl_mvm_check_uapsd(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 				const u8 *bssid)
 {
+	int i;
+
+	if (!test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) {
+		struct iwl_mvm_tcm_mac *mdata;
+
+		mdata = &mvm->tcm.data[iwl_mvm_vif_from_mac80211(vif)->id];
+		ewma_rate_init(&mdata->uapsd_nonagg_detect.rate);
+		mdata->opened_rx_ba_sessions = false;
+	}
+
 	if (!(mvm->fw->ucode_capa.flags & IWL_UCODE_TLV_FLAGS_UAPSD_SUPPORT))
 		return;
 
@@ -2549,6 +2575,13 @@ static void iwl_mvm_check_uapsd(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 		return;
 	}
 
+	for (i = 0; i < IWL_MVM_UAPSD_NOAGG_LIST_LEN; i++) {
+		if (ether_addr_equal(mvm->uapsd_noagg_bssids[i].addr, bssid)) {
+			vif->driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD;
+			return;
+		}
+	}
+
 	vif->driver_flags |= IEEE80211_VIF_SUPPORTS_UAPSD;
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 72bab44082ea..acc36eb1ef39 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -446,6 +446,8 @@ struct iwl_mvm_vif {
 	/* FW identified misbehaving AP */
 	u8 uapsd_misbehaving_bssid[ETH_ALEN];
 
+	struct delayed_work uapsd_nonagg_detected_wk;
+
 	/* Indicates that CSA countdown may be started */
 	bool csa_countdown;
 	bool csa_failed;
@@ -621,6 +623,7 @@ struct iwl_mvm_tcm_mac {
 		struct ewma_rate rate;
 		bool detected;
 	} uapsd_nonagg_detect;
+	bool opened_rx_ba_sessions;
 };
 
 struct iwl_mvm_tcm {
@@ -1028,6 +1031,10 @@ struct iwl_mvm {
 	unsigned long bt_coex_last_tcm_ts;
 	struct iwl_mvm_tcm tcm;
 
+	u8 uapsd_noagg_bssid_write_idx;
+	struct mac_address uapsd_noagg_bssids[IWL_MVM_UAPSD_NOAGG_BSSIDS_NUM]
+		__aligned(2);
+
 	struct iwl_time_quota_cmd last_quota_cmd;
 
 #ifdef CONFIG_NL80211_TESTMODE
@@ -1963,6 +1970,8 @@ void iwl_mvm_tcm_work(struct work_struct *work);
 void iwl_mvm_recalc_tcm(struct iwl_mvm *mvm);
 void iwl_mvm_pause_tcm(struct iwl_mvm *mvm, bool with_cancel);
 void iwl_mvm_resume_tcm(struct iwl_mvm *mvm);
+void iwl_mvm_tcm_add_vif(struct iwl_mvm *mvm, struct ieee80211_vif *vif);
+void iwl_mvm_tcm_rm_vif(struct iwl_mvm *mvm, struct ieee80211_vif *vif);
 u8 iwl_mvm_tcm_load_percentage(u32 airtime, u32 elapsed);
 
 void iwl_mvm_nic_restart(struct iwl_mvm *mvm, bool fw_error);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
index be80294349c3..bfb163419c67 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rx.c
@@ -264,6 +264,12 @@ static void iwl_mvm_rx_handle_tcm(struct iwl_mvm *mvm,
 	struct iwl_mvm_tcm_mac *mdata;
 	int mac;
 	int ac = IEEE80211_AC_BE; /* treat non-QoS as BE */
+	struct iwl_mvm_vif *mvmvif;
+	/* expected throughput in 100Kbps, single stream, 20 MHz */
+	static const u8 thresh_tpt[] = {
+		9, 18, 30, 42, 60, 78, 90, 96, 120, 135,
+	};
+	u16 thr;
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
 		ac = tid_to_mac80211_ac[ieee80211_get_tid(hdr)];
@@ -285,6 +291,35 @@ static void iwl_mvm_rx_handle_tcm(struct iwl_mvm *mvm,
 	if (!(rate_n_flags & (RATE_MCS_HT_MSK | RATE_MCS_VHT_MSK)))
 		return;
 
+	mvmvif = iwl_mvm_vif_from_mac80211(mvmsta->vif);
+
+	if (mdata->opened_rx_ba_sessions ||
+	    mdata->uapsd_nonagg_detect.detected ||
+	    (!mvmvif->queue_params[IEEE80211_AC_VO].uapsd &&
+	     !mvmvif->queue_params[IEEE80211_AC_VI].uapsd &&
+	     !mvmvif->queue_params[IEEE80211_AC_BE].uapsd &&
+	     !mvmvif->queue_params[IEEE80211_AC_BK].uapsd) ||
+	    mvmsta->sta_id != mvmvif->ap_sta_id)
+		return;
+
+	if (rate_n_flags & RATE_MCS_HT_MSK) {
+		thr = thresh_tpt[rate_n_flags & RATE_HT_MCS_RATE_CODE_MSK];
+		thr *= 1 + ((rate_n_flags & RATE_HT_MCS_NSS_MSK) >>
+					RATE_HT_MCS_NSS_POS);
+	} else {
+		if (WARN_ON((rate_n_flags & RATE_VHT_MCS_RATE_CODE_MSK) >=
+				ARRAY_SIZE(thresh_tpt)))
+			return;
+		thr = thresh_tpt[rate_n_flags & RATE_VHT_MCS_RATE_CODE_MSK];
+		thr *= 1 + ((rate_n_flags & RATE_VHT_MCS_NSS_MSK) >>
+					RATE_VHT_MCS_NSS_POS);
+	}
+
+	thr <<= ((rate_n_flags & RATE_MCS_CHAN_WIDTH_MSK) >>
+				RATE_MCS_CHAN_WIDTH_POS);
+
+	mdata->uapsd_nonagg_detect.rx_bytes += len;
+	ewma_rate_add(&mdata->uapsd_nonagg_detect.rate, thr);
 }
 
 static void iwl_mvm_rx_csum(struct ieee80211_sta *sta,
@@ -693,6 +728,7 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 	int expected_size;
 	int i;
 	u8 *energy;
+	__le32 *bytes;
 	__le32 *air_time;
 	__le32 flags;
 
@@ -768,11 +804,13 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 		struct iwl_notif_statistics_v11 *v11 = (void *)&pkt->data;
 
 		energy = (void *)&v11->load_stats.avg_energy;
+		bytes = (void *)&v11->load_stats.byte_count;
 		air_time = (void *)&v11->load_stats.air_time;
 	} else {
 		struct iwl_notif_statistics_cdb *stats = (void *)&pkt->data;
 
 		energy = (void *)&stats->load_stats.avg_energy;
+		bytes = (void *)&stats->load_stats.byte_count;
 		air_time = (void *)&stats->load_stats.air_time;
 	}
 
@@ -802,6 +840,15 @@ void iwl_mvm_handle_rx_statistics(struct iwl_mvm *mvm,
 	for (i = 0; i < NUM_MAC_INDEX_DRIVER; i++) {
 		struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[i];
 		u32 airtime = le32_to_cpu(air_time[i]);
+		u32 rx_bytes = le32_to_cpu(bytes[i]);
+
+		mdata->uapsd_nonagg_detect.rx_bytes += rx_bytes;
+		if (airtime) {
+			/* re-init every time to store rate from FW */
+			ewma_rate_init(&mdata->uapsd_nonagg_detect.rate);
+			ewma_rate_add(&mdata->uapsd_nonagg_detect.rate,
+				      rx_bytes * 8 / airtime);
+		}
 
 		mdata->rx.airtime += airtime;
 	}
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 27905f3fe3ba..ea0332a2389e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -1503,12 +1503,109 @@ static void iwl_mvm_tcm_results(struct iwl_mvm *mvm)
 	mutex_unlock(&mvm->mutex);
 }
 
+static void iwl_mvm_tcm_uapsd_nonagg_detected_wk(struct work_struct *wk)
+{
+	struct iwl_mvm *mvm;
+	struct iwl_mvm_vif *mvmvif;
+	struct ieee80211_vif *vif;
+
+	mvmvif = container_of(wk, struct iwl_mvm_vif,
+			      uapsd_nonagg_detected_wk.work);
+	vif = container_of((void *)mvmvif, struct ieee80211_vif, drv_priv);
+	mvm = mvmvif->mvm;
+
+	if (mvm->tcm.data[mvmvif->id].opened_rx_ba_sessions)
+		return;
+
+	/* remember that this AP is broken */
+	memcpy(mvm->uapsd_noagg_bssids[mvm->uapsd_noagg_bssid_write_idx].addr,
+	       vif->bss_conf.bssid, ETH_ALEN);
+	mvm->uapsd_noagg_bssid_write_idx++;
+	if (mvm->uapsd_noagg_bssid_write_idx >= IWL_MVM_UAPSD_NOAGG_LIST_LEN)
+		mvm->uapsd_noagg_bssid_write_idx = 0;
+
+	iwl_mvm_connection_loss(mvm, vif,
+				"AP isn't using AMPDU with uAPSD enabled");
+}
+
+static void iwl_mvm_uapsd_agg_disconnect_iter(void *data, u8 *mac,
+					      struct ieee80211_vif *vif)
+{
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+	struct iwl_mvm *mvm = mvmvif->mvm;
+	int *mac_id = data;
+
+	if (vif->type != NL80211_IFTYPE_STATION)
+		return;
+
+	if (mvmvif->id != *mac_id)
+		return;
+
+	if (!vif->bss_conf.assoc)
+		return;
+
+	if (!mvmvif->queue_params[IEEE80211_AC_VO].uapsd &&
+	    !mvmvif->queue_params[IEEE80211_AC_VI].uapsd &&
+	    !mvmvif->queue_params[IEEE80211_AC_BE].uapsd &&
+	    !mvmvif->queue_params[IEEE80211_AC_BK].uapsd)
+		return;
+
+	if (mvm->tcm.data[*mac_id].uapsd_nonagg_detect.detected)
+		return;
+
+	mvm->tcm.data[*mac_id].uapsd_nonagg_detect.detected = true;
+	IWL_INFO(mvm,
+		 "detected AP should do aggregation but isn't, likely due to U-APSD\n");
+	schedule_delayed_work(&mvmvif->uapsd_nonagg_detected_wk, 15 * HZ);
+}
+
+static void iwl_mvm_check_uapsd_agg_expected_tpt(struct iwl_mvm *mvm,
+						 unsigned int elapsed,
+						 int mac)
+{
+	u64 bytes = mvm->tcm.data[mac].uapsd_nonagg_detect.rx_bytes;
+	u64 tpt;
+	unsigned long rate;
+
+	rate = ewma_rate_read(&mvm->tcm.data[mac].uapsd_nonagg_detect.rate);
+
+	if (!rate || mvm->tcm.data[mac].opened_rx_ba_sessions ||
+	    mvm->tcm.data[mac].uapsd_nonagg_detect.detected)
+		return;
+
+	if (iwl_mvm_has_new_rx_api(mvm)) {
+		tpt = 8 * bytes; /* kbps */
+		do_div(tpt, elapsed);
+		rate *= 1000; /* kbps */
+		if (tpt < 22 * rate / 100)
+			return;
+	} else {
+		/*
+		 * the rate here is actually the threshold, in 100Kbps units,
+		 * so do the needed conversion from bytes to 100Kbps:
+		 * 100kb = bits / (100 * 1000),
+		 * 100kbps = 100kb / (msecs / 1000) ==
+		 *           (bits / (100 * 1000)) / (msecs / 1000) ==
+		 *           bits / (100 * msecs)
+		 */
+		tpt = (8 * bytes);
+		do_div(tpt, elapsed * 100);
+		if (tpt < rate)
+			return;
+	}
+
+	ieee80211_iterate_active_interfaces_atomic(
+		mvm->hw, IEEE80211_IFACE_ITER_NORMAL,
+		iwl_mvm_uapsd_agg_disconnect_iter, &mac);
+}
 
 static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 					    unsigned long ts,
 					    bool handle_uapsd)
 {
 	unsigned int elapsed = jiffies_to_msecs(ts - mvm->tcm.ts);
+	unsigned int uapsd_elapsed =
+		jiffies_to_msecs(ts - mvm->tcm.uapsd_nonagg_ts);
 	u32 total_airtime = 0;
 	int ac, mac;
 	bool low_latency = false;
@@ -1551,6 +1648,12 @@ static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 		}
 		low_latency |= mvm->tcm.result.low_latency[mac];
 
+		if (!mvm->tcm.result.low_latency[mac] && handle_uapsd)
+			iwl_mvm_check_uapsd_agg_expected_tpt(mvm, uapsd_elapsed,
+							     mac);
+		/* clear old data */
+		if (handle_uapsd)
+			mdata->uapsd_nonagg_detect.rx_bytes = 0;
 		memset(&mdata->rx.airtime, 0, sizeof(mdata->rx.airtime));
 		memset(&mdata->tx.airtime, 0, sizeof(mdata->tx.airtime));
 	}
@@ -1592,7 +1695,8 @@ void iwl_mvm_recalc_tcm(struct iwl_mvm *mvm)
 {
 	unsigned long ts = jiffies;
 	bool handle_uapsd =
-		false;
+		time_after(ts, mvm->tcm.uapsd_nonagg_ts +
+			       msecs_to_jiffies(IWL_MVM_UAPSD_NONAGG_PERIOD));
 
 	spin_lock(&mvm->tcm.lock);
 	if (mvm->tcm.paused || !time_after(ts, mvm->tcm.ts + MVM_TCM_PERIOD)) {
@@ -1601,6 +1705,12 @@ void iwl_mvm_recalc_tcm(struct iwl_mvm *mvm)
 	}
 	spin_unlock(&mvm->tcm.lock);
 
+	if (handle_uapsd && iwl_mvm_has_new_rx_api(mvm)) {
+		mutex_lock(&mvm->mutex);
+		if (iwl_mvm_request_statistics(mvm, true))
+			handle_uapsd = false;
+		mutex_unlock(&mvm->mutex);
+	}
 
 	spin_lock(&mvm->tcm.lock);
 	/* re-check if somebody else won the recheck race */
@@ -1659,6 +1769,21 @@ void iwl_mvm_resume_tcm(struct iwl_mvm *mvm)
 	spin_unlock_bh(&mvm->tcm.lock);
 }
 
+void iwl_mvm_tcm_add_vif(struct iwl_mvm *mvm, struct ieee80211_vif *vif)
+{
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+
+	INIT_DELAYED_WORK(&mvmvif->uapsd_nonagg_detected_wk,
+			  iwl_mvm_tcm_uapsd_nonagg_detected_wk);
+}
+
+void iwl_mvm_tcm_rm_vif(struct iwl_mvm *mvm, struct ieee80211_vif *vif)
+{
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+
+	cancel_delayed_work_sync(&mvmvif->uapsd_nonagg_detected_wk);
+}
+
 
 void iwl_mvm_get_sync_time(struct iwl_mvm *mvm, u32 *gp2, u64 *boottime)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b66b5817a0ce0e5653f4f60ab583c19cb5dab546 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 31 Jan 2017 10:39:44 +0200
Subject: iwlwifi: mvm: detect low latency and traffic load per band

Detect low latency and traffic load per band.  Add support for
deciding on scan type and timings per band.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/fw.c    |   1 +
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h   |   8 ++
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c  | 167 ++++++++++++++++++-------
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c |  66 ++++++++--
 4 files changed, 192 insertions(+), 50 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
index 3c59109bea20..4cf3e32e3dba 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
@@ -1093,6 +1093,7 @@ int iwl_mvm_up(struct iwl_mvm *mvm)
 
 	if (fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_UMAC_SCAN)) {
 		mvm->scan_type = IWL_SCAN_TYPE_NOT_SET;
+		mvm->hb_scan_type = IWL_SCAN_TYPE_NOT_SET;
 		ret = iwl_mvm_config_scan(mvm);
 		if (ret)
 			goto error;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index acc36eb1ef39..2150b785387f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -638,6 +640,7 @@ struct iwl_mvm_tcm {
 		u32 elapsed; /* milliseconds for this TCM period */
 		u32 airtime[NUM_MAC_INDEX_DRIVER];
 		enum iwl_mvm_traffic_load load[NUM_MAC_INDEX_DRIVER];
+		enum iwl_mvm_traffic_load band_load[NUM_NL80211_BANDS];
 		enum iwl_mvm_traffic_load global_load;
 		bool low_latency[NUM_MAC_INDEX_DRIVER];
 		bool change[NUM_MAC_INDEX_DRIVER];
@@ -879,7 +882,10 @@ struct iwl_mvm {
 	unsigned int scan_status;
 	void *scan_cmd;
 	struct iwl_mcast_filter_cmd *mcast_filter_cmd;
+	/* For CDB this is low band scan type, for non-CDB - type. */
 	enum iwl_mvm_scan_type scan_type;
+	enum iwl_mvm_scan_type hb_scan_type;
+
 	enum iwl_mvm_sched_scan_pass_all_states sched_scan_pass_all;
 	struct delayed_work scan_timeout_dwork;
 
@@ -1828,6 +1834,8 @@ int iwl_mvm_update_low_latency(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 			      enum iwl_mvm_low_latency_cause cause);
 /* get SystemLowLatencyMode - only needed for beacon threshold? */
 bool iwl_mvm_low_latency(struct iwl_mvm *mvm);
+bool iwl_mvm_low_latency_band(struct iwl_mvm *mvm, enum nl80211_band band);
+
 /* get VMACLowLatencyMode */
 static inline bool iwl_mvm_vif_low_latency(struct iwl_mvm_vif *mvmvif)
 {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 8f3185b9d365..8fcd92dcaaa7 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -19,9 +20,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
- * USA
+ * along with this program
  *
  * The full GNU General Public License is included in this distribution
  * in the file called COPYING.
@@ -117,7 +116,9 @@ static struct iwl_mvm_scan_timing_params scan_timing[] = {
 };
 
 struct iwl_mvm_scan_params {
+	/* For CDB this is low band scan type, for non-CDB - type. */
 	enum iwl_mvm_scan_type type;
+	enum iwl_mvm_scan_type hb_type;
 	u32 n_channels;
 	u16 delay;
 	int n_ssids;
@@ -231,12 +232,18 @@ static enum iwl_mvm_traffic_load iwl_mvm_get_traffic_load(struct iwl_mvm *mvm)
 	return mvm->tcm.result.global_load;
 }
 
+static enum iwl_mvm_traffic_load
+iwl_mvm_get_traffic_load_band(struct iwl_mvm *mvm, enum nl80211_band band)
+{
+	return mvm->tcm.result.band_load[band];
+}
+
 static enum
-iwl_mvm_scan_type iwl_mvm_get_scan_type(struct iwl_mvm *mvm, bool p2p_device)
+iwl_mvm_scan_type _iwl_mvm_get_scan_type(struct iwl_mvm *mvm, bool p2p_device,
+					 enum iwl_mvm_traffic_load load,
+					 bool low_latency)
 {
 	int global_cnt = 0;
-	enum iwl_mvm_traffic_load load;
-	bool low_latency;
 
 	ieee80211_iterate_active_interfaces_atomic(mvm->hw,
 					    IEEE80211_IFACE_ITER_NORMAL,
@@ -245,9 +252,6 @@ iwl_mvm_scan_type iwl_mvm_get_scan_type(struct iwl_mvm *mvm, bool p2p_device)
 	if (!global_cnt)
 		return IWL_SCAN_TYPE_UNASSOC;
 
-	load = iwl_mvm_get_traffic_load(mvm);
-	low_latency = iwl_mvm_low_latency(mvm);
-
 	if ((load == IWL_MVM_TRAFFIC_HIGH || low_latency) && !p2p_device &&
 	    fw_has_api(&mvm->fw->ucode_capa, IWL_UCODE_TLV_API_FRAGMENTED_SCAN))
 		return IWL_SCAN_TYPE_FRAGMENTED;
@@ -258,25 +262,57 @@ iwl_mvm_scan_type iwl_mvm_get_scan_type(struct iwl_mvm *mvm, bool p2p_device)
 	return IWL_SCAN_TYPE_WILD;
 }
 
+static enum
+iwl_mvm_scan_type iwl_mvm_get_scan_type(struct iwl_mvm *mvm, bool p2p_device)
+{
+	enum iwl_mvm_traffic_load load;
+	bool low_latency;
+
+	load = iwl_mvm_get_traffic_load(mvm);
+	low_latency = iwl_mvm_low_latency(mvm);
+
+	return _iwl_mvm_get_scan_type(mvm, p2p_device, load, low_latency);
+}
+
+static enum
+iwl_mvm_scan_type iwl_mvm_get_scan_type_band(struct iwl_mvm *mvm,
+					     bool p2p_device,
+					     enum nl80211_band band)
+{
+	enum iwl_mvm_traffic_load load;
+	bool low_latency;
+
+	load = iwl_mvm_get_traffic_load_band(mvm, band);
+	low_latency = iwl_mvm_low_latency_band(mvm, band);
+
+	return _iwl_mvm_get_scan_type(mvm, p2p_device, load, low_latency);
+}
+
 static int
 iwl_mvm_get_measurement_dwell(struct iwl_mvm *mvm,
 			      struct cfg80211_scan_request *req,
 			      struct iwl_mvm_scan_params *params)
 {
+	u32 duration = scan_timing[params->type].max_out_time;
+
 	if (!req->duration)
 		return 0;
 
-	if (req->duration_mandatory &&
-	    req->duration > scan_timing[params->type].max_out_time) {
+	if (iwl_mvm_is_cdb_supported(mvm)) {
+		u32 hb_time = scan_timing[params->hb_type].max_out_time;
+
+		duration = min_t(u32, duration, hb_time);
+	}
+
+	if (req->duration_mandatory && req->duration > duration) {
 		IWL_DEBUG_SCAN(mvm,
 			       "Measurement scan - too long dwell %hu (max out time %u)\n",
 			       req->duration,
-			       scan_timing[params->type].max_out_time);
+			       duration);
 		return -EOPNOTSUPP;
 	}
 
-	return min_t(u32, (u32)req->duration,
-		     scan_timing[params->type].max_out_time);
+	return min_t(u32, (u32)req->duration, duration);
 }
 
 static inline bool iwl_mvm_rrm_scan_needed(struct iwl_mvm *mvm)
@@ -1025,22 +1061,38 @@ static void iwl_mvm_fill_scan_config_v1(struct iwl_mvm *mvm, void *config,
 static void iwl_mvm_fill_scan_config(struct iwl_mvm *mvm, void *config,
 				     u32 flags, u8 channel_flags)
 {
-	enum iwl_mvm_scan_type type = iwl_mvm_get_scan_type(mvm, false);
 	struct iwl_scan_config *cfg = config;
 
 	cfg->flags = cpu_to_le32(flags);
 	cfg->tx_chains = cpu_to_le32(iwl_mvm_get_valid_tx_ant(mvm));
 	cfg->rx_chains = cpu_to_le32(iwl_mvm_scan_rx_ant(mvm));
 	cfg->legacy_rates = iwl_mvm_scan_config_rates(mvm);
-	cfg->out_of_channel_time[0] =
-		cpu_to_le32(scan_timing[type].max_out_time);
-	cfg->suspend_time[0] = cpu_to_le32(scan_timing[type].suspend_time);
 
 	if (iwl_mvm_is_cdb_supported(mvm)) {
-		cfg->suspend_time[1] =
-			cpu_to_le32(scan_timing[type].suspend_time);
-		cfg->out_of_channel_time[1] =
+		enum iwl_mvm_scan_type lb_type, hb_type;
+
+		lb_type = iwl_mvm_get_scan_type_band(mvm, false,
+						     NL80211_BAND_2GHZ);
+		hb_type = iwl_mvm_get_scan_type_band(mvm, false,
+						     NL80211_BAND_5GHZ);
+
+		cfg->out_of_channel_time[SCAN_LB_LMAC_IDX] =
+			cpu_to_le32(scan_timing[lb_type].max_out_time);
+		cfg->suspend_time[SCAN_LB_LMAC_IDX] =
+			cpu_to_le32(scan_timing[lb_type].suspend_time);
+
+		cfg->out_of_channel_time[SCAN_HB_LMAC_IDX] =
+			cpu_to_le32(scan_timing[hb_type].max_out_time);
+		cfg->suspend_time[SCAN_HB_LMAC_IDX] =
+			cpu_to_le32(scan_timing[hb_type].suspend_time);
+	} else {
+		enum iwl_mvm_scan_type type =
+			iwl_mvm_get_scan_type(mvm, false);
+
+		cfg->out_of_channel_time[SCAN_LB_LMAC_IDX] =
 			cpu_to_le32(scan_timing[type].max_out_time);
+		cfg->suspend_time[SCAN_LB_LMAC_IDX] =
+			cpu_to_le32(scan_timing[type].suspend_time);
 	}
 
 	iwl_mvm_fill_scan_dwell(mvm, &cfg->dwell);
@@ -1060,7 +1112,8 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 	struct iwl_host_cmd cmd = {
 		.id = iwl_cmd_id(SCAN_CFG_CMD, IWL_ALWAYS_LONG_GROUP, 0),
 	};
-	enum iwl_mvm_scan_type type = iwl_mvm_get_scan_type(mvm, false);
+	enum iwl_mvm_scan_type type;
+	enum iwl_mvm_scan_type hb_type = IWL_SCAN_TYPE_NOT_SET;
 	int num_channels =
 		mvm->nvm_data->bands[NL80211_BAND_2GHZ].n_channels +
 		mvm->nvm_data->bands[NL80211_BAND_5GHZ].n_channels;
@@ -1070,8 +1123,18 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 	if (WARN_ON(num_channels > mvm->fw->ucode_capa.n_scan_channels))
 		return -ENOBUFS;
 
-	if (type == mvm->scan_type)
-		return 0;
+	if (iwl_mvm_is_cdb_supported(mvm)) {
+		type = iwl_mvm_get_scan_type_band(mvm, false,
+						  NL80211_BAND_2GHZ);
+		hb_type = iwl_mvm_get_scan_type_band(mvm, false,
+						     NL80211_BAND_5GHZ);
+		if (type == mvm->scan_type && hb_type == mvm->hb_scan_type)
+			return 0;
+	} else {
+		type = iwl_mvm_get_scan_type(mvm, false);
+		if (type == mvm->scan_type)
+			return 0;
+	}
 
 	if (iwl_mvm_has_new_tx_api(mvm))
 		cmd_size = sizeof(struct iwl_scan_config);
@@ -1102,10 +1165,15 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 			IWL_CHANNEL_FLAG_EBS_ADD |
 			IWL_CHANNEL_FLAG_PRE_SCAN_PASSIVE2ACTIVE;
 
+	/*
+	 * Check for fragmented scan on LMAC2 - high band.
+	 * LMAC1 - low band is checked above.
+	 */
 	if (iwl_mvm_has_new_tx_api(mvm)) {
-		flags |= (type == IWL_SCAN_TYPE_FRAGMENTED) ?
-			 SCAN_CONFIG_FLAG_SET_LMAC2_FRAGMENTED :
-			 SCAN_CONFIG_FLAG_CLEAR_LMAC2_FRAGMENTED;
+		if (iwl_mvm_is_cdb_supported(mvm))
+			flags |= (hb_type == IWL_SCAN_TYPE_FRAGMENTED) ?
+				 SCAN_CONFIG_FLAG_SET_LMAC2_FRAGMENTED :
+				 SCAN_CONFIG_FLAG_CLEAR_LMAC2_FRAGMENTED;
 		iwl_mvm_fill_scan_config(mvm, cfg, flags, channel_flags);
 	} else {
 		iwl_mvm_fill_scan_config_v1(mvm, cfg, flags, channel_flags);
@@ -1118,8 +1186,10 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 	IWL_DEBUG_SCAN(mvm, "Sending UMAC scan config\n");
 
 	ret = iwl_mvm_send_cmd(mvm, &cmd);
-	if (!ret)
+	if (!ret) {
 		mvm->scan_type = type;
+		mvm->hb_scan_type = hb_type;
+	}
 
 	kfree(cfg);
 	return ret;
@@ -1173,7 +1243,7 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm,
 			cpu_to_le32(timing->suspend_time);
 
 		if (iwl_mvm_is_cdb_supported(mvm)) {
-			hb_timing = &scan_timing[params->type];
+			hb_timing = &scan_timing[params->hb_type];
 
 			cmd->v7.max_out_time[SCAN_HB_LMAC_IDX] =
 				cpu_to_le32(hb_timing->max_out_time);
@@ -1203,7 +1273,7 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm,
 		cmd->v1.fragmented_dwell = IWL_SCAN_DWELL_FRAGMENTED;
 
 		if (iwl_mvm_is_cdb_supported(mvm)) {
-			hb_timing = &scan_timing[params->type];
+			hb_timing = &scan_timing[params->hb_type];
 
 			cmd->v6.max_out_time[SCAN_HB_LMAC_IDX] =
 					cpu_to_le32(hb_timing->max_out_time);
@@ -1212,8 +1282,7 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm,
 		}
 
 		if (iwl_mvm_has_new_tx_api(mvm)) {
-			cmd->v6.scan_priority =
-				cpu_to_le32(IWL_SCAN_PRIORITY_EXT_6);
+			cmd->v6.scan_priority = cpu_to_le32(IWL_SCAN_PRIORITY_EXT_6);
 			cmd->v6.max_out_time[SCAN_LB_LMAC_IDX] =
 				cpu_to_le32(timing->max_out_time);
 			cmd->v6.suspend_time[SCAN_LB_LMAC_IDX] =
@@ -1257,11 +1326,12 @@ static u16 iwl_mvm_scan_umac_flags(struct iwl_mvm *mvm,
 	if (params->n_ssids == 1 && params->ssids[0].ssid_len != 0)
 		flags |= IWL_UMAC_SCAN_GEN_FLAGS_PRE_CONNECT;
 
-	if (params->type == IWL_SCAN_TYPE_FRAGMENTED) {
+	if (params->type == IWL_SCAN_TYPE_FRAGMENTED)
 		flags |= IWL_UMAC_SCAN_GEN_FLAGS_FRAGMENTED;
-		if (iwl_mvm_is_cdb_supported(mvm))
-			flags |= IWL_UMAC_SCAN_GEN_FLAGS_LMAC2_FRAGMENTED;
-	}
+
+	if (iwl_mvm_is_cdb_supported(mvm) &&
+	    params->hb_type == IWL_SCAN_TYPE_FRAGMENTED)
+		flags |= IWL_UMAC_SCAN_GEN_FLAGS_LMAC2_FRAGMENTED;
 
 	if (iwl_mvm_rrm_scan_needed(mvm) &&
 	    fw_has_capa(&mvm->fw->ucode_capa,
@@ -1492,6 +1562,21 @@ void iwl_mvm_scan_timeout_wk(struct work_struct *work)
 	iwl_force_nmi(mvm->trans);
 }
 
+static void iwl_mvm_fill_scan_type(struct iwl_mvm *mvm,
+				   struct iwl_mvm_scan_params *params,
+				   bool p2p)
+{
+	if (iwl_mvm_is_cdb_supported(mvm)) {
+		params->type =
+			iwl_mvm_get_scan_type_band(mvm, p2p,
+						   NL80211_BAND_2GHZ);
+		params->hb_type =
+			iwl_mvm_get_scan_type_band(mvm, p2p,
+						   NL80211_BAND_5GHZ);
+	} else {
+		params->type = iwl_mvm_get_scan_type(mvm, p2p);
+	}
+}
 int iwl_mvm_reg_scan_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 			   struct cfg80211_scan_request *req,
 			   struct ieee80211_scan_ies *ies)
@@ -1539,9 +1624,8 @@ int iwl_mvm_reg_scan_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	params.scan_plans = &scan_plan;
 	params.n_scan_plans = 1;
 
-	params.type =
-		iwl_mvm_get_scan_type(mvm,
-				      vif->type == NL80211_IFTYPE_P2P_DEVICE);
+	iwl_mvm_fill_scan_type(mvm, &params,
+			       vif->type == NL80211_IFTYPE_P2P_DEVICE);
 
 	ret = iwl_mvm_get_measurement_dwell(mvm, req, &params);
 	if (ret < 0)
@@ -1636,9 +1720,8 @@ int iwl_mvm_sched_scan_start(struct iwl_mvm *mvm,
 	params.n_scan_plans = req->n_scan_plans;
 	params.scan_plans = req->scan_plans;
 
-	params.type =
-		iwl_mvm_get_scan_type(mvm,
-				      vif->type == NL80211_IFTYPE_P2P_DEVICE);
+	iwl_mvm_fill_scan_type(mvm, &params,
+			       vif->type == NL80211_IFTYPE_P2P_DEVICE);
 
 	/* In theory, LMAC scans can handle a 32-bit delay, but since
 	 * waiting for over 18 hours to start the scan is a bit silly
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index ea0332a2389e..815d2d141fd8 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -1074,23 +1074,48 @@ int iwl_mvm_update_low_latency(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	return iwl_mvm_power_update_mac(mvm);
 }
 
+struct iwl_mvm_low_latency_iter {
+	bool result;
+	bool result_per_band[NUM_NL80211_BANDS];
+};
+
 static void iwl_mvm_ll_iter(void *_data, u8 *mac, struct ieee80211_vif *vif)
 {
-	bool *result = _data;
+	struct iwl_mvm_low_latency_iter *result = _data;
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+	enum nl80211_band band;
+
+	if (iwl_mvm_vif_low_latency(mvmvif)) {
+		result->result = true;
+
+		if (!mvmvif->phy_ctxt)
+			return;
 
-	if (iwl_mvm_vif_low_latency(iwl_mvm_vif_from_mac80211(vif)))
-		*result = true;
+		band = mvmvif->phy_ctxt->channel->band;
+		result->result_per_band[band] = true;
+	}
 }
 
 bool iwl_mvm_low_latency(struct iwl_mvm *mvm)
 {
-	bool result = false;
+	struct iwl_mvm_low_latency_iter data = {};
 
 	ieee80211_iterate_active_interfaces_atomic(
 			mvm->hw, IEEE80211_IFACE_ITER_NORMAL,
-			iwl_mvm_ll_iter, &result);
+			iwl_mvm_ll_iter, &data);
 
-	return result;
+	return data.result;
+}
+
+bool iwl_mvm_low_latency_band(struct iwl_mvm *mvm, enum nl80211_band band)
+{
+	struct iwl_mvm_low_latency_iter data = {};
+
+	ieee80211_iterate_active_interfaces_atomic(
+			mvm->hw, IEEE80211_IFACE_ITER_NORMAL,
+			iwl_mvm_ll_iter, &data);
+
+	return data.result_per_band[band];
 }
 
 struct iwl_bss_iter_data {
@@ -1599,6 +1624,18 @@ static void iwl_mvm_check_uapsd_agg_expected_tpt(struct iwl_mvm *mvm,
 		iwl_mvm_uapsd_agg_disconnect_iter, &mac);
 }
 
+static void iwl_mvm_tcm_iterator(void *_data, u8 *mac,
+				 struct ieee80211_vif *vif)
+{
+	struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
+	u32 *band = _data;
+
+	if (!mvmvif->phy_ctxt)
+		return;
+
+	band[mvmvif->id] = mvmvif->phy_ctxt->channel->band;
+}
+
 static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 					    unsigned long ts,
 					    bool handle_uapsd)
@@ -1607,9 +1644,11 @@ static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 	unsigned int uapsd_elapsed =
 		jiffies_to_msecs(ts - mvm->tcm.uapsd_nonagg_ts);
 	u32 total_airtime = 0;
-	int ac, mac;
+	u32 band_airtime[NUM_NL80211_BANDS] = {0};
+	u32 band[NUM_MAC_INDEX_DRIVER] = {0};
+	int ac, mac, i;
 	bool low_latency = false;
-	enum iwl_mvm_traffic_load load;
+	enum iwl_mvm_traffic_load load, band_load;
 	bool handle_ll = time_after(ts, mvm->tcm.ll_ts + MVM_LL_PERIOD);
 
 	if (handle_ll)
@@ -1619,12 +1658,18 @@ static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 
 	mvm->tcm.result.elapsed = elapsed;
 
+	ieee80211_iterate_active_interfaces_atomic(mvm->hw,
+						   IEEE80211_IFACE_ITER_NORMAL,
+						   iwl_mvm_tcm_iterator,
+						   &band);
+
 	for (mac = 0; mac < NUM_MAC_INDEX_DRIVER; mac++) {
 		struct iwl_mvm_tcm_mac *mdata = &mvm->tcm.data[mac];
 		u32 vo_vi_pkts = 0;
 		u32 airtime = mdata->rx.airtime + mdata->tx.airtime;
 
 		total_airtime += airtime;
+		band_airtime[band[mac]] += airtime;
 
 		load = iwl_mvm_tcm_load(mvm, airtime, elapsed);
 		mvm->tcm.result.change[mac] = load != mvm->tcm.result.load[mac];
@@ -1662,6 +1707,11 @@ static unsigned long iwl_mvm_calc_tcm_stats(struct iwl_mvm *mvm,
 	mvm->tcm.result.global_change = load != mvm->tcm.result.global_load;
 	mvm->tcm.result.global_load = load;
 
+	for (i = 0; i < NUM_NL80211_BANDS; i++) {
+		band_load = iwl_mvm_tcm_load(mvm, band_airtime[i], elapsed);
+		mvm->tcm.result.band_load[i] = band_load;
+	}
+
 	/*
 	 * If the current load isn't low we need to force re-evaluation
 	 * in the TCM period, so that we can return to low load if there
-- 
cgit v1.2.3-59-g8ed1b


From 622111a2d21baca39bb8d5cf56c2d509e50fe278 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 8 Jun 2017 09:07:11 +0200
Subject: iwlwifi: mvm: clean up scan capability checks

Introduce and use iwl_mvm_cdb_scan_api(), which checks the family.
Most of this will go away once the 22000 firmware supports adaptive
dwell, after which the V6 scan API won't be used, but the V3 scan
*config* API will still need to be distinguished.

In any case, this gets rid of the completely bogus has_new_tx_api()
checks.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h  | 10 ++++++++++
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 15 ++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 2150b785387f..2411cc91f833 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -1356,6 +1356,16 @@ static inline bool iwl_mvm_is_cdb_supported(struct iwl_mvm *mvm)
 			   IWL_UCODE_TLV_CAPA_CDB_SUPPORT);
 }
 
+static inline bool iwl_mvm_cdb_scan_api(struct iwl_mvm *mvm)
+{
+	/*
+	 * TODO: should this be the same as iwl_mvm_is_cdb_supported()?
+	 * but then there's a little bit of code in scan that won't make
+	 * any sense...
+	 */
+	return mvm->trans->cfg->device_family >= IWL_DEVICE_FAMILY_22000;
+}
+
 static inline bool iwl_mvm_has_new_rx_stats_api(struct iwl_mvm *mvm)
 {
 	return fw_has_api(&mvm->fw->ucode_capa,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 8fcd92dcaaa7..8c5f4f96d6d4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -147,7 +147,7 @@ static inline void *iwl_mvm_get_scan_req_umac_data(struct iwl_mvm *mvm)
 	if (iwl_mvm_is_adaptive_dwell_supported(mvm))
 		return (void *)&cmd->v7.data;
 
-	if (iwl_mvm_has_new_tx_api(mvm))
+	if (iwl_mvm_cdb_scan_api(mvm))
 		return (void *)&cmd->v6.data;
 
 	return (void *)&cmd->v1.data;
@@ -164,7 +164,7 @@ iwl_mvm_get_scan_req_umac_channel(struct iwl_mvm *mvm)
 	if (iwl_mvm_is_adaptive_dwell_supported(mvm))
 		return &cmd->v7.channel;
 
-	if (iwl_mvm_has_new_tx_api(mvm))
+	if (iwl_mvm_cdb_scan_api(mvm))
 		return &cmd->v6.channel;
 
 	return &cmd->v1.channel;
@@ -1136,7 +1136,7 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 			return 0;
 	}
 
-	if (iwl_mvm_has_new_tx_api(mvm))
+	if (iwl_mvm_cdb_scan_api(mvm))
 		cmd_size = sizeof(struct iwl_scan_config);
 	else
 		cmd_size = sizeof(struct iwl_scan_config_v1);
@@ -1169,7 +1169,7 @@ int iwl_mvm_config_scan(struct iwl_mvm *mvm)
 	 * Check for fragmented scan on LMAC2 - high band.
 	 * LMAC1 - low band is checked above.
 	 */
-	if (iwl_mvm_has_new_tx_api(mvm)) {
+	if (iwl_mvm_cdb_scan_api(mvm)) {
 		if (iwl_mvm_is_cdb_supported(mvm))
 			flags |= (hb_type == IWL_SCAN_TYPE_FRAGMENTED) ?
 				 SCAN_CONFIG_FLAG_SET_LMAC2_FRAGMENTED :
@@ -1281,8 +1281,9 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm,
 					cpu_to_le32(hb_timing->suspend_time);
 		}
 
-		if (iwl_mvm_has_new_tx_api(mvm)) {
-			cmd->v6.scan_priority = cpu_to_le32(IWL_SCAN_PRIORITY_EXT_6);
+		if (iwl_mvm_cdb_scan_api(mvm)) {
+			cmd->v6.scan_priority =
+				cpu_to_le32(IWL_SCAN_PRIORITY_EXT_6);
 			cmd->v6.max_out_time[SCAN_LB_LMAC_IDX] =
 				cpu_to_le32(timing->max_out_time);
 			cmd->v6.suspend_time[SCAN_LB_LMAC_IDX] =
@@ -1909,7 +1910,7 @@ int iwl_mvm_scan_size(struct iwl_mvm *mvm)
 		base_size = IWL_SCAN_REQ_UMAC_SIZE_V8;
 	else if (iwl_mvm_is_adaptive_dwell_supported(mvm))
 		base_size = IWL_SCAN_REQ_UMAC_SIZE_V7;
-	else if (iwl_mvm_has_new_tx_api(mvm))
+	else if (iwl_mvm_cdb_scan_api(mvm))
 		base_size = IWL_SCAN_REQ_UMAC_SIZE_V6;
 
 	if (fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_UMAC_SCAN))
-- 
cgit v1.2.3-59-g8ed1b


From cf58c9e0915aad684aa8e9b82afd260c67ee7b22 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Fri, 13 Apr 2018 13:17:16 +0300
Subject: iwlwifi: mvm: fix OOC priority in scans

The code that sets the correct out-of-channel priority depending on
the scan type was accidentally removed during a rebase.  Add it back.

Fixes: c1a7515393e4 ("iwlwifi: mvm: add adaptive dwell support")
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 8c5f4f96d6d4..4b3753d78d03 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1297,6 +1297,11 @@ static void iwl_mvm_scan_umac_dwell(struct iwl_mvm *mvm,
 				cpu_to_le32(timing->suspend_time);
 		}
 	}
+
+	if (iwl_mvm_is_regular_scan(params))
+		cmd->ooc_priority = cpu_to_le32(IWL_SCAN_PRIORITY_EXT_6);
+	else
+		cmd->ooc_priority = cpu_to_le32(IWL_SCAN_PRIORITY_EXT_2);
 }
 
 static void
-- 
cgit v1.2.3-59-g8ed1b


From 84226ca1c5d34e6a8492d12848e9cdf7752b834b Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Thu, 2 Nov 2017 04:07:52 +0200
Subject: iwlwifi: mvm: support offload of AMSDU rate control

Support the new APIs and activate AMSDU based on the
offloaded TLC decisions.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 .../net/wireless/intel/iwlwifi/fw/api/datapath.h   |  5 +++
 drivers/net/wireless/intel/iwlwifi/fw/api/rs.h     | 18 ++++++++--
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  8 +++--
 drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c     | 42 +++++++++++++++++++++-
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c        | 15 +++++---
 drivers/net/wireless/intel/iwlwifi/mvm/rs.h        |  5 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/sta.h       |  7 ++--
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        | 11 ++++--
 8 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h b/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
index a57c7223df0f..184cee98c359 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
@@ -92,6 +92,11 @@ enum iwl_data_path_subcmd_ids {
 	 */
 	TLC_MNG_NOTIF_REQ_CMD = 0x10,
 
+	/**
+	 * @TLC_MNG_AMSDU_ENABLE_NOTIF: &struct iwl_tlc_amsdu_notif
+	 */
+	TLC_MNG_AMSDU_ENABLE_NOTIF = 0xF6,
+
 	/**
 	 * @TLC_MNG_UPDATE_NOTIF: &struct iwl_tlc_update_notif
 	 */
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h b/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
index e49a6f7be613..1a8f3154a1fe 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
@@ -205,7 +205,7 @@ enum iwl_tlc_mng_ht_rates {
  * @non_ht_supp_rates: bitmap of supported legacy rates
  * @ht_supp_rates: bitmap of supported HT/VHT rates, valid bits are 0-9
  * @mode: &enum iwl_tlc_mng_cfg_mode
- * @reserved2: reserved
+ * @amsdu: TX amsdu is supported
  * @he_supp_rates: bitmap of supported HE rates
  * @sgi_ch_width_supp: bitmap of SGI support per channel width
  * @he_gi_support: 11ax HE guard interval
@@ -222,13 +222,27 @@ struct iwl_tlc_config_cmd {
 	__le16 non_ht_supp_rates;
 	__le16 ht_supp_rates[MAX_RS_ANT_NUM];
 	u8 mode;
-	u8 reserved2;
+	u8 amsdu;
 	__le16 he_supp_rates;
 	u8 sgi_ch_width_supp;
 	u8 he_gi_support;
 	__le32 max_ampdu_cnt;
 } __packed; /* TLC_MNG_CONFIG_CMD_API_S_VER_1 */
 
+/**
+ * struct iwl_tlc_amsdu_notif - TLC AMSDU configuration
+ * @sta_id: station id
+ * @reserved: reserved
+ * @amsdu_size: Max AMSDU size, in bytes
+ * @amsdu_enabled: bitmap for per-TID AMSDU enablement
+ */
+struct iwl_tlc_amsdu_notif {
+	u8 sta_id;
+	u8 reserved[3];
+	__le16 amsdu_size;
+	__le16 amsdu_enabled;
+} __packed; /* TLC_MNG_AMSDU_ENABLE_NTFY_API_S_VER_1 */
+
 #define IWL_TLC_NOTIF_INIT_RATE_POS 0
 #define IWL_TLC_NOTIF_INIT_RATE_MSK BIT(IWL_TLC_NOTIF_INIT_RATE_POS)
 #define IWL_TLC_NOTIF_REQ_INTERVAL (500)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 6a3f557543e4..1ef4ac21f32a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -250,6 +250,9 @@ static const struct iwl_rx_handlers iwl_mvm_rx_handlers[] = {
 	RX_HANDLER(TX_CMD, iwl_mvm_rx_tx_cmd, RX_HANDLER_SYNC),
 	RX_HANDLER(BA_NOTIF, iwl_mvm_rx_ba_notif, RX_HANDLER_SYNC),
 
+	RX_HANDLER_GRP(DATA_PATH_GROUP, TLC_MNG_UPDATE_NOTIF,
+		       iwl_mvm_tlc_update_notif, RX_HANDLER_SYNC),
+
 	RX_HANDLER(BT_PROFILE_NOTIFICATION, iwl_mvm_rx_bt_coex_notif,
 		   RX_HANDLER_ASYNC_LOCKED),
 	RX_HANDLER(BEACON_NOTIFICATION, iwl_mvm_rx_beacon_notif,
@@ -309,6 +312,8 @@ static const struct iwl_rx_handlers iwl_mvm_rx_handlers[] = {
 		       iwl_mvm_mu_mimo_grp_notif, RX_HANDLER_SYNC),
 	RX_HANDLER_GRP(DATA_PATH_GROUP, STA_PM_NOTIF,
 		       iwl_mvm_sta_pm_notif, RX_HANDLER_SYNC),
+	RX_HANDLER_GRP(DATA_PATH_GROUP, TLC_MNG_AMSDU_ENABLE_NOTIF,
+		       iwl_mvm_tlc_amsdu_notif, RX_HANDLER_SYNC),
 };
 #undef RX_HANDLER
 #undef RX_HANDLER_GRP
@@ -445,6 +450,7 @@ static const struct iwl_hcmd_names iwl_mvm_data_path_names[] = {
 	HCMD_NAME(DQA_ENABLE_CMD),
 	HCMD_NAME(UPDATE_MU_GROUPS_CMD),
 	HCMD_NAME(TRIGGER_RX_QUEUES_NOTIF_CMD),
+	HCMD_NAME(TLC_MNG_AMSDU_ENABLE_NOTIF),
 	HCMD_NAME(STA_PM_NOTIF),
 	HCMD_NAME(MU_GROUP_MGMT_NOTIF),
 	HCMD_NAME(RX_QUEUES_NOTIFICATION),
@@ -1034,8 +1040,6 @@ static void iwl_mvm_rx_mq(struct iwl_op_mode *op_mode,
 		iwl_mvm_rx_queue_notif(mvm, rxb, 0);
 	else if (cmd == WIDE_ID(LEGACY_GROUP, FRAME_RELEASE))
 		iwl_mvm_rx_frame_release(mvm, napi, rxb, 0);
-	else if (cmd == WIDE_ID(DATA_PATH_GROUP, TLC_MNG_UPDATE_NOTIF))
-		iwl_mvm_tlc_update_notif(mvm, pkt);
 	else
 		iwl_mvm_rx_common(mvm, rxb, pkt);
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
index fb5745660509..4e818bce469b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
@@ -228,8 +228,47 @@ static void rs_fw_tlc_mng_notif_req_config(struct iwl_mvm *mvm, u8 sta_id)
 		IWL_ERR(mvm, "Failed to send TLC notif request (%d)\n", ret);
 }
 
-void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm, struct iwl_rx_packet *pkt)
+void iwl_mvm_tlc_amsdu_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
 {
+	struct iwl_rx_packet *pkt = rxb_addr(rxb);
+	struct iwl_tlc_amsdu_notif *notif;
+	struct ieee80211_sta *sta;
+	struct iwl_mvm_sta *mvmsta;
+	u16 size;
+
+	notif = (void *)pkt->data;
+
+	if (WARN_ON(notif->sta_id >= ARRAY_SIZE(mvm->fw_id_to_mac_id)))
+		return;
+
+	rcu_read_lock();
+
+	sta = rcu_dereference(mvm->fw_id_to_mac_id[notif->sta_id]);
+	if (IS_ERR_OR_NULL(sta)) {
+		rcu_read_unlock();
+		IWL_ERR(mvm, "Invalid sta id (%d) in FW TLC notification\n",
+			notif->sta_id);
+		return;
+	}
+
+	mvmsta = iwl_mvm_sta_from_mac80211(sta);
+
+	size = min(le16_to_cpu(notif->amsdu_size), sta->max_amsdu_len);
+	mvmsta->amsdu_enabled = le16_to_cpu(notif->amsdu_enabled);
+	mvmsta->max_amsdu_len = size;
+
+	IWL_DEBUG_RATE(mvm,
+		       "AMSDU notification. AMSDU size: %d, AMSDU selected size: %d, AMSDU TID bitmap 0x%X\n",
+		       le16_to_cpu(notif->amsdu_size), size,
+		       mvmsta->amsdu_enabled);
+
+	rcu_read_unlock();
+};
+
+void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm,
+			      struct iwl_rx_cmd_buffer *rxb)
+{
+	struct iwl_rx_packet *pkt = rxb_addr(rxb);
 	struct iwl_tlc_update_notif *notif;
 	struct iwl_mvm_sta *mvmsta;
 	struct iwl_lq_sta_rs_fw *lq_sta;
@@ -273,6 +312,7 @@ void rs_fw_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 		.max_supp_ss = sta->rx_nss,
 		.max_ampdu_cnt = cpu_to_le32(mvmsta->max_agg_bufsize),
 		.sgi_ch_width_supp = rs_fw_sgi_cw_support(sta),
+		.amsdu = iwl_mvm_is_csum_supported(mvm),
 	};
 	int ret;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 5d776ec1840f..af7bc3b7b0c3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -1715,12 +1715,18 @@ static void rs_set_amsdu_len(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 {
 	struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
 
+	/*
+	 * In case TLC offload is not active amsdu_enabled is either 0xFFFF
+	 * or 0, since there is no per-TID alg.
+	 */
 	if ((!is_vht(&tbl->rate) && !is_ht(&tbl->rate)) ||
 	    tbl->rate.index < IWL_RATE_MCS_5_INDEX ||
 	    scale_action == RS_ACTION_DOWNSCALE)
-		mvmsta->tlc_amsdu = false;
+		mvmsta->amsdu_enabled = 0;
 	else
-		mvmsta->tlc_amsdu = true;
+		mvmsta->amsdu_enabled = 0xFFFF;
+
+	mvmsta->max_amsdu_len = sta->max_amsdu_len;
 }
 
 /*
@@ -3134,7 +3140,8 @@ static void rs_drv_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 	sband = hw->wiphy->bands[band];
 
 	lq_sta->lq.sta_id = mvmsta->sta_id;
-	mvmsta->tlc_amsdu = false;
+	mvmsta->amsdu_enabled = 0;
+	mvmsta->max_amsdu_len = sta->max_amsdu_len;
 
 	for (j = 0; j < LQ_SIZE; j++)
 		rs_rate_scale_clear_tbl_windows(mvm, &lq_sta->lq_info[j]);
@@ -3744,7 +3751,7 @@ static ssize_t rs_sta_dbgfs_scale_table_read(struct file *file,
 				(rate->sgi) ? "SGI" : "NGI",
 				(rate->ldpc) ? "LDPC" : "BCC",
 				(lq_sta->is_agg) ? "AGG on" : "",
-				(mvmsta->tlc_amsdu) ? "AMSDU on" : "");
+				(mvmsta->amsdu_enabled) ? "AMSDU on" : "");
 	}
 	desc += scnprintf(buff + desc, bufsz - desc, "last tx rate=0x%X\n",
 			lq_sta->last_rate_n_flags);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
index fb18cb8c233d..736ac9642921 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
@@ -454,5 +454,8 @@ void rs_fw_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 		     enum nl80211_band band);
 int rs_fw_tx_protection(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta,
 			bool enable);
-void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm, struct iwl_rx_packet *pkt);
+void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm,
+			      struct iwl_rx_cmd_buffer *rxb);
+void iwl_mvm_tlc_amsdu_notif(struct iwl_mvm *mvm,
+			     struct iwl_rx_cmd_buffer *rxb);
 #endif /* __rs__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
index 5ffd6adbc383..60502c81dfce 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
@@ -391,7 +391,9 @@ struct iwl_mvm_rxq_dup_data {
  * @tx_protection: reference counter for controlling the Tx protection.
  * @tt_tx_protection: is thermal throttling enable Tx protection?
  * @disable_tx: is tx to this STA disabled?
- * @tlc_amsdu: true if A-MSDU is allowed
+ * @amsdu_enabled: bitmap of TX AMSDU allowed TIDs.
+ *	In case TLC offload is not active it is either 0xFFFF or 0.
+ * @max_amsdu_len: max AMSDU length
  * @agg_tids: bitmap of tids whose status is operational aggregated (IWL_AGG_ON)
  * @sleep_tx_count: the number of frames that we told the firmware to let out
  *	even when that station is asleep. This is useful in case the queue
@@ -436,7 +438,8 @@ struct iwl_mvm_sta {
 	bool tt_tx_protection;
 
 	bool disable_tx;
-	bool tlc_amsdu;
+	u16 amsdu_enabled;
+	u16 max_amsdu_len;
 	bool sleeping;
 	bool associated;
 	u8 agg_tids;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index f06a0ee7be28..90381348697c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -774,9 +774,9 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 
 	dbg_max_amsdu_len = READ_ONCE(mvm->max_amsdu_len);
 
-	if (!sta->max_amsdu_len ||
+	if (!mvmsta->max_amsdu_len ||
 	    !ieee80211_is_data_qos(hdr->frame_control) ||
-	    (!mvmsta->tlc_amsdu && !dbg_max_amsdu_len))
+	    (!mvmsta->amsdu_enabled && !dbg_max_amsdu_len))
 		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
 
 	/*
@@ -803,7 +803,12 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	    !mvmsta->tid_data[tid].amsdu_in_ampdu_allowed)
 		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
 
-	max_amsdu_len = sta->max_amsdu_len;
+	if (iwl_mvm_vif_low_latency(iwl_mvm_vif_from_mac80211(mvmsta->vif)) ||
+	    tid_to_mac80211_ac[tid] < IEEE80211_AC_BE ||
+	    !(mvmsta->amsdu_enabled & BIT(tid)))
+		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
+
+	max_amsdu_len = mvmsta->max_amsdu_len;
 
 	/* the Tx FIFO to which this A-MSDU will be routed */
 	txf = iwl_mvm_mac_ac_to_tx_fifo(mvm, tid_to_mac80211_ac[tid]);
-- 
cgit v1.2.3-59-g8ed1b


From f79b8f9dc77238f03d7fffd8f34976cfc7b076b5 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 25 Dec 2017 15:15:06 +0200
Subject: iwlwifi: pcie: implement the overlow queue for Gen2 devices

When we enable TSO, we can have a lot of packets in the
operation mode that will be pushed to the transport
no matter what is the queue's fullness state.

To cope with that the transport can buffer those packets
and add them to the ring later when there is more room.
This implementation was missing in the Gen2 devices'
code.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 27 ++++++++++++++++++++---
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c      |  2 +-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index fabae0f60683..b605b364ed3f 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -488,6 +488,23 @@ int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb,
 
 	spin_lock(&txq->lock);
 
+	if (iwl_queue_space(txq) < txq->high_mark) {
+		iwl_stop_queue(trans, txq);
+
+		/* don't put the packet on the ring, if there is no room */
+		if (unlikely(iwl_queue_space(txq) < 3)) {
+			struct iwl_device_cmd **dev_cmd_ptr;
+
+			dev_cmd_ptr = (void *)((u8 *)skb->cb +
+					       trans_pcie->dev_cmd_offs);
+
+			*dev_cmd_ptr = dev_cmd;
+			__skb_queue_tail(&txq->overflow_q, skb);
+			spin_unlock(&txq->lock);
+			return 0;
+		}
+	}
+
 	idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr);
 
 	/* Set up driver data for this TFD */
@@ -523,9 +540,6 @@ int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb,
 	/* Tell device the write index *just past* this latest filled TFD */
 	txq->write_ptr = iwl_queue_inc_wrap(txq->write_ptr);
 	iwl_pcie_gen2_txq_inc_wr_ptr(trans, txq);
-	if (iwl_queue_space(txq) < txq->high_mark)
-		iwl_stop_queue(trans, txq);
-
 	/*
 	 * At this point the frame is "transmitted" successfully
 	 * and we will get a TX status notification eventually.
@@ -957,6 +971,13 @@ void iwl_pcie_gen2_txq_unmap(struct iwl_trans *trans, int txq_id)
 			spin_unlock_irqrestore(&trans_pcie->reg_lock, flags);
 		}
 	}
+
+	while (!skb_queue_empty(&txq->overflow_q)) {
+		struct sk_buff *skb = __skb_dequeue(&txq->overflow_q);
+
+		iwl_op_mode_free_skb(trans->op_mode, skb);
+	}
+
 	spin_unlock_bh(&txq->lock);
 
 	/* just in case - this queue may have been stopped */
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 1a566287993d..86873b99e5d7 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -1166,7 +1166,7 @@ void iwl_trans_pcie_reclaim(struct iwl_trans *trans, int txq_id, int ssn,
 			 * In that case, iwl_queue_space will be small again
 			 * and we won't wake mac80211's queue.
 			 */
-			iwl_trans_pcie_tx(trans, skb, dev_cmd_ptr, txq_id);
+			iwl_trans_tx(trans, skb, dev_cmd_ptr, txq_id);
 		}
 		spin_lock_bh(&txq->lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 422b5dd4294c3c5a3d88c77041f1fceff1d2e4ce Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Wed, 27 Dec 2017 22:10:28 +0200
Subject: iwlwifi: move timestamp functions from debugfs.h to dbg.h

These functions are not debugfs functions so they should be in dbg.h
instad in debugfs.h.

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/dbg.h     | 36 +++++++++++++++++++++++++
 drivers/net/wireless/intel/iwlwifi/fw/debugfs.c |  1 +
 drivers/net/wireless/intel/iwlwifi/fw/debugfs.h | 31 ---------------------
 3 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h
index 72259bff9922..507d9a49fa97 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.h
@@ -227,4 +227,40 @@ static inline void iwl_fw_cancel_dump(struct iwl_fw_runtime *fwrt)
 	cancel_delayed_work_sync(&fwrt->dump.wk);
 }
 
+#ifdef CONFIG_IWLWIFI_DEBUGFS
+static inline void iwl_fw_cancel_timestamp(struct iwl_fw_runtime *fwrt)
+{
+	fwrt->timestamp.delay = 0;
+	cancel_delayed_work_sync(&fwrt->timestamp.wk);
+}
+
+void iwl_fw_trigger_timestamp(struct iwl_fw_runtime *fwrt, u32 delay);
+
+static inline void iwl_fw_suspend_timestamp(struct iwl_fw_runtime *fwrt)
+{
+	cancel_delayed_work_sync(&fwrt->timestamp.wk);
+}
+
+static inline void iwl_fw_resume_timestamp(struct iwl_fw_runtime *fwrt)
+{
+	if (!fwrt->timestamp.delay)
+		return;
+
+	schedule_delayed_work(&fwrt->timestamp.wk,
+			      round_jiffies_relative(fwrt->timestamp.delay));
+}
+
+#else
+
+static inline void iwl_fw_cancel_timestamp(struct iwl_fw_runtime *fwrt) {}
+
+static inline void iwl_fw_trigger_timestamp(struct iwl_fw_runtime *fwrt,
+					    u32 delay) {}
+
+static inline void iwl_fw_suspend_timestamp(struct iwl_fw_runtime *fwrt) {}
+
+static inline void iwl_fw_resume_timestamp(struct iwl_fw_runtime *fwrt) {}
+
+#endif /* CONFIG_IWLWIFI_DEBUGFS */
+
 #endif  /* __iwl_fw_dbg_h__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c
index 8f005cd69559..8ba5a60ec9ed 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c
@@ -64,6 +64,7 @@
  *****************************************************************************/
 #include "api/commands.h"
 #include "debugfs.h"
+#include "dbg.h"
 
 #define FWRT_DEBUGFS_READ_FILE_OPS(name)				\
 static ssize_t iwl_dbgfs_##name##_read(struct iwl_fw_runtime *fwrt,	\
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.h b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.h
index d93f6a4bb22d..cbbfa8e9e66d 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.h
@@ -69,28 +69,6 @@
 int iwl_fwrt_dbgfs_register(struct iwl_fw_runtime *fwrt,
 			    struct dentry *dbgfs_dir);
 
-static inline void iwl_fw_cancel_timestamp(struct iwl_fw_runtime *fwrt)
-{
-	fwrt->timestamp.delay = 0;
-	cancel_delayed_work_sync(&fwrt->timestamp.wk);
-}
-
-static inline void iwl_fw_suspend_timestamp(struct iwl_fw_runtime *fwrt)
-{
-	cancel_delayed_work_sync(&fwrt->timestamp.wk);
-}
-
-static inline void iwl_fw_resume_timestamp(struct iwl_fw_runtime *fwrt)
-{
-	if (!fwrt->timestamp.delay)
-		return;
-
-	schedule_delayed_work(&fwrt->timestamp.wk,
-			      round_jiffies_relative(fwrt->timestamp.delay));
-}
-
-void iwl_fw_trigger_timestamp(struct iwl_fw_runtime *fwrt, u32 delay);
-
 #else
 static inline int iwl_fwrt_dbgfs_register(struct iwl_fw_runtime *fwrt,
 					  struct dentry *dbgfs_dir)
@@ -98,13 +76,4 @@ static inline int iwl_fwrt_dbgfs_register(struct iwl_fw_runtime *fwrt,
 	return 0;
 }
 
-static inline void iwl_fw_cancel_timestamp(struct iwl_fw_runtime *fwrt) {}
-
-static inline void iwl_fw_suspend_timestamp(struct iwl_fw_runtime *fwrt) {}
-
-static inline void iwl_fw_resume_timestamp(struct iwl_fw_runtime *fwrt) {}
-
-static inline void iwl_fw_trigger_timestamp(struct iwl_fw_runtime *fwrt,
-					    u32 delay) {}
-
 #endif /* CONFIG_IWLWIFI_DEBUGFS */
-- 
cgit v1.2.3-59-g8ed1b


From e0498146c8792b1ebc321c0e29e633b6a0918a6e Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 4 Jan 2018 13:53:55 +0200
Subject: iwlwifi: pcie: allocate shorter TX queues for 22000 devices

When support for shorter TX queues was introduced, it
didn't include the actual allocation of shorter queue,
which is the main motive for the change.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 2 +-
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index b605b364ed3f..734a5fc3584d 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -993,7 +993,7 @@ static void iwl_pcie_gen2_txq_free_memory(struct iwl_trans *trans,
 	/* De-alloc circular buffer of TFDs */
 	if (txq->tfds) {
 		dma_free_coherent(dev,
-				  trans_pcie->tfd_size * TFD_QUEUE_SIZE_MAX,
+				  trans_pcie->tfd_size * txq->n_window,
 				  txq->tfds, txq->dma_addr);
 		dma_free_coherent(dev,
 				  sizeof(*txq->first_tb_bufs) * txq->n_window,
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 86873b99e5d7..4493520aae1e 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -495,6 +495,9 @@ int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq,
 	if (WARN_ON(txq->entries || txq->tfds))
 		return -EINVAL;
 
+	if (trans->cfg->use_tfh)
+		tfd_sz = trans_pcie->tfd_size * slots_num;
+
 	timer_setup(&txq->stuck_timer, iwl_pcie_txq_stuck_timer, 0);
 	txq->trans_pcie = trans_pcie;
 
-- 
cgit v1.2.3-59-g8ed1b


From 01302f5b4a5e5791b3938ab8d3216232b1830b90 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 4 Jan 2018 13:47:09 +0200
Subject: iwlwifi: Revert "iwlwifi: pcie: dynamic Tx command queue size"

This reverts commit dd05f9aab4426ff178b12d601e50d19d336eba30.

Shorter TX queues support was added eventually without the
need for the parameters this patch added.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/cfg/22000.c     |  1 -
 drivers/net/wireless/intel/iwlwifi/iwl-config.h    |  3 ---
 .../net/wireless/intel/iwlwifi/pcie/ctxt-info.c    |  2 +-
 drivers/net/wireless/intel/iwlwifi/pcie/internal.h |  3 ---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c  |  8 ++------
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c       | 23 ++--------------------
 6 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
index dffd9df782b0..f2b6e0e8d787 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
@@ -137,7 +137,6 @@ static const struct iwl_ht_params iwl_22000_ht_params = {
 	.gen2 = true,							\
 	.nvm_type = IWL_NVM_EXT,					\
 	.dbgc_supported = true,						\
-	.tx_cmd_queue_size = 32,					\
 	.min_umac_error_event_table = 0x400000
 
 const struct iwl_cfg iwl22000_2ac_cfg_hr = {
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index f0f5636dd3ea..687bed30c851 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -333,8 +333,6 @@ struct iwl_pwr_tx_backoff {
  * @gen2: 22000 and on transport operation
  * @cdb: CDB support
  * @nvm_type: see &enum iwl_nvm_type
- * @tx_cmd_queue_size: size of the cmd queue. If zero, use the same value as
- *	the regular queues
  *
  * We enable the driver to be backward compatible wrt. hardware features.
  * API differences in uCode shouldn't be handled here but through TLVs
@@ -386,7 +384,6 @@ struct iwl_cfg {
 	    gen2:1,
 	    cdb:1,
 	    dbgc_supported:1;
-	u16 tx_cmd_queue_size;
 	u8 valid_tx_ant;
 	u8 valid_rx_ant;
 	u8 non_shared_ant;
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c
index 5ef216f3a60b..3fc4343581ee 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/ctxt-info.c
@@ -244,7 +244,7 @@ int iwl_pcie_ctxt_info_init(struct iwl_trans *trans,
 	ctxt_info->hcmd_cfg.cmd_queue_addr =
 		cpu_to_le64(trans_pcie->txq[trans_pcie->cmd_queue]->dma_addr);
 	ctxt_info->hcmd_cfg.cmd_queue_size =
-		TFD_QUEUE_CB_SIZE(trans_pcie->tx_cmd_queue_size);
+		TFD_QUEUE_CB_SIZE(TFD_CMD_SLOTS);
 
 	/* allocate ucode sections in dram and set addresses */
 	ret = iwl_pcie_ctxt_info_init_fw_sec(trans, fw, ctxt_info);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
index ca3b64ff4dad..a8833028e1cf 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
@@ -383,7 +383,6 @@ struct iwl_self_init_dram {
  * @hw_init_mask: initial unmasked hw causes
  * @fh_mask: current unmasked fh causes
  * @hw_mask: current unmasked hw causes
- * @tx_cmd_queue_size: the size of the tx command queue
  */
 struct iwl_trans_pcie {
 	struct iwl_rxq *rxq;
@@ -465,7 +464,6 @@ struct iwl_trans_pcie {
 	u32 fh_mask;
 	u32 hw_mask;
 	cpumask_t affinity_mask[IWL_MAX_RX_HW_QUEUES];
-	u16 tx_cmd_queue_size;
 };
 
 static inline struct iwl_trans_pcie *
@@ -537,7 +535,6 @@ void iwl_pcie_hcmd_complete(struct iwl_trans *trans,
 void iwl_trans_pcie_reclaim(struct iwl_trans *trans, int txq_id, int ssn,
 			    struct sk_buff_head *skbs);
 void iwl_trans_pcie_tx_reset(struct iwl_trans *trans);
-void iwl_pcie_set_tx_cmd_queue_size(struct iwl_trans *trans);
 
 static inline u16 iwl_pcie_tfd_tb_get_len(struct iwl_trans *trans, void *_tfd,
 					  u8 idx)
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index 734a5fc3584d..7bdb9c9e04b1 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -1173,8 +1173,6 @@ int iwl_pcie_gen2_tx_init(struct iwl_trans *trans)
 	struct iwl_txq *cmd_queue;
 	int txq_id = trans_pcie->cmd_queue, ret;
 
-	iwl_pcie_set_tx_cmd_queue_size(trans);
-
 	/* alloc and init the command queue */
 	if (!trans_pcie->txq[txq_id]) {
 		cmd_queue = kzalloc(sizeof(*cmd_queue), GFP_KERNEL);
@@ -1183,8 +1181,7 @@ int iwl_pcie_gen2_tx_init(struct iwl_trans *trans)
 			return -ENOMEM;
 		}
 		trans_pcie->txq[txq_id] = cmd_queue;
-		ret = iwl_pcie_txq_alloc(trans, cmd_queue,
-					 trans_pcie->tx_cmd_queue_size, true);
+		ret = iwl_pcie_txq_alloc(trans, cmd_queue, TFD_CMD_SLOTS, true);
 		if (ret) {
 			IWL_ERR(trans, "Tx %d queue init failed\n", txq_id);
 			goto error;
@@ -1193,8 +1190,7 @@ int iwl_pcie_gen2_tx_init(struct iwl_trans *trans)
 		cmd_queue = trans_pcie->txq[txq_id];
 	}
 
-	ret = iwl_pcie_txq_init(trans, cmd_queue,
-				trans_pcie->tx_cmd_queue_size, true);
+	ret = iwl_pcie_txq_init(trans, cmd_queue, TFD_CMD_SLOTS, true);
 	if (ret) {
 		IWL_ERR(trans, "Tx %d queue alloc failed\n", txq_id);
 		goto error;
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 4493520aae1e..cf468b9f5a82 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -953,8 +953,7 @@ static int iwl_pcie_tx_alloc(struct iwl_trans *trans)
 	     txq_id++) {
 		bool cmd_queue = (txq_id == trans_pcie->cmd_queue);
 
-		slots_num = cmd_queue ? trans_pcie->tx_cmd_queue_size :
-			TFD_TX_CMD_SLOTS;
+		slots_num = cmd_queue ? TFD_CMD_SLOTS : TFD_TX_CMD_SLOTS;
 		trans_pcie->txq[txq_id] = &trans_pcie->txq_memory[txq_id];
 		ret = iwl_pcie_txq_alloc(trans, trans_pcie->txq[txq_id],
 					 slots_num, cmd_queue);
@@ -973,21 +972,6 @@ error:
 	return ret;
 }
 
-void iwl_pcie_set_tx_cmd_queue_size(struct iwl_trans *trans)
-{
-	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
-	int queue_size = TFD_CMD_SLOTS;
-
-	if (trans->cfg->tx_cmd_queue_size)
-		queue_size = trans->cfg->tx_cmd_queue_size;
-
-	if (WARN_ON(!(is_power_of_2(queue_size) &&
-		      TFD_QUEUE_CB_SIZE(queue_size) > 0)))
-		trans_pcie->tx_cmd_queue_size = TFD_CMD_SLOTS;
-	else
-		trans_pcie->tx_cmd_queue_size = queue_size;
-}
-
 int iwl_pcie_tx_init(struct iwl_trans *trans)
 {
 	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
@@ -995,8 +979,6 @@ int iwl_pcie_tx_init(struct iwl_trans *trans)
 	int txq_id, slots_num;
 	bool alloc = false;
 
-	iwl_pcie_set_tx_cmd_queue_size(trans);
-
 	if (!trans_pcie->txq_memory) {
 		ret = iwl_pcie_tx_alloc(trans);
 		if (ret)
@@ -1020,8 +1002,7 @@ int iwl_pcie_tx_init(struct iwl_trans *trans)
 	     txq_id++) {
 		bool cmd_queue = (txq_id == trans_pcie->cmd_queue);
 
-		slots_num = cmd_queue ? trans_pcie->tx_cmd_queue_size :
-			TFD_TX_CMD_SLOTS;
+		slots_num = cmd_queue ? TFD_CMD_SLOTS : TFD_TX_CMD_SLOTS;
 		ret = iwl_pcie_txq_init(trans, trans_pcie->txq[txq_id],
 					slots_num, cmd_queue);
 		if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From 5369774c84eac674af1741c4322ad7728694d80b Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 4 Jan 2018 14:17:06 +0200
Subject: iwlwifi: add TX queue size parameter to TX queue allocation

As preparation for dynamic queue sizing, add a parameter
of the TX queue size to the dynamic queue allocation op
mode API.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/api/txq.h    | 1 +
 drivers/net/wireless/intel/iwlwifi/iwl-trans.h     | 8 ++++----
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     | 3 ++-
 drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 2 +-
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c  | 2 +-
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h b/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
index dfa111bb411e..1f45b2ce5295 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
@@ -131,6 +131,7 @@ enum iwl_tx_queue_cfg_actions {
 	TX_QUEUE_CFG_TFD_SHORT_FORMAT		= BIT(1),
 };
 
+#define IWL_DEFAULT_QUEUE_SIZE 256
 /**
  * struct iwl_tx_queue_cfg_cmd - txq hw scheduler config command
  * @sta_id: station id
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
index c25ed1a0bbb0..a92832711683 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
@@ -554,7 +554,7 @@ struct iwl_trans_ops {
 	/* 22000 functions */
 	int (*txq_alloc)(struct iwl_trans *trans,
 			 struct iwl_tx_queue_cfg_cmd *cmd,
-			 int cmd_id,
+			 int cmd_id, int size,
 			 unsigned int queue_wdg_timeout);
 	void (*txq_free)(struct iwl_trans *trans, int queue);
 
@@ -952,8 +952,8 @@ iwl_trans_txq_free(struct iwl_trans *trans, int queue)
 static inline int
 iwl_trans_txq_alloc(struct iwl_trans *trans,
 		    struct iwl_tx_queue_cfg_cmd *cmd,
-		    int cmd_id,
-		    unsigned int queue_wdg_timeout)
+		    int cmd_id, int size,
+		    unsigned int wdg_timeout)
 {
 	might_sleep();
 
@@ -965,7 +965,7 @@ iwl_trans_txq_alloc(struct iwl_trans *trans,
 		return -EIO;
 	}
 
-	return trans->ops->txq_alloc(trans, cmd, cmd_id, queue_wdg_timeout);
+	return trans->ops->txq_alloc(trans, cmd, cmd_id, size, wdg_timeout);
 }
 
 static inline void iwl_trans_txq_set_shared_mode(struct iwl_trans *trans,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 815d2d141fd8..81c02d6e3d82 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -733,7 +733,8 @@ int iwl_mvm_tvqm_enable_txq(struct iwl_mvm *mvm, int mac80211_queue,
 	if (cmd.tid == IWL_MAX_TID_COUNT)
 		cmd.tid = IWL_MGMT_TID;
 	queue = iwl_trans_txq_alloc(mvm->trans, (void *)&cmd,
-				    SCD_QUEUE_CFG, timeout);
+				    SCD_QUEUE_CFG, IWL_DEFAULT_QUEUE_SIZE,
+				    timeout);
 
 	if (queue < 0) {
 		IWL_DEBUG_TX_QUEUES(mvm,
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
index a8833028e1cf..cda66340d357 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
@@ -819,7 +819,7 @@ int iwl_trans_pcie_gen2_start_fw(struct iwl_trans *trans,
 void iwl_trans_pcie_gen2_fw_alive(struct iwl_trans *trans, u32 scd_addr);
 int iwl_trans_pcie_dyn_txq_alloc(struct iwl_trans *trans,
 				 struct iwl_tx_queue_cfg_cmd *cmd,
-				 int cmd_id,
+				 int cmd_id, int size,
 				 unsigned int timeout);
 void iwl_trans_pcie_dyn_txq_free(struct iwl_trans *trans, int queue);
 int iwl_trans_pcie_gen2_tx(struct iwl_trans *trans, struct sk_buff *skb,
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index 7bdb9c9e04b1..b3d151ec8fe4 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -1041,7 +1041,7 @@ static void iwl_pcie_gen2_txq_free(struct iwl_trans *trans, int txq_id)
 
 int iwl_trans_pcie_dyn_txq_alloc(struct iwl_trans *trans,
 				 struct iwl_tx_queue_cfg_cmd *cmd,
-				 int cmd_id,
+				 int cmd_id, int size,
 				 unsigned int timeout)
 {
 	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
-- 
cgit v1.2.3-59-g8ed1b


From 3cfb6de73df3bceaf1942297659177ac50267852 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 4 Jan 2018 14:02:51 +0200
Subject: iwlwifi: pcie: use the queue size as sent by opmode

Op mode will begin tp use varying size of TX queue.
All the infra is in place, allow it.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index b3d151ec8fe4..a235dda1f70b 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -1067,12 +1067,12 @@ int iwl_trans_pcie_dyn_txq_alloc(struct iwl_trans *trans,
 		return -ENOMEM;
 	}
 
-	ret = iwl_pcie_txq_alloc(trans, txq, TFD_TX_CMD_SLOTS, false);
+	ret = iwl_pcie_txq_alloc(trans, txq, size, false);
 	if (ret) {
 		IWL_ERR(trans, "Tx queue alloc failed\n");
 		goto error;
 	}
-	ret = iwl_pcie_txq_init(trans, txq, TFD_TX_CMD_SLOTS, false);
+	ret = iwl_pcie_txq_init(trans, txq, size, false);
 	if (ret) {
 		IWL_ERR(trans, "Tx queue init failed\n");
 		goto error;
@@ -1082,7 +1082,7 @@ int iwl_trans_pcie_dyn_txq_alloc(struct iwl_trans *trans,
 
 	cmd->tfdq_addr = cpu_to_le64(txq->dma_addr);
 	cmd->byte_cnt_addr = cpu_to_le64(txq->bc_tbl.dma);
-	cmd->cb_size = cpu_to_le32(TFD_QUEUE_CB_SIZE(TFD_TX_CMD_SLOTS));
+	cmd->cb_size = cpu_to_le32(TFD_QUEUE_CB_SIZE(size));
 
 	ret = iwl_trans_send_cmd(trans, &hcmd);
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From 251985c92865bc68a0b17a6409a640914f3ab1ba Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Thu, 4 Jan 2018 14:06:07 +0200
Subject: iwlwifi: mvm: use shorter queues for mgmt and auxilary queues

In 22000 devices, aka gen2, the TFS is 256 bytes.
In order to save memory, use shorter TX queue for aux and
mgmt queues, since there isn't much traffic on them.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/api/txq.h | 1 +
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c  | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h b/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
index 1f45b2ce5295..6ac240b6eace 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/txq.h
@@ -132,6 +132,7 @@ enum iwl_tx_queue_cfg_actions {
 };
 
 #define IWL_DEFAULT_QUEUE_SIZE 256
+#define IWL_MGMT_QUEUE_SIZE 16
 /**
  * struct iwl_tx_queue_cfg_cmd - txq hw scheduler config command
  * @sta_id: station id
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 81c02d6e3d82..e597bc193e5c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -728,13 +728,14 @@ int iwl_mvm_tvqm_enable_txq(struct iwl_mvm *mvm, int mac80211_queue,
 		.sta_id = sta_id,
 		.tid = tid,
 	};
-	int queue;
+	int queue, size = IWL_DEFAULT_QUEUE_SIZE;
 
-	if (cmd.tid == IWL_MAX_TID_COUNT)
+	if (cmd.tid == IWL_MAX_TID_COUNT) {
 		cmd.tid = IWL_MGMT_TID;
+		size = IWL_MGMT_QUEUE_SIZE;
+	}
 	queue = iwl_trans_txq_alloc(mvm->trans, (void *)&cmd,
-				    SCD_QUEUE_CFG, IWL_DEFAULT_QUEUE_SIZE,
-				    timeout);
+				    SCD_QUEUE_CFG, size, timeout);
 
 	if (queue < 0) {
 		IWL_DEBUG_TX_QUEUES(mvm,
-- 
cgit v1.2.3-59-g8ed1b


From 9c4f7d5127402294b50c9ff18fad66639f3b81d0 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Sun, 14 Jan 2018 19:06:09 +0200
Subject: iwlwifi: move all NVM parsing code to the common files

Move all the NVM file handling code to iwl-nvm-parse.c where all this
stuff belongs.  This cleans up the MVM specific code and allows easier
reuse by other opmodes if needed.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 214 ++++++++++++++++++++-
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h |  23 +++
 drivers/net/wireless/intel/iwlwifi/mvm/fw.c        |   7 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |  15 +-
 drivers/net/wireless/intel/iwlwifi/mvm/nvm.c       | 208 +-------------------
 5 files changed, 247 insertions(+), 220 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 8928613e033e..3437ed480e31 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -68,6 +70,7 @@
 #include <linux/export.h>
 #include <linux/etherdevice.h>
 #include <linux/pci.h>
+#include <linux/firmware.h>
 
 #include "iwl-drv.h"
 #include "iwl-modparams.h"
@@ -76,6 +79,7 @@
 #include "iwl-io.h"
 #include "iwl-csr.h"
 #include "fw/acpi.h"
+#include "fw/api/nvm-reg.h"
 
 /* NVM offsets (in words) definitions */
 enum nvm_offsets {
@@ -146,8 +150,8 @@ static const u8 iwl_ext_nvm_channels[] = {
 	149, 153, 157, 161, 165, 169, 173, 177, 181
 };
 
-#define IWL_NUM_CHANNELS		ARRAY_SIZE(iwl_nvm_channels)
-#define IWL_NUM_CHANNELS_EXT	ARRAY_SIZE(iwl_ext_nvm_channels)
+#define IWL_NVM_NUM_CHANNELS		ARRAY_SIZE(iwl_nvm_channels)
+#define IWL_NVM_NUM_CHANNELS_EXT	ARRAY_SIZE(iwl_ext_nvm_channels)
 #define NUM_2GHZ_CHANNELS		14
 #define NUM_2GHZ_CHANNELS_EXT	14
 #define FIRST_2GHZ_HT_MINUS		5
@@ -301,11 +305,11 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 	const u8 *nvm_chan;
 
 	if (cfg->nvm_type != IWL_NVM_EXT) {
-		num_of_ch = IWL_NUM_CHANNELS;
+		num_of_ch = IWL_NVM_NUM_CHANNELS;
 		nvm_chan = &iwl_nvm_channels[0];
 		num_2ghz_channels = NUM_2GHZ_CHANNELS;
 	} else {
-		num_of_ch = IWL_NUM_CHANNELS_EXT;
+		num_of_ch = IWL_NVM_NUM_CHANNELS_EXT;
 		nvm_chan = &iwl_ext_nvm_channels[0];
 		num_2ghz_channels = NUM_2GHZ_CHANNELS_EXT;
 	}
@@ -720,12 +724,12 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 	if (cfg->nvm_type != IWL_NVM_EXT)
 		data = kzalloc(sizeof(*data) +
 			       sizeof(struct ieee80211_channel) *
-			       IWL_NUM_CHANNELS,
+			       IWL_NVM_NUM_CHANNELS,
 			       GFP_KERNEL);
 	else
 		data = kzalloc(sizeof(*data) +
 			       sizeof(struct ieee80211_channel) *
-			       IWL_NUM_CHANNELS_EXT,
+			       IWL_NVM_NUM_CHANNELS_EXT,
 			       GFP_KERNEL);
 	if (!data)
 		return NULL;
@@ -859,7 +863,7 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg,
 	int valid_rules = 0;
 	bool new_rule;
 	int max_num_ch = cfg->nvm_type == IWL_NVM_EXT ?
-			 IWL_NUM_CHANNELS_EXT : IWL_NUM_CHANNELS;
+			 IWL_NVM_NUM_CHANNELS_EXT : IWL_NVM_NUM_CHANNELS;
 
 	if (WARN_ON_ONCE(num_of_ch > NL80211_MAX_SUPP_REG_RULES))
 		return ERR_PTR(-EINVAL);
@@ -938,3 +942,199 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg,
 	return regd;
 }
 IWL_EXPORT_SYMBOL(iwl_parse_nvm_mcc_info);
+
+#define IWL_MAX_NVM_SECTION_SIZE	0x1b58
+#define IWL_MAX_EXT_NVM_SECTION_SIZE	0x1ffc
+#define MAX_NVM_FILE_LEN	16384
+
+void iwl_nvm_fixups(u32 hw_id, unsigned int section, u8 *data,
+		    unsigned int len)
+{
+#define IWL_4165_DEVICE_ID	0x5501
+#define NVM_SKU_CAP_MIMO_DISABLE BIT(5)
+
+	if (section == NVM_SECTION_TYPE_PHY_SKU &&
+	    hw_id == IWL_4165_DEVICE_ID && data && len >= 5 &&
+	    (data[4] & NVM_SKU_CAP_MIMO_DISABLE))
+		/* OTP 0x52 bug work around: it's a 1x1 device */
+		data[3] = ANT_B | (ANT_B << 4);
+}
+IWL_EXPORT_SYMBOL(iwl_nvm_fixups);
+
+/*
+ * Reads external NVM from a file into mvm->nvm_sections
+ *
+ * HOW TO CREATE THE NVM FILE FORMAT:
+ * ------------------------------
+ * 1. create hex file, format:
+ *      3800 -> header
+ *      0000 -> header
+ *      5a40 -> data
+ *
+ *   rev - 6 bit (word1)
+ *   len - 10 bit (word1)
+ *   id - 4 bit (word2)
+ *   rsv - 12 bit (word2)
+ *
+ * 2. flip 8bits with 8 bits per line to get the right NVM file format
+ *
+ * 3. create binary file from the hex file
+ *
+ * 4. save as "iNVM_xxx.bin" under /lib/firmware
+ */
+int iwl_read_external_nvm(struct iwl_trans *trans,
+			  const char *nvm_file_name,
+			  struct iwl_nvm_section *nvm_sections)
+{
+	int ret, section_size;
+	u16 section_id;
+	const struct firmware *fw_entry;
+	const struct {
+		__le16 word1;
+		__le16 word2;
+		u8 data[];
+	} *file_sec;
+	const u8 *eof;
+	u8 *temp;
+	int max_section_size;
+	const __le32 *dword_buff;
+
+#define NVM_WORD1_LEN(x) (8 * (x & 0x03FF))
+#define NVM_WORD2_ID(x) (x >> 12)
+#define EXT_NVM_WORD2_LEN(x) (2 * (((x) & 0xFF) << 8 | (x) >> 8))
+#define EXT_NVM_WORD1_ID(x) ((x) >> 4)
+#define NVM_HEADER_0	(0x2A504C54)
+#define NVM_HEADER_1	(0x4E564D2A)
+#define NVM_HEADER_SIZE	(4 * sizeof(u32))
+
+	IWL_DEBUG_EEPROM(trans->dev, "Read from external NVM\n");
+
+	/* Maximal size depends on NVM version */
+	if (trans->cfg->nvm_type != IWL_NVM_EXT)
+		max_section_size = IWL_MAX_NVM_SECTION_SIZE;
+	else
+		max_section_size = IWL_MAX_EXT_NVM_SECTION_SIZE;
+
+	/*
+	 * Obtain NVM image via request_firmware. Since we already used
+	 * request_firmware_nowait() for the firmware binary load and only
+	 * get here after that we assume the NVM request can be satisfied
+	 * synchronously.
+	 */
+	ret = request_firmware(&fw_entry, nvm_file_name, trans->dev);
+	if (ret) {
+		IWL_ERR(trans, "ERROR: %s isn't available %d\n",
+			nvm_file_name, ret);
+		return ret;
+	}
+
+	IWL_INFO(trans, "Loaded NVM file %s (%zu bytes)\n",
+		 nvm_file_name, fw_entry->size);
+
+	if (fw_entry->size > MAX_NVM_FILE_LEN) {
+		IWL_ERR(trans, "NVM file too large\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	eof = fw_entry->data + fw_entry->size;
+	dword_buff = (__le32 *)fw_entry->data;
+
+	/* some NVM file will contain a header.
+	 * The header is identified by 2 dwords header as follow:
+	 * dword[0] = 0x2A504C54
+	 * dword[1] = 0x4E564D2A
+	 *
+	 * This header must be skipped when providing the NVM data to the FW.
+	 */
+	if (fw_entry->size > NVM_HEADER_SIZE &&
+	    dword_buff[0] == cpu_to_le32(NVM_HEADER_0) &&
+	    dword_buff[1] == cpu_to_le32(NVM_HEADER_1)) {
+		file_sec = (void *)(fw_entry->data + NVM_HEADER_SIZE);
+		IWL_INFO(trans, "NVM Version %08X\n", le32_to_cpu(dword_buff[2]));
+		IWL_INFO(trans, "NVM Manufacturing date %08X\n",
+			 le32_to_cpu(dword_buff[3]));
+
+		/* nvm file validation, dword_buff[2] holds the file version */
+		if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000 &&
+		    CSR_HW_REV_STEP(trans->hw_rev) == SILICON_C_STEP &&
+		    le32_to_cpu(dword_buff[2]) < 0xE4A) {
+			ret = -EFAULT;
+			goto out;
+		}
+	} else {
+		file_sec = (void *)fw_entry->data;
+	}
+
+	while (true) {
+		if (file_sec->data > eof) {
+			IWL_ERR(trans,
+				"ERROR - NVM file too short for section header\n");
+			ret = -EINVAL;
+			break;
+		}
+
+		/* check for EOF marker */
+		if (!file_sec->word1 && !file_sec->word2) {
+			ret = 0;
+			break;
+		}
+
+		if (trans->cfg->nvm_type != IWL_NVM_EXT) {
+			section_size =
+				2 * NVM_WORD1_LEN(le16_to_cpu(file_sec->word1));
+			section_id = NVM_WORD2_ID(le16_to_cpu(file_sec->word2));
+		} else {
+			section_size = 2 * EXT_NVM_WORD2_LEN(
+						le16_to_cpu(file_sec->word2));
+			section_id = EXT_NVM_WORD1_ID(
+						le16_to_cpu(file_sec->word1));
+		}
+
+		if (section_size > max_section_size) {
+			IWL_ERR(trans, "ERROR - section too large (%d)\n",
+				section_size);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (!section_size) {
+			IWL_ERR(trans, "ERROR - section empty\n");
+			ret = -EINVAL;
+			break;
+		}
+
+		if (file_sec->data + section_size > eof) {
+			IWL_ERR(trans,
+				"ERROR - NVM file too short for section (%d bytes)\n",
+				section_size);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (WARN(section_id >= NVM_MAX_NUM_SECTIONS,
+			 "Invalid NVM section ID %d\n", section_id)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		temp = kmemdup(file_sec->data, section_size, GFP_KERNEL);
+		if (!temp) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		iwl_nvm_fixups(trans->hw_id, section_id, temp, section_size);
+
+		kfree(nvm_sections[section_id].data);
+		nvm_sections[section_id].data = temp;
+		nvm_sections[section_id].length = section_size;
+
+		/* advance to the next section */
+		file_sec = (void *)(file_sec->data + section_size);
+	}
+out:
+	release_firmware(fw_entry);
+	return ret;
+}
+IWL_EXPORT_SYMBOL(iwl_read_external_nvm);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
index 306736c7a042..6b108cae61b8 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
@@ -7,6 +7,7 @@
  *
  * Copyright(c) 2008 - 2015 Intel Corporation. All rights reserved.
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -33,6 +34,7 @@
  *
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -109,4 +111,25 @@ struct ieee80211_regdomain *
 iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg,
 		       int num_of_ch, __le32 *channels, u16 fw_mcc);
 
+/**
+ * struct iwl_nvm_section - describes an NVM section in memory.
+ *
+ * This struct holds an NVM section read from the NIC using NVM_ACCESS_CMD,
+ * and saved for later use by the driver. Not all NVM sections are saved
+ * this way, only the needed ones.
+ */
+struct iwl_nvm_section {
+	u16 length;
+	const u8 *data;
+};
+
+/**
+ * iwl_read_external_nvm - Reads external NVM from a file into nvm_sections
+ */
+int iwl_read_external_nvm(struct iwl_trans *trans,
+			  const char *nvm_file_name,
+			  struct iwl_nvm_section *nvm_sections);
+void iwl_nvm_fixups(u32 hw_id, unsigned int section, u8 *data,
+		    unsigned int len);
+
 #endif /* __iwl_nvm_parse_h__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
index 4cf3e32e3dba..d5a612add59f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -79,6 +81,8 @@
 #include "mvm.h"
 #include "fw/dbg.h"
 #include "iwl-phy-db.h"
+#include "iwl-modparams.h"
+#include "iwl-nvm-parse.h"
 
 #define MVM_UCODE_ALIVE_TIMEOUT	HZ
 #define MVM_UCODE_CALIB_TIMEOUT	(2*HZ)
@@ -381,7 +385,8 @@ static int iwl_run_unified_mvm_ucode(struct iwl_mvm *mvm, bool read_nvm)
 
 	/* Load NVM to NIC if needed */
 	if (mvm->nvm_file_name) {
-		iwl_mvm_read_external_nvm(mvm);
+		iwl_read_external_nvm(mvm->trans, mvm->nvm_file_name,
+				      mvm->nvm_sections);
 		iwl_mvm_load_nvm_to_nic(mvm);
 	}
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 2411cc91f833..6a4ba160c59e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -92,7 +92,7 @@
 #include "fw/runtime.h"
 #include "fw/dbg.h"
 #include "fw/acpi.h"
-#include "fw/debugfs.h"
+#include "iwl-nvm-parse.h"
 
 #include <linux/average.h>
 
@@ -508,18 +508,6 @@ enum iwl_mvm_sched_scan_pass_all_states {
 	SCHED_SCAN_PASS_ALL_FOUND,
 };
 
-/**
- * struct iwl_nvm_section - describes an NVM section in memory.
- *
- * This struct holds an NVM section read from the NIC using NVM_ACCESS_CMD,
- * and saved for later use by the driver. Not all NVM sections are saved
- * this way, only the needed ones.
- */
-struct iwl_nvm_section {
-	u16 length;
-	const u8 *data;
-};
-
 /**
  * struct iwl_mvm_tt_mgnt - Thermal Throttling Management structure
  * @ct_kill_exit: worker to exit thermal kill
@@ -1511,7 +1499,6 @@ void iwl_mvm_accu_radio_stats(struct iwl_mvm *mvm);
 /* NVM */
 int iwl_nvm_init(struct iwl_mvm *mvm);
 int iwl_mvm_load_nvm_to_nic(struct iwl_mvm *mvm);
-int iwl_mvm_read_external_nvm(struct iwl_mvm *mvm);
 
 static inline u8 iwl_mvm_get_valid_tx_ant(struct iwl_mvm *mvm)
 {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
index 5bfe5306524c..cf48517944ec 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/nvm.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,9 +78,7 @@
 #include "fw/acpi.h"
 
 /* Default NVM size to read */
-#define IWL_NVM_DEFAULT_CHUNK_SIZE (2*1024)
-#define IWL_MAX_NVM_SECTION_SIZE	0x1b58
-#define IWL_MAX_EXT_NVM_SECTION_SIZE	0x1ffc
+#define IWL_NVM_DEFAULT_CHUNK_SIZE (2 * 1024)
 
 #define NVM_WRITE_OPCODE 1
 #define NVM_READ_OPCODE 0
@@ -229,19 +229,6 @@ static int iwl_nvm_write_section(struct iwl_mvm *mvm, u16 section,
 	return 0;
 }
 
-static void iwl_mvm_nvm_fixups(struct iwl_mvm *mvm, unsigned int section,
-			       u8 *data, unsigned int len)
-{
-#define IWL_4165_DEVICE_ID	0x5501
-#define NVM_SKU_CAP_MIMO_DISABLE BIT(5)
-
-	if (section == NVM_SECTION_TYPE_PHY_SKU &&
-	    mvm->trans->hw_id == IWL_4165_DEVICE_ID && data && len >= 5 &&
-	    (data[4] & NVM_SKU_CAP_MIMO_DISABLE))
-		/* OTP 0x52 bug work around: it's a 1x1 device */
-		data[3] = ANT_B | (ANT_B << 4);
-}
-
 /*
  * Reads an NVM section completely.
  * NICs prior to 7000 family doesn't have a real NVM, but just read
@@ -282,7 +269,7 @@ static int iwl_nvm_read_section(struct iwl_mvm *mvm, u16 section,
 		offset += ret;
 	}
 
-	iwl_mvm_nvm_fixups(mvm, section, data, offset);
+	iwl_nvm_fixups(mvm->trans->hw_id, section, data, offset);
 
 	IWL_DEBUG_EEPROM(mvm->trans->dev,
 			 "NVM section %d read completed\n", section);
@@ -355,184 +342,6 @@ iwl_parse_nvm_sections(struct iwl_mvm *mvm)
 				  lar_enabled);
 }
 
-#define MAX_NVM_FILE_LEN	16384
-
-/*
- * Reads external NVM from a file into mvm->nvm_sections
- *
- * HOW TO CREATE THE NVM FILE FORMAT:
- * ------------------------------
- * 1. create hex file, format:
- *      3800 -> header
- *      0000 -> header
- *      5a40 -> data
- *
- *   rev - 6 bit (word1)
- *   len - 10 bit (word1)
- *   id - 4 bit (word2)
- *   rsv - 12 bit (word2)
- *
- * 2. flip 8bits with 8 bits per line to get the right NVM file format
- *
- * 3. create binary file from the hex file
- *
- * 4. save as "iNVM_xxx.bin" under /lib/firmware
- */
-int iwl_mvm_read_external_nvm(struct iwl_mvm *mvm)
-{
-	int ret, section_size;
-	u16 section_id;
-	const struct firmware *fw_entry;
-	const struct {
-		__le16 word1;
-		__le16 word2;
-		u8 data[];
-	} *file_sec;
-	const u8 *eof;
-	u8 *temp;
-	int max_section_size;
-	const __le32 *dword_buff;
-
-#define NVM_WORD1_LEN(x) (8 * (x & 0x03FF))
-#define NVM_WORD2_ID(x) (x >> 12)
-#define EXT_NVM_WORD2_LEN(x) (2 * (((x) & 0xFF) << 8 | (x) >> 8))
-#define EXT_NVM_WORD1_ID(x) ((x) >> 4)
-#define NVM_HEADER_0	(0x2A504C54)
-#define NVM_HEADER_1	(0x4E564D2A)
-#define NVM_HEADER_SIZE	(4 * sizeof(u32))
-
-	IWL_DEBUG_EEPROM(mvm->trans->dev, "Read from external NVM\n");
-
-	/* Maximal size depends on NVM version */
-	if (mvm->trans->cfg->nvm_type != IWL_NVM_EXT)
-		max_section_size = IWL_MAX_NVM_SECTION_SIZE;
-	else
-		max_section_size = IWL_MAX_EXT_NVM_SECTION_SIZE;
-
-	/*
-	 * Obtain NVM image via request_firmware. Since we already used
-	 * request_firmware_nowait() for the firmware binary load and only
-	 * get here after that we assume the NVM request can be satisfied
-	 * synchronously.
-	 */
-	ret = request_firmware(&fw_entry, mvm->nvm_file_name,
-			       mvm->trans->dev);
-	if (ret) {
-		IWL_ERR(mvm, "ERROR: %s isn't available %d\n",
-			mvm->nvm_file_name, ret);
-		return ret;
-	}
-
-	IWL_INFO(mvm, "Loaded NVM file %s (%zu bytes)\n",
-		 mvm->nvm_file_name, fw_entry->size);
-
-	if (fw_entry->size > MAX_NVM_FILE_LEN) {
-		IWL_ERR(mvm, "NVM file too large\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	eof = fw_entry->data + fw_entry->size;
-	dword_buff = (__le32 *)fw_entry->data;
-
-	/* some NVM file will contain a header.
-	 * The header is identified by 2 dwords header as follow:
-	 * dword[0] = 0x2A504C54
-	 * dword[1] = 0x4E564D2A
-	 *
-	 * This header must be skipped when providing the NVM data to the FW.
-	 */
-	if (fw_entry->size > NVM_HEADER_SIZE &&
-	    dword_buff[0] == cpu_to_le32(NVM_HEADER_0) &&
-	    dword_buff[1] == cpu_to_le32(NVM_HEADER_1)) {
-		file_sec = (void *)(fw_entry->data + NVM_HEADER_SIZE);
-		IWL_INFO(mvm, "NVM Version %08X\n", le32_to_cpu(dword_buff[2]));
-		IWL_INFO(mvm, "NVM Manufacturing date %08X\n",
-			 le32_to_cpu(dword_buff[3]));
-
-		/* nvm file validation, dword_buff[2] holds the file version */
-		if (mvm->trans->cfg->device_family == IWL_DEVICE_FAMILY_8000 &&
-		    CSR_HW_REV_STEP(mvm->trans->hw_rev) == SILICON_C_STEP &&
-		    le32_to_cpu(dword_buff[2]) < 0xE4A) {
-			ret = -EFAULT;
-			goto out;
-		}
-	} else {
-		file_sec = (void *)fw_entry->data;
-	}
-
-	while (true) {
-		if (file_sec->data > eof) {
-			IWL_ERR(mvm,
-				"ERROR - NVM file too short for section header\n");
-			ret = -EINVAL;
-			break;
-		}
-
-		/* check for EOF marker */
-		if (!file_sec->word1 && !file_sec->word2) {
-			ret = 0;
-			break;
-		}
-
-		if (mvm->trans->cfg->nvm_type != IWL_NVM_EXT) {
-			section_size =
-				2 * NVM_WORD1_LEN(le16_to_cpu(file_sec->word1));
-			section_id = NVM_WORD2_ID(le16_to_cpu(file_sec->word2));
-		} else {
-			section_size = 2 * EXT_NVM_WORD2_LEN(
-						le16_to_cpu(file_sec->word2));
-			section_id = EXT_NVM_WORD1_ID(
-						le16_to_cpu(file_sec->word1));
-		}
-
-		if (section_size > max_section_size) {
-			IWL_ERR(mvm, "ERROR - section too large (%d)\n",
-				section_size);
-			ret = -EINVAL;
-			break;
-		}
-
-		if (!section_size) {
-			IWL_ERR(mvm, "ERROR - section empty\n");
-			ret = -EINVAL;
-			break;
-		}
-
-		if (file_sec->data + section_size > eof) {
-			IWL_ERR(mvm,
-				"ERROR - NVM file too short for section (%d bytes)\n",
-				section_size);
-			ret = -EINVAL;
-			break;
-		}
-
-		if (WARN(section_id >= NVM_MAX_NUM_SECTIONS,
-			 "Invalid NVM section ID %d\n", section_id)) {
-			ret = -EINVAL;
-			break;
-		}
-
-		temp = kmemdup(file_sec->data, section_size, GFP_KERNEL);
-		if (!temp) {
-			ret = -ENOMEM;
-			break;
-		}
-
-		iwl_mvm_nvm_fixups(mvm, section_id, temp, section_size);
-
-		kfree(mvm->nvm_sections[section_id].data);
-		mvm->nvm_sections[section_id].data = temp;
-		mvm->nvm_sections[section_id].length = section_size;
-
-		/* advance to the next section */
-		file_sec = (void *)(file_sec->data + section_size);
-	}
-out:
-	release_firmware(fw_entry);
-	return ret;
-}
-
 /* Loads the NVM data stored in mvm->nvm_sections into the NIC */
 int iwl_mvm_load_nvm_to_nic(struct iwl_mvm *mvm)
 {
@@ -585,7 +394,7 @@ int iwl_nvm_init(struct iwl_mvm *mvm)
 			break;
 		}
 
-		iwl_mvm_nvm_fixups(mvm, section, temp, ret);
+		iwl_nvm_fixups(mvm->trans->hw_id, section, temp, ret);
 
 		mvm->nvm_sections[section].data = temp;
 		mvm->nvm_sections[section].length = ret;
@@ -624,14 +433,17 @@ int iwl_nvm_init(struct iwl_mvm *mvm)
 	/* Only if PNVM selected in the mod param - load external NVM  */
 	if (mvm->nvm_file_name) {
 		/* read External NVM file from the mod param */
-		ret = iwl_mvm_read_external_nvm(mvm);
+		ret = iwl_read_external_nvm(mvm->trans, mvm->nvm_file_name,
+					    mvm->nvm_sections);
 		if (ret) {
 			mvm->nvm_file_name = nvm_file_C;
 
 			if ((ret == -EFAULT || ret == -ENOENT) &&
 			    mvm->nvm_file_name) {
 				/* in case nvm file was failed try again */
-				ret = iwl_mvm_read_external_nvm(mvm);
+				ret = iwl_read_external_nvm(mvm->trans,
+							    mvm->nvm_file_name,
+							    mvm->nvm_sections);
 				if (ret)
 					return ret;
 			} else {
-- 
cgit v1.2.3-59-g8ed1b


From 8f66e064c9db418c6c3037aa2c217dc7b18f3276 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Wed, 17 Jan 2018 16:35:39 +0200
Subject: iwlwifi: mvm: use the new get_tid function

This saves some typing and is overall more readable.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c |  7 +++----
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c   | 11 ++++-------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index f9cd7575b422..34791628cfb3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -112,7 +112,7 @@ static inline int iwl_mvm_check_pn(struct iwl_mvm *mvm, struct sk_buff *skb,
 		return -1;
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
-		tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+		tid = ieee80211_get_tid(hdr);
 	else
 		tid = 0;
 
@@ -347,8 +347,7 @@ static bool iwl_mvm_is_dup(struct ieee80211_sta *sta, int queue,
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
 		/* frame has qos control */
-		tid = *ieee80211_get_qos_ctl(hdr) &
-			IEEE80211_QOS_CTL_TID_MASK;
+		tid = ieee80211_get_tid(hdr);
 	else
 		tid = IWL_MAX_TID_COUNT;
 
@@ -628,7 +627,7 @@ static bool iwl_mvm_reorder(struct iwl_mvm *mvm,
 	bool amsdu = desc->mac_flags2 & IWL_RX_MPDU_MFLG2_AMSDU;
 	bool last_subframe =
 		desc->amsdu_info & IWL_RX_MPDU_AMSDU_LAST_SUBFRAME;
-	u8 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	u8 tid = ieee80211_get_tid(hdr);
 	u8 sub_frame_idx = desc->amsdu_info &
 			   IWL_RX_MPDU_AMSDU_SUBFRAME_IDX_MASK;
 	struct iwl_mvm_reorder_buf_entry *entries;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 90381348697c..d0916f2552e2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -767,7 +767,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	u16 snap_ip_tcp, pad;
 	unsigned int dbg_max_amsdu_len;
 	netdev_features_t netdev_flags = NETIF_F_CSUM_MASK | NETIF_F_SG;
-	u8 *qc, tid, txf;
+	u8 tid, txf;
 
 	snap_ip_tcp = 8 + skb_transport_header(skb) - skb_network_header(skb) +
 		tcp_hdrlen(skb);
@@ -790,8 +790,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
 	}
 
-	qc = ieee80211_get_qos_ctl(hdr);
-	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+	tid = ieee80211_get_tid(hdr);
 	if (WARN_ON_ONCE(tid >= IWL_MAX_TID_COUNT))
 		return -EINVAL;
 
@@ -846,7 +845,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	 */
 	num_subframes = (max_amsdu_len + pad) / (subf_len + pad);
 	if (num_subframes > 1)
-		*qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
+		*ieee80211_get_qos_ctl(hdr) |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
 
 	tcp_payload_len = skb_tail_pointer(skb) - skb_transport_header(skb) -
 		tcp_hdrlen(skb) + skb->data_len;
@@ -1007,9 +1006,7 @@ static int iwl_mvm_tx_mpdu(struct iwl_mvm *mvm, struct sk_buff *skb,
 	 * assignment of MGMT TID
 	 */
 	if (ieee80211_is_data_qos(fc) && !ieee80211_is_qos_nullfunc(fc)) {
-		u8 *qc = NULL;
-		qc = ieee80211_get_qos_ctl(hdr);
-		tid = qc[0] & IEEE80211_QOS_CTL_TID_MASK;
+		tid = ieee80211_get_tid(hdr);
 		if (WARN_ON_ONCE(tid >= IWL_MAX_TID_COUNT))
 			goto drop_unlock_sta;
 
-- 
cgit v1.2.3-59-g8ed1b


From 3506832b098850a8aafee19512e79c33debab380 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Sun, 28 Jan 2018 18:27:52 +0200
Subject: iwlwifi: pcie: gen2: fix race in cmd fifo write ptr

Avoid a race where two (or more) commands get the
same index:

1. T1 calls enqueue_hcmd and the local TFD index is assigned to
   txq->write_ptr;
2. Context switch 'before incrementing txq->write_ptr';
3. T2 calls enqueue_hcmd and the local TFD index is assigned to
   txq->write_ptr;
4. Now the index is set to the same value for both commands of T1 and
   T2.

To prevent this from happening, set the local TFD index inside the
critical section (the index is set by global txq write pointer).

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
index a235dda1f70b..48890a1c825f 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c
@@ -569,15 +569,13 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans,
 	unsigned long flags;
 	void *dup_buf = NULL;
 	dma_addr_t phys_addr;
-	int i, cmd_pos, idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr);
+	int i, cmd_pos, idx;
 	u16 copy_size, cmd_size, tb0_size;
 	bool had_nocopy = false;
 	u8 group_id = iwl_cmd_groupid(cmd->id);
 	const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD];
 	u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD];
-	struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr);
-
-	memset(tfd, 0, sizeof(*tfd));
+	struct iwl_tfh_tfd *tfd;
 
 	copy_size = sizeof(struct iwl_cmd_header_wide);
 	cmd_size = sizeof(struct iwl_cmd_header_wide);
@@ -648,6 +646,10 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans,
 
 	spin_lock_bh(&txq->lock);
 
+	idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr);
+	tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr);
+	memset(tfd, 0, sizeof(*tfd));
+
 	if (iwl_queue_space(txq) < ((cmd->flags & CMD_ASYNC) ? 2 : 1)) {
 		spin_unlock_bh(&txq->lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4883145a8e17fd0c232a80dbc212eaebf57b061d Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 29 Jan 2018 10:00:05 +0200
Subject: iwlwifi: mvm: set the MFP flag for keys that are used by MFP stations

22000 devices rely on this flag to install the key to the right
queues.  For earlier devices we didn't have a key / queue mapping and
the key was sent along with the Tx command for each Tx hence the
problem didn't arise.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index 80067eb9ea05..4f9d3e13d7e5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2015 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -2887,7 +2888,7 @@ static int iwl_mvm_send_sta_key(struct iwl_mvm *mvm,
 				u32 sta_id,
 				struct ieee80211_key_conf *key, bool mcast,
 				u32 tkip_iv32, u16 *tkip_p1k, u32 cmd_flags,
-				u8 key_offset)
+				u8 key_offset, bool mfp)
 {
 	union {
 		struct iwl_mvm_add_sta_key_cmd_v1 cmd_v1;
@@ -2960,6 +2961,8 @@ static int iwl_mvm_send_sta_key(struct iwl_mvm *mvm,
 
 	if (mcast)
 		key_flags |= cpu_to_le16(STA_KEY_MULTICAST);
+	if (mfp)
+		key_flags |= cpu_to_le16(STA_KEY_MFP);
 
 	u.cmd.common.key_offset = key_offset;
 	u.cmd.common.key_flags = key_flags;
@@ -3101,11 +3104,13 @@ static int __iwl_mvm_set_sta_key(struct iwl_mvm *mvm,
 	struct ieee80211_key_seq seq;
 	u16 p1k[5];
 	u32 sta_id;
+	bool mfp = false;
 
 	if (sta) {
 		struct iwl_mvm_sta *mvm_sta = iwl_mvm_sta_from_mac80211(sta);
 
 		sta_id = mvm_sta->sta_id;
+		mfp = sta->mfp;
 	} else if (vif->type == NL80211_IFTYPE_AP &&
 		   !(keyconf->flags & IEEE80211_KEY_FLAG_PAIRWISE)) {
 		struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif);
@@ -3127,7 +3132,8 @@ static int __iwl_mvm_set_sta_key(struct iwl_mvm *mvm,
 		ieee80211_get_key_rx_seq(keyconf, 0, &seq);
 		ieee80211_get_tkip_rx_p1k(keyconf, addr, seq.tkip.iv32, p1k);
 		ret = iwl_mvm_send_sta_key(mvm, sta_id, keyconf, mcast,
-					   seq.tkip.iv32, p1k, 0, key_offset);
+					   seq.tkip.iv32, p1k, 0, key_offset,
+					   mfp);
 		break;
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_WEP40:
@@ -3135,11 +3141,11 @@ static int __iwl_mvm_set_sta_key(struct iwl_mvm *mvm,
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		ret = iwl_mvm_send_sta_key(mvm, sta_id, keyconf, mcast,
-					   0, NULL, 0, key_offset);
+					   0, NULL, 0, key_offset, mfp);
 		break;
 	default:
 		ret = iwl_mvm_send_sta_key(mvm, sta_id, keyconf, mcast,
-					   0, NULL, 0, key_offset);
+					   0, NULL, 0, key_offset, mfp);
 	}
 
 	return ret;
@@ -3366,6 +3372,7 @@ void iwl_mvm_update_tkip_key(struct iwl_mvm *mvm,
 {
 	struct iwl_mvm_sta *mvm_sta;
 	bool mcast = !(keyconf->flags & IEEE80211_KEY_FLAG_PAIRWISE);
+	bool mfp = sta ? sta->mfp : false;
 
 	rcu_read_lock();
 
@@ -3373,7 +3380,8 @@ void iwl_mvm_update_tkip_key(struct iwl_mvm *mvm,
 	if (WARN_ON_ONCE(!mvm_sta))
 		goto unlock;
 	iwl_mvm_send_sta_key(mvm, mvm_sta->sta_id, keyconf, mcast,
-			     iv32, phase1key, CMD_ASYNC, keyconf->hw_key_idx);
+			     iv32, phase1key, CMD_ASYNC, keyconf->hw_key_idx,
+			     mfp);
 
  unlock:
 	rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 2c2b4bbc5d1f5c37e16d108f7a0c4e2a36c4f423 Mon Sep 17 00:00:00 2001
From: Naftali Goldstein <naftali.goldstein@intel.com>
Date: Mon, 15 Jan 2018 12:32:30 +0200
Subject: iwlwifi: mvm: update rs-fw API

Update rs-fw API to match changes in FW.  Specifically, the
TLC_MNG_NOTIF_REQ_CMD command and TLC_MNG_AMSDU_ENABLE_NOTIF
notification are removed, the A-MSDU related info is received from FW
via the TLC_MNG_UPDATE_NOTIF, and the TLC_MNG_CONFIG_CMD uses version
2 of its data structure.

Additionally, constify some arguments in a couple of functions.

Signed-off-by: Naftali Goldstein <naftali.goldstein@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 .../net/wireless/intel/iwlwifi/fw/api/datapath.h   |  10 --
 drivers/net/wireless/intel/iwlwifi/fw/api/rs.h     | 168 +++++++--------------
 drivers/net/wireless/intel/iwlwifi/iwl-config.h    |   2 +
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |   3 -
 drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c     | 143 +++++++-----------
 drivers/net/wireless/intel/iwlwifi/mvm/rs.h        |   2 -
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |   4 +-
 7 files changed, 114 insertions(+), 218 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h b/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
index 184cee98c359..5f6e855006dd 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/datapath.h
@@ -87,16 +87,6 @@ enum iwl_data_path_subcmd_ids {
 	 */
 	TLC_MNG_CONFIG_CMD = 0xF,
 
-	/**
-	 * @TLC_MNG_NOTIF_REQ_CMD: &struct iwl_tlc_notif_req_config_cmd
-	 */
-	TLC_MNG_NOTIF_REQ_CMD = 0x10,
-
-	/**
-	 * @TLC_MNG_AMSDU_ENABLE_NOTIF: &struct iwl_tlc_amsdu_notif
-	 */
-	TLC_MNG_AMSDU_ENABLE_NOTIF = 0xF6,
-
 	/**
 	 * @TLC_MNG_UPDATE_NOTIF: &struct iwl_tlc_update_notif
 	 */
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h b/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
index 1a8f3154a1fe..21e13a315421 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/rs.h
@@ -7,6 +7,7 @@
  *
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -28,6 +29,7 @@
  *
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -64,62 +66,38 @@
 
 /**
  * enum iwl_tlc_mng_cfg_flags_enum - options for TLC config flags
- * @IWL_TLC_MNG_CFG_FLAGS_CCK_MSK: CCK support
- * @IWL_TLC_MNG_CFG_FLAGS_DD_MSK: enable DD
  * @IWL_TLC_MNG_CFG_FLAGS_STBC_MSK: enable STBC
  * @IWL_TLC_MNG_CFG_FLAGS_LDPC_MSK: enable LDPC
- * @IWL_TLC_MNG_CFG_FLAGS_BF_MSK: enable BFER
- * @IWL_TLC_MNG_CFG_FLAGS_DCM_MSK: enable DCM
  */
 enum iwl_tlc_mng_cfg_flags {
-	IWL_TLC_MNG_CFG_FLAGS_CCK_MSK	= BIT(0),
-	IWL_TLC_MNG_CFG_FLAGS_DD_MSK	= BIT(1),
-	IWL_TLC_MNG_CFG_FLAGS_STBC_MSK	= BIT(2),
-	IWL_TLC_MNG_CFG_FLAGS_LDPC_MSK	= BIT(3),
-	IWL_TLC_MNG_CFG_FLAGS_BF_MSK	= BIT(4),
-	IWL_TLC_MNG_CFG_FLAGS_DCM_MSK	= BIT(5),
+	IWL_TLC_MNG_CFG_FLAGS_STBC_MSK		= BIT(0),
+	IWL_TLC_MNG_CFG_FLAGS_LDPC_MSK		= BIT(1),
 };
 
 /**
  * enum iwl_tlc_mng_cfg_cw - channel width options
- * @IWL_TLC_MNG_MAX_CH_WIDTH_20MHZ: 20MHZ channel
- * @IWL_TLC_MNG_MAX_CH_WIDTH_40MHZ: 40MHZ channel
- * @IWL_TLC_MNG_MAX_CH_WIDTH_80MHZ: 80MHZ channel
- * @IWL_TLC_MNG_MAX_CH_WIDTH_160MHZ: 160MHZ channel
- * @IWL_TLC_MNG_MAX_CH_WIDTH_LAST: maximum value
+ * @IWL_TLC_MNG_CH_WIDTH_20MHZ: 20MHZ channel
+ * @IWL_TLC_MNG_CH_WIDTH_40MHZ: 40MHZ channel
+ * @IWL_TLC_MNG_CH_WIDTH_80MHZ: 80MHZ channel
+ * @IWL_TLC_MNG_CH_WIDTH_160MHZ: 160MHZ channel
+ * @IWL_TLC_MNG_CH_WIDTH_LAST: maximum value
  */
 enum iwl_tlc_mng_cfg_cw {
-	IWL_TLC_MNG_MAX_CH_WIDTH_20MHZ,
-	IWL_TLC_MNG_MAX_CH_WIDTH_40MHZ,
-	IWL_TLC_MNG_MAX_CH_WIDTH_80MHZ,
-	IWL_TLC_MNG_MAX_CH_WIDTH_160MHZ,
-	IWL_TLC_MNG_MAX_CH_WIDTH_LAST = IWL_TLC_MNG_MAX_CH_WIDTH_160MHZ,
+	IWL_TLC_MNG_CH_WIDTH_20MHZ,
+	IWL_TLC_MNG_CH_WIDTH_40MHZ,
+	IWL_TLC_MNG_CH_WIDTH_80MHZ,
+	IWL_TLC_MNG_CH_WIDTH_160MHZ,
+	IWL_TLC_MNG_CH_WIDTH_LAST = IWL_TLC_MNG_CH_WIDTH_160MHZ,
 };
 
 /**
  * enum iwl_tlc_mng_cfg_chains - possible chains
  * @IWL_TLC_MNG_CHAIN_A_MSK: chain A
  * @IWL_TLC_MNG_CHAIN_B_MSK: chain B
- * @IWL_TLC_MNG_CHAIN_C_MSK: chain C
  */
 enum iwl_tlc_mng_cfg_chains {
 	IWL_TLC_MNG_CHAIN_A_MSK = BIT(0),
 	IWL_TLC_MNG_CHAIN_B_MSK = BIT(1),
-	IWL_TLC_MNG_CHAIN_C_MSK = BIT(2),
-};
-
-/**
- * enum iwl_tlc_mng_cfg_gi - guard interval options
- * @IWL_TLC_MNG_SGI_20MHZ_MSK: enable short GI for 20MHZ
- * @IWL_TLC_MNG_SGI_40MHZ_MSK: enable short GI for 40MHZ
- * @IWL_TLC_MNG_SGI_80MHZ_MSK: enable short GI for 80MHZ
- * @IWL_TLC_MNG_SGI_160MHZ_MSK: enable short GI for 160MHZ
- */
-enum iwl_tlc_mng_cfg_gi {
-	IWL_TLC_MNG_SGI_20MHZ_MSK  = BIT(0),
-	IWL_TLC_MNG_SGI_40MHZ_MSK  = BIT(1),
-	IWL_TLC_MNG_SGI_80MHZ_MSK  = BIT(2),
-	IWL_TLC_MNG_SGI_160MHZ_MSK = BIT(3),
 };
 
 /**
@@ -145,25 +123,7 @@ enum iwl_tlc_mng_cfg_mode {
 };
 
 /**
- * enum iwl_tlc_mng_vht_he_types - VHT HE types
- * @IWL_TLC_MNG_VALID_VHT_HE_TYPES_SU: VHT HT single user
- * @IWL_TLC_MNG_VALID_VHT_HE_TYPES_SU_EXT: VHT HT single user extended
- * @IWL_TLC_MNG_VALID_VHT_HE_TYPES_MU: VHT HT multiple users
- * @IWL_TLC_MNG_VALID_VHT_HE_TYPES_TRIG_BASED: trigger based
- * @IWL_TLC_MNG_VALID_VHT_HE_TYPES_NUM: a count of possible types
- */
-enum iwl_tlc_mng_vht_he_types {
-	IWL_TLC_MNG_VALID_VHT_HE_TYPES_SU = 0,
-	IWL_TLC_MNG_VALID_VHT_HE_TYPES_SU_EXT,
-	IWL_TLC_MNG_VALID_VHT_HE_TYPES_MU,
-	IWL_TLC_MNG_VALID_VHT_HE_TYPES_TRIG_BASED,
-	IWL_TLC_MNG_VALID_VHT_HE_TYPES_NUM =
-		IWL_TLC_MNG_VALID_VHT_HE_TYPES_TRIG_BASED,
-
-};
-
-/**
- * enum iwl_tlc_mng_ht_rates - HT/VHT rates
+ * enum iwl_tlc_mng_ht_rates - HT/VHT/HE rates
  * @IWL_TLC_MNG_HT_RATE_MCS0: index of MCS0
  * @IWL_TLC_MNG_HT_RATE_MCS1: index of MCS1
  * @IWL_TLC_MNG_HT_RATE_MCS2: index of MCS2
@@ -174,6 +134,8 @@ enum iwl_tlc_mng_vht_he_types {
  * @IWL_TLC_MNG_HT_RATE_MCS7: index of MCS7
  * @IWL_TLC_MNG_HT_RATE_MCS8: index of MCS8
  * @IWL_TLC_MNG_HT_RATE_MCS9: index of MCS9
+ * @IWL_TLC_MNG_HT_RATE_MCS10: index of MCS10
+ * @IWL_TLC_MNG_HT_RATE_MCS11: index of MCS11
  * @IWL_TLC_MNG_HT_RATE_MAX: maximal rate for HT/VHT
  */
 enum iwl_tlc_mng_ht_rates {
@@ -187,95 +149,73 @@ enum iwl_tlc_mng_ht_rates {
 	IWL_TLC_MNG_HT_RATE_MCS7,
 	IWL_TLC_MNG_HT_RATE_MCS8,
 	IWL_TLC_MNG_HT_RATE_MCS9,
-	IWL_TLC_MNG_HT_RATE_MAX = IWL_TLC_MNG_HT_RATE_MCS9,
+	IWL_TLC_MNG_HT_RATE_MCS10,
+	IWL_TLC_MNG_HT_RATE_MCS11,
+	IWL_TLC_MNG_HT_RATE_MAX = IWL_TLC_MNG_HT_RATE_MCS11,
 };
 
 /* Maximum supported tx antennas number */
-#define MAX_RS_ANT_NUM 3
+#define MAX_NSS 2
 
 /**
  * struct tlc_config_cmd - TLC configuration
  * @sta_id: station id
  * @reserved1: reserved
- * @max_supp_ch_width: channel width
- * @flags: bitmask of &enum iwl_tlc_mng_cfg_flags
- * @chains: bitmask of &enum iwl_tlc_mng_cfg_chains
- * @max_supp_ss: valid values are 0-3, 0 - spatial streams are not supported
- * @valid_vht_he_types: bitmap of &enum iwl_tlc_mng_vht_he_types
- * @non_ht_supp_rates: bitmap of supported legacy rates
- * @ht_supp_rates: bitmap of supported HT/VHT rates, valid bits are 0-9
+ * @max_ch_width: max supported channel width from @enum iwl_tlc_mng_cfg_cw
  * @mode: &enum iwl_tlc_mng_cfg_mode
+ * @chains: bitmask of &enum iwl_tlc_mng_cfg_chains
  * @amsdu: TX amsdu is supported
- * @he_supp_rates: bitmap of supported HE rates
+ * @flags: bitmask of &enum iwl_tlc_mng_cfg_flags
+ * @non_ht_rates: bitmap of supported legacy rates
+ * @ht_rates: bitmap of &enum iwl_tlc_mng_ht_rates, per <nss, channel-width>
+ *	      pair (0 - 80mhz width and below, 1 - 160mhz).
+ * @max_mpdu_len: max MPDU length, in bytes
  * @sgi_ch_width_supp: bitmap of SGI support per channel width
- * @he_gi_support: 11ax HE guard interval
- * @max_ampdu_cnt: max AMPDU size (frames count)
+ *		       use BIT(@enum iwl_tlc_mng_cfg_cw)
+ * @reserved2: reserved
  */
 struct iwl_tlc_config_cmd {
 	u8 sta_id;
 	u8 reserved1[3];
-	u8 max_supp_ch_width;
-	u8 chains;
-	u8 max_supp_ss;
-	u8 valid_vht_he_types;
-	__le16 flags;
-	__le16 non_ht_supp_rates;
-	__le16 ht_supp_rates[MAX_RS_ANT_NUM];
+	u8 max_ch_width;
 	u8 mode;
+	u8 chains;
 	u8 amsdu;
-	__le16 he_supp_rates;
+	__le16 flags;
+	__le16 non_ht_rates;
+	__le16 ht_rates[MAX_NSS][2];
+	__le16 max_mpdu_len;
 	u8 sgi_ch_width_supp;
-	u8 he_gi_support;
-	__le32 max_ampdu_cnt;
-} __packed; /* TLC_MNG_CONFIG_CMD_API_S_VER_1 */
-
-/**
- * struct iwl_tlc_amsdu_notif - TLC AMSDU configuration
- * @sta_id: station id
- * @reserved: reserved
- * @amsdu_size: Max AMSDU size, in bytes
- * @amsdu_enabled: bitmap for per-TID AMSDU enablement
- */
-struct iwl_tlc_amsdu_notif {
-	u8 sta_id;
-	u8 reserved[3];
-	__le16 amsdu_size;
-	__le16 amsdu_enabled;
-} __packed; /* TLC_MNG_AMSDU_ENABLE_NTFY_API_S_VER_1 */
-
-#define IWL_TLC_NOTIF_INIT_RATE_POS 0
-#define IWL_TLC_NOTIF_INIT_RATE_MSK BIT(IWL_TLC_NOTIF_INIT_RATE_POS)
-#define IWL_TLC_NOTIF_REQ_INTERVAL (500)
+	u8 reserved2[1];
+} __packed; /* TLC_MNG_CONFIG_CMD_API_S_VER_2 */
 
 /**
- * struct iwl_tlc_notif_req_config_cmd - request notif on specific changes
- * @sta_id: relevant station
- * @reserved1: reserved
- * @flags: bitmap of requested notifications %IWL_TLC_NOTIF_INIT_\*
- * @interval: minimum time between notifications from TLC to the driver (msec)
- * @reserved2: reserved
+ * enum iwl_tlc_update_flags - updated fields
+ * @IWL_TLC_NOTIF_FLAG_RATE: last initial rate update
+ * @IWL_TLC_NOTIF_FLAG_AMSDU: umsdu parameters update
  */
-struct iwl_tlc_notif_req_config_cmd {
-	u8 sta_id;
-	u8 reserved1;
-	__le16 flags;
-	__le16 interval;
-	__le16 reserved2;
-} __packed; /* TLC_MNG_NOTIF_REQ_CMD_API_S_VER_1 */
+enum iwl_tlc_update_flags {
+	IWL_TLC_NOTIF_FLAG_RATE  = BIT(0),
+	IWL_TLC_NOTIF_FLAG_AMSDU = BIT(1),
+};
 
 /**
  * struct iwl_tlc_update_notif - TLC notification from FW
  * @sta_id: station id
  * @reserved: reserved
  * @flags: bitmap of notifications reported
- * @values: field per flag in struct iwl_tlc_notif_req_config_cmd
+ * @rate: current initial rate
+ * @amsdu_size: Max AMSDU size, in bytes
+ * @amsdu_enabled: bitmap for per-TID AMSDU enablement
  */
 struct iwl_tlc_update_notif {
 	u8 sta_id;
-	u8 reserved;
-	__le16 flags;
-	__le32 values[16];
-} __packed; /* TLC_MNG_UPDATE_NTFY_API_S_VER_1 */
+	u8 reserved[3];
+	__le32 flags;
+	__le32 rate;
+	__le32 amsdu_size;
+	__le32 amsdu_enabled;
+} __packed; /* TLC_MNG_UPDATE_NTFY_API_S_VER_2 */
 
 /**
  * enum iwl_tlc_debug_flags - debug options
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index 687bed30c851..14e69e7a2287 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -151,6 +151,8 @@ enum iwl_nvm_type {
 #define	ANT_AC		(ANT_A | ANT_C)
 #define ANT_BC		(ANT_B | ANT_C)
 #define ANT_ABC		(ANT_A | ANT_B | ANT_C)
+#define MAX_ANT_NUM 3
+
 
 static inline u8 num_of_ant(u8 mask)
 {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 1ef4ac21f32a..de46e6258a5c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -312,8 +312,6 @@ static const struct iwl_rx_handlers iwl_mvm_rx_handlers[] = {
 		       iwl_mvm_mu_mimo_grp_notif, RX_HANDLER_SYNC),
 	RX_HANDLER_GRP(DATA_PATH_GROUP, STA_PM_NOTIF,
 		       iwl_mvm_sta_pm_notif, RX_HANDLER_SYNC),
-	RX_HANDLER_GRP(DATA_PATH_GROUP, TLC_MNG_AMSDU_ENABLE_NOTIF,
-		       iwl_mvm_tlc_amsdu_notif, RX_HANDLER_SYNC),
 };
 #undef RX_HANDLER
 #undef RX_HANDLER_GRP
@@ -450,7 +448,6 @@ static const struct iwl_hcmd_names iwl_mvm_data_path_names[] = {
 	HCMD_NAME(DQA_ENABLE_CMD),
 	HCMD_NAME(UPDATE_MU_GROUPS_CMD),
 	HCMD_NAME(TRIGGER_RX_QUEUES_NOTIF_CMD),
-	HCMD_NAME(TLC_MNG_AMSDU_ENABLE_NOTIF),
 	HCMD_NAME(STA_PM_NOTIF),
 	HCMD_NAME(MU_GROUP_MGMT_NOTIF),
 	HCMD_NAME(RX_QUEUES_NOTIFICATION),
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
index 4e818bce469b..b8b2b819e8e7 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs-fw.c
@@ -6,6 +6,7 @@
  * GPL LICENSE SUMMARY
  *
  * Copyright(c) 2017        Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -26,6 +27,7 @@
  * BSD LICENSE
  *
  * Copyright(c) 2017        Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -65,14 +67,14 @@ static u8 rs_fw_bw_from_sta_bw(struct ieee80211_sta *sta)
 {
 	switch (sta->bandwidth) {
 	case IEEE80211_STA_RX_BW_160:
-		return IWL_TLC_MNG_MAX_CH_WIDTH_160MHZ;
+		return IWL_TLC_MNG_CH_WIDTH_160MHZ;
 	case IEEE80211_STA_RX_BW_80:
-		return IWL_TLC_MNG_MAX_CH_WIDTH_80MHZ;
+		return IWL_TLC_MNG_CH_WIDTH_80MHZ;
 	case IEEE80211_STA_RX_BW_40:
-		return IWL_TLC_MNG_MAX_CH_WIDTH_40MHZ;
+		return IWL_TLC_MNG_CH_WIDTH_40MHZ;
 	case IEEE80211_STA_RX_BW_20:
 	default:
-		return IWL_TLC_MNG_MAX_CH_WIDTH_20MHZ;
+		return IWL_TLC_MNG_CH_WIDTH_20MHZ;
 	}
 }
 
@@ -85,7 +87,9 @@ static u8 rs_fw_set_active_chains(u8 chains)
 	if (chains & ANT_B)
 		fw_chains |= IWL_TLC_MNG_CHAIN_B_MSK;
 	if (chains & ANT_C)
-		fw_chains |= IWL_TLC_MNG_CHAIN_C_MSK;
+		WARN(false,
+		     "tlc offload doesn't support antenna C. chains: 0x%x\n",
+		     chains);
 
 	return fw_chains;
 }
@@ -97,13 +101,13 @@ static u8 rs_fw_sgi_cw_support(struct ieee80211_sta *sta)
 	u8 supp = 0;
 
 	if (ht_cap->cap & IEEE80211_HT_CAP_SGI_20)
-		supp |= IWL_TLC_MNG_SGI_20MHZ_MSK;
+		supp |= BIT(IWL_TLC_MNG_CH_WIDTH_20MHZ);
 	if (ht_cap->cap & IEEE80211_HT_CAP_SGI_40)
-		supp |= IWL_TLC_MNG_SGI_40MHZ_MSK;
+		supp |= BIT(IWL_TLC_MNG_CH_WIDTH_40MHZ);
 	if (vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80)
-		supp |= IWL_TLC_MNG_SGI_80MHZ_MSK;
+		supp |= BIT(IWL_TLC_MNG_CH_WIDTH_80MHZ);
 	if (vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_160)
-		supp |= IWL_TLC_MNG_SGI_160MHZ_MSK;
+		supp |= BIT(IWL_TLC_MNG_CH_WIDTH_160MHZ);
 
 	return supp;
 }
@@ -114,9 +118,7 @@ static u16 rs_fw_set_config_flags(struct iwl_mvm *mvm,
 	struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap;
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
 	bool vht_ena = vht_cap && vht_cap->vht_supported;
-	u16 flags = IWL_TLC_MNG_CFG_FLAGS_CCK_MSK |
-		    IWL_TLC_MNG_CFG_FLAGS_DCM_MSK |
-		    IWL_TLC_MNG_CFG_FLAGS_DD_MSK;
+	u16 flags = 0;
 
 	if (mvm->cfg->ht_params->stbc &&
 	    (num_of_ant(iwl_mvm_get_valid_tx_ant(mvm)) > 1) &&
@@ -129,16 +131,11 @@ static u16 rs_fw_set_config_flags(struct iwl_mvm *mvm,
 	     (vht_ena && (vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC))))
 		flags |= IWL_TLC_MNG_CFG_FLAGS_LDPC_MSK;
 
-	if (fw_has_capa(&mvm->fw->ucode_capa, IWL_UCODE_TLV_CAPA_BEAMFORMER) &&
-	    (num_of_ant(iwl_mvm_get_valid_tx_ant(mvm)) > 1) &&
-	    (vht_cap->cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE))
-		flags |= IWL_TLC_MNG_CFG_FLAGS_BF_MSK;
-
 	return flags;
 }
 
 static
-int rs_fw_vht_highest_rx_mcs_index(struct ieee80211_sta_vht_cap *vht_cap,
+int rs_fw_vht_highest_rx_mcs_index(const struct ieee80211_sta_vht_cap *vht_cap,
 				   int nss)
 {
 	u16 rx_mcs = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map) &
@@ -160,15 +157,16 @@ int rs_fw_vht_highest_rx_mcs_index(struct ieee80211_sta_vht_cap *vht_cap,
 	return 0;
 }
 
-static void rs_fw_vht_set_enabled_rates(struct ieee80211_sta *sta,
-					struct ieee80211_sta_vht_cap *vht_cap,
-					struct iwl_tlc_config_cmd *cmd)
+static void
+rs_fw_vht_set_enabled_rates(const struct ieee80211_sta *sta,
+			    const struct ieee80211_sta_vht_cap *vht_cap,
+			    struct iwl_tlc_config_cmd *cmd)
 {
 	u16 supp;
 	int i, highest_mcs;
 
 	for (i = 0; i < sta->rx_nss; i++) {
-		if (i == MAX_RS_ANT_NUM)
+		if (i == MAX_NSS)
 			break;
 
 		highest_mcs = rs_fw_vht_highest_rx_mcs_index(vht_cap, i + 1);
@@ -179,7 +177,9 @@ static void rs_fw_vht_set_enabled_rates(struct ieee80211_sta *sta,
 		if (sta->bandwidth == IEEE80211_STA_RX_BW_20)
 			supp &= ~BIT(IWL_TLC_MNG_HT_RATE_MCS9);
 
-		cmd->ht_supp_rates[i] = cpu_to_le16(supp);
+		cmd->ht_rates[i][0] = cpu_to_le16(supp);
+		if (sta->bandwidth == IEEE80211_STA_RX_BW_160)
+			cmd->ht_rates[i][1] = cmd->ht_rates[i][0];
 	}
 }
 
@@ -190,8 +190,8 @@ static void rs_fw_set_supp_rates(struct ieee80211_sta *sta,
 	int i;
 	unsigned long tmp;
 	unsigned long supp; /* must be unsigned long for for_each_set_bit */
-	struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap;
-	struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
+	const struct ieee80211_sta_ht_cap *ht_cap = &sta->ht_cap;
+	const struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap;
 
 	/* non HT rates */
 	supp = 0;
@@ -199,99 +199,71 @@ static void rs_fw_set_supp_rates(struct ieee80211_sta *sta,
 	for_each_set_bit(i, &tmp, BITS_PER_LONG)
 		supp |= BIT(sband->bitrates[i].hw_value);
 
-	cmd->non_ht_supp_rates = cpu_to_le16(supp);
+	cmd->non_ht_rates = cpu_to_le16(supp);
 	cmd->mode = IWL_TLC_MNG_MODE_NON_HT;
 
-	/* HT/VHT rates */
 	if (vht_cap && vht_cap->vht_supported) {
 		cmd->mode = IWL_TLC_MNG_MODE_VHT;
 		rs_fw_vht_set_enabled_rates(sta, vht_cap, cmd);
 	} else if (ht_cap && ht_cap->ht_supported) {
 		cmd->mode = IWL_TLC_MNG_MODE_HT;
-		cmd->ht_supp_rates[0] = cpu_to_le16(ht_cap->mcs.rx_mask[0]);
-		cmd->ht_supp_rates[1] = cpu_to_le16(ht_cap->mcs.rx_mask[1]);
+		cmd->ht_rates[0][0] = cpu_to_le16(ht_cap->mcs.rx_mask[0]);
+		cmd->ht_rates[1][0] = cpu_to_le16(ht_cap->mcs.rx_mask[1]);
 	}
 }
 
-static void rs_fw_tlc_mng_notif_req_config(struct iwl_mvm *mvm, u8 sta_id)
-{
-	u32 cmd_id = iwl_cmd_id(TLC_MNG_NOTIF_REQ_CMD, DATA_PATH_GROUP, 0);
-	struct iwl_tlc_notif_req_config_cmd cfg_cmd = {
-		.sta_id = sta_id,
-		.flags = cpu_to_le16(IWL_TLC_NOTIF_INIT_RATE_MSK),
-		.interval = cpu_to_le16(IWL_TLC_NOTIF_REQ_INTERVAL),
-	};
-	int ret;
-
-	ret = iwl_mvm_send_cmd_pdu(mvm, cmd_id, 0, sizeof(cfg_cmd), &cfg_cmd);
-	if (ret)
-		IWL_ERR(mvm, "Failed to send TLC notif request (%d)\n", ret);
-}
-
-void iwl_mvm_tlc_amsdu_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb)
+void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm,
+			      struct iwl_rx_cmd_buffer *rxb)
 {
 	struct iwl_rx_packet *pkt = rxb_addr(rxb);
-	struct iwl_tlc_amsdu_notif *notif;
+	struct iwl_tlc_update_notif *notif;
 	struct ieee80211_sta *sta;
 	struct iwl_mvm_sta *mvmsta;
-	u16 size;
-
-	notif = (void *)pkt->data;
-
-	if (WARN_ON(notif->sta_id >= ARRAY_SIZE(mvm->fw_id_to_mac_id)))
-		return;
+	struct iwl_lq_sta_rs_fw *lq_sta;
+	u32 flags;
 
 	rcu_read_lock();
 
+	notif = (void *)pkt->data;
 	sta = rcu_dereference(mvm->fw_id_to_mac_id[notif->sta_id]);
 	if (IS_ERR_OR_NULL(sta)) {
-		rcu_read_unlock();
 		IWL_ERR(mvm, "Invalid sta id (%d) in FW TLC notification\n",
 			notif->sta_id);
-		return;
+		goto out;
 	}
 
 	mvmsta = iwl_mvm_sta_from_mac80211(sta);
 
-	size = min(le16_to_cpu(notif->amsdu_size), sta->max_amsdu_len);
-	mvmsta->amsdu_enabled = le16_to_cpu(notif->amsdu_enabled);
-	mvmsta->max_amsdu_len = size;
-
-	IWL_DEBUG_RATE(mvm,
-		       "AMSDU notification. AMSDU size: %d, AMSDU selected size: %d, AMSDU TID bitmap 0x%X\n",
-		       le16_to_cpu(notif->amsdu_size), size,
-		       mvmsta->amsdu_enabled);
-
-	rcu_read_unlock();
-};
-
-void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm,
-			      struct iwl_rx_cmd_buffer *rxb)
-{
-	struct iwl_rx_packet *pkt = rxb_addr(rxb);
-	struct iwl_tlc_update_notif *notif;
-	struct iwl_mvm_sta *mvmsta;
-	struct iwl_lq_sta_rs_fw *lq_sta;
-
-	rcu_read_lock();
-
-	notif = (void *)pkt->data;
-	mvmsta = iwl_mvm_sta_from_staid_rcu(mvm, notif->sta_id);
-
 	if (!mvmsta) {
 		IWL_ERR(mvm, "Invalid sta id (%d) in FW TLC notification\n",
 			notif->sta_id);
 		goto out;
 	}
 
+	flags = le32_to_cpu(notif->flags);
+
 	lq_sta = &mvmsta->lq_sta.rs_fw;
 
-	if (le16_to_cpu(notif->flags) & IWL_TLC_NOTIF_INIT_RATE_MSK) {
-		lq_sta->last_rate_n_flags =
-			le32_to_cpu(notif->values[IWL_TLC_NOTIF_INIT_RATE_POS]);
+	if (flags & IWL_TLC_NOTIF_FLAG_RATE) {
+		lq_sta->last_rate_n_flags = le32_to_cpu(notif->rate);
 		IWL_DEBUG_RATE(mvm, "new rate_n_flags: 0x%X\n",
 			       lq_sta->last_rate_n_flags);
 	}
+
+	if (flags & IWL_TLC_NOTIF_FLAG_AMSDU) {
+		u16 size = le32_to_cpu(notif->amsdu_size);
+
+		if (WARN_ON(sta->max_amsdu_len < size))
+			goto out;
+
+		mvmsta->amsdu_enabled = le32_to_cpu(notif->amsdu_enabled);
+		mvmsta->max_amsdu_len = size;
+
+		IWL_DEBUG_RATE(mvm,
+			       "AMSDU update. AMSDU size: %d, AMSDU selected size: %d, AMSDU TID bitmap 0x%X\n",
+			       le32_to_cpu(notif->amsdu_size), size,
+			       mvmsta->amsdu_enabled);
+	}
 out:
 	rcu_read_unlock();
 }
@@ -306,11 +278,10 @@ void rs_fw_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 	struct ieee80211_supported_band *sband;
 	struct iwl_tlc_config_cmd cfg_cmd = {
 		.sta_id = mvmsta->sta_id,
-		.max_supp_ch_width = rs_fw_bw_from_sta_bw(sta),
+		.max_ch_width = rs_fw_bw_from_sta_bw(sta),
 		.flags = cpu_to_le16(rs_fw_set_config_flags(mvm, sta)),
 		.chains = rs_fw_set_active_chains(iwl_mvm_get_valid_tx_ant(mvm)),
-		.max_supp_ss = sta->rx_nss,
-		.max_ampdu_cnt = cpu_to_le32(mvmsta->max_agg_bufsize),
+		.max_mpdu_len = cpu_to_le16(sta->max_amsdu_len),
 		.sgi_ch_width_supp = rs_fw_sgi_cw_support(sta),
 		.amsdu = iwl_mvm_is_csum_supported(mvm),
 	};
@@ -327,8 +298,6 @@ void rs_fw_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 	ret = iwl_mvm_send_cmd_pdu(mvm, cmd_id, 0, sizeof(cfg_cmd), &cfg_cmd);
 	if (ret)
 		IWL_ERR(mvm, "Failed to send rate scale config (%d)\n", ret);
-
-	rs_fw_tlc_mng_notif_req_config(mvm, cfg_cmd.sta_id);
 }
 
 int rs_fw_tx_protection(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
index 736ac9642921..5e89141656c0 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
@@ -456,6 +456,4 @@ int rs_fw_tx_protection(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta,
 			bool enable);
 void iwl_mvm_tlc_update_notif(struct iwl_mvm *mvm,
 			      struct iwl_rx_cmd_buffer *rxb);
-void iwl_mvm_tlc_amsdu_notif(struct iwl_mvm *mvm,
-			     struct iwl_rx_cmd_buffer *rxb);
 #endif /* __rs__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index e597bc193e5c..0497c7a44def 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -278,8 +278,8 @@ u8 iwl_mvm_next_antenna(struct iwl_mvm *mvm, u8 valid, u8 last_idx)
 	u8 ind = last_idx;
 	int i;
 
-	for (i = 0; i < MAX_RS_ANT_NUM; i++) {
-		ind = (ind + 1) % MAX_RS_ANT_NUM;
+	for (i = 0; i < MAX_ANT_NUM; i++) {
+		ind = (ind + 1) % MAX_ANT_NUM;
 		if (valid & BIT(ind))
 			return ind;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 4ae80f6c8d86c415527b8bfff7c85ebe6268fc9c Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Mon, 14 Aug 2017 13:06:48 +0300
Subject: iwlwifi: support api ver2 of NVM_GET_INFO resp

NVM_GET_INFO API has changed to support indication
of 11ax support.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 .../net/wireless/intel/iwlwifi/fw/api/nvm-reg.h    | 42 ++++++++++++++--------
 drivers/net/wireless/intel/iwlwifi/fw/nvm.c        | 14 +++++---
 .../net/wireless/intel/iwlwifi/iwl-eeprom-parse.h  |  1 +
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h
index 37c57bcbfb4a..8d6dc9189985 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/nvm-reg.h
@@ -189,23 +189,37 @@ struct iwl_nvm_get_info_general {
 	u8 reserved;
 } __packed; /* GRP_REGULATORY_NVM_GET_INFO_GENERAL_S_VER_1 */
 
+/**
+ * enum iwl_nvm_mac_sku_flags - flags in &iwl_nvm_get_info_sku
+ * @NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED: true if 2.4 band enabled
+ * @NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED: true if 5.2 band enabled
+ * @NVM_MAC_SKU_FLAGS_802_11N_ENABLED: true if 11n enabled
+ * @NVM_MAC_SKU_FLAGS_802_11AC_ENABLED: true if 11ac enabled
+ * @NVM_MAC_SKU_FLAGS_802_11AX_ENABLED: true if 11ax enabled
+ * @NVM_MAC_SKU_FLAGS_MIMO_DISABLED: true if MIMO disabled
+ * @NVM_MAC_SKU_FLAGS_WAPI_ENABLED: true if WAPI enabled
+ * @NVM_MAC_SKU_FLAGS_REG_CHECK_ENABLED: true if regulatory checker enabled
+ * @NVM_MAC_SKU_FLAGS_API_LOCK_ENABLED: true if API lock enabled
+ */
+enum iwl_nvm_mac_sku_flags {
+	NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED	= BIT(0),
+	NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED	= BIT(1),
+	NVM_MAC_SKU_FLAGS_802_11N_ENABLED	= BIT(2),
+	NVM_MAC_SKU_FLAGS_802_11AC_ENABLED	= BIT(3),
+	NVM_MAC_SKU_FLAGS_802_11AX_ENABLED	= BIT(4),
+	NVM_MAC_SKU_FLAGS_MIMO_DISABLED		= BIT(5),
+	NVM_MAC_SKU_FLAGS_WAPI_ENABLED		= BIT(8),
+	NVM_MAC_SKU_FLAGS_REG_CHECK_ENABLED	= BIT(14),
+	NVM_MAC_SKU_FLAGS_API_LOCK_ENABLED	= BIT(15),
+};
+
 /**
  * struct iwl_nvm_get_info_sku - mac information
- * @enable_24g: band 2.4G enabled
- * @enable_5g: band 5G enabled
- * @enable_11n: 11n enabled
- * @enable_11ac: 11ac enabled
- * @mimo_disable: MIMO enabled
- * @ext_crypto: Extended crypto enabled
+ * @mac_sku_flags: flags for SKU, see &enum iwl_nvm_mac_sku_flags
  */
 struct iwl_nvm_get_info_sku {
-	__le32 enable_24g;
-	__le32 enable_5g;
-	__le32 enable_11n;
-	__le32 enable_11ac;
-	__le32 mimo_disable;
-	__le32 ext_crypto;
-} __packed; /* GRP_REGULATORY_NVM_GET_INFO_MAC_SKU_SECTION_S_VER_1 */
+	__le32 mac_sku_flags;
+} __packed; /* REGULATORY_NVM_GET_INFO_MAC_SKU_SECTION_S_VER_2 */
 
 /**
  * struct iwl_nvm_get_info_phy - phy information
@@ -243,7 +257,7 @@ struct iwl_nvm_get_info_rsp {
 	struct iwl_nvm_get_info_sku mac_sku;
 	struct iwl_nvm_get_info_phy phy_sku;
 	struct iwl_nvm_get_info_regulatory regulatory;
-} __packed; /* GRP_REGULATORY_NVM_GET_INFO_CMD_RSP_S_VER_1 */
+} __packed; /* GRP_REGULATORY_NVM_GET_INFO_CMD_RSP_S_VER_2 */
 
 /**
  * struct iwl_nvm_access_complete_cmd - NVM_ACCESS commands are completed
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
index bd2e1fb43f5a..ff44d9031e1a 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
@@ -86,6 +86,7 @@ struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
 	bool lar_fw_supported = !iwlwifi_mod_params.lar_disable &&
 				fw_has_capa(&fwrt->fw->ucode_capa,
 					    IWL_UCODE_TLV_CAPA_LAR_SUPPORT);
+	u32 mac_flags;
 
 	ret = iwl_trans_send_cmd(trans, &hcmd);
 	if (ret)
@@ -125,16 +126,19 @@ struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
 	nvm->nvm_version = le16_to_cpu(rsp->general.nvm_version);
 
 	/* Initialize MAC sku data */
+	mac_flags = le32_to_cpu(rsp->mac_sku.mac_sku_flags);
 	nvm->sku_cap_11ac_enable =
-		le32_to_cpu(rsp->mac_sku.enable_11ac);
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AC_ENABLED);
 	nvm->sku_cap_11n_enable =
-		le32_to_cpu(rsp->mac_sku.enable_11n);
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11N_ENABLED);
+	nvm->sku_cap_11ax_enable =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AX_ENABLED);
 	nvm->sku_cap_band_24GHz_enable =
-		le32_to_cpu(rsp->mac_sku.enable_24g);
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED);
 	nvm->sku_cap_band_52GHz_enable =
-		le32_to_cpu(rsp->mac_sku.enable_5g);
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED);
 	nvm->sku_cap_mimo_disabled =
-		le32_to_cpu(rsp->mac_sku.mimo_disable);
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_MIMO_DISABLED);
 
 	/* Initialize PHY sku data */
 	nvm->valid_tx_ant = (u8)le32_to_cpu(rsp->phy_sku.tx_chains);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
index b33888991b94..16bea75e82f1 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
@@ -85,6 +85,7 @@ struct iwl_nvm_data {
 	bool sku_cap_band_52GHz_enable;
 	bool sku_cap_11n_enable;
 	bool sku_cap_11ac_enable;
+	bool sku_cap_11ax_enable;
 	bool sku_cap_amt_enable;
 	bool sku_cap_ipan_enable;
 	bool sku_cap_mimo_disabled;
-- 
cgit v1.2.3-59-g8ed1b


From 4b82455ca51ed23cf0fd6a24ba72a40fd4794b82 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Tue, 15 Aug 2017 20:26:09 +0300
Subject: iwlwifi: use flags to denote modifiers for the channel maps

Instead of having a boolean for each modifier we need to handle in the
channel maps, create a bitmask with flags that denote each
modification.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/nvm.c        | 10 +++++----
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 25 ++++++++++++++--------
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h | 14 ++++++++++--
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
index ff44d9031e1a..beefb89194a9 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
@@ -87,6 +87,7 @@ struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
 				fw_has_capa(&fwrt->fw->ucode_capa,
 					    IWL_UCODE_TLV_CAPA_LAR_SUPPORT);
 	u32 mac_flags;
+	u32 sbands_flags = 0;
 
 	ret = iwl_trans_send_cmd(trans, &hcmd);
 	if (ret)
@@ -144,15 +145,16 @@ struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
 	nvm->valid_tx_ant = (u8)le32_to_cpu(rsp->phy_sku.tx_chains);
 	nvm->valid_rx_ant = (u8)le32_to_cpu(rsp->phy_sku.rx_chains);
 
-	/* Initialize regulatory data */
-	nvm->lar_enabled =
-		le32_to_cpu(rsp->regulatory.lar_enabled) && lar_fw_supported;
+	if (le32_to_cpu(rsp->regulatory.lar_enabled) && lar_fw_supported) {
+		nvm->lar_enabled = true;
+		sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR;
+	}
 
 	iwl_init_sbands(trans->dev, trans->cfg, nvm,
 			rsp->regulatory.channel_profile,
 			nvm->valid_tx_ant & fwrt->fw->valid_tx_ant,
 			nvm->valid_rx_ant & fwrt->fw->valid_rx_ant,
-			nvm->lar_enabled, false);
+			sbands_flags);
 
 	iwl_free_resp(&hcmd);
 	return nvm;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 3437ed480e31..735b211364d2 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -295,7 +295,7 @@ static u32 iwl_get_channel_flags(u8 ch_num, int ch_idx, bool is_5ghz,
 static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 				struct iwl_nvm_data *data,
 				const __le16 * const nvm_ch_flags,
-				bool lar_supported, bool no_wide_in_5ghz)
+				u32 sbands_flags)
 {
 	int ch_idx;
 	int n_channels = 0;
@@ -323,7 +323,8 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 			continue;
 
 		/* workaround to disable wide channels in 5GHz */
-		if (no_wide_in_5ghz && is_5ghz) {
+		if ((sbands_flags & IWL_NVM_SBANDS_FLAGS_NO_WIDE_IN_5GHZ) &&
+		    is_5ghz) {
 			ch_flags &= ~(NVM_CHANNEL_40MHZ |
 				     NVM_CHANNEL_80MHZ |
 				     NVM_CHANNEL_160MHZ);
@@ -332,7 +333,8 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 		if (ch_flags & NVM_CHANNEL_160MHZ)
 			data->vht160_supported = true;
 
-		if (!lar_supported && !(ch_flags & NVM_CHANNEL_VALID)) {
+		if (!(sbands_flags & IWL_NVM_SBANDS_FLAGS_LAR) &&
+		    !(ch_flags & NVM_CHANNEL_VALID)) {
 			/*
 			 * Channels might become valid later if lar is
 			 * supported, hence we still want to add them to
@@ -362,7 +364,7 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 		channel->max_power = IWL_DEFAULT_MAX_TX_POWER;
 
 		/* don't put limitations in case we're using LAR */
-		if (!lar_supported)
+		if (!(sbands_flags & IWL_NVM_SBANDS_FLAGS_LAR))
 			channel->flags = iwl_get_channel_flags(nvm_chan[ch_idx],
 							       ch_idx, is_5ghz,
 							       ch_flags, cfg);
@@ -460,15 +462,14 @@ static void iwl_init_vht_hw_capab(const struct iwl_cfg *cfg,
 
 void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
 		     struct iwl_nvm_data *data, const __le16 *nvm_ch_flags,
-		     u8 tx_chains, u8 rx_chains, bool lar_supported,
-		     bool no_wide_in_5ghz)
+		     u8 tx_chains, u8 rx_chains, u32 sbands_flags)
 {
 	int n_channels;
 	int n_used = 0;
 	struct ieee80211_supported_band *sband;
 
 	n_channels = iwl_init_channel_map(dev, cfg, data, nvm_ch_flags,
-					  lar_supported, no_wide_in_5ghz);
+					  sbands_flags);
 	sband = &data->bands[NL80211_BAND_2GHZ];
 	sband->band = NL80211_BAND_2GHZ;
 	sband->bitrates = &iwl_cfg80211_rates[RATES_24_OFFS];
@@ -716,8 +717,8 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 	struct device *dev = trans->dev;
 	struct iwl_nvm_data *data;
 	bool lar_enabled;
-	bool no_wide_in_5ghz = iwl_nvm_no_wide_in_5ghz(dev, cfg, nvm_hw);
 	u32 sku, radio_cfg;
+	u32 sbands_flags = 0;
 	u16 lar_config;
 	const __le16 *ch_section;
 
@@ -790,8 +791,14 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 		return NULL;
 	}
 
+	if (lar_fw_supported && lar_enabled)
+		sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR;
+
+	if (iwl_nvm_no_wide_in_5ghz(dev, cfg, nvm_hw))
+		sbands_flags |= IWL_NVM_SBANDS_FLAGS_NO_WIDE_IN_5GHZ;
+
 	iwl_init_sbands(dev, cfg, data, ch_section, tx_chains, rx_chains,
-			lar_fw_supported && lar_enabled, no_wide_in_5ghz);
+			sbands_flags);
 	data->calib_version = 255;
 
 	return data;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
index 6b108cae61b8..868baa508a16 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
@@ -69,6 +69,17 @@
 #include <net/cfg80211.h>
 #include "iwl-eeprom-parse.h"
 
+/**
+ * enum iwl_nvm_sbands_flags - modification flags for the channel profiles
+ *
+ * @IWL_NVM_SBANDS_FLAGS_LAR: LAR is enabled
+ * @IWL_NVM_SBANDS_FLAGS_NO_WIDE_IN_5GHZ: disallow 40, 80 and 160MHz on 5GHz
+ */
+enum iwl_nvm_sbands_flags {
+	IWL_NVM_SBANDS_FLAGS_LAR		= BIT(0),
+	IWL_NVM_SBANDS_FLAGS_NO_WIDE_IN_5GHZ	= BIT(1),
+};
+
 /**
  * iwl_parse_nvm_data - parse NVM data and return values
  *
@@ -95,8 +106,7 @@ void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
  */
 void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
 		     struct iwl_nvm_data *data, const __le16 *nvm_ch_flags,
-		     u8 tx_chains, u8 rx_chains, bool lar_supported,
-		     bool no_wide_in_5ghz);
+		     u8 tx_chains, u8 rx_chains, u32 sbands_flags);
 
 /**
  * iwl_parse_mcc_info - parse MCC (mobile country code) info coming from FW
-- 
cgit v1.2.3-59-g8ed1b


From 28e9c00fe1f0b1ff811d23dcffbf632b85f33797 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Thu, 17 Aug 2017 19:02:04 +0300
Subject: iwlwifi: remove upper case letters in sku_capa_band_*_enable

The sku_capa_band_24GHz_enable and sku_capa_band_52GHz_enable symbols
cause checkpatch to complain whenever we use them.  To prevent this,
convert them to all lower case.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/dvm/main.c         | 8 ++++----
 drivers/net/wireless/intel/iwlwifi/fw/nvm.c           | 4 ++--
 drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.c | 4 ++--
 drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h | 4 ++--
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c    | 6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
index e68254e12764..030482b357a3 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
@@ -1200,16 +1200,16 @@ static int iwl_eeprom_init_hw_params(struct iwl_priv *priv)
 		return -EINVAL;
 	}
 
-	if (!data->sku_cap_11n_enable && !data->sku_cap_band_24GHz_enable &&
-	    !data->sku_cap_band_52GHz_enable) {
+	if (!data->sku_cap_11n_enable && !data->sku_cap_band_24ghz_enable &&
+	    !data->sku_cap_band_52ghz_enable) {
 		IWL_ERR(priv, "Invalid device sku\n");
 		return -EINVAL;
 	}
 
 	IWL_DEBUG_INFO(priv,
 		       "Device SKU: 24GHz %s %s, 52GHz %s %s, 11.n %s %s\n",
-		       data->sku_cap_band_24GHz_enable ? "" : "NOT", "enabled",
-		       data->sku_cap_band_52GHz_enable ? "" : "NOT", "enabled",
+		       data->sku_cap_band_24ghz_enable ? "" : "NOT", "enabled",
+		       data->sku_cap_band_52ghz_enable ? "" : "NOT", "enabled",
 		       data->sku_cap_11n_enable ? "" : "NOT", "enabled");
 
 	priv->hw_params.tx_chains_num =
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
index beefb89194a9..b8d88df27b18 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
@@ -134,9 +134,9 @@ struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
 		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11N_ENABLED);
 	nvm->sku_cap_11ax_enable =
 		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AX_ENABLED);
-	nvm->sku_cap_band_24GHz_enable =
+	nvm->sku_cap_band_24ghz_enable =
 		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED);
-	nvm->sku_cap_band_52GHz_enable =
+	nvm->sku_cap_band_52ghz_enable =
 		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED);
 	nvm->sku_cap_mimo_disabled =
 		!!(mac_flags & NVM_MAC_SKU_FLAGS_MIMO_DISABLED);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.c
index 3199d345b427..777f5df8a0c6 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.c
@@ -899,8 +899,8 @@ iwl_parse_eeprom_data(struct device *dev, const struct iwl_cfg *cfg,
 				 EEPROM_SKU_CAP);
 	data->sku_cap_11n_enable = sku & EEPROM_SKU_CAP_11N_ENABLE;
 	data->sku_cap_amt_enable = sku & EEPROM_SKU_CAP_AMT_ENABLE;
-	data->sku_cap_band_24GHz_enable = sku & EEPROM_SKU_CAP_BAND_24GHZ;
-	data->sku_cap_band_52GHz_enable = sku & EEPROM_SKU_CAP_BAND_52GHZ;
+	data->sku_cap_band_24ghz_enable = sku & EEPROM_SKU_CAP_BAND_24GHZ;
+	data->sku_cap_band_52ghz_enable = sku & EEPROM_SKU_CAP_BAND_52GHZ;
 	data->sku_cap_ipan_enable = sku & EEPROM_SKU_CAP_IPAN_ENABLE;
 	if (iwlwifi_mod_params.disable_11n & IWL_DISABLE_HT_ALL)
 		data->sku_cap_11n_enable = false;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
index 16bea75e82f1..8be50ed12300 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-parse.h
@@ -81,8 +81,8 @@ struct iwl_nvm_data {
 	__le16 kelvin_voltage;
 	__le16 xtal_calib[2];
 
-	bool sku_cap_band_24GHz_enable;
-	bool sku_cap_band_52GHz_enable;
+	bool sku_cap_band_24ghz_enable;
+	bool sku_cap_band_52ghz_enable;
 	bool sku_cap_11n_enable;
 	bool sku_cap_11ac_enable;
 	bool sku_cap_11ax_enable;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 735b211364d2..a6246dc9f025 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -319,7 +319,7 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg,
 
 		ch_flags = __le16_to_cpup(nvm_ch_flags + ch_idx);
 
-		if (is_5ghz && !data->sku_cap_band_52GHz_enable)
+		if (is_5ghz && !data->sku_cap_band_52ghz_enable)
 			continue;
 
 		/* workaround to disable wide channels in 5GHz */
@@ -745,8 +745,8 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 		rx_chains &= data->valid_rx_ant;
 
 	sku = iwl_get_sku(cfg, nvm_sw, phy_sku);
-	data->sku_cap_band_24GHz_enable = sku & NVM_SKU_CAP_BAND_24GHZ;
-	data->sku_cap_band_52GHz_enable = sku & NVM_SKU_CAP_BAND_52GHZ;
+	data->sku_cap_band_24ghz_enable = sku & NVM_SKU_CAP_BAND_24GHZ;
+	data->sku_cap_band_52ghz_enable = sku & NVM_SKU_CAP_BAND_52GHZ;
 	data->sku_cap_11n_enable = sku & NVM_SKU_CAP_11N_ENABLE;
 	if (iwlwifi_mod_params.disable_11n & IWL_DISABLE_HT_ALL)
 		data->sku_cap_11n_enable = false;
-- 
cgit v1.2.3-59-g8ed1b


From 4c625c564ba2cf92fdd9cb7387d30be04139f77b Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Wed, 24 Jan 2018 16:12:49 +0200
Subject: iwlwifi: get rid of fw/nvm.c

There's already an opmode common file for nvm iwl-nvm-parse.c
Move the content of fw/nvm.c to iwl-nvm-parse.c and delete fw/nvm.c.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/Makefile        |   2 +-
 drivers/net/wireless/intel/iwlwifi/fw/nvm.c        | 168 ---------------------
 drivers/net/wireless/intel/iwlwifi/fw/runtime.h    |   1 -
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 111 +++++++++++++-
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h |  21 +--
 drivers/net/wireless/intel/iwlwifi/mvm/fw.c        |   2 +-
 6 files changed, 114 insertions(+), 191 deletions(-)
 delete mode 100644 drivers/net/wireless/intel/iwlwifi/fw/nvm.c

diff --git a/drivers/net/wireless/intel/iwlwifi/Makefile b/drivers/net/wireless/intel/iwlwifi/Makefile
index e6205eae51fd..4d08d78c6b71 100644
--- a/drivers/net/wireless/intel/iwlwifi/Makefile
+++ b/drivers/net/wireless/intel/iwlwifi/Makefile
@@ -13,7 +13,7 @@ iwlwifi-$(CONFIG_IWLMVM) += cfg/7000.o cfg/8000.o cfg/9000.o cfg/22000.o
 iwlwifi-objs		+= iwl-trans.o
 iwlwifi-objs		+= fw/notif-wait.o
 iwlwifi-$(CONFIG_IWLMVM) += fw/paging.o fw/smem.o fw/init.o fw/dbg.o
-iwlwifi-$(CONFIG_IWLMVM) += fw/common_rx.o fw/nvm.o
+iwlwifi-$(CONFIG_IWLMVM) += fw/common_rx.o
 iwlwifi-$(CONFIG_ACPI) += fw/acpi.o
 iwlwifi-$(CONFIG_IWLWIFI_DEBUGFS) += fw/debugfs.o
 
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c b/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
deleted file mode 100644
index b8d88df27b18..000000000000
--- a/drivers/net/wireless/intel/iwlwifi/fw/nvm.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/******************************************************************************
- *
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
- * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
- * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
- * USA
- *
- * The full GNU General Public License is included in this distribution
- * in the file called COPYING.
- *
- * Contact Information:
- *  Intel Linux Wireless <linuxwifi@intel.com>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- * BSD LICENSE
- *
- * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
- * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
- * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *  * Neither the name Intel Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *****************************************************************************/
-#include "iwl-drv.h"
-#include "runtime.h"
-#include "fw/api/nvm-reg.h"
-#include "fw/api/commands.h"
-#include "iwl-nvm-parse.h"
-
-struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt)
-{
-	struct iwl_nvm_get_info cmd = {};
-	struct iwl_nvm_get_info_rsp *rsp;
-	struct iwl_trans *trans = fwrt->trans;
-	struct iwl_nvm_data *nvm;
-	struct iwl_host_cmd hcmd = {
-		.flags = CMD_WANT_SKB | CMD_SEND_IN_RFKILL,
-		.data = { &cmd, },
-		.len = { sizeof(cmd) },
-		.id = WIDE_ID(REGULATORY_AND_NVM_GROUP, NVM_GET_INFO)
-	};
-	int  ret;
-	bool lar_fw_supported = !iwlwifi_mod_params.lar_disable &&
-				fw_has_capa(&fwrt->fw->ucode_capa,
-					    IWL_UCODE_TLV_CAPA_LAR_SUPPORT);
-	u32 mac_flags;
-	u32 sbands_flags = 0;
-
-	ret = iwl_trans_send_cmd(trans, &hcmd);
-	if (ret)
-		return ERR_PTR(ret);
-
-	if (WARN(iwl_rx_packet_payload_len(hcmd.resp_pkt) != sizeof(*rsp),
-		 "Invalid payload len in NVM response from FW %d",
-		 iwl_rx_packet_payload_len(hcmd.resp_pkt))) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	rsp = (void *)hcmd.resp_pkt->data;
-	if (le32_to_cpu(rsp->general.flags) & NVM_GENERAL_FLAGS_EMPTY_OTP)
-		IWL_INFO(fwrt, "OTP is empty\n");
-
-	nvm = kzalloc(sizeof(*nvm) +
-		      sizeof(struct ieee80211_channel) * IWL_NUM_CHANNELS,
-		      GFP_KERNEL);
-	if (!nvm) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	iwl_set_hw_address_from_csr(trans, nvm);
-	/* TODO: if platform NVM has MAC address - override it here */
-
-	if (!is_valid_ether_addr(nvm->hw_addr)) {
-		IWL_ERR(fwrt, "no valid mac address was found\n");
-		ret = -EINVAL;
-		goto err_free;
-	}
-
-	IWL_INFO(trans, "base HW address: %pM\n", nvm->hw_addr);
-
-	/* Initialize general data */
-	nvm->nvm_version = le16_to_cpu(rsp->general.nvm_version);
-
-	/* Initialize MAC sku data */
-	mac_flags = le32_to_cpu(rsp->mac_sku.mac_sku_flags);
-	nvm->sku_cap_11ac_enable =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AC_ENABLED);
-	nvm->sku_cap_11n_enable =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11N_ENABLED);
-	nvm->sku_cap_11ax_enable =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AX_ENABLED);
-	nvm->sku_cap_band_24ghz_enable =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED);
-	nvm->sku_cap_band_52ghz_enable =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED);
-	nvm->sku_cap_mimo_disabled =
-		!!(mac_flags & NVM_MAC_SKU_FLAGS_MIMO_DISABLED);
-
-	/* Initialize PHY sku data */
-	nvm->valid_tx_ant = (u8)le32_to_cpu(rsp->phy_sku.tx_chains);
-	nvm->valid_rx_ant = (u8)le32_to_cpu(rsp->phy_sku.rx_chains);
-
-	if (le32_to_cpu(rsp->regulatory.lar_enabled) && lar_fw_supported) {
-		nvm->lar_enabled = true;
-		sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR;
-	}
-
-	iwl_init_sbands(trans->dev, trans->cfg, nvm,
-			rsp->regulatory.channel_profile,
-			nvm->valid_tx_ant & fwrt->fw->valid_tx_ant,
-			nvm->valid_rx_ant & fwrt->fw->valid_rx_ant,
-			sbands_flags);
-
-	iwl_free_resp(&hcmd);
-	return nvm;
-
-err_free:
-	kfree(nvm);
-out:
-	iwl_free_resp(&hcmd);
-	return ERR_PTR(ret);
-}
-IWL_EXPORT_SYMBOL(iwl_fw_get_nvm);
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
index 3fb940ebd74a..d8db1dd100b0 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/runtime.h
@@ -170,6 +170,5 @@ void iwl_get_shared_mem_conf(struct iwl_fw_runtime *fwrt);
 
 void iwl_fwrt_handle_notification(struct iwl_fw_runtime *fwrt,
 				  struct iwl_rx_cmd_buffer *rxb);
-struct iwl_nvm_data *iwl_fw_get_nvm(struct iwl_fw_runtime *fwrt);
 
 #endif /* __iwl_fw_runtime_h__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index a6246dc9f025..6d33c14579d9 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -80,6 +80,9 @@
 #include "iwl-csr.h"
 #include "fw/acpi.h"
 #include "fw/api/nvm-reg.h"
+#include "fw/api/commands.h"
+#include "fw/api/cmdhdr.h"
+#include "fw/img.h"
 
 /* NVM offsets (in words) definitions */
 enum nvm_offsets {
@@ -460,9 +463,10 @@ static void iwl_init_vht_hw_capab(const struct iwl_cfg *cfg,
 	vht_cap->vht_mcs.tx_mcs_map = vht_cap->vht_mcs.rx_mcs_map;
 }
 
-void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
-		     struct iwl_nvm_data *data, const __le16 *nvm_ch_flags,
-		     u8 tx_chains, u8 rx_chains, u32 sbands_flags)
+static void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
+			    struct iwl_nvm_data *data,
+			    const __le16 *nvm_ch_flags, u8 tx_chains,
+			    u8 rx_chains, u32 sbands_flags)
 {
 	int n_channels;
 	int n_used = 0;
@@ -495,7 +499,6 @@ void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
 		IWL_ERR_DEV(dev, "NVM: used only %d of %d channels\n",
 			    n_used, n_channels);
 }
-IWL_EXPORT_SYMBOL(iwl_init_sbands);
 
 static int iwl_get_sku(const struct iwl_cfg *cfg, const __le16 *nvm_sw,
 		       const __le16 *phy_sku)
@@ -573,8 +576,8 @@ static void iwl_flip_hw_address(__le32 mac_addr0, __le32 mac_addr1, u8 *dest)
 	dest[5] = hw_addr[0];
 }
 
-void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
-				 struct iwl_nvm_data *data)
+static void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
+					struct iwl_nvm_data *data)
 {
 	__le32 mac_addr0 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR0_STRAP));
 	__le32 mac_addr1 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR1_STRAP));
@@ -592,7 +595,6 @@ void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
 
 	iwl_flip_hw_address(mac_addr0, mac_addr1, data->hw_addr);
 }
-IWL_EXPORT_SYMBOL(iwl_set_hw_address_from_csr);
 
 static void iwl_set_hw_address_family_8000(struct iwl_trans *trans,
 					   const struct iwl_cfg *cfg,
@@ -1145,3 +1147,98 @@ out:
 	return ret;
 }
 IWL_EXPORT_SYMBOL(iwl_read_external_nvm);
+
+struct iwl_nvm_data *iwl_get_nvm(struct iwl_trans *trans,
+				 const struct iwl_fw *fw)
+{
+	struct iwl_nvm_get_info cmd = {};
+	struct iwl_nvm_get_info_rsp *rsp;
+	struct iwl_nvm_data *nvm;
+	struct iwl_host_cmd hcmd = {
+		.flags = CMD_WANT_SKB | CMD_SEND_IN_RFKILL,
+		.data = { &cmd, },
+		.len = { sizeof(cmd) },
+		.id = WIDE_ID(REGULATORY_AND_NVM_GROUP, NVM_GET_INFO)
+	};
+	int  ret;
+	bool lar_fw_supported = !iwlwifi_mod_params.lar_disable &&
+				fw_has_capa(&fw->ucode_capa,
+					    IWL_UCODE_TLV_CAPA_LAR_SUPPORT);
+	u32 mac_flags;
+	u32 sbands_flags = 0;
+
+	ret = iwl_trans_send_cmd(trans, &hcmd);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (WARN(iwl_rx_packet_payload_len(hcmd.resp_pkt) != sizeof(*rsp),
+		 "Invalid payload len in NVM response from FW %d",
+		 iwl_rx_packet_payload_len(hcmd.resp_pkt))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rsp = (void *)hcmd.resp_pkt->data;
+	if (le32_to_cpu(rsp->general.flags) & NVM_GENERAL_FLAGS_EMPTY_OTP)
+		IWL_INFO(trans, "OTP is empty\n");
+
+	nvm = kzalloc(sizeof(*nvm) +
+		      sizeof(struct ieee80211_channel) * IWL_NUM_CHANNELS,
+		      GFP_KERNEL);
+	if (!nvm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	iwl_set_hw_address_from_csr(trans, nvm);
+	/* TODO: if platform NVM has MAC address - override it here */
+
+	if (!is_valid_ether_addr(nvm->hw_addr)) {
+		IWL_ERR(trans, "no valid mac address was found\n");
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	IWL_INFO(trans, "base HW address: %pM\n", nvm->hw_addr);
+
+	/* Initialize general data */
+	nvm->nvm_version = le16_to_cpu(rsp->general.nvm_version);
+
+	/* Initialize MAC sku data */
+	mac_flags = le32_to_cpu(rsp->mac_sku.mac_sku_flags);
+	nvm->sku_cap_11ac_enable =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11AC_ENABLED);
+	nvm->sku_cap_11n_enable =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_802_11N_ENABLED);
+	nvm->sku_cap_band_24ghz_enable =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_2_4_ENABLED);
+	nvm->sku_cap_band_52ghz_enable =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_BAND_5_2_ENABLED);
+	nvm->sku_cap_mimo_disabled =
+		!!(mac_flags & NVM_MAC_SKU_FLAGS_MIMO_DISABLED);
+
+	/* Initialize PHY sku data */
+	nvm->valid_tx_ant = (u8)le32_to_cpu(rsp->phy_sku.tx_chains);
+	nvm->valid_rx_ant = (u8)le32_to_cpu(rsp->phy_sku.rx_chains);
+
+	if (le32_to_cpu(rsp->regulatory.lar_enabled) && lar_fw_supported) {
+		nvm->lar_enabled = true;
+		sbands_flags |= IWL_NVM_SBANDS_FLAGS_LAR;
+	}
+
+	iwl_init_sbands(trans->dev, trans->cfg, nvm,
+			rsp->regulatory.channel_profile,
+			nvm->valid_tx_ant & fw->valid_tx_ant,
+			nvm->valid_rx_ant & fw->valid_rx_ant,
+			sbands_flags);
+
+	iwl_free_resp(&hcmd);
+	return nvm;
+
+err_free:
+	kfree(nvm);
+out:
+	iwl_free_resp(&hcmd);
+	return ERR_PTR(ret);
+}
+IWL_EXPORT_SYMBOL(iwl_get_nvm);
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
index 868baa508a16..60c7586f8342 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h
@@ -95,19 +95,6 @@ iwl_parse_nvm_data(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 		   const __le16 *mac_override, const __le16 *phy_sku,
 		   u8 tx_chains, u8 rx_chains, bool lar_fw_supported);
 
-/**
- * iwl_set_hw_address_from_csr - sets HW address for 9000 devices and on
- */
-void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
-				 struct iwl_nvm_data *data);
-
-/**
- * iwl_init_sbands - parse and set all channel profiles
- */
-void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg,
-		     struct iwl_nvm_data *data, const __le16 *nvm_ch_flags,
-		     u8 tx_chains, u8 rx_chains, u32 sbands_flags);
-
 /**
  * iwl_parse_mcc_info - parse MCC (mobile country code) info coming from FW
  *
@@ -142,4 +129,12 @@ int iwl_read_external_nvm(struct iwl_trans *trans,
 void iwl_nvm_fixups(u32 hw_id, unsigned int section, u8 *data,
 		    unsigned int len);
 
+/**
+ * iwl_get_nvm - retrieve NVM data from firmware
+ *
+ * Allocates a new iwl_nvm_data structure, fills it with
+ * NVM data, and returns it to caller.
+ */
+struct iwl_nvm_data *iwl_get_nvm(struct iwl_trans *trans,
+				 const struct iwl_fw *fw);
 #endif /* __iwl_nvm_parse_h__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
index d5a612add59f..866c91c923be 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
@@ -415,7 +415,7 @@ static int iwl_run_unified_mvm_ucode(struct iwl_mvm *mvm, bool read_nvm)
 
 	/* Read the NVM only at driver load time, no need to do this twice */
 	if (!IWL_MVM_PARSE_NVM && read_nvm) {
-		mvm->nvm_data = iwl_fw_get_nvm(&mvm->fwrt);
+		mvm->nvm_data = iwl_get_nvm(mvm->trans, mvm->fw);
 		if (IS_ERR(mvm->nvm_data)) {
 			ret = PTR_ERR(mvm->nvm_data);
 			mvm->nvm_data = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From bd8f3fc613919b50038c949f80b3f350a166293e Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Wed, 17 Jan 2018 15:25:28 +0200
Subject: iwlwifi: mvm: support 22000 HW opening agg before traffic

When trying to open aggregations on 22000 HW before
traffic had actually passed, the driver will discover
it is missing a queue to aggregate on. In such a case -
allocate a queue.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index 4f9d3e13d7e5..4ddd2c427b83 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -36,6 +36,7 @@
  * Copyright(c) 2012 - 2015 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -2467,6 +2468,15 @@ int iwl_mvm_sta_tx_agg_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	lockdep_assert_held(&mvm->mutex);
 
+	if (mvmsta->tid_data[tid].txq_id == IWL_MVM_INVALID_QUEUE &&
+	    iwl_mvm_has_new_tx_api(mvm)) {
+		u8 ac = tid_to_mac80211_ac[tid];
+
+		ret = iwl_mvm_sta_alloc_queue_tvqm(mvm, sta, ac, tid);
+		if (ret)
+			return ret;
+	}
+
 	spin_lock_bh(&mvmsta->lock);
 
 	/* possible race condition - we entered D0i3 while starting agg */
-- 
cgit v1.2.3-59-g8ed1b


From 878a4d328104fed86ea321c48a5245eed44fbe14 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 20 Apr 2018 10:05:16 +0200
Subject: libbpf: fixed build error for samples/bpf/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8a138aed4a80 ("bpf: btf: Add BTF support to libbpf") did not
include stdbool.h, so GCC complained when building samples/bpf/.

In file included from /home/btopel/src/ext/linux/samples/bpf/libbpf.h:6:0,
                 from /home/btopel/src/ext/linux/samples/bpf/test_lru_dist.c:24:
/home/btopel/src/ext/linux/tools/lib/bpf/bpf.h:105:4: error: unknown type name ‘bool’; did you mean ‘_Bool’?
    bool do_log);
    ^~~~
    _Bool

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/bpf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 01bda076310f..553b11ad52b3 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -24,6 +24,7 @@
 #define __BPF_BPF_H
 
 #include <linux/bpf.h>
+#include <stdbool.h>
 #include <stddef.h>
 
 struct bpf_create_map_attr {
-- 
cgit v1.2.3-59-g8ed1b


From 0263ea5cddedd84c111b55d33a8ec94740d1c8d3 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 19 Apr 2018 02:00:47 +0200
Subject: net: phy: mdio-boardinfo: Allow recursive mdiobus_register()

mdiobus_register will search for any mdiobus board info registered for
the bus being registered. If found, it will probe devices on the bus.
That device, if for example it is an ethernet switch, may then try to
register an mdio bus. Thus we need to allow recursive calls to
mdiobus_register.

Holding the mdio_board_lock will cause a deadlock during this
recursion. Release the lock and use list_for_each_entry_safe.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-boardinfo.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c
index 1861f387820d..863496fa5d13 100644
--- a/drivers/net/phy/mdio-boardinfo.c
+++ b/drivers/net/phy/mdio-boardinfo.c
@@ -30,17 +30,20 @@ void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus,
 					    struct mdio_board_info *bi))
 {
 	struct mdio_board_entry *be;
+	struct mdio_board_entry *tmp;
 	struct mdio_board_info *bi;
 	int ret;
 
 	mutex_lock(&mdio_board_lock);
-	list_for_each_entry(be, &mdio_board_list, list) {
+	list_for_each_entry_safe(be, tmp, &mdio_board_list, list) {
 		bi = &be->board_info;
 
 		if (strcmp(bus->id, bi->bus_id))
 			continue;
 
+		mutex_unlock(&mdio_board_lock);
 		ret = cb(bus, bi);
+		mutex_lock(&mdio_board_lock);
 		if (ret)
 			continue;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6b9227d666f2efe0f8ed234827bb1abdf63f9501 Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 19 Apr 2018 16:24:53 +0900
Subject: net: ethernet: ave: add multiple clocks and resets support as
 required property

When the link is becoming up for Pro4 SoC, the kernel is stalled
due to some missing clocks and resets.

The AVE block for Pro4 is connected to the GIO bus in the SoC.
Without its clock/reset, the access to the AVE register makes the
system stall.

In the same way, another MAC clock for Giga-bit Connection and
the PHY clock are also required for Pro4 to activate the Giga-bit feature
and to recognize the PHY.

To satisfy these requirements, this patch adds support for multiple clocks
and resets, and adds the clock-names and reset-names to the binding because
we need to distinguish clock/reset for the AVE main block and the others.

Also, make the resets a required property. Currently, "reset is
optional" relies on that the bootloader or firmware has deasserted
the reset before booting the kernel.  Drivers should work without
such expectation.

Fixes: 4c270b55a5af ("net: ethernet: socionext: add AVE ethernet driver")
Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../bindings/net/socionext,uniphier-ave4.txt       |  13 ++-
 drivers/net/ethernet/socionext/sni_ave.c           | 108 ++++++++++++++++-----
 2 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
index 96398cc2982f..85e0c49548ed 100644
--- a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
+++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
@@ -17,9 +17,18 @@ Required properties:
  - phy-handle: Should point to the external phy device.
 	See ethernet.txt file in the same directory.
  - clocks: A phandle to the clock for the MAC.
+	For Pro4 SoC, that is "socionext,uniphier-pro4-ave4",
+	another MAC clock, GIO bus clock and PHY clock are also required.
+ - clock-names: Should contain
+	- "ether", "ether-gb", "gio", "ether-phy" for Pro4 SoC
+	- "ether" for others
+ - resets: A phandle to the reset control for the MAC. For Pro4 SoC,
+	GIO bus reset is also required.
+ - reset-names: Should contain
+	- "ether", "gio" for Pro4 SoC
+	- "ether" for others
 
 Optional properties:
- - resets: A phandle to the reset control for the MAC.
  - local-mac-address: See ethernet.txt in the same directory.
 
 Required subnode:
@@ -34,7 +43,9 @@ Example:
 		interrupts = <0 66 4>;
 		phy-mode = "rgmii";
 		phy-handle = <&ethphy>;
+		clock-names = "ether";
 		clocks = <&sys_clk 6>;
+		reset-names = "ether";
 		resets = <&sys_rst 6>;
 		local-mac-address = [00 00 00 00 00 00];
 
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 0b3b7a460641..52940bdd4ad3 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -199,6 +199,9 @@
 
 #define IS_DESC_64BIT(p)	((p)->data->is_desc_64bit)
 
+#define AVE_MAX_CLKS		4
+#define AVE_MAX_RSTS		2
+
 enum desc_id {
 	AVE_DESCID_RX,
 	AVE_DESCID_TX,
@@ -227,6 +230,8 @@ struct ave_desc_info {
 
 struct ave_soc_data {
 	bool	is_desc_64bit;
+	const char	*clock_names[AVE_MAX_CLKS];
+	const char	*reset_names[AVE_MAX_RSTS];
 };
 
 struct ave_stats {
@@ -245,8 +250,10 @@ struct ave_private {
 	int			phy_id;
 	unsigned int		desc_size;
 	u32			msg_enable;
-	struct clk		*clk;
-	struct reset_control	*rst;
+	int			nclks;
+	struct clk		*clk[AVE_MAX_CLKS];
+	int			nrsts;
+	struct reset_control	*rst[AVE_MAX_RSTS];
 	phy_interface_t		phy_mode;
 	struct phy_device	*phydev;
 	struct mii_bus		*mdio;
@@ -1153,18 +1160,23 @@ static int ave_init(struct net_device *ndev)
 	struct device_node *np = dev->of_node;
 	struct device_node *mdio_np;
 	struct phy_device *phydev;
-	int ret;
+	int nc, nr, ret;
 
 	/* enable clk because of hw access until ndo_open */
-	ret = clk_prepare_enable(priv->clk);
-	if (ret) {
-		dev_err(dev, "can't enable clock\n");
-		return ret;
+	for (nc = 0; nc < priv->nclks; nc++) {
+		ret = clk_prepare_enable(priv->clk[nc]);
+		if (ret) {
+			dev_err(dev, "can't enable clock\n");
+			goto out_clk_disable;
+		}
 	}
-	ret = reset_control_deassert(priv->rst);
-	if (ret) {
-		dev_err(dev, "can't deassert reset\n");
-		goto out_clk_disable;
+
+	for (nr = 0; nr < priv->nrsts; nr++) {
+		ret = reset_control_deassert(priv->rst[nr]);
+		if (ret) {
+			dev_err(dev, "can't deassert reset\n");
+			goto out_reset_assert;
+		}
 	}
 
 	ave_global_reset(ndev);
@@ -1207,9 +1219,11 @@ static int ave_init(struct net_device *ndev)
 out_mdio_unregister:
 	mdiobus_unregister(priv->mdio);
 out_reset_assert:
-	reset_control_assert(priv->rst);
+	while (--nr >= 0)
+		reset_control_assert(priv->rst[nr]);
 out_clk_disable:
-	clk_disable_unprepare(priv->clk);
+	while (--nc >= 0)
+		clk_disable_unprepare(priv->clk[nc]);
 
 	return ret;
 }
@@ -1217,13 +1231,16 @@ out_clk_disable:
 static void ave_uninit(struct net_device *ndev)
 {
 	struct ave_private *priv = netdev_priv(ndev);
+	int i;
 
 	phy_disconnect(priv->phydev);
 	mdiobus_unregister(priv->mdio);
 
 	/* disable clk because of hw access after ndo_stop */
-	reset_control_assert(priv->rst);
-	clk_disable_unprepare(priv->clk);
+	for (i = 0; i < priv->nrsts; i++)
+		reset_control_assert(priv->rst[i]);
+	for (i = 0; i < priv->nclks; i++)
+		clk_disable_unprepare(priv->clk[i]);
 }
 
 static int ave_open(struct net_device *ndev)
@@ -1527,8 +1544,9 @@ static int ave_probe(struct platform_device *pdev)
 	struct resource	*res;
 	const void *mac_addr;
 	void __iomem *base;
+	const char *name;
+	int i, irq, ret;
 	u64 dma_mask;
-	int irq, ret;
 	u32 ave_id;
 
 	data = of_device_get_match_data(dev);
@@ -1614,16 +1632,28 @@ static int ave_probe(struct platform_device *pdev)
 	u64_stats_init(&priv->stats_tx.syncp);
 	u64_stats_init(&priv->stats_rx.syncp);
 
-	priv->clk = devm_clk_get(dev, NULL);
-	if (IS_ERR(priv->clk)) {
-		ret = PTR_ERR(priv->clk);
-		goto out_free_netdev;
+	for (i = 0; i < AVE_MAX_CLKS; i++) {
+		name = priv->data->clock_names[i];
+		if (!name)
+			break;
+		priv->clk[i] = devm_clk_get(dev, name);
+		if (IS_ERR(priv->clk[i])) {
+			ret = PTR_ERR(priv->clk[i]);
+			goto out_free_netdev;
+		}
+		priv->nclks++;
 	}
 
-	priv->rst = devm_reset_control_get_optional_shared(dev, NULL);
-	if (IS_ERR(priv->rst)) {
-		ret = PTR_ERR(priv->rst);
-		goto out_free_netdev;
+	for (i = 0; i < AVE_MAX_RSTS; i++) {
+		name = priv->data->reset_names[i];
+		if (!name)
+			break;
+		priv->rst[i] = devm_reset_control_get_shared(dev, name);
+		if (IS_ERR(priv->rst[i])) {
+			ret = PTR_ERR(priv->rst[i]);
+			goto out_free_netdev;
+		}
+		priv->nrsts++;
 	}
 
 	priv->mdio = devm_mdiobus_alloc(dev);
@@ -1687,22 +1717,52 @@ static int ave_remove(struct platform_device *pdev)
 
 static const struct ave_soc_data ave_pro4_data = {
 	.is_desc_64bit = false,
+	.clock_names = {
+		"gio", "ether", "ether-gb", "ether-phy",
+	},
+	.reset_names = {
+		"gio", "ether",
+	},
 };
 
 static const struct ave_soc_data ave_pxs2_data = {
 	.is_desc_64bit = false,
+	.clock_names = {
+		"ether",
+	},
+	.reset_names = {
+		"ether",
+	},
 };
 
 static const struct ave_soc_data ave_ld11_data = {
 	.is_desc_64bit = false,
+	.clock_names = {
+		"ether",
+	},
+	.reset_names = {
+		"ether",
+	},
 };
 
 static const struct ave_soc_data ave_ld20_data = {
 	.is_desc_64bit = true,
+	.clock_names = {
+		"ether",
+	},
+	.reset_names = {
+		"ether",
+	},
 };
 
 static const struct ave_soc_data ave_pxs3_data = {
 	.is_desc_64bit = false,
+	.clock_names = {
+		"ether",
+	},
+	.reset_names = {
+		"ether",
+	},
 };
 
 static const struct of_device_id of_ave_match[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 74734306c20393910eae3379322c622e57351d98 Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 19 Apr 2018 16:24:54 +0900
Subject: dt-bindings: net: ave: add syscon-phy-mode property to configure
 phy-mode setting

Add "socionext,syscon-phy-mode" property to specify system controller that
configures the settings about phy-mode.

Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
index 85e0c49548ed..fc8f01718690 100644
--- a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
+++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt
@@ -13,7 +13,8 @@ Required properties:
  - reg: Address where registers are mapped and size of region.
  - interrupts: Should contain the MAC interrupt.
  - phy-mode: See ethernet.txt in the same directory. Allow to choose
-	"rgmii", "rmii", or "mii" according to the PHY.
+	"rgmii", "rmii", "mii", or "internal" according to the PHY.
+	The acceptable mode is SoC-dependent.
  - phy-handle: Should point to the external phy device.
 	See ethernet.txt file in the same directory.
  - clocks: A phandle to the clock for the MAC.
@@ -27,6 +28,8 @@ Required properties:
  - reset-names: Should contain
 	- "ether", "gio" for Pro4 SoC
 	- "ether" for others
+ - socionext,syscon-phy-mode: A phandle to syscon with one argument
+	that configures phy mode. The argument is the ID of MAC instance.
 
 Optional properties:
  - local-mac-address: See ethernet.txt in the same directory.
@@ -47,6 +50,7 @@ Example:
 		clocks = <&sys_clk 6>;
 		reset-names = "ether";
 		resets = <&sys_rst 6>;
+		socionext,syscon-phy-mode = <&soc_glue 0>;
 		local-mac-address = [00 00 00 00 00 00];
 
 		mdio {
-- 
cgit v1.2.3-59-g8ed1b


From 57878f2f4697a4f4ae53586fa7565ed1a3275a2c Mon Sep 17 00:00:00 2001
From: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Date: Thu, 19 Apr 2018 16:24:55 +0900
Subject: net: ethernet: ave: add support for phy-mode setting of system
 controller

This patch adds support for specifying system controller that configures
phy-mode setting.

According to the DT property "phy-mode", it's necessary to configure the
controller, which is used to choose the settings of the MAC suitable,
for example, mdio pin connections, internal clocks, and so on.

Supported phy-modes are SoC-dependent. The driver allows phy-mode to set
"internal" if the SoC has a built-in PHY, and {"mii", "rmii", "rgmii"}
if the SoC supports each mode. So we have to check whether the phy-mode
is valid or not.

This adds the following features for each SoC:
- check whether the SoC supports the specified phy-mode
- configure the controller accroding to phy-mode

The DT property accepts one argument to distinguish them for multiple MAC
instances.

ethernet@65000000 {
	...
	socionext,syscon-phy-mode = <&soc_glue 0>;
};

ethernet@65200000 {
	...
	socionext,syscon-phy-mode = <&soc_glue 1>;
};

Signed-off-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/socionext/Kconfig   |   2 +
 drivers/net/ethernet/socionext/sni_ave.c | 150 ++++++++++++++++++++++++++++---
 2 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/socionext/Kconfig b/drivers/net/ethernet/socionext/Kconfig
index 6bcfe27fc560..b80048ca82a0 100644
--- a/drivers/net/ethernet/socionext/Kconfig
+++ b/drivers/net/ethernet/socionext/Kconfig
@@ -14,6 +14,8 @@ if NET_VENDOR_SOCIONEXT
 config SNI_AVE
 	tristate "Socionext AVE ethernet support"
 	depends on (ARCH_UNIPHIER || COMPILE_TEST) && OF
+	depends on HAS_IOMEM
+	select MFD_SYSCON
 	select PHYLIB
 	---help---
 	  Driver for gigabit ethernet MACs, called AVE, in the
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 52940bdd4ad3..f7ecceeb1e28 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -11,6 +11,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
+#include <linux/mfd/syscon.h>
 #include <linux/mii.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
@@ -18,6 +19,7 @@
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 #include <linux/phy.h>
+#include <linux/regmap.h>
 #include <linux/reset.h>
 #include <linux/types.h>
 #include <linux/u64_stats_sync.h>
@@ -197,6 +199,11 @@
 #define AVE_INTM_COUNT		20
 #define AVE_FORCE_TXINTCNT	1
 
+/* SG */
+#define SG_ETPINMODE		0x540
+#define SG_ETPINMODE_EXTPHY	BIT(1)	/* for LD11 */
+#define SG_ETPINMODE_RMII(ins)	BIT(ins)
+
 #define IS_DESC_64BIT(p)	((p)->data->is_desc_64bit)
 
 #define AVE_MAX_CLKS		4
@@ -228,12 +235,6 @@ struct ave_desc_info {
 	struct ave_desc *desc;	/* skb info related descriptor */
 };
 
-struct ave_soc_data {
-	bool	is_desc_64bit;
-	const char	*clock_names[AVE_MAX_CLKS];
-	const char	*reset_names[AVE_MAX_RSTS];
-};
-
 struct ave_stats {
 	struct	u64_stats_sync	syncp;
 	u64	packets;
@@ -257,6 +258,9 @@ struct ave_private {
 	phy_interface_t		phy_mode;
 	struct phy_device	*phydev;
 	struct mii_bus		*mdio;
+	struct regmap		*regmap;
+	unsigned int		pinmode_mask;
+	unsigned int		pinmode_val;
 
 	/* stats */
 	struct ave_stats	stats_rx;
@@ -279,6 +283,14 @@ struct ave_private {
 	const struct ave_soc_data *data;
 };
 
+struct ave_soc_data {
+	bool	is_desc_64bit;
+	const char	*clock_names[AVE_MAX_CLKS];
+	const char	*reset_names[AVE_MAX_RSTS];
+	int	(*get_pinmode)(struct ave_private *priv,
+			       phy_interface_t phy_mode, u32 arg);
+};
+
 static u32 ave_desc_read(struct net_device *ndev, enum desc_id id, int entry,
 			 int offset)
 {
@@ -1179,6 +1191,11 @@ static int ave_init(struct net_device *ndev)
 		}
 	}
 
+	ret = regmap_update_bits(priv->regmap, SG_ETPINMODE,
+				 priv->pinmode_mask, priv->pinmode_val);
+	if (ret)
+		return ret;
+
 	ave_global_reset(ndev);
 
 	mdio_np = of_get_child_by_name(np, "mdio");
@@ -1537,6 +1554,7 @@ static int ave_probe(struct platform_device *pdev)
 	const struct ave_soc_data *data;
 	struct device *dev = &pdev->dev;
 	char buf[ETHTOOL_FWVERS_LEN];
+	struct of_phandle_args args;
 	phy_interface_t phy_mode;
 	struct ave_private *priv;
 	struct net_device *ndev;
@@ -1559,12 +1577,6 @@ static int ave_probe(struct platform_device *pdev)
 		dev_err(dev, "phy-mode not found\n");
 		return -EINVAL;
 	}
-	if ((!phy_interface_mode_is_rgmii(phy_mode)) &&
-	    phy_mode != PHY_INTERFACE_MODE_RMII &&
-	    phy_mode != PHY_INTERFACE_MODE_MII) {
-		dev_err(dev, "phy-mode is invalid\n");
-		return -EINVAL;
-	}
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
@@ -1656,6 +1668,26 @@ static int ave_probe(struct platform_device *pdev)
 		priv->nrsts++;
 	}
 
+	ret = of_parse_phandle_with_fixed_args(np,
+					       "socionext,syscon-phy-mode",
+					       1, 0, &args);
+	if (ret) {
+		netdev_err(ndev, "can't get syscon-phy-mode property\n");
+		goto out_free_netdev;
+	}
+	priv->regmap = syscon_node_to_regmap(args.np);
+	of_node_put(args.np);
+	if (IS_ERR(priv->regmap)) {
+		netdev_err(ndev, "can't map syscon-phy-mode\n");
+		ret = PTR_ERR(priv->regmap);
+		goto out_free_netdev;
+	}
+	ret = priv->data->get_pinmode(priv, phy_mode, args.args[0]);
+	if (ret) {
+		netdev_err(ndev, "invalid phy-mode setting\n");
+		goto out_free_netdev;
+	}
+
 	priv->mdio = devm_mdiobus_alloc(dev);
 	if (!priv->mdio) {
 		ret = -ENOMEM;
@@ -1715,6 +1747,95 @@ static int ave_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static int ave_pro4_get_pinmode(struct ave_private *priv,
+				phy_interface_t phy_mode, u32 arg)
+{
+	if (arg > 0)
+		return -EINVAL;
+
+	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
+
+	switch (phy_mode) {
+	case PHY_INTERFACE_MODE_RMII:
+		priv->pinmode_val = SG_ETPINMODE_RMII(0);
+		break;
+	case PHY_INTERFACE_MODE_MII:
+	case PHY_INTERFACE_MODE_RGMII:
+		priv->pinmode_val = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ave_ld11_get_pinmode(struct ave_private *priv,
+				phy_interface_t phy_mode, u32 arg)
+{
+	if (arg > 0)
+		return -EINVAL;
+
+	priv->pinmode_mask = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
+
+	switch (phy_mode) {
+	case PHY_INTERFACE_MODE_INTERNAL:
+		priv->pinmode_val = 0;
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		priv->pinmode_val = SG_ETPINMODE_EXTPHY | SG_ETPINMODE_RMII(0);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ave_ld20_get_pinmode(struct ave_private *priv,
+				phy_interface_t phy_mode, u32 arg)
+{
+	if (arg > 0)
+		return -EINVAL;
+
+	priv->pinmode_mask = SG_ETPINMODE_RMII(0);
+
+	switch (phy_mode) {
+	case PHY_INTERFACE_MODE_RMII:
+		priv->pinmode_val = SG_ETPINMODE_RMII(0);
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+		priv->pinmode_val = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ave_pxs3_get_pinmode(struct ave_private *priv,
+				phy_interface_t phy_mode, u32 arg)
+{
+	if (arg > 1)
+		return -EINVAL;
+
+	priv->pinmode_mask = SG_ETPINMODE_RMII(arg);
+
+	switch (phy_mode) {
+	case PHY_INTERFACE_MODE_RMII:
+		priv->pinmode_val = SG_ETPINMODE_RMII(arg);
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+		priv->pinmode_val = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static const struct ave_soc_data ave_pro4_data = {
 	.is_desc_64bit = false,
 	.clock_names = {
@@ -1723,6 +1844,7 @@ static const struct ave_soc_data ave_pro4_data = {
 	.reset_names = {
 		"gio", "ether",
 	},
+	.get_pinmode = ave_pro4_get_pinmode,
 };
 
 static const struct ave_soc_data ave_pxs2_data = {
@@ -1733,6 +1855,7 @@ static const struct ave_soc_data ave_pxs2_data = {
 	.reset_names = {
 		"ether",
 	},
+	.get_pinmode = ave_pro4_get_pinmode,
 };
 
 static const struct ave_soc_data ave_ld11_data = {
@@ -1743,6 +1866,7 @@ static const struct ave_soc_data ave_ld11_data = {
 	.reset_names = {
 		"ether",
 	},
+	.get_pinmode = ave_ld11_get_pinmode,
 };
 
 static const struct ave_soc_data ave_ld20_data = {
@@ -1753,6 +1877,7 @@ static const struct ave_soc_data ave_ld20_data = {
 	.reset_names = {
 		"ether",
 	},
+	.get_pinmode = ave_ld20_get_pinmode,
 };
 
 static const struct ave_soc_data ave_pxs3_data = {
@@ -1763,6 +1888,7 @@ static const struct ave_soc_data ave_pxs3_data = {
 	.reset_names = {
 		"ether",
 	},
+	.get_pinmode = ave_pxs3_get_pinmode,
 };
 
 static const struct of_device_id of_ave_match[] = {
-- 
cgit v1.2.3-59-g8ed1b


From cea395ac868dee9104aa4fff640486cf4b3c464c Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Wed, 18 Apr 2018 23:18:28 -0700
Subject: liquidio: Added ndo_get_vf_stats support

Added the ndo to gather VF statistics through the PF.

Collect VF statistics via mailbox from VF.

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c    | 54 ++++++++++++++++++++++
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h    | 12 +++++
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 26 +++++++++++
 .../net/ethernet/cavium/liquidio/octeon_mailbox.c  | 52 +++++++++++++++++++++
 .../net/ethernet/cavium/liquidio/octeon_mailbox.h  |  7 +++
 5 files changed, 151 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index e8b290473ee2..bc9861c90ea3 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -1497,3 +1497,57 @@ void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
 		octeon_mbox_write(oct, &mbox_cmd);
 	}
 }
+
+static void
+cn23xx_get_vf_stats_callback(struct octeon_device *oct,
+			     struct octeon_mbox_cmd *cmd, void *arg)
+{
+	struct oct_vf_stats_ctx *ctx = arg;
+
+	memcpy(ctx->stats, cmd->data, sizeof(struct oct_vf_stats));
+	atomic_set(&ctx->status, 1);
+}
+
+int cn23xx_get_vf_stats(struct octeon_device *oct, int vfidx,
+			struct oct_vf_stats *stats)
+{
+	u32 timeout = HZ; // 1sec
+	struct octeon_mbox_cmd mbox_cmd;
+	struct oct_vf_stats_ctx ctx;
+	u32 count = 0, ret;
+
+	if (!(oct->sriov_info.vf_drv_loaded_mask & (1ULL << vfidx)))
+		return -1;
+
+	if (sizeof(struct oct_vf_stats) > sizeof(mbox_cmd.data))
+		return -1;
+
+	mbox_cmd.msg.u64 = 0;
+	mbox_cmd.msg.s.type = OCTEON_MBOX_REQUEST;
+	mbox_cmd.msg.s.resp_needed = 1;
+	mbox_cmd.msg.s.cmd = OCTEON_GET_VF_STATS;
+	mbox_cmd.msg.s.len = 1;
+	mbox_cmd.q_no = vfidx * oct->sriov_info.rings_per_vf;
+	mbox_cmd.recv_len = 0;
+	mbox_cmd.recv_status = 0;
+	mbox_cmd.fn = (octeon_mbox_callback_t)cn23xx_get_vf_stats_callback;
+	ctx.stats = stats;
+	atomic_set(&ctx.status, 0);
+	mbox_cmd.fn_arg = (void *)&ctx;
+	memset(mbox_cmd.data, 0, sizeof(mbox_cmd.data));
+	octeon_mbox_write(oct, &mbox_cmd);
+
+	do {
+		schedule_timeout_uninterruptible(1);
+	} while ((atomic_read(&ctx.status) == 0) && (count++ < timeout));
+
+	ret = atomic_read(&ctx.status);
+	if (ret == 0) {
+		octeon_mbox_cancel(oct, 0);
+		dev_err(&oct->pci_dev->dev, "Unable to get stats from VF-%d, timedout\n",
+			vfidx);
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index 2aba5247b6d8..63b3de4f2bfe 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -43,6 +43,15 @@ struct octeon_cn23xx_pf {
 
 #define CN23XX_SLI_DEF_BP			0x40
 
+struct oct_vf_stats {
+	u64 rx_packets;
+	u64 tx_packets;
+	u64 rx_bytes;
+	u64 tx_bytes;
+	u64 broadcast;
+	u64 multicast;
+};
+
 int setup_cn23xx_octeon_pf_device(struct octeon_device *oct);
 
 int validate_cn23xx_pf_config_info(struct octeon_device *oct,
@@ -56,4 +65,7 @@ int cn23xx_fw_loaded(struct octeon_device *oct);
 
 void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
 					u8 *mac);
+
+int cn23xx_get_vf_stats(struct octeon_device *oct, int ifidx,
+			struct oct_vf_stats *stats);
 #endif
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 44c2a0c1bbdd..f3891ae11b02 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3326,6 +3326,31 @@ static const struct switchdev_ops lio_pf_switchdev_ops = {
 	.switchdev_port_attr_get = lio_pf_switchdev_attr_get,
 };
 
+static int liquidio_get_vf_stats(struct net_device *netdev, int vfidx,
+				 struct ifla_vf_stats *vf_stats)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct = lio->oct_dev;
+	struct oct_vf_stats stats;
+	int ret;
+
+	if (vfidx < 0 || vfidx >= oct->sriov_info.num_vfs_alloced)
+		return -EINVAL;
+
+	memset(&stats, 0, sizeof(struct oct_vf_stats));
+	ret = cn23xx_get_vf_stats(oct, vfidx, &stats);
+	if (!ret) {
+		vf_stats->rx_packets = stats.rx_packets;
+		vf_stats->tx_packets = stats.tx_packets;
+		vf_stats->rx_bytes = stats.rx_bytes;
+		vf_stats->tx_bytes = stats.tx_bytes;
+		vf_stats->broadcast = stats.broadcast;
+		vf_stats->multicast = stats.multicast;
+	}
+
+	return ret;
+}
+
 static const struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
@@ -3348,6 +3373,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_get_vf_config	= liquidio_get_vf_config,
 	.ndo_set_vf_trust	= liquidio_set_vf_trust,
 	.ndo_set_vf_link_state  = liquidio_set_vf_link_state,
+	.ndo_get_vf_stats	= liquidio_get_vf_stats,
 };
 
 /** \brief Entry point for the liquidio module
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
index 28e74ee23ff8..021d99cd1665 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.c
@@ -24,6 +24,7 @@
 #include "octeon_device.h"
 #include "octeon_main.h"
 #include "octeon_mailbox.h"
+#include "cn23xx_pf_device.h"
 
 /**
  * octeon_mbox_read:
@@ -205,6 +206,26 @@ int octeon_mbox_write(struct octeon_device *oct,
 	return ret;
 }
 
+static void get_vf_stats(struct octeon_device *oct,
+			 struct oct_vf_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < oct->num_iqs; i++) {
+		if (!oct->instr_queue[i])
+			continue;
+		stats->tx_packets += oct->instr_queue[i]->stats.tx_done;
+		stats->tx_bytes += oct->instr_queue[i]->stats.tx_tot_bytes;
+	}
+
+	for (i = 0; i < oct->num_oqs; i++) {
+		if (!oct->droq[i])
+			continue;
+		stats->rx_packets += oct->droq[i]->stats.rx_pkts_received;
+		stats->rx_bytes += oct->droq[i]->stats.rx_bytes_received;
+	}
+}
+
 /**
  * octeon_mbox_process_cmd:
  * @mbox: Pointer mailbox
@@ -250,6 +271,15 @@ static int octeon_mbox_process_cmd(struct octeon_mbox *mbox,
 						     mbox_cmd->msg.s.params);
 		break;
 
+	case OCTEON_GET_VF_STATS:
+		dev_dbg(&oct->pci_dev->dev, "Got VF stats request. Sending data back\n");
+		mbox_cmd->msg.s.type = OCTEON_MBOX_RESPONSE;
+		mbox_cmd->msg.s.resp_needed = 1;
+		mbox_cmd->msg.s.len = 1 +
+			sizeof(struct oct_vf_stats) / sizeof(u64);
+		get_vf_stats(oct, (struct oct_vf_stats *)mbox_cmd->data);
+		octeon_mbox_write(oct, mbox_cmd);
+		break;
 	default:
 		break;
 	}
@@ -322,3 +352,25 @@ int octeon_mbox_process_message(struct octeon_mbox *mbox)
 
 	return 0;
 }
+
+int octeon_mbox_cancel(struct octeon_device *oct, int q_no)
+{
+	struct octeon_mbox *mbox = oct->mbox[q_no];
+	struct octeon_mbox_cmd *mbox_cmd;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&mbox->lock, flags);
+	mbox_cmd = &mbox->mbox_resp;
+
+	if (!(mbox->state & OCTEON_MBOX_STATE_RESPONSE_PENDING)) {
+		spin_unlock_irqrestore(&mbox->lock, flags);
+		return 1;
+	}
+
+	mbox->state = OCTEON_MBOX_STATE_IDLE;
+	memset(mbox_cmd, 0, sizeof(*mbox_cmd));
+	writeq(OCTEON_PFVFSIG, mbox->mbox_read_reg);
+	spin_unlock_irqrestore(&mbox->lock, flags);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h
index 1def22afeff1..d92bd7e16477 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mailbox.h
@@ -25,6 +25,7 @@
 #define OCTEON_VF_ACTIVE		0x1
 #define OCTEON_VF_FLR_REQUEST		0x2
 #define OCTEON_PF_CHANGED_VF_MACADDR	0x4
+#define OCTEON_GET_VF_STATS		0x8
 
 /*Macro for Read acknowldgement*/
 #define OCTEON_PFVFACK			0xffffffffffffffffULL
@@ -107,9 +108,15 @@ struct octeon_mbox {
 
 };
 
+struct oct_vf_stats_ctx {
+	atomic_t status;
+	struct oct_vf_stats *stats;
+};
+
 int octeon_mbox_read(struct octeon_mbox *mbox);
 int octeon_mbox_write(struct octeon_device *oct,
 		      struct octeon_mbox_cmd *mbox_cmd);
 int octeon_mbox_process_message(struct octeon_mbox *mbox);
+int octeon_mbox_cancel(struct octeon_device *oct, int q_no);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From a4dfa72d0acd1c99a160e25c099849ae37ad13fd Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Thu, 19 Apr 2018 11:06:18 +0200
Subject: tipc: set default MTU for UDP media

Currently, all bearers are configured with MTU value same as the
underlying L2 device. However, in case of bearers with media type
UDP, higher throughput is possible with a fixed and higher emulated
MTU value than adapting to the underlying L2 MTU.

In this commit, we introduce a parameter mtu in struct tipc_media
and a default value is set for UDP. A default value of 14k
was determined by experimentation and found to have a higher throughput
than 16k. MTU for UDP bearers are assigned the above set value of
media MTU.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_config.h | 5 +++++
 net/tipc/udp_media.c             | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 3f29e3c8ed06..4b2c93b1934c 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -185,6 +185,11 @@
 #define TIPC_DEF_LINK_WIN 50
 #define TIPC_MAX_LINK_WIN 8191
 
+/*
+ * Default MTU for UDP media
+ */
+
+#define TIPC_DEF_LINK_UDP_MTU 14000
 
 struct tipc_node_info {
 	__be32 addr;			/* network address of node */
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index e7d91f5d5cae..9783101bc4a9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 			err = -EINVAL;
 			goto err;
 		}
-		b->mtu = dev->mtu - sizeof(struct iphdr)
-			- sizeof(struct udphdr);
+		b->mtu = b->media->mtu;
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (local.proto == htons(ETH_P_IPV6)) {
 		udp_conf.family = AF_INET6;
@@ -803,6 +802,7 @@ struct tipc_media udp_media_info = {
 	.priority	= TIPC_DEF_LINK_PRI,
 	.tolerance	= TIPC_DEF_LINK_TOL,
 	.window		= TIPC_DEF_LINK_WIN,
+	.mtu		= TIPC_DEF_LINK_UDP_MTU,
 	.type_id	= TIPC_MEDIA_TYPE_UDP,
 	.hwaddr_len	= 0,
 	.name		= "udp"
-- 
cgit v1.2.3-59-g8ed1b


From 901271e0403af638c224987c2a4e55cebade7e91 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Thu, 19 Apr 2018 11:06:19 +0200
Subject: tipc: implement configuration of UDP media MTU

In previous commit, we changed the default emulated MTU for UDP bearers
to 14k.

This commit adds the functionality to set/change the default value
by configuring new MTU for UDP media. UDP bearer(s) have to be disabled
and enabled back for the new MTU to take effect.

Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |  1 +
 net/tipc/bearer.c                 | 13 +++++++++++++
 net/tipc/bearer.h                 |  3 +++
 net/tipc/udp_media.h              | 14 ++++++++++++++
 4 files changed, 31 insertions(+)

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 0affb682e5e3..85c11982c89b 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -266,6 +266,7 @@ enum {
 	TIPC_NLA_PROP_PRIO,		/* u32 */
 	TIPC_NLA_PROP_TOL,		/* u32 */
 	TIPC_NLA_PROP_WIN,		/* u32 */
+	TIPC_NLA_PROP_MTU,		/* u32 */
 
 	__TIPC_NLA_PROP_MAX,
 	TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f7d47c89d658..a22caf9e5a18 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -1029,6 +1029,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
 		goto prop_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window))
 		goto prop_msg_full;
+	if (media->type_id == TIPC_MEDIA_TYPE_UDP)
+		if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu))
+			goto prop_msg_full;
 
 	nla_nest_end(msg->skb, prop);
 	nla_nest_end(msg->skb, attrs);
@@ -1158,6 +1161,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info)
 			m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
 		if (props[TIPC_NLA_PROP_WIN])
 			m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+		if (props[TIPC_NLA_PROP_MTU]) {
+			if (m->type_id != TIPC_MEDIA_TYPE_UDP)
+				return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+			if (tipc_udp_mtu_bad(nla_get_u32
+					     (props[TIPC_NLA_PROP_MTU])))
+				return -EINVAL;
+			m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+#endif
+		}
 	}
 
 	return 0;
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 6efcee63a381..394290cbbb1d 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -94,6 +94,8 @@ struct tipc_bearer;
  * @priority: default link (and bearer) priority
  * @tolerance: default time (in ms) before declaring link failure
  * @window: default window (in packets) before declaring link congestion
+ * @mtu: max packet size bearer can support for media type not dependent on
+ * underlying device MTU
  * @type_id: TIPC media identifier
  * @hwaddr_len: TIPC media address len
  * @name: media name
@@ -118,6 +120,7 @@ struct tipc_media {
 	u32 priority;
 	u32 tolerance;
 	u32 window;
+	u32 mtu;
 	u32 type_id;
 	u32 hwaddr_len;
 	char name[TIPC_MAX_MEDIA_NAME];
diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h
index 281bbae87726..e7455cc73e16 100644
--- a/net/tipc/udp_media.h
+++ b/net/tipc/udp_media.h
@@ -38,9 +38,23 @@
 #ifndef _TIPC_UDP_MEDIA_H
 #define _TIPC_UDP_MEDIA_H
 
+#include <linux/ip.h>
+#include <linux/udp.h>
+
 int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr);
 int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b);
 int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb);
 
+/* check if configured MTU is too low for tipc headers */
+static inline bool tipc_udp_mtu_bad(u32 mtu)
+{
+	if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) +
+	    sizeof(struct udphdr)))
+		return false;
+
+	pr_warn("MTU too low for tipc bearer\n");
+	return true;
+}
+
 #endif
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 682cd3cf946b66bace4aa1037f49f0093ff182ce Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Thu, 19 Apr 2018 11:06:20 +0200
Subject: tipc: confgiure and apply UDP bearer MTU on running links

Currently, we have option to configure MTU of UDP media. The configured
MTU takes effect on the links going up after that moment. I.e, a user
has to reset bearer to have new value applied across its links. This is
confusing and disturbing on a running cluster.

We now introduce the functionality to change the default UDP bearer MTU
in struct tipc_bearer. Additionally, the links are updated dynamically,
without any need for a reset, when bearer value is changed. We leverage
the existing per-link functionality and the design being symetrical to
the confguration of link tolerance.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c | 16 +++++++++++++++-
 net/tipc/node.c   | 12 +++++++++---
 net/tipc/node.h   |  2 +-
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a22caf9e5a18..2dfb492a7c94 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -697,6 +697,9 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
 		goto prop_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window))
 		goto prop_msg_full;
+	if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP)
+		if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu))
+			goto prop_msg_full;
 
 	nla_nest_end(msg->skb, prop);
 
@@ -979,12 +982,23 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
 
 		if (props[TIPC_NLA_PROP_TOL]) {
 			b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
-			tipc_node_apply_tolerance(net, b);
+			tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL);
 		}
 		if (props[TIPC_NLA_PROP_PRIO])
 			b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
 		if (props[TIPC_NLA_PROP_WIN])
 			b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+		if (props[TIPC_NLA_PROP_MTU]) {
+			if (b->media->type_id != TIPC_MEDIA_TYPE_UDP)
+				return -EINVAL;
+#ifdef CONFIG_TIPC_MEDIA_UDP
+			if (tipc_udp_mtu_bad(nla_get_u32
+					     (props[TIPC_NLA_PROP_MTU])))
+				return -EINVAL;
+			b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]);
+			tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU);
+#endif
+		}
 	}
 
 	return 0;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c77dd2f3c589..b71e4e376bb9 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1681,7 +1681,8 @@ discard:
 	kfree_skb(skb);
 }
 
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b,
+			      int prop)
 {
 	struct tipc_net *tn = tipc_net(net);
 	int bearer_id = b->identity;
@@ -1696,8 +1697,13 @@ void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
 	list_for_each_entry_rcu(n, &tn->node_list, list) {
 		tipc_node_write_lock(n);
 		e = &n->links[bearer_id];
-		if (e->link)
-			tipc_link_set_tolerance(e->link, b->tolerance, &xmitq);
+		if (e->link) {
+			if (prop == TIPC_NLA_PROP_TOL)
+				tipc_link_set_tolerance(e->link, b->tolerance,
+							&xmitq);
+			else if (prop == TIPC_NLA_PROP_MTU)
+				tipc_link_set_mtu(e->link, b->mtu);
+		}
 		tipc_node_write_unlock(n);
 		tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
 	}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index f24b83500df1..bb271a37c93f 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -67,7 +67,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
 			  struct tipc_media_addr *maddr,
 			  bool *respond, bool *dupl_addr);
 void tipc_node_delete_links(struct net *net, int bearer_id);
-void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b);
+void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop);
 int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
 			   char *linkname, size_t len);
 int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
-- 
cgit v1.2.3-59-g8ed1b


From f1d22a1e0595639bcfef9941ebc0e3cfe9711009 Mon Sep 17 00:00:00 2001
From: George Wilkie <gwilkie@vyatta.att-mail.com>
Date: Thu, 19 Apr 2018 11:34:14 +0100
Subject: team: account for oper state

Account for operational state when determining port linkup state,
as per Documentation/networking/operstates.txt.

Signed-off-by: George Wilkie <gwilkie@vyatta.att-mail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a6c6ce19eeee..8a8611095ca0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2918,7 +2918,7 @@ static int team_device_event(struct notifier_block *unused,
 	case NETDEV_CHANGE:
 		if (netif_running(port->dev))
 			team_port_change_check(port,
-					       !!netif_carrier_ok(port->dev));
+					       !!netif_oper_up(port->dev));
 		break;
 	case NETDEV_UNREGISTER:
 		team_del_slave(port->team->dev, dev);
-- 
cgit v1.2.3-59-g8ed1b


From 4c52a889ab8ea7e965dc0ba40968768c5e91972f Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 19 Apr 2018 15:42:29 +0300
Subject: geneve: remove white-space before '#if IS_ENABLED(CONFIG_IPV6)'

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index b919e89a9b93..45acdc9f986e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1261,7 +1261,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	}
 
 	if (data[IFLA_GENEVE_REMOTE6]) {
- #if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 		if (changelink && (ip_tunnel_info_af(info) == AF_INET)) {
 			attrtype = IFLA_GENEVE_REMOTE6;
 			goto change_notsup;
-- 
cgit v1.2.3-59-g8ed1b


From 5edbea6987b305346a6ba9cf854e148b4ae814f0 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 19 Apr 2018 15:42:30 +0300
Subject: geneve: cleanup hard coded value for Ethernet header length

Use ETH_HLEN instead and introduce two new macros: GENEVE_IPV4_HLEN
and GENEVE_IPV6_HLEN that include Ethernet header length, corresponded
IP header length and GENEVE_BASE_HLEN.

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 45acdc9f986e..b650f845af25 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -36,6 +36,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
 #define GENEVE_VER 0
 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+#define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN)
+#define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN)
 
 /* per-network namespace private data for this module */
 struct geneve_net {
@@ -826,8 +828,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return PTR_ERR(rt);
 
 	if (skb_dst(skb)) {
-		int mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr) -
-			  GENEVE_BASE_HLEN - info->options_len - 14;
+		int mtu = dst_mtu(&rt->dst) - GENEVE_IPV4_HLEN -
+			  info->options_len;
 
 		skb_dst_update_pmtu(skb, mtu);
 	}
@@ -872,8 +874,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		return PTR_ERR(dst);
 
 	if (skb_dst(skb)) {
-		int mtu = dst_mtu(dst) - sizeof(struct ipv6hdr) -
-			  GENEVE_BASE_HLEN - info->options_len - 14;
+		int mtu = dst_mtu(dst) - GENEVE_IPV6_HLEN - info->options_len;
 
 		skb_dst_update_pmtu(skb, mtu);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 321acc1c68c13ce113531a8bc3dc5d43814c4ae2 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 19 Apr 2018 15:42:31 +0300
Subject: geneve: check MTU for a minimum in geneve_change_mtu()

geneve_change_mtu() will be used not only as ndo_change_mtu() callback,
but also to verify a user specified MTU on a new link creation in the
next patch.

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index b650f845af25..ae649f699a92 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -942,11 +942,10 @@ tx_error:
 
 static int geneve_change_mtu(struct net_device *dev, int new_mtu)
 {
-	/* Only possible if called internally, ndo_change_mtu path's new_mtu
-	 * is guaranteed to be between dev->min_mtu and dev->max_mtu.
-	 */
 	if (new_mtu > dev->max_mtu)
 		new_mtu = dev->max_mtu;
+	else if (new_mtu < dev->min_mtu)
+		new_mtu = dev->min_mtu;
 
 	dev->mtu = new_mtu;
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From c40e89fd358e94a55d6c1475afbea17b5580f601 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 19 Apr 2018 15:42:32 +0300
Subject: geneve: configure MTU based on a lower device

Currently, on a new link creation or when 'remote' address parameter
is updated, an MTU is not changed and always equals 1500. When a lower
device has a larger MTU, it might not be efficient, e.g. for UDP, and
requires the manual MTU adjustments to match the MTU of the lower
device.

This patch tries to automate this process, finds a lower device using
the 'remote' address parameter, then uses its MTU to tune GENEVE's MTU:
  * on a new link creation
  * when 'remote' parameter is changed

Also with this patch, the MTU from a user, on a new link creation, is
passed to geneve_change_mtu() where it is verified, and MTU adjustments
with a lower device is skipped in that case. Prior that change, it was
possible to set the invalid MTU values on a new link creation.

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index ae649f699a92..750eaa53bf0c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1387,6 +1387,48 @@ change_notsup:
 	return -EOPNOTSUPP;
 }
 
+static void geneve_link_config(struct net_device *dev,
+			       struct ip_tunnel_info *info, struct nlattr *tb[])
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	int ldev_mtu = 0;
+
+	if (tb[IFLA_MTU]) {
+		geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+		return;
+	}
+
+	switch (ip_tunnel_info_af(info)) {
+	case AF_INET: {
+		struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst };
+		struct rtable *rt = ip_route_output_key(geneve->net, &fl4);
+
+		if (!IS_ERR(rt) && rt->dst.dev) {
+			ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN;
+			ip_rt_put(rt);
+		}
+		break;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6: {
+		struct rt6_info *rt = rt6_lookup(geneve->net,
+						 &info->key.u.ipv6.dst, NULL, 0,
+						 NULL, 0);
+
+		if (rt && rt->dst.dev)
+			ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN;
+		ip6_rt_put(rt);
+		break;
+	}
+#endif
+	}
+
+	if (ldev_mtu <= 0)
+		return;
+
+	geneve_change_mtu(dev, ldev_mtu - info->options_len);
+}
+
 static int geneve_newlink(struct net *net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
@@ -1402,8 +1444,14 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (err)
 		return err;
 
-	return geneve_configure(net, dev, extack, &info, metadata,
-				use_udp6_rx_checksums);
+	err = geneve_configure(net, dev, extack, &info, metadata,
+			       use_udp6_rx_checksums);
+	if (err)
+		return err;
+
+	geneve_link_config(dev, &info, tb);
+
+	return 0;
 }
 
 /* Quiesces the geneve device data path for both TX and RX.
@@ -1477,8 +1525,10 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (err)
 		return err;
 
-	if (!geneve_dst_addr_equal(&geneve->info, &info))
+	if (!geneve_dst_addr_equal(&geneve->info, &info)) {
 		dst_cache_reset(&info.dst_cache);
+		geneve_link_config(dev, &info, tb);
+	}
 
 	geneve_quiesce(geneve, &gs4, &gs6);
 	geneve->info = info;
-- 
cgit v1.2.3-59-g8ed1b


From 7425d8220f8d0c2127aec75677f652a26f86bc95 Mon Sep 17 00:00:00 2001
From: Shahed Shaikh <shahed.shaikh@cavium.com>
Date: Thu, 19 Apr 2018 05:50:11 -0700
Subject: qed* : use trust mode to allow VF to override forced MAC

As per existing behavior, when PF sets a MAC address for a VF
(also called as forced MAC), VF is not allowed to change its
MAC address afterwards.
This puts the limitation on few use cases such as bonding of VFs,
where bonding driver asks VF to change its MAC address.

This patch uses a VF trust mode to allow VF to change its MAC address
in spite PF has set a forced MAC for that VF.

Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 210 +++++++++++++++++++++++--
 drivers/net/ethernet/qlogic/qede/qede_filter.c |   3 +-
 2 files changed, 195 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 5acb91b3564c..77376fda8eae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -48,7 +48,7 @@ static int qed_sriov_eqe_event(struct qed_hwfn *p_hwfn,
 			       u8 opcode,
 			       __le16 echo,
 			       union event_ring_data *data, u8 fw_return_code);
-
+static int qed_iov_bulletin_set_mac(struct qed_hwfn *p_hwfn, u8 *mac, int vfid);
 
 static u8 qed_vf_calculate_legacy(struct qed_vf_info *p_vf)
 {
@@ -1790,7 +1790,8 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 	if (!p_vf->vport_instance)
 		return -EINVAL;
 
-	if (events & BIT(MAC_ADDR_FORCED)) {
+	if ((events & BIT(MAC_ADDR_FORCED)) ||
+	    p_vf->p_vf_info.is_trusted_configured) {
 		/* Since there's no way [currently] of removing the MAC,
 		 * we can always assume this means we need to force it.
 		 */
@@ -1809,8 +1810,12 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 				  "PF failed to configure MAC for VF\n");
 			return rc;
 		}
-
-		p_vf->configured_features |= 1 << MAC_ADDR_FORCED;
+		if (p_vf->p_vf_info.is_trusted_configured)
+			p_vf->configured_features |=
+				BIT(VFPF_BULLETIN_MAC_ADDR);
+		else
+			p_vf->configured_features |=
+				BIT(MAC_ADDR_FORCED);
 	}
 
 	if (events & BIT(VLAN_ADDR_FORCED)) {
@@ -3170,6 +3175,10 @@ static int qed_iov_vf_update_mac_shadow(struct qed_hwfn *p_hwfn,
 	if (p_vf->bulletin.p_virt->valid_bitmap & BIT(MAC_ADDR_FORCED))
 		return 0;
 
+	/* Don't keep track of shadow copy since we don't intend to restore. */
+	if (p_vf->p_vf_info.is_trusted_configured)
+		return 0;
+
 	/* First remove entries and then add new ones */
 	if (p_params->opcode == QED_FILTER_REMOVE) {
 		for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++) {
@@ -3244,9 +3253,17 @@ static int qed_iov_chk_ucast(struct qed_hwfn *hwfn,
 
 	/* No real decision to make; Store the configured MAC */
 	if (params->type == QED_FILTER_MAC ||
-	    params->type == QED_FILTER_MAC_VLAN)
+	    params->type == QED_FILTER_MAC_VLAN) {
 		ether_addr_copy(vf->mac, params->mac);
 
+		if (vf->is_trusted_configured) {
+			qed_iov_bulletin_set_mac(hwfn, vf->mac, vfid);
+
+			/* Update and post bulleitin again */
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		}
+	}
+
 	return 0;
 }
 
@@ -4081,16 +4098,60 @@ static void qed_iov_bulletin_set_forced_mac(struct qed_hwfn *p_hwfn,
 		return;
 	}
 
-	feature = 1 << MAC_ADDR_FORCED;
+	if (vf_info->p_vf_info.is_trusted_configured) {
+		feature = BIT(VFPF_BULLETIN_MAC_ADDR);
+		/* Trust mode will disable Forced MAC */
+		vf_info->bulletin.p_virt->valid_bitmap &=
+			~BIT(MAC_ADDR_FORCED);
+	} else {
+		feature = BIT(MAC_ADDR_FORCED);
+		/* Forced MAC will disable MAC_ADDR */
+		vf_info->bulletin.p_virt->valid_bitmap &=
+			~BIT(VFPF_BULLETIN_MAC_ADDR);
+	}
+
 	memcpy(vf_info->bulletin.p_virt->mac, mac, ETH_ALEN);
 
 	vf_info->bulletin.p_virt->valid_bitmap |= feature;
-	/* Forced MAC will disable MAC_ADDR */
-	vf_info->bulletin.p_virt->valid_bitmap &= ~BIT(VFPF_BULLETIN_MAC_ADDR);
 
 	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
 }
 
+static int qed_iov_bulletin_set_mac(struct qed_hwfn *p_hwfn, u8 *mac, int vfid)
+{
+	struct qed_vf_info *vf_info;
+	u64 feature;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev, "Can not set MAC, invalid vfid [%d]\n",
+			  vfid);
+		return -EINVAL;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_NOTICE(p_hwfn->cdev, "Can't set MAC to malicious VF [%d]\n",
+			  vfid);
+		return -EINVAL;
+	}
+
+	if (vf_info->bulletin.p_virt->valid_bitmap & BIT(MAC_ADDR_FORCED)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Can not set MAC, Forced MAC is configured\n");
+		return -EINVAL;
+	}
+
+	feature = BIT(VFPF_BULLETIN_MAC_ADDR);
+	ether_addr_copy(vf_info->bulletin.p_virt->mac, mac);
+
+	vf_info->bulletin.p_virt->valid_bitmap |= feature;
+
+	if (vf_info->p_vf_info.is_trusted_configured)
+		qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
+
+	return 0;
+}
+
 static void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 					     u16 pvid, int vfid)
 {
@@ -4204,6 +4265,21 @@ out:
 	return rc;
 }
 
+static u8 *qed_iov_bulletin_get_mac(struct qed_hwfn *p_hwfn, u16 rel_vf_id)
+{
+	struct qed_vf_info *p_vf;
+
+	p_vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, true);
+	if (!p_vf || !p_vf->bulletin.p_virt)
+		return NULL;
+
+	if (!(p_vf->bulletin.p_virt->valid_bitmap &
+	      BIT(VFPF_BULLETIN_MAC_ADDR)))
+		return NULL;
+
+	return p_vf->bulletin.p_virt->mac;
+}
+
 static u8 *qed_iov_bulletin_get_forced_mac(struct qed_hwfn *p_hwfn,
 					   u16 rel_vf_id)
 {
@@ -4493,8 +4569,12 @@ static int qed_sriov_pf_set_mac(struct qed_dev *cdev, u8 *mac, int vfid)
 		if (!vf_info)
 			continue;
 
-		/* Set the forced MAC, and schedule the IOV task */
-		ether_addr_copy(vf_info->forced_mac, mac);
+		/* Set the MAC, and schedule the IOV task */
+		if (vf_info->is_trusted_configured)
+			ether_addr_copy(vf_info->mac, mac);
+		else
+			ether_addr_copy(vf_info->forced_mac, mac);
+
 		qed_schedule_iov(hwfn, QED_IOV_WQ_SET_UNICAST_FILTER_FLAG);
 	}
 
@@ -4802,6 +4882,33 @@ static void qed_handle_vf_msg(struct qed_hwfn *hwfn)
 	qed_ptt_release(hwfn, ptt);
 }
 
+static bool qed_pf_validate_req_vf_mac(struct qed_hwfn *hwfn,
+				       u8 *mac,
+				       struct qed_public_vf_info *info)
+{
+	if (info->is_trusted_configured) {
+		if (is_valid_ether_addr(info->mac) &&
+		    (!mac || !ether_addr_equal(mac, info->mac)))
+			return true;
+	} else {
+		if (is_valid_ether_addr(info->forced_mac) &&
+		    (!mac || !ether_addr_equal(mac, info->forced_mac)))
+			return true;
+	}
+
+	return false;
+}
+
+static void qed_set_bulletin_mac(struct qed_hwfn *hwfn,
+				 struct qed_public_vf_info *info,
+				 int vfid)
+{
+	if (info->is_trusted_configured)
+		qed_iov_bulletin_set_mac(hwfn, info->mac, vfid);
+	else
+		qed_iov_bulletin_set_forced_mac(hwfn, info->forced_mac, vfid);
+}
+
 static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
 {
 	int i;
@@ -4816,18 +4923,20 @@ static void qed_handle_pf_set_vf_unicast(struct qed_hwfn *hwfn)
 			continue;
 
 		/* Update data on bulletin board */
-		mac = qed_iov_bulletin_get_forced_mac(hwfn, i);
-		if (is_valid_ether_addr(info->forced_mac) &&
-		    (!mac || !ether_addr_equal(mac, info->forced_mac))) {
+		if (info->is_trusted_configured)
+			mac = qed_iov_bulletin_get_mac(hwfn, i);
+		else
+			mac = qed_iov_bulletin_get_forced_mac(hwfn, i);
+
+		if (qed_pf_validate_req_vf_mac(hwfn, mac, info)) {
 			DP_VERBOSE(hwfn,
 				   QED_MSG_IOV,
 				   "Handling PF setting of VF MAC to VF 0x%02x [Abs 0x%02x]\n",
 				   i,
 				   hwfn->cdev->p_iov_info->first_vf_in_pf + i);
 
-			/* Update bulletin board with forced MAC */
-			qed_iov_bulletin_set_forced_mac(hwfn,
-							info->forced_mac, i);
+			/* Update bulletin board with MAC */
+			qed_set_bulletin_mac(hwfn, info, i);
 			update = true;
 		}
 
@@ -4867,6 +4976,72 @@ static void qed_handle_bulletin_post(struct qed_hwfn *hwfn)
 	qed_ptt_release(hwfn, ptt);
 }
 
+static void qed_update_mac_for_vf_trust_change(struct qed_hwfn *hwfn, int vf_id)
+{
+	struct qed_public_vf_info *vf_info;
+	struct qed_vf_info *vf;
+	u8 *force_mac;
+	int i;
+
+	vf_info = qed_iov_get_public_vf_info(hwfn, vf_id, true);
+	vf = qed_iov_get_vf_info(hwfn, vf_id, true);
+
+	if (!vf_info || !vf)
+		return;
+
+	/* Force MAC converted to generic MAC in case of VF trust on */
+	if (vf_info->is_trusted_configured &&
+	    (vf->bulletin.p_virt->valid_bitmap & BIT(MAC_ADDR_FORCED))) {
+		force_mac = qed_iov_bulletin_get_forced_mac(hwfn, vf_id);
+
+		if (force_mac) {
+			/* Clear existing shadow copy of MAC to have a clean
+			 * slate.
+			 */
+			for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++) {
+				if (ether_addr_equal(vf->shadow_config.macs[i],
+						     vf_info->mac)) {
+					memset(vf->shadow_config.macs[i], 0,
+					       ETH_ALEN);
+					DP_VERBOSE(hwfn, QED_MSG_IOV,
+						   "Shadow MAC %pM removed for VF 0x%02x, VF trust mode is ON\n",
+						    vf_info->mac, vf_id);
+					break;
+				}
+			}
+
+			ether_addr_copy(vf_info->mac, force_mac);
+			memset(vf_info->forced_mac, 0, ETH_ALEN);
+			vf->bulletin.p_virt->valid_bitmap &=
+					~BIT(MAC_ADDR_FORCED);
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		}
+	}
+
+	/* Update shadow copy with VF MAC when trust mode is turned off */
+	if (!vf_info->is_trusted_configured) {
+		u8 empty_mac[ETH_ALEN];
+
+		memset(empty_mac, 0, ETH_ALEN);
+		for (i = 0; i < QED_ETH_VF_NUM_MAC_FILTERS; i++) {
+			if (ether_addr_equal(vf->shadow_config.macs[i],
+					     empty_mac)) {
+				ether_addr_copy(vf->shadow_config.macs[i],
+						vf_info->mac);
+				DP_VERBOSE(hwfn, QED_MSG_IOV,
+					   "Shadow is updated with %pM for VF 0x%02x, VF trust mode is OFF\n",
+					    vf_info->mac, vf_id);
+				break;
+			}
+		}
+		/* Clear bulletin when trust mode is turned off,
+		 * to have a clean slate for next (normal) operations.
+		 */
+		qed_iov_bulletin_set_mac(hwfn, empty_mac, vf_id);
+		qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+	}
+}
+
 static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
 {
 	struct qed_sp_vport_update_params params;
@@ -4890,6 +5065,9 @@ static void qed_iov_handle_trust_change(struct qed_hwfn *hwfn)
 			continue;
 		vf_info->is_trusted_configured = vf_info->is_trusted_request;
 
+		/* Handle forced MAC mode */
+		qed_update_mac_for_vf_trust_change(hwfn, i);
+
 		/* Validate that the VF has a configured vport */
 		vf = qed_iov_get_vf_info(hwfn, i, true);
 		if (!vf->vport_instance)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 6687e04d1558..8094f035b705 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -550,8 +550,7 @@ void qede_force_mac(void *dev, u8 *mac, bool forced)
 
 	__qede_lock(edev);
 
-	/* MAC hints take effect only if we haven't set one already */
-	if (is_valid_ether_addr(edev->ndev->dev_addr) && !forced) {
+	if (!is_valid_ether_addr(mac)) {
 		__qede_unlock(edev);
 		return;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 809c45a091d93e05c6e9b5d53bb3f1185273286b Mon Sep 17 00:00:00 2001
From: Shahed Shaikh <shahed.shaikh@cavium.com>
Date: Thu, 19 Apr 2018 05:50:12 -0700
Subject: qed* : Add new TLV to request PF to update MAC in bulletin board

There may be a need for VF driver to request PF to explicitly update its
bulletin with a MAC address.
e.g. When user assigns a MAC address to VF while VF is still down,
and PF's bulletin board contains different MAC address, in this case,
when VF's interface is brought up, it gets loaded with MAC address from
bulletin board which is not desirable.

To handle this corner case, we need a new TLV to request PF to update
its bulletin board with suggested MAC.

This request will be honored only for trusted VFs.

Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       | 19 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 37 ++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c       | 29 ++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       | 21 +++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  4 +++
 include/linux/qed/qed_eth_if.h                 |  1 +
 6 files changed, 111 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index e874504e8b28..8b1b7e8ca56c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2850,6 +2850,24 @@ static int qed_fp_cqe_completion(struct qed_dev *dev,
 				      cqe);
 }
 
+static int qed_req_bulletin_update_mac(struct qed_dev *cdev, u8 *mac)
+{
+	int i, ret;
+
+	if (IS_PF(cdev))
+		return 0;
+
+	for_each_hwfn(cdev, i) {
+		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
+
+		ret = qed_vf_pf_bulletin_update_mac(p_hwfn, mac);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_QED_SRIOV
 extern const struct qed_iov_hv_ops qed_iov_ops_pass;
 #endif
@@ -2887,6 +2905,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.ntuple_filter_config = &qed_ntuple_arfs_filter_config,
 	.configure_arfs_searcher = &qed_configure_arfs_searcher,
 	.get_coalesce = &qed_get_coalesce,
+	.req_bulletin_update_mac = &qed_req_bulletin_update_mac,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 77376fda8eae..f01bf52bc381 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3820,6 +3820,40 @@ static void qed_iov_get_link(struct qed_hwfn *p_hwfn,
 		__qed_vf_get_link_caps(p_hwfn, p_caps, p_bulletin);
 }
 
+static int
+qed_iov_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_vf_info *p_vf)
+{
+	struct qed_bulletin_content *p_bulletin = p_vf->bulletin.p_virt;
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct vfpf_bulletin_update_mac_tlv *p_req;
+	u8 status = PFVF_STATUS_SUCCESS;
+	int rc = 0;
+
+	if (!p_vf->p_vf_info.is_trusted_configured) {
+		DP_VERBOSE(p_hwfn,
+			   QED_MSG_IOV,
+			   "Blocking bulletin update request from untrusted VF[%d]\n",
+			   p_vf->abs_vf_id);
+		status = PFVF_STATUS_NOT_SUPPORTED;
+		rc = -EINVAL;
+		goto send_status;
+	}
+
+	p_req = &mbx->req_virt->bulletin_update_mac;
+	ether_addr_copy(p_bulletin->mac, p_req->mac);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Updated bulletin of VF[%d] with requested MAC[%pM]\n",
+		   p_vf->abs_vf_id, p_req->mac);
+
+send_status:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, p_vf,
+			     CHANNEL_TLV_BULLETIN_UPDATE_MAC,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+	return rc;
+}
+
 static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 				    struct qed_ptt *p_ptt, int vfid)
 {
@@ -3899,6 +3933,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_COALESCE_READ:
 			qed_iov_vf_pf_get_coalesce(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_BULLETIN_UPDATE_MAC:
+			qed_iov_vf_pf_bulletin_update_mac(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 91b5e9f02a62..2d7fcd6a0777 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1374,6 +1374,35 @@ exit:
 	return rc;
 }
 
+int
+qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+			      u8 *p_mac)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_bulletin_update_mac_tlv *p_req;
+	struct pfvf_def_resp_tlv *p_resp;
+	int rc;
+
+	if (!p_mac)
+		return -EINVAL;
+
+	/* clear mailbox and prep header tlv */
+	p_req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_BULLETIN_UPDATE_MAC,
+			       sizeof(*p_req));
+	ether_addr_copy(p_req->mac, p_mac);
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Requesting bulletin update for MAC[%pM]\n", p_mac);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	p_resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &p_resp->hdr.status, sizeof(*p_resp));
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
 int
 qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 		       u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 97d44dfb38ca..4f05d5eb3cf5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -518,6 +518,12 @@ struct pfvf_read_coal_resp_tlv {
 	u8 padding[6];
 };
 
+struct vfpf_bulletin_update_mac_tlv {
+	struct vfpf_first_tlv first_tlv;
+	u8 mac[ETH_ALEN];
+	u8 padding[2];
+};
+
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
 	struct vfpf_acquire_tlv acquire;
@@ -532,6 +538,7 @@ union vfpf_tlvs {
 	struct vfpf_update_tunn_param_tlv tunn_param_update;
 	struct vfpf_update_coalesce update_coalesce;
 	struct vfpf_read_coal_req_tlv read_coal_req;
+	struct vfpf_bulletin_update_mac_tlv bulletin_update_mac;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -650,6 +657,7 @@ enum {
 	CHANNEL_TLV_COALESCE_UPDATE,
 	CHANNEL_TLV_QID,
 	CHANNEL_TLV_COALESCE_READ,
+	CHANNEL_TLV_BULLETIN_UPDATE_MAC,
 	CHANNEL_TLV_MAX,
 
 	/* Required for iterating over vport-update tlvs.
@@ -1042,6 +1050,13 @@ int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
 				  struct qed_tunnel_info *p_tunn);
 
 u32 qed_vf_hw_bar_size(struct qed_hwfn *p_hwfn, enum BAR_ID bar_id);
+/**
+ * @brief - Ask PF to update the MAC address in it's bulletin board
+ *
+ * @param p_mac - mac address to be updated in bulletin board
+ */
+int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn, u8 *p_mac);
+
 #else
 static inline void qed_vf_get_link_params(struct qed_hwfn *p_hwfn,
 					  struct qed_mcp_link_params *params)
@@ -1228,6 +1243,12 @@ static inline int qed_vf_pf_tunnel_param_update(struct qed_hwfn *p_hwfn,
 	return -EINVAL;
 }
 
+static inline int qed_vf_pf_bulletin_update_mac(struct qed_hwfn *p_hwfn,
+						u8 *p_mac)
+{
+	return -EINVAL;
+}
+
 static inline u32
 qed_vf_hw_bar_size(struct qed_hwfn  *p_hwfn,
 		   enum BAR_ID bar_id)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 8094f035b705..43569b1839be 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1160,6 +1160,10 @@ int qede_set_mac_addr(struct net_device *ndev, void *p)
 	if (edev->state != QEDE_STATE_OPEN) {
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "The device is currently down\n");
+		/* Ask PF to explicitly update a copy in bulletin board */
+		if (IS_VF(edev) && edev->ops->req_bulletin_update_mac)
+			edev->ops->req_bulletin_update_mac(edev->cdev,
+							   ndev->dev_addr);
 		goto out;
 	}
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 147d08ccf813..7f9756fe9180 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -352,6 +352,7 @@ struct qed_eth_ops {
 	int (*configure_arfs_searcher)(struct qed_dev *cdev,
 				       enum qed_filter_config_mode mode);
 	int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle);
+	int (*req_bulletin_update_mac)(struct qed_dev *cdev, u8 *mac);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
-- 
cgit v1.2.3-59-g8ed1b


From 263243d6c2573cd761ef7ab773d3c467db0122f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Apr 2018 09:14:53 -0700
Subject: net/ipv6: Fix ip6_convert_metrics() bug

If ip6_convert_metrics() fails to allocate memory, it should not
overwrite rt->fib6_metrics or we risk a crash later as syzbot found.

BUG: KASAN: null-ptr-deref in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: null-ptr-deref in refcount_sub_and_test+0x92/0x330 lib/refcount.c:179
Read of size 4 at addr 0000000000000044 by task syzkaller832429/4487

CPU: 1 PID: 4487 Comm: syzkaller832429 Not tainted 4.16.0+ #6
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 kasan_report_error mm/kasan/report.c:352 [inline]
 kasan_report.cold.7+0x6d/0x2fe mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
 refcount_sub_and_test+0x92/0x330 lib/refcount.c:179
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:212
 fib6_info_destroy+0x2d0/0x3c0 net/ipv6/ip6_fib.c:206
 fib6_info_release include/net/ip6_fib.h:304 [inline]
 ip6_route_info_create+0x677/0x3240 net/ipv6/route.c:3020
 ip6_route_add+0x23/0xb0 net/ipv6/route.c:3030
 inet6_rtm_newroute+0x142/0x160 net/ipv6/route.c:4406
 rtnetlink_rcv_msg+0x466/0xc10 net/core/rtnetlink.c:4648
 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2448
 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4666
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x58b/0x740 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x9f0/0xfa0 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xd5/0x120 net/socket.c:639
 ___sys_sendmsg+0x805/0x940 net/socket.c:2117
 __sys_sendmsg+0x115/0x270 net/socket.c:2155
 SYSC_sendmsg net/socket.c:2164 [inline]
 SyS_sendmsg+0x29/0x30 net/socket.c:2162
 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x42/0xb7

Fixes: d4ead6b34b67 ("net/ipv6: move metrics from dst to rt6_info")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsa@cumulusnetworks.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 08cae7323776..884d9cee790f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2607,21 +2607,19 @@ out:
 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
 			       struct fib6_config *cfg)
 {
-	int err = 0;
+	struct dst_metrics *p;
 
-	if (cfg->fc_mx) {
-		rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
-					   GFP_KERNEL);
-		if (unlikely(!rt->fib6_metrics))
-			return -ENOMEM;
+	if (!cfg->fc_mx)
+		return 0;
 
-		refcount_set(&rt->fib6_metrics->refcnt, 1);
+	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
+	if (unlikely(!p))
+		return -ENOMEM;
 
-		err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
-					 rt->fib6_metrics->metrics);
-	}
+	refcount_set(&p->refcnt, 1);
+	rt->fib6_metrics = p;
 
-	return err;
+	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
 }
 
 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
-- 
cgit v1.2.3-59-g8ed1b


From a83762d97980f13b1ec7f2514b28666c0dac3af6 Mon Sep 17 00:00:00 2001
From: Bert Kenward <bkenward@solarflare.com>
Date: Thu, 19 Apr 2018 17:37:25 +0100
Subject: sfc: set and clear interrupt affinity hints

Use cpumask_local_spread to provide interrupt affinity hints
for each queue. This will spread interrupts across NUMA local
CPUs first, extending to remote nodes if needed.

Signed-off-by: Bert Kenward <bkenward@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 692dd729ee2a..c30b9e26f131 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1551,6 +1551,38 @@ static int efx_probe_interrupts(struct efx_nic *efx)
 	return 0;
 }
 
+#if defined(CONFIG_SMP)
+static void efx_set_interrupt_affinity(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+	unsigned int cpu;
+
+	efx_for_each_channel(channel, efx) {
+		cpu = cpumask_local_spread(channel->channel,
+					   pcibus_to_node(efx->pci_dev->bus));
+		irq_set_affinity_hint(channel->irq, cpumask_of(cpu));
+	}
+}
+
+static void efx_clear_interrupt_affinity(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+
+	efx_for_each_channel(channel, efx)
+		irq_set_affinity_hint(channel->irq, NULL);
+}
+#else
+static void
+efx_set_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused)))
+{
+}
+
+static void
+efx_clear_interrupt_affinity(struct efx_nic *efx __attribute__ ((unused)))
+{
+}
+#endif /* CONFIG_SMP */
+
 static int efx_soft_enable_interrupts(struct efx_nic *efx)
 {
 	struct efx_channel *channel, *end_channel;
@@ -3165,6 +3197,7 @@ static void efx_pci_remove_main(struct efx_nic *efx)
 	cancel_work_sync(&efx->reset_work);
 
 	efx_disable_interrupts(efx);
+	efx_clear_interrupt_affinity(efx);
 	efx_nic_fini_interrupt(efx);
 	efx_fini_port(efx);
 	efx->type->fini(efx);
@@ -3314,6 +3347,8 @@ static int efx_pci_probe_main(struct efx_nic *efx)
 	rc = efx_nic_init_interrupt(efx);
 	if (rc)
 		goto fail5;
+
+	efx_set_interrupt_affinity(efx);
 	rc = efx_enable_interrupts(efx);
 	if (rc)
 		goto fail6;
@@ -3321,6 +3356,7 @@ static int efx_pci_probe_main(struct efx_nic *efx)
 	return 0;
 
  fail6:
+	efx_clear_interrupt_affinity(efx);
 	efx_nic_fini_interrupt(efx);
  fail5:
 	efx_fini_port(efx);
-- 
cgit v1.2.3-59-g8ed1b


From 760db29bdc97b73ff60b091315ad787b1deb5cf5 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Thu, 19 Apr 2018 17:59:38 +0100
Subject: lan78xx: Read MAC address from DT if present

There is a standard mechanism for locating and using a MAC address from
the Device Tree. Use this facility in the lan78xx driver to support
applications without programmed EEPROM or OTP. At the same time,
regularise the handling of the different address sources.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 0867f7275852..a823f010de30 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -37,6 +37,7 @@
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
+#include <linux/of_net.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
@@ -1652,34 +1653,31 @@ static void lan78xx_init_mac_address(struct lan78xx_net *dev)
 	addr[5] = (addr_hi >> 8) & 0xFF;
 
 	if (!is_valid_ether_addr(addr)) {
-		/* reading mac address from EEPROM or OTP */
-		if ((lan78xx_read_eeprom(dev, EEPROM_MAC_OFFSET, ETH_ALEN,
-					 addr) == 0) ||
-		    (lan78xx_read_otp(dev, EEPROM_MAC_OFFSET, ETH_ALEN,
-				      addr) == 0)) {
-			if (is_valid_ether_addr(addr)) {
-				/* eeprom values are valid so use them */
-				netif_dbg(dev, ifup, dev->net,
-					  "MAC address read from EEPROM");
-			} else {
-				/* generate random MAC */
-				random_ether_addr(addr);
-				netif_dbg(dev, ifup, dev->net,
-					  "MAC address set to random addr");
-			}
-
-			addr_lo = addr[0] | (addr[1] << 8) |
-				  (addr[2] << 16) | (addr[3] << 24);
-			addr_hi = addr[4] | (addr[5] << 8);
-
-			ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
-			ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);
+		if (!eth_platform_get_mac_address(&dev->udev->dev, addr)) {
+			/* valid address present in Device Tree */
+			netif_dbg(dev, ifup, dev->net,
+				  "MAC address read from Device Tree");
+		} else if (((lan78xx_read_eeprom(dev, EEPROM_MAC_OFFSET,
+						 ETH_ALEN, addr) == 0) ||
+			    (lan78xx_read_otp(dev, EEPROM_MAC_OFFSET,
+					      ETH_ALEN, addr) == 0)) &&
+			   is_valid_ether_addr(addr)) {
+			/* eeprom values are valid so use them */
+			netif_dbg(dev, ifup, dev->net,
+				  "MAC address read from EEPROM");
 		} else {
 			/* generate random MAC */
 			random_ether_addr(addr);
 			netif_dbg(dev, ifup, dev->net,
 				  "MAC address set to random addr");
 		}
+
+		addr_lo = addr[0] | (addr[1] << 8) |
+			  (addr[2] << 16) | (addr[3] << 24);
+		addr_hi = addr[4] | (addr[5] << 8);
+
+		ret = lan78xx_write_reg(dev, RX_ADDRL, addr_lo);
+		ret = lan78xx_write_reg(dev, RX_ADDRH, addr_hi);
 	}
 
 	ret = lan78xx_write_reg(dev, MAF_LO(0), addr_lo);
-- 
cgit v1.2.3-59-g8ed1b


From 1827b0678863bc97a1653fdf5308762b2aefcd56 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Thu, 19 Apr 2018 17:59:39 +0100
Subject: lan78xx: Read LED states from Device Tree

Add support for DT property "microchip,led-modes", a vector of zero
to four cells (u32s) in the range 0-15, each of which sets the mode
for one of the LEDs. Some possible values are:

    0=link/activity          1=link1000/activity
    2=link100/activity       3=link10/activity
    4=link100/1000/activity  5=link10/1000/activity
    6=link10/100/activity    14=off    15=on

These values are given symbolic constants in a dt-bindings header.

Also use the presence of the DT property to indicate that the
LEDs should be enabled - necessary in the event that no valid OTP
or EEPROM is available.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                                 |  1 +
 drivers/net/phy/microchip.c                 | 25 ++++++++++++++++++++++
 drivers/net/usb/lan78xx.c                   | 32 ++++++++++++++++++++++++++++-
 include/dt-bindings/net/microchip-lan78xx.h | 21 +++++++++++++++++++
 include/linux/microchipphy.h                |  3 +++
 5 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/net/microchip-lan78xx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index a7321687cae9..c952d3076a65 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14572,6 +14572,7 @@ M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/usb/lan78xx.*
+F:	include/dt-bindings/net/microchip-lan78xx.h
 
 USB MASS STORAGE DRIVER
 M:	Alan Stern <stern@rowland.harvard.edu>
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
index 0f293ef28935..ef5e160fe9dc 100644
--- a/drivers/net/phy/microchip.c
+++ b/drivers/net/phy/microchip.c
@@ -20,6 +20,8 @@
 #include <linux/ethtool.h>
 #include <linux/phy.h>
 #include <linux/microchipphy.h>
+#include <linux/of.h>
+#include <dt-bindings/net/microchip-lan78xx.h>
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"Microchip LAN88XX PHY driver"
@@ -70,6 +72,8 @@ static int lan88xx_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
 	struct lan88xx_priv *priv;
+	u32 led_modes[4];
+	int len;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -77,6 +81,27 @@ static int lan88xx_probe(struct phy_device *phydev)
 
 	priv->wolopts = 0;
 
+	len = of_property_read_variable_u32_array(dev->of_node,
+						  "microchip,led-modes",
+						  led_modes,
+						  0,
+						  ARRAY_SIZE(led_modes));
+	if (len >= 0) {
+		u32 reg = 0;
+		int i;
+
+		for (i = 0; i < len; i++) {
+			if (led_modes[i] > 15)
+				return -EINVAL;
+			reg |= led_modes[i] << (i * 4);
+		}
+		for (; i < ARRAY_SIZE(led_modes); i++)
+			reg |= LAN78XX_FORCE_LED_OFF << (i * 4);
+		(void)phy_write(phydev, LAN78XX_PHY_LED_MODE_SELECT, reg);
+	} else if (len == -EOVERFLOW) {
+		return -EINVAL;
+	}
+
 	/* these values can be used to identify internal PHY */
 	priv->chip_id = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_ID);
 	priv->chip_rev = phy_read_mmd(phydev, 3, LAN88XX_MMD3_CHIP_REV);
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a823f010de30..6b03b973083e 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -37,6 +37,7 @@
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
+#include <linux/of_mdio.h>
 #include <linux/of_net.h>
 #include "lan78xx.h"
 
@@ -1760,6 +1761,7 @@ done:
 
 static int lan78xx_mdio_init(struct lan78xx_net *dev)
 {
+	struct device_node *node;
 	int ret;
 
 	dev->mdiobus = mdiobus_alloc();
@@ -1788,7 +1790,13 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev)
 		break;
 	}
 
-	ret = mdiobus_register(dev->mdiobus);
+	node = of_get_child_by_name(dev->udev->dev.of_node, "mdio");
+	if (node) {
+		ret = of_mdiobus_register(dev->mdiobus, node);
+		of_node_put(node);
+	} else {
+		ret = mdiobus_register(dev->mdiobus);
+	}
 	if (ret) {
 		netdev_err(dev->net, "can't register MDIO bus\n");
 		goto exit1;
@@ -2077,6 +2085,28 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control);
 	phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv);
 
+	if (phydev->mdio.dev.of_node) {
+		u32 reg;
+		int len;
+
+		len = of_property_count_elems_of_size(phydev->mdio.dev.of_node,
+						      "microchip,led-modes",
+						      sizeof(u32));
+		if (len >= 0) {
+			/* Ensure the appropriate LEDs are enabled */
+			lan78xx_read_reg(dev, HW_CFG, &reg);
+			reg &= ~(HW_CFG_LED0_EN_ |
+				 HW_CFG_LED1_EN_ |
+				 HW_CFG_LED2_EN_ |
+				 HW_CFG_LED3_EN_);
+			reg |= (len > 0) * HW_CFG_LED0_EN_ |
+				(len > 1) * HW_CFG_LED1_EN_ |
+				(len > 2) * HW_CFG_LED2_EN_ |
+				(len > 3) * HW_CFG_LED3_EN_;
+			lan78xx_write_reg(dev, HW_CFG, reg);
+		}
+	}
+
 	genphy_config_aneg(phydev);
 
 	dev->fc_autoneg = phydev->autoneg;
diff --git a/include/dt-bindings/net/microchip-lan78xx.h b/include/dt-bindings/net/microchip-lan78xx.h
new file mode 100644
index 000000000000..0742ff075307
--- /dev/null
+++ b/include/dt-bindings/net/microchip-lan78xx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H
+#define _DT_BINDINGS_MICROCHIP_LAN78XX_H
+
+/* LED modes for LAN7800/LAN7850 embedded PHY */
+
+#define LAN78XX_LINK_ACTIVITY           0
+#define LAN78XX_LINK_1000_ACTIVITY      1
+#define LAN78XX_LINK_100_ACTIVITY       2
+#define LAN78XX_LINK_10_ACTIVITY        3
+#define LAN78XX_LINK_100_1000_ACTIVITY  4
+#define LAN78XX_LINK_10_1000_ACTIVITY   5
+#define LAN78XX_LINK_10_100_ACTIVITY    6
+#define LAN78XX_DUPLEX_COLLISION        8
+#define LAN78XX_COLLISION               9
+#define LAN78XX_ACTIVITY                10
+#define LAN78XX_AUTONEG_FAULT           12
+#define LAN78XX_FORCE_LED_OFF           14
+#define LAN78XX_FORCE_LED_ON            15
+
+#endif
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
index eb492d47f717..8e4015e7f8d3 100644
--- a/include/linux/microchipphy.h
+++ b/include/linux/microchipphy.h
@@ -70,4 +70,7 @@
 #define	LAN88XX_MMD3_CHIP_ID			(32877)
 #define	LAN88XX_MMD3_CHIP_REV			(32878)
 
+/* Registers specific to the LAN7800/LAN7850 embedded phy */
+#define LAN78XX_PHY_LED_MODE_SELECT		(0x1D)
+
 #endif /* _MICROCHIPPHY_H */
-- 
cgit v1.2.3-59-g8ed1b


From 01d26589dee4b23376642fba333539605c52d324 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Thu, 19 Apr 2018 17:59:40 +0100
Subject: dt-bindings: Document the DT bindings for lan78xx

The Microchip LAN78XX family of devices are Ethernet controllers with
a USB interface. Despite being discoverable devices it can be useful to
be able to configure them from Device Tree, particularly in low-cost
applications without an EEPROM or programmed OTP.

Document the supported properties in a bindings file.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/microchip,lan78xx.txt  | 54 ++++++++++++++++++++++
 MAINTAINERS                                        |  1 +
 2 files changed, 55 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/microchip,lan78xx.txt

diff --git a/Documentation/devicetree/bindings/net/microchip,lan78xx.txt b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt
new file mode 100644
index 000000000000..76786a0f6d3d
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt
@@ -0,0 +1,54 @@
+Microchip LAN78xx Gigabit Ethernet controller
+
+The LAN78XX devices are usually configured by programming their OTP or with
+an external EEPROM, but some platforms (e.g. Raspberry Pi 3 B+) have neither.
+The Device Tree properties, if present, override the OTP and EEPROM.
+
+Required properties:
+- compatible: Should be one of "usb424,7800", "usb424,7801" or "usb424,7850".
+
+Optional properties:
+- local-mac-address:   see ethernet.txt
+- mac-address:         see ethernet.txt
+
+Optional properties of the embedded PHY:
+- microchip,led-modes: a 0..4 element vector, with each element configuring
+  the operating mode of an LED. Omitted LEDs are turned off. Allowed values
+  are defined in "include/dt-bindings/net/microchip-lan78xx.h".
+
+Example:
+
+/* Based on the configuration for a Raspberry Pi 3 B+ */
+&usb {
+	usb-port@1 {
+		compatible = "usb424,2514";
+		reg = <1>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		usb-port@1 {
+			compatible = "usb424,2514";
+			reg = <1>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			ethernet: ethernet@1 {
+				compatible = "usb424,7800";
+				reg = <1>;
+				local-mac-address = [ 00 11 22 33 44 55 ];
+
+				mdio {
+					#address-cells = <0x1>;
+					#size-cells = <0x0>;
+					eth_phy: ethernet-phy@1 {
+						reg = <1>;
+						microchip,led-modes = <
+							LAN78XX_LINK_1000_ACTIVITY
+							LAN78XX_LINK_10_100_ACTIVITY
+						>;
+					};
+				};
+			};
+		};
+	};
+};
diff --git a/MAINTAINERS b/MAINTAINERS
index c952d3076a65..81465707d8a8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14571,6 +14571,7 @@ M:	Woojung Huh <woojung.huh@microchip.com>
 M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	Documentation/devicetree/bindings/net/microchip,lan78xx.txt
 F:	drivers/net/usb/lan78xx.*
 F:	include/dt-bindings/net/microchip-lan78xx.h
 
-- 
cgit v1.2.3-59-g8ed1b


From 496218656f9857d801512efdec1d609ebbe8a83b Mon Sep 17 00:00:00 2001
From: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Date: Fri, 20 Apr 2018 11:43:50 +0530
Subject: lan78xx: Add support to dump lan78xx registers

In order to dump lan78xx family registers using ethtool, add
support at lan78xx driver level.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 6b03b973083e..c59f8afd0d73 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -280,6 +280,30 @@ struct lan78xx_statstage64 {
 	u64 eee_tx_lpi_time;
 };
 
+static u32 lan78xx_regs[] = {
+	ID_REV,
+	INT_STS,
+	HW_CFG,
+	PMT_CTL,
+	E2P_CMD,
+	E2P_DATA,
+	USB_STATUS,
+	VLAN_TYPE,
+	MAC_CR,
+	MAC_RX,
+	MAC_TX,
+	FLOW,
+	ERR_STS,
+	MII_ACC,
+	MII_DATA,
+	EEE_TX_LPI_REQ_DLY,
+	EEE_TW_TX_SYS,
+	EEE_TX_LPI_REM_DLY,
+	WUCSR
+};
+
+#define PHY_REG_SIZE (32 * sizeof(u32))
+
 struct lan78xx_net;
 
 struct lan78xx_priv {
@@ -1607,6 +1631,34 @@ exit:
 	return ret;
 }
 
+static int lan78xx_get_regs_len(struct net_device *netdev)
+{
+	if (!netdev->phydev)
+		return (sizeof(lan78xx_regs));
+	else
+		return (sizeof(lan78xx_regs) + PHY_REG_SIZE);
+}
+
+static void
+lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs,
+		 void *buf)
+{
+	u32 *data = buf;
+	int i, j;
+	struct lan78xx_net *dev = netdev_priv(netdev);
+
+	/* Read Device/MAC registers */
+	for (i = 0; i < (sizeof(lan78xx_regs) / sizeof(u32)); i++)
+		lan78xx_read_reg(dev, lan78xx_regs[i], &data[i]);
+
+	if (!netdev->phydev)
+		return;
+
+	/* Read PHY registers */
+	for (j = 0; j < 32; i++, j++)
+		data[i] = phy_read(netdev->phydev, j);
+}
+
 static const struct ethtool_ops lan78xx_ethtool_ops = {
 	.get_link	= lan78xx_get_link,
 	.nway_reset	= phy_ethtool_nway_reset,
@@ -1627,6 +1679,8 @@ static const struct ethtool_ops lan78xx_ethtool_ops = {
 	.set_pauseparam	= lan78xx_set_pause,
 	.get_link_ksettings = lan78xx_get_link_ksettings,
 	.set_link_ksettings = lan78xx_set_link_ksettings,
+	.get_regs_len	= lan78xx_get_regs_len,
+	.get_regs	= lan78xx_get_regs,
 };
 
 static int lan78xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
-- 
cgit v1.2.3-59-g8ed1b


From cf1a1e07fc8bb29947ad3c9568d73aee3f851431 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 20 Apr 2018 13:18:16 +0200
Subject: tun: do not compute the rxhash, if not needed

Currently, the tun driver, in absence of an eBPF steering program,
always compute the rxhash in its rx path, even when such value
is later unused due to additional checks (

This changeset moves the all the related checks just before the
__skb_get_hash_symmetric(), so that the latter is no more computed
when unneeded.

Also replace an unneeded RCU section with rcu_access_pointer().

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1e58be152d5c..091ace726763 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -525,11 +525,6 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 
 	rcu_read_lock();
 
-	/* We may get a very small possibility of OOO during switching, not
-	 * worth to optimize.*/
-	if (tun->numqueues == 1 || tfile->detached)
-		goto unlock;
-
 	e = tun_flow_find(head, rxhash);
 	if (likely(e)) {
 		/* TODO: keep queueing to old queue until it's empty? */
@@ -548,7 +543,6 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
 		spin_unlock_bh(&tun->lock);
 	}
 
-unlock:
 	rcu_read_unlock();
 }
 
@@ -1937,10 +1931,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		rcu_read_unlock();
 	}
 
-	rcu_read_lock();
-	if (!rcu_dereference(tun->steering_prog))
+	/* Compute the costly rx hash only if needed for flow updates.
+	 * We may get a very small possibility of OOO during switching, not
+	 * worth to optimize.
+	 */
+	if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
+	    !tfile->detached)
 		rxhash = __skb_get_hash_symmetric(skb);
-	rcu_read_unlock();
 
 	if (frags) {
 		/* Exercise flow dissector code path. */
-- 
cgit v1.2.3-59-g8ed1b


From 07cb9623ee7d15cc9968969ac247edae6972fb8f Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:10 +0100
Subject: ipv6: make ip6_dst_mtu_forward inline

Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip6_route.h | 21 +++++++++++++++++++++
 include/net/ipv6.h      |  2 --
 net/ipv6/ip6_output.c   | 22 ----------------------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5fb1e4ae7ac..376928c26d2d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
+static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+	struct inet6_dev *idev;
+	unsigned int mtu;
+
+	if (dst_metric_locked(dst, RTAX_MTU)) {
+		mtu = dst_metric_raw(dst, RTAX_MTU);
+		if (mtu)
+			return mtu;
+	}
+
+	mtu = IPV6_MIN_MTU;
+	rcu_read_lock();
+	idev = __in6_dev_get(dst->dev);
+	if (idev)
+		mtu = idev->cnf.mtu6;
+	rcu_read_unlock();
+
+	return mtu;
+}
+
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
 				   const void *daddr);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..765441867cfa 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -958,8 +958,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
 			      &inet6_sk(sk)->cork);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
-
 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
 		   struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3db47986ef38..cec49e137dbb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 	return dst_output(net, sk, skb);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
-	unsigned int mtu;
-	struct inet6_dev *idev;
-
-	if (dst_metric_locked(dst, RTAX_MTU)) {
-		mtu = dst_metric_raw(dst, RTAX_MTU);
-		if (mtu)
-			return mtu;
-	}
-
-	mtu = IPV6_MIN_MTU;
-	rcu_read_lock();
-	idev = __in6_dev_get(dst->dev);
-	if (idev)
-		mtu = idev->cnf.mtu6;
-	rcu_read_unlock();
-
-	return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
-- 
cgit v1.2.3-59-g8ed1b


From 4f3780c004ef608477a534558eb16f46c5a1c534 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:11 +0100
Subject: netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple

Reduces the number of cache lines touched in the offload forwarding
path. This is safe because PMTU limits are bypassed for the forwarding
path (see commit f87c10a8aa1e for more details).

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 17 +++--------------
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 17 +++--------------
 net/netfilter/nf_flow_table.c           |  8 ++++++--
 4 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 09ba67598991..76ee5c81b752 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -55,6 +55,8 @@ struct flow_offload_tuple {
 
 	int				oifidx;
 
+	u16				mtu;
+
 	struct dst_entry		*dst_cache;
 };
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..461b1815e633 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -178,7 +178,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -192,17 +192,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
-	u32 mtu;
-
-	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -233,9 +222,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 207cb35569b1..0e6328490142 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
-	u32 mtu;
-
-	mtu = ip6_dst_mtu_forward(&rt->dst);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
@@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index db0673a40b97..7403a0dfddf7 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,8 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 {
 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+	struct dst_entry *dst = route->tuple[dir].dst;
 
 	ft->dir = dir;
 
@@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	case NFPROTO_IPV4:
 		ft->src_v4 = ctt->src.u3.in;
 		ft->dst_v4 = ctt->dst.u3.in;
+		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
 		ft->src_v6 = ctt->src.u3.in6;
 		ft->dst_v6 = ctt->dst.u3.in6;
+		ft->mtu = ip6_dst_mtu_forward(dst);
 		break;
 	}
 
@@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 
 	ft->iifidx = route->tuple[dir].ifindex;
 	ft->oifidx = route->tuple[!dir].ifindex;
-
-	ft->dst_cache = route->tuple[dir].dst;
+	ft->dst_cache = dst;
 }
 
 struct flow_offload *
-- 
cgit v1.2.3-59-g8ed1b


From 1a999d899bc26abee26d70c10762c81b9d684d2b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:12 +0100
Subject: netfilter: nf_flow_table: rename nf_flow_table.c to
 nf_flow_table_core.c

Preparation for adding more code to the same module

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Makefile             |   2 +
 net/netfilter/nf_flow_table.c      | 432 -------------------------------------
 net/netfilter/nf_flow_table_core.c | 432 +++++++++++++++++++++++++++++++++++++
 3 files changed, 434 insertions(+), 432 deletions(-)
 delete mode 100644 net/netfilter/nf_flow_table.c
 create mode 100644 net/netfilter/nf_flow_table_core.c

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fd32bd2c9521..700c5d51e405 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,6 +111,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o
+
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
 # generic X tables 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
deleted file mode 100644
index 7403a0dfddf7..000000000000
--- a/net/netfilter/nf_flow_table.c
+++ /dev/null
@@ -1,432 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/ip6_route.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_tuple.h>
-
-struct flow_offload_entry {
-	struct flow_offload	flow;
-	struct nf_conn		*ct;
-	struct rcu_head		rcu_head;
-};
-
-static void
-flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
-		      struct nf_flow_route *route,
-		      enum flow_offload_tuple_dir dir)
-{
-	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
-	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
-	struct dst_entry *dst = route->tuple[dir].dst;
-
-	ft->dir = dir;
-
-	switch (ctt->src.l3num) {
-	case NFPROTO_IPV4:
-		ft->src_v4 = ctt->src.u3.in;
-		ft->dst_v4 = ctt->dst.u3.in;
-		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
-		break;
-	case NFPROTO_IPV6:
-		ft->src_v6 = ctt->src.u3.in6;
-		ft->dst_v6 = ctt->dst.u3.in6;
-		ft->mtu = ip6_dst_mtu_forward(dst);
-		break;
-	}
-
-	ft->l3proto = ctt->src.l3num;
-	ft->l4proto = ctt->dst.protonum;
-	ft->src_port = ctt->src.u.tcp.port;
-	ft->dst_port = ctt->dst.u.tcp.port;
-
-	ft->iifidx = route->tuple[dir].ifindex;
-	ft->oifidx = route->tuple[!dir].ifindex;
-	ft->dst_cache = dst;
-}
-
-struct flow_offload *
-flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
-{
-	struct flow_offload_entry *entry;
-	struct flow_offload *flow;
-
-	if (unlikely(nf_ct_is_dying(ct) ||
-	    !atomic_inc_not_zero(&ct->ct_general.use)))
-		return NULL;
-
-	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
-	if (!entry)
-		goto err_ct_refcnt;
-
-	flow = &entry->flow;
-
-	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
-		goto err_dst_cache_original;
-
-	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
-		goto err_dst_cache_reply;
-
-	entry->ct = ct;
-
-	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
-	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
-
-	if (ct->status & IPS_SRC_NAT)
-		flow->flags |= FLOW_OFFLOAD_SNAT;
-	else if (ct->status & IPS_DST_NAT)
-		flow->flags |= FLOW_OFFLOAD_DNAT;
-
-	return flow;
-
-err_dst_cache_reply:
-	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-err_dst_cache_original:
-	kfree(entry);
-err_ct_refcnt:
-	nf_ct_put(ct);
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(flow_offload_alloc);
-
-void flow_offload_free(struct flow_offload *flow)
-{
-	struct flow_offload_entry *e;
-
-	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
-	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
-	e = container_of(flow, struct flow_offload_entry, flow);
-	nf_ct_delete(e->ct, 0, 0);
-	nf_ct_put(e->ct);
-	kfree_rcu(e, rcu_head);
-}
-EXPORT_SYMBOL_GPL(flow_offload_free);
-
-void flow_offload_dead(struct flow_offload *flow)
-{
-	flow->flags |= FLOW_OFFLOAD_DYING;
-}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
-
-int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
-{
-	flow->timeout = (u32)jiffies;
-
-	rhashtable_insert_fast(&flow_table->rhashtable,
-			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
-	rhashtable_insert_fast(&flow_table->rhashtable,
-			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(flow_offload_add);
-
-static void flow_offload_del(struct nf_flowtable *flow_table,
-			     struct flow_offload *flow)
-{
-	rhashtable_remove_fast(&flow_table->rhashtable,
-			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
-	rhashtable_remove_fast(&flow_table->rhashtable,
-			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
-
-	flow_offload_free(flow);
-}
-
-struct flow_offload_tuple_rhash *
-flow_offload_lookup(struct nf_flowtable *flow_table,
-		    struct flow_offload_tuple *tuple)
-{
-	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
-				      *flow_table->type->params);
-}
-EXPORT_SYMBOL_GPL(flow_offload_lookup);
-
-int nf_flow_table_iterate(struct nf_flowtable *flow_table,
-			  void (*iter)(struct flow_offload *flow, void *data),
-			  void *data)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct rhashtable_iter hti;
-	struct flow_offload *flow;
-	int err;
-
-	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
-	if (err)
-		return err;
-
-	rhashtable_walk_start(&hti);
-
-	while ((tuplehash = rhashtable_walk_next(&hti))) {
-		if (IS_ERR(tuplehash)) {
-			err = PTR_ERR(tuplehash);
-			if (err != -EAGAIN)
-				goto out;
-
-			continue;
-		}
-		if (tuplehash->tuple.dir)
-			continue;
-
-		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
-
-		iter(flow, data);
-	}
-out:
-	rhashtable_walk_stop(&hti);
-	rhashtable_walk_exit(&hti);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
-
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
-	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
-}
-
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
-	return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
-static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct rhashtable_iter hti;
-	struct flow_offload *flow;
-	int err;
-
-	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
-	if (err)
-		return 0;
-
-	rhashtable_walk_start(&hti);
-
-	while ((tuplehash = rhashtable_walk_next(&hti))) {
-		if (IS_ERR(tuplehash)) {
-			err = PTR_ERR(tuplehash);
-			if (err != -EAGAIN)
-				goto out;
-
-			continue;
-		}
-		if (tuplehash->tuple.dir)
-			continue;
-
-		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
-
-		if (nf_flow_has_expired(flow) ||
-		    nf_flow_is_dying(flow))
-			flow_offload_del(flow_table, flow);
-	}
-out:
-	rhashtable_walk_stop(&hti);
-	rhashtable_walk_exit(&hti);
-
-	return 1;
-}
-
-void nf_flow_offload_work_gc(struct work_struct *work)
-{
-	struct nf_flowtable *flow_table;
-
-	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
-	nf_flow_offload_gc_step(flow_table);
-	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple *tuple = data;
-
-	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple_rhash *tuplehash = data;
-
-	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
-					const void *ptr)
-{
-	const struct flow_offload_tuple *tuple = arg->key;
-	const struct flow_offload_tuple_rhash *x = ptr;
-
-	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
-		return 1;
-
-	return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
-	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
-	.hashfn			= flow_offload_hash,
-	.obj_hashfn		= flow_offload_hash_obj,
-	.obj_cmpfn		= flow_offload_hash_cmp,
-	.automatic_shrinking	= true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
-				__be16 port, __be16 new_port)
-{
-	struct tcphdr *tcph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
-		return -1;
-
-	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
-
-	return 0;
-}
-
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
-				__be16 port, __be16 new_port)
-{
-	struct udphdr *udph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
-		return -1;
-
-	udph = (void *)(skb_network_header(skb) + thoff);
-	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace2(&udph->check, skb, port,
-					 new_port, true);
-		if (!udph->check)
-			udph->check = CSUM_MANGLED_0;
-	}
-
-	return 0;
-}
-
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
-			    u8 protocol, __be16 port, __be16 new_port)
-{
-	switch (protocol) {
-	case IPPROTO_TCP:
-		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
-			return NF_DROP;
-		break;
-	case IPPROTO_UDP:
-		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
-			return NF_DROP;
-		break;
-	}
-
-	return 0;
-}
-
-int nf_flow_snat_port(const struct flow_offload *flow,
-		      struct sk_buff *skb, unsigned int thoff,
-		      u8 protocol, enum flow_offload_tuple_dir dir)
-{
-	struct flow_ports *hdr;
-	__be16 port, new_port;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
-		return -1;
-
-	hdr = (void *)(skb_network_header(skb) + thoff);
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		port = hdr->source;
-		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
-		hdr->source = new_port;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		port = hdr->dest;
-		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
-		hdr->dest = new_port;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
-}
-EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-
-int nf_flow_dnat_port(const struct flow_offload *flow,
-		      struct sk_buff *skb, unsigned int thoff,
-		      u8 protocol, enum flow_offload_tuple_dir dir)
-{
-	struct flow_ports *hdr;
-	__be16 port, new_port;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
-		return -1;
-
-	hdr = (void *)(skb_network_header(skb) + thoff);
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		port = hdr->dest;
-		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
-		hdr->dest = new_port;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		port = hdr->source;
-		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
-		hdr->source = new_port;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
-}
-EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
-
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
-{
-	struct net_device *dev = data;
-
-	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
-		return;
-
-	flow_offload_dead(flow);
-}
-
-static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-					  void *data)
-{
-	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
-	flush_delayed_work(&flowtable->gc_work);
-}
-
-void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
-{
-	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
-
-void nf_flow_table_free(struct nf_flowtable *flow_table)
-{
-	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
-	WARN_ON(!nf_flow_offload_gc_step(flow_table));
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_free);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
new file mode 100644
index 000000000000..7403a0dfddf7
--- /dev/null
+++ b/net/netfilter/nf_flow_table_core.c
@@ -0,0 +1,432 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+struct flow_offload_entry {
+	struct flow_offload	flow;
+	struct nf_conn		*ct;
+	struct rcu_head		rcu_head;
+};
+
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+		      struct nf_flow_route *route,
+		      enum flow_offload_tuple_dir dir)
+{
+	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+	struct dst_entry *dst = route->tuple[dir].dst;
+
+	ft->dir = dir;
+
+	switch (ctt->src.l3num) {
+	case NFPROTO_IPV4:
+		ft->src_v4 = ctt->src.u3.in;
+		ft->dst_v4 = ctt->dst.u3.in;
+		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
+		break;
+	case NFPROTO_IPV6:
+		ft->src_v6 = ctt->src.u3.in6;
+		ft->dst_v6 = ctt->dst.u3.in6;
+		ft->mtu = ip6_dst_mtu_forward(dst);
+		break;
+	}
+
+	ft->l3proto = ctt->src.l3num;
+	ft->l4proto = ctt->dst.protonum;
+	ft->src_port = ctt->src.u.tcp.port;
+	ft->dst_port = ctt->dst.u.tcp.port;
+
+	ft->iifidx = route->tuple[dir].ifindex;
+	ft->oifidx = route->tuple[!dir].ifindex;
+	ft->dst_cache = dst;
+}
+
+struct flow_offload *
+flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+{
+	struct flow_offload_entry *entry;
+	struct flow_offload *flow;
+
+	if (unlikely(nf_ct_is_dying(ct) ||
+	    !atomic_inc_not_zero(&ct->ct_general.use)))
+		return NULL;
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (!entry)
+		goto err_ct_refcnt;
+
+	flow = &entry->flow;
+
+	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
+		goto err_dst_cache_original;
+
+	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
+		goto err_dst_cache_reply;
+
+	entry->ct = ct;
+
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
+
+	if (ct->status & IPS_SRC_NAT)
+		flow->flags |= FLOW_OFFLOAD_SNAT;
+	else if (ct->status & IPS_DST_NAT)
+		flow->flags |= FLOW_OFFLOAD_DNAT;
+
+	return flow;
+
+err_dst_cache_reply:
+	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+err_dst_cache_original:
+	kfree(entry);
+err_ct_refcnt:
+	nf_ct_put(ct);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
+
+void flow_offload_free(struct flow_offload *flow)
+{
+	struct flow_offload_entry *e;
+
+	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
+	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+	e = container_of(flow, struct flow_offload_entry, flow);
+	nf_ct_delete(e->ct, 0, 0);
+	nf_ct_put(e->ct);
+	kfree_rcu(e, rcu_head);
+}
+EXPORT_SYMBOL_GPL(flow_offload_free);
+
+void flow_offload_dead(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_DYING;
+}
+EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
+{
+	flow->timeout = (u32)jiffies;
+
+	rhashtable_insert_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+			       *flow_table->type->params);
+	rhashtable_insert_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+			       *flow_table->type->params);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(flow_offload_add);
+
+static void flow_offload_del(struct nf_flowtable *flow_table,
+			     struct flow_offload *flow)
+{
+	rhashtable_remove_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+			       *flow_table->type->params);
+	rhashtable_remove_fast(&flow_table->rhashtable,
+			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+			       *flow_table->type->params);
+
+	flow_offload_free(flow);
+}
+
+struct flow_offload_tuple_rhash *
+flow_offload_lookup(struct nf_flowtable *flow_table,
+		    struct flow_offload_tuple *tuple)
+{
+	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+				      *flow_table->type->params);
+}
+EXPORT_SYMBOL_GPL(flow_offload_lookup);
+
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+			  void (*iter)(struct flow_offload *flow, void *data),
+			  void *data)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct rhashtable_iter hti;
+	struct flow_offload *flow;
+	int err;
+
+	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+	if (err)
+		return err;
+
+	rhashtable_walk_start(&hti);
+
+	while ((tuplehash = rhashtable_walk_next(&hti))) {
+		if (IS_ERR(tuplehash)) {
+			err = PTR_ERR(tuplehash);
+			if (err != -EAGAIN)
+				goto out;
+
+			continue;
+		}
+		if (tuplehash->tuple.dir)
+			continue;
+
+		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+		iter(flow, data);
+	}
+out:
+	rhashtable_walk_stop(&hti);
+	rhashtable_walk_exit(&hti);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
+
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
+static inline bool nf_flow_is_dying(const struct flow_offload *flow)
+{
+	return flow->flags & FLOW_OFFLOAD_DYING;
+}
+
+static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct rhashtable_iter hti;
+	struct flow_offload *flow;
+	int err;
+
+	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+	if (err)
+		return 0;
+
+	rhashtable_walk_start(&hti);
+
+	while ((tuplehash = rhashtable_walk_next(&hti))) {
+		if (IS_ERR(tuplehash)) {
+			err = PTR_ERR(tuplehash);
+			if (err != -EAGAIN)
+				goto out;
+
+			continue;
+		}
+		if (tuplehash->tuple.dir)
+			continue;
+
+		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+		if (nf_flow_has_expired(flow) ||
+		    nf_flow_is_dying(flow))
+			flow_offload_del(flow_table, flow);
+	}
+out:
+	rhashtable_walk_stop(&hti);
+	rhashtable_walk_exit(&hti);
+
+	return 1;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+	struct nf_flowtable *flow_table;
+
+	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+	nf_flow_offload_gc_step(flow_table);
+	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
+
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple *tuple = data;
+
+	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple_rhash *tuplehash = data;
+
+	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+					const void *ptr)
+{
+	const struct flow_offload_tuple *tuple = arg->key;
+	const struct flow_offload_tuple_rhash *x = ptr;
+
+	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+		return 1;
+
+	return 0;
+}
+
+const struct rhashtable_params nf_flow_offload_rhash_params = {
+	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
+	.hashfn			= flow_offload_hash,
+	.obj_hashfn		= flow_offload_hash_obj,
+	.obj_cmpfn		= flow_offload_hash_cmp,
+	.automatic_shrinking	= true,
+};
+EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
+
+static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+				__be16 port, __be16 new_port)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+				__be16 port, __be16 new_port)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace2(&udph->check, skb, port,
+					 new_port, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+			    u8 protocol, __be16 port, __be16 new_port)
+{
+	switch (protocol) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir)
+{
+	struct flow_ports *hdr;
+	__be16 port, new_port;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+		return -1;
+
+	hdr = (void *)(skb_network_header(skb) + thoff);
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = hdr->source;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+		hdr->source = new_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = hdr->dest;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+		hdr->dest = new_port;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_snat_port);
+
+int nf_flow_dnat_port(const struct flow_offload *flow,
+		      struct sk_buff *skb, unsigned int thoff,
+		      u8 protocol, enum flow_offload_tuple_dir dir)
+{
+	struct flow_ports *hdr;
+	__be16 port, new_port;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+		return -1;
+
+	hdr = (void *)(skb_network_header(skb) + thoff);
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = hdr->dest;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
+		hdr->dest = new_port;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = hdr->source;
+		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
+		hdr->source = new_port;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+
+static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+{
+	struct net_device *dev = data;
+
+	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+		return;
+
+	flow_offload_dead(flow);
+}
+
+static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
+					  void *data)
+{
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	flush_delayed_work(&flowtable->gc_work);
+}
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+{
+	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+
+void nf_flow_table_free(struct nf_flowtable *flow_table)
+{
+	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_free);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-- 
cgit v1.2.3-59-g8ed1b


From 6a3e030f08e1b700aa6d1ebdc7ebe4c44a2ef67a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:37:57 -0700
Subject: net/ipv6: Clean up rt expires helpers

rt6_clean_expires and rt6_set_expires are no longer used. Removed them.
rt6_update_expires has 1 caller in route.c, so move it from the header.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 21 ---------------------
 net/ipv6/route.c      |  9 +++++++++
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 642211174692..327a74cd6a0d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -223,27 +223,6 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i)
 	return false;
 }
 
-static inline void rt6_clean_expires(struct rt6_info *rt)
-{
-	rt->rt6i_flags &= ~RTF_EXPIRES;
-	rt->dst.expires = 0;
-}
-
-static inline void rt6_set_expires(struct rt6_info *rt, unsigned long expires)
-{
-	rt->dst.expires = expires;
-	rt->rt6i_flags |= RTF_EXPIRES;
-}
-
-static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
-{
-	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
-		rt0->dst.expires = rt0->from->expires;
-
-	dst_set_expires(&rt0->dst, timeout);
-	rt0->rt6i_flags |= RTF_EXPIRES;
-}
-
 /* Function to safely get fn->sernum for passed in rt
  * and store result in passed in cookie.
  * Return true if we can get cookie safely
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 884d9cee790f..062dd4d8232c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2221,6 +2221,15 @@ static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
+static void rt6_update_expires(struct rt6_info *rt0, int timeout)
+{
+	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
+		rt0->dst.expires = rt0->from->expires;
+
+	dst_set_expires(&rt0->dst, timeout);
+	rt0->rt6i_flags |= RTF_EXPIRES;
+}
+
 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 {
 	struct net *net = dev_net(rt->dst.dev);
-- 
cgit v1.2.3-59-g8ed1b


From a269f1a764bb3abf5c73499fb74bc7ab1c2e986c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:37:58 -0700
Subject: net/ipv6: Rename rt6_get_cookie_safe

rt6_get_cookie_safe takes a fib6_info and checks the sernum of
the node. Update the name to reflect its purpose.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 6 +++---
 net/ipv6/route.c      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 327a74cd6a0d..dd1481ed8bdb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -228,8 +228,8 @@ static inline bool fib6_check_expired(const struct fib6_info *f6i)
  * Return true if we can get cookie safely
  * Return false if not
  */
-static inline bool rt6_get_cookie_safe(const struct fib6_info *f6i,
-				       u32 *cookie)
+static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
+					u32 *cookie)
 {
 	struct fib6_node *fn;
 	bool status = false;
@@ -254,7 +254,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		rt6_get_cookie_safe(rt->from, &cookie);
+		fib6_get_cookie_safe(rt->from, &cookie);
 
 	return cookie;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 062dd4d8232c..2d6fcfe11c82 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2128,7 +2128,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
+	if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) ||
 	     rt_cookie != cookie)
 		return false;
 
@@ -2142,7 +2142,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
 	    rt_cookie != cookie)
 		return NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 4d85cd0c2a1ce22a9aca1fa668ba11043c834bbc Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:37:59 -0700
Subject: net/ipv6: Move rcu_read_lock to callers of ip6_rt_cache_alloc

A later patch protects 'from' in rt6_info and this simplifies the
locking needed by it.

With the move, the fib6_info_hold for the uncached_rt is no longer
needed since the rcu_lock is still held.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2d6fcfe11c82..f8b22183b7fd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1164,10 +1164,8 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
 	 *	Clone the route.
 	 */
 
-	rcu_read_lock();
 	dev = ip6_rt_get_dev_rcu(ort);
 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
-	rcu_read_unlock();
 	if (!rt)
 		return NULL;
 
@@ -1855,14 +1853,11 @@ redo_rt6_select:
 		 * the daddr in the skb during the neighbor look-up is different
 		 * from the fl6->daddr used to look-up route here.
 		 */
-
 		struct rt6_info *uncached_rt;
 
-		fib6_info_hold(f6i);
-		rcu_read_unlock();
-
 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
-		fib6_info_release(f6i);
+
+		rcu_read_unlock();
 
 		if (uncached_rt) {
 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
@@ -2280,7 +2275,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	} else if (daddr) {
 		struct rt6_info *nrt6;
 
+		rcu_read_lock();
 		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
+		rcu_read_unlock();
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
 			if (rt6_insert_exception(nrt6, rt6->from))
@@ -3299,7 +3296,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 				     NEIGH_UPDATE_F_ISROUTER)),
 		     NDISC_REDIRECT, &ndopts);
 
+	rcu_read_lock();
 	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
+	rcu_read_unlock();
 	if (!nrt)
 		goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From a87b7dc9f7fa7c87cbd575361553e8e9e4ab4473 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:00 -0700
Subject: net/ipv6: Move rcu locking to callers of fib6_get_cookie_safe

A later patch protects 'from' in rt6_info and this simplifies the
locking needed by it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 ++++--
 net/ipv6/route.c      | 13 ++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index dd1481ed8bdb..dc3505fb62b3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -234,7 +234,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 	struct fib6_node *fn;
 	bool status = false;
 
-	rcu_read_lock();
 	fn = rcu_dereference(f6i->fib6_node);
 
 	if (fn) {
@@ -244,7 +243,6 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 		status = true;
 	}
 
-	rcu_read_unlock();
 	return status;
 }
 
@@ -252,10 +250,14 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
 	u32 cookie = 0;
 
+	rcu_read_lock();
+
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
 		fib6_get_cookie_safe(rt->from, &cookie);
 
+	rcu_read_unlock();
+
 	return cookie;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f8b22183b7fd..810a37ac043b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2159,9 +2159,12 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
+	struct dst_entry *dst_ret;
 	struct rt6_info *rt;
 
-	rt = (struct rt6_info *) dst;
+	rt = container_of(dst, struct rt6_info, dst);
+
+	rcu_read_lock();
 
 	/* All IPV6 dsts are created with ->obsolete set to the value
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
@@ -2170,9 +2173,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		return rt6_dst_from_check(rt, cookie);
+		dst_ret = rt6_dst_from_check(rt, cookie);
 	else
-		return rt6_check(rt, cookie);
+		dst_ret = rt6_check(rt, cookie);
+
+	rcu_read_unlock();
+
+	return dst_ret;
 }
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
-- 
cgit v1.2.3-59-g8ed1b


From 5bcaa41b964ba2cda6dde64a126bfb7da1bd88f8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:01 -0700
Subject: net/ipv6: Move release of fib6_info from pcpu routes to helper

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f936e91a8c65..64ca02b7745f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -860,6 +860,27 @@ insert_above:
 	return ln;
 }
 
+static void fib6_drop_pcpu_from(struct fib6_info *f6i,
+				const struct fib6_table *table)
+{
+	int cpu;
+
+	/* release the reference to this fib entry from
+	 * all of its cached pcpu routes
+	 */
+	for_each_possible_cpu(cpu) {
+		struct rt6_info **ppcpu_rt;
+		struct rt6_info *pcpu_rt;
+
+		ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
+		pcpu_rt = *ppcpu_rt;
+		if (pcpu_rt) {
+			fib6_info_release(pcpu_rt->from);
+			pcpu_rt->from = NULL;
+		}
+	}
+}
+
 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 			  struct net *net)
 {
@@ -887,24 +908,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
 				    lockdep_is_held(&table->tb6_lock));
 		}
 
-		if (rt->rt6i_pcpu) {
-			int cpu;
-
-			/* release the reference to this fib entry from
-			 * all of its cached pcpu routes
-			 */
-			for_each_possible_cpu(cpu) {
-				struct rt6_info **ppcpu_rt;
-				struct rt6_info *pcpu_rt;
-
-				ppcpu_rt = per_cpu_ptr(rt->rt6i_pcpu, cpu);
-				pcpu_rt = *ppcpu_rt;
-				if (pcpu_rt) {
-					fib6_info_release(pcpu_rt->from);
-					pcpu_rt->from = NULL;
-				}
-			}
-		}
+		if (rt->rt6i_pcpu)
+			fib6_drop_pcpu_from(rt, table);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a68886a691804d3f6d479ebf6825480fbafb6a00 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:02 -0700
Subject: net/ipv6: Make from in rt6_info rcu protected

When a dst entry is created from a fib entry, the 'from' in rt6_info
is set to the fib entry. The 'from' reference is used most notably for
cookie checking - making sure stale dst entries are updated if the
fib entry is changed.

When a fib entry is deleted, the pcpu routes on it are walked releasing
the fib6_info reference. This is needed for the fib6_info cleanup to
happen and to make sure all device references are released in a timely
manner.

There is a race window when a FIB entry is deleted and the 'from' on the
pcpu route is dropped and the pcpu route hits a cookie check. Handle
this race using rcu on from.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 10 +++---
 net/ipv6/ip6_fib.c    |  8 +++--
 net/ipv6/ip6_output.c |  9 +++--
 net/ipv6/route.c      | 96 ++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index dc3505fb62b3..1af450d4e923 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -174,7 +174,7 @@ struct fib6_info {
 
 struct rt6_info {
 	struct dst_entry		dst;
-	struct fib6_info		*from;
+	struct fib6_info __rcu		*from;
 
 	struct rt6key			rt6i_dst;
 	struct rt6key			rt6i_src;
@@ -248,13 +248,15 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i,
 
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
+	struct fib6_info *from;
 	u32 cookie = 0;
 
 	rcu_read_lock();
 
-	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		fib6_get_cookie_safe(rt->from, &cookie);
+	from = rcu_dereference(rt->from);
+	if (from && (rt->rt6i_flags & RTF_PCPU ||
+	    unlikely(!list_empty(&rt->rt6i_uncached))))
+		fib6_get_cookie_safe(from, &cookie);
 
 	rcu_read_unlock();
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 64ca02b7745f..6421c893466e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -875,8 +875,12 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
 		ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
 		pcpu_rt = *ppcpu_rt;
 		if (pcpu_rt) {
-			fib6_info_release(pcpu_rt->from);
-			pcpu_rt->from = NULL;
+			struct fib6_info *from;
+
+			from = rcu_dereference_protected(pcpu_rt->from,
+					     lockdep_is_held(&table->tb6_lock));
+			rcu_assign_pointer(pcpu_rt->from, NULL);
+			fib6_info_release(from);
 		}
 	}
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3db47986ef38..6a477d54f8c7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -962,16 +962,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	 * that's why we try it again later.
 	 */
 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+		struct fib6_info *from;
 		struct rt6_info *rt;
 		bool had_dst = *dst != NULL;
 
 		if (!had_dst)
 			*dst = ip6_route_output(net, sk, fl6);
 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
-		err = ip6_route_get_saddr(net, rt ? rt->from : NULL,
-					  &fl6->daddr,
+
+		rcu_read_lock();
+		from = rt ? rcu_dereference(rt->from) : NULL;
+		err = ip6_route_get_saddr(net, from, &fl6->daddr,
 					  sk ? inet6_sk(sk)->srcprefs : 0,
 					  &fl6->saddr);
+		rcu_read_unlock();
+
 		if (err)
 			goto out_err_release;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 810a37ac043b..004d00fe2fe5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -359,7 +359,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
 	struct rt6_info *rt = (struct rt6_info *)dst;
-	struct fib6_info *from = rt->from;
+	struct fib6_info *from;
 	struct inet6_dev *idev;
 
 	dst_destroy_metrics_generic(dst);
@@ -371,8 +371,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 		in6_dev_put(idev);
 	}
 
-	rt->from = NULL;
+	rcu_read_lock();
+	from = rcu_dereference(rt->from);
+	rcu_assign_pointer(rt->from, NULL);
 	fib6_info_release(from);
+	rcu_read_unlock();
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -402,12 +405,16 @@ static bool __rt6_check_expired(const struct rt6_info *rt)
 
 static bool rt6_check_expired(const struct rt6_info *rt)
 {
+	struct fib6_info *from;
+
+	from = rcu_dereference(rt->from);
+
 	if (rt->rt6i_flags & RTF_EXPIRES) {
 		if (time_after(jiffies, rt->dst.expires))
 			return true;
-	} else if (rt->from) {
+	} else if (from) {
 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
-			fib6_check_expired(rt->from);
+			fib6_check_expired(from);
 	}
 	return false;
 }
@@ -963,7 +970,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
 {
 	rt->rt6i_flags &= ~RTF_EXPIRES;
 	fib6_info_hold(from);
-	rt->from = from;
+	rcu_assign_pointer(rt->from, from);
 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
 	if (from->fib6_metrics != &dst_default_metrics) {
 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
@@ -2133,11 +2140,13 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 	return true;
 }
 
-static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_check(struct rt6_info *rt,
+				   struct fib6_info *from,
+				   u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((rt->from && !fib6_get_cookie_safe(rt->from, &rt_cookie)) ||
+	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
 	    rt_cookie != cookie)
 		return NULL;
 
@@ -2147,11 +2156,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 	return &rt->dst;
 }
 
-static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
+static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
+					    struct fib6_info *from,
+					    u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
-	    fib6_check(rt->from, cookie))
+	    fib6_check(from, cookie))
 		return &rt->dst;
 	else
 		return NULL;
@@ -2160,6 +2171,7 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 {
 	struct dst_entry *dst_ret;
+	struct fib6_info *from;
 	struct rt6_info *rt;
 
 	rt = container_of(dst, struct rt6_info, dst);
@@ -2171,11 +2183,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 	 * into this function always.
 	 */
 
-	if (rt->rt6i_flags & RTF_PCPU ||
-	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
-		dst_ret = rt6_dst_from_check(rt, cookie);
+	from = rcu_dereference(rt->from);
+
+	if (from && (rt->rt6i_flags & RTF_PCPU ||
+	    unlikely(!list_empty(&rt->rt6i_uncached))))
+		dst_ret = rt6_dst_from_check(rt, from, cookie);
 	else
-		dst_ret = rt6_check(rt, cookie);
+		dst_ret = rt6_check(rt, from, cookie);
 
 	rcu_read_unlock();
 
@@ -2211,13 +2225,17 @@ static void ip6_link_failure(struct sk_buff *skb)
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
 				rt6_remove_exception_rt(rt);
-		} else if (rt->from) {
+		} else {
+			struct fib6_info *from;
 			struct fib6_node *fn;
 
 			rcu_read_lock();
-			fn = rcu_dereference(rt->from->fib6_node);
-			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
-				fn->fn_sernum = -1;
+			from = rcu_dereference(rt->from);
+			if (from) {
+				fn = rcu_dereference(from->fib6_node);
+				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+					fn->fn_sernum = -1;
+			}
 			rcu_read_unlock();
 		}
 	}
@@ -2225,8 +2243,15 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
 {
-	if (!(rt0->rt6i_flags & RTF_EXPIRES) && rt0->from)
-		rt0->dst.expires = rt0->from->expires;
+	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
+		struct fib6_info *from;
+
+		rcu_read_lock();
+		from = rcu_dereference(rt0->from);
+		if (from)
+			rt0->dst.expires = from->expires;
+		rcu_read_unlock();
+	}
 
 	dst_set_expires(&rt0->dst, timeout);
 	rt0->rt6i_flags |= RTF_EXPIRES;
@@ -2243,8 +2268,14 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
+	bool from_set;
+
+	rcu_read_lock();
+	from_set = !!rcu_dereference(rt->from);
+	rcu_read_unlock();
+
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU || rt->from);
+		(rt->rt6i_flags & RTF_PCPU || from_set);
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
@@ -2280,16 +2311,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 		if (rt6->rt6i_flags & RTF_CACHE)
 			rt6_update_exception_stamp_rt(rt6);
 	} else if (daddr) {
+		struct fib6_info *from;
 		struct rt6_info *nrt6;
 
 		rcu_read_lock();
-		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
-		rcu_read_unlock();
+		from = rcu_dereference(rt6->from);
+		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
 		if (nrt6) {
 			rt6_do_update_pmtu(nrt6, mtu);
-			if (rt6_insert_exception(nrt6, rt6->from))
+			if (rt6_insert_exception(nrt6, from))
 				dst_release_immediate(&nrt6->dst);
 		}
+		rcu_read_unlock();
 	}
 }
 
@@ -3222,6 +3255,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	struct ndisc_options ndopts;
 	struct inet6_dev *in6_dev;
 	struct neighbour *neigh;
+	struct fib6_info *from;
 	struct rd_msg *msg;
 	int optlen, on_link;
 	u8 *lladdr;
@@ -3304,7 +3338,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 		     NDISC_REDIRECT, &ndopts);
 
 	rcu_read_lock();
-	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
+	from = rcu_dereference(rt->from);
+	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
 	rcu_read_unlock();
 	if (!nrt)
 		goto out;
@@ -4687,6 +4722,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	struct net *net = sock_net(in_skb->sk);
 	struct nlattr *tb[RTA_MAX+1];
 	int err, iif = 0, oif = 0;
+	struct fib6_info *from;
 	struct dst_entry *dst;
 	struct rt6_info *rt;
 	struct sk_buff *skb;
@@ -4783,15 +4819,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	skb_dst_set(skb, &rt->dst);
+
+	rcu_read_lock();
+	from = rcu_dereference(rt->from);
+
 	if (fibmatch)
-		err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
+		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, 0);
 	else
-		err = rt6_fill_node(net, skb, rt->from, dst,
-				    &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
+		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+				    &fl6.saddr, iif, RTM_NEWROUTE,
 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
 				    0);
+	rcu_read_unlock();
+
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
-- 
cgit v1.2.3-59-g8ed1b


From 8ae869714ba377c93d5dc546b97c0aeaba90b3c9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 20 Apr 2018 15:38:03 -0700
Subject: net/ipv6: Remove unncessary check on f6i in fib6_check

Dan reported an imbalance in fib6_check on use of f6i and checking
whether it is null. Since fib6_check is only called if f6i is non-null,
remove the unnecessary check.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 004d00fe2fe5..0407bbc5a028 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2130,8 +2130,7 @@ static bool fib6_check(struct fib6_info *f6i, u32 cookie)
 {
 	u32 rt_cookie = 0;
 
-	if ((f6i && !fib6_get_cookie_safe(f6i, &rt_cookie)) ||
-	     rt_cookie != cookie)
+	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
 		return false;
 
 	if (fib6_check_expired(f6i))
-- 
cgit v1.2.3-59-g8ed1b


From f546a99d62e5d4550a73c2ca0af51212a4fd46df Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Thu, 19 Apr 2018 19:23:35 +0200
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 057a28a9fe88..8da3c9336111 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.1"
+#define BATADV_SOURCE_VERSION "2018.2"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3-59-g8ed1b


From 1ba93211e2042bdbecc54971929a97f2fb0ed25e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 18 Mar 2018 10:34:29 +0100
Subject: batman-adv: Disable CONFIG_BATMAN_ADV_DEBUGFS by default

All tools which were known to the batman-adv development team are
supporting the batman-adv netlink interface since a while. Also debugfs is
not supported for batman-adv interfaces in any non-default netns. Thus
disabling CONFIG_BATMAN_ADV_DEBUGFS by default should not cause problems on
most systems. It is still possible to enable it in case it is still
required in a specific setup.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index e4e2e02b7380..bee034a95208 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -94,13 +94,13 @@ config BATMAN_ADV_DEBUGFS
 	bool "batman-adv debugfs entries"
 	depends on BATMAN_ADV
 	depends on DEBUG_FS
-	default y
+	default n
 	help
 	  Enable this to export routing related debug tables via debugfs.
 	  The information for each soft-interface and used hard-interface can be
 	  found under batman_adv/
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config BATMAN_ADV_DEBUG
 	bool "B.A.T.M.A.N. debugging"
-- 
cgit v1.2.3-59-g8ed1b


From f26e4e98b15322522bc6221325bf14cbead65d1b Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Sun, 25 Mar 2018 00:32:04 +0100
Subject: batman-adv: Avoid old nodes disabling multicast optimizations
 completely
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of disabling multicast optimizations mesh-wide once a node with
no multicast optimizations capabilities joins the mesh, do the
following:

Just insert such nodes into the WANT_ALL_IPV4/IPV6 lists. This is
sufficient to avoid multicast packet loss to such unsupportive nodes.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/multicast.c      | 29 ++++++-----------------------
 net/batman-adv/soft-interface.c |  1 -
 net/batman-adv/types.h          |  3 ---
 3 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index a11d3d89f012..36fd7b06c7cc 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -815,9 +815,6 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
 	if (!atomic_read(&bat_priv->multicast_mode))
 		return -EINVAL;
 
-	if (atomic_read(&bat_priv->mcast.num_disabled))
-		return -EINVAL;
-
 	switch (ntohs(ethhdr->h_proto)) {
 	case ETH_P_IP:
 		return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
@@ -1183,33 +1180,23 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
 {
 	bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
 	u8 mcast_flags = BATADV_NO_FLAGS;
-	bool orig_initialized;
 
 	if (orig_mcast_enabled && tvlv_value &&
 	    tvlv_value_len >= sizeof(mcast_flags))
 		mcast_flags = *(u8 *)tvlv_value;
 
+	if (!orig_mcast_enabled) {
+		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+		mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+	}
+
 	spin_lock_bh(&orig->mcast_handler_lock);
-	orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
-				    &orig->capa_initialized);
 
-	/* If mcast support is turned on decrease the disabled mcast node
-	 * counter only if we had increased it for this node before. If this
-	 * is a completely new orig_node no need to decrease the counter.
-	 */
 	if (orig_mcast_enabled &&
 	    !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
-		if (orig_initialized)
-			atomic_dec(&bat_priv->mcast.num_disabled);
 		set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
-	/* If mcast support is being switched off or if this is an initial
-	 * OGM without mcast support then increase the disabled mcast
-	 * node counter.
-	 */
 	} else if (!orig_mcast_enabled &&
-		   (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) ||
-		    !orig_initialized)) {
-		atomic_inc(&bat_priv->mcast.num_disabled);
+		   test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) {
 		clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities);
 	}
 
@@ -1595,10 +1582,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
 
 	spin_lock_bh(&orig->mcast_handler_lock);
 
-	if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) &&
-	    test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized))
-		atomic_dec(&bat_priv->mcast.num_disabled);
-
 	batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
 	batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
 	batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index edeffcb9f3a2..67065e35de51 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -796,7 +796,6 @@ static int batadv_softif_init_late(struct net_device *dev)
 	bat_priv->mcast.querier_ipv6.shadowing = false;
 	bat_priv->mcast.flags = BATADV_NO_FLAGS;
 	atomic_set(&bat_priv->multicast_mode, 1);
-	atomic_set(&bat_priv->mcast.num_disabled, 0);
 	atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
 	atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
 	atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 476b052ad982..0174f79e955a 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1212,9 +1212,6 @@ struct batadv_priv_mcast {
 	/** @bridged: whether the soft interface has a bridge on top */
 	bool bridged;
 
-	/** @num_disabled: number of nodes that have no mcast tvlv */
-	atomic_t num_disabled;
-
 	/**
 	 * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
 	 *  traffic
-- 
cgit v1.2.3-59-g8ed1b


From 6486379d8716ea6e68576b6d81d85d9a60f71a87 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Thu, 19 Apr 2018 21:01:07 +0200
Subject: batman-adv: Avoid bool in structures

Using the bool type for structure member is considered inappropriate [1]
for the kernel. Its size is not well defined (but usually 1 byte but maybe
also 4 byte).

[1] https://lkml.org/lkml/2017/11/21/384

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/types.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 0174f79e955a..3b3fc146b30d 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1160,13 +1160,13 @@ struct batadv_priv_dat {
  */
 struct batadv_mcast_querier_state {
 	/** @exists: whether a querier exists in the mesh */
-	bool exists;
+	unsigned char exists:1;
 
 	/**
 	 * @shadowing: if a querier exists, whether it is potentially shadowing
 	 *  multicast listeners (i.e. querier is behind our own bridge segment)
 	 */
-	bool shadowing;
+	unsigned char shadowing:1;
 };
 
 /**
@@ -1207,10 +1207,10 @@ struct batadv_priv_mcast {
 	u8 flags;
 
 	/** @enabled: whether the multicast tvlv is currently enabled */
-	bool enabled;
+	unsigned char enabled:1;
 
 	/** @bridged: whether the soft interface has a bridge on top */
-	bool bridged;
+	unsigned char bridged:1;
 
 	/**
 	 * @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
@@ -1389,7 +1389,7 @@ struct batadv_tp_vars {
 	atomic_t dup_acks;
 
 	/** @fast_recovery: true if in Fast Recovery mode */
-	bool fast_recovery;
+	unsigned char fast_recovery:1;
 
 	/** @recover: last sent seqno when entering Fast Recovery */
 	u32 recover;
@@ -2046,10 +2046,10 @@ struct batadv_skb_cb {
 	 * @decoded: Marks a skb as decoded, which is checked when searching for
 	 *  coding opportunities in network-coding.c
 	 */
-	bool decoded;
+	unsigned char decoded:1;
 
 	/** @num_bcasts: Counter for broadcast packet retransmissions */
-	unsigned int num_bcasts;
+	unsigned char num_bcasts;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From dc06338df9a8443dcff6c7be03ab792ea682d9ab Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Thu, 19 Apr 2018 21:01:08 +0200
Subject: batman-adv: Remove unused dentry without DEBUGFS

The debug_dir variable in the main structures is only accessed when
CONFIG_BATMAN_ADV_DEBUGFS is enabled. It can be dropped to potentially save
some bytes of memory.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/types.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 3b3fc146b30d..360357f83f20 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -215,10 +215,12 @@ struct batadv_hard_iface {
 	struct batadv_hard_iface_bat_v bat_v;
 #endif
 
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
 	/**
 	 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
 	 */
 	struct dentry *debug_dir;
+#endif
 
 	/**
 	 * @neigh_list: list of unique single hop neighbors via this interface
@@ -1242,10 +1244,12 @@ struct batadv_priv_nc {
 	/** @work: work queue callback item for cleanup */
 	struct delayed_work work;
 
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
 	/**
 	 * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
 	 */
 	struct dentry *debug_dir;
+#endif
 
 	/**
 	 * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
@@ -1598,8 +1602,10 @@ struct batadv_priv {
 	/** @mesh_obj: kobject for sysfs mesh subdirectory */
 	struct kobject *mesh_obj;
 
+#ifdef CONFIG_BATMAN_ADV_DEBUGFS
 	/** @debug_dir: dentry for debugfs batman-adv subdirectory */
 	struct dentry *debug_dir;
+#endif
 
 	/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
 	struct hlist_head forw_bat_list;
-- 
cgit v1.2.3-59-g8ed1b


From 48d7a07ba355c7056e66adcdd35379d12e48252b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Fri, 20 Apr 2018 08:48:47 -0700
Subject: hv_netvsc: select needed ucs2_string routine

The conversion of rndis friendly name to utf8 uses a standard
kernel routine which is optional in config. Therefore build
would fail for some configurations. Resolve by selecting needed
library.

Fixes: 0fe554a46a0f ("hv_netvsc: propogate Hyper-V friendly name into interface alias")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 936968d23559..0765d5f61714 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -1,5 +1,6 @@
 config HYPERV_NET
 	tristate "Microsoft Hyper-V virtual network driver"
 	depends on HYPERV
+	select UCS2_STRING
 	help
 	  Select this option to enable the Hyper-V virtual network driver.
-- 
cgit v1.2.3-59-g8ed1b


From aaa6452763cf944da72f16e0736554accb364703 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Sun, 22 Apr 2018 21:16:48 -0700
Subject: bpf: fix virtio-net's length calc for XDP_PASS

In commit 6870de435b90 ("bpf: make virtio compatible w/
bpf_xdp_adjust_tail") i didn't account for vi->hdr_len during new
packet's length calculation after bpf_prog_run in receive_mergeable.
because of this all packets, if they were passed to the kernel,
were truncated by 12 bytes.

Fixes:6870de435b90 ("bpf: make virtio compatible w/ bpf_xdp_adjust_tail")
Reported-by: David Ahern <dsahern@gmail.com>

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3b5991734118..f34794a76c4d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -765,7 +765,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			/* recalculate len if xdp.data or xdp.data_end were
 			 * adjusted
 			 */
-			len = xdp.data_end - xdp.data;
+			len = xdp.data_end - xdp.data + vi->hdr_len;
 			/* We can only create skb based on xdp_page. */
 			if (unlikely(xdp_page != page)) {
 				rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From fbcf93ebcaef7d09881ee308b52cd84f5e43c622 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 21 Apr 2018 09:48:23 -0700
Subject: bpf: btf: Clean up btf.h in uapi

This patch cleans up btf.h in uapi:
1) Rename "name" to "name_off" to better reflect it is an offset to the
   string section instead of a char array.
2) Remove unused value BTF_FLAGS_COMPR and BTF_MAGIC_SWAP

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h       |  8 +++-----
 kernel/bpf/btf.c               | 20 ++++++++++----------
 tools/include/uapi/linux/btf.h |  8 +++-----
 tools/lib/bpf/btf.c            |  2 +-
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index eb56ac760547..22e1046a1a86 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -473,7 +473,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
 			   btf_kind_str[kind],
-			   btf_name_by_offset(btf, t->name),
+			   btf_name_by_offset(btf, t->name_off),
 			   log_details ? " " : "");
 
 	if (log_details)
@@ -517,7 +517,7 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 		btf_verifier_log_type(env, struct_type, NULL);
 
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
-			   btf_name_by_offset(btf, member->name),
+			   btf_name_by_offset(btf, member->name_off),
 			   member->type, member->offset);
 
 	if (fmt && *fmt) {
@@ -1419,10 +1419,10 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for_each_member(i, t, member) {
-		if (!btf_name_offset_valid(btf, member->name)) {
+		if (!btf_name_offset_valid(btf, member->name_off)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid member name_offset:%u",
-						member->name);
+						member->name_off);
 			return -EINVAL;
 		}
 
@@ -1605,14 +1605,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 	btf_verifier_log_type(env, t, NULL);
 
 	for (i = 0; i < nr_enums; i++) {
-		if (!btf_name_offset_valid(btf, enums[i].name)) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
 			btf_verifier_log(env, "\tInvalid name_offset:%u",
-					 enums[i].name);
+					 enums[i].name_off);
 			return -EINVAL;
 		}
 
 		btf_verifier_log(env, "\t%s val=%d\n",
-				 btf_name_by_offset(btf, enums[i].name),
+				 btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
 	}
 
@@ -1636,7 +1636,7 @@ static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
 	for (i = 0; i < nr_enums; i++) {
 		if (v == enums[i].val) {
 			seq_printf(m, "%s",
-				   btf_name_by_offset(btf, enums[i].name));
+				   btf_name_by_offset(btf, enums[i].name_off));
 			return;
 		}
 	}
@@ -1687,9 +1687,9 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (!btf_name_offset_valid(env->btf, t->name)) {
+	if (!btf_name_offset_valid(env->btf, t->name_off)) {
 		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
-				 env->log_type_id, t->name);
+				 env->log_type_id, t->name_off);
 		return -EINVAL;
 	}
 
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index 74a30b1090df..bcb56ee47014 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -6,9 +6,7 @@
 #include <linux/types.h>
 
 #define BTF_MAGIC	0xeB9F
-#define BTF_MAGIC_SWAP	0x9FeB
 #define BTF_VERSION	1
-#define BTF_FLAGS_COMPR	0x01
 
 struct btf_header {
 	__u16	magic;
@@ -43,7 +41,7 @@ struct btf_header {
 #define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
 
 struct btf_type {
-	__u32 name;
+	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
@@ -105,7 +103,7 @@ struct btf_type {
  * info in "struct btf_type").
  */
 struct btf_enum {
-	__u32	name;
+	__u32	name_off;
 	__s32	val;
 };
 
@@ -122,7 +120,7 @@ struct btf_array {
  * "struct btf_type").
  */
 struct btf_member {
-	__u32	name;
+	__u32	name_off;
 	__u32	type;
 	__u32	offset;	/* offset in bits */
 };
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 58b6255abc7a..2bac710e3194 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -281,7 +281,7 @@ int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
 
 	for (i = 1; i <= btf->nr_types; i++) {
 		const struct btf_type *t = btf->types[i];
-		const char *name = btf_name_by_offset(btf, t->name);
+		const char *name = btf_name_by_offset(btf, t->name_off);
 
 		if (name && !strcmp(type_name, name))
 			return i;
-- 
cgit v1.2.3-59-g8ed1b


From 6163849d289be6ff2acd2fb520da303dec3219f0 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 20 Apr 2018 23:18:26 +0800
Subject: net: introduce a new tracepoint for tcp_rcv_space_adjust

tcp_rcv_space_adjust is called every time data is copied to user space,
introducing a tcp tracepoint for which could show us when the packet is
copied to user.

When a tcp packet arrives, tcp_rcv_established() will be called and with
the existed tracepoint tcp_probe we could get the time when this packet
arrives.
Then this packet will be copied to user, and tcp_rcv_space_adjust will
be called and with this new introduced tracepoint we could get the time
when this packet is copied to user.
With these two tracepoints, we could figure out whether the user program
processes this packet immediately or there's latency.

Hence in the printk message, sk_cookie is printed as a key to relate
tcp_rcv_space_adjust with tcp_probe.

Maybe we could export sockfd in this new tracepoint as well, then we
could relate this new tracepoint with epoll/read/recv* tracepoints, and
finally that could show us the whole lifespan of this packet. But we
could also implement that with pid as these functions are executed in
process context.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 30 +++++++++++++++++++++---------
 net/ipv4/tcp_input.c       |  2 ++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3dd68029d77a..c1a5284713b8 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -10,6 +10,7 @@
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
+#include <linux/sock_diag.h>
 
 #define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
 	do {							\
@@ -113,7 +114,7 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
  */
 DECLARE_EVENT_CLASS(tcp_event_sk,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk),
 
@@ -125,6 +126,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 		__array(__u8, daddr, 4)
 		__array(__u8, saddr_v6, 16)
 		__array(__u8, daddr_v6, 16)
+		__field(__u64, sock_cookie)
 	),
 
 	TP_fast_assign(
@@ -144,24 +146,34 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 
 		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
 			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
+	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx",
 		  __entry->sport, __entry->dport,
 		  __entry->saddr, __entry->daddr,
-		  __entry->saddr_v6, __entry->daddr_v6)
+		  __entry->saddr_v6, __entry->daddr_v6,
+		  __entry->sock_cookie)
 );
 
 DEFINE_EVENT(tcp_event_sk, tcp_receive_reset,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk)
 );
 
 DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock,
 
-	TP_PROTO(const struct sock *sk),
+	TP_PROTO(struct sock *sk),
+
+	TP_ARGS(sk)
+);
+
+DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust,
+
+	TP_PROTO(struct sock *sk),
 
 	TP_ARGS(sk)
 );
@@ -232,6 +244,7 @@ TRACE_EVENT(tcp_probe,
 		__field(__u32, snd_wnd)
 		__field(__u32, srtt)
 		__field(__u32, rcv_wnd)
+		__field(__u64, sock_cookie)
 	),
 
 	TP_fast_assign(
@@ -256,15 +269,14 @@ TRACE_EVENT(tcp_probe,
 		__entry->rcv_wnd = tp->rcv_wnd;
 		__entry->ssthresh = tcp_current_ssthresh(sk);
 		__entry->srtt = tp->srtt_us >> 3;
+		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
-		  "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
-		  "rcv_wnd=%u",
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
 		  __entry->saddr, __entry->daddr, __entry->mark,
 		  __entry->length, __entry->snd_nxt, __entry->snd_una,
 		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
-		  __entry->srtt, __entry->rcv_wnd)
+		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
 );
 
 #endif /* _TRACE_TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0396fb919b5d..5a17cfc75326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -582,6 +582,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	u32 copied;
 	int time;
 
+	trace_tcp_rcv_space_adjust(sk);
+
 	tcp_mstamp_refresh(tp);
 	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
 	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
-- 
cgit v1.2.3-59-g8ed1b


From a56e6bcd34b55182159d16fcf6d6105578dfa192 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Fri, 20 Apr 2018 23:56:51 -0400
Subject: tc-testing: updated ife test cases

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/tc-testing/tc-tests/actions/ife.json | 1036 +++++++++++++++++++-
 1 file changed, 1024 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
index 9f34f0753969..03777730ef29 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
@@ -1,7 +1,7 @@
 [
     {
-        "id": "a568",
-        "name": "Add action with ife type",
+        "id": "7682",
+        "name": "Create valid ife encode action with mark and pass control",
         "category": [
             "actions",
             "ife"
@@ -12,21 +12,878 @@
                 0,
                 1,
                 255
-            ],
-            "$TC actions add action ife encode type 0xDEAD index 1"
+            ]
         ],
-        "cmdUnderTest": "$TC actions get action ife index 1",
+        "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow mark.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "ef47",
+        "name": "Create valid ife encode action with mark and pipe control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 10 pipe index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use mark.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "df43",
+        "name": "Create valid ife encode action with mark and continue control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow mark continue index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*allow mark.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "e4cf",
+        "name": "Create valid ife encode action with mark and drop control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 789 drop index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action drop.*type 0xED3E.*use mark 789.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "ccba",
+        "name": "Create valid ife encode action with mark and reclassify control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 656768 reclassify index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use mark 656768.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "a1cf",
+        "name": "Create valid ife encode action with mark and jump control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 65 jump 1 index 2",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 2",
+        "matchPattern": "action order [0-9]*: ife encode action jump 1.*type 0xED3E.*use mark 65.*index 2",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "cb3d",
+        "name": "Create valid ife encode action with mark value at 32-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 4294967295 reclassify index 90",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 90",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use mark 4294967295.*index 90",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "1efb",
+        "name": "Create ife encode action with mark value exceeding 32-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 4294967295999 pipe index 90",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 90",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use mark 4294967295999.*index 90",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "95ed",
+        "name": "Create valid ife encode action with prio and pass control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio pass index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow prio.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "aa17",
+        "name": "Create valid ife encode action with prio and pipe control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 7 pipe index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use prio 7.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "74c7",
+        "name": "Create valid ife encode action with prio and continue control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 3 continue index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*use prio 3.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "7a97",
+        "name": "Create valid ife encode action with prio and drop control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio drop index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action drop.*type 0xED3E.*allow prio.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "f66b",
+        "name": "Create valid ife encode action with prio and reclassify control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 998877 reclassify index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use prio 998877.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "3056",
+        "name": "Create valid ife encode action with prio and jump control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 998877 jump 10 index 9",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 9",
+        "matchPattern": "action order [0-9]*: ife encode action jump 10.*type 0xED3E.*use prio 998877.*index 9",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "7dd3",
+        "name": "Create valid ife encode action with prio value at 32-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 4294967295 reclassify index 99",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 99",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use prio 4294967295.*index 99",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "2ca1",
+        "name": "Create ife encode action with prio value exceeding 32-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 4294967298 pipe index 99",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 99",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use prio 4294967298.*index 99",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "05bb",
+        "name": "Create valid ife encode action with tcindex and pass control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow tcindex pass index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow tcindex.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "ce65",
+        "name": "Create valid ife encode action with tcindex and pipe control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 111 pipe index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use tcindex 111.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "09cd",
+        "name": "Create valid ife encode action with tcindex and continue control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 1 continue index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*use tcindex 1.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "8eb5",
+        "name": "Create valid ife encode action with tcindex and continue control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 1 continue index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*use tcindex 1.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "451a",
+        "name": "Create valid ife encode action with tcindex and drop control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow tcindex drop index 77",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 77",
+        "matchPattern": "action order [0-9]*: ife encode action drop.*type 0xED3E.*allow tcindex.*index 77",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "d76c",
+        "name": "Create valid ife encode action with tcindex and reclassify control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow tcindex reclassify index 77",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 77",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*allow tcindex.*index 77",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "e731",
+        "name": "Create valid ife encode action with tcindex and jump control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow tcindex jump 999 index 77",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 77",
+        "matchPattern": "action order [0-9]*: ife encode action jump 999.*type 0xED3E.*allow tcindex.*index 77",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "b7b8",
+        "name": "Create valid ife encode action with tcindex value at 16-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 65535 pass index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*use tcindex 65535.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action skbedit"
+        ]
+    },
+    {
+        "id": "d0d8",
+        "name": "Create ife encode action with tcindex value exceeding 16-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 65539 pipe index 1",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use tcindex 65539.*index 1",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "2a9c",
+        "name": "Create valid ife encode action with mac src parameter",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow mark src 00:11:22:33:44:55 pipe index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*allow mark src 00:11:22:33:44:55.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "cf5c",
+        "name": "Create valid ife encode action with mac dst parameter",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 9876 dst 00:11:22:33:44:55 reclassify index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use prio 9876 dst 00:11:22:33:44:55.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "2353",
+        "name": "Create valid ife encode action with mac src and mac dst parameters",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow tcindex src 00:aa:bb:cc:dd:ee dst 00:11:22:33:44:55 pass index 11",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 11",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow tcindex dst 00:11:22:33:44:55 src 00:aa:bb:cc:dd:ee .*index 11",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "552c",
+        "name": "Create valid ife encode action with mark and type parameters",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use mark 7 type 0xfefe pass index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xFEFE.*use mark 7.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "0421",
+        "name": "Create valid ife encode action with prio and type parameters",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use prio 444 type 0xabba pipe index 21",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 21",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xABBA.*use prio 444.*index 21",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "4017",
+        "name": "Create valid ife encode action with tcindex and type parameters",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode use tcindex 5000 type 0xabcd reclassify index 21",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 21",
+        "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xABCD.*use tcindex 5000.*index 21",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "fac3",
+        "name": "Create valid ife encode action with index at 32-bit maximnum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 4294967295",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 4294967295",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow mark.*index 4294967295",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "7c25",
+        "name": "Create valid ife decode action with pass control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife decode pass index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife decode action pass.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "dccb",
+        "name": "Create valid ife decode action with pipe control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife decode pipe index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife decode action pipe.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "7bb9",
+        "name": "Create valid ife decode action with continue control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife decode continue index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife decode action continue.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "d9ad",
+        "name": "Create valid ife decode action with drop control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife decode drop index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 1",
+        "matchPattern": "action order [0-9]*: ife decode action drop.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "219f",
+        "name": "Create valid ife decode action with reclassify control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife decode reclassify index 1",
         "expExitCode": "0",
         "verifyCmd": "$TC actions get action ife index 1",
-        "matchPattern": "type 0xDEAD",
+        "matchPattern": "action order [0-9]*: ife decode action reclassify.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
         "matchCount": "1",
         "teardown": [
             "$TC actions flush action ife"
         ]
     },
     {
-        "id": "b983",
-        "name": "Add action without ife type",
+        "id": "8f44",
+        "name": "Create valid ife decode action with jump control",
         "category": [
             "actions",
             "ife"
@@ -37,16 +894,171 @@
                 0,
                 1,
                 255
-            ],
-            "$TC actions add action ife encode index 1"
+            ]
         ],
-        "cmdUnderTest": "$TC actions get action ife index 1",
+        "cmdUnderTest": "$TC actions add action ife decode jump 10 index 1",
         "expExitCode": "0",
         "verifyCmd": "$TC actions get action ife index 1",
-        "matchPattern": "type 0xED3E",
+        "matchPattern": "action order [0-9]*: ife decode action jump 10.*type 0x0.*allow mark allow tcindex allow prio.*index 1",
         "matchCount": "1",
         "teardown": [
             "$TC actions flush action ife"
         ]
+    },
+    {
+        "id": "56cf",
+        "name": "Create ife encode action with index exceeding 32-bit maximum",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 4294967295999",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4294967295999",
+        "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow mark.*index 4294967295999",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "ee94",
+        "name": "Create ife encode action with invalid control",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow mark kuka index 4",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action kuka.*type 0xED3E.*allow mark.*index 4",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "b330",
+        "name": "Create ife encode action with cookie",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio pipe index 4 cookie aabbccddeeff112233445566778800a1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*allow prio.*index 4.*cookie aabbccddeeff112233445566778800a1",
+        "matchCount": "1",
+        "teardown": [
+           "$TC actions flush action ife"
+        ]
+    },
+    {
+        "id": "bbc0",
+        "name": "Create ife encode action with invalid argument",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow foo pipe index 4",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*allow foo.*index 4",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "d54a",
+        "name": "Create ife encode action with invalid type argument",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio type 70000 pipe index 4",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0x11170.*allow prio.*index 4",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "7ee0",
+        "name": "Create ife encode action with invalid mac src argument",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio src 00:11:22:33:44:pp pipe index 4",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*allow prio.*index 4",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "0a7d",
+        "name": "Create ife encode action with invalid mac dst argument",
+        "category": [
+            "actions",
+            "ife"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action ife",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action ife encode allow prio dst 00.111-22:33:44:aa pipe index 4",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action ife index 4",
+        "matchPattern": "action order [0-9]*: ife encode action pipe.*allow prio.*index 4",
+        "matchCount": "0",
+        "teardown": []
     }
 ]
-- 
cgit v1.2.3-59-g8ed1b


From f9d4b0c1e9695e3de7af3768205bacc27312320c Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Sat, 21 Apr 2018 09:41:30 -0700
Subject: fib_rules: move common handling of newrule delrule msgs into
 fib_nl2rule

This reduces code duplication in the fib rule add and del paths.
Get rid of validate_rulemsg. This became obvious when adding duplicate
extack support in fib newrule/delrule error paths.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 436 +++++++++++++++++++++++----------------------------
 1 file changed, 192 insertions(+), 244 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f84c173..67801104e9a8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,206 +387,185 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
 }
 EXPORT_SYMBOL_GPL(fib_rules_seq_read);
 
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
-			    struct fib_rules_ops *ops)
-{
-	int err = -EINVAL;
-
-	if (frh->src_len)
-		if (tb[FRA_SRC] == NULL ||
-		    frh->src_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_SRC]) != ops->addr_size)
-			goto errout;
-
-	if (frh->dst_len)
-		if (tb[FRA_DST] == NULL ||
-		    frh->dst_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_DST]) != ops->addr_size)
-			goto errout;
-
-	err = 0;
-errout:
-	return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
-		       struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+				  struct fib_rule_hdr *frh,
+				  struct nlattr **tb,
+				  struct fib_rule *rule,
+				  bool user_priority)
 {
 	struct fib_rule *r;
 
 	list_for_each_entry(r, &ops->rules_list, list) {
-		if (r->action != rule->action)
+		if (rule->action && r->action != rule->action)
 			continue;
 
-		if (r->table != rule->table)
+		if (rule->table && r->table != rule->table)
 			continue;
 
-		if (r->pref != rule->pref)
+		if (user_priority && r->pref != rule->pref)
 			continue;
 
-		if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+		if (rule->iifname[0] &&
+		    memcmp(r->iifname, rule->iifname, IFNAMSIZ))
 			continue;
 
-		if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+		if (rule->oifname[0] &&
+		    memcmp(r->oifname, rule->oifname, IFNAMSIZ))
 			continue;
 
-		if (r->mark != rule->mark)
+		if (rule->mark && r->mark != rule->mark)
 			continue;
 
-		if (r->mark_mask != rule->mark_mask)
+		if (rule->mark_mask && r->mark_mask != rule->mark_mask)
 			continue;
 
-		if (r->tun_id != rule->tun_id)
+		if (rule->tun_id && r->tun_id != rule->tun_id)
 			continue;
 
 		if (r->fr_net != rule->fr_net)
 			continue;
 
-		if (r->l3mdev != rule->l3mdev)
+		if (rule->l3mdev && r->l3mdev != rule->l3mdev)
 			continue;
 
-		if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
-		    !uid_eq(r->uid_range.end, rule->uid_range.end))
+		if (uid_range_set(&rule->uid_range) &&
+		    (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+		    !uid_eq(r->uid_range.end, rule->uid_range.end)))
 			continue;
 
-		if (r->ip_proto != rule->ip_proto)
+		if (rule->ip_proto && r->ip_proto != rule->ip_proto)
 			continue;
 
-		if (!fib_rule_port_range_compare(&r->sport_range,
+		if (fib_rule_port_range_set(&rule->sport_range) &&
+		    !fib_rule_port_range_compare(&r->sport_range,
 						 &rule->sport_range))
 			continue;
 
-		if (!fib_rule_port_range_compare(&r->dport_range,
+		if (fib_rule_port_range_set(&rule->dport_range) &&
+		    !fib_rule_port_range_compare(&r->dport_range,
 						 &rule->dport_range))
 			continue;
 
 		if (!ops->compare(r, frh, tb))
 			continue;
-		return 1;
+		return r;
 	}
-	return 0;
+
+	return NULL;
 }
 
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
-		   struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+		       struct netlink_ext_ack *extack,
+		       struct fib_rules_ops *ops,
+		       struct nlattr *tb[],
+		       struct fib_rule **rule,
+		       bool *user_priority)
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
-	struct fib_rules_ops *ops = NULL;
-	struct fib_rule *rule, *r, *last = NULL;
-	struct nlattr *tb[FRA_MAX+1];
-	int err = -EINVAL, unresolved = 0;
-
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
-		goto errout;
-
-	ops = lookup_rules_ops(net, frh->family);
-	if (ops == NULL) {
-		err = -EAFNOSUPPORT;
-		goto errout;
-	}
+	struct fib_rule *nlrule = NULL;
+	int err = -EINVAL;
 
-	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
-		goto errout;
+	if (frh->src_len)
+		if (!tb[FRA_SRC] ||
+		    frh->src_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+			goto errout;
 
-	err = validate_rulemsg(frh, tb, ops);
-	if (err < 0)
-		goto errout;
+	if (frh->dst_len)
+		if (!tb[FRA_DST] ||
+		    frh->dst_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_DST]) != ops->addr_size)
+			goto errout;
 
-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
-	if (rule == NULL) {
+	nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (!nlrule) {
 		err = -ENOMEM;
 		goto errout;
 	}
-	refcount_set(&rule->refcnt, 1);
-	rule->fr_net = net;
+	refcount_set(&nlrule->refcnt, 1);
+	nlrule->fr_net = net;
 
-	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
-	                              : fib_default_rule_pref(ops);
+	if (tb[FRA_PRIORITY]) {
+		nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+		*user_priority = true;
+	} else {
+		nlrule->pref = fib_default_rule_pref(ops);
+	}
 
-	rule->proto = tb[FRA_PROTOCOL] ?
+	nlrule->proto = tb[FRA_PROTOCOL] ?
 		nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
 
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
-		rule->iifindex = -1;
-		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->iifname);
+		nlrule->iifindex = -1;
+		nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, nlrule->iifname);
 		if (dev)
-			rule->iifindex = dev->ifindex;
+			nlrule->iifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_OIFNAME]) {
 		struct net_device *dev;
 
-		rule->oifindex = -1;
-		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->oifname);
+		nlrule->oifindex = -1;
+		nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, nlrule->oifname);
 		if (dev)
-			rule->oifindex = dev->ifindex;
+			nlrule->oifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_FWMARK]) {
-		rule->mark = nla_get_u32(tb[FRA_FWMARK]);
-		if (rule->mark)
+		nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+		if (nlrule->mark)
 			/* compatibility: if the mark value is non-zero all bits
 			 * are compared unless a mask is explicitly specified.
 			 */
-			rule->mark_mask = 0xFFFFFFFF;
+			nlrule->mark_mask = 0xFFFFFFFF;
 	}
 
 	if (tb[FRA_FWMASK])
-		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+		nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
 
 	if (tb[FRA_TUN_ID])
-		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+		nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
 
 	err = -EINVAL;
 	if (tb[FRA_L3MDEV]) {
 #ifdef CONFIG_NET_L3_MASTER_DEV
-		rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
-		if (rule->l3mdev != 1)
+		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
+		if (nlrule->l3mdev != 1)
 #endif
 			goto errout_free;
 	}
 
-	rule->action = frh->action;
-	rule->flags = frh->flags;
-	rule->table = frh_get_table(frh, tb);
+	nlrule->action = frh->action;
+	nlrule->flags = frh->flags;
+	nlrule->table = frh_get_table(frh, tb);
 	if (tb[FRA_SUPPRESS_PREFIXLEN])
-		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+		nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
 	else
-		rule->suppress_prefixlen = -1;
+		nlrule->suppress_prefixlen = -1;
 
 	if (tb[FRA_SUPPRESS_IFGROUP])
-		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+		nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
 	else
-		rule->suppress_ifgroup = -1;
+		nlrule->suppress_ifgroup = -1;
 
 	if (tb[FRA_GOTO]) {
-		if (rule->action != FR_ACT_GOTO)
+		if (nlrule->action != FR_ACT_GOTO)
 			goto errout_free;
 
-		rule->target = nla_get_u32(tb[FRA_GOTO]);
+		nlrule->target = nla_get_u32(tb[FRA_GOTO]);
 		/* Backward jumps are prohibited to avoid endless loops */
-		if (rule->target <= rule->pref)
+		if (nlrule->target <= nlrule->pref)
 			goto errout_free;
-
-		list_for_each_entry(r, &ops->rules_list, list) {
-			if (r->pref == rule->target) {
-				RCU_INIT_POINTER(rule->ctarget, r);
-				break;
-			}
-		}
-
-		if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
-			unresolved = 1;
-	} else if (rule->action == FR_ACT_GOTO)
+	} else if (nlrule->action == FR_ACT_GOTO) {
 		goto errout_free;
+	}
 
-	if (rule->l3mdev && rule->table)
+	if (nlrule->l3mdev && nlrule->table)
 		goto errout_free;
 
 	if (tb[FRA_UID_RANGE]) {
@@ -595,34 +574,72 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 			goto errout_free;
 		}
 
-		rule->uid_range = nla_get_kuid_range(tb);
+		nlrule->uid_range = nla_get_kuid_range(tb);
 
-		if (!uid_range_set(&rule->uid_range) ||
-		    !uid_lte(rule->uid_range.start, rule->uid_range.end))
+		if (!uid_range_set(&nlrule->uid_range) ||
+		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end))
 			goto errout_free;
 	} else {
-		rule->uid_range = fib_kuid_range_unset;
+		nlrule->uid_range = fib_kuid_range_unset;
 	}
 
 	if (tb[FRA_IP_PROTO])
-		rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+		nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
 
 	if (tb[FRA_SPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
-					 &rule->sport_range);
+					 &nlrule->sport_range);
 		if (err)
 			goto errout_free;
 	}
 
 	if (tb[FRA_DPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
-					 &rule->dport_range);
+					 &nlrule->dport_range);
 		if (err)
 			goto errout_free;
 	}
 
+	*rule = nlrule;
+
+	return 0;
+
+errout_free:
+	kfree(nlrule);
+errout:
+	return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+		   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule = NULL, *r, *last = NULL;
+	struct nlattr *tb[FRA_MAX + 1];
+	int err = -EINVAL, unresolved = 0;
+	bool user_priority = false;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(net, frh->family);
+	if (!ops) {
+		err = -EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+	if (err < 0)
+		goto errout;
+
+	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+	if (err)
+		goto errout;
+
 	if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
-	    rule_exists(ops, frh, tb, rule)) {
+	    rule_find(ops, frh, tb, rule, user_priority)) {
 		err = -EEXIST;
 		goto errout_free;
 	}
@@ -636,6 +653,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout_free;
 
+	list_for_each_entry(r, &ops->rules_list, list) {
+		if (r->pref == rule->target) {
+			RCU_INIT_POINTER(rule->ctarget, r);
+			break;
+		}
+	}
+
+	if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+		unresolved = 1;
+
 	list_for_each_entry(r, &ops->rules_list, list) {
 		if (r->pref > rule->pref)
 			break;
@@ -690,13 +717,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
-	struct fib_rule_port_range sprange = {0, 0};
-	struct fib_rule_port_range dprange = {0, 0};
 	struct fib_rules_ops *ops = NULL;
-	struct fib_rule *rule, *r;
+	struct fib_rule *rule = NULL, *r, *nlrule = NULL;
 	struct nlattr *tb[FRA_MAX+1];
-	struct fib_kuid_range range;
 	int err = -EINVAL;
+	bool user_priority = false;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
@@ -711,150 +736,73 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
-	err = validate_rulemsg(frh, tb, ops);
-	if (err < 0)
+	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+	if (err)
 		goto errout;
 
-	if (tb[FRA_UID_RANGE]) {
-		range = nla_get_kuid_range(tb);
-		if (!uid_range_set(&range)) {
-			err = -EINVAL;
-			goto errout;
-		}
-	} else {
-		range = fib_kuid_range_unset;
+	rule = rule_find(ops, frh, tb, nlrule, user_priority);
+	if (!rule) {
+		err = -ENOENT;
+		goto errout;
 	}
 
-	if (tb[FRA_SPORT_RANGE]) {
-		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
-					 &sprange);
-		if (err)
-			goto errout;
+	if (rule->flags & FIB_RULE_PERMANENT) {
+		err = -EPERM;
+		goto errout;
 	}
 
-	if (tb[FRA_DPORT_RANGE]) {
-		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
-					 &dprange);
+	if (ops->delete) {
+		err = ops->delete(rule);
 		if (err)
 			goto errout;
 	}
 
-	list_for_each_entry(rule, &ops->rules_list, list) {
-		if (tb[FRA_PROTOCOL] &&
-		    (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
-			continue;
-
-		if (frh->action && (frh->action != rule->action))
-			continue;
-
-		if (frh_get_table(frh, tb) &&
-		    (frh_get_table(frh, tb) != rule->table))
-			continue;
-
-		if (tb[FRA_PRIORITY] &&
-		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
-			continue;
-
-		if (tb[FRA_IIFNAME] &&
-		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
-			continue;
-
-		if (tb[FRA_OIFNAME] &&
-		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
-			continue;
-
-		if (tb[FRA_FWMARK] &&
-		    (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
-			continue;
-
-		if (tb[FRA_FWMASK] &&
-		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
-			continue;
-
-		if (tb[FRA_TUN_ID] &&
-		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
-			continue;
-
-		if (tb[FRA_L3MDEV] &&
-		    (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
-			continue;
-
-		if (uid_range_set(&range) &&
-		    (!uid_eq(rule->uid_range.start, range.start) ||
-		     !uid_eq(rule->uid_range.end, range.end)))
-			continue;
-
-		if (tb[FRA_IP_PROTO] &&
-		    (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
-			continue;
-
-		if (fib_rule_port_range_set(&sprange) &&
-		    !fib_rule_port_range_compare(&rule->sport_range, &sprange))
-			continue;
-
-		if (fib_rule_port_range_set(&dprange) &&
-		    !fib_rule_port_range_compare(&rule->dport_range, &dprange))
-			continue;
-
-		if (!ops->compare(rule, frh, tb))
-			continue;
-
-		if (rule->flags & FIB_RULE_PERMANENT) {
-			err = -EPERM;
-			goto errout;
-		}
-
-		if (ops->delete) {
-			err = ops->delete(rule);
-			if (err)
-				goto errout;
-		}
-
-		if (rule->tun_id)
-			ip_tunnel_unneed_metadata();
+	if (rule->tun_id)
+		ip_tunnel_unneed_metadata();
 
-		list_del_rcu(&rule->list);
+	list_del_rcu(&rule->list);
 
-		if (rule->action == FR_ACT_GOTO) {
-			ops->nr_goto_rules--;
-			if (rtnl_dereference(rule->ctarget) == NULL)
-				ops->unresolved_rules--;
-		}
+	if (rule->action == FR_ACT_GOTO) {
+		ops->nr_goto_rules--;
+		if (rtnl_dereference(rule->ctarget) == NULL)
+			ops->unresolved_rules--;
+	}
 
-		/*
-		 * Check if this rule is a target to any of them. If so,
-		 * adjust to the next one with the same preference or
-		 * disable them. As this operation is eventually very
-		 * expensive, it is only performed if goto rules, except
-		 * current if it is goto rule, have actually been added.
-		 */
-		if (ops->nr_goto_rules > 0) {
-			struct fib_rule *n;
-
-			n = list_next_entry(rule, list);
-			if (&n->list == &ops->rules_list || n->pref != rule->pref)
-				n = NULL;
-			list_for_each_entry(r, &ops->rules_list, list) {
-				if (rtnl_dereference(r->ctarget) != rule)
-					continue;
-				rcu_assign_pointer(r->ctarget, n);
-				if (!n)
-					ops->unresolved_rules++;
-			}
+	/*
+	 * Check if this rule is a target to any of them. If so,
+	 * adjust to the next one with the same preference or
+	 * disable them. As this operation is eventually very
+	 * expensive, it is only performed if goto rules, except
+	 * current if it is goto rule, have actually been added.
+	 */
+	if (ops->nr_goto_rules > 0) {
+		struct fib_rule *n;
+
+		n = list_next_entry(rule, list);
+		if (&n->list == &ops->rules_list || n->pref != rule->pref)
+			n = NULL;
+		list_for_each_entry(r, &ops->rules_list, list) {
+			if (rtnl_dereference(r->ctarget) != rule)
+				continue;
+			rcu_assign_pointer(r->ctarget, n);
+			if (!n)
+				ops->unresolved_rules++;
 		}
-
-		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
-					NULL);
-		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
-				   NETLINK_CB(skb).portid);
-		fib_rule_put(rule);
-		flush_route_cache(ops);
-		rules_ops_put(ops);
-		return 0;
 	}
 
-	err = -ENOENT;
+	call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+				NULL);
+	notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+			   NETLINK_CB(skb).portid);
+	fib_rule_put(rule);
+	flush_route_cache(ops);
+	rules_ops_put(ops);
+	kfree(nlrule);
+	return 0;
+
 errout:
+	if (nlrule)
+		kfree(nlrule);
 	rules_ops_put(ops);
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b16fb418b1bf2a9f14d5d2a4fe29bde1f5550b37 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Sat, 21 Apr 2018 09:41:31 -0700
Subject: net: fib_rules: add extack support

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h |  3 ++-
 net/core/fib_rules.c    | 55 +++++++++++++++++++++++++++++++++++++------------
 net/decnet/dn_rules.c   |  7 +++++--
 net/ipv4/fib_rules.c    |  7 +++++--
 net/ipv4/ipmr.c         |  3 ++-
 net/ipv6/fib6_rules.c   |  7 +++++--
 net/ipv6/ip6mr.c        |  3 ++-
 7 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e5cfcfc7dd93..b473df5b9512 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -75,7 +75,8 @@ struct fib_rules_ops {
 	int			(*configure)(struct fib_rule *,
 					     struct sk_buff *,
 					     struct fib_rule_hdr *,
-					     struct nlattr **);
+					     struct nlattr **,
+					     struct netlink_ext_ack *);
 	int			(*delete)(struct fib_rule *);
 	int			(*compare)(struct fib_rule *,
 					   struct fib_rule_hdr *,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 67801104e9a8..ebd9351b213a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -469,14 +469,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (frh->src_len)
 		if (!tb[FRA_SRC] ||
 		    frh->src_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+		    nla_len(tb[FRA_SRC]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid source address");
 			goto errout;
+	}
 
 	if (frh->dst_len)
 		if (!tb[FRA_DST] ||
 		    frh->dst_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_DST]) != ops->addr_size)
+		    nla_len(tb[FRA_DST]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid dst address");
 			goto errout;
+	}
 
 	nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
 	if (!nlrule) {
@@ -537,6 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
 		if (nlrule->l3mdev != 1)
 #endif
+			NL_SET_ERR_MSG(extack, "Invalid l3mdev");
 			goto errout_free;
 	}
 
@@ -554,31 +559,41 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->suppress_ifgroup = -1;
 
 	if (tb[FRA_GOTO]) {
-		if (nlrule->action != FR_ACT_GOTO)
+		if (nlrule->action != FR_ACT_GOTO) {
+			NL_SET_ERR_MSG(extack, "Unexpected goto");
 			goto errout_free;
+		}
 
 		nlrule->target = nla_get_u32(tb[FRA_GOTO]);
 		/* Backward jumps are prohibited to avoid endless loops */
-		if (nlrule->target <= nlrule->pref)
+		if (nlrule->target <= nlrule->pref) {
+			NL_SET_ERR_MSG(extack, "Backward goto not supported");
 			goto errout_free;
+		}
 	} else if (nlrule->action == FR_ACT_GOTO) {
+		NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
 		goto errout_free;
 	}
 
-	if (nlrule->l3mdev && nlrule->table)
+	if (nlrule->l3mdev && nlrule->table) {
+		NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
 		goto errout_free;
+	}
 
 	if (tb[FRA_UID_RANGE]) {
 		if (current_user_ns() != net->user_ns) {
 			err = -EPERM;
+			NL_SET_ERR_MSG(extack, "No permission to set uid");
 			goto errout_free;
 		}
 
 		nlrule->uid_range = nla_get_kuid_range(tb);
 
 		if (!uid_range_set(&nlrule->uid_range) ||
-		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end))
+		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+			NL_SET_ERR_MSG(extack, "Invalid uid range");
 			goto errout_free;
+		}
 	} else {
 		nlrule->uid_range = fib_kuid_range_unset;
 	}
@@ -589,15 +604,19 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[FRA_SPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
 					 &nlrule->sport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid sport range");
 			goto errout_free;
+		}
 	}
 
 	if (tb[FRA_DPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
 					 &nlrule->dport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid dport range");
 			goto errout_free;
+		}
 	}
 
 	*rule = nlrule;
@@ -621,18 +640,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL, unresolved = 0;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (!ops) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
 	if (err)
@@ -644,7 +668,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout_free;
 	}
 
-	err = ops->configure(rule, skb, frh, tb);
+	err = ops->configure(rule, skb, frh, tb, extack);
 	if (err < 0)
 		goto errout_free;
 
@@ -723,18 +747,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (ops == NULL) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
 	if (err)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f509c9..72236695db3d 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 
 static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 				 struct fib_rule_hdr *frh,
-				 struct nlattr **tb)
+				 struct nlattr **tb,
+				 struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-	if (frh->tos)
+	if (frh->tos) {
+		NL_SET_ERR_MSG(extack, "Invalid tos value");
 		goto  errout;
+	}
 
 	if (rule->table == RT_TABLE_UNSPEC) {
 		if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11bc8838..f8eb78d042a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
 
 static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	int err = -EINVAL;
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	if (frh->tos & ~IPTOS_TOS_MASK)
+	if (frh->tos & ~IPTOS_TOS_MASK) {
+		NL_SET_ERR_MSG(extack, "Invalid tos");
 		goto errout;
+	}
 
 	/* split local/main if they are not already split */
 	err = fib_unmerge(net);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3f7f66..38e092eafc97 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-			       struct fib_rule_hdr *frh, struct nlattr **tb)
+			       struct fib_rule_hdr *frh, struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7b5fc8..6547fc6491a6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -245,15 +245,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
 
 static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct net *net = sock_net(skb->sk);
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
 	if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
-		if (rule->table == RT6_TABLE_UNSPEC)
+		if (rule->table == RT6_TABLE_UNSPEC) {
+			NL_SET_ERR_MSG(extack, "Invalid table");
 			goto errout;
+		}
 
 		if (fib6_new_table(net, rule->table) == NULL) {
 			err = -ENOBUFS;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..20a419ee8000 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-				struct fib_rule_hdr *frh, struct nlattr **tb)
+				struct fib_rule_hdr *frh, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
 {
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From c6849a3ac17e336811f1d5bba991d2a9bdc47af1 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 22 Apr 2018 21:50:04 +0800
Subject: net: init sk_cookie for inet socket

With sk_cookie we can identify a socket, that is very helpful for
traceing and statistic, i.e. tcp tracepiont and ebpf.
So we'd better init it by default for inet socket.
When using it, we just need call atomic64_read(&sk->sk_cookie).

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h | 9 +++++++++
 net/ipv4/tcp_input.c      | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 15fe980a27ea..5c916e6dff36 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -25,6 +25,15 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
+static inline
+void sock_init_cookie(struct sock *sk)
+{
+	u64 res;
+
+	res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
+	atomic64_set(&sk->sk_cookie, res);
+}
+
 u64 sock_gen_cookie(struct sock *sk);
 int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
 void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5a17cfc75326..17b78582ba63 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,6 +78,7 @@
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
 #include <linux/static_key.h>
+#include <linux/sock_diag.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -6190,10 +6191,15 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
 #endif
-		atomic64_set(&ireq->ir_cookie, 0);
 		ireq->ireq_state = TCP_NEW_SYN_RECV;
 		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
 		ireq->ireq_family = sk_listener->sk_family;
+
+		BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+					offsetof(struct sock, sk_cookie));
+		BUILD_BUG_ON(offsetof(struct inet_request_sock, ireq_net) !=
+					offsetof(struct sock, sk_net));
+		sock_init_cookie((struct sock *)ireq);
 	}
 
 	return req;
-- 
cgit v1.2.3-59-g8ed1b


From 22148df0d0bdbd5d1cca4e069e17bf49b070498f Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 22 Apr 2018 17:15:15 +0200
Subject: r8169: don't use netif_info et al before net_device has been
 registered

There's no benefit in using netif_info et al before the net_device has
been registered. We get messages like
r8169 0000:03:00.0 (unnamed net_device) (uninitialized): [message]
Therefore use dev_info/dev_err instead.

As a side effect we don't need parameter dev for function
rtl8169_get_mac_version() any longer.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index fcd42d0387b9..a5d00ee94245 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2483,7 +2483,7 @@ static const struct ethtool_ops rtl8169_ethtool_ops = {
 };
 
 static void rtl8169_get_mac_version(struct rtl8169_private *tp,
-				    struct net_device *dev, u8 default_version)
+				    u8 default_version)
 {
 	/*
 	 * The driver currently handles the 8168Bf and the 8168Be identically
@@ -2588,8 +2588,8 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp,
 	tp->mac_version = p->mac_version;
 
 	if (tp->mac_version == RTL_GIGA_MAC_NONE) {
-		netif_notice(tp, probe, dev,
-			     "unknown MAC, using family default\n");
+		dev_notice(tp_to_dev(tp),
+			   "unknown MAC, using family default\n");
 		tp->mac_version = default_version;
 	} else if (tp->mac_version == RTL_GIGA_MAC_VER_42) {
 		tp->mac_version = tp->mii.supports_gmii ?
@@ -8107,40 +8107,39 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* enable device (incl. PCI PM wakeup and hotplug setup) */
 	rc = pcim_enable_device(pdev);
 	if (rc < 0) {
-		netif_err(tp, probe, dev, "enable failure\n");
+		dev_err(&pdev->dev, "enable failure\n");
 		return rc;
 	}
 
 	if (pcim_set_mwi(pdev) < 0)
-		netif_info(tp, probe, dev, "Mem-Wr-Inval unavailable\n");
+		dev_info(&pdev->dev, "Mem-Wr-Inval unavailable\n");
 
 	/* use first MMIO region */
 	region = ffs(pci_select_bars(pdev, IORESOURCE_MEM)) - 1;
 	if (region < 0) {
-		netif_err(tp, probe, dev, "no MMIO resource found\n");
+		dev_err(&pdev->dev, "no MMIO resource found\n");
 		return -ENODEV;
 	}
 
 	/* check for weird/broken PCI region reporting */
 	if (pci_resource_len(pdev, region) < R8169_REGS_SIZE) {
-		netif_err(tp, probe, dev,
-			  "Invalid PCI region size(s), aborting\n");
+		dev_err(&pdev->dev, "Invalid PCI region size(s), aborting\n");
 		return -ENODEV;
 	}
 
 	rc = pcim_iomap_regions(pdev, BIT(region), MODULENAME);
 	if (rc < 0) {
-		netif_err(tp, probe, dev, "cannot remap MMIO, aborting\n");
+		dev_err(&pdev->dev, "cannot remap MMIO, aborting\n");
 		return rc;
 	}
 
 	tp->mmio_addr = pcim_iomap_table(pdev)[region];
 
 	if (!pci_is_pcie(pdev))
-		netif_info(tp, probe, dev, "not PCI Express\n");
+		dev_info(&pdev->dev, "not PCI Express\n");
 
 	/* Identify chip attached to board */
-	rtl8169_get_mac_version(tp, dev, cfg->default_ver);
+	rtl8169_get_mac_version(tp, cfg->default_ver);
 
 	tp->cp_cmd = 0;
 
@@ -8157,7 +8156,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	} else {
 		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (rc < 0) {
-			netif_err(tp, probe, dev, "DMA configuration failed\n");
+			dev_err(&pdev->dev, "DMA configuration failed\n");
 			return rc;
 		}
 	}
@@ -8185,7 +8184,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rc = rtl_alloc_irq(tp);
 	if (rc < 0) {
-		netif_err(tp, probe, dev, "Can't allocate interrupt\n");
+		dev_err(&pdev->dev, "Can't allocate interrupt\n");
 		return rc;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f0456b43140af9413397cc11d03d18b9f2fc2fc Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 23 Apr 2018 09:05:15 +0100
Subject: net: stmmac: Implement logic to automatically select HW Interface

Move all the core version detection to a common place ("hwif.c") and
implement a table which can be used to lookup the correct callbacks for
each IP version.

This simplifies the initialization flow of each IP version and eases
future implementation of new IP versions.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/Makefile       |   3 +-
 drivers/net/ethernet/stmicro/stmmac/common.h       |  30 +--
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h    |   1 -
 .../net/ethernet/stmicro/stmmac/dwmac1000_core.c   |  29 +--
 .../net/ethernet/stmicro/stmmac/dwmac100_core.c    |  23 +--
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       |   1 -
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  |  41 ++--
 drivers/net/ethernet/stmicro/stmmac/hwif.c         | 220 +++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  17 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |   1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  77 +-------
 11 files changed, 279 insertions(+), 164 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/hwif.c

diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile
index 972e4ef6d414..e3b578b4f7cb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Makefile
+++ b/drivers/net/ethernet/stmicro/stmmac/Makefile
@@ -4,7 +4,8 @@ stmmac-objs:= stmmac_main.o stmmac_ethtool.o stmmac_mdio.o ring_mode.o	\
 	      chain_mode.o dwmac_lib.o dwmac1000_core.o dwmac1000_dma.o	\
 	      dwmac100_core.o dwmac100_dma.o enh_desc.o norm_desc.o	\
 	      mmc_core.o stmmac_hwtstamp.o stmmac_ptp.o dwmac4_descs.o	\
-	      dwmac4_dma.o dwmac4_lib.o dwmac4_core.o dwmac5.o $(stmmac-y)
+	      dwmac4_dma.o dwmac4_lib.o dwmac4_core.o dwmac5.o hwif.o \
+	      $(stmmac-y)
 
 # Ordering matters. Generic driver must be last.
 obj-$(CONFIG_STMMAC_PLATFORM)	+= stmmac-platform.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 59673c6d2e52..627e905b6d76 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -39,6 +39,7 @@
 #define	DWMAC_CORE_3_40	0x34
 #define	DWMAC_CORE_3_50	0x35
 #define	DWMAC_CORE_4_00	0x40
+#define DWMAC_CORE_4_10	0x41
 #define DWMAC_CORE_5_00 0x50
 #define DWMAC_CORE_5_10 0x51
 #define STMMAC_CHAN0	0	/* Always supported and default for all chips */
@@ -428,12 +429,9 @@ struct stmmac_rx_routing {
 	u32 reg_shift;
 };
 
-struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
-					int perfect_uc_entries,
-					int *synopsys_id);
-struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id);
-struct mac_device_info *dwmac4_setup(void __iomem *ioaddr, int mcbins,
-				     int perfect_uc_entries, int *synopsys_id);
+int dwmac100_setup(struct stmmac_priv *priv);
+int dwmac1000_setup(struct stmmac_priv *priv);
+int dwmac4_setup(struct stmmac_priv *priv);
 
 void stmmac_set_mac_addr(void __iomem *ioaddr, u8 addr[6],
 			 unsigned int high, unsigned int low);
@@ -453,24 +451,4 @@ extern const struct stmmac_mode_ops ring_mode_ops;
 extern const struct stmmac_mode_ops chain_mode_ops;
 extern const struct stmmac_desc_ops dwmac4_desc_ops;
 
-/**
- * stmmac_get_synopsys_id - return the SYINID.
- * @priv: driver private structure
- * Description: this simple function is to decode and return the SYINID
- * starting from the HW core register.
- */
-static inline u32 stmmac_get_synopsys_id(u32 hwid)
-{
-	/* Check Synopsys Id (not available on old chips) */
-	if (likely(hwid)) {
-		u32 uid = ((hwid & 0x0000ff00) >> 8);
-		u32 synid = (hwid & 0x000000ff);
-
-		pr_info("stmmac - user ID: 0x%x, Synopsys ID: 0x%x\n",
-			uid, synid);
-
-		return synid;
-	}
-	return 0;
-}
 #endif /* __COMMON_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index c02d36629c52..184ca13c8f79 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -29,7 +29,6 @@
 #define GMAC_MII_DATA		0x00000014	/* MII Data */
 #define GMAC_FLOW_CTRL		0x00000018	/* Flow Control */
 #define GMAC_VLAN_TAG		0x0000001c	/* VLAN Tag */
-#define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC_DEBUG		0x00000024	/* GMAC debug register */
 #define GMAC_WAKEUP_FILTER	0x00000028	/* Wake-up Frame Filter */
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index ef10baf14186..0877bde6e860 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -27,6 +27,7 @@
 #include <linux/ethtool.h>
 #include <net/dsa.h>
 #include <asm/io.h>
+#include "stmmac.h"
 #include "stmmac_pcs.h"
 #include "dwmac1000.h"
 
@@ -498,7 +499,7 @@ static void dwmac1000_debug(void __iomem *ioaddr, struct stmmac_extra_stats *x,
 		x->mac_gmii_rx_proto_engine++;
 }
 
-static const struct stmmac_ops dwmac1000_ops = {
+const struct stmmac_ops dwmac1000_ops = {
 	.core_init = dwmac1000_core_init,
 	.set_mac = stmmac_set_mac,
 	.rx_ipc = dwmac1000_rx_ipc_enable,
@@ -519,28 +520,21 @@ static const struct stmmac_ops dwmac1000_ops = {
 	.pcs_get_adv_lp = dwmac1000_get_adv_lp,
 };
 
-struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
-					int perfect_uc_entries,
-					int *synopsys_id)
+int dwmac1000_setup(struct stmmac_priv *priv)
 {
-	struct mac_device_info *mac;
-	u32 hwid = readl(ioaddr + GMAC_VERSION);
+	struct mac_device_info *mac = priv->hw;
 
-	mac = kzalloc(sizeof(const struct mac_device_info), GFP_KERNEL);
-	if (!mac)
-		return NULL;
+	dev_info(priv->device, "\tDWMAC1000\n");
 
-	mac->pcsr = ioaddr;
-	mac->multicast_filter_bins = mcbins;
-	mac->unicast_filter_entries = perfect_uc_entries;
+	priv->dev->priv_flags |= IFF_UNICAST_FLT;
+	mac->pcsr = priv->ioaddr;
+	mac->multicast_filter_bins = priv->plat->multicast_filter_bins;
+	mac->unicast_filter_entries = priv->plat->unicast_filter_entries;
 	mac->mcast_bits_log2 = 0;
 
 	if (mac->multicast_filter_bins)
 		mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins);
 
-	mac->mac = &dwmac1000_ops;
-	mac->dma = &dwmac1000_dma_ops;
-
 	mac->link.duplex = GMAC_CONTROL_DM;
 	mac->link.speed10 = GMAC_CONTROL_PS;
 	mac->link.speed100 = GMAC_CONTROL_PS | GMAC_CONTROL_FES;
@@ -555,8 +549,5 @@ struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
 	mac->mii.clk_csr_shift = 2;
 	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
-	/* Get and dump the chip ID */
-	*synopsys_id = stmmac_get_synopsys_id(hwid);
-
-	return mac;
+	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
index 91b23f9db31a..b735143987e1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c
@@ -27,6 +27,7 @@
 #include <linux/crc32.h>
 #include <net/dsa.h>
 #include <asm/io.h>
+#include "stmmac.h"
 #include "dwmac100.h"
 
 static void dwmac100_core_init(struct mac_device_info *hw,
@@ -159,7 +160,7 @@ static void dwmac100_pmt(struct mac_device_info *hw, unsigned long mode)
 	return;
 }
 
-static const struct stmmac_ops dwmac100_ops = {
+const struct stmmac_ops dwmac100_ops = {
 	.core_init = dwmac100_core_init,
 	.set_mac = stmmac_set_mac,
 	.rx_ipc = dwmac100_rx_ipc_enable,
@@ -172,20 +173,13 @@ static const struct stmmac_ops dwmac100_ops = {
 	.get_umac_addr = dwmac100_get_umac_addr,
 };
 
-struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id)
+int dwmac100_setup(struct stmmac_priv *priv)
 {
-	struct mac_device_info *mac;
+	struct mac_device_info *mac = priv->hw;
 
-	mac = kzalloc(sizeof(const struct mac_device_info), GFP_KERNEL);
-	if (!mac)
-		return NULL;
-
-	pr_info("\tDWMAC100\n");
-
-	mac->pcsr = ioaddr;
-	mac->mac = &dwmac100_ops;
-	mac->dma = &dwmac100_dma_ops;
+	dev_info(priv->device, "\tDWMAC100\n");
 
+	mac->pcsr = priv->ioaddr;
 	mac->link.duplex = MAC_CONTROL_F;
 	mac->link.speed10 = 0;
 	mac->link.speed100 = 0;
@@ -200,8 +194,5 @@ struct mac_device_info *dwmac100_setup(void __iomem *ioaddr, int *synopsys_id)
 	mac->mii.clk_csr_shift = 2;
 	mac->mii.clk_csr_mask = GENMASK(5, 2);
 
-	/* Synopsys Id is not available on old chips */
-	*synopsys_id = 0;
-
-	return mac;
+	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index dedd40613090..03eab9077c1c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -34,7 +34,6 @@
 #define GMAC_PCS_BASE			0x000000e0
 #define GMAC_PHYIF_CONTROL_STATUS	0x000000f8
 #define GMAC_PMT			0x000000c0
-#define GMAC_VERSION			0x00000110
 #define GMAC_DEBUG			0x00000114
 #define GMAC_HW_FEATURE0		0x0000011c
 #define GMAC_HW_FEATURE1		0x00000120
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 517b1f6736a8..7289b3b47d8e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -18,6 +18,7 @@
 #include <linux/ethtool.h>
 #include <linux/io.h>
 #include <net/dsa.h>
+#include "stmmac.h"
 #include "stmmac_pcs.h"
 #include "dwmac4.h"
 #include "dwmac5.h"
@@ -700,7 +701,7 @@ static void dwmac4_debug(void __iomem *ioaddr, struct stmmac_extra_stats *x,
 		x->mac_gmii_rx_proto_engine++;
 }
 
-static const struct stmmac_ops dwmac4_ops = {
+const struct stmmac_ops dwmac4_ops = {
 	.core_init = dwmac4_core_init,
 	.set_mac = stmmac_set_mac,
 	.rx_ipc = dwmac4_rx_ipc_enable,
@@ -731,7 +732,7 @@ static const struct stmmac_ops dwmac4_ops = {
 	.set_filter = dwmac4_set_filter,
 };
 
-static const struct stmmac_ops dwmac410_ops = {
+const struct stmmac_ops dwmac410_ops = {
 	.core_init = dwmac4_core_init,
 	.set_mac = stmmac_dwmac4_set_mac,
 	.rx_ipc = dwmac4_rx_ipc_enable,
@@ -762,7 +763,7 @@ static const struct stmmac_ops dwmac410_ops = {
 	.set_filter = dwmac4_set_filter,
 };
 
-static const struct stmmac_ops dwmac510_ops = {
+const struct stmmac_ops dwmac510_ops = {
 	.core_init = dwmac4_core_init,
 	.set_mac = stmmac_dwmac4_set_mac,
 	.rx_ipc = dwmac4_rx_ipc_enable,
@@ -796,19 +797,16 @@ static const struct stmmac_ops dwmac510_ops = {
 	.safety_feat_dump = dwmac5_safety_feat_dump,
 };
 
-struct mac_device_info *dwmac4_setup(void __iomem *ioaddr, int mcbins,
-				     int perfect_uc_entries, int *synopsys_id)
+int dwmac4_setup(struct stmmac_priv *priv)
 {
-	struct mac_device_info *mac;
-	u32 hwid = readl(ioaddr + GMAC_VERSION);
+	struct mac_device_info *mac = priv->hw;
 
-	mac = kzalloc(sizeof(const struct mac_device_info), GFP_KERNEL);
-	if (!mac)
-		return NULL;
+	dev_info(priv->device, "\tDWMAC4/5\n");
 
-	mac->pcsr = ioaddr;
-	mac->multicast_filter_bins = mcbins;
-	mac->unicast_filter_entries = perfect_uc_entries;
+	priv->dev->priv_flags |= IFF_UNICAST_FLT;
+	mac->pcsr = priv->ioaddr;
+	mac->multicast_filter_bins = priv->plat->multicast_filter_bins;
+	mac->unicast_filter_entries = priv->plat->unicast_filter_entries;
 	mac->mcast_bits_log2 = 0;
 
 	if (mac->multicast_filter_bins)
@@ -828,20 +826,5 @@ struct mac_device_info *dwmac4_setup(void __iomem *ioaddr, int mcbins,
 	mac->mii.clk_csr_shift = 8;
 	mac->mii.clk_csr_mask = GENMASK(11, 8);
 
-	/* Get and dump the chip ID */
-	*synopsys_id = stmmac_get_synopsys_id(hwid);
-
-	if (*synopsys_id > DWMAC_CORE_4_00)
-		mac->dma = &dwmac410_dma_ops;
-	else
-		mac->dma = &dwmac4_dma_ops;
-
-	if (*synopsys_id >= DWMAC_CORE_5_10)
-		mac->mac = &dwmac510_ops;
-	else if (*synopsys_id >= DWMAC_CORE_4_00)
-		mac->mac = &dwmac410_ops;
-	else
-		mac->mac = &dwmac4_ops;
-
-	return mac;
+	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
new file mode 100644
index 000000000000..2b0a7e79de00
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+ * stmmac HW Interface Handling
+ */
+
+#include "common.h"
+#include "stmmac.h"
+
+static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg)
+{
+	u32 reg = readl(priv->ioaddr + id_reg);
+
+	if (!reg) {
+		dev_info(priv->device, "Version ID not available\n");
+		return 0x0;
+	}
+
+	dev_info(priv->device, "User ID: 0x%x, Synopsys ID: 0x%x\n",
+			(unsigned int)(reg & GENMASK(15, 8)) >> 8,
+			(unsigned int)(reg & GENMASK(7, 0)));
+	return reg & GENMASK(7, 0);
+}
+
+static void stmmac_dwmac_mode_quirk(struct stmmac_priv *priv)
+{
+	struct mac_device_info *mac = priv->hw;
+
+	if (priv->chain_mode) {
+		dev_info(priv->device, "Chain mode enabled\n");
+		priv->mode = STMMAC_CHAIN_MODE;
+		mac->mode = &chain_mode_ops;
+	} else {
+		dev_info(priv->device, "Ring mode enabled\n");
+		priv->mode = STMMAC_RING_MODE;
+		mac->mode = &ring_mode_ops;
+	}
+}
+
+static int stmmac_dwmac1_quirks(struct stmmac_priv *priv)
+{
+	struct mac_device_info *mac = priv->hw;
+
+	if (priv->plat->enh_desc) {
+		dev_info(priv->device, "Enhanced/Alternate descriptors\n");
+
+		/* GMAC older than 3.50 has no extended descriptors */
+		if (priv->synopsys_id >= DWMAC_CORE_3_50) {
+			dev_info(priv->device, "Enabled extended descriptors\n");
+			priv->extend_desc = 1;
+		} else {
+			dev_warn(priv->device, "Extended descriptors not supported\n");
+		}
+
+		mac->desc = &enh_desc_ops;
+	} else {
+		dev_info(priv->device, "Normal descriptors\n");
+		mac->desc = &ndesc_ops;
+	}
+
+	stmmac_dwmac_mode_quirk(priv);
+	return 0;
+}
+
+static int stmmac_dwmac4_quirks(struct stmmac_priv *priv)
+{
+	stmmac_dwmac_mode_quirk(priv);
+	return 0;
+}
+
+static const struct stmmac_hwif_entry {
+	bool gmac;
+	bool gmac4;
+	u32 min_id;
+	const void *desc;
+	const void *dma;
+	const void *mac;
+	const void *hwtimestamp;
+	const void *mode;
+	int (*setup)(struct stmmac_priv *priv);
+	int (*quirks)(struct stmmac_priv *priv);
+} stmmac_hw[] = {
+	/* NOTE: New HW versions shall go to the end of this table */
+	{
+		.gmac = false,
+		.gmac4 = false,
+		.min_id = 0,
+		.desc = NULL,
+		.dma = &dwmac100_dma_ops,
+		.mac = &dwmac100_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = NULL,
+		.setup = dwmac100_setup,
+		.quirks = stmmac_dwmac1_quirks,
+	}, {
+		.gmac = true,
+		.gmac4 = false,
+		.min_id = 0,
+		.desc = NULL,
+		.dma = &dwmac1000_dma_ops,
+		.mac = &dwmac1000_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = NULL,
+		.setup = dwmac1000_setup,
+		.quirks = stmmac_dwmac1_quirks,
+	}, {
+		.gmac = false,
+		.gmac4 = true,
+		.min_id = 0,
+		.desc = &dwmac4_desc_ops,
+		.dma = &dwmac4_dma_ops,
+		.mac = &dwmac4_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = NULL,
+		.setup = dwmac4_setup,
+		.quirks = stmmac_dwmac4_quirks,
+	}, {
+		.gmac = false,
+		.gmac4 = true,
+		.min_id = DWMAC_CORE_4_00,
+		.desc = &dwmac4_desc_ops,
+		.dma = &dwmac4_dma_ops,
+		.mac = &dwmac410_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = &dwmac4_ring_mode_ops,
+		.setup = dwmac4_setup,
+		.quirks = NULL,
+	}, {
+		.gmac = false,
+		.gmac4 = true,
+		.min_id = DWMAC_CORE_4_10,
+		.desc = &dwmac4_desc_ops,
+		.dma = &dwmac410_dma_ops,
+		.mac = &dwmac410_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = &dwmac4_ring_mode_ops,
+		.setup = dwmac4_setup,
+		.quirks = NULL,
+	}, {
+		.gmac = false,
+		.gmac4 = true,
+		.min_id = DWMAC_CORE_5_10,
+		.desc = &dwmac4_desc_ops,
+		.dma = &dwmac410_dma_ops,
+		.mac = &dwmac510_ops,
+		.hwtimestamp = &stmmac_ptp,
+		.mode = &dwmac4_ring_mode_ops,
+		.setup = dwmac4_setup,
+		.quirks = NULL,
+	}
+};
+
+int stmmac_hwif_init(struct stmmac_priv *priv)
+{
+	bool needs_gmac4 = priv->plat->has_gmac4;
+	bool needs_gmac = priv->plat->has_gmac;
+	const struct stmmac_hwif_entry *entry;
+	struct mac_device_info *mac;
+	int i, ret;
+	u32 id;
+
+	if (needs_gmac) {
+		id = stmmac_get_id(priv, GMAC_VERSION);
+	} else {
+		id = stmmac_get_id(priv, GMAC4_VERSION);
+	}
+
+	/* Save ID for later use */
+	priv->synopsys_id = id;
+
+	/* Check for HW specific setup first */
+	if (priv->plat->setup) {
+		priv->hw = priv->plat->setup(priv);
+		if (!priv->hw)
+			return -ENOMEM;
+		return 0;
+	}
+
+	mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
+	if (!mac)
+		return -ENOMEM;
+
+	/* Fallback to generic HW */
+	for (i = ARRAY_SIZE(stmmac_hw) - 1; i >= 0; i--) {
+		entry = &stmmac_hw[i];
+
+		if (needs_gmac ^ entry->gmac)
+			continue;
+		if (needs_gmac4 ^ entry->gmac4)
+			continue;
+		if (id < entry->min_id)
+			continue;
+
+		mac->desc = entry->desc;
+		mac->dma = entry->dma;
+		mac->mac = entry->mac;
+		mac->ptp = entry->hwtimestamp;
+		mac->mode = entry->mode;
+
+		priv->hw = mac;
+
+		/* Entry found */
+		ret = entry->setup(priv);
+		if (ret)
+			return ret;
+
+		/* Run quirks, if needed */
+		if (entry->quirks) {
+			ret = entry->quirks(priv);
+			if (ret)
+				return ret;
+		}
+
+		return 0;
+	}
+
+	dev_err(priv->device, "Failed to find HW IF (id=0x%x, gmac=%d/%d)\n",
+			id, needs_gmac, needs_gmac4);
+	return -EINVAL;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index f81ded4a9946..bfad61607f07 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -418,4 +418,21 @@ struct stmmac_mode_ops {
 #define stmmac_clean_desc3(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
 
+struct stmmac_priv;
+
+extern const struct stmmac_ops dwmac100_ops;
+extern const struct stmmac_dma_ops dwmac100_dma_ops;
+extern const struct stmmac_ops dwmac1000_ops;
+extern const struct stmmac_dma_ops dwmac1000_dma_ops;
+extern const struct stmmac_ops dwmac4_ops;
+extern const struct stmmac_dma_ops dwmac4_dma_ops;
+extern const struct stmmac_ops dwmac410_ops;
+extern const struct stmmac_dma_ops dwmac410_dma_ops;
+extern const struct stmmac_ops dwmac510_ops;
+
+#define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
+#define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
+
+int stmmac_hwif_init(struct stmmac_priv *priv);
+
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index da50451f8999..2443f20e07bf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -130,6 +130,7 @@ struct stmmac_priv {
 	int eee_active;
 	int tx_lpi_timer;
 	unsigned int mode;
+	unsigned int chain_mode;
 	int extend_desc;
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_clock_ops;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 48b55407a953..0135fd3aa6ef 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -769,7 +769,6 @@ static int stmmac_init_ptp(struct stmmac_priv *priv)
 		netdev_info(priv->dev,
 			    "IEEE 1588-2008 Advanced Timestamp supported\n");
 
-	priv->hw->ptp = &stmmac_ptp;
 	priv->hwts_tx_en = 0;
 	priv->hwts_rx_en = 0;
 
@@ -2121,32 +2120,6 @@ static void stmmac_mmc_setup(struct stmmac_priv *priv)
 		netdev_info(priv->dev, "No MAC Management Counters available\n");
 }
 
-/**
- * stmmac_selec_desc_mode - to select among: normal/alternate/extend descriptors
- * @priv: driver private structure
- * Description: select the Enhanced/Alternate or Normal descriptors.
- * In case of Enhanced/Alternate, it checks if the extended descriptors are
- * supported by the HW capability register.
- */
-static void stmmac_selec_desc_mode(struct stmmac_priv *priv)
-{
-	if (priv->plat->enh_desc) {
-		dev_info(priv->device, "Enhanced/Alternate descriptors\n");
-
-		/* GMAC older than 3.50 has no extended descriptors */
-		if (priv->synopsys_id >= DWMAC_CORE_3_50) {
-			dev_info(priv->device, "Enabled extended descriptors\n");
-			priv->extend_desc = 1;
-		} else
-			dev_warn(priv->device, "Extended descriptors not supported\n");
-
-		priv->hw->desc = &enh_desc_ops;
-	} else {
-		dev_info(priv->device, "Normal descriptors\n");
-		priv->hw->desc = &ndesc_ops;
-	}
-}
-
 /**
  * stmmac_get_hw_features - get MAC capabilities from the HW cap. register.
  * @priv: driver private structure
@@ -4098,49 +4071,17 @@ static void stmmac_service_task(struct work_struct *work)
  */
 static int stmmac_hw_init(struct stmmac_priv *priv)
 {
-	struct mac_device_info *mac;
-
-	/* Identify the MAC HW device */
-	if (priv->plat->setup) {
-		mac = priv->plat->setup(priv);
-	} else if (priv->plat->has_gmac) {
-		priv->dev->priv_flags |= IFF_UNICAST_FLT;
-		mac = dwmac1000_setup(priv->ioaddr,
-				      priv->plat->multicast_filter_bins,
-				      priv->plat->unicast_filter_entries,
-				      &priv->synopsys_id);
-	} else if (priv->plat->has_gmac4) {
-		priv->dev->priv_flags |= IFF_UNICAST_FLT;
-		mac = dwmac4_setup(priv->ioaddr,
-				   priv->plat->multicast_filter_bins,
-				   priv->plat->unicast_filter_entries,
-				   &priv->synopsys_id);
-	} else {
-		mac = dwmac100_setup(priv->ioaddr, &priv->synopsys_id);
-	}
-	if (!mac)
-		return -ENOMEM;
-
-	priv->hw = mac;
+	int ret;
 
 	/* dwmac-sun8i only work in chain mode */
 	if (priv->plat->has_sun8i)
 		chain_mode = 1;
+	priv->chain_mode = chain_mode;
 
-	/* To use the chained or ring mode */
-	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		priv->hw->mode = &dwmac4_ring_mode_ops;
-	} else {
-		if (chain_mode) {
-			priv->hw->mode = &chain_mode_ops;
-			dev_info(priv->device, "Chain mode enabled\n");
-			priv->mode = STMMAC_CHAIN_MODE;
-		} else {
-			priv->hw->mode = &ring_mode_ops;
-			dev_info(priv->device, "Ring mode enabled\n");
-			priv->mode = STMMAC_RING_MODE;
-		}
-	}
+	/* Initialize HW Interface */
+	ret = stmmac_hwif_init(priv);
+	if (ret)
+		return ret;
 
 	/* Get the HW capability (new GMAC newer than 3.50a) */
 	priv->hw_cap_support = stmmac_get_hw_features(priv);
@@ -4174,12 +4115,6 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 		dev_info(priv->device, "No HW DMA feature register supported\n");
 	}
 
-	/* To use alternate (extended), normal or GMAC4 descriptor structures */
-	if (priv->synopsys_id >= DWMAC_CORE_4_00)
-		priv->hw->desc = &dwmac4_desc_ops;
-	else
-		stmmac_selec_desc_mode(priv);
-
 	if (priv->plat->rx_coe) {
 		priv->hw->rx_csum = priv->plat->rx_coe;
 		dev_info(priv->device, "RX Checksum Offload Engine supported\n");
-- 
cgit v1.2.3-59-g8ed1b


From b60bfdfec5b8ec88552e75c8bd99f1ebfa66a6e0 Mon Sep 17 00:00:00 2001
From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Mon, 23 Apr 2018 14:56:04 +0300
Subject: qed: Delete unused parameter p_ptt from mcp APIs

Since nvm images attributes are cached during driver load, acquiring ptt
is not needed when calling qed_mcp_get_nvm_image().

Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 9 +--------
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 4 +---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 2 --
 3 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 9854aa9139af..d1d3787affe8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1894,15 +1894,8 @@ static int qed_nvm_get_image(struct qed_dev *cdev, enum qed_nvm_images type,
 			     u8 *buf, u16 len)
 {
 	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
-	struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
-	int rc;
-
-	if (!ptt)
-		return -EAGAIN;
 
-	rc = qed_mcp_get_nvm_image(hwfn, ptt, type, buf, len);
-	qed_ptt_release(hwfn, ptt);
-	return rc;
+	return qed_mcp_get_nvm_image(hwfn, type, buf, len);
 }
 
 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index ec0d425766a7..1377ad172fb3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2531,7 +2531,6 @@ err0:
 
 static int
 qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
-			  struct qed_ptt *p_ptt,
 			  enum qed_nvm_images image_id,
 			  struct qed_nvm_image_att *p_image_att)
 {
@@ -2569,7 +2568,6 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
 }
 
 int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
-			  struct qed_ptt *p_ptt,
 			  enum qed_nvm_images image_id,
 			  u8 *p_buffer, u32 buffer_len)
 {
@@ -2578,7 +2576,7 @@ int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
 
 	memset(p_buffer, 0, buffer_len);
 
-	rc = qed_mcp_get_nvm_image_att(p_hwfn, p_ptt, image_id, &image_att);
+	rc = qed_mcp_get_nvm_image_att(p_hwfn, image_id, &image_att);
 	if (rc)
 		return rc;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 8a5c988d0c3c..dd62c38b3bd3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -486,7 +486,6 @@ struct qed_nvm_image_att {
  * @brief Allows reading a whole nvram image
  *
  * @param p_hwfn
- * @param p_ptt
  * @param image_id - image requested for reading
  * @param p_buffer - allocated buffer into which to fill data
  * @param buffer_len - length of the allocated buffer.
@@ -494,7 +493,6 @@ struct qed_nvm_image_att {
  * @return 0 iff p_buffer now contains the nvram image.
  */
 int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
-			  struct qed_ptt *p_ptt,
 			  enum qed_nvm_images image_id,
 			  u8 *p_buffer, u32 buffer_len);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1ac4329a1cff2e0bb12b71c13ad53a0e05bc87a6 Mon Sep 17 00:00:00 2001
From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Mon, 23 Apr 2018 14:56:05 +0300
Subject: qed: Add configuration information to register dump and debug data

Configuration information is added to the debug data collection, in
addition to register dump.
Added qed_dbg_nvm_image() that receives an image type, allocates a
buffer and reads the image. The images are saved in the buffers and the
dump size is updated.

Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_debug.c | 113 +++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c   |  14 +++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h   |  14 ++++
 include/linux/qed/qed_if.h                  |   3 +
 4 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 4926c5532fba..b3211c7d38c2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -7778,6 +7778,57 @@ int qed_dbg_igu_fifo_size(struct qed_dev *cdev)
 	return qed_dbg_feature_size(cdev, DBG_FEATURE_IGU_FIFO);
 }
 
+int qed_dbg_nvm_image_length(struct qed_hwfn *p_hwfn,
+			     enum qed_nvm_images image_id, u32 *length)
+{
+	struct qed_nvm_image_att image_att;
+	int rc;
+
+	*length = 0;
+	rc = qed_mcp_get_nvm_image_att(p_hwfn, image_id, &image_att);
+	if (rc)
+		return rc;
+
+	*length = image_att.length;
+
+	return rc;
+}
+
+int qed_dbg_nvm_image(struct qed_dev *cdev, void *buffer,
+		      u32 *num_dumped_bytes, enum qed_nvm_images image_id)
+{
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	u32 len_rounded, i;
+	__be32 val;
+	int rc;
+
+	*num_dumped_bytes = 0;
+	rc = qed_dbg_nvm_image_length(p_hwfn, image_id, &len_rounded);
+	if (rc)
+		return rc;
+
+	DP_NOTICE(p_hwfn->cdev,
+		  "Collecting a debug feature [\"nvram image %d\"]\n",
+		  image_id);
+
+	len_rounded = roundup(len_rounded, sizeof(u32));
+	rc = qed_mcp_get_nvm_image(p_hwfn, image_id, buffer, len_rounded);
+	if (rc)
+		return rc;
+
+	/* QED_NVM_IMAGE_NVM_META image is not swapped like other images */
+	if (image_id != QED_NVM_IMAGE_NVM_META)
+		for (i = 0; i < len_rounded; i += 4) {
+			val = cpu_to_be32(*(u32 *)(buffer + i));
+			*(u32 *)(buffer + i) = val;
+		}
+
+	*num_dumped_bytes = len_rounded;
+
+	return rc;
+}
+
 int qed_dbg_protection_override(struct qed_dev *cdev, void *buffer,
 				u32 *num_dumped_bytes)
 {
@@ -7831,6 +7882,9 @@ enum debug_print_features {
 	IGU_FIFO = 6,
 	PHY = 7,
 	FW_ASSERTS = 8,
+	NVM_CFG1 = 9,
+	DEFAULT_CFG = 10,
+	NVM_META = 11,
 };
 
 static u32 qed_calc_regdump_header(enum debug_print_features feature,
@@ -7965,13 +8019,61 @@ int qed_dbg_all_data(struct qed_dev *cdev, void *buffer)
 		DP_ERR(cdev, "qed_dbg_mcp_trace failed. rc = %d\n", rc);
 	}
 
+	/* nvm cfg1 */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_NVM_CFG1);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(NVM_CFG1, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image  %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_NVM_CFG1, "QED_NVM_IMAGE_NVM_CFG1", rc);
+	}
+
+	/* nvm default */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_DEFAULT_CFG);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(DEFAULT_CFG, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_DEFAULT_CFG, "QED_NVM_IMAGE_DEFAULT_CFG",
+		       rc);
+	}
+
+	/* nvm meta */
+	rc = qed_dbg_nvm_image(cdev,
+			       (u8 *)buffer + offset + REGDUMP_HEADER_SIZE,
+			       &feature_size, QED_NVM_IMAGE_NVM_META);
+	if (!rc) {
+		*(u32 *)((u8 *)buffer + offset) =
+		    qed_calc_regdump_header(NVM_META, cur_engine,
+					    feature_size, omit_engine);
+		offset += (feature_size + REGDUMP_HEADER_SIZE);
+	} else if (rc != -ENOENT) {
+		DP_ERR(cdev,
+		       "qed_dbg_nvm_image failed for image %d (%s), rc = %d\n",
+		       QED_NVM_IMAGE_NVM_META, "QED_NVM_IMAGE_NVM_META", rc);
+	}
+
 	return 0;
 }
 
 int qed_dbg_all_data_size(struct qed_dev *cdev)
 {
+	struct qed_hwfn *p_hwfn =
+		&cdev->hwfns[cdev->dbg_params.engine_for_debug];
+	u32 regs_len = 0, image_len = 0;
 	u8 cur_engine, org_engine;
-	u32 regs_len = 0;
 
 	org_engine = qed_get_debug_engine(cdev);
 	for (cur_engine = 0; cur_engine < cdev->num_hwfns; cur_engine++) {
@@ -7993,6 +8095,15 @@ int qed_dbg_all_data_size(struct qed_dev *cdev)
 
 	/* Engine common */
 	regs_len += REGDUMP_HEADER_SIZE + qed_dbg_mcp_trace_size(cdev);
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_CFG1, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_DEFAULT_CFG, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
+	qed_dbg_nvm_image_length(p_hwfn, QED_NVM_IMAGE_NVM_META, &image_len);
+	if (image_len)
+		regs_len += REGDUMP_HEADER_SIZE + image_len;
 
 	return regs_len;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 1377ad172fb3..0550f0ee11b3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -2529,7 +2529,7 @@ err0:
 	return rc;
 }
 
-static int
+int
 qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
 			  enum qed_nvm_images image_id,
 			  struct qed_nvm_image_att *p_image_att)
@@ -2545,6 +2545,15 @@ qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
 	case QED_NVM_IMAGE_FCOE_CFG:
 		type = NVM_TYPE_FCOE_CFG;
 		break;
+	case QED_NVM_IMAGE_NVM_CFG1:
+		type = NVM_TYPE_NVM_CFG1;
+		break;
+	case QED_NVM_IMAGE_DEFAULT_CFG:
+		type = NVM_TYPE_DEFAULT_CFG;
+		break;
+	case QED_NVM_IMAGE_NVM_META:
+		type = NVM_TYPE_META;
+		break;
 	default:
 		DP_NOTICE(p_hwfn, "Unknown request of image_id %08x\n",
 			  image_id);
@@ -2588,9 +2597,6 @@ int qed_mcp_get_nvm_image(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 	}
 
-	/* Each NVM image is suffixed by CRC; Upper-layer has no need for it */
-	image_att.length -= 4;
-
 	if (image_att.length > buffer_len) {
 		DP_VERBOSE(p_hwfn,
 			   QED_MSG_STORAGE,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index dd62c38b3bd3..3af3896420b9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -482,6 +482,20 @@ struct qed_nvm_image_att {
 	u32 length;
 };
 
+/**
+ * @brief Allows reading a whole nvram image
+ *
+ * @param p_hwfn
+ * @param image_id - image to get attributes for
+ * @param p_image_att - image attributes structure into which to fill data
+ *
+ * @return int - 0 - operation was successful.
+ */
+int
+qed_mcp_get_nvm_image_att(struct qed_hwfn *p_hwfn,
+			  enum qed_nvm_images image_id,
+			  struct qed_nvm_image_att *p_image_att);
+
 /**
  * @brief Allows reading a whole nvram image
  *
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index b5b2bc9eacca..e53f9c7c2809 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -159,6 +159,9 @@ struct qed_dcbx_get {
 enum qed_nvm_images {
 	QED_NVM_IMAGE_ISCSI_CFG,
 	QED_NVM_IMAGE_FCOE_CFG,
+	QED_NVM_IMAGE_NVM_CFG1,
+	QED_NVM_IMAGE_DEFAULT_CFG,
+	QED_NVM_IMAGE_NVM_META,
 };
 
 struct qed_link_eee_params {
-- 
cgit v1.2.3-59-g8ed1b


From c3c14da0288de79ccef2c2380cf5737077fa64ed Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 23 Apr 2018 11:32:06 -0700
Subject: net/ipv6: add rcu locking to ip6_negative_advice

syzbot reported a suspicious rcu_dereference_check:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
  lockdep_rcu_suspicious+0x14a/0x153 kernel/locking/lockdep.c:4592
  rt6_check_expired+0x38b/0x3e0 net/ipv6/route.c:410
  ip6_negative_advice+0x67/0xc0 net/ipv6/route.c:2204
  dst_negative_advice include/net/sock.h:1786 [inline]
  sock_setsockopt+0x138f/0x1fe0 net/core/sock.c:1051
  __sys_setsockopt+0x2df/0x390 net/socket.c:1899
  SYSC_setsockopt net/socket.c:1914 [inline]
  SyS_setsockopt+0x34/0x50 net/socket.c:1911

Add rcu locking around call to rt6_check_expired in
ip6_negative_advice.

Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected")
Reported-by: syzbot+2422c9e35796659d2273@syzkaller.appspotmail.com
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0407bbc5a028..354a5b8d016f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2201,10 +2201,12 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
+			rcu_read_lock();
 			if (rt6_check_expired(rt)) {
 				rt6_remove_exception_rt(rt);
 				dst = NULL;
 			}
+			rcu_read_unlock();
 		} else {
 			dst_release(dst);
 			dst = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 8a14e46f140230ca889c2b0a4753ae8ea45fd770 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 23 Apr 2018 11:32:07 -0700
Subject: net/ipv6: Fix missing rcu dereferences on from

kbuild test robot reported 2 uses of rt->from not properly accessed
using rcu_dereference:
1. add rcu_dereference_protected to rt6_remove_exception_rt and make
   sure it is always called with rcu lock held.

2. change rt6_do_redirect to take a reference on 'from' when accessed
   the first time so it can be used the sceond time outside of the lock

Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 354a5b8d016f..ac3e51631c65 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1541,11 +1541,13 @@ static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
 static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
 	struct rt6_exception_bucket *bucket;
-	struct fib6_info *from = rt->from;
 	struct in6_addr *src_key = NULL;
 	struct rt6_exception *rt6_ex;
+	struct fib6_info *from;
 	int err;
 
+	from = rcu_dereference_protected(rt->from,
+					 lockdep_is_held(&rt6_exception_lock));
 	if (!from ||
 	    !(rt->rt6i_flags & RTF_CACHE))
 		return -EINVAL;
@@ -2223,6 +2225,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 	rt = (struct rt6_info *) skb_dst(skb);
 	if (rt) {
+		rcu_read_lock();
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
 				rt6_remove_exception_rt(rt);
@@ -2230,15 +2233,14 @@ static void ip6_link_failure(struct sk_buff *skb)
 			struct fib6_info *from;
 			struct fib6_node *fn;
 
-			rcu_read_lock();
 			from = rcu_dereference(rt->from);
 			if (from) {
 				fn = rcu_dereference(from->fib6_node);
 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
 					fn->fn_sernum = -1;
 			}
-			rcu_read_unlock();
 		}
+		rcu_read_unlock();
 	}
 }
 
@@ -3340,8 +3342,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 
 	rcu_read_lock();
 	from = rcu_dereference(rt->from);
-	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
+	fib6_info_hold(from);
 	rcu_read_unlock();
+
+	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
 	if (!nrt)
 		goto out;
 
@@ -3355,7 +3359,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	 * a cached route because rt6_insert_exception() will
 	 * takes care of it
 	 */
-	if (rt6_insert_exception(nrt, rt->from)) {
+	if (rt6_insert_exception(nrt, from)) {
 		dst_release_immediate(&nrt->dst);
 		goto out;
 	}
@@ -3367,6 +3371,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
 out:
+	fib6_info_release(from);
 	neigh_release(neigh);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 064223c1231ce508efaded6576ffdb07de9307b5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 23 Apr 2018 13:49:38 +0100
Subject: dca: make function dca_common_get_tag static

Function dca_common_get_tag is local to the source and does not need to be
in global scope, so make it static.

Cleans up sparse warning:
drivers/dca/dca-core.c:273:4: warning: symbol 'dca_common_get_tag' was
not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/dca/dca-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index 7afbb28d6a0f..1bc5ffb338c8 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -270,7 +270,7 @@ EXPORT_SYMBOL_GPL(dca_remove_requester);
  * @dev - the device that wants dca service
  * @cpu - the cpuid as returned by get_cpu()
  */
-u8 dca_common_get_tag(struct device *dev, int cpu)
+static u8 dca_common_get_tag(struct device *dev, int cpu)
 {
 	struct dca_provider *dca;
 	u8 tag;
-- 
cgit v1.2.3-59-g8ed1b


From b300fcf883aca9b96f9281cd05df4899fae10917 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Mon, 23 Apr 2018 16:00:50 +0200
Subject: selftests: net: update .gitignore with missing test

Fixes: 192dc405f308 ("selftests: net: add tcp_mmap program")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index c612d6e38c62..5559f6add046 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -7,3 +7,4 @@ reuseport_bpf_cpu
 reuseport_bpf_numa
 reuseport_dualstack
 reuseaddr_conflict
+tcp_mmap
-- 
cgit v1.2.3-59-g8ed1b


From 9c20b9372fbaf6f7d4c05f5f925806a7928f0c73 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Mon, 23 Apr 2018 20:08:41 -0700
Subject: net: fib_rules: fix l3mdev netlink attr processing

Fixes: b16fb418b1bf ("net: fib_rules: add extack support")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ebd9351b213a..2271c80fd967 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -541,8 +541,10 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
 		if (nlrule->l3mdev != 1)
 #endif
+		{
 			NL_SET_ERR_MSG(extack, "Invalid l3mdev");
 			goto errout_free;
+		}
 	}
 
 	nlrule->action = frh->action;
-- 
cgit v1.2.3-59-g8ed1b


From cea19a6ce8bf0518d156beea419d822021cc3705 Mon Sep 17 00:00:00 2001
From: Carl Huang <cjhuang@codeaurora.org>
Date: Thu, 19 Apr 2018 19:39:38 +0300
Subject: ath10k: add WMI_SERVICE_AVAILABLE_EVENT support

Add WMI_SERVICE_AVAILABLE_EVENT to extend WMI_SERVICE_READY_EVENT,
the 128bit service map in WMI_SERVICE_READY_EVENT is not enough
for firmware to notice new WLAN service to host driver. Hereby,
for thoese new WLAN service, firmware will notice host driver by
WMI_SERVICE_AVAILABLE_EVENT.

Signed-off-by: Alan Liu <alanliu@codeaurora.org>
Signed-off-by: Carl Huang <cjhuang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi-ops.h |  24 +++
 drivers/net/wireless/ath/ath10k/wmi-tlv.c |  38 ++++
 drivers/net/wireless/ath/ath10k/wmi-tlv.h | 346 +++++++++++++++++++++++++++++-
 drivers/net/wireless/ath/ath10k/wmi.c     |  21 +-
 drivers/net/wireless/ath/ath10k/wmi.h     |   8 +
 5 files changed, 431 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi-ops.h b/drivers/net/wireless/ath/ath10k/wmi-ops.h
index c35e45340b4f..4f7e0ba071a2 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-ops.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-ops.h
@@ -25,6 +25,7 @@ struct sk_buff;
 struct wmi_ops {
 	void (*rx)(struct ath10k *ar, struct sk_buff *skb);
 	void (*map_svc)(const __le32 *in, unsigned long *out, size_t len);
+	void (*map_svc_ext)(const __le32 *in, unsigned long *out, size_t len);
 
 	int (*pull_scan)(struct ath10k *ar, struct sk_buff *skb,
 			 struct wmi_scan_ev_arg *arg);
@@ -54,6 +55,9 @@ struct wmi_ops {
 			      struct wmi_wow_ev_arg *arg);
 	int (*pull_echo_ev)(struct ath10k *ar, struct sk_buff *skb,
 			    struct wmi_echo_ev_arg *arg);
+	int (*pull_svc_avail)(struct ath10k *ar, struct sk_buff *skb,
+			      struct wmi_svc_avail_ev_arg *arg);
+
 	enum wmi_txbf_conf (*get_txbf_conf_scheme)(struct ath10k *ar);
 
 	struct sk_buff *(*gen_pdev_suspend)(struct ath10k *ar, u32 suspend_opt);
@@ -229,6 +233,17 @@ ath10k_wmi_map_svc(struct ath10k *ar, const __le32 *in, unsigned long *out,
 	return 0;
 }
 
+static inline int
+ath10k_wmi_map_svc_ext(struct ath10k *ar, const __le32 *in, unsigned long *out,
+		       size_t len)
+{
+	if (!ar->wmi.ops->map_svc_ext)
+		return -EOPNOTSUPP;
+
+	ar->wmi.ops->map_svc_ext(in, out, len);
+	return 0;
+}
+
 static inline int
 ath10k_wmi_pull_scan(struct ath10k *ar, struct sk_buff *skb,
 		     struct wmi_scan_ev_arg *arg)
@@ -329,6 +344,15 @@ ath10k_wmi_pull_rdy(struct ath10k *ar, struct sk_buff *skb,
 	return ar->wmi.ops->pull_rdy(ar, skb, arg);
 }
 
+static inline int
+ath10k_wmi_pull_svc_avail(struct ath10k *ar, struct sk_buff *skb,
+			  struct wmi_svc_avail_ev_arg *arg)
+{
+	if (!ar->wmi.ops->pull_svc_avail)
+		return -EOPNOTSUPP;
+	return ar->wmi.ops->pull_svc_avail(ar, skb, arg);
+}
+
 static inline int
 ath10k_wmi_pull_fw_stats(struct ath10k *ar, struct sk_buff *skb,
 			 struct ath10k_fw_stats *stats)
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
index 9d1b0a459069..31866e3b74fc 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
@@ -594,6 +594,9 @@ static void ath10k_wmi_tlv_op_rx(struct ath10k *ar, struct sk_buff *skb)
 	case WMI_TLV_READY_EVENTID:
 		ath10k_wmi_event_ready(ar, skb);
 		break;
+	case WMI_TLV_SERVICE_AVAILABLE_EVENTID:
+		ath10k_wmi_event_service_available(ar, skb);
+		break;
 	case WMI_TLV_OFFLOAD_BCN_TX_STATUS_EVENTID:
 		ath10k_wmi_tlv_event_bcn_tx_status(ar, skb);
 		break;
@@ -1117,6 +1120,39 @@ static int ath10k_wmi_tlv_op_pull_rdy_ev(struct ath10k *ar,
 	return 0;
 }
 
+static int ath10k_wmi_tlv_svc_avail_parse(struct ath10k *ar, u16 tag, u16 len,
+					  const void *ptr, void *data)
+{
+	struct wmi_svc_avail_ev_arg *arg = data;
+
+	switch (tag) {
+	case WMI_TLV_TAG_STRUCT_SERVICE_AVAILABLE_EVENT:
+		arg->service_map_ext_len = *(__le32 *)ptr;
+		arg->service_map_ext = ptr + sizeof(__le32);
+		return 0;
+	default:
+		break;
+	}
+	return -EPROTO;
+}
+
+static int ath10k_wmi_tlv_op_pull_svc_avail(struct ath10k *ar,
+					    struct sk_buff *skb,
+					    struct wmi_svc_avail_ev_arg *arg)
+{
+	int ret;
+
+	ret = ath10k_wmi_tlv_iter(ar, skb->data, skb->len,
+				  ath10k_wmi_tlv_svc_avail_parse, arg);
+
+	if (ret) {
+		ath10k_warn(ar, "failed to parse svc_avail tlv: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 static void ath10k_wmi_tlv_pull_vdev_stats(const struct wmi_tlv_vdev_stats *src,
 					   struct ath10k_fw_stats_vdev *dst)
 {
@@ -3740,6 +3776,7 @@ static struct wmi_vdev_param_map wmi_tlv_vdev_param_map = {
 static const struct wmi_ops wmi_tlv_ops = {
 	.rx = ath10k_wmi_tlv_op_rx,
 	.map_svc = wmi_tlv_svc_map,
+	.map_svc_ext = wmi_tlv_svc_map_ext,
 
 	.pull_scan = ath10k_wmi_tlv_op_pull_scan_ev,
 	.pull_mgmt_rx = ath10k_wmi_tlv_op_pull_mgmt_rx_ev,
@@ -3751,6 +3788,7 @@ static const struct wmi_ops wmi_tlv_ops = {
 	.pull_phyerr = ath10k_wmi_op_pull_phyerr_ev,
 	.pull_svc_rdy = ath10k_wmi_tlv_op_pull_svc_rdy_ev,
 	.pull_rdy = ath10k_wmi_tlv_op_pull_rdy_ev,
+	.pull_svc_avail = ath10k_wmi_tlv_op_pull_svc_avail,
 	.pull_fw_stats = ath10k_wmi_tlv_op_pull_fw_stats,
 	.pull_roam_ev = ath10k_wmi_tlv_op_pull_roam_ev,
 	.pull_wow_event = ath10k_wmi_tlv_op_pull_wow_ev,
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
index fa3773ec7c68..529b91b17061 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
@@ -295,6 +295,7 @@ enum wmi_tlv_cmd_id {
 enum wmi_tlv_event_id {
 	WMI_TLV_SERVICE_READY_EVENTID = 0x1,
 	WMI_TLV_READY_EVENTID,
+	WMI_TLV_SERVICE_AVAILABLE_EVENTID,
 	WMI_TLV_SCAN_EVENTID = WMI_TLV_EV(WMI_TLV_GRP_SCAN),
 	WMI_TLV_PDEV_TPC_CONFIG_EVENTID = WMI_TLV_EV(WMI_TLV_GRP_PDEV),
 	WMI_TLV_CHAN_INFO_EVENTID,
@@ -949,6 +950,275 @@ enum wmi_tlv_tag {
 	WMI_TLV_TAG_STRUCT_PACKET_FILTER_ENABLE,
 	WMI_TLV_TAG_STRUCT_SAP_SET_BLACKLIST_PARAM_CMD,
 	WMI_TLV_TAG_STRUCT_MGMT_TX_CMD,
+	WMI_TLV_TAG_STRUCT_MGMT_TX_COMPL_EVENT,
+	WMI_TLV_TAG_STRUCT_SOC_SET_ANTENNA_MODE_CMD,
+	WMI_TLV_TAG_STRUCT_WOW_UDP_SVC_OFLD_CMD,
+	WMI_TLV_TAG_STRUCT_LRO_INFO_CMD,
+	WMI_TLV_TAG_STRUCT_ROAM_EARLYSTOP_RSSI_THRES_PARAM,
+	WMI_TLV_TAG_STRUCT_SERVICE_READY_EXT_EVENT,
+	WMI_TLV_TAG_STRUCT_MAWC_SENSOR_REPORT_IND_CMD,
+	WMI_TLV_TAG_STRUCT_MAWC_ENABLE_SENSOR_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_CONFIGURE_MAWC_CMD,
+	WMI_TLV_TAG_STRUCT_NLO_CONFIGURE_MAWC_CMD,
+	WMI_TLV_TAG_STRUCT_EXTSCAN_CONFIGURE_MAWC_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_ASSOC_CONF_EVENT,
+	WMI_TLV_TAG_STRUCT_WOW_HOSTWAKEUP_GPIO_PIN_PATTERN_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_AP_PS_EGAP_PARAM_CMD,
+	WMI_TLV_TAG_STRUCT_AP_PS_EGAP_INFO_EVENT,
+	WMI_TLV_TAG_STRUCT_PMF_OFFLOAD_SET_SA_QUERY_CMD,
+	WMI_TLV_TAG_STRUCT_TRANSFER_DATA_TO_FLASH_CMD,
+	WMI_TLV_TAG_STRUCT_TRANSFER_DATA_TO_FLASH_COMPLETE_EVENT,
+	WMI_TLV_TAG_STRUCT_SCPC_EVENT,
+	WMI_TLV_TAG_STRUCT_AP_PS_EGAP_INFO_CHAINMASK_LIST,
+	WMI_TLV_TAG_STRUCT_STA_SMPS_FORCE_MODE_COMPLETE_EVENT,
+	WMI_TLV_TAG_STRUCT_BPF_GET_CAPABILITY_CMD,
+	WMI_TLV_TAG_STRUCT_BPF_CAPABILITY_INFO_EVT,
+	WMI_TLV_TAG_STRUCT_BPF_GET_VDEV_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_BPF_VDEV_STATS_INFO_EVT,
+	WMI_TLV_TAG_STRUCT_BPF_SET_VDEV_INSTRUCTIONS_CMD,
+	WMI_TLV_TAG_STRUCT_BPF_DEL_VDEV_INSTRUCTIONS_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_DELETE_RESP_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_DELETE_RESP_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_DENSE_THRES_PARAM,
+	WMI_TLV_TAG_STRUCT_ENLO_CANDIDATE_SCORE_PARAM,
+	WMI_TLV_TAG_STRUCT_PEER_UPDATE_WDS_ENTRY_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_CONFIG_RATEMASK,
+	WMI_TLV_TAG_STRUCT_PDEV_FIPS_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SMART_ANT_ENABLE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SMART_ANT_SET_RX_ANTENNA_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_SMART_ANT_SET_TX_ANTENNA_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_SMART_ANT_SET_TRAIN_ANTENNA_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_SMART_ANT_SET_NODE_CONFIG_OPS_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_ANT_SWITCH_TBL_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_CTL_TABLE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_MIMOGAIN_TABLE_CMD,
+	WMI_TLV_TAG_STRUCT_FWTEST_SET_PARAM_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_ATF_REQUEST,
+	WMI_TLV_TAG_STRUCT_VDEV_ATF_REQUEST,
+	WMI_TLV_TAG_STRUCT_PDEV_GET_ANI_CCK_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_GET_ANI_OFDM_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_INST_RSSI_STATS_RESP,
+	WMI_TLV_TAG_STRUCT_MED_UTIL_REPORT_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_STA_PS_STATECHANGE_EVENT,
+	WMI_TLV_TAG_STRUCT_WDS_ADDR_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_RATECODE_LIST_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_NFCAL_POWER_ALL_CHANNELS_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_TPC_EVENT,
+	WMI_TLV_TAG_STRUCT_ANI_OFDM_EVENT,
+	WMI_TLV_TAG_STRUCT_ANI_CCK_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_CHANNEL_HOPPING_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_FIPS_EVENT,
+	WMI_TLV_TAG_STRUCT_ATF_PEER_INFO,
+	WMI_TLV_TAG_STRUCT_PDEV_GET_TPC_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_FILTER_NRP_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_QBOOST_CFG_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SMART_ANT_GPIO_HANDLE,
+	WMI_TLV_TAG_STRUCT_PEER_SMART_ANT_SET_TX_ANTENNA_SERIES,
+	WMI_TLV_TAG_STRUCT_PEER_SMART_ANT_SET_TRAIN_ANTENNA_PARAM,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_ANT_CTRL_CHAIN,
+	WMI_TLV_TAG_STRUCT_PEER_CCK_OFDM_RATE_INFO,
+	WMI_TLV_TAG_STRUCT_PEER_MCS_RATE_INFO,
+	WMI_TLV_TAG_STRUCT_PDEV_NFCAL_POWER_ALL_CHANNELS_NFDBR,
+	WMI_TLV_TAG_STRUCT_PDEV_NFCAL_POWER_ALL_CHANNELS_NFDBM,
+	WMI_TLV_TAG_STRUCT_PDEV_NFCAL_POWER_ALL_CHANNELS_FREQNUM,
+	WMI_TLV_TAG_STRUCT_MU_REPORT_TOTAL_MU,
+	WMI_TLV_TAG_STRUCT_VDEV_SET_DSCP_TID_MAP_CMD,
+	WMI_TLV_TAG_STRUCT_ROAM_SET_MBO,
+	WMI_TLV_TAG_STRUCT_MIB_STATS_ENABLE_CMD,
+	WMI_TLV_TAG_STRUCT_NAN_DISC_IFACE_CREATED_EVENT,
+	WMI_TLV_TAG_STRUCT_NAN_DISC_IFACE_DELETED_EVENT,
+	WMI_TLV_TAG_STRUCT_NAN_STARTED_CLUSTER_EVENT,
+	WMI_TLV_TAG_STRUCT_NAN_JOINED_CLUSTER_EVENT,
+	WMI_TLV_TAG_STRUCT_NDI_GET_CAP_REQ,
+	WMI_TLV_TAG_STRUCT_NDP_INITIATOR_REQ,
+	WMI_TLV_TAG_STRUCT_NDP_RESPONDER_REQ,
+	WMI_TLV_TAG_STRUCT_NDP_END_REQ,
+	WMI_TLV_TAG_STRUCT_NDI_CAP_RSP_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_INITIATOR_RSP_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_RESPONDER_RSP_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_END_RSP_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_INDICATION_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_CONFIRM_EVENT,
+	WMI_TLV_TAG_STRUCT_NDP_END_INDICATION_EVENT,
+	WMI_TLV_TAG_STRUCT_VDEV_SET_QUIET_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_PCL_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_HW_MODE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_MAC_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_ANTENNA_MODE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_HW_MODE_RESPONSE_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_HW_MODE_TRANSITION_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_HW_MODE_RESPONSE_VDEV_MAC_ENTRY,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_MAC_CONFIG_RESPONSE_EVENT,
+	WMI_TLV_TAG_STRUCT_COEX_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_CONFIG_ENHANCED_MCAST_FILTER,
+	WMI_TLV_TAG_STRUCT_CHAN_AVOID_RPT_ALLOW_CMD,
+	WMI_TLV_TAG_STRUCT_SET_PERIODIC_CHANNEL_STATS_CONFIG,
+	WMI_TLV_TAG_STRUCT_VDEV_SET_CUSTOM_AGGR_SIZE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_WAL_POWER_DEBUG_CMD,
+	WMI_TLV_TAG_STRUCT_MAC_PHY_CAPABILITIES,
+	WMI_TLV_TAG_STRUCT_HW_MODE_CAPABILITIES,
+	WMI_TLV_TAG_STRUCT_SOC_MAC_PHY_HW_MODE_CAPS,
+	WMI_TLV_TAG_STRUCT_HAL_REG_CAPABILITIES_EXT,
+	WMI_TLV_TAG_STRUCT_SOC_HAL_REG_CAPABILITIES,
+	WMI_TLV_TAG_STRUCT_VDEV_WISA_CMD,
+	WMI_TLV_TAG_STRUCT_TX_POWER_LEVEL_STATS_EVT,
+	WMI_TLV_TAG_STRUCT_SCAN_ADAPTIVE_DWELL_PARAMETERS_TLV,
+	WMI_TLV_TAG_STRUCT_SCAN_ADAPTIVE_DWELL_CONFIG,
+	WMI_TLV_TAG_STRUCT_WOW_SET_ACTION_WAKE_UP_CMD,
+	WMI_TLV_TAG_STRUCT_NDP_END_RSP_PER_NDI,
+	WMI_TLV_TAG_STRUCT_PEER_BWF_REQUEST,
+	WMI_TLV_TAG_STRUCT_BWF_PEER_INFO,
+	WMI_TLV_TAG_STRUCT_DBGLOG_TIME_STAMP_SYNC_CMD,
+	WMI_TLV_TAG_STRUCT_RMC_SET_LEADER_CMD,
+	WMI_TLV_TAG_STRUCT_RMC_MANUAL_LEADER_EVENT,
+	WMI_TLV_TAG_STRUCT_PER_CHAIN_RSSI_STATS,
+	WMI_TLV_TAG_STRUCT_RSSI_STATS,
+	WMI_TLV_TAG_STRUCT_P2P_LO_START_CMD,
+	WMI_TLV_TAG_STRUCT_P2P_LO_STOP_CMD,
+	WMI_TLV_TAG_STRUCT_P2P_LO_STOPPED_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_REORDER_QUEUE_SETUP_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_REORDER_QUEUE_REMOVE_CMD,
+	WMI_TLV_TAG_STRUCT_SET_MULTIPLE_MCAST_FILTER_CMD,
+	WMI_TLV_TAG_STRUCT_MGMT_TX_COMPL_BUNDLE_EVENT,
+	WMI_TLV_TAG_STRUCT_READ_DATA_FROM_FLASH_CMD,
+	WMI_TLV_TAG_STRUCT_READ_DATA_FROM_FLASH_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_REORDER_TIMEOUT_VAL_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_SET_RX_BLOCKSIZE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_WAKEUP_CONFIG_CMDID,
+	WMI_TLV_TAG_STRUCT_TLV_BUF_LEN_PARAM,
+	WMI_TLV_TAG_STRUCT_SERVICE_AVAILABLE_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_ANTDIV_INFO_REQ_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_ANTDIV_INFO_EVENT,
+	WMI_TLV_TAG_STRUCT_PEER_ANTDIV_INFO,
+	WMI_TLV_TAG_STRUCT_PDEV_GET_ANTDIV_STATUS_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_ANTDIV_STATUS_EVENT,
+	WMI_TLV_TAG_STRUCT_MNT_FILTER_CMD,
+	WMI_TLV_TAG_STRUCT_GET_CHIP_POWER_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_CHIP_POWER_STATS_EVENT,
+	WMI_TLV_TAG_STRUCT_COEX_GET_ANTENNA_ISOLATION_CMD,
+	WMI_TLV_TAG_STRUCT_COEX_REPORT_ISOLATION_EVENT,
+	WMI_TLV_TAG_STRUCT_CHAN_CCA_STATS,
+	WMI_TLV_TAG_STRUCT_PEER_SIGNAL_STATS,
+	WMI_TLV_TAG_STRUCT_TX_STATS,
+	WMI_TLV_TAG_STRUCT_PEER_AC_TX_STATS,
+	WMI_TLV_TAG_STRUCT_RX_STATS,
+	WMI_TLV_TAG_STRUCT_PEER_AC_RX_STATS,
+	WMI_TLV_TAG_STRUCT_REPORT_STATS_EVENT,
+	WMI_TLV_TAG_STRUCT_CHAN_CCA_STATS_THRESH,
+	WMI_TLV_TAG_STRUCT_PEER_SIGNAL_STATS_THRESH,
+	WMI_TLV_TAG_STRUCT_TX_STATS_THRESH,
+	WMI_TLV_TAG_STRUCT_RX_STATS_THRESH,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_STATS_THRESHOLD_CMD,
+	WMI_TLV_TAG_STRUCT_REQUEST_WLAN_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_RX_AGGR_FAILURE_EVENT,
+	WMI_TLV_TAG_STRUCT_RX_AGGR_FAILURE_INFO,
+	WMI_TLV_TAG_STRUCT_VDEV_ENCRYPT_DECRYPT_DATA_REQ_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_ENCRYPT_DECRYPT_DATA_RESP_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_BAND_TO_MAC,
+	WMI_TLV_TAG_STRUCT_TBTT_OFFSET_INFO,
+	WMI_TLV_TAG_STRUCT_TBTT_OFFSET_EXT_EVENT,
+	WMI_TLV_TAG_STRUCT_SAR_LIMITS_CMD,
+	WMI_TLV_TAG_STRUCT_SAR_LIMIT_CMD_ROW,
+	WMI_TLV_TAG_STRUCT_PDEV_DFS_PHYERR_OFFLOAD_ENABLE_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_DFS_PHYERR_OFFLOAD_DISABLE_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_ADFS_CH_CFG_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_ADFS_OCAC_ABORT_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_DFS_RADAR_DETECTION_EVENT,
+	WMI_TLV_TAG_STRUCT_VDEV_ADFS_OCAC_COMPLETE_EVENT,
+	WMI_TLV_TAG_STRUCT_VDEV_DFS_CAC_COMPLETE_EVENT,
+	WMI_TLV_TAG_STRUCT_VENDOR_OUI,
+	WMI_TLV_TAG_STRUCT_REQUEST_RCPI_CMD,
+	WMI_TLV_TAG_STRUCT_UPDATE_RCPI_EVENT,
+	WMI_TLV_TAG_STRUCT_REQUEST_PEER_STATS_INFO_CMD,
+	WMI_TLV_TAG_STRUCT_PEER_STATS_INFO,
+	WMI_TLV_TAG_STRUCT_PEER_STATS_INFO_EVENT,
+	WMI_TLV_TAG_STRUCT_PKGID_EVENT,
+	WMI_TLV_TAG_STRUCT_CONNECTED_NLO_RSSI_PARAMS,
+	WMI_TLV_TAG_STRUCT_SET_CURRENT_COUNTRY_CMD,
+	WMI_TLV_TAG_STRUCT_REGULATORY_RULE_STRUCT,
+	WMI_TLV_TAG_STRUCT_REG_CHAN_LIST_CC_EVENT,
+	WMI_TLV_TAG_STRUCT_11D_SCAN_START_CMD,
+	WMI_TLV_TAG_STRUCT_11D_SCAN_STOP_CMD,
+	WMI_TLV_TAG_STRUCT_11D_NEW_COUNTRY_EVENT,
+	WMI_TLV_TAG_STRUCT_REQUEST_RADIO_CHAN_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_RADIO_CHAN_STATS,
+	WMI_TLV_TAG_STRUCT_RADIO_CHAN_STATS_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_PER_CONFIG,
+	WMI_TLV_TAG_STRUCT_VDEV_ADD_MAC_ADDR_TO_RX_FILTER_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_ADD_MAC_ADDR_TO_RX_FILTER_STATUS_EVENT,
+	WMI_TLV_TAG_STRUCT_BPF_SET_VDEV_ACTIVE_MODE_CMD,
+	WMI_TLV_TAG_STRUCT_HW_DATA_FILTER_CMD,
+	WMI_TLV_TAG_STRUCT_CONNECTED_NLO_BSS_BAND_RSSI_PREF,
+	WMI_TLV_TAG_STRUCT_PEER_OPER_MODE_CHANGE_EVENT,
+	WMI_TLV_TAG_STRUCT_CHIP_POWER_SAVE_FAILURE_DETECTED,
+	WMI_TLV_TAG_STRUCT_PDEV_MULTIPLE_VDEV_RESTART_REQUEST_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_CSA_SWITCH_COUNT_STATUS_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_UPDATE_PKT_ROUTING_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_CHECK_CAL_VERSION_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_CHECK_CAL_VERSION_EVENT,
+	WMI_TLV_TAG_STRUCT_PDEV_SET_DIVERSITY_GAIN_CMD,
+	WMI_TLV_TAG_STRUCT_MAC_PHY_CHAINMASK_COMBO,
+	WMI_TLV_TAG_STRUCT_MAC_PHY_CHAINMASK_CAPABILITY,
+	WMI_TLV_TAG_STRUCT_VDEV_SET_ARP_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_GET_ARP_STATS_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_GET_ARP_STATS_EVENT,
+	WMI_TLV_TAG_STRUCT_IFACE_OFFLOAD_STATS,
+	WMI_TLV_TAG_STRUCT_REQUEST_STATS_CMD_SUB_STRUCT_PARAM,
+	WMI_TLV_TAG_STRUCT_RSSI_CTL_EXT,
+	WMI_TLV_TAG_STRUCT_SINGLE_PHYERR_EXT_RX_HDR,
+	WMI_TLV_TAG_STRUCT_COEX_BT_ACTIVITY_EVENT,
+	WMI_TLV_TAG_STRUCT_VDEV_GET_TX_POWER_CMD,
+	WMI_TLV_TAG_STRUCT_VDEV_TX_POWER_EVENT,
+	WMI_TLV_TAG_STRUCT_OFFCHAN_DATA_TX_COMPL_EVENT,
+	WMI_TLV_TAG_STRUCT_OFFCHAN_DATA_TX_SEND_CMD,
+	WMI_TLV_TAG_STRUCT_TX_SEND_PARAMS,
+	WMI_TLV_TAG_STRUCT_HE_RATE_SET,
+	WMI_TLV_TAG_STRUCT_CONGESTION_STATS,
+	WMI_TLV_TAG_STRUCT_SET_INIT_COUNTRY_CMD,
+	WMI_TLV_TAG_STRUCT_SCAN_DBS_DUTY_CYCLE,
+	WMI_TLV_TAG_STRUCT_SCAN_DBS_DUTY_CYCLE_PARAM_TLV,
+	WMI_TLV_TAG_STRUCT_PDEV_DIV_GET_RSSI_ANTID,
+	WMI_TLV_TAG_STRUCT_THERM_THROT_CONFIG_REQUEST,
+	WMI_TLV_TAG_STRUCT_THERM_THROT_LEVEL_CONFIG_INFO,
+	WMI_TLV_TAG_STRUCT_THERM_THROT_STATS_EVENT,
+	WMI_TLV_TAG_STRUCT_THERM_THROT_LEVEL_STATS_INFO,
+	WMI_TLV_TAG_STRUCT_PDEV_DIV_RSSI_ANTID_EVENT,
+	WMI_TLV_TAG_STRUCT_OEM_DMA_RING_CAPABILITIES,
+	WMI_TLV_TAG_STRUCT_OEM_DMA_RING_CFG_REQ,
+	WMI_TLV_TAG_STRUCT_OEM_DMA_RING_CFG_RSP,
+	WMI_TLV_TAG_STRUCT_OEM_INDIRECT_DATA,
+	WMI_TLV_TAG_STRUCT_OEM_DMA_BUF_RELEASE,
+	WMI_TLV_TAG_STRUCT_OEM_DMA_BUF_RELEASE_ENTRY,
+	WMI_TLV_TAG_STRUCT_PDEV_BSS_CHAN_INFO_REQUEST,
+	WMI_TLV_TAG_STRUCT_PDEV_BSS_CHAN_INFO_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_LCA_DISALLOW_CONFIG_TLV_PARAM,
+	WMI_TLV_TAG_STRUCT_VDEV_LIMIT_OFFCHAN_CMD,
+	WMI_TLV_TAG_STRUCT_ROAM_RSSI_REJECTION_OCE_CONFIG_PARAM,
+	WMI_TLV_TAG_STRUCT_UNIT_TEST_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_FILS_OFFLOAD_TLV_PARAM,
+	WMI_TLV_TAG_STRUCT_PDEV_UPDATE_PMK_CACHE_CMD,
+	WMI_TLV_TAG_STRUCT_PMK_CACHE,
+	WMI_TLV_TAG_STRUCT_PDEV_UPDATE_FILS_HLP_PKT_CMD,
+	WMI_TLV_TAG_STRUCT_ROAM_FILS_SYNCH_TLV_PARAM,
+	WMI_TLV_TAG_STRUCT_GTK_OFFLOAD_EXTENDED_TLV_PARAM,
+	WMI_TLV_TAG_STRUCT_ROAM_BG_SCAN_ROAMING_PARAM,
+	WMI_TLV_TAG_STRUCT_OIC_PING_OFFLOAD_PARAMS_CMD,
+	WMI_TLV_TAG_STRUCT_OIC_PING_OFFLOAD_SET_ENABLE_CMD,
+	WMI_TLV_TAG_STRUCT_OIC_PING_HANDOFF_EVENT,
+	WMI_TLV_TAG_STRUCT_DHCP_LEASE_RENEW_OFFLOAD_CMD,
+	WMI_TLV_TAG_STRUCT_DHCP_LEASE_RENEW_EVENT,
+	WMI_TLV_TAG_STRUCT_BTM_CONFIG,
+	WMI_TLV_TAG_STRUCT_DEBUG_MESG_FW_DATA_STALL_PARAM,
+	WMI_TLV_TAG_STRUCT_WLM_CONFIG_CMD,
+	WMI_TLV_TAG_STRUCT_PDEV_UPDATE_CTLTABLE_REQUEST,
+	WMI_TLV_TAG_STRUCT_PDEV_UPDATE_CTLTABLE_EVENT,
+	WMI_TLV_TAG_STRUCT_ROAM_CND_SCORING_PARAM,
+	WMI_TLV_TAG_STRUCT_PDEV_CONFIG_VENDOR_OUI_ACTION,
+	WMI_TLV_TAG_STRUCT_VENDOR_OUI_EXT,
+	WMI_TLV_TAG_STRUCT_ROAM_SYNCH_FRAME_EVENT,
+	WMI_TLV_TAG_STRUCT_FD_SEND_FROM_HOST_CMD,
+	WMI_TLV_TAG_STRUCT_ENABLE_FILS_CMD,
+	WMI_TLV_TAG_STRUCT_HOST_SWFDA_EVENT,
 
 	WMI_TLV_TAG_MAX
 };
@@ -1068,16 +1338,74 @@ enum wmi_tlv_service {
 	WMI_TLV_SERVICE_WLAN_STATS_REPORT,
 	WMI_TLV_SERVICE_TX_MSDU_ID_NEW_PARTITION_SUPPORT,
 	WMI_TLV_SERVICE_DFS_PHYERR_OFFLOAD,
+	WMI_TLV_SERVICE_RCPI_SUPPORT,
+	WMI_TLV_SERVICE_FW_MEM_DUMP_SUPPORT,
+	WMI_TLV_SERVICE_PEER_STATS_INFO,
+	WMI_TLV_SERVICE_REGULATORY_DB,
+	WMI_TLV_SERVICE_11D_OFFLOAD,
+	WMI_TLV_SERVICE_HW_DATA_FILTERING,
+	WMI_TLV_SERVICE_MULTIPLE_VDEV_RESTART,
+	WMI_TLV_SERVICE_PKT_ROUTING,
+	WMI_TLV_SERVICE_CHECK_CAL_VERSION,
+	WMI_TLV_SERVICE_OFFCHAN_TX_WMI,
+	WMI_TLV_SERVICE_8SS_TX_BFEE,
+	WMI_TLV_SERVICE_EXTENDED_NSS_SUPPORT,
+	WMI_TLV_SERVICE_ACK_TIMEOUT,
+	WMI_TLV_SERVICE_PDEV_BSS_CHANNEL_INFO_64,
+	WMI_TLV_MAX_SERVICE = 128,
+
+/* NOTE:
+ * The above service flags are delivered in the wmi_service_bitmap field
+ * of the WMI_TLV_SERVICE_READY_EVENT message.
+ * The below service flags are delivered in a WMI_TLV_SERVICE_AVAILABLE_EVENT
+ * message rather than in the WMI_TLV_SERVICE_READY_EVENT message's
+ * wmi_service_bitmap field.
+ * The WMI_TLV_SERVICE_AVAILABLE_EVENT message immediately precedes the
+ * WMI_TLV_SERVICE_READY_EVENT message.
+ */
+
+	WMI_TLV_SERVICE_CHAN_LOAD_INFO = 128,
+	WMI_TLV_SERVICE_TX_PPDU_INFO_STATS_SUPPORT,
+	WMI_TLV_SERVICE_VDEV_LIMIT_OFFCHAN_SUPPORT,
+	WMI_TLV_SERVICE_FILS_SUPPORT,
+	WMI_TLV_SERVICE_WLAN_OIC_PING_OFFLOAD,
+	WMI_TLV_SERVICE_WLAN_DHCP_RENEW,
+	WMI_TLV_SERVICE_MAWC_SUPPORT,
+	WMI_TLV_SERVICE_VDEV_LATENCY_CONFIG,
+	WMI_TLV_SERVICE_PDEV_UPDATE_CTLTABLE_SUPPORT,
+	WMI_TLV_SERVICE_PKTLOG_SUPPORT_OVER_HTT,
+	WMI_TLV_SERVICE_VDEV_MULTI_GROUP_KEY_SUPPORT,
+	WMI_TLV_SERVICE_SCAN_PHYMODE_SUPPORT,
+	WMI_TLV_SERVICE_THERM_THROT,
+	WMI_TLV_SERVICE_BCN_OFFLOAD_START_STOP_SUPPORT,
+	WMI_TLV_SERVICE_WOW_WAKEUP_BY_TIMER_PATTERN,
+	WMI_TLV_SERVICE_PEER_MAP_UNMAP_V2_SUPPORT = 143,
+	WMI_TLV_SERVICE_OFFCHAN_DATA_TID_SUPPORT = 144,
+	WMI_TLV_SERVICE_RX_PROMISC_ENABLE_SUPPORT = 145,
+	WMI_TLV_SERVICE_SUPPORT_DIRECT_DMA = 146,
+	WMI_TLV_SERVICE_AP_OBSS_DETECTION_OFFLOAD = 147,
+	WMI_TLV_SERVICE_11K_NEIGHBOUR_REPORT_SUPPORT = 148,
+	WMI_TLV_SERVICE_LISTEN_INTERVAL_OFFLOAD_SUPPORT = 149,
+	WMI_TLV_SERVICE_BSS_COLOR_OFFLOAD = 150,
+	WMI_TLV_SERVICE_RUNTIME_DPD_RECAL = 151,
+	WMI_TLV_SERVICE_STA_TWT = 152,
+	WMI_TLV_SERVICE_AP_TWT = 153,
+	WMI_TLV_SERVICE_GMAC_OFFLOAD_SUPPORT = 154,
+	WMI_TLV_SERVICE_SPOOF_MAC_SUPPORT = 155,
+
+	WMI_TLV_MAX_EXT_SERVICE = 256,
 };
 
-#define WMI_SERVICE_IS_ENABLED(wmi_svc_bmap, svc_id, len) \
-	((svc_id) < (len) && \
-	 __le32_to_cpu((wmi_svc_bmap)[(svc_id) / (sizeof(u32))]) & \
-	 BIT((svc_id) % (sizeof(u32))))
+#define WMI_TLV_EXT_SERVICE_IS_ENABLED(wmi_svc_bmap, svc_id, len) \
+	((svc_id) < (WMI_TLV_MAX_EXT_SERVICE) && \
+	 (svc_id) >= (len) && \
+	__le32_to_cpu((wmi_svc_bmap)[((svc_id) - (len)) / 32]) & \
+	BIT(((((svc_id) - (len)) % 32) & 0x1f)))
 
 #define SVCMAP(x, y, len) \
 	do { \
-		if (WMI_SERVICE_IS_ENABLED((in), (x), (len))) \
+		if ((WMI_SERVICE_IS_ENABLED((in), (x), (len))) || \
+			(WMI_TLV_EXT_SERVICE_IS_ENABLED((in), (x), (len)))) \
 			__set_bit(y, out); \
 	} while (0)
 
@@ -1228,6 +1556,14 @@ wmi_tlv_svc_map(const __le32 *in, unsigned long *out, size_t len)
 	       WMI_SERVICE_MGMT_TX_WMI, len);
 }
 
+static inline void
+wmi_tlv_svc_map_ext(const __le32 *in, unsigned long *out, size_t len)
+{
+	SVCMAP(WMI_TLV_SERVICE_SPOOF_MAC_SUPPORT,
+	       WMI_SERVICE_SPOOF_MAC_SUPPORT,
+	       WMI_TLV_MAX_SERVICE);
+}
+
 #undef SVCMAP
 
 struct wmi_tlv {
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index e12dd28fcf5d..fa07ef69122b 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -5059,7 +5059,6 @@ static void ath10k_wmi_event_service_ready_work(struct work_struct *work)
 		return;
 	}
 
-	memset(&ar->wmi.svc_map, 0, sizeof(ar->wmi.svc_map));
 	ath10k_wmi_map_svc(ar, arg.service_map, ar->wmi.svc_map,
 			   arg.service_map_len);
 
@@ -5269,6 +5268,21 @@ int ath10k_wmi_event_ready(struct ath10k *ar, struct sk_buff *skb)
 	return 0;
 }
 
+void ath10k_wmi_event_service_available(struct ath10k *ar, struct sk_buff *skb)
+{
+	int ret;
+	struct wmi_svc_avail_ev_arg arg = {};
+
+	ret = ath10k_wmi_pull_svc_avail(ar, skb, &arg);
+	if (ret) {
+		ath10k_warn(ar, "failed to parse servive available event: %d\n",
+			    ret);
+	}
+
+	ath10k_wmi_map_svc_ext(ar, arg.service_map_ext, ar->wmi.svc_map,
+			       __le32_to_cpu(arg.service_map_ext_len));
+}
+
 static int ath10k_wmi_event_temperature(struct ath10k *ar, struct sk_buff *skb)
 {
 	const struct wmi_pdev_temperature_event *ev;
@@ -5465,6 +5479,9 @@ static void ath10k_wmi_op_rx(struct ath10k *ar, struct sk_buff *skb)
 		ath10k_wmi_event_ready(ar, skb);
 		ath10k_wmi_queue_set_coverage_class_work(ar);
 		break;
+	case WMI_SERVICE_AVAILABLE_EVENTID:
+		ath10k_wmi_event_service_available(ar, skb);
+		break;
 	default:
 		ath10k_warn(ar, "Unknown eventid: %d\n", id);
 		break;
@@ -5880,6 +5897,8 @@ int ath10k_wmi_connect(struct ath10k *ar)
 	struct ath10k_htc_svc_conn_req conn_req;
 	struct ath10k_htc_svc_conn_resp conn_resp;
 
+	memset(&ar->wmi.svc_map, 0, sizeof(ar->wmi.svc_map));
+
 	memset(&conn_req, 0, sizeof(conn_req));
 	memset(&conn_resp, 0, sizeof(conn_resp));
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 3cc129d812a4..7e2f24fdcf1c 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -202,6 +202,7 @@ enum wmi_service {
 	WMI_SERVICE_HOST_DFS_CHECK_SUPPORT,
 	WMI_SERVICE_TPC_STATS_FINAL,
 	WMI_SERVICE_RESET_CHIP,
+	WMI_SERVICE_SPOOF_MAC_SUPPORT,
 
 	/* keep last */
 	WMI_SERVICE_MAX,
@@ -1190,6 +1191,7 @@ enum wmi_cmd_id {
 enum wmi_event_id {
 	WMI_SERVICE_READY_EVENTID = 0x1,
 	WMI_READY_EVENTID,
+	WMI_SERVICE_AVAILABLE_EVENTID,
 
 	/* Scan specific events */
 	WMI_SCAN_EVENTID = WMI_EVT_GRP_START_ID(WMI_GRP_SCAN),
@@ -6639,6 +6641,11 @@ struct wmi_svc_rdy_ev_arg {
 	const struct wlan_host_mem_req *mem_reqs[WMI_MAX_MEM_REQS];
 };
 
+struct wmi_svc_avail_ev_arg {
+	__le32 service_map_ext_len;
+	const __le32 *service_map_ext;
+};
+
 struct wmi_rdy_ev_arg {
 	__le32 sw_version;
 	__le32 abi_version;
@@ -7059,6 +7066,7 @@ void ath10k_wmi_event_vdev_standby_req(struct ath10k *ar, struct sk_buff *skb);
 void ath10k_wmi_event_vdev_resume_req(struct ath10k *ar, struct sk_buff *skb);
 void ath10k_wmi_event_service_ready(struct ath10k *ar, struct sk_buff *skb);
 int ath10k_wmi_event_ready(struct ath10k *ar, struct sk_buff *skb);
+void ath10k_wmi_event_service_available(struct ath10k *ar, struct sk_buff *skb);
 int ath10k_wmi_op_pull_phyerr_ev(struct ath10k *ar, const void *phyerr_buf,
 				 int left_len, struct wmi_phyerr_ev_arg *arg);
 void ath10k_wmi_main_op_fw_stats_fill(struct ath10k *ar,
-- 
cgit v1.2.3-59-g8ed1b


From 60e1d0fb290197fe505dff6e4e3b7e4d258dbf60 Mon Sep 17 00:00:00 2001
From: Carl Huang <cjhuang@codeaurora.org>
Date: Thu, 19 Apr 2018 19:39:40 +0300
Subject: ath10k: support MAC address randomization in scan

The ath10k reports the random_mac_addr capability to upper layer
based on the service bit firmware reported. Driver sets the
spoofed flag in scan_ctrl_flag to firmware if upper layer has
enabled this feature in scan request.

Test with QCA6174 hw3.0 and firmware-6.bin_WLAN.RM.4.4.1-00102-QCARMSWP-1,
but QCA9377 is also affected.

Signed-off-by: Carl Huang <cjhuang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/mac.c     | 17 +++++++++++++++++
 drivers/net/wireless/ath/ath10k/wmi-ops.h | 22 ++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/wmi-tlv.c | 25 +++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/wmi-tlv.h | 11 +++++++++++
 drivers/net/wireless/ath/ath10k/wmi.c     |  5 +++++
 drivers/net/wireless/ath/ath10k/wmi.h     |  9 +++++++++
 6 files changed, 89 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index fc3320f94f45..c71cf5b81385 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -5717,6 +5717,12 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
 		arg.scan_ctrl_flags |= WMI_SCAN_FLAG_PASSIVE;
 	}
 
+	if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
+		arg.scan_ctrl_flags |=  WMI_SCAN_ADD_SPOOFED_MAC_IN_PROBE_REQ;
+		ether_addr_copy(arg.mac_addr.addr, req->mac_addr);
+		ether_addr_copy(arg.mac_mask.addr, req->mac_addr_mask);
+	}
+
 	if (req->n_channels) {
 		arg.n_channels = req->n_channels;
 		for (i = 0; i < arg.n_channels; i++)
@@ -8433,6 +8439,17 @@ int ath10k_mac_register(struct ath10k *ar)
 		goto err_dfs_detector_exit;
 	}
 
+	if (test_bit(WMI_SERVICE_SPOOF_MAC_SUPPORT, ar->wmi.svc_map)) {
+		ret = ath10k_wmi_scan_prob_req_oui(ar, ar->mac_addr);
+		if (ret) {
+			ath10k_err(ar, "failed to set prob req oui: %i\n", ret);
+			goto err_dfs_detector_exit;
+		}
+
+		ar->hw->wiphy->features |=
+			NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR;
+	}
+
 	ar->hw->wiphy->cipher_suites = cipher_suites;
 
 	/* QCA988x and QCA6174 family chips do not support CCMP-256, GCMP-128
diff --git a/drivers/net/wireless/ath/ath10k/wmi-ops.h b/drivers/net/wireless/ath/ath10k/wmi-ops.h
index 4f7e0ba071a2..e37d16b31afe 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-ops.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-ops.h
@@ -119,6 +119,8 @@ struct wmi_ops {
 					 u32 value);
 	struct sk_buff *(*gen_scan_chan_list)(struct ath10k *ar,
 					      const struct wmi_scan_chan_list_arg *arg);
+	struct sk_buff *(*gen_scan_prob_req_oui)(struct ath10k *ar,
+						 u32 prob_req_oui);
 	struct sk_buff *(*gen_beacon_dma)(struct ath10k *ar, u32 vdev_id,
 					  const void *bcn, size_t bcn_len,
 					  u32 bcn_paddr, bool dtim_zero,
@@ -914,6 +916,26 @@ ath10k_wmi_scan_chan_list(struct ath10k *ar,
 	return ath10k_wmi_cmd_send(ar, skb, ar->wmi.cmd->scan_chan_list_cmdid);
 }
 
+static inline int
+ath10k_wmi_scan_prob_req_oui(struct ath10k *ar, const u8 mac_addr[ETH_ALEN])
+{
+	struct sk_buff *skb;
+	u32 prob_req_oui;
+
+	prob_req_oui = (((u32)mac_addr[0]) << 16) |
+		       (((u32)mac_addr[1]) << 8) | mac_addr[2];
+
+	if (!ar->wmi.ops->gen_scan_prob_req_oui)
+		return -EOPNOTSUPP;
+
+	skb = ar->wmi.ops->gen_scan_prob_req_oui(ar, prob_req_oui);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	return ath10k_wmi_cmd_send(ar, skb,
+			ar->wmi.cmd->scan_prob_req_oui_cmdid);
+}
+
 static inline int
 ath10k_wmi_peer_assoc(struct ath10k *ar,
 		      const struct wmi_peer_assoc_complete_arg *arg)
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
index 31866e3b74fc..01f4eb201330 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
@@ -1636,6 +1636,8 @@ ath10k_wmi_tlv_op_gen_start_scan(struct ath10k *ar,
 	cmd->num_bssids = __cpu_to_le32(arg->n_bssids);
 	cmd->ie_len = __cpu_to_le32(arg->ie_len);
 	cmd->num_probes = __cpu_to_le32(3);
+	ether_addr_copy(cmd->mac_addr.addr, arg->mac_addr.addr);
+	ether_addr_copy(cmd->mac_mask.addr, arg->mac_mask.addr);
 
 	/* FIXME: There are some scan flag inconsistencies across firmwares,
 	 * e.g. WMI-TLV inverts the logic behind the following flag.
@@ -2482,6 +2484,27 @@ ath10k_wmi_tlv_op_gen_scan_chan_list(struct ath10k *ar,
 	return skb;
 }
 
+static struct sk_buff *
+ath10k_wmi_tlv_op_gen_scan_prob_req_oui(struct ath10k *ar, u32 prob_req_oui)
+{
+	struct wmi_scan_prob_req_oui_cmd *cmd;
+	struct wmi_tlv *tlv;
+	struct sk_buff *skb;
+
+	skb = ath10k_wmi_alloc_skb(ar, sizeof(*tlv) + sizeof(*cmd));
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	tlv = (void *)skb->data;
+	tlv->tag = __cpu_to_le16(WMI_TLV_TAG_STRUCT_SCAN_PROB_REQ_OUI_CMD);
+	tlv->len = __cpu_to_le16(sizeof(*cmd));
+	cmd = (void *)tlv->value;
+	cmd->prob_req_oui = __cpu_to_le32(prob_req_oui);
+
+	ath10k_dbg(ar, ATH10K_DBG_WMI, "wmi tlv scan prob req oui\n");
+	return skb;
+}
+
 static struct sk_buff *
 ath10k_wmi_tlv_op_gen_beacon_dma(struct ath10k *ar, u32 vdev_id,
 				 const void *bcn, size_t bcn_len,
@@ -3452,6 +3475,7 @@ static struct wmi_cmd_map wmi_tlv_cmd_map = {
 	.stop_scan_cmdid = WMI_TLV_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_TLV_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_TLV_SCAN_SCH_PRIO_TBL_CMDID,
+	.scan_prob_req_oui_cmdid = WMI_TLV_SCAN_PROB_REQ_OUI_CMDID,
 	.pdev_set_regdomain_cmdid = WMI_TLV_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_TLV_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_TLV_PDEV_SET_PARAM_CMDID,
@@ -3820,6 +3844,7 @@ static const struct wmi_ops wmi_tlv_ops = {
 	.gen_set_sta_ps = ath10k_wmi_tlv_op_gen_set_sta_ps,
 	.gen_set_ap_ps = ath10k_wmi_tlv_op_gen_set_ap_ps,
 	.gen_scan_chan_list = ath10k_wmi_tlv_op_gen_scan_chan_list,
+	.gen_scan_prob_req_oui = ath10k_wmi_tlv_op_gen_scan_prob_req_oui,
 	.gen_beacon_dma = ath10k_wmi_tlv_op_gen_beacon_dma,
 	.gen_pdev_set_wmm = ath10k_wmi_tlv_op_gen_pdev_set_wmm,
 	.gen_request_stats = ath10k_wmi_tlv_op_gen_request_stats,
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
index 529b91b17061..954c50bb3f66 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
@@ -1706,6 +1706,15 @@ struct wmi_tlv_scan_chan_list_cmd {
 	__le32 num_scan_chans;
 } __packed;
 
+struct wmi_scan_prob_req_oui_cmd {
+/* OUI to be used in Probe Request frame when random MAC address is
+ * requested part of scan parameters. This is applied to both FW internal
+ * scans and host initiated scans. Host can request for random MAC address
+ * with WMI_SCAN_ADD_SPOOFED_MAC_IN_PROBE_REQ flag.
+ */
+	__le32 prob_req_oui;
+}  __packed;
+
 struct wmi_tlv_start_scan_cmd {
 	struct wmi_start_scan_common common;
 	__le32 burst_duration_ms;
@@ -1714,6 +1723,8 @@ struct wmi_tlv_start_scan_cmd {
 	__le32 num_ssids;
 	__le32 ie_len;
 	__le32 num_probes;
+	struct wmi_mac_addr mac_addr;
+	struct wmi_mac_addr mac_mask;
 } __packed;
 
 struct wmi_tlv_vdev_start_cmd {
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index fa07ef69122b..df2e92a6c9bd 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -42,6 +42,7 @@ static struct wmi_cmd_map wmi_cmd_map = {
 	.stop_scan_cmdid = WMI_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_SCAN_SCH_PRIO_TBL_CMDID,
+	.scan_prob_req_oui_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_set_regdomain_cmdid = WMI_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_PDEV_SET_PARAM_CMDID,
@@ -207,6 +208,7 @@ static struct wmi_cmd_map wmi_10x_cmd_map = {
 	.stop_scan_cmdid = WMI_10X_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_10X_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_CMD_UNSUPPORTED,
+	.scan_prob_req_oui_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_set_regdomain_cmdid = WMI_10X_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_10X_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_10X_PDEV_SET_PARAM_CMDID,
@@ -374,6 +376,7 @@ static struct wmi_cmd_map wmi_10_2_4_cmd_map = {
 	.stop_scan_cmdid = WMI_10_2_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_10_2_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_CMD_UNSUPPORTED,
+	.scan_prob_req_oui_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_set_regdomain_cmdid = WMI_10_2_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_10_2_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_10_2_PDEV_SET_PARAM_CMDID,
@@ -541,6 +544,7 @@ static struct wmi_cmd_map wmi_10_4_cmd_map = {
 	.stop_scan_cmdid = WMI_10_4_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_10_4_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_10_4_SCAN_SCH_PRIO_TBL_CMDID,
+	.scan_prob_req_oui_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_set_regdomain_cmdid = WMI_10_4_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_10_4_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_10_4_PDEV_SET_PARAM_CMDID,
@@ -1338,6 +1342,7 @@ static struct wmi_cmd_map wmi_10_2_cmd_map = {
 	.stop_scan_cmdid = WMI_10_2_STOP_SCAN_CMDID,
 	.scan_chan_list_cmdid = WMI_10_2_SCAN_CHAN_LIST_CMDID,
 	.scan_sch_prio_tbl_cmdid = WMI_CMD_UNSUPPORTED,
+	.scan_prob_req_oui_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_set_regdomain_cmdid = WMI_10_2_PDEV_SET_REGDOMAIN_CMDID,
 	.pdev_set_channel_cmdid = WMI_10_2_PDEV_SET_CHANNEL_CMDID,
 	.pdev_set_param_cmdid = WMI_10_2_PDEV_SET_PARAM_CMDID,
diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 7e2f24fdcf1c..2705973e39c6 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -791,6 +791,7 @@ struct wmi_cmd_map {
 	u32 stop_scan_cmdid;
 	u32 scan_chan_list_cmdid;
 	u32 scan_sch_prio_tbl_cmdid;
+	u32 scan_prob_req_oui_cmdid;
 	u32 pdev_set_regdomain_cmdid;
 	u32 pdev_set_channel_cmdid;
 	u32 pdev_set_param_cmdid;
@@ -3168,6 +3169,8 @@ struct wmi_start_scan_arg {
 	u16 channels[64];
 	struct wmi_ssid_arg ssids[WLAN_SCAN_PARAMS_MAX_SSID];
 	struct wmi_bssid_arg bssids[WLAN_SCAN_PARAMS_MAX_BSSID];
+	struct wmi_mac_addr mac_addr;
+	struct wmi_mac_addr mac_mask;
 };
 
 /* scan control flags */
@@ -3191,6 +3194,12 @@ struct wmi_start_scan_arg {
  */
 #define WMI_SCAN_CONTINUE_ON_ERROR 0x80
 
+/* Use random MAC address for TA for Probe Request frame and add
+ * OUI specified by WMI_SCAN_PROB_REQ_OUI_CMDID to the Probe Request frame.
+ * if OUI is not set by WMI_SCAN_PROB_REQ_OUI_CMDID then the flag is ignored.
+ */
+#define WMI_SCAN_ADD_SPOOFED_MAC_IN_PROBE_REQ   0x1000
+
 /* WMI_SCAN_CLASS_MASK must be the same value as IEEE80211_SCAN_CLASS_MASK */
 #define WMI_SCAN_CLASS_MASK 0xFF000000
 
-- 
cgit v1.2.3-59-g8ed1b


From fa3440fa2fa1531fb2aafb6d51f2bc5ac70a6247 Mon Sep 17 00:00:00 2001
From: Wen Gong <wgong@codeaurora.org>
Date: Thu, 19 Apr 2018 19:40:07 +0300
Subject: ath10k: convert wow pattern from 802.3 to 802.11

When trying to set wow wakeup patterns it fails with this command:

iw phyxx wowlan enable patterns offset xx+ IP address xx.xx.xx.xx

The reason is that the wow pattern from upper layer is in 802.3 format
for this case, it need to convert it to 802.11 format. The input
offset parameter is used for 802.3, but the actual offset firmware
need depends on rx_decap_mode, so that it needs to be recalculated.
Pattern of 802.3 packet is not same with 802.11 packet. If the
rx_decap_mode is ATH10K_HW_TXRX_NATIVE_WIFI, then firmware will
receive data packet with 802.11 format from hardware.

Tested with QCA6174 hw3.0 with firmware
WLAN.RM.4.4.1-00099-QCARMSWPZ-1, but this will also affect QCA9377.
This has always failed, so it's not a regression with new firmware
releases.

Signed-off-by: Wen Gong <wgong@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.h |   4 +
 drivers/net/wireless/ath/ath10k/wow.c | 138 ++++++++++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 2705973e39c6..16a39244a34f 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -6835,6 +6835,10 @@ struct wmi_wow_ev_arg {
 #define WOW_MIN_PATTERN_SIZE	1
 #define WOW_MAX_PATTERN_SIZE	148
 #define WOW_MAX_PKT_OFFSET	128
+#define WOW_HDR_LEN	(sizeof(struct ieee80211_hdr_3addr) + \
+	sizeof(struct rfc1042_hdr))
+#define WOW_MAX_REDUCE	(WOW_HDR_LEN - sizeof(struct ethhdr) - \
+	offsetof(struct ieee80211_hdr_3addr, addr1))
 
 enum wmi_tdls_state {
 	WMI_TDLS_DISABLE,
diff --git a/drivers/net/wireless/ath/ath10k/wow.c b/drivers/net/wireless/ath/ath10k/wow.c
index c4cbccb29b31..a6b179f88d36 100644
--- a/drivers/net/wireless/ath/ath10k/wow.c
+++ b/drivers/net/wireless/ath/ath10k/wow.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -76,6 +77,109 @@ static int ath10k_wow_cleanup(struct ath10k *ar)
 	return 0;
 }
 
+/**
+ * Convert a 802.3 format to a 802.11 format.
+ *         +------------+-----------+--------+----------------+
+ * 802.3:  |dest mac(6B)|src mac(6B)|type(2B)|     body...    |
+ *         +------------+-----------+--------+----------------+
+ *                |__         |_______    |____________  |________
+ *                   |                |                |          |
+ *         +--+------------+----+-----------+---------------+-----------+
+ * 802.11: |4B|dest mac(6B)| 6B |src mac(6B)|  8B  |type(2B)|  body...  |
+ *         +--+------------+----+-----------+---------------+-----------+
+ */
+static void ath10k_wow_convert_8023_to_80211
+					(struct cfg80211_pkt_pattern *new,
+					const struct cfg80211_pkt_pattern *old)
+{
+	u8 hdr_8023_pattern[ETH_HLEN] = {};
+	u8 hdr_8023_bit_mask[ETH_HLEN] = {};
+	u8 hdr_80211_pattern[WOW_HDR_LEN] = {};
+	u8 hdr_80211_bit_mask[WOW_HDR_LEN] = {};
+
+	int total_len = old->pkt_offset + old->pattern_len;
+	int hdr_80211_end_offset;
+
+	struct ieee80211_hdr_3addr *new_hdr_pattern =
+		(struct ieee80211_hdr_3addr *)hdr_80211_pattern;
+	struct ieee80211_hdr_3addr *new_hdr_mask =
+		(struct ieee80211_hdr_3addr *)hdr_80211_bit_mask;
+	struct ethhdr *old_hdr_pattern = (struct ethhdr *)hdr_8023_pattern;
+	struct ethhdr *old_hdr_mask = (struct ethhdr *)hdr_8023_bit_mask;
+	int hdr_len = sizeof(*new_hdr_pattern);
+
+	struct rfc1042_hdr *new_rfc_pattern =
+		(struct rfc1042_hdr *)(hdr_80211_pattern + hdr_len);
+	struct rfc1042_hdr *new_rfc_mask =
+		(struct rfc1042_hdr *)(hdr_80211_bit_mask + hdr_len);
+	int rfc_len = sizeof(*new_rfc_pattern);
+
+	memcpy(hdr_8023_pattern + old->pkt_offset,
+	       old->pattern, ETH_HLEN - old->pkt_offset);
+	memcpy(hdr_8023_bit_mask + old->pkt_offset,
+	       old->mask, ETH_HLEN - old->pkt_offset);
+
+	/* Copy destination address */
+	memcpy(new_hdr_pattern->addr1, old_hdr_pattern->h_dest, ETH_ALEN);
+	memcpy(new_hdr_mask->addr1, old_hdr_mask->h_dest, ETH_ALEN);
+
+	/* Copy source address */
+	memcpy(new_hdr_pattern->addr3, old_hdr_pattern->h_source, ETH_ALEN);
+	memcpy(new_hdr_mask->addr3, old_hdr_mask->h_source, ETH_ALEN);
+
+	/* Copy logic link type */
+	memcpy(&new_rfc_pattern->snap_type,
+	       &old_hdr_pattern->h_proto,
+	       sizeof(old_hdr_pattern->h_proto));
+	memcpy(&new_rfc_mask->snap_type,
+	       &old_hdr_mask->h_proto,
+	       sizeof(old_hdr_mask->h_proto));
+
+	/* Caculate new pkt_offset */
+	if (old->pkt_offset < ETH_ALEN)
+		new->pkt_offset = old->pkt_offset +
+			offsetof(struct ieee80211_hdr_3addr, addr1);
+	else if (old->pkt_offset < offsetof(struct ethhdr, h_proto))
+		new->pkt_offset = old->pkt_offset +
+			offsetof(struct ieee80211_hdr_3addr, addr3) -
+			offsetof(struct ethhdr, h_source);
+	else
+		new->pkt_offset = old->pkt_offset + hdr_len + rfc_len - ETH_HLEN;
+
+	/* Caculate new hdr end offset */
+	if (total_len > ETH_HLEN)
+		hdr_80211_end_offset = hdr_len + rfc_len;
+	else if (total_len > offsetof(struct ethhdr, h_proto))
+		hdr_80211_end_offset = hdr_len + rfc_len + total_len - ETH_HLEN;
+	else if (total_len > ETH_ALEN)
+		hdr_80211_end_offset = total_len - ETH_ALEN +
+			offsetof(struct ieee80211_hdr_3addr, addr3);
+	else
+		hdr_80211_end_offset = total_len +
+			offsetof(struct ieee80211_hdr_3addr, addr1);
+
+	new->pattern_len = hdr_80211_end_offset - new->pkt_offset;
+
+	memcpy((u8 *)new->pattern,
+	       hdr_80211_pattern + new->pkt_offset,
+	       new->pattern_len);
+	memcpy((u8 *)new->mask,
+	       hdr_80211_bit_mask + new->pkt_offset,
+	       new->pattern_len);
+
+	if (total_len > ETH_HLEN) {
+		/* Copy frame body */
+		memcpy((u8 *)new->pattern + new->pattern_len,
+		       (void *)old->pattern + ETH_HLEN - old->pkt_offset,
+		       total_len - ETH_HLEN);
+		memcpy((u8 *)new->mask + new->pattern_len,
+		       (void *)old->mask + ETH_HLEN - old->pkt_offset,
+		       total_len - ETH_HLEN);
+
+		new->pattern_len += total_len - ETH_HLEN;
+	}
+}
+
 static int ath10k_vif_wow_set_wakeups(struct ath10k_vif *arvif,
 				      struct cfg80211_wowlan *wowlan)
 {
@@ -116,22 +220,40 @@ static int ath10k_vif_wow_set_wakeups(struct ath10k_vif *arvif,
 
 	for (i = 0; i < wowlan->n_patterns; i++) {
 		u8 bitmask[WOW_MAX_PATTERN_SIZE] = {};
+		u8 ath_pattern[WOW_MAX_PATTERN_SIZE] = {};
+		u8 ath_bitmask[WOW_MAX_PATTERN_SIZE] = {};
+		struct cfg80211_pkt_pattern new_pattern = {};
+		struct cfg80211_pkt_pattern old_pattern = patterns[i];
 		int j;
 
+		new_pattern.pattern = ath_pattern;
+		new_pattern.mask = ath_bitmask;
 		if (patterns[i].pattern_len > WOW_MAX_PATTERN_SIZE)
 			continue;
-
 		/* convert bytemask to bitmask */
 		for (j = 0; j < patterns[i].pattern_len; j++)
 			if (patterns[i].mask[j / 8] & BIT(j % 8))
 				bitmask[j] = 0xff;
+		old_pattern.mask = bitmask;
+		new_pattern = old_pattern;
+
+		if (ar->wmi.rx_decap_mode == ATH10K_HW_TXRX_NATIVE_WIFI) {
+			if (patterns[i].pkt_offset < ETH_HLEN)
+				ath10k_wow_convert_8023_to_80211(&new_pattern,
+								 &old_pattern);
+			else
+				new_pattern.pkt_offset += WOW_HDR_LEN - ETH_HLEN;
+		}
+
+		if (WARN_ON(new_pattern.pattern_len > WOW_MAX_PATTERN_SIZE))
+			return -EINVAL;
 
 		ret = ath10k_wmi_wow_add_pattern(ar, arvif->vdev_id,
 						 pattern_id,
-						 patterns[i].pattern,
-						 bitmask,
-						 patterns[i].pattern_len,
-						 patterns[i].pkt_offset);
+						 new_pattern.pattern,
+						 new_pattern.mask,
+						 new_pattern.pattern_len,
+						 new_pattern.pkt_offset);
 		if (ret) {
 			ath10k_warn(ar, "failed to add pattern %i to vdev %i: %d\n",
 				    pattern_id,
@@ -345,6 +467,12 @@ int ath10k_wow_init(struct ath10k *ar)
 		return -EINVAL;
 
 	ar->wow.wowlan_support = ath10k_wowlan_support;
+
+	if (ar->wmi.rx_decap_mode == ATH10K_HW_TXRX_NATIVE_WIFI) {
+		ar->wow.wowlan_support.pattern_max_len -= WOW_MAX_REDUCE;
+		ar->wow.wowlan_support.max_pkt_offset -= WOW_MAX_REDUCE;
+	}
+
 	ar->wow.wowlan_support.n_patterns = ar->wow.max_num_patterns;
 	ar->hw->wiphy->wowlan = &ar->wow.wowlan_support;
 
-- 
cgit v1.2.3-59-g8ed1b


From 0fc8bb50bbfc82d113218c803e1965838dcdbc89 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Thu, 19 Apr 2018 19:40:11 +0300
Subject: wcn36xx: pass correct BSS index when deleting BSS keys

The firmware message to delete BSS keys expects a BSS index to be passed.
This field is currently hard-coded to 0. Fix this by passing in the index
we received from the firmware when the BSS was configured.

The encryption type in that message also needs to be set to what was used
when the key was set, so the assignment of vif_priv->encrypt_type is now
done after the firmware command was sent. This reportedly fixes the
following error in AP mode:

  wcn36xx: ERROR hal_remove_bsskey response failed err=6

Also, AFAIU, when a BSS is deleted, the firmware apparently drops all the
keys associated with it. Trying to remove the key explicitly afterwards
will hence lead to the following message:

  wcn36xx: ERROR hal_remove_bsskey response failed err=16

This is now suppressed with an extra check for the BSS index validity.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 10 +++++++---
 drivers/net/wireless/ath/wcn36xx/smd.c  |  6 ++++--
 drivers/net/wireless/ath/wcn36xx/smd.h  |  2 ++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 32bbd6e2fd09..749aef3e2b85 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -549,6 +549,7 @@ static int wcn36xx_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 		} else {
 			wcn36xx_smd_set_bsskey(wcn,
 				vif_priv->encrypt_type,
+				vif_priv->bss_index,
 				key_conf->keyidx,
 				key_conf->keylen,
 				key);
@@ -566,10 +567,13 @@ static int wcn36xx_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 		break;
 	case DISABLE_KEY:
 		if (!(IEEE80211_KEY_FLAG_PAIRWISE & key_conf->flags)) {
+			if (vif_priv->bss_index != WCN36XX_HAL_BSS_INVALID_IDX)
+				wcn36xx_smd_remove_bsskey(wcn,
+					vif_priv->encrypt_type,
+					vif_priv->bss_index,
+					key_conf->keyidx);
+
 			vif_priv->encrypt_type = WCN36XX_HAL_ED_NONE;
-			wcn36xx_smd_remove_bsskey(wcn,
-				vif_priv->encrypt_type,
-				key_conf->keyidx);
 		} else {
 			sta_priv->is_data_encrypted = false;
 			/* do not remove key if disassociated */
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 9827a1e1124b..297a785d0785 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -1636,6 +1636,7 @@ out:
 
 int wcn36xx_smd_set_bsskey(struct wcn36xx *wcn,
 			   enum ani_ed_type enc_type,
+			   u8 bssidx,
 			   u8 keyidx,
 			   u8 keylen,
 			   u8 *key)
@@ -1645,7 +1646,7 @@ int wcn36xx_smd_set_bsskey(struct wcn36xx *wcn,
 
 	mutex_lock(&wcn->hal_mutex);
 	INIT_HAL_MSG(msg_body, WCN36XX_HAL_SET_BSSKEY_REQ);
-	msg_body.bss_idx = 0;
+	msg_body.bss_idx = bssidx;
 	msg_body.enc_type = enc_type;
 	msg_body.num_keys = 1;
 	msg_body.keys[0].id = keyidx;
@@ -1706,6 +1707,7 @@ out:
 
 int wcn36xx_smd_remove_bsskey(struct wcn36xx *wcn,
 			      enum ani_ed_type enc_type,
+			      u8 bssidx,
 			      u8 keyidx)
 {
 	struct wcn36xx_hal_remove_bss_key_req_msg msg_body;
@@ -1713,7 +1715,7 @@ int wcn36xx_smd_remove_bsskey(struct wcn36xx *wcn,
 
 	mutex_lock(&wcn->hal_mutex);
 	INIT_HAL_MSG(msg_body, WCN36XX_HAL_RMV_BSSKEY_REQ);
-	msg_body.bss_idx = 0;
+	msg_body.bss_idx = bssidx;
 	msg_body.enc_type = enc_type;
 	msg_body.key_id = keyidx;
 
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.h b/drivers/net/wireless/ath/wcn36xx/smd.h
index 8076edf40ac8..61bb8d43138c 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.h
+++ b/drivers/net/wireless/ath/wcn36xx/smd.h
@@ -97,6 +97,7 @@ int wcn36xx_smd_set_stakey(struct wcn36xx *wcn,
 			   u8 sta_index);
 int wcn36xx_smd_set_bsskey(struct wcn36xx *wcn,
 			   enum ani_ed_type enc_type,
+			   u8 bssidx,
 			   u8 keyidx,
 			   u8 keylen,
 			   u8 *key);
@@ -106,6 +107,7 @@ int wcn36xx_smd_remove_stakey(struct wcn36xx *wcn,
 			      u8 sta_index);
 int wcn36xx_smd_remove_bsskey(struct wcn36xx *wcn,
 			      enum ani_ed_type enc_type,
+			      u8 bssidx,
 			      u8 keyidx);
 int wcn36xx_smd_enter_bmps(struct wcn36xx *wcn, struct ieee80211_vif *vif);
 int wcn36xx_smd_exit_bmps(struct wcn36xx *wcn, struct ieee80211_vif *vif);
-- 
cgit v1.2.3-59-g8ed1b


From 20529b33aa12224e4399b4486e5ab617cf8f3e5e Mon Sep 17 00:00:00 2001
From: Rakesh Pillai <pillair@codeaurora.org>
Date: Tue, 17 Apr 2018 14:54:26 +0530
Subject: ath10k: enable hw checksum for wcn3990

By default ath10k driver enables the support for HW_CHECKSUM
(NETIF_F_HW_CSUM). Since the TCP/UDP checksum calculation is not enabled
in the wcn3990 firmware the checksum is incorrect in the TCP/UDP packets
and all patckets are dropped. But due note that wcn3990 support in
ath10k is still incomplete so this isn't a critical fix (yet).

Enable hw checksum calculations in wcn3990 hardware by
setting the proper flags in msdu descriptor tso flags.

Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/htt.h    | 13 +++++++++++++
 drivers/net/wireless/ath/ath10k/htt_tx.c |  7 +++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt.h b/drivers/net/wireless/ath/ath10k/htt.h
index 4e5fe539eb77..5d3ff80f3a1f 100644
--- a/drivers/net/wireless/ath/ath10k/htt.h
+++ b/drivers/net/wireless/ath/ath10k/htt.h
@@ -128,6 +128,19 @@ struct htt_msdu_ext_desc_64 {
 				 | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE \
 				 | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE)
 
+#define HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE_64		BIT(16)
+#define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE_64		BIT(17)
+#define HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE_64		BIT(18)
+#define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE_64		BIT(19)
+#define HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE_64		BIT(20)
+#define HTT_MSDU_EXT_DESC_FLAG_PARTIAL_CSUM_ENABLE_64		BIT(21)
+
+#define HTT_MSDU_CHECKSUM_ENABLE_64  (HTT_MSDU_EXT_DESC_FLAG_IPV4_CSUM_ENABLE_64 \
+				     | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV4_CSUM_ENABLE_64 \
+				     | HTT_MSDU_EXT_DESC_FLAG_UDP_IPV6_CSUM_ENABLE_64 \
+				     | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV4_CSUM_ENABLE_64 \
+				     | HTT_MSDU_EXT_DESC_FLAG_TCP_IPV6_CSUM_ENABLE_64)
+
 enum htt_data_tx_desc_flags0 {
 	HTT_DATA_TX_DESC_FLAGS0_MAC_HDR_PRESENT = 1 << 0,
 	HTT_DATA_TX_DESC_FLAGS0_NO_AGGR         = 1 << 1,
diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c
index a086958c39b6..5d8b97a0ccaa 100644
--- a/drivers/net/wireless/ath/ath10k/htt_tx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_tx.c
@@ -1475,8 +1475,11 @@ static int ath10k_htt_tx_64(struct ath10k_htt *htt,
 	    !test_bit(ATH10K_FLAG_RAW_MODE, &ar->dev_flags)) {
 		flags1 |= HTT_DATA_TX_DESC_FLAGS1_CKSUM_L3_OFFLOAD;
 		flags1 |= HTT_DATA_TX_DESC_FLAGS1_CKSUM_L4_OFFLOAD;
-		if (ar->hw_params.continuous_frag_desc)
-			ext_desc->flags |= HTT_MSDU_CHECKSUM_ENABLE;
+		if (ar->hw_params.continuous_frag_desc) {
+			memset(ext_desc->tso_flag, 0, sizeof(ext_desc->tso_flag));
+			ext_desc->tso_flag[3] |=
+				__cpu_to_le32(HTT_MSDU_CHECKSUM_ENABLE_64);
+		}
 	}
 
 	/* Prevent firmware from sending up tx inspection requests. There's
-- 
cgit v1.2.3-59-g8ed1b


From b2e40d7ab8e2d36b455113df96dcbab0407e35c0 Mon Sep 17 00:00:00 2001
From: Rakesh Pillai <pillair@codeaurora.org>
Date: Tue, 17 Apr 2018 17:36:58 +0530
Subject: ath10k: add hw params for shadow register support

wcn3990 supports shadow register for ce write.

Add a hw param for shadow register support.

Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 14 ++++++++++++++
 drivers/net/wireless/ath/ath10k/hw.h   |  4 ++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 64674d8ce457..a8bb21173d2c 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -90,6 +90,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.num_wds_entries = 0x20,
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA988X_HW_2_0_VERSION,
@@ -120,6 +121,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA9887_HW_1_0_VERSION,
@@ -150,6 +152,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -179,6 +182,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -208,6 +212,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA6174_HW_3_0_VERSION,
@@ -237,6 +242,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA6174_HW_3_2_VERSION,
@@ -269,6 +275,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA99X0_HW_2_0_DEV_VERSION,
@@ -304,6 +311,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA9984_HW_1_0_DEV_VERSION,
@@ -344,6 +352,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA9888_HW_2_0_DEV_VERSION,
@@ -383,6 +392,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA9377_HW_1_0_DEV_VERSION,
@@ -412,6 +422,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA9377_HW_1_1_DEV_VERSION,
@@ -443,6 +454,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = QCA4019_HW_1_0_DEV_VERSION,
@@ -479,6 +491,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
+		.shadow_reg_support = false,
 	},
 	{
 		.id = WCN3990_HW_1_0_DEV_VERSION,
@@ -500,6 +513,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = true,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL_DUAL_MAC,
 		.per_ce_irq = true,
+		.shadow_reg_support = true,
 	},
 };
 
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index b025a1bf2fde..dcefde76956c 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -571,6 +572,9 @@ struct ath10k_hw_params {
 
 	/* target supporting per ce IRQ */
 	bool per_ce_irq;
+
+	/* target supporting shadow register for ce write */
+	bool shadow_reg_support;
 };
 
 struct htt_rx_desc;
-- 
cgit v1.2.3-59-g8ed1b


From b7ba83f7c414e583fdf82a1b1b95d2376cdb4b45 Mon Sep 17 00:00:00 2001
From: Rakesh Pillai <pillair@codeaurora.org>
Date: Tue, 17 Apr 2018 17:36:59 +0530
Subject: ath10k: add support for shadow register for WNC3990

WCN3990 needs shadow register write operation support
for copy engine for regular operation in powersave mode.
Add support for copy engine shadow register write in
datapath tx for WCN3990

Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/ce.c | 143 ++++++++++++++++++++++++++++++++++-
 drivers/net/wireless/ath/ath10k/ce.h |   4 +
 2 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index 3a15923bdd26..020405b6d408 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -58,6 +59,74 @@
  * the buffer is sent/received.
  */
 
+static inline u32 shadow_sr_wr_ind_addr(struct ath10k *ar,
+					struct ath10k_ce_pipe *ce_state)
+{
+	u32 ce_id = ce_state->id;
+	u32 addr = 0;
+
+	switch (ce_id) {
+	case 0:
+		addr = 0x00032000;
+		break;
+	case 3:
+		addr = 0x0003200C;
+		break;
+	case 4:
+		addr = 0x00032010;
+		break;
+	case 5:
+		addr = 0x00032014;
+		break;
+	case 7:
+		addr = 0x0003201C;
+		break;
+	default:
+		ath10k_warn(ar, "invalid CE id: %d", ce_id);
+		break;
+	}
+	return addr;
+}
+
+static inline u32 shadow_dst_wr_ind_addr(struct ath10k *ar,
+					 struct ath10k_ce_pipe *ce_state)
+{
+	u32 ce_id = ce_state->id;
+	u32 addr = 0;
+
+	switch (ce_id) {
+	case 1:
+		addr = 0x00032034;
+		break;
+	case 2:
+		addr = 0x00032038;
+		break;
+	case 5:
+		addr = 0x00032044;
+		break;
+	case 7:
+		addr = 0x0003204C;
+		break;
+	case 8:
+		addr = 0x00032050;
+		break;
+	case 9:
+		addr = 0x00032054;
+		break;
+	case 10:
+		addr = 0x00032058;
+		break;
+	case 11:
+		addr = 0x0003205C;
+		break;
+	default:
+		ath10k_warn(ar, "invalid CE id: %d", ce_id);
+		break;
+	}
+
+	return addr;
+}
+
 static inline unsigned int
 ath10k_set_ring_byte(unsigned int offset,
 		     struct ath10k_hw_ce_regs_addr_map *addr_map)
@@ -123,6 +192,22 @@ static inline u32 ath10k_ce_src_ring_read_index_get(struct ath10k *ar,
 				ar->hw_ce_regs->current_srri_addr);
 }
 
+static inline void
+ath10k_ce_shadow_src_ring_write_index_set(struct ath10k *ar,
+					  struct ath10k_ce_pipe *ce_state,
+					  unsigned int value)
+{
+	ath10k_ce_write32(ar, shadow_sr_wr_ind_addr(ar, ce_state), value);
+}
+
+static inline void
+ath10k_ce_shadow_dest_ring_write_index_set(struct ath10k *ar,
+					   struct ath10k_ce_pipe *ce_state,
+					   unsigned int value)
+{
+	ath10k_ce_write32(ar, shadow_dst_wr_ind_addr(ar, ce_state), value);
+}
+
 static inline void ath10k_ce_src_ring_base_addr_set(struct ath10k *ar,
 						    u32 ce_ctrl_addr,
 						    unsigned int addr)
@@ -376,8 +461,14 @@ static int _ath10k_ce_send_nolock(struct ath10k_ce_pipe *ce_state,
 	write_index = CE_RING_IDX_INCR(nentries_mask, write_index);
 
 	/* WORKAROUND */
-	if (!(flags & CE_SEND_FLAG_GATHER))
-		ath10k_ce_src_ring_write_index_set(ar, ctrl_addr, write_index);
+	if (!(flags & CE_SEND_FLAG_GATHER)) {
+		if (ar->hw_params.shadow_reg_support)
+			ath10k_ce_shadow_src_ring_write_index_set(ar, ce_state,
+								  write_index);
+		else
+			ath10k_ce_src_ring_write_index_set(ar, ctrl_addr,
+							   write_index);
+	}
 
 	src_ring->write_index = write_index;
 exit:
@@ -1251,6 +1342,22 @@ static int ath10k_ce_init_dest_ring(struct ath10k *ar,
 	return 0;
 }
 
+static int ath10k_ce_alloc_shadow_base(struct ath10k *ar,
+				       struct ath10k_ce_ring *src_ring,
+				       u32 nentries)
+{
+	src_ring->shadow_base_unaligned = kcalloc(nentries,
+						  sizeof(struct ce_desc),
+						  GFP_KERNEL);
+	if (!src_ring->shadow_base_unaligned)
+		return -ENOMEM;
+
+	src_ring->shadow_base = (struct ce_desc *)
+			PTR_ALIGN(src_ring->shadow_base_unaligned,
+				  CE_DESC_RING_ALIGN);
+	return 0;
+}
+
 static struct ath10k_ce_ring *
 ath10k_ce_alloc_src_ring(struct ath10k *ar, unsigned int ce_id,
 			 const struct ce_attr *attr)
@@ -1258,6 +1365,7 @@ ath10k_ce_alloc_src_ring(struct ath10k *ar, unsigned int ce_id,
 	struct ath10k_ce_ring *src_ring;
 	u32 nentries = attr->src_nentries;
 	dma_addr_t base_addr;
+	int ret;
 
 	nentries = roundup_pow_of_two(nentries);
 
@@ -1294,6 +1402,19 @@ ath10k_ce_alloc_src_ring(struct ath10k *ar, unsigned int ce_id,
 			ALIGN(src_ring->base_addr_ce_space_unaligned,
 			      CE_DESC_RING_ALIGN);
 
+	if (ar->hw_params.shadow_reg_support) {
+		ret = ath10k_ce_alloc_shadow_base(ar, src_ring, nentries);
+		if (ret) {
+			dma_free_coherent(ar->dev,
+					  (nentries * sizeof(struct ce_desc) +
+					   CE_DESC_RING_ALIGN),
+					  src_ring->base_addr_owner_space_unaligned,
+					  base_addr);
+			kfree(src_ring);
+			return ERR_PTR(ret);
+		}
+	}
+
 	return src_ring;
 }
 
@@ -1304,6 +1425,7 @@ ath10k_ce_alloc_src_ring_64(struct ath10k *ar, unsigned int ce_id,
 	struct ath10k_ce_ring *src_ring;
 	u32 nentries = attr->src_nentries;
 	dma_addr_t base_addr;
+	int ret;
 
 	nentries = roundup_pow_of_two(nentries);
 
@@ -1339,6 +1461,19 @@ ath10k_ce_alloc_src_ring_64(struct ath10k *ar, unsigned int ce_id,
 			ALIGN(src_ring->base_addr_ce_space_unaligned,
 			      CE_DESC_RING_ALIGN);
 
+	if (ar->hw_params.shadow_reg_support) {
+		ret = ath10k_ce_alloc_shadow_base(ar, src_ring, nentries);
+		if (ret) {
+			dma_free_coherent(ar->dev,
+					  (nentries * sizeof(struct ce_desc) +
+					   CE_DESC_RING_ALIGN),
+					  src_ring->base_addr_owner_space_unaligned,
+					  base_addr);
+			kfree(src_ring);
+			return ERR_PTR(ret);
+		}
+	}
+
 	return src_ring;
 }
 
@@ -1505,6 +1640,8 @@ static void _ath10k_ce_free_pipe(struct ath10k *ar, int ce_id)
 	struct ath10k_ce_pipe *ce_state = &ce->ce_states[ce_id];
 
 	if (ce_state->src_ring) {
+		if (ar->hw_params.shadow_reg_support)
+			kfree(ce_state->src_ring->shadow_base_unaligned);
 		dma_free_coherent(ar->dev,
 				  (ce_state->src_ring->nentries *
 				   sizeof(struct ce_desc) +
@@ -1534,6 +1671,8 @@ static void _ath10k_ce_free_pipe_64(struct ath10k *ar, int ce_id)
 	struct ath10k_ce_pipe *ce_state = &ce->ce_states[ce_id];
 
 	if (ce_state->src_ring) {
+		if (ar->hw_params.shadow_reg_support)
+			kfree(ce_state->src_ring->shadow_base_unaligned);
 		dma_free_coherent(ar->dev,
 				  (ce_state->src_ring->nentries *
 				   sizeof(struct ce_desc_64) +
diff --git a/drivers/net/wireless/ath/ath10k/ce.h b/drivers/net/wireless/ath/ath10k/ce.h
index d8f9da334529..9aea89133209 100644
--- a/drivers/net/wireless/ath/ath10k/ce.h
+++ b/drivers/net/wireless/ath/ath10k/ce.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018 The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -113,6 +114,9 @@ struct ath10k_ce_ring {
 	/* CE address space */
 	u32 base_addr_ce_space;
 
+	char *shadow_base_unaligned;
+	struct ce_desc *shadow_base;
+
 	/* keep last */
 	void *per_transfer_context[0];
 };
-- 
cgit v1.2.3-59-g8ed1b


From 4945af5b264fbdbdb5a9021b8a6a179d0c7a33b2 Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 17 Apr 2018 17:37:00 +0530
Subject: ath10k: enable SRRI/DRRI support on ddr for WCN3990

SRRI/DRRI are not mapped in the HW Shadow block and can lead
to un-clocked access if common subsystem in the target is
powered down due to idle mode.

To mitigate this problem SRRI/DRRI can be read from
DDR instead of doing an actual hardware read.
Host allocates non cached memory on ddr and configures
the physical address of this memory to the CE hardware.
The hardware updates the RRI on this particular location.
Read SRRI/DRRI from DDR location instead of
direct target read.

Enable retention restore on ddr using hw params to enable
in specific targets.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/ce.c   | 102 +++++++++++++++++++++++++++++++--
 drivers/net/wireless/ath/ath10k/ce.h   |  10 ++++
 drivers/net/wireless/ath/ath10k/core.c |  14 +++++
 drivers/net/wireless/ath/ath10k/hw.c   |   9 ++-
 drivers/net/wireless/ath/ath10k/hw.h   |  13 ++++-
 drivers/net/wireless/ath/ath10k/snoc.c |   3 +
 6 files changed, 142 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/ce.c b/drivers/net/wireless/ath/ath10k/ce.c
index 020405b6d408..3b96a43fbda4 100644
--- a/drivers/net/wireless/ath/ath10k/ce.c
+++ b/drivers/net/wireless/ath/ath10k/ce.c
@@ -185,11 +185,30 @@ static inline u32 ath10k_ce_src_ring_write_index_get(struct ath10k *ar,
 				ar->hw_ce_regs->sr_wr_index_addr);
 }
 
+static inline u32 ath10k_ce_src_ring_read_index_from_ddr(struct ath10k *ar,
+							 u32 ce_id)
+{
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+
+	return ce->vaddr_rri[ce_id] & CE_DDR_RRI_MASK;
+}
+
 static inline u32 ath10k_ce_src_ring_read_index_get(struct ath10k *ar,
 						    u32 ce_ctrl_addr)
 {
-	return ath10k_ce_read32(ar, ce_ctrl_addr +
-				ar->hw_ce_regs->current_srri_addr);
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	u32 ce_id = COPY_ENGINE_ID(ce_ctrl_addr);
+	struct ath10k_ce_pipe *ce_state = &ce->ce_states[ce_id];
+	u32 index;
+
+	if (ar->hw_params.rri_on_ddr &&
+	    (ce_state->attr_flags & CE_ATTR_DIS_INTR))
+		index = ath10k_ce_src_ring_read_index_from_ddr(ar, ce_id);
+	else
+		index = ath10k_ce_read32(ar, ce_ctrl_addr +
+					 ar->hw_ce_regs->current_srri_addr);
+
+	return index;
 }
 
 static inline void
@@ -266,11 +285,31 @@ static inline void ath10k_ce_dest_ring_byte_swap_set(struct ath10k *ar,
 			  ath10k_set_ring_byte(n, ctrl_regs->dst_ring));
 }
 
+static inline
+	u32 ath10k_ce_dest_ring_read_index_from_ddr(struct ath10k *ar, u32 ce_id)
+{
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+
+	return (ce->vaddr_rri[ce_id] >> CE_DDR_DRRI_SHIFT) &
+		CE_DDR_RRI_MASK;
+}
+
 static inline u32 ath10k_ce_dest_ring_read_index_get(struct ath10k *ar,
 						     u32 ce_ctrl_addr)
 {
-	return ath10k_ce_read32(ar, ce_ctrl_addr +
-				ar->hw_ce_regs->current_drri_addr);
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+	u32 ce_id = COPY_ENGINE_ID(ce_ctrl_addr);
+	struct ath10k_ce_pipe *ce_state = &ce->ce_states[ce_id];
+	u32 index;
+
+	if (ar->hw_params.rri_on_ddr &&
+	    (ce_state->attr_flags & CE_ATTR_DIS_INTR))
+		index = ath10k_ce_dest_ring_read_index_from_ddr(ar, ce_id);
+	else
+		index = ath10k_ce_read32(ar, ce_ctrl_addr +
+					 ar->hw_ce_regs->current_drri_addr);
+
+	return index;
 }
 
 static inline void ath10k_ce_dest_ring_base_addr_set(struct ath10k *ar,
@@ -486,7 +525,7 @@ static int _ath10k_ce_send_nolock_64(struct ath10k_ce_pipe *ce_state,
 	struct ath10k_ce_ring *src_ring = ce_state->src_ring;
 	struct ce_desc_64 *desc, sdesc;
 	unsigned int nentries_mask = src_ring->nentries_mask;
-	unsigned int sw_index = src_ring->sw_index;
+	unsigned int sw_index;
 	unsigned int write_index = src_ring->write_index;
 	u32 ctrl_addr = ce_state->ctrl_addr;
 	__le32 *addr;
@@ -500,6 +539,11 @@ static int _ath10k_ce_send_nolock_64(struct ath10k_ce_pipe *ce_state,
 		ath10k_warn(ar, "%s: send more we can (nbytes: %d, max: %d)\n",
 			    __func__, nbytes, ce_state->src_sz_max);
 
+	if (ar->hw_params.rri_on_ddr)
+		sw_index = ath10k_ce_src_ring_read_index_from_ddr(ar, ce_state->id);
+	else
+		sw_index = src_ring->sw_index;
+
 	if (unlikely(CE_RING_DELTA(nentries_mask,
 				   write_index, sw_index - 1) <= 0)) {
 		ret = -ENOSR;
@@ -1016,7 +1060,10 @@ int ath10k_ce_completed_send_next_nolock(struct ath10k_ce_pipe *ce_state,
 		src_ring->hw_index = read_index;
 	}
 
-	read_index = src_ring->hw_index;
+	if (ar->hw_params.rri_on_ddr)
+		read_index = ath10k_ce_src_ring_read_index_get(ar, ctrl_addr);
+	else
+		read_index = src_ring->hw_index;
 
 	if (read_index == sw_index)
 		return -EIO;
@@ -1841,3 +1888,46 @@ int ath10k_ce_alloc_pipe(struct ath10k *ar, int ce_id,
 	return 0;
 }
 EXPORT_SYMBOL(ath10k_ce_alloc_pipe);
+
+void ath10k_ce_alloc_rri(struct ath10k *ar)
+{
+	int i;
+	u32 value;
+	u32 ctrl1_regs;
+	u32 ce_base_addr;
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+
+	ce->vaddr_rri = dma_alloc_coherent(ar->dev,
+					   (CE_COUNT * sizeof(u32)),
+					   &ce->paddr_rri, GFP_KERNEL);
+
+	if (!ce->vaddr_rri)
+		return;
+
+	ath10k_ce_write32(ar, ar->hw_ce_regs->ce_rri_low,
+			  lower_32_bits(ce->paddr_rri));
+	ath10k_ce_write32(ar, ar->hw_ce_regs->ce_rri_high,
+			  (upper_32_bits(ce->paddr_rri) &
+			  CE_DESC_FLAGS_GET_MASK));
+
+	for (i = 0; i < CE_COUNT; i++) {
+		ctrl1_regs = ar->hw_ce_regs->ctrl1_regs->addr;
+		ce_base_addr = ath10k_ce_base_address(ar, i);
+		value = ath10k_ce_read32(ar, ce_base_addr + ctrl1_regs);
+		value |= ar->hw_ce_regs->upd->mask;
+		ath10k_ce_write32(ar, ce_base_addr + ctrl1_regs, value);
+	}
+
+	memset(ce->vaddr_rri, 0, CE_COUNT * sizeof(u32));
+}
+EXPORT_SYMBOL(ath10k_ce_alloc_rri);
+
+void ath10k_ce_free_rri(struct ath10k *ar)
+{
+	struct ath10k_ce *ce = ath10k_ce_priv(ar);
+
+	dma_free_coherent(ar->dev, (CE_COUNT * sizeof(u32)),
+			  ce->vaddr_rri,
+			  ce->paddr_rri);
+}
+EXPORT_SYMBOL(ath10k_ce_free_rri);
diff --git a/drivers/net/wireless/ath/ath10k/ce.h b/drivers/net/wireless/ath/ath10k/ce.h
index 9aea89133209..dbeffaef6024 100644
--- a/drivers/net/wireless/ath/ath10k/ce.h
+++ b/drivers/net/wireless/ath/ath10k/ce.h
@@ -49,6 +49,9 @@ struct ath10k_ce_pipe;
 #define CE_DESC_FLAGS_META_DATA_MASK ar->hw_values->ce_desc_meta_data_mask
 #define CE_DESC_FLAGS_META_DATA_LSB  ar->hw_values->ce_desc_meta_data_lsb
 
+#define CE_DDR_RRI_MASK			GENMASK(15, 0)
+#define CE_DDR_DRRI_SHIFT		16
+
 struct ce_desc {
 	__le32 addr;
 	__le16 nbytes;
@@ -157,6 +160,8 @@ struct ath10k_ce {
 	spinlock_t ce_lock;
 	const struct ath10k_bus_ops *bus_ops;
 	struct ath10k_ce_pipe ce_states[CE_COUNT_MAX];
+	u32 *vaddr_rri;
+	dma_addr_t paddr_rri;
 };
 
 /*==================Send====================*/
@@ -265,6 +270,8 @@ int ath10k_ce_disable_interrupts(struct ath10k *ar);
 void ath10k_ce_enable_interrupts(struct ath10k *ar);
 void ath10k_ce_dump_registers(struct ath10k *ar,
 			      struct ath10k_fw_crash_data *crash_data);
+void ath10k_ce_alloc_rri(struct ath10k *ar);
+void ath10k_ce_free_rri(struct ath10k *ar);
 
 /* ce_attr.flags values */
 /* Use NonSnooping PCIe accesses? */
@@ -331,6 +338,9 @@ static inline u32 ath10k_ce_base_address(struct ath10k *ar, unsigned int ce_id)
 	return CE0_BASE_ADDRESS + (CE1_BASE_ADDRESS - CE0_BASE_ADDRESS) * ce_id;
 }
 
+#define COPY_ENGINE_ID(COPY_ENGINE_BASE_ADDRESS) (((COPY_ENGINE_BASE_ADDRESS) \
+		- CE0_BASE_ADDRESS) / (CE1_BASE_ADDRESS - CE0_BASE_ADDRESS))
+
 #define CE_SRC_RING_TO_DESC(baddr, idx) \
 	(&(((struct ce_desc *)baddr)[idx]))
 
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index a8bb21173d2c..4cf54a7ef09a 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -91,6 +91,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA988X_HW_2_0_VERSION,
@@ -122,6 +123,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA9887_HW_1_0_VERSION,
@@ -153,6 +155,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -183,6 +186,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA6174_HW_2_1_VERSION,
@@ -213,6 +217,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA6174_HW_3_0_VERSION,
@@ -243,6 +248,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA6174_HW_3_2_VERSION,
@@ -276,6 +282,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA99X0_HW_2_0_DEV_VERSION,
@@ -312,6 +319,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA9984_HW_1_0_DEV_VERSION,
@@ -353,6 +361,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA9888_HW_2_0_DEV_VERSION,
@@ -393,6 +402,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA9377_HW_1_0_DEV_VERSION,
@@ -423,6 +433,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA9377_HW_1_1_DEV_VERSION,
@@ -455,6 +466,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = QCA4019_HW_1_0_DEV_VERSION,
@@ -492,6 +504,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 		.per_ce_irq = false,
 		.shadow_reg_support = false,
+		.rri_on_ddr = false,
 	},
 	{
 		.id = WCN3990_HW_1_0_DEV_VERSION,
@@ -514,6 +527,7 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL_DUAL_MAC,
 		.per_ce_irq = true,
 		.shadow_reg_support = true,
+		.rri_on_ddr = true,
 	},
 };
 
diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c
index 497ac33e0fbf..677535b3d207 100644
--- a/drivers/net/wireless/ath/ath10k/hw.c
+++ b/drivers/net/wireless/ath/ath10k/hw.c
@@ -310,6 +310,12 @@ static struct ath10k_hw_ce_dst_src_wm_regs wcn3990_wm_dst_ring = {
 	.wm_high	= &wcn3990_dst_wm_high,
 };
 
+static struct ath10k_hw_ce_ctrl1_upd wcn3990_ctrl1_upd = {
+	.shift = 19,
+	.mask = 0x00080000,
+	.enable = 0x00000000,
+};
+
 const struct ath10k_hw_ce_regs wcn3990_ce_regs = {
 	.sr_base_addr		= 0x00000000,
 	.sr_size_addr		= 0x00000008,
@@ -320,8 +326,6 @@ const struct ath10k_hw_ce_regs wcn3990_ce_regs = {
 	.dst_wr_index_addr	= 0x00000040,
 	.current_srri_addr	= 0x00000044,
 	.current_drri_addr	= 0x00000048,
-	.ddr_addr_for_rri_low	= 0x00000004,
-	.ddr_addr_for_rri_high	= 0x00000008,
 	.ce_rri_low		= 0x0024C004,
 	.ce_rri_high		= 0x0024C008,
 	.host_ie_addr		= 0x0000002c,
@@ -331,6 +335,7 @@ const struct ath10k_hw_ce_regs wcn3990_ce_regs = {
 	.misc_regs		= &wcn3990_misc_reg,
 	.wm_srcr		= &wcn3990_wm_src_ring,
 	.wm_dstr		= &wcn3990_wm_dst_ring,
+	.upd			= &wcn3990_ctrl1_upd,
 };
 
 const struct ath10k_hw_values wcn3990_values = {
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index dcefde76956c..b8bdabe73073 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -336,6 +336,12 @@ struct ath10k_hw_ce_dst_src_wm_regs {
 	struct ath10k_hw_ce_regs_addr_map *wm_low;
 	struct ath10k_hw_ce_regs_addr_map *wm_high; };
 
+struct ath10k_hw_ce_ctrl1_upd {
+	u32 shift;
+	u32 mask;
+	u32 enable;
+};
+
 struct ath10k_hw_ce_regs {
 	u32 sr_base_addr;
 	u32 sr_size_addr;
@@ -358,7 +364,9 @@ struct ath10k_hw_ce_regs {
 	struct ath10k_hw_ce_cmd_halt *cmd_halt;
 	struct ath10k_hw_ce_host_ie *host_ie;
 	struct ath10k_hw_ce_dst_src_wm_regs *wm_srcr;
-	struct ath10k_hw_ce_dst_src_wm_regs *wm_dstr; };
+	struct ath10k_hw_ce_dst_src_wm_regs *wm_dstr;
+	struct ath10k_hw_ce_ctrl1_upd *upd;
+};
 
 struct ath10k_hw_values {
 	u32 rtc_state_val_on;
@@ -575,6 +583,9 @@ struct ath10k_hw_params {
 
 	/* target supporting shadow register for ce write */
 	bool shadow_reg_support;
+
+	/* target supporting retention restore on ddr */
+	bool rri_on_ddr;
 };
 
 struct htt_rx_desc;
diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 2e490ff124f1..47a4d2a5bd4c 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -767,6 +767,7 @@ static void ath10k_snoc_hif_power_down(struct ath10k *ar)
 	ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot hif power down\n");
 
 	ath10k_snoc_wlan_disable(ar);
+	ath10k_ce_free_rri(ar);
 }
 
 static int ath10k_snoc_hif_power_up(struct ath10k *ar)
@@ -782,6 +783,8 @@ static int ath10k_snoc_hif_power_up(struct ath10k *ar)
 		return ret;
 	}
 
+	ath10k_ce_alloc_rri(ar);
+
 	ret = ath10k_snoc_init_pipes(ar);
 	if (ret) {
 		ath10k_err(ar, "failed to initialize CE: %d\n", ret);
-- 
cgit v1.2.3-59-g8ed1b


From d5cded16fdc02a31f9f73d899329896218c594aa Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Tue, 17 Apr 2018 17:37:01 +0530
Subject: ath10k: enable sta idle power save

Enable sta power save in fw for the targets that
supports idle power save. The idle ps enable command
will be ignored by the firmware which does not support
this feature.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Rakesh Pillai <pillair@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/mac.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index c71cf5b81385..3d7119ad7c7a 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -4679,6 +4679,13 @@ static int ath10k_start(struct ieee80211_hw *hw)
 		}
 	}
 
+	param = ar->wmi.pdev_param->idle_ps_config;
+	ret = ath10k_wmi_pdev_set_param(ar, param, 1);
+	if (ret && ret != -EOPNOTSUPP) {
+		ath10k_warn(ar, "failed to enable idle_ps_config: %d\n", ret);
+		goto err_core_stop;
+	}
+
 	__ath10k_set_antenna(ar, ar->cfg_tx_chainmask, ar->cfg_rx_chainmask);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 14ca3c98aa12842615a36812f29dc39b60811c93 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 17 Apr 2018 15:23:33 +0200
Subject: wcn36xx: abort scan request when 'dequeued' indicator is sent

When the firmware sends a WCN36XX_HAL_SCAN_IND_DEQUEUED indication,
the request is apparently no longer valid. Attempts to stop the hardware
scan request subsequently will lead to the following error message, and
the hardware is no longer able to communicate with any AP:

[   57.917186] wcn36xx: ERROR hal_stop_scan_offload response failed err=5

Interpreting this indicator message as scan abortion fixes this.

While at it, add a newline to a debug print.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 297a785d0785..b7964f79d837 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -2140,10 +2140,11 @@ static int wcn36xx_smd_hw_scan_ind(struct wcn36xx *wcn, void *buf, size_t len)
 		return -EIO;
 	}
 
-	wcn36xx_dbg(WCN36XX_DBG_HAL, "scan indication (type %x)", rsp->type);
+	wcn36xx_dbg(WCN36XX_DBG_HAL, "scan indication (type %x)\n", rsp->type);
 
 	switch (rsp->type) {
 	case WCN36XX_HAL_SCAN_IND_FAILED:
+	case WCN36XX_HAL_SCAN_IND_DEQUEUED:
 		scan_info.aborted = true;
 		/* fall through */
 	case WCN36XX_HAL_SCAN_IND_COMPLETED:
@@ -2156,7 +2157,6 @@ static int wcn36xx_smd_hw_scan_ind(struct wcn36xx *wcn, void *buf, size_t len)
 		break;
 	case WCN36XX_HAL_SCAN_IND_STARTED:
 	case WCN36XX_HAL_SCAN_IND_FOREIGN_CHANNEL:
-	case WCN36XX_HAL_SCAN_IND_DEQUEUED:
 	case WCN36XX_HAL_SCAN_IND_PREEMPTED:
 	case WCN36XX_HAL_SCAN_IND_RESTARTED:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 80c764d32122eceb26754d9ffa9718c124c8072e Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 17 Apr 2018 15:23:34 +0200
Subject: wcn36xx: cancel pending scan request when interface goes down

When the network interface goes down while a scan request is still
pending that can't be stopped due to firmware hickups, wcn->scan_req
remains set, even though the hardware is deinitialized. This results
in -EBUSY for all scan requests after the interface was brought up
again.

Fix this by explicitly completing pending scan requests in
wcn36xx_stop().

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 749aef3e2b85..08b6939d3f57 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -353,6 +353,19 @@ static void wcn36xx_stop(struct ieee80211_hw *hw)
 
 	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac stop\n");
 
+	cancel_work_sync(&wcn->scan_work);
+
+	mutex_lock(&wcn->scan_lock);
+	if (wcn->scan_req) {
+		struct cfg80211_scan_info scan_info = {
+			.aborted = true,
+		};
+
+		ieee80211_scan_completed(wcn->hw, &scan_info);
+	}
+	wcn->scan_req = NULL;
+	mutex_unlock(&wcn->scan_lock);
+
 	wcn36xx_debugfs_exit(wcn);
 	wcn36xx_smd_stop(wcn);
 	wcn36xx_dxe_deinit(wcn);
-- 
cgit v1.2.3-59-g8ed1b


From 89722f5767ac71b66e3077b6a158495eb316a43c Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 17 Apr 2018 15:23:35 +0200
Subject: wcn36xx: handle scan cancellation when firmware support is missing

For firmwares that don't have the SCAN_OFFLOAD feature bit set, do
not call into wcn36xx_smd_stop_hw_scan(). Instead, stop the asynchronous
work and call into ieee80211_scan_completed() immediately.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 08b6939d3f57..e3b91b3b38ef 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -687,10 +687,18 @@ static void wcn36xx_cancel_hw_scan(struct ieee80211_hw *hw,
 	wcn->scan_aborted = true;
 	mutex_unlock(&wcn->scan_lock);
 
-	/* ieee80211_scan_completed will be called on FW scan indication */
-	wcn36xx_smd_stop_hw_scan(wcn);
+	if (get_feat_caps(wcn->fw_feat_caps, SCAN_OFFLOAD)) {
+		/* ieee80211_scan_completed will be called on FW scan
+		 * indication */
+		wcn36xx_smd_stop_hw_scan(wcn);
+	} else {
+		struct cfg80211_scan_info scan_info = {
+			.aborted = true,
+		};
 
-	cancel_work_sync(&wcn->scan_work);
+		cancel_work_sync(&wcn->scan_work);
+		ieee80211_scan_completed(wcn->hw, &scan_info);
+	}
 }
 
 static void wcn36xx_update_allowed_rates(struct ieee80211_sta *sta,
-- 
cgit v1.2.3-59-g8ed1b


From 5a425c8b78e5a3d4850035c737f4f453f528320a Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 17 Apr 2018 15:23:36 +0200
Subject: wcn36xx: send bss_type in scan requests

Pass the bss_type of the currently configured BSS in the message for the
scan request. Therefore, that setting needs to be kept in struct
wcn36xx_vif.

This seems to be only interesting when scanning for a specific SSID
and doesn't matter for regular wildcard scans.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c     | 5 ++++-
 drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index b7964f79d837..9c8ce5e0454f 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -620,6 +620,7 @@ out:
 int wcn36xx_smd_start_hw_scan(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 			      struct cfg80211_scan_request *req)
 {
+	struct wcn36xx_vif *vif_priv = wcn36xx_vif_to_priv(vif);
 	struct wcn36xx_hal_start_scan_offload_req_msg msg_body;
 	int ret, i;
 
@@ -631,6 +632,7 @@ int wcn36xx_smd_start_hw_scan(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 	msg_body.max_ch_time = 100;
 	msg_body.scan_hidden = 1;
 	memcpy(msg_body.mac, vif->addr, ETH_ALEN);
+	msg_body.bss_type = vif_priv->bss_type;
 	msg_body.p2p_search = vif->p2p;
 
 	msg_body.num_ssid = min_t(u8, req->n_ssids, ARRAY_SIZE(msg_body.ssids));
@@ -1399,9 +1401,10 @@ int wcn36xx_smd_config_bss(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 	bss->spectrum_mgt_enable = 0;
 	bss->tx_mgmt_power = 0;
 	bss->max_tx_power = WCN36XX_MAX_POWER(wcn);
-
 	bss->action = update;
 
+	vif_priv->bss_type = bss->bss_type;
+
 	wcn36xx_dbg(WCN36XX_DBG_HAL,
 		    "hal config bss bssid %pM self_mac_addr %pM bss_type %d oper_mode %d nw_type %d\n",
 		    bss->bssid, bss->self_mac_addr, bss->bss_type,
diff --git a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
index 5854adf43f3a..22e05cb4eb98 100644
--- a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
+++ b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
@@ -123,6 +123,7 @@ struct wcn36xx_vif {
 	bool is_joining;
 	bool sta_assoc;
 	struct wcn36xx_hal_mac_ssid ssid;
+	enum wcn36xx_hal_bss_type bss_type;
 
 	/* Power management */
 	enum wcn36xx_power_state pw_state;
-- 
cgit v1.2.3-59-g8ed1b


From 4836ec42b07433fc8bdaf36a18f74f7a89659bb4 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 17 Apr 2018 15:23:37 +0200
Subject: wcn36xx: pass information elements in scan requests

When the wifi driver core passes IE elements in the scan request, append
them to the firmware message. The driver currently tells the core that
it is capable of attaching up to WCN36XX_MAX_SCAN_IE_LEN octets, but
doesn't actually pass them to the the hardware.

Note that this patch doesn't fix a bug that was observed. The change is
merely done for the sake of completeness as the hardware supports
appending IEs in scans. Tests show that network scans work fine with
this patch applied.

Some defines were moved around to avoid cyclic include dependencies.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/hal.h     |  8 +++++++-
 drivers/net/wireless/ath/wcn36xx/smd.c     | 11 +++++++++++
 drivers/net/wireless/ath/wcn36xx/wcn36xx.h |  6 ------
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/hal.h b/drivers/net/wireless/ath/wcn36xx/hal.h
index 182963522941..2aed6c233508 100644
--- a/drivers/net/wireless/ath/wcn36xx/hal.h
+++ b/drivers/net/wireless/ath/wcn36xx/hal.h
@@ -88,6 +88,12 @@
 /* version string max length (including NULL) */
 #define WCN36XX_HAL_VERSION_LENGTH  64
 
+/* How many frames until we start a-mpdu TX session */
+#define WCN36XX_AMPDU_START_THRESH	20
+
+#define WCN36XX_MAX_SCAN_SSIDS		9
+#define WCN36XX_MAX_SCAN_IE_LEN		500
+
 /* message types for messages exchanged between WDI and HAL */
 enum wcn36xx_hal_host_msg_type {
 	/* Init/De-Init */
@@ -1170,7 +1176,7 @@ struct wcn36xx_hal_start_scan_offload_req_msg {
 
 	/* IE field */
 	u16 ie_len;
-	u8 ie[0];
+	u8 ie[WCN36XX_MAX_SCAN_IE_LEN];
 } __packed;
 
 struct wcn36xx_hal_start_scan_offload_rsp_msg {
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 9c8ce5e0454f..ea74f2b92df5 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -624,6 +624,9 @@ int wcn36xx_smd_start_hw_scan(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 	struct wcn36xx_hal_start_scan_offload_req_msg msg_body;
 	int ret, i;
 
+	if (req->ie_len > WCN36XX_MAX_SCAN_IE_LEN)
+		return -EINVAL;
+
 	mutex_lock(&wcn->hal_mutex);
 	INIT_HAL_MSG(msg_body, WCN36XX_HAL_START_SCAN_OFFLOAD_REQ);
 
@@ -648,6 +651,14 @@ int wcn36xx_smd_start_hw_scan(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 	for (i = 0; i < msg_body.num_channel; i++)
 		msg_body.channels[i] = req->channels[i]->hw_value;
 
+	msg_body.header.len -= WCN36XX_MAX_SCAN_IE_LEN;
+
+	if (req->ie_len > 0) {
+		msg_body.ie_len = req->ie_len;
+		msg_body.header.len += req->ie_len;
+		memcpy(msg_body.ie, req->ie, req->ie_len);
+	}
+
 	PREPARE_HAL_BUF(wcn->hal_buf, msg_body);
 
 	wcn36xx_dbg(WCN36XX_DBG_HAL,
diff --git a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
index 22e05cb4eb98..9343989d1169 100644
--- a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
+++ b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
@@ -32,12 +32,6 @@
 #define WLAN_NV_FILE               "wlan/prima/WCNSS_qcom_wlan_nv.bin"
 #define WCN36XX_AGGR_BUFFER_SIZE 64
 
-/* How many frames until we start a-mpdu TX session */
-#define WCN36XX_AMPDU_START_THRESH	20
-
-#define WCN36XX_MAX_SCAN_SSIDS		9
-#define WCN36XX_MAX_SCAN_IE_LEN		500
-
 extern unsigned int wcn36xx_dbg_mask;
 
 enum wcn36xx_debug_mask {
-- 
cgit v1.2.3-59-g8ed1b


From 7d208687176292f4cba4dbb850087a0d6ed2b414 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:13 +0100
Subject: netfilter: nf_flow_table: move ipv4 offload hook code to
 nf_flow_table

Allows some minor code sharing with the ipv6 hook code and is also
useful as preparation for adding iptables support for offload

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 241 -------------------------------
 net/netfilter/Makefile                  |   2 +-
 net/netfilter/nf_flow_table_ip.c        | 246 ++++++++++++++++++++++++++++++++
 3 files changed, 247 insertions(+), 242 deletions(-)
 create mode 100644 net/netfilter/nf_flow_table_ip.c

diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 461b1815e633..b6e43ff0c7b7 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,249 +2,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
-			      __be32 addr, __be32 new_addr)
-{
-	struct tcphdr *tcph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
-		return -1;
-
-	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
-	return 0;
-}
-
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
-			      __be32 addr, __be32 new_addr)
-{
-	struct udphdr *udph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
-		return -1;
-
-	udph = (void *)(skb_network_header(skb) + thoff);
-	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace4(&udph->check, skb, addr,
-					 new_addr, true);
-		if (!udph->check)
-			udph->check = CSUM_MANGLED_0;
-	}
-
-	return 0;
-}
-
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
-				  unsigned int thoff, __be32 addr,
-				  __be32 new_addr)
-{
-	switch (iph->protocol) {
-	case IPPROTO_TCP:
-		if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	case IPPROTO_UDP:
-		if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	}
-
-	return 0;
-}
-
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			   struct iphdr *iph, unsigned int thoff,
-			   enum flow_offload_tuple_dir dir)
-{
-	__be32 addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = iph->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
-		iph->saddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = iph->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
-		iph->daddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-	csum_replace4(&iph->check, addr, new_addr);
-
-	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			   struct iphdr *iph, unsigned int thoff,
-			   enum flow_offload_tuple_dir dir)
-{
-	__be32 addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = iph->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
-		iph->daddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = iph->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
-		iph->saddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-	csum_replace4(&iph->check, addr, new_addr);
-
-	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			  enum flow_offload_tuple_dir dir)
-{
-	struct iphdr *iph = ip_hdr(skb);
-	unsigned int thoff = iph->ihl * 4;
-
-	if (flow->flags & FLOW_OFFLOAD_SNAT &&
-	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
-	     nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
-		return -1;
-	if (flow->flags & FLOW_OFFLOAD_DNAT &&
-	    (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
-	     nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
-		return -1;
-
-	return 0;
-}
-
-static bool ip_has_options(unsigned int thoff)
-{
-	return thoff != sizeof(struct iphdr);
-}
-
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
-			    struct flow_offload_tuple *tuple)
-{
-	struct flow_ports *ports;
-	unsigned int thoff;
-	struct iphdr *iph;
-
-	if (!pskb_may_pull(skb, sizeof(*iph)))
-		return -1;
-
-	iph = ip_hdr(skb);
-	thoff = iph->ihl * 4;
-
-	if (ip_is_fragment(iph) ||
-	    unlikely(ip_has_options(thoff)))
-		return -1;
-
-	if (iph->protocol != IPPROTO_TCP &&
-	    iph->protocol != IPPROTO_UDP)
-		return -1;
-
-	thoff = iph->ihl * 4;
-	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
-		return -1;
-
-	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
-	tuple->src_v4.s_addr	= iph->saddr;
-	tuple->dst_v4.s_addr	= iph->daddr;
-	tuple->src_port		= ports->source;
-	tuple->dst_port		= ports->dest;
-	tuple->l3proto		= AF_INET;
-	tuple->l4proto		= iph->protocol;
-	tuple->iifidx		= dev->ifindex;
-
-	return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
-	if (skb->len <= mtu)
-		return false;
-
-	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
-		return false;
-
-	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
-		return false;
-
-	return true;
-}
-
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
-			const struct nf_hook_state *state)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table = priv;
-	struct flow_offload_tuple tuple = {};
-	enum flow_offload_tuple_dir dir;
-	struct flow_offload *flow;
-	struct net_device *outdev;
-	const struct rtable *rt;
-	struct iphdr *iph;
-	__be32 nexthop;
-
-	if (skb->protocol != htons(ETH_P_IP))
-		return NF_ACCEPT;
-
-	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
-		return NF_ACCEPT;
-
-	tuplehash = flow_offload_lookup(flow_table, &tuple);
-	if (tuplehash == NULL)
-		return NF_ACCEPT;
-
-	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
-	if (!outdev)
-		return NF_ACCEPT;
-
-	dir = tuplehash->tuple.dir;
-	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
-		return NF_ACCEPT;
-
-	if (skb_try_make_writable(skb, sizeof(*iph)))
-		return NF_DROP;
-
-	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
-	    nf_flow_nat_ip(flow, skb, dir) < 0)
-		return NF_DROP;
-
-	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
-	iph = ip_hdr(skb);
-	ip_decrease_ttl(iph);
-
-	skb->dev = outdev;
-	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
-	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-
-	return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 700c5d51e405..3103ed1efe17 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
 
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
new file mode 100644
index 000000000000..034fda963392
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,246 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace4(&udph->check, skb, addr,
+					 new_addr, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+				  unsigned int thoff, __be32 addr,
+				  __be32 new_addr)
+{
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+	csum_replace4(&iph->check, addr, new_addr);
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+	csum_replace4(&iph->check, addr, new_addr);
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			  enum flow_offload_tuple_dir dir)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned int thoff = iph->ihl * 4;
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+	return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+			    struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		return -1;
+
+	iph = ip_hdr(skb);
+	thoff = iph->ihl * 4;
+
+	if (ip_is_fragment(iph) ||
+	    unlikely(ip_has_options(thoff)))
+		return -1;
+
+	if (iph->protocol != IPPROTO_TCP &&
+	    iph->protocol != IPPROTO_UDP)
+		return -1;
+
+	thoff = iph->ihl * 4;
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v4.s_addr	= iph->saddr;
+	tuple->dst_v4.s_addr	= iph->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET;
+	tuple->l4proto		= iph->protocol;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+		return false;
+
+	return true;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+			const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	const struct rtable *rt;
+	struct iphdr *iph;
+	__be32 nexthop;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*iph)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ip(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	iph = ip_hdr(skb);
+	ip_decrease_ttl(iph);
+
+	skb->dev = outdev;
+	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+	return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-- 
cgit v1.2.3-59-g8ed1b


From 3aeb51d7e7b8dcc641a4238b162ed72fdc2c0b23 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:14 +0100
Subject: netfilter: nf_flow_table: move ip header check out of
 nf_flow_exceeds_mtu

Allows the function to be shared with the IPv6 hook code

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 034fda963392..103263e0c7c2 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -182,9 +182,6 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->len <= mtu)
 		return false;
 
-	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
-		return false;
-
 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
@@ -223,7 +220,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
-- 
cgit v1.2.3-59-g8ed1b


From a908fdec3dda2902edef18658d5cb39127d1051e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:15 +0100
Subject: netfilter: nf_flow_table: move ipv6 offload hook code to
 nf_flow_table

Useful as preparation for adding iptables support for offload.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 232 --------------------------------
 net/netfilter/nf_flow_table_ip.c        | 215 +++++++++++++++++++++++++++++
 2 files changed, 215 insertions(+), 232 deletions(-)

diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 0e6328490142..f1804ce8d561 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -3,240 +3,8 @@
 #include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
-#include <linux/ipv6.h>
-#include <linux/netdevice.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
-				struct in6_addr *addr,
-				struct in6_addr *new_addr)
-{
-	struct tcphdr *tcph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
-		return -1;
-
-	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
-				  new_addr->s6_addr32, true);
-
-	return 0;
-}
-
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
-				struct in6_addr *addr,
-				struct in6_addr *new_addr)
-{
-	struct udphdr *udph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
-		return -1;
-
-	udph = (void *)(skb_network_header(skb) + thoff);
-	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
-					  new_addr->s6_addr32, true);
-		if (!udph->check)
-			udph->check = CSUM_MANGLED_0;
-	}
-
-	return 0;
-}
-
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
-				    unsigned int thoff, struct in6_addr *addr,
-				    struct in6_addr *new_addr)
-{
-	switch (ip6h->nexthdr) {
-	case IPPROTO_TCP:
-		if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	case IPPROTO_UDP:
-		if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	}
-
-	return 0;
-}
-
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
-			     struct sk_buff *skb, struct ipv6hdr *ip6h,
-			     unsigned int thoff,
-			     enum flow_offload_tuple_dir dir)
-{
-	struct in6_addr addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = ip6h->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
-		ip6h->saddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = ip6h->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
-		ip6h->daddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
-			     struct sk_buff *skb, struct ipv6hdr *ip6h,
-			     unsigned int thoff,
-			     enum flow_offload_tuple_dir dir)
-{
-	struct in6_addr addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = ip6h->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
-		ip6h->daddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = ip6h->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
-		ip6h->saddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
-			    struct sk_buff *skb,
-			    enum flow_offload_tuple_dir dir)
-{
-	struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	unsigned int thoff = sizeof(*ip6h);
-
-	if (flow->flags & FLOW_OFFLOAD_SNAT &&
-	    (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
-	     nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
-		return -1;
-	if (flow->flags & FLOW_OFFLOAD_DNAT &&
-	    (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
-	     nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
-		return -1;
-
-	return 0;
-}
-
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
-			      struct flow_offload_tuple *tuple)
-{
-	struct flow_ports *ports;
-	struct ipv6hdr *ip6h;
-	unsigned int thoff;
-
-	if (!pskb_may_pull(skb, sizeof(*ip6h)))
-		return -1;
-
-	ip6h = ipv6_hdr(skb);
-
-	if (ip6h->nexthdr != IPPROTO_TCP &&
-	    ip6h->nexthdr != IPPROTO_UDP)
-		return -1;
-
-	thoff = sizeof(*ip6h);
-	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
-		return -1;
-
-	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
-	tuple->src_v6		= ip6h->saddr;
-	tuple->dst_v6		= ip6h->daddr;
-	tuple->src_port		= ports->source;
-	tuple->dst_port		= ports->dest;
-	tuple->l3proto		= AF_INET6;
-	tuple->l4proto		= ip6h->nexthdr;
-	tuple->iifidx		= dev->ifindex;
-
-	return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
-	if (skb->len <= mtu)
-		return false;
-
-	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
-		return false;
-
-	return true;
-}
-
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
-			  const struct nf_hook_state *state)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table = priv;
-	struct flow_offload_tuple tuple = {};
-	enum flow_offload_tuple_dir dir;
-	struct flow_offload *flow;
-	struct net_device *outdev;
-	struct in6_addr *nexthop;
-	struct ipv6hdr *ip6h;
-	struct rt6_info *rt;
-
-	if (skb->protocol != htons(ETH_P_IPV6))
-		return NF_ACCEPT;
-
-	if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
-		return NF_ACCEPT;
-
-	tuplehash = flow_offload_lookup(flow_table, &tuple);
-	if (tuplehash == NULL)
-		return NF_ACCEPT;
-
-	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
-	if (!outdev)
-		return NF_ACCEPT;
-
-	dir = tuplehash->tuple.dir;
-	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
-		return NF_ACCEPT;
-
-	if (skb_try_make_writable(skb, sizeof(*ip6h)))
-		return NF_DROP;
-
-	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
-	    nf_flow_nat_ipv6(flow, skb, dir) < 0)
-		return NF_DROP;
-
-	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
-	ip6h = ipv6_hdr(skb);
-	ip6h->hop_limit--;
-
-	skb->dev = outdev;
-	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
-	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-
-	return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 103263e0c7c2..dc570fb7641d 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -4,8 +4,11 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
 #include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 /* For layer 4 checksum field offset. */
@@ -242,3 +245,215 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+				  new_addr->s6_addr32, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+					  new_addr->s6_addr32, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+				    unsigned int thoff, struct in6_addr *addr,
+				    struct in6_addr *new_addr)
+{
+	switch (ip6h->nexthdr) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+			    struct sk_buff *skb,
+			    enum flow_offload_tuple_dir dir)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	unsigned int thoff = sizeof(*ip6h);
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+			      struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	struct ipv6hdr *ip6h;
+	unsigned int thoff;
+
+	if (!pskb_may_pull(skb, sizeof(*ip6h)))
+		return -1;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_TCP &&
+	    ip6h->nexthdr != IPPROTO_UDP)
+		return -1;
+
+	thoff = sizeof(*ip6h);
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v6		= ip6h->saddr;
+	tuple->dst_v6		= ip6h->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET6;
+	tuple->l4proto		= ip6h->nexthdr;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+			  const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	struct in6_addr *nexthop;
+	struct ipv6hdr *ip6h;
+	struct rt6_info *rt;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*ip6h)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ipv6(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	ip6h = ipv6_hdr(skb);
+	ip6h->hop_limit--;
+
+	skb->dev = outdev;
+	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+	return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
-- 
cgit v1.2.3-59-g8ed1b


From 1e80380b14681f229fc6bcd3d53f001f4fc0a463 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:16 +0100
Subject: netfilter: nf_flow_table: relax mixed ipv4/ipv6 flowtable
 dependencies

Since the offload hook code was moved, this table no longer depends on
the IPv4 and IPv6 flowtable modules

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 704b3832dbad..d20664b02ae4 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -666,8 +666,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_INET
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
-	depends on NF_FLOW_TABLE_IPV4
-	depends on NF_FLOW_TABLE_IPV6
+	depends on NF_FLOW_TABLE
 	help
           This option adds the flow table mixed IPv4/IPv6 support.
 
-- 
cgit v1.2.3-59-g8ed1b


From a268de77faf6881756b4943b287fd78ec05a7d1e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:17 +0100
Subject: netfilter: nf_flow_table: move init code to nf_flow_table_core.c

Reduces duplication of .gc and .params in flowtable type definitions and
makes the API clearer

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |   6 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   3 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   3 +-
 net/netfilter/nf_flow_table_core.c      | 102 +++++++++++++++++++-------------
 net/netfilter/nf_flow_table_inet.c      |   3 +-
 net/netfilter/nf_tables_api.c           |  22 +++----
 6 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 76ee5c81b752..f876e32a60b8 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,9 +14,8 @@ struct nf_flowtable;
 struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
-	void				(*gc)(struct work_struct *work);
+	int				(*init)(struct nf_flowtable *ft);
 	void				(*free)(struct nf_flowtable *ft);
-	const struct rhashtable_params	*params;
 	nf_hookfn			*hook;
 	struct module			*owner;
 };
@@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 
+int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
-void nf_flow_offload_work_gc(struct work_struct *work);
-extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
 void flow_offload_dead(struct flow_offload *flow);
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index b6e43ff0c7b7..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -7,8 +7,7 @@
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f1804ce8d561..c511d206bf9b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -8,8 +8,7 @@
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 7403a0dfddf7..09d1be669c39 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -116,16 +116,50 @@ void flow_offload_dead(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_dead);
 
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple *tuple = data;
+
+	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple_rhash *tuplehash = data;
+
+	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+					const void *ptr)
+{
+	const struct flow_offload_tuple *tuple = arg->key;
+	const struct flow_offload_tuple_rhash *x = ptr;
+
+	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+		return 1;
+
+	return 0;
+}
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
+	.hashfn			= flow_offload_hash,
+	.obj_hashfn		= flow_offload_hash_obj,
+	.obj_cmpfn		= flow_offload_hash_cmp,
+	.automatic_shrinking	= true,
+};
+
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 {
 	flow->timeout = (u32)jiffies;
 
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -135,10 +169,10 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 {
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 
 	flow_offload_free(flow);
 }
@@ -148,7 +182,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
 {
 	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
-				      *flow_table->type->params);
+				      nf_flow_offload_rhash_params);
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
@@ -237,7 +271,7 @@ out:
 	return 1;
 }
 
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
 {
 	struct nf_flowtable *flow_table;
 
@@ -245,42 +279,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
 	nf_flow_offload_gc_step(flow_table);
 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
 }
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple *tuple = data;
-
-	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple_rhash *tuplehash = data;
-
-	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
-					const void *ptr)
-{
-	const struct flow_offload_tuple *tuple = arg->key;
-	const struct flow_offload_tuple_rhash *x = ptr;
-
-	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
-		return 1;
-
-	return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
-	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
-	.hashfn			= flow_offload_hash,
-	.obj_hashfn		= flow_offload_hash_obj,
-	.obj_cmpfn		= flow_offload_hash_cmp,
-	.automatic_shrinking	= true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
 
 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 				__be16 port, __be16 new_port)
@@ -398,6 +396,24 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
 
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+	int err;
+
+	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+	err = rhashtable_init(&flowtable->rhashtable,
+			      &nf_flow_offload_rhash_params);
+	if (err < 0)
+		return err;
+
+	queue_delayed_work(system_power_efficient_wq,
+			   &flowtable->gc_work, HZ);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
@@ -423,8 +439,10 @@ EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+	rhashtable_destroy(&flow_table->rhashtable);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_free);
 
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9134cc429ad4..6cd9955916e5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5150,14 +5150,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	}
 
 	flowtable->data.type = type;
-	err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+	err = type->init(&flowtable->data);
 	if (err < 0)
 		goto err3;
 
 	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
 					     flowtable);
 	if (err < 0)
-		goto err3;
+		goto err4;
 
 	for (i = 0; i < flowtable->ops_len; i++) {
 		if (!flowtable->ops[i].dev)
@@ -5171,37 +5171,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				if (flowtable->ops[i].dev == ft->ops[k].dev &&
 				    flowtable->ops[i].pf == ft->ops[k].pf) {
 					err = -EBUSY;
-					goto err4;
+					goto err5;
 				}
 			}
 		}
 
 		err = nf_register_net_hook(net, &flowtable->ops[i]);
 		if (err < 0)
-			goto err4;
+			goto err5;
 	}
 
 	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
 	if (err < 0)
-		goto err5;
-
-	INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
-	queue_delayed_work(system_power_efficient_wq,
-			   &flowtable->data.gc_work, HZ);
+		goto err6;
 
 	list_add_tail_rcu(&flowtable->list, &table->flowtables);
 	table->use++;
 
 	return 0;
-err5:
+err6:
 	i = flowtable->ops_len;
-err4:
+err5:
 	for (k = i - 1; k >= 0; k--) {
 		kfree(flowtable->dev_name[k]);
 		nf_unregister_net_hook(net, &flowtable->ops[k]);
 	}
 
 	kfree(flowtable->ops);
+err4:
+	flowtable->data.type->free(&flowtable->data);
 err3:
 	module_put(type->owner);
 err2:
@@ -5485,11 +5483,9 @@ err:
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	cancel_delayed_work_sync(&flowtable->data.gc_work);
 	kfree(flowtable->ops);
 	kfree(flowtable->name);
 	flowtable->data.type->free(&flowtable->data);
-	rhashtable_destroy(&flowtable->data.rhashtable);
 	module_put(flowtable->data.type->owner);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 17857d9299225abb55bf6a61eb73f72de1c94625 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:18 +0100
Subject: netfilter: nf_flow_table: fix priv pointer for netdev hook

The offload ip hook expects a pointer to the flowtable, not to the
rhashtable. Since the rhashtable is the first member, this is safe for
the moment, but breaks as soon as the structure layout changes

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6cd9955916e5..517bb93c00fb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5019,7 +5019,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		flowtable->ops[i].pf		= NFPROTO_NETDEV;
 		flowtable->ops[i].hooknum	= hooknum;
 		flowtable->ops[i].priority	= priority;
-		flowtable->ops[i].priv		= &flowtable->data.rhashtable;
+		flowtable->ops[i].priv		= &flowtable->data;
 		flowtable->ops[i].hook		= flowtable->data.type->hook;
 		flowtable->ops[i].dev		= dev_array[i];
 		flowtable->dev_name[i]		= kstrdup(dev_array[i]->name,
-- 
cgit v1.2.3-59-g8ed1b


From 84453a90252ca0cd7d1bd229199a40c58bfe431e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:19 +0100
Subject: netfilter: nf_flow_table: track flow tables in nf_flow_table directly

Avoids having nf_flow_table depend on nftables (useful for future
iptables backport work)

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 include/net/netfilter/nf_tables.h     |  3 ---
 net/netfilter/nf_flow_table_core.c    | 21 ++++++++++++++++++---
 net/netfilter/nf_tables_api.c         | 17 -----------------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index f876e32a60b8..ab408adba688 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -21,6 +21,7 @@ struct nf_flowtable_type {
 };
 
 struct nf_flowtable {
+	struct list_head		list;
 	struct rhashtable		rhashtable;
 	const struct nf_flowtable_type	*type;
 	struct delayed_work		gc_work;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cd368d1b8cb8..2f2062ae1c45 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1109,9 +1109,6 @@ struct nft_flowtable {
 struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 						 const struct nlattr *nla,
 						 u8 genmask);
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data);
 
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 09d1be669c39..e761359b56a9 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -18,6 +18,9 @@ struct flow_offload_entry {
 	struct rcu_head		rcu_head;
 };
 
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
 static void
 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 		      struct nf_flow_route *route,
@@ -410,6 +413,10 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
 	queue_delayed_work(system_power_efficient_wq,
 			   &flowtable->gc_work, HZ);
 
+	mutex_lock(&flowtable_lock);
+	list_add(&flowtable->list, &flowtables);
+	mutex_unlock(&flowtable_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_init);
@@ -425,20 +432,28 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 }
 
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-					  void *data)
+					  struct net_device *dev)
 {
-	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
 	flush_delayed_work(&flowtable->gc_work);
 }
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
 {
-	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+	struct nf_flowtable *flowtable;
+
+	mutex_lock(&flowtable_lock);
+	list_for_each_entry(flowtable, &flowtables, list)
+		nf_flow_table_iterate_cleanup(flowtable, dev);
+	mutex_unlock(&flowtable_lock);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	mutex_lock(&flowtable_lock);
+	list_del(&flow_table->list);
+	mutex_unlock(&flowtable_lock);
 	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 517bb93c00fb..16b67f54b3d2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5060,23 +5060,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
 	return ERR_PTR(-ENOENT);
 }
 
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data)
-{
-	struct nft_flowtable *flowtable;
-	const struct nft_table *table;
-
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(table, &net->nft.tables, list) {
-		list_for_each_entry(flowtable, &table->flowtables, list) {
-			iter(&flowtable->data, data);
-		}
-	}
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
 static void nft_unregister_flowtable_net_hooks(struct net *net,
 					       struct nft_flowtable *flowtable)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6bdc3c68d94c5d6adc675ee55361962e9dd2489d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:20 +0100
Subject: netfilter: nf_flow_table: make flow_offload_dead inline

It is too trivial to keep as a separate exported function

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 5 ++++-
 net/netfilter/nf_flow_table_core.c    | 6 ------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ab408adba688..5aa49524ebef 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -103,7 +103,10 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
 
-void flow_offload_dead(struct flow_offload *flow);
+static inline void flow_offload_dead(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_DYING;
+}
 
 int nf_flow_snat_port(const struct flow_offload *flow,
 		      struct sk_buff *skb, unsigned int thoff,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index e761359b56a9..0d38f20fd226 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -113,12 +113,6 @@ void flow_offload_free(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
-void flow_offload_dead(struct flow_offload *flow)
-{
-	flow->flags |= FLOW_OFFLOAD_DYING;
-}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
-
 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
 {
 	const struct flow_offload_tuple *tuple = data;
-- 
cgit v1.2.3-59-g8ed1b


From 59c466dd68e796f3a7a0709d90c72ce2d84e29c2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:21 +0100
Subject: netfilter: nf_flow_table: add a new flow state for tearing down
 offloading

On cleanup, this will be treated differently from FLOW_OFFLOAD_DYING:

If FLOW_OFFLOAD_DYING is set, the connection is going away, so both the
offload state and the connection tracking entry will be deleted.

If FLOW_OFFLOAD_TEARDOWN is set, the connection remains alive, but
the offload state is torn down. This is useful for cases that require
more complex state tracking / timeout handling on TCP, or if the
connection has been idle for too long.

Support for sending flows back to the slow path will be implemented in
a following patch

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  2 ++
 net/netfilter/nf_flow_table_core.c    | 22 ++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 5aa49524ebef..ba9fa4592f2b 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -68,6 +68,7 @@ struct flow_offload_tuple_rhash {
 #define FLOW_OFFLOAD_SNAT	0x1
 #define FLOW_OFFLOAD_DNAT	0x2
 #define FLOW_OFFLOAD_DYING	0x4
+#define FLOW_OFFLOAD_TEARDOWN	0x8
 
 struct flow_offload {
 	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
@@ -103,6 +104,7 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
 
+void flow_offload_teardown(struct flow_offload *flow);
 static inline void flow_offload_dead(struct flow_offload *flow)
 {
 	flow->flags |= FLOW_OFFLOAD_DYING;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 0d38f20fd226..5a81e4f771e9 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -174,6 +174,12 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 	flow_offload_free(flow);
 }
 
+void flow_offload_teardown(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+}
+EXPORT_SYMBOL_GPL(flow_offload_teardown);
+
 struct flow_offload_tuple_rhash *
 flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
@@ -226,11 +232,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
 	return (__s32)(flow->timeout - (u32)jiffies) <= 0;
 }
 
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
-	return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
 static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 {
 	struct flow_offload_tuple_rhash *tuplehash;
@@ -258,7 +259,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
 
 		if (nf_flow_has_expired(flow) ||
-		    nf_flow_is_dying(flow))
+		    (flow->flags & (FLOW_OFFLOAD_DYING |
+				    FLOW_OFFLOAD_TEARDOWN)))
 			flow_offload_del(flow_table, flow);
 	}
 out:
@@ -419,10 +421,14 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
 
-	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+	if (!dev) {
+		flow_offload_teardown(flow);
 		return;
+	}
 
-	flow_offload_dead(flow);
+	if (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
+	    flow->tuplehash[1].tuple.iifidx == dev->ifindex)
+		flow_offload_dead(flow);
 }
 
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-- 
cgit v1.2.3-59-g8ed1b


From ba03137f4c0c96d1f1f498632f868a2b743b155a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:22 +0100
Subject: netfilter: nf_flow_table: in flow_offload_lookup, skip entries being
 deleted

Preparation for sending flows back to the slow path

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 5a81e4f771e9..ff5e17a15963 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -184,8 +184,21 @@ struct flow_offload_tuple_rhash *
 flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
 {
-	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
-				      nf_flow_offload_rhash_params);
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct flow_offload *flow;
+	int dir;
+
+	tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+					   nf_flow_offload_rhash_params);
+	if (!tuplehash)
+		return NULL;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
+		return NULL;
+
+	return tuplehash;
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
-- 
cgit v1.2.3-59-g8ed1b


From da5984e51063a2c8723a36c29f090e03b3ff2a35 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:23 +0100
Subject: netfilter: nf_flow_table: add support for sending flows back to the
 slow path

Since conntrack hasn't seen any packets from the offloaded flow in a
while, and the timeout for offloaded flows is set to an extremely long
value, we need to fix up the state before we can send a flow back to the
slow path.

For TCP, reset td_maxwin in both directions, which makes it resync its
state on the next packets.

Use the regular timeout for TCP and UDP established connections.

This allows the slow path to take over again once the offload state has
been torn down

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 50 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index ff5e17a15963..0699981a8511 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -100,6 +100,43 @@ err_ct_refcnt:
 }
 EXPORT_SYMBOL_GPL(flow_offload_alloc);
 
+static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+{
+	tcp->state = TCP_CONNTRACK_ESTABLISHED;
+	tcp->seen[0].td_maxwin = 0;
+	tcp->seen[1].td_maxwin = 0;
+}
+
+static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+{
+	const struct nf_conntrack_l4proto *l4proto;
+	struct net *net = nf_ct_net(ct);
+	unsigned int *timeouts;
+	unsigned int timeout;
+	int l4num;
+
+	l4num = nf_ct_protonum(ct);
+	if (l4num == IPPROTO_TCP)
+		flow_offload_fixup_tcp(&ct->proto.tcp);
+
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num);
+	if (!l4proto)
+		return;
+
+	timeouts = l4proto->get_timeouts(net);
+	if (!timeouts)
+		return;
+
+	if (l4num == IPPROTO_TCP)
+		timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+	else if (l4num == IPPROTO_UDP)
+		timeout = timeouts[UDP_CT_REPLIED];
+	else
+		return;
+
+	ct->timeout = nfct_time_stamp + timeout;
+}
+
 void flow_offload_free(struct flow_offload *flow)
 {
 	struct flow_offload_entry *e;
@@ -107,7 +144,8 @@ void flow_offload_free(struct flow_offload *flow)
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
 	e = container_of(flow, struct flow_offload_entry, flow);
-	nf_ct_delete(e->ct, 0, 0);
+	if (flow->flags & FLOW_OFFLOAD_DYING)
+		nf_ct_delete(e->ct, 0, 0);
 	nf_ct_put(e->ct);
 	kfree_rcu(e, rcu_head);
 }
@@ -164,6 +202,8 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
 static void flow_offload_del(struct nf_flowtable *flow_table,
 			     struct flow_offload *flow)
 {
+	struct flow_offload_entry *e;
+
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
 			       nf_flow_offload_rhash_params);
@@ -171,12 +211,20 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       nf_flow_offload_rhash_params);
 
+	e = container_of(flow, struct flow_offload_entry, flow);
+	clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
+
 	flow_offload_free(flow);
 }
 
 void flow_offload_teardown(struct flow_offload *flow)
 {
+	struct flow_offload_entry *e;
+
 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
+
+	e = container_of(flow, struct flow_offload_entry, flow);
+	flow_offload_fixup_ct_state(e->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
-- 
cgit v1.2.3-59-g8ed1b


From b6f27d322a0a7b588fe6369fa5fa87bd7c553bd3 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 26 Feb 2018 10:15:24 +0100
Subject: netfilter: nf_flow_table: tear down TCP flows if RST or FIN was seen

Allow the slow path to handle the shutdown of the connection with proper
timeouts. The packet containing RST/FIN is also sent to the slow path
and the TCP conntrack module will update its state.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index dc570fb7641d..692c75ef5cb7 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -15,6 +15,23 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
+static int nf_flow_tcp_state_check(struct flow_offload *flow,
+				   struct sk_buff *skb, unsigned int thoff)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	if (unlikely(tcph->fin || tcph->rst)) {
+		flow_offload_teardown(flow);
+		return -1;
+	}
+
+	return 0;
+}
+
 static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
 			      __be32 addr, __be32 new_addr)
 {
@@ -119,10 +136,9 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
 }
 
 static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			  enum flow_offload_tuple_dir dir)
+			  unsigned int thoff, enum flow_offload_tuple_dir dir)
 {
 	struct iphdr *iph = ip_hdr(skb);
-	unsigned int thoff = iph->ihl * 4;
 
 	if (flow->flags & FLOW_OFFLOAD_SNAT &&
 	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
@@ -202,6 +218,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	struct flow_offload *flow;
 	struct net_device *outdev;
 	const struct rtable *rt;
+	unsigned int thoff;
 	struct iphdr *iph;
 	__be32 nexthop;
 
@@ -230,8 +247,12 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	if (skb_try_make_writable(skb, sizeof(*iph)))
 		return NF_DROP;
 
+	thoff = ip_hdr(skb)->ihl * 4;
+	if (nf_flow_tcp_state_check(flow, skb, thoff))
+		return NF_ACCEPT;
+
 	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
-	    nf_flow_nat_ip(flow, skb, dir) < 0)
+	    nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
 		return NF_DROP;
 
 	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
@@ -439,6 +460,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
+	if (nf_flow_tcp_state_check(flow, skb, sizeof(*ip6h)))
+		return NF_ACCEPT;
+
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
 		return NF_DROP;
 
-- 
cgit v1.2.3-59-g8ed1b


From 33894c367dbd5bd3ec5b26c643ca022798226dd9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 23 Mar 2018 19:15:37 +0100
Subject: netfilter: nf_flow_table: add missing condition for TCP state check

Avoid looking at unrelated fields in UDP packets

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 692c75ef5cb7..82451b7e0acb 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -15,11 +15,14 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 
-static int nf_flow_tcp_state_check(struct flow_offload *flow,
-				   struct sk_buff *skb, unsigned int thoff)
+static int nf_flow_state_check(struct flow_offload *flow, int proto,
+			       struct sk_buff *skb, unsigned int thoff)
 {
 	struct tcphdr *tcph;
 
+	if (proto != IPPROTO_TCP)
+		return 0;
+
 	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
 		return -1;
 
@@ -248,7 +251,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 		return NF_DROP;
 
 	thoff = ip_hdr(skb)->ihl * 4;
-	if (nf_flow_tcp_state_check(flow, skb, thoff))
+	if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
 		return NF_ACCEPT;
 
 	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
@@ -460,7 +463,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
-	if (nf_flow_tcp_state_check(flow, skb, sizeof(*ip6h)))
+	if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
+				sizeof(*ip6h)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
-- 
cgit v1.2.3-59-g8ed1b


From df1e202531235f82a98e5b6c377a93a1e7d45673 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 23 Mar 2018 19:15:38 +0100
Subject: netfilter: nf_flow_table: fix offloading connections with SNAT+DNAT

Pass all NAT types to the flow offload struct, otherwise parts of the
address/port pair do not get translated properly, causing connection
stalls

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 0699981a8511..eb0d1658ac05 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -84,7 +84,7 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 
 	if (ct->status & IPS_SRC_NAT)
 		flow->flags |= FLOW_OFFLOAD_SNAT;
-	else if (ct->status & IPS_DST_NAT)
+	if (ct->status & IPS_DST_NAT)
 		flow->flags |= FLOW_OFFLOAD_DNAT;
 
 	return flow;
-- 
cgit v1.2.3-59-g8ed1b


From cac20fcdf146b82d02e412d7a345f5826279cd82 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Mar 2018 12:06:51 +0200
Subject: netfilter: nf_tables: simplify lookup functions

Replace the nf_tables_ prefix by nft_ and merge code into single lookup
function whenever possible. In many cases we go over the 80-chars
boundary function names, this save us ~50 LoC.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  12 +-
 net/netfilter/nf_tables_api.c     | 249 +++++++++++++++-----------------------
 net/netfilter/nft_flow_offload.c  |   5 +-
 net/netfilter/nft_objref.c        |   4 +-
 4 files changed, 110 insertions(+), 160 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2f2062ae1c45..123e82a2f8bb 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1015,9 +1015,9 @@ static inline void *nft_obj_data(const struct nft_object *obj)
 
 #define nft_expr_obj(expr)	*((struct nft_object **)nft_expr_priv(expr))
 
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
-					const struct nlattr *nla, u32 objtype,
-					u8 genmask);
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+				  const struct nlattr *nla, u32 objtype,
+				  u8 genmask);
 
 void nft_obj_notify(struct net *net, struct nft_table *table,
 		    struct nft_object *obj, u32 portid, u32 seq,
@@ -1106,9 +1106,9 @@ struct nft_flowtable {
 	struct nf_flowtable		data;
 };
 
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
-						 const struct nlattr *nla,
-						 u8 genmask);
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+					   const struct nlattr *nla,
+					   u8 genmask);
 
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 16b67f54b3d2..f65e650b61aa 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -386,13 +386,17 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 {
 	struct nft_table *table;
 
+	if (nla == NULL)
+		return ERR_PTR(-EINVAL);
+
 	list_for_each_entry(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
 		    table->family == family &&
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
-	return NULL;
+
+	return ERR_PTR(-ENOENT);
 }
 
 static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
@@ -406,37 +410,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
 		    nft_active_genmask(table, genmask))
 			return table;
 	}
-	return NULL;
-}
-
-static struct nft_table *nf_tables_table_lookup(const struct net *net,
-						const struct nlattr *nla,
-						u8 family, u8 genmask)
-{
-	struct nft_table *table;
-
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
-	table = nft_table_lookup(net, nla, family, genmask);
-	if (table != NULL)
-		return table;
-
-	return ERR_PTR(-ENOENT);
-}
-
-static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
-							 const struct nlattr *nla,
-							 u8 genmask)
-{
-	struct nft_table *table;
-
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
-	table = nft_table_lookup_byhandle(net, nla, genmask);
-	if (table != NULL)
-		return table;
 
 	return ERR_PTR(-ENOENT);
 }
@@ -608,8 +581,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -735,7 +707,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	int err;
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(net, name, family, genmask);
+	table = nft_table_lookup(net, name, family, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -893,12 +865,11 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 		return nft_flush(&ctx, family);
 
 	if (nla[NFTA_TABLE_HANDLE])
-		table = nf_tables_table_lookup_byhandle(net,
-							nla[NFTA_TABLE_HANDLE],
-							genmask);
+		table = nft_table_lookup_byhandle(net, nla[NFTA_TABLE_HANDLE],
+						  genmask);
 	else
-		table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
-					       family, genmask);
+		table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family,
+					 genmask);
 
 	if (IS_ERR(table))
 		return PTR_ERR(table);
@@ -949,8 +920,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
  */
 
 static struct nft_chain *
-nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
-				u8 genmask)
+nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
 {
 	struct nft_chain *chain;
 
@@ -963,9 +933,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table,
-						const struct nlattr *nla,
-						u8 genmask)
+static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
+					  const struct nlattr *nla, u8 genmask)
 {
 	struct nft_chain *chain;
 
@@ -1194,12 +1163,11 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
@@ -1513,8 +1481,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	    nla[NFTA_CHAIN_NAME]) {
 		struct nft_chain *chain2;
 
-		chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
-						genmask);
+		chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 		if (!IS_ERR(chain2))
 			return -EEXIST;
 	}
@@ -1576,8 +1543,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1586,11 +1552,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
-		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	} else {
-		chain = nf_tables_chain_lookup(table, name, genmask);
+		chain = nft_chain_lookup(table, name, genmask);
 		if (IS_ERR(chain)) {
 			if (PTR_ERR(chain) != -ENOENT)
 				return PTR_ERR(chain);
@@ -1647,16 +1613,15 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	u32 use;
 	int err;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
-		chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 	} else {
-		chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+		chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
 	}
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
@@ -1939,8 +1904,8 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
  * Rules
  */
 
-static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
-						u64 handle)
+static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+					  u64 handle)
 {
 	struct nft_rule *rule;
 
@@ -1953,13 +1918,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain,
-					      const struct nlattr *nla)
+static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+					const struct nlattr *nla)
 {
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+	return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
 }
 
 static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2191,16 +2156,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
-	rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+	rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 	if (IS_ERR(rule))
 		return PTR_ERR(rule);
 
@@ -2265,18 +2229,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
 	if (nla[NFTA_RULE_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
-		rule = __nf_tables_rule_lookup(chain, handle);
+		rule = __nft_rule_lookup(chain, handle);
 		if (IS_ERR(rule))
 			return PTR_ERR(rule);
 
@@ -2300,7 +2263,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			return -EOPNOTSUPP;
 
 		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
-		old_rule = __nf_tables_rule_lookup(chain, pos_handle);
+		old_rule = __nft_rule_lookup(chain, pos_handle);
 		if (IS_ERR(old_rule))
 			return PTR_ERR(old_rule);
 	}
@@ -2435,14 +2398,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	int family = nfmsg->nfgen_family, err = 0;
 	struct nft_ctx ctx;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_RULE_CHAIN]) {
-		chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN],
-					       genmask);
+		chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 	}
@@ -2451,8 +2412,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
-			rule = nf_tables_rule_lookup(chain,
-						     nla[NFTA_RULE_HANDLE]);
+			rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 			if (IS_ERR(rule))
 				return PTR_ERR(rule);
 
@@ -2635,8 +2595,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 	struct nft_table *table = NULL;
 
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE],
-					       family, genmask);
+		table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+					 genmask);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -2645,8 +2605,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 	return 0;
 }
 
-static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
-					    const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup(const struct nft_table *table,
+				      const struct nlattr *nla, u8 genmask)
 {
 	struct nft_set *set;
 
@@ -2661,14 +2621,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
-						     const struct nlattr *nla, u8 genmask)
+static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
+					       const struct nlattr *nla,
+					       u8 genmask)
 {
 	struct nft_set *set;
 
-	if (nla == NULL)
-		return ERR_PTR(-EINVAL);
-
 	list_for_each_entry(set, &table->sets, list) {
 		if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
 		    nft_active_genmask(set, genmask))
@@ -2677,9 +2635,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
-						 const struct nlattr *nla,
-						 u8 genmask)
+static struct nft_set *nft_set_lookup_byid(const struct net *net,
+					   const struct nlattr *nla, u8 genmask)
 {
 	struct nft_trans *trans;
 	u32 id = ntohl(nla_get_be32(nla));
@@ -2703,12 +2660,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
 {
 	struct nft_set *set;
 
-	set = nf_tables_set_lookup(table, nla_set_name, genmask);
+	set = nft_set_lookup(table, nla_set_name, genmask);
 	if (IS_ERR(set)) {
 		if (!nla_set_id)
 			return set;
 
-		set = nf_tables_set_lookup_byid(net, nla_set_id, genmask);
+		set = nft_set_lookup_byid(net, nla_set_id, genmask);
 	}
 	return set;
 }
@@ -2980,7 +2937,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	if (!nla[NFTA_SET_TABLE])
 		return -EINVAL;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3132,14 +3089,13 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
-	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+	set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
 		if (PTR_ERR(set) != -ENOENT)
 			return PTR_ERR(set);
@@ -3262,9 +3218,10 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 		return err;
 
 	if (nla[NFTA_SET_HANDLE])
-		set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
+		set = nft_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE],
+					      genmask);
 	else
-		set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+		set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3404,8 +3361,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 	int family = nfmsg->nfgen_family;
 	struct nft_table *table;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -3741,8 +3698,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
@@ -3954,8 +3910,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			err = -EINVAL;
 			goto err2;
 		}
-		obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
-					   set->objtype, genmask);
+		obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+				     set->objtype, genmask);
 		if (IS_ERR(obj)) {
 			err = PTR_ERR(obj);
 			goto err2;
@@ -4284,8 +4240,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
+	set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
@@ -4373,9 +4328,9 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
 }
 EXPORT_SYMBOL_GPL(nft_unregister_obj);
 
-struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
-					const struct nlattr *nla,
-					u32 objtype, u8 genmask)
+struct nft_object *nft_obj_lookup(const struct nft_table *table,
+				  const struct nlattr *nla, u32 objtype,
+				  u8 genmask)
 {
 	struct nft_object *obj;
 
@@ -4387,11 +4342,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 	}
 	return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+EXPORT_SYMBOL_GPL(nft_obj_lookup);
 
-static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
-							const struct nlattr *nla,
-							u32 objtype, u8 genmask)
+static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
+						  const struct nlattr *nla,
+						  u32 objtype, u8 genmask)
 {
 	struct nft_object *obj;
 
@@ -4535,13 +4490,12 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_DATA])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
 	if (IS_ERR(obj)) {
 		err = PTR_ERR(obj);
 		if (err != -ENOENT)
@@ -4761,13 +4715,12 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_OBJ_TYPE])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 
@@ -4817,18 +4770,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	    (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
-				       genmask);
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 	if (nla[NFTA_OBJ_HANDLE])
-		obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
-						    objtype, genmask);
+		obj = nft_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
+					      objtype, genmask);
 	else
-		obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
-					   objtype, genmask);
+		obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype,
+				     genmask);
 	if (IS_ERR(obj))
 		return PTR_ERR(obj);
 	if (obj->use > 0)
@@ -4903,9 +4855,8 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
 	[NFTA_FLOWTABLE_HANDLE]		= { .type = NLA_U64 },
 };
 
-struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
-						 const struct nlattr *nla,
-						 u8 genmask)
+struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+					   const struct nlattr *nla, u8 genmask)
 {
 	struct nft_flowtable *flowtable;
 
@@ -4916,11 +4867,11 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 	}
 	return ERR_PTR(-ENOENT);
 }
-EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
 
 static struct nft_flowtable *
-nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
-				    const struct nlattr *nla, u8 genmask)
+nft_flowtable_lookup_byhandle(const struct nft_table *table,
+			      const struct nlattr *nla, u8 genmask)
 {
        struct nft_flowtable *flowtable;
 
@@ -5093,13 +5044,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	    !nla[NFTA_FLOWTABLE_HOOK])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable)) {
 		err = PTR_ERR(flowtable);
 		if (err != -ENOENT)
@@ -5210,19 +5161,19 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	     !nla[NFTA_FLOWTABLE_HANDLE]))
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
 	if (nla[NFTA_FLOWTABLE_HANDLE])
-		flowtable = nf_tables_flowtable_lookup_byhandle(table,
-								nla[NFTA_FLOWTABLE_HANDLE],
-								genmask);
+		flowtable = nft_flowtable_lookup_byhandle(table,
+							  nla[NFTA_FLOWTABLE_HANDLE],
+							  genmask);
 	else
-		flowtable = nf_tables_flowtable_lookup(table,
-						       nla[NFTA_FLOWTABLE_NAME],
-						       genmask);
+		flowtable = nft_flowtable_lookup(table,
+						 nla[NFTA_FLOWTABLE_NAME],
+						 genmask);
 	if (IS_ERR(flowtable))
                 return PTR_ERR(flowtable);
 	if (flowtable->use > 0)
@@ -5407,13 +5358,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	if (!nla[NFTA_FLOWTABLE_NAME])
 		return -EINVAL;
 
-	table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE],
-				       family, genmask);
+	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
+				 genmask);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
@@ -6382,8 +6333,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 	case NFT_GOTO:
 		if (!tb[NFTA_VERDICT_CHAIN])
 			return -EINVAL;
-		chain = nf_tables_chain_lookup(ctx->table,
-					       tb[NFTA_VERDICT_CHAIN], genmask);
+		chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
+					 genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 		if (nft_is_base_chain(chain))
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b65829b2be22..d6bab8c3cbb0 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
 	if (!tb[NFTA_FLOW_TABLE_NAME])
 		return -EINVAL;
 
-	flowtable = nf_tables_flowtable_lookup(ctx->table,
-					       tb[NFTA_FLOW_TABLE_NAME],
-					       genmask);
+	flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
+					 genmask);
 	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 0b02407773ad..cdf348f751ec 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
 		return -EINVAL;
 
 	objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
-	obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
-				   genmask);
+	obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+			     genmask);
 	if (IS_ERR(obj))
 		return -ENOENT;
 
-- 
cgit v1.2.3-59-g8ed1b


From 36dd1bcc07e52946dfdf42cf2ca701b919a3db27 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Mar 2018 12:06:52 +0200
Subject: netfilter: nf_tables: initial support for extended ACK reporting

Keep it simple to start with, just report attribute offsets that can be
useful to userspace when representating errors to users.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 299 +++++++++++++++++++++++++++++-------------
 1 file changed, 206 insertions(+), 93 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f65e650b61aa..2f14cadd9922 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -582,8 +582,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
 		return PTR_ERR(table);
+	}
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -699,21 +701,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	const struct nlattr *name;
-	struct nft_table *table;
 	int family = nfmsg->nfgen_family;
+	const struct nlattr *attr;
+	struct nft_table *table;
 	u32 flags = 0;
 	struct nft_ctx ctx;
 	int err;
 
-	name = nla[NFTA_TABLE_NAME];
-	table = nft_table_lookup(net, name, family, genmask);
+	attr = nla[NFTA_TABLE_NAME];
+	table = nft_table_lookup(net, attr, family, genmask);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
+		}
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
@@ -732,7 +736,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table == NULL)
 		goto err_kzalloc;
 
-	table->name = nla_strdup(name, GFP_KERNEL);
+	table->name = nla_strdup(attr, GFP_KERNEL);
 	if (table->name == NULL)
 		goto err_strdup;
 
@@ -855,8 +859,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
-	struct nft_table *table;
 	int family = nfmsg->nfgen_family;
+	const struct nlattr *attr;
+	struct nft_table *table;
 	struct nft_ctx ctx;
 
 	nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
@@ -864,15 +869,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
 	    (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
 		return nft_flush(&ctx, family);
 
-	if (nla[NFTA_TABLE_HANDLE])
-		table = nft_table_lookup_byhandle(net, nla[NFTA_TABLE_HANDLE],
-						  genmask);
-	else
-		table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family,
-					 genmask);
+	if (nla[NFTA_TABLE_HANDLE]) {
+		attr = nla[NFTA_TABLE_HANDLE];
+		table = nft_table_lookup_byhandle(net, attr, genmask);
+	} else {
+		attr = nla[NFTA_TABLE_NAME];
+		table = nft_table_lookup(net, attr, family, genmask);
+	}
 
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return PTR_ERR(table);
+	}
 
 	if (nlh->nlmsg_flags & NLM_F_NONREC &&
 	    table->use > 0)
@@ -1164,12 +1172,16 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
-	if (IS_ERR(chain))
+	if (IS_ERR(chain)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 		return PTR_ERR(chain);
+	}
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -1531,9 +1543,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 			      struct netlink_ext_ack *extack)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	const struct nlattr * uninitialized_var(name);
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
+	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	u8 policy = NF_ACCEPT;
@@ -1544,34 +1556,45 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	chain = NULL;
-	name = nla[NFTA_CHAIN_NAME];
+	attr = nla[NFTA_CHAIN_NAME];
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
 		chain = nft_chain_lookup_byhandle(table, handle, genmask);
-		if (IS_ERR(chain))
+		if (IS_ERR(chain)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
 			return PTR_ERR(chain);
+		}
+		attr = nla[NFTA_CHAIN_HANDLE];
 	} else {
-		chain = nft_chain_lookup(table, name, genmask);
+		chain = nft_chain_lookup(table, attr, genmask);
 		if (IS_ERR(chain)) {
-			if (PTR_ERR(chain) != -ENOENT)
+			if (PTR_ERR(chain) != -ENOENT) {
+				NL_SET_BAD_ATTR(extack, attr);
 				return PTR_ERR(chain);
+			}
 			chain = NULL;
 		}
 	}
 
 	if (nla[NFTA_CHAIN_POLICY]) {
 		if (chain != NULL &&
-		    !nft_is_base_chain(chain))
+		    !nft_is_base_chain(chain)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
 			return -EOPNOTSUPP;
+		}
 
 		if (chain == NULL &&
-		    nla[NFTA_CHAIN_HOOK] == NULL)
+		    nla[NFTA_CHAIN_HOOK] == NULL) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
 			return -EOPNOTSUPP;
+		}
 
 		policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
 		switch (policy) {
@@ -1586,8 +1609,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
 
 	if (chain != NULL) {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
+		}
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
 
@@ -1604,27 +1629,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule;
-	int family = nfmsg->nfgen_family;
 	struct nft_ctx ctx;
 	u64 handle;
 	u32 use;
 	int err;
 
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	if (nla[NFTA_CHAIN_HANDLE]) {
-		handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+		attr = nla[NFTA_CHAIN_HANDLE];
+		handle = be64_to_cpu(nla_get_be64(attr));
 		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 	} else {
-		chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+		attr = nla[NFTA_CHAIN_NAME];
+		chain = nft_chain_lookup(table, attr, genmask);
 	}
-	if (IS_ERR(chain))
+	if (IS_ERR(chain)) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return PTR_ERR(chain);
+	}
 
 	if (nlh->nlmsg_flags & NLM_F_NONREC &&
 	    chain->use > 0)
@@ -1646,8 +1678,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 	/* There are rules and elements that are still holding references to us,
 	 * we cannot do a recursive removal in this case.
 	 */
-	if (use > 0)
+	if (use > 0) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
+	}
 
 	return nft_delchain(&ctx);
 }
@@ -2157,16 +2191,22 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
-	if (IS_ERR(chain))
+	if (IS_ERR(chain)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 		return PTR_ERR(chain);
+	}
 
 	rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
-	if (IS_ERR(rule))
+	if (IS_ERR(rule)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
 		return PTR_ERR(rule);
+	}
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -2230,21 +2270,29 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
-	if (IS_ERR(chain))
+	if (IS_ERR(chain)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 		return PTR_ERR(chain);
+	}
 
 	if (nla[NFTA_RULE_HANDLE]) {
 		handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
 		rule = __nft_rule_lookup(chain, handle);
-		if (IS_ERR(rule))
+		if (IS_ERR(rule)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
 			return PTR_ERR(rule);
+		}
 
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
 			return -EEXIST;
+		}
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			old_rule = rule;
 		else
@@ -2264,8 +2312,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 
 		pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
 		old_rule = __nft_rule_lookup(chain, pos_handle);
-		if (IS_ERR(old_rule))
+		if (IS_ERR(old_rule)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
 			return PTR_ERR(old_rule);
+		}
 	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
@@ -2399,13 +2449,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	if (nla[NFTA_RULE_CHAIN]) {
 		chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
-		if (IS_ERR(chain))
+		if (IS_ERR(chain)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 			return PTR_ERR(chain);
+		}
 	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
@@ -2413,14 +2467,18 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	if (chain) {
 		if (nla[NFTA_RULE_HANDLE]) {
 			rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
-			if (IS_ERR(rule))
+			if (IS_ERR(rule)) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
 				return PTR_ERR(rule);
+			}
 
 			err = nft_delrule(&ctx, rule);
 		} else if (nla[NFTA_RULE_ID]) {
 			rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
-			if (IS_ERR(rule))
+			if (IS_ERR(rule)) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
 				return PTR_ERR(rule);
+			}
 
 			err = nft_delrule(&ctx, rule);
 		} else {
@@ -2588,6 +2646,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 				     const struct sk_buff *skb,
 				     const struct nlmsghdr *nlh,
 				     const struct nlattr * const nla[],
+				     struct netlink_ext_ack *extack,
 				     u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2597,8 +2656,10 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
 	if (nla[NFTA_SET_TABLE] != NULL) {
 		table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
 					 genmask);
-		if (IS_ERR(table))
+		if (IS_ERR(table)) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
 			return PTR_ERR(table);
+		}
 	}
 
 	nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
@@ -2910,7 +2971,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	int err;
 
 	/* Verify existence before starting dump */
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+					genmask);
 	if (err < 0)
 		return err;
 
@@ -3090,20 +3152,27 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
 	table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
 	set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
 	if (IS_ERR(set)) {
-		if (PTR_ERR(set) != -ENOENT)
+		if (PTR_ERR(set) != -ENOENT) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
 			return PTR_ERR(set);
+		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
 			return -EEXIST;
+		}
 		if (nlh->nlmsg_flags & NLM_F_REPLACE)
 			return -EOPNOTSUPP;
+
 		return 0;
 	}
 
@@ -3204,6 +3273,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
+	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
 	int err;
@@ -3213,21 +3283,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TABLE] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask);
+	err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
+					genmask);
 	if (err < 0)
 		return err;
 
-	if (nla[NFTA_SET_HANDLE])
-		set = nft_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE],
-					      genmask);
-	else
-		set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
-	if (IS_ERR(set))
-		return PTR_ERR(set);
+	if (nla[NFTA_SET_HANDLE]) {
+		attr = nla[NFTA_SET_HANDLE];
+		set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+	} else {
+		attr = nla[NFTA_SET_NAME];
+		set = nft_set_lookup(ctx.table, attr, genmask);
+	}
 
+	if (IS_ERR(set)) {
+		NL_SET_BAD_ATTR(extack, attr);
+		return PTR_ERR(set);
+	}
 	if (!list_empty(&set->bindings) ||
-	    (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0))
+	    (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
+	}
 
 	return nft_delset(&ctx, set);
 }
@@ -3355,6 +3432,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 				      const struct sk_buff *skb,
 				      const struct nlmsghdr *nlh,
 				      const struct nlattr * const nla[],
+				      struct netlink_ext_ack *extack,
 				      u8 genmask)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -3363,8 +3441,10 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
 
 	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
 				 genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
 	return 0;
@@ -3694,7 +3774,8 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 	int rem, err = 0;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+					 genmask);
 	if (err < 0)
 		return err;
 
@@ -4048,7 +4129,8 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+					 genmask);
 	if (err < 0)
 		return err;
 
@@ -4236,7 +4318,8 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 	int rem, err = 0;
 
-	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask);
+	err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
+					 genmask);
 	if (err < 0)
 		return err;
 
@@ -4491,20 +4574,24 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		return -EINVAL;
 
 	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
 	if (IS_ERR(obj)) {
 		err = PTR_ERR(obj);
-		if (err != -ENOENT)
+		if (err != -ENOENT) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
 			return err;
-
+		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
 			return -EEXIST;
-
+		}
 		return 0;
 	}
 
@@ -4716,13 +4803,17 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 		return -EINVAL;
 
 	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 	obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
-	if (IS_ERR(obj))
+	if (IS_ERR(obj)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
 		return PTR_ERR(obj);
+	}
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb2)
@@ -4761,6 +4852,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
+	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_object *obj;
 	struct nft_ctx ctx;
@@ -4771,20 +4863,28 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 		return -EINVAL;
 
 	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	if (nla[NFTA_OBJ_HANDLE])
-		obj = nft_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
-					      objtype, genmask);
-	else
-		obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype,
-				     genmask);
-	if (IS_ERR(obj))
+	if (nla[NFTA_OBJ_HANDLE]) {
+		attr = nla[NFTA_OBJ_HANDLE];
+		obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
+	} else {
+		attr = nla[NFTA_OBJ_NAME];
+		obj = nft_obj_lookup(table, attr, objtype, genmask);
+	}
+
+	if (IS_ERR(obj)) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return PTR_ERR(obj);
-	if (obj->use > 0)
+	}
+	if (obj->use > 0) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
+	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
@@ -5046,18 +5146,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 
 	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
 				 genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
 		return PTR_ERR(table);
+	}
 
 	flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
 					 genmask);
 	if (IS_ERR(flowtable)) {
 		err = PTR_ERR(flowtable);
-		if (err != -ENOENT)
+		if (err != -ENOENT) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
 			return err;
+		}
 	} else {
-		if (nlh->nlmsg_flags & NLM_F_EXCL)
+		if (nlh->nlmsg_flags & NLM_F_EXCL) {
+			NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
 			return -EEXIST;
+		}
 
 		return 0;
 	}
@@ -5153,6 +5259,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
 	struct nft_flowtable *flowtable;
+	const struct nlattr *attr;
 	struct nft_table *table;
 	struct nft_ctx ctx;
 
@@ -5163,21 +5270,27 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
 
 	table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
 				 genmask);
-	if (IS_ERR(table))
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
 		return PTR_ERR(table);
+	}
 
-	if (nla[NFTA_FLOWTABLE_HANDLE])
-		flowtable = nft_flowtable_lookup_byhandle(table,
-							  nla[NFTA_FLOWTABLE_HANDLE],
-							  genmask);
-	else
-		flowtable = nft_flowtable_lookup(table,
-						 nla[NFTA_FLOWTABLE_NAME],
-						 genmask);
-	if (IS_ERR(flowtable))
-                return PTR_ERR(flowtable);
-	if (flowtable->use > 0)
+	if (nla[NFTA_FLOWTABLE_HANDLE]) {
+		attr = nla[NFTA_FLOWTABLE_HANDLE];
+		flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
+	} else {
+		attr = nla[NFTA_FLOWTABLE_NAME];
+		flowtable = nft_flowtable_lookup(table, attr, genmask);
+	}
+
+	if (IS_ERR(flowtable)) {
+		NL_SET_BAD_ATTR(extack, attr);
+		return PTR_ERR(flowtable);
+	}
+	if (flowtable->use > 0) {
+		NL_SET_BAD_ATTR(extack, attr);
 		return -EBUSY;
+	}
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
-- 
cgit v1.2.3-59-g8ed1b


From 71cc0873e0e0a4c6dca899c42e3ac143f7960d8e Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 3 Apr 2018 23:15:39 +0200
Subject: netfilter: nf_tables: Simplify set backend selection

Drop nft_set_type's ability to act as a container of multiple backend
implementations it chooses from. Instead consolidate the whole selection
logic in nft_select_set_ops() and the actual backend provided estimate()
callback.

This turns nf_tables_set_types into a list containing all available
backends which is traversed when selecting one matching userspace
requested criteria.

Also, this change allows to embed nft_set_ops structure into
nft_set_type and pull flags field into the latter as it's only used
during selection phase.

A crucial part of this change is to make sure the new layout respects
hash backend constraints formerly enforced by nft_hash_select_ops()
function: This is achieved by introduction of a specific estimate()
callback for nft_hash_fast_ops which returns false for key lengths != 4.
In turn, nft_hash_estimate() is changed to return false for key lengths
== 4 so it won't be chosen by accident. Also, both callbacks must return
false for unbounded sets as their size estimate depends on a known
maximum element count.

Note that this patch partially reverts commit 4f2921ca21b71 ("netfilter:
nf_tables: meter: pick a set backend that supports updates") by making
nft_set_ops_candidate() not explicitly look for an update callback but
make NFT_SET_EVAL a regular backend feature flag which is checked along
with the others. This way all feature requirements are checked in one
go.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  34 ++++-----
 net/netfilter/nf_tables_api.c     |  25 +++----
 net/netfilter/nft_set_bitmap.c    |  34 ++++-----
 net/netfilter/nft_set_hash.c      | 153 +++++++++++++++++++++-----------------
 net/netfilter/nft_set_rbtree.c    |  36 ++++-----
 5 files changed, 139 insertions(+), 143 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 123e82a2f8bb..de77d36e36b3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -275,23 +275,6 @@ struct nft_set_estimate {
 	enum nft_set_class	space;
 };
 
-/**
- *      struct nft_set_type - nf_tables set type
- *
- *      @select_ops: function to select nft_set_ops
- *      @ops: default ops, used when no select_ops functions is present
- *      @list: used internally
- *      @owner: module reference
- */
-struct nft_set_type {
-	const struct nft_set_ops	*(*select_ops)(const struct nft_ctx *,
-						       const struct nft_set_desc *desc,
-						       u32 flags);
-	const struct nft_set_ops	*ops;
-	struct list_head		list;
-	struct module			*owner;
-};
-
 struct nft_set_ext;
 struct nft_expr;
 
@@ -310,7 +293,6 @@ struct nft_expr;
  *	@init: initialize private data of new set instance
  *	@destroy: destroy private data of set instance
  *	@elemsize: element private size
- *	@features: features supported by the implementation
  */
 struct nft_set_ops {
 	bool				(*lookup)(const struct net *net,
@@ -361,9 +343,23 @@ struct nft_set_ops {
 	void				(*destroy)(const struct nft_set *set);
 
 	unsigned int			elemsize;
+};
+
+/**
+ *      struct nft_set_type - nf_tables set type
+ *
+ *      @ops: set ops for this type
+ *      @list: used internally
+ *      @owner: module reference
+ *      @features: features supported by the implementation
+ */
+struct nft_set_type {
+	const struct nft_set_ops	ops;
+	struct list_head		list;
+	struct module			*owner;
 	u32				features;
-	const struct nft_set_type	*type;
 };
+#define to_set_type(o) container_of(o, struct nft_set_type, ops)
 
 int nft_register_set(struct nft_set_type *type);
 void nft_unregister_set(struct nft_set_type *type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2f14cadd9922..9ce35acf491d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2523,14 +2523,12 @@ void nft_unregister_set(struct nft_set_type *type)
 EXPORT_SYMBOL_GPL(nft_unregister_set);
 
 #define NFT_SET_FEATURES	(NFT_SET_INTERVAL | NFT_SET_MAP | \
-				 NFT_SET_TIMEOUT | NFT_SET_OBJECT)
+				 NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
+				 NFT_SET_EVAL)
 
-static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags)
+static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
 {
-	if ((flags & NFT_SET_EVAL) && !ops->update)
-		return false;
-
-	return (flags & ops->features) == (flags & NFT_SET_FEATURES);
+	return (flags & type->features) == (flags & NFT_SET_FEATURES);
 }
 
 /*
@@ -2567,14 +2565,9 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 	best.space  = ~0;
 
 	list_for_each_entry(type, &nf_tables_set_types, list) {
-		if (!type->select_ops)
-			ops = type->ops;
-		else
-			ops = type->select_ops(ctx, desc, flags);
-		if (!ops)
-			continue;
+		ops = &type->ops;
 
-		if (!nft_set_ops_candidate(ops, flags))
+		if (!nft_set_ops_candidate(type, flags))
 			continue;
 		if (!ops->estimate(desc, flags, &est))
 			continue;
@@ -2605,7 +2598,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 		if (!try_module_get(type->owner))
 			continue;
 		if (bops != NULL)
-			module_put(bops->type->owner);
+			module_put(to_set_type(bops)->owner);
 
 		bops = ops;
 		best = est;
@@ -3247,14 +3240,14 @@ err3:
 err2:
 	kvfree(set);
 err1:
-	module_put(ops->type->owner);
+	module_put(to_set_type(ops)->owner);
 	return err;
 }
 
 static void nft_set_destroy(struct nft_set *set)
 {
 	set->ops->destroy(set);
-	module_put(set->ops->type->owner);
+	module_put(to_set_type(set->ops)->owner);
 	kfree(set->name);
 	kvfree(set);
 }
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 45fb2752fb63..d6626e01c7ee 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_bitmap_type;
-static struct nft_set_ops nft_bitmap_ops __read_mostly = {
-	.type		= &nft_bitmap_type,
-	.privsize	= nft_bitmap_privsize,
-	.elemsize	= offsetof(struct nft_bitmap_elem, ext),
-	.estimate	= nft_bitmap_estimate,
-	.init		= nft_bitmap_init,
-	.destroy	= nft_bitmap_destroy,
-	.insert		= nft_bitmap_insert,
-	.remove		= nft_bitmap_remove,
-	.deactivate	= nft_bitmap_deactivate,
-	.flush		= nft_bitmap_flush,
-	.activate	= nft_bitmap_activate,
-	.lookup		= nft_bitmap_lookup,
-	.walk		= nft_bitmap_walk,
-	.get		= nft_bitmap_get,
-};
-
 static struct nft_set_type nft_bitmap_type __read_mostly = {
-	.ops		= &nft_bitmap_ops,
 	.owner		= THIS_MODULE,
+	.ops		= {
+		.privsize	= nft_bitmap_privsize,
+		.elemsize	= offsetof(struct nft_bitmap_elem, ext),
+		.estimate	= nft_bitmap_estimate,
+		.init		= nft_bitmap_init,
+		.destroy	= nft_bitmap_destroy,
+		.insert		= nft_bitmap_insert,
+		.remove		= nft_bitmap_remove,
+		.deactivate	= nft_bitmap_deactivate,
+		.flush		= nft_bitmap_flush,
+		.activate	= nft_bitmap_activate,
+		.lookup		= nft_bitmap_lookup,
+		.walk		= nft_bitmap_walk,
+		.get		= nft_bitmap_get,
+	},
 };
 
 static int __init nft_bitmap_module_init(void)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index fc9c6d5d64cd..dbf1f4ad077c 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -605,6 +605,12 @@ static void nft_hash_destroy(const struct nft_set *set)
 static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 			      struct nft_set_estimate *est)
 {
+	if (!desc->size)
+		return false;
+
+	if (desc->klen == 4)
+		return false;
+
 	est->size   = sizeof(struct nft_hash) +
 		      nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
 		      desc->size * sizeof(struct nft_hash_elem);
@@ -614,91 +620,100 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_hash_type;
-static struct nft_set_ops nft_rhash_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_rhash_privsize,
-	.elemsize	= offsetof(struct nft_rhash_elem, ext),
-	.estimate	= nft_rhash_estimate,
-	.init		= nft_rhash_init,
-	.destroy	= nft_rhash_destroy,
-	.insert		= nft_rhash_insert,
-	.activate	= nft_rhash_activate,
-	.deactivate	= nft_rhash_deactivate,
-	.flush		= nft_rhash_flush,
-	.remove		= nft_rhash_remove,
-	.lookup		= nft_rhash_lookup,
-	.update		= nft_rhash_update,
-	.walk		= nft_rhash_walk,
-	.get		= nft_rhash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
-};
+static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
+			      struct nft_set_estimate *est)
+{
+	if (!desc->size)
+		return false;
 
-static struct nft_set_ops nft_hash_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_hash_privsize,
-	.elemsize	= offsetof(struct nft_hash_elem, ext),
-	.estimate	= nft_hash_estimate,
-	.init		= nft_hash_init,
-	.destroy	= nft_hash_destroy,
-	.insert		= nft_hash_insert,
-	.activate	= nft_hash_activate,
-	.deactivate	= nft_hash_deactivate,
-	.flush		= nft_hash_flush,
-	.remove		= nft_hash_remove,
-	.lookup		= nft_hash_lookup,
-	.walk		= nft_hash_walk,
-	.get		= nft_hash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
-};
+	if (desc->klen != 4)
+		return false;
 
-static struct nft_set_ops nft_hash_fast_ops __read_mostly = {
-	.type		= &nft_hash_type,
-	.privsize       = nft_hash_privsize,
-	.elemsize	= offsetof(struct nft_hash_elem, ext),
-	.estimate	= nft_hash_estimate,
-	.init		= nft_hash_init,
-	.destroy	= nft_hash_destroy,
-	.insert		= nft_hash_insert,
-	.activate	= nft_hash_activate,
-	.deactivate	= nft_hash_deactivate,
-	.flush		= nft_hash_flush,
-	.remove		= nft_hash_remove,
-	.lookup		= nft_hash_lookup_fast,
-	.walk		= nft_hash_walk,
-	.get		= nft_hash_get,
-	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
-static const struct nft_set_ops *
-nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc,
-		    u32 flags)
-{
-	if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) {
-		switch (desc->klen) {
-		case 4:
-			return &nft_hash_fast_ops;
-		default:
-			return &nft_hash_ops;
-		}
-	}
+	est->size   = sizeof(struct nft_hash) +
+		      nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+		      desc->size * sizeof(struct nft_hash_elem);
+	est->lookup = NFT_SET_CLASS_O_1;
+	est->space  = NFT_SET_CLASS_O_N;
 
-	return &nft_rhash_ops;
+	return true;
 }
 
+static struct nft_set_type nft_rhash_type __read_mostly = {
+	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT |
+			  NFT_SET_TIMEOUT | NFT_SET_EVAL,
+	.ops		= {
+		.privsize       = nft_rhash_privsize,
+		.elemsize	= offsetof(struct nft_rhash_elem, ext),
+		.estimate	= nft_rhash_estimate,
+		.init		= nft_rhash_init,
+		.destroy	= nft_rhash_destroy,
+		.insert		= nft_rhash_insert,
+		.activate	= nft_rhash_activate,
+		.deactivate	= nft_rhash_deactivate,
+		.flush		= nft_rhash_flush,
+		.remove		= nft_rhash_remove,
+		.lookup		= nft_rhash_lookup,
+		.update		= nft_rhash_update,
+		.walk		= nft_rhash_walk,
+		.get		= nft_rhash_get,
+	},
+};
+
 static struct nft_set_type nft_hash_type __read_mostly = {
-	.select_ops	= nft_hash_select_ops,
 	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize       = nft_hash_privsize,
+		.elemsize	= offsetof(struct nft_hash_elem, ext),
+		.estimate	= nft_hash_estimate,
+		.init		= nft_hash_init,
+		.destroy	= nft_hash_destroy,
+		.insert		= nft_hash_insert,
+		.activate	= nft_hash_activate,
+		.deactivate	= nft_hash_deactivate,
+		.flush		= nft_hash_flush,
+		.remove		= nft_hash_remove,
+		.lookup		= nft_hash_lookup,
+		.walk		= nft_hash_walk,
+		.get		= nft_hash_get,
+	},
+};
+
+static struct nft_set_type nft_hash_fast_type __read_mostly = {
+	.owner		= THIS_MODULE,
+	.features	= NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize       = nft_hash_privsize,
+		.elemsize	= offsetof(struct nft_hash_elem, ext),
+		.estimate	= nft_hash_fast_estimate,
+		.init		= nft_hash_init,
+		.destroy	= nft_hash_destroy,
+		.insert		= nft_hash_insert,
+		.activate	= nft_hash_activate,
+		.deactivate	= nft_hash_deactivate,
+		.flush		= nft_hash_flush,
+		.remove		= nft_hash_remove,
+		.lookup		= nft_hash_lookup_fast,
+		.walk		= nft_hash_walk,
+		.get		= nft_hash_get,
+	},
 };
 
 static int __init nft_hash_module_init(void)
 {
-	return nft_register_set(&nft_hash_type);
+	if (nft_register_set(&nft_hash_fast_type) ||
+	    nft_register_set(&nft_hash_type) ||
+	    nft_register_set(&nft_rhash_type))
+		return 1;
+	return 0;
 }
 
 static void __exit nft_hash_module_exit(void)
 {
+	nft_unregister_set(&nft_rhash_type);
 	nft_unregister_set(&nft_hash_type);
+	nft_unregister_set(&nft_hash_fast_type);
 }
 
 module_init(nft_hash_module_init);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index e6f08bc5f359..22c57d7612c4 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -393,28 +393,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 	return true;
 }
 
-static struct nft_set_type nft_rbtree_type;
-static struct nft_set_ops nft_rbtree_ops __read_mostly = {
-	.type		= &nft_rbtree_type,
-	.privsize	= nft_rbtree_privsize,
-	.elemsize	= offsetof(struct nft_rbtree_elem, ext),
-	.estimate	= nft_rbtree_estimate,
-	.init		= nft_rbtree_init,
-	.destroy	= nft_rbtree_destroy,
-	.insert		= nft_rbtree_insert,
-	.remove		= nft_rbtree_remove,
-	.deactivate	= nft_rbtree_deactivate,
-	.flush		= nft_rbtree_flush,
-	.activate	= nft_rbtree_activate,
-	.lookup		= nft_rbtree_lookup,
-	.walk		= nft_rbtree_walk,
-	.get		= nft_rbtree_get,
-	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
-};
-
 static struct nft_set_type nft_rbtree_type __read_mostly = {
-	.ops		= &nft_rbtree_ops,
 	.owner		= THIS_MODULE,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
+	.ops		= {
+		.privsize	= nft_rbtree_privsize,
+		.elemsize	= offsetof(struct nft_rbtree_elem, ext),
+		.estimate	= nft_rbtree_estimate,
+		.init		= nft_rbtree_init,
+		.destroy	= nft_rbtree_destroy,
+		.insert		= nft_rbtree_insert,
+		.remove		= nft_rbtree_remove,
+		.deactivate	= nft_rbtree_deactivate,
+		.flush		= nft_rbtree_flush,
+		.activate	= nft_rbtree_activate,
+		.lookup		= nft_rbtree_lookup,
+		.walk		= nft_rbtree_walk,
+		.get		= nft_rbtree_get,
+	},
 };
 
 static int __init nft_rbtree_module_init(void)
-- 
cgit v1.2.3-59-g8ed1b


From 2eb0f624b709e78ec8e2f4c3412947703db99301 Mon Sep 17 00:00:00 2001
From: Thierry Du Tre <thierry@dtsystems.be>
Date: Wed, 4 Apr 2018 15:38:22 +0200
Subject: netfilter: add NAT support for shifted portmap ranges

This is a patch proposal to support shifted ranges in portmaps.  (i.e. tcp/udp
incoming port 5000-5100 on WAN redirected to LAN 192.168.1.5:2000-2100)

Currently DNAT only works for single port or identical port ranges.  (i.e.
ports 5000-5100 on WAN interface redirected to a LAN host while original
destination port is not altered) When different port ranges are configured,
either 'random' mode should be used, or else all incoming connections are
mapped onto the first port in the redirect range. (in described example
WAN:5000-5100 will all be mapped to 192.168.1.5:2000)

This patch introduces a new mode indicated by flag NF_NAT_RANGE_PROTO_OFFSET
which uses a base port value to calculate an offset with the destination port
present in the incoming stream. That offset is then applied as index within the
redirect port range (index modulo rangewidth to handle range overflow).

In described example the base port would be 5000. An incoming stream with
destination port 5004 would result in an offset value 4 which means that the
NAT'ed stream will be using destination port 2004.

Other possibilities include deterministic mapping of larger or multiple ranges
to a smaller range : WAN:5000-5999 -> LAN:5000-5099 (maps WAN port 5*xx to port
51xx)

This patch does not change any current behavior. It just adds new NAT proto
range functionality which must be selected via the specific flag when intended
to use.

A patch for iptables (libipt_DNAT.c + libip6t_DNAT.c) will also be proposed
which makes this functionality immediately available.

Signed-off-by: Thierry Du Tre <thierry@dtsystems.be>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/ipv6/nf_nat_masquerade.h |  2 +-
 include/net/netfilter/nf_nat.h                 |  2 +-
 include/net/netfilter/nf_nat_l3proto.h         |  4 +-
 include/net/netfilter/nf_nat_l4proto.h         |  8 +--
 include/net/netfilter/nf_nat_redirect.h        |  2 +-
 include/uapi/linux/netfilter/nf_nat.h          | 12 ++++-
 net/ipv4/netfilter/ipt_MASQUERADE.c            |  2 +-
 net/ipv4/netfilter/nf_nat_h323.c               |  4 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       |  4 +-
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c    |  4 +-
 net/ipv4/netfilter/nf_nat_pptp.c               |  2 +-
 net/ipv4/netfilter/nf_nat_proto_gre.c          |  2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c         |  2 +-
 net/ipv4/netfilter/nft_masq_ipv4.c             |  2 +-
 net/ipv6/netfilter/ip6t_MASQUERADE.c           |  2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       |  4 +-
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c    |  4 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c       |  2 +-
 net/ipv6/netfilter/nft_masq_ipv6.c             |  2 +-
 net/ipv6/netfilter/nft_redir_ipv6.c            |  2 +-
 net/netfilter/nf_nat_core.c                    | 27 +++++-----
 net/netfilter/nf_nat_helper.c                  |  2 +-
 net/netfilter/nf_nat_proto_common.c            |  9 ++--
 net/netfilter/nf_nat_proto_dccp.c              |  2 +-
 net/netfilter/nf_nat_proto_sctp.c              |  2 +-
 net/netfilter/nf_nat_proto_tcp.c               |  2 +-
 net/netfilter/nf_nat_proto_udp.c               |  4 +-
 net/netfilter/nf_nat_proto_unknown.c           |  2 +-
 net/netfilter/nf_nat_redirect.c                |  6 +--
 net/netfilter/nf_nat_sip.c                     |  2 +-
 net/netfilter/nft_nat.c                        |  2 +-
 net/netfilter/xt_NETMAP.c                      |  8 +--
 net/netfilter/xt_REDIRECT.c                    |  2 +-
 net/netfilter/xt_nat.c                         | 72 +++++++++++++++++++++++---
 net/openvswitch/conntrack.c                    |  4 +-
 36 files changed, 145 insertions(+), 71 deletions(-)

diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h
index ebd869473603..cd24be4c4a99 100644
--- a/include/net/netfilter/ipv4/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv4/nf_nat_masquerade.h
@@ -6,7 +6,7 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 
 void nf_nat_masquerade_ipv4_register_notifier(void);
diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h
index 1ed4f2631ed6..0c3b5ebf0bb8 100644
--- a/include/net/netfilter/ipv6/nf_nat_masquerade.h
+++ b/include/net/netfilter/ipv6/nf_nat_masquerade.h
@@ -3,7 +3,7 @@
 #define _NF_NAT_MASQUERADE_IPV6_H_
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 void nf_nat_masquerade_ipv6_register_notifier(void);
 void nf_nat_masquerade_ipv6_unregister_notifier(void);
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 207a467e7ca6..da3d601cadee 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -39,7 +39,7 @@ struct nf_conn_nat {
 
 /* Set up the info structure to map into this range. */
 unsigned int nf_nat_setup_info(struct nf_conn *ct,
-			       const struct nf_nat_range *range,
+			       const struct nf_nat_range2 *range,
 			       enum nf_nat_manip_type maniptype);
 
 extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct,
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index ce7c2b4e64bb..ac47098a61dc 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -7,7 +7,7 @@ struct nf_nat_l3proto {
 	u8	l3proto;
 
 	bool	(*in_range)(const struct nf_conntrack_tuple *t,
-			    const struct nf_nat_range *range);
+			    const struct nf_nat_range2 *range);
 
 	u32 	(*secure_port)(const struct nf_conntrack_tuple *t, __be16);
 
@@ -33,7 +33,7 @@ struct nf_nat_l3proto {
 				  struct flowi *fl);
 
 	int	(*nlattr_to_range)(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 };
 
 int nf_nat_l3proto_register(const struct nf_nat_l3proto *);
diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 67835ff8a2d9..b4d6b29bca62 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -34,12 +34,12 @@ struct nf_nat_l4proto {
 	 */
 	void (*unique_tuple)(const struct nf_nat_l3proto *l3proto,
 			     struct nf_conntrack_tuple *tuple,
-			     const struct nf_nat_range *range,
+			     const struct nf_nat_range2 *range,
 			     enum nf_nat_manip_type maniptype,
 			     const struct nf_conn *ct);
 
 	int (*nlattr_to_range)(struct nlattr *tb[],
-			       struct nf_nat_range *range);
+			       struct nf_nat_range2 *range);
 };
 
 /* Protocol registration. */
@@ -72,11 +72,11 @@ bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct, u16 *rover);
 
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range);
+				   struct nf_nat_range2 *range);
 
 #endif /*_NF_NAT_L4PROTO_H*/
diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index 5ddabb08c472..c129aacc8ae8 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -7,7 +7,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 		     const struct nf_nat_ipv4_multi_range_compat *mr,
 		     unsigned int hooknum);
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum);
 
 #endif /* _NF_NAT_REDIRECT_H_ */
diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h
index a33000da7229..4a95c0db14d4 100644
--- a/include/uapi/linux/netfilter/nf_nat.h
+++ b/include/uapi/linux/netfilter/nf_nat.h
@@ -10,6 +10,7 @@
 #define NF_NAT_RANGE_PROTO_RANDOM		(1 << 2)
 #define NF_NAT_RANGE_PERSISTENT			(1 << 3)
 #define NF_NAT_RANGE_PROTO_RANDOM_FULLY		(1 << 4)
+#define NF_NAT_RANGE_PROTO_OFFSET		(1 << 5)
 
 #define NF_NAT_RANGE_PROTO_RANDOM_ALL		\
 	(NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
@@ -17,7 +18,7 @@
 #define NF_NAT_RANGE_MASK					\
 	(NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |	\
 	 NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |	\
-	 NF_NAT_RANGE_PROTO_RANDOM_FULLY)
+	 NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET)
 
 struct nf_nat_ipv4_range {
 	unsigned int			flags;
@@ -40,4 +41,13 @@ struct nf_nat_range {
 	union nf_conntrack_man_proto	max_proto;
 };
 
+struct nf_nat_range2 {
+	unsigned int			flags;
+	union nf_inet_addr		min_addr;
+	union nf_inet_addr		max_addr;
+	union nf_conntrack_man_proto	min_proto;
+	union nf_conntrack_man_proto	max_proto;
+	union nf_conntrack_man_proto	base_proto;
+};
+
 #endif /* _NETFILTER_NF_NAT_H */
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index a03e4e7ef5f9..ce1512b02cb2 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
 static unsigned int
 masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_ipv4_multi_range_compat *mr;
 
 	mr = par->targinfo;
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index ac8342dcb55e..4e6b53ab6c33 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_q931_expect(struct nf_conn *new,
 			       struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	if (this->tuple.src.u3.ip != 0) {	/* Only accept calls from GK */
 		nf_nat_follow_master(new, this);
@@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 static void ip_nat_callforwarding_expect(struct nf_conn *new,
 					 struct nf_conntrack_expect *this)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(new->status & IPS_NAT_DONE_MASK);
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index f7ff6a364d7b..4346336cee4c 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
 #endif /* CONFIG_XFRM */
 
 static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
 	       ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
@@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V4_MINIP]) {
 		range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index 0c366aad89cb..f538c5001547 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -24,13 +24,13 @@
 
 unsigned int
 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range *range,
+		       const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	struct nf_conn *ct;
 	struct nf_conn_nat *nat;
 	enum ip_conntrack_info ctinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	const struct rtable *rt;
 	__be32 newsrc, nh;
 
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 8a69363b4884..5d259a12e25f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
 	struct nf_conntrack_tuple t = {};
 	const struct nf_ct_pptp_master *ct_pptp_info;
 	const struct nf_nat_pptp *nat_pptp_info;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	struct nf_conn_nat *nat;
 
 	nat = nf_ct_nat_ext_add(ct);
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index edf05002d674..00fda6331ce5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
 static void
 gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 7b98baa13ede..6d7cf1d79baf 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index f18677277119..f1193e1e928a 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 92c0047e7e33..491f808e356a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 6b7f075f811f..56d75eb5448f 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
 #endif
 
 static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
-				 const struct nf_nat_range *range)
+				 const struct nf_nat_range2 *range)
 {
 	return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
 	       ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
@@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
-				       struct nf_nat_range *range)
+				       struct nf_nat_range2 *range)
 {
 	if (tb[CTA_NAT_V6_MINIP]) {
 		nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 98f61fcb9108..9dfc2b90c362 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -26,14 +26,14 @@
 static atomic_t v6_worker_count;
 
 unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out)
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
 	struct in6_addr src;
 	struct nf_conn *ct;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index 57593b00c5b4..d9bf42ba44fa 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
 static void
 icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    enum nf_nat_manip_type maniptype,
 		    const struct nf_conn *ct)
 {
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 4146536e9c15..dd0122f3cffe 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_masq *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
index a27e424f690d..74269865acc8 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
 				const struct nft_pktinfo *pkt)
 {
 	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_proto_min) {
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 617693ff9f4c..37b3c9913b08 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple);
 static int in_range(const struct nf_nat_l3proto *l3proto,
 		    const struct nf_nat_l4proto *l4proto,
 		    const struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range)
+		    const struct nf_nat_range2 *range)
 {
 	/* If we are supposed to map IPs, then we must be in the
 	 * range specified, otherwise let this drag us onto a new src IP.
@@ -194,7 +194,7 @@ find_appropriate_src(struct net *net,
 		     const struct nf_nat_l4proto *l4proto,
 		     const struct nf_conntrack_tuple *tuple,
 		     struct nf_conntrack_tuple *result,
-		     const struct nf_nat_range *range)
+		     const struct nf_nat_range2 *range)
 {
 	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
@@ -224,7 +224,7 @@ find_appropriate_src(struct net *net,
 static void
 find_best_ips_proto(const struct nf_conntrack_zone *zone,
 		    struct nf_conntrack_tuple *tuple,
-		    const struct nf_nat_range *range,
+		    const struct nf_nat_range2 *range,
 		    const struct nf_conn *ct,
 		    enum nf_nat_manip_type maniptype)
 {
@@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone,
 static void
 get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_tuple *orig_tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 struct nf_conn *ct,
 		 enum nf_nat_manip_type maniptype)
 {
@@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	/* Only bother mapping if it's not already in range and unique */
 	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
 		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
-			if (l4proto->in_range(tuple, maniptype,
-					      &range->min_proto,
-					      &range->max_proto) &&
+			if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
+			    l4proto->in_range(tuple, maniptype,
+			          &range->min_proto,
+			          &range->max_proto) &&
 			    (range->min_proto.all == range->max_proto.all ||
 			     !nf_nat_used_tuple(tuple, ct)))
 				goto out;
@@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 		}
 	}
 
-	/* Last change: get protocol to try to obtain unique tuple. */
+	/* Last chance: get protocol to try to obtain unique tuple. */
 	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
 out:
 	rcu_read_unlock();
@@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
 
 unsigned int
 nf_nat_setup_info(struct nf_conn *ct,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype)
 {
 	struct net *net = nf_ct_net(ct);
@@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 		(manip == NF_NAT_MANIP_SRC ?
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
-	struct nf_nat_range range = {
+	struct nf_nat_range2 range = {
 		.flags		= NF_NAT_RANGE_MAP_IPS,
 		.min_addr	= ip,
 		.max_addr	= ip,
@@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
 
 static int nfnetlink_parse_nat_proto(struct nlattr *attr,
 				     const struct nf_conn *ct,
-				     struct nf_nat_range *range)
+				     struct nf_nat_range2 *range)
 {
 	struct nlattr *tb[CTA_PROTONAT_MAX+1];
 	const struct nf_nat_l4proto *l4proto;
@@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
 
 static int
 nfnetlink_parse_nat(const struct nlattr *nat,
-		    const struct nf_conn *ct, struct nf_nat_range *range,
+		    const struct nf_conn *ct, struct nf_nat_range2 *range,
 		    const struct nf_nat_l3proto *l3proto)
 {
 	struct nlattr *tb[CTA_NAT_MAX+1];
@@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	const struct nf_nat_l3proto *l3proto;
 	int err;
 
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 607a373379b4..99606baedda4 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
 void nf_nat_follow_master(struct nf_conn *ct,
 			  struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 7d7466dbf663..5d849d835561 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range);
 
 void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct,
 				 u16 *rover)
@@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 						  : tuple->src.u.all);
 	} else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
 		off = prandom_u32();
+	} else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) {
+		off = (ntohs(*portptr) - ntohs(range->base_proto.all));
 	} else {
 		off = *rover;
 	}
@@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		*portptr = htons(min + off % range_size);
 		if (++i != range_size && nf_nat_used_tuple(tuple, ct))
 			continue;
-		if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
+		if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
+					NF_NAT_RANGE_PROTO_OFFSET)))
 			*rover = off;
 		return;
 	}
@@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
-				   struct nf_nat_range *range)
+				   struct nf_nat_range2 *range)
 {
 	if (tb[CTA_PROTONAT_PORT_MIN]) {
 		range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]);
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index 269fcd5dc34c..67ea0d83aa5a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover;
 static void
 dccp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index c57ee3240b1d..1c5d9b65fbba 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover;
 static void
 sctp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		  struct nf_conntrack_tuple *tuple,
-		  const struct nf_nat_range *range,
+		  const struct nf_nat_range2 *range,
 		  enum nf_nat_manip_type maniptype,
 		  const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 4f8820fc5148..f15fcd475f98 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -23,7 +23,7 @@ static u16 tcp_port_rover;
 static void
 tcp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index edd4a77dc09a..5790f70a83b2 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -22,7 +22,7 @@ static u16 udp_port_rover;
 static void
 udp_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		 struct nf_conntrack_tuple *tuple,
-		 const struct nf_nat_range *range,
+		 const struct nf_nat_range2 *range,
 		 enum nf_nat_manip_type maniptype,
 		 const struct nf_conn *ct)
 {
@@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
 static void
 udplite_unique_tuple(const struct nf_nat_l3proto *l3proto,
 		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_nat_range *range,
+		     const struct nf_nat_range2 *range,
 		     enum nf_nat_manip_type maniptype,
 		     const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c
index 6e494d584412..c5db3e251232 100644
--- a/net/netfilter/nf_nat_proto_unknown.c
+++ b/net/netfilter/nf_nat_proto_unknown.c
@@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
 
 static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto,
 				 struct nf_conntrack_tuple *tuple,
-				 const struct nf_nat_range *range,
+				 const struct nf_nat_range2 *range,
 				 enum nf_nat_manip_type maniptype,
 				 const struct nf_conn *ct)
 {
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 25b06b959118..7c4bb0a773ca 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	__be32 newdst;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
 		hooknum != NF_INET_LOCAL_OUT);
@@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
 static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
 
 unsigned int
-nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
+nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 	struct in6_addr newdst;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 791fac4fd745..1f3086074981 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
 static void nf_nat_sip_expected(struct nf_conn *ct,
 				struct nf_conntrack_expect *exp)
 {
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	/* This must be a fresh one. */
 	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 1f36954c2ba9..c15807d10b91 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr,
 	const struct nft_nat *priv = nft_expr_priv(expr);
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 
 	memset(&range, 0, sizeof(range));
 	if (priv->sreg_addr_min) {
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 58aa9dd3c5b7..1d437875e15a 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -21,8 +21,8 @@
 static unsigned int
 netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
-	struct nf_nat_range newrange;
+	const struct nf_nat_range2 *range = par->targinfo;
+	struct nf_nat_range2 newrange;
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	union nf_inet_addr new_addr, netmask;
@@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 		return -EINVAL;
@@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	enum ip_conntrack_info ctinfo;
 	__be32 new_ip, netmask;
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range newrange;
+	struct nf_nat_range2 newrange;
 
 	WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
 		xt_hooknum(par) != NF_INET_POST_ROUTING &&
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 98a4c6d4f1cb..5ce9461e979c 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range2 *range = par->targinfo;
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index bdb689cdc829..8af9707f8789 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-static void xt_nat_convert_range(struct nf_nat_range *dst,
+static void xt_nat_convert_range(struct nf_nat_range2 *dst,
 				 const struct nf_nat_ipv4_range *src)
 {
 	memset(&dst->min_addr, 0, sizeof(dst->min_addr));
 	memset(&dst->max_addr, 0, sizeof(dst->max_addr));
+	memset(&dst->base_proto, 0, sizeof(dst->base_proto));
 
 	dst->flags	 = src->flags;
 	dst->min_addr.ip = src->min_ip;
@@ -54,7 +55,7 @@ static unsigned int
 xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -71,7 +72,7 @@ static unsigned int
 xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-	struct nf_nat_range range;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 static unsigned int
 xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
 		  ctinfo == IP_CT_RELATED_REPLY)));
 
-	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
 }
 
 static unsigned int
 xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	const struct nf_nat_range *range = par->targinfo;
+	const struct nf_nat_range *range_v1 = par->targinfo;
+	struct nf_nat_range2 range;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+
+	memcpy(&range, range_v1, sizeof(*range_v1));
+	memset(&range.base_proto, 0, sizeof(range.base_proto));
+
+	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+static unsigned int
+xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+		  ctinfo == IP_CT_RELATED_REPLY)));
+
+	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
+}
+
+static unsigned int
+xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct nf_nat_range2 *range = par->targinfo;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 
@@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 				  (1 << NF_INET_LOCAL_OUT),
 		.me		= THIS_MODULE,
 	},
+	{
+		.name		= "SNAT",
+		.revision	= 2,
+		.checkentry	= xt_nat_checkentry,
+		.destroy	= xt_nat_destroy,
+		.target		= xt_snat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_POST_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "DNAT",
+		.revision	= 2,
+		.target		= xt_dnat_target_v2,
+		.targetsize	= sizeof(struct nf_nat_range2),
+		.table		= "nat",
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_OUT),
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init xt_nat_init(void)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c5904f629091..02fc343feb66 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -72,7 +72,7 @@ struct ovs_conntrack_info {
 	struct md_mark mark;
 	struct md_labels labels;
 #ifdef CONFIG_NF_NAT_NEEDED
-	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+	struct nf_nat_range2 range;  /* Only present for SRC NAT and DST NAT. */
 #endif
 };
 
@@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net,
  */
 static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
 			      enum ip_conntrack_info ctinfo,
-			      const struct nf_nat_range *range,
+			      const struct nf_nat_range2 *range,
 			      enum nf_nat_manip_type maniptype)
 {
 	int hooknum, nh_off, err = NF_ACCEPT;
-- 
cgit v1.2.3-59-g8ed1b


From 35341a61597ea160b0b8d8a91052c62eb4be8dba Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 8 Apr 2018 23:59:34 +0900
Subject: netfilter: add __exit mark to helper modules

There are no __exit mark in the helper modules.
because these exit functions used to be called by init function
but now that is not. so we can add __exit mark.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_ftp.c  | 3 +--
 net/netfilter/nf_conntrack_irc.c  | 6 +-----
 net/netfilter/nf_conntrack_sane.c | 3 +--
 net/netfilter/nf_conntrack_sip.c  | 2 +-
 net/netfilter/nf_conntrack_tftp.c | 2 +-
 5 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index f0e9a7511e1a..a11c304fb771 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
 	.timeout	= 5 * 60,
 };
 
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_ftp_fini(void)
+static void __exit nf_conntrack_ftp_fini(void)
 {
 	nf_conntrack_helpers_unregister(ftp, ports_c * 2);
 	kfree(ftp_buffer);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 5523acce9d69..4099f4d79bae 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff,
 static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly;
 static struct nf_conntrack_expect_policy irc_exp_policy;
 
-static void nf_conntrack_irc_fini(void);
-
 static int __init nf_conntrack_irc_init(void)
 {
 	int i, ret;
@@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void)
 	return 0;
 }
 
-/* This function is intentionally _NOT_ defined as __exit, because
- * it is needed by the init function */
-static void nf_conntrack_irc_fini(void)
+static void __exit nf_conntrack_irc_fini(void)
 {
 	nf_conntrack_helpers_unregister(irc, ports_c);
 	kfree(irc_buffer);
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index ae457f39d5ce..5072ff96ab33 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
 	.timeout	= 5 * 60,
 };
 
-/* don't make this __exit, since it's called from __init ! */
-static void nf_conntrack_sane_fini(void)
+static void __exit nf_conntrack_sane_fini(void)
 {
 	nf_conntrack_helpers_unregister(sane, ports_c * 2);
 	kfree(sane_buffer);
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 4dbb5bad4363..148ce1a52cc7 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1609,7 +1609,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1
 	},
 };
 
-static void nf_conntrack_sip_fini(void)
+static void __exit nf_conntrack_sip_fini(void)
 {
 	nf_conntrack_helpers_unregister(sip, ports_c * 4);
 }
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 0ec6779fd5d9..548b673b3625 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = {
 	.timeout	= 5 * 60,
 };
 
-static void nf_conntrack_tftp_fini(void)
+static void __exit nf_conntrack_tftp_fini(void)
 {
 	nf_conntrack_helpers_unregister(tftp, ports_c * 2);
 }
-- 
cgit v1.2.3-59-g8ed1b


From e4de6ead16fe607082af87ecd4cce7f9690ea632 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 00:00:21 +0900
Subject: netfilter: ebtables: add ebt_free_table_info function

A ebt_free_table_info frees all of chainstacks.
It similar to xt_free_table_info. this inline function
reduces code line.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 032e0fe45940..355410b13316 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -343,6 +343,16 @@ find_table_lock(struct net *net, const char *name, int *error,
 				"ebtable_", error, mutex);
 }
 
+static inline void ebt_free_table_info(struct ebt_table_info *info)
+{
+	int i;
+
+	if (info->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(info->chainstack[i]);
+		vfree(info->chainstack);
+	}
+}
 static inline int
 ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 		unsigned int *cnt)
@@ -975,7 +985,7 @@ static void get_counters(const struct ebt_counter *oldcounters,
 static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 			      struct ebt_table_info *newinfo)
 {
-	int ret, i;
+	int ret;
 	struct ebt_counter *counterstmp = NULL;
 	/* used to be able to unlock earlier */
 	struct ebt_table_info *table;
@@ -1051,13 +1061,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
 			  ebt_cleanup_entry, net, NULL);
 
 	vfree(table->entries);
-	if (table->chainstack) {
-		for_each_possible_cpu(i)
-			vfree(table->chainstack[i]);
-		vfree(table->chainstack);
-	}
+	ebt_free_table_info(table);
 	vfree(table);
-
 	vfree(counterstmp);
 
 #ifdef CONFIG_AUDIT
@@ -1078,11 +1083,7 @@ free_iterate:
 free_counterstmp:
 	vfree(counterstmp);
 	/* can be initialized in translate_table() */
-	if (newinfo->chainstack) {
-		for_each_possible_cpu(i)
-			vfree(newinfo->chainstack[i]);
-		vfree(newinfo->chainstack);
-	}
+	ebt_free_table_info(newinfo);
 	return ret;
 }
 
@@ -1147,8 +1148,6 @@ free_newinfo:
 
 static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 {
-	int i;
-
 	mutex_lock(&ebt_mutex);
 	list_del(&table->list);
 	mutex_unlock(&ebt_mutex);
@@ -1157,11 +1156,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 	if (table->private->nentries)
 		module_put(table->me);
 	vfree(table->private->entries);
-	if (table->private->chainstack) {
-		for_each_possible_cpu(i)
-			vfree(table->private->chainstack[i]);
-		vfree(table->private->chainstack);
-	}
+	ebt_free_table_info(table->private);
 	vfree(table->private);
 	kfree(table);
 }
@@ -1263,11 +1258,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
 free_unlock:
 	mutex_unlock(&ebt_mutex);
 free_chainstack:
-	if (newinfo->chainstack) {
-		for_each_possible_cpu(i)
-			vfree(newinfo->chainstack[i]);
-		vfree(newinfo->chainstack);
-	}
+	ebt_free_table_info(newinfo);
 	vfree(newinfo->entries);
 free_newinfo:
 	vfree(newinfo);
-- 
cgit v1.2.3-59-g8ed1b


From cd9a5a15808403a7895c51b1378168d6c75cf8a6 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 00:00:57 +0900
Subject: netfilter: ebtables: remove EBT_MATCH and EBT_NOMATCH

EBT_MATCH and EBT_NOMATCH are used to change return value.
match functions(ebt_xxx.c) return false when received frame is not matched
and returns true when received frame is matched.
but, EBT_MATCH_ITERATE understands oppositely.
so, to change return value, EBT_MATCH and EBT_NOMATCH are used.
but, we can use operation '!' simply.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h | 4 ----
 net/bridge/netfilter/ebtables.c           | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 0773b5a032f1..c6935be7c6ca 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -17,10 +17,6 @@
 #include <linux/if_ether.h>
 #include <uapi/linux/netfilter_bridge/ebtables.h>
 
-/* return values for match() functions */
-#define EBT_MATCH 0
-#define EBT_NOMATCH 1
-
 struct ebt_match {
 	struct list_head list;
 	const char name[EBT_FUNCTION_MAXNAMELEN];
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 355410b13316..7c07221369c0 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb,
 {
 	par->match     = m->u.match;
 	par->matchinfo = m->data;
-	return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH;
+	return !m->u.match->match(skb, par);
 }
 
 static inline int
-- 
cgit v1.2.3-59-g8ed1b


From 4351bef053c0335dca1b1ae140384712c0e35e41 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 9 Apr 2018 00:01:24 +0900
Subject: netfilter: x_tables: remove duplicate ip6t_get_target function call

In the check_target, ip6t_get_target is called twice.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6_tables.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 65c9e1a58305..7097bbf95843 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -528,7 +528,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
 		.family    = NFPROTO_IPV6,
 	};
 
-	t = ip6t_get_target(e);
 	return xt_check_target(&par, t->u.target_size - sizeof(*t),
 			       e->ipv6.proto,
 			       e->ipv6.invflags & IP6T_INV_PROTO);
-- 
cgit v1.2.3-59-g8ed1b


From a1d768f1a00db556e2aae9f92bdb38671e601da5 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 13 Apr 2018 23:09:58 +0900
Subject: netfilter: ebtables: add ebt_get_target and ebt_get_target_c

ebt_get_target similar to {ip/ip6/arp}t_get_target.
and ebt_get_target_c similar to {ip/ip6/arp}t_get_target_c.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_bridge/ebtables.h |  6 ++++++
 net/bridge/netfilter/ebtables.c                | 22 +++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index 0c7dc8315013..3b86c14ea49d 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h
@@ -191,6 +191,12 @@ struct ebt_entry {
 	unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace))));
 };
 
+static __inline__ struct ebt_entry_target *
+ebt_get_target(struct ebt_entry *e)
+{
+	return (void *)e + e->target_offset;
+}
+
 /* {g,s}etsockopt numbers */
 #define EBT_BASE_CTL            128
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 7c07221369c0..9be240129448 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry)
 	return (void *)entry + entry->next_offset;
 }
 
+static inline const struct ebt_entry_target *
+ebt_get_target_c(const struct ebt_entry *e)
+{
+	return ebt_get_target((struct ebt_entry *)e);
+}
+
 /* Do some firewalling */
 unsigned int ebt_do_table(struct sk_buff *skb,
 			  const struct nf_hook_state *state,
@@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 		 */
 		EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
 
-		t = (struct ebt_entry_target *)
-		   (((char *)point) + point->target_offset);
+		t = ebt_get_target_c(point);
 		/* standard target */
 		if (!t->u.target->target)
 			verdict = ((struct ebt_standard_target *)t)->verdict;
@@ -637,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
 		return 1;
 	EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL);
 	EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 
 	par.net      = net;
 	par.target   = t->u.target;
@@ -716,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
 	ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j);
 	if (ret != 0)
 		goto cleanup_watchers;
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target(e);
 	gap = e->next_offset - e->target_offset;
 
 	target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0);
@@ -789,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
 			if (pos == nentries)
 				continue;
 		}
-		t = (struct ebt_entry_target *)
-		   (((char *)e) + e->target_offset);
+		t = ebt_get_target_c(e);
 		if (strcmp(t->u.name, EBT_STANDARD_TARGET))
 			goto letscontinue;
 		if (e->target_offset + sizeof(struct ebt_standard_target) >
@@ -1396,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
 		return -EFAULT;
 
 	hlp = ubase + (((char *)e + e->target_offset) - base);
-	t = (struct ebt_entry_target *)(((char *)e) + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase);
 	if (ret != 0)
@@ -1737,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr,
 		return ret;
 	target_offset = e->target_offset - (origsize - *size);
 
-	t = (struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target(e);
 
 	ret = compat_target_to_user(t, dstptr, size);
 	if (ret)
@@ -1785,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e,
 	EBT_MATCH_ITERATE(e, compat_calc_match, &off);
 	EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off);
 
-	t = (const struct ebt_entry_target *) ((char *) e + e->target_offset);
+	t = ebt_get_target_c(e);
 
 	off += xt_compat_target_offset(t->u.target);
 	off += ebt_compat_entry_padsize();
-- 
cgit v1.2.3-59-g8ed1b


From dc3c09d327220db44dce8664584c31f068c53a4a Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 13 Apr 2018 23:10:20 +0900
Subject: netfilter: xtables: use ipt_get_target_c instead of ipt_get_target

ipt_get_target is used to get struct xt_entry_target
and ipt_get_target_c is used to get const struct xt_entry_target.
However in the ipt_do_table, ipt_get_target is used to get
const struct xt_entry_target. it should be replaced by ipt_get_target_c.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 44b308d93ec2..444f125f3974 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -300,7 +300,7 @@ ipt_do_table(struct sk_buff *skb,
 		counter = xt_get_this_cpu_counter(&e->counters);
 		ADD_COUNTER(*counter, skb->len, 1);
 
-		t = ipt_get_target(e);
+		t = ipt_get_target_c(e);
 		WARN_ON(!t->u.kernel.target);
 
 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
-- 
cgit v1.2.3-59-g8ed1b


From 8e1102d5a1596dca10f51e3de800809944f8816d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 18:04:49 +0200
Subject: netfilter: nf_tables: support timeouts larger than 23 days

Marco De Benedetto says:
 I would like to use a timeout of 30 days for elements in a set but it
 seems there is a some kind of problem above 24d20h31m23s.

Fix this by using 'jiffies64' for timeout handling to get same behaviour
on 32 and 64bit systems.

nftables passes timeouts as u64 in milliseconds to the kernel,
but on kernel side we used a mixture of 'long' and jiffies conversions
rather than u64 and jiffies64.

Bugzilla: https://bugzilla.netfilter.org/show_bug.cgi?id=1237
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nf_tables_api.c     | 50 +++++++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index de77d36e36b3..435c9e3b9181 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -585,7 +585,7 @@ static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext)
 	return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT);
 }
 
-static inline unsigned long *nft_set_ext_expiration(const struct nft_set_ext *ext)
+static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext)
 {
 	return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION);
 }
@@ -603,7 +603,7 @@ static inline struct nft_expr *nft_set_ext_expr(const struct nft_set_ext *ext)
 static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
 {
 	return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) &&
-	       time_is_before_eq_jiffies(*nft_set_ext_expiration(ext));
+	       time_is_before_eq_jiffies64(*nft_set_ext_expiration(ext));
 }
 
 static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9ce35acf491d..d57aeea89a79 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2779,6 +2779,27 @@ cont:
 	return 0;
 }
 
+static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+{
+	u64 ms = be64_to_cpu(nla_get_be64(nla));
+	u64 max = (u64)(~((u64)0));
+
+	max = div_u64(max, NSEC_PER_MSEC);
+	if (ms >= max)
+		return -ERANGE;
+
+	ms *= NSEC_PER_MSEC;
+	*result = nsecs_to_jiffies64(ms);
+	return 0;
+}
+
+static u64 nf_jiffies64_to_msecs(u64 input)
+{
+	u64 ms = jiffies64_to_nsecs(input);
+
+	return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+}
+
 static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 			      const struct nft_set *set, u16 event, u16 flags)
 {
@@ -2826,7 +2847,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 
 	if (set->timeout &&
 	    nla_put_be64(skb, NFTA_SET_TIMEOUT,
-			 cpu_to_be64(jiffies_to_msecs(set->timeout)),
+			 nf_jiffies64_to_msecs(set->timeout),
 			 NFTA_SET_PAD))
 		goto nla_put_failure;
 	if (set->gc_int &&
@@ -3122,8 +3143,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TIMEOUT] != NULL) {
 		if (!(flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
-						nla[NFTA_SET_TIMEOUT])));
+
+		err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+		if (err)
+			return err;
 	}
 	gc_int = 0;
 	if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3387,8 +3410,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
 		.align	= __alignof__(u64),
 	},
 	[NFT_SET_EXT_EXPIRATION]	= {
-		.len	= sizeof(unsigned long),
-		.align	= __alignof__(unsigned long),
+		.len	= sizeof(u64),
+		.align	= __alignof__(u64),
 	},
 	[NFT_SET_EXT_USERDATA]		= {
 		.len	= sizeof(struct nft_userdata),
@@ -3481,22 +3504,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
 	    nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
-			 cpu_to_be64(jiffies_to_msecs(
-						*nft_set_ext_timeout(ext))),
+			 nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
 			 NFTA_SET_ELEM_PAD))
 		goto nla_put_failure;
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
-		unsigned long expires, now = jiffies;
+		u64 expires, now = get_jiffies_64();
 
 		expires = *nft_set_ext_expiration(ext);
-		if (time_before(now, expires))
+		if (time_before64(now, expires))
 			expires -= now;
 		else
 			expires = 0;
 
 		if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
-				 cpu_to_be64(jiffies_to_msecs(expires)),
+				 nf_jiffies64_to_msecs(expires),
 				 NFTA_SET_ELEM_PAD))
 			goto nla_put_failure;
 	}
@@ -3871,7 +3893,7 @@ void *nft_set_elem_init(const struct nft_set *set,
 		memcpy(nft_set_ext_data(ext), data, set->dlen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
 		*nft_set_ext_expiration(ext) =
-			jiffies + timeout;
+			get_jiffies_64() + timeout;
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
 		*nft_set_ext_timeout(ext) = timeout;
 
@@ -3958,8 +3980,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
 		if (!(set->flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
-					nla[NFTA_SET_ELEM_TIMEOUT])));
+		err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
+					    &timeout);
+		if (err)
+			return err;
 	} else if (set->flags & NFT_SET_TIMEOUT) {
 		timeout = set->timeout;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 99a0efbeeb83482893f7d5df343a2d2eb591933d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 18:52:58 +0200
Subject: netfilter: nf_tables: always use an upper set size for dynsets

nft rejects rules that lack a timeout and a size limit when they're used
to add elements from packet path.

Pick a sane upperlimit instead of rejecting outright.
The upperlimit is visible to userspace, just as if it would have been
given during set declaration.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_dynset.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 04863fad05dd..5cc3509659c6 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
 	u64 timeout;
 	void *elem;
 
-	if (set->size && !atomic_add_unless(&set->nelems, 1, set->size))
+	if (!atomic_add_unless(&set->nelems, 1, set->size))
 		return NULL;
 
 	timeout = priv->timeout ? : set->timeout;
@@ -216,6 +216,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		goto err1;
 
+	if (set->size == 0)
+		set->size = 0xffff;
+
 	priv->set = set;
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From bd2bbdb497dba24b9ca7f6257c83e496c64b6e9d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:53 +0200
Subject: netfilter: merge meta_bridge into nft_meta

It overcomplicates things for no reason.
nft_meta_bridge only offers retrieval of bridge port interface name.

Because of this being its own module, we had to export all nft_meta
functions, which we can then make static again (which even reduces
the size of nft_meta -- including bridge port retrieval...):

before:
   text    data     bss     dec     hex filename
   1838     832       0    2670     a6e net/bridge/netfilter/nft_meta_bridge.ko
   6147     936       1    7084    1bac net/netfilter/nft_meta.ko

after:
   5826     936       1    6763    1a6b net/netfilter/nft_meta.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h       |  44 -----------
 net/bridge/netfilter/Kconfig           |   7 --
 net/bridge/netfilter/Makefile          |   1 -
 net/bridge/netfilter/nft_meta_bridge.c | 135 ---------------------------------
 net/netfilter/nft_meta.c               |  90 ++++++++++++++--------
 5 files changed, 58 insertions(+), 219 deletions(-)
 delete mode 100644 include/net/netfilter/nft_meta.h
 delete mode 100644 net/bridge/netfilter/nft_meta_bridge.c

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
deleted file mode 100644
index 5c69e9b09388..000000000000
--- a/include/net/netfilter/nft_meta.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NFT_META_H_
-#define _NFT_META_H_
-
-struct nft_meta {
-	enum nft_meta_keys	key:8;
-	union {
-		enum nft_registers	dreg:8;
-		enum nft_registers	sreg:8;
-	};
-};
-
-extern const struct nla_policy nft_meta_policy[];
-
-int nft_meta_get_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[]);
-
-int nft_meta_set_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[]);
-
-int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
-
-int nft_meta_set_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
-
-void nft_meta_get_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt);
-
-void nft_meta_set_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt);
-
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr);
-
-int nft_meta_set_validate(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr,
-			  const struct nft_data **data);
-
-#endif
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index f212447794bd..9a0159aebe1a 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE
 	bool "Ethernet Bridge nf_tables support"
 
 if NF_TABLES_BRIDGE
-
-config NFT_BRIDGE_META
-	tristate "Netfilter nf_table bridge meta support"
-	depends on NFT_META
-	help
-	  Add support for bridge dedicated meta key.
-
 config NFT_BRIDGE_REJECT
 	tristate "Netfilter nf_tables bridge reject support"
 	depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 4bc758dd4a8c..9b868861f21a 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
-obj-$(CONFIG_NFT_BRIDGE_META)  += nft_meta_bridge.o
 obj-$(CONFIG_NFT_BRIDGE_REJECT)  += nft_reject_bridge.o
 
 # packet logging
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
deleted file mode 100644
index bb63c9aed55d..000000000000
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2014 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_meta.h>
-
-#include "../br_private.h"
-
-static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
-				     struct nft_regs *regs,
-				     const struct nft_pktinfo *pkt)
-{
-	const struct nft_meta *priv = nft_expr_priv(expr);
-	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
-	u32 *dest = &regs->data[priv->dreg];
-	const struct net_bridge_port *p;
-
-	switch (priv->key) {
-	case NFT_META_BRI_IIFNAME:
-		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
-			goto err;
-		break;
-	case NFT_META_BRI_OIFNAME:
-		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
-			goto err;
-		break;
-	default:
-		goto out;
-	}
-
-	strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
-	return;
-out:
-	return nft_meta_get_eval(expr, regs, pkt);
-err:
-	regs->verdict.code = NFT_BREAK;
-}
-
-static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
-				    const struct nft_expr *expr,
-				    const struct nlattr * const tb[])
-{
-	struct nft_meta *priv = nft_expr_priv(expr);
-	unsigned int len;
-
-	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
-	switch (priv->key) {
-	case NFT_META_BRI_IIFNAME:
-	case NFT_META_BRI_OIFNAME:
-		len = IFNAMSIZ;
-		break;
-	default:
-		return nft_meta_get_init(ctx, expr, tb);
-	}
-
-	priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
-	return nft_validate_register_store(ctx, priv->dreg, NULL,
-					   NFT_DATA_VALUE, len);
-}
-
-static struct nft_expr_type nft_meta_bridge_type;
-static const struct nft_expr_ops nft_meta_bridge_get_ops = {
-	.type		= &nft_meta_bridge_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_bridge_get_eval,
-	.init		= nft_meta_bridge_get_init,
-	.dump		= nft_meta_get_dump,
-};
-
-static const struct nft_expr_ops nft_meta_bridge_set_ops = {
-	.type		= &nft_meta_bridge_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_set_eval,
-	.init		= nft_meta_set_init,
-	.destroy	= nft_meta_set_destroy,
-	.dump		= nft_meta_set_dump,
-	.validate	= nft_meta_set_validate,
-};
-
-static const struct nft_expr_ops *
-nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
-			   const struct nlattr * const tb[])
-{
-	if (tb[NFTA_META_KEY] == NULL)
-		return ERR_PTR(-EINVAL);
-
-	if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
-		return ERR_PTR(-EINVAL);
-
-	if (tb[NFTA_META_DREG])
-		return &nft_meta_bridge_get_ops;
-
-	if (tb[NFTA_META_SREG])
-		return &nft_meta_bridge_set_ops;
-
-	return ERR_PTR(-EINVAL);
-}
-
-static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
-	.family         = NFPROTO_BRIDGE,
-	.name           = "meta",
-	.select_ops     = nft_meta_bridge_select_ops,
-	.policy         = nft_meta_policy,
-	.maxattr        = NFTA_META_MAX,
-	.owner          = THIS_MODULE,
-};
-
-static int __init nft_meta_bridge_module_init(void)
-{
-	return nft_register_expr(&nft_meta_bridge_type);
-}
-
-static void __exit nft_meta_bridge_module_exit(void)
-{
-	nft_unregister_expr(&nft_meta_bridge_type);
-}
-
-module_init(nft_meta_bridge_module_init);
-module_exit(nft_meta_bridge_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8fb91940e2e7..6c0b82628117 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,5 +1,7 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2014 Intel Corporation
+ * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -24,21 +26,35 @@
 #include <net/tcp_states.h> /* for TCP_TIME_WAIT */
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
-#include <net/netfilter/nft_meta.h>
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+struct nft_meta {
+	enum nft_meta_keys	key:8;
+	union {
+		enum nft_registers	dreg:8;
+		enum nft_registers	sreg:8;
+	};
+};
+
 static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
 
-void nft_meta_get_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt)
+#ifdef CONFIG_NF_TABLES_BRIDGE
+#include "../bridge/br_private.h"
+#endif
+
+static void nft_meta_get_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 	const struct sk_buff *skb = pkt->skb;
 	const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
 	struct sock *sk;
 	u32 *dest = &regs->data[priv->dreg];
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	const struct net_bridge_port *p;
+#endif
 
 	switch (priv->key) {
 	case NFT_META_LEN:
@@ -214,6 +230,18 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 	case NFT_META_SECPATH:
 		nft_reg_store8(dest, !!skb->sp);
 		break;
+#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	case NFT_META_BRI_IIFNAME:
+		if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
+			goto err;
+		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+		return;
+	case NFT_META_BRI_OIFNAME:
+		if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+			goto err;
+		strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
+		return;
 #endif
 	default:
 		WARN_ON(1);
@@ -224,11 +252,10 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 err:
 	regs->verdict.code = NFT_BREAK;
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_eval);
 
-void nft_meta_set_eval(const struct nft_expr *expr,
-		       struct nft_regs *regs,
-		       const struct nft_pktinfo *pkt)
+static void nft_meta_set_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
 {
 	const struct nft_meta *meta = nft_expr_priv(expr);
 	struct sk_buff *skb = pkt->skb;
@@ -258,18 +285,16 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 		WARN_ON(1);
 	}
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_eval);
 
-const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
 	[NFTA_META_DREG]	= { .type = NLA_U32 },
 	[NFTA_META_KEY]		= { .type = NLA_U32 },
 	[NFTA_META_SREG]	= { .type = NLA_U32 },
 };
-EXPORT_SYMBOL_GPL(nft_meta_policy);
 
-int nft_meta_get_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[])
+static int nft_meta_get_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -317,6 +342,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_SECPATH:
 		len = sizeof(u8);
 		break;
+#endif
+#ifdef CONFIG_NF_TABLES_BRIDGE
+	case NFT_META_BRI_IIFNAME:
+	case NFT_META_BRI_OIFNAME:
+		if (ctx->family != NFPROTO_BRIDGE)
+			return -EOPNOTSUPP;
+		len = IFNAMSIZ;
+		break;
 #endif
 	default:
 		return -EOPNOTSUPP;
@@ -326,7 +359,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	return nft_validate_register_store(ctx, priv->dreg, NULL,
 					   NFT_DATA_VALUE, len);
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_init);
 
 static int nft_meta_get_validate(const struct nft_ctx *ctx,
 				 const struct nft_expr *expr,
@@ -360,9 +392,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
 #endif
 }
 
-int nft_meta_set_validate(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr,
-			  const struct nft_data **data)
+static int nft_meta_set_validate(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr,
+				 const struct nft_data **data)
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int hooks;
@@ -388,11 +420,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
 
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_validate);
 
-int nft_meta_set_init(const struct nft_ctx *ctx,
-		      const struct nft_expr *expr,
-		      const struct nlattr * const tb[])
+static int nft_meta_set_init(const struct nft_ctx *ctx,
+			     const struct nft_expr *expr,
+			     const struct nlattr * const tb[])
 {
 	struct nft_meta *priv = nft_expr_priv(expr);
 	unsigned int len;
@@ -424,10 +455,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_init);
 
-int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr)
+static int nft_meta_get_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -440,10 +470,8 @@ int nft_meta_get_dump(struct sk_buff *skb,
 nla_put_failure:
 	return -1;
 }
-EXPORT_SYMBOL_GPL(nft_meta_get_dump);
 
-int nft_meta_set_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr)
+static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -457,17 +485,15 @@ int nft_meta_set_dump(struct sk_buff *skb,
 nla_put_failure:
 	return -1;
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_dump);
 
-void nft_meta_set_destroy(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr)
+static void nft_meta_set_destroy(const struct nft_ctx *ctx,
+				 const struct nft_expr *expr)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
 	if (priv->key == NFT_META_NFTRACE)
 		static_branch_dec(&nft_trace_enabled);
 }
-EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
 
 static struct nft_expr_type nft_meta_type;
 static const struct nft_expr_ops nft_meta_get_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From db688c24eada63b1efe6d0d7d835e5c3bdd71fd3 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 24 Apr 2018 10:34:36 +0200
Subject: vhost_net: use packet weight for rx handler, too

Similar to commit a2ac99905f1e ("vhost-net: set packet weight of
tx polling to 2 * vq size"), we need a packet-based limit for
handler_rx, too - elsewhere, under rx flood with small packets,
tx can be delayed for a very long time, even without busypolling.

The pkt limit applied to handle_rx must be the same applied by
handle_tx, or we will get unfair scheduling between rx and tx.
Tying such limit to the queue length makes it less effective for
large queue length values and can introduce large process
scheduler latencies, so a constant valued is used - likewise
the existing bytes limit.

The selected limit has been validated with PVP[1] performance
test with different queue sizes:

queue size		256	512	1024

baseline		366	354	362
weight 128		715	723	670
weight 256		740	745	733
weight 512		600	460	583
weight 1024		423	427	418

A packet weight of 256 gives peek performances in under all the
tested scenarios.

No measurable regression in unidirectional performance tests has
been detected.

[1] https://developers.redhat.com/blog/2017/06/05/measuring-and-comparing-open-vswitch-performance/

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index bbf38befefb2..c4b49fca4871 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -46,8 +46,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 #define VHOST_NET_WEIGHT 0x80000
 
 /* Max number of packets transferred before requeueing the job.
- * Using this limit prevents one virtqueue from starving rx. */
-#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2)
+ * Using this limit prevents one virtqueue from starving others with small
+ * pkts.
+ */
+#define VHOST_NET_PKT_WEIGHT 256
 
 /* MAX number of TX used buffers for outstanding zerocopy */
 #define VHOST_MAX_PEND 128
@@ -587,7 +589,7 @@ static void handle_tx(struct vhost_net *net)
 			vhost_zerocopy_signal_used(net, vq);
 		vhost_net_tx_packet(net);
 		if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
-		    unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) {
+		    unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
 		}
@@ -769,6 +771,7 @@ static void handle_rx(struct vhost_net *net)
 	struct socket *sock;
 	struct iov_iter fixup;
 	__virtio16 num_buffers;
+	int recv_pkts = 0;
 
 	mutex_lock_nested(&vq->mutex, 0);
 	sock = vq->private_data;
@@ -872,7 +875,8 @@ static void handle_rx(struct vhost_net *net)
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, vhost_len);
 		total_len += vhost_len;
-		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
+		if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
+		    unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			goto out;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 026a807c2de37aa826748c2ffa1969fc778406b2 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Tue, 24 Apr 2018 13:36:01 +0300
Subject: net/dim: Rename *_get_profile() functions to *_get_rx_moderation()

Preparation for introducing adaptive TX to net DIM.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bcmsysport.c        |  6 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c     |  8 ++++----
 drivers/net/ethernet/broadcom/genet/bcmgenet.c    |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c  |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  6 ++++--
 include/linux/net_dim.h                           | 12 ++++++------
 6 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index f9a3c1a76d5d..effc651a2a2f 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -654,7 +654,7 @@ static int bcm_sysport_set_coalesce(struct net_device *dev,
 	pkts = priv->rx_max_coalesced_frames;
 
 	if (ec->use_adaptive_rx_coalesce && !priv->dim.use_dim) {
-		moder = net_dim_get_def_profile(priv->dim.dim.mode);
+		moder = net_dim_get_def_rx_moderation(priv->dim.dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
@@ -1064,7 +1064,7 @@ static void bcm_sysport_dim_work(struct work_struct *work)
 	struct bcm_sysport_priv *priv =
 			container_of(ndim, struct bcm_sysport_priv, dim);
 	struct net_dim_cq_moder cur_profile =
-				net_dim_get_profile(dim->mode, dim->profile_ix);
+			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcm_sysport_set_rx_coalesce(priv, cur_profile.usec, cur_profile.pkts);
 	dim->state = NET_DIM_START_MEASURE;
@@ -1437,7 +1437,7 @@ static void bcm_sysport_init_rx_coalesce(struct bcm_sysport_priv *priv)
 
 	/* If DIM was enabled, re-apply default parameters */
 	if (dim->use_dim) {
-		moder = net_dim_get_def_profile(dim->dim.mode);
+		moder = net_dim_get_def_rx_moderation(dim->dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 408dd190331e..afa97c8bb081 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -21,11 +21,11 @@ void bnxt_dim_work(struct work_struct *work)
 	struct bnxt_napi *bnapi = container_of(cpr,
 					       struct bnxt_napi,
 					       cp_ring);
-	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
-								  dim->profile_ix);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
-	cpr->rx_ring_coal.coal_ticks = cur_profile.usec;
-	cpr->rx_ring_coal.coal_bufs = cur_profile.pkts;
+	cpr->rx_ring_coal.coal_ticks = cur_moder.usec;
+	cpr->rx_ring_coal.coal_bufs = cur_moder.pkts;
 
 	bnxt_hwrm_set_ring_coal(bnapi->bp, bnapi);
 	dim->state = NET_DIM_START_MEASURE;
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 0445f2c0c629..20c1681bb1af 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -652,7 +652,7 @@ static void bcmgenet_set_ring_rx_coalesce(struct bcmgenet_rx_ring *ring,
 	pkts = ring->rx_max_coalesced_frames;
 
 	if (ec->use_adaptive_rx_coalesce && !ring->dim.use_dim) {
-		moder = net_dim_get_def_profile(ring->dim.dim.mode);
+		moder = net_dim_get_def_rx_moderation(ring->dim.dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
@@ -1925,7 +1925,7 @@ static void bcmgenet_dim_work(struct work_struct *work)
 	struct bcmgenet_rx_ring *ring =
 			container_of(ndim, struct bcmgenet_rx_ring, dim);
 	struct net_dim_cq_moder cur_profile =
-			net_dim_get_profile(dim->mode, dim->profile_ix);
+			net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	bcmgenet_set_rx_coalesce(ring, cur_profile.usec, cur_profile.pkts);
 	dim->state = NET_DIM_START_MEASURE;
@@ -2102,7 +2102,7 @@ static void bcmgenet_init_rx_coalesce(struct bcmgenet_rx_ring *ring)
 
 	/* If DIM was enabled, re-apply default parameters */
 	if (dim->use_dim) {
-		moder = net_dim_get_def_profile(dim->dim.mode);
+		moder = net_dim_get_def_rx_moderation(dim->dim.mode);
 		usecs = moder.usec;
 		pkts = moder.pkts;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index 602851ab5b14..1b286e1985d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -38,11 +38,11 @@ void mlx5e_rx_dim_work(struct work_struct *work)
 	struct net_dim *dim = container_of(work, struct net_dim,
 					   work);
 	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
-	struct net_dim_cq_moder cur_profile = net_dim_get_profile(dim->mode,
-								  dim->profile_ix);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
 	mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
-				       cur_profile.usec, cur_profile.pkts);
+				       cur_moder.usec, cur_moder.pkts);
 
 	dim->state = NET_DIM_START_MEASURE;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f10037419978..a69cc1cc7bfb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4119,12 +4119,14 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 		switch (cq_period_mode) {
 		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
 			params->rx_cq_moderation =
-				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
+				net_dim_get_def_rx_moderation(
+					NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
 			break;
 		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
 		default:
 			params->rx_cq_moderation =
-				net_dim_get_def_profile(NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
+				net_dim_get_def_rx_moderation(
+					NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
 		}
 	}
 
diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 29ed8fd6379a..7ca3c4deb3a6 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -129,17 +129,17 @@ profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
 	NET_DIM_CQE_PROFILES,
 };
 
-static inline struct net_dim_cq_moder net_dim_get_profile(u8 cq_period_mode,
-							  int ix)
+static inline struct net_dim_cq_moder
+net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder;
+	struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix];
 
-	cq_moder = profile[cq_period_mode][ix];
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
-static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mode)
+static inline struct net_dim_cq_moder
+net_dim_get_def_rx_moderation(u8 rx_cq_period_mode)
 {
 	int default_profile_ix;
 
@@ -148,7 +148,7 @@ static inline struct net_dim_cq_moder net_dim_get_def_profile(u8 rx_cq_period_mo
 	else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */
 		default_profile_ix = NET_DIM_DEF_PROFILE_EQE;
 
-	return net_dim_get_profile(rx_cq_period_mode, default_profile_ix);
+	return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix);
 }
 
 static inline bool net_dim_on_top(struct net_dim *dim)
-- 
cgit v1.2.3-59-g8ed1b


From 623ad755226c2b4fb312772bae9cdeccd8219ee3 Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Tue, 24 Apr 2018 13:36:02 +0300
Subject: net/dim: Support adaptive TX moderation

Interrupt moderation for TX traffic requires different profiles than RX
interrupt moderation. The main goal here is to reduce interrupt rate and
allow better payload aggregation by keeping SKBs in the TX queue a bit
longer. Ping-pong behavior would get a profile with a short timer, so
latency wouldn't increase for these scenarios. There might be a slight
degradation in bandwidth for single stream with large message sizes, since
net.ipv4.tcp_limit_output_bytes is limiting the allowed TX traffic, but
with many streams performance is always improved.

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_dim.h | 63 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h
index 7ca3c4deb3a6..db99240d00bd 100644
--- a/include/linux/net_dim.h
+++ b/include/linux/net_dim.h
@@ -103,11 +103,12 @@ enum {
 #define NET_DIM_PARAMS_NUM_PROFILES 5
 /* Adaptive moderation profiles */
 #define NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE 256
+#define NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE 128
 #define NET_DIM_DEF_PROFILE_CQE 1
 #define NET_DIM_DEF_PROFILE_EQE 1
 
 /* All profiles sizes must be NET_PARAMS_DIM_NUM_PROFILES */
-#define NET_DIM_EQE_PROFILES { \
+#define NET_DIM_RX_EQE_PROFILES { \
 	{1,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 	{8,   NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 	{64,  NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
@@ -115,7 +116,7 @@ enum {
 	{256, NET_DIM_DEFAULT_RX_CQ_MODERATION_PKTS_FROM_EQE}, \
 }
 
-#define NET_DIM_CQE_PROFILES { \
+#define NET_DIM_RX_CQE_PROFILES { \
 	{2,  256},             \
 	{8,  128},             \
 	{16, 64},              \
@@ -123,32 +124,68 @@ enum {
 	{64, 64}               \
 }
 
+#define NET_DIM_TX_EQE_PROFILES { \
+	{1,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{8,   NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{32,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{64,  NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE},  \
+	{128, NET_DIM_DEFAULT_TX_CQ_MODERATION_PKTS_FROM_EQE}   \
+}
+
+#define NET_DIM_TX_CQE_PROFILES { \
+	{5,  128},  \
+	{8,  64},  \
+	{16, 32},  \
+	{32, 32},  \
+	{64, 32}   \
+}
+
 static const struct net_dim_cq_moder
-profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
-	NET_DIM_EQE_PROFILES,
-	NET_DIM_CQE_PROFILES,
+rx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_RX_EQE_PROFILES,
+	NET_DIM_RX_CQE_PROFILES,
+};
+
+static const struct net_dim_cq_moder
+tx_profile[NET_DIM_CQ_PERIOD_NUM_MODES][NET_DIM_PARAMS_NUM_PROFILES] = {
+	NET_DIM_TX_EQE_PROFILES,
+	NET_DIM_TX_CQE_PROFILES,
 };
 
 static inline struct net_dim_cq_moder
 net_dim_get_rx_moderation(u8 cq_period_mode, int ix)
 {
-	struct net_dim_cq_moder cq_moder = profile[cq_period_mode][ix];
+	struct net_dim_cq_moder cq_moder = rx_profile[cq_period_mode][ix];
 
 	cq_moder.cq_period_mode = cq_period_mode;
 	return cq_moder;
 }
 
 static inline struct net_dim_cq_moder
-net_dim_get_def_rx_moderation(u8 rx_cq_period_mode)
+net_dim_get_def_rx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
+
+	return net_dim_get_rx_moderation(cq_period_mode, profile_ix);
+}
+
+static inline struct net_dim_cq_moder
+net_dim_get_tx_moderation(u8 cq_period_mode, int ix)
 {
-	int default_profile_ix;
+	struct net_dim_cq_moder cq_moder = tx_profile[cq_period_mode][ix];
 
-	if (rx_cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE)
-		default_profile_ix = NET_DIM_DEF_PROFILE_CQE;
-	else /* NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE */
-		default_profile_ix = NET_DIM_DEF_PROFILE_EQE;
+	cq_moder.cq_period_mode = cq_period_mode;
+	return cq_moder;
+}
+
+static inline struct net_dim_cq_moder
+net_dim_get_def_tx_moderation(u8 cq_period_mode)
+{
+	u8 profile_ix = cq_period_mode == NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+			NET_DIM_DEF_PROFILE_CQE : NET_DIM_DEF_PROFILE_EQE;
 
-	return net_dim_get_rx_moderation(rx_cq_period_mode, default_profile_ix);
+	return net_dim_get_tx_moderation(cq_period_mode, profile_ix);
 }
 
 static inline bool net_dim_on_top(struct net_dim *dim)
-- 
cgit v1.2.3-59-g8ed1b


From cbce4f44479856cd4621e3dce531cfb078357b1f Mon Sep 17 00:00:00 2001
From: Tal Gilboa <talgi@mellanox.com>
Date: Tue, 24 Apr 2018 13:36:03 +0300
Subject: net/mlx5e: Enable adaptive-TX moderation

Add support for adaptive TX moderation. This greatly reduces TX interrupt
rate and increases bandwidth, mostly for TCP bandwidth over ARM
architecture (below). There is a slight single stream TCP with very large
message sizes degradation (x86). In this case if there's any moderation on
transmitted packets the bandwidth would reduce due to hitting TCP output limit.
Since this is a synthetic case, this is still worth doing.

Performance improvement (ConnectX-4Lx 40GbE, ARM)
TCP 64B bandwidth with 1-50 streams increased 6-35%.
TCP 64B bandwidth with 100-500 streams increased 20-70%.

Performance improvement (ConnectX-5 100GbE, x86)
Bandwidth: increased up to 40% (1024B with 10s of streams).
Interrupt rate: reduced up to 50% (1024B with 1000s of streams).

Performance degradation (ConnectX-5 100GbE, x86)
Bandwidth: up to 10% decrease single stream TCP (1MB message size from
51Gb/s to 47Gb/s).

Signed-off-by: Tal Gilboa <talgi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  4 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   | 24 +++++--
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 35 +++++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 81 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  | 37 +++++++---
 5 files changed, 125 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3317a4da87cb..56c748ca2a09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -241,6 +241,7 @@ struct mlx5e_params {
 	bool vlan_strip_disable;
 	bool scatter_fcs_en;
 	bool rx_dim_enabled;
+	bool tx_dim_enabled;
 	u32 lro_timeout;
 	u32 pflags;
 	struct bpf_prog *xdp_prog;
@@ -330,6 +331,7 @@ enum {
 	MLX5E_SQ_STATE_ENABLED,
 	MLX5E_SQ_STATE_RECOVERING,
 	MLX5E_SQ_STATE_IPSEC,
+	MLX5E_SQ_STATE_AM,
 };
 
 struct mlx5e_sq_wqe_info {
@@ -342,6 +344,7 @@ struct mlx5e_txqsq {
 	/* dirtied @completion */
 	u16                        cc;
 	u32                        dma_fifo_cc;
+	struct net_dim             dim; /* Adaptive Moderation */
 
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
@@ -1111,4 +1114,5 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 			    u16 max_channels, u16 mtu);
 u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev);
 void mlx5e_rx_dim_work(struct work_struct *work);
+void mlx5e_tx_dim_work(struct work_struct *work);
 #endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index 1b286e1985d2..d67adf70a97b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -33,16 +33,30 @@
 #include <linux/net_dim.h>
 #include "en.h"
 
+static void
+mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
+			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
+{
+	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
 void mlx5e_rx_dim_work(struct work_struct *work)
 {
-	struct net_dim *dim = container_of(work, struct net_dim,
-					   work);
+	struct net_dim *dim = container_of(work, struct net_dim, work);
 	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
 	struct net_dim_cq_moder cur_moder =
 		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
 
-	mlx5_core_modify_cq_moderation(rq->mdev, &rq->cq.mcq,
-				       cur_moder.usec, cur_moder.pkts);
+	mlx5e_complete_dim_work(dim, cur_moder, rq->mdev, &rq->cq.mcq);
+}
 
-	dim->state = NET_DIM_START_MEASURE;
+void mlx5e_tx_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct mlx5e_txqsq *sq = container_of(dim, struct mlx5e_txqsq, dim);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
+
+	mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 37fd0245b6c1..2b786c4d3dab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -389,14 +389,20 @@ static int mlx5e_set_channels(struct net_device *dev,
 int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
+	struct net_dim_cq_moder *rx_moder, *tx_moder;
+
 	if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
 		return -EOPNOTSUPP;
 
-	coal->rx_coalesce_usecs       = priv->channels.params.rx_cq_moderation.usec;
-	coal->rx_max_coalesced_frames = priv->channels.params.rx_cq_moderation.pkts;
-	coal->tx_coalesce_usecs       = priv->channels.params.tx_cq_moderation.usec;
-	coal->tx_max_coalesced_frames = priv->channels.params.tx_cq_moderation.pkts;
-	coal->use_adaptive_rx_coalesce = priv->channels.params.rx_dim_enabled;
+	rx_moder = &priv->channels.params.rx_cq_moderation;
+	coal->rx_coalesce_usecs		= rx_moder->usec;
+	coal->rx_max_coalesced_frames	= rx_moder->pkts;
+	coal->use_adaptive_rx_coalesce	= priv->channels.params.rx_dim_enabled;
+
+	tx_moder = &priv->channels.params.tx_cq_moderation;
+	coal->tx_coalesce_usecs		= tx_moder->usec;
+	coal->tx_max_coalesced_frames	= tx_moder->pkts;
+	coal->use_adaptive_tx_coalesce	= priv->channels.params.tx_dim_enabled;
 
 	return 0;
 }
@@ -438,6 +444,7 @@ mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesc
 int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal)
 {
+	struct net_dim_cq_moder *rx_moder, *tx_moder;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_channels new_channels = {};
 	int err = 0;
@@ -463,11 +470,15 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 	mutex_lock(&priv->state_lock);
 	new_channels.params = priv->channels.params;
 
-	new_channels.params.tx_cq_moderation.usec = coal->tx_coalesce_usecs;
-	new_channels.params.tx_cq_moderation.pkts = coal->tx_max_coalesced_frames;
-	new_channels.params.rx_cq_moderation.usec = coal->rx_coalesce_usecs;
-	new_channels.params.rx_cq_moderation.pkts = coal->rx_max_coalesced_frames;
-	new_channels.params.rx_dim_enabled        = !!coal->use_adaptive_rx_coalesce;
+	rx_moder          = &new_channels.params.rx_cq_moderation;
+	rx_moder->usec    = coal->rx_coalesce_usecs;
+	rx_moder->pkts    = coal->rx_max_coalesced_frames;
+	new_channels.params.rx_dim_enabled = !!coal->use_adaptive_rx_coalesce;
+
+	tx_moder          = &new_channels.params.tx_cq_moderation;
+	tx_moder->usec    = coal->tx_coalesce_usecs;
+	tx_moder->pkts    = coal->tx_max_coalesced_frames;
+	new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
 
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
 		priv->channels.params = new_channels.params;
@@ -475,7 +486,9 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 	}
 	/* we are opened */
 
-	reset = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled;
+	reset = (!!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled) ||
+		(!!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled);
+
 	if (!reset) {
 		mlx5e_set_priv_channels_coalesce(priv, coal);
 		priv->channels.params = new_channels.params;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a69cc1cc7bfb..f1fe490ed794 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1025,6 +1025,9 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	if (err)
 		goto err_sq_wq_destroy;
 
+	INIT_WORK(&sq->dim.work, mlx5e_tx_dim_work);
+	sq->dim.mode = params->tx_cq_moderation.cq_period_mode;
+
 	sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
 
 	return 0;
@@ -1188,6 +1191,9 @@ static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 	if (tx_rate)
 		mlx5e_set_sq_maxrate(c->netdev, sq, tx_rate);
 
+	if (params->tx_dim_enabled)
+		sq->state |= BIT(MLX5E_SQ_STATE_AM);
+
 	return 0;
 
 err_free_txqsq:
@@ -4084,18 +4090,48 @@ static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 		link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw;
 }
 
-void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 {
-	params->tx_cq_moderation.cq_period_mode = cq_period_mode;
+	struct net_dim_cq_moder moder;
+
+	moder.cq_period_mode = cq_period_mode;
+	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
+	moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+		moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
+
+	return moder;
+}
 
-	params->tx_cq_moderation.pkts =
-		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
-	params->tx_cq_moderation.usec =
-		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
+static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
+{
+	struct net_dim_cq_moder moder;
 
+	moder.cq_period_mode = cq_period_mode;
+	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
+	moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
 	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
-		params->tx_cq_moderation.usec =
-			MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
+		moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
+
+	return moder;
+}
+
+static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode)
+{
+	return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ?
+		NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+		NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+}
+
+void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+{
+	if (params->tx_dim_enabled) {
+		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
+
+		params->tx_cq_moderation = net_dim_get_def_tx_moderation(dim_period_mode);
+	} else {
+		params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode);
+	}
 
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
 			params->tx_cq_moderation.cq_period_mode ==
@@ -4104,30 +4140,12 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 {
-	params->rx_cq_moderation.cq_period_mode = cq_period_mode;
-
-	params->rx_cq_moderation.pkts =
-		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
-	params->rx_cq_moderation.usec =
-		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
-
-	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
-		params->rx_cq_moderation.usec =
-			MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
-
 	if (params->rx_dim_enabled) {
-		switch (cq_period_mode) {
-		case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
-			params->rx_cq_moderation =
-				net_dim_get_def_rx_moderation(
-					NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE);
-			break;
-		case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
-		default:
-			params->rx_cq_moderation =
-				net_dim_get_def_rx_moderation(
-					NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE);
-		}
+		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
+
+		params->rx_cq_moderation = net_dim_get_def_rx_moderation(dim_period_mode);
+	} else {
+		params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode);
 	}
 
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
@@ -4191,6 +4209,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
 			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
 	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
 	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index f292bb346985..900661d45c8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -44,6 +44,30 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 	return cpumask_test_cpu(current_cpu, aff);
 }
 
+static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
+{
+	struct net_dim_sample dim_sample;
+
+	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_AM)))
+		return;
+
+	net_dim_sample(sq->cq.event_ctr, sq->stats.packets, sq->stats.bytes,
+		       &dim_sample);
+	net_dim(&sq->dim, dim_sample);
+}
+
+static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
+{
+	struct net_dim_sample dim_sample;
+
+	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_AM)))
+		return;
+
+	net_dim_sample(rq->cq.event_ctr, rq->stats.packets, rq->stats.bytes,
+		       &dim_sample);
+	net_dim(&rq->dim, dim_sample);
+}
+
 int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -75,18 +99,13 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	if (unlikely(!napi_complete_done(napi, work_done)))
 		return work_done;
 
-	for (i = 0; i < c->num_tc; i++)
+	for (i = 0; i < c->num_tc; i++) {
+		mlx5e_handle_tx_dim(&c->sq[i]);
 		mlx5e_cq_arm(&c->sq[i].cq);
-
-	if (MLX5E_TEST_BIT(c->rq.state, MLX5E_RQ_STATE_AM)) {
-		struct net_dim_sample dim_sample;
-		net_dim_sample(c->rq.cq.event_ctr,
-			       c->rq.stats.packets,
-			       c->rq.stats.bytes,
-			       &dim_sample);
-		net_dim(&c->rq.dim, dim_sample);
 	}
 
+	mlx5e_handle_rx_dim(&c->rq);
+
 	mlx5e_cq_arm(&c->rq.cq);
 	mlx5e_cq_arm(&c->icosq.cq);
 
-- 
cgit v1.2.3-59-g8ed1b


From a06ac0d67d9fda7c255476c6391032319030045d Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 24 Apr 2018 23:07:45 +0800
Subject: Revert "net: init sk_cookie for inet socket"

This reverts commit <c6849a3ac17e> ("net: init sk_cookie for inet socket")

Per discussion with Eric, when update sock_net(sk)->cookie_gen, the
whole cache cache line will be invalidated, as this cache line is shared
with all cpus, that may cause great performace hit.

Bellow is the data form Eric.
"Performance is reduced from ~5 Mpps to ~3.8 Mpps with 16 RX queues on
my host" when running synflood test.

Have to revert it to prevent from cache line false sharing.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h | 9 ---------
 net/ipv4/tcp_input.c      | 8 +-------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 5c916e6dff36..15fe980a27ea 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -25,15 +25,6 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
-static inline
-void sock_init_cookie(struct sock *sk)
-{
-	u64 res;
-
-	res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
-	atomic64_set(&sk->sk_cookie, res);
-}
-
 u64 sock_gen_cookie(struct sock *sk);
 int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
 void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 17b78582ba63..5a17cfc75326 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -78,7 +78,6 @@
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
 #include <linux/static_key.h>
-#include <linux/sock_diag.h>
 
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 
@@ -6191,15 +6190,10 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
 #endif
+		atomic64_set(&ireq->ir_cookie, 0);
 		ireq->ireq_state = TCP_NEW_SYN_RECV;
 		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
 		ireq->ireq_family = sk_listener->sk_family;
-
-		BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
-					offsetof(struct sock, sk_cookie));
-		BUILD_BUG_ON(offsetof(struct inet_request_sock, ireq_net) !=
-					offsetof(struct sock, sk_net));
-		sock_init_cookie((struct sock *)ireq);
 	}
 
 	return req;
-- 
cgit v1.2.3-59-g8ed1b


From 8c2320e84c193d4d42b8446986afa2797fb411e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 23 Apr 2018 14:46:25 -0700
Subject: tcp: md5: only call tp->af_specific->md5_lookup() for md5 sockets

RETPOLINE made calls to tp->af_specific->md5_lookup() quite expensive,
given they have no result.
We can omit the calls for sockets that have no md5 keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 383cac0ff0ec..95feffb6d53f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -585,14 +585,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
+	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (*md5) {
-		opts->options |= OPTION_MD5;
-		remaining -= TCPOLEN_MD5SIG_ALIGNED;
+	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+		}
 	}
-#else
-	*md5 = NULL;
 #endif
 
 	/* We always get an MSS option.  The option bytes which will be seen in
@@ -720,14 +721,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	opts->options = 0;
 
+	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (unlikely(*md5)) {
-		opts->options |= OPTION_MD5;
-		size += TCPOLEN_MD5SIG_ALIGNED;
+	if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
+		*md5 = tp->af_specific->md5_lookup(sk, sk);
+		if (*md5) {
+			opts->options |= OPTION_MD5;
+			size += TCPOLEN_MD5SIG_ALIGNED;
+		}
 	}
-#else
-	*md5 = NULL;
 #endif
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
-- 
cgit v1.2.3-59-g8ed1b


From 0c6f69a5e3641bccfe21fd0eeafad45a8c6db987 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: remove outdated comments about grow_decision etc

grow_decision and shink_decision no longer exist, so remove
the remaining references to them.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1f8ad121eb43..87d443a5b11d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -836,9 +836,8 @@ out:
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -866,9 +865,8 @@ static inline int rhashtable_insert_fast(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert_key(
 	struct rhltable *hlt, const void *key, struct rhlist_head *list,
@@ -890,9 +888,8 @@ static inline int rhltable_insert_key(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhltable_insert(
 	struct rhltable *hlt, struct rhlist_head *list,
@@ -922,9 +919,8 @@ static inline int rhltable_insert(
  *
  * It is safe to call this function from atomic context.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  */
 static inline int rhashtable_lookup_insert_fast(
 	struct rhashtable *ht, struct rhash_head *obj,
@@ -981,9 +977,8 @@ static inline void *rhashtable_lookup_get_insert_fast(
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
- * Will trigger an automatic deferred table resizing if the size grows
- * beyond the watermark indicated by grow_decision() which can be passed
- * to rhashtable_init().
+ * Will trigger an automatic deferred table resizing if residency in the
+ * table grows beyond 70%.
  *
  * Returns zero on success.
  */
@@ -1134,8 +1129,8 @@ static inline int __rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%.
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */
@@ -1156,8 +1151,8 @@ static inline int rhashtable_remove_fast(
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the
- * shrink_decision function specified at rhashtable_init() returns true.
+ * Will automatically shrink the table if permitted when residency drops
+ * below 30%
  *
  * Returns zero on success, -ENOENT if the entry could not be found.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 82266e98dd4d8e7d5b8e4a0fedeb91f2eb29d306 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: Revise incorrect comment on r{hl, hash}table_walk_enter()

Neither rhashtable_walk_enter() or rhltable_walk_enter() sleep, though
they do take a spinlock without irq protection.
So revise the comments to accurately state the contexts in which
these functions can be called.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 5 +++--
 lib/rhashtable.c           | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 87d443a5b11d..4e1f535c2034 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1268,8 +1268,9 @@ static inline int rhashtable_walk_init(struct rhashtable *ht,
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 2b2b79974b61..6d490f51174e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -668,8 +668,9 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
  *
- * This function may sleep so you must not call it from interrupt
- * context or with spin locks held.
+ * This function may be called from any process context, including
+ * non-preemptable context, but cannot be called from softirq or
+ * hardirq context.
  *
  * You must call rhashtable_walk_exit after this function returns.
  */
-- 
cgit v1.2.3-59-g8ed1b


From b41cc04b662ac96bbb291fb66b7b8aab5bc0a8c9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: reset iter when rhashtable_walk_start sees new table

The documentation claims that when rhashtable_walk_start_check()
detects a resize event, it will rewind back to the beginning
of the table.  This is not true.  We need to set ->slot and
->skip to be zero for it to be true.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6d490f51174e..81edf1ab38ab 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -737,6 +737,8 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (!iter->walker.tbl && !iter->end_of_table) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
+		iter->slot = 0;
+		iter->skip = 0;
 		return -EAGAIN;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5d240a8936f6a1d3ece06701e8c4d830a2eca8a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 24 Apr 2018 08:29:13 +1000
Subject: rhashtable: improve rhashtable_walk stability when stop/start used.

When a walk of an rhashtable is interrupted with rhastable_walk_stop()
and then rhashtable_walk_start(), the location to restart from is based
on a 'skip' count in the current hash chain, and this can be incorrect
if insertions or deletions have happened.  This does not happen when
the walk is not stopped and started as iter->p is a placeholder which
is safe to use while holding the RCU read lock.

In rhashtable_walk_start() we can revalidate that 'p' is still in the
same hash chain.  If it isn't then the current method is still used.

With this patch, if a rhashtable walker ensures that the current
object remains in the table over a stop/start period (possibly by
elevating the reference count if that is sufficient), it can be sure
that a walk will not miss objects that were in the hashtable for the
whole time of the walk.

rhashtable_walk_start() may not find the object even though it is
still in the hashtable if a rehash has moved it to a new table.  In
this case it will (eventually) get -EAGAIN and will need to proceed
through the whole table again to be sure to see everything at least
once.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/rhashtable.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 81edf1ab38ab..9427b5766134 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -727,6 +727,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 	__acquires(RCU)
 {
 	struct rhashtable *ht = iter->ht;
+	bool rhlist = ht->rhlist;
 
 	rcu_read_lock();
 
@@ -735,13 +736,52 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 		list_del(&iter->walker.list);
 	spin_unlock(&ht->lock);
 
-	if (!iter->walker.tbl && !iter->end_of_table) {
+	if (iter->end_of_table)
+		return 0;
+	if (!iter->walker.tbl) {
 		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 		iter->slot = 0;
 		iter->skip = 0;
 		return -EAGAIN;
 	}
 
+	if (iter->p && !rhlist) {
+		/*
+		 * We need to validate that 'p' is still in the table, and
+		 * if so, update 'skip'
+		 */
+		struct rhash_head *p;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			skip++;
+			if (p == iter->p) {
+				iter->skip = skip;
+				goto found;
+			}
+		}
+		iter->p = NULL;
+	} else if (iter->p && rhlist) {
+		/* Need to validate that 'list' is still in the table, and
+		 * if so, update 'skip' and 'p'.
+		 */
+		struct rhash_head *p;
+		struct rhlist_head *list;
+		int skip = 0;
+		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+			for (list = container_of(p, struct rhlist_head, rhead);
+			     list;
+			     list = rcu_dereference(list->next)) {
+				skip++;
+				if (list == iter->list) {
+					iter->p = p;
+					skip = skip;
+					goto found;
+				}
+			}
+		}
+		iter->p = NULL;
+	}
+found:
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
@@ -917,8 +957,6 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 		iter->walker.tbl = NULL;
 	spin_unlock(&ht->lock);
 
-	iter->p = NULL;
-
 out:
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3-59-g8ed1b


From fbb93020b89bc0270a9ca3c2479bbbbc548d9ee5 Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Mon, 26 Mar 2018 16:36:33 +0300
Subject: qtnfmac: add DFS offload support

DFS offload support implemented:
- DFS_OFFLOAD feature is advertised depending on HW capabilities
- CAC_STARTED event forwarding from HW implemented
- start_radar_detection() callback now returning -ENOTSUPP
  if DFS_OFFLOAD is enabled

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c |  9 +++++++++
 drivers/net/wireless/quantenna/qtnfmac/event.c    | 11 +++++++++++
 drivers/net/wireless/quantenna/qtnfmac/qlink.h    |  7 +++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 0398bece5782..5122dc798064 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -813,6 +813,9 @@ static int qtnf_start_radar_detection(struct wiphy *wiphy,
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(ndev);
 	int ret;
 
+	if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD))
+		return -ENOTSUPP;
+
 	ret = qtnf_cmd_start_cac(vif, chandef, cac_time_ms);
 	if (ret)
 		pr_err("%s: failed to start CAC ret=%d\n", ndev->name, ret);
@@ -909,6 +912,9 @@ struct wiphy *qtnf_wiphy_allocate(struct qtnf_bus *bus)
 {
 	struct wiphy *wiphy;
 
+	if (bus->hw_info.hw_capab & QLINK_HW_CAPAB_DFS_OFFLOAD)
+		qtn_cfg80211_ops.start_radar_detection = NULL;
+
 	wiphy = wiphy_new(&qtn_cfg80211_ops, sizeof(struct qtnf_wmac));
 	if (!wiphy)
 		return NULL;
@@ -982,6 +988,9 @@ int qtnf_wiphy_register(struct qtnf_hw_info *hw_info, struct qtnf_wmac *mac)
 			WIPHY_FLAG_AP_UAPSD |
 			WIPHY_FLAG_HAS_CHANNEL_SWITCH;
 
+	if (hw_info->hw_capab & QLINK_HW_CAPAB_DFS_OFFLOAD)
+		wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD);
+
 	wiphy->probe_resp_offload = NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS |
 				    NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS2;
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index bcd415f96412..cb2a6c12f870 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -443,6 +443,17 @@ static int qtnf_event_handle_radar(struct qtnf_vif *vif,
 		cfg80211_cac_event(vif->netdev, &chandef,
 				   NL80211_RADAR_CAC_ABORTED, GFP_KERNEL);
 		break;
+	case QLINK_RADAR_CAC_STARTED:
+		if (vif->wdev.cac_started)
+			break;
+
+		if (!wiphy_ext_feature_isset(wiphy,
+					     NL80211_EXT_FEATURE_DFS_OFFLOAD))
+			break;
+
+		cfg80211_cac_event(vif->netdev, &chandef,
+				   NL80211_RADAR_CAC_STARTED, GFP_KERNEL);
+		break;
 	default:
 		pr_warn("%s: unhandled radar event %u\n",
 			vif->netdev->name, ev->event);
diff --git a/drivers/net/wireless/quantenna/qtnfmac/qlink.h b/drivers/net/wireless/quantenna/qtnfmac/qlink.h
index 9bf3ae4d1b3b..9ab27e158023 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/qlink.h
+++ b/drivers/net/wireless/quantenna/qtnfmac/qlink.h
@@ -68,10 +68,12 @@ struct qlink_msg_header {
  * @QLINK_HW_CAPAB_STA_INACT_TIMEOUT: device implements a logic to kick-out
  *	associated STAs due to inactivity. Inactivity timeout period is taken
  *	from QLINK_CMD_START_AP parameters.
+ * @QLINK_HW_CAPAB_DFS_OFFLOAD: device implements DFS offload functionality
  */
 enum qlink_hw_capab {
-	QLINK_HW_CAPAB_REG_UPDATE = BIT(0),
-	QLINK_HW_CAPAB_STA_INACT_TIMEOUT = BIT(1),
+	QLINK_HW_CAPAB_REG_UPDATE		= BIT(0),
+	QLINK_HW_CAPAB_STA_INACT_TIMEOUT	= BIT(1),
+	QLINK_HW_CAPAB_DFS_OFFLOAD		= BIT(2),
 };
 
 enum qlink_iface_type {
@@ -1031,6 +1033,7 @@ enum qlink_radar_event {
 	QLINK_RADAR_CAC_ABORTED,
 	QLINK_RADAR_NOP_FINISHED,
 	QLINK_RADAR_PRE_CAC_EXPIRED,
+	QLINK_RADAR_CAC_STARTED,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 0fb1abc3a0b8f83b312b3c00e98643a17df69e87 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 29 Mar 2018 16:38:18 +0100
Subject: cw1200: fix spelling mistake: "Mailformed" -> "Malformed"

Trivial fix to spelling mistake in wiphy_warn warning message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/st/cw1200/txrx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/st/cw1200/txrx.c b/drivers/net/wireless/st/cw1200/txrx.c
index e9050b41157a..f7b1b0062db3 100644
--- a/drivers/net/wireless/st/cw1200/txrx.c
+++ b/drivers/net/wireless/st/cw1200/txrx.c
@@ -1069,7 +1069,7 @@ void cw1200_rx_cb(struct cw1200_common *priv,
 	}
 
 	if (skb->len < sizeof(struct ieee80211_pspoll)) {
-		wiphy_warn(priv->hw->wiphy, "Mailformed SDU rx'ed. Size is lesser than IEEE header.\n");
+		wiphy_warn(priv->hw->wiphy, "Malformed SDU rx'ed. Size is lesser than IEEE header.\n");
 		goto drop;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5dc3638735f1c8a6ccd7f6ea361546567deb5f0f Mon Sep 17 00:00:00 2001
From: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Date: Thu, 29 Mar 2018 19:44:49 +0530
Subject: rsi: move xtend_desc structure from rsi_main.h to rsi_mgmt.h

All descriptor structures are in rsi_mgmt.h except this
extended descriptor structure. Hence moving it to rsi_mgmt.h
and also renaming to rsi_xtend_desc.

Signed-off-by: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_hal.c | 12 ++++++------
 drivers/net/wireless/rsi/rsi_main.h    |  6 ------
 drivers/net/wireless/rsi/rsi_mgmt.h    |  6 ++++++
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index de608ae365a4..ce6a86deea65 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -55,7 +55,7 @@ static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 	struct rsi_mgmt_desc *mgmt_desc;
 	struct skb_info *tx_params;
 	struct ieee80211_bss_conf *bss = NULL;
-	struct xtended_desc *xtend_desc = NULL;
+	struct rsi_xtended_desc *xtend_desc = NULL;
 	u8 header_size;
 	u32 dword_align_bytes = 0;
 
@@ -69,7 +69,7 @@ static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 	vif = tx_params->vif;
 
 	/* Update header size */
-	header_size = FRAME_DESC_SZ + sizeof(struct xtended_desc);
+	header_size = FRAME_DESC_SZ + sizeof(struct rsi_xtended_desc);
 	if (header_size > skb_headroom(skb)) {
 		rsi_dbg(ERR_ZONE,
 			"%s: Failed to add extended descriptor\n",
@@ -92,7 +92,7 @@ static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 	wh = (struct ieee80211_hdr *)&skb->data[header_size];
 
 	mgmt_desc = (struct rsi_mgmt_desc *)skb->data;
-	xtend_desc = (struct xtended_desc *)&skb->data[FRAME_DESC_SZ];
+	xtend_desc = (struct rsi_xtended_desc *)&skb->data[FRAME_DESC_SZ];
 
 	rsi_set_len_qno(&mgmt_desc->len_qno, (skb->len - FRAME_DESC_SZ),
 			RSI_WIFI_MGMT_Q);
@@ -158,7 +158,7 @@ static int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
 	struct skb_info *tx_params;
 	struct ieee80211_bss_conf *bss;
 	struct rsi_data_desc *data_desc;
-	struct xtended_desc *xtend_desc;
+	struct rsi_xtended_desc *xtend_desc;
 	u8 ieee80211_size = MIN_802_11_HDR_LEN;
 	u8 header_size;
 	u8 vap_id = 0;
@@ -170,7 +170,7 @@ static int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
 	bss = &vif->bss_conf;
 	tx_params = (struct skb_info *)info->driver_data;
 
-	header_size = FRAME_DESC_SZ + sizeof(struct xtended_desc);
+	header_size = FRAME_DESC_SZ + sizeof(struct rsi_xtended_desc);
 	if (header_size > skb_headroom(skb)) {
 		rsi_dbg(ERR_ZONE, "%s: Unable to send pkt\n", __func__);
 		return -ENOSPC;
@@ -188,7 +188,7 @@ static int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
 	data_desc = (struct rsi_data_desc *)skb->data;
 	memset(data_desc, 0, header_size);
 
-	xtend_desc = (struct xtended_desc *)&skb->data[FRAME_DESC_SZ];
+	xtend_desc = (struct rsi_xtended_desc *)&skb->data[FRAME_DESC_SZ];
 	wh = (struct ieee80211_hdr *)&skb->data[header_size];
 	seq_num = IEEE80211_SEQ_TO_SN(le16_to_cpu(wh->seq_ctrl));
 
diff --git a/drivers/net/wireless/rsi/rsi_main.h b/drivers/net/wireless/rsi/rsi_main.h
index ef4fa323694b..b81cdbf05e5b 100644
--- a/drivers/net/wireless/rsi/rsi_main.h
+++ b/drivers/net/wireless/rsi/rsi_main.h
@@ -190,12 +190,6 @@ struct cqm_info {
 	u32 rssi_hyst;
 };
 
-struct xtended_desc {
-	u8 confirm_frame_type;
-	u8 retry_cnt;
-	u16 reserved;
-};
-
 enum rsi_dfs_regions {
 	RSI_REGION_FCC = 0,
 	RSI_REGION_ETSI,
diff --git a/drivers/net/wireless/rsi/rsi_mgmt.h b/drivers/net/wireless/rsi/rsi_mgmt.h
index cf6567ae5bbe..6726e84a6773 100644
--- a/drivers/net/wireless/rsi/rsi_mgmt.h
+++ b/drivers/net/wireless/rsi/rsi_mgmt.h
@@ -301,6 +301,12 @@ struct rsi_mac_frame {
 #define ENCAP_MGMT_PKT			BIT(7)
 #define DESC_IMMEDIATE_WAKEUP		BIT(15)
 
+struct rsi_xtended_desc {
+	u8 confirm_frame_type;
+	u8 retry_cnt;
+	u16 reserved;
+};
+
 struct rsi_cmd_desc_dword0 {
 	__le16 len_qno;
 	u8 frame_type;
-- 
cgit v1.2.3-59-g8ed1b


From 1be05eb5e4dc8e3812ae72bdd55e6fa4de3ee948 Mon Sep 17 00:00:00 2001
From: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Date: Thu, 29 Mar 2018 19:44:50 +0530
Subject: rsi: move descriptor preparation to core

Descriptors preparation is moved to core instead of HAL to
avoid synchronization issues in sending TX frames

Signed-off-by: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_core.c | 11 +++++++++
 drivers/net/wireless/rsi/rsi_91x_hal.c  | 43 +++++++++++++++++----------------
 drivers/net/wireless/rsi/rsi_hal.h      |  2 ++
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c
index 5dafd2e1306c..f5d17568685b 100644
--- a/drivers/net/wireless/rsi/rsi_91x_core.c
+++ b/drivers/net/wireless/rsi/rsi_91x_core.c
@@ -413,6 +413,11 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 	    (ieee80211_is_qos_nullfunc(wh->frame_control))) {
 		q_num = MGMT_SOFT_Q;
 		skb->priority = q_num;
+
+		if (rsi_prepare_mgmt_desc(common, skb)) {
+			rsi_dbg(ERR_ZONE, "Failed to prepare desc\n");
+			goto xmit_fail;
+		}
 	} else {
 		if (ieee80211_is_data_qos(wh->frame_control)) {
 			tid = (skb->data[24] & IEEE80211_QOS_TID);
@@ -433,6 +438,8 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 			if (!rsta)
 				goto xmit_fail;
 			tx_params->sta_id = rsta->sta_id;
+		} else {
+			tx_params->sta_id = 0;
 		}
 
 		if (rsta) {
@@ -443,6 +450,10 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 							      tid, 0);
 			}
 		}
+		if (rsi_prepare_data_desc(common, skb)) {
+			rsi_dbg(ERR_ZONE, "Failed to prepare data desc\n");
+			goto xmit_fail;
+		}
 	}
 
 	if ((q_num < MGMT_SOFT_Q) &&
diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index ce6a86deea65..43d7d6c7ccb8 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -45,7 +45,7 @@ int rsi_send_pkt_to_bus(struct rsi_common *common, struct sk_buff *skb)
 	return status;
 }
 
-static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
+int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 {
 	struct rsi_hw *adapter = common->priv;
 	struct ieee80211_hdr *wh = NULL;
@@ -113,17 +113,6 @@ static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 	if (conf_is_ht40(conf))
 		mgmt_desc->bbp_info = cpu_to_le16(FULL40M_ENABLE);
 
-	if (ieee80211_is_probe_req(wh->frame_control)) {
-		if (!bss->assoc) {
-			rsi_dbg(INFO_ZONE,
-				"%s: blocking mgmt queue\n", __func__);
-			mgmt_desc->misc_flags = RSI_DESC_REQUIRE_CFM_TO_HOST;
-			xtend_desc->confirm_frame_type = PROBEREQ_CONFIRM;
-			common->mgmt_q_block = true;
-			rsi_dbg(INFO_ZONE, "Mgmt queue blocked\n");
-		}
-	}
-
 	if (ieee80211_is_probe_resp(wh->frame_control)) {
 		mgmt_desc->misc_flags |= (RSI_ADD_DELTA_TSF_VAP_ID |
 					  RSI_FETCH_RETRY_CNT_FRM_HST);
@@ -149,7 +138,7 @@ static int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb)
 }
 
 /* This function prepares descriptor for given data packet */
-static int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
+int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
 {
 	struct rsi_hw *adapter = common->priv;
 	struct ieee80211_vif *vif;
@@ -301,10 +290,6 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
 	    (!bss->assoc))
 		goto err;
 
-	status = rsi_prepare_data_desc(common, skb);
-	if (status)
-		goto err;
-
 	status = rsi_send_pkt_to_bus(common, skb);
 	if (status)
 		rsi_dbg(ERR_ZONE, "%s: Failed to write pkt\n", __func__);
@@ -327,12 +312,18 @@ int rsi_send_mgmt_pkt(struct rsi_common *common,
 		      struct sk_buff *skb)
 {
 	struct rsi_hw *adapter = common->priv;
+	struct ieee80211_bss_conf *bss;
+	struct ieee80211_hdr *wh;
 	struct ieee80211_tx_info *info;
 	struct skb_info *tx_params;
+	struct rsi_mgmt_desc *mgmt_desc;
+	struct rsi_xtended_desc *xtend_desc;
 	int status = -E2BIG;
+	u8 header_size;
 
 	info = IEEE80211_SKB_CB(skb);
 	tx_params = (struct skb_info *)info->driver_data;
+	header_size = tx_params->internal_hdr_size;
 
 	if (tx_params->flags & INTERNAL_MGMT_PKT) {
 		status = adapter->host_intf_ops->write_pkt(common->priv,
@@ -346,15 +337,25 @@ int rsi_send_mgmt_pkt(struct rsi_common *common,
 		return status;
 	}
 
-	if (FRAME_DESC_SZ > skb_headroom(skb))
-		goto err;
+	bss = &info->control.vif->bss_conf;
+	wh = (struct ieee80211_hdr *)&skb->data[header_size];
+	mgmt_desc = (struct rsi_mgmt_desc *)skb->data;
+	xtend_desc = (struct rsi_xtended_desc *)&skb->data[FRAME_DESC_SZ];
+
+	/* Indicate to firmware to give cfm for probe */
+	if (ieee80211_is_probe_req(wh->frame_control) && !bss->assoc) {
+		rsi_dbg(INFO_ZONE,
+			"%s: blocking mgmt queue\n", __func__);
+		mgmt_desc->misc_flags = RSI_DESC_REQUIRE_CFM_TO_HOST;
+		xtend_desc->confirm_frame_type = PROBEREQ_CONFIRM;
+		common->mgmt_q_block = true;
+		rsi_dbg(INFO_ZONE, "Mgmt queue blocked\n");
+	}
 
-	rsi_prepare_mgmt_desc(common, skb);
 	status = rsi_send_pkt_to_bus(common, skb);
 	if (status)
 		rsi_dbg(ERR_ZONE, "%s: Failed to write the packet\n", __func__);
 
-err:
 	rsi_indicate_tx_status(common->priv, skb, status);
 	return status;
 }
diff --git a/drivers/net/wireless/rsi/rsi_hal.h b/drivers/net/wireless/rsi/rsi_hal.h
index 786dccd0b732..d6c2baa56e9f 100644
--- a/drivers/net/wireless/rsi/rsi_hal.h
+++ b/drivers/net/wireless/rsi/rsi_hal.h
@@ -167,6 +167,8 @@ struct rsi_bt_desc {
 } __packed;
 
 int rsi_hal_device_init(struct rsi_hw *adapter);
+int rsi_prepare_mgmt_desc(struct rsi_common *common, struct sk_buff *skb);
+int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb);
 int rsi_prepare_beacon(struct rsi_common *common, struct sk_buff *skb);
 int rsi_send_pkt_to_bus(struct rsi_common *common, struct sk_buff *skb);
 int rsi_send_bt_pkt(struct rsi_common *common, struct sk_buff *skb);
-- 
cgit v1.2.3-59-g8ed1b


From 62dc108d5f7c4443ce88e1d6157e8547edbc07b3 Mon Sep 17 00:00:00 2001
From: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Date: Thu, 29 Mar 2018 19:44:51 +0530
Subject: rsi: enable 80MHz clock by default

80MHz clock for device should be enabled by default in
TX command frame radio capabilities.

Signed-off-by: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mgmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
index c21fca750fd4..9207da09a4f7 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
@@ -325,8 +325,8 @@ static int rsi_load_radio_caps(struct rsi_common *common)
 	radio_caps->channel_num = common->channel;
 	radio_caps->rf_model = RSI_RF_TYPE;
 
+	radio_caps->radio_cfg_info = RSI_LMAC_CLOCK_80MHZ;
 	if (common->channel_width == BW_40MHZ) {
-		radio_caps->radio_cfg_info = RSI_LMAC_CLOCK_80MHZ;
 		radio_caps->radio_cfg_info |= RSI_ENABLE_40MHZ;
 
 		if (common->fsm_state == FSM_MAC_INIT_DONE) {
-- 
cgit v1.2.3-59-g8ed1b


From 4fd6c4762f372852d65ad737ab35996395295dce Mon Sep 17 00:00:00 2001
From: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Date: Thu, 29 Mar 2018 19:44:52 +0530
Subject: rsi: roaming enhancements

To support roaming below changes are done:
* Station notify frame is send to firmware after sending assoc
  request. This will avoid dropping of first EAPOL frame due to
  delay in creation of station control block in firmware.
* Data queues are unblocked after sending station notify in open
  mode, after configuring key in WEP mode, and after receiving
  EAPOL4 confirm in WPA mode.
* Initial EAPOL frames priority is chaged to MGMT, rekey EAPOL
  frames priority changed to VO.

Signed-off-by: Prameela Rani Garnepudi <prameela.j04cs@gmail.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_core.c     | 16 +++++++++++++++
 drivers/net/wireless/rsi/rsi_91x_hal.c      | 18 ++++++++++++++++
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 18 +++++++++++++---
 drivers/net/wireless/rsi/rsi_91x_mgmt.c     | 32 +++++++++++++++++------------
 drivers/net/wireless/rsi/rsi_main.h         |  1 +
 drivers/net/wireless/rsi/rsi_mgmt.h         | 10 ++++++++-
 6 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c
index f5d17568685b..3ca468b9f2b8 100644
--- a/drivers/net/wireless/rsi/rsi_91x_core.c
+++ b/drivers/net/wireless/rsi/rsi_91x_core.c
@@ -411,6 +411,18 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 	if ((ieee80211_is_mgmt(wh->frame_control)) ||
 	    (ieee80211_is_ctl(wh->frame_control)) ||
 	    (ieee80211_is_qos_nullfunc(wh->frame_control))) {
+		if (ieee80211_is_assoc_req(wh->frame_control) ||
+		    ieee80211_is_reassoc_req(wh->frame_control)) {
+			struct ieee80211_bss_conf *bss = &vif->bss_conf;
+
+			common->eapol4_confirm = false;
+			rsi_hal_send_sta_notify_frame(common,
+						      RSI_IFTYPE_STATION,
+						      STA_CONNECTED, bss->bssid,
+						      bss->qos, bss->aid, 0,
+						      vif);
+		}
+
 		q_num = MGMT_SOFT_Q;
 		skb->priority = q_num;
 
@@ -450,6 +462,10 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 							      tid, 0);
 			}
 		}
+		if (skb->protocol == cpu_to_be16(ETH_P_PAE)) {
+			q_num = MGMT_SOFT_Q;
+			skb->priority = q_num;
+		}
 		if (rsi_prepare_data_desc(common, skb)) {
 			rsi_dbg(ERR_ZONE, "Failed to prepare data desc\n");
 			goto xmit_fail;
diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index 43d7d6c7ccb8..b7c54037f369 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -232,6 +232,18 @@ int rsi_prepare_data_desc(struct rsi_common *common, struct sk_buff *skb)
 		data_desc->misc_flags |= RSI_FETCH_RETRY_CNT_FRM_HST;
 #define EAPOL_RETRY_CNT 15
 		xtend_desc->retry_cnt = EAPOL_RETRY_CNT;
+
+		if (common->eapol4_confirm)
+			skb->priority = VO_Q;
+		else
+			rsi_set_len_qno(&data_desc->len_qno,
+					(skb->len - FRAME_DESC_SZ),
+					RSI_WIFI_MGMT_Q);
+		if ((skb->len - header_size) == EAPOL4_PACKET_LEN) {
+			data_desc->misc_flags |=
+				RSI_DESC_REQUIRE_CFM_TO_HOST;
+			xtend_desc->confirm_frame_type = EAPOL4_CONFIRM;
+		}
 	}
 
 	data_desc->mac_flags = cpu_to_le16(seq_num & 0xfff);
@@ -271,8 +283,11 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
 	struct rsi_hw *adapter = common->priv;
 	struct ieee80211_vif *vif;
 	struct ieee80211_tx_info *info;
+	struct skb_info *tx_params;
 	struct ieee80211_bss_conf *bss;
+	struct ieee80211_hdr *wh;
 	int status = -EINVAL;
+	u8 header_size;
 
 	if (!skb)
 		return 0;
@@ -284,6 +299,9 @@ int rsi_send_data_pkt(struct rsi_common *common, struct sk_buff *skb)
 		goto err;
 	vif = info->control.vif;
 	bss = &vif->bss_conf;
+	tx_params = (struct skb_info *)info->driver_data;
+	header_size = tx_params->internal_hdr_size;
+	wh = (struct ieee80211_hdr *)&skb->data[header_size];
 
 	if (((vif->type == NL80211_IFTYPE_STATION) ||
 	     (vif->type == NL80211_IFTYPE_P2P_CLIENT)) &&
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 32f5cb46fd4f..5edc3a9d8eeb 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -737,7 +737,8 @@ static void rsi_mac80211_bss_info_changed(struct ieee80211_hw *hw,
 				      bss_conf->bssid,
 				      bss_conf->qos,
 				      bss_conf->aid,
-				      NULL, 0, vif);
+				      NULL, 0,
+				      bss_conf->assoc_capability, vif);
 		adapter->ps_info.dtim_interval_duration = bss->dtim_period;
 		adapter->ps_info.listen_interval = conf->listen_interval;
 
@@ -914,6 +915,17 @@ static int rsi_hal_key_config(struct ieee80211_hw *hw,
 				key->cipher,
 				sta_id,
 				vif);
+	if (status)
+		return status;
+
+	if (vif->type == NL80211_IFTYPE_STATION && key->key &&
+	    (key->cipher == WLAN_CIPHER_SUITE_WEP104 ||
+	     key->cipher == WLAN_CIPHER_SUITE_WEP40)) {
+		if (!rsi_send_block_unblock_frame(adapter->priv, false))
+			adapter->priv->hw_data_qs_blocked = false;
+	}
+
+	return 0;
 }
 
 /**
@@ -1391,7 +1403,7 @@ static int rsi_mac80211_sta_add(struct ieee80211_hw *hw,
 			rsi_dbg(INFO_ZONE, "Indicate bss status to device\n");
 			rsi_inform_bss_status(common, RSI_OPMODE_AP, 1,
 					      sta->addr, sta->wme, sta->aid,
-					      sta, sta_idx, vif);
+					      sta, sta_idx, 0, vif);
 
 			if (common->key) {
 				struct ieee80211_key_conf *key = common->key;
@@ -1469,7 +1481,7 @@ static int rsi_mac80211_sta_remove(struct ieee80211_hw *hw,
 				rsi_inform_bss_status(common, RSI_OPMODE_AP, 0,
 						      sta->addr, sta->wme,
 						      sta->aid, sta, sta_idx,
-						      vif);
+						      0, vif);
 				rsta->sta = NULL;
 				rsta->sta_id = -1;
 				for (cnt = 0; cnt < IEEE80211_NUM_TIDS; cnt++)
diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
index 9207da09a4f7..0757adcb3f4a 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
@@ -454,14 +454,10 @@ static int rsi_mgmt_pkt_to_core(struct rsi_common *common,
  *
  * Return: status: 0 on success, corresponding negative error code on failure.
  */
-static int rsi_hal_send_sta_notify_frame(struct rsi_common *common,
-					 enum opmode opmode,
-					 u8 notify_event,
-					 const unsigned char *bssid,
-					 u8 qos_enable,
-					 u16 aid,
-					 u16 sta_id,
-					 struct ieee80211_vif *vif)
+int rsi_hal_send_sta_notify_frame(struct rsi_common *common, enum opmode opmode,
+				  u8 notify_event, const unsigned char *bssid,
+				  u8 qos_enable, u16 aid, u16 sta_id,
+				  struct ieee80211_vif *vif)
 {
 	struct sk_buff *skb = NULL;
 	struct rsi_peer_notify *peer_notify;
@@ -1328,6 +1324,7 @@ void rsi_inform_bss_status(struct rsi_common *common,
 			   u16 aid,
 			   struct ieee80211_sta *sta,
 			   u16 sta_id,
+			   u16 assoc_cap,
 			   struct ieee80211_vif *vif)
 {
 	if (status) {
@@ -1342,10 +1339,10 @@ void rsi_inform_bss_status(struct rsi_common *common,
 					      vif);
 		if (common->min_rate == 0xffff)
 			rsi_send_auto_rate_request(common, sta, sta_id, vif);
-		if (opmode == RSI_OPMODE_STA) {
-			if (!rsi_send_block_unblock_frame(common, false))
-				common->hw_data_qs_blocked = false;
-		}
+		if (opmode == RSI_OPMODE_STA &&
+		    !(assoc_cap & WLAN_CAPABILITY_PRIVACY) &&
+		    !rsi_send_block_unblock_frame(common, false))
+			common->hw_data_qs_blocked = false;
 	} else {
 		if (opmode == RSI_OPMODE_STA)
 			common->hw_data_qs_blocked = true;
@@ -1850,10 +1847,19 @@ int rsi_mgmt_pkt_recv(struct rsi_common *common, u8 *msg)
 			__func__);
 		return rsi_handle_card_ready(common, msg);
 	case TX_STATUS_IND:
-		if (msg[15] == PROBEREQ_CONFIRM) {
+		switch (msg[RSI_TX_STATUS_TYPE]) {
+		case PROBEREQ_CONFIRM:
 			common->mgmt_q_block = false;
 			rsi_dbg(FSM_ZONE, "%s: Probe confirm received\n",
 				__func__);
+			break;
+		case EAPOL4_CONFIRM:
+			if (msg[RSI_TX_STATUS]) {
+				common->eapol4_confirm = true;
+				if (!rsi_send_block_unblock_frame(common,
+								  false))
+					common->hw_data_qs_blocked = false;
+			}
 		}
 		break;
 	case BEACON_EVENT_IND:
diff --git a/drivers/net/wireless/rsi/rsi_main.h b/drivers/net/wireless/rsi/rsi_main.h
index b81cdbf05e5b..a084f224bb03 100644
--- a/drivers/net/wireless/rsi/rsi_main.h
+++ b/drivers/net/wireless/rsi/rsi_main.h
@@ -287,6 +287,7 @@ struct rsi_common {
 	struct timer_list roc_timer;
 	struct ieee80211_vif *roc_vif;
 
+	bool eapol4_confirm;
 	void *bt_adapter;
 };
 
diff --git a/drivers/net/wireless/rsi/rsi_mgmt.h b/drivers/net/wireless/rsi/rsi_mgmt.h
index 6726e84a6773..5f946f31723f 100644
--- a/drivers/net/wireless/rsi/rsi_mgmt.h
+++ b/drivers/net/wireless/rsi/rsi_mgmt.h
@@ -33,6 +33,7 @@
 #define WMM_SHORT_SLOT_TIME             9
 #define SIFS_DURATION                   16
 
+#define EAPOL4_PACKET_LEN		0x85
 #define KEY_TYPE_CLEAR                  0
 #define RSI_PAIRWISE_KEY                1
 #define RSI_GROUP_KEY                   2
@@ -62,9 +63,12 @@
 #define RX_DOT11_MGMT                   0x02
 #define TX_STATUS_IND                   0x04
 #define BEACON_EVENT_IND		0x08
+#define EAPOL4_CONFIRM                  1
 #define PROBEREQ_CONFIRM                2
 #define CARD_READY_IND                  0x00
 #define SLEEP_NOTIFY_IND                0x06
+#define RSI_TX_STATUS_TYPE		15
+#define RSI_TX_STATUS			12
 
 #define RSI_DELETE_PEER                 0x0
 #define RSI_ADD_PEER                    0x1
@@ -660,10 +664,14 @@ int rsi_set_channel(struct rsi_common *common,
 		    struct ieee80211_channel *channel);
 int rsi_send_vap_dynamic_update(struct rsi_common *common);
 int rsi_send_block_unblock_frame(struct rsi_common *common, bool event);
+int rsi_hal_send_sta_notify_frame(struct rsi_common *common, enum opmode opmode,
+				  u8 notify_event, const unsigned char *bssid,
+				  u8 qos_enable, u16 aid, u16 sta_id,
+				  struct ieee80211_vif *vif);
 void rsi_inform_bss_status(struct rsi_common *common, enum opmode opmode,
 			   u8 status, const u8 *addr, u8 qos_enable, u16 aid,
 			   struct ieee80211_sta *sta, u16 sta_id,
-			   struct ieee80211_vif *vif);
+			   u16 assoc_cap, struct ieee80211_vif *vif);
 void rsi_indicate_pkt_to_os(struct rsi_common *common, struct sk_buff *skb);
 int rsi_mac80211_attach(struct rsi_common *common);
 void rsi_indicate_tx_status(struct rsi_hw *common, struct sk_buff *skb,
-- 
cgit v1.2.3-59-g8ed1b


From 350fcdb834576162ab55519ed86572c00ea786a7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 6 Apr 2018 11:37:17 +0300
Subject: rsi: remove unecessary PTR_ALIGN()s

The issue here is that we allocate "data" and then set
"data = PTR_ALIGN(data, 8);" and then we free the aligned pointer
instead of the original pointer.

kmalloc() pointers are already ARCH_SLAB_MINALIGN aligned which is 8 or
more on everything except certain Xtensa variants.  We decided that if
the Xtensa people ever notice a bug here then we'll tell them the bug is
on their side.  ;)

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_sdio.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c
index d76e69c0beaa..8ef00582c6ea 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c
@@ -660,8 +660,6 @@ static int rsi_sdio_master_reg_read(struct rsi_hw *adapter, u32 addr,
 	if (!data)
 		return -ENOMEM;
 
-	data = PTR_ALIGN(data, 8);
-
 	ms_addr = (addr >> 16);
 	status = rsi_sdio_master_access_msword(adapter, ms_addr);
 	if (status < 0) {
@@ -724,8 +722,6 @@ static int rsi_sdio_master_reg_write(struct rsi_hw *adapter,
 	if (!data_aligned)
 		return -ENOMEM;
 
-	data_aligned = PTR_ALIGN(data_aligned, 8);
-
 	if (size == 2) {
 		*data_aligned = ((data << 16) | (data & 0xFFFF));
 	} else if (size == 1) {
-- 
cgit v1.2.3-59-g8ed1b


From 16d3bb7b2f37b917b3c55e83d230a199f5c51864 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:11 +0530
Subject: rsi: disable fw watchdog timer during reset

Firmware's watchdog timer should be disabled as a part of reset
sequence. This change fixes a firmware hang issue observed during
stress tests.

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_usb.c | 7 +++++++
 drivers/net/wireless/rsi/rsi_hal.h     | 1 +
 drivers/net/wireless/rsi/rsi_usb.h     | 1 +
 3 files changed, 9 insertions(+)

diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c
index 7b8bae313aa9..b065438f51b2 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb.c
@@ -687,6 +687,13 @@ static int rsi_reset_card(struct rsi_hw *adapter)
 	 */
 	msleep(100);
 
+	if (rsi_usb_master_reg_write(adapter, SWBL_REGOUT,
+				     RSI_FW_WDT_DISABLE_REQ,
+				     RSI_COMMON_REG_SIZE) < 0) {
+		rsi_dbg(ERR_ZONE, "Disabling firmware watchdog timer failed\n");
+		goto fail;
+	}
+
 	ret = usb_ulp_read_write(adapter, RSI_WATCH_DOG_TIMER_1,
 				 RSI_ULP_WRITE_2, 32);
 	if (ret < 0)
diff --git a/drivers/net/wireless/rsi/rsi_hal.h b/drivers/net/wireless/rsi/rsi_hal.h
index d6c2baa56e9f..327638cdd30b 100644
--- a/drivers/net/wireless/rsi/rsi_hal.h
+++ b/drivers/net/wireless/rsi/rsi_hal.h
@@ -115,6 +115,7 @@
 #define FW_FLASH_OFFSET			0x820
 #define LMAC_VER_OFFSET			(FW_FLASH_OFFSET + 0x200)
 #define MAX_DWORD_ALIGN_BYTES		64
+#define RSI_COMMON_REG_SIZE		2
 
 struct bl_header {
 	__le32 flags;
diff --git a/drivers/net/wireless/rsi/rsi_usb.h b/drivers/net/wireless/rsi/rsi_usb.h
index a88d59295a98..b6fe79f0a513 100644
--- a/drivers/net/wireless/rsi/rsi_usb.h
+++ b/drivers/net/wireless/rsi/rsi_usb.h
@@ -26,6 +26,7 @@
 #define RSI_USB_READY_MAGIC_NUM      0xab
 #define FW_STATUS_REG                0x41050012
 #define RSI_TA_HOLD_REG              0x22000844
+#define RSI_FW_WDT_DISABLE_REQ	     0x69
 
 #define USB_VENDOR_REGISTER_READ     0x15
 #define USB_VENDOR_REGISTER_WRITE    0x16
-- 
cgit v1.2.3-59-g8ed1b


From 8c1475bdfc3ae3b5ee46d7c026be5e4409e05ea2 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:12 +0530
Subject: rsi: device bootup parameter configuration

Some device configuration flags need to be enabled while sending 'bootup
params' internal frame to firmware. This patch takes care of it.

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_boot_params.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_boot_params.h b/drivers/net/wireless/rsi/rsi_boot_params.h
index 238ee96434ec..ad903b22440e 100644
--- a/drivers/net/wireless/rsi/rsi_boot_params.h
+++ b/drivers/net/wireless/rsi/rsi_boot_params.h
@@ -46,7 +46,8 @@
 	(((TA_PLL_M_VAL_20 + 1) * 40) / \
 	 ((TA_PLL_N_VAL_20 + 1) * (TA_PLL_P_VAL_20 + 1)))
 #define VALID_20 \
-	(WIFI_PLL960_CONFIGS | WIFI_AFEPLL_CONFIGS | WIFI_SWITCH_CLK_CONFIGS)
+	(WIFI_TAPLL_CONFIGS | WIFI_PLL960_CONFIGS | WIFI_AFEPLL_CONFIGS | \
+	 WIFI_SWITCH_CLK_CONFIGS | BOOTUP_MODE_INFO | CRYSTAL_GOOD_TIME)
 #define UMAC_CLK_40BW   \
 	(((TA_PLL_M_VAL_40 + 1) * 40) / \
 	 ((TA_PLL_N_VAL_40 + 1) * (TA_PLL_P_VAL_40 + 1)))
-- 
cgit v1.2.3-59-g8ed1b


From cbbfdd6c700f7c840df73199fac0c77a31b6d944 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:13 +0530
Subject: rsi: use appropriate interface for power save configuration

Power save request should be sent on station interface. Virtual
interface which is connected should be preferred. This patch
resolves device not entering power save problem in certain
situations

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 5edc3a9d8eeb..77aa3bb357a0 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -614,7 +614,7 @@ static int rsi_mac80211_config(struct ieee80211_hw *hw,
 
 	/* Power save parameters */
 	if (changed & IEEE80211_CONF_CHANGE_PS) {
-		struct ieee80211_vif *vif;
+		struct ieee80211_vif *vif, *sta_vif = NULL;
 		unsigned long flags;
 		int i, set_ps = 1;
 
@@ -628,13 +628,17 @@ static int rsi_mac80211_config(struct ieee80211_hw *hw,
 				set_ps = 0;
 				break;
 			}
+			if ((vif->type == NL80211_IFTYPE_STATION ||
+			     vif->type == NL80211_IFTYPE_P2P_CLIENT) &&
+			    (!sta_vif || vif->bss_conf.assoc))
+				sta_vif = vif;
 		}
-		if (set_ps) {
+		if (set_ps && sta_vif) {
 			spin_lock_irqsave(&adapter->ps_lock, flags);
 			if (conf->flags & IEEE80211_CONF_PS)
-				rsi_enable_ps(adapter, vif);
+				rsi_enable_ps(adapter, sta_vif);
 			else
-				rsi_disable_ps(adapter, vif);
+				rsi_disable_ps(adapter, sta_vif);
 			spin_unlock_irqrestore(&adapter->ps_lock, flags);
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a55e50f0672ec16c944b7e4f1168476661ea25a0 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:14 +0530
Subject: rsi: increase max supported aggregation subframes

Maximum number of supported aggregation subframes has been increased
to 8. This is the optimal number for the driver.

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 3 ++-
 drivers/net/wireless/rsi/rsi_mgmt.h         | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 77aa3bb357a0..2a777435a735 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1955,7 +1955,8 @@ int rsi_mac80211_attach(struct rsi_common *common)
 	hw->uapsd_queues = RSI_IEEE80211_UAPSD_QUEUES;
 	hw->uapsd_max_sp_len = IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL;
 
-	hw->max_tx_aggregation_subframes = 6;
+	hw->max_tx_aggregation_subframes = RSI_MAX_TX_AGGR_FRMS;
+	hw->max_rx_aggregation_subframes = RSI_MAX_RX_AGGR_FRMS;
 	rsi_register_rates_channels(adapter, NL80211_BAND_2GHZ);
 	rsi_register_rates_channels(adapter, NL80211_BAND_5GHZ);
 	hw->rate_control_algorithm = "AARF";
diff --git a/drivers/net/wireless/rsi/rsi_mgmt.h b/drivers/net/wireless/rsi/rsi_mgmt.h
index 5f946f31723f..14620935c925 100644
--- a/drivers/net/wireless/rsi/rsi_mgmt.h
+++ b/drivers/net/wireless/rsi/rsi_mgmt.h
@@ -225,6 +225,9 @@
 #define RSI_WOW_DISCONNECT		BIT(5)
 #endif
 
+#define RSI_MAX_TX_AGGR_FRMS		8
+#define RSI_MAX_RX_AGGR_FRMS		8
+
 enum opmode {
 	RSI_OPMODE_UNSUPPORTED = -1,
 	RSI_OPMODE_AP = 0,
-- 
cgit v1.2.3-59-g8ed1b


From 3334306a086e6784ad32f395e5dcc9272ca0ce6e Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:15 +0530
Subject: rsi: parse TID from data frame correctly

Currently TID is extracted by checking at specific offset in data frame.
This approach doesn't work for some of the frames. This patch uses mac80211
API and do it correctly

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c
index 3ca468b9f2b8..1f1b97220d43 100644
--- a/drivers/net/wireless/rsi/rsi_91x_core.c
+++ b/drivers/net/wireless/rsi/rsi_91x_core.c
@@ -432,7 +432,9 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 		}
 	} else {
 		if (ieee80211_is_data_qos(wh->frame_control)) {
-			tid = (skb->data[24] & IEEE80211_QOS_TID);
+			u8 *qos = ieee80211_get_qos_ctl(wh);
+
+			tid = *qos & IEEE80211_QOS_CTL_TID_MASK;
 			skb->priority = TID_TO_WME_AC(tid);
 		} else {
 			tid = IEEE80211_NONQOS_TID;
-- 
cgit v1.2.3-59-g8ed1b


From 1e9c410f26a2ec3201b0f10141275afcbdaf6b69 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:16 +0530
Subject: rsi: enable power save by default for coex

Power save is by default enabled for WLAN and BT coex mode.

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 2a777435a735..bb497bd63563 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -2008,6 +2008,9 @@ int rsi_mac80211_attach(struct rsi_common *common)
 	wiphy->iface_combinations = rsi_iface_combinations;
 	wiphy->n_iface_combinations = ARRAY_SIZE(rsi_iface_combinations);
 
+	if (common->coex_mode > 1)
+		wiphy->flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
+
 	status = ieee80211_register_hw(hw);
 	if (status)
 		return status;
-- 
cgit v1.2.3-59-g8ed1b


From f0b147b8f1d27c9fa543704b21c67a7a3723c7d2 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Date: Tue, 10 Apr 2018 20:34:17 +0530
Subject: rsi: advertise 5GHz support based on device capability

Currently 5GHz gets advertised even for the device which supports
only 2.4Ghz band. This patch fixes the issue

Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index bb497bd63563..766d874cc6e2 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1957,8 +1957,6 @@ int rsi_mac80211_attach(struct rsi_common *common)
 
 	hw->max_tx_aggregation_subframes = RSI_MAX_TX_AGGR_FRMS;
 	hw->max_rx_aggregation_subframes = RSI_MAX_RX_AGGR_FRMS;
-	rsi_register_rates_channels(adapter, NL80211_BAND_2GHZ);
-	rsi_register_rates_channels(adapter, NL80211_BAND_5GHZ);
 	hw->rate_control_algorithm = "AARF";
 
 	SET_IEEE80211_PERM_ADDR(hw, common->mac_addr);
@@ -1979,10 +1977,15 @@ int rsi_mac80211_attach(struct rsi_common *common)
 
 	wiphy->available_antennas_rx = 1;
 	wiphy->available_antennas_tx = 1;
+
+	rsi_register_rates_channels(adapter, NL80211_BAND_2GHZ);
 	wiphy->bands[NL80211_BAND_2GHZ] =
 		&adapter->sbands[NL80211_BAND_2GHZ];
-	wiphy->bands[NL80211_BAND_5GHZ] =
-		&adapter->sbands[NL80211_BAND_5GHZ];
+	if (common->num_supp_bands > 1) {
+		rsi_register_rates_channels(adapter, NL80211_BAND_5GHZ);
+		wiphy->bands[NL80211_BAND_5GHZ] =
+			&adapter->sbands[NL80211_BAND_5GHZ];
+	}
 
 	/* AP Parameters */
 	wiphy->max_ap_assoc_sta = rsi_max_ap_stas[common->oper_mode - 1];
-- 
cgit v1.2.3-59-g8ed1b


From f700546682a62a87a9615121a37ee7452dab4b76 Mon Sep 17 00:00:00 2001
From: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Date: Wed, 11 Apr 2018 12:13:31 +0530
Subject: rsi: fix nommu_map_sg overflow kernel panic

Following overflow kernel panic is observed on some platforms while
loading the driver. It is fixed if dynamically allocated memory is
passed to SDIO instead of static one

[  927.513963] nommu_map_sg: overflow 17d54064ba7c+20 of device mask ffffffff
[  927.517712] Modules linked in: rsi_sdio(+) cmac bnep arc4 rsi_91x mac80211 cfg80211
	       btrsi rfcomm bluetooth ecdh_generic snd_soc_sst_bytcr_rt5660
[  927.517861] CPU: 0 PID: 1624 Comm: insmod Tainted: G W 4.15.0-1000 #1
[  927.517870] RIP: 0010:sdhci_send_command+0x5f0/0xa90 [sdhci]
[  927.517873] RSP: 0000:ffffac3fc064b6d8 EFLAGS: 00010086
[  927.517895] Call Trace:
[  927.517908]  ? __schedule+0x3cd/0x890
[  927.517915]  ? mod_timer+0x17b/0x3c0
[  927.517922]  sdhci_request+0x7c/0xf0 [sdhci]
[  927.517928]  __mmc_start_request+0x5a/0x170
[  927.517932]  mmc_start_request+0x74/0x90
[  927.517936]  mmc_wait_for_req+0x87/0xe0
[  927.517940]  mmc_io_rw_extended+0x2fd/0x330
[  927.517946]  ? mmc_wait_data_done+0x30/0x30
[  927.517951]  sdio_io_rw_ext_helper+0x160/0x210
[  927.517956]  sdio_writesb+0x1d/0x20
[  927.517966]	rsi_sdio_write_register_multiple+0x68/0x110 [rsi_sdio]
[  927.517976]  rsi_hal_device_init+0x357/0x910 [rsi_91x]
[  927.517983]  ? rsi_hal_device_init+0x357/0x910 [rsi_91x]
[  927.517990]  rsi_probe+0x2c6/0x450 [rsi_sdio]
[  927.517995]  sdio_bus_probe+0xfc/0x110
[  927.518000]  driver_probe_device+0x2b3/0x490
[  927.518005]  __driver_attach+0xdf/0xf0
[  927.518008]  ? driver_probe_device+0x490/0x490
[  927.518014]  bus_for_each_dev+0x6c/0xc0
[  927.518018]  driver_attach+0x1e/0x20
[  927.518021]  bus_add_driver+0x1f4/0x270
[  927.518028]  ? rsi_sdio_ack_intr+0x50/0x50 [rsi_sdio]
[  927.518031]  driver_register+0x60/0xe0
[  927.518038]  ? rsi_sdio_ack_intr+0x50/0x50 [rsi_sdio]
[  927.518041]  sdio_register_driver+0x20/0x30
[  927.518047]  rsi_module_init+0x16/0x40 [rsi_sdio]

Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_hal.c  | 35 ++++++++++++++++++++-------------
 drivers/net/wireless/rsi/rsi_91x_sdio.c | 21 +++++++++++++-------
 drivers/net/wireless/rsi/rsi_sdio.h     |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_hal.c b/drivers/net/wireless/rsi/rsi_91x_hal.c
index b7c54037f369..0761e61591bd 100644
--- a/drivers/net/wireless/rsi/rsi_91x_hal.c
+++ b/drivers/net/wireless/rsi/rsi_91x_hal.c
@@ -635,28 +635,32 @@ static int bl_write_header(struct rsi_hw *adapter, u8 *flash_content,
 			   u32 content_size)
 {
 	struct rsi_host_intf_ops *hif_ops = adapter->host_intf_ops;
-	struct bl_header bl_hdr;
+	struct bl_header *bl_hdr;
 	u32 write_addr, write_len;
 	int status;
 
-	bl_hdr.flags = 0;
-	bl_hdr.image_no = cpu_to_le32(adapter->priv->coex_mode);
-	bl_hdr.check_sum = cpu_to_le32(
-				*(u32 *)&flash_content[CHECK_SUM_OFFSET]);
-	bl_hdr.flash_start_address = cpu_to_le32(
-					*(u32 *)&flash_content[ADDR_OFFSET]);
-	bl_hdr.flash_len = cpu_to_le32(*(u32 *)&flash_content[LEN_OFFSET]);
+	bl_hdr = kzalloc(sizeof(*bl_hdr), GFP_KERNEL);
+	if (!bl_hdr)
+		return -ENOMEM;
+
+	bl_hdr->flags = 0;
+	bl_hdr->image_no = cpu_to_le32(adapter->priv->coex_mode);
+	bl_hdr->check_sum =
+		cpu_to_le32(*(u32 *)&flash_content[CHECK_SUM_OFFSET]);
+	bl_hdr->flash_start_address =
+		cpu_to_le32(*(u32 *)&flash_content[ADDR_OFFSET]);
+	bl_hdr->flash_len = cpu_to_le32(*(u32 *)&flash_content[LEN_OFFSET]);
 	write_len = sizeof(struct bl_header);
 
 	if (adapter->rsi_host_intf == RSI_HOST_INTF_USB) {
 		write_addr = PING_BUFFER_ADDRESS;
 		status = hif_ops->write_reg_multiple(adapter, write_addr,
-						 (u8 *)&bl_hdr, write_len);
+						 (u8 *)bl_hdr, write_len);
 		if (status < 0) {
 			rsi_dbg(ERR_ZONE,
 				"%s: Failed to load Version/CRC structure\n",
 				__func__);
-			return status;
+			goto fail;
 		}
 	} else {
 		write_addr = PING_BUFFER_ADDRESS >> 16;
@@ -665,20 +669,23 @@ static int bl_write_header(struct rsi_hw *adapter, u8 *flash_content,
 			rsi_dbg(ERR_ZONE,
 				"%s: Unable to set ms word to common reg\n",
 				__func__);
-			return status;
+			goto fail;
 		}
 		write_addr = RSI_SD_REQUEST_MASTER |
 			     (PING_BUFFER_ADDRESS & 0xFFFF);
 		status = hif_ops->write_reg_multiple(adapter, write_addr,
-						 (u8 *)&bl_hdr, write_len);
+						 (u8 *)bl_hdr, write_len);
 		if (status < 0) {
 			rsi_dbg(ERR_ZONE,
 				"%s: Failed to load Version/CRC structure\n",
 				__func__);
-			return status;
+			goto fail;
 		}
 	}
-	return 0;
+	status = 0;
+fail:
+	kfree(bl_hdr);
+	return status;
 }
 
 static u32 read_flash_capacity(struct rsi_hw *adapter)
diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c
index 8ef00582c6ea..f7f282098696 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c
@@ -1038,17 +1038,21 @@ static void ulp_read_write(struct rsi_hw *adapter, u16 addr, u32 data,
 /*This function resets and re-initializes the chip.*/
 static void rsi_reset_chip(struct rsi_hw *adapter)
 {
-	__le32 data;
+	u8 *data;
 	u8 sdio_interrupt_status = 0;
 	u8 request = 1;
 	int ret;
 
+	data = kzalloc(sizeof(u32), GFP_KERNEL);
+	if (!data)
+		return;
+
 	rsi_dbg(INFO_ZONE, "Writing disable to wakeup register\n");
 	ret =  rsi_sdio_write_register(adapter, 0, SDIO_WAKEUP_REG, &request);
 	if (ret < 0) {
 		rsi_dbg(ERR_ZONE,
 			"%s: Failed to write SDIO wakeup register\n", __func__);
-		return;
+		goto err;
 	}
 	msleep(20);
 	ret =  rsi_sdio_read_register(adapter, RSI_FN1_INT_REGISTER,
@@ -1056,7 +1060,7 @@ static void rsi_reset_chip(struct rsi_hw *adapter)
 	if (ret < 0) {
 		rsi_dbg(ERR_ZONE, "%s: Failed to Read Intr Status Register\n",
 			__func__);
-		return;
+		goto err;
 	}
 	rsi_dbg(INFO_ZONE, "%s: Intr Status Register value = %d\n",
 		__func__, sdio_interrupt_status);
@@ -1066,17 +1070,17 @@ static void rsi_reset_chip(struct rsi_hw *adapter)
 		rsi_dbg(ERR_ZONE,
 			"%s: Unable to set ms word to common reg\n",
 			__func__);
-		return;
+		goto err;
 	}
 
-	data = TA_HOLD_THREAD_VALUE;
+	put_unaligned_le32(TA_HOLD_THREAD_VALUE, data);
 	if (rsi_sdio_write_register_multiple(adapter, TA_HOLD_THREAD_REG |
 					     RSI_SD_REQUEST_MASTER,
-					     (u8 *)&data, 4)) {
+					     data, 4)) {
 		rsi_dbg(ERR_ZONE,
 			"%s: Unable to hold Thread-Arch processor threads\n",
 			__func__);
-		return;
+		goto err;
 	}
 
 	/* This msleep will ensure Thread-Arch processor to go to hold
@@ -1097,6 +1101,9 @@ static void rsi_reset_chip(struct rsi_hw *adapter)
 	 * read write operations to complete for chip reset.
 	 */
 	msleep(500);
+err:
+	kfree(data);
+	return;
 }
 
 /**
diff --git a/drivers/net/wireless/rsi/rsi_sdio.h b/drivers/net/wireless/rsi/rsi_sdio.h
index ead8e7c4df3a..353dbdf31e75 100644
--- a/drivers/net/wireless/rsi/rsi_sdio.h
+++ b/drivers/net/wireless/rsi/rsi_sdio.h
@@ -87,7 +87,7 @@ enum sdio_interrupt_type {
 #define TA_SOFT_RST_CLR              0
 #define TA_SOFT_RST_SET              BIT(0)
 #define TA_PC_ZERO                   0
-#define TA_HOLD_THREAD_VALUE         cpu_to_le32(0xF)
+#define TA_HOLD_THREAD_VALUE         0xF
 #define TA_RELEASE_THREAD_VALUE      cpu_to_le32(0xF)
 #define TA_BASE_ADDR                 0x2200
 #define MISC_CFG_BASE_ADDR           0x4105
-- 
cgit v1.2.3-59-g8ed1b


From 78e450719c702784e42af6da912d3692fd3da0cb Mon Sep 17 00:00:00 2001
From: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Date: Wed, 11 Apr 2018 12:13:32 +0530
Subject: rsi: Fix 'invalid vdd' warning in mmc

While performing cleanup, driver is messing with card->ocr
value by not masking rocr against ocr_avail. Below panic
is observed with some of the SDIO host controllers due to
this. Issue is resolved by reverting incorrect modifications
to vdd.

[  927.423821] mmc1: Invalid vdd 0x1f
[  927.423925] Modules linked in: rsi_sdio(+) cmac bnep arc4 rsi_91x
	       mac80211 cfg80211 btrsi rfcomm bluetooth ecdh_generic
[  927.424073] CPU: 0 PID: 1624 Comm: insmod Tainted: G		W        4.15.0-1000-caracalla #1
[  927.424075] Hardware name: Dell Inc. Edge Gateway	3003/      , BIOS 01.00.06 01/22/2018
[  927.424082] RIP: 0010:sdhci_set_power_noreg+0xdd/0x190[sdhci]
[  927.424085] RSP: 0018:ffffac3fc064b930 EFLAGS:  00010282
[  927.424107] Call Trace:
[  927.424118]  sdhci_set_power+0x5a/0x60 [sdhci]
[  927.424125]  sdhci_set_ios+0x360/0x3b0 [sdhci]
[  927.424133]  mmc_set_initial_state+0x92/0x120
[  927.424137]  mmc_power_up.part.34+0x33/0x1d0
[  927.424141]  mmc_power_up+0x17/0x20
[  927.424147]  mmc_sdio_runtime_resume+0x2d/0x50
[  927.424151]  mmc_runtime_resume+0x17/0x20
[  927.424156]  __rpm_callback+0xc4/0x200
[  927.424161]  ? idr_alloc_cyclic+0x57/0xd0
[  927.424165]  ? mmc_runtime_suspend+0x20/0x20
[  927.424169]  rpm_callback+0x24/0x80
[  927.424172]  ? mmc_runtime_suspend+0x20/0x20
[  927.424176]  rpm_resume+0x4b3/0x6c0
[  927.424181]  __pm_runtime_resume+0x4e/0x80
[  927.424188]  driver_probe_device+0x41/0x490
[  927.424192]  __driver_attach+0xdf/0xf0
[  927.424196]  ? driver_probe_device+0x490/0x490
[  927.424201]  bus_for_each_dev+0x6c/0xc0
[  927.424205]  driver_attach+0x1e/0x20
[  927.424209]  bus_add_driver+0x1f4/0x270
[  927.424217]  ? rsi_sdio_ack_intr+0x50/0x50 [rsi_sdio]
[  927.424221]  driver_register+0x60/0xe0
[  927.424227]  ? rsi_sdio_ack_intr+0x50/0x50 [rsi_sdio]
[  927.424231]  sdio_register_driver+0x20/0x30
[  927.424237]  rsi_module_init+0x16/0x40 [rsi_sdio]

Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Amitkumar Karwar <amit.karwar@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_sdio.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio.c b/drivers/net/wireless/rsi/rsi_91x_sdio.c
index f7f282098696..416981d99229 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio.c
@@ -170,7 +170,6 @@ static void rsi_reset_card(struct sdio_func *pfunction)
 	int err;
 	struct mmc_card *card = pfunction->card;
 	struct mmc_host *host = card->host;
-	s32 bit = (fls(host->ocr_avail) - 1);
 	u8 cmd52_resp;
 	u32 clock, resp, i;
 	u16 rca;
@@ -190,7 +189,6 @@ static void rsi_reset_card(struct sdio_func *pfunction)
 	msleep(20);
 
 	/* Initialize the SDIO card */
-	host->ios.vdd = bit;
 	host->ios.chip_select = MMC_CS_DONTCARE;
 	host->ios.bus_mode = MMC_BUSMODE_OPENDRAIN;
 	host->ios.power_mode = MMC_POWER_UP;
-- 
cgit v1.2.3-59-g8ed1b


From 8cf304a85378b54b8a86bf2ed130da56dd59b251 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Mon, 9 Apr 2018 15:51:31 +0800
Subject: rtlwifi: btcoex: remove identical statements within if-else branches

Since the statements are identical, we can safely remove the statements.
The commit 42e74946f016 ("rtlwifi: btcoexist: Fix if == else warnings in
halbtc8821a1ant.c") had fixed the statements, but the commit c6821613e653
("rtlwifi: btcoex: follow linux coding style") that converted coding style
and upgraded btcoex didn't include the fix.

Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../realtek/rtlwifi/btcoexist/halbtc8821a1ant.c       | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a1ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a1ant.c
index 202597cf8915..b5d65872db84 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a1ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a1ant.c
@@ -1584,11 +1584,7 @@ static void btc8821a1ant_act_bt_sco_hid_only_busy(struct btc_coexist *btcoexist,
 	/* tdma and coex table */
 	btc8821a1ant_ps_tdma(btcoexist, NORMAL_EXEC, true, 5);
 
-	if (BT_8821A_1ANT_WIFI_STATUS_NON_CONNECTED_ASSO_AUTH_SCAN ==
-	    wifi_status)
-		btc8821a1ant_coex_table_with_type(btcoexist, NORMAL_EXEC, 1);
-	else
-		btc8821a1ant_coex_table_with_type(btcoexist, NORMAL_EXEC, 1);
+	btc8821a1ant_coex_table_with_type(btcoexist, NORMAL_EXEC, 1);
 }
 
 static void btc8821a1ant_act_wifi_con_bt_acl_busy(struct btc_coexist *btcoexist,
@@ -1991,16 +1987,9 @@ static void btc8821a1ant_run_coexist_mechanism(struct btc_coexist *btcoexist)
 			wifi_rssi_state =
 				btc8821a1ant_wifi_rssi_state(btcoexist, 1, 2,
 							     30, 0);
-			if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
-			    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
-				btc8821a1ant_limited_tx(btcoexist,
-							NORMAL_EXEC, 1, 1,
-							0, 1);
-			} else {
-				btc8821a1ant_limited_tx(btcoexist,
-							NORMAL_EXEC, 1, 1,
-							0, 1);
-			}
+			btc8821a1ant_limited_tx(btcoexist,
+						NORMAL_EXEC, 1, 1,
+						0, 1);
 		} else {
 			btc8821a1ant_limited_tx(btcoexist, NORMAL_EXEC,
 						0, 0, 0, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 5156b054bcdab19dc2681fb606011378e9dd37e2 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 30 Mar 2018 16:12:23 -0500
Subject: mt7601u: phy: mark expected switch fall-through

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt7601u/phy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mediatek/mt7601u/phy.c b/drivers/net/wireless/mediatek/mt7601u/phy.c
index ca09a5d4305e..9a90f1f09fb0 100644
--- a/drivers/net/wireless/mediatek/mt7601u/phy.c
+++ b/drivers/net/wireless/mediatek/mt7601u/phy.c
@@ -795,6 +795,7 @@ mt7601u_phy_rf_pa_mode_val(struct mt7601u_dev *dev, int phy_mode, int tx_rate)
 	switch (phy_mode) {
 	case MT_PHY_TYPE_OFDM:
 		tx_rate += 4;
+		/* fall through */
 	case MT_PHY_TYPE_CCK:
 		reg = dev->rf_pa_mode[0];
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 2cb161094ef940f191cf60cd512783269572b4a7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 3 Apr 2018 16:45:31 +0200
Subject: mt76x2: fix tssi initialization for 5GHz band

Fix mcu initial configuration for tssi calibration on 5GHz band

Fixes: 7bc04215a66b ("mt76: add driver code for MT76x2e")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
index fcc37eb7ce0b..f28c55746cea 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
@@ -180,7 +180,7 @@ mt76x2_phy_tssi_init_cal(struct mt76x2_dev *dev)
 	if (mt76x2_channel_silent(dev))
 		return false;
 
-	if (chan->band == NL80211_BAND_2GHZ)
+	if (chan->band == NL80211_BAND_5GHZ)
 		flag |= BIT(0);
 
 	if (mt76x2_ext_pa_enabled(dev, chan->band))
-- 
cgit v1.2.3-59-g8ed1b


From 00dfae9a0ab7594b77d39725ebff677900695d70 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 3 Apr 2018 17:01:43 +0200
Subject: mt76x2: make mt76x2_mac_reset routine static

Add static qualifier to mt76x2_mac_reset routine and remove the
prototype from mt76x2_mac.h since it is used just in mt76x2_init.c

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 2 +-
 drivers/net/wireless/mediatek/mt76/mt76x2_mac.h  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index 934c331d995e..359b105235b3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -228,7 +228,7 @@ mt76x2_init_beacon_offsets(struct mt76x2_dev *dev)
 		mt76_wr(dev, MT_BCN_OFFSET(i), regs[i]);
 }
 
-int mt76x2_mac_reset(struct mt76x2_dev *dev, bool hard)
+static int mt76x2_mac_reset(struct mt76x2_dev *dev, bool hard)
 {
 	static const u8 null_addr[ETH_ALEN] = {};
 	const u8 *macaddr = dev->mt76.macaddr;
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.h b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.h
index 8a8a25e32d5f..c048cd06df6b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.h
@@ -157,7 +157,6 @@ mt76x2_skb_tx_info(struct sk_buff *skb)
 	return (void *) info->status.status_driver_data;
 }
 
-int mt76x2_mac_reset(struct mt76x2_dev *dev, bool hard);
 int mt76x2_mac_start(struct mt76x2_dev *dev);
 void mt76x2_mac_stop(struct mt76x2_dev *dev, bool force);
 void mt76x2_mac_resume(struct mt76x2_dev *dev);
-- 
cgit v1.2.3-59-g8ed1b


From 11b2a25f02b700027aa89ff27c807ec756df01f6 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:48 +0200
Subject: mt76: stop tx queues from the driver callback instead of common code

Allows the driver to control whether to send a BlockAckReq after waking
up. MT76x2 needs this set to true, for MT7603 (to be submitted later)
it needs to be false.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c    | 6 ++----
 drivers/net/wireless/mediatek/mt76/mt76x2_main.c | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 4f30cdcd2b53..962ed8234065 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -541,12 +541,10 @@ mt76_check_ps(struct mt76_dev *dev, struct sk_buff *skb)
 	if (!!test_bit(MT_WCID_FLAG_PS, &wcid->flags) == ps)
 		return;
 
-	if (ps) {
+	if (ps)
 		set_bit(MT_WCID_FLAG_PS, &wcid->flags);
-		mt76_stop_tx_queues(dev, sta, true);
-	} else {
+	else
 		clear_bit(MT_WCID_FLAG_PS, &wcid->flags);
-	}
 
 	ieee80211_sta_ps_transition(sta, ps);
 	dev->drv->sta_ps(dev, sta, ps);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index 73c127f92613..81c58f865c64 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -321,6 +321,7 @@ mt76x2_sta_ps(struct mt76_dev *mdev, struct ieee80211_sta *sta, bool ps)
 	struct mt76x2_dev *dev = container_of(mdev, struct mt76x2_dev, mt76);
 	int idx = msta->wcid.idx;
 
+	mt76_stop_tx_queues(&dev->mt76, sta, true);
 	mt76x2_mac_wcid_set_drop(dev, idx, ps);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 49149d3f1c6bfb314db1f386cf5067e5151ee0aa Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:49 +0200
Subject: mt76: add missing VHT maximum A-MPDU length capability

Signficantly improves throughput with some clients

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 962ed8234065..ec531ce50d01 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -213,7 +213,8 @@ mt76_init_sband(struct mt76_dev *dev, struct mt76_sband *msband,
 	vht_cap->vht_supported = true;
 	vht_cap->cap |= IEEE80211_VHT_CAP_RXLDPC |
 			IEEE80211_VHT_CAP_RXSTBC_1 |
-			IEEE80211_VHT_CAP_SHORT_GI_80;
+			IEEE80211_VHT_CAP_SHORT_GI_80 |
+			(3 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9f67c277a80f37a6594ae4ea3b78157773e1a23b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:50 +0200
Subject: mt76: toggle driver station powersave bit before notifying mac80211

Avoids race conditions from mac80211 enqueueing tx packets before the
tx-drop bit is cleared

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index ec531ce50d01..eb49e0a6758c 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -547,8 +547,8 @@ mt76_check_ps(struct mt76_dev *dev, struct sk_buff *skb)
 	else
 		clear_bit(MT_WCID_FLAG_PS, &wcid->flags);
 
-	ieee80211_sta_ps_transition(sta, ps);
 	dev->drv->sta_ps(dev, sta, ps);
+	ieee80211_sta_ps_transition(sta, ps);
 }
 
 void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames,
-- 
cgit v1.2.3-59-g8ed1b


From 80f28994f7d9c01e2af3d8584874cfc1904a7553 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:51 +0200
Subject: mt76: rework tx power handling

There were a number of issues in the existing tx power handling code:

1. The EEPROM target power for chain 0 refers to the actual output power
after the channel and bandwidth delta have been added to the initial
channel gain. This means the delta values should not be added/subtracted
for the rate power calculation

2. When power is reduced significantly, the initial channel gain
underflows very quickly, while the per-rate power offsets are high.
This miscalculation causes effective tx power to be increased when it
should actually be lowered.

Fix this by trying to adjust the channel gain to the lowest power from
the rate table. In case of under- or overflow, compensate by adding an
appropriate delta to the rate power values.

This makes power configuration more accurate on a wider range of
configurable values

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_phy.c | 67 ++++++++++++++++---------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
index f28c55746cea..a4280a9f6956 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
@@ -73,16 +73,6 @@ int mt76x2_phy_get_rssi(struct mt76x2_dev *dev, s8 rssi, int chain)
 	return rssi;
 }
 
-static u8
-mt76x2_txpower_check(int value)
-{
-	if (value < 0)
-		return 0;
-	if (value > 0x2f)
-		return 0x2f;
-	return value;
-}
-
 static void
 mt76x2_add_rate_power_offset(struct mt76_rate_power *r, int offset)
 {
@@ -102,6 +92,26 @@ mt76x2_limit_rate_power(struct mt76_rate_power *r, int limit)
 			r->all[i] = limit;
 }
 
+static int
+mt76x2_get_min_rate_power(struct mt76_rate_power *r)
+{
+	int i;
+	s8 ret = 0;
+
+	for (i = 0; i < sizeof(r->all); i++) {
+		if (!r->all[i])
+			continue;
+
+		if (ret)
+			ret = min(ret, r->all[i]);
+		else
+			ret = r->all[i];
+	}
+
+	return ret;
+}
+
+
 void mt76x2_phy_set_txpower(struct mt76x2_dev *dev)
 {
 	enum nl80211_chan_width width = dev->mt76.chandef.width;
@@ -109,6 +119,7 @@ void mt76x2_phy_set_txpower(struct mt76x2_dev *dev)
 	struct mt76x2_tx_power_info txp;
 	int txp_0, txp_1, delta = 0;
 	struct mt76_rate_power t = {};
+	int base_power, gain;
 
 	mt76x2_get_power_info(dev, &txp, chan);
 
@@ -117,26 +128,32 @@ void mt76x2_phy_set_txpower(struct mt76x2_dev *dev)
 	else if (width == NL80211_CHAN_WIDTH_80)
 		delta = txp.delta_bw80;
 
-	if (txp.target_power > dev->txpower_conf)
-		delta -= txp.target_power - dev->txpower_conf;
-
 	mt76x2_get_rate_power(dev, &t, chan);
-	mt76x2_add_rate_power_offset(&t, txp.chain[0].target_power +
-				   txp.chain[0].delta);
+	mt76x2_add_rate_power_offset(&t, txp.chain[0].target_power);
 	mt76x2_limit_rate_power(&t, dev->txpower_conf);
 	dev->txpower_cur = mt76x2_get_max_rate_power(&t);
-	mt76x2_add_rate_power_offset(&t, -(txp.chain[0].target_power +
-					 txp.chain[0].delta + delta));
-	dev->target_power = txp.chain[0].target_power;
-	dev->target_power_delta[0] = txp.chain[0].delta + delta;
-	dev->target_power_delta[1] = txp.chain[1].delta + delta;
-	dev->rate_power = t;
 
-	txp_0 = mt76x2_txpower_check(txp.chain[0].target_power +
-				   txp.chain[0].delta + delta);
+	base_power = mt76x2_get_min_rate_power(&t);
+	delta += base_power - txp.chain[0].target_power;
+	txp_0 = txp.chain[0].target_power + txp.chain[0].delta + delta;
+	txp_1 = txp.chain[1].target_power + txp.chain[1].delta + delta;
+
+	gain = min(txp_0, txp_1);
+	if (gain < 0) {
+		base_power -= gain;
+		txp_0 -= gain;
+		txp_1 -= gain;
+	} else if (gain > 0x2f) {
+		base_power -= gain - 0x2f;
+		txp_0 = 0x2f;
+		txp_1 = 0x2f;
+	}
 
-	txp_1 = mt76x2_txpower_check(txp.chain[1].target_power +
-				   txp.chain[1].delta + delta);
+	mt76x2_add_rate_power_offset(&t, -base_power);
+	dev->target_power = txp.chain[0].target_power;
+	dev->target_power_delta[0] = txp_0 - txp.chain[0].target_power;
+	dev->target_power_delta[1] = txp_1 - txp.chain[0].target_power;
+	dev->rate_power = t;
 
 	mt76_rmw_field(dev, MT_TX_ALC_CFG_0, MT_TX_ALC_CFG_0_CH_INIT_0, txp_0);
 	mt76_rmw_field(dev, MT_TX_ALC_CFG_0, MT_TX_ALC_CFG_0_CH_INIT_1, txp_1);
-- 
cgit v1.2.3-59-g8ed1b


From 07073a2768ac8ef1360f715a68c3f7b7d1f23850 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:52 +0200
Subject: mt76: fix potential sleep in atomic context

Use cancel_delayed_work instead of cancel_delayed_work_sync

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index fcb208d1f276..cbac42cb536c 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -273,7 +273,7 @@ static void mt76_rx_aggr_shutdown(struct mt76_dev *dev, struct mt76_rx_tid *tid)
 
 	spin_unlock_bh(&tid->lock);
 
-	cancel_delayed_work_sync(&tid->reorder_work);
+	cancel_delayed_work(&tid->reorder_work);
 }
 
 void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno)
-- 
cgit v1.2.3-59-g8ed1b


From 18efed59fabc0d0f6dd9888b5f0e8102c8332685 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:53 +0200
Subject: mt76: set RX_FLAG_DUP_VALIDATED for A-MPDU reordered packets

Required for fast-rx and allows mac80211 to skip an unnnecessary check.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index cbac42cb536c..6657ec8928de 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -169,6 +169,7 @@ void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 	if (!tid)
 		return;
 
+	status->flag |= RX_FLAG_DUP_VALIDATED;
 	spin_lock_bh(&tid->lock);
 
 	if (tid->stopped)
-- 
cgit v1.2.3-59-g8ed1b


From 1af83148a4fc31858383e8ee867f5b9f9c2432cd Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 3 Apr 2018 21:52:54 +0200
Subject: mt76: check qos ack policy before reordering packets

Do not attempt to reorder packets not part of a BA session

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index 6657ec8928de..dbf4057d2d3e 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -147,12 +147,13 @@ mt76_rx_aggr_check_ctl(struct sk_buff *skb, struct sk_buff_head *frames)
 void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 {
 	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct mt76_wcid *wcid = status->wcid;
 	struct ieee80211_sta *sta;
 	struct mt76_rx_tid *tid;
 	bool sn_less;
 	u16 seqno, head, size;
-	u8 idx;
+	u8 ackp, idx;
 
 	__skb_queue_tail(frames, skb);
 
@@ -165,6 +166,12 @@ void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 		return;
 	}
 
+	/* not part of a BA session */
+	ackp = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_ACK_POLICY_MASK;
+	if (ackp != IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
+	    ackp != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL)
+		return;
+
 	tid = rcu_dereference(wcid->aggr[status->tid]);
 	if (!tid)
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From cf7b411dcea8849e1802bd1ac35249b33050afed Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 3 Apr 2018 22:30:19 +0200
Subject: mt76x2: remove unnecessary MT_TX_ALC_CFG_4 configuration

Remove unnecessary MT_TX_ALC_CFG_4 configuration since the register value
will be properly set according to the usage of an external power amplifier

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_phy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
index a4280a9f6956..79c698ada818 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
@@ -274,7 +274,6 @@ mt76x2_phy_set_txpower_regs(struct mt76x2_dev *dev, enum nl80211_band band)
 			mt76_wr(dev, MT_TX_ALC_CFG_2, 0x1b0f0400);
 			mt76_wr(dev, MT_TX_ALC_CFG_3, 0x1b0f0476);
 		}
-		mt76_wr(dev, MT_TX_ALC_CFG_4, 0);
 
 		if (mt76x2_ext_pa_enabled(dev, band))
 			pa_mode_adj = 0x04000000;
-- 
cgit v1.2.3-59-g8ed1b


From bcb0f68ae2eda3f4e124aa25bd0f5ae7afaba33c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 4 Apr 2018 10:23:46 +0200
Subject: mt76x2: fix tx_alc_enabled check

Fix mt76x2_temp_tx_alc_enabled routine since in order to enable tx_alc
temperature compensation it necessary to take into account BIT(15) of
MT_EE_TX_POWER_EXT_PA_5G eeprom info

Fixes: 7bc04215a66b ("mt76: add driver code for MT76x2e")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.c | 6 +-----
 drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.h | 6 ++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.c b/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.c
index 5bb50027c1e8..95d5f7d888f0 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.c
@@ -609,17 +609,13 @@ int mt76x2_get_temp_comp(struct mt76x2_dev *dev, struct mt76x2_temp_comp *t)
 
 	memset(t, 0, sizeof(*t));
 
-	val = mt76x2_eeprom_get(dev, MT_EE_NIC_CONF_1);
-	if (!(val & MT_EE_NIC_CONF_1_TEMP_TX_ALC))
+	if (!mt76x2_temp_tx_alc_enabled(dev))
 		return -EINVAL;
 
 	if (!mt76x2_ext_pa_enabled(dev, band))
 		return -EINVAL;
 
 	val = mt76x2_eeprom_get(dev, MT_EE_TX_POWER_EXT_PA_5G) >> 8;
-	if (!(val & BIT(7)))
-		return -EINVAL;
-
 	t->temp_25_ref = val & 0x7f;
 	if (band == NL80211_BAND_5GHZ) {
 		slope = mt76x2_eeprom_get(dev, MT_EE_RF_TEMP_COMP_SLOPE_5G);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.h b/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.h
index d79122728dca..aa0b0c040375 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_eeprom.h
@@ -159,6 +159,12 @@ void mt76x2_read_rx_gain(struct mt76x2_dev *dev);
 static inline bool
 mt76x2_temp_tx_alc_enabled(struct mt76x2_dev *dev)
 {
+	u16 val;
+
+	val = mt76x2_eeprom_get(dev, MT_EE_TX_POWER_EXT_PA_5G);
+	if (!(val & BIT(15)))
+		return false;
+
 	return mt76x2_eeprom_get(dev, MT_EE_NIC_CONF_1) &
 	       MT_EE_NIC_CONF_1_TEMP_TX_ALC;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b9e5d4feb4fc0ec0823e5da3bdea1c01d38f448e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 4 Apr 2018 10:38:32 +0200
Subject: mt76x2: set default values in TX_ALC_CFG_{1, 2} for tempetaure
 compensation

Initialize default values for temperature compensation in TX_ALC_CFG_{1,2}
if tssi has been enabled

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_phy.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
index 79c698ada818..038d1fe6c726 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
@@ -677,6 +677,14 @@ int mt76x2_phy_set_channel(struct mt76x2_dev *dev,
 	memcpy(dev->cal.agc_gain_cur, dev->cal.agc_gain_init,
 	       sizeof(dev->cal.agc_gain_cur));
 
+	/* init default values for temp compensation */
+	if (mt76x2_tssi_enabled(dev)) {
+		mt76_rmw_field(dev, MT_TX_ALC_CFG_1, MT_TX_ALC_CFG_1_TEMP_COMP,
+			       0x38);
+		mt76_rmw_field(dev, MT_TX_ALC_CFG_2, MT_TX_ALC_CFG_2_TEMP_COMP,
+			       0x38);
+	}
+
 	ieee80211_queue_delayed_work(mt76_hw(dev), &dev->cal_work,
 				     MT_CALIBRATE_INTERVAL);
 
-- 
cgit v1.2.3-59-g8ed1b


From b305a6ab02475f52ef604b36e4cecd3bf4aa5eb7 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 16 Apr 2018 13:56:17 +0200
Subject: mt7601u: use EWMA to calculate avg_rssi

avr_rssi is not calculated correctly as we do not divide result
by 256 (mt76 sum avg_rssi1 and avg_rssi2 and divide by 512).
However dividing by 256 will make avg_rssi almost the same as
last rssi value - not really an average. So use EWMA to calculate
avg_rssi. I've chosen weight_rcp=4 to convergence quicker on signal
strength changes.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt7601u/mac.c     |  4 ++--
 drivers/net/wireless/mediatek/mt7601u/mt7601u.h |  5 ++++-
 drivers/net/wireless/mediatek/mt7601u/phy.c     | 10 +++++++---
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt7601u/mac.c b/drivers/net/wireless/mediatek/mt7601u/mac.c
index d55d7040a56d..148c36d3d2e5 100644
--- a/drivers/net/wireless/mediatek/mt7601u/mac.c
+++ b/drivers/net/wireless/mediatek/mt7601u/mac.c
@@ -453,7 +453,7 @@ mt7601u_rx_monitor_beacon(struct mt7601u_dev *dev, struct mt7601u_rxwi *rxwi,
 {
 	dev->bcn_freq_off = rxwi->freq_off;
 	dev->bcn_phy_mode = FIELD_GET(MT_RXWI_RATE_PHY, rate);
-	dev->avg_rssi = (dev->avg_rssi * 15) / 16 + (rssi << 8);
+	ewma_rssi_add(&dev->avg_rssi, -rssi);
 }
 
 static int
@@ -503,7 +503,7 @@ u32 mt76_mac_process_rx(struct mt7601u_dev *dev, struct sk_buff *skb,
 	if (mt7601u_rx_is_our_beacon(dev, data))
 		mt7601u_rx_monitor_beacon(dev, rxwi, rate, rssi);
 	else if (rxwi->rxinfo & cpu_to_le32(MT_RXINFO_U2M))
-		dev->avg_rssi = (dev->avg_rssi * 15) / 16 + (rssi << 8);
+		ewma_rssi_add(&dev->avg_rssi, -rssi);
 	spin_unlock_bh(&dev->con_mon_lock);
 
 	return len;
diff --git a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
index 9233744451a9..db317d8c1652 100644
--- a/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
+++ b/drivers/net/wireless/mediatek/mt7601u/mt7601u.h
@@ -23,6 +23,7 @@
 #include <linux/completion.h>
 #include <net/mac80211.h>
 #include <linux/debugfs.h>
+#include <linux/average.h>
 
 #include "regs.h"
 
@@ -138,6 +139,8 @@ enum {
 	MT7601U_STATE_MORE_STATS,
 };
 
+DECLARE_EWMA(rssi, 10, 4);
+
 /**
  * struct mt7601u_dev - adapter structure
  * @lock:		protects @wcid->tx_rate.
@@ -220,7 +223,7 @@ struct mt7601u_dev {
 	s8 bcn_freq_off;
 	u8 bcn_phy_mode;
 
-	int avg_rssi; /* starts at 0 and converges */
+	struct ewma_rssi avg_rssi;
 
 	u8 agc_save;
 
diff --git a/drivers/net/wireless/mediatek/mt7601u/phy.c b/drivers/net/wireless/mediatek/mt7601u/phy.c
index 9a90f1f09fb0..9d2f9a776ef1 100644
--- a/drivers/net/wireless/mediatek/mt7601u/phy.c
+++ b/drivers/net/wireless/mediatek/mt7601u/phy.c
@@ -975,6 +975,7 @@ void mt7601u_agc_restore(struct mt7601u_dev *dev)
 static void mt7601u_agc_tune(struct mt7601u_dev *dev)
 {
 	u8 val = mt7601u_agc_default(dev);
+	long avg_rssi;
 
 	if (test_bit(MT7601U_STATE_SCANNING, &dev->state))
 		return;
@@ -984,9 +985,12 @@ static void mt7601u_agc_tune(struct mt7601u_dev *dev)
 	 *	 Rssi updates are only on beacons and U2M so should work...
 	 */
 	spin_lock_bh(&dev->con_mon_lock);
-	if (dev->avg_rssi <= -70)
+	avg_rssi = ewma_rssi_read(&dev->avg_rssi);
+	WARN_ON_ONCE(avg_rssi == 0);
+	avg_rssi = -avg_rssi;
+	if (avg_rssi <= -70)
 		val -= 0x20;
-	else if (dev->avg_rssi <= -60)
+	else if (avg_rssi <= -60)
 		val -= 0x10;
 	spin_unlock_bh(&dev->con_mon_lock);
 
@@ -1102,7 +1106,7 @@ void mt7601u_phy_con_cal_onoff(struct mt7601u_dev *dev,
 	/* Start/stop collecting beacon data */
 	spin_lock_bh(&dev->con_mon_lock);
 	ether_addr_copy(dev->ap_bssid, info->bssid);
-	dev->avg_rssi = 0;
+	ewma_rssi_init(&dev->avg_rssi);
 	dev->bcn_freq_off = MT_FREQ_OFFSET_INVALID;
 	spin_unlock_bh(&dev->con_mon_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2019c39a2d6081e09436254fba320708e8127691 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 16 Apr 2018 13:56:18 +0200
Subject: mt7601u: run calibration works after finishing scanning

When finishing scanning we switch to operational channel sill with
SCANNING flag. This mean that we never perform calibration works after
scanning. To fix the problem queue calibration works on
.sw_scan_complete() routine.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt7601u/main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index 3c9ea40d9584..7b21016012c3 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -288,6 +288,12 @@ mt7601u_sw_scan_complete(struct ieee80211_hw *hw,
 
 	mt7601u_agc_restore(dev);
 	clear_bit(MT7601U_STATE_SCANNING, &dev->state);
+
+	ieee80211_queue_delayed_work(dev->hw, &dev->cal_work,
+				     MT_CALIBRATE_INTERVAL);
+	if (dev->freq_cal.enabled)
+		ieee80211_queue_delayed_work(dev->hw, &dev->freq_cal.work,
+					     MT_FREQ_CAL_INIT_DELAY);
 }
 
 static int
-- 
cgit v1.2.3-59-g8ed1b


From cc6603aaeebf75751f5c3b79bc38ab62a0a55e7e Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 17 Apr 2018 01:05:59 +0200
Subject: mt76x2: fix TXD_INFO bitmask definition

Despite this issue is not harmful since it is not actually used in
mt76 driver, fix DMA txinfo bitmask definition

Fixes: 7bc04215a66b ('mt76: add driver code for MT76x2e')
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_dma.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_dma.h b/drivers/net/wireless/mediatek/mt76/mt76x2_dma.h
index 47f79d83fcb4..e9d426bbf91a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_dma.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_dma.h
@@ -19,7 +19,7 @@
 
 #include "dma.h"
 
-#define MT_TXD_INFO_LEN			GENMASK(13, 0)
+#define MT_TXD_INFO_LEN			GENMASK(15, 0)
 #define MT_TXD_INFO_NEXT_VLD		BIT(16)
 #define MT_TXD_INFO_TX_BURST		BIT(17)
 #define MT_TXD_INFO_80211		BIT(19)
@@ -27,9 +27,8 @@
 #define MT_TXD_INFO_CSO			BIT(21)
 #define MT_TXD_INFO_WIV			BIT(24)
 #define MT_TXD_INFO_QSEL		GENMASK(26, 25)
-#define MT_TXD_INFO_TCO			BIT(29)
-#define MT_TXD_INFO_UCO			BIT(30)
-#define MT_TXD_INFO_ICO			BIT(31)
+#define MT_TXD_INFO_DPORT		GENMASK(29, 27)
+#define MT_TXD_INFO_TYPE		GENMASK(31, 30)
 
 #define MT_RX_FCE_INFO_LEN		GENMASK(13, 0)
 #define MT_RX_FCE_INFO_SELF_GEN		BIT(15)
-- 
cgit v1.2.3-59-g8ed1b


From 660de409e25ba5bf34e8fdf2e13b69f1a7bd7213 Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:32 +0100
Subject: ipconfig: Document setting of NIS domain name

ic_do_bootp_ext() is responsible for parsing the "ip=" and "nfsaddrs="
kernel parameters. If a "." character is found in parameter 4 (the
client's hostname), everything before the first "." is used as the
hostname, and everything after it is used as the NIS domain name (but
not necessarily the DNS domain name).

Document this behaviour in Documentation/filesystems/nfs/nfsroot.txt,
as it is not made explicit.

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/nfs/nfsroot.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 5efae00f6c7f..1513e5d663fd 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -123,10 +123,13 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 
 		Default:  Determined using autoconfiguration.
 
-  <hostname>	Name of the client. May be supplied by autoconfiguration,
-  		but its absence will not trigger autoconfiguration.
-		If specified and DHCP is used, the user provided hostname will
-		be carried in the DHCP request to hopefully update DNS record.
+  <hostname>	Name of the client. If a '.' character is present, anything
+		before the first '.' is used as the client's hostname, and anything
+		after it is used as its NIS domain name. May be supplied by
+		autoconfiguration, but its absence will not trigger autoconfiguration.
+		If specified and DHCP is used, the user-provided hostname (and NIS
+		domain name, if present) will be carried in the DHCP request; this
+		may cause a DNS record to be created or updated for the client.
 
   		Default: Client IP address is used in ASCII notation.
 
-- 
cgit v1.2.3-59-g8ed1b


From e18bdc83aec4b7e187d9c54dc442a6ab3efac26d Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:33 +0100
Subject: ipconfig: Tidy up reporting of name servers

Commit 5e953778a2aab04929a5e7b69f53dc26e39b079e ("ipconfig: add
nameserver IPs to kernel-parameter ip=") adds the IP addresses of
discovered name servers to the summary printed by ipconfig when
configuration is complete. It appears the intention in ip_auto_config()
was to print the name servers on a new line (especially given the
spacing and lack of comma before "nameserver0="), but they're actually
printed on the same line as the NFS root filesystem configuration
summary:

  [    0.686186] IP-Config: Complete:
  [    0.686226]      device=eth0, hwaddr=xx:xx:xx:xx:xx:xx, ipaddr=10.0.0.2, mask=255.255.255.0, gw=10.0.0.1
  [    0.686328]      host=test, domain=example.com, nis-domain=(none)
  [    0.686386]      bootserver=10.0.0.1, rootserver=10.0.0.1, rootpath=     nameserver0=10.0.0.1

This makes it harder to read and parse ipconfig's output. Instead, print
the name servers on a separate line:

  [    0.791250] IP-Config: Complete:
  [    0.791289]      device=eth0, hwaddr=xx:xx:xx:xx:xx:xx, ipaddr=10.0.0.2, mask=255.255.255.0, gw=10.0.0.1
  [    0.791407]      host=test, domain=example.com, nis-domain=(none)
  [    0.791475]      bootserver=10.0.0.1, rootserver=10.0.0.1, rootpath=
  [    0.791476]      nameserver0=10.0.0.1

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 43f620feb1c4..d0ea0ecc9008 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1481,16 +1481,19 @@ static int __init ip_auto_config(void)
 		&ic_servaddr, &root_server_addr, root_server_path);
 	if (ic_dev_mtu)
 		pr_cont(", mtu=%d", ic_dev_mtu);
-	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+	/* Name servers (if any): */
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
 		if (ic_nameservers[i] != NONE) {
-			pr_cont("     nameserver%u=%pI4",
-				i, &ic_nameservers[i]);
-			break;
+			if (i == 0)
+				pr_info("     nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
+			else
+				pr_cont(", nameserver%u=%pI4",
+					i, &ic_nameservers[i]);
 		}
-	for (i++; i < CONF_NAMESERVERS_MAX; i++)
-		if (ic_nameservers[i] != NONE)
-			pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]);
-	pr_cont("\n");
+		if (i + 1 == CONF_NAMESERVERS_MAX)
+			pr_cont("\n");
+	}
 #endif /* !SILENT */
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 4e1a8af28d56a4194cf3f17c69d9d21183246f3a Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:34 +0100
Subject: ipconfig: BOOTP: Don't request IEN-116 name servers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ipconfig is autoconfigured via BOOTP, the request packet
initialised by ic_bootp_init_ext() allocates 8 bytes for tag 5 ("Name
Server" [1, §3.7]), but tag 5 in the response isn't processed by
ic_do_bootp_ext(). Instead, allocate the 8 bytes to tag 6 ("Domain Name
Server" [1, §3.8]), which is processed by ic_do_bootp_ext(), and appears
to have been the intended tag to request.

This won't cause any breakage for existing users, as tag 5 responses
provided by BOOTP servers weren't being processed anyway.

[1] RFC 2132, "DHCP Options and BOOTP Vendor Extensions":
    https://tools.ietf.org/rfc/rfc2132.txt

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d0ea0ecc9008..bcf3c4f9882d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -721,7 +721,7 @@ static void __init ic_bootp_init_ext(u8 *e)
 	*e++ = 3;		/* Default gateway request */
 	*e++ = 4;
 	e += 4;
-	*e++ = 5;		/* Name server request */
+	*e++ = 6;		/* (DNS) name server request */
 	*e++ = 8;
 	e += 8;
 	*e++ = 12;		/* Host name request */
-- 
cgit v1.2.3-59-g8ed1b


From de1fa15b6642f0b66f1844b9b464a8c28e84347c Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:35 +0100
Subject: ipconfig: BOOTP: Request CONF_NAMESERVERS_MAX name servers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When ipconfig is autoconfigured via BOOTP, the request packet
initialised by ic_bootp_init_ext() always allocates 8 bytes for the name
server option, limiting the BOOTP server to responding with at most 2
name servers even though ipconfig in fact supports an arbitrary number
of name servers (as defined by CONF_NAMESERVERS_MAX, which is currently
3).

Only request name servers in the request packet if CONF_NAMESERVERS_MAX
is positive (to comply with [1, §3.8]), and allocate enough space in the
packet for CONF_NAMESERVERS_MAX name servers to indicate the maximum
number we can accept in response.

[1] RFC 2132, "DHCP Options and BOOTP Vendor Extensions":
    https://tools.ietf.org/rfc/rfc2132.txt

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index bcf3c4f9882d..0f460d6d3cce 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -721,9 +721,11 @@ static void __init ic_bootp_init_ext(u8 *e)
 	*e++ = 3;		/* Default gateway request */
 	*e++ = 4;
 	e += 4;
+#if CONF_NAMESERVERS_MAX > 0
 	*e++ = 6;		/* (DNS) name server request */
-	*e++ = 8;
-	e += 8;
+	*e++ = 4 * CONF_NAMESERVERS_MAX;
+	e += 4 * CONF_NAMESERVERS_MAX;
+#endif
 	*e++ = 12;		/* Host name request */
 	*e++ = 32;
 	e += 32;
-- 
cgit v1.2.3-59-g8ed1b


From 8b0b37c5644e1c0e0feac5bbf673337cefa3efb2 Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:36 +0100
Subject: ipconfig: Document /proc/net/pnp

Fully document the format used by the /proc/net/pnp file written by
ipconfig, explain where its values originate from, and clarify that the
tertiary name server IP and DNS domain name are only written to the file
when autoconfiguration is used.

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/nfs/nfsroot.txt | 34 ++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 1513e5d663fd..a1030bea60d3 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -110,6 +110,9 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 		will not be triggered if it is missing and NFS root is not
 		in operation.
 
+		Value is exported to /proc/net/pnp with the prefix "bootserver "
+		(see below).
+
 		Default: Determined using autoconfiguration.
 		         The address of the autoconfiguration server is used.
 
@@ -165,12 +168,33 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 
                 Default: any
 
-  <dns0-ip>	IP address of first nameserver.
-		Value gets exported by /proc/net/pnp which is often linked
-		on embedded systems by /etc/resolv.conf.
+  <dns0-ip>	IP address of primary nameserver.
+		Value is exported to /proc/net/pnp with the prefix "nameserver "
+		(see below).
+
+		Default: None if not using autoconfiguration; determined
+		automatically if using autoconfiguration.
+
+  <dns1-ip>	IP address of secondary nameserver.
+		See <dns0-ip>.
+
+  After configuration (whether manual or automatic) is complete, a file is
+  created at /proc/net/pnp in the following format; lines are omitted if
+  their respective value is empty following configuration.
+
+	#PROTO: <DHCP|BOOTP|RARP|MANUAL>	(depending on configuration method)
+	domain <dns-domain>			(if autoconfigured, the DNS domain)
+	nameserver <dns0-ip>			(primary name server IP)
+	nameserver <dns1-ip>			(secondary name server IP)
+	nameserver <dns2-ip>			(tertiary name server IP)
+	bootserver <server-ip>			(NFS server IP)
+
+  <dns-domain> and <dns2-ip> are requested during autoconfiguration; they
+  cannot be specified as part of the "ip=" kernel command line parameter.
 
-  <dns1-ip>	IP address of second nameserver.
-		Same as above.
+  Because the "domain" and "nameserver" options are recognised by DNS
+  resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
+  that use an NFS root filesystem.
 
 
 nfsrootdebug
-- 
cgit v1.2.3-59-g8ed1b


From 300eec7c0a2495f771709c7642aa15f7cc148b83 Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:37 +0100
Subject: ipconfig: Correctly initialise ic_nameservers

ic_nameservers, which stores the list of name servers discovered by
ipconfig, is initialised (i.e. has all of its elements set to NONE, or
0xffffffff) by ic_nameservers_predef() in the following scenarios:

 - before the "ip=" and "nfsaddrs=" kernel command line parameters are
   parsed (in ip_auto_config_setup());
 - before autoconfiguring via DHCP or BOOTP (in ic_bootp_init()), in
   order to clear any values that may have been set after parsing "ip="
   or "nfsaddrs=" and are no longer needed.

This means that ic_nameservers_predef() is not called when neither "ip="
nor "nfsaddrs=" is specified on the kernel command line. In this
scenario, every element in ic_nameservers remains set to 0x00000000,
which is indistinguishable from ANY and causes pnp_seq_show() to write
the following (bogus) information to /proc/net/pnp:

  #MANUAL
  nameserver 0.0.0.0
  nameserver 0.0.0.0
  nameserver 0.0.0.0

This is potentially problematic for systems that blindly link
/etc/resolv.conf to /proc/net/pnp.

Ensure that ic_nameservers is also initialised when neither "ip=" nor
"nfsaddrs=" are specified by calling ic_nameservers_predef() in
ip_auto_config(), but only when ip_auto_config_setup() was not called
earlier. This causes the following to be written to /proc/net/pnp, and
is consistent with what gets written when ipconfig is configured
manually but no name servers are specified on the kernel command line:

  #MANUAL

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 0f460d6d3cce..e11dfd29a929 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -750,6 +750,11 @@ static void __init ic_bootp_init_ext(u8 *e)
  */
 static inline void __init ic_bootp_init(void)
 {
+	/* Re-initialise all name servers to NONE, in case any were set via the
+	 * "ip=" or "nfsaddrs=" kernel command line parameters: any IP addresses
+	 * specified there will already have been decoded but are no longer
+	 * needed
+	 */
 	ic_nameservers_predef();
 
 	dev_add_pack(&bootp_packet_type);
@@ -1370,6 +1375,13 @@ static int __init ip_auto_config(void)
 	int err;
 	unsigned int i;
 
+	/* Initialise all name servers to NONE (but only if the "ip=" or
+	 * "nfsaddrs=" kernel command line parameters weren't decoded, otherwise
+	 * we'll overwrite the IP addresses specified there)
+	 */
+	if (ic_set_manually == 0)
+		ic_nameservers_predef();
+
 #ifdef CONFIG_PROC_FS
 	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
 #endif /* CONFIG_PROC_FS */
@@ -1593,6 +1605,7 @@ static int __init ip_auto_config_setup(char *addrs)
 		return 1;
 	}
 
+	/* Initialise all name servers to NONE */
 	ic_nameservers_predef();
 
 	/* Parse string for static IP assignment.  */
-- 
cgit v1.2.3-59-g8ed1b


From 4d019b3f80dc147fd9d177e7e0337fc66e3c0032 Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:38 +0100
Subject: ipconfig: Create /proc/net/ipconfig directory

To allow ipconfig to report IP configuration details to user space
processes without cluttering /proc/net, create a new subdirectory
/proc/net/ipconfig. All files containing IP configuration details should
be written to this directory.

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index e11dfd29a929..9abf833f3a99 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -158,6 +158,9 @@ static u8 ic_domain[64];		/* DNS (not NIS) domain name */
  * Private state.
  */
 
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
+
 /* Name of user-selected boot device */
 static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
 
@@ -1301,6 +1304,16 @@ static const struct file_operations pnp_seq_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+
+/* Create the /proc/net/ipconfig directory */
+static int ipconfig_proc_net_init(void)
+{
+	ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
+	if (!ipconfig_dir)
+		return -ENOMEM;
+
+	return 0;
+}
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1384,6 +1397,8 @@ static int __init ip_auto_config(void)
 
 #ifdef CONFIG_PROC_FS
 	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
+
+	ipconfig_proc_net_init();
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
-- 
cgit v1.2.3-59-g8ed1b


From c04d2cb2009f87cba7431c4ed3d85a602f71658e Mon Sep 17 00:00:00 2001
From: Chris Novakovic <chris@chrisn.me.uk>
Date: Tue, 24 Apr 2018 03:56:39 +0100
Subject: ipconfig: Write NTP server IPs to /proc/net/ipconfig/ntp_servers

Distributed filesystems are most effective when the server and client
clocks are synchronised. Embedded devices often use NFS for their
root filesystem but typically do not contain an RTC, so the clocks of
the NFS server and the embedded device will be out-of-sync when the root
filesystem is mounted (and may not be synchronised until late in the
boot process).

Extend ipconfig with the ability to export IP addresses of NTP servers
it discovers to /proc/net/ipconfig/ntp_servers. They can be supplied as
follows:

 - If ipconfig is configured manually via the "ip=" or "nfsaddrs="
   kernel command line parameters, one NTP server can be specified in
   the new "<ntp0-ip>" parameter.
 - If ipconfig is autoconfigured via DHCP, request DHCP option 42 in
   the DHCPDISCOVER message, and record the IP addresses of up to three
   NTP servers sent by the responding DHCP server in the subsequent
   DHCPOFFER message.

ipconfig will only write the NTP server IP addresses it discovers to
/proc/net/ipconfig/ntp_servers, one per line (in the order received from
the DHCP server, if DHCP autoconfiguration is used); making use of these
NTP servers is the responsibility of a user space process (e.g. an
initrd/initram script that invokes an NTP client before mounting an NFS
root filesystem).

Signed-off-by: Chris Novakovic <chris@chrisn.me.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/filesystems/nfs/nfsroot.txt |  35 +++++++--
 net/ipv4/ipconfig.c                       | 118 +++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 17 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index a1030bea60d3..d2963123eb1c 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -5,6 +5,7 @@ Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
 Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
 Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
 Updated 2006 by Horms <horms@verge.net.au>
+Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
 
 
@@ -79,7 +80,7 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
 
 
 ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
-   <dns0-ip>:<dns1-ip>
+   <dns0-ip>:<dns1-ip>:<ntp0-ip>
 
   This parameter tells the kernel how to configure IP addresses of devices
   and also how to set up the IP routing table. It was originally called
@@ -178,9 +179,18 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
   <dns1-ip>	IP address of secondary nameserver.
 		See <dns0-ip>.
 
-  After configuration (whether manual or automatic) is complete, a file is
-  created at /proc/net/pnp in the following format; lines are omitted if
-  their respective value is empty following configuration.
+  <ntp0-ip>	IP address of a Network Time Protocol (NTP) server.
+		Value is exported to /proc/net/ipconfig/ntp_servers, but is
+		otherwise unused (see below).
+
+		Default: None if not using autoconfiguration; determined
+		automatically if using autoconfiguration.
+
+  After configuration (whether manual or automatic) is complete, two files
+  are created in the following format; lines are omitted if their respective
+  value is empty following configuration:
+
+  - /proc/net/pnp:
 
 	#PROTO: <DHCP|BOOTP|RARP|MANUAL>	(depending on configuration method)
 	domain <dns-domain>			(if autoconfigured, the DNS domain)
@@ -189,13 +199,26 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
 	nameserver <dns2-ip>			(tertiary name server IP)
 	bootserver <server-ip>			(NFS server IP)
 
-  <dns-domain> and <dns2-ip> are requested during autoconfiguration; they
-  cannot be specified as part of the "ip=" kernel command line parameter.
+  - /proc/net/ipconfig/ntp_servers:
+
+	<ntp0-ip>				(NTP server IP)
+	<ntp1-ip>				(NTP server IP)
+	<ntp2-ip>				(NTP server IP)
+
+  <dns-domain> and <dns2-ip> (in /proc/net/pnp) and <ntp1-ip> and <ntp2-ip>
+  (in /proc/net/ipconfig/ntp_servers) are requested during autoconfiguration;
+  they cannot be specified as part of the "ip=" kernel command line parameter.
 
   Because the "domain" and "nameserver" options are recognised by DNS
   resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
   that use an NFS root filesystem.
 
+  Note that the kernel will not synchronise the system time with any NTP
+  servers it discovers; this is the responsibility of a user space process
+  (e.g. an initrd/initramfs script that passes the IP addresses listed in
+  /proc/net/ipconfig/ntp_servers to an NTP client before mounting the real
+  root filesystem if it is on NFS).
+
 
 nfsrootdebug
 
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9abf833f3a99..d839d74853fc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -28,6 +28,9 @@
  *
  *  Multiple Nameservers in /proc/net/pnp
  *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ *
+ *  NTP servers in /proc/net/ipconfig/ntp_servers
+ *              --  Chris Novakovic <chris@chrisn.me.uk>, April 2018
  */
 
 #include <linux/types.h>
@@ -93,6 +96,7 @@
 #define CONF_TIMEOUT_MAX	(HZ*30)	/* Maximum allowed timeout */
 #define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers
 					   - '3' from resolv.h */
+#define CONF_NTP_SERVERS_MAX   3	/* Maximum number of NTP servers */
 
 #define NONE cpu_to_be32(INADDR_NONE)
 #define ANY cpu_to_be32(INADDR_ANY)
@@ -152,6 +156,7 @@ static int ic_proto_used;			/* Protocol used, if any */
 #define ic_proto_used 0
 #endif
 static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */
 static u8 ic_domain[64];		/* DNS (not NIS) domain name */
 
 /*
@@ -579,6 +584,15 @@ static inline void __init ic_nameservers_predef(void)
 		ic_nameservers[i] = NONE;
 }
 
+/* Predefine NTP servers */
+static inline void __init ic_ntp_servers_predef(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++)
+		ic_ntp_servers[i] = NONE;
+}
+
 /*
  *	DHCP/BOOTP support.
  */
@@ -674,6 +688,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
 			17,	/* Boot path */
 			26,	/* MTU */
 			40,	/* NIS domain name */
+			42,	/* NTP servers */
 		};
 
 		*e++ = 55;	/* Parameter request list */
@@ -753,12 +768,13 @@ static void __init ic_bootp_init_ext(u8 *e)
  */
 static inline void __init ic_bootp_init(void)
 {
-	/* Re-initialise all name servers to NONE, in case any were set via the
-	 * "ip=" or "nfsaddrs=" kernel command line parameters: any IP addresses
-	 * specified there will already have been decoded but are no longer
-	 * needed
+	/* Re-initialise all name servers and NTP servers to NONE, in case any
+	 * were set via the "ip=" or "nfsaddrs=" kernel command line parameters:
+	 * any IP addresses specified there will already have been decoded but
+	 * are no longer needed
 	 */
 	ic_nameservers_predef();
+	ic_ntp_servers_predef();
 
 	dev_add_pack(&bootp_packet_type);
 }
@@ -922,6 +938,15 @@ static void __init ic_do_bootp_ext(u8 *ext)
 		ic_bootp_string(utsname()->domainname, ext+1, *ext,
 				__NEW_UTS_LEN);
 		break;
+	case 42:	/* NTP servers */
+		servers = *ext / 4;
+		if (servers > CONF_NTP_SERVERS_MAX)
+			servers = CONF_NTP_SERVERS_MAX;
+		for (i = 0; i < servers; i++) {
+			if (ic_ntp_servers[i] == NONE)
+				memcpy(&ic_ntp_servers[i], ext+1+4*i, 4);
+		}
+		break;
 	}
 }
 
@@ -1268,6 +1293,7 @@ static int __init ic_dynamic(void)
 
 #ifdef CONFIG_PROC_FS
 
+/* Name servers: */
 static int pnp_seq_show(struct seq_file *seq, void *v)
 {
 	int i;
@@ -1306,7 +1332,7 @@ static const struct file_operations pnp_seq_fops = {
 };
 
 /* Create the /proc/net/ipconfig directory */
-static int ipconfig_proc_net_init(void)
+static int __init ipconfig_proc_net_init(void)
 {
 	ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net);
 	if (!ipconfig_dir)
@@ -1314,6 +1340,52 @@ static int ipconfig_proc_net_init(void)
 
 	return 0;
 }
+
+/* Create a new file under /proc/net/ipconfig */
+static int ipconfig_proc_net_create(const char *name,
+				    const struct file_operations *fops)
+{
+	char *pname;
+	struct proc_dir_entry *p;
+
+	if (!ipconfig_dir)
+		return -ENOMEM;
+
+	pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name);
+	if (!pname)
+		return -ENOMEM;
+
+	p = proc_create(pname, 0444, init_net.proc_net, fops);
+	kfree(pname);
+	if (!p)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
+static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE)
+			seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]);
+	}
+	return 0;
+}
+
+static int ntp_servers_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ntp_servers_seq_show, NULL);
+}
+
+static const struct file_operations ntp_servers_seq_fops = {
+	.open		= ntp_servers_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 #endif /* CONFIG_PROC_FS */
 
 /*
@@ -1388,17 +1460,20 @@ static int __init ip_auto_config(void)
 	int err;
 	unsigned int i;
 
-	/* Initialise all name servers to NONE (but only if the "ip=" or
-	 * "nfsaddrs=" kernel command line parameters weren't decoded, otherwise
-	 * we'll overwrite the IP addresses specified there)
+	/* Initialise all name servers and NTP servers to NONE (but only if the
+	 * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
+	 * otherwise we'll overwrite the IP addresses specified there)
 	 */
-	if (ic_set_manually == 0)
+	if (ic_set_manually == 0) {
 		ic_nameservers_predef();
+		ic_ntp_servers_predef();
+	}
 
 #ifdef CONFIG_PROC_FS
 	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
 
-	ipconfig_proc_net_init();
+	if (ipconfig_proc_net_init() == 0)
+		ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
@@ -1523,6 +1598,19 @@ static int __init ip_auto_config(void)
 		if (i + 1 == CONF_NAMESERVERS_MAX)
 			pr_cont("\n");
 	}
+	/* NTP servers (if any): */
+	for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+		if (ic_ntp_servers[i] != NONE) {
+			if (i == 0)
+				pr_info("     ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+			else
+				pr_cont(", ntpserver%u=%pI4",
+					i, &ic_ntp_servers[i]);
+		}
+		if (i + 1 == CONF_NTP_SERVERS_MAX)
+			pr_cont("\n");
+	}
 #endif /* !SILENT */
 
 	/*
@@ -1620,8 +1708,9 @@ static int __init ip_auto_config_setup(char *addrs)
 		return 1;
 	}
 
-	/* Initialise all name servers to NONE */
+	/* Initialise all name servers and NTP servers to NONE */
 	ic_nameservers_predef();
+	ic_ntp_servers_predef();
 
 	/* Parse string for static IP assignment.  */
 	ip = addrs;
@@ -1680,6 +1769,13 @@ static int __init ip_auto_config_setup(char *addrs)
 						ic_nameservers[1] = NONE;
 				}
 				break;
+			case 9:
+				if (CONF_NTP_SERVERS_MAX >= 1) {
+					ic_ntp_servers[0] = in_aton(ip);
+					if (ic_ntp_servers[0] == ANY)
+						ic_ntp_servers[0] = NONE;
+				}
+				break;
 			}
 		}
 		ip = cp;
-- 
cgit v1.2.3-59-g8ed1b


From c7d852e301d834949e570920db808201bdc51a22 Mon Sep 17 00:00:00 2001
From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Tue, 24 Apr 2018 15:32:53 +0300
Subject: qed: Fix copying 2 strings

The strscpy() was a recent fix (net: qed: use correct strncpy() size) to
prevent passing the length of the source buffer to strncpy() and guarantee
null termination.
It misses the goal of overwriting only the first 3 characters in
"???_BIG_RAM" and "???_RAM" while keeping the rest of the string.
Use strncpy() with the length of 3, without null termination.

Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_debug.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index b3211c7d38c2..39124b594a36 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -419,6 +419,7 @@ struct phy_defs {
 #define NUM_RSS_MEM_TYPES		5
 
 #define NUM_BIG_RAM_TYPES		3
+#define BIG_RAM_NAME_LEN		3
 
 #define NUM_PHY_TBUS_ADDRESSES		2048
 #define PHY_DUMP_SIZE_DWORDS		(NUM_PHY_TBUS_ADDRESSES / 2)
@@ -3650,8 +3651,8 @@ static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
 		     BIT(big_ram->is_256b_bit_offset[dev_data->chip_id]) ? 256
 									 : 128;
 
-	strscpy(type_name, big_ram->instance_name, sizeof(type_name));
-	strscpy(mem_name, big_ram->instance_name, sizeof(mem_name));
+	strncpy(type_name, big_ram->instance_name, BIG_RAM_NAME_LEN);
+	strncpy(mem_name, big_ram->instance_name, BIG_RAM_NAME_LEN);
 
 	/* Dump memory header */
 	offset += qed_grc_dump_mem_hdr(p_hwfn,
-- 
cgit v1.2.3-59-g8ed1b


From 080aaddae5b378ebb0f06d297bf05cf4ec5b47ac Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 24 Apr 2018 12:39:45 +0100
Subject: fsl/fman_port: remove redundant check on port->rev_info.major

The check port->rev_info.major >= 6 is being performed twice, thus
the inner second check is always true and is redundant, hence it
can be removed. Detected by cppcheck.

drivers/net/ethernet/freescale/fman/fman_port.c:1394]: (warning)
Identical inner 'if' condition is always true.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fman/fman_port.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman_port.c b/drivers/net/ethernet/freescale/fman/fman_port.c
index 6552d68ea6e1..ce6e24c74978 100644
--- a/drivers/net/ethernet/freescale/fman/fman_port.c
+++ b/drivers/net/ethernet/freescale/fman/fman_port.c
@@ -1391,12 +1391,10 @@ int fman_port_config(struct fman_port *port, struct fman_port_params *params)
 		/* FM_WRONG_RESET_VALUES_ERRATA_FMAN_A005127 Errata
 		 * workaround
 		 */
-		if (port->rev_info.major >= 6) {
-			u32 reg;
+		u32 reg;
 
-			reg = 0x00001013;
-			iowrite32be(reg, &port->bmi_regs->tx.fmbm_tfp);
-		}
+		reg = 0x00001013;
+		iowrite32be(reg, &port->bmi_regs->tx.fmbm_tfp);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 95ad7544ad3f6cf585e5b8587e7d520fa3a43ad5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 24 Apr 2018 13:36:58 +0100
Subject: net/tls: remove redundant second null check on sgout

A duplicated null check on sgout is redundant as it is known to be
already true because of the identical earlier check. Remove it.
Detected by cppcheck:

net/tls/tls_sw.c:696: (warning) Identical inner 'if' condition is always
true.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 71e79597f940..6ed1c02cfc94 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -693,8 +693,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 	if (!sgout) {
 		nsg = skb_cow_data(skb, 0, &unused) + 1;
 		sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
-		if (!sgout)
-			sgout = sgin;
+		sgout = sgin;
 	}
 
 	sg_init_table(sgin, nsg);
-- 
cgit v1.2.3-59-g8ed1b


From 091311debcf02ff80a76d70b524eba4b2ffe66f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 24 Apr 2018 09:22:49 -0700
Subject: net/ipv6: fix LOCKDEP issue in rt6_remove_exception_rt()

rt6_remove_exception_rt() is called under rcu_read_lock() only.

We lock rt6_exception_lock a bit later, so we do not hold
rt6_exception_lock yet.

Fixes: 8a14e46f1402 ("net/ipv6: Fix missing rcu dereferences on from")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: David Ahern <dsahern@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ac3e51631c65..432c4bcc1111 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1546,8 +1546,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
 	struct fib6_info *from;
 	int err;
 
-	from = rcu_dereference_protected(rt->from,
-					 lockdep_is_held(&rt6_exception_lock));
+	from = rcu_dereference(rt->from);
 	if (!from ||
 	    !(rt->rt6i_flags & RTF_CACHE))
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 16f4faa4f06ff3b4e214922d55ac33ab6e2bdbdc Mon Sep 17 00:00:00 2001
From: Srinivas Jampala <srinivasa.jampala@cavium.com>
Date: Tue, 24 Apr 2018 10:23:27 -0700
Subject: liquidio: Swap VF representor Tx and Rx statistics

Swap VF representor tx and rx interface statistics since it is a
virtual switchdev port and tx for VM should be rx for VF representor
and vice-versa.

Signed-off-by: Srinivas Jampala <srinivasa.jampala@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
index 2adafa366d3f..ddd7431579f4 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c
@@ -201,13 +201,14 @@ lio_vf_rep_get_stats64(struct net_device *dev,
 {
 	struct lio_vf_rep_desc *vf_rep = netdev_priv(dev);
 
-	stats64->tx_packets = vf_rep->stats.tx_packets;
-	stats64->tx_bytes   = vf_rep->stats.tx_bytes;
-	stats64->tx_dropped = vf_rep->stats.tx_dropped;
-
-	stats64->rx_packets = vf_rep->stats.rx_packets;
-	stats64->rx_bytes   = vf_rep->stats.rx_bytes;
-	stats64->rx_dropped = vf_rep->stats.rx_dropped;
+	/* Swap tx and rx stats as VF rep is a switch port */
+	stats64->tx_packets = vf_rep->stats.rx_packets;
+	stats64->tx_bytes   = vf_rep->stats.rx_bytes;
+	stats64->tx_dropped = vf_rep->stats.rx_dropped;
+
+	stats64->rx_packets = vf_rep->stats.tx_packets;
+	stats64->rx_bytes   = vf_rep->stats.tx_bytes;
+	stats64->rx_dropped = vf_rep->stats.tx_dropped;
 }
 
 static int
-- 
cgit v1.2.3-59-g8ed1b


From 12bed760a78da6e12ac8252fec64d019a9eac523 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Tue, 24 Apr 2018 17:50:29 +0300
Subject: bpf: add helper for getting xfrm states

This commit introduces a helper which allows fetching xfrm state
parameters by eBPF programs attached to TC.

Prototype:
bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)

skb: pointer to skb
index: the index in the skb xfrm_state secpath array
xfrm_state: pointer to 'struct bpf_xfrm_state'
size: size of 'struct bpf_xfrm_state'
flags: reserved for future extensions

The helper returns 0 on success. Non zero if no xfrm state at the index
is found - or non exists at all.

struct bpf_xfrm_state currently includes the SPI, peer IPv4/IPv6
address and the reqid; it can be further extended by adding elements to
its end - indicating the populated fields by the 'size' argument -
keeping backwards compatibility.

Typical usage:

struct bpf_xfrm_state x = {};
bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
...

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 25 ++++++++++++++++++++++++-
 net/core/filter.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8383a289f7b..e6679393b687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -774,6 +774,15 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: A negative integer to be added to xdp_md.data_end
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
+ *     retrieve XFRM state
+ *     @skb: pointer to skb
+ *     @index: index of the xfrm state in the secpath
+ *     @key: pointer to 'struct bpf_xfrm_state'
+ *     @size: size of 'struct bpf_xfrm_state'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -841,7 +850,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -947,6 +957,19 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
 /* Generic BPF return codes which all BPF program types may support.
  * The values are binary compatible with their TC_ACT_* counter-part to
  * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
diff --git a/net/core/filter.c b/net/core/filter.c
index e25bc4a3aa1a..8e45c6c7ab08 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -57,6 +57,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 
 /**
@@ -3743,6 +3744,49 @@ static const struct bpf_func_proto bpf_bind_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+#ifdef CONFIG_XFRM
+BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
+	   struct bpf_xfrm_state *, to, u32, size, u64, flags)
+{
+	const struct sec_path *sp = skb_sec_path(skb);
+	const struct xfrm_state *x;
+
+	if (!sp || unlikely(index >= sp->len || flags))
+		goto err_clear;
+
+	x = sp->xvec[index];
+
+	if (unlikely(size != sizeof(struct bpf_xfrm_state)))
+		goto err_clear;
+
+	to->reqid = x->props.reqid;
+	to->spi = x->id.spi;
+	to->family = x->props.family;
+	if (to->family == AF_INET6) {
+		memcpy(to->remote_ipv6, x->props.saddr.a6,
+		       sizeof(to->remote_ipv6));
+	} else {
+		to->remote_ipv4 = x->props.saddr.a4;
+	}
+
+	return 0;
+err_clear:
+	memset(to, 0, size);
+	return -EINVAL;
+}
+
+static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
+	.func		= bpf_skb_get_xfrm_state,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+#endif
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3884,6 +3928,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+#ifdef CONFIG_XFRM
+	case BPF_FUNC_skb_get_xfrm_state:
+		return &bpf_skb_get_xfrm_state_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 29a36f9eef30b7f008b722a753a84b314f981037 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Tue, 24 Apr 2018 17:50:30 +0300
Subject: samples/bpf: extend test_tunnel_bpf.sh with xfrm state test

Add a test for fetching xfrm state parameters from a tc program running
on ingress.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/tcbpf2_kern.c                 | 16 +++++++
 samples/bpf/test_tunnel_bpf.sh            | 71 +++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            | 25 ++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  4 +-
 4 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 9a8db7bd6db4..fa260c750fb1 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -593,4 +593,20 @@ int _ip6ip6_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("xfrm_get_state")
+int _xfrm_get_state(struct __sk_buff *skb)
+{
+	struct bpf_xfrm_state x;
+	char fmt[] = "reqid %d spi 0x%x remote ip 0x%x\n";
+	int ret;
+
+	ret = bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
+	if (ret < 0)
+		return TC_ACT_OK;
+
+	bpf_trace_printk(fmt, sizeof(fmt), x.reqid, bpf_ntohl(x.spi),
+			 bpf_ntohl(x.remote_ipv4));
+	return TC_ACT_OK;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index c265863ccdf9..9c534dc07b36 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -155,6 +155,57 @@ function add_ipip_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
+function setup_xfrm_tunnel {
+	auth=0x$(printf '1%.0s' {1..40})
+	enc=0x$(printf '2%.0s' {1..32})
+	spi_in_to_out=0x1
+	spi_out_to_in=0x2
+	# in namespace
+	# in -> out
+	ip netns exec at_ns0 \
+		ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+			spi $spi_in_to_out reqid 1 mode tunnel \
+			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+	ip netns exec at_ns0 \
+		ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
+		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+		mode tunnel
+	# out -> in
+	ip netns exec at_ns0 \
+		ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+			spi $spi_out_to_in reqid 2 mode tunnel \
+			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+	ip netns exec at_ns0 \
+		ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
+		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+		mode tunnel
+	# address & route
+	ip netns exec at_ns0 \
+		ip addr add dev veth0 10.1.1.100/32
+	ip netns exec at_ns0 \
+		ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
+			src 10.1.1.100
+
+	# out of namespace
+	# in -> out
+	ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+		spi $spi_in_to_out reqid 1 mode tunnel \
+		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
+	ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
+		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+		mode tunnel
+	# out -> in
+	ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+		spi $spi_out_to_in reqid 2 mode tunnel \
+		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
+	ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
+		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+		mode tunnel
+	# address & route
+	ip addr add dev veth1 10.1.1.200/32
+	ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
+}
+
 function attach_bpf {
 	DEV=$1
 	SET_TUNNEL=$2
@@ -278,6 +329,22 @@ function test_ipip {
 	cleanup
 }
 
+function test_xfrm_tunnel {
+	config_device
+        tcpdump -nei veth1 ip &
+	output=$(mktemp)
+	cat /sys/kernel/debug/tracing/trace_pipe | tee $output &
+        setup_xfrm_tunnel
+	tc qdisc add dev veth1 clsact
+	tc filter add dev veth1 proto ip ingress bpf da obj tcbpf2_kern.o \
+		sec xfrm_get_state
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	grep "reqid 1" $output
+	grep "spi 0x1" $output
+	grep "remote ip 0xac100164" $output
+	cleanup
+}
+
 function cleanup {
 	set +ex
 	pkill iperf
@@ -291,6 +358,8 @@ function cleanup {
 	ip link del geneve11
 	ip link del erspan11
 	ip link del ip6erspan11
+	ip x s flush
+	ip x p flush
 	pkill tcpdump
 	pkill cat
 	set -ex
@@ -316,4 +385,6 @@ echo "Testing GENEVE tunnel..."
 test_geneve
 echo "Testing IPIP tunnel..."
 test_ipip
+echo "Testing IPSec tunnel..."
+test_xfrm_tunnel
 echo "*** PASS ***"
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7f7fbb9d0253..5841ed41b30c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -774,6 +774,15 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: A negative integer to be added to xdp_md.data_end
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
+ *     retrieve XFRM state
+ *     @skb: pointer to skb
+ *     @index: index of the xfrm state in the secpath
+ *     @key: pointer to 'struct bpf_xfrm_state'
+ *     @size: size of 'struct bpf_xfrm_state'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -841,7 +850,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -946,6 +956,19 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
 /* Generic BPF return codes which all BPF program types may support.
  * The values are binary compatible with their TC_ACT_* counter-part to
  * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 9271576bdc8f..69d7b918e66a 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -98,7 +98,9 @@ static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_tail;
-
+static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
+				     int size, int flags) =
+	(void *) BPF_FUNC_skb_get_xfrm_state;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From d71962f3e627b5941804036755c844fabfb65ff5 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul.chaignon@orange.com>
Date: Tue, 24 Apr 2018 15:07:54 +0200
Subject: bpf: allow map helpers access to map values directly

Helpers that expect ARG_PTR_TO_MAP_KEY and ARG_PTR_TO_MAP_VALUE can only
access stack and packet memory.  Allow these helpers to directly access
map values by passing registers of type PTR_TO_MAP_VALUE.

This change removes the need for an extra copy to the stack when using a
map value to perform a second map lookup, as in the following:

struct bpf_map_def SEC("maps") infobyreq = {
    .type = BPF_MAP_TYPE_HASHMAP,
    .key_size = sizeof(struct request *),
    .value_size = sizeof(struct info_t),
    .max_entries = 1024,
};
struct bpf_map_def SEC("maps") counts = {
    .type = BPF_MAP_TYPE_HASHMAP,
    .key_size = sizeof(struct info_t),
    .value_size = sizeof(u64),
    .max_entries = 1024,
};
SEC("kprobe/blk_account_io_start")
int bpf_blk_account_io_start(struct pt_regs *ctx)
{
    struct info_t *info = bpf_map_lookup_elem(&infobyreq, &ctx->di);
    u64 *count = bpf_map_lookup_elem(&counts, info);
    (*count)++;
}

Signed-off-by: Paul Chaignon <paul.chaignon@orange.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb902bf..eb1a596aebd3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1914,7 +1914,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (!type_is_pkt_pointer(type) &&
+		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
 		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
@@ -1966,14 +1966,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type_is_pkt_pointer(type))
-			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->key_size,
-						  false);
-		else
-			err = check_stack_boundary(env, regno,
-						   meta->map_ptr->key_size,
-						   false, NULL);
+		err = check_helper_mem_access(env, regno,
+					      meta->map_ptr->key_size, false,
+					      NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -1983,14 +1978,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type_is_pkt_pointer(type))
-			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->value_size,
-						  false);
-		else
-			err = check_stack_boundary(env, regno,
-						   meta->map_ptr->value_size,
-						   false, NULL);
+		err = check_helper_mem_access(env, regno,
+					      meta->map_ptr->value_size, false,
+					      NULL);
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f90dd6aaea48abc02b5ba0c4530f5c71bc1ab15 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul.chaignon@orange.com>
Date: Tue, 24 Apr 2018 15:08:19 +0200
Subject: tools/bpf: add verifier tests for accesses to map values

This patch adds new test cases for accesses to map values from map
helpers.

Signed-off-by: Paul Chaignon <paul.chaignon@orange.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 266 ++++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b1a9ae..165e9ddfa446 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -64,6 +64,7 @@ struct bpf_test {
 	struct bpf_insn	insns[MAX_INSNS];
 	int fixup_map1[MAX_FIXUPS];
 	int fixup_map2[MAX_FIXUPS];
+	int fixup_map3[MAX_FIXUPS];
 	int fixup_prog[MAX_FIXUPS];
 	int fixup_map_in_map[MAX_FIXUPS];
 	const char *errstr;
@@ -88,6 +89,11 @@ struct test_val {
 	int foo[MAX_ENTRIES];
 };
 
+struct other_val {
+	long long foo;
+	long long bar;
+};
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -5593,6 +5599,257 @@ static struct bpf_test tests[] = {
 		.errstr = "R1 min value is negative",
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"map lookup helper access to map",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 8 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map update helper access to map",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 10 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map update helper access to map: wrong size",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.fixup_map3 = { 10 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=8 off=0 size=16",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const imm)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2,
+				      offsetof(struct other_val, bar)),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 9 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const imm): out-of-bound 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2,
+				      sizeof(struct other_val) - 4),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 9 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=16 off=12 size=8",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const imm): out-of-bound 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 9 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=16 off=-4 size=8",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const reg)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				      offsetof(struct other_val, bar)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 10 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const reg): out-of-bound 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3,
+				      sizeof(struct other_val) - 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 10 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=16 off=12 size=8",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via const reg): out-of-bound 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_3, -4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 10 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=16 off=-4 size=8",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via variable)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				    offsetof(struct other_val, bar), 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 11 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via variable): no max check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 10 },
+		.result = REJECT,
+		.errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"map helper access to adjusted map (via variable): wrong max check",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_3,
+				    offsetof(struct other_val, bar) + 1, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map3 = { 3, 11 },
+		.result = REJECT,
+		.errstr = "invalid access to map value, value_size=16 off=9 size=8",
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 	{
 		"map element value is preserved across register spilling",
 		.insns = {
@@ -11533,6 +11790,7 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 {
 	int *fixup_map1 = test->fixup_map1;
 	int *fixup_map2 = test->fixup_map2;
+	int *fixup_map3 = test->fixup_map3;
 	int *fixup_prog = test->fixup_prog;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
@@ -11556,6 +11814,14 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 		} while (*fixup_map2);
 	}
 
+	if (*fixup_map3) {
+		map_fds[1] = create_map(sizeof(struct other_val), 1);
+		do {
+			prog[*fixup_map3].imm = map_fds[1];
+			fixup_map3++;
+		} while (*fixup_map3);
+	}
+
 	if (*fixup_prog) {
 		map_fds[2] = create_prog_array();
 		do {
-- 
cgit v1.2.3-59-g8ed1b


From b6fd9cf796e6dd0090807e2cdc279d1d71fdb6fa Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Tue, 24 Apr 2018 00:53:05 +0200
Subject: selftests: bpf: update .gitignore with missing file

Fixes: c0fa1b6c3efc ("bpf: btf: Add BTF tests")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 9cf83f895d98..da19f0562bf8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -12,3 +12,4 @@ test_tcpbpf_user
 test_verifier_log
 feature
 test_libbpf_open
+test_btf
-- 
cgit v1.2.3-59-g8ed1b


From 6595c7426acbcc2c4a79cfe409b2c840fb2ebed3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 24 Apr 2018 14:45:04 -0700
Subject: tools/bpf: remove test_sock_addr from TEST_GEN_PROGS

Since test_sock_addr is not supposed to run by itself,
remove it from TEST_GEN_PROGS and add it to
TEST_GEN_PROGS_EXTENDED. This way, run_tests will
not run test_sock_addr. The corresponding test to run
is test_sock_addr.sh.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0b72cc7596f1..64037ee2eed0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_sock_addr test_btf
+	test_sock test_btf
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -43,7 +43,7 @@ TEST_PROGS := test_kmod.sh \
 	test_sock_addr.sh
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
 
 include ../lib.mk
 
-- 
cgit v1.2.3-59-g8ed1b


From 15f66a91a66bfc5d2ae95b9a3356a3fe114ca2f3 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 14:30:28 -0700
Subject: bpf: sockmap, code sockmap_test in C

By moving sockmap_test from shell script into C we can run it directly
from selftests, but we can also push the input/output around in proper
structures.

However, keep the CLI options around because they are useful for
debugging when a paticular pattern of msghdr or sockmap options
trips up the sockmap code path.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 209 ++++++++++++++++++++++-------------------
 1 file changed, 113 insertions(+), 96 deletions(-)

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 6f2334912283..649f06a5bfd2 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -52,6 +52,8 @@ void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
+#define BPF_FILENAME "sockmap_kern.o"
+
 /* global sockets */
 int s1, s2, c1, c2, p1, p2;
 
@@ -226,6 +228,9 @@ struct sockmap_options {
 	bool sendpage;
 	bool data_test;
 	bool drop_expected;
+	int iov_count;
+	int iov_length;
+	int rate;
 };
 
 static int msg_loop_sendpage(int fd, int iov_length, int cnt,
@@ -409,12 +414,14 @@ static inline float recvdBps(struct msg_stats s)
 	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
 }
 
-static int sendmsg_test(int iov_count, int iov_buf, int cnt,
-			struct sockmap_options *opt)
+static int sendmsg_test(struct sockmap_options *opt)
 {
 	float sent_Bps = 0, recvd_Bps = 0;
 	int rx_fd, txpid, rxpid, err = 0;
 	struct msg_stats s = {0};
+	int iov_count = opt->iov_count;
+	int iov_buf = opt->iov_length;
+	int cnt = opt->rate;
 	int status;
 
 	errno = 0;
@@ -568,98 +575,13 @@ enum {
 	SENDPAGE,
 };
 
-int main(int argc, char **argv)
+static int run_options(struct sockmap_options options, int cg_fd,  int test)
 {
-	int iov_count = 1, length = 1024, rate = 1, tx_prog_fd;
-	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
-	int opt, longindex, err, cg_fd = 0;
-	struct sockmap_options options = {0};
-	int test = PING_PONG;
+	char *bpf_file = BPF_FILENAME;
+	int err, tx_prog_fd;
 	char filename[256];
 
-	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
-				  long_options, &longindex)) != -1) {
-		switch (opt) {
-		case 's':
-			txmsg_start = atoi(optarg);
-			break;
-		case 'e':
-			txmsg_end = atoi(optarg);
-			break;
-		case 'a':
-			txmsg_apply = atoi(optarg);
-			break;
-		case 'k':
-			txmsg_cork = atoi(optarg);
-			break;
-		case 'c':
-			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
-			if (cg_fd < 0) {
-				fprintf(stderr,
-					"ERROR: (%i) open cg path failed: %s\n",
-					cg_fd, optarg);
-				return cg_fd;
-			}
-			break;
-		case 'r':
-			rate = atoi(optarg);
-			break;
-		case 'v':
-			options.verbose = 1;
-			break;
-		case 'i':
-			iov_count = atoi(optarg);
-			break;
-		case 'l':
-			length = atoi(optarg);
-			break;
-		case 'd':
-			options.data_test = true;
-			break;
-		case 't':
-			if (strcmp(optarg, "ping") == 0) {
-				test = PING_PONG;
-			} else if (strcmp(optarg, "sendmsg") == 0) {
-				test = SENDMSG;
-			} else if (strcmp(optarg, "base") == 0) {
-				test = BASE;
-			} else if (strcmp(optarg, "base_sendpage") == 0) {
-				test = BASE_SENDPAGE;
-			} else if (strcmp(optarg, "sendpage") == 0) {
-				test = SENDPAGE;
-			} else {
-				usage(argv);
-				return -1;
-			}
-			break;
-		case 0:
-			break;
-		case 'h':
-		default:
-			usage(argv);
-			return -1;
-		}
-	}
-
-	if (!cg_fd) {
-		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
-			argv[0]);
-		return -1;
-	}
-
-	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
-		perror("setrlimit(RLIMIT_MEMLOCK)");
-		return 1;
-	}
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	running = 1;
-
-	/* catch SIGINT */
-	signal(SIGINT, running_handler);
-
-	if (load_bpf_file(filename)) {
+	if (load_bpf_file(bpf_file)) {
 		fprintf(stderr, "load_bpf_file: (%s) %s\n",
 			filename, strerror(errno));
 		return 1;
@@ -857,23 +779,23 @@ run:
 		options.drop_expected = true;
 
 	if (test == PING_PONG)
-		err = forever_ping_pong(rate, &options);
+		err = forever_ping_pong(options.rate, &options);
 	else if (test == SENDMSG) {
 		options.base = false;
 		options.sendpage = false;
-		err = sendmsg_test(iov_count, length, rate, &options);
+		err = sendmsg_test(&options);
 	} else if (test == SENDPAGE) {
 		options.base = false;
 		options.sendpage = true;
-		err = sendmsg_test(iov_count, length, rate, &options);
+		err = sendmsg_test(&options);
 	} else if (test == BASE) {
 		options.base = true;
 		options.sendpage = false;
-		err = sendmsg_test(iov_count, length, rate, &options);
+		err = sendmsg_test(&options);
 	} else if (test == BASE_SENDPAGE) {
 		options.base = true;
 		options.sendpage = true;
-		err = sendmsg_test(iov_count, length, rate, &options);
+		err = sendmsg_test(&options);
 	} else
 		fprintf(stderr, "unknown test\n");
 out:
@@ -888,6 +810,101 @@ out:
 	return err;
 }
 
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	int iov_count = 1, length = 1024, rate = 1;
+	struct sockmap_options options = {0};
+	int opt, longindex, cg_fd = 0;
+	int test = PING_PONG;
+
+	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 's':
+			txmsg_start = atoi(optarg);
+			break;
+		case 'e':
+			txmsg_end = atoi(optarg);
+			break;
+		case 'a':
+			txmsg_apply = atoi(optarg);
+			break;
+		case 'k':
+			txmsg_cork = atoi(optarg);
+			break;
+		case 'c':
+			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+			if (cg_fd < 0) {
+				fprintf(stderr,
+					"ERROR: (%i) open cg path failed: %s\n",
+					cg_fd, optarg);
+				return cg_fd;
+			}
+			break;
+		case 'r':
+			rate = atoi(optarg);
+			break;
+		case 'v':
+			options.verbose = 1;
+			break;
+		case 'i':
+			iov_count = atoi(optarg);
+			break;
+		case 'l':
+			length = atoi(optarg);
+			break;
+		case 'd':
+			options.data_test = true;
+			break;
+		case 't':
+			if (strcmp(optarg, "ping") == 0) {
+				test = PING_PONG;
+			} else if (strcmp(optarg, "sendmsg") == 0) {
+				test = SENDMSG;
+			} else if (strcmp(optarg, "base") == 0) {
+				test = BASE;
+			} else if (strcmp(optarg, "base_sendpage") == 0) {
+				test = BASE_SENDPAGE;
+			} else if (strcmp(optarg, "sendpage") == 0) {
+				test = SENDPAGE;
+			} else {
+				usage(argv);
+				return -1;
+			}
+			break;
+		case 0:
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return -1;
+		}
+	}
+
+	if (!cg_fd) {
+		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+			argv[0]);
+		return -1;
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	running = 1;
+
+	/* catch SIGINT */
+	signal(SIGINT, running_handler);
+
+	options.iov_count = iov_count;
+	options.iov_length = length;
+	options.rate = rate;
+
+	return run_options(options, cg_fd, test);
+}
+
 void running_handler(int a)
 {
 	running = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 5d9ffeae5edc7ae5137059f90715de70c45af553 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 14:30:33 -0700
Subject: bpf: sockmap, add a set of tests to run by default

If no options are passed to sockmap after this patch we run a set of
tests using various options and sendmsg/sendpage sizes. This replaces
the sockmap_test.sh script.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 563 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 512 insertions(+), 51 deletions(-)

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 649f06a5bfd2..22595b43ecc5 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -25,6 +25,7 @@
 #include <fcntl.h>
 #include <sys/wait.h>
 #include <time.h>
+#include <sched.h>
 
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -53,9 +54,11 @@ void running_handler(int a);
 #define S2_PORT 10001
 
 #define BPF_FILENAME "sockmap_kern.o"
+#define BPF_CGRP "/mnt/cgroup2/"
 
 /* global sockets */
 int s1, s2, c1, c2, p1, p2;
+int test_cnt;
 
 int txmsg_pass;
 int txmsg_noisy;
@@ -109,7 +112,7 @@ static void usage(char *argv[])
 	printf("\n");
 }
 
-static int sockmap_init_sockets(void)
+static int sockmap_init_sockets(int verbose)
 {
 	int i, err, one = 1;
 	struct sockaddr_in addr;
@@ -209,9 +212,11 @@ static int sockmap_init_sockets(void)
 		return errno;
 	}
 
-	printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
-	printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
-		c1, s1, c2, s2);
+	if (verbose) {
+		printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
+		printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
+			c1, s1, c2, s2);
+	}
 	return 0;
 }
 
@@ -329,10 +334,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
 		int slct, recv, max_fd = fd;
+		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		float total_bytes;
 		fd_set w;
 
+		fcntl(fd, fd_flags);
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
 		if (err < 0)
@@ -351,7 +358,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
 			} else if (!slct) {
-				fprintf(stderr, "unexpected timeout\n");
+				if (opt->verbose)
+					fprintf(stderr, "unexpected timeout\n");
 				errno = -EIO;
 				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
@@ -440,7 +448,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 			iov_count = 1;
 		err = msg_loop(rx_fd, iov_count, iov_buf,
 			       cnt, &s, false, opt);
-		if (err)
+		if (err && opt->verbose)
 			fprintf(stderr,
 				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
 				iov_count, iov_buf, cnt, err);
@@ -450,10 +458,11 @@ static int sendmsg_test(struct sockmap_options *opt)
 			sent_Bps = sentBps(s);
 			recvd_Bps = recvdBps(s);
 		}
-		fprintf(stdout,
-			"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
-			s.bytes_sent, sent_Bps, sent_Bps/giga,
-			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		if (opt->verbose)
+			fprintf(stdout,
+				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+				s.bytes_sent, sent_Bps, sent_Bps/giga,
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
 		exit(1);
 	} else if (rxpid == -1) {
 		perror("msg_loop_rx: ");
@@ -477,10 +486,11 @@ static int sendmsg_test(struct sockmap_options *opt)
 			sent_Bps = sentBps(s);
 			recvd_Bps = recvdBps(s);
 		}
-		fprintf(stdout,
-			"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
-			s.bytes_sent, sent_Bps, sent_Bps/giga,
-			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		if (opt->verbose)
+			fprintf(stdout,
+				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+				s.bytes_sent, sent_Bps, sent_Bps/giga,
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
 		exit(1);
 	} else if (txpid == -1) {
 		perror("msg_loop_tx: ");
@@ -575,17 +585,9 @@ enum {
 	SENDPAGE,
 };
 
-static int run_options(struct sockmap_options options, int cg_fd,  int test)
+static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 {
-	char *bpf_file = BPF_FILENAME;
-	int err, tx_prog_fd;
-	char filename[256];
-
-	if (load_bpf_file(bpf_file)) {
-		fprintf(stderr, "load_bpf_file: (%s) %s\n",
-			filename, strerror(errno));
-		return 1;
-	}
+	int i, key, next_key, err, tx_prog_fd = -1, zero = 0;
 
 	/* If base test skip BPF setup */
 	if (test == BASE || test == BASE_SENDPAGE)
@@ -617,7 +619,7 @@ static int run_options(struct sockmap_options options, int cg_fd,  int test)
 	}
 
 run:
-	err = sockmap_init_sockets();
+	err = sockmap_init_sockets(options->verbose);
 	if (err) {
 		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
 		goto out;
@@ -651,7 +653,7 @@ run:
 			fprintf(stderr,
 				"ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
 				err, strerror(errno));
-			return err;
+			goto out;
 		}
 
 		err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
@@ -659,7 +661,7 @@ run:
 			fprintf(stderr,
 				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
 				err, strerror(errno));
-			return err;
+			goto out;
 		}
 
 		if (txmsg_redir || txmsg_redir_noisy)
@@ -672,7 +674,7 @@ run:
 			fprintf(stderr,
 				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
 				err, strerror(errno));
-			return err;
+			goto out;
 		}
 
 		if (txmsg_apply) {
@@ -682,7 +684,7 @@ run:
 				fprintf(stderr,
 					"ERROR: bpf_map_update_elem (apply_bytes):  %d (%s\n",
 					err, strerror(errno));
-				return err;
+				goto out;
 			}
 		}
 
@@ -693,7 +695,7 @@ run:
 				fprintf(stderr,
 					"ERROR: bpf_map_update_elem (cork_bytes):  %d (%s\n",
 					err, strerror(errno));
-				return err;
+				goto out;
 			}
 		}
 
@@ -704,7 +706,7 @@ run:
 				fprintf(stderr,
 					"ERROR: bpf_map_update_elem (txmsg_start):  %d (%s)\n",
 					err, strerror(errno));
-				return err;
+				goto out;
 			}
 		}
 
@@ -716,7 +718,7 @@ run:
 				fprintf(stderr,
 					"ERROR: bpf_map_update_elem (txmsg_end):  %d (%s)\n",
 					err, strerror(errno));
-				return err;
+				goto out;
 			}
 		}
 
@@ -776,36 +778,484 @@ run:
 	}
 
 	if (txmsg_drop)
-		options.drop_expected = true;
+		options->drop_expected = true;
 
 	if (test == PING_PONG)
-		err = forever_ping_pong(options.rate, &options);
+		err = forever_ping_pong(options->rate, options);
 	else if (test == SENDMSG) {
-		options.base = false;
-		options.sendpage = false;
-		err = sendmsg_test(&options);
+		options->base = false;
+		options->sendpage = false;
+		err = sendmsg_test(options);
 	} else if (test == SENDPAGE) {
-		options.base = false;
-		options.sendpage = true;
-		err = sendmsg_test(&options);
+		options->base = false;
+		options->sendpage = true;
+		err = sendmsg_test(options);
 	} else if (test == BASE) {
-		options.base = true;
-		options.sendpage = false;
-		err = sendmsg_test(&options);
+		options->base = true;
+		options->sendpage = false;
+		err = sendmsg_test(options);
 	} else if (test == BASE_SENDPAGE) {
-		options.base = true;
-		options.sendpage = true;
-		err = sendmsg_test(&options);
+		options->base = true;
+		options->sendpage = true;
+		err = sendmsg_test(options);
 	} else
 		fprintf(stderr, "unknown test\n");
 out:
+	/* Detatch and zero all the maps */
 	bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
+	bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
+	bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
+	if (tx_prog_fd >= 0)
+		bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
+
+	for (i = 0; i < 8; i++) {
+		key = next_key = 0;
+		bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+		while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) {
+			bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+			key = next_key;
+		}
+	}
+
 	close(s1);
 	close(s2);
 	close(p1);
 	close(p2);
 	close(c1);
 	close(c2);
+	return err;
+}
+
+static char *test_to_str(int test)
+{
+	switch (test) {
+	case SENDMSG:
+		return "sendmsg";
+	case SENDPAGE:
+		return "sendpage";
+	}
+	return "unknown";
+}
+
+#define OPTSTRING 60
+static void test_options(char *options)
+{
+	memset(options, 0, OPTSTRING);
+
+	if (txmsg_pass)
+		strncat(options, "pass,", OPTSTRING);
+	if (txmsg_noisy)
+		strncat(options, "pass_noisy,", OPTSTRING);
+	if (txmsg_redir)
+		strncat(options, "redir,", OPTSTRING);
+	if (txmsg_redir_noisy)
+		strncat(options, "redir_noisy,", OPTSTRING);
+	if (txmsg_drop)
+		strncat(options, "drop,", OPTSTRING);
+	if (txmsg_apply)
+		strncat(options, "apply,", OPTSTRING);
+	if (txmsg_cork)
+		strncat(options, "cork,", OPTSTRING);
+	if (txmsg_start)
+		strncat(options, "start,", OPTSTRING);
+	if (txmsg_end)
+		strncat(options, "end,", OPTSTRING);
+	if (txmsg_ingress)
+		strncat(options, "ingress,", OPTSTRING);
+	if (txmsg_skb)
+		strncat(options, "skb,", OPTSTRING);
+}
+
+static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
+{
+	char *options = calloc(60, sizeof(char));
+	int err;
+
+	if (test == SENDPAGE)
+		opt->sendpage = true;
+	else
+		opt->sendpage = false;
+
+	if (txmsg_drop)
+		opt->drop_expected = true;
+	else
+		opt->drop_expected = false;
+
+	test_options(options);
+
+	fprintf(stdout,
+		"[TEST %i]: (%i, %i, %i, %s, %s): ",
+		test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+		test_to_str(test), options);
+	fflush(stdout);
+	err = run_options(opt, cgrp, test);
+	fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
+	test_cnt++;
+	free(options);
+	return err;
+}
+
+static int test_exec(int cgrp, struct sockmap_options *opt)
+{
+	int err = __test_exec(cgrp, SENDMSG, opt);
+
+	sched_yield();
+	if (err)
+		goto out;
+
+	err = __test_exec(cgrp, SENDPAGE, opt);
+	sched_yield();
+out:
+	return err;
+}
+
+static int test_loop(int cgrp)
+{
+	struct sockmap_options opt;
+
+	int err, i, l, r;
+
+	opt.verbose = 0;
+	opt.base = false;
+	opt.sendpage = false;
+	opt.data_test = false;
+	opt.drop_expected = false;
+	opt.iov_count = 0;
+	opt.iov_length = 0;
+	opt.rate = 0;
+
+	for (r = 1; r < 100; r += 33) {
+		for (i = 1; i < 100; i += 33) {
+			for (l = 1; l < 100; l += 33) {
+				opt.rate = r;
+				opt.iov_count = i;
+				opt.iov_length = l;
+				err = test_exec(cgrp, &opt);
+				if (err)
+					goto out;
+			}
+		}
+	}
+
+out:
+	return err;
+}
+
+static int test_txmsg(int cgrp)
+{
+	int err;
+
+	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
+	txmsg_apply = txmsg_cork = 0;
+	txmsg_ingress = txmsg_skb = 0;
+
+	txmsg_pass = 1;
+	err = test_loop(cgrp);
+	txmsg_pass = 0;
+	if (err)
+		goto out;
+
+	txmsg_redir = 1;
+	err = test_loop(cgrp);
+	txmsg_redir = 0;
+	if (err)
+		goto out;
+
+	txmsg_drop = 1;
+	err = test_loop(cgrp);
+	txmsg_drop = 0;
+	if (err)
+		goto out;
+
+	txmsg_redir = 1;
+	txmsg_ingress = 1;
+	err = test_loop(cgrp);
+	txmsg_redir = 0;
+	txmsg_ingress = 0;
+	if (err)
+		goto out;
+out:
+	txmsg_pass = 0;
+	txmsg_redir = 0;
+	txmsg_drop = 0;
+	return err;
+}
+
+static int test_send(struct sockmap_options *opt, int cgrp)
+{
+	int err;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1024;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1024;
+	opt->iov_count = 1;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1;
+	opt->rate = 1024;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 256;
+	opt->iov_count = 1024;
+	opt->rate = 10;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->rate = 100;
+	opt->iov_count = 1;
+	opt->iov_length = 5;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+out:
+	return err;
+}
+
+static int test_mixed(int cgrp)
+{
+	struct sockmap_options opt = {0};
+	int err;
+
+	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
+	txmsg_apply = txmsg_cork = 0;
+	txmsg_start = txmsg_end = 0;
+	/* Test small and large iov_count values with pass/redir/apply/cork */
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 0;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1024;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 0;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1024;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_cork = 4096;
+	txmsg_apply = 4096;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 0;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1024;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 0;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1024;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_cork = 4096;
+	txmsg_apply = 4096;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+out:
+	return err;
+}
+
+static int test_start_end(int cgrp)
+{
+	struct sockmap_options opt = {0};
+	int err, i;
+
+	/* Test basic start/end with lots of iov_count and iov_lengths */
+	txmsg_start = 1;
+	txmsg_end = 2;
+	err = test_txmsg(cgrp);
+	if (err)
+		goto out;
+
+	/* Test start/end with cork */
+	opt.rate = 16;
+	opt.iov_count = 1;
+	opt.iov_length = 100;
+	txmsg_cork = 1600;
+
+	for (i = 99; i <= 1600; i += 100) {
+		txmsg_start = 0;
+		txmsg_end = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+
+	/* Test start/end with cork but pull data in middle */
+	for (i = 199; i <= 1600; i += 100) {
+		txmsg_start = 100;
+		txmsg_end = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+
+	/* Test start/end with cork pulling last sg entry */
+	txmsg_start = 1500;
+	txmsg_end = 1600;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end pull of single byte in last page */
+	txmsg_start = 1111;
+	txmsg_end = 1112;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with end < start */
+	txmsg_start = 1111;
+	txmsg_end = 0;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with end > data */
+	txmsg_start = 0;
+	txmsg_end = 1601;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with start > data */
+	txmsg_start = 1601;
+	txmsg_end = 1600;
+	err = test_exec(cgrp, &opt);
+
+out:
+	txmsg_start = 0;
+	txmsg_end = 0;
+	return err;
+}
+
+static int test_suite(void)
+{
+	char *bpf_file = BPF_FILENAME;
+	int cg_fd, err;
+
+	cg_fd = open(BPF_CGRP, O_DIRECTORY, O_RDONLY);
+	if (cg_fd < 0) {
+		fprintf(stderr,
+			"ERROR: (%i) open cg path failed: %s\n",
+			cg_fd, optarg);
+		return cg_fd;
+	}
+
+	if (load_bpf_file(bpf_file)) {
+		fprintf(stderr, "load_bpf_file: (%s) %s\n",
+			bpf_file, strerror(errno));
+		return 1;
+	}
+
+	/* Tests basic commands and APIs with range of iov values */
+	txmsg_start = txmsg_end = 0;
+	err = test_txmsg(cg_fd);
+	if (err)
+		goto out;
+
+	/* Tests interesting combinations of APIs used together */
+	err = test_mixed(cg_fd);
+	if (err)
+		goto out;
+
+	/* Tests pull_data API using start/end API */
+	err = test_start_end(cg_fd);
+	if (err)
+		goto out;
+
+out:
 	close(cg_fd);
 	return err;
 }
@@ -815,9 +1265,18 @@ int main(int argc, char **argv)
 	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
 	int iov_count = 1, length = 1024, rate = 1;
 	struct sockmap_options options = {0};
-	int opt, longindex, cg_fd = 0;
+	int opt, longindex, err, cg_fd = 0;
+	char *bpf_file = BPF_FILENAME;
 	int test = PING_PONG;
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (argc < 2)
+		return test_suite();
+
 	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
@@ -888,11 +1347,11 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
-	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
-		perror("setrlimit(RLIMIT_MEMLOCK)");
+	if (load_bpf_file(bpf_file)) {
+		fprintf(stderr, "load_bpf_file: (%s) %s\n",
+			bpf_file, strerror(errno));
 		return 1;
 	}
-
 	running = 1;
 
 	/* catch SIGINT */
@@ -902,7 +1361,9 @@ int main(int argc, char **argv)
 	options.iov_length = length;
 	options.rate = rate;
 
-	return run_options(options, cg_fd, test);
+	err = run_options(&options, cg_fd, test);
+	close(cg_fd);
+	return err;
 }
 
 void running_handler(int a)
-- 
cgit v1.2.3-59-g8ed1b


From 16962b2404ac88cde0281fe2176d6ae3820ed320 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 14:30:38 -0700
Subject: bpf: sockmap, add selftests

This adds a new test program test_sockmap which is the old sample
sockmap program. By moving the sample program here we can now run it
as part of the self tests suite. To support this a populate_progs()
routine is added to load programs and maps which was previously done
with load_bpf_file(). This is needed because self test libs do not
provide a similar routine. Also we now use the cgroup_helpers
routines to manage cgroup use instead of manually creating one and
supplying it to the CLI.

Notice we keep the CLI around though because it is useful for dbg
and specialized testing.

To run use ./test_sockmap and the result should be,

Summary 660 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h                  |    1 +
 tools/include/uapi/linux/if_link.h              |   39 +
 tools/lib/bpf/libbpf.c                          |    4 +-
 tools/lib/bpf/libbpf.h                          |    2 +
 tools/testing/selftests/bpf/Makefile            |    5 +-
 tools/testing/selftests/bpf/test_sockmap.c      | 1465 +++++++++++++++++++++++
 tools/testing/selftests/bpf/test_sockmap_kern.c |  340 ++++++
 7 files changed, 1852 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sockmap.c
 create mode 100644 tools/testing/selftests/bpf/test_sockmap_kern.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5841ed41b30c..e6679393b687 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -894,6 +894,7 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
+#define BPF_F_SEQ_NUMBER		(1ULL << 3)
 
 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
  * BPF_FUNC_perf_event_read_value flags.
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 6d9447700e18..68699f654118 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -941,4 +941,43 @@ enum {
 	IFLA_EVENT_BONDING_OPTIONS,	/* change in bonding options */
 };
 
+/* tun section */
+
+enum {
+	IFLA_TUN_UNSPEC,
+	IFLA_TUN_OWNER,
+	IFLA_TUN_GROUP,
+	IFLA_TUN_TYPE,
+	IFLA_TUN_PI,
+	IFLA_TUN_VNET_HDR,
+	IFLA_TUN_PERSIST,
+	IFLA_TUN_MULTI_QUEUE,
+	IFLA_TUN_NUM_QUEUES,
+	IFLA_TUN_NUM_DISABLED_QUEUES,
+	__IFLA_TUN_MAX,
+};
+
+#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1)
+
+/* rmnet section */
+
+#define RMNET_FLAGS_INGRESS_DEAGGREGATION         (1U << 0)
+#define RMNET_FLAGS_INGRESS_MAP_COMMANDS          (1U << 1)
+#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4           (1U << 2)
+#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4            (1U << 3)
+
+enum {
+	IFLA_RMNET_UNSPEC,
+	IFLA_RMNET_MUX_ID,
+	IFLA_RMNET_FLAGS,
+	__IFLA_RMNET_MAX,
+};
+
+#define IFLA_RMNET_MAX	(__IFLA_RMNET_MAX - 1)
+
+struct ifla_rmnet_flags {
+	__u32	flags;
+	__u32	mask;
+};
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 6513e0b08795..7bcdca13083a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1961,8 +1961,8 @@ BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT);
 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
 
-static void bpf_program__set_expected_attach_type(struct bpf_program *prog,
-						 enum bpf_attach_type type)
+void bpf_program__set_expected_attach_type(struct bpf_program *prog,
+					   enum bpf_attach_type type)
 {
 	prog->expected_attach_type = type;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index d6ac4fa6f472..197f9ce2248c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -193,6 +193,8 @@ int bpf_program__set_sched_act(struct bpf_program *prog);
 int bpf_program__set_xdp(struct bpf_program *prog);
 int bpf_program__set_perf_event(struct bpf_program *prog);
 void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
+void bpf_program__set_expected_attach_type(struct bpf_program *prog,
+					   enum bpf_attach_type type);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 64037ee2eed0..0c19d5e08f08 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf
+	test_sock test_btf test_sockmap
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -32,7 +32,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
-	test_btf_haskv.o test_btf_nokv.o
+	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -56,6 +56,7 @@ $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
 $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
+$(OUTPUT)/test_sockmap: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
new file mode 100644
index 000000000000..6d63a1cdc649
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -0,0 +1,1465 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <sched.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/sendfile.h>
+
+#include <linux/netlink.h>
+#include <linux/socket.h>
+#include <linux/sock_diag.h>
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <libgen.h>
+
+#include <getopt.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+int running;
+static void running_handler(int a);
+
+/* randomly selected ports for testing on lo */
+#define S1_PORT 10000
+#define S2_PORT 10001
+
+#define BPF_FILENAME "test_sockmap_kern.o"
+#define CG_PATH "/sockmap"
+
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+int test_cnt;
+int passed;
+int failed;
+int map_fd[8];
+struct bpf_map *maps[8];
+int prog_fd[11];
+
+int txmsg_pass;
+int txmsg_noisy;
+int txmsg_redir;
+int txmsg_redir_noisy;
+int txmsg_drop;
+int txmsg_apply;
+int txmsg_cork;
+int txmsg_start;
+int txmsg_end;
+int txmsg_ingress;
+int txmsg_skb;
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"cgroup",	required_argument,	NULL, 'c' },
+	{"rate",	required_argument,	NULL, 'r' },
+	{"verbose",	no_argument,		NULL, 'v' },
+	{"iov_count",	required_argument,	NULL, 'i' },
+	{"length",	required_argument,	NULL, 'l' },
+	{"test",	required_argument,	NULL, 't' },
+	{"data_test",   no_argument,		NULL, 'd' },
+	{"txmsg",		no_argument,	&txmsg_pass,  1  },
+	{"txmsg_noisy",		no_argument,	&txmsg_noisy, 1  },
+	{"txmsg_redir",		no_argument,	&txmsg_redir, 1  },
+	{"txmsg_redir_noisy",	no_argument,	&txmsg_redir_noisy, 1},
+	{"txmsg_drop",		no_argument,	&txmsg_drop, 1 },
+	{"txmsg_apply",	required_argument,	NULL, 'a'},
+	{"txmsg_cork",	required_argument,	NULL, 'k'},
+	{"txmsg_start", required_argument,	NULL, 's'},
+	{"txmsg_end",	required_argument,	NULL, 'e'},
+	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
+	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
+	{0, 0, NULL, 0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+	printf(" options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)\n",
+				*long_options[i].flag);
+		else
+			printf(" -%c\n", long_options[i].val);
+	}
+	printf("\n");
+}
+
+static int sockmap_init_sockets(int verbose)
+{
+	int i, err, one = 1;
+	struct sockaddr_in addr;
+	int *fds[4] = {&s1, &s2, &c1, &c2};
+
+	s1 = s2 = p1 = p2 = c1 = c2 = 0;
+
+	/* Init sockets */
+	for (i = 0; i < 4; i++) {
+		*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
+		if (*fds[i] < 0) {
+			perror("socket s1 failed()");
+			return errno;
+		}
+	}
+
+	/* Allow reuse */
+	for (i = 0; i < 2; i++) {
+		err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR,
+				 (char *)&one, sizeof(one));
+		if (err) {
+			perror("setsockopt failed()");
+			return errno;
+		}
+	}
+
+	/* Non-blocking sockets */
+	for (i = 0; i < 2; i++) {
+		err = ioctl(*fds[i], FIONBIO, (char *)&one);
+		if (err < 0) {
+			perror("ioctl s1 failed()");
+			return errno;
+		}
+	}
+
+	/* Bind server sockets */
+	memset(&addr, 0, sizeof(struct sockaddr_in));
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+	addr.sin_port = htons(S1_PORT);
+	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
+	if (err < 0) {
+		perror("bind s1 failed()\n");
+		return errno;
+	}
+
+	addr.sin_port = htons(S2_PORT);
+	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
+	if (err < 0) {
+		perror("bind s2 failed()\n");
+		return errno;
+	}
+
+	/* Listen server sockets */
+	addr.sin_port = htons(S1_PORT);
+	err = listen(s1, 32);
+	if (err < 0) {
+		perror("listen s1 failed()\n");
+		return errno;
+	}
+
+	addr.sin_port = htons(S2_PORT);
+	err = listen(s2, 32);
+	if (err < 0) {
+		perror("listen s1 failed()\n");
+		return errno;
+	}
+
+	/* Initiate Connect */
+	addr.sin_port = htons(S1_PORT);
+	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
+	if (err < 0 && errno != EINPROGRESS) {
+		perror("connect c1 failed()\n");
+		return errno;
+	}
+
+	addr.sin_port = htons(S2_PORT);
+	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
+	if (err < 0 && errno != EINPROGRESS) {
+		perror("connect c2 failed()\n");
+		return errno;
+	} else if (err < 0) {
+		err = 0;
+	}
+
+	/* Accept Connecrtions */
+	p1 = accept(s1, NULL, NULL);
+	if (p1 < 0) {
+		perror("accept s1 failed()\n");
+		return errno;
+	}
+
+	p2 = accept(s2, NULL, NULL);
+	if (p2 < 0) {
+		perror("accept s1 failed()\n");
+		return errno;
+	}
+
+	if (verbose) {
+		printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
+		printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
+			c1, s1, c2, s2);
+	}
+	return 0;
+}
+
+struct msg_stats {
+	size_t bytes_sent;
+	size_t bytes_recvd;
+	struct timespec start;
+	struct timespec end;
+};
+
+struct sockmap_options {
+	int verbose;
+	bool base;
+	bool sendpage;
+	bool data_test;
+	bool drop_expected;
+	int iov_count;
+	int iov_length;
+	int rate;
+};
+
+static int msg_loop_sendpage(int fd, int iov_length, int cnt,
+			     struct msg_stats *s,
+			     struct sockmap_options *opt)
+{
+	bool drop = opt->drop_expected;
+	unsigned char k = 0;
+	FILE *file;
+	int i, fp;
+
+	file = fopen(".sendpage_tst.tmp", "w+");
+	for (i = 0; i < iov_length * cnt; i++, k++)
+		fwrite(&k, sizeof(char), 1, file);
+	fflush(file);
+	fseek(file, 0, SEEK_SET);
+	fclose(file);
+
+	fp = open(".sendpage_tst.tmp", O_RDONLY);
+	clock_gettime(CLOCK_MONOTONIC, &s->start);
+	for (i = 0; i < cnt; i++) {
+		int sent = sendfile(fd, fp, NULL, iov_length);
+
+		if (!drop && sent < 0) {
+			perror("send loop error:");
+			close(fp);
+			return sent;
+		} else if (drop && sent >= 0) {
+			printf("sendpage loop error expected: %i\n", sent);
+			close(fp);
+			return -EIO;
+		}
+
+		if (sent > 0)
+			s->bytes_sent += sent;
+	}
+	clock_gettime(CLOCK_MONOTONIC, &s->end);
+	close(fp);
+	return 0;
+}
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx,
+		    struct sockmap_options *opt)
+{
+	struct msghdr msg = {0};
+	int err, i, flags = MSG_NOSIGNAL;
+	struct iovec *iov;
+	unsigned char k;
+	bool data_test = opt->data_test;
+	bool drop = opt->drop_expected;
+
+	iov = calloc(iov_count, sizeof(struct iovec));
+	if (!iov)
+		return errno;
+
+	k = 0;
+	for (i = 0; i < iov_count; i++) {
+		unsigned char *d = calloc(iov_length, sizeof(char));
+
+		if (!d) {
+			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+			goto out_errno;
+		}
+		iov[i].iov_base = d;
+		iov[i].iov_len = iov_length;
+
+		if (data_test && tx) {
+			int j;
+
+			for (j = 0; j < iov_length; j++)
+				d[j] = k++;
+		}
+	}
+
+	msg.msg_iov = iov;
+	msg.msg_iovlen = iov_count;
+	k = 0;
+
+	if (tx) {
+		clock_gettime(CLOCK_MONOTONIC, &s->start);
+		for (i = 0; i < cnt; i++) {
+			int sent = sendmsg(fd, &msg, flags);
+
+			if (!drop && sent < 0) {
+				perror("send loop error:");
+				goto out_errno;
+			} else if (drop && sent >= 0) {
+				printf("send loop error expected: %i\n", sent);
+				errno = -EIO;
+				goto out_errno;
+			}
+			if (sent > 0)
+				s->bytes_sent += sent;
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	} else {
+		int slct, recv, max_fd = fd;
+		int fd_flags = O_NONBLOCK;
+		struct timeval timeout;
+		float total_bytes;
+		fd_set w;
+
+		fcntl(fd, fd_flags);
+		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+		if (err < 0)
+			perror("recv start time: ");
+		while (s->bytes_recvd < total_bytes) {
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
+
+			/* FD sets */
+			FD_ZERO(&w);
+			FD_SET(fd, &w);
+
+			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+			if (slct == -1) {
+				perror("select()");
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			} else if (!slct) {
+				if (opt->verbose)
+					fprintf(stderr, "unexpected timeout\n");
+				errno = -EIO;
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
+				goto out_errno;
+			}
+
+			recv = recvmsg(fd, &msg, flags);
+			if (recv < 0) {
+				if (errno != EWOULDBLOCK) {
+					clock_gettime(CLOCK_MONOTONIC, &s->end);
+					perror("recv failed()\n");
+					goto out_errno;
+				}
+			}
+
+			s->bytes_recvd += recv;
+
+			if (data_test) {
+				int j;
+
+				for (i = 0; i < msg.msg_iovlen; i++) {
+					unsigned char *d = iov[i].iov_base;
+
+					for (j = 0;
+					     j < iov[i].iov_len && recv; j++) {
+						if (d[j] != k++) {
+							errno = -EIO;
+							fprintf(stderr,
+								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
+								i, j, d[j], k - 1, d[j+1], k + 1);
+							goto out_errno;
+						}
+						recv--;
+					}
+				}
+			}
+		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
+	}
+
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return 0;
+out_errno:
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return errno;
+}
+
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+	return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static int sendmsg_test(struct sockmap_options *opt)
+{
+	float sent_Bps = 0, recvd_Bps = 0;
+	int rx_fd, txpid, rxpid, err = 0;
+	struct msg_stats s = {0};
+	int iov_count = opt->iov_count;
+	int iov_buf = opt->iov_length;
+	int cnt = opt->rate;
+	int status;
+
+	errno = 0;
+
+	if (opt->base)
+		rx_fd = p1;
+	else
+		rx_fd = p2;
+
+	rxpid = fork();
+	if (rxpid == 0) {
+		if (opt->drop_expected)
+			exit(1);
+
+		if (opt->sendpage)
+			iov_count = 1;
+		err = msg_loop(rx_fd, iov_count, iov_buf,
+			       cnt, &s, false, opt);
+		if (err && opt->verbose)
+			fprintf(stderr,
+				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(p2, SHUT_RDWR);
+		shutdown(p1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		if (opt->verbose)
+			fprintf(stdout,
+				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+				s.bytes_sent, sent_Bps, sent_Bps/giga,
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (rxpid == -1) {
+		perror("msg_loop_rx: ");
+		return errno;
+	}
+
+	txpid = fork();
+	if (txpid == 0) {
+		if (opt->sendpage)
+			err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt);
+		else
+			err = msg_loop(c1, iov_count, iov_buf,
+				       cnt, &s, true, opt);
+
+		if (err)
+			fprintf(stderr,
+				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		shutdown(c1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		if (opt->verbose)
+			fprintf(stdout,
+				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+				s.bytes_sent, sent_Bps, sent_Bps/giga,
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+		exit(1);
+	} else if (txpid == -1) {
+		perror("msg_loop_tx: ");
+		return errno;
+	}
+
+	assert(waitpid(rxpid, &status, 0) == rxpid);
+	assert(waitpid(txpid, &status, 0) == txpid);
+	return err;
+}
+
+static int forever_ping_pong(int rate, struct sockmap_options *opt)
+{
+	struct timeval timeout;
+	char buf[1024] = {0};
+	int sc;
+
+	timeout.tv_sec = 10;
+	timeout.tv_usec = 0;
+
+	/* Ping/Pong data from client to server */
+	sc = send(c1, buf, sizeof(buf), 0);
+	if (sc < 0) {
+		perror("send failed()\n");
+		return sc;
+	}
+
+	do {
+		int s, rc, i, max_fd = p2;
+		fd_set w;
+
+		/* FD sets */
+		FD_ZERO(&w);
+		FD_SET(c1, &w);
+		FD_SET(c2, &w);
+		FD_SET(p1, &w);
+		FD_SET(p2, &w);
+
+		s = select(max_fd + 1, &w, NULL, NULL, &timeout);
+		if (s == -1) {
+			perror("select()");
+			break;
+		} else if (!s) {
+			fprintf(stderr, "unexpected timeout\n");
+			break;
+		}
+
+		for (i = 0; i <= max_fd && s > 0; ++i) {
+			if (!FD_ISSET(i, &w))
+				continue;
+
+			s--;
+
+			rc = recv(i, buf, sizeof(buf), 0);
+			if (rc < 0) {
+				if (errno != EWOULDBLOCK) {
+					perror("recv failed()\n");
+					return rc;
+				}
+			}
+
+			if (rc == 0) {
+				close(i);
+				break;
+			}
+
+			sc = send(i, buf, rc, 0);
+			if (sc < 0) {
+				perror("send failed()\n");
+				return sc;
+			}
+		}
+
+		if (rate)
+			sleep(rate);
+
+		if (opt->verbose) {
+			printf(".");
+			fflush(stdout);
+
+		}
+	} while (running);
+
+	return 0;
+}
+
+enum {
+	PING_PONG,
+	SENDMSG,
+	BASE,
+	BASE_SENDPAGE,
+	SENDPAGE,
+};
+
+static int run_options(struct sockmap_options *options, int cg_fd,  int test)
+{
+	int i, key, next_key, err, tx_prog_fd = -1, zero = 0;
+
+	/* If base test skip BPF setup */
+	if (test == BASE || test == BASE_SENDPAGE)
+		goto run;
+
+	/* Attach programs to sockmap */
+	err = bpf_prog_attach(prog_fd[0], map_fd[0],
+				BPF_SK_SKB_STREAM_PARSER, 0);
+	if (err) {
+		fprintf(stderr,
+			"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
+			prog_fd[0], map_fd[0], err, strerror(errno));
+		return err;
+	}
+
+	err = bpf_prog_attach(prog_fd[1], map_fd[0],
+				BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (err) {
+		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
+			err, strerror(errno));
+		return err;
+	}
+
+	/* Attach to cgroups */
+	err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (err) {
+		fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
+			err, strerror(errno));
+		return err;
+	}
+
+run:
+	err = sockmap_init_sockets(options->verbose);
+	if (err) {
+		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
+		goto out;
+	}
+
+	/* Attach txmsg program to sockmap */
+	if (txmsg_pass)
+		tx_prog_fd = prog_fd[3];
+	else if (txmsg_noisy)
+		tx_prog_fd = prog_fd[4];
+	else if (txmsg_redir)
+		tx_prog_fd = prog_fd[5];
+	else if (txmsg_redir_noisy)
+		tx_prog_fd = prog_fd[6];
+	else if (txmsg_drop)
+		tx_prog_fd = prog_fd[9];
+	/* apply and cork must be last */
+	else if (txmsg_apply)
+		tx_prog_fd = prog_fd[7];
+	else if (txmsg_cork)
+		tx_prog_fd = prog_fd[8];
+	else
+		tx_prog_fd = 0;
+
+	if (tx_prog_fd) {
+		int redir_fd, i = 0;
+
+		err = bpf_prog_attach(tx_prog_fd,
+				      map_fd[1], BPF_SK_MSG_VERDICT, 0);
+		if (err) {
+			fprintf(stderr,
+				"ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
+				err, strerror(errno));
+			goto out;
+		}
+
+		err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
+		if (err) {
+			fprintf(stderr,
+				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
+				err, strerror(errno));
+			goto out;
+		}
+
+		if (txmsg_redir || txmsg_redir_noisy)
+			redir_fd = c2;
+		else
+			redir_fd = c1;
+
+		err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY);
+		if (err) {
+			fprintf(stderr,
+				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
+				err, strerror(errno));
+			goto out;
+		}
+
+		if (txmsg_apply) {
+			err = bpf_map_update_elem(map_fd[3],
+						  &i, &txmsg_apply, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (apply_bytes):  %d (%s\n",
+					err, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (txmsg_cork) {
+			err = bpf_map_update_elem(map_fd[4],
+						  &i, &txmsg_cork, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (cork_bytes):  %d (%s\n",
+					err, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (txmsg_start) {
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (txmsg_start):  %d (%s)\n",
+					err, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (txmsg_end) {
+			i = 1;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_end, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (txmsg_end):  %d (%s)\n",
+					err, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (txmsg_ingress) {
+			int in = BPF_F_INGRESS;
+
+			i = 0;
+			err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
+					err, strerror(errno));
+			}
+			i = 1;
+			err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n",
+					err, strerror(errno));
+			}
+			err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n",
+					err, strerror(errno));
+			}
+
+			i = 2;
+			err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n",
+					err, strerror(errno));
+			}
+		}
+
+		if (txmsg_skb) {
+			int skb_fd = (test == SENDMSG || test == SENDPAGE) ?
+					p2 : p1;
+			int ingress = BPF_F_INGRESS;
+
+			i = 0;
+			err = bpf_map_update_elem(map_fd[7],
+						  &i, &ingress, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
+					err, strerror(errno));
+			}
+
+			i = 3;
+			err = bpf_map_update_elem(map_fd[0],
+						  &i, &skb_fd, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
+					err, strerror(errno));
+			}
+		}
+	}
+
+	if (txmsg_drop)
+		options->drop_expected = true;
+
+	if (test == PING_PONG)
+		err = forever_ping_pong(options->rate, options);
+	else if (test == SENDMSG) {
+		options->base = false;
+		options->sendpage = false;
+		err = sendmsg_test(options);
+	} else if (test == SENDPAGE) {
+		options->base = false;
+		options->sendpage = true;
+		err = sendmsg_test(options);
+	} else if (test == BASE) {
+		options->base = true;
+		options->sendpage = false;
+		err = sendmsg_test(options);
+	} else if (test == BASE_SENDPAGE) {
+		options->base = true;
+		options->sendpage = true;
+		err = sendmsg_test(options);
+	} else
+		fprintf(stderr, "unknown test\n");
+out:
+	/* Detatch and zero all the maps */
+	bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
+	bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
+	bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
+	if (tx_prog_fd >= 0)
+		bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
+
+	for (i = 0; i < 8; i++) {
+		key = next_key = 0;
+		bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+		while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) {
+			bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+			key = next_key;
+		}
+	}
+
+	close(s1);
+	close(s2);
+	close(p1);
+	close(p2);
+	close(c1);
+	close(c2);
+	return err;
+}
+
+static char *test_to_str(int test)
+{
+	switch (test) {
+	case SENDMSG:
+		return "sendmsg";
+	case SENDPAGE:
+		return "sendpage";
+	}
+	return "unknown";
+}
+
+#define OPTSTRING 60
+static void test_options(char *options)
+{
+	memset(options, 0, OPTSTRING);
+
+	if (txmsg_pass)
+		strncat(options, "pass,", OPTSTRING);
+	if (txmsg_noisy)
+		strncat(options, "pass_noisy,", OPTSTRING);
+	if (txmsg_redir)
+		strncat(options, "redir,", OPTSTRING);
+	if (txmsg_redir_noisy)
+		strncat(options, "redir_noisy,", OPTSTRING);
+	if (txmsg_drop)
+		strncat(options, "drop,", OPTSTRING);
+	if (txmsg_apply)
+		strncat(options, "apply,", OPTSTRING);
+	if (txmsg_cork)
+		strncat(options, "cork,", OPTSTRING);
+	if (txmsg_start)
+		strncat(options, "start,", OPTSTRING);
+	if (txmsg_end)
+		strncat(options, "end,", OPTSTRING);
+	if (txmsg_ingress)
+		strncat(options, "ingress,", OPTSTRING);
+	if (txmsg_skb)
+		strncat(options, "skb,", OPTSTRING);
+}
+
+static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
+{
+	char *options = calloc(60, sizeof(char));
+	int err;
+
+	if (test == SENDPAGE)
+		opt->sendpage = true;
+	else
+		opt->sendpage = false;
+
+	if (txmsg_drop)
+		opt->drop_expected = true;
+	else
+		opt->drop_expected = false;
+
+	test_options(options);
+
+	fprintf(stdout,
+		"[TEST %i]: (%i, %i, %i, %s, %s): ",
+		test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+		test_to_str(test), options);
+	fflush(stdout);
+	err = run_options(opt, cgrp, test);
+	fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
+	test_cnt++;
+	!err ? passed++ : failed++;
+	free(options);
+	return err;
+}
+
+static int test_exec(int cgrp, struct sockmap_options *opt)
+{
+	int err = __test_exec(cgrp, SENDMSG, opt);
+
+	sched_yield();
+	if (err)
+		goto out;
+
+	err = __test_exec(cgrp, SENDPAGE, opt);
+	sched_yield();
+out:
+	return err;
+}
+
+static int test_loop(int cgrp)
+{
+	struct sockmap_options opt;
+
+	int err, i, l, r;
+
+	opt.verbose = 0;
+	opt.base = false;
+	opt.sendpage = false;
+	opt.data_test = false;
+	opt.drop_expected = false;
+	opt.iov_count = 0;
+	opt.iov_length = 0;
+	opt.rate = 0;
+
+	for (r = 1; r < 100; r += 33) {
+		for (i = 1; i < 100; i += 33) {
+			for (l = 1; l < 100; l += 33) {
+				opt.rate = r;
+				opt.iov_count = i;
+				opt.iov_length = l;
+				err = test_exec(cgrp, &opt);
+				if (err)
+					goto out;
+			}
+		}
+	}
+
+out:
+	return err;
+}
+
+static int test_txmsg(int cgrp)
+{
+	int err;
+
+	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
+	txmsg_apply = txmsg_cork = 0;
+	txmsg_ingress = txmsg_skb = 0;
+
+	txmsg_pass = 1;
+	err = test_loop(cgrp);
+	txmsg_pass = 0;
+	if (err)
+		goto out;
+
+	txmsg_redir = 1;
+	err = test_loop(cgrp);
+	txmsg_redir = 0;
+	if (err)
+		goto out;
+
+	txmsg_drop = 1;
+	err = test_loop(cgrp);
+	txmsg_drop = 0;
+	if (err)
+		goto out;
+
+	txmsg_redir = 1;
+	txmsg_ingress = 1;
+	err = test_loop(cgrp);
+	txmsg_redir = 0;
+	txmsg_ingress = 0;
+	if (err)
+		goto out;
+out:
+	txmsg_pass = 0;
+	txmsg_redir = 0;
+	txmsg_drop = 0;
+	return err;
+}
+
+static int test_send(struct sockmap_options *opt, int cgrp)
+{
+	int err;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1024;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1024;
+	opt->iov_count = 1;
+	opt->rate = 1;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 1;
+	opt->iov_count = 1;
+	opt->rate = 1024;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->iov_length = 256;
+	opt->iov_count = 1024;
+	opt->rate = 10;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+
+	opt->rate = 100;
+	opt->iov_count = 1;
+	opt->iov_length = 5;
+	err = test_exec(cgrp, opt);
+	if (err)
+		goto out;
+out:
+	return err;
+}
+
+static int test_mixed(int cgrp)
+{
+	struct sockmap_options opt = {0};
+	int err;
+
+	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
+	txmsg_apply = txmsg_cork = 0;
+	txmsg_start = txmsg_end = 0;
+	/* Test small and large iov_count values with pass/redir/apply/cork */
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 0;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1024;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 0;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_apply = 1024;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 1;
+	txmsg_redir = 0;
+	txmsg_cork = 4096;
+	txmsg_apply = 4096;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 0;
+	txmsg_cork = 1;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1024;
+	txmsg_cork = 0;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 0;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_apply = 1024;
+	txmsg_cork = 1024;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+
+	txmsg_pass = 0;
+	txmsg_redir = 1;
+	txmsg_cork = 4096;
+	txmsg_apply = 4096;
+	err = test_send(&opt, cgrp);
+	if (err)
+		goto out;
+out:
+	return err;
+}
+
+static int test_start_end(int cgrp)
+{
+	struct sockmap_options opt = {0};
+	int err, i;
+
+	/* Test basic start/end with lots of iov_count and iov_lengths */
+	txmsg_start = 1;
+	txmsg_end = 2;
+	err = test_txmsg(cgrp);
+	if (err)
+		goto out;
+
+	/* Test start/end with cork */
+	opt.rate = 16;
+	opt.iov_count = 1;
+	opt.iov_length = 100;
+	txmsg_cork = 1600;
+
+	for (i = 99; i <= 1600; i += 100) {
+		txmsg_start = 0;
+		txmsg_end = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+
+	/* Test start/end with cork but pull data in middle */
+	for (i = 199; i <= 1600; i += 100) {
+		txmsg_start = 100;
+		txmsg_end = i;
+		err = test_exec(cgrp, &opt);
+		if (err)
+			goto out;
+	}
+
+	/* Test start/end with cork pulling last sg entry */
+	txmsg_start = 1500;
+	txmsg_end = 1600;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end pull of single byte in last page */
+	txmsg_start = 1111;
+	txmsg_end = 1112;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with end < start */
+	txmsg_start = 1111;
+	txmsg_end = 0;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with end > data */
+	txmsg_start = 0;
+	txmsg_end = 1601;
+	err = test_exec(cgrp, &opt);
+	if (err)
+		goto out;
+
+	/* Test start/end with start > data */
+	txmsg_start = 1601;
+	txmsg_end = 1600;
+	err = test_exec(cgrp, &opt);
+
+out:
+	txmsg_start = 0;
+	txmsg_end = 0;
+	return err;
+}
+
+char *map_names[] = {
+	"sock_map",
+	"sock_map_txmsg",
+	"sock_map_redir",
+	"sock_apply_bytes",
+	"sock_cork_bytes",
+	"sock_pull_bytes",
+	"sock_redir_flags",
+	"sock_skb_opts",
+};
+
+int prog_attach_type[] = {
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_SOCK_OPS,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+	BPF_SK_MSG_VERDICT,
+};
+
+int prog_type[] = {
+	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_SOCK_OPS,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_SK_MSG,
+};
+
+static int populate_progs(void)
+{
+	char *bpf_file = BPF_FILENAME;
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	int i = 0;
+	long err;
+
+	obj = bpf_object__open(bpf_file);
+	err = libbpf_get_error(obj);
+	if (err) {
+		char err_buf[256];
+
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
+		printf("Unable to load eBPF objects in file '%s' : %s\n",
+		       bpf_file, err_buf);
+		return -1;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		bpf_program__set_type(prog, prog_type[i]);
+		bpf_program__set_expected_attach_type(prog,
+						      prog_attach_type[i]);
+		i++;
+	}
+
+	i = bpf_object__load(obj);
+	i = 0;
+	bpf_object__for_each_program(prog, obj) {
+		prog_fd[i] = bpf_program__fd(prog);
+		i++;
+	}
+
+	for (i = 0; i < sizeof(map_fd)/sizeof(int); i++) {
+		maps[i] = bpf_object__find_map_by_name(obj, map_names[i]);
+		map_fd[i] = bpf_map__fd(maps[i]);
+		if (map_fd[i] < 0) {
+			fprintf(stderr, "load_bpf_file: (%i) %s\n",
+				map_fd[i], strerror(errno));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int test_suite(void)
+{
+	int cg_fd, err;
+
+	err = populate_progs();
+	if (err < 0) {
+		fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
+		return err;
+	}
+
+	if (setup_cgroup_environment()) {
+		fprintf(stderr, "ERROR: cgroup env failed\n");
+		return -EINVAL;
+	}
+
+	cg_fd = create_and_get_cgroup(CG_PATH);
+	if (cg_fd < 0) {
+		fprintf(stderr,
+			"ERROR: (%i) open cg path failed: %s\n",
+			cg_fd, optarg);
+		return cg_fd;
+	}
+
+	/* Tests basic commands and APIs with range of iov values */
+	txmsg_start = txmsg_end = 0;
+	err = test_txmsg(cg_fd);
+	if (err)
+		goto out;
+
+	/* Tests interesting combinations of APIs used together */
+	err = test_mixed(cg_fd);
+	if (err)
+		goto out;
+
+	/* Tests pull_data API using start/end API */
+	err = test_start_end(cg_fd);
+	if (err)
+		goto out;
+
+out:
+	printf("Summary: %i PASSED %i FAILED\n", passed, failed);
+	close(cg_fd);
+	return err;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	int iov_count = 1, length = 1024, rate = 1;
+	struct sockmap_options options = {0};
+	int opt, longindex, err, cg_fd = 0;
+	char *bpf_file = BPF_FILENAME;
+	int test = PING_PONG;
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (argc < 2)
+		return test_suite();
+
+	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 's':
+			txmsg_start = atoi(optarg);
+			break;
+		case 'e':
+			txmsg_end = atoi(optarg);
+			break;
+		case 'a':
+			txmsg_apply = atoi(optarg);
+			break;
+		case 'k':
+			txmsg_cork = atoi(optarg);
+			break;
+		case 'c':
+			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+			if (cg_fd < 0) {
+				fprintf(stderr,
+					"ERROR: (%i) open cg path failed: %s\n",
+					cg_fd, optarg);
+				return cg_fd;
+			}
+			break;
+		case 'r':
+			rate = atoi(optarg);
+			break;
+		case 'v':
+			options.verbose = 1;
+			break;
+		case 'i':
+			iov_count = atoi(optarg);
+			break;
+		case 'l':
+			length = atoi(optarg);
+			break;
+		case 'd':
+			options.data_test = true;
+			break;
+		case 't':
+			if (strcmp(optarg, "ping") == 0) {
+				test = PING_PONG;
+			} else if (strcmp(optarg, "sendmsg") == 0) {
+				test = SENDMSG;
+			} else if (strcmp(optarg, "base") == 0) {
+				test = BASE;
+			} else if (strcmp(optarg, "base_sendpage") == 0) {
+				test = BASE_SENDPAGE;
+			} else if (strcmp(optarg, "sendpage") == 0) {
+				test = SENDPAGE;
+			} else {
+				usage(argv);
+				return -1;
+			}
+			break;
+		case 0:
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return -1;
+		}
+	}
+
+	if (!cg_fd) {
+		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+			argv[0]);
+		return -1;
+	}
+
+	err = populate_progs();
+	if (err) {
+		fprintf(stderr, "populate program: (%s) %s\n",
+			bpf_file, strerror(errno));
+		return 1;
+	}
+	running = 1;
+
+	/* catch SIGINT */
+	signal(SIGINT, running_handler);
+
+	options.iov_count = iov_count;
+	options.iov_length = length;
+	options.rate = rate;
+
+	err = run_options(&options, cg_fd, test);
+	close(cg_fd);
+	return err;
+}
+
+void running_handler(int a)
+{
+	running = 0;
+}
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/test_sockmap_kern.c
new file mode 100644
index 000000000000..33de97e2b6b6
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ *    client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") sock_map = {
+	.type = BPF_MAP_TYPE_SOCKMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_txmsg = {
+	.type = BPF_MAP_TYPE_SOCKMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_redir = {
+	.type = BPF_MAP_TYPE_SOCKMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_apply_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_cork_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_pull_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 2
+};
+
+struct bpf_map_def SEC("maps") sock_redir_flags = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_skb_opts = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+	__u32 lport = skb->local_port;
+	__u32 rport = skb->remote_port;
+	int len, *f, ret, zero = 0;
+	__u64 flags = 0;
+
+	if (lport == 10000)
+		ret = 10;
+	else
+		ret = 1;
+
+	len = (__u32)skb->data_end - (__u32)skb->data;
+	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+	if (f && *f) {
+		ret = 3;
+		flags = *f;
+	}
+
+	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
+		   len, flags);
+	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+	__u32 lport, rport;
+	int op, err = 0, index, key, ret;
+
+
+	op = (int) skops->op;
+
+	switch (op) {
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (lport == 10000) {
+			ret = 1;
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (bpf_ntohl(rport) == 10001) {
+			ret = 10;
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog5(struct sk_msg_md *msg)
+{
+	int err1 = -1, err2 = -1, zero = 0, one = 1;
+	int *bytes, *start, *end, len1, len2;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+		int err;
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
+		   len1, err1, err2);
+	return SK_PASS;
+}
+
+SEC("sk_msg3")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1, key = 0;
+	int *start, *end, *f;
+	__u64 flags = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+}
+
+SEC("sk_msg4")
+int bpf_prog7(struct sk_msg_md *msg)
+{
+	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
+	int *f, *bytes, *start, *end, len1, len2;
+	__u64 flags = 0;
+
+		int err;
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
+		   len1, flags, err1 ? err1 : err2);
+	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+	bpf_printk("sk_msg3: err %i\n", err);
+	return err;
+}
+
+SEC("sk_msg5")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes) {
+		ret = bpf_msg_apply_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	} else {
+		return SK_DROP;
+	}
+	return SK_PASS;
+}
+SEC("sk_msg6")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes) {
+		if (((__u64)data_end - (__u64)data) >= *bytes)
+			return SK_PASS;
+		ret = bpf_msg_cork_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	}
+	return SK_PASS;
+}
+
+SEC("sk_msg7")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+
+	return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From 2e04eb1dd1caf4eaa0998e928f1fb896e35b01f2 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 14:30:43 -0700
Subject: bpf: sockmap, remove samples program

The BPF sample sockmap is redundant now that equivelant tests exist
in the BPF selftests. Lets remove this sample and only keep the
selftest version that will be run as part of the selftest suite.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/Makefile        |   75 ---
 samples/sockmap/sockmap_kern.c  |  341 ----------
 samples/sockmap/sockmap_test.sh |  488 --------------
 samples/sockmap/sockmap_user.c  | 1372 ---------------------------------------
 4 files changed, 2276 deletions(-)
 delete mode 100644 samples/sockmap/Makefile
 delete mode 100644 samples/sockmap/sockmap_kern.c
 delete mode 100755 samples/sockmap/sockmap_test.sh
 delete mode 100644 samples/sockmap/sockmap_user.c

diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
deleted file mode 100644
index 9bf2881bd11b..000000000000
--- a/samples/sockmap/Makefile
+++ /dev/null
@@ -1,75 +0,0 @@
-# List of programs to build
-hostprogs-y := sockmap
-
-# Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
-
-HOSTCFLAGS += -I$(objtree)/usr/include
-HOSTCFLAGS += -I$(srctree)/tools/lib/
-HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
-HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
-HOSTCFLAGS += -I$(srctree)/tools/perf
-
-sockmap-objs := ../bpf/bpf_load.o $(LIBBPF) sockmap_user.o
-
-# Tell kbuild to always build the programs
-always := $(hostprogs-y)
-always += sockmap_kern.o
-
-HOSTLOADLIBES_sockmap += -lelf -lpthread
-
-# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
-#  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
-LLC ?= llc
-CLANG ?= clang
-
-# Trick to allow make to be run from this directory
-all:
-	$(MAKE) -C ../../ $(CURDIR)/
-
-clean:
-	$(MAKE) -C ../../ M=$(CURDIR) clean
-	@rm -f *~
-
-$(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
-	$(call if_changed_dep,cc_s_c)
-
-$(obj)/syscall_nrs.h:	$(obj)/syscall_nrs.s FORCE
-	$(call filechk,offsets,__SYSCALL_NRS_H__)
-
-clean-files += syscall_nrs.h
-
-FORCE:
-
-
-# Verify LLVM compiler tools are available and bpf target is supported by llc
-.PHONY: verify_cmds verify_target_bpf $(CLANG) $(LLC)
-
-verify_cmds: $(CLANG) $(LLC)
-	@for TOOL in $^ ; do \
-		if ! (which -- "$${TOOL}" > /dev/null 2>&1); then \
-			echo "*** ERROR: Cannot find LLVM tool $${TOOL}" ;\
-			exit 1; \
-		else true; fi; \
-	done
-
-verify_target_bpf: verify_cmds
-	@if ! (${LLC} -march=bpf -mattr=help > /dev/null 2>&1); then \
-		echo "*** ERROR: LLVM (${LLC}) does not support 'bpf' target" ;\
-		echo "   NOTICE: LLVM version >= 3.7.1 required" ;\
-		exit 2; \
-	else true; fi
-
-$(src)/*.c: verify_target_bpf
-
-# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
-# But, there is no easy way to fix it, so just exclude it since it is
-# useless for BPF samples.
-$(obj)/%.o: $(src)/%.c
-	$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
-		-D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
-		-Wno-compare-distinct-pointer-types \
-		-Wno-gnu-variable-sized-type-not-at-end \
-		-Wno-address-of-packed-member -Wno-tautological-compare \
-		-Wno-unknown-warning-option \
-		-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c
deleted file mode 100644
index 9ff8bc5dc206..000000000000
--- a/samples/sockmap/sockmap_kern.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#include <uapi/linux/bpf.h>
-#include <uapi/linux/if_ether.h>
-#include <uapi/linux/if_packet.h>
-#include <uapi/linux/ip.h>
-#include "../../tools/testing/selftests/bpf/bpf_helpers.h"
-#include "../../tools/testing/selftests/bpf/bpf_endian.h"
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- *    client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-#define bpf_printk(fmt, ...)					\
-({								\
-	       char ____fmt[] = fmt;				\
-	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
-				##__VA_ARGS__);			\
-})
-
-struct bpf_map_def SEC("maps") sock_map = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_txmsg = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_redir = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_apply_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_cork_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_pull_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 2
-};
-
-struct bpf_map_def SEC("maps") sock_redir_flags = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_skb_opts = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
-	return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
-	__u32 lport = skb->local_port;
-	__u32 rport = skb->remote_port;
-	int len, *f, ret, zero = 0;
-	__u64 flags = 0;
-
-	if (lport == 10000)
-		ret = 10;
-	else
-		ret = 1;
-
-	len = (__u32)skb->data_end - (__u32)skb->data;
-	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
-	if (f && *f) {
-		ret = 3;
-		flags = *f;
-	}
-
-	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
-		   len, flags);
-	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
-	__u32 lport, rport;
-	int op, err = 0, index, key, ret;
-
-
-	op = (int) skops->op;
-
-	switch (op) {
-	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (lport == 10000) {
-			ret = 1;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (bpf_ntohl(rport) == 10001) {
-			ret = 10;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
-	int err1 = -1, err2 = -1, zero = 0, one = 1;
-	int *bytes, *start, *end, len1, len2;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-		int err;
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
-		   len1, err1, err2);
-	return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1, key = 0;
-	int *start, *end, *f;
-	__u64 flags = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
-	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
-	int *f, *bytes, *start, *end, len1, len2;
-	__u64 flags = 0;
-
-		int err;
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
-		   len1, flags, err1 ? err1 : err2);
-	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-	bpf_printk("sk_msg3: err %i\n", err);
-	return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes) {
-		ret = bpf_msg_apply_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	} else {
-		return SK_DROP;
-	}
-	return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes) {
-		if (((__u64)data_end - (__u64)data) >= *bytes)
-			return SK_PASS;
-		ret = bpf_msg_cork_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	}
-	return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-
-	return SK_DROP;
-}
-
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh
deleted file mode 100755
index ace75f070eb8..000000000000
--- a/samples/sockmap/sockmap_test.sh
+++ /dev/null
@@ -1,488 +0,0 @@
-#Test a bunch of positive cases to verify basic functionality
-for prog in  "--txmsg_redir --txmsg_skb" "--txmsg_redir --txmsg_ingress" "--txmsg" "--txmsg_redir" "--txmsg_redir --txmsg_ingress" "--txmsg_drop"; do
-for t in "sendmsg" "sendpage"; do
-for r in 1 10 100; do
-	for i in 1 10 100; do
-		for l in 1 10 100; do
-			TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-			echo $TEST
-			$TEST
-			sleep 2
-		done
-	done
-done
-done
-done
-
-#Test max iov
-t="sendmsg"
-r=1
-i=1024
-l=1
-prog="--txmsg"
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-prog="--txmsg_redir"
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-
-# Test max iov with 1k send
-
-t="sendmsg"
-r=1
-i=1024
-l=1024
-prog="--txmsg"
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-prog="--txmsg_redir"
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-echo $TEST
-$TEST
-sleep 2
-
-# Test apply with 1B
-r=1
-i=1024
-l=1024
-prog="--txmsg_apply 1"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply with apply that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply and redirect with 1B
-r=1
-i=1024
-l=1024
-prog="--txmsg_redir --txmsg_apply 1"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 1 --txmsg_ingress"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 1 --txmsg_skb"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-
-# Test apply and redirect with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_redir --txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 2048 --txmsg_ingress"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-prog="--txmsg_redir --txmsg_apply 2048 --txmsg_skb"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-
-# Test apply and redirect with apply that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_apply 2048"
-
-for t in "sendmsg" "sendpage"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with 1B not really useful but test it anyways
-r=1
-i=1024
-l=1024
-prog="--txmsg_cork 1"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with a more reasonable 100B
-r=1
-i=1000
-l=1000
-prog="--txmsg_cork 100"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with cork that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-r=1
-i=1024
-l=1024
-prog="--txmsg_redir --txmsg_cork 1"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with a more reasonable 100B
-r=1
-i=1000
-l=1000
-prog="--txmsg_redir --txmsg_cork 100"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with larger value than send
-r=1
-i=8
-l=1024
-prog="--txmsg_redir --txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test cork with cork that never reaches limit
-r=1024
-i=1
-l=1
-prog="--txmsg_cork 2048"
-
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-
-# mix and match cork and apply not really useful but valid programs
-
-# Test apply < cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 10 --txmsg_cork 100"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Try again with larger sizes so we hit overflow case
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 4096 --txmsg_cork 8096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply > cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 100 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Again with larger sizes so we hit overflow cases
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 8096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-
-# Test apply = cork
-r=100
-i=1
-l=5
-prog="--txmsg_apply 10 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-r=100
-i=1000
-l=2048
-prog="--txmsg_apply 4096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply < cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 100"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Try again with larger sizes so we hit overflow case
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 8096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Test apply > cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 100 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Again with larger sizes so we hit overflow cases
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 8096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-
-# Test apply = cork
-r=100
-i=1
-l=5
-prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 10"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-r=100
-i=1000
-l=2048
-prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 4096"
-for t in "sendpage" "sendmsg"; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-# Tests for bpf_msg_pull_data()
-for i in `seq 99 100 1600`; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-		--txmsg --txmsg_start 0 --txmsg_end $i --txmsg_cork 1600"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-for i in `seq 199 100 1600`; do
-	TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-		--txmsg --txmsg_start 100 --txmsg_end $i --txmsg_cork 1600"
-	echo $TEST
-	$TEST
-	sleep 2
-done
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-	--txmsg --txmsg_start 1500 --txmsg_end 1600 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-	--txmsg --txmsg_start 1111 --txmsg_end 1112 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-	--txmsg --txmsg_start 1111 --txmsg_end 0 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-	--txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1600"
-echo $TEST
-$TEST
-sleep 2
-
-TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \
-	--txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1602"
-echo $TEST
-$TEST
-sleep 2
-
-# Run through gamut again with start and end
-for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do
-for t in "sendmsg" "sendpage"; do
-for r in 1 10 100; do
-	for i in 1 10 100; do
-		for l in 1 10 100; do
-			TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog --txmsg_start 1 --txmsg_end 2"
-			echo $TEST
-			$TEST
-			sleep 2
-		done
-	done
-done
-done
-done
-
-# Some specific tests to cover specific code paths
-./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
-	-r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
-./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
-	-r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3
-./sockmap --cgroup /mnt/cgroup2/ -t sendpage \
-	-r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
-./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \
-	-r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
deleted file mode 100644
index 22595b43ecc5..000000000000
--- a/samples/sockmap/sockmap_user.c
+++ /dev/null
@@ -1,1372 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/select.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <unistd.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/ioctl.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <fcntl.h>
-#include <sys/wait.h>
-#include <time.h>
-#include <sched.h>
-
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/types.h>
-#include <sys/sendfile.h>
-
-#include <linux/netlink.h>
-#include <linux/socket.h>
-#include <linux/sock_diag.h>
-#include <linux/bpf.h>
-#include <linux/if_link.h>
-#include <assert.h>
-#include <libgen.h>
-
-#include <getopt.h>
-
-#include "../bpf/bpf_load.h"
-#include "../bpf/bpf_util.h"
-#include "../bpf/libbpf.h"
-
-int running;
-void running_handler(int a);
-
-/* randomly selected ports for testing on lo */
-#define S1_PORT 10000
-#define S2_PORT 10001
-
-#define BPF_FILENAME "sockmap_kern.o"
-#define BPF_CGRP "/mnt/cgroup2/"
-
-/* global sockets */
-int s1, s2, c1, c2, p1, p2;
-int test_cnt;
-
-int txmsg_pass;
-int txmsg_noisy;
-int txmsg_redir;
-int txmsg_redir_noisy;
-int txmsg_drop;
-int txmsg_apply;
-int txmsg_cork;
-int txmsg_start;
-int txmsg_end;
-int txmsg_ingress;
-int txmsg_skb;
-
-static const struct option long_options[] = {
-	{"help",	no_argument,		NULL, 'h' },
-	{"cgroup",	required_argument,	NULL, 'c' },
-	{"rate",	required_argument,	NULL, 'r' },
-	{"verbose",	no_argument,		NULL, 'v' },
-	{"iov_count",	required_argument,	NULL, 'i' },
-	{"length",	required_argument,	NULL, 'l' },
-	{"test",	required_argument,	NULL, 't' },
-	{"data_test",   no_argument,		NULL, 'd' },
-	{"txmsg",		no_argument,	&txmsg_pass,  1  },
-	{"txmsg_noisy",		no_argument,	&txmsg_noisy, 1  },
-	{"txmsg_redir",		no_argument,	&txmsg_redir, 1  },
-	{"txmsg_redir_noisy",	no_argument,	&txmsg_redir_noisy, 1},
-	{"txmsg_drop",		no_argument,	&txmsg_drop, 1 },
-	{"txmsg_apply",	required_argument,	NULL, 'a'},
-	{"txmsg_cork",	required_argument,	NULL, 'k'},
-	{"txmsg_start", required_argument,	NULL, 's'},
-	{"txmsg_end",	required_argument,	NULL, 'e'},
-	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
-	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
-	{0, 0, NULL, 0 }
-};
-
-static void usage(char *argv[])
-{
-	int i;
-
-	printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
-	printf(" options:\n");
-	for (i = 0; long_options[i].name != 0; i++) {
-		printf(" --%-12s", long_options[i].name);
-		if (long_options[i].flag != NULL)
-			printf(" flag (internal value:%d)\n",
-				*long_options[i].flag);
-		else
-			printf(" -%c\n", long_options[i].val);
-	}
-	printf("\n");
-}
-
-static int sockmap_init_sockets(int verbose)
-{
-	int i, err, one = 1;
-	struct sockaddr_in addr;
-	int *fds[4] = {&s1, &s2, &c1, &c2};
-
-	s1 = s2 = p1 = p2 = c1 = c2 = 0;
-
-	/* Init sockets */
-	for (i = 0; i < 4; i++) {
-		*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
-		if (*fds[i] < 0) {
-			perror("socket s1 failed()");
-			return errno;
-		}
-	}
-
-	/* Allow reuse */
-	for (i = 0; i < 2; i++) {
-		err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR,
-				 (char *)&one, sizeof(one));
-		if (err) {
-			perror("setsockopt failed()");
-			return errno;
-		}
-	}
-
-	/* Non-blocking sockets */
-	for (i = 0; i < 2; i++) {
-		err = ioctl(*fds[i], FIONBIO, (char *)&one);
-		if (err < 0) {
-			perror("ioctl s1 failed()");
-			return errno;
-		}
-	}
-
-	/* Bind server sockets */
-	memset(&addr, 0, sizeof(struct sockaddr_in));
-	addr.sin_family = AF_INET;
-	addr.sin_addr.s_addr = inet_addr("127.0.0.1");
-
-	addr.sin_port = htons(S1_PORT);
-	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
-	if (err < 0) {
-		perror("bind s1 failed()\n");
-		return errno;
-	}
-
-	addr.sin_port = htons(S2_PORT);
-	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
-	if (err < 0) {
-		perror("bind s2 failed()\n");
-		return errno;
-	}
-
-	/* Listen server sockets */
-	addr.sin_port = htons(S1_PORT);
-	err = listen(s1, 32);
-	if (err < 0) {
-		perror("listen s1 failed()\n");
-		return errno;
-	}
-
-	addr.sin_port = htons(S2_PORT);
-	err = listen(s2, 32);
-	if (err < 0) {
-		perror("listen s1 failed()\n");
-		return errno;
-	}
-
-	/* Initiate Connect */
-	addr.sin_port = htons(S1_PORT);
-	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
-	if (err < 0 && errno != EINPROGRESS) {
-		perror("connect c1 failed()\n");
-		return errno;
-	}
-
-	addr.sin_port = htons(S2_PORT);
-	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
-	if (err < 0 && errno != EINPROGRESS) {
-		perror("connect c2 failed()\n");
-		return errno;
-	} else if (err < 0) {
-		err = 0;
-	}
-
-	/* Accept Connecrtions */
-	p1 = accept(s1, NULL, NULL);
-	if (p1 < 0) {
-		perror("accept s1 failed()\n");
-		return errno;
-	}
-
-	p2 = accept(s2, NULL, NULL);
-	if (p2 < 0) {
-		perror("accept s1 failed()\n");
-		return errno;
-	}
-
-	if (verbose) {
-		printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
-		printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
-			c1, s1, c2, s2);
-	}
-	return 0;
-}
-
-struct msg_stats {
-	size_t bytes_sent;
-	size_t bytes_recvd;
-	struct timespec start;
-	struct timespec end;
-};
-
-struct sockmap_options {
-	int verbose;
-	bool base;
-	bool sendpage;
-	bool data_test;
-	bool drop_expected;
-	int iov_count;
-	int iov_length;
-	int rate;
-};
-
-static int msg_loop_sendpage(int fd, int iov_length, int cnt,
-			     struct msg_stats *s,
-			     struct sockmap_options *opt)
-{
-	bool drop = opt->drop_expected;
-	unsigned char k = 0;
-	FILE *file;
-	int i, fp;
-
-	file = fopen(".sendpage_tst.tmp", "w+");
-	for (i = 0; i < iov_length * cnt; i++, k++)
-		fwrite(&k, sizeof(char), 1, file);
-	fflush(file);
-	fseek(file, 0, SEEK_SET);
-	fclose(file);
-
-	fp = open(".sendpage_tst.tmp", O_RDONLY);
-	clock_gettime(CLOCK_MONOTONIC, &s->start);
-	for (i = 0; i < cnt; i++) {
-		int sent = sendfile(fd, fp, NULL, iov_length);
-
-		if (!drop && sent < 0) {
-			perror("send loop error:");
-			close(fp);
-			return sent;
-		} else if (drop && sent >= 0) {
-			printf("sendpage loop error expected: %i\n", sent);
-			close(fp);
-			return -EIO;
-		}
-
-		if (sent > 0)
-			s->bytes_sent += sent;
-	}
-	clock_gettime(CLOCK_MONOTONIC, &s->end);
-	close(fp);
-	return 0;
-}
-
-static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
-		    struct msg_stats *s, bool tx,
-		    struct sockmap_options *opt)
-{
-	struct msghdr msg = {0};
-	int err, i, flags = MSG_NOSIGNAL;
-	struct iovec *iov;
-	unsigned char k;
-	bool data_test = opt->data_test;
-	bool drop = opt->drop_expected;
-
-	iov = calloc(iov_count, sizeof(struct iovec));
-	if (!iov)
-		return errno;
-
-	k = 0;
-	for (i = 0; i < iov_count; i++) {
-		unsigned char *d = calloc(iov_length, sizeof(char));
-
-		if (!d) {
-			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
-			goto out_errno;
-		}
-		iov[i].iov_base = d;
-		iov[i].iov_len = iov_length;
-
-		if (data_test && tx) {
-			int j;
-
-			for (j = 0; j < iov_length; j++)
-				d[j] = k++;
-		}
-	}
-
-	msg.msg_iov = iov;
-	msg.msg_iovlen = iov_count;
-	k = 0;
-
-	if (tx) {
-		clock_gettime(CLOCK_MONOTONIC, &s->start);
-		for (i = 0; i < cnt; i++) {
-			int sent = sendmsg(fd, &msg, flags);
-
-			if (!drop && sent < 0) {
-				perror("send loop error:");
-				goto out_errno;
-			} else if (drop && sent >= 0) {
-				printf("send loop error expected: %i\n", sent);
-				errno = -EIO;
-				goto out_errno;
-			}
-			if (sent > 0)
-				s->bytes_sent += sent;
-		}
-		clock_gettime(CLOCK_MONOTONIC, &s->end);
-	} else {
-		int slct, recv, max_fd = fd;
-		int fd_flags = O_NONBLOCK;
-		struct timeval timeout;
-		float total_bytes;
-		fd_set w;
-
-		fcntl(fd, fd_flags);
-		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
-		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
-		if (err < 0)
-			perror("recv start time: ");
-		while (s->bytes_recvd < total_bytes) {
-			timeout.tv_sec = 1;
-			timeout.tv_usec = 0;
-
-			/* FD sets */
-			FD_ZERO(&w);
-			FD_SET(fd, &w);
-
-			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
-			if (slct == -1) {
-				perror("select()");
-				clock_gettime(CLOCK_MONOTONIC, &s->end);
-				goto out_errno;
-			} else if (!slct) {
-				if (opt->verbose)
-					fprintf(stderr, "unexpected timeout\n");
-				errno = -EIO;
-				clock_gettime(CLOCK_MONOTONIC, &s->end);
-				goto out_errno;
-			}
-
-			recv = recvmsg(fd, &msg, flags);
-			if (recv < 0) {
-				if (errno != EWOULDBLOCK) {
-					clock_gettime(CLOCK_MONOTONIC, &s->end);
-					perror("recv failed()\n");
-					goto out_errno;
-				}
-			}
-
-			s->bytes_recvd += recv;
-
-			if (data_test) {
-				int j;
-
-				for (i = 0; i < msg.msg_iovlen; i++) {
-					unsigned char *d = iov[i].iov_base;
-
-					for (j = 0;
-					     j < iov[i].iov_len && recv; j++) {
-						if (d[j] != k++) {
-							errno = -EIO;
-							fprintf(stderr,
-								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-								i, j, d[j], k - 1, d[j+1], k + 1);
-							goto out_errno;
-						}
-						recv--;
-					}
-				}
-			}
-		}
-		clock_gettime(CLOCK_MONOTONIC, &s->end);
-	}
-
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
-	return 0;
-out_errno:
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
-	return errno;
-}
-
-static float giga = 1000000000;
-
-static inline float sentBps(struct msg_stats s)
-{
-	return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
-}
-
-static inline float recvdBps(struct msg_stats s)
-{
-	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
-}
-
-static int sendmsg_test(struct sockmap_options *opt)
-{
-	float sent_Bps = 0, recvd_Bps = 0;
-	int rx_fd, txpid, rxpid, err = 0;
-	struct msg_stats s = {0};
-	int iov_count = opt->iov_count;
-	int iov_buf = opt->iov_length;
-	int cnt = opt->rate;
-	int status;
-
-	errno = 0;
-
-	if (opt->base)
-		rx_fd = p1;
-	else
-		rx_fd = p2;
-
-	rxpid = fork();
-	if (rxpid == 0) {
-		if (opt->drop_expected)
-			exit(1);
-
-		if (opt->sendpage)
-			iov_count = 1;
-		err = msg_loop(rx_fd, iov_count, iov_buf,
-			       cnt, &s, false, opt);
-		if (err && opt->verbose)
-			fprintf(stderr,
-				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
-				iov_count, iov_buf, cnt, err);
-		shutdown(p2, SHUT_RDWR);
-		shutdown(p1, SHUT_RDWR);
-		if (s.end.tv_sec - s.start.tv_sec) {
-			sent_Bps = sentBps(s);
-			recvd_Bps = recvdBps(s);
-		}
-		if (opt->verbose)
-			fprintf(stdout,
-				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
-				s.bytes_sent, sent_Bps, sent_Bps/giga,
-				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
-	} else if (rxpid == -1) {
-		perror("msg_loop_rx: ");
-		return errno;
-	}
-
-	txpid = fork();
-	if (txpid == 0) {
-		if (opt->sendpage)
-			err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt);
-		else
-			err = msg_loop(c1, iov_count, iov_buf,
-				       cnt, &s, true, opt);
-
-		if (err)
-			fprintf(stderr,
-				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
-				iov_count, iov_buf, cnt, err);
-		shutdown(c1, SHUT_RDWR);
-		if (s.end.tv_sec - s.start.tv_sec) {
-			sent_Bps = sentBps(s);
-			recvd_Bps = recvdBps(s);
-		}
-		if (opt->verbose)
-			fprintf(stdout,
-				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
-				s.bytes_sent, sent_Bps, sent_Bps/giga,
-				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
-	} else if (txpid == -1) {
-		perror("msg_loop_tx: ");
-		return errno;
-	}
-
-	assert(waitpid(rxpid, &status, 0) == rxpid);
-	assert(waitpid(txpid, &status, 0) == txpid);
-	return err;
-}
-
-static int forever_ping_pong(int rate, struct sockmap_options *opt)
-{
-	struct timeval timeout;
-	char buf[1024] = {0};
-	int sc;
-
-	timeout.tv_sec = 10;
-	timeout.tv_usec = 0;
-
-	/* Ping/Pong data from client to server */
-	sc = send(c1, buf, sizeof(buf), 0);
-	if (sc < 0) {
-		perror("send failed()\n");
-		return sc;
-	}
-
-	do {
-		int s, rc, i, max_fd = p2;
-		fd_set w;
-
-		/* FD sets */
-		FD_ZERO(&w);
-		FD_SET(c1, &w);
-		FD_SET(c2, &w);
-		FD_SET(p1, &w);
-		FD_SET(p2, &w);
-
-		s = select(max_fd + 1, &w, NULL, NULL, &timeout);
-		if (s == -1) {
-			perror("select()");
-			break;
-		} else if (!s) {
-			fprintf(stderr, "unexpected timeout\n");
-			break;
-		}
-
-		for (i = 0; i <= max_fd && s > 0; ++i) {
-			if (!FD_ISSET(i, &w))
-				continue;
-
-			s--;
-
-			rc = recv(i, buf, sizeof(buf), 0);
-			if (rc < 0) {
-				if (errno != EWOULDBLOCK) {
-					perror("recv failed()\n");
-					return rc;
-				}
-			}
-
-			if (rc == 0) {
-				close(i);
-				break;
-			}
-
-			sc = send(i, buf, rc, 0);
-			if (sc < 0) {
-				perror("send failed()\n");
-				return sc;
-			}
-		}
-
-		if (rate)
-			sleep(rate);
-
-		if (opt->verbose) {
-			printf(".");
-			fflush(stdout);
-
-		}
-	} while (running);
-
-	return 0;
-}
-
-enum {
-	PING_PONG,
-	SENDMSG,
-	BASE,
-	BASE_SENDPAGE,
-	SENDPAGE,
-};
-
-static int run_options(struct sockmap_options *options, int cg_fd,  int test)
-{
-	int i, key, next_key, err, tx_prog_fd = -1, zero = 0;
-
-	/* If base test skip BPF setup */
-	if (test == BASE || test == BASE_SENDPAGE)
-		goto run;
-
-	/* Attach programs to sockmap */
-	err = bpf_prog_attach(prog_fd[0], map_fd[0],
-				BPF_SK_SKB_STREAM_PARSER, 0);
-	if (err) {
-		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
-			err, strerror(errno));
-		return err;
-	}
-
-	err = bpf_prog_attach(prog_fd[1], map_fd[0],
-				BPF_SK_SKB_STREAM_VERDICT, 0);
-	if (err) {
-		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
-			err, strerror(errno));
-		return err;
-	}
-
-	/* Attach to cgroups */
-	err = bpf_prog_attach(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
-	if (err) {
-		fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
-			err, strerror(errno));
-		return err;
-	}
-
-run:
-	err = sockmap_init_sockets(options->verbose);
-	if (err) {
-		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
-		goto out;
-	}
-
-	/* Attach txmsg program to sockmap */
-	if (txmsg_pass)
-		tx_prog_fd = prog_fd[3];
-	else if (txmsg_noisy)
-		tx_prog_fd = prog_fd[4];
-	else if (txmsg_redir)
-		tx_prog_fd = prog_fd[5];
-	else if (txmsg_redir_noisy)
-		tx_prog_fd = prog_fd[6];
-	else if (txmsg_drop)
-		tx_prog_fd = prog_fd[9];
-	/* apply and cork must be last */
-	else if (txmsg_apply)
-		tx_prog_fd = prog_fd[7];
-	else if (txmsg_cork)
-		tx_prog_fd = prog_fd[8];
-	else
-		tx_prog_fd = 0;
-
-	if (tx_prog_fd) {
-		int redir_fd, i = 0;
-
-		err = bpf_prog_attach(tx_prog_fd,
-				      map_fd[1], BPF_SK_MSG_VERDICT, 0);
-		if (err) {
-			fprintf(stderr,
-				"ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
-				err, strerror(errno));
-			goto out;
-		}
-
-		err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
-		if (err) {
-			fprintf(stderr,
-				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
-				err, strerror(errno));
-			goto out;
-		}
-
-		if (txmsg_redir || txmsg_redir_noisy)
-			redir_fd = c2;
-		else
-			redir_fd = c1;
-
-		err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY);
-		if (err) {
-			fprintf(stderr,
-				"ERROR: bpf_map_update_elem (txmsg):  %d (%s\n",
-				err, strerror(errno));
-			goto out;
-		}
-
-		if (txmsg_apply) {
-			err = bpf_map_update_elem(map_fd[3],
-						  &i, &txmsg_apply, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (apply_bytes):  %d (%s\n",
-					err, strerror(errno));
-				goto out;
-			}
-		}
-
-		if (txmsg_cork) {
-			err = bpf_map_update_elem(map_fd[4],
-						  &i, &txmsg_cork, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (cork_bytes):  %d (%s\n",
-					err, strerror(errno));
-				goto out;
-			}
-		}
-
-		if (txmsg_start) {
-			err = bpf_map_update_elem(map_fd[5],
-						  &i, &txmsg_start, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (txmsg_start):  %d (%s)\n",
-					err, strerror(errno));
-				goto out;
-			}
-		}
-
-		if (txmsg_end) {
-			i = 1;
-			err = bpf_map_update_elem(map_fd[5],
-						  &i, &txmsg_end, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (txmsg_end):  %d (%s)\n",
-					err, strerror(errno));
-				goto out;
-			}
-		}
-
-		if (txmsg_ingress) {
-			int in = BPF_F_INGRESS;
-
-			i = 0;
-			err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
-					err, strerror(errno));
-			}
-			i = 1;
-			err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n",
-					err, strerror(errno));
-			}
-			err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n",
-					err, strerror(errno));
-			}
-
-			i = 2;
-			err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n",
-					err, strerror(errno));
-			}
-		}
-
-		if (txmsg_skb) {
-			int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1;
-			int ingress = BPF_F_INGRESS;
-
-			i = 0;
-			err = bpf_map_update_elem(map_fd[7], &i, &ingress, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
-					err, strerror(errno));
-			}
-
-			i = 3;
-			err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY);
-			if (err) {
-				fprintf(stderr,
-					"ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
-					err, strerror(errno));
-			}
-		}
-	}
-
-	if (txmsg_drop)
-		options->drop_expected = true;
-
-	if (test == PING_PONG)
-		err = forever_ping_pong(options->rate, options);
-	else if (test == SENDMSG) {
-		options->base = false;
-		options->sendpage = false;
-		err = sendmsg_test(options);
-	} else if (test == SENDPAGE) {
-		options->base = false;
-		options->sendpage = true;
-		err = sendmsg_test(options);
-	} else if (test == BASE) {
-		options->base = true;
-		options->sendpage = false;
-		err = sendmsg_test(options);
-	} else if (test == BASE_SENDPAGE) {
-		options->base = true;
-		options->sendpage = true;
-		err = sendmsg_test(options);
-	} else
-		fprintf(stderr, "unknown test\n");
-out:
-	/* Detatch and zero all the maps */
-	bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS);
-	bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
-	bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
-	if (tx_prog_fd >= 0)
-		bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
-
-	for (i = 0; i < 8; i++) {
-		key = next_key = 0;
-		bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
-		while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) {
-			bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
-			key = next_key;
-		}
-	}
-
-	close(s1);
-	close(s2);
-	close(p1);
-	close(p2);
-	close(c1);
-	close(c2);
-	return err;
-}
-
-static char *test_to_str(int test)
-{
-	switch (test) {
-	case SENDMSG:
-		return "sendmsg";
-	case SENDPAGE:
-		return "sendpage";
-	}
-	return "unknown";
-}
-
-#define OPTSTRING 60
-static void test_options(char *options)
-{
-	memset(options, 0, OPTSTRING);
-
-	if (txmsg_pass)
-		strncat(options, "pass,", OPTSTRING);
-	if (txmsg_noisy)
-		strncat(options, "pass_noisy,", OPTSTRING);
-	if (txmsg_redir)
-		strncat(options, "redir,", OPTSTRING);
-	if (txmsg_redir_noisy)
-		strncat(options, "redir_noisy,", OPTSTRING);
-	if (txmsg_drop)
-		strncat(options, "drop,", OPTSTRING);
-	if (txmsg_apply)
-		strncat(options, "apply,", OPTSTRING);
-	if (txmsg_cork)
-		strncat(options, "cork,", OPTSTRING);
-	if (txmsg_start)
-		strncat(options, "start,", OPTSTRING);
-	if (txmsg_end)
-		strncat(options, "end,", OPTSTRING);
-	if (txmsg_ingress)
-		strncat(options, "ingress,", OPTSTRING);
-	if (txmsg_skb)
-		strncat(options, "skb,", OPTSTRING);
-}
-
-static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
-{
-	char *options = calloc(60, sizeof(char));
-	int err;
-
-	if (test == SENDPAGE)
-		opt->sendpage = true;
-	else
-		opt->sendpage = false;
-
-	if (txmsg_drop)
-		opt->drop_expected = true;
-	else
-		opt->drop_expected = false;
-
-	test_options(options);
-
-	fprintf(stdout,
-		"[TEST %i]: (%i, %i, %i, %s, %s): ",
-		test_cnt, opt->rate, opt->iov_count, opt->iov_length,
-		test_to_str(test), options);
-	fflush(stdout);
-	err = run_options(opt, cgrp, test);
-	fprintf(stdout, "%s\n", !err ? "PASS" : "FAILED");
-	test_cnt++;
-	free(options);
-	return err;
-}
-
-static int test_exec(int cgrp, struct sockmap_options *opt)
-{
-	int err = __test_exec(cgrp, SENDMSG, opt);
-
-	sched_yield();
-	if (err)
-		goto out;
-
-	err = __test_exec(cgrp, SENDPAGE, opt);
-	sched_yield();
-out:
-	return err;
-}
-
-static int test_loop(int cgrp)
-{
-	struct sockmap_options opt;
-
-	int err, i, l, r;
-
-	opt.verbose = 0;
-	opt.base = false;
-	opt.sendpage = false;
-	opt.data_test = false;
-	opt.drop_expected = false;
-	opt.iov_count = 0;
-	opt.iov_length = 0;
-	opt.rate = 0;
-
-	for (r = 1; r < 100; r += 33) {
-		for (i = 1; i < 100; i += 33) {
-			for (l = 1; l < 100; l += 33) {
-				opt.rate = r;
-				opt.iov_count = i;
-				opt.iov_length = l;
-				err = test_exec(cgrp, &opt);
-				if (err)
-					goto out;
-			}
-		}
-	}
-
-out:
-	return err;
-}
-
-static int test_txmsg(int cgrp)
-{
-	int err;
-
-	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
-	txmsg_apply = txmsg_cork = 0;
-	txmsg_ingress = txmsg_skb = 0;
-
-	txmsg_pass = 1;
-	err = test_loop(cgrp);
-	txmsg_pass = 0;
-	if (err)
-		goto out;
-
-	txmsg_redir = 1;
-	err = test_loop(cgrp);
-	txmsg_redir = 0;
-	if (err)
-		goto out;
-
-	txmsg_drop = 1;
-	err = test_loop(cgrp);
-	txmsg_drop = 0;
-	if (err)
-		goto out;
-
-	txmsg_redir = 1;
-	txmsg_ingress = 1;
-	err = test_loop(cgrp);
-	txmsg_redir = 0;
-	txmsg_ingress = 0;
-	if (err)
-		goto out;
-out:
-	txmsg_pass = 0;
-	txmsg_redir = 0;
-	txmsg_drop = 0;
-	return err;
-}
-
-static int test_send(struct sockmap_options *opt, int cgrp)
-{
-	int err;
-
-	opt->iov_length = 1;
-	opt->iov_count = 1;
-	opt->rate = 1;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-
-	opt->iov_length = 1;
-	opt->iov_count = 1024;
-	opt->rate = 1;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-
-	opt->iov_length = 1024;
-	opt->iov_count = 1;
-	opt->rate = 1;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-
-	opt->iov_length = 1;
-	opt->iov_count = 1;
-	opt->rate = 1024;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-
-	opt->iov_length = 256;
-	opt->iov_count = 1024;
-	opt->rate = 10;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-
-	opt->rate = 100;
-	opt->iov_count = 1;
-	opt->iov_length = 5;
-	err = test_exec(cgrp, opt);
-	if (err)
-		goto out;
-out:
-	return err;
-}
-
-static int test_mixed(int cgrp)
-{
-	struct sockmap_options opt = {0};
-	int err;
-
-	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
-	txmsg_apply = txmsg_cork = 0;
-	txmsg_start = txmsg_end = 0;
-	/* Test small and large iov_count values with pass/redir/apply/cork */
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 1;
-	txmsg_cork = 0;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 0;
-	txmsg_cork = 1;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 1;
-	txmsg_cork = 1;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 1024;
-	txmsg_cork = 0;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 0;
-	txmsg_cork = 1024;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_apply = 1024;
-	txmsg_cork = 1024;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 1;
-	txmsg_redir = 0;
-	txmsg_cork = 4096;
-	txmsg_apply = 4096;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_apply = 1;
-	txmsg_cork = 0;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_apply = 0;
-	txmsg_cork = 1;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_apply = 1024;
-	txmsg_cork = 0;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_apply = 0;
-	txmsg_cork = 1024;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_apply = 1024;
-	txmsg_cork = 1024;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-
-	txmsg_pass = 0;
-	txmsg_redir = 1;
-	txmsg_cork = 4096;
-	txmsg_apply = 4096;
-	err = test_send(&opt, cgrp);
-	if (err)
-		goto out;
-out:
-	return err;
-}
-
-static int test_start_end(int cgrp)
-{
-	struct sockmap_options opt = {0};
-	int err, i;
-
-	/* Test basic start/end with lots of iov_count and iov_lengths */
-	txmsg_start = 1;
-	txmsg_end = 2;
-	err = test_txmsg(cgrp);
-	if (err)
-		goto out;
-
-	/* Test start/end with cork */
-	opt.rate = 16;
-	opt.iov_count = 1;
-	opt.iov_length = 100;
-	txmsg_cork = 1600;
-
-	for (i = 99; i <= 1600; i += 100) {
-		txmsg_start = 0;
-		txmsg_end = i;
-		err = test_exec(cgrp, &opt);
-		if (err)
-			goto out;
-	}
-
-	/* Test start/end with cork but pull data in middle */
-	for (i = 199; i <= 1600; i += 100) {
-		txmsg_start = 100;
-		txmsg_end = i;
-		err = test_exec(cgrp, &opt);
-		if (err)
-			goto out;
-	}
-
-	/* Test start/end with cork pulling last sg entry */
-	txmsg_start = 1500;
-	txmsg_end = 1600;
-	err = test_exec(cgrp, &opt);
-	if (err)
-		goto out;
-
-	/* Test start/end pull of single byte in last page */
-	txmsg_start = 1111;
-	txmsg_end = 1112;
-	err = test_exec(cgrp, &opt);
-	if (err)
-		goto out;
-
-	/* Test start/end with end < start */
-	txmsg_start = 1111;
-	txmsg_end = 0;
-	err = test_exec(cgrp, &opt);
-	if (err)
-		goto out;
-
-	/* Test start/end with end > data */
-	txmsg_start = 0;
-	txmsg_end = 1601;
-	err = test_exec(cgrp, &opt);
-	if (err)
-		goto out;
-
-	/* Test start/end with start > data */
-	txmsg_start = 1601;
-	txmsg_end = 1600;
-	err = test_exec(cgrp, &opt);
-
-out:
-	txmsg_start = 0;
-	txmsg_end = 0;
-	return err;
-}
-
-static int test_suite(void)
-{
-	char *bpf_file = BPF_FILENAME;
-	int cg_fd, err;
-
-	cg_fd = open(BPF_CGRP, O_DIRECTORY, O_RDONLY);
-	if (cg_fd < 0) {
-		fprintf(stderr,
-			"ERROR: (%i) open cg path failed: %s\n",
-			cg_fd, optarg);
-		return cg_fd;
-	}
-
-	if (load_bpf_file(bpf_file)) {
-		fprintf(stderr, "load_bpf_file: (%s) %s\n",
-			bpf_file, strerror(errno));
-		return 1;
-	}
-
-	/* Tests basic commands and APIs with range of iov values */
-	txmsg_start = txmsg_end = 0;
-	err = test_txmsg(cg_fd);
-	if (err)
-		goto out;
-
-	/* Tests interesting combinations of APIs used together */
-	err = test_mixed(cg_fd);
-	if (err)
-		goto out;
-
-	/* Tests pull_data API using start/end API */
-	err = test_start_end(cg_fd);
-	if (err)
-		goto out;
-
-out:
-	close(cg_fd);
-	return err;
-}
-
-int main(int argc, char **argv)
-{
-	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
-	int iov_count = 1, length = 1024, rate = 1;
-	struct sockmap_options options = {0};
-	int opt, longindex, err, cg_fd = 0;
-	char *bpf_file = BPF_FILENAME;
-	int test = PING_PONG;
-
-	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
-		perror("setrlimit(RLIMIT_MEMLOCK)");
-		return 1;
-	}
-
-	if (argc < 2)
-		return test_suite();
-
-	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
-				  long_options, &longindex)) != -1) {
-		switch (opt) {
-		case 's':
-			txmsg_start = atoi(optarg);
-			break;
-		case 'e':
-			txmsg_end = atoi(optarg);
-			break;
-		case 'a':
-			txmsg_apply = atoi(optarg);
-			break;
-		case 'k':
-			txmsg_cork = atoi(optarg);
-			break;
-		case 'c':
-			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
-			if (cg_fd < 0) {
-				fprintf(stderr,
-					"ERROR: (%i) open cg path failed: %s\n",
-					cg_fd, optarg);
-				return cg_fd;
-			}
-			break;
-		case 'r':
-			rate = atoi(optarg);
-			break;
-		case 'v':
-			options.verbose = 1;
-			break;
-		case 'i':
-			iov_count = atoi(optarg);
-			break;
-		case 'l':
-			length = atoi(optarg);
-			break;
-		case 'd':
-			options.data_test = true;
-			break;
-		case 't':
-			if (strcmp(optarg, "ping") == 0) {
-				test = PING_PONG;
-			} else if (strcmp(optarg, "sendmsg") == 0) {
-				test = SENDMSG;
-			} else if (strcmp(optarg, "base") == 0) {
-				test = BASE;
-			} else if (strcmp(optarg, "base_sendpage") == 0) {
-				test = BASE_SENDPAGE;
-			} else if (strcmp(optarg, "sendpage") == 0) {
-				test = SENDPAGE;
-			} else {
-				usage(argv);
-				return -1;
-			}
-			break;
-		case 0:
-			break;
-		case 'h':
-		default:
-			usage(argv);
-			return -1;
-		}
-	}
-
-	if (!cg_fd) {
-		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
-			argv[0]);
-		return -1;
-	}
-
-	if (load_bpf_file(bpf_file)) {
-		fprintf(stderr, "load_bpf_file: (%s) %s\n",
-			bpf_file, strerror(errno));
-		return 1;
-	}
-	running = 1;
-
-	/* catch SIGINT */
-	signal(SIGINT, running_handler);
-
-	options.iov_count = iov_count;
-	options.iov_length = length;
-	options.rate = rate;
-
-	err = run_options(&options, cg_fd, test);
-	close(cg_fd);
-	return err;
-}
-
-void running_handler(int a)
-{
-	running = 0;
-}
-- 
cgit v1.2.3-59-g8ed1b


From a18fda1a62c3313685ec222802d5b6743dbd560e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 24 Apr 2018 16:28:18 -0700
Subject: bpf: reduce runtime of test_sockmap tests

When test_sockmap was running outside of selftests and was not being
run by build bots it was reasonable to spend significant amount of
time running various tests. The number of tests is high because many
different I/O iterators are run.

However, now that test_sockmap is part of selftests rather than
iterate through all I/O sides only test a minimal set of min/max
values along with a few "normal" I/O ops. Also remove the long
running tests. They can be run from other test frameworks on a regular
cadence.

This significanly reduces runtime of test_sockmap.

Before:

$ time sudo ./test_sockmap  > /dev/null

real    4m47.521s
user    0m0.370s
sys     0m3.131s

After:

$ time sudo ./test_sockmap  > /dev/null

real    0m0.514s
user    0m0.104s
sys     0m0.430s

The CLI is still available for users that want to test the long
running tests that do the larger send/recv tests.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 33 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 6d63a1cdc649..29c022d23f4e 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -344,8 +344,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		if (err < 0)
 			perror("recv start time: ");
 		while (s->bytes_recvd < total_bytes) {
-			timeout.tv_sec = 1;
-			timeout.tv_usec = 0;
+			timeout.tv_sec = 0;
+			timeout.tv_usec = 10;
 
 			/* FD sets */
 			FD_ZERO(&w);
@@ -903,12 +903,10 @@ static int test_exec(int cgrp, struct sockmap_options *opt)
 {
 	int err = __test_exec(cgrp, SENDMSG, opt);
 
-	sched_yield();
 	if (err)
 		goto out;
 
 	err = __test_exec(cgrp, SENDPAGE, opt);
-	sched_yield();
 out:
 	return err;
 }
@@ -928,19 +926,18 @@ static int test_loop(int cgrp)
 	opt.iov_length = 0;
 	opt.rate = 0;
 
-	for (r = 1; r < 100; r += 33) {
-		for (i = 1; i < 100; i += 33) {
-			for (l = 1; l < 100; l += 33) {
-				opt.rate = r;
-				opt.iov_count = i;
-				opt.iov_length = l;
-				err = test_exec(cgrp, &opt);
-				if (err)
-					goto out;
-			}
+	r = 1;
+	for (i = 1; i < 100; i += 33) {
+		for (l = 1; l < 100; l += 33) {
+			opt.rate = r;
+			opt.iov_count = i;
+			opt.iov_length = l;
+			err = test_exec(cgrp, &opt);
+			if (err)
+				goto out;
 		}
 	}
-
+	sched_yield();
 out:
 	return err;
 }
@@ -1031,6 +1028,7 @@ static int test_send(struct sockmap_options *opt, int cgrp)
 	if (err)
 		goto out;
 out:
+	sched_yield();
 	return err;
 }
 
@@ -1168,7 +1166,7 @@ static int test_start_end(int cgrp)
 	opt.iov_length = 100;
 	txmsg_cork = 1600;
 
-	for (i = 99; i <= 1600; i += 100) {
+	for (i = 99; i <= 1600; i += 500) {
 		txmsg_start = 0;
 		txmsg_end = i;
 		err = test_exec(cgrp, &opt);
@@ -1177,7 +1175,7 @@ static int test_start_end(int cgrp)
 	}
 
 	/* Test start/end with cork but pull data in middle */
-	for (i = 199; i <= 1600; i += 100) {
+	for (i = 199; i <= 1600; i += 500) {
 		txmsg_start = 100;
 		txmsg_end = i;
 		err = test_exec(cgrp, &opt);
@@ -1221,6 +1219,7 @@ static int test_start_end(int cgrp)
 out:
 	txmsg_start = 0;
 	txmsg_end = 0;
+	sched_yield();
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 9c9e53233c281ed6c00425b165c4dd7fd0881cd5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:22:36 -0700
Subject: nfp: bpf: remove double space

Whitespace cleanup - remove double space.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 29b4e5f8c102..9cc638718272 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1400,7 +1400,7 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	if (!load_lm_ptr)
 		return 0;
 
-	emit_csr_wr(nfp_prog, stack_reg(nfp_prog),  NFP_CSR_ACT_LM_ADDR0);
+	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
 	wrp_nops(nfp_prog, 3);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 6c59500c2dbfae0e3c90854ef443948d2889495d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:22:37 -0700
Subject: nfp: bpf: optimize add/sub of a negative constant

NFP instruction set can fit small immediates into the instruction.
Negative integers, however, will never fit because they will have
highest bit set.  If we swap the ALU op between ADD and SUB and
negate the constant we have a better chance of fitting small negative
integers into the instruction itself and saving one or two cycles.

immed[gprB_21, 0xfffffffc]
alu[gprA_4, gprA_4, +, gprB_21], gpr_wrboth
immed[gprB_21, 0xffffffff]
alu[gprA_5, gprA_5, +carry, gprB_21], gpr_wrboth

now becomes:

alu[gprA_4, gprA_4, -, 4], gpr_wrboth
alu[gprA_5, gprA_5, -carry, 0], gpr_wrboth

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 9cc638718272..a5590988fc69 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -2777,6 +2777,40 @@ static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
 	}
 }
 
+/* abs(insn.imm) will fit better into unrestricted reg immediate -
+ * convert add/sub of a negative number into a sub/add of a positive one.
+ */
+static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *meta;
+
+	list_for_each_entry(meta, &nfp_prog->insns, l) {
+		struct bpf_insn insn = meta->insn;
+
+		if (meta->skip)
+			continue;
+
+		if (BPF_CLASS(insn.code) != BPF_ALU &&
+		    BPF_CLASS(insn.code) != BPF_ALU64)
+			continue;
+		if (BPF_SRC(insn.code) != BPF_K)
+			continue;
+		if (insn.imm >= 0)
+			continue;
+
+		if (BPF_OP(insn.code) == BPF_ADD)
+			insn.code = BPF_CLASS(insn.code) | BPF_SUB;
+		else if (BPF_OP(insn.code) == BPF_SUB)
+			insn.code = BPF_CLASS(insn.code) | BPF_ADD;
+		else
+			continue;
+
+		meta->insn.code = insn.code | BPF_K;
+
+		meta->insn.imm = -insn.imm;
+	}
+}
+
 /* Remove masking after load since our load guarantees this is not needed */
 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
 {
@@ -3212,6 +3246,7 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 {
 	nfp_bpf_opt_reg_init(nfp_prog);
 
+	nfp_bpf_opt_neg_add_sub(nfp_prog);
 	nfp_bpf_opt_ld_mask(nfp_prog);
 	nfp_bpf_opt_ld_shift(nfp_prog);
 	nfp_bpf_opt_ldst_gather(nfp_prog);
-- 
cgit v1.2.3-59-g8ed1b


From 61dd8f0007799e88d35624f63e24e98a978df9d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:22:38 -0700
Subject: nfp: bpf: tabularize generations of compare operations

There are quite a few compare instructions now, use a table
to translate BPF instruction code to NFP instruction parameters
instead of parameterizing helpers.  This saves LOC and makes
future extensions easier.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 168 ++++++++++-----------------
 1 file changed, 61 insertions(+), 107 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index a5590988fc69..5b8da7a67df4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1214,45 +1214,79 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	return 0;
 }
 
-static int
-wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-	    enum br_mask br_mask, bool swap)
+static const struct jmp_code_map {
+	enum br_mask br_mask;
+	bool swap;
+} jmp_code_map[] = {
+	[BPF_JGT >> 4]	= { BR_BLO, true },
+	[BPF_JGE >> 4]	= { BR_BHS, false },
+	[BPF_JLT >> 4]	= { BR_BLO, false },
+	[BPF_JLE >> 4]	= { BR_BHS, true },
+	[BPF_JSGT >> 4]	= { BR_BLT, true },
+	[BPF_JSGE >> 4]	= { BR_BGE, false },
+	[BPF_JSLT >> 4]	= { BR_BLT, false },
+	[BPF_JSLE >> 4]	= { BR_BGE, true },
+};
+
+static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
+{
+	unsigned int op;
+
+	op = BPF_OP(meta->insn.code) >> 4;
+	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
+	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
+		      !jmp_code_map[op].br_mask,
+		      "no code found for jump instruction"))
+		return NULL;
+
+	return &jmp_code_map[op];
+}
+
+static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
 	u64 imm = insn->imm; /* sign extend */
+	const struct jmp_code_map *code;
 	u8 reg = insn->dst_reg * 2;
 	swreg tmp_reg;
 
+	code = nfp_jmp_code_get(meta);
+	if (!code)
+		return -EINVAL;
+
 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
-	if (!swap)
+	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(), tmp_reg, ALU_OP_SUB, reg_a(reg));
 
 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
-	if (!swap)
+	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(),
 			 reg_a(reg + 1), ALU_OP_SUB_C, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(),
 			 tmp_reg, ALU_OP_SUB_C, reg_a(reg + 1));
 
-	emit_br(nfp_prog, br_mask, insn->off, 0);
+	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
 	return 0;
 }
 
-static int
-wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-	    enum br_mask br_mask, bool swap)
+static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
+	const struct jmp_code_map *code;
 	u8 areg, breg;
 
+	code = nfp_jmp_code_get(meta);
+	if (!code)
+		return -EINVAL;
+
 	areg = insn->dst_reg * 2;
 	breg = insn->src_reg * 2;
 
-	if (swap) {
+	if (code->swap) {
 		areg ^= breg;
 		breg ^= areg;
 		areg ^= breg;
@@ -1261,7 +1295,7 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
 	emit_alu(nfp_prog, reg_none(),
 		 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
-	emit_br(nfp_prog, br_mask, insn->off, 0);
+	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
 	return 0;
 }
@@ -2283,46 +2317,6 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
-static int jgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, true);
-}
-
-static int jge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, false);
-}
-
-static int jlt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLO, false);
-}
-
-static int jle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BHS, true);
-}
-
-static int jsgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, true);
-}
-
-static int jsge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, false);
-}
-
-static int jslt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BLT, false);
-}
-
-static int jsle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_imm(nfp_prog, meta, BR_BGE, true);
-}
-
 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2392,46 +2386,6 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
-static int jgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, true);
-}
-
-static int jge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, false);
-}
-
-static int jlt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLO, false);
-}
-
-static int jle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BHS, true);
-}
-
-static int jsgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, true);
-}
-
-static int jsge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, false);
-}
-
-static int jslt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BLT, false);
-}
-
-static int jsle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	return wrp_cmp_reg(nfp_prog, meta, BR_BGE, true);
-}
-
 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
@@ -2520,25 +2474,25 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
-	[BPF_JMP | BPF_JGT | BPF_K] =	jgt_imm,
-	[BPF_JMP | BPF_JGE | BPF_K] =	jge_imm,
-	[BPF_JMP | BPF_JLT | BPF_K] =	jlt_imm,
-	[BPF_JMP | BPF_JLE | BPF_K] =	jle_imm,
-	[BPF_JMP | BPF_JSGT | BPF_K] =  jsgt_imm,
-	[BPF_JMP | BPF_JSGE | BPF_K] =  jsge_imm,
-	[BPF_JMP | BPF_JSLT | BPF_K] =  jslt_imm,
-	[BPF_JMP | BPF_JSLE | BPF_K] =  jsle_imm,
+	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
+	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
+	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
-	[BPF_JMP | BPF_JGT | BPF_X] =	jgt_reg,
-	[BPF_JMP | BPF_JGE | BPF_X] =	jge_reg,
-	[BPF_JMP | BPF_JLT | BPF_X] =	jlt_reg,
-	[BPF_JMP | BPF_JLE | BPF_X] =	jle_reg,
-	[BPF_JMP | BPF_JSGT | BPF_X] =  jsgt_reg,
-	[BPF_JMP | BPF_JSGE | BPF_X] =  jsge_reg,
-	[BPF_JMP | BPF_JSLT | BPF_X] =  jslt_reg,
-	[BPF_JMP | BPF_JSLE | BPF_X] =  jsle_reg,
+	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
+	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
+	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
 	[BPF_JMP | BPF_CALL] =		call,
-- 
cgit v1.2.3-59-g8ed1b


From 7bdc97be9075c074be0f0aa9c59a8d2238224743 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:22:39 -0700
Subject: nfp: bpf: optimize comparisons to negative constants

Comparison instruction requires a subtraction.  If the constant
is negative we are more likely to fit it into a NFP instruction
directly if we change the sign and use addition.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 42 +++++++++++++++++++--------
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  6 +++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 5b8da7a67df4..65f0791cae0c 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1247,6 +1247,7 @@ static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	const struct bpf_insn *insn = &meta->insn;
 	u64 imm = insn->imm; /* sign extend */
 	const struct jmp_code_map *code;
+	enum alu_op alu_op, carry_op;
 	u8 reg = insn->dst_reg * 2;
 	swreg tmp_reg;
 
@@ -1254,19 +1255,22 @@ static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	if (!code)
 		return -EINVAL;
 
+	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
+	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
+
 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
 	if (!code->swap)
-		emit_alu(nfp_prog, reg_none(), reg_a(reg), ALU_OP_SUB, tmp_reg);
+		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
 	else
-		emit_alu(nfp_prog, reg_none(), tmp_reg, ALU_OP_SUB, reg_a(reg));
+		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
 
 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
 	if (!code->swap)
 		emit_alu(nfp_prog, reg_none(),
-			 reg_a(reg + 1), ALU_OP_SUB_C, tmp_reg);
+			 reg_a(reg + 1), carry_op, tmp_reg);
 	else
 		emit_alu(nfp_prog, reg_none(),
-			 tmp_reg, ALU_OP_SUB_C, reg_a(reg + 1));
+			 tmp_reg, carry_op, reg_a(reg + 1));
 
 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
 
@@ -2745,21 +2749,35 @@ static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
 			continue;
 
 		if (BPF_CLASS(insn.code) != BPF_ALU &&
-		    BPF_CLASS(insn.code) != BPF_ALU64)
+		    BPF_CLASS(insn.code) != BPF_ALU64 &&
+		    BPF_CLASS(insn.code) != BPF_JMP)
 			continue;
 		if (BPF_SRC(insn.code) != BPF_K)
 			continue;
 		if (insn.imm >= 0)
 			continue;
 
-		if (BPF_OP(insn.code) == BPF_ADD)
-			insn.code = BPF_CLASS(insn.code) | BPF_SUB;
-		else if (BPF_OP(insn.code) == BPF_SUB)
-			insn.code = BPF_CLASS(insn.code) | BPF_ADD;
-		else
-			continue;
+		if (BPF_CLASS(insn.code) == BPF_JMP) {
+			switch (BPF_OP(insn.code)) {
+			case BPF_JGE:
+			case BPF_JSGE:
+			case BPF_JLT:
+			case BPF_JSLT:
+				meta->jump_neg_op = true;
+				break;
+			default:
+				continue;
+			}
+		} else {
+			if (BPF_OP(insn.code) == BPF_ADD)
+				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
+			else if (BPF_OP(insn.code) == BPF_SUB)
+				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
+			else
+				continue;
 
-		meta->insn.code = insn.code | BPF_K;
+			meta->insn.code = insn.code | BPF_K;
+		}
 
 		meta->insn.imm = -insn.imm;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 4981c8944ca3..68b5d326483d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -236,6 +236,7 @@ struct nfp_bpf_reg_state {
  * @xadd_over_16bit: 16bit immediate is not guaranteed
  * @xadd_maybe_16bit: 16bit immediate is possible
  * @jmp_dst: destination info for jump instructions
+ * @jump_neg_op: jump instruction has inverted immediate, use ADD instead of SUB
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
@@ -264,7 +265,10 @@ struct nfp_insn_meta {
 			bool xadd_maybe_16bit;
 		};
 		/* jump */
-		struct nfp_insn_meta *jmp_dst;
+		struct {
+			struct nfp_insn_meta *jmp_dst;
+			bool jump_neg_op;
+		};
 		/* function calls */
 		struct {
 			u32 func_id;
-- 
cgit v1.2.3-59-g8ed1b


From d6c38be09af0c2805cb858e3a20a2b29b2c04735 Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Mon, 2 Apr 2018 20:31:22 +0800
Subject: mwifiex: uap: filter duplicate ERP IE

Firmware parse and attach ERP IE from bss configuration,
do not set again from tail IE.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/ie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/ie.c b/drivers/net/wireless/marvell/mwifiex/ie.c
index 922e3d69fd84..b10baacb51c9 100644
--- a/drivers/net/wireless/marvell/mwifiex/ie.c
+++ b/drivers/net/wireless/marvell/mwifiex/ie.c
@@ -349,6 +349,7 @@ static int mwifiex_uap_parse_tail_ies(struct mwifiex_private *priv,
 		case WLAN_EID_SUPP_RATES:
 		case WLAN_EID_COUNTRY:
 		case WLAN_EID_PWR_CONSTRAINT:
+		case WLAN_EID_ERP_INFO:
 		case WLAN_EID_EXT_SUPP_RATES:
 		case WLAN_EID_HT_CAPABILITY:
 		case WLAN_EID_HT_OPERATION:
-- 
cgit v1.2.3-59-g8ed1b


From c1003538bf6b28a4bae8e511672fa5abaabe683c Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Mon, 2 Apr 2018 20:31:23 +0800
Subject: mwifiex: uap: support cfg80211 ignore_broadcast_ssid=2

Firmware already support hidden ssid and keep ssid length,
Open the capability in driver.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 7f7e9de2db1c..4857b75e54a7 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1979,7 +1979,8 @@ static int mwifiex_cfg80211_start_ap(struct wiphy *wiphy,
 		bss_cfg->bcast_ssid_ctl = 0;
 		break;
 	case NL80211_HIDDEN_SSID_ZERO_CONTENTS:
-		/* firmware doesn't support this type of hidden SSID */
+		bss_cfg->bcast_ssid_ctl = 2;
+		break;
 	default:
 		kfree(bss_cfg);
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 01eca2842874b9a85b7cd1e1b0e5b34a5d53a21f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 5 Apr 2018 14:17:19 +0300
Subject: mwifiex: pcie: tighten a check in mwifiex_pcie_process_event_ready()

If "evt_len" is 1 then we try to memcpy() negative 3 bytes and it would
cause memory corruption.

Fixes: d930faee141b ("mwifiex: add support for Marvell pcie8766 chipset")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/pcie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c
index 97a6199692ab..7538543d46fa 100644
--- a/drivers/net/wireless/marvell/mwifiex/pcie.c
+++ b/drivers/net/wireless/marvell/mwifiex/pcie.c
@@ -1881,7 +1881,8 @@ static int mwifiex_pcie_process_event_ready(struct mwifiex_adapter *adapter)
 		mwifiex_dbg(adapter, EVENT,
 			    "info: Event length: %d\n", evt_len);
 
-		if ((evt_len > 0) && (evt_len  < MAX_EVENT_SIZE))
+		if (evt_len > MWIFIEX_EVENT_HEADER_LEN &&
+		    evt_len < MAX_EVENT_SIZE)
 			memcpy(adapter->event_body, skb_cmd->data +
 			       MWIFIEX_EVENT_HEADER_LEN, evt_len -
 			       MWIFIEX_EVENT_HEADER_LEN);
-- 
cgit v1.2.3-59-g8ed1b


From 1f589e2510d5df1192dca7c089103a2cbd028101 Mon Sep 17 00:00:00 2001
From: Dan Haab <dhaab@luxul.com>
Date: Tue, 3 Apr 2018 10:21:56 +0200
Subject: brcmfmac: add support for BCM4366E chipset

BCM4366E is a wireless chipset with a BCM43664 ChipCommon. It's
supported by the same firmware as 4366c0.

Signed-off-by: Dan Haab <dan.haab@luxul.com>
[arend: rebase patch and remove unnecessary definition]
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c       | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c       | 1 +
 drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 3b829fed8631..927d62b3d41b 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -689,6 +689,7 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci)
 	case BRCM_CC_43525_CHIP_ID:
 	case BRCM_CC_4365_CHIP_ID:
 	case BRCM_CC_4366_CHIP_ID:
+	case BRCM_CC_43664_CHIP_ID:
 		return 0x200000;
 	case CY_CC_4373_CHIP_ID:
 		return 0x160000;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 091c191ce259..6e25ff94dd07 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -75,6 +75,7 @@ static struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_4365_CHIP_ID, 0xFFFFFFF0, 4365C),
 	BRCMF_FW_ENTRY(BRCM_CC_4366_CHIP_ID, 0x0000000F, 4366B),
 	BRCMF_FW_ENTRY(BRCM_CC_4366_CHIP_ID, 0xFFFFFFF0, 4366C),
+	BRCMF_FW_ENTRY(BRCM_CC_43664_CHIP_ID, 0xFFFFFFF0, 4366C),
 	BRCMF_FW_ENTRY(BRCM_CC_4371_CHIP_ID, 0xFFFFFFFF, 4371),
 };
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
index 57544a3a3ce4..686f7a85a045 100644
--- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
@@ -57,6 +57,7 @@
 #define BRCM_CC_43602_CHIP_ID		43602
 #define BRCM_CC_4365_CHIP_ID		0x4365
 #define BRCM_CC_4366_CHIP_ID		0x4366
+#define BRCM_CC_43664_CHIP_ID		43664
 #define BRCM_CC_4371_CHIP_ID		0x4371
 #define CY_CC_4373_CHIP_ID		0x4373
 
-- 
cgit v1.2.3-59-g8ed1b


From 863683cfbbfcd48ae88e2af880f52b91332f1f26 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 4 Apr 2018 19:09:44 -0500
Subject: brcmsmac: phy_lcn: remove duplicate code

Remove and refactor some code in order to avoid having identical code
for different branches.

Notice that this piece of code hasn't been modified since 2011.

Addresses-Coverity-ID: 1226756 ("Identical code for different branches")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
index 93d4cde0eb31..9d830d27b229 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_lcn.c
@@ -3388,13 +3388,8 @@ void wlc_lcnphy_deaf_mode(struct brcms_phy *pi, bool mode)
 	u8 phybw40;
 	phybw40 = CHSPEC_IS40(pi->radio_chanspec);
 
-	if (LCNREV_LT(pi->pubpi.phy_rev, 2)) {
-		mod_phy_reg(pi, 0x4b0, (0x1 << 5), (mode) << 5);
-		mod_phy_reg(pi, 0x4b1, (0x1 << 9), 0 << 9);
-	} else {
-		mod_phy_reg(pi, 0x4b0, (0x1 << 5), (mode) << 5);
-		mod_phy_reg(pi, 0x4b1, (0x1 << 9), 0 << 9);
-	}
+	mod_phy_reg(pi, 0x4b0, (0x1 << 5), (mode) << 5);
+	mod_phy_reg(pi, 0x4b1, (0x1 << 9), 0 << 9);
 
 	if (phybw40 == 0) {
 		mod_phy_reg((pi), 0x410,
-- 
cgit v1.2.3-59-g8ed1b


From 9d81ecde4dce37cd9e147c24c9bc0cd1d4c259d5 Mon Sep 17 00:00:00 2001
From: Xose Vazquez Perez <xose.vazquez@gmail.com>
Date: Thu, 5 Apr 2018 14:22:32 +0200
Subject: rt2x00: rt2800: add antenna diversity for RT5370G

RT5370G has hardware RX antenna diversity like RT5390R.
Based on DPO_RT5572_LinuxSTA_2.6.1.3_20121022.tar.bz2 manufacturer driver:
https://d86o2zu8ugzlg.cloudfront.net/mediatek-craft/drivers/DPO_RT5572_LinuxSTA_2.6.1.3_20121022.tar.bz2

Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Helmut Schaa <helmut.schaa@googlemail.com>
Cc: Gertjan van Wingerde <gwingerde@gmail.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: wireless newbie <wnewbie72@gmail.com>
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: linux wireless ml <linux-wireless@vger.kernel.org>
Signed-off-by: Xose Vazquez Perez <xose.vazquez@gmail.com>
Acked-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ralink/rt2x00/rt2800.h    | 1 +
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800.h b/drivers/net/wireless/ralink/rt2x00/rt2800.h
index 6a8c93fb6a43..7a133e94b1bf 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800.h
@@ -94,6 +94,7 @@
 #define REV_RT3390E			0x0211
 #define REV_RT3593E			0x0211
 #define REV_RT5390F			0x0502
+#define REV_RT5370G			0x0503
 #define REV_RT5390R			0x1502
 #define REV_RT5592C			0x0221
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index 429d07b651dd..e827dc522580 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -6220,8 +6220,9 @@ static void rt2800_init_bbp_53xx(struct rt2x00_dev *rt2x00dev)
 		rt2800_register_write(rt2x00dev, GPIO_CTRL, reg);
 	}
 
-	/* This chip has hardware antenna diversity*/
-	if (rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5390R)) {
+	/* These chips have hardware RX antenna diversity */
+	if (rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5390R) ||
+	    rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5370G)) {
 		rt2800_bbp_write(rt2x00dev, 150, 0); /* Disable Antenna Software OFDM */
 		rt2800_bbp_write(rt2x00dev, 151, 0); /* Disable Antenna Software CCK */
 		rt2800_bbp_write(rt2x00dev, 154, 0); /* Clear previously selected antenna */
@@ -8748,7 +8749,9 @@ static int rt2800_init_eeprom(struct rt2x00_dev *rt2x00dev)
 		rt2x00dev->default_ant.rx = ANTENNA_A;
 	}
 
-	if (rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5390R)) {
+	/* These chips have hardware RX antenna diversity */
+	if (rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5390R) ||
+	    rt2x00_rt_rev_gte(rt2x00dev, RT5390, REV_RT5370G)) {
 		rt2x00dev->default_ant.tx = ANTENNA_HW_DIVERSITY; /* Unused */
 		rt2x00dev->default_ant.rx = ANTENNA_HW_DIVERSITY; /* Unused */
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3763770044640caeb1101cdea40697cc0814403c Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 5 Apr 2018 10:49:49 -0500
Subject: qtnfmac: pearl: pcie: fix memory leak in qtnf_fw_work_handler

In case memory resources for fw were succesfully allocated, release
them before jumping to fw_load_fail.

Addresses-Coverity-ID: 1466092 ("Resource leak")
Fixes: c3b2f7ca4186 ("qtnfmac: implement asynchronous firmware loading")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
index f117904d9120..6c1e139bb8f7 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
@@ -1185,6 +1185,10 @@ static void qtnf_fw_work_handler(struct work_struct *work)
 	if (qtnf_poll_state(&priv->bda->bda_ep_state, QTN_EP_FW_LOADRDY,
 			    QTN_FW_DL_TIMEOUT_MS)) {
 		pr_err("card is not ready\n");
+
+		if (!flashboot)
+			release_firmware(fw);
+
 		goto fw_load_fail;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b056b83c06e1b01b7091ba81c5883038a0fc2f46 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:15:58 -0400
Subject: ixgbe: Drop support for macvlan specific unicast lists

Drop the code for handling macvlan specific unicast lists. It isn't needed
since we don't take any efforts to maintain it when we bring the interface
up and it takes the slow path anyway.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 31 ---------------------------
 1 file changed, 31 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 51e7d82a5860..c8f59430a5a1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4900,36 +4900,6 @@ int ixgbe_del_mac_filter(struct ixgbe_adapter *adapter,
 	return -ENOMEM;
 }
 
-/**
- * ixgbe_write_uc_addr_list - write unicast addresses to RAR table
- * @netdev: network interface device structure
- * @vfn: pool to associate with unicast addresses
- *
- * Writes unicast address list to the RAR table.
- * Returns: -ENOMEM on failure/insufficient address space
- *                0 on no addresses written
- *                X on writing X addresses to the RAR table
- **/
-static int ixgbe_write_uc_addr_list(struct net_device *netdev, int vfn)
-{
-	struct ixgbe_adapter *adapter = netdev_priv(netdev);
-	int count = 0;
-
-	/* return ENOMEM indicating insufficient memory for addresses */
-	if (netdev_uc_count(netdev) > ixgbe_available_rars(adapter, vfn))
-		return -ENOMEM;
-
-	if (!netdev_uc_empty(netdev)) {
-		struct netdev_hw_addr *ha;
-		netdev_for_each_uc_addr(ha, netdev) {
-			ixgbe_del_mac_filter(adapter, ha->addr, vfn);
-			ixgbe_add_mac_filter(adapter, ha->addr, vfn);
-			count++;
-		}
-	}
-	return count;
-}
-
 static int ixgbe_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
@@ -5328,7 +5298,6 @@ static void ixgbe_macvlan_set_rx_mode(struct net_device *dev, unsigned int pool,
 		vmolr |= IXGBE_VMOLR_ROMPE;
 		hw->mac.ops.update_mc_addr_list(hw, dev);
 	}
-	ixgbe_write_uc_addr_list(adapter->netdev, pool);
 	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(pool), vmolr);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 7d775f63470c3b6ddf34c770c973293ab925a7bb Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:03 -0400
Subject: macvlan: Rename fwd_priv to accel_priv and add accessor function

This change renames the fwd_priv member to accel_priv as this more
accurately reflects the actual purpose of this value. In addition I am
adding an accessor which will allow us to further abstract this in the
future if needed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++++-----
 drivers/net/macvlan.c                         | 31 ++++++++++++++++-----------
 include/linux/if_macvlan.h                    |  8 ++++++-
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c8f59430a5a1..0fbeb2f48711 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5398,10 +5398,9 @@ static int ixgbe_fwd_ring_up(struct net_device *vdev,
 static int ixgbe_upper_dev_walk(struct net_device *upper, void *data)
 {
 	if (netif_is_macvlan(upper)) {
-		struct macvlan_dev *dfwd = netdev_priv(upper);
-		struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
+		struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper);
 
-		if (dfwd->fwd_priv)
+		if (vadapter)
 			ixgbe_fwd_ring_up(upper, vadapter);
 	}
 
@@ -8983,13 +8982,12 @@ struct upper_walk_data {
 static int get_macvlan_queue(struct net_device *upper, void *_data)
 {
 	if (netif_is_macvlan(upper)) {
-		struct macvlan_dev *dfwd = netdev_priv(upper);
-		struct ixgbe_fwd_adapter *vadapter = dfwd->fwd_priv;
+		struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper);
 		struct upper_walk_data *data = _data;
 		struct ixgbe_adapter *adapter = data->adapter;
 		int ifindex = data->ifindex;
 
-		if (vadapter && vadapter->netdev->ifindex == ifindex) {
+		if (vadapter && upper->ifindex == ifindex) {
 			data->queue = adapter->rx_ring[vadapter->rx_base_queue]->reg_idx;
 			data->action = data->queue;
 			return 1;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 725f4b4afc6d..7ddc94ff4109 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -559,9 +559,9 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 	if (unlikely(netpoll_tx_running(dev)))
 		return macvlan_netpoll_send_skb(vlan, skb);
 
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		skb->dev = vlan->lowerdev;
-		ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);
+		ret = dev_queue_xmit_accel(skb, vlan->accel_priv);
 	} else {
 		ret = macvlan_queue_xmit(skb, dev);
 	}
@@ -613,16 +613,23 @@ static int macvlan_open(struct net_device *dev)
 		goto hash_add;
 	}
 
+	err = -EBUSY;
+	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
+		goto out;
+
+	/* Attempt to populate accel_priv which is used to offload the L2
+	 * forwarding requests for unicast packets.
+	 */
 	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
-		vlan->fwd_priv =
+		vlan->accel_priv =
 		      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
 
 		/* If we get a NULL pointer back, or if we get an error
 		 * then we should just fall through to the non accelerated path
 		 */
-		if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
-			vlan->fwd_priv = NULL;
-		} else
+		if (IS_ERR_OR_NULL(vlan->accel_priv))
+			vlan->accel_priv = NULL;
+		else
 			return 0;
 	}
 
@@ -655,10 +662,10 @@ clear_multi:
 del_unicast:
 	dev_uc_del(lowerdev, dev->dev_addr);
 out:
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
-							   vlan->fwd_priv);
-		vlan->fwd_priv = NULL;
+							   vlan->accel_priv);
+		vlan->accel_priv = NULL;
 	}
 	return err;
 }
@@ -668,10 +675,10 @@ static int macvlan_stop(struct net_device *dev)
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
-	if (vlan->fwd_priv) {
+	if (vlan->accel_priv) {
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
-							   vlan->fwd_priv);
-		vlan->fwd_priv = NULL;
+							   vlan->accel_priv);
+		vlan->accel_priv = NULL;
 		return 0;
 	}
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 4cb7aeeafce0..c5106ce8273a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -21,7 +21,7 @@ struct macvlan_dev {
 	struct hlist_node	hlist;
 	struct macvlan_port	*port;
 	struct net_device	*lowerdev;
-	void			*fwd_priv;
+	void			*accel_priv;
 	struct vlan_pcpu_stats __percpu *pcpu_stats;
 
 	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
@@ -86,4 +86,10 @@ macvlan_dev_real_dev(const struct net_device *dev)
 }
 #endif
 
+static inline void *macvlan_accel_priv(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->accel_priv;
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3-59-g8ed1b


From 81d4e91cd599ed7fd378ca5463d6d9b05214b8b2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:09 -0400
Subject: macvlan: Use software path for offloaded local, broadcast, and
 multicast traffic

This change makes it so that we use a software path for packets that are
going to be locally switched between two macvlan interfaces on the same
device. In addition we resort to software replication of broadcast and
multicast packets instead of offloading that to hardware.

The general idea is that using the device for east/west traffic local to
the system is extremely inefficient. We can only support up to whatever the
PCIe limit is for any given device so this caps us at somewhere around 20G
for devices supported by ixgbe. This is compounded even further when you
take broadcast and multicast into account as a single 10G port can come to
a crawl as a packet is replicated up to 60+ times in some cases. In order
to get away from that I am implementing changes so that we handle
broadcast/multicast replication and east/west local traffic all in
software.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c |  4 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 39 ++++++--------------
 drivers/net/macvlan.c                           | 47 +++++++++++--------------
 3 files changed, 33 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 45793491d4ba..ee645bacfe93 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1254,7 +1254,7 @@ void fm10k_restore_rx_state(struct fm10k_intfc *interface)
 			glort = l2_accel->dglort + 1 + i;
 
 			hw->mac.ops.update_xcast_mode(hw, glort,
-						      FM10K_XCAST_MODE_MULTI);
+						      FM10K_XCAST_MODE_NONE);
 			fm10k_queue_mac_request(interface, glort,
 						sdev->dev_addr,
 						hw->mac.default_vid, true);
@@ -1515,7 +1515,7 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 
 	if (fm10k_host_mbx_ready(interface)) {
 		hw->mac.ops.update_xcast_mode(hw, glort,
-					      FM10K_XCAST_MODE_MULTI);
+					      FM10K_XCAST_MODE_NONE);
 		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
 					hw->mac.default_vid, true);
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0fbeb2f48711..4d4fa46a1e7c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -4219,7 +4219,8 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
 static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	u32 reg_offset, vf_shift;
+	u16 pool = adapter->num_rx_pools;
+	u32 reg_offset, vf_shift, vmolr;
 	u32 gcr_ext, vmdctl;
 	int i;
 
@@ -4233,6 +4234,13 @@ static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
 	vmdctl |= IXGBE_VT_CTL_REPLEN;
 	IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vmdctl);
 
+	/* accept untagged packets until a vlan tag is
+	 * specifically set for the VMDQ queue/pool
+	 */
+	vmolr = IXGBE_VMOLR_AUPE;
+	while (pool--)
+		IXGBE_WRITE_REG(hw, IXGBE_VMOLR(VMDQ_P(pool)), vmolr);
+
 	vf_shift = VMDQ_P(0) % 32;
 	reg_offset = (VMDQ_P(0) >= 32) ? 1 : 0;
 
@@ -5279,28 +5287,6 @@ static void ixgbe_fdir_filter_restore(struct ixgbe_adapter *adapter)
 	spin_unlock(&adapter->fdir_perfect_lock);
 }
 
-static void ixgbe_macvlan_set_rx_mode(struct net_device *dev, unsigned int pool,
-				      struct ixgbe_adapter *adapter)
-{
-	struct ixgbe_hw *hw = &adapter->hw;
-	u32 vmolr;
-
-	/* No unicast promiscuous support for VMDQ devices. */
-	vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(pool));
-	vmolr |= (IXGBE_VMOLR_ROMPE | IXGBE_VMOLR_BAM | IXGBE_VMOLR_AUPE);
-
-	/* clear the affected bit */
-	vmolr &= ~IXGBE_VMOLR_MPE;
-
-	if (dev->flags & IFF_ALLMULTI) {
-		vmolr |= IXGBE_VMOLR_MPE;
-	} else {
-		vmolr |= IXGBE_VMOLR_ROMPE;
-		hw->mac.ops.update_mc_addr_list(hw, dev);
-	}
-	IXGBE_WRITE_REG(hw, IXGBE_VMOLR(pool), vmolr);
-}
-
 /**
  * ixgbe_clean_rx_ring - Free Rx Buffers per Queue
  * @rx_ring: ring to free buffers from
@@ -5384,10 +5370,8 @@ static int ixgbe_fwd_ring_up(struct net_device *vdev,
 	 */
 	err = ixgbe_add_mac_filter(adapter, vdev->dev_addr,
 				   VMDQ_P(accel->pool));
-	if (err >= 0) {
-		ixgbe_macvlan_set_rx_mode(vdev, accel->pool, adapter);
+	if (err >= 0)
 		return 0;
-	}
 
 	for (i = 0; i < adapter->num_rx_queues_per_pool; i++)
 		adapter->rx_ring[baseq + i]->netdev = NULL;
@@ -9832,9 +9816,6 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 	ixgbe_del_mac_filter(adapter, accel->netdev->dev_addr,
 			     VMDQ_P(accel->pool));
 
-	/* disable ability to receive packets for this pool */
-	IXGBE_WRITE_REG(&adapter->hw, IXGBE_VMOLR(accel->pool), 0);
-
 	/* Allow remaining Rx packets to get flushed out of the
 	 * Rx FIFO before we drop the netdev for the ring.
 	 */
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 7ddc94ff4109..adde8fc45588 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -514,6 +514,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 	const struct macvlan_port *port = vlan->port;
 	const struct macvlan_dev *dest;
+	void *accel_priv = NULL;
 
 	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
 		const struct ethhdr *eth = (void *)skb->data;
@@ -533,9 +534,14 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 
+	/* For packets that are non-multicast and not bridged we will pass
+	 * the necessary information so that the lowerdev can distinguish
+	 * the source of the packets via the accel_priv value.
+	 */
+	accel_priv = vlan->accel_priv;
 xmit_world:
 	skb->dev = vlan->lowerdev;
-	return dev_queue_xmit(skb);
+	return dev_queue_xmit_accel(skb, accel_priv);
 }
 
 static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
@@ -552,19 +558,14 @@ static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, str
 static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
+	struct macvlan_dev *vlan = netdev_priv(dev);
 	unsigned int len = skb->len;
 	int ret;
-	struct macvlan_dev *vlan = netdev_priv(dev);
 
 	if (unlikely(netpoll_tx_running(dev)))
 		return macvlan_netpoll_send_skb(vlan, skb);
 
-	if (vlan->accel_priv) {
-		skb->dev = vlan->lowerdev;
-		ret = dev_queue_xmit_accel(skb, vlan->accel_priv);
-	} else {
-		ret = macvlan_queue_xmit(skb, dev);
-	}
+	ret = macvlan_queue_xmit(skb, dev);
 
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 		struct vlan_pcpu_stats *pcpu_stats;
@@ -620,26 +621,20 @@ static int macvlan_open(struct net_device *dev)
 	/* Attempt to populate accel_priv which is used to offload the L2
 	 * forwarding requests for unicast packets.
 	 */
-	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
+	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD)
 		vlan->accel_priv =
 		      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
 
-		/* If we get a NULL pointer back, or if we get an error
-		 * then we should just fall through to the non accelerated path
-		 */
-		if (IS_ERR_OR_NULL(vlan->accel_priv))
-			vlan->accel_priv = NULL;
-		else
-			return 0;
+	/* If earlier attempt to offload failed, or accel_priv is not
+	 * populated we must add the unicast address to the lower device.
+	 */
+	if (IS_ERR_OR_NULL(vlan->accel_priv)) {
+		vlan->accel_priv = NULL;
+		err = dev_uc_add(lowerdev, dev->dev_addr);
+		if (err < 0)
+			goto out;
 	}
 
-	err = -EBUSY;
-	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
-		goto out;
-
-	err = dev_uc_add(lowerdev, dev->dev_addr);
-	if (err < 0)
-		goto out;
 	if (dev->flags & IFF_ALLMULTI) {
 		err = dev_set_allmulti(lowerdev, 1);
 		if (err < 0)
@@ -660,13 +655,14 @@ clear_multi:
 	if (dev->flags & IFF_ALLMULTI)
 		dev_set_allmulti(lowerdev, -1);
 del_unicast:
-	dev_uc_del(lowerdev, dev->dev_addr);
-out:
 	if (vlan->accel_priv) {
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
 							   vlan->accel_priv);
 		vlan->accel_priv = NULL;
+	} else {
+		dev_uc_del(lowerdev, dev->dev_addr);
 	}
+out:
 	return err;
 }
 
@@ -679,7 +675,6 @@ static int macvlan_stop(struct net_device *dev)
 		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
 							   vlan->accel_priv);
 		vlan->accel_priv = NULL;
-		return 0;
 	}
 
 	dev_uc_unsync(lowerdev, dev);
-- 
cgit v1.2.3-59-g8ed1b


From 8d80ac43dea3877a945d15dc09a6c6efd4dc27a3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:14 -0400
Subject: ixgbe/fm10k: Drop tracking stats for macvlan broadcast/multicast

Drop dead code now that we shouldn't be receiving broadcast or multicast
frames on the queues associated to the macvlan netdev.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_main.c | 7 +++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index df8607097e4a..c51d61f5f715 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -445,15 +445,14 @@ static void fm10k_type_trans(struct fm10k_ring *rx_ring,
 			l2_accel = NULL;
 	}
 
-	skb->protocol = eth_type_trans(skb, dev);
-
 	/* Record Rx queue, or update macvlan statistics */
 	if (!l2_accel)
 		skb_record_rx_queue(skb, rx_ring->queue_index);
 	else
 		macvlan_count_rx(netdev_priv(dev), skb->len + ETH_HLEN, true,
-				 (skb->pkt_type == PACKET_BROADCAST) ||
-				 (skb->pkt_type == PACKET_MULTICAST));
+				 false);
+
+	skb->protocol = eth_type_trans(skb, dev);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4d4fa46a1e7c..2c648214148e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1768,15 +1768,14 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 	if (ixgbe_test_staterr(rx_desc, IXGBE_RXDADV_STAT_SECP))
 		ixgbe_ipsec_rx(rx_ring, rx_desc, skb);
 
-	skb->protocol = eth_type_trans(skb, dev);
-
 	/* record Rx queue, or update MACVLAN statistics */
 	if (netif_is_ixgbe(dev))
 		skb_record_rx_queue(skb, rx_ring->queue_index);
 	else
 		macvlan_count_rx(netdev_priv(dev), skb->len + ETH_HLEN, true,
-				 (skb->pkt_type == PACKET_BROADCAST) ||
-				 (skb->pkt_type == PACKET_MULTICAST));
+				 false);
+
+	skb->protocol = eth_type_trans(skb, dev);
 }
 
 static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
-- 
cgit v1.2.3-59-g8ed1b


From a222311283641c9a476426ccacb2a3c82bfe5aaa Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:19 -0400
Subject: macvlan: macvlan_count_rx shouldn't be static inline AND extern

It doesn't make sense to define macvlan_count_rx as a static inline and
then add a forward declaration after that as an extern. I am dropping the
extern declaration since it seems like it is something that likely got
missed when the function was made an inline.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c5106ce8273a..80089d6c5a0a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -61,10 +61,6 @@ extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 				  struct nlattr *tb[], struct nlattr *data[],
 				  struct netlink_ext_ack *extack);
 
-extern void macvlan_count_rx(const struct macvlan_dev *vlan,
-			     unsigned int len, bool success,
-			     bool multicast);
-
 extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
 
 extern int macvlan_link_register(struct rtnl_link_ops *ops);
-- 
cgit v1.2.3-59-g8ed1b


From 6cb1937d4eff6936089139169377fda208a2701a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:25 -0400
Subject: macvlan: Add function to test for destination filtering support

This patch adds a function indicating if a given macvlan can fully supports
destination filtering, especially as it relates to unicast traffic. For
those macvlan interfaces that do not support destination filtering such
passthru or source mode filtering we should not be enabling offload
support.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 80089d6c5a0a..0221390668a0 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -88,4 +88,13 @@ static inline void *macvlan_accel_priv(struct net_device *dev)
 
 	return macvlan->accel_priv;
 }
+
+static inline bool macvlan_supports_dest_filter(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	return macvlan->mode == MACVLAN_MODE_PRIVATE ||
+	       macvlan->mode == MACVLAN_MODE_VEPA ||
+	       macvlan->mode == MACVLAN_MODE_BRIDGE;
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3-59-g8ed1b


From 53cd4d8e4dfb231fa5ba125b6c0666b82a0504fa Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:30 -0400
Subject: macvlan: Provide function for interfaces to release HW offload

This patch provides a basic function to allow a lower device to disable
macvlan offload if it was previously enabled on a given macvlan. The idea
here is to allow for recovery from failure should the lowerdev run out of
resources.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/if_macvlan.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 0221390668a0..2e55e4cdbd8a 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -97,4 +97,12 @@ static inline bool macvlan_supports_dest_filter(struct net_device *dev)
 	       macvlan->mode == MACVLAN_MODE_VEPA ||
 	       macvlan->mode == MACVLAN_MODE_BRIDGE;
 }
+
+static inline int macvlan_release_l2fw_offload(struct net_device *dev)
+{
+	struct macvlan_dev *macvlan = netdev_priv(dev);
+
+	macvlan->accel_priv = NULL;
+	return dev_uc_add(macvlan->lowerdev, dev->dev_addr);
+}
 #endif /* _LINUX_IF_MACVLAN_H */
-- 
cgit v1.2.3-59-g8ed1b


From 3335915d07f74e6391ce33959e2f25cab273bf73 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:35 -0400
Subject: ixgbe/fm10k: Only support macvlan offload for types that support
 destination filtering

Both the ixgbe and fm10k drivers support destination filtering.

Instead of adding a ton of complexity to support either source or passthru
mode we can instead just avoid offloading them for now. Doing this we avoid
leaking packets into interfaces that aren't meant to receive them.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 8 ++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index ee645bacfe93..26e749766337 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -22,6 +22,7 @@
 #include "fm10k.h"
 #include <linux/vmalloc.h>
 #include <net/udp_tunnel.h>
+#include <linux/if_macvlan.h>
 
 /**
  * fm10k_setup_tx_resources - allocate Tx resources (Descriptors)
@@ -1449,6 +1450,13 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 	int size = 0, i;
 	u16 glort;
 
+	/* The hardware supported by fm10k only filters on the destination MAC
+	 * address. In order to avoid issues we only support offloading modes
+	 * where the hardware can actually provide the functionality.
+	 */
+	if (!macvlan_supports_dest_filter(sdev))
+		return ERR_PTR(-EMEDIUMTYPE);
+
 	/* allocate l2 accel structure if it is not available */
 	if (!l2_accel) {
 		/* verify there is enough free GLORTs to support l2_accel */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 2c648214148e..76f54771fdb2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9760,6 +9760,13 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
 	unsigned int limit;
 	int pool, err;
 
+	/* The hardware supported by ixgbe only filters on the destination MAC
+	 * address. In order to avoid issues we only support offloading modes
+	 * where the hardware can actually provide the functionality.
+	 */
+	if (!macvlan_supports_dest_filter(vdev))
+		return ERR_PTR(-EMEDIUMTYPE);
+
 	/* Hardware has a limited number of available pools. Each VF, and the
 	 * PF require a pool. Check to ensure we don't attempt to use more
 	 * then the available number of pools.
-- 
cgit v1.2.3-59-g8ed1b


From 865255b5a2e5a905a1cd8ff8444604af61ee79d8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:40 -0400
Subject: ixgbe: Drop real_adapter from l2 fwd acceleration structure

This patch drops the real_adapter member from the fwd_adapter structure.
The general idea behind the change is that the real_adapter is carrying
unnecessary data since we could always just grab the adapter structure
from netdev_priv(macvlan->lowerdev) if we really needed to get at it.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h      |  1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 +++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 7dd5038cfcc4..8fccca57cd6a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -305,7 +305,6 @@ enum ixgbe_ring_state_t {
 struct ixgbe_fwd_adapter {
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
 	struct net_device *netdev;
-	struct ixgbe_adapter *real_adapter;
 	unsigned int tx_base_queue;
 	unsigned int rx_base_queue;
 	int pool;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 76f54771fdb2..f00720582e52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5338,10 +5338,10 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 	rx_ring->next_to_use = 0;
 }
 
-static int ixgbe_fwd_ring_up(struct net_device *vdev,
+static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter,
 			     struct ixgbe_fwd_adapter *accel)
 {
-	struct ixgbe_adapter *adapter = accel->real_adapter;
+	struct net_device *vdev = accel->netdev;
 	int i, baseq, err;
 
 	if (!test_bit(accel->pool, adapter->fwd_bitmask))
@@ -5378,14 +5378,19 @@ static int ixgbe_fwd_ring_up(struct net_device *vdev,
 	return err;
 }
 
-static int ixgbe_upper_dev_walk(struct net_device *upper, void *data)
+static int ixgbe_macvlan_up(struct net_device *vdev, void *data)
 {
-	if (netif_is_macvlan(upper)) {
-		struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper);
+	struct ixgbe_adapter *adapter = data;
+	struct ixgbe_fwd_adapter *accel;
 
-		if (vadapter)
-			ixgbe_fwd_ring_up(upper, vadapter);
-	}
+	if (!netif_is_macvlan(vdev))
+		return 0;
+
+	accel = macvlan_accel_priv(vdev);
+	if (!accel)
+		return 0;
+
+	ixgbe_fwd_ring_up(adapter, accel);
 
 	return 0;
 }
@@ -5393,7 +5398,7 @@ static int ixgbe_upper_dev_walk(struct net_device *upper, void *data)
 static void ixgbe_configure_dfwd(struct ixgbe_adapter *adapter)
 {
 	netdev_walk_all_upper_dev_rcu(adapter->netdev,
-				      ixgbe_upper_dev_walk, NULL);
+				      ixgbe_macvlan_up, adapter);
 }
 
 static void ixgbe_configure(struct ixgbe_adapter *adapter)
@@ -9792,13 +9797,12 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
 	adapter->ring_feature[RING_F_VMDQ].limit = limit + 1;
 
 	fwd_adapter->pool = pool;
-	fwd_adapter->real_adapter = adapter;
 
 	/* Force reinit of ring allocation with VMDQ enabled */
 	err = ixgbe_setup_tc(pdev, adapter->hw_tcs);
 
 	if (!err && netif_running(pdev))
-		err = ixgbe_fwd_ring_up(vdev, fwd_adapter);
+		err = ixgbe_fwd_ring_up(adapter, fwd_adapter);
 
 	if (!err)
 		return fwd_adapter;
@@ -9814,7 +9818,7 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
 static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 {
 	struct ixgbe_fwd_adapter *accel = priv;
-	struct ixgbe_adapter *adapter = accel->real_adapter;
+	struct ixgbe_adapter *adapter = netdev_priv(pdev);
 	unsigned int rxbase = accel->rx_base_queue;
 	unsigned int limit, i;
 
-- 
cgit v1.2.3-59-g8ed1b


From 8315ef6f395cc4e14171c59e1176711976deeb84 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 3 Apr 2018 17:16:45 -0400
Subject: ixgbe: Avoid performing unnecessary resets for macvlan offload

The original implementation for macvlan offload has us performing a full
port reset every time we added a new macvlan. This shouldn't be necessary
and can be avoided with a few behavior changes.

This patches updates the logic for the queues so that we have essentially 3
possible configurations for macvlan offload. They consist of 15 macvlans
with 4 queues per macvlan, 31 macvlans with 2 queues per macvlan, and 63
macvlans with 1 queue per macvlan. As macvlans are added you will encounter
up to 3 total resets if you add all the way up to 63, and after that the
device will stay in the mode supporting up to 63 macvlans until the L2FW
flag is cleared.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  | 192 +++++++++++++++++--------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   5 +-
 2 files changed, 135 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f00720582e52..b6e5cea84949 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5344,15 +5344,11 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter,
 	struct net_device *vdev = accel->netdev;
 	int i, baseq, err;
 
-	if (!test_bit(accel->pool, adapter->fwd_bitmask))
-		return 0;
-
 	baseq = accel->pool * adapter->num_rx_queues_per_pool;
 	netdev_dbg(vdev, "pool %i:%i queues %i:%i\n",
 		   accel->pool, adapter->num_rx_pools,
 		   baseq, baseq + adapter->num_rx_queues_per_pool);
 
-	accel->netdev = vdev;
 	accel->rx_base_queue = baseq;
 	accel->tx_base_queue = baseq;
 
@@ -5372,9 +5368,17 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter,
 	if (err >= 0)
 		return 0;
 
+	/* if we cannot add the MAC rule then disable the offload */
+	macvlan_release_l2fw_offload(vdev);
+
 	for (i = 0; i < adapter->num_rx_queues_per_pool; i++)
 		adapter->rx_ring[baseq + i]->netdev = NULL;
 
+	netdev_err(vdev, "L2FW offload disabled due to L2 filter error\n");
+
+	clear_bit(accel->pool, adapter->fwd_bitmask);
+	kfree(accel);
+
 	return err;
 }
 
@@ -8799,6 +8803,49 @@ static void ixgbe_set_prio_tc_map(struct ixgbe_adapter *adapter)
 }
 
 #endif /* CONFIG_IXGBE_DCB */
+static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, void *data)
+{
+	struct ixgbe_adapter *adapter = data;
+	struct ixgbe_fwd_adapter *accel;
+	int pool;
+
+	/* we only care about macvlans... */
+	if (!netif_is_macvlan(vdev))
+		return 0;
+
+	/* that have hardware offload enabled... */
+	accel = macvlan_accel_priv(vdev);
+	if (!accel)
+		return 0;
+
+	/* If we can relocate to a different bit do so */
+	pool = find_first_zero_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
+	if (pool < adapter->num_rx_pools) {
+		set_bit(pool, adapter->fwd_bitmask);
+		accel->pool = pool;
+		return 0;
+	}
+
+	/* if we cannot find a free pool then disable the offload */
+	netdev_err(vdev, "L2FW offload disabled due to lack of queue resources\n");
+	macvlan_release_l2fw_offload(vdev);
+	kfree(accel);
+
+	return 0;
+}
+
+static void ixgbe_defrag_macvlan_pools(struct net_device *dev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+
+	/* flush any stale bits out of the fwd bitmask */
+	bitmap_clear(adapter->fwd_bitmask, 1, 63);
+
+	/* walk through upper devices reassigning pools */
+	netdev_walk_all_upper_dev_rcu(dev, ixgbe_reassign_macvlan_pool,
+				      adapter);
+}
+
 /**
  * ixgbe_setup_tc - configure net_device for multiple traffic classes
  *
@@ -8866,6 +8913,8 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 #endif /* CONFIG_IXGBE_DCB */
 	ixgbe_init_interrupt_scheme(adapter);
 
+	ixgbe_defrag_macvlan_pools(dev);
+
 	if (netif_running(dev))
 		return ixgbe_open(dev);
 
@@ -9415,6 +9464,22 @@ static netdev_features_t ixgbe_fix_features(struct net_device *netdev,
 	return features;
 }
 
+static void ixgbe_reset_l2fw_offload(struct ixgbe_adapter *adapter)
+{
+	int rss = min_t(int, ixgbe_max_rss_indices(adapter),
+			num_online_cpus());
+
+	/* go back to full RSS if we're not running SR-IOV */
+	if (!adapter->ring_feature[RING_F_VMDQ].offset)
+		adapter->flags &= ~(IXGBE_FLAG_VMDQ_ENABLED |
+				    IXGBE_FLAG_SRIOV_ENABLED);
+
+	adapter->ring_feature[RING_F_RSS].limit = rss;
+	adapter->ring_feature[RING_F_VMDQ].limit = 1;
+
+	ixgbe_setup_tc(adapter->netdev, adapter->hw_tcs);
+}
+
 static int ixgbe_set_features(struct net_device *netdev,
 			      netdev_features_t features)
 {
@@ -9495,7 +9560,9 @@ static int ixgbe_set_features(struct net_device *netdev,
 		}
 	}
 
-	if (need_reset)
+	if ((changed & NETIF_F_HW_L2FW_DOFFLOAD) && adapter->num_rx_pools > 1)
+		ixgbe_reset_l2fw_offload(adapter);
+	else if (need_reset)
 		ixgbe_do_reset(netdev);
 	else if (changed & (NETIF_F_HW_VLAN_CTAG_RX |
 			    NETIF_F_HW_VLAN_CTAG_FILTER))
@@ -9758,11 +9825,9 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 
 static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
 {
-	struct ixgbe_fwd_adapter *fwd_adapter = NULL;
 	struct ixgbe_adapter *adapter = netdev_priv(pdev);
-	int used_pools = adapter->num_vfs + adapter->num_rx_pools;
+	struct ixgbe_fwd_adapter *accel;
 	int tcs = adapter->hw_tcs ? : 1;
-	unsigned int limit;
 	int pool, err;
 
 	/* The hardware supported by ixgbe only filters on the destination MAC
@@ -9772,47 +9837,73 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
 	if (!macvlan_supports_dest_filter(vdev))
 		return ERR_PTR(-EMEDIUMTYPE);
 
-	/* Hardware has a limited number of available pools. Each VF, and the
-	 * PF require a pool. Check to ensure we don't attempt to use more
-	 * then the available number of pools.
-	 */
-	if (used_pools >= IXGBE_MAX_VF_FUNCTIONS)
-		return ERR_PTR(-EINVAL);
+	pool = find_first_zero_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
+	if (pool == adapter->num_rx_pools) {
+		u16 used_pools = adapter->num_vfs + adapter->num_rx_pools;
+		u16 reserved_pools;
+
+		if (((adapter->flags & IXGBE_FLAG_DCB_ENABLED) &&
+		     adapter->num_rx_pools >= (MAX_TX_QUEUES / tcs)) ||
+		    adapter->num_rx_pools > IXGBE_MAX_MACVLANS)
+			return ERR_PTR(-EBUSY);
+
+		/* Hardware has a limited number of available pools. Each VF,
+		 * and the PF require a pool. Check to ensure we don't
+		 * attempt to use more then the available number of pools.
+		 */
+		if (used_pools >= IXGBE_MAX_VF_FUNCTIONS)
+			return ERR_PTR(-EBUSY);
 
-	if (((adapter->flags & IXGBE_FLAG_DCB_ENABLED) &&
-	      adapter->num_rx_pools >= (MAX_TX_QUEUES / tcs)) ||
-	    (adapter->num_rx_pools > IXGBE_MAX_MACVLANS))
-		return ERR_PTR(-EBUSY);
+		/* Enable VMDq flag so device will be set in VM mode */
+		adapter->flags |= IXGBE_FLAG_VMDQ_ENABLED |
+				  IXGBE_FLAG_SRIOV_ENABLED;
 
-	fwd_adapter = kzalloc(sizeof(*fwd_adapter), GFP_KERNEL);
-	if (!fwd_adapter)
-		return ERR_PTR(-ENOMEM);
+		/* Try to reserve as many queues per pool as possible,
+		 * we start with the configurations that support 4 queues
+		 * per pools, followed by 2, and then by just 1 per pool.
+		 */
+		if (used_pools < 32 && adapter->num_rx_pools < 16)
+			reserved_pools = min_t(u16,
+					       32 - used_pools,
+					       16 - adapter->num_rx_pools);
+		else if (adapter->num_rx_pools < 32)
+			reserved_pools = min_t(u16,
+					       64 - used_pools,
+					       32 - adapter->num_rx_pools);
+		else
+			reserved_pools = 64 - used_pools;
 
-	pool = find_first_zero_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
-	set_bit(pool, adapter->fwd_bitmask);
-	limit = find_last_bit(adapter->fwd_bitmask, adapter->num_rx_pools + 1);
 
-	/* Enable VMDq flag so device will be set in VM mode */
-	adapter->flags |= IXGBE_FLAG_VMDQ_ENABLED | IXGBE_FLAG_SRIOV_ENABLED;
-	adapter->ring_feature[RING_F_VMDQ].limit = limit + 1;
+		if (!reserved_pools)
+			return ERR_PTR(-EBUSY);
 
-	fwd_adapter->pool = pool;
+		adapter->ring_feature[RING_F_VMDQ].limit += reserved_pools;
 
-	/* Force reinit of ring allocation with VMDQ enabled */
-	err = ixgbe_setup_tc(pdev, adapter->hw_tcs);
+		/* Force reinit of ring allocation with VMDQ enabled */
+		err = ixgbe_setup_tc(pdev, adapter->hw_tcs);
+		if (err)
+			return ERR_PTR(err);
 
-	if (!err && netif_running(pdev))
-		err = ixgbe_fwd_ring_up(adapter, fwd_adapter);
+		if (pool >= adapter->num_rx_pools)
+			return ERR_PTR(-ENOMEM);
+	}
 
-	if (!err)
-		return fwd_adapter;
+	accel = kzalloc(sizeof(*accel), GFP_KERNEL);
+	if (!accel)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(pool, adapter->fwd_bitmask);
+	accel->pool = pool;
+	accel->netdev = vdev;
 
-	/* unwind counter and free adapter struct */
-	netdev_info(pdev,
-		    "%s: dfwd hardware acceleration failed\n", vdev->name);
-	clear_bit(pool, adapter->fwd_bitmask);
-	kfree(fwd_adapter);
-	return ERR_PTR(err);
+	if (!netif_running(pdev))
+		return accel;
+
+	err = ixgbe_fwd_ring_up(adapter, accel);
+	if (err)
+		return ERR_PTR(err);
+
+	return accel;
 }
 
 static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
@@ -9820,7 +9911,7 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 	struct ixgbe_fwd_adapter *accel = priv;
 	struct ixgbe_adapter *adapter = netdev_priv(pdev);
 	unsigned int rxbase = accel->rx_base_queue;
-	unsigned int limit, i;
+	unsigned int i;
 
 	/* delete unicast filter associated with offloaded interface */
 	ixgbe_del_mac_filter(adapter, accel->netdev->dev_addr,
@@ -9844,25 +9935,6 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
 	}
 
 	clear_bit(accel->pool, adapter->fwd_bitmask);
-	limit = find_last_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
-	adapter->ring_feature[RING_F_VMDQ].limit = limit + 1;
-
-	/* go back to full RSS if we're done with our VMQs */
-	if (adapter->ring_feature[RING_F_VMDQ].limit == 1) {
-		int rss = min_t(int, ixgbe_max_rss_indices(adapter),
-				num_online_cpus());
-
-		adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
-		adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
-		adapter->ring_feature[RING_F_RSS].limit = rss;
-	}
-
-	ixgbe_setup_tc(pdev, adapter->hw_tcs);
-	netdev_dbg(pdev, "pool %i:%i queues %i:%i\n",
-		   accel->pool, adapter->num_rx_pools,
-		   accel->rx_base_queue,
-		   accel->rx_base_queue +
-		   adapter->num_rx_queues_per_pool);
 	kfree(accel);
 }
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 008aa073a679..bfc4171cd3f9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -266,7 +266,7 @@ int ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
 #endif
 
 	/* Disable VMDq flag so device will be set in VM mode */
-	if (adapter->ring_feature[RING_F_VMDQ].limit == 1) {
+	if (bitmap_weight(adapter->fwd_bitmask, adapter->num_rx_pools) == 1) {
 		adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
 		adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
 		rss = min_t(int, ixgbe_max_rss_indices(adapter),
@@ -312,7 +312,8 @@ static int ixgbe_pci_sriov_enable(struct pci_dev *dev, int num_vfs)
 	 * other values out of range.
 	 */
 	num_tc = adapter->hw_tcs;
-	num_rx_pools = adapter->num_rx_pools;
+	num_rx_pools = bitmap_weight(adapter->fwd_bitmask,
+				     adapter->num_rx_pools);
 	limit = (num_tc > 4) ? IXGBE_MAX_VFS_8TC :
 		(num_tc > 1) ? IXGBE_MAX_VFS_4TC : IXGBE_MAX_VFS_1TC;
 
-- 
cgit v1.2.3-59-g8ed1b


From 83dd693f36509c2f2b6df8dca28b6dadc346f41f Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:50 -0700
Subject: igb: Fix not adding filter elements to the list

Because the order of the parameters passes to 'hlist_add_behind()' was
inverted, the 'parent' node was added "behind" the 'input', as input
is not in the list, this causes the 'input' node to be lost.

Fixes: 0e71def25281 ("igb: add support of RX network flow classification")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index e77ba0d5866d..5975d432836f 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2865,7 +2865,7 @@ static int igb_update_ethtool_nfc_entry(struct igb_adapter *adapter,
 
 	/* add filter to the list */
 	if (parent)
-		hlist_add_behind(&parent->nfc_node, &input->nfc_node);
+		hlist_add_behind(&input->nfc_node, &parent->nfc_node);
 	else
 		hlist_add_head(&input->nfc_node, &adapter->nfc_filter_list);
 
-- 
cgit v1.2.3-59-g8ed1b


From b5facfdba14ccb440bd1eac20870a8f23afa17f3 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Tue, 24 Apr 2018 20:23:16 +0200
Subject: ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode

ECMP (equal-cost multipath) hashes are typically computed on the packets'
5-tuple(src IP, dst IP, src port, dst port, L4 proto).

For encapsulated packets, the L4 data is not readily available and ECMP
hashing will often revert to (src IP, dst IP). This will lead to traffic
polarization on a single ECMP path, causing congestion and waste of network
capacity.

In IPv6, the 20-bit flow label field is also used as part of the ECMP hash.
In the lack of L4 data, the hashing will be on (src IP, dst IP, flow
label). Having a non-zero flow label is thus important for proper traffic
load balancing when L4 data is unavailable (i.e., when packets are
encapsulated).

Currently, the seg6_do_srh_encap() function extracts the original packet's
flow label and set it as the outer IPv6 flow label. There are two issues
with this behaviour:

a) There is no guarantee that the inner flow label is set by the source.
b) If the original packet is not IPv6, the flow label will be set to
zero (e.g., IPv4 or L2 encap).

This patch adds a function, named seg6_make_flowlabel(), that computes a
flow label from a given skb. It supports IPv6, IPv4 and L2 payloads, and
leverages the per namespace 'seg6_flowlabel" sysctl value.

The currently support behaviours are as follows:
-1 set flowlabel to zero.
0 copy flowlabel from Inner paceket in case of Inner IPv6
(Set flowlabel to 0 in case IPv4/L2)
1 Compute the flowlabel using seg6_make_flowlabel()

This patch has been tested for IPv6, IPv4, and L2 traffic.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h   |  1 +
 net/ipv6/seg6_iptunnel.c   | 24 ++++++++++++++++++++++--
 net/ipv6/sysctl_net_ipv6.c |  8 ++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 97b3a54579c8..c978a31b0f84 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -43,6 +43,7 @@ struct netns_sysctl_ipv6 {
 	int max_hbh_opts_cnt;
 	int max_dst_opts_len;
 	int max_hbh_opts_len;
+	int seg6_flowlabel;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 5fe139484919..9898926ce30d 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 	rcu_read_unlock();
 }
 
+/* Compute flowlabel for outer IPv6 header */
+static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
+				  struct ipv6hdr *inner_hdr)
+{
+	int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel;
+	__be32 flowlabel = 0;
+	u32 hash;
+
+	if (do_flowlabel > 0) {
+		hash = skb_get_hash(skb);
+		rol32(hash, 16);
+		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
+	} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
+		flowlabel = ip6_flowlabel(inner_hdr);
+	}
+	return flowlabel;
+}
+
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
 int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 {
@@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	struct ipv6hdr *hdr, *inner_hdr;
 	struct ipv6_sr_hdr *isrh;
 	int hdrlen, tot_len, err;
+	__be32 flowlabel;
 
 	hdrlen = (osrh->hdrlen + 1) << 3;
 	tot_len = hdrlen + sizeof(*hdr);
@@ -119,12 +138,13 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	 * decapsulation will overwrite inner hlim with outer hlim
 	 */
 
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
 	if (skb->protocol == htons(ETH_P_IPV6)) {
 		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
-			     ip6_flowlabel(inner_hdr));
+			     flowlabel);
 		hdr->hop_limit = inner_hdr->hop_limit;
 	} else {
-		ip6_flow_hdr(hdr, 0, 0);
+		ip6_flow_hdr(hdr, 0, flowlabel);
 		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
 	}
 
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 6fbdef630152..e15cd37024fd 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "seg6_flowlabel",
+		.data		= &init_net.ipv6.sysctl.seg6_flowlabel,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
@@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
 	ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
 	ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
+	ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3-59-g8ed1b


From 0aef78aa7b39829a2a46c1c0e6e96276d744e07a Mon Sep 17 00:00:00 2001
From: Ivan Vecera <cera@cera.cz>
Date: Tue, 24 Apr 2018 19:51:29 +0200
Subject: ipv6: addrconf: don't evaluate keep_addr_on_down twice

The addrconf_ifdown() evaluates keep_addr_on_down state twice. There
is no need to do it.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Ivan Vecera <cera@cera.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7b4d7bbf2c17..fbfd71a2d9c8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3622,8 +3622,7 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
 	struct inet6_ifaddr *ifa, *tmp;
-	int _keep_addr;
-	bool keep_addr;
+	bool keep_addr = false;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3649,15 +3648,18 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
-	/* aggregate the system setting and interface setting */
-	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
-	if (!_keep_addr)
-		_keep_addr = idev->cnf.keep_addr_on_down;
-
 	/* combine the user config with event to determine if permanent
 	 * addresses are to be removed from address hash table
 	 */
-	keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
+	if (!how && !idev->cnf.disable_ipv6) {
+		/* aggregate the system setting and interface setting */
+		int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+
+		if (!_keep_addr)
+			_keep_addr = idev->cnf.keep_addr_on_down;
+
+		keep_addr = (_keep_addr > 0);
+	}
 
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3707,11 +3709,6 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	/* re-combine the user config with event to determine if permanent
-	 * addresses are to be removed from the interface list
-	 */
-	keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
-
 	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
 		struct fib6_info *rt = NULL;
 		bool keep;
-- 
cgit v1.2.3-59-g8ed1b


From 9ce33e46531d4b9f94b0fa135781e27c7c4e32e8 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 24 Apr 2018 13:49:34 -0700
Subject: neighbour: support for NTF_EXT_LEARNED flag

This patch extends NTF_EXT_LEARNED support to the neighbour system.
Example use-case: An Ethernet VPN implementation (eg in FRR routing suite)
can use this flag to add dynamic reachable external neigh entires
learned via control plane. The use of neigh NTF_EXT_LEARNED in this
patch is consistent with its use with bridge and vxlan fdb entries.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 19 ++++++++++++++++++-
 net/core/neighbour.c    |  8 +++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index e421f86af043..6c1eecd56a4d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -246,6 +246,7 @@ static inline void *neighbour_priv(const struct neighbour *n)
 #define NEIGH_UPDATE_F_OVERRIDE			0x00000001
 #define NEIGH_UPDATE_F_WEAK_OVERRIDE		0x00000002
 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER	0x00000004
+#define NEIGH_UPDATE_F_EXT_LEARNED		0x20000000
 #define NEIGH_UPDATE_F_ISROUTER			0x40000000
 #define NEIGH_UPDATE_F_ADMIN			0x80000000
 
@@ -526,5 +527,21 @@ static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
 	} while (read_seqretry(&n->ha_lock, seq));
 }
 
-
+static inline void neigh_update_ext_learned(struct neighbour *neigh, u32 flags,
+					    int *notify)
+{
+	u8 ndm_flags = 0;
+
+	if (!(flags & NEIGH_UPDATE_F_ADMIN))
+		return;
+
+	ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+	if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+		if (ndm_flags & NTF_EXT_LEARNED)
+			neigh->flags |= NTF_EXT_LEARNED;
+		else
+			neigh->flags &= ~NTF_EXT_LEARNED;
+		*notify = 1;
+	}
+}
 #endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ce519861be59..5afae29367c1 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work)
 			write_lock(&n->lock);
 
 			state = n->nud_state;
-			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+			if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
+			    (n->flags & NTF_EXT_LEARNED)) {
 				write_unlock(&n->lock);
 				goto next_elt;
 			}
@@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 	if (neigh->dead)
 		goto out;
 
+	neigh_update_ext_learned(neigh, flags, &notify);
+
 	if (!(new & NUD_VALID)) {
 		neigh_del_timer(neigh);
 		if (old & NUD_CONNECTED)
@@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 			flags &= ~NEIGH_UPDATE_F_OVERRIDE;
 	}
 
+	if (ndm->ndm_flags & NTF_EXT_LEARNED)
+		flags |= NEIGH_UPDATE_F_EXT_LEARNED;
+
 	if (ndm->ndm_flags & NTF_USE) {
 		neigh_event_send(neigh, NULL);
 		err = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 47b3ba5175e940c9be5fd6aec05d6e7e048ff823 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 24 Apr 2018 18:17:34 -0300
Subject: sctp: fix const parameter violation in sctp_make_sack

sctp_make_sack() make changes to the asoc and this cast is just
bypassing the const attribute. As there is no need to have the const
there, just remove it and fix the violation.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    | 2 +-
 net/sctp/sm_make_chunk.c | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2d0e782c9055..f4b657478a30 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -207,7 +207,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
 					    int len, __u8 flags, gfp_t gfp);
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 				  const __u32 lowest_tsn);
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc);
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc);
 struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 				      const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5a4fb1dc8400..db93eabd6ef5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -779,10 +779,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc,
  * association.  This reports on which TSN's we've seen to date,
  * including duplicates and gaps.
  */
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc)
 {
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
-	struct sctp_association *aptr = (struct sctp_association *)asoc;
 	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
 	__u16 num_gabs, num_dup_tsns;
 	struct sctp_transport *trans;
@@ -857,7 +856,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 
 	/* Add the duplicate TSN information.  */
 	if (num_dup_tsns) {
-		aptr->stats.idupchunks += num_dup_tsns;
+		asoc->stats.idupchunks += num_dup_tsns;
 		sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns,
 				 sctp_tsnmap_get_dups(map));
 	}
@@ -869,11 +868,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 	 * association so no transport will match after a wrap event like this,
 	 * Until the next sack
 	 */
-	if (++aptr->peer.sack_generation == 0) {
+	if (++asoc->peer.sack_generation == 0) {
 		list_for_each_entry(trans, &asoc->peer.transport_addr_list,
 				    transports)
 			trans->sack_generation = 0;
-		aptr->peer.sack_generation = 1;
+		asoc->peer.sack_generation = 1;
 	}
 nodata:
 	return retval;
-- 
cgit v1.2.3-59-g8ed1b


From 51446780fc33e45cb790c05a7fa2c5bf7e8bc53b Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Tue, 24 Apr 2018 18:17:35 -0300
Subject: sctp: fix identification of new acks for SFR-CACC

It's currently written as:

if (!tchunk->tsn_gap_acked) {   [1]
	tchunk->tsn_gap_acked = 1;
	...
}

if (TSN_lte(tsn, sack_ctsn)) {
	if (!tchunk->tsn_gap_acked) {
		/* SFR-CACC processing */
		...
	}
}

Which causes the SFR-CACC processing on ack reception to never process,
as tchunk->tsn_gap_acked is always true by then. Block [1] was
moved to that position by the commit marked below.

This patch fixes it by doing SFR-CACC processing earlier, before
tsn_gap_acked is set to true.

Fixes: 31b02e154940 ("sctp: Failover transmitted list on transport delete")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index f211b3db6a35..dee7cbd54831 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1457,7 +1457,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 			 * the outstanding bytes for this chunk, so only
 			 * count bytes associated with a transport.
 			 */
-			if (transport) {
+			if (transport && !tchunk->tsn_gap_acked) {
 				/* If this chunk is being used for RTT
 				 * measurement, calculate the RTT and update
 				 * the RTO using this value.
@@ -1469,14 +1469,34 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 				 * first instance of the packet or a later
 				 * instance).
 				 */
-				if (!tchunk->tsn_gap_acked &&
-				    !sctp_chunk_retransmitted(tchunk) &&
+				if (!sctp_chunk_retransmitted(tchunk) &&
 				    tchunk->rtt_in_progress) {
 					tchunk->rtt_in_progress = 0;
 					rtt = jiffies - tchunk->sent_at;
 					sctp_transport_update_rto(transport,
 								  rtt);
 				}
+
+				if (TSN_lte(tsn, sack_ctsn)) {
+					/*
+					 * SFR-CACC algorithm:
+					 * 2) If the SACK contains gap acks
+					 * and the flag CHANGEOVER_ACTIVE is
+					 * set the receiver of the SACK MUST
+					 * take the following action:
+					 *
+					 * B) For each TSN t being acked that
+					 * has not been acked in any SACK so
+					 * far, set cacc_saw_newack to 1 for
+					 * the destination that the TSN was
+					 * sent to.
+					 */
+					if (sack->num_gap_ack_blocks &&
+					    q->asoc->peer.primary_path->cacc.
+					    changeover_active)
+						transport->cacc.cacc_saw_newack
+							= 1;
+				}
 			}
 
 			/* If the chunk hasn't been marked as ACKED,
@@ -1508,28 +1528,6 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 				restart_timer = 1;
 				forward_progress = true;
 
-				if (!tchunk->tsn_gap_acked) {
-					/*
-					 * SFR-CACC algorithm:
-					 * 2) If the SACK contains gap acks
-					 * and the flag CHANGEOVER_ACTIVE is
-					 * set the receiver of the SACK MUST
-					 * take the following action:
-					 *
-					 * B) For each TSN t being acked that
-					 * has not been acked in any SACK so
-					 * far, set cacc_saw_newack to 1 for
-					 * the destination that the TSN was
-					 * sent to.
-					 */
-					if (transport &&
-					    sack->num_gap_ack_blocks &&
-					    q->asoc->peer.primary_path->cacc.
-					    changeover_active)
-						transport->cacc.cacc_saw_newack
-							= 1;
-				}
-
 				list_add_tail(&tchunk->transmitted_list,
 					      &q->sacked);
 			} else {
-- 
cgit v1.2.3-59-g8ed1b


From c77bbc648faf4a6647c4f4446ca82e231475ffb5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 24 Apr 2018 18:36:07 -0700
Subject: net: rules: Move l3mdev attribute validation to a helper

Move the check on FRA_L3MDEV attribute to helper to improve the
readability of fib_nl2rule. Update the extack messages to be
clear when the configuration option is disabled versus an invalid
value has been passed.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 2271c80fd967..126ffc5bc630 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -454,6 +454,27 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
 	return NULL;
 }
 
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+			      struct netlink_ext_ack *extack)
+{
+	nlrule->l3mdev = nla_get_u8(nla);
+	if (nlrule->l3mdev != 1) {
+		NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
+		return -1;
+	}
+
+	return 0;
+}
+#else
+static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
+			      struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
+	return -1;
+}
+#endif
+
 static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		       struct netlink_ext_ack *extack,
 		       struct fib_rules_ops *ops,
@@ -536,16 +557,9 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
 
 	err = -EINVAL;
-	if (tb[FRA_L3MDEV]) {
-#ifdef CONFIG_NET_L3_MASTER_DEV
-		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
-		if (nlrule->l3mdev != 1)
-#endif
-		{
-			NL_SET_ERR_MSG(extack, "Invalid l3mdev");
-			goto errout_free;
-		}
-	}
+	if (tb[FRA_L3MDEV] &&
+	    fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
+		goto errout_free;
 
 	nlrule->action = frh->action;
 	nlrule->flags = frh->flags;
-- 
cgit v1.2.3-59-g8ed1b


From 4dc93fcf0b95dc3fda4db917effae31fbb8ad2a8 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:51 -0700
Subject: igb: Fix queue selection on MAC filters on i210

On the RAH registers there are semantic differences on the meaning of
the "queue" parameter for traffic steering depending on the controller
model: there is the 82575 meaning, which "queue" means a RX Hardware
Queue, and the i350 meaning, where it is a reception pool.

The previous behaviour was having no effect for i210 based controllers
because the QSEL bit of the RAH register wasn't being set.

This patch separates the condition in discrete cases, so the different
handling is clearer.

Fixes: 83c21335c876 ("igb: improve MAC filter handling")
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index cce7ada89255..9afee130c2aa 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -8763,12 +8763,17 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		if (is_valid_ether_addr(addr))
 			rar_high |= E1000_RAH_AV;
 
-		if (hw->mac.type == e1000_82575)
+		switch (hw->mac.type) {
+		case e1000_82575:
+		case e1000_i210:
 			rar_high |= E1000_RAH_POOL_1 *
 				    adapter->mac_table[index].queue;
-		else
+			break;
+		default:
 			rar_high |= E1000_RAH_POOL_1 <<
 				    adapter->mac_table[index].queue;
+			break;
+		}
 	}
 
 	wr32(E1000_RAL(index), rar_low);
-- 
cgit v1.2.3-59-g8ed1b


From 6995ddc49405bd4afe2aa7937f38abd2ee706084 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:52 -0700
Subject: igb: Enable the hardware traffic class feature bit for igb models

This will allow functionality depending on the hardware being traffic
class aware to work. In particular the tc-flower offloading checks
verifies that this bit is set.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9afee130c2aa..0facaa73f626 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2822,6 +2822,9 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (hw->mac.type >= e1000_82576)
 		netdev->features |= NETIF_F_SCTP_CRC;
 
+	if (hw->mac.type >= e1000_i350)
+		netdev->features |= NETIF_F_HW_TC;
+
 #define IGB_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \
 				  NETIF_F_GSO_GRE_CSUM | \
 				  NETIF_F_GSO_IPXIP4 | \
-- 
cgit v1.2.3-59-g8ed1b


From 6bf9e4d39a582a4beea1844d6955cb2ebc979a67 Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Tue, 24 Apr 2018 15:18:46 +0200
Subject: batman-adv: fix batadv_interface_tx()'s return type

The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
which is a typedef for an enum type, but the implementation in this
driver returns an 'int'.

Fix this by returning 'netdev_tx_t' in this driver too.

Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
[sven@narfation.org: fixed alignment]
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/soft-interface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 67065e35de51..1485263a348b 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -188,8 +188,8 @@ static void batadv_interface_set_rx_mode(struct net_device *dev)
 {
 }
 
-static int batadv_interface_tx(struct sk_buff *skb,
-			       struct net_device *soft_iface)
+static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
+				       struct net_device *soft_iface)
 {
 	struct ethhdr *ethhdr;
 	struct batadv_priv *bat_priv = netdev_priv(soft_iface);
-- 
cgit v1.2.3-59-g8ed1b


From 1d717cf4115e283aaf6c38174ecb7646f96abab5 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:53 -0700
Subject: igb: Add support for MAC address filters specifying source addresses

Makes it possible to direct packets to queues based on their source
address. Documents the expected usage of the 'flags' parameter.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
 drivers/net/ethernet/intel/igb/igb.h           |  1 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 40 ++++++++++++++++++++++----
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 98534f765e0e..5417edbe3125 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -491,6 +491,7 @@
  * manageability enabled, allowing us room for 15 multicast addresses.
  */
 #define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
+#define E1000_RAH_ASEL_SRC_ADDR 0x00010000
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC0000
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 8dbc399b345e..1ff58f7a5c7d 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -474,6 +474,7 @@ struct igb_mac_addr {
 
 #define IGB_MAC_STATE_DEFAULT	0x1
 #define IGB_MAC_STATE_IN_USE	0x2
+#define IGB_MAC_STATE_SRC_ADDR	0x4
 
 /* board specific private data structure */
 struct igb_adapter {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 0facaa73f626..902db2096bfd 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6859,8 +6859,14 @@ static void igb_set_default_mac_filter(struct igb_adapter *adapter)
 	igb_rar_set_index(adapter, 0);
 }
 
-static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
-			      const u8 queue)
+/* Add a MAC filter for 'addr' directing matching traffic to 'queue',
+ * 'flags' is used to indicate what kind of match is made, match is by
+ * default for the destination address, if matching by source address
+ * is desired the flag IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_add_mac_filter_flags(struct igb_adapter *adapter,
+				    const u8 *addr, const u8 queue,
+				    const u8 flags)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	int rar_entries = hw->mac.rar_entry_count -
@@ -6880,7 +6886,7 @@ static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 
 		ether_addr_copy(adapter->mac_table[i].addr, addr);
 		adapter->mac_table[i].queue = queue;
-		adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE;
+		adapter->mac_table[i].state |= IGB_MAC_STATE_IN_USE | flags;
 
 		igb_rar_set_index(adapter, i);
 		return i;
@@ -6889,8 +6895,21 @@ static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return -ENOSPC;
 }
 
-static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+static int igb_add_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 			      const u8 queue)
+{
+	return igb_add_mac_filter_flags(adapter, addr, queue, 0);
+}
+
+/* Remove a MAC filter for 'addr' directing matching traffic to
+ * 'queue', 'flags' is used to indicate what kind of match need to be
+ * removed, match is by default for the destination address, if
+ * matching by source address is to be removed the flag
+ * IGB_MAC_STATE_SRC_ADDR can be used.
+ */
+static int igb_del_mac_filter_flags(struct igb_adapter *adapter,
+				    const u8 *addr, const u8 queue,
+				    const u8 flags)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	int rar_entries = hw->mac.rar_entry_count -
@@ -6907,12 +6926,14 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	for (i = 0; i < rar_entries; i++) {
 		if (!(adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE))
 			continue;
+		if ((adapter->mac_table[i].state & flags) != flags)
+			continue;
 		if (adapter->mac_table[i].queue != queue)
 			continue;
 		if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
 			continue;
 
-		adapter->mac_table[i].state &= ~IGB_MAC_STATE_IN_USE;
+		adapter->mac_table[i].state = 0;
 		memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
 		adapter->mac_table[i].queue = 0;
 
@@ -6923,6 +6944,12 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return -ENOENT;
 }
 
+static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
+			      const u8 queue)
+{
+	return igb_del_mac_filter_flags(adapter, addr, queue, 0);
+}
+
 static int igb_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
@@ -8766,6 +8793,9 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		if (is_valid_ether_addr(addr))
 			rar_high |= E1000_RAH_AV;
 
+		if (adapter->mac_table[index].state & IGB_MAC_STATE_SRC_ADDR)
+			rar_high |= E1000_RAH_ASEL_SRC_ADDR;
+
 		switch (hw->mac.type) {
 		case e1000_82575:
 		case e1000_i210:
-- 
cgit v1.2.3-59-g8ed1b


From 0a8238998345549011c2a0da43cd6a35ea7bbebe Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:54 -0700
Subject: igb: Add support for enabling queue steering in filters

On some igb models (82575 and i210) the MAC address filters can
control to which queue the packet will be assigned.

This extends the 'state' with one more state to signify that queue
selection should be enabled for that filter.

As 82575 parts are no longer easily obtained (and this was developed
against i210), only support for the i210 model is enabled.

These functions are exported and will be used in the next patch.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  1 +
 drivers/net/ethernet/intel/igb/igb.h           |  6 ++++++
 drivers/net/ethernet/intel/igb/igb_main.c      | 26 ++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 5417edbe3125..d3d1d868e7ba 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -492,6 +492,7 @@
  */
 #define E1000_RAH_AV  0x80000000        /* Receive descriptor valid */
 #define E1000_RAH_ASEL_SRC_ADDR 0x00010000
+#define E1000_RAH_QSEL_ENABLE 0x10000000
 #define E1000_RAL_MAC_ADDR_LEN 4
 #define E1000_RAH_MAC_ADDR_LEN 2
 #define E1000_RAH_POOL_MASK 0x03FC0000
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 1ff58f7a5c7d..c71845a450f7 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -475,6 +475,7 @@ struct igb_mac_addr {
 #define IGB_MAC_STATE_DEFAULT	0x1
 #define IGB_MAC_STATE_IN_USE	0x2
 #define IGB_MAC_STATE_SRC_ADDR	0x4
+#define IGB_MAC_STATE_QUEUE_STEERING 0x8
 
 /* board specific private data structure */
 struct igb_adapter {
@@ -740,4 +741,9 @@ int igb_add_filter(struct igb_adapter *adapter,
 int igb_erase_filter(struct igb_adapter *adapter,
 		     struct igb_nfc_filter *input);
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags);
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags);
+
 #endif /* _IGB_H_ */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 902db2096bfd..21e76d649068 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6950,6 +6950,28 @@ static int igb_del_mac_filter(struct igb_adapter *adapter, const u8 *addr,
 	return igb_del_mac_filter_flags(adapter, addr, queue, 0);
 }
 
+int igb_add_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags)
+{
+	struct e1000_hw *hw = &adapter->hw;
+
+	/* In theory, this should be supported on 82575 as well, but
+	 * that part wasn't easily accessible during development.
+	 */
+	if (hw->mac.type != e1000_i210)
+		return -EOPNOTSUPP;
+
+	return igb_add_mac_filter_flags(adapter, addr, queue,
+					IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
+int igb_del_mac_steering_filter(struct igb_adapter *adapter,
+				const u8 *addr, u8 queue, u8 flags)
+{
+	return igb_del_mac_filter_flags(adapter, addr, queue,
+					IGB_MAC_STATE_QUEUE_STEERING | flags);
+}
+
 static int igb_uc_sync(struct net_device *netdev, const unsigned char *addr)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
@@ -8799,6 +8821,10 @@ static void igb_rar_set_index(struct igb_adapter *adapter, u32 index)
 		switch (hw->mac.type) {
 		case e1000_82575:
 		case e1000_i210:
+			if (adapter->mac_table[index].state &
+			    IGB_MAC_STATE_QUEUE_STEERING)
+				rar_high |= E1000_RAH_QSEL_ENABLE;
+
 			rar_high |= E1000_RAH_POOL_1 *
 				    adapter->mac_table[index].queue;
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 872f923c5b38226a0c6825487742098b1968d463 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:55 -0700
Subject: igb: Allow filters to be added for the local MAC address

Users expect that when adding a steering filter for the local MAC
address, that all the traffic directed to that address will go to some
queue.

Currently, it's not possible to configure entries in the "in use"
state, which is the normal state of the local MAC address entry (it is
the default), this patch allows to override the steering configuration
of "in use" entries, if the filter to be added match the address and
address type (source or destination) of an existing entry.

There is a bit of a special handling for entries referring to the
local MAC address, when they are removed, only the steering
configuration is reset.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 40 +++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 21e76d649068..9a5c2dc3cf5a 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6859,6 +6859,27 @@ static void igb_set_default_mac_filter(struct igb_adapter *adapter)
 	igb_rar_set_index(adapter, 0);
 }
 
+/* If the filter to be added and an already existing filter express
+ * the same address and address type, it should be possible to only
+ * override the other configurations, for example the queue to steer
+ * traffic.
+ */
+static bool igb_mac_entry_can_be_used(const struct igb_mac_addr *entry,
+				      const u8 *addr, const u8 flags)
+{
+	if (!(entry->state & IGB_MAC_STATE_IN_USE))
+		return true;
+
+	if ((entry->state & IGB_MAC_STATE_SRC_ADDR) !=
+	    (flags & IGB_MAC_STATE_SRC_ADDR))
+		return false;
+
+	if (!ether_addr_equal(addr, entry->addr))
+		return false;
+
+	return true;
+}
+
 /* Add a MAC filter for 'addr' directing matching traffic to 'queue',
  * 'flags' is used to indicate what kind of match is made, match is by
  * default for the destination address, if matching by source address
@@ -6881,7 +6902,8 @@ static int igb_add_mac_filter_flags(struct igb_adapter *adapter,
 	 * addresses.
 	 */
 	for (i = 0; i < rar_entries; i++) {
-		if (adapter->mac_table[i].state & IGB_MAC_STATE_IN_USE)
+		if (!igb_mac_entry_can_be_used(&adapter->mac_table[i],
+					       addr, flags))
 			continue;
 
 		ether_addr_copy(adapter->mac_table[i].addr, addr);
@@ -6933,9 +6955,19 @@ static int igb_del_mac_filter_flags(struct igb_adapter *adapter,
 		if (!ether_addr_equal(adapter->mac_table[i].addr, addr))
 			continue;
 
-		adapter->mac_table[i].state = 0;
-		memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
-		adapter->mac_table[i].queue = 0;
+		/* When a filter for the default address is "deleted",
+		 * we return it to its initial configuration
+		 */
+		if (adapter->mac_table[i].state & IGB_MAC_STATE_DEFAULT) {
+			adapter->mac_table[i].state =
+				IGB_MAC_STATE_DEFAULT | IGB_MAC_STATE_IN_USE;
+			adapter->mac_table[i].queue =
+				adapter->vfs_allocated_count;
+		} else {
+			adapter->mac_table[i].state = 0;
+			adapter->mac_table[i].queue = 0;
+			memset(adapter->mac_table[i].addr, 0, ETH_ALEN);
+		}
 
 		igb_rar_set_index(adapter, i);
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From bae51fefe2ac6ec2fe27f9d041aa3be87cae85af Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:56 -0700
Subject: igb: Enable nfc filters to specify MAC addresses

This allows igb_add_filter()/igb_erase_filter() to work on filters
that include MAC addresses (both source and destination).

For now, this only exposes the functionality, the next commit glues
ethtool into this. Later in this series, these APIs are used to allow
offloading of cls_flower filters.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |  4 ++++
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index c71845a450f7..0eac1df499f8 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -442,6 +442,8 @@ struct hwmon_buff {
 enum igb_filter_match_flags {
 	IGB_FILTER_FLAG_ETHER_TYPE = 0x1,
 	IGB_FILTER_FLAG_VLAN_TCI   = 0x2,
+	IGB_FILTER_FLAG_SRC_MAC_ADDR   = 0x4,
+	IGB_FILTER_FLAG_DST_MAC_ADDR   = 0x8,
 };
 
 #define IGB_MAX_RXNFC_FILTERS 16
@@ -456,6 +458,8 @@ struct igb_nfc_input {
 	u8 match_flags;
 	__be16 etype;
 	__be16 vlan_tci;
+	u8 src_addr[ETH_ALEN];
+	u8 dst_addr[ETH_ALEN];
 };
 
 struct igb_nfc_filter {
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 5975d432836f..31b2960a7869 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2776,6 +2776,25 @@ int igb_add_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 			return err;
 	}
 
+	if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+		err = igb_add_mac_steering_filter(adapter,
+						  input->filter.dst_addr,
+						  input->action, 0);
+		err = min_t(int, err, 0);
+		if (err)
+			return err;
+	}
+
+	if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+		err = igb_add_mac_steering_filter(adapter,
+						  input->filter.src_addr,
+						  input->action,
+						  IGB_MAC_STATE_SRC_ADDR);
+		err = min_t(int, err, 0);
+		if (err)
+			return err;
+	}
+
 	if (input->filter.match_flags & IGB_FILTER_FLAG_VLAN_TCI)
 		err = igb_rxnfc_write_vlan_prio_filter(adapter, input);
 
@@ -2824,6 +2843,15 @@ int igb_erase_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 		igb_clear_vlan_prio_filter(adapter,
 					   ntohs(input->filter.vlan_tci));
 
+	if (input->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR)
+		igb_del_mac_steering_filter(adapter, input->filter.src_addr,
+					    input->action,
+					    IGB_MAC_STATE_SRC_ADDR);
+
+	if (input->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR)
+		igb_del_mac_steering_filter(adapter, input->filter.dst_addr,
+					    input->action, 0);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b4a38d4276e1c3ca46141d6424a8da86bc5b74b0 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:57 -0700
Subject: igb: Add MAC address support for ethtool nftuple filters

This adds the capability of configuring the queue steering of arriving
packets based on their source and destination MAC addresses.

Source address steering (i.e. driving traffic to a specific queue),
for the i210, does not work, but filtering does (i.e. accepting
traffic based on the source address). So, trying to add a filter
specifying only a source address will be an error.

In practical terms this adds support for the following use cases,
characterized by these examples:

$ ethtool -N eth0 flow-type ether dst aa:aa:aa:aa:aa:aa action 0
(this will direct packets with destination address "aa:aa:aa:aa:aa:aa"
to the RX queue 0)

$ ethtool -N eth0 flow-type ether src 44:44:44:44:44:44 \
  	     	  	    	  proto 0x22f0 action 3
(this will direct packets with source address "44:44:44:44:44:44" and
ethertype 0x22f0 to the RX queue 3)

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 43 +++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 31b2960a7869..6697c273ab59 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -2495,6 +2495,23 @@ static int igb_get_ethtool_nfc_entry(struct igb_adapter *adapter,
 			fsp->h_ext.vlan_tci = rule->filter.vlan_tci;
 			fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
 		}
+		if (rule->filter.match_flags & IGB_FILTER_FLAG_DST_MAC_ADDR) {
+			ether_addr_copy(fsp->h_u.ether_spec.h_dest,
+					rule->filter.dst_addr);
+			/* As we only support matching by the full
+			 * mask, return the mask to userspace
+			 */
+			eth_broadcast_addr(fsp->m_u.ether_spec.h_dest);
+		}
+		if (rule->filter.match_flags & IGB_FILTER_FLAG_SRC_MAC_ADDR) {
+			ether_addr_copy(fsp->h_u.ether_spec.h_source,
+					rule->filter.src_addr);
+			/* As we only support matching by the full
+			 * mask, return the mask to userspace
+			 */
+			eth_broadcast_addr(fsp->m_u.ether_spec.h_source);
+		}
+
 		return 0;
 	}
 	return -EINVAL;
@@ -2768,8 +2785,16 @@ static int igb_rxnfc_write_vlan_prio_filter(struct igb_adapter *adapter,
 
 int igb_add_filter(struct igb_adapter *adapter, struct igb_nfc_filter *input)
 {
+	struct e1000_hw *hw = &adapter->hw;
 	int err = -EINVAL;
 
+	if (hw->mac.type == e1000_i210 &&
+	    !(input->filter.match_flags & ~IGB_FILTER_FLAG_SRC_MAC_ADDR)) {
+		dev_err(&adapter->pdev->dev,
+			"i210 doesn't support flow classification rules specifying only source addresses.\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (input->filter.match_flags & IGB_FILTER_FLAG_ETHER_TYPE) {
 		err = igb_rxnfc_write_etype_filter(adapter, input);
 		if (err)
@@ -2933,10 +2958,6 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter *adapter,
 	if ((fsp->flow_type & ~FLOW_EXT) != ETHER_FLOW)
 		return -EINVAL;
 
-	if (fsp->m_u.ether_spec.h_proto != ETHER_TYPE_FULL_MASK &&
-	    fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK))
-		return -EINVAL;
-
 	input = kzalloc(sizeof(*input), GFP_KERNEL);
 	if (!input)
 		return -ENOMEM;
@@ -2946,6 +2967,20 @@ static int igb_add_ethtool_nfc_entry(struct igb_adapter *adapter,
 		input->filter.match_flags = IGB_FILTER_FLAG_ETHER_TYPE;
 	}
 
+	/* Only support matching addresses by the full mask */
+	if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_source)) {
+		input->filter.match_flags |= IGB_FILTER_FLAG_SRC_MAC_ADDR;
+		ether_addr_copy(input->filter.src_addr,
+				fsp->h_u.ether_spec.h_source);
+	}
+
+	/* Only support matching addresses by the full mask */
+	if (is_broadcast_ether_addr(fsp->m_u.ether_spec.h_dest)) {
+		input->filter.match_flags |= IGB_FILTER_FLAG_DST_MAC_ADDR;
+		ether_addr_copy(input->filter.dst_addr,
+				fsp->h_u.ether_spec.h_dest);
+	}
+
 	if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
 		if (fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK)) {
 			err = -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From f8f3d34e5470c48a10c27c4f6f65e149e10d04ec Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:58 -0700
Subject: igb: Add the skeletons for tc-flower offloading

This adds basic functions needed to implement offloading for filters
created by tc-flower.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 66 +++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9a5c2dc3cf5a..26b0f0efc97e 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -36,6 +36,7 @@
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 #include <linux/net_tstamp.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
@@ -2513,6 +2514,69 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
 	return 0;
 }
 
+static int igb_configure_clsflower(struct igb_adapter *adapter,
+				   struct tc_cls_flower_offload *cls_flower)
+{
+	return -EOPNOTSUPP;
+}
+
+static int igb_delete_clsflower(struct igb_adapter *adapter,
+				struct tc_cls_flower_offload *cls_flower)
+{
+	return -EOPNOTSUPP;
+}
+
+static int igb_setup_tc_cls_flower(struct igb_adapter *adapter,
+				   struct tc_cls_flower_offload *cls_flower)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return igb_configure_clsflower(adapter, cls_flower);
+	case TC_CLSFLOWER_DESTROY:
+		return igb_delete_clsflower(adapter, cls_flower);
+	case TC_CLSFLOWER_STATS:
+		return -EOPNOTSUPP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int igb_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				 void *cb_priv)
+{
+	struct igb_adapter *adapter = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(adapter->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return igb_setup_tc_cls_flower(adapter, type_data);
+
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int igb_setup_tc_block(struct igb_adapter *adapter,
+			      struct tc_block_offload *f)
+{
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, igb_setup_tc_block_cb,
+					     adapter, adapter);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb,
+					adapter);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 			void *type_data)
 {
@@ -2521,6 +2585,8 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	switch (type) {
 	case TC_SETUP_QDISC_CBS:
 		return igb_offload_cbs(adapter, type_data);
+	case TC_SETUP_BLOCK:
+		return igb_setup_tc_block(adapter, type_data);
 
 	default:
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From 3e3e9fd8b6f0dd2d387d0dc666b770fe0dc36b33 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:17:01 -0700
Subject: nfp: reset local locks on init

NFP locks record the owner when held, for PCIe devices the owner
ID will be the PCIe link number.  When driver loads it should scan
known locks and if they indicate that they are held by local
endpoint but the driver doesn't hold them - release them.

Locks can be left taken for instance when kernel gets kexec-ed or
after a crash.  Management FW tries to clean up stale locks too,
but it currently depends on PCIe link going down which doesn't
always happen.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c      |  5 ++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h   |  2 +
 .../net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h   |  2 +
 .../net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c | 45 +++++++++++++++++
 .../ethernet/netronome/nfp/nfpcore/nfp_resource.c  | 59 ++++++++++++++++++++++
 5 files changed, 113 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index c4b1f344b4da..0ade122805ad 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -486,6 +486,10 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 		goto err_disable_msix;
 	}
 
+	err = nfp_resource_table_init(pf->cpp);
+	if (err)
+		goto err_cpp_free;
+
 	pf->hwinfo = nfp_hwinfo_read(pf->cpp);
 
 	dev_info(&pdev->dev, "Assembly: %s%s%s-%s CPLD: %s\n",
@@ -548,6 +552,7 @@ err_fw_unload:
 	vfree(pf->dumpspec);
 err_hwinfo_free:
 	kfree(pf->hwinfo);
+err_cpp_free:
 	nfp_cpp_free(pf->cpp);
 err_disable_msix:
 	destroy_workqueue(pf->wq);
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
index ced62d112aa2..f44d0a857314 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h
@@ -94,6 +94,8 @@ int nfp_nsp_read_sensors(struct nfp_nsp *state, unsigned int sensor_mask,
 /* MAC Statistics Accumulator */
 #define NFP_RESOURCE_MAC_STATISTICS	"mac.stat"
 
+int nfp_resource_table_init(struct nfp_cpp *cpp);
+
 struct nfp_resource *
 nfp_resource_acquire(struct nfp_cpp *cpp, const char *name);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index c8f2c064cce3..4e19add1c539 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -295,6 +295,8 @@ void nfp_cpp_mutex_free(struct nfp_cpp_mutex *mutex);
 int nfp_cpp_mutex_lock(struct nfp_cpp_mutex *mutex);
 int nfp_cpp_mutex_unlock(struct nfp_cpp_mutex *mutex);
 int nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex);
+int nfp_cpp_mutex_reclaim(struct nfp_cpp *cpp, int target,
+			  unsigned long long address);
 
 /**
  * nfp_cppcore_pcie_unit() - Get PCI Unit of a CPP handle
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
index cb28ac03e4ca..c88bf673cb76 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_mutex.c
@@ -59,6 +59,11 @@ static u32 nfp_mutex_unlocked(u16 interface)
 	return (u32)interface << 16 | 0x0000;
 }
 
+static u32 nfp_mutex_owner(u32 val)
+{
+	return val >> 16;
+}
+
 static bool nfp_mutex_is_locked(u32 val)
 {
 	return (val & 0xffff) == 0x000f;
@@ -351,3 +356,43 @@ int nfp_cpp_mutex_trylock(struct nfp_cpp_mutex *mutex)
 
 	return nfp_mutex_is_locked(tmp) ? -EBUSY : -EINVAL;
 }
+
+/**
+ * nfp_cpp_mutex_reclaim() - Unlock mutex if held by local endpoint
+ * @cpp:	NFP CPP handle
+ * @target:	NFP CPP target ID (ie NFP_CPP_TARGET_CLS or NFP_CPP_TARGET_MU)
+ * @address:	Offset into the address space of the NFP CPP target ID
+ *
+ * Release lock if held by local system.  Extreme care is advised, call only
+ * when no local lock users can exist.
+ *
+ * Return:      0 if the lock was OK, 1 if locked by us, -errno on invalid mutex
+ */
+int nfp_cpp_mutex_reclaim(struct nfp_cpp *cpp, int target,
+			  unsigned long long address)
+{
+	const u32 mur = NFP_CPP_ID(target, 3, 0);	/* atomic_read */
+	const u32 muw = NFP_CPP_ID(target, 4, 0);	/* atomic_write */
+	u16 interface = nfp_cpp_interface(cpp);
+	int err;
+	u32 tmp;
+
+	err = nfp_cpp_mutex_validate(interface, &target, address);
+	if (err)
+		return err;
+
+	/* Check lock */
+	err = nfp_cpp_readl(cpp, mur, address, &tmp);
+	if (err < 0)
+		return err;
+
+	if (nfp_mutex_is_unlocked(tmp) || nfp_mutex_owner(tmp) != interface)
+		return 0;
+
+	/* Bust the lock */
+	err = nfp_cpp_writel(cpp, muw, address, nfp_mutex_unlocked(interface));
+	if (err < 0)
+		return err;
+
+	return 1;
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
index 7e14725055c7..2dd89dba9311 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c
@@ -338,3 +338,62 @@ u64 nfp_resource_size(struct nfp_resource *res)
 {
 	return res->size;
 }
+
+/**
+ * nfp_resource_table_init() - Run initial checks on the resource table
+ * @cpp:	NFP CPP handle
+ *
+ * Start-of-day init procedure for resource table.  Must be called before
+ * any local resource table users may exist.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int nfp_resource_table_init(struct nfp_cpp *cpp)
+{
+	struct nfp_cpp_mutex *dev_mutex;
+	int i, err;
+
+	err = nfp_cpp_mutex_reclaim(cpp, NFP_RESOURCE_TBL_TARGET,
+				    NFP_RESOURCE_TBL_BASE);
+	if (err < 0) {
+		nfp_err(cpp, "Error: failed to reclaim resource table mutex\n");
+		return err;
+	}
+	if (err)
+		nfp_warn(cpp, "Warning: busted main resource table mutex\n");
+
+	dev_mutex = nfp_cpp_mutex_alloc(cpp, NFP_RESOURCE_TBL_TARGET,
+					NFP_RESOURCE_TBL_BASE,
+					NFP_RESOURCE_TBL_KEY);
+	if (!dev_mutex)
+		return -ENOMEM;
+
+	if (nfp_cpp_mutex_lock(dev_mutex)) {
+		nfp_err(cpp, "Error: failed to claim resource table mutex\n");
+		nfp_cpp_mutex_free(dev_mutex);
+		return -EINVAL;
+	}
+
+	/* Resource 0 is the dev_mutex, start from 1 */
+	for (i = 1; i < NFP_RESOURCE_TBL_ENTRIES; i++) {
+		u64 addr = NFP_RESOURCE_TBL_BASE +
+			sizeof(struct nfp_resource_entry) * i;
+
+		err = nfp_cpp_mutex_reclaim(cpp, NFP_RESOURCE_TBL_TARGET, addr);
+		if (err < 0) {
+			nfp_err(cpp,
+				"Error: failed to reclaim resource %d mutex\n",
+				i);
+			goto err_unlock;
+		}
+		if (err)
+			nfp_warn(cpp, "Warning: busted resource %d mutex\n", i);
+	}
+
+	err = 0;
+err_unlock:
+	nfp_cpp_mutex_unlock(dev_mutex);
+	nfp_cpp_mutex_free(dev_mutex);
+
+	return err;
+}
-- 
cgit v1.2.3-59-g8ed1b


From dd92a7d1afef6229812a8980e89a22cfc9ddca94 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 24 Apr 2018 21:17:02 -0700
Subject: nfp: print PCIe link bandwidth on probe

To aid debugging of performance issues caused by limited PCIe
bandwidth print the PCIe link information on probe.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
index cd678323bacb..a0e336bd1d85 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
@@ -1330,6 +1330,7 @@ struct nfp_cpp *nfp_cpp_from_nfp6000_pcie(struct pci_dev *pdev)
 	/*  Finished with card initialization. */
 	dev_info(&pdev->dev,
 		 "Netronome Flow Processor NFP4000/NFP6000 PCIe Card Probe\n");
+	pcie_print_link_status(pdev);
 
 	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
 	if (!nfp) {
-- 
cgit v1.2.3-59-g8ed1b


From 54a4a03439305f701834cef1e4bd108548388d83 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Tue, 24 Apr 2018 21:17:03 -0700
Subject: nfp: flower: support offloading multiple rules with same cookie

When multiple netdevs are attached to a tc offload block and register for
callbacks, a rule added to the block will be propogated to all netdevs.
Previously these were detected as duplicates (based on cookie) and
rejected. Modify the rule nfp lookup function to optionally include an
ingress netdev and a host context along with the cookie value when
searching for a rule. When a new rule is passed to the driver, the netdev
the rule is to be attached to is considered when searching for dublicates.
When a stats update is received from HW, the host context is used
alongside the cookie to map to the correct host rule.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  8 +++++--
 .../net/ethernet/netronome/nfp/flower/metadata.c   | 20 +++++++++-------
 .../net/ethernet/netronome/nfp/flower/offload.c    | 27 ++++++++++++++++------
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index c67e1b54c614..9e6804bc9b40 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -47,6 +47,7 @@
 struct net_device;
 struct nfp_app;
 
+#define NFP_FL_STATS_CTX_DONT_CARE	cpu_to_be32(0xffffffff)
 #define NFP_FL_STATS_ENTRY_RS		BIT(20)
 #define NFP_FL_STATS_ELEM_RS		4
 #define NFP_FL_REPEATED_HASH_MAX	BIT(17)
@@ -189,6 +190,7 @@ struct nfp_fl_payload {
 	spinlock_t lock; /* lock stats */
 	struct nfp_fl_stats stats;
 	__be32 nfp_tun_ipv4_addr;
+	struct net_device *ingress_dev;
 	char *unmasked_data;
 	char *mask_data;
 	char *action_data;
@@ -216,12 +218,14 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
 			      struct nfp_fl_payload *nfp_flow);
 int nfp_compile_flow_metadata(struct nfp_app *app,
 			      struct tc_cls_flower_offload *flow,
-			      struct nfp_fl_payload *nfp_flow);
+			      struct nfp_fl_payload *nfp_flow,
+			      struct net_device *netdev);
 int nfp_modify_flow_metadata(struct nfp_app *app,
 			     struct nfp_fl_payload *nfp_flow);
 
 struct nfp_fl_payload *
-nfp_flower_search_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie);
+nfp_flower_search_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie,
+			   struct net_device *netdev, __be32 host_ctx);
 struct nfp_fl_payload *
 nfp_flower_remove_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie);
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
index db977cf8e933..21668aa435e8 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/metadata.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
@@ -99,14 +99,18 @@ static int nfp_get_stats_entry(struct nfp_app *app, u32 *stats_context_id)
 
 /* Must be called with either RTNL or rcu_read_lock */
 struct nfp_fl_payload *
-nfp_flower_search_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie)
+nfp_flower_search_fl_table(struct nfp_app *app, unsigned long tc_flower_cookie,
+			   struct net_device *netdev, __be32 host_ctx)
 {
 	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_fl_payload *flower_entry;
 
 	hash_for_each_possible_rcu(priv->flow_table, flower_entry, link,
 				   tc_flower_cookie)
-		if (flower_entry->tc_flower_cookie == tc_flower_cookie)
+		if (flower_entry->tc_flower_cookie == tc_flower_cookie &&
+		    (!netdev || flower_entry->ingress_dev == netdev) &&
+		    (host_ctx == NFP_FL_STATS_CTX_DONT_CARE ||
+		     flower_entry->meta.host_ctx_id == host_ctx))
 			return flower_entry;
 
 	return NULL;
@@ -121,13 +125,11 @@ nfp_flower_update_stats(struct nfp_app *app, struct nfp_fl_stats_frame *stats)
 	flower_cookie = be64_to_cpu(stats->stats_cookie);
 
 	rcu_read_lock();
-	nfp_flow = nfp_flower_search_fl_table(app, flower_cookie);
+	nfp_flow = nfp_flower_search_fl_table(app, flower_cookie, NULL,
+					      stats->stats_con_id);
 	if (!nfp_flow)
 		goto exit_rcu_unlock;
 
-	if (nfp_flow->meta.host_ctx_id != stats->stats_con_id)
-		goto exit_rcu_unlock;
-
 	spin_lock(&nfp_flow->lock);
 	nfp_flow->stats.pkts += be32_to_cpu(stats->pkt_count);
 	nfp_flow->stats.bytes += be64_to_cpu(stats->byte_count);
@@ -317,7 +319,8 @@ nfp_check_mask_remove(struct nfp_app *app, char *mask_data, u32 mask_len,
 
 int nfp_compile_flow_metadata(struct nfp_app *app,
 			      struct tc_cls_flower_offload *flow,
-			      struct nfp_fl_payload *nfp_flow)
+			      struct nfp_fl_payload *nfp_flow,
+			      struct net_device *netdev)
 {
 	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_fl_payload *check_entry;
@@ -348,7 +351,8 @@ int nfp_compile_flow_metadata(struct nfp_app *app,
 	nfp_flow->stats.bytes = 0;
 	nfp_flow->stats.used = jiffies;
 
-	check_entry = nfp_flower_search_fl_table(app, flow->cookie);
+	check_entry = nfp_flower_search_fl_table(app, flow->cookie, netdev,
+						 NFP_FL_STATS_CTX_DONT_CARE);
 	if (check_entry) {
 		if (nfp_release_stats_entry(app, stats_cxt))
 			return -EINVAL;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 114d2ab02a38..bdc82e11a31e 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -419,6 +419,8 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 		goto err_free_key_ls;
 	}
 
+	flow_pay->ingress_dev = egress ? NULL : netdev;
+
 	err = nfp_flower_compile_flow_match(flow, key_layer, netdev, flow_pay,
 					    tun_type);
 	if (err)
@@ -428,7 +430,8 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_destroy_flow;
 
-	err = nfp_compile_flow_metadata(app, flow, flow_pay);
+	err = nfp_compile_flow_metadata(app, flow, flow_pay,
+					flow_pay->ingress_dev);
 	if (err)
 		goto err_destroy_flow;
 
@@ -462,6 +465,7 @@ err_free_key_ls:
  * @app:	Pointer to the APP handle
  * @netdev:	netdev structure.
  * @flow:	TC flower classifier offload structure
+ * @egress:	Netdev is the egress dev.
  *
  * Removes a flow from the repeated hash structure and clears the
  * action payload.
@@ -470,13 +474,16 @@ err_free_key_ls:
  */
 static int
 nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
-		       struct tc_cls_flower_offload *flow)
+		       struct tc_cls_flower_offload *flow, bool egress)
 {
 	struct nfp_port *port = nfp_port_from_netdev(netdev);
 	struct nfp_fl_payload *nfp_flow;
+	struct net_device *ingr_dev;
 	int err;
 
-	nfp_flow = nfp_flower_search_fl_table(app, flow->cookie);
+	ingr_dev = egress ? NULL : netdev;
+	nfp_flow = nfp_flower_search_fl_table(app, flow->cookie, ingr_dev,
+					      NFP_FL_STATS_CTX_DONT_CARE);
 	if (!nfp_flow)
 		return -ENOENT;
 
@@ -505,7 +512,9 @@ err_free_flow:
 /**
  * nfp_flower_get_stats() - Populates flow stats obtained from hardware.
  * @app:	Pointer to the APP handle
+ * @netdev:	Netdev structure.
  * @flow:	TC flower classifier offload structure
+ * @egress:	Netdev is the egress dev.
  *
  * Populates a flow statistics structure which which corresponds to a
  * specific flow.
@@ -513,11 +522,15 @@ err_free_flow:
  * Return: negative value on error, 0 if stats populated successfully.
  */
 static int
-nfp_flower_get_stats(struct nfp_app *app, struct tc_cls_flower_offload *flow)
+nfp_flower_get_stats(struct nfp_app *app, struct net_device *netdev,
+		     struct tc_cls_flower_offload *flow, bool egress)
 {
 	struct nfp_fl_payload *nfp_flow;
+	struct net_device *ingr_dev;
 
-	nfp_flow = nfp_flower_search_fl_table(app, flow->cookie);
+	ingr_dev = egress ? NULL : netdev;
+	nfp_flow = nfp_flower_search_fl_table(app, flow->cookie, ingr_dev,
+					      NFP_FL_STATS_CTX_DONT_CARE);
 	if (!nfp_flow)
 		return -EINVAL;
 
@@ -543,9 +556,9 @@ nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
 	case TC_CLSFLOWER_REPLACE:
 		return nfp_flower_add_offload(app, netdev, flower, egress);
 	case TC_CLSFLOWER_DESTROY:
-		return nfp_flower_del_offload(app, netdev, flower);
+		return nfp_flower_del_offload(app, netdev, flower, egress);
 	case TC_CLSFLOWER_STATS:
-		return nfp_flower_get_stats(app, flower);
+		return nfp_flower_get_stats(app, netdev, flower, egress);
 	}
 
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From c50647d3e8fd4149ecd78e9a234336e727f48107 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Tue, 24 Apr 2018 21:17:04 -0700
Subject: nfp: flower: ignore duplicate cb requests for same rule

If a flower rule has a repr both as ingress and egress port then 2
callbacks may be generated for the same rule request.

Add an indicator to each flow as to whether or not it was added from an
ingress registered cb. If so then ignore add/del/stat requests to it from
an egress cb.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  1 +
 .../net/ethernet/netronome/nfp/flower/offload.c    | 23 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 9e6804bc9b40..733ff53cc601 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -194,6 +194,7 @@ struct nfp_fl_payload {
 	char *unmasked_data;
 	char *mask_data;
 	char *action_data;
+	bool ingress_offload;
 };
 
 struct nfp_fl_stats_frame {
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index bdc82e11a31e..70ec9d821b91 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -345,7 +345,7 @@ nfp_flower_calculate_key_layers(struct nfp_app *app,
 }
 
 static struct nfp_fl_payload *
-nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer)
+nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer, bool egress)
 {
 	struct nfp_fl_payload *flow_pay;
 
@@ -371,6 +371,8 @@ nfp_flower_allocate_new(struct nfp_fl_key_ls *key_layer)
 	flow_pay->meta.flags = 0;
 	spin_lock_init(&flow_pay->lock);
 
+	flow_pay->ingress_offload = !egress;
+
 	return flow_pay;
 
 err_free_mask:
@@ -402,8 +404,20 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_fl_payload *flow_pay;
 	struct nfp_fl_key_ls *key_layer;
+	struct net_device *ingr_dev;
 	int err;
 
+	ingr_dev = egress ? NULL : netdev;
+	flow_pay = nfp_flower_search_fl_table(app, flow->cookie, ingr_dev,
+					      NFP_FL_STATS_CTX_DONT_CARE);
+	if (flow_pay) {
+		/* Ignore as duplicate if it has been added by different cb. */
+		if (flow_pay->ingress_offload && egress)
+			return 0;
+		else
+			return -EOPNOTSUPP;
+	}
+
 	key_layer = kmalloc(sizeof(*key_layer), GFP_KERNEL);
 	if (!key_layer)
 		return -ENOMEM;
@@ -413,7 +427,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_free_key_ls;
 
-	flow_pay = nfp_flower_allocate_new(key_layer);
+	flow_pay = nfp_flower_allocate_new(key_layer, egress);
 	if (!flow_pay) {
 		err = -ENOMEM;
 		goto err_free_key_ls;
@@ -485,7 +499,7 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 	nfp_flow = nfp_flower_search_fl_table(app, flow->cookie, ingr_dev,
 					      NFP_FL_STATS_CTX_DONT_CARE);
 	if (!nfp_flow)
-		return -ENOENT;
+		return egress ? 0 : -ENOENT;
 
 	err = nfp_modify_flow_metadata(app, nfp_flow);
 	if (err)
@@ -534,6 +548,9 @@ nfp_flower_get_stats(struct nfp_app *app, struct net_device *netdev,
 	if (!nfp_flow)
 		return -EINVAL;
 
+	if (nfp_flow->ingress_offload && egress)
+		return 0;
+
 	spin_lock_bh(&nfp_flow->lock);
 	tcf_exts_stats_update(flow->exts, nfp_flow->stats.bytes,
 			      nfp_flow->stats.pkts, nfp_flow->stats.used);
-- 
cgit v1.2.3-59-g8ed1b


From e086be9ab21cc2a0894e0f72ff8c974aba0b296e Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Tue, 10 Apr 2018 10:49:59 -0700
Subject: igb: Add support for adding offloaded clsflower filters

This allows filters added by tc-flower and specifying MAC addresses,
Ethernet types, and the VLAN priority field, to be offloaded to the
controller.

This reuses most of the infrastructure used by ethtool, but clsflower
filters are kept in a separated list, so they are invisible to
ethtool.

To setup clsflower offloading:

$ tc qdisc replace dev eth0 handle 100: parent root mqprio \
     	   	   num_tc 3 map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \
		   queues 1@0 1@1 2@2 hw 0
(clsflower offloading depends on the netword driver to be configured
with multiple traffic classes, we use mqprio's 'num_tc' parameter to
set it to 3)

$ tc qdisc add dev eth0 ingress

Examples of filters:

$ tc filter add dev eth0 parent ffff: flower \
     	    dst_mac aa:aa:aa:aa:aa:aa \
	    hw_tc 2 skip_sw
(just a simple filter filtering for the destination MAC address and
steering that traffic to queue 2)

$ tc filter add dev enp2s0 parent ffff: proto 0x22f0 flower \
     	    src_mac cc:cc:cc:cc:cc:cc \
	    hw_tc 1 skip_sw
(as the i210 doesn't support steering traffic based on the source
address alone, we need to use another steering traffic, in this case
we are using the ethernet type (0x22f0) to steer traffic to queue 1)

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |   2 +
 drivers/net/ethernet/intel/igb/igb_main.c | 188 +++++++++++++++++++++++++++++-
 2 files changed, 188 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 0eac1df499f8..990a7fb32e4e 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -465,6 +465,7 @@ struct igb_nfc_input {
 struct igb_nfc_filter {
 	struct hlist_node nfc_node;
 	struct igb_nfc_input filter;
+	unsigned long cookie;
 	u16 etype_reg_index;
 	u16 sw_idx;
 	u16 action;
@@ -604,6 +605,7 @@ struct igb_adapter {
 
 	/* RX network flow classification support */
 	struct hlist_head nfc_filter_list;
+	struct hlist_head cls_flower_list;
 	unsigned int nfc_filter_count;
 	/* lock for RX network flow classification filter */
 	spinlock_t nfc_lock;
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 26b0f0efc97e..3ce43207a35f 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2514,16 +2514,197 @@ static int igb_offload_cbs(struct igb_adapter *adapter,
 	return 0;
 }
 
+#define ETHER_TYPE_FULL_MASK ((__force __be16)~0)
+#define VLAN_PRIO_FULL_MASK (0x07)
+
+static int igb_parse_cls_flower(struct igb_adapter *adapter,
+				struct tc_cls_flower_offload *f,
+				int traffic_class,
+				struct igb_nfc_filter *input)
+{
+	struct netlink_ext_ack *extack = f->common.extack;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN))) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Unsupported key used, only BASIC, CONTROL, ETH_ADDRS and VLAN are supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key, *mask;
+
+		key = skb_flow_dissector_target(f->dissector,
+						FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						f->key);
+		mask = skb_flow_dissector_target(f->dissector,
+						 FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						 f->mask);
+
+		if (!is_zero_ether_addr(mask->dst)) {
+			if (!is_broadcast_ether_addr(mask->dst)) {
+				NL_SET_ERR_MSG_MOD(extack, "Only full masks are supported for destination MAC address");
+				return -EINVAL;
+			}
+
+			input->filter.match_flags |=
+				IGB_FILTER_FLAG_DST_MAC_ADDR;
+			ether_addr_copy(input->filter.dst_addr, key->dst);
+		}
+
+		if (!is_zero_ether_addr(mask->src)) {
+			if (!is_broadcast_ether_addr(mask->src)) {
+				NL_SET_ERR_MSG_MOD(extack, "Only full masks are supported for source MAC address");
+				return -EINVAL;
+			}
+
+			input->filter.match_flags |=
+				IGB_FILTER_FLAG_SRC_MAC_ADDR;
+			ether_addr_copy(input->filter.src_addr, key->src);
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key, *mask;
+
+		key = skb_flow_dissector_target(f->dissector,
+						FLOW_DISSECTOR_KEY_BASIC,
+						f->key);
+		mask = skb_flow_dissector_target(f->dissector,
+						 FLOW_DISSECTOR_KEY_BASIC,
+						 f->mask);
+
+		if (mask->n_proto) {
+			if (mask->n_proto != ETHER_TYPE_FULL_MASK) {
+				NL_SET_ERR_MSG_MOD(extack, "Only full mask is supported for EtherType filter");
+				return -EINVAL;
+			}
+
+			input->filter.match_flags |= IGB_FILTER_FLAG_ETHER_TYPE;
+			input->filter.etype = key->n_proto;
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key, *mask;
+
+		key = skb_flow_dissector_target(f->dissector,
+						FLOW_DISSECTOR_KEY_VLAN,
+						f->key);
+		mask = skb_flow_dissector_target(f->dissector,
+						 FLOW_DISSECTOR_KEY_VLAN,
+						 f->mask);
+
+		if (mask->vlan_priority) {
+			if (mask->vlan_priority != VLAN_PRIO_FULL_MASK) {
+				NL_SET_ERR_MSG_MOD(extack, "Only full mask is supported for VLAN priority");
+				return -EINVAL;
+			}
+
+			input->filter.match_flags |= IGB_FILTER_FLAG_VLAN_TCI;
+			input->filter.vlan_tci = key->vlan_priority;
+		}
+	}
+
+	input->action = traffic_class;
+	input->cookie = f->cookie;
+
+	return 0;
+}
+
 static int igb_configure_clsflower(struct igb_adapter *adapter,
 				   struct tc_cls_flower_offload *cls_flower)
 {
-	return -EOPNOTSUPP;
+	struct netlink_ext_ack *extack = cls_flower->common.extack;
+	struct igb_nfc_filter *filter, *f;
+	int err, tc;
+
+	tc = tc_classid_to_hwtc(adapter->netdev, cls_flower->classid);
+	if (tc < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid traffic class");
+		return -EINVAL;
+	}
+
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!filter)
+		return -ENOMEM;
+
+	err = igb_parse_cls_flower(adapter, cls_flower, tc, filter);
+	if (err < 0)
+		goto err_parse;
+
+	spin_lock(&adapter->nfc_lock);
+
+	hlist_for_each_entry(f, &adapter->nfc_filter_list, nfc_node) {
+		if (!memcmp(&f->filter, &filter->filter, sizeof(f->filter))) {
+			err = -EEXIST;
+			NL_SET_ERR_MSG_MOD(extack,
+					   "This filter is already set in ethtool");
+			goto err_locked;
+		}
+	}
+
+	hlist_for_each_entry(f, &adapter->cls_flower_list, nfc_node) {
+		if (!memcmp(&f->filter, &filter->filter, sizeof(f->filter))) {
+			err = -EEXIST;
+			NL_SET_ERR_MSG_MOD(extack,
+					   "This filter is already set in cls_flower");
+			goto err_locked;
+		}
+	}
+
+	err = igb_add_filter(adapter, filter);
+	if (err < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Could not add filter to the adapter");
+		goto err_locked;
+	}
+
+	hlist_add_head(&filter->nfc_node, &adapter->cls_flower_list);
+
+	spin_unlock(&adapter->nfc_lock);
+
+	return 0;
+
+err_locked:
+	spin_unlock(&adapter->nfc_lock);
+
+err_parse:
+	kfree(filter);
+
+	return err;
 }
 
 static int igb_delete_clsflower(struct igb_adapter *adapter,
 				struct tc_cls_flower_offload *cls_flower)
 {
-	return -EOPNOTSUPP;
+	struct igb_nfc_filter *filter;
+	int err;
+
+	spin_lock(&adapter->nfc_lock);
+
+	hlist_for_each_entry(filter, &adapter->cls_flower_list, nfc_node)
+		if (filter->cookie == cls_flower->cookie)
+			break;
+
+	if (!filter) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	err = igb_erase_filter(adapter, filter);
+	if (err < 0)
+		goto out;
+
+	hlist_del(&filter->nfc_node);
+	kfree(filter);
+
+out:
+	spin_unlock(&adapter->nfc_lock);
+
+	return err;
 }
 
 static int igb_setup_tc_cls_flower(struct igb_adapter *adapter,
@@ -9368,6 +9549,9 @@ static void igb_nfc_filter_exit(struct igb_adapter *adapter)
 	hlist_for_each_entry(rule, &adapter->nfc_filter_list, nfc_node)
 		igb_erase_filter(adapter, rule);
 
+	hlist_for_each_entry(rule, &adapter->cls_flower_list, nfc_node)
+		igb_erase_filter(adapter, rule);
+
 	spin_unlock(&adapter->nfc_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 22e15b6f9c010ffded63e6c9720a656eb3d73835 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 25 Apr 2018 17:46:47 +0800
Subject: sctp: remove the unused sctp_assoc_is_match function

After Commit 4f0087812648 ("sctp: apply rhashtable api to send/recv
path"), there's no place using sctp_assoc_is_match, so remove it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ----
 net/sctp/associola.c       | 25 -------------------------
 2 files changed, 29 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a0ec462bc1a9..05594b248e52 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2091,10 +2091,6 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 				  enum sctp_transport_cmd command,
 				  sctp_sn_error_t error);
 struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32);
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *,
-					   struct net *,
-					   const union sctp_addr *,
-					   const union sctp_addr *);
 void sctp_assoc_migrate(struct sctp_association *, struct sock *);
 int sctp_assoc_update(struct sctp_association *old,
 		      struct sctp_association *new);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 837806dd5799..a8f3b088fcb2 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -988,31 +988,6 @@ out:
 	return match;
 }
 
-/* Is this the association we are looking for? */
-struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
-					   struct net *net,
-					   const union sctp_addr *laddr,
-					   const union sctp_addr *paddr)
-{
-	struct sctp_transport *transport;
-
-	if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
-	    (htons(asoc->peer.port) == paddr->v4.sin_port) &&
-	    net_eq(sock_net(asoc->base.sk), net)) {
-		transport = sctp_assoc_lookup_paddr(asoc, paddr);
-		if (!transport)
-			goto out;
-
-		if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
-					 sctp_sk(asoc->base.sk)))
-			goto out;
-	}
-	transport = NULL;
-
-out:
-	return transport;
-}
-
 /* Do delayed input processing.  This is scheduled by sctp_rcv(). */
 static void sctp_assoc_bh_rcv(struct work_struct *work)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 4d7b44833e9a129d3220152727a19cceea3a2ac4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 25 Apr 2018 11:31:13 +0100
Subject: net: amd8111e: remove redundant duplicated if statement

There are two identical nested if statements, the second is redundant
and can be removed. Also clean up white space formatting.

Cleans up cppcheck warning:
drivers/net/ethernet/amd/amd8111e.c:1080: (warning) Identical inner 'if'
condition is always true.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/amd8111e.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index c99e3e845ac0..a90080f12e67 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1074,16 +1074,12 @@ static int amd8111e_calc_coalesce(struct net_device *dev)
 				amd8111e_set_coalesce(dev,TX_INTR_COAL);
 				coal_conf->tx_coal_type = MEDIUM_COALESCE;
 			}
-
-		}
-		else if(tx_pkt_size >= 1024){
-			if (tx_pkt_size >= 1024){
-				if(coal_conf->tx_coal_type !=  HIGH_COALESCE){
-					coal_conf->tx_timeout = 4;
-					coal_conf->tx_event_count = 8;
-					amd8111e_set_coalesce(dev,TX_INTR_COAL);
-					coal_conf->tx_coal_type = HIGH_COALESCE;
-				}
+		} else if (tx_pkt_size >= 1024) {
+			if (coal_conf->tx_coal_type != HIGH_COALESCE) {
+				coal_conf->tx_timeout = 4;
+				coal_conf->tx_event_count = 8;
+				amd8111e_set_coalesce(dev, TX_INTR_COAL);
+				coal_conf->tx_coal_type = HIGH_COALESCE;
 			}
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6f5d295909c9e10a715c4f9f6301ffa701398112 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 25 Apr 2018 11:43:07 +0100
Subject: mkiss: remove redundant check for len > 0

The check for len > 0 is always true and hence is redundant as
this check is already being made to execute the code inside the
while-loop. Hence it is redundant and can be removed.

Cleans up cppcheck warning:
drivers/net/hamradio/mkiss.c:220: (warning) Identical inner 'if'
condition is always true.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hamradio/mkiss.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index c180b480f8ef..13e4c1eff353 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -217,7 +217,7 @@ static int kiss_esc_crc(unsigned char *s, unsigned char *d, unsigned short crc,
 			c = *s++;
 		else if (len > 1)
 			c = crc >> 8;
-		else if (len > 0)
+		else
 			c = crc & 0xff;
 
 		len--;
-- 
cgit v1.2.3-59-g8ed1b


From c926ca160506d45c57cc30ee70d9df4cc8911713 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Apr 2018 07:46:39 -0700
Subject: xfrm: remove VLA usage in __xfrm6_sort()

In the quest to remove all stack VLA usage removed from the kernel[1],
just use XFRM_MAX_DEPTH as already done for the "class" array. In one
case, it'll do this loop up to 5, the other caller up to 6.

[1] https://lkml.org/lkml/2018/3/7/621

Co-developed-by: Andreas Christoforou <andreaschristofo@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_state.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 16f434791763..5bdca3d5d6b7 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -60,11 +60,9 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
 static int
 __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
 {
-	int i;
+	int count[XFRM_MAX_DEPTH] = { };
 	int class[XFRM_MAX_DEPTH];
-	int count[maxclass];
-
-	memset(count, 0, sizeof(count));
+	int i;
 
 	for (i = 0; i < n; i++) {
 		int c;
-- 
cgit v1.2.3-59-g8ed1b


From a8cbb46f831d2c101feccdd0e0daf3627b8c1dca Mon Sep 17 00:00:00 2001
From: Golan Ben Ami <golan.ben.ami@intel.com>
Date: Sun, 22 Oct 2017 15:58:26 +0300
Subject: iwlwifi: allow different csr flags for different device families

Different device families may have different flag values
for passing a message to the fw (i.e. SW_RESET).
In order to keep the code readable, and avoid conditioning
upon the family, store a value for each flag, which indicates
the bit that needs to be enabled.

Signed-off-by: Golan Ben Ami <golan.ben.ami@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/cfg/1000.c      |  8 ++-
 drivers/net/wireless/intel/iwlwifi/cfg/2000.c      | 13 +++--
 drivers/net/wireless/intel/iwlwifi/cfg/22000.c     |  7 +++
 drivers/net/wireless/intel/iwlwifi/cfg/5000.c      |  8 ++-
 drivers/net/wireless/intel/iwlwifi/cfg/6000.c      | 19 +++++--
 drivers/net/wireless/intel/iwlwifi/cfg/7000.c      |  5 +-
 drivers/net/wireless/intel/iwlwifi/cfg/8000.c      |  5 +-
 drivers/net/wireless/intel/iwlwifi/cfg/9000.c      |  3 +-
 drivers/net/wireless/intel/iwlwifi/iwl-config.h    | 65 ++++++++++++++++++++++
 drivers/net/wireless/intel/iwlwifi/iwl-csr.h       | 28 +---------
 .../net/wireless/intel/iwlwifi/iwl-eeprom-read.c   |  8 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/utils.c     |  8 ++-
 drivers/net/wireless/intel/iwlwifi/pcie/rx.c       |  3 +-
 .../net/wireless/intel/iwlwifi/pcie/trans-gen2.c   | 15 +++--
 drivers/net/wireless/intel/iwlwifi/pcie/trans.c    | 65 ++++++++++++----------
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c       | 16 +++---
 16 files changed, 184 insertions(+), 92 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/1000.c b/drivers/net/wireless/intel/iwlwifi/cfg/1000.c
index b2573b1d1506..591687984962 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/1000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/1000.c
@@ -1,6 +1,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -27,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include "iwl-config.h"
-#include "iwl-csr.h"
 #include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
@@ -91,7 +91,8 @@ static const struct iwl_eeprom_params iwl1000_eeprom_params = {
 	.base_params = &iwl1000_base_params,			\
 	.eeprom_params = &iwl1000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl1000_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N 1000 BGN",
@@ -117,7 +118,8 @@ const struct iwl_cfg iwl1000_bg_cfg = {
 	.eeprom_params = &iwl1000_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
 	.rx_with_siso_diversity = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl100_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N 100 BGN",
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/2000.c b/drivers/net/wireless/intel/iwlwifi/cfg/2000.c
index 1b32ad413b9e..a63ca8820568 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/2000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/2000.c
@@ -1,6 +1,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -115,7 +116,8 @@ static const struct iwl_eeprom_params iwl20x0_eeprom_params = {
 	.base_params = &iwl2000_base_params,			\
 	.eeprom_params = &iwl20x0_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 
 const struct iwl_cfg iwl2000_2bgn_cfg = {
@@ -142,7 +144,8 @@ const struct iwl_cfg iwl2000_2bgn_d_cfg = {
 	.base_params = &iwl2030_base_params,			\
 	.eeprom_params = &iwl20x0_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl2030_2bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N 2230 BGN",
@@ -163,7 +166,8 @@ const struct iwl_cfg iwl2030_2bgn_cfg = {
 	.eeprom_params = &iwl20x0_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
 	.rx_with_siso_diversity = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl105_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N 105 BGN",
@@ -190,7 +194,8 @@ const struct iwl_cfg iwl105_bgn_d_cfg = {
 	.eeprom_params = &iwl20x0_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
 	.rx_with_siso_diversity = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl135_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N 135 BGN",
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
index f2b6e0e8d787..afc78a1e1cfc 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
@@ -143,6 +143,7 @@ const struct iwl_cfg iwl22000_2ac_cfg_hr = {
 	.name = "Intel(R) Dual Band Wireless AC 22000",
 	.fw_name_pre = IWL_22000_HR_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -153,6 +154,7 @@ const struct iwl_cfg iwl22000_2ac_cfg_hr_cdb = {
 	.name = "Intel(R) Dual Band Wireless AC 22000",
 	.fw_name_pre = IWL_22000_HR_CDB_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -164,6 +166,7 @@ const struct iwl_cfg iwl22000_2ac_cfg_jf = {
 	.name = "Intel(R) Dual Band Wireless AC 22000",
 	.fw_name_pre = IWL_22000_JF_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -174,6 +177,7 @@ const struct iwl_cfg iwl22000_2ax_cfg_hr = {
 	.name = "Intel(R) Dual Band Wireless AX 22000",
 	.fw_name_pre = IWL_22000_HR_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -184,6 +188,7 @@ const struct iwl_cfg iwl22000_2ax_cfg_qnj_hr_f0 = {
 	.name = "Intel(R) Dual Band Wireless AX 22000",
 	.fw_name_pre = IWL_22000_HR_F0_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -194,6 +199,7 @@ const struct iwl_cfg iwl22000_2ax_cfg_qnj_jf_b0 = {
 	.name = "Intel(R) Dual Band Wireless AX 22000",
 	.fw_name_pre = IWL_22000_JF_B0_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
@@ -204,6 +210,7 @@ const struct iwl_cfg iwl22000_2ax_cfg_qnj_hr_a0 = {
 	.name = "Intel(R) Dual Band Wireless AX 22000",
 	.fw_name_pre = IWL_22000_HR_A0_FW_PRE,
 	IWL_DEVICE_22000,
+	.csr = &iwl_csr_v1,
 	.ht_params = &iwl_22000_ht_params,
 	.nvm_ver = IWL_22000_NVM_VERSION,
 	.nvm_calib_ver = IWL_22000_TX_POWER_VERSION,
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/5000.c b/drivers/net/wireless/intel/iwlwifi/cfg/5000.c
index 4aa8f0a05c8a..a224f1be1ec2 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/5000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/5000.c
@@ -1,6 +1,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2007 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -28,7 +29,6 @@
 #include <linux/stringify.h>
 #include "iwl-config.h"
 #include "iwl-agn-hw.h"
-#include "iwl-csr.h"
 
 /* Highest firmware API version supported */
 #define IWL5000_UCODE_API_MAX 5
@@ -89,7 +89,8 @@ static const struct iwl_eeprom_params iwl5000_eeprom_params = {
 	.base_params = &iwl5000_base_params,			\
 	.eeprom_params = &iwl5000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl5300_agn_cfg = {
 	.name = "Intel(R) Ultimate N WiFi Link 5300 AGN",
@@ -153,7 +154,8 @@ const struct iwl_cfg iwl5350_agn_cfg = {
 	.eeprom_params = &iwl5000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
 	.internal_wimax_coex = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl5150_agn_cfg = {
 	.name = "Intel(R) WiMAX/WiFi Link 5150 AGN",
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/6000.c b/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
index 39335b7b0c16..51cec0bb75fc 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
@@ -1,6 +1,7 @@
 /******************************************************************************
  *
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -135,7 +136,8 @@ static const struct iwl_eeprom_params iwl6000_eeprom_params = {
 	.base_params = &iwl6000_g2_base_params,			\
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6005_2agn_cfg = {
 	.name = "Intel(R) Centrino(R) Advanced-N 6205 AGN",
@@ -189,7 +191,8 @@ const struct iwl_cfg iwl6005_2agn_mow2_cfg = {
 	.base_params = &iwl6000_g2_base_params,			\
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6030_2agn_cfg = {
 	.name = "Intel(R) Centrino(R) Advanced-N 6230 AGN",
@@ -225,7 +228,8 @@ const struct iwl_cfg iwl6030_2bg_cfg = {
 	.base_params = &iwl6000_g2_base_params,			\
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_RF_STATE,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6035_2agn_cfg = {
 	.name = "Intel(R) Centrino(R) Advanced-N 6235 AGN",
@@ -280,7 +284,8 @@ const struct iwl_cfg iwl130_bg_cfg = {
 	.base_params = &iwl6000_base_params,			\
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6000i_2agn_cfg = {
 	.name = "Intel(R) Centrino(R) Advanced-N 6200 AGN",
@@ -313,7 +318,8 @@ const struct iwl_cfg iwl6000i_2bg_cfg = {
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
 	.internal_wimax_coex = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6050_2agn_cfg = {
 	.name = "Intel(R) Centrino(R) Advanced-N + WiMAX 6250 AGN",
@@ -339,7 +345,8 @@ const struct iwl_cfg iwl6050_2abg_cfg = {
 	.eeprom_params = &iwl6000_eeprom_params,		\
 	.led_mode = IWL_LED_BLINK,				\
 	.internal_wimax_coex = true,				\
-	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K
+	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl6150_bgn_cfg = {
 	.name = "Intel(R) Centrino(R) Wireless-N + WiMAX 6150 BGN",
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/7000.c b/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
index ce741beec1fc..8e9684e8e603 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -167,7 +169,8 @@ static const struct iwl_ht_params iwl7000_ht_params = {
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_7000,	\
 	.non_shared_ant = ANT_A,				\
 	.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K,	\
-	.dccm_offset = IWL7000_DCCM_OFFSET
+	.dccm_offset = IWL7000_DCCM_OFFSET,			\
+	.csr = &iwl_csr_v1
 
 #define IWL_DEVICE_7000						\
 	IWL_DEVICE_7000_COMMON,					\
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/8000.c b/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
index 3f4d9bac9f73..e8299b2ab6bd 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2014 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -34,6 +35,7 @@
  *
  * Copyright(c) 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2014 - 2015 Intel Mobile Communications GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -158,7 +160,8 @@ static const struct iwl_tt_params iwl8000_tt_params = {
 	.apmg_not_supported = true,					\
 	.nvm_type = IWL_NVM_EXT,					\
 	.dbgc_supported = true,						\
-	.min_umac_error_event_table = 0x800000
+	.min_umac_error_event_table = 0x800000,				\
+	.csr = &iwl_csr_v1
 
 #define IWL_DEVICE_8000							\
 	IWL_DEVICE_8000_COMMON,						\
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
index e1c869a1f8cc..fe376fce3444 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
@@ -156,7 +156,8 @@ static const struct iwl_tt_params iwl9000_tt_params = {
 	.rf_id = true,							\
 	.nvm_type = IWL_NVM_EXT,					\
 	.dbgc_supported = true,						\
-	.min_umac_error_event_table = 0x800000
+	.min_umac_error_event_table = 0x800000,				\
+	.csr = &iwl_csr_v1
 
 const struct iwl_cfg iwl9160_2ac_cfg = {
 	.name = "Intel(R) Dual Band Wireless AC 9160",
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index 14e69e7a2287..7f0b7cea8b9b 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -7,6 +7,7 @@
  *
  * Copyright(c) 2007 - 2014 Intel Corporation. All rights reserved.
  * Copyright (C) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -33,6 +34,7 @@
  *
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright (C) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,6 +71,7 @@
 #include <linux/netdevice.h>
 #include <linux/ieee80211.h>
 #include <linux/nl80211.h>
+#include "iwl-csr.h"
 
 enum iwl_device_family {
 	IWL_DEVICE_FAMILY_UNDEFINED,
@@ -284,6 +287,44 @@ struct iwl_pwr_tx_backoff {
 	u32 backoff;
 };
 
+/**
+ * struct iwl_csr_params
+ *
+ * @flag_sw_reset: reset the device
+ * @flag_mac_clock_ready:
+ *	Indicates MAC (ucode processor, etc.) is powered up and can run.
+ *	Internal resources are accessible.
+ *	NOTE:  This does not indicate that the processor is actually running.
+ *	NOTE:  This does not indicate that device has completed
+ *	       init or post-power-down restore of internal SRAM memory.
+ *	       Use CSR_UCODE_DRV_GP1_BIT_MAC_SLEEP as indication that
+ *	       SRAM is restored and uCode is in normal operation mode.
+ *	       This note is relevant only for pre 5xxx devices.
+ *	NOTE:  After device reset, this bit remains "0" until host sets
+ *	       INIT_DONE
+ * @flag_init_done: Host sets this to put device into fully operational
+ *	D0 power mode. Host resets this after SW_RESET to put device into
+ *	low power mode.
+ * @flag_mac_access_req: Host sets this to request and maintain MAC wakeup,
+ *	to allow host access to device-internal resources. Host must wait for
+ *	mac_clock_ready (and !GOING_TO_SLEEP) before accessing non-CSR device
+ *	registers.
+ * @flag_val_mac_access_en: mac access is enabled
+ * @flag_master_dis: disable master
+ * @flag_stop_master: stop master
+ * @addr_sw_reset: address for resetting the device
+ */
+struct iwl_csr_params {
+	u8 flag_sw_reset;
+	u8 flag_mac_clock_ready;
+	u8 flag_init_done;
+	u8 flag_mac_access_req;
+	u8 flag_val_mac_access_en;
+	u8 flag_master_dis;
+	u8 flag_stop_master;
+	u8 addr_sw_reset;
+};
+
 /**
  * struct iwl_cfg
  * @name: Official name of the device
@@ -316,6 +357,7 @@ struct iwl_pwr_tx_backoff {
  * @mac_addr_from_csr: read HW address from CSR registers
  * @features: hw features, any combination of feature_whitelist
  * @pwr_tx_backoffs: translation table between power limits and backoffs
+ * @csr: csr flags and addresses that are different across devices
  * @max_rx_agg_size: max RX aggregation size of the ADDBA request/response
  * @max_tx_agg_size: max TX aggregation size of the ADDBA request/response
  * @max_ht_ampdu_factor: the exponent of the max length of A-MPDU that the
@@ -354,6 +396,7 @@ struct iwl_cfg {
 	const struct iwl_pwr_tx_backoff *pwr_tx_backoffs;
 	const char *default_nvm_file_C_step;
 	const struct iwl_tt_params *thermal_params;
+	const struct iwl_csr_params *csr;
 	enum iwl_device_family device_family;
 	enum iwl_led_mode led_mode;
 	enum iwl_nvm_type nvm_type;
@@ -400,6 +443,28 @@ struct iwl_cfg {
 	u32 extra_phy_cfg_flags;
 };
 
+static const struct iwl_csr_params iwl_csr_v1 = {
+	.flag_mac_clock_ready = 0,
+	.flag_val_mac_access_en = 0,
+	.flag_init_done = 2,
+	.flag_mac_access_req = 3,
+	.flag_sw_reset = 7,
+	.flag_master_dis = 8,
+	.flag_stop_master = 9,
+	.addr_sw_reset = (CSR_BASE + 0x020)
+};
+
+static const struct iwl_csr_params iwl_csr_v2 = {
+	.flag_init_done = 6,
+	.flag_mac_clock_ready = 20,
+	.flag_val_mac_access_en = 20,
+	.flag_mac_access_req = 21,
+	.flag_master_dis = 28,
+	.flag_stop_master = 29,
+	.flag_sw_reset = 31,
+	.addr_sw_reset = (CSR_BASE + 0x024)
+};
+
 /*
  * This list declares the config structures for all devices.
  */
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h
index 4f0d070eda54..ba971d3946e2 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h
@@ -8,6 +8,7 @@
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2016        Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -34,6 +35,7 @@
  *
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -257,7 +259,6 @@
 /* RESET */
 #define CSR_RESET_REG_FLAG_NEVO_RESET                (0x00000001)
 #define CSR_RESET_REG_FLAG_FORCE_NMI                 (0x00000002)
-#define CSR_RESET_REG_FLAG_SW_RESET                  (0x00000080)
 #define CSR_RESET_REG_FLAG_MASTER_DISABLED           (0x00000100)
 #define CSR_RESET_REG_FLAG_STOP_MASTER               (0x00000200)
 #define CSR_RESET_LINK_PWR_MGMT_DISABLED             (0x80000000)
@@ -280,35 +281,10 @@
  *     4:  GOING_TO_SLEEP
  *         Indicates MAC is entering a power-saving sleep power-down.
  *         Not a good time to access device-internal resources.
- *     3:  MAC_ACCESS_REQ
- *         Host sets this to request and maintain MAC wakeup, to allow host
- *         access to device-internal resources.  Host must wait for
- *         MAC_CLOCK_READY (and !GOING_TO_SLEEP) before accessing non-CSR
- *         device registers.
- *     2:  INIT_DONE
- *         Host sets this to put device into fully operational D0 power mode.
- *         Host resets this after SW_RESET to put device into low power mode.
- *     0:  MAC_CLOCK_READY
- *         Indicates MAC (ucode processor, etc.) is powered up and can run.
- *         Internal resources are accessible.
- *         NOTE:  This does not indicate that the processor is actually running.
- *         NOTE:  This does not indicate that device has completed
- *                init or post-power-down restore of internal SRAM memory.
- *                Use CSR_UCODE_DRV_GP1_BIT_MAC_SLEEP as indication that
- *                SRAM is restored and uCode is in normal operation mode.
- *                Later devices (5xxx/6xxx/1xxx) use non-volatile SRAM, and
- *                do not need to save/restore it.
- *         NOTE:  After device reset, this bit remains "0" until host sets
- *                INIT_DONE
  */
-#define CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY        (0x00000001)
-#define CSR_GP_CNTRL_REG_FLAG_INIT_DONE              (0x00000004)
-#define CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ         (0x00000008)
 #define CSR_GP_CNTRL_REG_FLAG_GOING_TO_SLEEP         (0x00000010)
 #define CSR_GP_CNTRL_REG_FLAG_XTAL_ON		     (0x00000400)
 
-#define CSR_GP_CNTRL_REG_VAL_MAC_ACCESS_EN           (0x00000001)
-
 #define CSR_GP_CNTRL_REG_MSK_POWER_SAVE_TYPE         (0x07000000)
 #define CSR_GP_CNTRL_REG_FLAG_RFKILL_WAKE_L1A_EN     (0x04000000)
 #define CSR_GP_CNTRL_REG_FLAG_HW_RF_KILL_SW          (0x08000000)
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-read.c b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-read.c
index f2cea1c7befc..ac965c34a2f8 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-read.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-eeprom-read.c
@@ -6,6 +6,7 @@
  * GPL LICENSE SUMMARY
  *
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -31,6 +32,7 @@
  * BSD LICENSE
  *
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -199,12 +201,12 @@ static int iwl_init_otp_access(struct iwl_trans *trans)
 	/* Enable 40MHz radio clock */
 	iwl_write32(trans, CSR_GP_CNTRL,
 		    iwl_read32(trans, CSR_GP_CNTRL) |
-		    CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+		    BIT(trans->cfg->csr->flag_init_done));
 
 	/* wait for clock to be ready */
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
 			   25000);
 	if (ret < 0) {
 		IWL_ERR(trans, "Time out access OTP\n");
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 0497c7a44def..b002a7afb5f5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -520,15 +522,15 @@ static void iwl_mvm_dump_lmac_error_log(struct iwl_mvm *mvm, u32 base)
 
 		/* set INIT_DONE flag */
 		iwl_set_bit(trans, CSR_GP_CNTRL,
-			    CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+			    BIT(trans->cfg->csr->flag_init_done));
 
 		/* and wait for clock stabilization */
 		if (trans->cfg->device_family == IWL_DEVICE_FAMILY_8000)
 			udelay(2);
 
 		err = iwl_poll_bit(trans, CSR_GP_CNTRL,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
+				   BIT(trans->cfg->csr->flag_mac_clock_ready),
+				   BIT(trans->cfg->csr->flag_mac_clock_ready),
 				   25000);
 		if (err < 0) {
 			IWL_DEBUG_INFO(trans,
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
index f25ce3a1ea50..f772d70a65e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2003 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * Portions of this file are derived from the ipw3945 project, as well
  * as portions of the ieee80211 subsystem header files.
@@ -201,7 +202,7 @@ static void iwl_pcie_rxq_inc_wr_ptr(struct iwl_trans *trans,
 			IWL_DEBUG_INFO(trans, "Rx queue requesting wakeup, GP1 = 0x%x\n",
 				       reg);
 			iwl_set_bit(trans, CSR_GP_CNTRL,
-				    CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+				    BIT(trans->cfg->csr->flag_mac_access_req));
 			rxq->need_update = true;
 			return;
 		}
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c
index cb4012541f45..b8e8dac2895d 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans-gen2.c
@@ -6,6 +6,7 @@
  * GPL LICENSE SUMMARY
  *
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -19,6 +20,7 @@
  * BSD LICENSE
  *
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -92,7 +94,8 @@ static int iwl_pcie_gen2_apm_init(struct iwl_trans *trans)
 	 * Set "initialization complete" bit to move adapter from
 	 * D0U* --> D0A* (powered-up active) state.
 	 */
-	iwl_set_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+	iwl_set_bit(trans, CSR_GP_CNTRL,
+		    BIT(trans->cfg->csr->flag_init_done));
 
 	/*
 	 * Wait for clock stabilization; once stabilized, access to
@@ -100,8 +103,9 @@ static int iwl_pcie_gen2_apm_init(struct iwl_trans *trans)
 	 * and accesses to uCode SRAM.
 	 */
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY, 25000);
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   25000);
 	if (ret < 0) {
 		IWL_DEBUG_INFO(trans, "Failed to init the card\n");
 		return ret;
@@ -143,7 +147,8 @@ static void iwl_pcie_gen2_apm_stop(struct iwl_trans *trans, bool op_mode_leave)
 	 * Clear "initialization complete" bit to move adapter from
 	 * D0A* (powered-up Active) --> D0U* (Uninitialized) state.
 	 */
-	iwl_clear_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+	iwl_clear_bit(trans, CSR_GP_CNTRL,
+		      BIT(trans->cfg->csr->flag_init_done));
 }
 
 void _iwl_trans_pcie_gen2_stop_device(struct iwl_trans *trans, bool low_power)
@@ -187,7 +192,7 @@ void _iwl_trans_pcie_gen2_stop_device(struct iwl_trans *trans, bool low_power)
 
 	/* Make sure (redundant) we've released our request to stay awake */
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+		      BIT(trans->cfg->csr->flag_mac_access_req));
 
 	/* Stop the device, and put it in low power state */
 	iwl_pcie_gen2_apm_stop(trans, false);
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index f8a0234d332c..623476603cd4 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2007 - 2015 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2005 - 2015 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -179,7 +181,8 @@ out:
 static void iwl_trans_pcie_sw_reset(struct iwl_trans *trans)
 {
 	/* Reset entire device - do controller reset (results in SHRD_HW_RST) */
-	iwl_set_bit(trans, CSR_RESET, CSR_RESET_REG_FLAG_SW_RESET);
+	iwl_set_bit(trans, trans->cfg->csr->addr_sw_reset,
+		    BIT(trans->cfg->csr->flag_sw_reset));
 	usleep_range(5000, 6000);
 }
 
@@ -372,7 +375,8 @@ static int iwl_pcie_apm_init(struct iwl_trans *trans)
 	 * Set "initialization complete" bit to move adapter from
 	 * D0U* --> D0A* (powered-up active) state.
 	 */
-	iwl_set_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+	iwl_set_bit(trans, CSR_GP_CNTRL,
+		    BIT(trans->cfg->csr->flag_init_done));
 
 	/*
 	 * Wait for clock stabilization; once stabilized, access to
@@ -380,8 +384,9 @@ static int iwl_pcie_apm_init(struct iwl_trans *trans)
 	 * and accesses to uCode SRAM.
 	 */
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY, 25000);
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   25000);
 	if (ret < 0) {
 		IWL_ERR(trans, "Failed to init the card\n");
 		return ret;
@@ -459,15 +464,16 @@ static void iwl_pcie_apm_lp_xtal_enable(struct iwl_trans *trans)
 	 * Set "initialization complete" bit to move adapter from
 	 * D0U* --> D0A* (powered-up active) state.
 	 */
-	iwl_set_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+	iwl_set_bit(trans, CSR_GP_CNTRL,
+		    BIT(trans->cfg->csr->flag_init_done));
 
 	/*
 	 * Wait for clock stabilization; once stabilized, access to
 	 * device-internal resources is possible.
 	 */
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
 			   25000);
 	if (WARN_ON(ret < 0)) {
 		IWL_ERR(trans, "Access time out - failed to enable LP XTAL\n");
@@ -519,7 +525,7 @@ static void iwl_pcie_apm_lp_xtal_enable(struct iwl_trans *trans)
 	 * D0A* (powered-up Active) --> D0U* (Uninitialized) state.
 	 */
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+		      BIT(trans->cfg->csr->flag_init_done));
 
 	/* Activates XTAL resources monitor */
 	__iwl_trans_pcie_set_bit(trans, CSR_MONITOR_CFG_REG,
@@ -541,11 +547,12 @@ void iwl_pcie_apm_stop_master(struct iwl_trans *trans)
 	int ret;
 
 	/* stop device's busmaster DMA activity */
-	iwl_set_bit(trans, CSR_RESET, CSR_RESET_REG_FLAG_STOP_MASTER);
+	iwl_set_bit(trans, trans->cfg->csr->addr_sw_reset,
+		    BIT(trans->cfg->csr->flag_stop_master));
 
-	ret = iwl_poll_bit(trans, CSR_RESET,
-			   CSR_RESET_REG_FLAG_MASTER_DISABLED,
-			   CSR_RESET_REG_FLAG_MASTER_DISABLED, 100);
+	ret = iwl_poll_bit(trans, trans->cfg->csr->addr_sw_reset,
+			   BIT(trans->cfg->csr->flag_master_dis),
+			   BIT(trans->cfg->csr->flag_master_dis), 100);
 	if (ret < 0)
 		IWL_WARN(trans, "Master Disable Timed Out, 100 usec\n");
 
@@ -594,7 +601,7 @@ static void iwl_pcie_apm_stop(struct iwl_trans *trans, bool op_mode_leave)
 	 * D0A* (powered-up Active) --> D0U* (Uninitialized) state.
 	 */
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+		      BIT(trans->cfg->csr->flag_init_done));
 }
 
 static int iwl_pcie_nic_init(struct iwl_trans *trans)
@@ -1267,7 +1274,7 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool low_power)
 
 	/* Make sure (redundant) we've released our request to stay awake */
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+		      BIT(trans->cfg->csr->flag_mac_access_req));
 
 	/* Stop the device, and put it in low power state */
 	iwl_pcie_apm_stop(trans, false);
@@ -1497,9 +1504,9 @@ static void iwl_trans_pcie_d3_suspend(struct iwl_trans *trans, bool test,
 	iwl_pcie_synchronize_irqs(trans);
 
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+		      BIT(trans->cfg->csr->flag_mac_access_req));
 	iwl_clear_bit(trans, CSR_GP_CNTRL,
-		      CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+		      BIT(trans->cfg->csr->flag_init_done));
 
 	iwl_pcie_enable_rx_wake(trans, false);
 
@@ -1543,15 +1550,17 @@ static int iwl_trans_pcie_d3_resume(struct iwl_trans *trans,
 		iwl_pcie_reset_ict(trans);
 	iwl_enable_interrupts(trans);
 
-	iwl_set_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
-	iwl_set_bit(trans, CSR_GP_CNTRL, CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+	iwl_set_bit(trans, CSR_GP_CNTRL,
+		    BIT(trans->cfg->csr->flag_mac_access_req));
+	iwl_set_bit(trans, CSR_GP_CNTRL,
+		    BIT(trans->cfg->csr->flag_init_done));
 
 	if (trans->cfg->device_family >= IWL_DEVICE_FAMILY_8000)
 		udelay(2);
 
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-			   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
+			   BIT(trans->cfg->csr->flag_mac_clock_ready),
 			   25000);
 	if (ret < 0) {
 		IWL_ERR(trans, "Failed to resume the device (mac ready)\n");
@@ -1562,7 +1571,7 @@ static int iwl_trans_pcie_d3_resume(struct iwl_trans *trans,
 
 	if (!reset) {
 		iwl_clear_bit(trans, CSR_GP_CNTRL,
-			      CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+			      BIT(trans->cfg->csr->flag_mac_access_req));
 	} else {
 		iwl_trans_pcie_tx_reset(trans);
 
@@ -1939,7 +1948,7 @@ static bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans,
 
 	/* this bit wakes up the NIC */
 	__iwl_trans_pcie_set_bit(trans, CSR_GP_CNTRL,
-				 CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+				 BIT(trans->cfg->csr->flag_mac_access_req));
 	if (trans->cfg->device_family >= IWL_DEVICE_FAMILY_8000)
 		udelay(2);
 
@@ -1964,8 +1973,8 @@ static bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans,
 	 * and do not save/restore SRAM when power cycling.
 	 */
 	ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-			   CSR_GP_CNTRL_REG_VAL_MAC_ACCESS_EN,
-			   (CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY |
+			   BIT(trans->cfg->csr->flag_val_mac_access_en),
+			   (BIT(trans->cfg->csr->flag_mac_clock_ready) |
 			    CSR_GP_CNTRL_REG_FLAG_GOING_TO_SLEEP), 15000);
 	if (unlikely(ret < 0)) {
 		iwl_trans_pcie_dump_regs(trans);
@@ -2003,7 +2012,7 @@ static void iwl_trans_pcie_release_nic_access(struct iwl_trans *trans,
 		goto out;
 
 	__iwl_trans_pcie_clear_bit(trans, CSR_GP_CNTRL,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+				   BIT(trans->cfg->csr->flag_mac_access_req));
 	/*
 	 * Above we read the CSR_GP_CNTRL register, which will flush
 	 * any previous writes, but we need the write that clears the
@@ -3232,12 +3241,12 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev,
 		 * id located at the AUX bus MISC address space.
 		 */
 		iwl_set_bit(trans, CSR_GP_CNTRL,
-			    CSR_GP_CNTRL_REG_FLAG_INIT_DONE);
+			    BIT(trans->cfg->csr->flag_init_done));
 		udelay(2);
 
 		ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY,
+				   BIT(trans->cfg->csr->flag_mac_clock_ready),
+				   BIT(trans->cfg->csr->flag_mac_clock_ready),
 				   25000);
 		if (ret < 0) {
 			IWL_DEBUG_INFO(trans, "Failed to wake up the nic\n");
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index cf468b9f5a82..1cbb8f470afd 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2003 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * Portions of this file are derived from the ipw3945 project, as well
  * as portions of the ieee80211 subsystem header files.
@@ -273,7 +274,7 @@ static void iwl_pcie_txq_inc_wr_ptr(struct iwl_trans *trans,
 			IWL_DEBUG_INFO(trans, "Tx queue %d requesting wakeup, GP1 = 0x%x\n",
 				       txq_id, reg);
 			iwl_set_bit(trans, CSR_GP_CNTRL,
-				    CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+				    BIT(trans->cfg->csr->flag_mac_access_req));
 			txq->need_update = true;
 			return;
 		}
@@ -611,7 +612,7 @@ static void iwl_pcie_clear_cmd_in_flight(struct iwl_trans *trans)
 
 	trans_pcie->cmd_hold_nic_awake = false;
 	__iwl_trans_pcie_clear_bit(trans, CSR_GP_CNTRL,
-				   CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+				   BIT(trans->cfg->csr->flag_mac_access_req));
 }
 
 /*
@@ -1171,6 +1172,7 @@ static int iwl_pcie_set_cmd_in_flight(struct iwl_trans *trans,
 				      const struct iwl_host_cmd *cmd)
 {
 	struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+	const struct iwl_cfg *cfg = trans->cfg;
 	int ret;
 
 	lockdep_assert_held(&trans_pcie->reg_lock);
@@ -1188,19 +1190,19 @@ static int iwl_pcie_set_cmd_in_flight(struct iwl_trans *trans,
 	 * returned. This needs to be done only on NICs that have
 	 * apmg_wake_up_wa set.
 	 */
-	if (trans->cfg->base_params->apmg_wake_up_wa &&
+	if (cfg->base_params->apmg_wake_up_wa &&
 	    !trans_pcie->cmd_hold_nic_awake) {
 		__iwl_trans_pcie_set_bit(trans, CSR_GP_CNTRL,
-					 CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+					 BIT(cfg->csr->flag_mac_access_req));
 
 		ret = iwl_poll_bit(trans, CSR_GP_CNTRL,
-				   CSR_GP_CNTRL_REG_VAL_MAC_ACCESS_EN,
-				   (CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY |
+				   BIT(cfg->csr->flag_val_mac_access_en),
+				   (BIT(cfg->csr->flag_mac_clock_ready) |
 				    CSR_GP_CNTRL_REG_FLAG_GOING_TO_SLEEP),
 				   15000);
 		if (ret < 0) {
 			__iwl_trans_pcie_clear_bit(trans, CSR_GP_CNTRL,
-					CSR_GP_CNTRL_REG_FLAG_MAC_ACCESS_REQ);
+					BIT(cfg->csr->flag_mac_access_req));
 			IWL_ERR(trans, "Failed to wake NIC for hcmd\n");
 			return -EIO;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 132db31ca9ee305ca1725db361096ab2230511c7 Mon Sep 17 00:00:00 2001
From: Golan Ben-Ami <golan.ben.ami@intel.com>
Date: Mon, 17 Jul 2017 19:42:35 +0300
Subject: iwlwifi: introduce Image Loader (IML) - new firmware image

In future devices a new image will be introduced - IML. The IML, image
loader, is loaded by the ROM, and as part of the new self-init flow,
loads the rest of the firmware images to the device.

Store the image, so the ROM can load it to the device.

Signed-off-by: Golan Ben Ami <golan.ben.ami@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/file.h   | 3 +++
 drivers/net/wireless/intel/iwlwifi/fw/img.h    | 6 ++++++
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c   | 8 ++++++++
 drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 5 +++++
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c   | 3 +++
 5 files changed, 25 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/file.h b/drivers/net/wireless/intel/iwlwifi/fw/file.h
index 9b2805e1e3b1..9d939cbaf6c6 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/file.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/file.h
@@ -8,6 +8,7 @@
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -143,6 +145,7 @@ enum iwl_ucode_tlv_type {
 	IWL_UCODE_TLV_FW_DBG_TRIGGER	= 40,
 	IWL_UCODE_TLV_FW_GSCAN_CAPA	= 50,
 	IWL_UCODE_TLV_FW_MEM_SEG	= 51,
+	IWL_UCODE_TLV_IML		= 52,
 };
 
 struct iwl_ucode_tlv {
diff --git a/drivers/net/wireless/intel/iwlwifi/fw/img.h b/drivers/net/wireless/intel/iwlwifi/fw/img.h
index b23ffe12ad84..f4912382b6af 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/img.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/img.h
@@ -8,6 +8,7 @@
  * Copyright(c) 2008 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016        Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,6 +36,7 @@
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016        Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -241,6 +243,8 @@ enum iwl_fw_type {
  * @ucode_ver: ucode version from the ucode file
  * @fw_version: firmware version string
  * @img: ucode image like ucode_rt, ucode_init, ucode_wowlan.
+ * @iml_len: length of the image loader image
+ * @iml: image loader fw image
  * @ucode_capa: capabilities parsed from the ucode file.
  * @enhance_sensitivity_table: device can do enhanced sensitivity.
  * @init_evtlog_ptr: event log offset for init ucode.
@@ -267,6 +271,8 @@ struct iwl_fw {
 
 	/* ucode images */
 	struct fw_img img[IWL_UCODE_TYPE_MAX];
+	size_t iml_len;
+	u8 *iml;
 
 	struct iwl_ucode_capabilities ucode_capa;
 	bool enhance_sensitivity_table;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
index aa2d5c14e202..4f451a6f20b9 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
@@ -179,6 +179,7 @@ static void iwl_dealloc_ucode(struct iwl_drv *drv)
 	for (i = 0; i < ARRAY_SIZE(drv->fw.dbg_trigger_tlv); i++)
 		kfree(drv->fw.dbg_trigger_tlv[i]);
 	kfree(drv->fw.dbg_mem_tlv);
+	kfree(drv->fw.iml);
 
 	for (i = 0; i < IWL_UCODE_TYPE_MAX; i++)
 		iwl_free_fw_img(drv, drv->fw.img + i);
@@ -1126,6 +1127,13 @@ static int iwl_parse_tlv_firmware(struct iwl_drv *drv,
 			pieces->n_dbg_mem_tlv++;
 			break;
 			}
+		case IWL_UCODE_TLV_IML: {
+			drv->fw.iml_len = tlv_len;
+			drv->fw.iml = kmemdup(tlv_data, tlv_len, GFP_KERNEL);
+			if (!drv->fw.iml)
+				return -ENOMEM;
+			break;
+			}
 		default:
 			IWL_DEBUG_INFO(drv, "unknown TLV: %d\n", tlv_type);
 			break;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
index a92832711683..1b9c627ee34d 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h
@@ -691,6 +691,8 @@ enum iwl_plat_pm_mode {
  * @wide_cmd_header: true when ucode supports wide command header format
  * @num_rx_queues: number of RX queues allocated by the transport;
  *	the transport must set this before calling iwl_drv_start()
+ * @iml_len: the length of the image loader
+ * @iml: a pointer to the image loader itself
  * @dev_cmd_pool: pool for Tx cmd allocation - for internal use only.
  *	The user should use iwl_trans_{alloc,free}_tx_cmd.
  * @rx_mpdu_cmd: MPDU RX command ID, must be assigned by opmode before
@@ -735,6 +737,9 @@ struct iwl_trans {
 
 	u8 num_rx_queues;
 
+	size_t iml_len;
+	u8 *iml;
+
 	/* The following fields are internal only */
 	struct kmem_cache *dev_cmd_pool;
 	char dev_cmd_pool_name[50];
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index de46e6258a5c..ff1e518096c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -739,6 +739,9 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg,
 	       sizeof(trans->dbg_conf_tlv));
 	trans->dbg_trigger_tlv = mvm->fw->dbg_trigger_tlv;
 
+	trans->iml = mvm->fw->iml;
+	trans->iml_len = mvm->fw->iml_len;
+
 	/* set up notification wait support */
 	iwl_notification_wait_init(&mvm->notif_wait);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4efc272a45c112bf539da14d480be0448abe4fac Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Tue, 13 Feb 2018 13:52:43 +0200
Subject: iwlwifi: cfg: remove unnecessary cfg data in non-dvm devices

The max_data_size and max_inst_size values are only needed for DVM
devices.  Remove the assignment to those fields in 7000 and newer
families so we can also remove the otherwise unnecessary inclusion of
iwl-agn.h headers.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/cfg/22000.c  |  3 ---
 drivers/net/wireless/intel/iwlwifi/cfg/7000.c   | 11 ++++-------
 drivers/net/wireless/intel/iwlwifi/cfg/8000.c   |  9 +++------
 drivers/net/wireless/intel/iwlwifi/cfg/9000.c   |  3 ---
 drivers/net/wireless/intel/iwlwifi/iwl-config.h |  4 ++--
 5 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
index afc78a1e1cfc..d4ba66aecdc9 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c
@@ -54,7 +54,6 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include "iwl-config.h"
-#include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
 #define IWL_22000_UCODE_API_MAX	38
@@ -115,8 +114,6 @@ static const struct iwl_ht_params iwl_22000_ht_params = {
 	.ucode_api_max = IWL_22000_UCODE_API_MAX,			\
 	.ucode_api_min = IWL_22000_UCODE_API_MIN,			\
 	.device_family = IWL_DEVICE_FAMILY_22000,			\
-	.max_inst_size = IWL60_RTC_INST_SIZE,				\
-	.max_data_size = IWL60_RTC_DATA_SIZE,				\
 	.base_params = &iwl_22000_base_params,				\
 	.led_mode = IWL_LED_RF_STATE,					\
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_22000,		\
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/7000.c b/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
index 8e9684e8e603..69bfa827e82a 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/7000.c
@@ -7,8 +7,8 @@
  *
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
- * Copyright(c) 2015 Intel Deutschland GmbH
- * Copyright(c) 2018 Intel Corporation
+ * Copyright(c) 2015        Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,8 +35,8 @@
  *
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
- * Copyright(c) 2015 Intel Deutschland GmbH
- * Copyright(c) 2018 Intel Corporation
+ * Copyright(c) 2015        Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -70,7 +70,6 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include "iwl-config.h"
-#include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
 #define IWL7260_UCODE_API_MAX	17
@@ -162,8 +161,6 @@ static const struct iwl_ht_params iwl7000_ht_params = {
 
 #define IWL_DEVICE_7000_COMMON					\
 	.device_family = IWL_DEVICE_FAMILY_7000,		\
-	.max_inst_size = IWL60_RTC_INST_SIZE,			\
-	.max_data_size = IWL60_RTC_DATA_SIZE,			\
 	.base_params = &iwl7000_base_params,			\
 	.led_mode = IWL_LED_RF_STATE,				\
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_7000,	\
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/8000.c b/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
index e8299b2ab6bd..7262e973e0d6 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/8000.c
@@ -7,8 +7,8 @@
  *
  * Copyright(c) 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2014 - 2015 Intel Mobile Communications GmbH
- * Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright(c) 2018 Intel Corporation
+ * Copyright(c) 2016        Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -35,7 +35,7 @@
  *
  * Copyright(c) 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2014 - 2015 Intel Mobile Communications GmbH
- * Copyright(c) 2018 Intel Corporation
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -69,7 +69,6 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include "iwl-config.h"
-#include "iwl-agn-hw.h"
 
 /* Highest firmware API version supported */
 #define IWL8000_UCODE_API_MAX	36
@@ -142,8 +141,6 @@ static const struct iwl_tt_params iwl8000_tt_params = {
 
 #define IWL_DEVICE_8000_COMMON						\
 	.device_family = IWL_DEVICE_FAMILY_8000,			\
-	.max_inst_size = IWL60_RTC_INST_SIZE,				\
-	.max_data_size = IWL60_RTC_DATA_SIZE,				\
 	.base_params = &iwl8000_base_params,				\
 	.led_mode = IWL_LED_RF_STATE,					\
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_8000,		\
diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
index fe376fce3444..9706624911f8 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
@@ -54,7 +54,6 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include "iwl-config.h"
-#include "iwl-agn-hw.h"
 #include "fw/file.h"
 
 /* Highest firmware API version supported */
@@ -135,8 +134,6 @@ static const struct iwl_tt_params iwl9000_tt_params = {
 	.ucode_api_max = IWL9000_UCODE_API_MAX,				\
 	.ucode_api_min = IWL9000_UCODE_API_MIN,				\
 	.device_family = IWL_DEVICE_FAMILY_9000,			\
-	.max_inst_size = IWL60_RTC_INST_SIZE,				\
-	.max_data_size = IWL60_RTC_DATA_SIZE,				\
 	.base_params = &iwl9000_base_params,				\
 	.led_mode = IWL_LED_RF_STATE,					\
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_9000,		\
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index 7f0b7cea8b9b..7752794d0437 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -337,8 +337,8 @@ struct iwl_csr_params {
  *	next step. Supported only in integrated solutions.
  * @ucode_api_max: Highest version of uCode API supported by driver.
  * @ucode_api_min: Lowest version of uCode API supported by driver.
- * @max_inst_size: The maximal length of the fw inst section
- * @max_data_size: The maximal length of the fw data section
+ * @max_inst_size: The maximal length of the fw inst section (only DVM)
+ * @max_data_size: The maximal length of the fw data section (only DVM)
  * @valid_tx_ant: valid transmit antenna
  * @valid_rx_ant: valid receive antenna
  * @non_shared_ant: the antenna that is for WiFi only
-- 
cgit v1.2.3-59-g8ed1b


From bf1ad8978bbfd8579dfd10d2e7742d0f554e449a Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliadx.peller@intel.com>
Date: Tue, 13 Feb 2018 18:41:46 +0200
Subject: iwlwifi: pcie: allow sending pre-built A-MSDUs

In case of A-MSDUs, the trans layer is taking care of building
the subframes (out of the given skb), according to the given gso_size.

However, in some testing flows, we want to build the whole A-MSDU
frame in a different place (e.g. userspace), and ask the driver
to send it as-is.

In case of gso_size==0, simply treat the frame as normal-frame,
although the A-MSDU flag is set.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index 1cbb8f470afd..473fe7ccb07c 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -2397,7 +2397,13 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb,
 		goto out_err;
 	iwl_pcie_txq_build_tfd(trans, txq, tb1_phys, tb1_len, false);
 
-	if (amsdu) {
+	/*
+	 * If gso_size wasn't set, don't give the frame "amsdu treatment"
+	 * (adding subframes, etc.).
+	 * This can happen in some testing flows when the amsdu was already
+	 * pre-built, and we just need to send the resulting skb.
+	 */
+	if (amsdu && skb_shinfo(skb)->gso_size) {
 		if (unlikely(iwl_fill_data_tbs_amsdu(trans, skb, txq, hdr_len,
 						     out_meta, dev_cmd,
 						     tb1_len)))
-- 
cgit v1.2.3-59-g8ed1b


From 18e8f43ff13e0dd0a485bc35bea91df37c26c4bb Mon Sep 17 00:00:00 2001
From: Golan Ben Ami <golan.ben.ami@intel.com>
Date: Wed, 22 Nov 2017 11:40:15 +0200
Subject: iwlwifi: support new csr addresses for hw address

In future devices we use different csr addresses for hw addresses.
Update csr addresses to support new and legacy devices.

Signed-off-by: Golan Ben Ami <golan.ben.ami@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-config.h    | 20 ++++++++++++++++++--
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 14 ++++++++++----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index 7752794d0437..cfd14e5ea59c 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -313,6 +313,10 @@ struct iwl_pwr_tx_backoff {
  * @flag_master_dis: disable master
  * @flag_stop_master: stop master
  * @addr_sw_reset: address for resetting the device
+ * @mac_addr0_otp: first part of MAC address from OTP
+ * @mac_addr1_otp: second part of MAC address from OTP
+ * @mac_addr0_strap: first part of MAC address from strap
+ * @mac_addr1_strap: second part of MAC address from strap
  */
 struct iwl_csr_params {
 	u8 flag_sw_reset;
@@ -323,6 +327,10 @@ struct iwl_csr_params {
 	u8 flag_master_dis;
 	u8 flag_stop_master;
 	u8 addr_sw_reset;
+	u32 mac_addr0_otp;
+	u32 mac_addr1_otp;
+	u32 mac_addr0_strap;
+	u32 mac_addr1_strap;
 };
 
 /**
@@ -451,7 +459,11 @@ static const struct iwl_csr_params iwl_csr_v1 = {
 	.flag_sw_reset = 7,
 	.flag_master_dis = 8,
 	.flag_stop_master = 9,
-	.addr_sw_reset = (CSR_BASE + 0x020)
+	.addr_sw_reset = (CSR_BASE + 0x020),
+	.mac_addr0_otp = 0x380,
+	.mac_addr1_otp = 0x384,
+	.mac_addr0_strap = 0x388,
+	.mac_addr1_strap = 0x38C
 };
 
 static const struct iwl_csr_params iwl_csr_v2 = {
@@ -462,7 +474,11 @@ static const struct iwl_csr_params iwl_csr_v2 = {
 	.flag_master_dis = 28,
 	.flag_stop_master = 29,
 	.flag_sw_reset = 31,
-	.addr_sw_reset = (CSR_BASE + 0x024)
+	.addr_sw_reset = (CSR_BASE + 0x024),
+	.mac_addr0_otp = 0x30,
+	.mac_addr1_otp = 0x34,
+	.mac_addr0_strap = 0x38,
+	.mac_addr1_strap = 0x3C
 };
 
 /*
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 1f31d7bfdffc..0f9d56420c42 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -579,8 +579,12 @@ static void iwl_flip_hw_address(__le32 mac_addr0, __le32 mac_addr1, u8 *dest)
 static void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
 					struct iwl_nvm_data *data)
 {
-	__le32 mac_addr0 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR0_STRAP));
-	__le32 mac_addr1 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR1_STRAP));
+	__le32 mac_addr0 =
+		cpu_to_le32(iwl_read32(trans,
+				       trans->cfg->csr->mac_addr0_strap));
+	__le32 mac_addr1 =
+		cpu_to_le32(iwl_read32(trans,
+				       trans->cfg->csr->mac_addr1_strap));
 
 	iwl_flip_hw_address(mac_addr0, mac_addr1, data->hw_addr);
 	/*
@@ -590,8 +594,10 @@ static void iwl_set_hw_address_from_csr(struct iwl_trans *trans,
 	if (is_valid_ether_addr(data->hw_addr))
 		return;
 
-	mac_addr0 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR0_OTP));
-	mac_addr1 = cpu_to_le32(iwl_read32(trans, CSR_MAC_ADDR1_OTP));
+	mac_addr0 = cpu_to_le32(iwl_read32(trans,
+					   trans->cfg->csr->mac_addr0_otp));
+	mac_addr1 = cpu_to_le32(iwl_read32(trans,
+					   trans->cfg->csr->mac_addr1_otp));
 
 	iwl_flip_hw_address(mac_addr0, mac_addr1, data->hw_addr);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1e5b7750315a8c2242444acda495555e7d52a6bf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 20 Feb 2018 16:32:36 +0100
Subject: iwlwifi: mvm: move skb padding reservation earlier

Future changes will require moving the HE radiotap data into
the SKB head, but this means we need to have the alignment
reservation done before that. To prepare, move the alignment
reservation earlier here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 34791628cfb3..bb63e75a9b7f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -151,17 +151,9 @@ static void iwl_mvm_create_skb(struct sk_buff *skb, struct ieee80211_hdr *hdr,
 	unsigned int hdrlen = ieee80211_hdrlen(hdr->frame_control);
 
 	if (desc->mac_flags2 & IWL_RX_MPDU_MFLG2_PAD) {
+		len -= 2;
 		pad_len = 2;
-
-		/*
-		 * If the device inserted padding it means that (it thought)
-		 * the 802.11 header wasn't a multiple of 4 bytes long. In
-		 * this case, reserve two bytes at the start of the SKB to
-		 * align the payload properly in case we end up copying it.
-		 */
-		skb_reserve(skb, pad_len);
 	}
-	len -= pad_len;
 
 	/* If frame is small enough to fit in skb->head, pull it completely.
 	 * If not, only pull ieee80211_hdr (including crypto if present, and
@@ -866,6 +858,16 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi,
 		return;
 	}
 
+	if (desc->mac_flags2 & IWL_RX_MPDU_MFLG2_PAD) {
+		/*
+		 * If the device inserted padding it means that (it thought)
+		 * the 802.11 header wasn't a multiple of 4 bytes long. In
+		 * this case, reserve two bytes at the start of the SKB to
+		 * align the payload properly in case we end up copying it.
+		 */
+		skb_reserve(skb, 2);
+	}
+
 	rx_status = IEEE80211_SKB_RXCB(skb);
 
 	if (iwl_mvm_rx_crypto(mvm, hdr, rx_status, desc,
-- 
cgit v1.2.3-59-g8ed1b


From 9039d985811d5b109b58b202b7594fd24e433fed Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Tue, 13 Feb 2018 11:09:40 +0200
Subject: iwlwifi: fw: harden page loading code

The page loading code trusts the data provided in the firmware images
a bit too much and may cause a buffer overflow or copy unknown data if
the block sizes don't match what we expect.

To prevent potential problems, harden the code by checking if the
sizes we are copying are what we expect.

Cc: stable@vger.kernel.org
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/paging.c | 49 +++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/paging.c b/drivers/net/wireless/intel/iwlwifi/fw/paging.c
index 1fec8e3a6b35..6afcfd1f0eec 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/paging.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/paging.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -30,6 +31,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -163,7 +165,7 @@ static int iwl_alloc_fw_paging_mem(struct iwl_fw_runtime *fwrt,
 static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 			       const struct fw_img *image)
 {
-	int sec_idx, idx;
+	int sec_idx, idx, ret;
 	u32 offset = 0;
 
 	/*
@@ -190,17 +192,23 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 	 */
 	if (sec_idx >= image->num_sec - 1) {
 		IWL_ERR(fwrt, "Paging: Missing CSS and/or paging sections\n");
-		iwl_free_fw_paging(fwrt);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto err;
 	}
 
 	/* copy the CSS block to the dram */
 	IWL_DEBUG_FW(fwrt, "Paging: load paging CSS to FW, sec = %d\n",
 		     sec_idx);
 
+	if (image->sec[sec_idx].len > fwrt->fw_paging_db[0].fw_paging_size) {
+		IWL_ERR(fwrt, "CSS block is larger than paging size\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
 	memcpy(page_address(fwrt->fw_paging_db[0].fw_paging_block),
 	       image->sec[sec_idx].data,
-	       fwrt->fw_paging_db[0].fw_paging_size);
+	       image->sec[sec_idx].len);
 	dma_sync_single_for_device(fwrt->trans->dev,
 				   fwrt->fw_paging_db[0].fw_paging_phys,
 				   fwrt->fw_paging_db[0].fw_paging_size,
@@ -221,6 +229,14 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 	for (idx = 1; idx < fwrt->num_of_paging_blk; idx++) {
 		struct iwl_fw_paging *block = &fwrt->fw_paging_db[idx];
 
+		if (block->fw_paging_size > image->sec[sec_idx].len - offset) {
+			IWL_ERR(fwrt,
+				"Paging: paging size is larger than remaining data in block %d\n",
+				idx);
+			ret = -EINVAL;
+			goto err;
+		}
+
 		memcpy(page_address(block->fw_paging_block),
 		       image->sec[sec_idx].data + offset,
 		       block->fw_paging_size);
@@ -231,19 +247,32 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 
 		IWL_DEBUG_FW(fwrt,
 			     "Paging: copied %d paging bytes to block %d\n",
-			     fwrt->fw_paging_db[idx].fw_paging_size,
-			     idx);
+			     block->fw_paging_size, idx);
 
-		offset += fwrt->fw_paging_db[idx].fw_paging_size;
+		offset += block->fw_paging_size;
+
+		if (offset > image->sec[sec_idx].len) {
+			IWL_ERR(fwrt,
+				"Paging: offset goes over section size\n");
+			ret = -EINVAL;
+			goto err;
+		}
 	}
 
 	/* copy the last paging block */
 	if (fwrt->num_of_pages_in_last_blk > 0) {
 		struct iwl_fw_paging *block = &fwrt->fw_paging_db[idx];
 
+		if (image->sec[sec_idx].len - offset > block->fw_paging_size) {
+			IWL_ERR(fwrt,
+				"Paging: last block is larger than paging size\n");
+			ret = -EINVAL;
+			goto err;
+		}
+
 		memcpy(page_address(block->fw_paging_block),
 		       image->sec[sec_idx].data + offset,
-		       FW_PAGING_SIZE * fwrt->num_of_pages_in_last_blk);
+		       image->sec[sec_idx].len - offset);
 		dma_sync_single_for_device(fwrt->trans->dev,
 					   block->fw_paging_phys,
 					   block->fw_paging_size,
@@ -255,6 +284,10 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 	}
 
 	return 0;
+
+err:
+	iwl_free_fw_paging(fwrt);
+	return ret;
 }
 
 static int iwl_save_fw_paging(struct iwl_fw_runtime *fwrt,
-- 
cgit v1.2.3-59-g8ed1b


From de460ddd8b795ceca8a826771ef8feb016bb438d Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Wed, 14 Feb 2018 08:54:40 +0200
Subject: iwlwifi: fw: combine loading of last page block into main copy loop

Now that we check and copy only the actual size of the page block,
there is no need to treat the last block separately.  Remove the
mostly duplicate code and make the main copy loop handle also the last
block.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/paging.c | 69 ++++++++++----------------
 1 file changed, 26 insertions(+), 43 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/paging.c b/drivers/net/wireless/intel/iwlwifi/fw/paging.c
index 6afcfd1f0eec..9b8dd7fe7112 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/paging.c
+++ b/drivers/net/wireless/intel/iwlwifi/fw/paging.c
@@ -221,25 +221,39 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 	sec_idx++;
 
 	/*
-	 * copy the paging blocks to the dram
-	 * loop index start from 1 since that CSS block already copied to dram
-	 * and CSS index is 0.
-	 * loop stop at num_of_paging_blk since that last block is not full.
+	 * Copy the paging blocks to the dram.  The loop index starts
+	 * from 1 since the CSS block (index 0) was already copied to
+	 * dram.  We use num_of_paging_blk + 1 to account for that.
 	 */
-	for (idx = 1; idx < fwrt->num_of_paging_blk; idx++) {
+	for (idx = 1; idx < fwrt->num_of_paging_blk + 1; idx++) {
 		struct iwl_fw_paging *block = &fwrt->fw_paging_db[idx];
-
-		if (block->fw_paging_size > image->sec[sec_idx].len - offset) {
+		int remaining = image->sec[sec_idx].len - offset;
+		int len = block->fw_paging_size;
+
+		/*
+		 * For the last block, we copy all that is remaining,
+		 * for all other blocks, we copy fw_paging_size at a
+		 * time. */
+		if (idx == fwrt->num_of_paging_blk) {
+			len = remaining;
+			if (remaining !=
+			    fwrt->num_of_pages_in_last_blk * FW_PAGING_SIZE) {
+				IWL_ERR(fwrt,
+					"Paging: last block contains more data than expected %d\n",
+					remaining);
+				ret = -EINVAL;
+				goto err;
+			}
+		} else if (block->fw_paging_size > remaining) {
 			IWL_ERR(fwrt,
-				"Paging: paging size is larger than remaining data in block %d\n",
-				idx);
+				"Paging: not enough data in other in block %d (%d)\n",
+				idx, remaining);
 			ret = -EINVAL;
 			goto err;
 		}
 
 		memcpy(page_address(block->fw_paging_block),
-		       image->sec[sec_idx].data + offset,
-		       block->fw_paging_size);
+		       image->sec[sec_idx].data + offset, len);
 		dma_sync_single_for_device(fwrt->trans->dev,
 					   block->fw_paging_phys,
 					   block->fw_paging_size,
@@ -247,40 +261,9 @@ static int iwl_fill_paging_mem(struct iwl_fw_runtime *fwrt,
 
 		IWL_DEBUG_FW(fwrt,
 			     "Paging: copied %d paging bytes to block %d\n",
-			     block->fw_paging_size, idx);
+			     len, idx);
 
 		offset += block->fw_paging_size;
-
-		if (offset > image->sec[sec_idx].len) {
-			IWL_ERR(fwrt,
-				"Paging: offset goes over section size\n");
-			ret = -EINVAL;
-			goto err;
-		}
-	}
-
-	/* copy the last paging block */
-	if (fwrt->num_of_pages_in_last_blk > 0) {
-		struct iwl_fw_paging *block = &fwrt->fw_paging_db[idx];
-
-		if (image->sec[sec_idx].len - offset > block->fw_paging_size) {
-			IWL_ERR(fwrt,
-				"Paging: last block is larger than paging size\n");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		memcpy(page_address(block->fw_paging_block),
-		       image->sec[sec_idx].data + offset,
-		       image->sec[sec_idx].len - offset);
-		dma_sync_single_for_device(fwrt->trans->dev,
-					   block->fw_paging_phys,
-					   block->fw_paging_size,
-					   DMA_BIDIRECTIONAL);
-
-		IWL_DEBUG_FW(fwrt,
-			     "Paging: copied %d pages in the last block %d\n",
-			     fwrt->num_of_pages_in_last_blk, idx);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 49564a806fc5551ef22140e9aa4e29c6788d3eb0 Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Tue, 12 Dec 2017 08:58:41 +0200
Subject: iwlwifi: pcie: remove non-responsive device

If we fail to to grab NIC access because the device is not responding
(i.e. CSR_GP_CNTRL returns 0xFFFFFFFF), remove the device from the PCI
bus, to avoid any further damage, and to let the user space rescan.

In order to inform the userspace that a rescan is needed, we send a
kobject uevent with "INACCESSIBLE".

This functionality is disabled by default, but can be enabled via a
new module parameter called "remove_when_gone".  In the future we may
change this module parameter to include 3 modes instead: do nothing;
auto-rescan or; send uevent.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Rajat Jain <rajatja@google.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c       |  6 ++
 drivers/net/wireless/intel/iwlwifi/iwl-modparams.h |  2 +
 drivers/net/wireless/intel/iwlwifi/pcie/internal.h |  5 ++
 drivers/net/wireless/intel/iwlwifi/pcie/trans.c    | 74 +++++++++++++++++++++-
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
index 4f451a6f20b9..c59ce4f8a5ed 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c
@@ -1850,3 +1850,9 @@ MODULE_PARM_DESC(d0i3_timeout, "Timeout to D0i3 entry when idle (ms)");
 
 module_param_named(disable_11ac, iwlwifi_mod_params.disable_11ac, bool, 0444);
 MODULE_PARM_DESC(disable_11ac, "Disable VHT capabilities (default: false)");
+
+module_param_named(remove_when_gone,
+		   iwlwifi_mod_params.remove_when_gone, bool,
+		   0444);
+MODULE_PARM_DESC(remove_when_gone,
+		 "Remove dev from PCIe bus if it is deemed inaccessible (default: false)");
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h
index a41c46e63eb1..a7dd8a8cddf9 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-modparams.h
@@ -122,6 +122,7 @@ enum iwl_uapsd_disable {
  * @lar_disable: disable LAR (regulatory), default = 0
  * @fw_monitor: allow to use firmware monitor
  * @disable_11ac: disable VHT capabilities, default = false.
+ * @remove_when_gone: remove an inaccessible device from the PCIe bus.
  */
 struct iwl_mod_params {
 	int swcrypto;
@@ -143,6 +144,7 @@ struct iwl_mod_params {
 	bool lar_disable;
 	bool fw_monitor;
 	bool disable_11ac;
+	bool remove_when_gone;
 };
 
 #endif /* #__iwl_modparams_h__ */
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
index cda66340d357..45ea32796cda 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
@@ -383,6 +383,8 @@ struct iwl_self_init_dram {
  * @hw_init_mask: initial unmasked hw causes
  * @fh_mask: current unmasked fh causes
  * @hw_mask: current unmasked hw causes
+ * @in_rescan: true if we have triggered a device rescan
+ * @scheduled_for_removal: true if we have scheduled a device removal
  */
 struct iwl_trans_pcie {
 	struct iwl_rxq *rxq;
@@ -464,6 +466,9 @@ struct iwl_trans_pcie {
 	u32 fh_mask;
 	u32 hw_mask;
 	cpumask_t affinity_mask[IWL_MAX_RX_HW_QUEUES];
+	u16 tx_cmd_queue_size;
+	bool in_rescan;
+	bool scheduled_for_removal;
 };
 
 static inline struct iwl_trans_pcie *
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index 623476603cd4..6e9a9ecfb11c 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
@@ -75,6 +75,7 @@
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/pm_runtime.h>
+#include <linux/module.h>
 
 #include "iwl-drv.h"
 #include "iwl-trans.h"
@@ -1935,6 +1936,29 @@ static void iwl_trans_pcie_set_pmi(struct iwl_trans *trans, bool state)
 		clear_bit(STATUS_TPOWER_PMI, &trans->status);
 }
 
+struct iwl_trans_pcie_removal {
+	struct pci_dev *pdev;
+	struct work_struct work;
+};
+
+static void iwl_trans_pcie_removal_wk(struct work_struct *wk)
+{
+	struct iwl_trans_pcie_removal *removal =
+		container_of(wk, struct iwl_trans_pcie_removal, work);
+	struct pci_dev *pdev = removal->pdev;
+	char *prop[] = {"EVENT=INACCESSIBLE", NULL};
+
+	dev_err(&pdev->dev, "Device gone - attempting removal\n");
+	kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, prop);
+	pci_lock_rescan_remove();
+	pci_dev_put(pdev);
+	pci_stop_and_remove_bus_device(pdev);
+	pci_unlock_rescan_remove();
+
+	kfree(removal);
+	module_put(THIS_MODULE);
+}
+
 static bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans,
 					   unsigned long *flags)
 {
@@ -1977,11 +2001,55 @@ static bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans,
 			   (BIT(trans->cfg->csr->flag_mac_clock_ready) |
 			    CSR_GP_CNTRL_REG_FLAG_GOING_TO_SLEEP), 15000);
 	if (unlikely(ret < 0)) {
-		iwl_trans_pcie_dump_regs(trans);
-		iwl_write32(trans, CSR_RESET, CSR_RESET_REG_FLAG_FORCE_NMI);
+		u32 cntrl = iwl_read32(trans, CSR_GP_CNTRL);
+
 		WARN_ONCE(1,
 			  "Timeout waiting for hardware access (CSR_GP_CNTRL 0x%08x)\n",
-			  iwl_read32(trans, CSR_GP_CNTRL));
+			  cntrl);
+
+		iwl_trans_pcie_dump_regs(trans);
+
+		if (iwlwifi_mod_params.remove_when_gone && cntrl == ~0U) {
+			struct iwl_trans_pcie_removal *removal;
+
+			if (trans_pcie->scheduled_for_removal)
+				goto err;
+
+			IWL_ERR(trans, "Device gone - scheduling removal!\n");
+
+			/*
+			 * get a module reference to avoid doing this
+			 * while unloading anyway and to avoid
+			 * scheduling a work with code that's being
+			 * removed.
+			 */
+			if (!try_module_get(THIS_MODULE)) {
+				IWL_ERR(trans,
+					"Module is being unloaded - abort\n");
+				goto err;
+			}
+
+			removal = kzalloc(sizeof(*removal), GFP_ATOMIC);
+			if (!removal) {
+				module_put(THIS_MODULE);
+				goto err;
+			}
+			/*
+			 * we don't need to clear this flag, because
+			 * the trans will be freed and reallocated.
+			*/
+			trans_pcie->scheduled_for_removal = true;
+
+			removal->pdev = to_pci_dev(trans->dev);
+			INIT_WORK(&removal->work, iwl_trans_pcie_removal_wk);
+			pci_dev_get(removal->pdev);
+			schedule_work(&removal->work);
+		} else {
+			iwl_write32(trans, CSR_RESET,
+				    CSR_RESET_REG_FLAG_FORCE_NMI);
+		}
+
+err:
 		spin_unlock_irqrestore(&trans_pcie->reg_lock, *flags);
 		return false;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From c3039b10df74818bdd1d7e9d2b8706c855c4394b Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Thu, 22 Feb 2018 14:50:06 +0200
Subject: iwlwifi: make bitfield a u32 instead of u16

The bitfield we use in struct iwl_cfg contains 17 bits, but we declare
it as u16.  The compiler doesn't seem to have any problems with that
(it probably automatically makes it u32), but it's cleaner to use a
type that is long enough for all the flags.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
index cfd14e5ea59c..c503b26793f6 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h
@@ -420,7 +420,7 @@ struct iwl_cfg {
 	u32 soc_latency;
 	u16 nvm_ver;
 	u16 nvm_calib_ver;
-	u16 rx_with_siso_diversity:1,
+	u32 rx_with_siso_diversity:1,
 	    bt_shared_single_ant:1,
 	    internal_wimax_coex:1,
 	    host_interrupt_operation_mode:1,
-- 
cgit v1.2.3-59-g8ed1b


From 755a654cada277b1851737845f63c7ab2059c982 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Sun, 4 Feb 2018 15:01:37 +0200
Subject: iwlwifi: mvm: remove check for non low latency TIDs

Firmware will only send non low-latency TIDs in the
bitmap, so the check is now redundant.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index d0916f2552e2..df4c60496f72 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -803,7 +803,6 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
 
 	if (iwl_mvm_vif_low_latency(iwl_mvm_vif_from_mac80211(mvmsta->vif)) ||
-	    tid_to_mac80211_ac[tid] < IEEE80211_AC_BE ||
 	    !(mvmsta->amsdu_enabled & BIT(tid)))
 		return iwl_mvm_tx_tso_segment(skb, 1, netdev_flags, mpdus_skb);
 
-- 
cgit v1.2.3-59-g8ed1b


From d3a6f7fb97fc34a38e40cc56392e701598f99863 Mon Sep 17 00:00:00 2001
From: Eliad Peller <eliadx.peller@intel.com>
Date: Thu, 22 Feb 2018 18:45:49 +0200
Subject: iwlwifi: mvm: set wakeup filters for wowlan "any" configuration

In case of "any" wowlan trigger is configured, no valid wakeup filter
was configured.

Moreover, the fw assumes there's no connection when there are no configured
wakeup filters.
This leads to the station info not being updated on D3 command, causing
rate_n_flags to be 0 when the offloading code sends tx frame (triggering
SYSASSERT_102C due to invalid antenna param)

Note: "any" trigger is currently assumed to only be used when entering
d0i3 (which has a different flow). However, we still reach this code
when using d3_test.

Signed-off-by: Eliad Peller <eliadx.peller@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index 80a9a7cb83be..3fcf489f3120 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -18,11 +19,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
- * USA
- *
  * The full GNU General Public License is included in this distribution
  * in the file called COPYING.
  *
@@ -35,6 +31,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018        Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -693,6 +690,14 @@ iwl_mvm_get_wowlan_config(struct iwl_mvm *mvm,
 				    IWL_WOWLAN_WAKEUP_LINK_CHANGE);
 	}
 
+	if (wowlan->any) {
+		wowlan_config_cmd->wakeup_filter |=
+			cpu_to_le32(IWL_WOWLAN_WAKEUP_BEACON_MISS |
+				    IWL_WOWLAN_WAKEUP_LINK_CHANGE |
+				    IWL_WOWLAN_WAKEUP_RX_FRAME |
+				    IWL_WOWLAN_WAKEUP_BCN_FILTERING);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1cd7884dfd78df6284d27b008823b0b4a808f196 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:15 -0400
Subject: udp: expose inet cork to udp

UDP segmentation offload needs access to inet_cork in the udp layer.
Pass the struct to ip(6)_make_skb instead of allocating it on the
stack in that function itself.

This patch is a noop otherwise.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      |  2 +-
 include/net/ipv6.h    |  1 +
 net/ipv4/ip_output.c  | 17 ++++++++---------
 net/ipv4/udp.c        |  4 +++-
 net/ipv6/ip6_output.c | 20 ++++++++++----------
 net/ipv6/udp.c        |  3 ++-
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index dc4a2d6e58a5..7ec543a64bbc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -171,7 +171,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags);
+			    struct inet_cork *cork, unsigned int flags);
 
 static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 {
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..0dd722cab037 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -950,6 +950,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc);
 
 static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 83c73bab2c3d..2883ff1e909c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1470,9 +1470,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags)
+			    struct inet_cork *cork, unsigned int flags)
 {
-	struct inet_cork cork;
 	struct sk_buff_head queue;
 	int err;
 
@@ -1481,22 +1480,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.flags = 0;
-	cork.addr = 0;
-	cork.opt = NULL;
-	err = ip_setup_cork(sk, &cork, ipc, rtp);
+	cork->flags = 0;
+	cork->addr = 0;
+	cork->opt = NULL;
+	err = ip_setup_cork(sk, cork, ipc, rtp);
 	if (err)
 		return ERR_PTR(err);
 
-	err = __ip_append_data(sk, fl4, &queue, &cork,
+	err = __ip_append_data(sk, fl4, &queue, cork,
 			       &current->task_frag, getfrag,
 			       from, length, transhdrlen, flags);
 	if (err) {
-		__ip_flush_pending_frames(sk, &queue, &cork);
+		__ip_flush_pending_frames(sk, &queue, cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip_make_skb(sk, fl4, &queue, &cork);
+	return __ip_make_skb(sk, fl4, &queue, cork);
 }
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..6b9d8017b319 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1030,9 +1030,11 @@ back_from_confirm:
 
 	/* Lockless fast path for the non-corking case. */
 	if (!corkreq) {
+		struct inet_cork cork;
+
 		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
 				  sizeof(struct udphdr), &ipc, &rt,
-				  msg->msg_flags);
+				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_send_skb(skb, fl4);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6a477d54f8c7..7fa1db447405 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1755,9 +1755,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc)
 {
-	struct inet_cork_full cork;
 	struct inet6_cork v6_cork;
 	struct sk_buff_head queue;
 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1768,27 +1768,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.base.flags = 0;
-	cork.base.addr = 0;
-	cork.base.opt = NULL;
-	cork.base.dst = NULL;
+	cork->base.flags = 0;
+	cork->base.addr = 0;
+	cork->base.opt = NULL;
+	cork->base.dst = NULL;
 	v6_cork.opt = NULL;
-	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
 	if (err) {
-		ip6_cork_release(&cork, &v6_cork);
+		ip6_cork_release(cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 
-	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
 				&current->task_frag, getfrag, from,
 				length + exthdrlen, transhdrlen + exthdrlen,
 				flags, ipc6, sockc);
 	if (err) {
-		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4ec76a87aeb8..824797f8d1ab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1324,12 +1324,13 @@ back_from_confirm:
 
 	/* Lockless fast path for the non-corking case */
 	if (!corkreq) {
+		struct inet_cork_full cork;
 		struct sk_buff *skb;
 
 		skb = ip6_make_skb(sk, getfrag, msg, ulen,
 				   sizeof(struct udphdr), &ipc6,
 				   &fl6, (struct rt6_info *)dst,
-				   msg->msg_flags, &sockc);
+				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_v6_send_skb(skb, &fl6);
-- 
cgit v1.2.3-59-g8ed1b


From ee80d1ebe5ba7f4bd74959c873119175a4fc08d3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:16 -0400
Subject: udp: add udp gso

Implement generic segmentation offload support for udp datagrams. A
follow-up patch adds support to the protocol stack to generate such
packets.

UDP GSO is not UFO. UFO fragments a single large datagram. GSO splits
a large payload into a number of discrete UDP datagrams.

The implementation adds a GSO type SKB_UDP_GSO_L4 to differentiate it
from UFO (SKB_UDP_GSO).

IPPROTO_UDPLITE is excluded, as that protocol has no gso handler
registered.

[ Export __udp_gso_segment for ipv6. -DaveM ]

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 include/net/udp.h      |  4 ++++
 net/core/skbuff.c      |  2 ++
 net/ipv4/udp_offload.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/ip6_offload.c |  6 ++++--
 net/ipv6/udp_offload.c | 19 +++++++++++++++++-
 6 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d274059529eb..a4a5c0c5cba8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -573,6 +573,8 @@ enum {
 	SKB_GSO_ESP = 1 << 15,
 
 	SKB_GSO_UDP = 1 << 16,
+
+	SKB_GSO_UDP_L4 = 1 << 17,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b272f6ac..741d888d0fdb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check);
+
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
 	struct udphdr *uh;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ff49e352deea..c647cfe114e0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4940,6 +4940,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 		thlen = tcp_hdrlen(skb);
 	} else if (unlikely(skb_is_gso_sctp(skb))) {
 		thlen = sizeof(struct sctphdr);
+	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+		thlen = sizeof(struct udphdr);
 	}
 	/* UFO sets gso_size to the size of the fragmentation
 	 * payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..6c2b7640f6f3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,54 @@ out_unlock:
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check)
+{
+	struct sk_buff *segs, *seg;
+	unsigned int hdrlen;
+	struct udphdr *uh;
+
+	if (gso_skb->len <= sizeof(*uh) + mss)
+		return ERR_PTR(-EINVAL);
+
+	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
+	skb_pull(gso_skb, sizeof(*uh));
+
+	segs = skb_segment(gso_skb, features);
+	if (unlikely(IS_ERR_OR_NULL(segs)))
+		return segs;
+
+	for (seg = segs; seg; seg = seg->next) {
+		uh = udp_hdr(seg);
+		uh->len = htons(seg->len - hdrlen);
+		uh->check = check;
+
+		/* last packet can be partial gso_size */
+		if (!seg->next)
+			csum_replace2(&uh->check, htons(mss),
+				      htons(seg->len - hdrlen - sizeof(*uh)));
+	}
+
+	return segs;
+}
+EXPORT_SYMBOL_GPL(__udp_gso_segment);
+
+static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct iphdr *iph = ip_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
+		return ERR_PTR(-EIO);
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v4_check(sizeof(struct udphdr) + mss,
+					      iph->saddr, iph->daddr, 0));
+}
+EXPORT_SYMBOL_GPL(__udp4_gso_segment);
+
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -203,12 +251,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
 		goto out;
 
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 		goto out;
 
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		return __udp4_gso_segment(skb, features);
+
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
 		goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..5b3f2f89ef41 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	if (skb->encapsulation &&
 	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
-		udpfrag = proto == IPPROTO_UDP && encap;
+		udpfrag = proto == IPPROTO_UDP && encap &&
+			  (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 	else
-		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+			  (skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 
 	ops = rcu_dereference(inet6_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..f7b85b1e6b3e 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,6 +17,20 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
+static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
+		return ERR_PTR(-EIO);
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v6_check(sizeof(struct udphdr) + mss,
+					      &ip6h->saddr, &ip6h->daddr, 0));
+}
+
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -42,12 +56,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		const struct ipv6hdr *ipv6h;
 		struct udphdr *uh;
 
-		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP))
+		if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)))
 			goto out;
 
 		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 			goto out;
 
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+			return __udp6_gso_segment(skb, features);
+
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.
 		 */
-- 
cgit v1.2.3-59-g8ed1b


From bec1f6f697362c5bc635dacd7ac8499d0a10a4e7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:17 -0400
Subject: udp: generate gso with UDP_SEGMENT

Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.

To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.

A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.

Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.

The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.

Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.

    tcp tso
     3197 MB/s 54232 msg/s 54232 calls/s
         6,457,754,262      cycles

    tcp gso
     1765 MB/s 29939 msg/s 29939 calls/s
        11,203,021,806      cycles

    tcp without tso/gso *
      739 MB/s 12548 msg/s 12548 calls/s
        11,205,483,630      cycles

    udp
      876 MB/s 14873 msg/s 624666 calls/s
        11,205,777,429      cycles

    udp gso
     2139 MB/s 36282 msg/s 36282 calls/s
        11,204,374,561      cycles

   [*] after reverting commit 0a6b2a1dc2a2
       ("tcp: switch to GSO being always on")

Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:

  perf stat -a -C 12 -e cycles \
    ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4

Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h      |  3 +++
 include/net/inet_sock.h  |  1 +
 include/net/ip.h         |  1 +
 include/net/ipv6.h       |  1 +
 include/uapi/linux/udp.h |  1 +
 net/ipv4/ip_output.c     |  9 ++++++---
 net/ipv4/udp.c           | 33 ++++++++++++++++++++++++++++++---
 net/ipv6/ip6_output.c    |  6 ++++--
 net/ipv6/udp.c           | 23 ++++++++++++++++++++---
 9 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..ca840345571b 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,7 @@ struct udp_sock {
 	 * when the socket is uncorked.
 	 */
 	__u16		 len;		/* total length of pending frames */
+	__u16		 gso_size;
 	/*
 	 * Fields specific to UDP-Lite.
 	 */
@@ -87,6 +88,8 @@ struct udp_sock {
 	int		forward_deficit;
 };
 
+#define UDP_MAX_SEGMENTS	(1 << 6UL)
+
 static inline struct udp_sock *udp_sk(const struct sock *sk)
 {
 	return (struct udp_sock *)sk;
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..83d5b3c2ac42 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -147,6 +147,7 @@ struct inet_cork {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 7ec543a64bbc..bada1f1f871e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0dd722cab037..0a872a7c33c8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -298,6 +298,7 @@ struct ipcm6_cookie {
 	__s16 tclass;
 	__s8  dontfrag;
 	struct ipv6_txoptions *opt;
+	__u16 gso_size;
 };
 
 static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index efb7b5991c2f..09d00f8c442b 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -32,6 +32,7 @@ struct udphdr {
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
+#define UDP_SEGMENT	103	/* Set GSO segmentation size */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2883ff1e909c..da4abbee10f7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,7 +882,8 @@ static int __ip_append_data(struct sock *sk,
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
 		tskey = sk->sk_tskey++;
@@ -906,7 +907,7 @@ static int __ip_append_data(struct sock *sk,
 	if (transhdrlen &&
 	    length + fragheaderlen <= mtu &&
 	    rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
@@ -1135,6 +1136,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	*rtp = NULL;
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1214,7 +1217,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 		return -EOPNOTSUPP;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b9d8017b319..bda022c5480b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(udp_set_csum);
 
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+			struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)  				 /*     UDP-Lite      */
 		csum = udplite_csum(skb);
 
@@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_send_skb(skb, fl4);
+	err = udp_send_skb(skb, fl4, &inet->cork.base);
 
 out:
 	up->len = 0;
@@ -922,6 +938,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.sockc.tsflags = sk->sk_tsflags;
 	ipc.addr = inet->inet_saddr;
 	ipc.oif = sk->sk_bound_dev_if;
+	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
 		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
@@ -1037,7 +1054,7 @@ back_from_confirm:
 				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_send_skb(skb, fl4);
+			err = udp_send_skb(skb, fl4, &cork);
 		goto out;
 	}
 
@@ -2367,6 +2384,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->no_check6_rx = valbool;
 		break;
 
+	case UDP_SEGMENT:
+		if (val < 0 || val > USHRT_MAX)
+			return -EINVAL;
+		up->gso_size = val;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
@@ -2457,6 +2480,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		val = up->no_check6_rx;
 		break;
 
+	case UDP_SEGMENT:
+		val = up->gso_size;
+		break;
+
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7fa1db447405..a1c4a78132d2 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1240,6 +1240,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
@@ -1281,7 +1283,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1329,7 +1331,7 @@ emsgsize:
 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
 	    headersize == sizeof(struct ipv6hdr) &&
 	    length <= mtu - headersize &&
-	    !(flags & MSG_MORE) &&
+	    (!(flags & MSG_MORE) || cork->gso_size) &&
 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
 		csummode = CHECKSUM_PARTIAL;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 824797f8d1ab..86b7dd58d4b4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
  *	Sending
  */
 
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+			   struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct udphdr *uh;
@@ -1042,6 +1043,21 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)
 		csum = udplite_csum(skb);
 	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
@@ -1093,7 +1109,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_v6_send_skb(skb, &fl6);
+	err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
 
 out:
 	up->len = 0;
@@ -1127,6 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc6.hlimit = -1;
 	ipc6.tclass = -1;
 	ipc6.dontfrag = -1;
+	ipc6.gso_size = up->gso_size;
 	sockc.tsflags = sk->sk_tsflags;
 
 	/* destination address check */
@@ -1333,7 +1350,7 @@ back_from_confirm:
 				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_v6_send_skb(skb, &fl6);
+			err = udp_v6_send_skb(skb, &fl6, &cork.base);
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ad405857b174ed31a97982bb129c320d03321cf5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:18 -0400
Subject: udp: better wmem accounting on gso

skb_segment by default transfers allocated wmem from the gso skb
to the tail of the segment list. This underreports real truesize
of the list, especially if the tail might be dropped.

Similar to tcp_gso_segment, update wmem_alloc with the aggregate
list truesize and make each segment responsible for its own
share by setting skb->destructor.

Clear gso_skb->destructor prior to calling skb_segment to skip
the default assignment to tail.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 6c2b7640f6f3..dc5158cba66e 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -191,6 +191,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
 				  unsigned int mss, __sum16 check)
 {
+	struct sock *sk = gso_skb->sk;
+	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
 	unsigned int hdrlen;
 	struct udphdr *uh;
@@ -201,9 +203,15 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
 	skb_pull(gso_skb, sizeof(*uh));
 
+	/* clear destructor to avoid skb_segment assigning it to tail */
+	WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
+	gso_skb->destructor = NULL;
+
 	segs = skb_segment(gso_skb, features);
-	if (unlikely(IS_ERR_OR_NULL(segs)))
+	if (unlikely(IS_ERR_OR_NULL(segs))) {
+		gso_skb->destructor = sock_wfree;
 		return segs;
+	}
 
 	for (seg = segs; seg; seg = seg->next) {
 		uh = udp_hdr(seg);
@@ -214,8 +222,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		if (!seg->next)
 			csum_replace2(&uh->check, htons(mss),
 				      htons(seg->len - hdrlen - sizeof(*uh)));
+
+		seg->destructor = sock_wfree;
+		seg->sk = sk;
+		sum_truesize += seg->truesize;
 	}
 
+	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
+
 	return segs;
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
-- 
cgit v1.2.3-59-g8ed1b


From 15e36f5b8e982debe43e425d2e12d34e022d51e9 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:19 -0400
Subject: udp: paged allocation with gso

When sending large datagrams that are later segmented, store data in
page frags to avoid copying from linear in skb_segment.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 15 +++++++++++----
 net/ipv6/ip6_output.c | 19 ++++++++++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index da4abbee10f7..f2338e40c37d 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,13 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	bool paged;
 
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
 	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+	paged = !!cork->gso_size;
 
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
@@ -934,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
 			unsigned int fraglen;
 			unsigned int fraggap;
 			unsigned int alloclen;
+			unsigned int pagedlen = 0;
 			struct sk_buff *skb_prev;
 alloc_new_skb:
 			skb_prev = skb;
@@ -954,8 +957,12 @@ alloc_new_skb:
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
-			else
+			else if (!paged)
 				alloclen = fraglen;
+			else {
+				alloclen = min_t(int, fraglen, MAX_HEADER);
+				pagedlen = fraglen - alloclen;
+			}
 
 			alloclen += exthdrlen;
 
@@ -999,7 +1006,7 @@ alloc_new_skb:
 			/*
 			 *	Find where to start putting bytes.
 			 */
-			data = skb_put(skb, fraglen + exthdrlen);
+			data = skb_put(skb, fraglen + exthdrlen - pagedlen);
 			skb_set_network_header(skb, exthdrlen);
 			skb->transport_header = (skb->network_header +
 						 fragheaderlen);
@@ -1015,7 +1022,7 @@ alloc_new_skb:
 				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 
-			copy = datalen - transhdrlen - fraggap;
+			copy = datalen - transhdrlen - fraggap - pagedlen;
 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 				err = -EFAULT;
 				kfree_skb(skb);
@@ -1023,7 +1030,7 @@ alloc_new_skb:
 			}
 
 			offset += copy;
-			length -= datalen - fraggap;
+			length -= copy + transhdrlen;
 			transhdrlen = 0;
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a1c4a78132d2..dfd8af41824e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1276,6 +1276,7 @@ static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
+	bool paged;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1283,6 +1284,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
+	paged = !!cork->gso_size;
 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
@@ -1374,6 +1376,7 @@ emsgsize:
 			unsigned int fraglen;
 			unsigned int fraggap;
 			unsigned int alloclen;
+			unsigned int pagedlen = 0;
 alloc_new_skb:
 			/* There's no room in the current skb */
 			if (skb)
@@ -1396,11 +1399,17 @@ alloc_new_skb:
 
 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+			fraglen = datalen + fragheaderlen;
+
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
-			else
-				alloclen = datalen + fragheaderlen;
+			else if (!paged)
+				alloclen = fraglen;
+			else {
+				alloclen = min_t(int, fraglen, MAX_HEADER);
+				pagedlen = fraglen - alloclen;
+			}
 
 			alloclen += dst_exthdrlen;
 
@@ -1422,7 +1431,7 @@ alloc_new_skb:
 			 */
 			alloclen += sizeof(struct frag_hdr);
 
-			copy = datalen - transhdrlen - fraggap;
+			copy = datalen - transhdrlen - fraggap - pagedlen;
 			if (copy < 0) {
 				err = -EINVAL;
 				goto error;
@@ -1461,7 +1470,7 @@ alloc_new_skb:
 			/*
 			 *	Find where to start putting bytes
 			 */
-			data = skb_put(skb, fraglen);
+			data = skb_put(skb, fraglen - pagedlen);
 			skb_set_network_header(skb, exthdrlen);
 			data += fragheaderlen;
 			skb->transport_header = (skb->network_header +
@@ -1484,7 +1493,7 @@ alloc_new_skb:
 			}
 
 			offset += copy;
-			length -= datalen - fraggap;
+			length -= copy + transhdrlen;
 			transhdrlen = 0;
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2e8de8576343ab540856082916bfb84d17288b08 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:20 -0400
Subject: udp: add gso segment cmsg

Allow specifying segment size in the send call.

The new control message performs the same function as socket option
UDP_SEGMENT while avoiding the extra system call.

[ Export udp_cmsg_send for ipv6. -DaveM ]

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h |  1 +
 net/ipv4/udp.c    | 44 ++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/udp.c    |  5 ++++-
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 741d888d0fdb..05990746810e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -273,6 +273,7 @@ int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udp_push_pending_frames(struct sock *sk);
 void udp_flush_pending_frames(struct sock *sk);
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
 void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bda022c5480b..794aeafeb782 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -853,6 +853,43 @@ out:
 }
 EXPORT_SYMBOL(udp_push_pending_frames);
 
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+	switch (cmsg->cmsg_type) {
+	case UDP_SEGMENT:
+		if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+			return -EINVAL;
+		*gso_size = *(__u16 *)CMSG_DATA(cmsg);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+	struct cmsghdr *cmsg;
+	bool need_ip = false;
+	int err;
+
+	for_each_cmsghdr(cmsg, msg) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_UDP) {
+			need_ip = true;
+			continue;
+		}
+
+		err = __udp_cmsg_send(cmsg, gso_size);
+		if (err)
+			return err;
+	}
+
+	return need_ip;
+}
+EXPORT_SYMBOL_GPL(udp_cmsg_send);
+
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -941,8 +978,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
-		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
-		if (unlikely(err)) {
+		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+		if (err > 0)
+			err = ip_cmsg_send(sk, msg, &ipc,
+					   sk->sk_family == AF_INET6);
+		if (unlikely(err < 0)) {
 			kfree(ipc.opt);
 			return err;
 		}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 86b7dd58d4b4..6acfdd3e442b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1276,7 +1276,10 @@ do_udp_sendmsg:
 		opt->tot_len = sizeof(*opt);
 		ipc6.opt = opt;
 
-		err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+		err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+		if (err > 0)
+			err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+						    &ipc6, &sockc);
 		if (err < 0) {
 			fl6_sock_release(flowlabel);
 			return err;
-- 
cgit v1.2.3-59-g8ed1b


From 83aa025f535f76733e334e3d2a4d8577c8441a7e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:21 -0400
Subject: udp: add gso support to virtual devices

Virtual devices such as tunnels and bonding can handle large packets.
Only segment packets when reaching a physical or loopback device.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-features.txt | 7 +++++++
 include/linux/netdev_features.h              | 5 ++++-
 include/linux/netdevice.h                    | 1 +
 net/core/ethtool.c                           | 1 +
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
index c77f9d57eb91..c4a54c162547 100644
--- a/Documentation/networking/netdev-features.txt
+++ b/Documentation/networking/netdev-features.txt
@@ -113,6 +113,13 @@ whatever headers there might be.
 NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit
 set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6).
 
+ * Transmit UDP segmentation offload
+
+NETIF_F_GSO_UDP_GSO_L4 accepts a single UDP header with a payload that exceeds
+gso_size. On segmentation, it segments the payload on gso_size boundaries and
+replicates the network and UDP headers (fixing up the last one if less than
+gso_size).
+
  * Transmit DMA from high memory
 
 On platforms where this is relevant, NETIF_F_HIGHDMA signals that
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 35b79f47a13d..fe2f3b30960e 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -55,8 +55,9 @@ enum {
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
 	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
 	NETIF_F_GSO_UDP_BIT,		/* ... UFO, deprecated except tuntap */
+	NETIF_F_GSO_UDP_L4_BIT,		/* ... UDP payload GSO (not UFO) */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_UDP_BIT,
+		NETIF_F_GSO_UDP_L4_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -147,6 +148,7 @@ enum {
 #define NETIF_F_HW_ESP_TX_CSUM	__NETIF_F(HW_ESP_TX_CSUM)
 #define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
+#define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
@@ -216,6 +218,7 @@ enum {
 				 NETIF_F_GSO_GRE_CSUM |			\
 				 NETIF_F_GSO_IPXIP4 |			\
 				 NETIF_F_GSO_IPXIP6 |			\
+				 NETIF_F_GSO_UDP_L4 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 14e0777ffcfb..366c32891158 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4186,6 +4186,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6dd5d7..4650fd6d678c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",
 	[NETIF_F_GSO_ESP_BIT] =		 "tx-esp-segmentation",
+	[NETIF_F_GSO_UDP_L4_BIT] =	 "tx-udp-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
-- 
cgit v1.2.3-59-g8ed1b


From a160725780e3e086cf936722b0267408f17fceaa Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:22 -0400
Subject: selftests: udp gso

Validate udp gso, including edge cases (such as min/max gso sizes).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/.gitignore |   1 +
 tools/testing/selftests/net/Makefile   |   3 +-
 tools/testing/selftests/net/udpgso.c   | 485 +++++++++++++++++++++++++++++++++
 tools/testing/selftests/net/udpgso.sh  |  16 ++
 4 files changed, 504 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/udpgso.c
 create mode 100755 tools/testing/selftests/net/udpgso.sh

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 5559f6add046..67a26240407c 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -8,3 +8,4 @@ reuseport_bpf_numa
 reuseport_dualstack
 reuseaddr_conflict
 tcp_mmap
+udpgso
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index c3761c35f542..db3303348315 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,12 +5,13 @@ CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh udpgso.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_FILES += tcp_mmap
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
+TEST_GEN_PROGS += udpgso
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
new file mode 100644
index 000000000000..58cabfc66bf8
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <net/if.h>
+#include <linux/in.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU	0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT		103
+#endif
+
+#define CONST_MTU_TEST	1500
+
+#define CONST_HDRLEN_V4		(sizeof(struct iphdr) + sizeof(struct udphdr))
+#define CONST_HDRLEN_V6		(sizeof(struct ip6_hdr) + sizeof(struct udphdr))
+
+#define CONST_MSS_V4		(CONST_MTU_TEST - CONST_HDRLEN_V4)
+#define CONST_MSS_V6		(CONST_MTU_TEST - CONST_HDRLEN_V6)
+
+#define CONST_MAX_SEGS_V4	(ETH_MAX_MTU / CONST_MSS_V4)
+#define CONST_MAX_SEGS_V6	(ETH_MAX_MTU / CONST_MSS_V6)
+
+static bool		cfg_do_ipv4;
+static bool		cfg_do_ipv6;
+static bool		cfg_do_connectionless;
+static bool		cfg_do_setsockopt;
+static int		cfg_specific_test_id = -1;
+
+static const char	cfg_ifname[] = "lo";
+static unsigned short	cfg_port = 9000;
+
+static char buf[ETH_MAX_MTU];
+
+struct testcase {
+	int tlen;		/* send() buffer size, may exceed mss */
+	bool tfail;		/* send() call is expected to fail */
+	int gso_len;		/* mss after applying gso */
+	int r_num_mss;		/* recv(): number of calls of full mss */
+	int r_len_last;		/* recv(): size of last non-mss dgram, if any */
+};
+
+const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
+const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
+
+struct testcase testcases_v4[] = {
+	{
+		/* no GSO: send a single byte */
+		.tlen = 1,
+		.r_len_last = 1,
+	},
+	{
+		/* no GSO: send a single MSS */
+		.tlen = CONST_MSS_V4,
+		.r_num_mss = 1,
+	},
+	{
+		/* no GSO: send a single MSS + 1B: fail */
+		.tlen = CONST_MSS_V4 + 1,
+		.tfail = true,
+	},
+	{
+		/* send a single MSS: will fail with GSO, because the segment
+		 * logic in udp4_ufo_fragment demands a gso skb to be > MTU
+		 */
+		.tlen = CONST_MSS_V4,
+		.gso_len = CONST_MSS_V4,
+		.tfail = true,
+		.r_num_mss = 1,
+	},
+	{
+		/* send a single MSS + 1B */
+		.tlen = CONST_MSS_V4 + 1,
+		.gso_len = CONST_MSS_V4,
+		.r_num_mss = 1,
+		.r_len_last = 1,
+	},
+	{
+		/* send exactly 2 MSS */
+		.tlen = CONST_MSS_V4 * 2,
+		.gso_len = CONST_MSS_V4,
+		.r_num_mss = 2,
+	},
+	{
+		/* send 2 MSS + 1B */
+		.tlen = (CONST_MSS_V4 * 2) + 1,
+		.gso_len = CONST_MSS_V4,
+		.r_num_mss = 2,
+		.r_len_last = 1,
+	},
+	{
+		/* send MAX segs */
+		.tlen = (ETH_MAX_MTU / CONST_MSS_V4) * CONST_MSS_V4,
+		.gso_len = CONST_MSS_V4,
+		.r_num_mss = (ETH_MAX_MTU / CONST_MSS_V4),
+	},
+
+	{
+		/* send MAX bytes */
+		.tlen = ETH_MAX_MTU - CONST_HDRLEN_V4,
+		.gso_len = CONST_MSS_V4,
+		.r_num_mss = CONST_MAX_SEGS_V4,
+		.r_len_last = ETH_MAX_MTU - CONST_HDRLEN_V4 -
+			      (CONST_MAX_SEGS_V4 * CONST_MSS_V4),
+	},
+	{
+		/* send MAX + 1: fail */
+		.tlen = ETH_MAX_MTU - CONST_HDRLEN_V4 + 1,
+		.gso_len = CONST_MSS_V4,
+		.tfail = true,
+	},
+	{
+		/* EOL */
+	}
+};
+
+#ifndef IP6_MAX_MTU
+#define IP6_MAX_MTU	(ETH_MAX_MTU + sizeof(struct ip6_hdr))
+#endif
+
+struct testcase testcases_v6[] = {
+	{
+		/* no GSO: send a single byte */
+		.tlen = 1,
+		.r_len_last = 1,
+	},
+	{
+		/* no GSO: send a single MSS */
+		.tlen = CONST_MSS_V6,
+		.r_num_mss = 1,
+	},
+	{
+		/* no GSO: send a single MSS + 1B: fail */
+		.tlen = CONST_MSS_V6 + 1,
+		.tfail = true,
+	},
+	{
+		/* send a single MSS: will fail with GSO, because the segment
+		 * logic in udp4_ufo_fragment demands a gso skb to be > MTU
+		 */
+		.tlen = CONST_MSS_V6,
+		.gso_len = CONST_MSS_V6,
+		.tfail = true,
+		.r_num_mss = 1,
+	},
+	{
+		/* send a single MSS + 1B */
+		.tlen = CONST_MSS_V6 + 1,
+		.gso_len = CONST_MSS_V6,
+		.r_num_mss = 1,
+		.r_len_last = 1,
+	},
+	{
+		/* send exactly 2 MSS */
+		.tlen = CONST_MSS_V6 * 2,
+		.gso_len = CONST_MSS_V6,
+		.r_num_mss = 2,
+	},
+	{
+		/* send 2 MSS + 1B */
+		.tlen = (CONST_MSS_V6 * 2) + 1,
+		.gso_len = CONST_MSS_V6,
+		.r_num_mss = 2,
+		.r_len_last = 1,
+	},
+	{
+		/* send MAX segs */
+		.tlen = (IP6_MAX_MTU / CONST_MSS_V6) * CONST_MSS_V6,
+		.gso_len = CONST_MSS_V6,
+		.r_num_mss = (IP6_MAX_MTU / CONST_MSS_V6),
+	},
+
+	{
+		/* send MAX bytes */
+		.tlen = IP6_MAX_MTU - CONST_HDRLEN_V6,
+		.gso_len = CONST_MSS_V6,
+		.r_num_mss = CONST_MAX_SEGS_V6,
+		.r_len_last = IP6_MAX_MTU - CONST_HDRLEN_V6 -
+			      (CONST_MAX_SEGS_V6 * CONST_MSS_V6),
+	},
+	{
+		/* send MAX + 1: fail */
+		.tlen = IP6_MAX_MTU - CONST_HDRLEN_V6 + 1,
+		.gso_len = CONST_MSS_V6,
+		.tfail = true,
+	},
+	{
+		/* EOL */
+	}
+};
+
+static unsigned int get_device_mtu(int fd, const char *ifname)
+{
+	struct ifreq ifr;
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	strcpy(ifr.ifr_name, ifname);
+
+	if (ioctl(fd, SIOCGIFMTU, &ifr))
+		error(1, errno, "ioctl get mtu");
+
+	return ifr.ifr_mtu;
+}
+
+static void __set_device_mtu(int fd, const char *ifname, unsigned int mtu)
+{
+	struct ifreq ifr;
+
+	memset(&ifr, 0, sizeof(ifr));
+
+	ifr.ifr_mtu = mtu;
+	strcpy(ifr.ifr_name, ifname);
+
+	if (ioctl(fd, SIOCSIFMTU, &ifr))
+		error(1, errno, "ioctl set mtu");
+}
+
+static void set_device_mtu(int fd, int mtu)
+{
+	int val;
+
+	val = get_device_mtu(fd, cfg_ifname);
+	fprintf(stderr, "device mtu (orig): %u\n", val);
+
+	__set_device_mtu(fd, cfg_ifname, mtu);
+	val = get_device_mtu(fd, cfg_ifname);
+	if (val != mtu)
+		error(1, 0, "unable to set device mtu to %u\n", val);
+
+	fprintf(stderr, "device mtu (test): %u\n", val);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+	int level, name, val;
+
+	if (is_ipv4) {
+		level	= SOL_IP;
+		name	= IP_MTU_DISCOVER;
+		val	= IP_PMTUDISC_DO;
+	} else {
+		level	= SOL_IPV6;
+		name	= IPV6_MTU_DISCOVER;
+		val	= IPV6_PMTUDISC_DO;
+	}
+
+	if (setsockopt(fd, level, name, &val, sizeof(val)))
+		error(1, errno, "setsockopt path mtu");
+}
+
+static bool send_one(int fd, int len, int gso_len,
+		     struct sockaddr *addr, socklen_t alen)
+{
+	char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
+	struct msghdr msg = {0};
+	struct iovec iov = {0};
+	struct cmsghdr *cm;
+	int ret;
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	msg.msg_name = addr;
+	msg.msg_namelen = alen;
+
+	if (gso_len && !cfg_do_setsockopt) {
+		msg.msg_control = control;
+		msg.msg_controllen = sizeof(control);
+
+		cm = CMSG_FIRSTHDR(&msg);
+		cm->cmsg_level = SOL_UDP;
+		cm->cmsg_type = UDP_SEGMENT;
+		cm->cmsg_len = CMSG_LEN(sizeof(uint16_t));
+		*((uint16_t *) CMSG_DATA(cm)) = gso_len;
+	}
+
+	ret = sendmsg(fd, &msg, 0);
+	if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+		return false;
+	if (ret == -1)
+		error(1, errno, "sendmsg");
+	if (ret != len)
+		error(1, 0, "sendto: %d != %u", ret, len);
+
+	return true;
+}
+
+static int recv_one(int fd, int flags)
+{
+	int ret;
+
+	ret = recv(fd, buf, sizeof(buf), flags);
+	if (ret == -1 && errno == EAGAIN && (flags & MSG_DONTWAIT))
+		return 0;
+	if (ret == -1)
+		error(1, errno, "recv");
+
+	return ret;
+}
+
+static void run_one(struct testcase *test, int fdt, int fdr,
+		    struct sockaddr *addr, socklen_t alen)
+{
+	int i, ret, val, mss;
+	bool sent;
+
+	fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
+			addr->sa_family == AF_INET ? 4 : 6,
+			test->tlen, test->gso_len,
+			test->tfail ? "(fail)" : "");
+
+	val = test->gso_len;
+	if (cfg_do_setsockopt) {
+		if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
+			error(1, errno, "setsockopt udp segment");
+	}
+
+	sent = send_one(fdt, test->tlen, test->gso_len, addr, alen);
+	if (sent && test->tfail)
+		error(1, 0, "send succeeded while expecting failure");
+	if (!sent && !test->tfail)
+		error(1, 0, "send failed while expecting success");
+	if (!sent)
+		return;
+
+	mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;
+
+	/* Recv all full MSS datagrams */
+	for (i = 0; i < test->r_num_mss; i++) {
+		ret = recv_one(fdr, 0);
+		if (ret != mss)
+			error(1, 0, "recv.%d: %d != %d", i, ret, mss);
+	}
+
+	/* Recv the non-full last datagram, if tlen was not a multiple of mss */
+	if (test->r_len_last) {
+		ret = recv_one(fdr, 0);
+		if (ret != test->r_len_last)
+			error(1, 0, "recv.%d: %d != %d (last)",
+			      i, ret, test->r_len_last);
+	}
+
+	/* Verify received all data */
+	ret = recv_one(fdr, MSG_DONTWAIT);
+	if (ret)
+		error(1, 0, "recv: unexpected datagram");
+}
+
+static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
+{
+	struct testcase *tests, *test;
+
+	tests = addr->sa_family == AF_INET ? testcases_v4 : testcases_v6;
+
+	for (test = tests; test->tlen; test++) {
+		/* if a specific test is given, then skip all others */
+		if (cfg_specific_test_id == -1 ||
+		    cfg_specific_test_id == test - tests)
+			run_one(test, fdt, fdr, addr, alen);
+	}
+}
+
+static void run_test(struct sockaddr *addr, socklen_t alen)
+{
+	struct timeval tv = { .tv_usec = 100 * 1000 };
+	int fdr, fdt;
+
+	fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
+	if (fdr == -1)
+		error(1, errno, "socket r");
+
+	if (bind(fdr, addr, alen))
+		error(1, errno, "bind");
+
+	/* Have tests fail quickly instead of hang */
+	if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+		error(1, errno, "setsockopt rcv timeout");
+
+	fdt = socket(addr->sa_family, SOCK_DGRAM, 0);
+	if (fdt == -1)
+		error(1, errno, "socket t");
+
+	/* Do not fragment these datagrams: only succeed if GSO works */
+	set_pmtu_discover(fdt, addr->sa_family == AF_INET);
+
+	if (cfg_do_connectionless) {
+		set_device_mtu(fdt, CONST_MTU_TEST);
+		run_all(fdt, fdr, addr, alen);
+	}
+
+	if (close(fdt))
+		error(1, errno, "close t");
+	if (close(fdr))
+		error(1, errno, "close r");
+}
+
+static void run_test_v4(void)
+{
+	struct sockaddr_in addr = {0};
+
+	addr.sin_family = AF_INET;
+	addr.sin_port = htons(cfg_port);
+	addr.sin_addr = addr4;
+
+	run_test((void *)&addr, sizeof(addr));
+}
+
+static void run_test_v6(void)
+{
+	struct sockaddr_in6 addr = {0};
+
+	addr.sin6_family = AF_INET6;
+	addr.sin6_port = htons(cfg_port);
+	addr.sin6_addr = addr6;
+
+	run_test((void *)&addr, sizeof(addr));
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "46Cst:")) != -1) {
+		switch (c) {
+		case '4':
+			cfg_do_ipv4 = true;
+			break;
+		case '6':
+			cfg_do_ipv6 = true;
+			break;
+		case 'C':
+			cfg_do_connectionless = true;
+			break;
+		case 's':
+			cfg_do_setsockopt = true;
+			break;
+		case 't':
+			cfg_specific_test_id = strtoul(optarg, NULL, 0);
+			break;
+		default:
+			error(1, 0, "%s: parse error", argv[0]);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+
+	if (cfg_do_ipv4)
+		run_test_v4();
+	if (cfg_do_ipv6)
+		run_test_v6();
+
+	fprintf(stderr, "OK\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
new file mode 100755
index 000000000000..7977b97e060c
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso regression tests
+
+echo "ipv4 cmsg"
+./in_netns.sh ./udpgso -4 -C
+
+echo "ipv4 setsockopt"
+./in_netns.sh ./udpgso -4 -C -s
+
+echo "ipv6 cmsg"
+./in_netns.sh ./udpgso -6 -C
+
+echo "ipv6 setsockopt"
+./in_netns.sh ./udpgso -6 -C -s
-- 
cgit v1.2.3-59-g8ed1b


From e5b2d91c2d12c197e9863def909a46302b45aa01 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:23 -0400
Subject: selftests: udp gso with connected sockets

Connected sockets use path mtu instead of device mtu.

Test this path by inserting a route mtu that is lower than the device
mtu. Verify that the path mtu for the connection matches this lower
number, then run the same test as in the connectionless case.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgso.c  | 117 +++++++++++++++++++++++++++++++++-
 tools/testing/selftests/net/udpgso.sh |   7 ++
 2 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 58cabfc66bf8..52ebad2abea9 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -47,6 +47,7 @@
 
 static bool		cfg_do_ipv4;
 static bool		cfg_do_ipv6;
+static bool		cfg_do_connected;
 static bool		cfg_do_connectionless;
 static bool		cfg_do_setsockopt;
 static int		cfg_specific_test_id = -1;
@@ -273,6 +274,101 @@ static void set_pmtu_discover(int fd, bool is_ipv4)
 		error(1, errno, "setsockopt path mtu");
 }
 
+static unsigned int get_path_mtu(int fd, bool is_ipv4)
+{
+	socklen_t vallen;
+	unsigned int mtu;
+	int ret;
+
+	vallen = sizeof(mtu);
+	if (is_ipv4)
+		ret = getsockopt(fd, SOL_IP, IP_MTU, &mtu, &vallen);
+	else
+		ret = getsockopt(fd, SOL_IPV6, IPV6_MTU, &mtu, &vallen);
+
+	if (ret)
+		error(1, errno, "getsockopt mtu");
+
+
+	fprintf(stderr, "path mtu (read):  %u\n", mtu);
+	return mtu;
+}
+
+/* very wordy version of system("ip route add dev lo mtu 1500 127.0.0.3/32") */
+static void set_route_mtu(int mtu, bool is_ipv4)
+{
+	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+	struct nlmsghdr *nh;
+	struct rtattr *rta;
+	struct rtmsg *rt;
+	char data[NLMSG_ALIGN(sizeof(*nh)) +
+		  NLMSG_ALIGN(sizeof(*rt)) +
+		  NLMSG_ALIGN(RTA_LENGTH(sizeof(addr6))) +
+		  NLMSG_ALIGN(RTA_LENGTH(sizeof(int))) +
+		  NLMSG_ALIGN(RTA_LENGTH(0) + RTA_LENGTH(sizeof(int)))];
+	int fd, ret, alen, off = 0;
+
+	alen = is_ipv4 ? sizeof(addr4) : sizeof(addr6);
+
+	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (fd == -1)
+		error(1, errno, "socket netlink");
+
+	memset(data, 0, sizeof(data));
+
+	nh = (void *)data;
+	nh->nlmsg_type = RTM_NEWROUTE;
+	nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
+	off += NLMSG_ALIGN(sizeof(*nh));
+
+	rt = (void *)(data + off);
+	rt->rtm_family = is_ipv4 ? AF_INET : AF_INET6;
+	rt->rtm_table = RT_TABLE_MAIN;
+	rt->rtm_dst_len = alen << 3;
+	rt->rtm_protocol = RTPROT_BOOT;
+	rt->rtm_scope = RT_SCOPE_UNIVERSE;
+	rt->rtm_type = RTN_UNICAST;
+	off += NLMSG_ALIGN(sizeof(*rt));
+
+	rta = (void *)(data + off);
+	rta->rta_type = RTA_DST;
+	rta->rta_len = RTA_LENGTH(alen);
+	if (is_ipv4)
+		memcpy(RTA_DATA(rta), &addr4, alen);
+	else
+		memcpy(RTA_DATA(rta), &addr6, alen);
+	off += NLMSG_ALIGN(rta->rta_len);
+
+	rta = (void *)(data + off);
+	rta->rta_type = RTA_OIF;
+	rta->rta_len = RTA_LENGTH(sizeof(int));
+	*((int *)(RTA_DATA(rta))) = 1; //if_nametoindex("lo");
+	off += NLMSG_ALIGN(rta->rta_len);
+
+	/* MTU is a subtype in a metrics type */
+	rta = (void *)(data + off);
+	rta->rta_type = RTA_METRICS;
+	rta->rta_len = RTA_LENGTH(0) + RTA_LENGTH(sizeof(int));
+	off += NLMSG_ALIGN(rta->rta_len);
+
+	/* now fill MTU subtype. Note that it fits within above rta_len */
+	rta = (void *)(((char *) rta) + RTA_LENGTH(0));
+	rta->rta_type = RTAX_MTU;
+	rta->rta_len = RTA_LENGTH(sizeof(int));
+	*((int *)(RTA_DATA(rta))) = mtu;
+
+	nh->nlmsg_len = off;
+
+	ret = sendto(fd, data, off, 0, (void *)&nladdr, sizeof(nladdr));
+	if (ret != off)
+		error(1, errno, "send netlink: %uB != %uB\n", ret, off);
+
+	if (close(fd))
+		error(1, errno, "close netlink");
+
+	fprintf(stderr, "route mtu (test): %u\n", mtu);
+}
+
 static bool send_one(int fd, int len, int gso_len,
 		     struct sockaddr *addr, socklen_t alen)
 {
@@ -391,7 +487,7 @@ static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
 static void run_test(struct sockaddr *addr, socklen_t alen)
 {
 	struct timeval tv = { .tv_usec = 100 * 1000 };
-	int fdr, fdt;
+	int fdr, fdt, val;
 
 	fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
 	if (fdr == -1)
@@ -416,6 +512,20 @@ static void run_test(struct sockaddr *addr, socklen_t alen)
 		run_all(fdt, fdr, addr, alen);
 	}
 
+	if (cfg_do_connected) {
+		set_device_mtu(fdt, CONST_MTU_TEST + 100);
+		set_route_mtu(CONST_MTU_TEST, addr->sa_family == AF_INET);
+
+		if (connect(fdt, addr, alen))
+			error(1, errno, "connect");
+
+		val = get_path_mtu(fdt, addr->sa_family == AF_INET);
+		if (val != CONST_MTU_TEST)
+			error(1, 0, "bad path mtu %u\n", val);
+
+		run_all(fdt, fdr, addr, 0 /* use connected addr */);
+	}
+
 	if (close(fdt))
 		error(1, errno, "close t");
 	if (close(fdr))
@@ -448,7 +558,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "46Cst:")) != -1) {
+	while ((c = getopt(argc, argv, "46cCst:")) != -1) {
 		switch (c) {
 		case '4':
 			cfg_do_ipv4 = true;
@@ -456,6 +566,9 @@ static void parse_opts(int argc, char **argv)
 		case '6':
 			cfg_do_ipv6 = true;
 			break;
+		case 'c':
+			cfg_do_connected = true;
+			break;
 		case 'C':
 			cfg_do_connectionless = true;
 			break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 7977b97e060c..7cdf0e7c1dde 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -14,3 +14,10 @@ echo "ipv6 cmsg"
 
 echo "ipv6 setsockopt"
 ./in_netns.sh ./udpgso -6 -C -s
+
+echo "ipv4 connected"
+./in_netns.sh ./udpgso -4 -c
+
+# blocked on 2nd loopback address
+# echo "ipv6 connected"
+# ./in_netns.sh ./udpgso -6 -c
-- 
cgit v1.2.3-59-g8ed1b


From 3f12817fe3827b83e757617c2ff99f0ab088f5b9 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:24 -0400
Subject: selftests: udp gso with corking

Corked sockets take a different path to construct a udp datagram than
the lockless fast path. Test this alternate path.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgso.c  | 42 ++++++++++++++++++++++++++---------
 tools/testing/selftests/net/udpgso.sh |  6 +++++
 2 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 52ebad2abea9..48a0592db938 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -49,6 +49,7 @@ static bool		cfg_do_ipv4;
 static bool		cfg_do_ipv6;
 static bool		cfg_do_connected;
 static bool		cfg_do_connectionless;
+static bool		cfg_do_msgmore;
 static bool		cfg_do_setsockopt;
 static int		cfg_specific_test_id = -1;
 
@@ -369,6 +370,23 @@ static void set_route_mtu(int mtu, bool is_ipv4)
 	fprintf(stderr, "route mtu (test): %u\n", mtu);
 }
 
+static bool __send_one(int fd, struct msghdr *msg, int flags)
+{
+	int ret;
+
+	ret = sendmsg(fd, msg, flags);
+	if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+		return false;
+	if (ret == -1)
+		error(1, errno, "sendmsg");
+	if (ret != msg->msg_iov->iov_len)
+		error(1, 0, "sendto: %d != %lu", ret, msg->msg_iov->iov_len);
+	if (msg->msg_flags)
+		error(1, 0, "sendmsg: return flags 0x%x\n", msg->msg_flags);
+
+	return true;
+}
+
 static bool send_one(int fd, int len, int gso_len,
 		     struct sockaddr *addr, socklen_t alen)
 {
@@ -376,7 +394,6 @@ static bool send_one(int fd, int len, int gso_len,
 	struct msghdr msg = {0};
 	struct iovec iov = {0};
 	struct cmsghdr *cm;
-	int ret;
 
 	iov.iov_base = buf;
 	iov.iov_len = len;
@@ -398,15 +415,17 @@ static bool send_one(int fd, int len, int gso_len,
 		*((uint16_t *) CMSG_DATA(cm)) = gso_len;
 	}
 
-	ret = sendmsg(fd, &msg, 0);
-	if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
-		return false;
-	if (ret == -1)
-		error(1, errno, "sendmsg");
-	if (ret != len)
-		error(1, 0, "sendto: %d != %u", ret, len);
+	/* If MSG_MORE, send 1 byte followed by remainder */
+	if (cfg_do_msgmore && len > 1) {
+		iov.iov_len = 1;
+		if (!__send_one(fd, &msg, MSG_MORE))
+			error(1, 0, "send 1B failed");
 
-	return true;
+		iov.iov_base++;
+		iov.iov_len = len - 1;
+	}
+
+	return __send_one(fd, &msg, 0);
 }
 
 static int recv_one(int fd, int flags)
@@ -558,7 +577,7 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "46cCst:")) != -1) {
+	while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
 		switch (c) {
 		case '4':
 			cfg_do_ipv4 = true;
@@ -572,6 +591,9 @@ static void parse_opts(int argc, char **argv)
 		case 'C':
 			cfg_do_connectionless = true;
 			break;
+		case 'm':
+			cfg_do_msgmore = true;
+			break;
 		case 's':
 			cfg_do_setsockopt = true;
 			break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 7cdf0e7c1dde..fec24f584fe9 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -21,3 +21,9 @@ echo "ipv4 connected"
 # blocked on 2nd loopback address
 # echo "ipv6 connected"
 # ./in_netns.sh ./udpgso -6 -c
+
+echo "ipv4 msg_more"
+./in_netns.sh ./udpgso -4 -C -m
+
+echo "ipv6 msg_more"
+./in_netns.sh ./udpgso -6 -C -m
-- 
cgit v1.2.3-59-g8ed1b


From 3a687bef148d94da13dc25981bb7b1e26a1ab125 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 26 Apr 2018 13:42:25 -0400
Subject: selftests: udp gso benchmark

Send udp data between a source and sink, optionally with udp gso.
The two processes are expected to be run on separate hosts.

A script is included that runs them together over loopback in a
single namespace for functionality testing.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/.gitignore        |   2 +
 tools/testing/selftests/net/Makefile          |   3 +-
 tools/testing/selftests/net/udpgso_bench.sh   |  74 +++++
 tools/testing/selftests/net/udpgso_bench_rx.c | 265 ++++++++++++++++
 tools/testing/selftests/net/udpgso_bench_tx.c | 420 ++++++++++++++++++++++++++
 5 files changed, 763 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/udpgso_bench.sh
 create mode 100644 tools/testing/selftests/net/udpgso_bench_rx.c
 create mode 100644 tools/testing/selftests/net/udpgso_bench_tx.c

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 67a26240407c..f0e6c35a93ae 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -9,3 +9,5 @@ reuseport_dualstack
 reuseaddr_conflict
 tcp_mmap
 udpgso
+udpgso_bench_rx
+udpgso_bench_tx
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index db3303348315..df9102ec7b7a 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,12 +6,13 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh udpgso.sh
+TEST_PROGS += udpgso_bench.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_FILES += tcp_mmap
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
-TEST_GEN_PROGS += udpgso
+TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
new file mode 100755
index 000000000000..792fa4d0285e
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso benchmarks
+
+wake_children() {
+	local -r jobs="$(jobs -p)"
+
+	if [[ "${jobs}" != "" ]]; then
+		kill -1 ${jobs} 2>/dev/null
+	fi
+}
+trap wake_children EXIT
+
+run_one() {
+	local -r args=$@
+
+	./udpgso_bench_rx &
+	./udpgso_bench_rx -t &
+
+	./udpgso_bench_tx ${args}
+}
+
+run_in_netns() {
+	local -r args=$@
+
+	./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+	local -r args=$@
+
+	echo "udp"
+	run_in_netns ${args}
+
+	echo "udp gso"
+	run_in_netns ${args} -S
+
+	echo "udp gso zerocopy"
+	run_in_netns ${args} -S -z
+}
+
+run_tcp() {
+	local -r args=$@
+
+	echo "tcp"
+	run_in_netns ${args} -t
+
+	echo "tcp zerocopy"
+	run_in_netns ${args} -t -z
+}
+
+run_all() {
+	local -r core_args="-l 4"
+	local -r ipv4_args="${core_args} -4 -D 127.0.0.1"
+	local -r ipv6_args="${core_args} -6 -D ::1"
+
+	echo "ipv4"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv4_args}"
+
+	echo "ipv6"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv6_args}"
+}
+
+if [[ $# -eq 0 ]]; then
+	run_all
+elif [[ $1 == "__subprocess" ]]; then
+	shift
+	run_one $@
+else
+	run_in_netns $@
+fi
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
new file mode 100644
index 000000000000..727cf67a3f75
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/errqueue.h>
+#include <linux/if_packet.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static int  cfg_port		= 8000;
+static bool cfg_tcp;
+static bool cfg_verify;
+
+static bool interrupted;
+static unsigned long packets, bytes;
+
+static void sigint_handler(int signum)
+{
+	if (signum == SIGINT)
+		interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void do_poll(int fd)
+{
+	struct pollfd pfd;
+	int ret;
+
+	pfd.events = POLLIN;
+	pfd.revents = 0;
+	pfd.fd = fd;
+
+	do {
+		ret = poll(&pfd, 1, 10);
+		if (ret == -1)
+			error(1, errno, "poll");
+		if (ret == 0)
+			continue;
+		if (pfd.revents != POLLIN)
+			error(1, errno, "poll: 0x%x expected 0x%x\n",
+					pfd.revents, POLLIN);
+	} while (!ret && !interrupted);
+}
+
+static int do_socket(bool do_tcp)
+{
+	struct sockaddr_in6 addr = {0};
+	int fd, val;
+
+	fd = socket(PF_INET6, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket");
+
+	val = 1 << 21;
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)))
+		error(1, errno, "setsockopt rcvbuf");
+	val = 1;
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
+		error(1, errno, "setsockopt reuseport");
+
+	addr.sin6_family =	PF_INET6;
+	addr.sin6_port =	htons(cfg_port);
+	addr.sin6_addr =	in6addr_any;
+	if (bind(fd, (void *) &addr, sizeof(addr)))
+		error(1, errno, "bind");
+
+	if (do_tcp) {
+		int accept_fd = fd;
+
+		if (listen(accept_fd, 1))
+			error(1, errno, "listen");
+
+		do_poll(accept_fd);
+
+		fd = accept(accept_fd, NULL, NULL);
+		if (fd == -1)
+			error(1, errno, "accept");
+		if (close(accept_fd))
+			error(1, errno, "close accept fd");
+	}
+
+	return fd;
+}
+
+/* Flush all outstanding bytes for the tcp receive queue */
+static void do_flush_tcp(int fd)
+{
+	int ret;
+
+	while (true) {
+		/* MSG_TRUNC flushes up to len bytes */
+		ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
+		if (ret == -1 && errno == EAGAIN)
+			return;
+		if (ret == -1)
+			error(1, errno, "flush");
+		if (ret == 0) {
+			/* client detached */
+			exit(0);
+		}
+
+		packets++;
+		bytes += ret;
+	}
+
+}
+
+static char sanitized_char(char val)
+{
+	return (val >= 'a' && val <= 'z') ? val : '.';
+}
+
+static void do_verify_udp(const char *data, int len)
+{
+	char cur = data[0];
+	int i;
+
+	/* verify contents */
+	if (cur < 'a' || cur > 'z')
+		error(1, 0, "data initial byte out of range");
+
+	for (i = 1; i < len; i++) {
+		if (cur == 'z')
+			cur = 'a';
+		else
+			cur++;
+
+		if (data[i] != cur)
+			error(1, 0, "data[%d]: len %d, %c(%hhu) != %c(%hhu)\n",
+			      i, len,
+			      sanitized_char(data[i]), data[i],
+			      sanitized_char(cur), cur);
+	}
+}
+
+/* Flush all outstanding datagrams. Verify first few bytes of each. */
+static void do_flush_udp(int fd)
+{
+	static char rbuf[ETH_DATA_LEN];
+	int ret, len, budget = 256;
+
+	len = cfg_verify ? sizeof(rbuf) : 0;
+	while (budget--) {
+		/* MSG_TRUNC will make return value full datagram length */
+		ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+		if (ret == -1 && errno == EAGAIN)
+			return;
+		if (ret == -1)
+			error(1, errno, "recv");
+		if (len) {
+			if (ret == 0)
+				error(1, errno, "recv: 0 byte datagram\n");
+
+			do_verify_udp(rbuf, ret);
+		}
+
+		packets++;
+		bytes += ret;
+	}
+}
+
+static void usage(const char *filepath)
+{
+	error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "ptv")) != -1) {
+		switch (c) {
+		case 'p':
+			cfg_port = htons(strtoul(optarg, NULL, 0));
+			break;
+		case 't':
+			cfg_tcp = true;
+			break;
+		case 'v':
+			cfg_verify = true;
+			break;
+		}
+	}
+
+	if (optind != argc)
+		usage(argv[0]);
+
+	if (cfg_tcp && cfg_verify)
+		error(1, 0, "TODO: implement verify mode for tcp");
+}
+
+static void do_recv(void)
+{
+	unsigned long tnow, treport;
+	int fd;
+
+	fd = do_socket(cfg_tcp);
+
+	treport = gettimeofday_ms() + 1000;
+	do {
+		do_poll(fd);
+
+		if (cfg_tcp)
+			do_flush_tcp(fd);
+		else
+			do_flush_udp(fd);
+
+		tnow = gettimeofday_ms();
+		if (tnow > treport) {
+			if (packets)
+				fprintf(stderr,
+					"%s rx: %6lu MB/s %8lu calls/s\n",
+					cfg_tcp ? "tcp" : "udp",
+					bytes >> 20, packets);
+			bytes = packets = 0;
+			treport = tnow + 1000;
+		}
+
+	} while (!interrupted);
+
+	if (close(fd))
+		error(1, errno, "close");
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+
+	signal(SIGINT, sigint_handler);
+
+	do_recv();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
new file mode 100644
index 000000000000..e821564053cf
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT		103
+#endif
+
+#ifndef SO_ZEROCOPY
+#define SO_ZEROCOPY	60
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY	0x4000000
+#endif
+
+#define NUM_PKT		100
+
+static bool	cfg_cache_trash;
+static int	cfg_cpu		= -1;
+static int	cfg_connected	= true;
+static int	cfg_family	= PF_UNSPEC;
+static uint16_t	cfg_mss;
+static int	cfg_payload_len	= (1472 * 42);
+static int	cfg_port	= 8000;
+static int	cfg_runtime_ms	= -1;
+static bool	cfg_segment;
+static bool	cfg_sendmmsg;
+static bool	cfg_tcp;
+static bool	cfg_zerocopy;
+
+static socklen_t cfg_alen;
+static struct sockaddr_storage cfg_dst_addr;
+
+static bool interrupted;
+static char buf[NUM_PKT][ETH_MAX_MTU];
+
+static void sigint_handler(int signum)
+{
+	if (signum == SIGINT)
+		interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static int set_cpu(int cpu)
+{
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpu, &mask);
+	if (sched_setaffinity(0, sizeof(mask), &mask))
+		error(1, 0, "setaffinity %d", cpu);
+
+	return 0;
+}
+
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+	struct sockaddr_in6 *addr6 = (void *) sockaddr;
+	struct sockaddr_in *addr4 = (void *) sockaddr;
+
+	switch (domain) {
+	case PF_INET:
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(cfg_port);
+		if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+			error(1, 0, "ipv4 parse error: %s", str_addr);
+		break;
+	case PF_INET6:
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(cfg_port);
+		if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+			error(1, 0, "ipv6 parse error: %s", str_addr);
+		break;
+	default:
+		error(1, 0, "illegal domain");
+	}
+}
+
+static void flush_zerocopy(int fd)
+{
+	struct msghdr msg = {0};	/* flush */
+	int ret;
+
+	while (1) {
+		ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+		if (ret == -1 && errno == EAGAIN)
+			break;
+		if (ret == -1)
+			error(1, errno, "errqueue");
+		if (msg.msg_flags != (MSG_ERRQUEUE | MSG_CTRUNC))
+			error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
+		msg.msg_flags = 0;
+	}
+}
+
+static int send_tcp(int fd, char *data)
+{
+	int ret, done = 0, count = 0;
+
+	while (done < cfg_payload_len) {
+		ret = send(fd, data + done, cfg_payload_len - done,
+			   cfg_zerocopy ? MSG_ZEROCOPY : 0);
+		if (ret == -1)
+			error(1, errno, "write");
+
+		done += ret;
+		count++;
+	}
+
+	return count;
+}
+
+static int send_udp(int fd, char *data)
+{
+	int ret, total_len, len, count = 0;
+
+	total_len = cfg_payload_len;
+
+	while (total_len) {
+		len = total_len < cfg_mss ? total_len : cfg_mss;
+
+		ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
+			     cfg_connected ? NULL : (void *)&cfg_dst_addr,
+			     cfg_connected ? 0 : cfg_alen);
+		if (ret == -1)
+			error(1, errno, "write");
+		if (ret != len)
+			error(1, errno, "write: %uB != %uB\n", ret, len);
+
+		total_len -= len;
+		count++;
+	}
+
+	return count;
+}
+
+static int send_udp_sendmmsg(int fd, char *data)
+{
+	const int max_nr_msg = ETH_MAX_MTU / ETH_DATA_LEN;
+	struct mmsghdr mmsgs[max_nr_msg];
+	struct iovec iov[max_nr_msg];
+	unsigned int off = 0, left;
+	int i = 0, ret;
+
+	memset(mmsgs, 0, sizeof(mmsgs));
+
+	left = cfg_payload_len;
+	while (left) {
+		if (i == max_nr_msg)
+			error(1, 0, "sendmmsg: exceeds max_nr_msg");
+
+		iov[i].iov_base = data + off;
+		iov[i].iov_len = cfg_mss < left ? cfg_mss : left;
+
+		mmsgs[i].msg_hdr.msg_iov = iov + i;
+		mmsgs[i].msg_hdr.msg_iovlen = 1;
+
+		off += iov[i].iov_len;
+		left -= iov[i].iov_len;
+		i++;
+	}
+
+	ret = sendmmsg(fd, mmsgs, i, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+	if (ret == -1)
+		error(1, errno, "sendmmsg");
+
+	return ret;
+}
+
+static void send_udp_segment_cmsg(struct cmsghdr *cm)
+{
+	uint16_t *valp;
+
+	cm->cmsg_level = SOL_UDP;
+	cm->cmsg_type = UDP_SEGMENT;
+	cm->cmsg_len = CMSG_LEN(sizeof(cfg_mss));
+	valp = (void *)CMSG_DATA(cm);
+	*valp = cfg_mss;
+}
+
+static int send_udp_segment(int fd, char *data)
+{
+	char control[CMSG_SPACE(sizeof(cfg_mss))] = {0};
+	struct msghdr msg = {0};
+	struct iovec iov = {0};
+	int ret;
+
+	iov.iov_base = data;
+	iov.iov_len = cfg_payload_len;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	msg.msg_control = control;
+	msg.msg_controllen = sizeof(control);
+	send_udp_segment_cmsg(CMSG_FIRSTHDR(&msg));
+
+	msg.msg_name = (void *)&cfg_dst_addr;
+	msg.msg_namelen = cfg_alen;
+
+	ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+	if (ret == -1)
+		error(1, errno, "sendmsg");
+	if (ret != iov.iov_len)
+		error(1, 0, "sendmsg: %u != %lu\n", ret, iov.iov_len);
+
+	return 1;
+}
+
+static void usage(const char *filepath)
+{
+	error(1, 0, "Usage: %s [-46cmStuz] [-C cpu] [-D dst ip] [-l secs] [-p port] [-s sendsize]",
+		    filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int max_len, hdrlen;
+	int c;
+
+	while ((c = getopt(argc, argv, "46cC:D:l:mp:s:Stuz")) != -1) {
+		switch (c) {
+		case '4':
+			if (cfg_family != PF_UNSPEC)
+				error(1, 0, "Pass one of -4 or -6");
+			cfg_family = PF_INET;
+			cfg_alen = sizeof(struct sockaddr_in);
+			break;
+		case '6':
+			if (cfg_family != PF_UNSPEC)
+				error(1, 0, "Pass one of -4 or -6");
+			cfg_family = PF_INET6;
+			cfg_alen = sizeof(struct sockaddr_in6);
+			break;
+		case 'c':
+			cfg_cache_trash = true;
+			break;
+		case 'C':
+			cfg_cpu = strtol(optarg, NULL, 0);
+			break;
+		case 'D':
+			setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
+			break;
+		case 'l':
+			cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
+			break;
+		case 'm':
+			cfg_sendmmsg = true;
+			break;
+		case 'p':
+			cfg_port = strtoul(optarg, NULL, 0);
+			break;
+		case 's':
+			cfg_payload_len = strtoul(optarg, NULL, 0);
+			break;
+		case 'S':
+			cfg_segment = true;
+			break;
+		case 't':
+			cfg_tcp = true;
+			break;
+		case 'u':
+			cfg_connected = false;
+			break;
+		case 'z':
+			cfg_zerocopy = true;
+			break;
+		}
+	}
+
+	if (optind != argc)
+		usage(argv[0]);
+
+	if (cfg_family == PF_UNSPEC)
+		error(1, 0, "must pass one of -4 or -6");
+	if (cfg_tcp && !cfg_connected)
+		error(1, 0, "connectionless tcp makes no sense");
+	if (cfg_segment && cfg_sendmmsg)
+		error(1, 0, "cannot combine segment offload and sendmmsg");
+
+	if (cfg_family == PF_INET)
+		hdrlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+	else
+		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+
+	cfg_mss = ETH_DATA_LEN - hdrlen;
+	max_len = ETH_MAX_MTU - hdrlen;
+
+	if (cfg_payload_len > max_len)
+		error(1, 0, "payload length %u exceeds max %u",
+		      cfg_payload_len, max_len);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+	int level, name, val;
+
+	if (is_ipv4) {
+		level	= SOL_IP;
+		name	= IP_MTU_DISCOVER;
+		val	= IP_PMTUDISC_DO;
+	} else {
+		level	= SOL_IPV6;
+		name	= IPV6_MTU_DISCOVER;
+		val	= IPV6_PMTUDISC_DO;
+	}
+
+	if (setsockopt(fd, level, name, &val, sizeof(val)))
+		error(1, errno, "setsockopt path mtu");
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long num_msgs, num_sends;
+	unsigned long tnow, treport, tstop;
+	int fd, i, val;
+
+	parse_opts(argc, argv);
+
+	if (cfg_cpu > 0)
+		set_cpu(cfg_cpu);
+
+	for (i = 0; i < sizeof(buf[0]); i++)
+		buf[0][i] = 'a' + (i % 26);
+	for (i = 1; i < NUM_PKT; i++)
+		memcpy(buf[i], buf[0], sizeof(buf[0]));
+
+	signal(SIGINT, sigint_handler);
+
+	fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket");
+
+	if (cfg_zerocopy) {
+		val = 1;
+		if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val)))
+			error(1, errno, "setsockopt zerocopy");
+	}
+
+	if (cfg_connected &&
+	    connect(fd, (void *)&cfg_dst_addr, cfg_alen))
+		error(1, errno, "connect");
+
+	if (cfg_segment)
+		set_pmtu_discover(fd, cfg_family == PF_INET);
+
+	num_msgs = num_sends = 0;
+	tnow = gettimeofday_ms();
+	tstop = tnow + cfg_runtime_ms;
+	treport = tnow + 1000;
+
+	i = 0;
+	do {
+		if (cfg_tcp)
+			num_sends += send_tcp(fd, buf[i]);
+		else if (cfg_segment)
+			num_sends += send_udp_segment(fd, buf[i]);
+		else if (cfg_sendmmsg)
+			num_sends += send_udp_sendmmsg(fd, buf[i]);
+		else
+			num_sends += send_udp(fd, buf[i]);
+		num_msgs++;
+
+		if (cfg_zerocopy && ((num_msgs & 0xF) == 0))
+			flush_zerocopy(fd);
+
+		tnow = gettimeofday_ms();
+		if (tnow > treport) {
+			fprintf(stderr,
+				"%s tx: %6lu MB/s %8lu calls/s %6lu msg/s\n",
+				cfg_tcp ? "tcp" : "udp",
+				(num_msgs * cfg_payload_len) >> 20,
+				num_sends, num_msgs);
+			num_msgs = num_sends = 0;
+			treport = tnow + 1000;
+		}
+
+		/* cold cache when writing buffer */
+		if (cfg_cache_trash)
+			i = ++i < NUM_PKT ? i : 0;
+
+	} while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop));
+
+	if (close(fd))
+		error(1, errno, "close");
+
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From b85fab0e67b162014cd328cb4e2a8e8ae382cb8a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 25 Apr 2018 19:41:06 +0200
Subject: bpf: Add gpl_compatible flag to struct bpf_prog_info

Adding gpl_compatible flag to struct bpf_prog_info
so it can be dumped via bpf_prog_get_info_by_fd and
displayed via bpftool progs dump.

Alexei noticed 4-byte hole in struct bpf_prog_info,
so we put the u32 flags field in there, and we can
keep adding bit fields in there without breaking
user space.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 1 +
 kernel/bpf/syscall.c     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e6679393b687..da8801860c7d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1060,6 +1060,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a3ec4..7bb4ff1c770a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1914,6 +1914,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.load_time = prog->aux->load_time;
 	info.created_by_uid = from_kuid_munged(current_user_ns(),
 					       prog->aux->user->uid);
+	info.gpl_compatible = prog->gpl_compatible;
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
 	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
-- 
cgit v1.2.3-59-g8ed1b


From fb6ef42b5c8e4ce92c2c3c36398ad7574b290951 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 25 Apr 2018 19:41:07 +0200
Subject: tools, bpf: Sync bpf.h uapi header

Syncing the bpf.h uapi header with tools.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e6679393b687..da8801860c7d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1060,6 +1060,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
-- 
cgit v1.2.3-59-g8ed1b


From 9b984a20ca84337af81cdab92d1e8ae37007894a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 26 Apr 2018 10:18:01 +0200
Subject: tools, bpftool: Display license GPL compatible in prog show/list

Display the license "gpl" string in bpftool prog command, like:

  # bpftool prog list
  5: tracepoint  name func  tag 57cd311f2e27366b  gpl
          loaded_at Apr 26/09:37  uid 0
          xlated 16B  not jited  memlock 4096B

  # bpftool --json --pretty prog show
  [{
          "id": 5,
          "type": "tracepoint",
          "name": "func",
          "tag": "57cd311f2e27366b",
          "gpl_compatible": true,
          "loaded_at": "Apr 26/09:37",
          "uid": 0,
          "bytes_xlated": 16,
          "jited": false,
          "bytes_memlock": 4096
      }
  ]

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 3 ++-
 tools/bpf/bpftool/prog.c                         | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 67ca6c69376c..43d34a5c3ec5 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -95,7 +95,7 @@ EXAMPLES
 **# bpftool prog show**
 ::
 
-  10: xdp  name some_prog  tag 005a3d2123620c8b
+  10: xdp  name some_prog  tag 005a3d2123620c8b  gpl
 	loaded_at Sep 29/20:11  uid 0
 	xlated 528B  jited 370B  memlock 4096B  map_ids 10
 
@@ -108,6 +108,7 @@ EXAMPLES
                 "id": 10,
                 "type": "xdp",
                 "tag": "005a3d2123620c8b",
+                "gpl_compatible": true,
                 "loaded_at": "Sep 29/20:11",
                 "uid": 0,
                 "bytes_xlated": 528,
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 548adb9b7317..e71a0a11afde 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -235,6 +235,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[0], info->tag[1], info->tag[2], info->tag[3],
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
+	jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible);
+
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
 	if (info->load_time) {
@@ -295,6 +297,7 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 	printf("tag ");
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
 	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
+	printf("%s", info->gpl_compatible ? "  gpl" : "");
 	printf("\n");
 
 	if (info->load_time) {
-- 
cgit v1.2.3-59-g8ed1b


From f761312023a1f0d625e34daadd54c3f9cb2a4ac2 Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@tehnerd.com>
Date: Wed, 25 Apr 2018 07:15:03 -0700
Subject: bpf: fix xdp_generic for bpf_adjust_tail usecase

When bpf_adjust_tail was introduced for generic xdp, it changed skb's tail
pointer, so it was pointing to the new "end of the packet". However skb's
len field wasn't properly modified, so on the wire ethernet frame had
original (or even bigger, if adjust_head was used) size. This diff is
fixing this.

Fixes: 198d83bb3 (" bpf: make generic xdp compatible w/ bpf_xdp_adjust_tail")
Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index c624a04dad1f..8f8931b93140 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4057,8 +4057,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	 * pckt.
 	 */
 	off = orig_data_end - xdp.data_end;
-	if (off != 0)
+	if (off != 0) {
 		skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+		skb->len -= off;
+	}
 
 	switch (act) {
 	case XDP_REDIRECT:
-- 
cgit v1.2.3-59-g8ed1b


From 8a22543c8e7093104d7c22190746b9492bfbd897 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:54 +0200
Subject: netfilter: nf_tables: make meta expression builtin

size net/netfilter/nft_meta.ko
   text    data     bss     dec     hex filename
   5826     936       1    6763    1a6b net/netfilter/nft_meta.ko
  96407    2064     400   98871   18237 net/netfilter/nf_tables.ko

after:
 100826    2240     401  103467   1942b net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_meta.c               | 22 +---------------------
 5 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index ea5aab568be8..3339cce8f585 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -10,6 +10,7 @@ extern struct nft_expr_type nft_byteorder_type;
 extern struct nft_expr_type nft_payload_type;
 extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
+extern struct nft_expr_type nft_meta_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d20664b02ae4..29a13c7a5af2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -480,12 +480,6 @@ config NFT_EXTHDR
 	  This option adds the "exthdr" expression that you can use to match
 	  IPv6 extension headers and tcp options.
 
-config NFT_META
-	tristate "Netfilter nf_tables meta module"
-	help
-	  This option adds the "meta" expression that you can use to match and
-	  to set packet metainformation such as the packet mark.
-
 config NFT_RT
 	tristate "Netfilter nf_tables routing module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3103ed1efe17..89634c389fe7 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,12 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o
+		  nft_dynset.o nft_meta.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
-obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index dfd0bf3810d2..b67d6577f767 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -251,6 +251,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_payload_type,
 	&nft_dynset_type,
 	&nft_range_type,
+	&nft_meta_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6c0b82628117..5348bd058c88 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -11,8 +11,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
@@ -495,7 +493,6 @@ static void nft_meta_set_destroy(const struct nft_ctx *ctx,
 		static_branch_dec(&nft_trace_enabled);
 }
 
-static struct nft_expr_type nft_meta_type;
 static const struct nft_expr_ops nft_meta_get_ops = {
 	.type		= &nft_meta_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -534,27 +531,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
 	return ERR_PTR(-EINVAL);
 }
 
-static struct nft_expr_type nft_meta_type __read_mostly = {
+struct nft_expr_type nft_meta_type __read_mostly = {
 	.name		= "meta",
 	.select_ops	= nft_meta_select_ops,
 	.policy		= nft_meta_policy,
 	.maxattr	= NFTA_META_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_meta_module_init(void)
-{
-	return nft_register_expr(&nft_meta_type);
-}
-
-static void __exit nft_meta_module_exit(void)
-{
-	nft_unregister_expr(&nft_meta_type);
-}
-
-module_init(nft_meta_module_init);
-module_exit(nft_meta_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("meta");
-- 
cgit v1.2.3-59-g8ed1b


From ae1bc6a9f398d5e0310387eb077a0d9ce1fb21f5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:55 +0200
Subject: netfilter: nf_tables: merge rt expression into nft core

before:
   text    data     bss     dec     hex filename
   2657     844       0    3501     dad net/netfilter/nft_rt.ko
 100826    2240     401  103467   1942b net/netfilter/nf_tables.ko
after:
   2657     844       0    3501     dad net/netfilter/nft_rt.ko
 102456    2316     401  105173   19ad5 net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_rt.c                 | 22 +---------------------
 5 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 3339cce8f585..d6a358ae3749 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -11,6 +11,7 @@ extern struct nft_expr_type nft_payload_type;
 extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
 extern struct nft_expr_type nft_meta_type;
+extern struct nft_expr_type nft_rt_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 29a13c7a5af2..771f1a4f3376 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -480,12 +480,6 @@ config NFT_EXTHDR
 	  This option adds the "exthdr" expression that you can use to match
 	  IPv6 extension headers and tcp options.
 
-config NFT_RT
-	tristate "Netfilter nf_tables routing module"
-	help
-	  This option adds the "rt" expression that you can use to match
-	  packet routing information such as the packet nexthop.
-
 config NFT_NUMGEN
 	tristate "Netfilter nf_tables number generator module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 89634c389fe7..128dbcfaa194 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,12 +76,11 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o nft_meta.o
+		  nft_dynset.o nft_meta.o nft_rt.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
-obj-$(CONFIG_NFT_RT)		+= nft_rt.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index b67d6577f767..481ce2c0bbbf 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -252,6 +252,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_dynset_type,
 	&nft_range_type,
 	&nft_meta_type,
+	&nft_rt_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 11a2071b6dd4..76dba9f6b6f6 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -7,8 +7,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
@@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
 	return nft_chain_validate_hooks(ctx->chain, hooks);
 }
 
-static struct nft_expr_type nft_rt_type;
 static const struct nft_expr_ops nft_rt_get_ops = {
 	.type		= &nft_rt_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_rt)),
@@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = {
 	.validate	= nft_rt_validate,
 };
 
-static struct nft_expr_type nft_rt_type __read_mostly = {
+struct nft_expr_type nft_rt_type __read_mostly = {
 	.name		= "rt",
 	.ops		= &nft_rt_get_ops,
 	.policy		= nft_rt_policy,
 	.maxattr	= NFTA_RT_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_rt_module_init(void)
-{
-	return nft_register_expr(&nft_rt_type);
-}
-
-static void __exit nft_rt_module_exit(void)
-{
-	nft_unregister_expr(&nft_rt_type);
-}
-
-module_init(nft_rt_module_init);
-module_exit(nft_rt_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>");
-MODULE_ALIAS_NFT_EXPR("rt");
-- 
cgit v1.2.3-59-g8ed1b


From d0103158cf7c9190860dabd12b85ccad3c6e3455 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 16 Apr 2018 19:15:56 +0200
Subject: netfilter: nf_tables: merge exthdr expression into nft core

before:
   text    data     bss     dec     hex filename
   5056     844       0    5900    170c net/netfilter/nft_exthdr.ko
 102456    2316     401  105173   19ad5 net/netfilter/nf_tables.ko

after:
 106410    2392     401  109203   1aa93 net/netfilter/nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  3 +--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_exthdr.c             | 23 ++---------------------
 5 files changed, 5 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index d6a358ae3749..cd6915b6c054 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -12,6 +12,7 @@ extern struct nft_expr_type nft_dynset_type;
 extern struct nft_expr_type nft_range_type;
 extern struct nft_expr_type nft_meta_type;
 extern struct nft_expr_type nft_rt_type;
+extern struct nft_expr_type nft_exthdr_type;
 
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 771f1a4f3376..f66586fb41cd 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -474,12 +474,6 @@ config NF_TABLES_NETDEV
 	help
 	  This option enables support for the "netdev" table.
 
-config NFT_EXTHDR
-	tristate "Netfilter nf_tables exthdr module"
-	help
-	  This option adds the "exthdr" expression that you can use to match
-	  IPv6 extension headers and tcp options.
-
 config NFT_NUMGEN
 	tristate "Netfilter nf_tables number generator module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 128dbcfaa194..b37ce0bc9ab7 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -76,11 +76,10 @@ obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
-		  nft_dynset.o nft_meta.o nft_rt.o
+		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
-obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 481ce2c0bbbf..9cf47c4cb9d5 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -253,6 +253,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_range_type,
 	&nft_meta_type,
 	&nft_rt_type,
+	&nft_exthdr_type,
 };
 
 int __init nf_tables_core_module_init(void)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..a940c9fd9045 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -10,11 +10,10 @@
 
 #include <asm/unaligned.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/tcp.h>
 
@@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
-static struct nft_expr_type nft_exthdr_type;
 static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static struct nft_expr_type nft_exthdr_type __read_mostly = {
+struct nft_expr_type nft_exthdr_type __read_mostly = {
 	.name		= "exthdr",
 	.select_ops	= nft_exthdr_select_ops,
 	.policy		= nft_exthdr_policy,
 	.maxattr	= NFTA_EXTHDR_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_exthdr_module_init(void)
-{
-	return nft_register_expr(&nft_exthdr_type);
-}
-
-static void __exit nft_exthdr_module_exit(void)
-{
-	nft_unregister_expr(&nft_exthdr_type);
-}
-
-module_init(nft_exthdr_module_init);
-module_exit(nft_exthdr_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("exthdr");
-- 
cgit v1.2.3-59-g8ed1b


From 933a741e3b8295b9192a07367ebc15cbb4bdf957 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 26 Apr 2018 14:01:39 -0700
Subject: selftests/bpf: bpf tunnel test.

The patch migrates the original tests at samples/bpf/tcbpf2_kern.c
and samples/bpf/test_tunnel_bpf.sh to selftests.  There are a couple
changes from the original:
    1) add ipv6 vxlan, ipv6 geneve, ipv6 ipip tests
    2) simplify the original ipip tests (remove iperf tests)
    3) improve documentation
    4) use bpf_ntoh* and bpf_hton* api

In summary, 'test_tunnel_kern.o' contains the following bpf program:
  GRE: gre_set_tunnel, gre_get_tunnel
  IP6GRE: ip6gretap_set_tunnel, ip6gretap_get_tunnel
  ERSPAN: erspan_set_tunnel, erspan_get_tunnel
  IP6ERSPAN: ip4ip6erspan_set_tunnel, ip4ip6erspan_get_tunnel
  VXLAN: vxlan_set_tunnel, vxlan_get_tunnel
  IP6VXLAN: ip6vxlan_set_tunnel, ip6vxlan_get_tunnel
  GENEVE: geneve_set_tunnel, geneve_get_tunnel
  IP6GENEVE: ip6geneve_set_tunnel, ip6geneve_get_tunnel
  IPIP: ipip_set_tunnel, ipip_get_tunnel
  IP6IP: ipip6_set_tunnel, ipip6_get_tunnel,
         ip6ip6_set_tunnel, ip6ip6_get_tunnel
  XFRM: xfrm_get_state

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile           |   5 +-
 tools/testing/selftests/bpf/test_tunnel.sh     | 729 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_tunnel_kern.c | 713 ++++++++++++++++++++++++
 3 files changed, 1445 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_tunnel.sh
 create mode 100644 tools/testing/selftests/bpf/test_tunnel_kern.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0c19d5e08f08..b64a7a39cbc8 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
-	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o
+	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -40,7 +40,8 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_redirect.sh \
 	test_xdp_meta.sh \
 	test_offload.py \
-	test_sock_addr.sh
+	test_sock_addr.sh \
+	test_tunnel.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
new file mode 100755
index 000000000000..aeb2901f21f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -0,0 +1,729 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# End-to-end eBPF tunnel test suite
+#   The script tests BPF network tunnel implementation.
+#
+# Topology:
+# ---------
+#     root namespace   |     at_ns0 namespace
+#                      |
+#      -----------     |     -----------
+#      | tnl dev |     |     | tnl dev |  (overlay network)
+#      -----------     |     -----------
+#      metadata-mode   |     native-mode
+#       with bpf       |
+#                      |
+#      ----------      |     ----------
+#      |  veth1  | --------- |  veth0  |  (underlay network)
+#      ----------    peer    ----------
+#
+#
+# Device Configuration
+# --------------------
+# Root namespace with metadata-mode tunnel + BPF
+# Device names and addresses:
+# 	veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay)
+# 	tunnel dev <type>11, ex: gre11, IPv4: 10.1.1.200 (overlay)
+#
+# Namespace at_ns0 with native tunnel
+# Device names and addresses:
+# 	veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay)
+# 	tunnel dev <type>00, ex: gre00, IPv4: 10.1.1.100 (overlay)
+#
+#
+# End-to-end ping packet flow
+# ---------------------------
+# Most of the tests start by namespace creation, device configuration,
+# then ping the underlay and overlay network.  When doing 'ping 10.1.1.100'
+# from root namespace, the following operations happen:
+# 1) Route lookup shows 10.1.1.100/24 belongs to tnl dev, fwd to tnl dev.
+# 2) Tnl device's egress BPF program is triggered and set the tunnel metadata,
+#    with remote_ip=172.16.1.200 and others.
+# 3) Outer tunnel header is prepended and route the packet to veth1's egress
+# 4) veth0's ingress queue receive the tunneled packet at namespace at_ns0
+# 5) Tunnel protocol handler, ex: vxlan_rcv, decap the packet
+# 6) Forward the packet to the overlay tnl dev
+
+PING_ARG="-c 3 -w 10 -q"
+ret=0
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+config_device()
+{
+	ip netns add at_ns0
+	ip link add veth0 type veth peer name veth1
+	ip link set veth0 netns at_ns0
+	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip link set dev veth1 up mtu 1500
+	ip addr add dev veth1 172.16.1.200/24
+}
+
+add_gre_tunnel()
+{
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+        ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	# root namespace
+	ip link add dev $DEV type $TYPE key 2 external
+	ip link set dev $DEV up
+	ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6gretap_tunnel()
+{
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \
+		local ::11 remote ::22
+
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip addr add dev $DEV fc80::200/24
+	ip link set dev $DEV up
+}
+
+add_erspan_tunnel()
+{
+	# at_ns0 namespace
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 2 erspan_dir egress erspan_hwid 3
+	fi
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip link set dev $DEV up
+	ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6erspan_tunnel()
+{
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# at_ns0 namespace
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 2 erspan_dir egress erspan_hwid 7
+	fi
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
+add_vxlan_tunnel()
+{
+	# Set static ARP entry here because iptables set-mark works
+	# on L3 packet, as a result not applying to ARP packets,
+	# causing errors at get_tunnel_{key/opt}.
+
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE \
+		id 2 dstport 4789 gbp remote 172.16.1.200
+	ip netns exec at_ns0 \
+		ip link set dev $DEV_NS address 52:54:00:d9:01:00 up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 arp -s 10.1.1.200 52:54:00:d9:02:00
+	ip netns exec at_ns0 iptables -A OUTPUT -j MARK --set-mark 0x800FF
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external gbp dstport 4789
+	ip link set dev $DEV address 52:54:00:d9:02:00 up
+	ip addr add dev $DEV 10.1.1.200/24
+	arp -s 10.1.1.100 52:54:00:d9:01:00
+}
+
+add_ip6vxlan_tunnel()
+{
+	#ip netns exec at_ns0 ip -4 addr del 172.16.1.100 dev veth0
+	ip netns exec at_ns0 ip -6 addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	#ip -4 addr del 172.16.1.200 dev veth1
+	ip -6 addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE id 22 dstport 4789 \
+		local ::11 remote ::22
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external dstport 4789
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
+add_geneve_tunnel()
+{
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE \
+		id 2 dstport 6081 remote 172.16.1.200
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	# root namespace
+	ip link add dev $DEV type $TYPE dstport 6081 external
+	ip link set dev $DEV up
+	ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6geneve_tunnel()
+{
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE id 22 \
+		remote ::22     # geneve has no local option
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
+add_ipip_tunnel()
+{
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE \
+		local 172.16.1.100 remote 172.16.1.200
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip link set dev $DEV up
+	ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ipip6tnl_tunnel()
+{
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# at_ns0 namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE \
+		local ::11 remote ::22
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# root namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
+test_gre()
+{
+	TYPE=gretap
+	DEV_NS=gretap00
+	DEV=gretap11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_gre_tunnel
+	attach_bpf $DEV gre_set_tunnel gre_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+        if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6gre()
+{
+	TYPE=ip6gre
+	DEV_NS=ip6gre00
+	DEV=ip6gre11
+	ret=0
+
+	check $TYPE
+	config_device
+	# reuse the ip6gretap function
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 $PING_ARG ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 $PING_ARG fc80::200
+	check_err $?
+	cleanup
+
+        if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6gretap()
+{
+	TYPE=ip6gretap
+	DEV_NS=ip6gretap00
+	DEV=ip6gretap11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 $PING_ARG ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 $PING_ARG fc80::200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_erspan()
+{
+	TYPE=erspan
+	DEV_NS=erspan00
+	DEV=erspan11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_erspan_tunnel $1
+	attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6erspan()
+{
+	TYPE=ip6erspan
+	DEV_NS=ip6erspan00
+	DEV=ip6erspan11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ip6erspan_tunnel $1
+	attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+	ping6 $PING_ARG ::11
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_vxlan()
+{
+	TYPE=vxlan
+	DEV_NS=vxlan00
+	DEV=vxlan11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_vxlan_tunnel
+	attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6vxlan()
+{
+	TYPE=vxlan
+	DEV_NS=ip6vxlan00
+	DEV=ip6vxlan11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ip6vxlan_tunnel
+	ip link set dev veth1 mtu 1500
+	attach_bpf $DEV ip6vxlan_set_tunnel ip6vxlan_get_tunnel
+	# underlay
+	ping6 $PING_ARG ::11
+	# ip4 over ip6
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: ip6$TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
+}
+
+test_geneve()
+{
+	TYPE=geneve
+	DEV_NS=geneve00
+	DEV=geneve11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_geneve_tunnel
+	attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6geneve()
+{
+	TYPE=geneve
+	DEV_NS=ip6geneve00
+	DEV=ip6geneve11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ip6geneve_tunnel
+	attach_bpf $DEV ip6geneve_set_tunnel ip6geneve_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: ip6$TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
+}
+
+test_ipip()
+{
+	TYPE=ipip
+	DEV_NS=ipip00
+	DEV=ipip11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ipip_tunnel
+	ip link set dev veth1 mtu 1500
+	attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ipip6()
+{
+	TYPE=ip6tnl
+	DEV_NS=ipip6tnl00
+	DEV=ipip6tnl11
+	ret=0
+
+	check $TYPE
+	config_device
+	add_ipip6tnl_tunnel
+	ip link set dev veth1 mtu 1500
+	attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
+	# underlay
+	ping6 $PING_ARG ::11
+	# ip4 over ip6
+	ping $PING_ARG 10.1.1.100
+	check_err $?
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: $TYPE"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+setup_xfrm_tunnel()
+{
+	auth=0x$(printf '1%.0s' {1..40})
+	enc=0x$(printf '2%.0s' {1..32})
+	spi_in_to_out=0x1
+	spi_out_to_in=0x2
+	# at_ns0 namespace
+	# at_ns0 -> root
+	ip netns exec at_ns0 \
+		ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+			spi $spi_in_to_out reqid 1 mode tunnel \
+			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+	ip netns exec at_ns0 \
+		ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
+		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+		mode tunnel
+	# root -> at_ns0
+	ip netns exec at_ns0 \
+		ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+			spi $spi_out_to_in reqid 2 mode tunnel \
+			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+	ip netns exec at_ns0 \
+		ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
+		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+		mode tunnel
+	# address & route
+	ip netns exec at_ns0 \
+		ip addr add dev veth0 10.1.1.100/32
+	ip netns exec at_ns0 \
+		ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
+			src 10.1.1.100
+
+	# root namespace
+	# at_ns0 -> root
+	ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+		spi $spi_in_to_out reqid 1 mode tunnel \
+		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
+	ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
+		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+		mode tunnel
+	# root -> at_ns0
+	ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+		spi $spi_out_to_in reqid 2 mode tunnel \
+		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
+	ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
+		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+		mode tunnel
+	# address & route
+	ip addr add dev veth1 10.1.1.200/32
+	ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
+}
+
+test_xfrm_tunnel()
+{
+	config_device
+        #tcpdump -nei veth1 ip &
+	output=$(mktemp)
+	cat /sys/kernel/debug/tracing/trace_pipe | tee $output &
+        setup_xfrm_tunnel
+	tc qdisc add dev veth1 clsact
+	tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \
+		sec xfrm_get_state
+	ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+	sleep 1
+	grep "reqid 1" $output
+	check_err $?
+	grep "spi 0x1" $output
+	check_err $?
+	grep "remote ip 0xac100164" $output
+	check_err $?
+	cleanup
+
+	if [ $ret -ne 0 ]; then
+                echo -e ${RED}"FAIL: xfrm tunnel"${NC}
+                return 1
+        fi
+        echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
+}
+
+attach_bpf()
+{
+	DEV=$1
+	SET=$2
+	GET=$3
+	tc qdisc add dev $DEV clsact
+	tc filter add dev $DEV egress bpf da obj test_tunnel_kern.o sec $SET
+	tc filter add dev $DEV ingress bpf da obj test_tunnel_kern.o sec $GET
+}
+
+cleanup()
+{
+	ip netns delete at_ns0 2> /dev/null
+	ip link del veth1 2> /dev/null
+	ip link del ipip11 2> /dev/null
+	ip link del ipip6tnl11 2> /dev/null
+	ip link del gretap11 2> /dev/null
+	ip link del ip6gre11 2> /dev/null
+	ip link del ip6gretap11 2> /dev/null
+	ip link del vxlan11 2> /dev/null
+	ip link del ip6vxlan11 2> /dev/null
+	ip link del geneve11 2> /dev/null
+	ip link del ip6geneve11 2> /dev/null
+	ip link del erspan11 2> /dev/null
+	ip link del ip6erspan11 2> /dev/null
+}
+
+cleanup_exit()
+{
+	echo "CATCH SIGKILL or SIGINT, cleanup and exit"
+	cleanup
+	exit 0
+}
+
+check()
+{
+	ip link help $1 2>&1 | grep -q "^Usage:"
+	if [ $? -ne 0 ];then
+		echo "SKIP $1: iproute2 not support"
+	cleanup
+	return 1
+	fi
+}
+
+enable_debug()
+{
+	echo 'file ip_gre.c +p' > /sys/kernel/debug/dynamic_debug/control
+	echo 'file ip6_gre.c +p' > /sys/kernel/debug/dynamic_debug/control
+	echo 'file vxlan.c +p' > /sys/kernel/debug/dynamic_debug/control
+	echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control
+	echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control
+}
+
+check_err()
+{
+	if [ $ret -eq 0 ]; then
+		ret=$1
+	fi
+}
+
+bpf_tunnel_test()
+{
+	echo "Testing GRE tunnel..."
+	test_gre
+	echo "Testing IP6GRE tunnel..."
+	test_ip6gre
+	echo "Testing IP6GRETAP tunnel..."
+	test_ip6gretap
+	echo "Testing ERSPAN tunnel..."
+	test_erspan v2
+	echo "Testing IP6ERSPAN tunnel..."
+	test_ip6erspan v2
+	echo "Testing VXLAN tunnel..."
+	test_vxlan
+	echo "Testing IP6VXLAN tunnel..."
+	test_ip6vxlan
+	echo "Testing GENEVE tunnel..."
+	test_geneve
+	echo "Testing IP6GENEVE tunnel..."
+	test_ip6geneve
+	echo "Testing IPIP tunnel..."
+	test_ipip
+	echo "Testing IPIP6 tunnel..."
+	test_ipip6
+	echo "Testing IPSec tunnel..."
+	test_xfrm_tunnel
+}
+
+trap cleanup 0 3 6
+trap cleanup_exit 2 9
+
+cleanup
+bpf_tunnel_test
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_tunnel_kern.c b/tools/testing/selftests/bpf/test_tunnel_kern.c
new file mode 100644
index 000000000000..504df69c83df
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tunnel_kern.c
@@ -0,0 +1,713 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2016 VMware
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+#include <linux/pkt_cls.h>
+#include <linux/erspan.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define ERROR(ret) do {\
+		char fmt[] = "ERROR line:%d ret:%d\n";\
+		bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
+	} while (0)
+
+int _version SEC("version") = 1;
+
+struct geneve_opt {
+	__be16	opt_class;
+	__u8	type;
+	__u8	length:5;
+	__u8	r3:1;
+	__u8	r2:1;
+	__u8	r1:1;
+	__u8	opt_data[8]; /* hard-coded to 8 byte */
+};
+
+struct vxlan_metadata {
+	__u32     gbp;
+};
+
+SEC("gre_set_tunnel")
+int _gre_set_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("gre_get_tunnel")
+int _gre_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "key %d remote ip 0x%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4);
+	return TC_ACT_OK;
+}
+
+SEC("ip6gretap_set_tunnel")
+int _ip6gretap_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+	key.tunnel_label = 0xabcde;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+				     BPF_F_SEQ_NUMBER);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6gretap_get_tunnel")
+int _ip6gretap_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip6 ::%x label %x\n";
+	struct bpf_tunnel_key key;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			 key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+	return TC_ACT_OK;
+}
+
+SEC("erspan_set_tunnel")
+int _erspan_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_ZERO_CSUM_TX);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	__builtin_memset(&md, 0, sizeof(md));
+#ifdef ERSPAN_V1
+	md.version = 1;
+	md.u.index = bpf_htonl(123);
+#else
+	__u8 direction = 1;
+	__u8 hwid = 7;
+
+	md.version = 2;
+	md.u.md2.dir = direction;
+	md.u.md2.hwid = hwid & 0xf;
+	md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
+	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("erspan_get_tunnel")
+int _erspan_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	__u32 index;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+			 md.u.md2.dir,
+			 (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+			 bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+	return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = bpf_htonl(0x11);
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	__builtin_memset(&md, 0, sizeof(md));
+
+#ifdef ERSPAN_V1
+	md.u.index = bpf_htonl(123);
+	md.version = 1;
+#else
+	__u8 direction = 0;
+	__u8 hwid = 17;
+
+	md.version = 2;
+	md.u.md2.dir = direction;
+	md.u.md2.hwid = hwid & 0xf;
+	md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
+	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	__u32 index;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+			 md.u.md2.dir,
+			 (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+			 bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+	return TC_ACT_OK;
+}
+
+SEC("vxlan_set_tunnel")
+int _vxlan_set_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	struct vxlan_metadata md;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_ZERO_CSUM_TX);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	md.gbp = 0x800FF; /* Set VXLAN Group Policy extension */
+	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("vxlan_get_tunnel")
+int _vxlan_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	struct vxlan_metadata md;
+	char fmt[] = "key %d remote ip 0x%x vxlan gbp 0x%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, md.gbp);
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6vxlan_set_tunnel")
+int _ip6vxlan_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+	key.tunnel_id = 22;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6vxlan_get_tunnel")
+int _ip6vxlan_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip6 ::%x label %x\n";
+	struct bpf_tunnel_key key;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			 key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+	return TC_ACT_OK;
+}
+
+SEC("geneve_set_tunnel")
+int _geneve_set_tunnel(struct __sk_buff *skb)
+{
+	int ret, ret2;
+	struct bpf_tunnel_key key;
+	struct geneve_opt gopt;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	__builtin_memset(&gopt, 0x0, sizeof(gopt));
+	gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+	gopt.type = 0x08;
+	gopt.r1 = 0;
+	gopt.r2 = 0;
+	gopt.r3 = 0;
+	gopt.length = 2; /* 4-byte multiple */
+	*(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef);
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_ZERO_CSUM_TX);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("geneve_get_tunnel")
+int _geneve_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	struct geneve_opt gopt;
+	char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, gopt.opt_class);
+	return TC_ACT_OK;
+}
+
+SEC("ip6geneve_set_tunnel")
+int _ip6geneve_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	struct geneve_opt gopt;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+	key.tunnel_id = 22;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	__builtin_memset(&gopt, 0x0, sizeof(gopt));
+	gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+	gopt.type = 0x08;
+	gopt.r1 = 0;
+	gopt.r2 = 0;
+	gopt.r3 = 0;
+	gopt.length = 2; /* 4-byte multiple */
+	*(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef);
+
+	ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6geneve_get_tunnel")
+int _ip6geneve_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
+	struct bpf_tunnel_key key;
+	struct geneve_opt gopt;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv4, gopt.opt_class);
+
+	return TC_ACT_OK;
+}
+
+SEC("ipip_set_tunnel")
+int _ipip_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct iphdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	key.tunnel_ttl = 64;
+	if (iph->protocol == IPPROTO_ICMP) {
+		key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+	} else {
+		if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
+			return TC_ACT_SHOT;
+
+		if (tcp->dest == bpf_htons(5200))
+			key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+		else if (tcp->dest == bpf_htons(5201))
+			key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
+		else
+			return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ipip_get_tunnel")
+int _ipip_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip 0x%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
+	return TC_ACT_OK;
+}
+
+SEC("ipip6_set_tunnel")
+int _ipip6_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct iphdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ipip6_get_tunnel")
+int _ipip6_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip6 %x::%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), bpf_htonl(key.remote_ipv6[0]),
+			 bpf_htonl(key.remote_ipv6[3]));
+	return TC_ACT_OK;
+}
+
+SEC("ip6ip6_set_tunnel")
+int _ip6ip6_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key = {};
+	void *data = (void *)(long)skb->data;
+	struct ipv6hdr *iph = data;
+	struct tcphdr *tcp = data + sizeof(*iph);
+	void *data_end = (void *)(long)skb->data_end;
+	int ret;
+
+	/* single length check */
+	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
+		ERROR(1);
+		return TC_ACT_SHOT;
+	}
+
+	key.remote_ipv6[0] = bpf_htonl(0x2401db00);
+	key.tunnel_ttl = 64;
+
+	if (iph->nexthdr == 58 /* NEXTHDR_ICMP */) {
+		key.remote_ipv6[3] = bpf_htonl(1);
+	} else {
+		if (iph->nexthdr != 6 /* NEXTHDR_TCP */) {
+			ERROR(iph->nexthdr);
+			return TC_ACT_SHOT;
+		}
+
+		if (tcp->dest == bpf_htons(5200)) {
+			key.remote_ipv6[3] = bpf_htonl(1);
+		} else if (tcp->dest == bpf_htons(5201)) {
+			key.remote_ipv6[3] = bpf_htonl(2);
+		} else {
+			ERROR(tcp->dest);
+			return TC_ACT_SHOT;
+		}
+	}
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6ip6_get_tunnel")
+int _ip6ip6_get_tunnel(struct __sk_buff *skb)
+{
+	int ret;
+	struct bpf_tunnel_key key;
+	char fmt[] = "remote ip6 %x::%x\n";
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt), bpf_htonl(key.remote_ipv6[0]),
+			 bpf_htonl(key.remote_ipv6[3]));
+	return TC_ACT_OK;
+}
+
+SEC("xfrm_get_state")
+int _xfrm_get_state(struct __sk_buff *skb)
+{
+	struct bpf_xfrm_state x;
+	char fmt[] = "reqid %d spi 0x%x remote ip 0x%x\n";
+	int ret;
+
+	ret = bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
+	if (ret < 0)
+		return TC_ACT_OK;
+
+	bpf_trace_printk(fmt, sizeof(fmt), x.reqid, bpf_ntohl(x.spi),
+			 bpf_ntohl(x.remote_ipv4));
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From b05cd74043236e7427a71fc7c59deee4588a7c91 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 26 Apr 2018 14:01:40 -0700
Subject: samples/bpf: remove the bpf tunnel testsuite.

Move the testsuite to
selftests/bpf/{test_tunnel_kern.c, test_tunnel.sh}

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile           |   1 -
 samples/bpf/tcbpf2_kern.c      | 612 -----------------------------------------
 samples/bpf/test_tunnel_bpf.sh | 390 --------------------------
 3 files changed, 1003 deletions(-)
 delete mode 100644 samples/bpf/tcbpf2_kern.c
 delete mode 100755 samples/bpf/test_tunnel_bpf.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aa8c392e2e52..b853581592fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -114,7 +114,6 @@ always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
 always += tcbpf1_kern.o
-always += tcbpf2_kern.o
 always += tc_l2_redirect_kern.o
 always += lathist_kern.o
 always += offwaketime_kern.o
diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
deleted file mode 100644
index fa260c750fb1..000000000000
--- a/samples/bpf/tcbpf2_kern.c
+++ /dev/null
@@ -1,612 +0,0 @@
-/* Copyright (c) 2016 VMware
- * Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <uapi/linux/if_ether.h>
-#include <uapi/linux/if_packet.h>
-#include <uapi/linux/ip.h>
-#include <uapi/linux/ipv6.h>
-#include <uapi/linux/in.h>
-#include <uapi/linux/tcp.h>
-#include <uapi/linux/filter.h>
-#include <uapi/linux/pkt_cls.h>
-#include <uapi/linux/erspan.h>
-#include <net/ipv6.h>
-#include "bpf_helpers.h"
-#include "bpf_endian.h"
-
-#define _htonl __builtin_bswap32
-#define ERROR(ret) do {\
-		char fmt[] = "ERROR line:%d ret:%d\n";\
-		bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
-	} while(0)
-
-struct geneve_opt {
-	__be16	opt_class;
-	u8	type;
-	u8	length:5;
-	u8	r3:1;
-	u8	r2:1;
-	u8	r1:1;
-	u8	opt_data[8]; /* hard-coded to 8 byte */
-};
-
-struct vxlan_metadata {
-	u32     gbp;
-};
-
-SEC("gre_set_tunnel")
-int _gre_set_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
-				     BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("gre_get_tunnel")
-int _gre_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	char fmt[] = "key %d remote ip 0x%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4);
-	return TC_ACT_OK;
-}
-
-SEC("ip6gretap_set_tunnel")
-int _ip6gretap_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key;
-	int ret;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv6[3] = _htonl(0x11); /* ::11 */
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-	key.tunnel_label = 0xabcde;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
-				     BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
-				     BPF_F_SEQ_NUMBER);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("ip6gretap_get_tunnel")
-int _ip6gretap_get_tunnel(struct __sk_buff *skb)
-{
-	char fmt[] = "key %d remote ip6 ::%x label %x\n";
-	struct bpf_tunnel_key key;
-	int ret;
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
-				     BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt),
-			 key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
-
-	return TC_ACT_OK;
-}
-
-SEC("erspan_set_tunnel")
-int _erspan_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key;
-	struct erspan_metadata md;
-	int ret;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	__builtin_memset(&md, 0, sizeof(md));
-#ifdef ERSPAN_V1
-	md.version = 1;
-	md.u.index = bpf_htonl(123);
-#else
-	u8 direction = 1;
-	u8 hwid = 7;
-
-	md.version = 2;
-	md.u.md2.dir = direction;
-	md.u.md2.hwid = hwid & 0xf;
-	md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
-#endif
-
-	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("erspan_get_tunnel")
-int _erspan_get_tunnel(struct __sk_buff *skb)
-{
-	char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
-	struct bpf_tunnel_key key;
-	struct erspan_metadata md;
-	u32 index;
-	int ret;
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, md.version);
-
-#ifdef ERSPAN_V1
-	char fmt2[] = "\tindex %x\n";
-
-	index = bpf_ntohl(md.u.index);
-	bpf_trace_printk(fmt2, sizeof(fmt2), index);
-#else
-	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
-
-	bpf_trace_printk(fmt2, sizeof(fmt2),
-			 md.u.md2.dir,
-			 (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
-			 bpf_ntohl(md.u.md2.timestamp));
-#endif
-
-	return TC_ACT_OK;
-}
-
-SEC("ip4ip6erspan_set_tunnel")
-int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key;
-	struct erspan_metadata md;
-	int ret;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv6[3] = _htonl(0x11);
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
-				     BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	__builtin_memset(&md, 0, sizeof(md));
-
-#ifdef ERSPAN_V1
-	md.u.index = htonl(123);
-	md.version = 1;
-#else
-	u8 direction = 0;
-	u8 hwid = 17;
-
-	md.version = 2;
-	md.u.md2.dir = direction;
-	md.u.md2.hwid = hwid & 0xf;
-	md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
-#endif
-
-	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("ip4ip6erspan_get_tunnel")
-int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
-{
-	char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
-	struct bpf_tunnel_key key;
-	struct erspan_metadata md;
-	u32 index;
-	int ret;
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, md.version);
-
-#ifdef ERSPAN_V1
-	char fmt2[] = "\tindex %x\n";
-
-	index = bpf_ntohl(md.u.index);
-	bpf_trace_printk(fmt2, sizeof(fmt2), index);
-#else
-	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
-
-	bpf_trace_printk(fmt2, sizeof(fmt2),
-			 md.u.md2.dir,
-			 (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
-			 bpf_ntohl(md.u.md2.timestamp));
-#endif
-
-	return TC_ACT_OK;
-}
-
-SEC("vxlan_set_tunnel")
-int _vxlan_set_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	struct vxlan_metadata md;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	md.gbp = 0x800FF; /* Set VXLAN Group Policy extension */
-	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("vxlan_get_tunnel")
-int _vxlan_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	struct vxlan_metadata md;
-	char fmt[] = "key %d remote ip 0x%x vxlan gbp 0x%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, md.gbp);
-
-	return TC_ACT_OK;
-}
-
-SEC("geneve_set_tunnel")
-int _geneve_set_tunnel(struct __sk_buff *skb)
-{
-	int ret, ret2;
-	struct bpf_tunnel_key key;
-	struct geneve_opt gopt;
-
-	__builtin_memset(&key, 0x0, sizeof(key));
-	key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-	key.tunnel_id = 2;
-	key.tunnel_tos = 0;
-	key.tunnel_ttl = 64;
-
-	__builtin_memset(&gopt, 0x0, sizeof(gopt));
-	gopt.opt_class = 0x102; /* Open Virtual Networking (OVN) */
-	gopt.type = 0x08;
-	gopt.r1 = 0;
-	gopt.r2 = 0;
-	gopt.r3 = 0;
-	gopt.length = 2; /* 4-byte multiple */
-	*(int *) &gopt.opt_data = 0xdeadbeef;
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("geneve_get_tunnel")
-int _geneve_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	struct geneve_opt gopt;
-	char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, gopt.opt_class);
-	return TC_ACT_OK;
-}
-
-SEC("ipip_set_tunnel")
-int _ipip_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key = {};
-	void *data = (void *)(long)skb->data;
-	struct iphdr *iph = data;
-	struct tcphdr *tcp = data + sizeof(*iph);
-	void *data_end = (void *)(long)skb->data_end;
-	int ret;
-
-	/* single length check */
-	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
-		ERROR(1);
-		return TC_ACT_SHOT;
-	}
-
-	key.tunnel_ttl = 64;
-	if (iph->protocol == IPPROTO_ICMP) {
-		key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-	} else {
-		if (iph->protocol != IPPROTO_TCP || iph->ihl != 5)
-			return TC_ACT_SHOT;
-
-		if (tcp->dest == htons(5200))
-			key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
-		else if (tcp->dest == htons(5201))
-			key.remote_ipv4 = 0xac100165; /* 172.16.1.101 */
-		else
-			return TC_ACT_SHOT;
-	}
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("ipip_get_tunnel")
-int _ipip_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	char fmt[] = "remote ip 0x%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
-	return TC_ACT_OK;
-}
-
-SEC("ipip6_set_tunnel")
-int _ipip6_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key = {};
-	void *data = (void *)(long)skb->data;
-	struct iphdr *iph = data;
-	struct tcphdr *tcp = data + sizeof(*iph);
-	void *data_end = (void *)(long)skb->data_end;
-	int ret;
-
-	/* single length check */
-	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
-		ERROR(1);
-		return TC_ACT_SHOT;
-	}
-
-	key.remote_ipv6[0] = _htonl(0x2401db00);
-	key.tunnel_ttl = 64;
-
-	if (iph->protocol == IPPROTO_ICMP) {
-		key.remote_ipv6[3] = _htonl(1);
-	} else {
-		if (iph->protocol != IPPROTO_TCP || iph->ihl != 5) {
-			ERROR(iph->protocol);
-			return TC_ACT_SHOT;
-		}
-
-		if (tcp->dest == htons(5200)) {
-			key.remote_ipv6[3] = _htonl(1);
-		} else if (tcp->dest == htons(5201)) {
-			key.remote_ipv6[3] = _htonl(2);
-		} else {
-			ERROR(tcp->dest);
-			return TC_ACT_SHOT;
-		}
-	}
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("ipip6_get_tunnel")
-int _ipip6_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	char fmt[] = "remote ip6 %x::%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
-			 _htonl(key.remote_ipv6[3]));
-	return TC_ACT_OK;
-}
-
-SEC("ip6ip6_set_tunnel")
-int _ip6ip6_set_tunnel(struct __sk_buff *skb)
-{
-	struct bpf_tunnel_key key = {};
-	void *data = (void *)(long)skb->data;
-	struct ipv6hdr *iph = data;
-	struct tcphdr *tcp = data + sizeof(*iph);
-	void *data_end = (void *)(long)skb->data_end;
-	int ret;
-
-	/* single length check */
-	if (data + sizeof(*iph) + sizeof(*tcp) > data_end) {
-		ERROR(1);
-		return TC_ACT_SHOT;
-	}
-
-	key.remote_ipv6[0] = _htonl(0x2401db00);
-	key.tunnel_ttl = 64;
-
-	if (iph->nexthdr == NEXTHDR_ICMP) {
-		key.remote_ipv6[3] = _htonl(1);
-	} else {
-		if (iph->nexthdr != NEXTHDR_TCP) {
-			ERROR(iph->nexthdr);
-			return TC_ACT_SHOT;
-		}
-
-		if (tcp->dest == htons(5200)) {
-			key.remote_ipv6[3] = _htonl(1);
-		} else if (tcp->dest == htons(5201)) {
-			key.remote_ipv6[3] = _htonl(2);
-		} else {
-			ERROR(tcp->dest);
-			return TC_ACT_SHOT;
-		}
-	}
-
-	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	return TC_ACT_OK;
-}
-
-SEC("ip6ip6_get_tunnel")
-int _ip6ip6_get_tunnel(struct __sk_buff *skb)
-{
-	int ret;
-	struct bpf_tunnel_key key;
-	char fmt[] = "remote ip6 %x::%x\n";
-
-	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
-	if (ret < 0) {
-		ERROR(ret);
-		return TC_ACT_SHOT;
-	}
-
-	bpf_trace_printk(fmt, sizeof(fmt), _htonl(key.remote_ipv6[0]),
-			 _htonl(key.remote_ipv6[3]));
-	return TC_ACT_OK;
-}
-
-SEC("xfrm_get_state")
-int _xfrm_get_state(struct __sk_buff *skb)
-{
-	struct bpf_xfrm_state x;
-	char fmt[] = "reqid %d spi 0x%x remote ip 0x%x\n";
-	int ret;
-
-	ret = bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
-	if (ret < 0)
-		return TC_ACT_OK;
-
-	bpf_trace_printk(fmt, sizeof(fmt), x.reqid, bpf_ntohl(x.spi),
-			 bpf_ntohl(x.remote_ipv4));
-	return TC_ACT_OK;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
deleted file mode 100755
index 9c534dc07b36..000000000000
--- a/samples/bpf/test_tunnel_bpf.sh
+++ /dev/null
@@ -1,390 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# In Namespace 0 (at_ns0) using native tunnel
-# Overlay IP: 10.1.1.100
-# local 192.16.1.100 remote 192.16.1.200
-# veth0 IP: 172.16.1.100, tunnel dev <type>00
-
-# Out of Namespace using BPF set/get on lwtunnel
-# Overlay IP: 10.1.1.200
-# local 172.16.1.200 remote 172.16.1.100
-# veth1 IP: 172.16.1.200, tunnel dev <type>11
-
-function config_device {
-	ip netns add at_ns0
-	ip link add veth0 type veth peer name veth1
-	ip link set veth0 netns at_ns0
-	ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
-	ip netns exec at_ns0 ip link set dev veth0 up
-	ip link set dev veth1 up mtu 1500
-	ip addr add dev veth1 172.16.1.200/24
-}
-
-function add_gre_tunnel {
-	# in namespace
-	ip netns exec at_ns0 \
-        ip link add dev $DEV_NS type $TYPE seq key 2 \
-		local 172.16.1.100 remote 172.16.1.200
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE key 2 external
-	ip link set dev $DEV up
-	ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ip6gretap_tunnel {
-
-	# assign ipv6 address
-	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
-	ip netns exec at_ns0 ip link set dev veth0 up
-	ip addr add dev veth1 ::22/96
-	ip link set dev veth1 up
-
-	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \
-		local ::11 remote ::22
-
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-	ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE external
-	ip addr add dev $DEV 10.1.1.200/24
-	ip addr add dev $DEV fc80::200/24
-	ip link set dev $DEV up
-}
-
-function add_erspan_tunnel {
-	# in namespace
-	if [ "$1" == "v1" ]; then
-		ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 \
-		local 172.16.1.100 remote 172.16.1.200 \
-		erspan_ver 1 erspan 123
-	else
-		ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 \
-		local 172.16.1.100 remote 172.16.1.200 \
-		erspan_ver 2 erspan_dir egress erspan_hwid 3
-	fi
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE external
-	ip link set dev $DEV up
-	ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ip6erspan_tunnel {
-
-	# assign ipv6 address
-	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
-	ip netns exec at_ns0 ip link set dev veth0 up
-	ip addr add dev veth1 ::22/96
-	ip link set dev veth1 up
-
-	# in namespace
-	if [ "$1" == "v1" ]; then
-		ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 \
-		local ::11 remote ::22 \
-		erspan_ver 1 erspan 123
-	else
-		ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 \
-		local ::11 remote ::22 \
-		erspan_ver 2 erspan_dir egress erspan_hwid 7
-	fi
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE external
-	ip addr add dev $DEV 10.1.1.200/24
-	ip link set dev $DEV up
-}
-
-function add_vxlan_tunnel {
-	# Set static ARP entry here because iptables set-mark works
-	# on L3 packet, as a result not applying to ARP packets,
-	# causing errors at get_tunnel_{key/opt}.
-
-	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE id 2 dstport 4789 gbp remote 172.16.1.200
-	ip netns exec at_ns0 ip link set dev $DEV_NS address 52:54:00:d9:01:00 up
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-	ip netns exec at_ns0 arp -s 10.1.1.200 52:54:00:d9:02:00
-	ip netns exec at_ns0 iptables -A OUTPUT -j MARK --set-mark 0x800FF
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE external gbp dstport 4789
-	ip link set dev $DEV address 52:54:00:d9:02:00 up
-	ip addr add dev $DEV 10.1.1.200/24
-	arp -s 10.1.1.100 52:54:00:d9:01:00
-}
-
-function add_geneve_tunnel {
-	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE id 2 dstport 6081 remote 172.16.1.200
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE dstport 6081 external
-	ip link set dev $DEV up
-	ip addr add dev $DEV 10.1.1.200/24
-}
-
-function add_ipip_tunnel {
-	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE local 172.16.1.100 remote 172.16.1.200
-	ip netns exec at_ns0 ip link set dev $DEV_NS up
-	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
-
-	# out of namespace
-	ip link add dev $DEV type $TYPE external
-	ip link set dev $DEV up
-	ip addr add dev $DEV 10.1.1.200/24
-}
-
-function setup_xfrm_tunnel {
-	auth=0x$(printf '1%.0s' {1..40})
-	enc=0x$(printf '2%.0s' {1..32})
-	spi_in_to_out=0x1
-	spi_out_to_in=0x2
-	# in namespace
-	# in -> out
-	ip netns exec at_ns0 \
-		ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
-			spi $spi_in_to_out reqid 1 mode tunnel \
-			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
-	ip netns exec at_ns0 \
-		ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
-		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
-		mode tunnel
-	# out -> in
-	ip netns exec at_ns0 \
-		ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
-			spi $spi_out_to_in reqid 2 mode tunnel \
-			auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
-	ip netns exec at_ns0 \
-		ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
-		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
-		mode tunnel
-	# address & route
-	ip netns exec at_ns0 \
-		ip addr add dev veth0 10.1.1.100/32
-	ip netns exec at_ns0 \
-		ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
-			src 10.1.1.100
-
-	# out of namespace
-	# in -> out
-	ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
-		spi $spi_in_to_out reqid 1 mode tunnel \
-		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
-	ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
-		tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
-		mode tunnel
-	# out -> in
-	ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
-		spi $spi_out_to_in reqid 2 mode tunnel \
-		auth-trunc 'hmac(sha1)' $auth 96  enc 'cbc(aes)' $enc
-	ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
-		tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
-		mode tunnel
-	# address & route
-	ip addr add dev veth1 10.1.1.200/32
-	ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
-}
-
-function attach_bpf {
-	DEV=$1
-	SET_TUNNEL=$2
-	GET_TUNNEL=$3
-	tc qdisc add dev $DEV clsact
-	tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL
-	tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL
-}
-
-function test_gre {
-	TYPE=gretap
-	DEV_NS=gretap00
-	DEV=gretap11
-	config_device
-	add_gre_tunnel
-	attach_bpf $DEV gre_set_tunnel gre_get_tunnel
-	ping -c 1 10.1.1.100
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	cleanup
-}
-
-function test_ip6gre {
-	TYPE=ip6gre
-	DEV_NS=ip6gre00
-	DEV=ip6gre11
-	config_device
-	# reuse the ip6gretap function
-	add_ip6gretap_tunnel
-	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
-	# underlay
-	ping6 -c 4 ::11
-	# overlay: ipv4 over ipv6
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	ping -c 1 10.1.1.100
-	# overlay: ipv6 over ipv6
-	ip netns exec at_ns0 ping6 -c 1 fc80::200
-	cleanup
-}
-
-function test_ip6gretap {
-	TYPE=ip6gretap
-	DEV_NS=ip6gretap00
-	DEV=ip6gretap11
-	config_device
-	add_ip6gretap_tunnel
-	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
-	# underlay
-	ping6 -c 4 ::11
-	# overlay: ipv4 over ipv6
-	ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200
-	ping -c 1 10.1.1.100
-	# overlay: ipv6 over ipv6
-	ip netns exec at_ns0 ping6 -c 1 fc80::200
-	cleanup
-}
-
-function test_erspan {
-	TYPE=erspan
-	DEV_NS=erspan00
-	DEV=erspan11
-	config_device
-	add_erspan_tunnel $1
-	attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
-	ping -c 1 10.1.1.100
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	cleanup
-}
-
-function test_ip6erspan {
-	TYPE=ip6erspan
-	DEV_NS=ip6erspan00
-	DEV=ip6erspan11
-	config_device
-	add_ip6erspan_tunnel $1
-	attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
-	ping6 -c 3 ::11
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	cleanup
-}
-
-function test_vxlan {
-	TYPE=vxlan
-	DEV_NS=vxlan00
-	DEV=vxlan11
-	config_device
-	add_vxlan_tunnel
-	attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
-	ping -c 1 10.1.1.100
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	cleanup
-}
-
-function test_geneve {
-	TYPE=geneve
-	DEV_NS=geneve00
-	DEV=geneve11
-	config_device
-	add_geneve_tunnel
-	attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
-	ping -c 1 10.1.1.100
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	cleanup
-}
-
-function test_ipip {
-	TYPE=ipip
-	DEV_NS=ipip00
-	DEV=ipip11
-	config_device
-	tcpdump -nei veth1 &
-	cat /sys/kernel/debug/tracing/trace_pipe &
-	add_ipip_tunnel
-	ethtool -K veth1 gso off gro off rx off tx off
-	ip link set dev veth1 mtu 1500
-	attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
-	ping -c 1 10.1.1.100
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null
-	sleep 0.2
-	iperf -c 10.1.1.100 -n 5k -p 5200
-	cleanup
-}
-
-function test_xfrm_tunnel {
-	config_device
-        tcpdump -nei veth1 ip &
-	output=$(mktemp)
-	cat /sys/kernel/debug/tracing/trace_pipe | tee $output &
-        setup_xfrm_tunnel
-	tc qdisc add dev veth1 clsact
-	tc filter add dev veth1 proto ip ingress bpf da obj tcbpf2_kern.o \
-		sec xfrm_get_state
-	ip netns exec at_ns0 ping -c 1 10.1.1.200
-	grep "reqid 1" $output
-	grep "spi 0x1" $output
-	grep "remote ip 0xac100164" $output
-	cleanup
-}
-
-function cleanup {
-	set +ex
-	pkill iperf
-	ip netns delete at_ns0
-	ip link del veth1
-	ip link del ipip11
-	ip link del gretap11
-	ip link del ip6gre11
-	ip link del ip6gretap11
-	ip link del vxlan11
-	ip link del geneve11
-	ip link del erspan11
-	ip link del ip6erspan11
-	ip x s flush
-	ip x p flush
-	pkill tcpdump
-	pkill cat
-	set -ex
-}
-
-trap cleanup 0 2 3 6 9
-cleanup
-echo "Testing GRE tunnel..."
-test_gre
-echo "Testing IP6GRE tunnel..."
-test_ip6gre
-echo "Testing IP6GRETAP tunnel..."
-test_ip6gretap
-echo "Testing ERSPAN tunnel..."
-test_erspan v1
-test_erspan v2
-echo "Testing IP6ERSPAN tunnel..."
-test_ip6erspan v1
-test_ip6erspan v2
-echo "Testing VXLAN tunnel..."
-test_vxlan
-echo "Testing GENEVE tunnel..."
-test_geneve
-echo "Testing IPIP tunnel..."
-test_ipip
-echo "Testing IPSec tunnel..."
-test_xfrm_tunnel
-echo "*** PASS ***"
-- 
cgit v1.2.3-59-g8ed1b


From 3aa1409a7b160f9444945c0df1cb079df82be84e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 23 Apr 2018 13:53:41 -0700
Subject: ipvs: initialize tbl->entries after allocation

tbl->entries is not initialized after kmalloc(), therefore
causes an uninit-value warning in ip_vs_lblc_check_expire()
as reported by syzbot.

Reported-by: <syzbot+3dfdea57819073a04f21@syzkaller.appspotmail.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_lblcr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 9b6a6c9e9cfa..542c4949937a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -535,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	tbl->counter = 1;
 	tbl->dead = false;
 	tbl->svc = svc;
+	atomic_set(&tbl->entries, 0);
 
 	/*
 	 *    Hook periodic timer for garbage collection
-- 
cgit v1.2.3-59-g8ed1b


From 8b2ebb6cf064247d60cccbf1750610ac9bb2e672 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 23 Apr 2018 14:04:45 -0700
Subject: ipvs: initialize tbl->entries in ip_vs_lblc_init_svc()

Similarly, tbl->entries is not initialized after kmalloc(),
therefore causes an uninit-value warning in ip_vs_lblc_check_expire(),
as reported by syzbot.

Reported-by: <syzbot+3e9695f147fb529aa9bc@syzkaller.appspotmail.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_lblc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 08147fc6400c..b9f375e6dc93 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -372,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	tbl->counter = 1;
 	tbl->dead = false;
 	tbl->svc = svc;
+	atomic_set(&tbl->entries, 0);
 
 	/*
 	 *    Hook periodic timer for garbage collection
-- 
cgit v1.2.3-59-g8ed1b


From 56a092c895054a6b423781d788339775bd2bda10 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:52 +0100
Subject: bpf: add script and prepare bpf.h for new helpers documentation

Remove previous "overview" of eBPF helpers from user bpf.h header.
Replace it by a comment explaining how to process the new documentation
(to come in following patches) with a Python script to produce RST, then
man page documentation.

Also add the aforementioned Python script under scripts/. It is used to
process include/uapi/linux/bpf.h and to extract helper descriptions, to
turn it into a RST document that can further be processed with rst2man
to produce a man page. The script takes one "--filename <path/to/file>"
option. If the script is launched from scripts/ in the kernel root
directory, it should be able to find the location of the header to
parse, and "--filename <path/to/file>" is then optional. If it cannot
find the file, then the option becomes mandatory. RST-formatted
documentation is printed to standard output.

Typical workflow for producing the final man page would be:

    $ ./scripts/bpf_helpers_doc.py \
            --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    $ man /tmp/bpf-helpers.7

Note that the tool kernel-doc cannot be used to document eBPF helpers,
whose signatures are not available directly in the header files
(pre-processor directives are used to produce them at the beginning of
the compilation process).

v4:
- Also remove overviews for newly added bpf_xdp_adjust_tail() and
  bpf_skb_get_xfrm_state().
- Remove vague statement about what helpers are restricted to GPL
  programs in "LICENSE" section for man page footer.
- Replace license boilerplate with SPDX tag for Python script.

v3:
- Change license for man page.
- Remove "for safety reasons" from man page header text.
- Change "packets metadata" to "packets" in man page header text.
- Move and fix comment on helpers introducing no overhead.
- Remove "NOTES" section from man page footer.
- Add "LICENSE" section to man page footer.
- Edit description of file include/uapi/linux/bpf.h in man page footer.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h   | 422 ++-------------------------------------------
 scripts/bpf_helpers_doc.py | 421 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 437 insertions(+), 406 deletions(-)
 create mode 100755 scripts/bpf_helpers_doc.py

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da8801860c7d..dd43fc98c982 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -377,412 +377,22 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
- *
- * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
- *
- * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
- *
- * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
- *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
- *
- * int bpf_xdp_adjust_tail(xdp_md, delta)
- *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
- *     size is supported.
- *     @xdp_md: pointer to xdp_md
- *     @delta: A negative integer to be added to xdp_md.data_end
- *     Return: 0 on success or negative on error
- *
- * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
- *     retrieve XFRM state
- *     @skb: pointer to skb
- *     @index: index of the xfrm state in the secpath
- *     @key: pointer to 'struct bpf_xfrm_state'
- *     @size: size of 'struct bpf_xfrm_state'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
new file mode 100755
index 000000000000..30ba0fee36e4
--- /dev/null
+++ b/scripts/bpf_helpers_doc.py
@@ -0,0 +1,421 @@
+#!/usr/bin/python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright (C) 2018 Netronome Systems, Inc.
+
+# In case user attempts to run with Python 2.
+from __future__ import print_function
+
+import argparse
+import re
+import sys, os
+
+class NoHelperFound(BaseException):
+    pass
+
+class ParsingError(BaseException):
+    def __init__(self, line='<line not provided>', reader=None):
+        if reader:
+            BaseException.__init__(self,
+                                   'Error at file offset %d, parsing line: %s' %
+                                   (reader.tell(), line))
+        else:
+            BaseException.__init__(self, 'Error parsing line: %s' % line)
+
+class Helper(object):
+    """
+    An object representing the description of an eBPF helper function.
+    @proto: function prototype of the helper function
+    @desc: textual description of the helper function
+    @ret: description of the return value of the helper function
+    """
+    def __init__(self, proto='', desc='', ret=''):
+        self.proto = proto
+        self.desc = desc
+        self.ret = ret
+
+    def proto_break_down(self):
+        """
+        Break down helper function protocol into smaller chunks: return type,
+        name, distincts arguments.
+        """
+        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        res = {}
+        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+
+        capture = proto_re.match(self.proto)
+        res['ret_type'] = capture.group(1)
+        res['ret_star'] = capture.group(2)
+        res['name']     = capture.group(3)
+        res['args'] = []
+
+        args    = capture.group(4).split(', ')
+        for a in args:
+            capture = arg_re.match(a)
+            res['args'].append({
+                'type' : capture.group(1),
+                'star' : capture.group(6),
+                'name' : capture.group(7)
+            })
+
+        return res
+
+class HeaderParser(object):
+    """
+    An object used to parse a file in order to extract the documentation of a
+    list of eBPF helper functions. All the helpers that can be retrieved are
+    stored as Helper object, in the self.helpers() array.
+    @filename: name of file to parse, usually include/uapi/linux/bpf.h in the
+               kernel tree
+    """
+    def __init__(self, filename):
+        self.reader = open(filename, 'r')
+        self.line = ''
+        self.helpers = []
+
+    def parse_helper(self):
+        proto    = self.parse_proto()
+        desc     = self.parse_desc()
+        ret      = self.parse_ret()
+        return Helper(proto=proto, desc=desc, ret=ret)
+
+    def parse_proto(self):
+        # Argument can be of shape:
+        #   - "void"
+        #   - "type  name"
+        #   - "type *name"
+        #   - Same as above, with "const" and/or "struct" in front of type
+        #   - "..." (undefined number of arguments, for bpf_trace_printk())
+        # There is at least one term ("void"), and at most five arguments.
+        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        capture = p.match(self.line)
+        if not capture:
+            raise NoHelperFound
+        self.line = self.reader.readline()
+        return capture.group(1)
+
+    def parse_desc(self):
+        p = re.compile('^ \* \tDescription$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty description and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Description can be several lines, some of them possibly empty, and it
+        # stops when another subsection title is met.
+        desc = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                desc += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    desc += capture.group(1) + '\n'
+                else:
+                    break
+        return desc
+
+    def parse_ret(self):
+        p = re.compile('^ \* \tReturn$')
+        capture = p.match(self.line)
+        if not capture:
+            # Helper can have empty retval and we might be parsing another
+            # attribute: return but do not consume.
+            return ''
+        # Return value description can be several lines, some of them possibly
+        # empty, and it stops when another subsection title is met.
+        ret = ''
+        while True:
+            self.line = self.reader.readline()
+            if self.line == ' *\n':
+                ret += '\n'
+            else:
+                p = re.compile('^ \* \t\t(.*)')
+                capture = p.match(self.line)
+                if capture:
+                    ret += capture.group(1) + '\n'
+                else:
+                    break
+        return ret
+
+    def run(self):
+        # Advance to start of helper function descriptions.
+        offset = self.reader.read().find('* Start of BPF helper function descriptions:')
+        if offset == -1:
+            raise Exception('Could not find start of eBPF helper descriptions list')
+        self.reader.seek(offset)
+        self.reader.readline()
+        self.reader.readline()
+        self.line = self.reader.readline()
+
+        while True:
+            try:
+                helper = self.parse_helper()
+                self.helpers.append(helper)
+            except NoHelperFound:
+                break
+
+        self.reader.close()
+        print('Parsed description of %d helper function(s)' % len(self.helpers),
+              file=sys.stderr)
+
+###############################################################################
+
+class Printer(object):
+    """
+    A generic class for printers. Printers should be created with an array of
+    Helper objects, and implement a way to print them in the desired fashion.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def __init__(self, helpers):
+        self.helpers = helpers
+
+    def print_header(self):
+        pass
+
+    def print_footer(self):
+        pass
+
+    def print_one(self, helper):
+        pass
+
+    def print_all(self):
+        self.print_header()
+        for helper in self.helpers:
+            self.print_one(helper)
+        self.print_footer()
+
+class PrinterRST(Printer):
+    """
+    A printer for dumping collected information about helpers as a ReStructured
+    Text page compatible with the rst2man program, which can be used to
+    generate a manual page for the helpers.
+    @helpers: array of Helper objects to print to standard output
+    """
+    def print_header(self):
+        header = '''\
+.. Copyright (C) All BPF authors and contributors from 2014 to present.
+.. See git log include/uapi/linux/bpf.h in kernel tree for details.
+.. 
+.. %%%LICENSE_START(VERBATIM)
+.. Permission is granted to make and distribute verbatim copies of this
+.. manual provided the copyright notice and this permission notice are
+.. preserved on all copies.
+.. 
+.. Permission is granted to copy and distribute modified versions of this
+.. manual under the conditions for verbatim copying, provided that the
+.. entire resulting derived work is distributed under the terms of a
+.. permission notice identical to this one.
+.. 
+.. Since the Linux kernel and libraries are constantly changing, this
+.. manual page may be incorrect or out-of-date.  The author(s) assume no
+.. responsibility for errors or omissions, or for damages resulting from
+.. the use of the information contained herein.  The author(s) may not
+.. have taken the same level of care in the production of this manual,
+.. which is licensed free of charge, as they might when working
+.. professionally.
+.. 
+.. Formatted or processed versions of this manual, if unaccompanied by
+.. the source, must acknowledge the copyright and authors of this work.
+.. %%%LICENSE_END
+.. 
+.. Please do not edit this file. It was generated from the documentation
+.. located in file include/uapi/linux/bpf.h of the Linux kernel sources
+.. (helpers description), and from scripts/bpf_helpers_doc.py in the same
+.. repository (header and footer).
+
+===========
+BPF-HELPERS
+===========
+-------------------------------------------------------------------------------
+list of eBPF helper functions
+-------------------------------------------------------------------------------
+
+:Manual section: 7
+
+DESCRIPTION
+===========
+
+The extended Berkeley Packet Filter (eBPF) subsystem consists in programs
+written in a pseudo-assembly language, then attached to one of the several
+kernel hooks and run in reaction of specific events. This framework differs
+from the older, "classic" BPF (or "cBPF") in several aspects, one of them being
+the ability to call special functions (or "helpers") from within a program.
+These functions are restricted to a white-list of helpers defined in the
+kernel.
+
+These helpers are used by eBPF programs to interact with the system, or with
+the context in which they work. For instance, they can be used to print
+debugging messages, to get the time since the system was booted, to interact
+with eBPF maps, or to manipulate network packets. Since there are several eBPF
+program types, and that they do not run in the same context, each program type
+can only call a subset of those helpers.
+
+Due to eBPF conventions, a helper can not have more than five arguments.
+
+Internally, eBPF programs call directly into the compiled helper functions
+without requiring any foreign-function interface. As a result, calling helpers
+introduces no overhead, thus offering excellent performance.
+
+This document is an attempt to list and document the helpers available to eBPF
+developers. They are sorted by chronological order (the oldest helpers in the
+kernel at the top).
+
+HELPERS
+=======
+'''
+        print(header)
+
+    def print_footer(self):
+        footer = '''
+EXAMPLES
+========
+
+Example usage for most of the eBPF helpers listed in this manual page are
+available within the Linux kernel sources, at the following locations:
+
+* *samples/bpf/*
+* *tools/testing/selftests/bpf/*
+
+LICENSE
+=======
+
+eBPF programs can have an associated license, passed along with the bytecode
+instructions to the kernel when the programs are loaded. The format for that
+string is identical to the one in use for kernel modules (Dual licenses, such
+as "Dual BSD/GPL", may be used). Some helper functions are only accessible to
+programs that are compatible with the GNU Privacy License (GPL).
+
+In order to use such helpers, the eBPF program must be loaded with the correct
+license string passed (via **attr**) to the **bpf**\ () system call, and this
+generally translates into the C source code of the program containing a line
+similar to the following:
+
+::
+
+	char ____license[] __attribute__((section("license"), used)) = "GPL";
+
+IMPLEMENTATION
+==============
+
+This manual page is an effort to document the existing eBPF helper functions.
+But as of this writing, the BPF sub-system is under heavy development. New eBPF
+program or map types are added, along with new helper functions. Some helpers
+are occasionally made available for additional program types. So in spite of
+the efforts of the community, this page might not be up-to-date. If you want to
+check by yourself what helper functions exist in your kernel, or what types of
+programs they can support, here are some files among the kernel tree that you
+may be interested in:
+
+* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list
+  of all helper functions, as well as many other BPF definitions including most
+  of the flags, structs or constants used by the helpers.
+* *net/core/filter.c* contains the definition of most network-related helper
+  functions, and the list of program types from which they can be used.
+* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related
+  helpers.
+* *kernel/bpf/verifier.c* contains the functions used to check that valid types
+  of eBPF maps are used with a given helper function.
+* *kernel/bpf/* directory contains other files in which additional helpers are
+  defined (for cgroups, sockmaps, etc.).
+
+Compatibility between helper functions and program types can generally be found
+in the files where helper functions are defined. Look for the **struct
+bpf_func_proto** objects and for functions returning them: these functions
+contain a list of helpers that a given program type can call. Note that the
+**default:** label of the **switch ... case** used to filter helpers can call
+other functions, themselves allowing access to additional helpers. The
+requirement for GPL license is also in those **struct bpf_func_proto**.
+
+Compatibility between helper functions and map types can be found in the
+**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*.
+
+Helper functions that invalidate the checks on **data** and **data_end**
+pointers for network processing are listed in function
+**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*.
+
+SEE ALSO
+========
+
+**bpf**\ (2),
+**cgroups**\ (7),
+**ip**\ (8),
+**perf_event_open**\ (2),
+**sendmsg**\ (2),
+**socket**\ (7),
+**tc-bpf**\ (8)'''
+        print(footer)
+
+    def print_proto(self, helper):
+        """
+        Format function protocol with bold and italics markers. This makes RST
+        file less readable, but gives nice results in the manual page.
+        """
+        proto = helper.proto_break_down()
+
+        print('**%s %s%s(' % (proto['ret_type'],
+                              proto['ret_star'].replace('*', '\\*'),
+                              proto['name']),
+              end='')
+
+        comma = ''
+        for a in proto['args']:
+            one_arg = '{}{}'.format(comma, a['type'])
+            if a['name']:
+                if a['star']:
+                    one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
+                else:
+                    one_arg += '** '
+                one_arg += '*{}*\\ **'.format(a['name'])
+            comma = ', '
+            print(one_arg, end='')
+
+        print(')**')
+
+    def print_one(self, helper):
+        self.print_proto(helper)
+
+        if (helper.desc):
+            print('\tDescription')
+            # Do not strip all newline characters: formatted code at the end of
+            # a section must be followed by a blank line.
+            for line in re.sub('\n$', '', helper.desc, count=1).split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        if (helper.ret):
+            print('\tReturn')
+            for line in helper.ret.rstrip().split('\n'):
+                print('{}{}'.format('\t\t' if line else '', line))
+
+        print('')
+
+###############################################################################
+
+# If script is launched from scripts/ from kernel tree and can access
+# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse,
+# otherwise the --filename argument will be required from the command line.
+script = os.path.abspath(sys.argv[0])
+linuxRoot = os.path.dirname(os.path.dirname(script))
+bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h')
+
+argParser = argparse.ArgumentParser(description="""
+Parse eBPF header file and generate documentation for eBPF helper functions.
+The RST-formatted output produced can be turned into a manual page with the
+rst2man utility.
+""")
+if (os.path.isfile(bpfh)):
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h',
+                           default=bpfh)
+else:
+    argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h')
+args = argParser.parse_args()
+
+# Parse file.
+headerParser = HeaderParser(args.filename)
+headerParser.run()
+
+# Print formatted output to standard output.
+printer = PrinterRST(headerParser.helpers)
+printer.print_all()
-- 
cgit v1.2.3-59-g8ed1b


From ad4a522349e5c4fa6af63df376256749dc489fb8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:53 +0100
Subject: bpf: add documentation for eBPF helpers (01-11)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_map_lookup_elem()
- bpf_map_update_elem()
- bpf_map_delete_elem()
- bpf_probe_read()
- bpf_ktime_get_ns()
- bpf_trace_printk()
- bpf_skb_store_bytes()
- bpf_l3_csum_replace()
- bpf_l4_csum_replace()
- bpf_tail_call()
- bpf_clone_redirect()

v4:
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_map_update_elem(): Add "const" qualifier for key and value.
- bpf_map_lookup_elem(): Add "const" qualifier for key.
- bpf_skb_store_bytes(): Clarify comment about invalidated verifier
  checks.
- bpf_l3_csum_replace(): Mention L3 instead of just IP, and add a note
  about bpf_csum_diff().
- bpf_l4_csum_replace(): Mention L4 instead of just TCP/UDP, and add a
  note about bpf_csum_diff().
- bpf_tail_call(): Bring minor edits to description.
- bpf_clone_redirect(): Add a note about the relation with
  bpf_redirect(). Also clarify comment about invalidated verifier
  checks.

v3:
- bpf_map_lookup_elem(): Fix description of restrictions for flags
  related to the existence of the entry.
- bpf_trace_printk(): State that trace_pipe can be configured. Fix
  return value in case an unknown format specifier is met. Add a note on
  kernel log notice when the helper is used. Edit example.
- bpf_tail_call(): Improve comment on stack inheritance.
- bpf_clone_redirect(): Improve description of BPF_F_INGRESS flag.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 230 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd43fc98c982..96925e949a24 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -393,6 +393,236 @@ union bpf_attr {
  * intentional, removing them would break paragraphs for rst2man.
  *
  * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From c456dec4d2656b70de7371e3a6ae60b7ec7ab74f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:54 +0100
Subject: bpf: add documentation for eBPF helpers (12-22)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Alexei:

- bpf_get_current_pid_tgid()
- bpf_get_current_uid_gid()
- bpf_get_current_comm()
- bpf_skb_vlan_push()
- bpf_skb_vlan_pop()
- bpf_skb_get_tunnel_key()
- bpf_skb_set_tunnel_key()
- bpf_redirect()
- bpf_perf_event_output()
- bpf_get_stackid()
- bpf_get_current_task()

v4:
- bpf_redirect(): Fix typo: "XDP_ABORT" changed to "XDP_ABORTED". Add
  note on bpf_redirect_map() providing better performance. Replace "Save
  for" with "Except for".
- bpf_skb_vlan_push(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_vlan_pop(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_get_tunnel_key(): Add notes on tunnel_id, "collect metadata"
  mode, and example tunneling protocols with which it can be used.
- bpf_skb_set_tunnel_key(): Add a reference to the description of
  bpf_skb_get_tunnel_key().
- bpf_perf_event_output(): Specify that, and for what purpose, the
  helper can be used with programs attached to TC and XDP.

v3:
- bpf_skb_get_tunnel_key(): Change and improve description and example.
- bpf_redirect(): Improve description of BPF_F_INGRESS flag.
- bpf_perf_event_output(): Fix first sentence of description. Delete
  wrong statement on context being evaluated as a struct pt_reg. Remove
  the long yet incomplete example.
- bpf_get_stackid(): Add a note about PERF_MAX_STACK_DEPTH being
  configurable.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 254 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 96925e949a24..465ea273729a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -623,6 +623,260 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From 1fdd08bedc56d2a927c5f230e2d63f17ca1068f1 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:55 +0100
Subject: bpf: add documentation for eBPF helpers (23-32)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_prandom_u32()
- bpf_get_smp_processor_id()
- bpf_get_cgroup_classid()
- bpf_get_route_realm()
- bpf_skb_load_bytes()
- bpf_csum_diff()
- bpf_skb_get_tunnel_opt()
- bpf_skb_set_tunnel_opt()
- bpf_skb_change_proto()
- bpf_skb_change_type()

v4:
- bpf_get_prandom_u32(): Warn that the prng is not cryptographically
  secure.
- bpf_get_smp_processor_id(): Fix a typo (case).
- bpf_get_cgroup_classid(): Clarify description. Add notes on the helper
  being limited to cgroup v1, and to egress path.
- bpf_get_route_realm(): Add comparison with bpf_get_cgroup_classid().
  Add a note about usage with TC and advantage of clsact. Fix a typo in
  return value ("sdb" instead of "skb").
- bpf_skb_load_bytes(): Make explicit loading large data loads it to the
  eBPF stack.
- bpf_csum_diff(): Add a note on seed that can be cascaded. Link to
  bpf_l3|l4_csum_replace().
- bpf_skb_get_tunnel_opt(): Add a note about usage with "collect
  metadata" mode, and example of this with Geneve.
- bpf_skb_set_tunnel_opt(): Add a link to bpf_skb_get_tunnel_opt()
  description.
- bpf_skb_change_proto(): Mention that the main use case is NAT64.
  Clarify comment about invalidated verifier checks.

v3:
- bpf_get_prandom_u32(): Fix helper name :(. Add description, including
  a note on the internal random state.
- bpf_get_smp_processor_id(): Add description, including a note on the
  processor id remaining stable during program run.
- bpf_get_cgroup_classid(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use the helper. Add a reference to related documentation.
  State that placing a task in net_cls controller disables cgroup-bpf.
- bpf_get_route_realm(): State that CONFIG_CGROUP_NET_CLASSID is
  required to use this helper.
- bpf_skb_load_bytes(): Fix comment on current use cases for the helper.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 197 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 465ea273729a..7352abf72f50 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -495,6 +495,27 @@ union bpf_attr {
  * 		The number of bytes written to the buffer, or a negative error
  * 		in case of failure.
  *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
  * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
  * 	Description
  * 		Store *len* bytes from address *from* into the packet
@@ -647,6 +668,32 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
  * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  * 	Description
  * 		Push a *vlan_tci* (VLAN tag control information) of protocol
@@ -786,6 +833,30 @@ union bpf_attr {
  * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
  * 		error.
  *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
  * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
@@ -831,6 +902,23 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
@@ -874,6 +962,115 @@ union bpf_attr {
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
  *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
-- 
cgit v1.2.3-59-g8ed1b


From fa15601ab31e5a90bb2a7dbdb43083470a8e787b Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:56 +0100
Subject: bpf: add documentation for eBPF helpers (33-41)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by Daniel:

- bpf_get_hash_recalc()
- bpf_skb_change_tail()
- bpf_skb_pull_data()
- bpf_csum_update()
- bpf_set_hash_invalid()
- bpf_get_numa_node_id()
- bpf_set_hash()
- bpf_skb_adjust_room()
- bpf_xdp_adjust_meta()

v4:
- bpf_skb_change_tail(): Clarify comment about invalidated verifier
  checks.
- bpf_skb_pull_data(): Clarify the motivation for using this helper or
  bpf_skb_load_bytes(), on non-linear buffers. Fix RST formatting for
  *skb*. Clarify comment about invalidated verifier checks.
- bpf_csum_update(): Fix description of checksum (entire packet, not IP
  checksum). Fix a typo: "header" instead of "helper".
- bpf_set_hash_invalid(): Mention bpf_get_hash_recalc().
- bpf_get_numa_node_id(): State that the helper is not restricted to
  programs attached to sockets.
- bpf_skb_adjust_room(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_meta(): Clarify comment about invalidated verifier
  checks.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 164 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7352abf72f50..b9c28f63f511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1071,9 +1071,173 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
  * u64 bpf_get_current_task(void)
  * 	Return
  * 		A pointer to the current task struct.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From c6b5fb8690faccf770d2f344094db1a8e482148f Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:57 +0100
Subject: bpf: add documentation for eBPF helpers (42-50)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Kaixu:
- bpf_perf_event_read()

Helpers from Martin:
- bpf_skb_under_cgroup()
- bpf_xdp_adjust_head()

Helpers from Sargun:
- bpf_probe_write_user()
- bpf_current_task_under_cgroup()

Helper from Thomas:
- bpf_skb_change_head()

Helper from Gianluca:
- bpf_probe_read_str()

Helpers from Chenbo:
- bpf_get_socket_cookie()
- bpf_get_socket_uid()

v4:
- bpf_perf_event_read(): State that bpf_perf_event_read_value() should
  be preferred over this helper.
- bpf_skb_change_head(): Clarify comment about invalidated verifier
  checks.
- bpf_xdp_adjust_head(): Clarify comment about invalidated verifier
  checks.
- bpf_probe_write_user(): Add that dst must be a valid user space
  address.
- bpf_get_socket_cookie(): Improve description by making clearer that
  the cockie belongs to the socket, and state that it remains stable for
  the life of the socket.

v3:
- bpf_perf_event_read(): Fix time of selection for perf event type in
  description. Remove occurences of "cores" to avoid confusion with
  "CPU".

Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Gianluca Borello <g.borello@gmail.com>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
[for bpf_skb_under_cgroup(), bpf_xdp_adjust_head()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b9c28f63f511..0d4b55e85e07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -810,6 +810,35 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with bpf_perf_event_read_value(), which at the same time
+ * 		provides more features over the **bpf_perf_event_read**\ ()
+ * 		interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
  * int bpf_redirect(u32 ifindex, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
@@ -1071,6 +1100,17 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
  * u32 bpf_get_hash_recalc(struct sk_buff *skb)
  * 	Description
  * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
@@ -1091,6 +1131,37 @@ union bpf_attr {
  * 	Return
  * 		A pointer to the current task struct.
  *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
  * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
  * 	Description
  * 		Resize (trim or grow) the packet associated to *skb* to the
@@ -1182,6 +1253,107 @@ union bpf_attr {
  * 	Return
  * 		The id of current NUMA node.
  *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
  * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
  * 	Description
  * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
-- 
cgit v1.2.3-59-g8ed1b


From 7aa79a869dd6d3aa345ea8ffd54085c980f5b1c3 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:58 +0100
Subject: bpf: add documentation for eBPF helpers (51-57)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helpers from Lawrence:
- bpf_setsockopt()
- bpf_getsockopt()
- bpf_sock_ops_cb_flags_set()

Helpers from Yonghong:
- bpf_perf_event_read_value()
- bpf_perf_prog_read_value()

Helper from Josef:
- bpf_override_return()

Helper from Andrey:
- bpf_bind()

v4:
- bpf_perf_event_read_value(): State that this helper should be
  preferred over bpf_perf_event_read().

v3:
- bpf_perf_event_read_value(): Fix time of selection for perf event type
  in description. Remove occurences of "cores" to avoid confusion with
  "CPU".
- bpf_bind(): Remove last paragraph of description, which was off topic.

Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Yonghong Song <yhs@fb.com>
[for bpf_perf_event_read_value(), bpf_perf_prog_read_value()]
Acked-by: Andrey Ignatov <rdna@fb.com>
[for bpf_bind()]
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d4b55e85e07..4aaee0169421 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,6 +1361,28 @@ union bpf_attr {
  * 	Return
  * 		0
  *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
@@ -1410,6 +1432,164 @@ union bpf_attr {
  * 		direct packet access.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From ab12704099c2001915843a8b7885567ee6f1800e Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:16:59 +0100
Subject: bpf: add documentation for eBPF helpers (58-64)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions, all
written by John:

- bpf_redirect_map()
- bpf_sk_redirect_map()
- bpf_sock_map_update()
- bpf_msg_redirect_map()
- bpf_msg_apply_bytes()
- bpf_msg_cork_bytes()
- bpf_msg_pull_data()

v4:
- bpf_redirect_map(): Fix typos: "XDP_ABORT" changed to "XDP_ABORTED",
  "his" to "this". Also add a paragraph on performance improvement over
  bpf_redirect() helper.

v3:
- bpf_sk_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_msg_redirect_map(): Improve description of BPF_F_INGRESS flag.
- bpf_redirect_map(): Fix note on CPU redirection, not fully implemented
  for generic XDP but supported on native XDP.
- bpf_msg_pull_data(): Clarify comment about invalidated verifier
  checks.

Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 147 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4aaee0169421..27940a36501a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1404,6 +1404,56 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
  * 	Description
  * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
@@ -1574,6 +1624,103 @@ union bpf_attr {
  * 		be set is returned (which comes down to 0 if all bits were set
  * 		as required).
  *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
-- 
cgit v1.2.3-59-g8ed1b


From 2d020dd771762d96d158a9deed41bc6b7480a8fe Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:17:00 +0100
Subject: bpf: add documentation for eBPF helpers (65-66)

Add documentation for eBPF helper functions to bpf.h user header file.
This documentation can be parsed with the Python script provided in
another commit of the patch series, in order to provide a RST document
that can later be converted into a man page.

The objective is to make the documentation easily understandable and
accessible to all eBPF developers, including beginners.

This patch contains descriptions for the following helper functions:

Helper from Nikita:
- bpf_xdp_adjust_tail()

Helper from Eyal:
- bpf_skb_get_xfrm_state()

v4:
- New patch (helpers did not exist yet for previous versions).

Cc: Nikita V. Shirokov <tehnerd@tehnerd.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 27940a36501a..da77a9388947 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1737,6 +1737,36 @@ union bpf_attr {
  * 		must be set to zero.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From 9cde0c8892b9bba07b27f2317d8d02707dc6ff92 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 25 Apr 2018 18:17:01 +0100
Subject: bpf: update bpf.h uapi header for tools

Update tools/include/uapi/linux/bpf.h file in order to reflect the
changes for BPF helper functions documentation introduced in previous
commits.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 1776 +++++++++++++++++++++++++++++++---------
 1 file changed, 1380 insertions(+), 396 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da8801860c7d..da77a9388947 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -377,412 +377,1396 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
-/* BPF helper function descriptions:
- *
- * void *bpf_map_lookup_elem(&map, &key)
- *     Return: Map value or NULL
- *
- * int bpf_map_update_elem(&map, &key, &value, flags)
- *     Return: 0 on success or negative error
- *
- * int bpf_map_delete_elem(&map, &key)
- *     Return: 0 on success or negative error
- *
- * int bpf_probe_read(void *dst, int size, void *src)
- *     Return: 0 on success or negative error
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  *
  * u64 bpf_ktime_get_ns(void)
- *     Return: current ktime
- *
- * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
- *     Return: length of buffer written or negative error
- *
- * u32 bpf_prandom_u32(void)
- *     Return: random value
- *
- * u32 bpf_raw_smp_processor_id(void)
- *     Return: SMP processor ID
- *
- * int bpf_skb_store_bytes(skb, offset, from, len, flags)
- *     store bytes into packet
- *     @skb: pointer to skb
- *     @offset: offset within packet from skb->mac_header
- *     @from: pointer where to copy bytes from
- *     @len: number of bytes to store into packet
- *     @flags: bit 0 - if true, recompute skb->csum
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l3_csum_replace(skb, offset, from, to, flags)
- *     recompute IP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where IP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_l4_csum_replace(skb, offset, from, to, flags)
- *     recompute TCP/UDP checksum
- *     @skb: pointer to skb
- *     @offset: offset within packet where TCP/UDP checksum is located
- *     @from: old value of header field
- *     @to: new value of header field
- *     @flags: bits 0-3 - size of header field
- *             bit 4 - is pseudo header
- *             other bits - reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_tail_call(ctx, prog_array_map, index)
- *     jump into another BPF program
- *     @ctx: context pointer passed to next program
- *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: 32-bit index inside array that selects specific program to run
- *     Return: 0 on success or negative error
- *
- * int bpf_clone_redirect(skb, ifindex, flags)
- *     redirect to another netdev
- *     @skb: pointer to skb
- *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: 0 on success or negative error
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  *
  * u64 bpf_get_current_pid_tgid(void)
- *     Return: current->tgid << 32 | current->pid
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
  *
  * u64 bpf_get_current_uid_gid(void)
- *     Return: current_gid << 32 | current_uid
- *
- * int bpf_get_current_comm(char *buf, int size_of_buf)
- *     stores current->comm into buf
- *     Return: 0 on success or negative error
- *
- * u32 bpf_get_cgroup_classid(skb)
- *     retrieve a proc's classid
- *     @skb: pointer to skb
- *     Return: classid if != 0
- *
- * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_vlan_pop(skb)
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_get_tunnel_key(skb, key, size, flags)
- * int bpf_skb_set_tunnel_key(skb, key, size, flags)
- *     retrieve or populate tunnel metadata
- *     @skb: pointer to skb
- *     @key: pointer to 'struct bpf_tunnel_key'
- *     @size: size of 'struct bpf_tunnel_key'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
- *
- * u64 bpf_perf_event_read(map, flags)
- *     read perf event counter value
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     Return: value of perf event counter read or error code
- *
- * int bpf_redirect(ifindex, flags)
- *     redirect to another netdev
- *     @ifindex: ifindex of the net device
- *     @flags:
- *	  cls_bpf:
- *          bit 0 - if set, redirect to ingress instead of egress
- *          other bits - reserved
- *	  xdp_bpf:
- *	    all bits - reserved
- *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
- *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
- * int bpf_redirect_map(map, key, flags)
- *     redirect to endpoint in map
- *     @map: pointer to dev map
- *     @key: index in map to lookup
- *     @flags: --
- *     Return: XDP_REDIRECT on success or XDP_ABORT on error
- *
- * u32 bpf_get_route_realm(skb)
- *     retrieve a dst's tclassid
- *     @skb: pointer to skb
- *     Return: realm if != 0
- *
- * int bpf_perf_event_output(ctx, map, flags, data, size)
- *     output perf raw sample
- *     @ctx: struct pt_regs*
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @data: data on stack to be output as raw data
- *     @size: size of data
- *     Return: 0 on success or negative error
- *
- * int bpf_get_stackid(ctx, map, flags)
- *     walk user or kernel stack and return id
- *     @ctx: struct pt_regs*
- *     @map: pointer to stack_trace map
- *     @flags: bits 0-7 - numer of stack frames to skip
- *             bit 8 - collect user stack instead of kernel
- *             bit 9 - compare stacks by hash only
- *             bit 10 - if two different stacks hash into the same stackid
- *                      discard old
- *             other bits - reserved
- *     Return: >= 0 stackid on success or negative error
- *
- * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
- *     calculate csum diff
- *     @from: raw from buffer
- *     @from_size: length of from buffer
- *     @to: raw to buffer
- *     @to_size: length of to buffer
- *     @seed: optional seed
- *     Return: csum result or negative error code
- *
- * int bpf_skb_get_tunnel_opt(skb, opt, size)
- *     retrieve tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: option size
- *
- * int bpf_skb_set_tunnel_opt(skb, opt, size)
- *     populate tunnel options metadata
- *     @skb: pointer to skb
- *     @opt: pointer to raw tunnel option data
- *     @size: size of @opt
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_proto(skb, proto, flags)
- *     Change protocol of the skb. Currently supported is v4 -> v6,
- *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
- *     program is expected to fill the new headers via skb_store_bytes
- *     and lX_csum_replace.
- *     @skb: pointer to skb
- *     @proto: new skb->protocol type
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_change_type(skb, type)
- *     Change packet type of skb.
- *     @skb: pointer to skb
- *     @type: new skb->pkt_type type
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_under_cgroup(skb, map, index)
- *     Check cgroup2 membership of skb
- *     @skb: pointer to skb
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 skb failed the cgroup2 descendant test
- *       == 1 skb succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * u32 bpf_get_hash_recalc(skb)
- *     Retrieve and possibly recalculate skb->hash.
- *     @skb: pointer to skb
- *     Return: hash
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with bpf_perf_event_read_value(), which at the same time
+ * 		provides more features over the **bpf_perf_event_read**\ ()
+ * 		interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
  *
  * u64 bpf_get_current_task(void)
- *     Returns current task_struct
- *     Return: current
- *
- * int bpf_probe_write_user(void *dst, void *src, int len)
- *     safely attempt to write to a location
- *     @dst: destination address in userspace
- *     @src: source address on stack
- *     @len: number of bytes to copy
- *     Return: 0 on success or negative error
- *
- * int bpf_current_task_under_cgroup(map, index)
- *     Check cgroup2 membership of current task
- *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
- *     @index: index of the cgroup in the bpf_map
- *     Return:
- *       == 0 current failed the cgroup2 descendant test
- *       == 1 current succeeded the cgroup2 descendant test
- *        < 0 error
- *
- * int bpf_skb_change_tail(skb, len, flags)
- *     The helper will resize the skb to the given new size, to be used f.e.
- *     with control messages.
- *     @skb: pointer to skb
- *     @len: new skb length
- *     @flags: reserved
- *     Return: 0 on success or negative error
- *
- * int bpf_skb_pull_data(skb, len)
- *     The helper will pull in non-linear data in case the skb is non-linear
- *     and not all of len are part of the linear section. Only needed for
- *     read/write with direct packet access.
- *     @skb: pointer to skb
- *     @len: len to make read/writeable
- *     Return: 0 on success or negative error
- *
- * s64 bpf_csum_update(skb, csum)
- *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
- *     @skb: pointer to skb
- *     @csum: csum to add
- *     Return: csum on success or negative error
- *
- * void bpf_set_hash_invalid(skb)
- *     Invalidate current skb->hash.
- *     @skb: pointer to skb
- *
- * int bpf_get_numa_node_id()
- *     Return: Id of current NUMA node.
- *
- * int bpf_skb_change_head()
- *     Grows headroom of skb and adjusts MAC header offset accordingly.
- *     Will extends/reallocae as required automatically.
- *     May change skb data pointer and will thus invalidate any check
- *     performed for direct packet access.
- *     @skb: pointer to skb
- *     @len: length of header to be pushed in front
- *     @flags: Flags (unused for now)
- *     Return: 0 on success or negative error
- *
- * int bpf_xdp_adjust_head(xdp_md, delta)
- *     Adjust the xdp_md.data by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data
- *     Return: 0 on success or negative on error
+ * 	Return
+ * 		A pointer to the current task struct.
+ *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  *
  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
- *     Copy a NUL terminated string from unsafe address. In case the string
- *     length is smaller than size, the target is not padded with further NUL
- *     bytes. In case the string length is larger than size, just count-1
- *     bytes are copied and the last byte is set to NUL.
- *     @dst: destination address
- *     @size: maximum number of bytes to copy, including the trailing NUL
- *     @unsafe_ptr: unsafe address
- *     Return:
- *       > 0 length of the string including the trailing NUL on success
- *       < 0 error
- *
- * u64 bpf_get_socket_cookie(skb)
- *     Get the cookie for the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
- *     field is missing inside sk_buff
- *
- * u32 bpf_get_socket_uid(skb)
- *     Get the owner uid of the socket stored inside sk_buff.
- *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or overflowuid if failed.
- *
- * u32 bpf_set_hash(skb, hash)
- *     Set full skb->hash.
- *     @skb: pointer to skb
- *     @hash: hash to set
- *
- * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls setsockopt. Not all opts are available, only those with
- *     integer optvals plus TCP_CONGESTION.
- *     Supported levels: SOL_SOCKET and IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: SOL_SOCKET or IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_getsockopt(bpf_socket, level, optname, optval, optlen)
- *     Calls getsockopt. Not all opts are available.
- *     Supported levels: IPPROTO_TCP
- *     @bpf_socket: pointer to bpf_socket
- *     @level: IPPROTO_TCP
- *     @optname: option name
- *     @optval: pointer to option value
- *     @optlen: length of optval in bytes
- *     Return: 0 or negative error
- *
- * int bpf_sock_ops_cb_flags_set(bpf_sock_ops, flags)
- *     Set callback flags for sock_ops
- *     @bpf_sock_ops: pointer to bpf_sock_ops_kern struct
- *     @flags: flags value
- *     Return: 0 for no error
- *             -EINVAL if there is no full tcp socket
- *             bits in flags that are not supported by current kernel
- *
- * int bpf_skb_adjust_room(skb, len_diff, mode, flags)
- *     Grow or shrink room in sk_buff.
- *     @skb: pointer to skb
- *     @len_diff: (signed) amount of room to grow/shrink
- *     @mode: operation mode (enum bpf_adj_room_mode)
- *     @flags: reserved for future use
- *     Return: 0 on success or negative error code
- *
- * int bpf_sk_redirect_map(map, key, flags)
- *     Redirect skb to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_sock_map_update(skops, map, key, flags)
- *	@skops: pointer to bpf_sock_ops
- *	@map: pointer to sockmap to update
- *	@key: key to insert/update sock in map
- *	@flags: same flags as map update elem
- *
- * int bpf_xdp_adjust_meta(xdp_md, delta)
- *     Adjust the xdp_md.data_meta by delta
- *     @xdp_md: pointer to xdp_md
- *     @delta: An positive/negative integer to be added to xdp_md.data_meta
- *     Return: 0 on success or negative on error
- *
- * int bpf_perf_event_read_value(map, flags, buf, buf_size)
- *     read perf event counter value and perf event enabled/running time
- *     @map: pointer to perf_event_array map
- *     @flags: index of event in the map or bitmask flags
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return: 0 on success or negative error code
- *
- * int bpf_perf_prog_read_value(ctx, buf, buf_size)
- *     read perf prog attached perf event counter and enabled/running time
- *     @ctx: pointer to ctx
- *     @buf: buf to fill
- *     @buf_size: size of the buf
- *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
- *
- * int bpf_msg_redirect_map(map, key, flags)
- *     Redirect msg to a sock in map using key as a lookup key for the
- *     sock in map.
- *     @map: pointer to sockmap
- *     @key: key to lookup sock in map
- *     @flags: reserved for future use
- *     Return: SK_PASS
- *
- * int bpf_bind(ctx, addr, addr_len)
- *     Bind socket to address. Only binding to IP is supported, no port can be
- *     set in addr.
- *     @ctx: pointer to context of type bpf_sock_addr
- *     @addr: pointer to struct sockaddr to bind socket to
- *     @addr_len: length of sockaddr structure
- *     Return: 0 on success or negative error code
- *
- * int bpf_xdp_adjust_tail(xdp_md, delta)
- *     Adjust the xdp_md.data_end by delta. Only shrinking of packet's
- *     size is supported.
- *     @xdp_md: pointer to xdp_md
- *     @delta: A negative integer to be added to xdp_md.data_end
- *     Return: 0 on success or negative on error
- *
- * int bpf_skb_get_xfrm_state(skb, index, xfrm_state, size, flags)
- *     retrieve XFRM state
- *     @skb: pointer to skb
- *     @index: index of the xfrm state in the secpath
- *     @key: pointer to 'struct bpf_xfrm_state'
- *     @size: size of 'struct bpf_xfrm_state'
- *     @flags: room for future extensions
- *     Return: 0 on success or negative error
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3-59-g8ed1b


From c0885f61bbb6a89c35397d3a8fe49c35822cde81 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@foxmail.com>
Date: Wed, 25 Apr 2018 10:07:13 +0800
Subject: samples, bpf: remove redundant ret assignment in bpf_load_program()

2 redundant ret assignments removed:

* 'ret = 1' before the logic 'if (data_maps)', and if any errors jump to
  label 'done'. No 'ret = 1' needed before the error jump.

* After the '/* load programs */' part, if everything goes well, then
  the BPF code will be loaded and 'ret' set to 0 by load_and_attach().
  If something goes wrong, 'ret' set to none-O, the redundant 'ret = 0'
  after the for clause will make the error skipped.

  For example, if some BPF code cannot provide supported program types
  in ELF SEC("unknown"), the for clause will not call load_and_attach()
  to load the BPF code. 1 should be returned to callees instead of 0.

Signed-off-by: Wang Sheng-Hui <shhuiw@foxmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/bpf_load.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index bebe4188b4b3..feca497d6afd 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -549,7 +549,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		if (nr_maps < 0) {
 			printf("Error: Failed loading ELF maps (errno:%d):%s\n",
 			       nr_maps, strerror(-nr_maps));
-			ret = 1;
 			goto done;
 		}
 		if (load_maps(map_data, nr_maps, fixup_map))
@@ -615,7 +614,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		}
 	}
 
-	ret = 0;
 done:
 	close(fd);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 2c25fc9a503adef4279951382fc9d47b59977f59 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Fri, 27 Apr 2018 18:02:54 +0800
Subject: bpf, doc: Update bpf_jit_enable limitation for
 CONFIG_BPF_JIT_ALWAYS_ON

When CONFIG_BPF_JIT_ALWAYS_ON is enabled, kernel has limitation for
bpf_jit_enable, so it has fixed value 1 and we cannot set it to 2
for JIT opcode dumping; this patch is to update the doc for it.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/filter.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index fd55c7de9991..5032e1263bc9 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -483,6 +483,12 @@ Example output from dmesg:
 [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00
 [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3
 
+When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and
+setting any other value than that will return in failure. This is even the case for
+setting bpf_jit_enable to 2, since dumping the final JIT image into the kernel log
+is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is the
+generally recommended approach instead.
+
 In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for
 generating disassembly out of the kernel log's hexdump:
 
-- 
cgit v1.2.3-59-g8ed1b


From 2f177c1628c3f54cdfcc8093cd4b5297f4baeec4 Mon Sep 17 00:00:00 2001
From: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Date: Wed, 25 Apr 2018 11:36:40 +0300
Subject: ath10k: fix information leak in debugfs

During write to some of debugfs in ath10k, few variables exposing stack
data when process user input. which leads to possible information leak.

This patch fix this issue by initializing buffer and checks
the return valure of 'simple_write_to_buffer'.

Signed-off-by: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/debug.c       | 20 +++++++++---------
 drivers/net/wireless/ath/ath10k/debugfs_sta.c | 30 +++++++++++++--------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index bac832ce1873..57d22cd976e5 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -987,13 +987,13 @@ static ssize_t ath10k_write_htt_max_amsdu_ampdu(struct file *file,
 {
 	struct ath10k *ar = file->private_data;
 	int res;
-	char buf[64];
+	char buf[64] = {0};
 	unsigned int amsdu, ampdu;
 
-	simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
-
-	/* make sure that buf is null terminated */
-	buf[sizeof(buf) - 1] = 0;
+	res = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos,
+				     user_buf, count);
+	if (res <= 0)
+		return res;
 
 	res = sscanf(buf, "%u %u", &amsdu, &ampdu);
 
@@ -1043,14 +1043,14 @@ static ssize_t ath10k_write_fw_dbglog(struct file *file,
 {
 	struct ath10k *ar = file->private_data;
 	int ret;
-	char buf[96];
+	char buf[96] = {0};
 	unsigned int log_level;
 	u64 mask;
 
-	simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
-
-	/* make sure that buf is null terminated */
-	buf[sizeof(buf) - 1] = 0;
+	ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos,
+				     user_buf, count);
+	if (ret <= 0)
+		return ret;
 
 	ret = sscanf(buf, "%llx %u", &mask, &log_level);
 
diff --git a/drivers/net/wireless/ath/ath10k/debugfs_sta.c b/drivers/net/wireless/ath/ath10k/debugfs_sta.c
index 8f688f136c22..a63c97e2c50c 100644
--- a/drivers/net/wireless/ath/ath10k/debugfs_sta.c
+++ b/drivers/net/wireless/ath/ath10k/debugfs_sta.c
@@ -254,12 +254,12 @@ static ssize_t ath10k_dbg_sta_write_addba(struct file *file,
 	struct ath10k *ar = arsta->arvif->ar;
 	u32 tid, buf_size;
 	int ret;
-	char buf[64];
+	char buf[64] = {0};
 
-	simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
-
-	/* make sure that buf is null terminated */
-	buf[sizeof(buf) - 1] = '\0';
+	ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos,
+				     user_buf, count);
+	if (ret <= 0)
+		return ret;
 
 	ret = sscanf(buf, "%u %u", &tid, &buf_size);
 	if (ret != 2)
@@ -305,12 +305,12 @@ static ssize_t ath10k_dbg_sta_write_addba_resp(struct file *file,
 	struct ath10k *ar = arsta->arvif->ar;
 	u32 tid, status;
 	int ret;
-	char buf[64];
-
-	simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
+	char buf[64] = {0};
 
-	/* make sure that buf is null terminated */
-	buf[sizeof(buf) - 1] = '\0';
+	ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos,
+				     user_buf, count);
+	if (ret <= 0)
+		return ret;
 
 	ret = sscanf(buf, "%u %u", &tid, &status);
 	if (ret != 2)
@@ -355,12 +355,12 @@ static ssize_t ath10k_dbg_sta_write_delba(struct file *file,
 	struct ath10k *ar = arsta->arvif->ar;
 	u32 tid, initiator, reason;
 	int ret;
-	char buf[64];
-
-	simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
+	char buf[64] = {0};
 
-	/* make sure that buf is null terminated */
-	buf[sizeof(buf) - 1] = '\0';
+	ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos,
+				     user_buf, count);
+	if (ret <= 0)
+		return ret;
 
 	ret = sscanf(buf, "%u %u %u", &tid, &initiator, &reason);
 	if (ret != 3)
-- 
cgit v1.2.3-59-g8ed1b


From 4b190675ad06f5a6ecbeef0b01890c5fb372e3eb Mon Sep 17 00:00:00 2001
From: Tamizh Chelvam <tamizhr@codeaurora.org>
Date: Wed, 25 Apr 2018 11:36:44 +0300
Subject: ath10k: fix kernel panic while reading tpc_stats

When attempt to read tpc_stats for the chipsets which support
more than 3 tx chain will trigger kernel panic(kernel stack is corrupted)
due to writing values on rate_code array out of range.
This patch changes the array size depends on the WMI_TPC_TX_N_CHAIN and
added check to avoid write values on the array if the num tx chain
get in tpc config event is greater than WMI_TPC_TX_N_CHAIN.

Tested on QCA9984 with firmware-5.bin_10.4-3.5.3-00057

Kernel panic log :

[  323.510944] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: bf90c654
[  323.510944]
[  323.524390] CPU: 0 PID: 1908 Comm: cat Not tainted 3.14.77 #31
[  323.530224] [<c021db48>] (unwind_backtrace) from [<c021ac08>] (show_stack+0x10/0x14)
[  323.537941] [<c021ac08>] (show_stack) from [<c03c53c0>] (dump_stack+0x80/0xa0)
[  323.545146] [<c03c53c0>] (dump_stack) from [<c022e4ac>] (panic+0x84/0x1e4)
[  323.552000] [<c022e4ac>] (panic) from [<c022e61c>] (__stack_chk_fail+0x10/0x14)
[  323.559350] [<c022e61c>] (__stack_chk_fail) from [<bf90c654>] (ath10k_wmi_event_pdev_tpc_config+0x424/0x438 [ath10k_core])
[  323.570471] [<bf90c654>] (ath10k_wmi_event_pdev_tpc_config [ath10k_core]) from [<bf90d800>] (ath10k_wmi_10_4_op_rx+0x2f0/0x39c [ath10k_core])
[  323.583047] [<bf90d800>] (ath10k_wmi_10_4_op_rx [ath10k_core]) from [<bf8fcc18>] (ath10k_htc_rx_completion_handler+0x170/0x1a0 [ath10k_core])
[  323.595702] [<bf8fcc18>] (ath10k_htc_rx_completion_handler [ath10k_core]) from [<bf961f44>] (ath10k_pci_hif_send_complete_check+0x1f0/0x220 [ath10k_pci])
[  323.609421] [<bf961f44>] (ath10k_pci_hif_send_complete_check [ath10k_pci]) from [<bf96562c>] (ath10k_ce_per_engine_service+0x74/0xc4 [ath10k_pci])
[  323.622490] [<bf96562c>] (ath10k_ce_per_engine_service [ath10k_pci]) from [<bf9656f0>] (ath10k_ce_per_engine_service_any+0x74/0x80 [ath10k_pci])
[  323.635423] [<bf9656f0>] (ath10k_ce_per_engine_service_any [ath10k_pci]) from [<bf96365c>] (ath10k_pci_napi_poll+0x44/0xe8 [ath10k_pci])
[  323.647665] [<bf96365c>] (ath10k_pci_napi_poll [ath10k_pci]) from [<c0599994>] (net_rx_action+0xac/0x160)
[  323.657208] [<c0599994>] (net_rx_action) from [<c02324a4>] (__do_softirq+0x104/0x294)
[  323.665017] [<c02324a4>] (__do_softirq) from [<c0232920>] (irq_exit+0x9c/0x11c)
[  323.672314] [<c0232920>] (irq_exit) from [<c0217fc0>] (handle_IRQ+0x6c/0x90)
[  323.679341] [<c0217fc0>] (handle_IRQ) from [<c02084e0>] (gic_handle_irq+0x3c/0x60)
[  323.686893] [<c02084e0>] (gic_handle_irq) from [<c02095c0>] (__irq_svc+0x40/0x70)
[  323.694349] Exception stack(0xdd489c58 to 0xdd489ca0)
[  323.699384] 9c40:                                                       00000000 a0000013
[  323.707547] 9c60: 00000000 dc4bce40 60000013 ddc1d800 dd488000 00000990 00000000 c085c800
[  323.715707] 9c80: 00000000 dd489d44 0000092d dd489ca0 c026e664 c026e668 60000013 ffffffff
[  323.723877] [<c02095c0>] (__irq_svc) from [<c026e668>] (rcu_note_context_switch+0x170/0x184)
[  323.732298] [<c026e668>] (rcu_note_context_switch) from [<c020e928>] (__schedule+0x50/0x4d4)
[  323.740716] [<c020e928>] (__schedule) from [<c020e490>] (schedule_timeout+0x148/0x178)
[  323.748611] [<c020e490>] (schedule_timeout) from [<c020f804>] (wait_for_common+0x114/0x154)
[  323.756972] [<c020f804>] (wait_for_common) from [<bf8f6ef0>] (ath10k_tpc_stats_open+0xc8/0x340 [ath10k_core])
[  323.766873] [<bf8f6ef0>] (ath10k_tpc_stats_open [ath10k_core]) from [<c02bb598>] (do_dentry_open+0x1ac/0x274)
[  323.776741] [<c02bb598>] (do_dentry_open) from [<c02c838c>] (do_last+0x8c0/0xb08)
[  323.784201] [<c02c838c>] (do_last) from [<c02c87e4>] (path_openat+0x210/0x598)
[  323.791408] [<c02c87e4>] (path_openat) from [<c02c9d1c>] (do_filp_open+0x2c/0x78)
[  323.798873] [<c02c9d1c>] (do_filp_open) from [<c02bc85c>] (do_sys_open+0x114/0x1b4)
[  323.806509] [<c02bc85c>] (do_sys_open) from [<c0208c80>] (ret_fast_syscall+0x0/0x44)
[  323.814241] CPU1: stopping
[  323.816927] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.14.77 #31
[  323.823008] [<c021db48>] (unwind_backtrace) from [<c021ac08>] (show_stack+0x10/0x14)
[  323.830731] [<c021ac08>] (show_stack) from [<c03c53c0>] (dump_stack+0x80/0xa0)
[  323.837934] [<c03c53c0>] (dump_stack) from [<c021cfac>] (handle_IPI+0xb8/0x140)
[  323.845224] [<c021cfac>] (handle_IPI) from [<c02084fc>] (gic_handle_irq+0x58/0x60)
[  323.852774] [<c02084fc>] (gic_handle_irq) from [<c02095c0>] (__irq_svc+0x40/0x70)
[  323.860233] Exception stack(0xdd499fa0 to 0xdd499fe8)
[  323.865273] 9fa0: ffffffed 00000000 1d3c9000 00000000 dd498000 dd498030 10c0387d c08b62c8
[  323.873432] 9fc0: 4220406a 512f04d0 00000000 00000000 00000001 dd499fe8 c021838c c0218390
[  323.881588] 9fe0: 60000013 ffffffff
[  323.885070] [<c02095c0>] (__irq_svc) from [<c0218390>] (arch_cpu_idle+0x30/0x50)
[  323.892454] [<c0218390>] (arch_cpu_idle) from [<c026500c>] (cpu_startup_entry+0xa4/0x108)
[  323.900690] [<c026500c>] (cpu_startup_entry) from [<422085a4>] (0x422085a4)

Signed-off-by: Tamizh chelvam <tamizhr@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/debug.c | 8 +++++++-
 drivers/net/wireless/ath/ath10k/wmi.c   | 6 ++++++
 drivers/net/wireless/ath/ath10k/wmi.h   | 2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index 57d22cd976e5..0d98c93a3aba 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -1519,7 +1519,13 @@ static void ath10k_tpc_stats_print(struct ath10k_tpc_stats *tpc_stats,
 	*len += scnprintf(buf + *len, buf_len - *len,
 			  "********************************\n");
 	*len += scnprintf(buf + *len, buf_len - *len,
-			  "No.  Preamble Rate_code tpc_value1 tpc_value2 tpc_value3\n");
+			  "No.  Preamble Rate_code ");
+
+	for (i = 0; i < WMI_TPC_TX_N_CHAIN; i++)
+		*len += scnprintf(buf + *len, buf_len - *len,
+				  "tpc_value%d ", i);
+
+	*len += scnprintf(buf + *len, buf_len - *len, "\n");
 
 	for (i = 0; i < tpc_stats->rate_max; i++) {
 		*len += scnprintf(buf + *len, buf_len - *len,
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index df2e92a6c9bd..d7dba1791a1a 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -4484,6 +4484,12 @@ void ath10k_wmi_event_pdev_tpc_config(struct ath10k *ar, struct sk_buff *skb)
 
 	num_tx_chain = __le32_to_cpu(ev->num_tx_chain);
 
+	if (num_tx_chain > WMI_TPC_TX_N_CHAIN) {
+		ath10k_warn(ar, "number of tx chain is %d greater than TPC configured tx chain %d\n",
+			    num_tx_chain, WMI_TPC_TX_N_CHAIN);
+		return;
+	}
+
 	ath10k_wmi_tpc_config_get_rate_code(rate_code, pream_table,
 					    num_tx_chain);
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 16a39244a34f..700b12f46baf 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -4026,9 +4026,9 @@ struct wmi_pdev_get_tpc_config_cmd {
 } __packed;
 
 #define WMI_TPC_CONFIG_PARAM		1
-#define WMI_TPC_RATE_MAX		160
 #define WMI_TPC_FINAL_RATE_MAX		240
 #define WMI_TPC_TX_N_CHAIN		4
+#define WMI_TPC_RATE_MAX               (WMI_TPC_TX_N_CHAIN * 65)
 #define WMI_TPC_PREAM_TABLE_MAX		10
 #define WMI_TPC_FLAG			3
 #define WMI_TPC_BUF_SIZE		10
-- 
cgit v1.2.3-59-g8ed1b


From 378b1d65070f3c989a662555eda8564cdd937b4a Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Tue, 24 Apr 2018 15:18:59 +0200
Subject: ath6kl: fix ath6kl_data_tx()'s return type

The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
which is a typedef for an enum type, but the implementation in this
driver returns an 'int'.

Fix this by returning 'netdev_tx_t' in this driver too.

Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath6kl/core.h | 2 +-
 drivers/net/wireless/ath/ath6kl/txrx.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/core.h b/drivers/net/wireless/ath/ath6kl/core.h
index e23d450babd2..0d30e762c090 100644
--- a/drivers/net/wireless/ath/ath6kl/core.h
+++ b/drivers/net/wireless/ath/ath6kl/core.h
@@ -914,7 +914,7 @@ void ath6kl_tx_data_cleanup(struct ath6kl *ar);
 
 struct ath6kl_cookie *ath6kl_alloc_cookie(struct ath6kl *ar);
 void ath6kl_free_cookie(struct ath6kl *ar, struct ath6kl_cookie *cookie);
-int ath6kl_data_tx(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t ath6kl_data_tx(struct sk_buff *skb, struct net_device *dev);
 
 struct aggr_info *aggr_init(struct ath6kl_vif *vif);
 void aggr_conn_init(struct ath6kl_vif *vif, struct aggr_info *aggr_info,
diff --git a/drivers/net/wireless/ath/ath6kl/txrx.c b/drivers/net/wireless/ath/ath6kl/txrx.c
index 8da9506f8c2b..618d12ed4b40 100644
--- a/drivers/net/wireless/ath/ath6kl/txrx.c
+++ b/drivers/net/wireless/ath/ath6kl/txrx.c
@@ -353,7 +353,7 @@ fail_ctrl_tx:
 	return status;
 }
 
-int ath6kl_data_tx(struct sk_buff *skb, struct net_device *dev)
+netdev_tx_t ath6kl_data_tx(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ath6kl *ar = ath6kl_priv(dev);
 	struct ath6kl_cookie *cookie = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 785281342d0ce06ec04e4b730fb29b6320139edc Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 26 Apr 2018 10:12:44 +0100
Subject: ath10k: fix spelling mistake: "servive" -> "service"

Trivial fix to spelling mistake in ath10k_warn warning message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index d7dba1791a1a..bb8e9e259a56 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -5286,7 +5286,7 @@ void ath10k_wmi_event_service_available(struct ath10k *ar, struct sk_buff *skb)
 
 	ret = ath10k_wmi_pull_svc_avail(ar, skb, &arg);
 	if (ret) {
-		ath10k_warn(ar, "failed to parse servive available event: %d\n",
+		ath10k_warn(ar, "failed to parse service available event: %d\n",
 			    ret);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From e60a92590187d01f1c98254d2d7e8f613ebd31dd Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@linaro.org>
Date: Thu, 26 Apr 2018 14:35:02 +0200
Subject: ath10k: sdio: jump to correct label in error handling path

Jump to the correct label in error handling path.
At this point of execution create_singlethread_workqueue() has succeeded,
so it should be properly destroyed.

Jump label was renamed in commit ec2c64e20257 ("ath10k: sdio: fix memory
leak for probe allocations").
However, the bug was originally introduced in commit d96db25d2025
("ath10k: add initial SDIO support").

Fixes: d96db25d2025 ("ath10k: add initial SDIO support")
Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/sdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/sdio.c b/drivers/net/wireless/ath/ath10k/sdio.c
index 2d04c54a4153..d612ce8c9cff 100644
--- a/drivers/net/wireless/ath/ath10k/sdio.c
+++ b/drivers/net/wireless/ath/ath10k/sdio.c
@@ -2011,7 +2011,7 @@ static int ath10k_sdio_probe(struct sdio_func *func,
 		ret = -ENODEV;
 		ath10k_err(ar, "unsupported device id %u (0x%x)\n",
 			   dev_id_base, id->device);
-		goto err_core_destroy;
+		goto err_free_wq;
 	}
 
 	ar->id.vendor = id->vendor;
-- 
cgit v1.2.3-59-g8ed1b


From 3e5cf362c34b14c8d01d19d4b821fb35e1779862 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 25 Apr 2018 19:29:36 +0200
Subject: tipc: introduce ioctl for fetching node identity

After the introduction of a 128-bit node identity it may be difficult
for a user to correlate between this identity and the generated node
hash address.

We now try to make this easier by introducing a new ioctl() call for
fetching a node identity by using the hash value as key. This will
be particularly useful when we extend some of the commands in the
'tipc' tool, but we also expect regular user applications to need
this feature.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 12 ++++++++----
 net/tipc/node.c           | 21 +++++++++++++++++++++
 net/tipc/node.h           |  1 +
 net/tipc/socket.c         | 13 +++++++++++--
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index bf6d28677cfe..6b2fd4d9655f 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -209,16 +209,16 @@ struct tipc_group_req {
  * The string formatting for each name element is:
  * media: media
  * interface: media:interface name
- * link: Z.C.N:interface-Z.C.N:interface
- *
+ * link: node:interface-node:interface
  */
-
+#define TIPC_NODEID_LEN         16
 #define TIPC_MAX_MEDIA_NAME	16
 #define TIPC_MAX_IF_NAME	16
 #define TIPC_MAX_BEARER_NAME	32
 #define TIPC_MAX_LINK_NAME	68
 
-#define SIOCGETLINKNAME		SIOCPROTOPRIVATE
+#define SIOCGETLINKNAME        SIOCPROTOPRIVATE
+#define SIOCGETNODEID          (SIOCPROTOPRIVATE + 1)
 
 struct tipc_sioc_ln_req {
 	__u32 peer;
@@ -226,6 +226,10 @@ struct tipc_sioc_ln_req {
 	char linkname[TIPC_MAX_LINK_NAME];
 };
 
+struct tipc_sioc_nodeid_req {
+	__u32 peer;
+	char node_id[TIPC_NODEID_LEN];
+};
 
 /* The macros and functions below are deprecated:
  */
diff --git a/net/tipc/node.c b/net/tipc/node.c
index e9c52e1416c5..81e6dd0cd1ca 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
 	return mtu;
 }
 
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id)
+{
+	u8 *own_id = tipc_own_id(net);
+	struct tipc_node *n;
+
+	if (!own_id)
+		return true;
+
+	if (addr == tipc_own_addr(net)) {
+		memcpy(id, own_id, TIPC_NODEID_LEN);
+		return true;
+	}
+	n = tipc_node_find(net, addr);
+	if (!n)
+		return false;
+
+	memcpy(id, &n->peer_id, TIPC_NODEID_LEN);
+	tipc_node_put(n);
+	return true;
+}
+
 u16 tipc_node_get_capabilities(struct net *net, u32 addr)
 {
 	struct tipc_node *n;
diff --git a/net/tipc/node.h b/net/tipc/node.h
index bb271a37c93f..846c8f240872 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,6 +60,7 @@ enum {
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
+bool tipc_node_get_id(struct net *net, u32 addr, u8 *id);
 u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
 void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
 			  struct tipc_bearer *bearer,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 252a52ae0893..c4992002fd68 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2973,7 +2973,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt,
 
 static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 {
-	struct sock *sk = sock->sk;
+	struct net *net = sock_net(sock->sk);
+	struct tipc_sioc_nodeid_req nr = {0};
 	struct tipc_sioc_ln_req lnr;
 	void __user *argp = (void __user *)arg;
 
@@ -2981,7 +2982,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCGETLINKNAME:
 		if (copy_from_user(&lnr, argp, sizeof(lnr)))
 			return -EFAULT;
-		if (!tipc_node_get_linkname(sock_net(sk),
+		if (!tipc_node_get_linkname(net,
 					    lnr.bearer_id & 0xffff, lnr.peer,
 					    lnr.linkname, TIPC_MAX_LINK_NAME)) {
 			if (copy_to_user(argp, &lnr, sizeof(lnr)))
@@ -2989,6 +2990,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			return 0;
 		}
 		return -EADDRNOTAVAIL;
+	case SIOCGETNODEID:
+		if (copy_from_user(&nr, argp, sizeof(nr)))
+			return -EFAULT;
+		if (!tipc_node_get_id(net, nr.peer, nr.node_id))
+			return -EADDRNOTAVAIL;
+		if (copy_to_user(argp, &nr, sizeof(nr)))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOIOCTLCMD;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 834944073301e85001c3ed9913027ca47c6f889b Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Wed, 25 Apr 2018 19:54:14 +0200
Subject: l2tp: consistent reference counting in procfs and debufs

The 'pppol2tp' procfs and 'l2tp/tunnels' debugfs files handle reference
counting of sessions differently than for tunnels.

For consistency, use the same mechanism for handling both sessions and
tunnels. That is, drop the reference on the previous session just
before looking up the next one (rather than in .show()). If necessary
(if dump stops before *_next_session() returns NULL), drop the last
reference in .stop().

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_debugfs.c | 20 +++++++++++++-------
 net/l2tp/l2tp_ppp.c     | 21 +++++++++++++--------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 7f1e842ef05a..e87686f7d63c 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -57,6 +57,10 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
 
 static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
 {
+	/* Drop reference taken during previous invocation */
+	if (pd->session)
+		l2tp_session_dec_refcount(pd->session);
+
 	pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
 	pd->session_idx++;
 
@@ -105,11 +109,16 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
 	if (!pd || pd == SEQ_START_TOKEN)
 		return;
 
-	/* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */
+	/* Drop reference taken by last invocation of l2tp_dfs_next_session()
+	 * or l2tp_dfs_next_tunnel().
+	 */
+	if (pd->session) {
+		l2tp_session_dec_refcount(pd->session);
+		pd->session = NULL;
+	}
 	if (pd->tunnel) {
 		l2tp_tunnel_dec_refcount(pd->tunnel);
 		pd->tunnel = NULL;
-		pd->session = NULL;
 	}
 }
 
@@ -250,13 +259,10 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v)
 		goto out;
 	}
 
-	/* Show the tunnel or session context */
-	if (!pd->session) {
+	if (!pd->session)
 		l2tp_dfs_seq_tunnel_show(m, pd->tunnel);
-	} else {
+	else
 		l2tp_dfs_seq_session_show(m, pd->session);
-		l2tp_session_dec_refcount(pd->session);
-	}
 
 out:
 	return 0;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1fd9e145076a..f951c768dcf2 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1576,6 +1576,10 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
 
 static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
 {
+	/* Drop reference taken during previous invocation */
+	if (pd->session)
+		l2tp_session_dec_refcount(pd->session);
+
 	pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
 	pd->session_idx++;
 
@@ -1624,11 +1628,16 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v)
 	if (!pd || pd == SEQ_START_TOKEN)
 		return;
 
-	/* Drop reference taken by last invocation of pppol2tp_next_tunnel() */
+	/* Drop reference taken by last invocation of pppol2tp_next_session()
+	 * or pppol2tp_next_tunnel().
+	 */
+	if (pd->session) {
+		l2tp_session_dec_refcount(pd->session);
+		pd->session = NULL;
+	}
 	if (pd->tunnel) {
 		l2tp_tunnel_dec_refcount(pd->tunnel);
 		pd->tunnel = NULL;
-		pd->session = NULL;
 	}
 }
 
@@ -1723,14 +1732,10 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
 		goto out;
 	}
 
-	/* Show the tunnel or session context.
-	 */
-	if (!pd->session) {
+	if (!pd->session)
 		pppol2tp_seq_tunnel_show(m, pd->tunnel);
-	} else {
+	else
 		pppol2tp_seq_session_show(m, pd->session);
-		l2tp_session_dec_refcount(pd->session);
-	}
 
 out:
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From c59530d0d5dccc96795af12c139f618182cf98db Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:47 -0700
Subject: net: Move PHY statistics code into PHY library helpers

In order to make it possible for network device drivers that do not
necessarily have a phy_device attached, but still report PHY statistics,
have a preliminary refactoring consisting in creating helper functions
that encapsulate the PHY device driver knowledge within PHYLIB.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h   | 20 ++++++++++++++++++++
 net/core/ethtool.c    | 38 ++++++++------------------------------
 3 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef15e6..a98ed12c0009 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1277,3 +1277,51 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
 	return phy_restart_aneg(phydev);
 }
 EXPORT_SYMBOL(phy_ethtool_nway_reset);
+
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_strings(phydev, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_strings);
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+	int ret;
+
+	if (!phydev->drv)
+		return -EIO;
+
+	if (phydev->drv->get_sset_count &&
+	    phydev->drv->get_strings &&
+	    phydev->drv->get_stats) {
+		mutex_lock(&phydev->lock);
+		ret = phydev->drv->get_sset_count(phydev);
+		mutex_unlock(&phydev->lock);
+
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(phy_ethtool_get_sset_count);
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data)
+{
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_stats(phydev, stats, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_stats);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f0b5870a6d40..6ca81395c545 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1066,6 +1066,26 @@ int phy_ethtool_nway_reset(struct net_device *ndev);
 #if IS_ENABLED(CONFIG_PHYLIB)
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
+int phy_ethtool_get_sset_count(struct phy_device *phydev);
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data);
+#else
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+	return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+	return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+			  struct ethtool_stats *stats, u64 *data)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 extern struct bus_type mdio_bus_type;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4650fd6d678c..efcd8e620796 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -211,23 +211,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
-static int phy_get_sset_count(struct phy_device *phydev)
-{
-	int ret;
-
-	if (phydev->drv->get_sset_count &&
-	    phydev->drv->get_strings &&
-	    phydev->drv->get_stats) {
-		mutex_lock(&phydev->lock);
-		ret = phydev->drv->get_sset_count(phydev);
-		mutex_unlock(&phydev->lock);
-
-		return ret;
-	}
-
-	return -EOPNOTSUPP;
-}
-
 static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -246,7 +229,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 
 	if (sset == ETH_SS_PHY_STATS) {
 		if (dev->phydev)
-			return phy_get_sset_count(dev->phydev);
+			return phy_ethtool_get_sset_count(dev->phydev);
 		else
 			return -EOPNOTSUPP;
 	}
@@ -273,15 +256,10 @@ static void __ethtool_get_strings(struct net_device *dev,
 	else if (stringset == ETH_SS_PHY_TUNABLES)
 		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
 	else if (stringset == ETH_SS_PHY_STATS) {
-		struct phy_device *phydev = dev->phydev;
-
-		if (phydev) {
-			mutex_lock(&phydev->lock);
-			phydev->drv->get_strings(phydev, data);
-			mutex_unlock(&phydev->lock);
-		} else {
+		if (dev->phydev)
+			phy_ethtool_get_strings(dev->phydev, data);
+		else
 			return;
-		}
 	} else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
@@ -2002,7 +1980,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (!phydev)
 		return -EOPNOTSUPP;
 
-	n_stats = phy_get_sset_count(phydev);
+	n_stats = phy_ethtool_get_sset_count(dev->phydev);
 	if (n_stats < 0)
 		return n_stats;
 	if (n_stats > S32_MAX / sizeof(u64))
@@ -2017,9 +1995,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (n_stats && !data)
 		return -ENOMEM;
 
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_stats(phydev, &stats, data);
-	mutex_unlock(&phydev->lock);
+	ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+	if (ret < 0)
+		return ret;
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
-- 
cgit v1.2.3-59-g8ed1b


From 9994338227179eaf8db6f4493504e108b1fae5fc Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:48 -0700
Subject: net: Allow network devices to have PHY statistics

Add a new callback: get_ethtool_phy_stats() which allows network device
drivers not making use of the PHY library to return PHY statistics.
Update ethtool_get_phy_stats(), __ethtool_get_sset_count() and
__ethtool_get_strings() accordingly to interogate the network device
about ETH_SS_PHY_STATS.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  5 +++++
 net/core/ethtool.c      | 39 +++++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b32cd2062f18..f8a2245b70ac 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -312,6 +312,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	by kernel. Returns a negative error code or zero.
  * @get_fecparam: Get the network device Forward Error Correction parameters.
  * @set_fecparam: Set the network device Forward Error Correction parameters.
+ * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
+ *	This is only useful if the device maintains PHY statistics and
+ *	cannot use the standard PHY library helpers.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -407,5 +410,7 @@ struct ethtool_ops {
 				      struct ethtool_fecparam *);
 	int	(*set_fecparam)(struct net_device *,
 				      struct ethtool_fecparam *);
+	void	(*get_ethtool_phy_stats)(struct net_device *,
+					 struct ethtool_stats *, u64 *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index efcd8e620796..b849fdae7e87 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -227,12 +227,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 	if (sset == ETH_SS_PHY_TUNABLES)
 		return ARRAY_SIZE(phy_tunable_strings);
 
-	if (sset == ETH_SS_PHY_STATS) {
-		if (dev->phydev)
-			return phy_ethtool_get_sset_count(dev->phydev);
-		else
-			return -EOPNOTSUPP;
-	}
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		return phy_ethtool_get_sset_count(dev->phydev);
 
 	if (ops->get_sset_count && ops->get_strings)
 		return ops->get_sset_count(dev, sset);
@@ -255,12 +252,10 @@ static void __ethtool_get_strings(struct net_device *dev,
 		memcpy(data, tunable_strings, sizeof(tunable_strings));
 	else if (stringset == ETH_SS_PHY_TUNABLES)
 		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
-	else if (stringset == ETH_SS_PHY_STATS) {
-		if (dev->phydev)
-			phy_ethtool_get_strings(dev->phydev, data);
-		else
-			return;
-	} else
+	else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+		 !ops->get_ethtool_phy_stats)
+		phy_ethtool_get_strings(dev->phydev, data);
+	else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
 }
@@ -1972,15 +1967,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
 
 static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 {
-	struct ethtool_stats stats;
+	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
+	struct ethtool_stats stats;
 	u64 *data;
 	int ret, n_stats;
 
-	if (!phydev)
+	if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
 		return -EOPNOTSUPP;
 
-	n_stats = phy_ethtool_get_sset_count(dev->phydev);
+	if (dev->phydev && !ops->get_ethtool_phy_stats)
+		n_stats = phy_ethtool_get_sset_count(dev->phydev);
+	else
+		n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
 	if (n_stats < 0)
 		return n_stats;
 	if (n_stats > S32_MAX / sizeof(u64))
@@ -1995,9 +1994,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
 	if (n_stats && !data)
 		return -ENOMEM;
 
-	ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
-	if (ret < 0)
-		return ret;
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+		if (ret < 0)
+			return ret;
+	} else {
+		ops->get_ethtool_phy_stats(dev, &stats, data);
+	}
 
 	ret = -EFAULT;
 	if (copy_to_user(useraddr, &stats, sizeof(stats)))
-- 
cgit v1.2.3-59-g8ed1b


From 1d1e79f1c6a57dc3750d328ea38a4c385d4edee8 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:49 -0700
Subject: net: dsa: Do not check for ethtool_ops validity

This is completely redundant with what netdev_set_default_ethtool_ops()
does, we are always guaranteed to have a valid dev->ethtool_ops pointer,
however, within that structure, not all function calls may be populated,
so we still have to check them individually.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/master.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..9ec16b39ed15 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 	int port = cpu_dp->index;
 	int count = 0;
 
-	if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+	if (ops->get_sset_count && ops->get_ethtool_stats) {
 		count = ops->get_sset_count(dev, ETH_SS_STATS);
 		ops->get_ethtool_stats(dev, stats, data);
 	}
@@ -38,7 +38,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
 
-	if (ops && ops->get_sset_count)
+	if (ops->get_sset_count)
 		count += ops->get_sset_count(dev, sset);
 
 	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
@@ -64,7 +64,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	/* We do not want to be NULL-terminated, since this is a prefix */
 	pfx[sizeof(pfx) - 1] = '_';
 
-	if (ops && ops->get_sset_count && ops->get_strings) {
+	if (ops->get_sset_count && ops->get_strings) {
 		mcount = ops->get_sset_count(dev, ETH_SS_STATS);
 		ops->get_strings(dev, stringset, data);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 89f09048348936a9a8c5131c8538cc6ed26fd44c Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:50 -0700
Subject: net: dsa: Pass stringset to ethtool operations

Up until now we largely assumed that we were interested in ETH_SS_STATS
type of strings for all ethtool operations, this is about to change with
the introduction of additional string sets, e.g: ETH_SS_PHY_STATS.
Update all functions to take an appropriate stringset argument and act
on it when it is different than ETH_SS_STATS for now.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       | 11 +++++++++--
 drivers/net/dsa/b53/b53_priv.h         |  5 +++--
 drivers/net/dsa/dsa_loop.c             | 11 +++++++++--
 drivers/net/dsa/lan9303-core.c         | 11 +++++++++--
 drivers/net/dsa/microchip/ksz_common.c | 11 +++++++++--
 drivers/net/dsa/mt7530.c               | 11 +++++++++--
 drivers/net/dsa/mv88e6xxx/chip.c       | 10 ++++++++--
 drivers/net/dsa/qca8k.c                | 10 ++++++++--
 include/net/dsa.h                      |  5 +++--
 net/dsa/master.c                       | 21 +++++++++++++--------
 net/dsa/slave.c                        |  5 +++--
 11 files changed, 83 insertions(+), 28 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 78616787f2a3..726b2d8c6fe9 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -806,13 +806,17 @@ static unsigned int b53_get_mib_size(struct b53_device *dev)
 		return B53_MIBS_SIZE;
 }
 
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		     uint8_t *data)
 {
 	struct b53_device *dev = ds->priv;
 	const struct b53_mib_desc *mibs = b53_get_mib(dev);
 	unsigned int mib_size = b53_get_mib_size(dev);
 	unsigned int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < mib_size; i++)
 		strlcpy(data + i * ETH_GSTRING_LEN,
 			mibs[i].name, ETH_GSTRING_LEN);
@@ -852,10 +856,13 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
 }
 EXPORT_SYMBOL(b53_get_ethtool_stats);
 
-int b53_get_sset_count(struct dsa_switch *ds, int port)
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
 	struct b53_device *dev = ds->priv;
 
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return b53_get_mib_size(dev);
 }
 EXPORT_SYMBOL(b53_get_sset_count);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 1187ebd79287..b933d5cb5c2d 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -286,9 +286,10 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev)
 /* Exported functions towards other drivers */
 void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port);
 int b53_configure_vlan(struct dsa_switch *ds);
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		     uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
-int b53_get_sset_count(struct dsa_switch *ds, int port);
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index f77be9f85cb3..9354cc08d3fd 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -86,16 +86,23 @@ static int dsa_loop_setup(struct dsa_switch *ds)
 	return 0;
 }
 
-static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port)
+static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return __DSA_LOOP_CNT_MAX;
 }
 
-static void dsa_loop_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void dsa_loop_get_strings(struct dsa_switch *ds, int port,
+				 u32 stringset, uint8_t *data)
 {
 	struct dsa_loop_priv *ps = ds->priv;
 	unsigned int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < __DSA_LOOP_CNT_MAX; i++)
 		memcpy(data + i * ETH_GSTRING_LEN,
 		       ps->ports[port].mib[i].name, ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index fefa454f3e56..b4f6e1a67dd9 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -977,10 +977,14 @@ static const struct lan9303_mib_desc lan9303_mib[] = {
 	{ .offset = LAN9303_MAC_TX_LATECOL_0, .name = "TxLateCol", },
 };
 
-static void lan9303_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void lan9303_get_strings(struct dsa_switch *ds, int port,
+				u32 stringset, uint8_t *data)
 {
 	unsigned int u;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (u = 0; u < ARRAY_SIZE(lan9303_mib); u++) {
 		strncpy(data + u * ETH_GSTRING_LEN, lan9303_mib[u].name,
 			ETH_GSTRING_LEN);
@@ -1007,8 +1011,11 @@ static void lan9303_get_ethtool_stats(struct dsa_switch *ds, int port,
 	}
 }
 
-static int lan9303_get_sset_count(struct dsa_switch *ds, int port)
+static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(lan9303_mib);
 }
 
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index bcb3e6c734f2..7210c49b7922 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -439,15 +439,22 @@ static void ksz_disable_port(struct dsa_switch *ds, int port,
 	ksz_port_cfg(dev, port, REG_PORT_CTRL_0, PORT_MAC_LOOPBACK, true);
 }
 
-static int ksz_sset_count(struct dsa_switch *ds, int port)
+static int ksz_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return TOTAL_SWITCH_COUNTER_NUM;
 }
 
-static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+static void ksz_get_strings(struct dsa_switch *ds, int port,
+			    u32 stringset, uint8_t *buf)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
 		memcpy(buf + i * ETH_GSTRING_LEN, mib_names[i].string,
 		       ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 80a4dbc3a499..62e486652e62 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -573,10 +573,14 @@ static int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum,
 }
 
 static void
-mt7530_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+mt7530_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+		   uint8_t *data)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(mt7530_mib); i++)
 		strncpy(data + i * ETH_GSTRING_LEN, mt7530_mib[i].name,
 			ETH_GSTRING_LEN);
@@ -604,8 +608,11 @@ mt7530_get_ethtool_stats(struct dsa_switch *ds, int port,
 }
 
 static int
-mt7530_get_sset_count(struct dsa_switch *ds, int port)
+mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(mt7530_mib);
 }
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3d2091099f7f..8f92ccc0dd54 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -742,11 +742,14 @@ static void mv88e6xxx_atu_vtu_get_strings(uint8_t *data)
 }
 
 static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port,
-				  uint8_t *data)
+				  u32 stringset, uint8_t *data)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int count = 0;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	mutex_lock(&chip->reg_lock);
 
 	if (chip->info->ops->stats_get_strings)
@@ -789,12 +792,15 @@ static int mv88e6320_stats_get_sset_count(struct mv88e6xxx_chip *chip)
 					      STATS_TYPE_BANK1);
 }
 
-static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port)
+static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int serdes_count = 0;
 	int count = 0;
 
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	mutex_lock(&chip->reg_lock);
 	if (chip->info->ops->stats_get_sset_count)
 		count = chip->info->ops->stats_get_sset_count(chip);
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 600d5ad1fbde..757b6d90ea36 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -600,10 +600,13 @@ qca8k_phy_write(struct dsa_switch *ds, int phy, int regnum, u16 val)
 }
 
 static void
-qca8k_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+qca8k_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data)
 {
 	int i;
 
+	if (stringset != ETH_SS_STATS)
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(ar8327_mib); i++)
 		strncpy(data + i * ETH_GSTRING_LEN, ar8327_mib[i].name,
 			ETH_GSTRING_LEN);
@@ -631,8 +634,11 @@ qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
 }
 
 static int
-qca8k_get_sset_count(struct dsa_switch *ds, int port)
+qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
+	if (sset != ETH_SS_STATS)
+		return 0;
+
 	return ARRAY_SIZE(ar8327_mib);
 }
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 60fb4ec8ba61..0bc0aad1b02e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -356,10 +356,11 @@ struct dsa_switch_ops {
 	/*
 	 * ethtool hardware statistics.
 	 */
-	void	(*get_strings)(struct dsa_switch *ds, int port, uint8_t *data);
+	void	(*get_strings)(struct dsa_switch *ds, int port,
+			       u32 stringset, uint8_t *data);
 	void	(*get_ethtool_stats)(struct dsa_switch *ds,
 				     int port, uint64_t *data);
-	int	(*get_sset_count)(struct dsa_switch *ds, int port);
+	int	(*get_sset_count)(struct dsa_switch *ds, int port, int sset);
 
 	/*
 	 * ethtool Wake-on-LAN
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 9ec16b39ed15..8d27687fd0ca 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -38,11 +38,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
 
-	if (ops->get_sset_count)
-		count += ops->get_sset_count(dev, sset);
+	if (ops->get_sset_count) {
+		count = ops->get_sset_count(dev, sset);
+		if (count < 0)
+			count = 0;
+	}
 
-	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds, cpu_dp->index);
+	if (ds->ops->get_sset_count)
+		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
 
 	return count;
 }
@@ -65,18 +68,20 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	pfx[sizeof(pfx) - 1] = '_';
 
 	if (ops->get_sset_count && ops->get_strings) {
-		mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+		mcount = ops->get_sset_count(dev, stringset);
+		if (mcount < 0)
+			mcount = 0;
 		ops->get_strings(dev, stringset, data);
 	}
 
-	if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+	if (ds->ops->get_strings) {
 		ndata = data + mcount * len;
 		/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
 		 * the output after to prepend our CPU port prefix we
 		 * constructed earlier
 		 */
-		ds->ops->get_strings(ds, port, ndata);
-		count = ds->ops->get_sset_count(ds, port);
+		ds->ops->get_strings(ds, port, stringset, ndata);
+		count = ds->ops->get_sset_count(ds, port, stringset);
 		for (i = 0; i < count; i++) {
 			memmove(ndata + (i * len + sizeof(pfx)),
 				ndata + i * len, len - sizeof(pfx));
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..f3fb3a0880b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -560,7 +560,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
 		strncpy(data + 2 * len, "rx_packets", len);
 		strncpy(data + 3 * len, "rx_bytes", len);
 		if (ds->ops->get_strings)
-			ds->ops->get_strings(ds, dp->index, data + 4 * len);
+			ds->ops->get_strings(ds, dp->index, stringset,
+					     data + 4 * len);
 	}
 }
 
@@ -605,7 +606,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 
 		count = 4;
 		if (ds->ops->get_sset_count)
-			count += ds->ops->get_sset_count(ds, dp->index);
+			count += ds->ops->get_sset_count(ds, dp->index, sset);
 
 		return count;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6207a78c098a183df1620c1e2cbf620a4cc7c3e3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:51 -0700
Subject: net: dsa: Add helper function to obtain PHY device of a given port

In preparation for having more call sites attempting to obtain a
reference against a PHY device corresponding to a particular port,
introduce a helper function for that purpose.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/port.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..5e2a88720a9a 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -273,25 +273,38 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 	return 0;
 }
 
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
 {
-	struct device_node *port_dn = dp->dn;
 	struct device_node *phy_dn;
-	struct dsa_switch *ds = dp->ds;
 	struct phy_device *phydev;
-	int port = dp->index;
-	int err = 0;
 
-	phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+	phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
 	if (!phy_dn)
-		return 0;
+		return NULL;
 
 	phydev = of_phy_find_device(phy_dn);
 	if (!phydev) {
-		err = -EPROBE_DEFER;
-		goto err_put_of;
+		of_node_put(phy_dn);
+		return ERR_PTR(-EPROBE_DEFER);
 	}
 
+	return phydev;
+}
+
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+	struct dsa_switch *ds = dp->ds;
+	struct phy_device *phydev;
+	int port = dp->index;
+	int err = 0;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (!phydev)
+		return 0;
+
+	if (IS_ERR(phydev))
+		return PTR_ERR(phydev);
+
 	if (enable) {
 		err = genphy_config_init(phydev);
 		if (err < 0)
@@ -317,8 +330,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
 
 err_put_dev:
 	put_device(&phydev->mdio.dev);
-err_put_of:
-	of_node_put(phy_dn);
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cf963573039a333eff7156629e61a06e59da61cf Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:52 -0700
Subject: net: dsa: Allow providing PHY statistics from CPU port

Implement the same type of ethtool diversion that we have for
ETH_SS_STATS and make it work with ETH_SS_PHY_STATS. This allows
providing PHY level statistics for CPU ports that are directly
connecting to a PHY device.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  7 +++++++
 net/dsa/master.c  | 47 ++++++++++++++++++++++++++++++++++++++++-----
 net/dsa/port.c    | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0bc0aad1b02e..462e9741b210 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -361,6 +361,8 @@ struct dsa_switch_ops {
 	void	(*get_ethtool_stats)(struct dsa_switch *ds,
 				     int port, uint64_t *data);
 	int	(*get_sset_count)(struct dsa_switch *ds, int port, int sset);
+	void	(*get_ethtool_phy_stats)(struct dsa_switch *ds,
+					 int port, uint64_t *data);
 
 	/*
 	 * ethtool Wake-on-LAN
@@ -589,4 +591,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 #define BRCM_TAG_GET_PORT(v)		((v) >> 8)
 #define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
 
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
+int dsa_port_get_phy_sset_count(struct dsa_port *dp);
+
 #endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 8d27687fd0ca..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
 		ds->ops->get_ethtool_stats(ds, port, data + count);
 }
 
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+					     struct ethtool_stats *stats,
+					     uint64_t *data)
+{
+	struct dsa_port *cpu_dp = dev->dsa_ptr;
+	const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+	struct dsa_switch *ds = cpu_dp->ds;
+	int port = cpu_dp->index;
+	int count = 0;
+
+	if (dev->phydev && !ops->get_ethtool_phy_stats) {
+		count = phy_ethtool_get_sset_count(dev->phydev);
+		if (count >= 0)
+			phy_ethtool_get_stats(dev->phydev, stats, data);
+	} else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+		count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+		ops->get_ethtool_phy_stats(dev, stats, data);
+	}
+
+	if (count < 0)
+		count = 0;
+
+	if (ds->ops->get_ethtool_phy_stats)
+		ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
 static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 	struct dsa_switch *ds = cpu_dp->ds;
 	int count = 0;
 
-	if (ops->get_sset_count) {
+	if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats)
+		count = phy_ethtool_get_sset_count(dev->phydev);
+	else if (ops->get_sset_count)
 		count = ops->get_sset_count(dev, sset);
-		if (count < 0)
-			count = 0;
-	}
+
+	if (count < 0)
+		count = 0;
 
 	if (ds->ops->get_sset_count)
 		count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
@@ -67,7 +96,14 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	/* We do not want to be NULL-terminated, since this is a prefix */
 	pfx[sizeof(pfx) - 1] = '_';
 
-	if (ops->get_sset_count && ops->get_strings) {
+	if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+	    !ops->get_ethtool_phy_stats) {
+		mcount = phy_ethtool_get_sset_count(dev->phydev);
+		if (mcount < 0)
+			mcount = 0;
+		else
+			phy_ethtool_get_strings(dev->phydev, data);
+	} else if (ops->get_sset_count && ops->get_strings) {
 		mcount = ops->get_sset_count(dev, stringset);
 		if (mcount < 0)
 			mcount = 0;
@@ -107,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
 	ops->get_sset_count = dsa_master_get_sset_count;
 	ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
 	ops->get_strings = dsa_master_get_strings;
+	ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
 
 	dev->ethtool_ops = ops;
 
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 5e2a88720a9a..2413beb995be 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -383,3 +383,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
 	else
 		dsa_port_setup_phy_of(dp, false);
 }
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_strings(phydev, data);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_stats(phydev, NULL, data);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+	struct phy_device *phydev;
+	int ret = -EOPNOTSUPP;
+
+	if (of_phy_is_fixed_link(dp->dn))
+		return ret;
+
+	phydev = dsa_port_get_phy_device(dp);
+	if (IS_ERR_OR_NULL(phydev))
+		return ret;
+
+	ret = phy_ethtool_get_sset_count(phydev);
+	put_device(&phydev->mdio.dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
-- 
cgit v1.2.3-59-g8ed1b


From c7d28c9df292a49904446dca15b2037ee8f874af Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:53 -0700
Subject: net: dsa: b53: Add support for reading PHY statistics

Allow the b53 driver to return PHY statistics when the CPU port used is
different than 5, 7 or 8, because those are typically PHY-less on most
devices. This is useful for debugging link problems between the switch
and an external host when using a non standard CPU port number (e.g: 4).

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 78 ++++++++++++++++++++++++++++++++++------
 drivers/net/dsa/b53/b53_priv.h   |  1 +
 drivers/net/dsa/bcm_sf2.c        |  1 +
 3 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 726b2d8c6fe9..9f561fe505cb 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -806,20 +806,39 @@ static unsigned int b53_get_mib_size(struct b53_device *dev)
 		return B53_MIBS_SIZE;
 }
 
+static struct phy_device *b53_get_phy_device(struct dsa_switch *ds, int port)
+{
+	/* These ports typically do not have built-in PHYs */
+	switch (port) {
+	case B53_CPU_PORT_25:
+	case 7:
+	case B53_CPU_PORT:
+		return NULL;
+	}
+
+	return mdiobus_get_phy(ds->slave_mii_bus, port);
+}
+
 void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
 		     uint8_t *data)
 {
 	struct b53_device *dev = ds->priv;
 	const struct b53_mib_desc *mibs = b53_get_mib(dev);
 	unsigned int mib_size = b53_get_mib_size(dev);
+	struct phy_device *phydev;
 	unsigned int i;
 
-	if (stringset != ETH_SS_STATS)
-		return;
+	if (stringset == ETH_SS_STATS) {
+		for (i = 0; i < mib_size; i++)
+			strlcpy(data + i * ETH_GSTRING_LEN,
+				mibs[i].name, ETH_GSTRING_LEN);
+	} else if (stringset == ETH_SS_PHY_STATS) {
+		phydev = b53_get_phy_device(ds, port);
+		if (!phydev)
+			return;
 
-	for (i = 0; i < mib_size; i++)
-		strlcpy(data + i * ETH_GSTRING_LEN,
-			mibs[i].name, ETH_GSTRING_LEN);
+		phy_ethtool_get_strings(phydev, data);
+	}
 }
 EXPORT_SYMBOL(b53_get_strings);
 
@@ -856,14 +875,34 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
 }
 EXPORT_SYMBOL(b53_get_ethtool_stats);
 
+void b53_get_ethtool_phy_stats(struct dsa_switch *ds, int port, uint64_t *data)
+{
+	struct phy_device *phydev;
+
+	phydev = b53_get_phy_device(ds, port);
+	if (!phydev)
+		return;
+
+	phy_ethtool_get_stats(phydev, NULL, data);
+}
+EXPORT_SYMBOL(b53_get_ethtool_phy_stats);
+
 int b53_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
 	struct b53_device *dev = ds->priv;
+	struct phy_device *phydev;
 
-	if (sset != ETH_SS_STATS)
-		return 0;
+	if (sset == ETH_SS_STATS) {
+		return b53_get_mib_size(dev);
+	} else if (sset == ETH_SS_PHY_STATS) {
+		phydev = b53_get_phy_device(ds, port);
+		if (!phydev)
+			return 0;
+
+		return phy_ethtool_get_sset_count(phydev);
+	}
 
-	return b53_get_mib_size(dev);
+	return 0;
 }
 EXPORT_SYMBOL(b53_get_sset_count);
 
@@ -1484,7 +1523,7 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
 }
 EXPORT_SYMBOL(b53_br_fast_age);
 
-static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
+static bool b53_possible_cpu_port(struct dsa_switch *ds, int port)
 {
 	/* Broadcom switches will accept enabling Broadcom tags on the
 	 * following ports: 5, 7 and 8, any other port is not supported
@@ -1496,10 +1535,19 @@ static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
 		return true;
 	}
 
-	dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n", port);
 	return false;
 }
 
+static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
+{
+	bool ret = b53_possible_cpu_port(ds, port);
+
+	if (!ret)
+		dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n",
+			 port);
+	return ret;
+}
+
 enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port)
 {
 	struct b53_device *dev = ds->priv;
@@ -1657,6 +1705,7 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.get_strings		= b53_get_strings,
 	.get_ethtool_stats	= b53_get_ethtool_stats,
 	.get_sset_count		= b53_get_sset_count,
+	.get_ethtool_phy_stats	= b53_get_ethtool_phy_stats,
 	.phy_read		= b53_phy_read16,
 	.phy_write		= b53_phy_write16,
 	.adjust_link		= b53_adjust_link,
@@ -1961,6 +2010,15 @@ static int b53_switch_init(struct b53_device *dev)
 	dev->num_ports = dev->cpu_port + 1;
 	dev->enabled_ports |= BIT(dev->cpu_port);
 
+	/* Include non standard CPU port built-in PHYs to be probed */
+	if (is539x(dev) || is531x5(dev)) {
+		for (i = 0; i < dev->num_ports; i++) {
+			if (!(dev->ds->phys_mii_mask & BIT(i)) &&
+			    !b53_possible_cpu_port(dev->ds, i))
+				dev->ds->phys_mii_mask |= BIT(i);
+		}
+	}
+
 	dev->ports = devm_kzalloc(dev->dev,
 				  sizeof(struct b53_port) * dev->num_ports,
 				  GFP_KERNEL);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index b933d5cb5c2d..cc284a514de9 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -290,6 +290,7 @@ void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
 		     uint8_t *data);
 void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
 int b53_get_sset_count(struct dsa_switch *ds, int port, int sset);
+void b53_get_ethtool_phy_stats(struct dsa_switch *ds, int port, uint64_t *data);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
 void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 0378eded31f2..97236cfcbae4 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -859,6 +859,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.get_strings		= b53_get_strings,
 	.get_ethtool_stats	= b53_get_ethtool_stats,
 	.get_sset_count		= b53_get_sset_count,
+	.get_ethtool_phy_stats	= b53_get_ethtool_phy_stats,
 	.get_phy_flags		= bcm_sf2_sw_get_phy_flags,
 	.adjust_link		= bcm_sf2_sw_adjust_link,
 	.fixed_link_update	= bcm_sf2_sw_fixed_link_update,
-- 
cgit v1.2.3-59-g8ed1b


From 96cbddcd52e76d9052948e408b17bedc8aa1c11a Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 25 Apr 2018 12:12:54 -0700
Subject: net: dsa: loop: Hook PHY statistics

We just return the same statistics through ethtool_get_stats() and
ethtool_get_phy_stats() for simplicity since this is just a mock-up driver.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/dsa_loop.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index 9354cc08d3fd..58f14af04639 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -88,7 +88,7 @@ static int dsa_loop_setup(struct dsa_switch *ds)
 
 static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port, int sset)
 {
-	if (sset != ETH_SS_STATS)
+	if (sset != ETH_SS_STATS && sset != ETH_SS_PHY_STATS)
 		return 0;
 
 	return __DSA_LOOP_CNT_MAX;
@@ -100,7 +100,7 @@ static void dsa_loop_get_strings(struct dsa_switch *ds, int port,
 	struct dsa_loop_priv *ps = ds->priv;
 	unsigned int i;
 
-	if (stringset != ETH_SS_STATS)
+	if (stringset != ETH_SS_STATS && stringset != ETH_SS_PHY_STATS)
 		return;
 
 	for (i = 0; i < __DSA_LOOP_CNT_MAX; i++)
@@ -263,6 +263,7 @@ static const struct dsa_switch_ops dsa_loop_driver = {
 	.get_strings		= dsa_loop_get_strings,
 	.get_ethtool_stats	= dsa_loop_get_ethtool_stats,
 	.get_sset_count		= dsa_loop_get_sset_count,
+	.get_ethtool_phy_stats	= dsa_loop_get_ethtool_stats,
 	.phy_read		= dsa_loop_phy_read,
 	.phy_write		= dsa_loop_phy_write,
 	.port_bridge_join	= dsa_loop_port_bridge_join,
-- 
cgit v1.2.3-59-g8ed1b


From 1ecd6e8ad9964273d2b540196784bc3c9b17053b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 27 Apr 2018 13:11:25 -0400
Subject: phy: Temporary build fix after phylib changes.

Make PHYLIB boolean, because we reference phylib provided symbols now
from net/core/ethtool.c and therefore 'm' doesn't work.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index edb8b9ab827f..7c5e8c1e9370 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -9,7 +9,6 @@ menuconfig MDIO_DEVICE
 
 config MDIO_BUS
 	tristate
-	default m if PHYLIB=m
 	default MDIO_DEVICE
 	help
 	  This internal symbol is used for link time dependencies and it
@@ -171,7 +170,7 @@ config PHYLINK
 	  autonegotiation modes.
 
 menuconfig PHYLIB
-	tristate "PHY Device support and infrastructure"
+	bool "PHY Device support and infrastructure"
 	depends on NETDEVICES
 	select MDIO_DEVICE
 	help
-- 
cgit v1.2.3-59-g8ed1b


From f058ca6b06d4c4685201de3760b1962641aad445 Mon Sep 17 00:00:00 2001
From: Pradeep Nalla <pradeep.nalla@cavium.com>
Date: Wed, 25 Apr 2018 17:00:12 -0700
Subject: liquidio: move a couple of functions to lio_core.c

To support the next patch in this series which has code that calls
octnet_get_link_stats from two different .c files, move that function (and
its dependency octnet_nic_stats_callback) to lio_core.c.  Remove
octnet_get_link_stats's static declaration and add its function prototype
in octeon_network.h.

Signed-off-by: Pradeep Nalla <pradeep.nalla@cavium.com>
Acked-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 152 ++++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 157 ---------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h  |   2 +
 3 files changed, 154 insertions(+), 157 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 2a94eee943b2..334a068b6a62 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -1169,3 +1169,155 @@ int lio_wait_for_clean_oq(struct octeon_device *oct)
 
 	return pending_pkts;
 }
+
+static void
+octnet_nic_stats_callback(struct octeon_device *oct_dev,
+			  u32 status, void *ptr)
+{
+	struct octeon_soft_command *sc = (struct octeon_soft_command *)ptr;
+	struct oct_nic_stats_resp *resp =
+	    (struct oct_nic_stats_resp *)sc->virtrptr;
+	struct oct_nic_stats_ctrl *ctrl =
+	    (struct oct_nic_stats_ctrl *)sc->ctxptr;
+	struct nic_rx_stats *rsp_rstats = &resp->stats.fromwire;
+	struct nic_tx_stats *rsp_tstats = &resp->stats.fromhost;
+	struct nic_rx_stats *rstats = &oct_dev->link_stats.fromwire;
+	struct nic_tx_stats *tstats = &oct_dev->link_stats.fromhost;
+
+	if (status != OCTEON_REQUEST_TIMEOUT && !resp->status) {
+		octeon_swap_8B_data((u64 *)&resp->stats,
+				    (sizeof(struct oct_link_stats)) >> 3);
+
+		/* RX link-level stats */
+		rstats->total_rcvd = rsp_rstats->total_rcvd;
+		rstats->bytes_rcvd = rsp_rstats->bytes_rcvd;
+		rstats->total_bcst = rsp_rstats->total_bcst;
+		rstats->total_mcst = rsp_rstats->total_mcst;
+		rstats->runts      = rsp_rstats->runts;
+		rstats->ctl_rcvd   = rsp_rstats->ctl_rcvd;
+		/* Accounts for over/under-run of buffers */
+		rstats->fifo_err  = rsp_rstats->fifo_err;
+		rstats->dmac_drop = rsp_rstats->dmac_drop;
+		rstats->fcs_err   = rsp_rstats->fcs_err;
+		rstats->jabber_err = rsp_rstats->jabber_err;
+		rstats->l2_err    = rsp_rstats->l2_err;
+		rstats->frame_err = rsp_rstats->frame_err;
+		rstats->red_drops = rsp_rstats->red_drops;
+
+		/* RX firmware stats */
+		rstats->fw_total_rcvd = rsp_rstats->fw_total_rcvd;
+		rstats->fw_total_fwd = rsp_rstats->fw_total_fwd;
+		rstats->fw_err_pko = rsp_rstats->fw_err_pko;
+		rstats->fw_err_link = rsp_rstats->fw_err_link;
+		rstats->fw_err_drop = rsp_rstats->fw_err_drop;
+		rstats->fw_rx_vxlan = rsp_rstats->fw_rx_vxlan;
+		rstats->fw_rx_vxlan_err = rsp_rstats->fw_rx_vxlan_err;
+
+		/* Number of packets that are LROed      */
+		rstats->fw_lro_pkts = rsp_rstats->fw_lro_pkts;
+		/* Number of octets that are LROed       */
+		rstats->fw_lro_octs = rsp_rstats->fw_lro_octs;
+		/* Number of LRO packets formed          */
+		rstats->fw_total_lro = rsp_rstats->fw_total_lro;
+		/* Number of times lRO of packet aborted */
+		rstats->fw_lro_aborts = rsp_rstats->fw_lro_aborts;
+		rstats->fw_lro_aborts_port = rsp_rstats->fw_lro_aborts_port;
+		rstats->fw_lro_aborts_seq = rsp_rstats->fw_lro_aborts_seq;
+		rstats->fw_lro_aborts_tsval = rsp_rstats->fw_lro_aborts_tsval;
+		rstats->fw_lro_aborts_timer = rsp_rstats->fw_lro_aborts_timer;
+		/* intrmod: packet forward rate */
+		rstats->fwd_rate = rsp_rstats->fwd_rate;
+
+		/* TX link-level stats */
+		tstats->total_pkts_sent = rsp_tstats->total_pkts_sent;
+		tstats->total_bytes_sent = rsp_tstats->total_bytes_sent;
+		tstats->mcast_pkts_sent = rsp_tstats->mcast_pkts_sent;
+		tstats->bcast_pkts_sent = rsp_tstats->bcast_pkts_sent;
+		tstats->ctl_sent = rsp_tstats->ctl_sent;
+		/* Packets sent after one collision*/
+		tstats->one_collision_sent = rsp_tstats->one_collision_sent;
+		/* Packets sent after multiple collision*/
+		tstats->multi_collision_sent = rsp_tstats->multi_collision_sent;
+		/* Packets not sent due to max collisions */
+		tstats->max_collision_fail = rsp_tstats->max_collision_fail;
+		/* Packets not sent due to max deferrals */
+		tstats->max_deferral_fail = rsp_tstats->max_deferral_fail;
+		/* Accounts for over/under-run of buffers */
+		tstats->fifo_err = rsp_tstats->fifo_err;
+		tstats->runts = rsp_tstats->runts;
+		/* Total number of collisions detected */
+		tstats->total_collisions = rsp_tstats->total_collisions;
+
+		/* firmware stats */
+		tstats->fw_total_sent = rsp_tstats->fw_total_sent;
+		tstats->fw_total_fwd = rsp_tstats->fw_total_fwd;
+		tstats->fw_err_pko = rsp_tstats->fw_err_pko;
+		tstats->fw_err_pki = rsp_tstats->fw_err_pki;
+		tstats->fw_err_link = rsp_tstats->fw_err_link;
+		tstats->fw_err_drop = rsp_tstats->fw_err_drop;
+		tstats->fw_tso = rsp_tstats->fw_tso;
+		tstats->fw_tso_fwd = rsp_tstats->fw_tso_fwd;
+		tstats->fw_err_tso = rsp_tstats->fw_err_tso;
+		tstats->fw_tx_vxlan = rsp_tstats->fw_tx_vxlan;
+
+		resp->status = 1;
+	} else {
+		resp->status = -1;
+	}
+	complete(&ctrl->complete);
+}
+
+int octnet_get_link_stats(struct net_device *netdev)
+{
+	struct lio *lio = GET_LIO(netdev);
+	struct octeon_device *oct_dev = lio->oct_dev;
+	struct octeon_soft_command *sc;
+	struct oct_nic_stats_ctrl *ctrl;
+	struct oct_nic_stats_resp *resp;
+	int retval;
+
+	/* Alloc soft command */
+	sc = (struct octeon_soft_command *)
+		octeon_alloc_soft_command(oct_dev,
+					  0,
+					  sizeof(struct oct_nic_stats_resp),
+					  sizeof(struct octnic_ctrl_pkt));
+
+	if (!sc)
+		return -ENOMEM;
+
+	resp = (struct oct_nic_stats_resp *)sc->virtrptr;
+	memset(resp, 0, sizeof(struct oct_nic_stats_resp));
+
+	ctrl = (struct oct_nic_stats_ctrl *)sc->ctxptr;
+	memset(ctrl, 0, sizeof(struct oct_nic_stats_ctrl));
+	ctrl->netdev = netdev;
+	init_completion(&ctrl->complete);
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	octeon_prepare_soft_command(oct_dev, sc, OPCODE_NIC,
+				    OPCODE_NIC_PORT_STATS, 0, 0, 0);
+
+	sc->callback = octnet_nic_stats_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = 500;	/*in milli seconds*/
+
+	retval = octeon_send_soft_command(oct_dev, sc);
+	if (retval == IQ_SEND_FAILED) {
+		octeon_free_soft_command(oct_dev, sc);
+		return -EINVAL;
+	}
+
+	wait_for_completion_timeout(&ctrl->complete, msecs_to_jiffies(1000));
+
+	if (resp->status != 1) {
+		octeon_free_soft_command(oct_dev, sc);
+
+		return -EINVAL;
+	}
+
+	octeon_free_soft_command(oct_dev, sc);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 9926a12dd805..351549c83472 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -32,7 +32,6 @@
 #include "cn23xx_vf_device.h"
 
 static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs);
-static int octnet_get_link_stats(struct net_device *netdev);
 
 struct oct_intrmod_context {
 	int octeon_id;
@@ -1776,162 +1775,6 @@ static int octnet_set_intrmod_cfg(struct lio *lio,
 	return -EINTR;
 }
 
-static void
-octnet_nic_stats_callback(struct octeon_device *oct_dev,
-			  u32 status, void *ptr)
-{
-	struct octeon_soft_command *sc = (struct octeon_soft_command *)ptr;
-	struct oct_nic_stats_resp *resp =
-	    (struct oct_nic_stats_resp *)sc->virtrptr;
-	struct oct_nic_stats_ctrl *ctrl =
-	    (struct oct_nic_stats_ctrl *)sc->ctxptr;
-	struct nic_rx_stats *rsp_rstats = &resp->stats.fromwire;
-	struct nic_tx_stats *rsp_tstats = &resp->stats.fromhost;
-
-	struct nic_rx_stats *rstats = &oct_dev->link_stats.fromwire;
-	struct nic_tx_stats *tstats = &oct_dev->link_stats.fromhost;
-
-	if ((status != OCTEON_REQUEST_TIMEOUT) && !resp->status) {
-		octeon_swap_8B_data((u64 *)&resp->stats,
-				    (sizeof(struct oct_link_stats)) >> 3);
-
-		/* RX link-level stats */
-		rstats->total_rcvd = rsp_rstats->total_rcvd;
-		rstats->bytes_rcvd = rsp_rstats->bytes_rcvd;
-		rstats->total_bcst = rsp_rstats->total_bcst;
-		rstats->total_mcst = rsp_rstats->total_mcst;
-		rstats->runts      = rsp_rstats->runts;
-		rstats->ctl_rcvd   = rsp_rstats->ctl_rcvd;
-		/* Accounts for over/under-run of buffers */
-		rstats->fifo_err  = rsp_rstats->fifo_err;
-		rstats->dmac_drop = rsp_rstats->dmac_drop;
-		rstats->fcs_err   = rsp_rstats->fcs_err;
-		rstats->jabber_err = rsp_rstats->jabber_err;
-		rstats->l2_err    = rsp_rstats->l2_err;
-		rstats->frame_err = rsp_rstats->frame_err;
-		rstats->red_drops = rsp_rstats->red_drops;
-
-		/* RX firmware stats */
-		rstats->fw_total_rcvd = rsp_rstats->fw_total_rcvd;
-		rstats->fw_total_fwd = rsp_rstats->fw_total_fwd;
-		rstats->fw_err_pko = rsp_rstats->fw_err_pko;
-		rstats->fw_err_link = rsp_rstats->fw_err_link;
-		rstats->fw_err_drop = rsp_rstats->fw_err_drop;
-		rstats->fw_rx_vxlan = rsp_rstats->fw_rx_vxlan;
-		rstats->fw_rx_vxlan_err = rsp_rstats->fw_rx_vxlan_err;
-
-		/* Number of packets that are LROed      */
-		rstats->fw_lro_pkts = rsp_rstats->fw_lro_pkts;
-		/* Number of octets that are LROed       */
-		rstats->fw_lro_octs = rsp_rstats->fw_lro_octs;
-		/* Number of LRO packets formed          */
-		rstats->fw_total_lro = rsp_rstats->fw_total_lro;
-		/* Number of times lRO of packet aborted */
-		rstats->fw_lro_aborts = rsp_rstats->fw_lro_aborts;
-		rstats->fw_lro_aborts_port = rsp_rstats->fw_lro_aborts_port;
-		rstats->fw_lro_aborts_seq = rsp_rstats->fw_lro_aborts_seq;
-		rstats->fw_lro_aborts_tsval = rsp_rstats->fw_lro_aborts_tsval;
-		rstats->fw_lro_aborts_timer = rsp_rstats->fw_lro_aborts_timer;
-		/* intrmod: packet forward rate */
-		rstats->fwd_rate = rsp_rstats->fwd_rate;
-
-		/* TX link-level stats */
-		tstats->total_pkts_sent = rsp_tstats->total_pkts_sent;
-		tstats->total_bytes_sent = rsp_tstats->total_bytes_sent;
-		tstats->mcast_pkts_sent = rsp_tstats->mcast_pkts_sent;
-		tstats->bcast_pkts_sent = rsp_tstats->bcast_pkts_sent;
-		tstats->ctl_sent = rsp_tstats->ctl_sent;
-		/* Packets sent after one collision*/
-		tstats->one_collision_sent = rsp_tstats->one_collision_sent;
-		/* Packets sent after multiple collision*/
-		tstats->multi_collision_sent = rsp_tstats->multi_collision_sent;
-		/* Packets not sent due to max collisions */
-		tstats->max_collision_fail = rsp_tstats->max_collision_fail;
-		/* Packets not sent due to max deferrals */
-		tstats->max_deferral_fail = rsp_tstats->max_deferral_fail;
-		/* Accounts for over/under-run of buffers */
-		tstats->fifo_err = rsp_tstats->fifo_err;
-		tstats->runts = rsp_tstats->runts;
-		/* Total number of collisions detected */
-		tstats->total_collisions = rsp_tstats->total_collisions;
-
-		/* firmware stats */
-		tstats->fw_total_sent = rsp_tstats->fw_total_sent;
-		tstats->fw_total_fwd = rsp_tstats->fw_total_fwd;
-		tstats->fw_err_pko = rsp_tstats->fw_err_pko;
-		tstats->fw_err_pki = rsp_tstats->fw_err_pki;
-		tstats->fw_err_link = rsp_tstats->fw_err_link;
-		tstats->fw_err_drop = rsp_tstats->fw_err_drop;
-		tstats->fw_tso = rsp_tstats->fw_tso;
-		tstats->fw_tso_fwd = rsp_tstats->fw_tso_fwd;
-		tstats->fw_err_tso = rsp_tstats->fw_err_tso;
-		tstats->fw_tx_vxlan = rsp_tstats->fw_tx_vxlan;
-
-		resp->status = 1;
-	} else {
-		resp->status = -1;
-	}
-	complete(&ctrl->complete);
-}
-
-/*  Configure interrupt moderation parameters */
-static int octnet_get_link_stats(struct net_device *netdev)
-{
-	struct lio *lio = GET_LIO(netdev);
-	struct octeon_device *oct_dev = lio->oct_dev;
-
-	struct octeon_soft_command *sc;
-	struct oct_nic_stats_ctrl *ctrl;
-	struct oct_nic_stats_resp *resp;
-
-	int retval;
-
-	/* Alloc soft command */
-	sc = (struct octeon_soft_command *)
-		octeon_alloc_soft_command(oct_dev,
-					  0,
-					  sizeof(struct oct_nic_stats_resp),
-					  sizeof(struct octnic_ctrl_pkt));
-
-	if (!sc)
-		return -ENOMEM;
-
-	resp = (struct oct_nic_stats_resp *)sc->virtrptr;
-	memset(resp, 0, sizeof(struct oct_nic_stats_resp));
-
-	ctrl = (struct oct_nic_stats_ctrl *)sc->ctxptr;
-	memset(ctrl, 0, sizeof(struct oct_nic_stats_ctrl));
-	ctrl->netdev = netdev;
-	init_completion(&ctrl->complete);
-
-	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
-
-	octeon_prepare_soft_command(oct_dev, sc, OPCODE_NIC,
-				    OPCODE_NIC_PORT_STATS, 0, 0, 0);
-
-	sc->callback = octnet_nic_stats_callback;
-	sc->callback_arg = sc;
-	sc->wait_time = 500;	/*in milli seconds*/
-
-	retval = octeon_send_soft_command(oct_dev, sc);
-	if (retval == IQ_SEND_FAILED) {
-		octeon_free_soft_command(oct_dev, sc);
-		return -EINVAL;
-	}
-
-	wait_for_completion_timeout(&ctrl->complete, msecs_to_jiffies(1000));
-
-	if (resp->status != 1) {
-		octeon_free_soft_command(oct_dev, sc);
-
-		return -EINVAL;
-	}
-
-	octeon_free_soft_command(oct_dev, sc);
-
-	return 0;
-}
-
 static int lio_get_intr_coalesce(struct net_device *netdev,
 				 struct ethtool_coalesce *intr_coal)
 {
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 4069710796a8..d090edd1a4a5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -190,6 +190,8 @@ irqreturn_t liquidio_msix_intr_handler(int irq __attribute__((unused)),
 
 int octeon_setup_interrupt(struct octeon_device *oct, u32 num_ioqs);
 
+int octnet_get_link_stats(struct net_device *netdev);
+
 int lio_wait_for_clean_oq(struct octeon_device *oct);
 /**
  * \brief Register ethtool operations
-- 
cgit v1.2.3-59-g8ed1b


From 80002347d6f51c45e49eb545ec7ae7077d46faf8 Mon Sep 17 00:00:00 2001
From: Pradeep Nalla <pradeep.nalla@cavium.com>
Date: Wed, 25 Apr 2018 17:00:22 -0700
Subject: liquidio: add support for ndo_get_stats64 instead of ndo_get_stats

Support ndo_get_stats64 instead of ndo_get_stats.  Also add stats for
multicast and broadcast packets.

Signed-off-by: Pradeep Nalla <pradeep.nalla@cavium.com>
Acked-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    |  4 +
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 98 +++++++++++++---------
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 52 ++++++++----
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 47 +++++++----
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  5 ++
 5 files changed, 135 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 334a068b6a62..844e288d60fe 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -1207,6 +1207,8 @@ octnet_nic_stats_callback(struct octeon_device *oct_dev,
 		/* RX firmware stats */
 		rstats->fw_total_rcvd = rsp_rstats->fw_total_rcvd;
 		rstats->fw_total_fwd = rsp_rstats->fw_total_fwd;
+		rstats->fw_total_mcast = rsp_rstats->fw_total_mcast;
+		rstats->fw_total_bcast = rsp_rstats->fw_total_bcast;
 		rstats->fw_err_pko = rsp_rstats->fw_err_pko;
 		rstats->fw_err_link = rsp_rstats->fw_err_link;
 		rstats->fw_err_drop = rsp_rstats->fw_err_drop;
@@ -1251,6 +1253,8 @@ octnet_nic_stats_callback(struct octeon_device *oct_dev,
 		/* firmware stats */
 		tstats->fw_total_sent = rsp_tstats->fw_total_sent;
 		tstats->fw_total_fwd = rsp_tstats->fw_total_fwd;
+		tstats->fw_total_mcast_sent = rsp_tstats->fw_total_mcast_sent;
+		tstats->fw_total_bcast_sent = rsp_tstats->fw_total_bcast_sent;
 		tstats->fw_err_pko = rsp_tstats->fw_err_pko;
 		tstats->fw_err_pki = rsp_tstats->fw_err_pki;
 		tstats->fw_err_link = rsp_tstats->fw_err_link;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 351549c83472..64c817a63a01 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -112,6 +112,9 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 	"tx_tso_err",
 	"tx_vxlan",
 
+	"tx_mcast",
+	"tx_bcast",
+
 	"mac_tx_total_pkts",
 	"mac_tx_total_bytes",
 	"mac_tx_mcast_pkts",
@@ -127,6 +130,8 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 
 	"rx_total_rcvd",
 	"rx_total_fwd",
+	"rx_mcast",
+	"rx_bcast",
 	"rx_jabber_err",
 	"rx_l2_err",
 	"rx_frame_err",
@@ -171,6 +176,10 @@ static const char oct_vf_stats_strings[][ETH_GSTRING_LEN] = {
 	"tx_errors",
 	"rx_dropped",
 	"tx_dropped",
+	"rx_mcast",
+	"tx_mcast",
+	"rx_bcast",
+	"tx_bcast",
 	"link_state_changes",
 };
 
@@ -1056,50 +1065,48 @@ lio_get_ethtool_stats(struct net_device *netdev,
 {
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct_dev = lio->oct_dev;
-	struct net_device_stats *netstats = &netdev->stats;
+	struct rtnl_link_stats64 lstats;
 	int i = 0, j;
 
 	if (ifstate_check(lio, LIO_IFSTATE_RESETTING))
 		return;
 
-	netdev->netdev_ops->ndo_get_stats(netdev);
-	octnet_get_link_stats(netdev);
-
+	netdev->netdev_ops->ndo_get_stats64(netdev, &lstats);
 	/*sum of oct->droq[oq_no]->stats->rx_pkts_received */
-	data[i++] = CVM_CAST64(netstats->rx_packets);
+	data[i++] = lstats.rx_packets;
 	/*sum of oct->instr_queue[iq_no]->stats.tx_done */
-	data[i++] = CVM_CAST64(netstats->tx_packets);
+	data[i++] = lstats.tx_packets;
 	/*sum of oct->droq[oq_no]->stats->rx_bytes_received */
-	data[i++] = CVM_CAST64(netstats->rx_bytes);
+	data[i++] = lstats.rx_bytes;
 	/*sum of oct->instr_queue[iq_no]->stats.tx_tot_bytes */
-	data[i++] = CVM_CAST64(netstats->tx_bytes);
-	data[i++] = CVM_CAST64(netstats->rx_errors +
-			       oct_dev->link_stats.fromwire.fcs_err +
-			       oct_dev->link_stats.fromwire.jabber_err +
-			       oct_dev->link_stats.fromwire.l2_err +
-			       oct_dev->link_stats.fromwire.frame_err);
-	data[i++] = CVM_CAST64(netstats->tx_errors);
+	data[i++] = lstats.tx_bytes;
+	data[i++] = lstats.rx_errors +
+			oct_dev->link_stats.fromwire.fcs_err +
+			oct_dev->link_stats.fromwire.jabber_err +
+			oct_dev->link_stats.fromwire.l2_err +
+			oct_dev->link_stats.fromwire.frame_err;
+	data[i++] = lstats.tx_errors;
 	/*sum of oct->droq[oq_no]->stats->rx_dropped +
 	 *oct->droq[oq_no]->stats->dropped_nodispatch +
 	 *oct->droq[oq_no]->stats->dropped_toomany +
 	 *oct->droq[oq_no]->stats->dropped_nomem
 	 */
-	data[i++] = CVM_CAST64(netstats->rx_dropped +
-			       oct_dev->link_stats.fromwire.fifo_err +
-			       oct_dev->link_stats.fromwire.dmac_drop +
-			       oct_dev->link_stats.fromwire.red_drops +
-			       oct_dev->link_stats.fromwire.fw_err_pko +
-			       oct_dev->link_stats.fromwire.fw_err_link +
-			       oct_dev->link_stats.fromwire.fw_err_drop);
+	data[i++] = lstats.rx_dropped +
+			oct_dev->link_stats.fromwire.fifo_err +
+			oct_dev->link_stats.fromwire.dmac_drop +
+			oct_dev->link_stats.fromwire.red_drops +
+			oct_dev->link_stats.fromwire.fw_err_pko +
+			oct_dev->link_stats.fromwire.fw_err_link +
+			oct_dev->link_stats.fromwire.fw_err_drop;
 	/*sum of oct->instr_queue[iq_no]->stats.tx_dropped */
-	data[i++] = CVM_CAST64(netstats->tx_dropped +
-			       oct_dev->link_stats.fromhost.max_collision_fail +
-			       oct_dev->link_stats.fromhost.max_deferral_fail +
-			       oct_dev->link_stats.fromhost.total_collisions +
-			       oct_dev->link_stats.fromhost.fw_err_pko +
-			       oct_dev->link_stats.fromhost.fw_err_link +
-			       oct_dev->link_stats.fromhost.fw_err_drop +
-			       oct_dev->link_stats.fromhost.fw_err_pki);
+	data[i++] = lstats.tx_dropped +
+			oct_dev->link_stats.fromhost.max_collision_fail +
+			oct_dev->link_stats.fromhost.max_deferral_fail +
+			oct_dev->link_stats.fromhost.total_collisions +
+			oct_dev->link_stats.fromhost.fw_err_pko +
+			oct_dev->link_stats.fromhost.fw_err_link +
+			oct_dev->link_stats.fromhost.fw_err_drop +
+			oct_dev->link_stats.fromhost.fw_err_pki;
 
 	/* firmware tx stats */
 	/*per_core_stats[cvmx_get_core_num()].link_stats[mdata->from_ifidx].
@@ -1134,6 +1141,10 @@ lio_get_ethtool_stats(struct net_device *netdev,
 	 */
 	data[i++] = CVM_CAST64(oct_dev->link_stats.fromhost.fw_tx_vxlan);
 
+	/* Multicast packets sent by this port */
+	data[i++] = oct_dev->link_stats.fromhost.fw_total_mcast_sent;
+	data[i++] = oct_dev->link_stats.fromhost.fw_total_bcast_sent;
+
 	/* mac tx statistics */
 	/*CVMX_BGXX_CMRX_TX_STAT5 */
 	data[i++] = CVM_CAST64(oct_dev->link_stats.fromhost.total_pkts_sent);
@@ -1170,6 +1181,9 @@ lio_get_ethtool_stats(struct net_device *netdev,
 	 *fw_total_fwd
 	 */
 	data[i++] = CVM_CAST64(oct_dev->link_stats.fromwire.fw_total_fwd);
+	/* Multicast packets received on this port */
+	data[i++] = oct_dev->link_stats.fromwire.fw_total_mcast;
+	data[i++] = oct_dev->link_stats.fromwire.fw_total_bcast;
 	/*per_core_stats[core_id].link_stats[ifidx].fromwire.jabber_err */
 	data[i++] = CVM_CAST64(oct_dev->link_stats.fromwire.jabber_err);
 	/*per_core_stats[core_id].link_stats[ifidx].fromwire.l2_err */
@@ -1338,7 +1352,7 @@ static void lio_vf_get_ethtool_stats(struct net_device *netdev,
 				     __attribute__((unused)),
 				     u64 *data)
 {
-	struct net_device_stats *netstats = &netdev->stats;
+	struct rtnl_link_stats64 lstats;
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct_dev = lio->oct_dev;
 	int i = 0, j, vj;
@@ -1346,25 +1360,31 @@ static void lio_vf_get_ethtool_stats(struct net_device *netdev,
 	if (ifstate_check(lio, LIO_IFSTATE_RESETTING))
 		return;
 
-	netdev->netdev_ops->ndo_get_stats(netdev);
+	netdev->netdev_ops->ndo_get_stats64(netdev, &lstats);
 	/* sum of oct->droq[oq_no]->stats->rx_pkts_received */
-	data[i++] = CVM_CAST64(netstats->rx_packets);
+	data[i++] = lstats.rx_packets;
 	/* sum of oct->instr_queue[iq_no]->stats.tx_done */
-	data[i++] = CVM_CAST64(netstats->tx_packets);
+	data[i++] = lstats.tx_packets;
 	/* sum of oct->droq[oq_no]->stats->rx_bytes_received */
-	data[i++] = CVM_CAST64(netstats->rx_bytes);
+	data[i++] = lstats.rx_bytes;
 	/* sum of oct->instr_queue[iq_no]->stats.tx_tot_bytes */
-	data[i++] = CVM_CAST64(netstats->tx_bytes);
-	data[i++] = CVM_CAST64(netstats->rx_errors);
-	data[i++] = CVM_CAST64(netstats->tx_errors);
+	data[i++] = lstats.tx_bytes;
+	data[i++] = lstats.rx_errors;
+	data[i++] = lstats.tx_errors;
 	 /* sum of oct->droq[oq_no]->stats->rx_dropped +
 	  * oct->droq[oq_no]->stats->dropped_nodispatch +
 	  * oct->droq[oq_no]->stats->dropped_toomany +
 	  * oct->droq[oq_no]->stats->dropped_nomem
 	  */
-	data[i++] = CVM_CAST64(netstats->rx_dropped);
+	data[i++] = lstats.rx_dropped;
 	/* sum of oct->instr_queue[iq_no]->stats.tx_dropped */
-	data[i++] = CVM_CAST64(netstats->tx_dropped);
+	data[i++] = lstats.tx_dropped;
+
+	data[i++] = oct_dev->link_stats.fromwire.fw_total_mcast;
+	data[i++] = oct_dev->link_stats.fromhost.fw_total_mcast_sent;
+	data[i++] = oct_dev->link_stats.fromwire.fw_total_bcast;
+	data[i++] = oct_dev->link_stats.fromhost.fw_total_bcast_sent;
+
 	/* lio->link_changes */
 	data[i++] = CVM_CAST64(lio->link_changes);
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index f3891ae11b02..fe3edf13c8f0 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2252,14 +2252,11 @@ static int liquidio_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-/**
- * \brief Net device get_stats
- * @param netdev network device
- */
-static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
+static void
+liquidio_get_stats64(struct net_device *netdev,
+		     struct rtnl_link_stats64 *lstats)
 {
 	struct lio *lio = GET_LIO(netdev);
-	struct net_device_stats *stats = &netdev->stats;
 	struct octeon_device *oct;
 	u64 pkts = 0, drop = 0, bytes = 0;
 	struct oct_droq_stats *oq_stats;
@@ -2269,7 +2266,7 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 	oct = lio->oct_dev;
 
 	if (ifstate_check(lio, LIO_IFSTATE_RESETTING))
-		return stats;
+		return;
 
 	for (i = 0; i < oct->num_iqs; i++) {
 		iq_no = lio->linfo.txpciq[i].s.q_no;
@@ -2279,9 +2276,9 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 		bytes += iq_stats->tx_tot_bytes;
 	}
 
-	stats->tx_packets = pkts;
-	stats->tx_bytes = bytes;
-	stats->tx_dropped = drop;
+	lstats->tx_packets = pkts;
+	lstats->tx_bytes = bytes;
+	lstats->tx_dropped = drop;
 
 	pkts = 0;
 	drop = 0;
@@ -2298,11 +2295,34 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 		bytes += oq_stats->rx_bytes_received;
 	}
 
-	stats->rx_bytes = bytes;
-	stats->rx_packets = pkts;
-	stats->rx_dropped = drop;
-
-	return stats;
+	lstats->rx_bytes = bytes;
+	lstats->rx_packets = pkts;
+	lstats->rx_dropped = drop;
+
+	octnet_get_link_stats(netdev);
+	lstats->multicast = oct->link_stats.fromwire.fw_total_mcast;
+	lstats->collisions = oct->link_stats.fromhost.total_collisions;
+
+	/* detailed rx_errors: */
+	lstats->rx_length_errors = oct->link_stats.fromwire.l2_err;
+	/* recved pkt with crc error    */
+	lstats->rx_crc_errors = oct->link_stats.fromwire.fcs_err;
+	/* recv'd frame alignment error */
+	lstats->rx_frame_errors = oct->link_stats.fromwire.frame_err;
+	/* recv'r fifo overrun */
+	lstats->rx_fifo_errors = oct->link_stats.fromwire.fifo_err;
+
+	lstats->rx_errors = lstats->rx_length_errors + lstats->rx_crc_errors +
+		lstats->rx_frame_errors + lstats->rx_fifo_errors;
+
+	/* detailed tx_errors */
+	lstats->tx_aborted_errors = oct->link_stats.fromhost.fw_err_pko;
+	lstats->tx_carrier_errors = oct->link_stats.fromhost.fw_err_link;
+	lstats->tx_fifo_errors = oct->link_stats.fromhost.fifo_err;
+
+	lstats->tx_errors = lstats->tx_aborted_errors +
+		lstats->tx_carrier_errors +
+		lstats->tx_fifo_errors;
 }
 
 /**
@@ -3355,7 +3375,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
 	.ndo_start_xmit		= liquidio_xmit,
-	.ndo_get_stats		= liquidio_get_stats,
+	.ndo_get_stats64	= liquidio_get_stats64,
 	.ndo_set_mac_address	= liquidio_set_mac,
 	.ndo_set_rx_mode	= liquidio_set_mcast_list,
 	.ndo_tx_timeout		= liquidio_tx_timeout,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index f92dfa411de6..b7b91d15ce3f 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1336,24 +1336,21 @@ static int liquidio_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-/**
- * \brief Net device get_stats
- * @param netdev network device
- */
-static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
+static void
+liquidio_get_stats64(struct net_device *netdev,
+		     struct rtnl_link_stats64 *lstats)
 {
 	struct lio *lio = GET_LIO(netdev);
-	struct net_device_stats *stats = &netdev->stats;
+	struct octeon_device *oct;
 	u64 pkts = 0, drop = 0, bytes = 0;
 	struct oct_droq_stats *oq_stats;
 	struct oct_iq_stats *iq_stats;
-	struct octeon_device *oct;
 	int i, iq_no, oq_no;
 
 	oct = lio->oct_dev;
 
 	if (ifstate_check(lio, LIO_IFSTATE_RESETTING))
-		return stats;
+		return;
 
 	for (i = 0; i < oct->num_iqs; i++) {
 		iq_no = lio->linfo.txpciq[i].s.q_no;
@@ -1363,9 +1360,9 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 		bytes += iq_stats->tx_tot_bytes;
 	}
 
-	stats->tx_packets = pkts;
-	stats->tx_bytes = bytes;
-	stats->tx_dropped = drop;
+	lstats->tx_packets = pkts;
+	lstats->tx_bytes = bytes;
+	lstats->tx_dropped = drop;
 
 	pkts = 0;
 	drop = 0;
@@ -1382,11 +1379,29 @@ static struct net_device_stats *liquidio_get_stats(struct net_device *netdev)
 		bytes += oq_stats->rx_bytes_received;
 	}
 
-	stats->rx_bytes = bytes;
-	stats->rx_packets = pkts;
-	stats->rx_dropped = drop;
+	lstats->rx_bytes = bytes;
+	lstats->rx_packets = pkts;
+	lstats->rx_dropped = drop;
+
+	octnet_get_link_stats(netdev);
+	lstats->multicast = oct->link_stats.fromwire.fw_total_mcast;
+
+	/* detailed rx_errors: */
+	lstats->rx_length_errors = oct->link_stats.fromwire.l2_err;
+	/* recved pkt with crc error */
+	lstats->rx_crc_errors = oct->link_stats.fromwire.fcs_err;
+	/* recv'd frame alignment error */
+	lstats->rx_frame_errors = oct->link_stats.fromwire.frame_err;
+
+	lstats->rx_errors = lstats->rx_length_errors + lstats->rx_crc_errors +
+			    lstats->rx_frame_errors;
+
+	/* detailed tx_errors */
+	lstats->tx_aborted_errors = oct->link_stats.fromhost.fw_err_pko;
+	lstats->tx_carrier_errors = oct->link_stats.fromhost.fw_err_link;
 
-	return stats;
+	lstats->tx_errors = lstats->tx_aborted_errors +
+		lstats->tx_carrier_errors;
 }
 
 /**
@@ -2034,7 +2049,7 @@ static const struct net_device_ops lionetdevops = {
 	.ndo_open		= liquidio_open,
 	.ndo_stop		= liquidio_stop,
 	.ndo_start_xmit		= liquidio_xmit,
-	.ndo_get_stats		= liquidio_get_stats,
+	.ndo_get_stats64	= liquidio_get_stats64,
 	.ndo_set_mac_address	= liquidio_set_mac,
 	.ndo_set_rx_mode	= liquidio_set_mcast_list,
 	.ndo_tx_timeout		= liquidio_tx_timeout,
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 34a94daca590..2166744b6812 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -802,6 +802,9 @@ struct nic_rx_stats {
 	u64 fw_total_rcvd;
 	u64 fw_total_fwd;
 	u64 fw_total_fwd_bytes;
+	u64 fw_total_mcast;
+	u64 fw_total_bcast;
+
 	u64 fw_err_pko;
 	u64 fw_err_link;
 	u64 fw_err_drop;
@@ -858,6 +861,8 @@ struct nic_tx_stats {
 	u64 fw_total_sent;
 	u64 fw_total_fwd;
 	u64 fw_total_fwd_bytes;
+	u64 fw_total_mcast_sent;
+	u64 fw_total_bcast_sent;
 	u64 fw_err_pko;
 	u64 fw_err_link;
 	u64 fw_err_drop;
-- 
cgit v1.2.3-59-g8ed1b


From d8fb1648fcf2d5ded71239fc16d69c447af6f814 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 26 Apr 2018 11:07:05 +0800
Subject: bridge: use hlist_entry_safe

Use hlist_entry_safe() instead of open-coding it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index b4eed113d2ec..7a7fd672ccf2 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -274,8 +274,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
 		struct net_bridge_port *port, *lport, *rport;
 
 		lport = p ? p->port : NULL;
-		rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
-			     NULL;
+		rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
 
 		if ((unsigned long)lport > (unsigned long)rport) {
 			port = lport;
-- 
cgit v1.2.3-59-g8ed1b


From 7bcd64eb8cb56dc2a3b56c0b8f55c8cd908c1c14 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:12 +0200
Subject: s390/qeth: skip QDIO queue handler indirection

Both qeth sub drivers use the same QDIO queue handlers, there's no need
to expose them via the driver's discipline. No functional change.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  9 ---------
 drivers/s390/net/qeth_core_main.c | 29 +++++++++++++----------------
 drivers/s390/net/qeth_l2_main.c   |  3 ---
 drivers/s390/net/qeth_l3_main.c   |  3 ---
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 78b98b3e7efa..8469446df62c 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -712,9 +712,6 @@ enum qeth_discipline_id {
 
 struct qeth_discipline {
 	const struct device_type *devtype;
-	void (*start_poll)(struct ccw_device *, int, unsigned long);
-	qdio_handler_t *input_handler;
-	qdio_handler_t *output_handler;
 	int (*process_rx_buffer)(struct qeth_card *card, int budget, int *done);
 	int (*recover)(void *ptr);
 	int (*setup) (struct ccwgroup_device *);
@@ -921,13 +918,7 @@ struct sk_buff *qeth_core_get_next_skb(struct qeth_card *,
 		struct qeth_qdio_buffer *, struct qdio_buffer_element **, int *,
 		struct qeth_hdr **);
 void qeth_schedule_recovery(struct qeth_card *);
-void qeth_qdio_start_poll(struct ccw_device *, int, unsigned long);
 int qeth_poll(struct napi_struct *napi, int budget);
-void qeth_qdio_input_handler(struct ccw_device *,
-		unsigned int, unsigned int, int,
-		int, unsigned long);
-void qeth_qdio_output_handler(struct ccw_device *, unsigned int,
-			int, int, int, unsigned long);
 void qeth_clear_ipacmd_list(struct qeth_card *);
 int qeth_qdio_clear_card(struct qeth_card *, int);
 void qeth_clear_working_pool_list(struct qeth_card *);
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index dffd820731f2..5323c675e57c 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3588,15 +3588,14 @@ static void qeth_check_outbound_queue(struct qeth_qdio_out_q *queue)
 	}
 }
 
-void qeth_qdio_start_poll(struct ccw_device *ccwdev, int queue,
-		unsigned long card_ptr)
+static void qeth_qdio_start_poll(struct ccw_device *ccwdev, int queue,
+				 unsigned long card_ptr)
 {
 	struct qeth_card *card = (struct qeth_card *)card_ptr;
 
 	if (card->dev && (card->dev->flags & IFF_UP))
 		napi_schedule(&card->napi);
 }
-EXPORT_SYMBOL_GPL(qeth_qdio_start_poll);
 
 int qeth_configure_cq(struct qeth_card *card, enum qeth_cq cq)
 {
@@ -3698,9 +3697,10 @@ out:
 	return;
 }
 
-void qeth_qdio_input_handler(struct ccw_device *ccwdev, unsigned int qdio_err,
-		unsigned int queue, int first_elem, int count,
-		unsigned long card_ptr)
+static void qeth_qdio_input_handler(struct ccw_device *ccwdev,
+				    unsigned int qdio_err, int queue,
+				    int first_elem, int count,
+				    unsigned long card_ptr)
 {
 	struct qeth_card *card = (struct qeth_card *)card_ptr;
 
@@ -3711,14 +3711,12 @@ void qeth_qdio_input_handler(struct ccw_device *ccwdev, unsigned int qdio_err,
 		qeth_qdio_cq_handler(card, qdio_err, queue, first_elem, count);
 	else if (qdio_err)
 		qeth_schedule_recovery(card);
-
-
 }
-EXPORT_SYMBOL_GPL(qeth_qdio_input_handler);
 
-void qeth_qdio_output_handler(struct ccw_device *ccwdev,
-		unsigned int qdio_error, int __queue, int first_element,
-		int count, unsigned long card_ptr)
+static void qeth_qdio_output_handler(struct ccw_device *ccwdev,
+				     unsigned int qdio_error, int __queue,
+				     int first_element, int count,
+				     unsigned long card_ptr)
 {
 	struct qeth_card *card        = (struct qeth_card *) card_ptr;
 	struct qeth_qdio_out_q *queue = card->qdio.out_qs[__queue];
@@ -3787,7 +3785,6 @@ void qeth_qdio_output_handler(struct ccw_device *ccwdev,
 		card->perf_stats.outbound_handler_time += qeth_get_micros() -
 			card->perf_stats.outbound_handler_start_time;
 }
-EXPORT_SYMBOL_GPL(qeth_qdio_output_handler);
 
 /* We cannot use outbound queue 3 for unicast packets on HiperSockets */
 static inline int qeth_cut_iqd_prio(struct qeth_card *card, int queue_num)
@@ -4995,7 +4992,7 @@ static int qeth_qdio_establish(struct qeth_card *card)
 		goto out_free_in_sbals;
 	}
 	for (i = 0; i < card->qdio.no_in_queues; ++i)
-		queue_start_poll[i] = card->discipline->start_poll;
+		queue_start_poll[i] = qeth_qdio_start_poll;
 
 	qeth_qdio_establish_cq(card, in_sbal_ptrs, queue_start_poll);
 
@@ -5019,8 +5016,8 @@ static int qeth_qdio_establish(struct qeth_card *card)
 	init_data.qib_param_field        = qib_param_field;
 	init_data.no_input_qs            = card->qdio.no_in_queues;
 	init_data.no_output_qs           = card->qdio.no_out_queues;
-	init_data.input_handler 	 = card->discipline->input_handler;
-	init_data.output_handler	 = card->discipline->output_handler;
+	init_data.input_handler		 = qeth_qdio_input_handler;
+	init_data.output_handler	 = qeth_qdio_output_handler;
 	init_data.queue_start_poll_array = queue_start_poll;
 	init_data.int_parm               = (unsigned long) card;
 	init_data.input_sbal_addr_array  = (void **) in_sbal_ptrs;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index b8079f2a65b3..dc9976064181 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -1315,9 +1315,6 @@ static int qeth_l2_control_event(struct qeth_card *card,
 
 struct qeth_discipline qeth_l2_discipline = {
 	.devtype = &qeth_l2_devtype,
-	.start_poll = qeth_qdio_start_poll,
-	.input_handler = (qdio_handler_t *) qeth_qdio_input_handler,
-	.output_handler = (qdio_handler_t *) qeth_qdio_output_handler,
 	.process_rx_buffer = qeth_l2_process_inbound_buffer,
 	.recover = qeth_l2_recover,
 	.setup = qeth_l2_probe_device,
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index c1a16a74aa83..f33df82f1237 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2960,9 +2960,6 @@ static int qeth_l3_control_event(struct qeth_card *card,
 
 struct qeth_discipline qeth_l3_discipline = {
 	.devtype = &qeth_l3_devtype,
-	.start_poll = qeth_qdio_start_poll,
-	.input_handler = (qdio_handler_t *) qeth_qdio_input_handler,
-	.output_handler = (qdio_handler_t *) qeth_qdio_output_handler,
 	.process_rx_buffer = qeth_l3_process_inbound_buffer,
 	.recover = qeth_l3_recover,
 	.setup = qeth_l3_probe_device,
-- 
cgit v1.2.3-59-g8ed1b


From d4ac024688aec2fb726bf5ddad991d214111d50a Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:13 +0200
Subject: s390/qeth: convert vlan spinlock to mutex

As the vid_list is only accessed from process context, there's no need to
protect it with a spinlock.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  2 +-
 drivers/s390/net/qeth_core_main.c |  2 +-
 drivers/s390/net/qeth_l2_main.c   | 15 ++++++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 8469446df62c..6d7a72da71dd 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -777,9 +777,9 @@ struct qeth_card {
 	struct qeth_card_options options;
 
 	wait_queue_head_t wait_q;
-	spinlock_t vlanlock;
 	spinlock_t mclock;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	struct mutex vid_list_mutex;		/* vid_list */
 	struct list_head vid_list;
 	DECLARE_HASHTABLE(mac_htable, 4);
 	DECLARE_HASHTABLE(ip_htable, 4);
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 5323c675e57c..7feb50ac0a8b 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -1467,13 +1467,13 @@ static int qeth_setup_card(struct qeth_card *card)
 	card->lan_online = 0;
 	card->read_or_write_problem = 0;
 	card->dev = NULL;
-	spin_lock_init(&card->vlanlock);
 	spin_lock_init(&card->mclock);
 	spin_lock_init(&card->lock);
 	spin_lock_init(&card->ip_lock);
 	spin_lock_init(&card->thread_mask_lock);
 	mutex_init(&card->conf_mutex);
 	mutex_init(&card->discipline_mutex);
+	mutex_init(&card->vid_list_mutex);
 	card->thread_start_mask = 0;
 	card->thread_allowed_mask = 0;
 	card->thread_running_mask = 0;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index dc9976064181..896c919e7658 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -297,12 +297,13 @@ static int qeth_l2_send_setdelvlan(struct qeth_card *card, __u16 i,
 static void qeth_l2_process_vlans(struct qeth_card *card)
 {
 	struct qeth_vlan_vid *id;
+
 	QETH_CARD_TEXT(card, 3, "L2prcvln");
-	spin_lock_bh(&card->vlanlock);
+	mutex_lock(&card->vid_list_mutex);
 	list_for_each_entry(id, &card->vid_list, list) {
 		qeth_l2_send_setdelvlan(card, id->vid, IPA_CMD_SETVLAN);
 	}
-	spin_unlock_bh(&card->vlanlock);
+	mutex_unlock(&card->vid_list_mutex);
 }
 
 static int qeth_l2_vlan_rx_add_vid(struct net_device *dev,
@@ -319,7 +320,7 @@ static int qeth_l2_vlan_rx_add_vid(struct net_device *dev,
 		QETH_CARD_TEXT(card, 3, "aidREC");
 		return 0;
 	}
-	id = kmalloc(sizeof(struct qeth_vlan_vid), GFP_ATOMIC);
+	id = kmalloc(sizeof(*id), GFP_KERNEL);
 	if (id) {
 		id->vid = vid;
 		rc = qeth_l2_send_setdelvlan(card, vid, IPA_CMD_SETVLAN);
@@ -327,9 +328,9 @@ static int qeth_l2_vlan_rx_add_vid(struct net_device *dev,
 			kfree(id);
 			return rc;
 		}
-		spin_lock_bh(&card->vlanlock);
+		mutex_lock(&card->vid_list_mutex);
 		list_add_tail(&id->list, &card->vid_list);
-		spin_unlock_bh(&card->vlanlock);
+		mutex_unlock(&card->vid_list_mutex);
 	} else {
 		return -ENOMEM;
 	}
@@ -348,7 +349,7 @@ static int qeth_l2_vlan_rx_kill_vid(struct net_device *dev,
 		QETH_CARD_TEXT(card, 3, "kidREC");
 		return 0;
 	}
-	spin_lock_bh(&card->vlanlock);
+	mutex_lock(&card->vid_list_mutex);
 	list_for_each_entry(id, &card->vid_list, list) {
 		if (id->vid == vid) {
 			list_del(&id->list);
@@ -356,7 +357,7 @@ static int qeth_l2_vlan_rx_kill_vid(struct net_device *dev,
 			break;
 		}
 	}
-	spin_unlock_bh(&card->vlanlock);
+	mutex_unlock(&card->vid_list_mutex);
 	if (tmpid) {
 		rc = qeth_l2_send_setdelvlan(card, vid, IPA_CMD_DELVLAN);
 		kfree(tmpid);
-- 
cgit v1.2.3-59-g8ed1b


From d8de0ddfade012ee2e3030c60ff288b78ec0a717 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:14 +0200
Subject: s390/qeth: don't worry about IPs on VLAN removal

When removing a VLAN ID on a L3 device, the driver currently attempts to
walk and unregister the VLAN device's IP addresses.
This can be safely removed - before qeth_l3_vlan_rx_kill_vid() even gets
called, we receive an inet[6]addr event for each IP on the device and
qeth_l3_handle_ip_event() unregisters the address accordingly.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l3_main.c | 87 -----------------------------------------
 1 file changed, 87 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index f33df82f1237..53c5470be7d5 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1293,91 +1293,6 @@ static void qeth_l3_add_multicast_ipv6(struct qeth_card *card)
 	in6_dev_put(in6_dev);
 }
 
-static void qeth_l3_free_vlan_addresses4(struct qeth_card *card,
-			unsigned short vid)
-{
-	struct in_device *in_dev;
-	struct in_ifaddr *ifa;
-	struct qeth_ipaddr *addr;
-	struct net_device *netdev;
-
-	QETH_CARD_TEXT(card, 4, "frvaddr4");
-
-	netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q), vid);
-	if (!netdev)
-		return;
-	in_dev = in_dev_get(netdev);
-	if (!in_dev)
-		return;
-
-	addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV4);
-	if (!addr)
-		goto out;
-
-	spin_lock_bh(&card->ip_lock);
-
-	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-		addr->u.a4.addr = be32_to_cpu(ifa->ifa_address);
-		addr->u.a4.mask = be32_to_cpu(ifa->ifa_mask);
-		addr->type = QETH_IP_TYPE_NORMAL;
-		qeth_l3_delete_ip(card, addr);
-	}
-
-	spin_unlock_bh(&card->ip_lock);
-
-	kfree(addr);
-out:
-	in_dev_put(in_dev);
-}
-
-static void qeth_l3_free_vlan_addresses6(struct qeth_card *card,
-					 unsigned short vid)
-{
-	struct inet6_dev *in6_dev;
-	struct inet6_ifaddr *ifa;
-	struct qeth_ipaddr *addr;
-	struct net_device *netdev;
-
-	QETH_CARD_TEXT(card, 4, "frvaddr6");
-
-	netdev = __vlan_find_dev_deep_rcu(card->dev, htons(ETH_P_8021Q), vid);
-	if (!netdev)
-		return;
-
-	in6_dev = in6_dev_get(netdev);
-	if (!in6_dev)
-		return;
-
-	addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV6);
-	if (!addr)
-		goto out;
-
-	spin_lock_bh(&card->ip_lock);
-
-	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
-		memcpy(&addr->u.a6.addr, &ifa->addr,
-		       sizeof(struct in6_addr));
-		addr->u.a6.pfxlen = ifa->prefix_len;
-		addr->type = QETH_IP_TYPE_NORMAL;
-		qeth_l3_delete_ip(card, addr);
-	}
-
-	spin_unlock_bh(&card->ip_lock);
-
-	kfree(addr);
-out:
-	in6_dev_put(in6_dev);
-}
-
-static void qeth_l3_free_vlan_addresses(struct qeth_card *card,
-			unsigned short vid)
-{
-	rcu_read_lock();
-	qeth_l3_free_vlan_addresses4(card, vid);
-	qeth_l3_free_vlan_addresses6(card, vid);
-	rcu_read_unlock();
-}
-
 static int qeth_l3_vlan_rx_add_vid(struct net_device *dev,
 				   __be16 proto, u16 vid)
 {
@@ -1398,8 +1313,6 @@ static int qeth_l3_vlan_rx_kill_vid(struct net_device *dev,
 		QETH_CARD_TEXT(card, 3, "kidREC");
 		return 0;
 	}
-	/* unregister IP addresses of vlan device */
-	qeth_l3_free_vlan_addresses(card, vid);
 	clear_bit(vid, card->active_vlans);
 	qeth_l3_set_rx_mode(dev);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 71c4689780f8f3e46f0110a2683f7dc36b356f68 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ursula.braun@de.ibm.com>
Date: Thu, 26 Apr 2018 09:42:15 +0200
Subject: s390/net: set HW port number in netdevice

struct net_device contains a dev_port field. Store the OSA port number
in this field.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Reviewed-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/lcs.c           | 3 +++
 drivers/s390/net/qeth_core_sys.c | 2 ++
 drivers/s390/net/qeth_l2_main.c  | 1 +
 drivers/s390/net/qeth_l3_main.c  | 1 +
 4 files changed, 7 insertions(+)

diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c
index 0ee8f33efb54..2d9fe7e4ee40 100644
--- a/drivers/s390/net/lcs.c
+++ b/drivers/s390/net/lcs.c
@@ -1928,6 +1928,8 @@ lcs_portno_store (struct device *dev, struct device_attribute *attr, const char
 		return -EINVAL;
         /* TODO: sanity checks */
         card->portno = value;
+	if (card->dev)
+		card->dev->dev_port = card->portno;
 
         return count;
 
@@ -2158,6 +2160,7 @@ lcs_new_device(struct ccwgroup_device *ccwgdev)
 	card->dev = dev;
 	card->dev->ml_priv = card;
 	card->dev->netdev_ops = &lcs_netdev_ops;
+	card->dev->dev_port = card->portno;
 	memcpy(card->dev->dev_addr, card->mac, LCS_MAC_LENGTH);
 #ifdef CONFIG_IP_MULTICAST
 	if (!lcs_check_multicast_support(card))
diff --git a/drivers/s390/net/qeth_core_sys.c b/drivers/s390/net/qeth_core_sys.c
index ae81534de912..c3f18afb368b 100644
--- a/drivers/s390/net/qeth_core_sys.c
+++ b/drivers/s390/net/qeth_core_sys.c
@@ -144,6 +144,8 @@ static ssize_t qeth_dev_portno_store(struct device *dev,
 		goto out;
 	}
 	card->info.portno = portno;
+	if (card->dev)
+		card->dev->dev_port = portno;
 out:
 	mutex_unlock(&card->conf_mutex);
 	return rc ? rc : count;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 896c919e7658..3239b2546ac7 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -984,6 +984,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 	card->dev->mtu = card->info.initial_mtu;
 	card->dev->min_mtu = 64;
 	card->dev->max_mtu = ETH_MAX_MTU;
+	card->dev->dev_port = card->info.portno;
 	card->dev->netdev_ops = &qeth_l2_netdev_ops;
 	if (card->info.type == QETH_CARD_TYPE_OSN) {
 		card->dev->ethtool_ops = &qeth_l2_osn_ops;
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 53c5470be7d5..64f27de079ca 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2576,6 +2576,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 	card->dev->mtu = card->info.initial_mtu;
 	card->dev->min_mtu = 64;
 	card->dev->max_mtu = ETH_MAX_MTU;
+	card->dev->dev_port = card->info.portno;
 	card->dev->ethtool_ops = &qeth_l3_ethtool_ops;
 	card->dev->features |=	NETIF_F_HW_VLAN_CTAG_TX |
 				NETIF_F_HW_VLAN_CTAG_RX |
-- 
cgit v1.2.3-59-g8ed1b


From 1b81143281117ef238691860ec6c45c2cddb2383 Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:16 +0200
Subject: s390/qeth: de-indent else after return

Trivial cleanup, in preparation for a subsequent patch.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l3_main.c | 43 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 64f27de079ca..0e8ed8787fd3 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2533,28 +2533,27 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 		    (card->info.link_type == QETH_LINK_TYPE_HSTR)) {
 			pr_info("qeth_l3: ignoring TR device\n");
 			return -ENODEV;
-		} else {
-			card->dev = alloc_etherdev(0);
-			if (!card->dev)
-				return -ENODEV;
-			card->dev->netdev_ops = &qeth_l3_osa_netdev_ops;
-
-			/*IPv6 address autoconfiguration stuff*/
-			qeth_l3_get_unique_id(card);
-			if (!(card->info.unique_id & UNIQUE_ID_NOT_BY_CARD))
-				card->dev->dev_id = card->info.unique_id &
-							 0xffff;
-
-			card->dev->hw_features |= NETIF_F_SG;
-			card->dev->vlan_features |= NETIF_F_SG;
-
-			if (!card->info.guestlan) {
-				card->dev->features |= NETIF_F_SG;
-				card->dev->hw_features |= NETIF_F_TSO |
-					NETIF_F_RXCSUM | NETIF_F_IP_CSUM;
-				card->dev->vlan_features |= NETIF_F_TSO |
-					NETIF_F_RXCSUM | NETIF_F_IP_CSUM;
-			}
+		}
+
+		card->dev = alloc_etherdev(0);
+		if (!card->dev)
+			return -ENODEV;
+		card->dev->netdev_ops = &qeth_l3_osa_netdev_ops;
+
+		/*IPv6 address autoconfiguration stuff*/
+		qeth_l3_get_unique_id(card);
+		if (!(card->info.unique_id & UNIQUE_ID_NOT_BY_CARD))
+			card->dev->dev_id = card->info.unique_id & 0xffff;
+
+		card->dev->hw_features |= NETIF_F_SG;
+		card->dev->vlan_features |= NETIF_F_SG;
+
+		if (!card->info.guestlan) {
+			card->dev->features |= NETIF_F_SG;
+			card->dev->hw_features |= NETIF_F_TSO |
+				NETIF_F_RXCSUM | NETIF_F_IP_CSUM;
+			card->dev->vlan_features |= NETIF_F_TSO |
+				NETIF_F_RXCSUM | NETIF_F_IP_CSUM;
 		}
 	} else if (card->info.type == QETH_CARD_TYPE_IQD) {
 		card->dev = alloc_netdev(0, "hsi%d", NET_NAME_UNKNOWN,
-- 
cgit v1.2.3-59-g8ed1b


From 6195b936610018d11bbb67bd97158d50bdaa2767 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:17 +0200
Subject: s390/qeth: extract csum offload helpers

This consolidates the checksum offload code that was duplicated
over the two qeth subdrivers.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h    | 21 +++++++++++++++++++++
 drivers/s390/net/qeth_l2_main.c | 35 ++++++-----------------------------
 drivers/s390/net/qeth_l3_main.c | 36 ++++++------------------------------
 3 files changed, 33 insertions(+), 59 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 6d7a72da71dd..a8114758075f 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -864,6 +864,27 @@ static inline int qeth_get_ip_version(struct sk_buff *skb)
 	}
 }
 
+static inline void qeth_rx_csum(struct qeth_card *card, struct sk_buff *skb,
+				u8 flags)
+{
+	if ((card->dev->features & NETIF_F_RXCSUM) &&
+	    (flags & QETH_HDR_EXT_CSUM_HDR_REQ) &&
+	    (flags & QETH_HDR_EXT_CSUM_TRANSP_REQ))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	else
+		skb->ip_summed = CHECKSUM_NONE;
+}
+
+static inline void qeth_tx_csum(struct sk_buff *skb, u8 *flags)
+{
+	*flags |= QETH_HDR_EXT_CSUM_TRANSP_REQ;
+	if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+		*flags |= QETH_HDR_EXT_UDP;
+	/* some HW requires combined L3+L4 csum offload: */
+	*flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
+	ip_hdr(skb)->check = 0;
+}
+
 static inline void qeth_put_buffer_pool_entry(struct qeth_card *card,
 		struct qeth_buffer_pool_entry *entry)
 {
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 3239b2546ac7..945df56434fd 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -17,7 +17,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/etherdevice.h>
-#include <linux/ip.h>
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/hashtable.h>
@@ -195,23 +194,6 @@ static int qeth_l2_get_cast_type(struct qeth_card *card, struct sk_buff *skb)
 	return RTN_UNSPEC;
 }
 
-static void qeth_l2_hdr_csum(struct qeth_card *card, struct qeth_hdr *hdr,
-			     struct sk_buff *skb)
-{
-	struct iphdr *iph = ip_hdr(skb);
-
-	/* tcph->check contains already the pseudo hdr checksum
-	 * so just set the header flags
-	 */
-	if (iph->protocol == IPPROTO_UDP)
-		hdr->hdr.l2.flags[1] |= QETH_HDR_EXT_UDP;
-	hdr->hdr.l2.flags[1] |= QETH_HDR_EXT_CSUM_TRANSP_REQ |
-		QETH_HDR_EXT_CSUM_HDR_REQ;
-	iph->check = 0;
-	if (card->options.performance_stats)
-		card->perf_stats.tx_csum++;
-}
-
 static void qeth_l2_fill_header(struct qeth_hdr *hdr, struct sk_buff *skb,
 				int cast_type, unsigned int data_len)
 {
@@ -424,15 +406,7 @@ static int qeth_l2_process_inbound_buffer(struct qeth_card *card,
 		switch (hdr->hdr.l2.id) {
 		case QETH_HEADER_TYPE_LAYER2:
 			skb->protocol = eth_type_trans(skb, skb->dev);
-			if ((card->dev->features & NETIF_F_RXCSUM)
-			   && ((hdr->hdr.l2.flags[1] &
-				(QETH_HDR_EXT_CSUM_HDR_REQ |
-				   QETH_HDR_EXT_CSUM_TRANSP_REQ)) ==
-				(QETH_HDR_EXT_CSUM_HDR_REQ |
-				   QETH_HDR_EXT_CSUM_TRANSP_REQ)))
-				skb->ip_summed = CHECKSUM_UNNECESSARY;
-			else
-				skb->ip_summed = CHECKSUM_NONE;
+			qeth_rx_csum(card, skb, hdr->hdr.l2.flags[1]);
 			if (skb->protocol == htons(ETH_P_802_2))
 				*((__u32 *)skb->cb) = ++card->seqno.pkt_seqno;
 			len = skb->len;
@@ -724,8 +698,11 @@ static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
 		hdr_elements = 1;
 	}
 	qeth_l2_fill_header(hdr, skb, cast_type, skb->len - push_len);
-	if (skb->ip_summed == CHECKSUM_PARTIAL)
-		qeth_l2_hdr_csum(card, hdr, skb);
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		qeth_tx_csum(skb, &hdr->hdr.l2.flags[1]);
+		if (card->options.performance_stats)
+			card->perf_stats.tx_csum++;
+	}
 
 	elements = qeth_get_elements_no(card, skb, hdr_elements, 0);
 	if (!elements) {
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 0e8ed8787fd3..bc359e322ae7 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1367,17 +1367,7 @@ static void qeth_l3_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tag);
 	}
 
-	if (card->dev->features & NETIF_F_RXCSUM) {
-		if ((hdr->hdr.l3.ext_flags &
-		    (QETH_HDR_EXT_CSUM_HDR_REQ |
-		     QETH_HDR_EXT_CSUM_TRANSP_REQ)) ==
-		    (QETH_HDR_EXT_CSUM_HDR_REQ |
-		     QETH_HDR_EXT_CSUM_TRANSP_REQ))
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-		else
-			skb->ip_summed = CHECKSUM_NONE;
-	} else
-		skb->ip_summed = CHECKSUM_NONE;
+	qeth_rx_csum(card, skb, hdr->hdr.l3.ext_flags);
 }
 
 static int qeth_l3_process_inbound_buffer(struct qeth_card *card,
@@ -2123,23 +2113,6 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 	rcu_read_unlock();
 }
 
-static void qeth_l3_hdr_csum(struct qeth_card *card, struct qeth_hdr *hdr,
-			     struct sk_buff *skb)
-{
-	struct iphdr *iph = ip_hdr(skb);
-
-	/* tcph->check contains already the pseudo hdr checksum
-	 * so just set the header flags
-	 */
-	if (iph->protocol == IPPROTO_UDP)
-		hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_UDP;
-	hdr->hdr.l3.ext_flags |= QETH_HDR_EXT_CSUM_TRANSP_REQ |
-		QETH_HDR_EXT_CSUM_HDR_REQ;
-	iph->check = 0;
-	if (card->options.performance_stats)
-		card->perf_stats.tx_csum++;
-}
-
 static void qeth_tso_fill_header(struct qeth_card *card,
 		struct qeth_hdr *qhdr, struct sk_buff *skb)
 {
@@ -2331,8 +2304,11 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 			}
 		}
 
-		if (skb->ip_summed == CHECKSUM_PARTIAL)
-			qeth_l3_hdr_csum(card, hdr, new_skb);
+		if (new_skb->ip_summed == CHECKSUM_PARTIAL) {
+			qeth_tx_csum(new_skb, &hdr->hdr.l3.ext_flags);
+			if (card->options.performance_stats)
+				card->perf_stats.tx_csum++;
+		}
 	}
 
 	elements = use_tso ?
-- 
cgit v1.2.3-59-g8ed1b


From b339c24ebfca87711237231984a9d8b9dcd3a81a Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:18 +0200
Subject: s390/qeth: disregard IPv4 header for RX csum offload

The kernel does its own validation of the IPv4 header checksum,
drivers/HW are not required to handle this.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index a8114758075f..2cfeae485a08 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -868,7 +868,6 @@ static inline void qeth_rx_csum(struct qeth_card *card, struct sk_buff *skb,
 				u8 flags)
 {
 	if ((card->dev->features & NETIF_F_RXCSUM) &&
-	    (flags & QETH_HDR_EXT_CSUM_HDR_REQ) &&
 	    (flags & QETH_HDR_EXT_CSUM_TRANSP_REQ))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	else
-- 
cgit v1.2.3-59-g8ed1b


From 3aade31b2f2dce54d988a42d2e1974c82b25fccf Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:19 +0200
Subject: s390/qeth: add stats counter for RX csum offload

This matches the statistics we gather for the TX offload path.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      | 8 ++++++--
 drivers/s390/net/qeth_core_main.c | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 2cfeae485a08..e8108f6891b2 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -148,6 +148,7 @@ struct qeth_perf_stats {
 	unsigned int tx_csum;
 	unsigned int tx_lin;
 	unsigned int tx_linfail;
+	unsigned int rx_csum;
 };
 
 /* Routing stuff */
@@ -868,10 +869,13 @@ static inline void qeth_rx_csum(struct qeth_card *card, struct sk_buff *skb,
 				u8 flags)
 {
 	if ((card->dev->features & NETIF_F_RXCSUM) &&
-	    (flags & QETH_HDR_EXT_CSUM_TRANSP_REQ))
+	    (flags & QETH_HDR_EXT_CSUM_TRANSP_REQ)) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	else
+		if (card->options.performance_stats)
+			card->perf_stats.rx_csum++;
+	} else {
 		skb->ip_summed = CHECKSUM_NONE;
+	}
 }
 
 static inline void qeth_tx_csum(struct sk_buff *skb, u8 *flags)
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 7feb50ac0a8b..b90f38d78ad9 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -6005,7 +6005,8 @@ static struct {
 	{"tx lin"},
 	{"tx linfail"},
 	{"cq handler count"},
-	{"cq handler time"}
+	{"cq handler time"},
+	{"rx csum"}
 };
 
 int qeth_core_get_sset_count(struct net_device *dev, int stringset)
@@ -6067,6 +6068,7 @@ void qeth_core_get_ethtool_stats(struct net_device *dev,
 	data[35] = card->perf_stats.tx_linfail;
 	data[36] = card->perf_stats.cq_cnt;
 	data[37] = card->perf_stats.cq_time;
+	data[38] = card->perf_stats.rx_csum;
 }
 EXPORT_SYMBOL_GPL(qeth_core_get_ethtool_stats);
 
-- 
cgit v1.2.3-59-g8ed1b


From ee75fb8615497c885a62337aeba375211e3baece Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:20 +0200
Subject: s390/qeth: query IPv6 assists during hardsetup

For new functionality, the L2 subdriver will start using IPv6 assists.
So move the query from the L3 subdriver into the common setup path.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core_main.c | 5 +++++
 drivers/s390/net/qeth_l3_main.c   | 8 --------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index b90f38d78ad9..9ec1bb7f8b84 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -5201,6 +5201,11 @@ retriable:
 	rc = qeth_query_ipassists(card, QETH_PROT_IPV4);
 	if (rc == -ENOMEM)
 		goto out;
+	if (qeth_is_supported(card, IPA_IPV6)) {
+		rc = qeth_query_ipassists(card, QETH_PROT_IPV6);
+		if (rc == -ENOMEM)
+			goto out;
+	}
 	if (qeth_is_supported(card, IPA_SETADAPTERPARMS)) {
 		rc = qeth_query_setadapterparms(card);
 		if (rc < 0) {
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index bc359e322ae7..9d9c743e7433 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -851,14 +851,6 @@ static int qeth_l3_softsetup_ipv6(struct qeth_card *card)
 
 	QETH_CARD_TEXT(card, 3, "softipv6");
 
-	rc = qeth_query_ipassists(card, QETH_PROT_IPV6);
-	if (rc) {
-		dev_err(&card->gdev->dev,
-			"Activating IPv6 support for %s failed\n",
-			QETH_CARD_IFNAME(card));
-		return rc;
-	}
-
 	if (card->info.type == QETH_CARD_TYPE_IQD)
 		goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From a8155b009f133445e874fb84c43f85091b345617 Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:21 +0200
Subject: s390/qeth: extend Checksum Offload Assists for IPv6

Add some wrappers to make the protocol-specific Assist code a little
more generic, and use them for sending protocol-agnostic commands in
the Checksum Offload Assist code.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      | 23 ++++++++++++--
 drivers/s390/net/qeth_core_main.c | 65 +++++++++++++++++++++------------------
 drivers/s390/net/qeth_l3_main.c   | 24 +++------------
 3 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index e8108f6891b2..7cbc9bf5d6b0 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -900,6 +900,27 @@ static inline int qeth_is_diagass_supported(struct qeth_card *card,
 	return card->info.diagass_support & (__u32)cmd;
 }
 
+int qeth_send_simple_setassparms_prot(struct qeth_card *card,
+				      enum qeth_ipa_funcs ipa_func,
+				      u16 cmd_code, long data,
+				      enum qeth_prot_versions prot);
+/* IPv4 variant */
+static inline int qeth_send_simple_setassparms(struct qeth_card *card,
+					       enum qeth_ipa_funcs ipa_func,
+					       u16 cmd_code, long data)
+{
+	return qeth_send_simple_setassparms_prot(card, ipa_func, cmd_code,
+						 data, QETH_PROT_IPV4);
+}
+
+static inline int qeth_send_simple_setassparms_v6(struct qeth_card *card,
+						  enum qeth_ipa_funcs ipa_func,
+						  u16 cmd_code, long data)
+{
+	return qeth_send_simple_setassparms_prot(card, ipa_func, cmd_code,
+						 data, QETH_PROT_IPV6);
+}
+
 extern struct qeth_discipline qeth_l2_discipline;
 extern struct qeth_discipline qeth_l3_discipline;
 extern const struct attribute_group *qeth_generic_attr_groups[];
@@ -994,8 +1015,6 @@ int qeth_hw_trap(struct qeth_card *, enum qeth_diags_trap_action);
 int qeth_query_ipassists(struct qeth_card *, enum qeth_prot_versions prot);
 void qeth_trace_features(struct qeth_card *);
 void qeth_close_dev(struct qeth_card *);
-int qeth_send_simple_setassparms(struct qeth_card *, enum qeth_ipa_funcs,
-				 __u16, long);
 int qeth_send_setassparms(struct qeth_card *, struct qeth_cmd_buffer *, __u16,
 			  long,
 			  int (*reply_cb)(struct qeth_card *,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 9ec1bb7f8b84..5e4a509822f1 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -5513,26 +5513,26 @@ int qeth_send_setassparms(struct qeth_card *card,
 }
 EXPORT_SYMBOL_GPL(qeth_send_setassparms);
 
-int qeth_send_simple_setassparms(struct qeth_card *card,
-				 enum qeth_ipa_funcs ipa_func,
-				 __u16 cmd_code, long data)
+int qeth_send_simple_setassparms_prot(struct qeth_card *card,
+				      enum qeth_ipa_funcs ipa_func,
+				      u16 cmd_code, long data,
+				      enum qeth_prot_versions prot)
 {
 	int rc;
 	int length = 0;
 	struct qeth_cmd_buffer *iob;
 
-	QETH_CARD_TEXT(card, 4, "simassp4");
+	QETH_CARD_TEXT_(card, 4, "simassp%i", prot);
 	if (data)
 		length = sizeof(__u32);
-	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code,
-				       length, QETH_PROT_IPV4);
+	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code, length, prot);
 	if (!iob)
 		return -ENOMEM;
 	rc = qeth_send_setassparms(card, iob, length, data,
 				   qeth_setassparms_cb, NULL);
 	return rc;
 }
-EXPORT_SYMBOL_GPL(qeth_send_simple_setassparms);
+EXPORT_SYMBOL_GPL(qeth_send_simple_setassparms_prot);
 
 static void qeth_unregister_dbf_views(void)
 {
@@ -6330,14 +6330,15 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card,
 static int qeth_ipa_checksum_run_cmd(struct qeth_card *card,
 				     enum qeth_ipa_funcs ipa_func,
 				     __u16 cmd_code, long data,
-				     struct qeth_checksum_cmd *chksum_cb)
+				     struct qeth_checksum_cmd *chksum_cb,
+				     enum qeth_prot_versions prot)
 {
 	struct qeth_cmd_buffer *iob;
 	int rc = -ENOMEM;
 
 	QETH_CARD_TEXT(card, 4, "chkdocmd");
 	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code,
-				       sizeof(__u32), QETH_PROT_IPV4);
+				       sizeof(__u32), prot);
 	if (iob)
 		rc = qeth_send_setassparms(card, iob, sizeof(__u32), data,
 					   qeth_ipa_checksum_run_cmd_cb,
@@ -6345,7 +6346,8 @@ static int qeth_ipa_checksum_run_cmd(struct qeth_card *card,
 	return rc;
 }
 
-static int qeth_send_checksum_on(struct qeth_card *card, int cstype)
+static int qeth_send_checksum_on(struct qeth_card *card, int cstype,
+				 enum qeth_prot_versions prot)
 {
 	const __u32 required_features = QETH_IPA_CHECKSUM_IP_HDR |
 					QETH_IPA_CHECKSUM_UDP |
@@ -6354,7 +6356,7 @@ static int qeth_send_checksum_on(struct qeth_card *card, int cstype)
 	int rc;
 
 	rc = qeth_ipa_checksum_run_cmd(card, cstype, IPA_CMD_ASS_START, 0,
-				       &chksum_cb);
+				       &chksum_cb, prot);
 	if (!rc) {
 		if ((required_features & chksum_cb.supported) !=
 		    required_features)
@@ -6366,37 +6368,42 @@ static int qeth_send_checksum_on(struct qeth_card *card, int cstype)
 				 QETH_CARD_IFNAME(card));
 	}
 	if (rc) {
-		qeth_send_simple_setassparms(card, cstype, IPA_CMD_ASS_STOP, 0);
+		qeth_send_simple_setassparms_prot(card, cstype,
+						  IPA_CMD_ASS_STOP, 0, prot);
 		dev_warn(&card->gdev->dev,
-			 "Starting HW checksumming for %s failed, using SW checksumming\n",
-			 QETH_CARD_IFNAME(card));
+			 "Starting HW IPv%d checksumming for %s failed, using SW checksumming\n",
+			 prot, QETH_CARD_IFNAME(card));
 		return rc;
 	}
 	rc = qeth_ipa_checksum_run_cmd(card, cstype, IPA_CMD_ASS_ENABLE,
-				       chksum_cb.supported, &chksum_cb);
+				       chksum_cb.supported, &chksum_cb,
+				       prot);
 	if (!rc) {
 		if ((required_features & chksum_cb.enabled) !=
 		    required_features)
 			rc = -EIO;
 	}
 	if (rc) {
-		qeth_send_simple_setassparms(card, cstype, IPA_CMD_ASS_STOP, 0);
+		qeth_send_simple_setassparms_prot(card, cstype,
+						  IPA_CMD_ASS_STOP, 0, prot);
 		dev_warn(&card->gdev->dev,
-			 "Enabling HW checksumming for %s failed, using SW checksumming\n",
-			 QETH_CARD_IFNAME(card));
+			 "Enabling HW IPv%d checksumming for %s failed, using SW checksumming\n",
+			 prot, QETH_CARD_IFNAME(card));
 		return rc;
 	}
 
-	dev_info(&card->gdev->dev, "HW Checksumming (%sbound) enabled\n",
-		 cstype == IPA_INBOUND_CHECKSUM ? "in" : "out");
+	dev_info(&card->gdev->dev, "HW Checksumming (%sbound IPv%d) enabled\n",
+		 cstype == IPA_INBOUND_CHECKSUM ? "in" : "out", prot);
 	return 0;
 }
 
-static int qeth_set_ipa_csum(struct qeth_card *card, int on, int cstype)
+static int qeth_set_ipa_csum(struct qeth_card *card, bool on, int cstype,
+			     enum qeth_prot_versions prot)
 {
-	int rc = (on) ? qeth_send_checksum_on(card, cstype)
-		      : qeth_send_simple_setassparms(card, cstype,
-						     IPA_CMD_ASS_STOP, 0);
+	int rc = (on) ? qeth_send_checksum_on(card, cstype, prot)
+		      : qeth_send_simple_setassparms_prot(card, cstype,
+							  IPA_CMD_ASS_STOP, 0,
+							  prot);
 	return rc ? -EIO : 0;
 }
 
@@ -6459,16 +6466,14 @@ int qeth_set_features(struct net_device *dev, netdev_features_t features)
 	QETH_DBF_HEX(SETUP, 2, &features, sizeof(features));
 
 	if ((changed & NETIF_F_IP_CSUM)) {
-		rc = qeth_set_ipa_csum(card,
-				       features & NETIF_F_IP_CSUM ? 1 : 0,
-				       IPA_OUTBOUND_CHECKSUM);
+		rc = qeth_set_ipa_csum(card, features & NETIF_F_IP_CSUM,
+				       IPA_OUTBOUND_CHECKSUM, QETH_PROT_IPV4);
 		if (rc)
 			changed ^= NETIF_F_IP_CSUM;
 	}
 	if ((changed & NETIF_F_RXCSUM)) {
-		rc = qeth_set_ipa_csum(card,
-					features & NETIF_F_RXCSUM ? 1 : 0,
-					IPA_INBOUND_CHECKSUM);
+		rc = qeth_set_ipa_csum(card, features & NETIF_F_RXCSUM,
+				       IPA_INBOUND_CHECKSUM, QETH_PROT_IPV4);
 		if (rc)
 			changed ^= NETIF_F_RXCSUM;
 	}
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 9d9c743e7433..dd233fe3d6c4 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -735,22 +735,6 @@ static int qeth_l3_setadapter_parms(struct qeth_card *card)
 	return rc;
 }
 
-static int qeth_l3_send_simple_setassparms_ipv6(struct qeth_card *card,
-		enum qeth_ipa_funcs ipa_func, __u16 cmd_code)
-{
-	int rc;
-	struct qeth_cmd_buffer *iob;
-
-	QETH_CARD_TEXT(card, 4, "simassp6");
-	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code,
-				       0, QETH_PROT_IPV6);
-	if (!iob)
-		return -ENOMEM;
-	rc = qeth_send_setassparms(card, iob, 0, 0,
-				   qeth_setassparms_cb, NULL);
-	return rc;
-}
-
 static int qeth_l3_start_ipa_arp_processing(struct qeth_card *card)
 {
 	int rc;
@@ -862,16 +846,16 @@ static int qeth_l3_softsetup_ipv6(struct qeth_card *card)
 			QETH_CARD_IFNAME(card));
 		return rc;
 	}
-	rc = qeth_l3_send_simple_setassparms_ipv6(card, IPA_IPV6,
-					       IPA_CMD_ASS_START);
+	rc = qeth_send_simple_setassparms_v6(card, IPA_IPV6,
+					     IPA_CMD_ASS_START, 0);
 	if (rc) {
 		dev_err(&card->gdev->dev,
 			"Activating IPv6 support for %s failed\n",
 			 QETH_CARD_IFNAME(card));
 		return rc;
 	}
-	rc = qeth_l3_send_simple_setassparms_ipv6(card, IPA_PASSTHRU,
-					       IPA_CMD_ASS_START);
+	rc = qeth_send_simple_setassparms_v6(card, IPA_PASSTHRU,
+					     IPA_CMD_ASS_START, 0);
 	if (rc) {
 		dev_warn(&card->gdev->dev,
 			"Enabling the passthrough mode for %s failed\n",
-- 
cgit v1.2.3-59-g8ed1b


From 571f9dd8026b44fe52d9ca9ed6a68c53aad1f3ba Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:22 +0200
Subject: s390/qeth: add IPv6 TX checksum offload support

Check if a qeth device supports IPv6 TX checksum offload, and advertise
NETIF_F_IPV6_CSUM accordingly. Add support for setting the relevant bits
in IPv6 packet descriptors.

Currently this has only limited use (ie. UDP, or Jumbo Frames). For any
TCP traffic with a standard MSS, the TCP checksum gets calculated
as part of the linear GSO segmentation.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      | 13 ++++++++-----
 drivers/s390/net/qeth_core_main.c | 18 +++++++++++++-----
 drivers/s390/net/qeth_core_mpc.h  |  1 +
 drivers/s390/net/qeth_l2_main.c   | 14 ++++++++++----
 drivers/s390/net/qeth_l3_main.c   |  7 ++++++-
 5 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 7cbc9bf5d6b0..2a5fec55bf60 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -878,14 +878,17 @@ static inline void qeth_rx_csum(struct qeth_card *card, struct sk_buff *skb,
 	}
 }
 
-static inline void qeth_tx_csum(struct sk_buff *skb, u8 *flags)
+static inline void qeth_tx_csum(struct sk_buff *skb, u8 *flags, int ipv)
 {
 	*flags |= QETH_HDR_EXT_CSUM_TRANSP_REQ;
-	if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+	if ((ipv == 4 && ip_hdr(skb)->protocol == IPPROTO_UDP) ||
+	    (ipv == 6 && ipv6_hdr(skb)->nexthdr == IPPROTO_UDP))
 		*flags |= QETH_HDR_EXT_UDP;
-	/* some HW requires combined L3+L4 csum offload: */
-	*flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
-	ip_hdr(skb)->check = 0;
+	if (ipv == 4) {
+		/* some HW requires combined L3+L4 csum offload: */
+		*flags |= QETH_HDR_EXT_CSUM_HDR_REQ;
+		ip_hdr(skb)->check = 0;
+	}
 }
 
 static inline void qeth_put_buffer_pool_entry(struct qeth_card *card,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 5e4a509822f1..55b05d9c1cb6 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -6349,12 +6349,12 @@ static int qeth_ipa_checksum_run_cmd(struct qeth_card *card,
 static int qeth_send_checksum_on(struct qeth_card *card, int cstype,
 				 enum qeth_prot_versions prot)
 {
-	const __u32 required_features = QETH_IPA_CHECKSUM_IP_HDR |
-					QETH_IPA_CHECKSUM_UDP |
-					QETH_IPA_CHECKSUM_TCP;
+	u32 required_features = QETH_IPA_CHECKSUM_UDP | QETH_IPA_CHECKSUM_TCP;
 	struct qeth_checksum_cmd chksum_cb;
 	int rc;
 
+	if (prot == QETH_PROT_IPV4)
+		required_features |= QETH_IPA_CHECKSUM_IP_HDR;
 	rc = qeth_ipa_checksum_run_cmd(card, cstype, IPA_CMD_ASS_START, 0,
 				       &chksum_cb, prot);
 	if (!rc) {
@@ -6430,8 +6430,8 @@ static int qeth_set_ipa_tso(struct qeth_card *card, int on)
 	return rc;
 }
 
-#define QETH_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_TSO)
-
+#define QETH_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_TSO | \
+			  NETIF_F_IPV6_CSUM)
 /**
  * qeth_recover_features() - Restore device features after recovery
  * @dev:	the recovering net_device
@@ -6471,6 +6471,12 @@ int qeth_set_features(struct net_device *dev, netdev_features_t features)
 		if (rc)
 			changed ^= NETIF_F_IP_CSUM;
 	}
+	if (changed & NETIF_F_IPV6_CSUM) {
+		rc = qeth_set_ipa_csum(card, features & NETIF_F_IPV6_CSUM,
+				       IPA_OUTBOUND_CHECKSUM, QETH_PROT_IPV6);
+		if (rc)
+			changed ^= NETIF_F_IPV6_CSUM;
+	}
 	if ((changed & NETIF_F_RXCSUM)) {
 		rc = qeth_set_ipa_csum(card, features & NETIF_F_RXCSUM,
 				       IPA_INBOUND_CHECKSUM, QETH_PROT_IPV4);
@@ -6500,6 +6506,8 @@ netdev_features_t qeth_fix_features(struct net_device *dev,
 	QETH_DBF_TEXT(SETUP, 2, "fixfeat");
 	if (!qeth_is_supported(card, IPA_OUTBOUND_CHECKSUM))
 		features &= ~NETIF_F_IP_CSUM;
+	if (!qeth_is_supported6(card, IPA_OUTBOUND_CHECKSUM_V6))
+		features &= ~NETIF_F_IPV6_CSUM;
 	if (!qeth_is_supported(card, IPA_INBOUND_CHECKSUM))
 		features &= ~NETIF_F_RXCSUM;
 	if (!qeth_is_supported(card, IPA_OUTBOUND_TSO))
diff --git a/drivers/s390/net/qeth_core_mpc.h b/drivers/s390/net/qeth_core_mpc.h
index f4d1ec0b8f5a..af3c35fbfa9e 100644
--- a/drivers/s390/net/qeth_core_mpc.h
+++ b/drivers/s390/net/qeth_core_mpc.h
@@ -246,6 +246,7 @@ enum qeth_ipa_funcs {
 	IPA_QUERY_ARP_ASSIST	= 0x00040000L,
 	IPA_INBOUND_TSO         = 0x00080000L,
 	IPA_OUTBOUND_TSO        = 0x00100000L,
+	IPA_OUTBOUND_CHECKSUM_V6 = 0x00800000L,
 };
 
 /* SETIP/DELIP IPA Command: ***************************************************/
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 945df56434fd..5b1780fa4cb5 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -660,7 +660,8 @@ out:
 }
 
 static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
-			    struct qeth_qdio_out_q *queue, int cast_type)
+			    struct qeth_qdio_out_q *queue, int cast_type,
+			    int ipv)
 {
 	int push_len = sizeof(struct qeth_hdr);
 	unsigned int elements, nr_frags;
@@ -699,7 +700,7 @@ static int qeth_l2_xmit_osa(struct qeth_card *card, struct sk_buff *skb,
 	}
 	qeth_l2_fill_header(hdr, skb, cast_type, skb->len - push_len);
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		qeth_tx_csum(skb, &hdr->hdr.l2.flags[1]);
+		qeth_tx_csum(skb, &hdr->hdr.l2.flags[1], ipv);
 		if (card->options.performance_stats)
 			card->perf_stats.tx_csum++;
 	}
@@ -754,6 +755,7 @@ static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
 {
 	struct qeth_card *card = dev->ml_priv;
 	int cast_type = qeth_l2_get_cast_type(card, skb);
+	int ipv = qeth_get_ip_version(skb);
 	struct qeth_qdio_out_q *queue;
 	int tx_bytes = skb->len;
 	int rc;
@@ -761,7 +763,7 @@ static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
 	if (card->qdio.do_prio_queueing || (cast_type &&
 					card->info.is_multicast_different))
 		queue = card->qdio.out_qs[qeth_get_priority_queue(card, skb,
-					qeth_get_ip_version(skb), cast_type)];
+					ipv, cast_type)];
 	else
 		queue = card->qdio.out_qs[card->qdio.default_out_queue];
 
@@ -784,7 +786,7 @@ static netdev_tx_t qeth_l2_hard_start_xmit(struct sk_buff *skb,
 		rc = qeth_l2_xmit_iqd(card, skb, queue, cast_type);
 		break;
 	default:
-		rc = qeth_l2_xmit_osa(card, skb, queue, cast_type);
+		rc = qeth_l2_xmit_osa(card, skb, queue, cast_type, ipv);
 	}
 
 	if (!rc) {
@@ -995,6 +997,10 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 			card->dev->vlan_features |= NETIF_F_RXCSUM;
 		}
 	}
+	if (qeth_is_supported6(card, IPA_OUTBOUND_CHECKSUM_V6)) {
+		card->dev->hw_features |= NETIF_F_IPV6_CSUM;
+		card->dev->vlan_features |= NETIF_F_IPV6_CSUM;
+	}
 
 	card->info.broadcast_capable = 1;
 	qeth_l2_request_initial_mac(card);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index dd233fe3d6c4..e7fa479adf47 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2281,7 +2281,7 @@ static netdev_tx_t qeth_l3_hard_start_xmit(struct sk_buff *skb,
 		}
 
 		if (new_skb->ip_summed == CHECKSUM_PARTIAL) {
-			qeth_tx_csum(new_skb, &hdr->hdr.l3.ext_flags);
+			qeth_tx_csum(new_skb, &hdr->hdr.l3.ext_flags, ipv);
 			if (card->options.performance_stats)
 				card->perf_stats.tx_csum++;
 		}
@@ -2507,6 +2507,11 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 			card->dev->vlan_features |= NETIF_F_TSO |
 				NETIF_F_RXCSUM | NETIF_F_IP_CSUM;
 		}
+
+		if (qeth_is_supported6(card, IPA_OUTBOUND_CHECKSUM_V6)) {
+			card->dev->hw_features |= NETIF_F_IPV6_CSUM;
+			card->dev->vlan_features |= NETIF_F_IPV6_CSUM;
+		}
 	} else if (card->info.type == QETH_CARD_TYPE_IQD) {
 		card->dev = alloc_netdev(0, "hsi%d", NET_NAME_UNKNOWN,
 					 ether_setup);
-- 
cgit v1.2.3-59-g8ed1b


From d7e6ed97b534e8b5706f4ef6d51c44f13608333c Mon Sep 17 00:00:00 2001
From: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:23 +0200
Subject: s390/qeth: add IPv6 RX checksum offload support

Check if a qeth device supports IPv6 RX checksum offload, and hook it up
into the existing NETIF_F_RXCSUM support.
As NETIF_F_RXCSUM is now backed by a combination of HW Assists, we need
to be a little smarter when dealing with errors during a configuration
change:
- switching on NETIF_F_RXCSUM only makes sense if at least one HW Assist
  was enabled successfully.
- for switching off NETIF_F_RXCSUM, all available HW Assists need to be
  deactivated.

Signed-off-by: Kittipon Meesompop <kmeesomp@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core_main.c | 31 +++++++++++++++++++++++++++----
 drivers/s390/net/qeth_core_mpc.h  |  1 +
 drivers/s390/net/qeth_l2_main.c   |  9 +++++----
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 55b05d9c1cb6..06415b6a8f68 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -6430,6 +6430,29 @@ static int qeth_set_ipa_tso(struct qeth_card *card, int on)
 	return rc;
 }
 
+static int qeth_set_ipa_rx_csum(struct qeth_card *card, bool on)
+{
+	int rc_ipv4 = (on) ? -EOPNOTSUPP : 0;
+	int rc_ipv6;
+
+	if (qeth_is_supported(card, IPA_INBOUND_CHECKSUM))
+		rc_ipv4 = qeth_set_ipa_csum(card, on, IPA_INBOUND_CHECKSUM,
+					    QETH_PROT_IPV4);
+	if (!qeth_is_supported6(card, IPA_INBOUND_CHECKSUM_V6))
+		/* no/one Offload Assist available, so the rc is trivial */
+		return rc_ipv4;
+
+	rc_ipv6 = qeth_set_ipa_csum(card, on, IPA_INBOUND_CHECKSUM,
+				    QETH_PROT_IPV6);
+
+	if (on)
+		/* enable: success if any Assist is active */
+		return (rc_ipv6) ? rc_ipv4 : 0;
+
+	/* disable: failure if any Assist is still active */
+	return (rc_ipv6) ? rc_ipv6 : rc_ipv4;
+}
+
 #define QETH_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_TSO | \
 			  NETIF_F_IPV6_CSUM)
 /**
@@ -6477,9 +6500,8 @@ int qeth_set_features(struct net_device *dev, netdev_features_t features)
 		if (rc)
 			changed ^= NETIF_F_IPV6_CSUM;
 	}
-	if ((changed & NETIF_F_RXCSUM)) {
-		rc = qeth_set_ipa_csum(card, features & NETIF_F_RXCSUM,
-				       IPA_INBOUND_CHECKSUM, QETH_PROT_IPV4);
+	if (changed & NETIF_F_RXCSUM) {
+		rc = qeth_set_ipa_rx_csum(card, features & NETIF_F_RXCSUM);
 		if (rc)
 			changed ^= NETIF_F_RXCSUM;
 	}
@@ -6508,7 +6530,8 @@ netdev_features_t qeth_fix_features(struct net_device *dev,
 		features &= ~NETIF_F_IP_CSUM;
 	if (!qeth_is_supported6(card, IPA_OUTBOUND_CHECKSUM_V6))
 		features &= ~NETIF_F_IPV6_CSUM;
-	if (!qeth_is_supported(card, IPA_INBOUND_CHECKSUM))
+	if (!qeth_is_supported(card, IPA_INBOUND_CHECKSUM) &&
+	    !qeth_is_supported6(card, IPA_INBOUND_CHECKSUM_V6))
 		features &= ~NETIF_F_RXCSUM;
 	if (!qeth_is_supported(card, IPA_OUTBOUND_TSO))
 		features &= ~NETIF_F_TSO;
diff --git a/drivers/s390/net/qeth_core_mpc.h b/drivers/s390/net/qeth_core_mpc.h
index af3c35fbfa9e..878e62f35169 100644
--- a/drivers/s390/net/qeth_core_mpc.h
+++ b/drivers/s390/net/qeth_core_mpc.h
@@ -246,6 +246,7 @@ enum qeth_ipa_funcs {
 	IPA_QUERY_ARP_ASSIST	= 0x00040000L,
 	IPA_INBOUND_TSO         = 0x00080000L,
 	IPA_OUTBOUND_TSO        = 0x00100000L,
+	IPA_INBOUND_CHECKSUM_V6 = 0x00400000L,
 	IPA_OUTBOUND_CHECKSUM_V6 = 0x00800000L,
 };
 
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 5b1780fa4cb5..810d69bd9991 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -992,15 +992,16 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 			card->dev->hw_features |= NETIF_F_IP_CSUM;
 			card->dev->vlan_features |= NETIF_F_IP_CSUM;
 		}
-		if (qeth_is_supported(card, IPA_INBOUND_CHECKSUM)) {
-			card->dev->hw_features |= NETIF_F_RXCSUM;
-			card->dev->vlan_features |= NETIF_F_RXCSUM;
-		}
 	}
 	if (qeth_is_supported6(card, IPA_OUTBOUND_CHECKSUM_V6)) {
 		card->dev->hw_features |= NETIF_F_IPV6_CSUM;
 		card->dev->vlan_features |= NETIF_F_IPV6_CSUM;
 	}
+	if (qeth_is_supported(card, IPA_INBOUND_CHECKSUM) ||
+	    qeth_is_supported6(card, IPA_INBOUND_CHECKSUM_V6)) {
+		card->dev->hw_features |= NETIF_F_RXCSUM;
+		card->dev->vlan_features |= NETIF_F_RXCSUM;
+	}
 
 	card->info.broadcast_capable = 1;
 	qeth_l2_request_initial_mac(card);
-- 
cgit v1.2.3-59-g8ed1b


From 21b1702af12eb62bb40d994c9c1546ecacfc8456 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Thu, 26 Apr 2018 09:42:24 +0200
Subject: s390/qeth: improve fallback to random MAC address

If READ MAC fails to fetch a valid MAC address, allow some more device
types (IQD and z/VM OSD) to fall back to a random address.
Also use eth_hw_addr_random(), for indicating to userspace that the
address type is NET_ADDR_RANDOM.

Note that while z/VM has various protection schemes to prohibit
custom addresses on its NICs, they are all optional. So we should at
least give it a try.

Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_l2_main.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 810d69bd9991..a7cb37da6a21 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -439,7 +439,6 @@ static int qeth_l2_process_inbound_buffer(struct qeth_card *card,
 static int qeth_l2_request_initial_mac(struct qeth_card *card)
 {
 	int rc = 0;
-	char vendor_pre[] = {0x02, 0x00, 0x00};
 
 	QETH_DBF_TEXT(SETUP, 2, "l2reqmac");
 	QETH_DBF_TEXT_(SETUP, 2, "doL2%s", CARD_BUS_ID(card));
@@ -459,16 +458,20 @@ static int qeth_l2_request_initial_mac(struct qeth_card *card)
 	    card->info.type == QETH_CARD_TYPE_OSX ||
 	    card->info.guestlan) {
 		rc = qeth_setadpparms_change_macaddr(card);
-		if (rc) {
-			QETH_DBF_MESSAGE(2, "couldn't get MAC address on "
-				"device %s: x%x\n", CARD_BUS_ID(card), rc);
-			QETH_DBF_TEXT_(SETUP, 2, "1err%04x", rc);
-			return rc;
-		}
-	} else {
-		eth_random_addr(card->dev->dev_addr);
-		memcpy(card->dev->dev_addr, vendor_pre, 3);
+		if (!rc)
+			goto out;
+		QETH_DBF_MESSAGE(2, "READ_MAC Assist failed on device %s: x%x\n",
+				 CARD_BUS_ID(card), rc);
+		QETH_DBF_TEXT_(SETUP, 2, "1err%04x", rc);
+		/* fall back once more: */
 	}
+
+	/* some devices don't support a custom MAC address: */
+	if (card->info.type == QETH_CARD_TYPE_OSM ||
+	    card->info.type == QETH_CARD_TYPE_OSX)
+		return (rc) ? rc : -EADDRNOTAVAIL;
+	eth_hw_addr_random(card->dev);
+
 out:
 	QETH_DBF_HEX(SETUP, 2, card->dev->dev_addr, card->dev->addr_len);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 094be0927ff3e802b3f26b7f66c563a4b93a4ab5 Mon Sep 17 00:00:00 2001
From: Tobias Regnery <tobias.regnery@gmail.com>
Date: Thu, 26 Apr 2018 12:36:36 +0200
Subject: geneve: fix build with modular IPV6

Commit c40e89fd358e ("geneve: configure MTU based on a lower device") added
an IS_ENABLED(CONFIG_IPV6) to geneve, leading to the following link error
with CONFIG_GENEVE=y and CONFIG_IPV6=m:

drivers/net/geneve.o: In function `geneve_link_config':
geneve.c:(.text+0x14c): undefined reference to `rt6_lookup'

Fix this by adding a Kconfig dependency and forcing GENEVE to be a module
when IPV6 is a module.

Fixes: c40e89fd358e ("geneve: configure MTU based on a lower device")
Signed-off-by: Tobias Regnery <tobias.regnery@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 891846655000..a029b27fd002 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -198,6 +198,7 @@ config VXLAN
 config GENEVE
        tristate "Generic Network Virtualization Encapsulation"
        depends on INET && NET_UDP_TUNNEL
+       depends on IPV6 || !IPV6
        select NET_IP_TUNNEL
        select GRO_CELLS
        ---help---
-- 
cgit v1.2.3-59-g8ed1b


From 3f5ecd8a90dd553167838098d92206cd9d6142e8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 26 Apr 2018 15:18:38 +0300
Subject: net: Fix coccinelle warning

kbuild test robot says:

  >coccinelle warnings: (new ones prefixed by >>)
  >>> net/core/dev.c:1588:2-3: Unneeded semicolon

So, let's remove it.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 8f8931b93140..0a2d46424069 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1587,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
-	};
+	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
 }
-- 
cgit v1.2.3-59-g8ed1b


From 51dce24bcdbdc493a87a17bcaf898b1f1d2fa600 Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 26 Apr 2018 08:08:09 -0700
Subject: net: intel: Cleanup the copyright/license headers

After many years of having a ~30 line copyright and license header to our
source files, we are finally able to reduce that to one line with the
advent of the SPDX identifier.

Also caught a few files missing the SPDX license identifier, so fixed
them up.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Acked-by: Shannon Nelson <shannon.nelson@oracle.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/e100.c                  | 28 +-------------------
 drivers/net/ethernet/intel/e1000/Makefile          | 26 -------------------
 drivers/net/ethernet/intel/e1000/e1000.h           | 29 +--------------------
 drivers/net/ethernet/intel/e1000/e1000_ethtool.c   | 23 +----------------
 drivers/net/ethernet/intel/e1000/e1000_hw.c        | 28 +-------------------
 drivers/net/ethernet/intel/e1000/e1000_hw.h        | 28 +-------------------
 drivers/net/ethernet/intel/e1000/e1000_main.c      | 28 +-------------------
 drivers/net/ethernet/intel/e1000/e1000_osdep.h     | 29 +--------------------
 drivers/net/ethernet/intel/e1000/e1000_param.c     | 28 +-------------------
 drivers/net/ethernet/intel/e1000e/80003es2lan.c    | 21 +--------------
 drivers/net/ethernet/intel/e1000e/80003es2lan.h    | 21 +--------------
 drivers/net/ethernet/intel/e1000e/82571.c          | 21 +--------------
 drivers/net/ethernet/intel/e1000e/82571.h          | 21 +--------------
 drivers/net/ethernet/intel/e1000e/Makefile         | 27 +------------------
 drivers/net/ethernet/intel/e1000e/defines.h        | 21 +--------------
 drivers/net/ethernet/intel/e1000e/e1000.h          | 21 +--------------
 drivers/net/ethernet/intel/e1000e/ethtool.c        | 21 +--------------
 drivers/net/ethernet/intel/e1000e/hw.h             | 21 +--------------
 drivers/net/ethernet/intel/e1000e/ich8lan.c        | 21 +--------------
 drivers/net/ethernet/intel/e1000e/ich8lan.h        | 21 +--------------
 drivers/net/ethernet/intel/e1000e/mac.c            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/mac.h            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/manage.c         | 21 +--------------
 drivers/net/ethernet/intel/e1000e/manage.h         | 21 +--------------
 drivers/net/ethernet/intel/e1000e/netdev.c         | 21 +--------------
 drivers/net/ethernet/intel/e1000e/nvm.c            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/nvm.h            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/param.c          | 21 +--------------
 drivers/net/ethernet/intel/e1000e/phy.c            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/phy.h            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/ptp.c            | 21 +--------------
 drivers/net/ethernet/intel/e1000e/regs.h           | 21 +--------------
 drivers/net/ethernet/intel/fm10k/Makefile          | 23 +----------------
 drivers/net/ethernet/intel/fm10k/fm10k.h           | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_common.c    | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_common.h    | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c     | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c   | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c   | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_iov.c       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_main.c      | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_mbx.c       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_mbx.h       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c    | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_pf.c        | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_pf.h        | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_tlv.c       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_tlv.h       | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_type.h      | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_vf.c        | 20 +--------------
 drivers/net/ethernet/intel/fm10k/fm10k_vf.h        | 20 +--------------
 drivers/net/ethernet/intel/i40e/Makefile           | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e.h             | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_adminq.c      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_adminq.h      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_alloc.h       | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_client.c      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_client.h      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_common.c      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_dcb.c         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_dcb.h         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_devids.h      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_diag.c        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_diag.h        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_hmc.c         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_hmc.h         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c     | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h     | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_nvm.c         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_osdep.h       | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_register.h    | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_status.h      | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_trace.h       | 23 +----------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_type.h        | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +------------------
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h | 26 +------------------
 drivers/net/ethernet/intel/i40evf/Makefile         | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c    | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_adminq.h    | 26 +------------------
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h    | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_alloc.h     | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_common.c    | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_devids.h    | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_hmc.h       | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h   | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_osdep.h     | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_prototype.h | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_register.h  | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_status.h    | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_trace.h     | 23 +----------------
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c      | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h      | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40e_type.h      | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40evf.h         | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40evf_client.c  |  2 ++
 drivers/net/ethernet/intel/i40evf/i40evf_client.h  |  2 ++
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 26 +------------------
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    | 26 +------------------
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c    | 26 +------------------
 drivers/net/ethernet/intel/igb/Makefile            | 28 +-------------------
 drivers/net/ethernet/intel/igb/e1000_82575.c       | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_82575.h       | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_defines.h     | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_hw.h          | 22 +---------------
 drivers/net/ethernet/intel/igb/e1000_i210.c        | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_i210.h        | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_mac.c         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_mac.h         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_mbx.c         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_mbx.h         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_nvm.c         | 22 +---------------
 drivers/net/ethernet/intel/igb/e1000_nvm.h         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_phy.c         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_phy.h         | 23 +----------------
 drivers/net/ethernet/intel/igb/e1000_regs.h        | 23 +----------------
 drivers/net/ethernet/intel/igb/igb.h               | 23 +----------------
 drivers/net/ethernet/intel/igb/igb_ethtool.c       | 23 +----------------
 drivers/net/ethernet/intel/igb/igb_hwmon.c         | 23 +----------------
 drivers/net/ethernet/intel/igb/igb_main.c          | 23 +----------------
 drivers/net/ethernet/intel/igb/igb_ptp.c           | 19 ++------------
 drivers/net/ethernet/intel/igbvf/Makefile          | 28 +-------------------
 drivers/net/ethernet/intel/igbvf/defines.h         | 26 +------------------
 drivers/net/ethernet/intel/igbvf/ethtool.c         | 26 +------------------
 drivers/net/ethernet/intel/igbvf/igbvf.h           | 26 +------------------
 drivers/net/ethernet/intel/igbvf/mbx.c             | 26 +------------------
 drivers/net/ethernet/intel/igbvf/mbx.h             | 26 +------------------
 drivers/net/ethernet/intel/igbvf/netdev.c          | 26 +------------------
 drivers/net/ethernet/intel/igbvf/regs.h            | 26 +------------------
 drivers/net/ethernet/intel/igbvf/vf.c              | 26 +------------------
 drivers/net/ethernet/intel/igbvf/vf.h              | 26 +------------------
 drivers/net/ethernet/intel/ixgb/Makefile           | 27 -------------------
 drivers/net/ethernet/intel/ixgb/ixgb.h             | 28 +-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_ee.c          | 29 ++-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_ee.h          | 28 +-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c     | 29 ++-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_hw.c          | 29 ++-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_hw.h          | 28 +-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_ids.h         | 28 +-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_main.c        | 29 ++-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_osdep.h       | 28 +-------------------
 drivers/net/ethernet/intel/ixgb/ixgb_param.c       | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/Makefile          | 29 +--------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe.h           | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c     | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c     | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c    | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.h    | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c       | 30 ++--------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h       | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c    | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c   | 28 ++------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c   | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c      | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.h      | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c     | 28 ++------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h     | 27 +------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c       | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c       | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h       | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_model.h     | 26 +------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c       | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h       | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       | 28 ++------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h     | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c     | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_type.h      | 28 +-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c      | 29 ++-------------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h      | 24 +----------------
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c      | 26 +++----------------
 drivers/net/ethernet/intel/ixgbevf/Makefile        | 28 +-------------------
 drivers/net/ethernet/intel/ixgbevf/defines.h       | 26 +------------------
 drivers/net/ethernet/intel/ixgbevf/ethtool.c       | 27 ++-----------------
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h       | 26 +------------------
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c  | 27 ++-----------------
 drivers/net/ethernet/intel/ixgbevf/mbx.c           | 27 ++-----------------
 drivers/net/ethernet/intel/ixgbevf/mbx.h           | 26 +------------------
 drivers/net/ethernet/intel/ixgbevf/regs.h          | 26 +------------------
 drivers/net/ethernet/intel/ixgbevf/vf.c            | 27 ++-----------------
 drivers/net/ethernet/intel/ixgbevf/vf.h            | 26 +------------------
 195 files changed, 222 insertions(+), 4624 deletions(-)

diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c
index 41ad56edfb96..27d5f27163d2 100644
--- a/drivers/net/ethernet/intel/e100.c
+++ b/drivers/net/ethernet/intel/e100.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel PRO/100 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /*
  *	e100.c: Intel(R) PRO/100 ethernet driver
diff --git a/drivers/net/ethernet/intel/e1000/Makefile b/drivers/net/ethernet/intel/e1000/Makefile
index c7caadd3c8af..314c52d44b7c 100644
--- a/drivers/net/ethernet/intel/e1000/Makefile
+++ b/drivers/net/ethernet/intel/e1000/Makefile
@@ -1,31 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel PRO/1000 Linux driver
 # Copyright(c) 1999 - 2006 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# Linux NICS <linux.nics@intel.com>
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
 
 #
 # Makefile for the Intel(R) PRO/1000 ethernet driver
diff --git a/drivers/net/ethernet/intel/e1000/e1000.h b/drivers/net/ethernet/intel/e1000/e1000.h
index 3a0feea2df54..c40729b2c184 100644
--- a/drivers/net/ethernet/intel/e1000/e1000.h
+++ b/drivers/net/ethernet/intel/e1000/e1000.h
@@ -1,32 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
-
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /* Linux PRO/1000 Ethernet Driver main header file */
 
diff --git a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
index 3e80ca170dd7..5d365a986bb0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_ethtool.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- * Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2006 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /* ethtool support for e1000 */
 
diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c
index 6e7e923d57bf..48428d6a00be 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_hw.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-*
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
- */
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /* e1000_hw.c
  * Shared functions for accessing and configuring the MAC
diff --git a/drivers/net/ethernet/intel/e1000/e1000_hw.h b/drivers/net/ethernet/intel/e1000/e1000_hw.h
index f09c569ec19b..b57a04954ccf 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_hw.h
+++ b/drivers/net/ethernet/intel/e1000/e1000_hw.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /* e1000_hw.h
  * Structures, enums, and macros for the MAC
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index d5eb19b86a0a..2110d5f2da19 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 #include "e1000.h"
 #include <net/ip6_checksum.h>
diff --git a/drivers/net/ethernet/intel/e1000/e1000_osdep.h b/drivers/net/ethernet/intel/e1000/e1000_osdep.h
index ae0559b8b011..e966bb290797 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_osdep.h
+++ b/drivers/net/ethernet/intel/e1000/e1000_osdep.h
@@ -1,32 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
-
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 /* glue for the OS independent part of e1000
  * includes register access macros
diff --git a/drivers/net/ethernet/intel/e1000/e1000_param.c b/drivers/net/ethernet/intel/e1000/e1000_param.c
index 345f23927bcc..d3f29ffe1e47 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_param.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_param.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel PRO/1000 Linux driver
-  Copyright(c) 1999 - 2006 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2006 Intel Corporation. */
 
 #include "e1000.h"
 
diff --git a/drivers/net/ethernet/intel/e1000e/80003es2lan.c b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
index 953e99df420c..257bd59bc9c6 100644
--- a/drivers/net/ethernet/intel/e1000e/80003es2lan.c
+++ b/drivers/net/ethernet/intel/e1000e/80003es2lan.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* 80003ES2LAN Gigabit Ethernet Controller (Copper)
  * 80003ES2LAN Gigabit Ethernet Controller (Serdes)
diff --git a/drivers/net/ethernet/intel/e1000e/80003es2lan.h b/drivers/net/ethernet/intel/e1000e/80003es2lan.h
index ee6d1256fda4..aa9d639c6cbb 100644
--- a/drivers/net/ethernet/intel/e1000e/80003es2lan.h
+++ b/drivers/net/ethernet/intel/e1000e/80003es2lan.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_80003ES2LAN_H_
 #define _E1000E_80003ES2LAN_H_
diff --git a/drivers/net/ethernet/intel/e1000e/82571.c b/drivers/net/ethernet/intel/e1000e/82571.c
index 924f2c8dfa6c..b9309302c29e 100644
--- a/drivers/net/ethernet/intel/e1000e/82571.c
+++ b/drivers/net/ethernet/intel/e1000e/82571.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* 82571EB Gigabit Ethernet Controller
  * 82571EB Gigabit Ethernet Controller (Copper)
diff --git a/drivers/net/ethernet/intel/e1000e/82571.h b/drivers/net/ethernet/intel/e1000e/82571.h
index 9a24c645f726..834c238d02db 100644
--- a/drivers/net/ethernet/intel/e1000e/82571.h
+++ b/drivers/net/ethernet/intel/e1000e/82571.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_82571_H_
 #define _E1000E_82571_H_
diff --git a/drivers/net/ethernet/intel/e1000e/Makefile b/drivers/net/ethernet/intel/e1000e/Makefile
index 24e391a4ac68..44e58b6e7660 100644
--- a/drivers/net/ethernet/intel/e1000e/Makefile
+++ b/drivers/net/ethernet/intel/e1000e/Makefile
@@ -1,30 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel PRO/1000 Linux driver
-# Copyright(c) 1999 - 2014 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# Linux NICS <linux.nics@intel.com>
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
+# Copyright(c) 1999 - 2018 Intel Corporation.
 
 #
 # Makefile for the Intel(R) PRO/1000 ethernet driver
diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h
index 22883015a695..fd550dee4982 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000_DEFINES_H_
 #define _E1000_DEFINES_H_
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index da88555ba1fd..c760dc72c520 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* Linux PRO/1000 Ethernet Driver main header file */
 
diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 64dc0c11147f..e084cb734eb1 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* ethtool support for e1000 */
 
diff --git a/drivers/net/ethernet/intel/e1000e/hw.h b/drivers/net/ethernet/intel/e1000e/hw.h
index 21802396bed6..eff75bd8a8f0 100644
--- a/drivers/net/ethernet/intel/e1000e/hw.h
+++ b/drivers/net/ethernet/intel/e1000e/hw.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000_HW_H_
 #define _E1000_HW_H_
diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c
index 1551d6ce5341..cdae0efde8e6 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* 82562G 10/100 Network Connection
  * 82562G-2 10/100 Network Connection
diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.h b/drivers/net/ethernet/intel/e1000e/ich8lan.h
index 3c4f82c21084..eb09c755fa17 100644
--- a/drivers/net/ethernet/intel/e1000e/ich8lan.h
+++ b/drivers/net/ethernet/intel/e1000e/ich8lan.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_ICH8LAN_H_
 #define _E1000E_ICH8LAN_H_
diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c
index b293464a9f27..4abd55d646c5 100644
--- a/drivers/net/ethernet/intel/e1000e/mac.c
+++ b/drivers/net/ethernet/intel/e1000e/mac.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "e1000.h"
 
diff --git a/drivers/net/ethernet/intel/e1000e/mac.h b/drivers/net/ethernet/intel/e1000e/mac.h
index cb0abf6c76a5..6ab261119801 100644
--- a/drivers/net/ethernet/intel/e1000e/mac.h
+++ b/drivers/net/ethernet/intel/e1000e/mac.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_MAC_H_
 #define _E1000E_MAC_H_
diff --git a/drivers/net/ethernet/intel/e1000e/manage.c b/drivers/net/ethernet/intel/e1000e/manage.c
index e027660aeb92..c4c9b20bc51f 100644
--- a/drivers/net/ethernet/intel/e1000e/manage.c
+++ b/drivers/net/ethernet/intel/e1000e/manage.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "e1000.h"
 
diff --git a/drivers/net/ethernet/intel/e1000e/manage.h b/drivers/net/ethernet/intel/e1000e/manage.h
index 3268f2e58593..d868aad806d4 100644
--- a/drivers/net/ethernet/intel/e1000e/manage.h
+++ b/drivers/net/ethernet/intel/e1000e/manage.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_MANAGE_H_
 #define _E1000E_MANAGE_H_
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index ec4a9759a6f2..d3fef7fefea8 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/e1000e/nvm.c b/drivers/net/ethernet/intel/e1000e/nvm.c
index 68949bb41b7b..937f9af22d26 100644
--- a/drivers/net/ethernet/intel/e1000e/nvm.c
+++ b/drivers/net/ethernet/intel/e1000e/nvm.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "e1000.h"
 
diff --git a/drivers/net/ethernet/intel/e1000e/nvm.h b/drivers/net/ethernet/intel/e1000e/nvm.h
index 8e082028be7d..6a30dfea4117 100644
--- a/drivers/net/ethernet/intel/e1000e/nvm.h
+++ b/drivers/net/ethernet/intel/e1000e/nvm.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_NVM_H_
 #define _E1000E_NVM_H_
diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
index 2def33eba9e6..098369fd3e65 100644
--- a/drivers/net/ethernet/intel/e1000e/param.c
+++ b/drivers/net/ethernet/intel/e1000e/param.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/netdevice.h>
 #include <linux/module.h>
diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c
index b8226ed0e338..42233019255a 100644
--- a/drivers/net/ethernet/intel/e1000e/phy.c
+++ b/drivers/net/ethernet/intel/e1000e/phy.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "e1000.h"
 
diff --git a/drivers/net/ethernet/intel/e1000e/phy.h b/drivers/net/ethernet/intel/e1000e/phy.h
index d4180b5e9196..c48777d09523 100644
--- a/drivers/net/ethernet/intel/e1000e/phy.h
+++ b/drivers/net/ethernet/intel/e1000e/phy.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_PHY_H_
 #define _E1000E_PHY_H_
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index f941e5085f44..37c76945ad9b 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -1,24 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* PTP 1588 Hardware Clock (PHC)
  * Derived from PTP Hardware Clock driver for Intel 82576 and 82580 (igb)
diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h
index 16afc3c2a986..47f5ca793970 100644
--- a/drivers/net/ethernet/intel/e1000e/regs.h
+++ b/drivers/net/ethernet/intel/e1000e/regs.h
@@ -1,24 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel PRO/1000 Linux driver
- * Copyright(c) 1999 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000E_REGS_H_
 #define _E1000E_REGS_H_
diff --git a/drivers/net/ethernet/intel/fm10k/Makefile b/drivers/net/ethernet/intel/fm10k/Makefile
index 93277cb99cb7..26a9746ccb14 100644
--- a/drivers/net/ethernet/intel/fm10k/Makefile
+++ b/drivers/net/ethernet/intel/fm10k/Makefile
@@ -1,26 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel(R) Ethernet Switch Host Interface Driver
-# Copyright(c) 2013 - 2016 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
+# Copyright(c) 2013 - 2018 Intel Corporation.
 
 #
 # Makefile for the Intel(R) Ethernet Switch Host Interface Driver
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h
index a9cdf763c59d..a903a0ba45e1 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_H_
 #define _FM10K_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_common.c b/drivers/net/ethernet/intel/fm10k/fm10k_common.c
index e303d88720ef..f51a63fca513 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_common.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_common.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k_common.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_common.h b/drivers/net/ethernet/intel/fm10k/fm10k_common.h
index 2bdb24d2ca9d..4c48fb73b3e7 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_common.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_common.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_COMMON_H_
 #define _FM10K_COMMON_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
index c4f733452ef2..20768ac7f17e 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_dcbnl.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
index 43e8d839831f..dca104121c05 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_debugfs.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 28b6b4e56487..eeac2b75a195 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/vmalloc.h>
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
index 30395f5e5e87..e707d717012f 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_iov.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k.h"
 #include "fm10k_vf.h"
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index c51d61f5f715..3f536541f45f 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
index c01bf30a0c9e..21021fe4f1c3 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k_common.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
index 007e1dfa9b7a..56d1abff04e2 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_mbx.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_MBX_H_
 #define _FM10K_MBX_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 26e749766337..c879af72bbf5 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k.h"
 #include <linux/vmalloc.h>
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index c4a2b688b38b..15071e4adb98 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/module.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
index 7ba54c534f8c..8f0a99b6a537 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k_pf.h"
 #include "fm10k_vf.h"
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
index ae81f9a16602..8e814df709d2 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pf.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_PF_H_
 #define _FM10K_PF_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_tlv.c b/drivers/net/ethernet/intel/fm10k/fm10k_tlv.c
index 725ecb7abccd..2a7a40bf2b1c 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_tlv.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_tlv.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k_tlv.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_tlv.h b/drivers/net/ethernet/intel/fm10k/fm10k_tlv.h
index 5d2ee759507e..160bc5b78f99 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_tlv.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_tlv.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_TLV_H_
 #define _FM10K_TLV_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_type.h b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
index dd23af11e2c1..3e608e493f9d 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_type.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_type.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_TYPE_H_
 #define _FM10K_TYPE_H_
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
index f06913630b39..a8519c1f0406 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.c
@@ -1,23 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "fm10k_vf.h"
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_vf.h b/drivers/net/ethernet/intel/fm10k/fm10k_vf.h
index 66a66b73a2f1..787d0d570a28 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_vf.h
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_vf.h
@@ -1,23 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Ethernet Switch Host Interface Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _FM10K_VF_H_
 #define _FM10K_VF_H_
diff --git a/drivers/net/ethernet/intel/i40e/Makefile b/drivers/net/ethernet/intel/i40e/Makefile
index 75437768a07c..14397e7e9925 100644
--- a/drivers/net/ethernet/intel/i40e/Makefile
+++ b/drivers/net/ethernet/intel/i40e/Makefile
@@ -1,29 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel Ethernet Controller XL710 Family Linux Driver
-# Copyright(c) 2013 - 2015 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
+# Copyright(c) 2013 - 2018 Intel Corporation.
 
 #
 # Makefile for the Intel(R) Ethernet Connection XL710 (i40e.ko) driver
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index a44139c1de80..f573108faec3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_H_
 #define _I40E_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index 843fc7781ef8..ddbea79d18e5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_status.h"
 #include "i40e_type.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.h b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
index 0a8749ee9fd3..edec3df78971 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ADMINQ_H_
 #define _I40E_ADMINQ_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 0244923edeb8..7d888e05f96f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ADMINQ_CMD_H_
 #define _I40E_ADMINQ_CMD_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_alloc.h b/drivers/net/ethernet/intel/i40e/i40e_alloc.h
index abed0c52e782..cb8689222c8b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_alloc.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_alloc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ALLOC_H_
 #define _I40E_ALLOC_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
index d8ce4999864f..2041757f948c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/list.h>
 #include <linux/errno.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.h b/drivers/net/ethernet/intel/i40e/i40e_client.h
index 9d464d40bc17..72994baf4941 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_CLIENT_H_
 #define _I40E_CLIENT_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index c0a3dae8a2db..6f8fd70d606a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_type.h"
 #include "i40e_adminq.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 9fec728dc4b9..69e7d4967b1c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_adminq.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.h b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
index 4f806386cb22..2b748a60a843 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_DCB_H_
 #define _I40E_DCB_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
index 502818e3da78..a09d723d5ca0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifdef CONFIG_I40E_DCB
 #include "i40e.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index d494dcaf18d0..94774a2a511f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_devids.h b/drivers/net/ethernet/intel/i40e/i40e_devids.h
index ad6a66ccb576..334b05ff685a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_devids.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_devids.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_DEVIDS_H_
 #define _I40E_DEVIDS_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.c b/drivers/net/ethernet/intel/i40e/i40e_diag.c
index df3e60470f8b..ef4d3762bf37 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_diag.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_diag.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_diag.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.h b/drivers/net/ethernet/intel/i40e/i40e_diag.h
index be8341763475..c3340f320a18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_diag.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_diag.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_DIAG_H_
 #define _I40E_DIAG_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index b974482ff630..94dbe2c419ca 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 /* ethtool support for i40e */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.c b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
index 6d4b590f851b..e40f3b59e42b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_hmc.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_osdep.h"
 #include "i40e_register.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.h b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
index 7b5fd33d70ae..1c78de838857 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_hmc.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_HMC_H_
 #define _I40E_HMC_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
index cd40dc487b38..994011c38fb4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_osdep.h"
 #include "i40e_register.h"
diff --git a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
index 79e1396735d9..c46a2c449e60 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_lan_hmc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_LAN_HMC_H_
 #define _I40E_LAN_HMC_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 16229998fb1e..41bad112a907 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/etherdevice.h>
 #include <linux/of_net.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index ba9687c03795..51554df9cb9a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_prototype.h"
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_osdep.h b/drivers/net/ethernet/intel/i40e/i40e_osdep.h
index 9c3c3b0d3ac4..a07574bff550 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_osdep.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_osdep.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_OSDEP_H_
 #define _I40E_OSDEP_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 2ec24188d6e2..3170655cdeb9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_PROTOTYPE_H_
 #define _I40E_PROTOTYPE_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 5b47dd1f75a5..47d10ad39c18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e.h"
 #include <linux/ptp_classify.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h
index b3e206e49cc2..52e3680c57f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_register.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_register.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_REGISTER_H_
 #define _I40E_REGISTER_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_status.h b/drivers/net/ethernet/intel/i40e/i40e_status.h
index 10c86f63dc52..77be0702d07c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_status.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_status.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_STATUS_H_
 #define _I40E_STATUS_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h
index 410ba13bcf21..424f02077e2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_trace.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel(R) 40-10 Gigabit Ethernet Connection Network Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 /* Modeled on trace-events-sample.h */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 87fb27ab9c24..6959897c789e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/prefetch.h>
 #include <net/busy_poll.h>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 4bf318b8be85..fdd2c55f03a6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_TXRX_H_
 #define _I40E_TXRX_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index bfb80092b352..40968a4216a7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_TYPE_H_
 #define _I40E_TYPE_H_
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 35173cbe80f7..3e7ab9fbeb0c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e.h"
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
index 57f727bb9e36..bf67d62e2b5f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_VIRTCHNL_PF_H_
 #define _I40E_VIRTCHNL_PF_H_
diff --git a/drivers/net/ethernet/intel/i40evf/Makefile b/drivers/net/ethernet/intel/i40evf/Makefile
index 1e89c5487676..3c5c6e962280 100644
--- a/drivers/net/ethernet/intel/i40evf/Makefile
+++ b/drivers/net/ethernet/intel/i40evf/Makefile
@@ -1,29 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
-# Copyright(c) 2013 - 2014 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
+# Copyright(c) 2013 - 2018 Intel Corporation.
 
 #
 ## Makefile for the Intel(R) 40GbE VF driver
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
index 6fd677efa9da..c355120dfdfd 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_status.h"
 #include "i40e_type.h"
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
index a7137c165256..1f264b9b6805 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ADMINQ_H_
 #define _I40E_ADMINQ_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
index 439e71882049..aa81e87cd471 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ADMINQ_CMD_H_
 #define _I40E_ADMINQ_CMD_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_alloc.h b/drivers/net/ethernet/intel/i40evf/i40e_alloc.h
index 7e0fddd8af36..cb8689222c8b 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_alloc.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_alloc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_ALLOC_H_
 #define _I40E_ALLOC_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index 67140cdbcd7a..0841b2d1ca6d 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40e_type.h"
 #include "i40e_adminq.h"
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_devids.h b/drivers/net/ethernet/intel/i40evf/i40e_devids.h
index 352dd3f3eb6a..f300bf271824 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_devids.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_devids.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_DEVIDS_H_
 #define _I40E_DEVIDS_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_hmc.h b/drivers/net/ethernet/intel/i40evf/i40e_hmc.h
index 7432596164f4..1c78de838857 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_hmc.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_hmc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_HMC_H_
 #define _I40E_HMC_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h b/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h
index ddac0e4908d3..82b00f70a632 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_lan_hmc.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_LAN_HMC_H_
 #define _I40E_LAN_HMC_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_osdep.h b/drivers/net/ethernet/intel/i40evf/i40e_osdep.h
index 8668ad6c1a65..3ddddb46455b 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_osdep.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_osdep.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_OSDEP_H_
 #define _I40E_OSDEP_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
index 72501bd0f1a9..a358f4b9d5aa 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_PROTOTYPE_H_
 #define _I40E_PROTOTYPE_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_register.h b/drivers/net/ethernet/intel/i40evf/i40e_register.h
index c9c935659758..49e1f57d99cc 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_register.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_register.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_REGISTER_H_
 #define _I40E_REGISTER_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_status.h b/drivers/net/ethernet/intel/i40evf/i40e_status.h
index 0d7993ecb99a..77be0702d07c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_status.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_status.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_STATUS_H_
 #define _I40E_STATUS_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_trace.h b/drivers/net/ethernet/intel/i40evf/i40e_trace.h
index ece01dd12a3c..d7a4e68820a8 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_trace.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_trace.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel(R) 40-10 Gigabit Ethernet Virtual Function Driver
- * Copyright(c) 2013 - 2017 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 /* Modeled on trace-events-sample.h */
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 12bd937861e7..87101f565be1 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/prefetch.h>
 #include <net/busy_poll.h>
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
index 5790897eae2e..3b5a63b3236e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_TXRX_H_
 #define _I40E_TXRX_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index 449de4b0058e..c77cb9f672d8 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40E_TYPE_H_
 #define _I40E_TYPE_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 3a7a1e77bf39..98b834932dd3 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #ifndef _I40EVF_H_
 #define _I40EVF_H_
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_client.c b/drivers/net/ethernet/intel/i40evf/i40evf_client.c
index da60ce12b33d..a30d093a52d6 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_client.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_client.c
@@ -1,4 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
+
 #include <linux/list.h>
 #include <linux/errno.h>
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_client.h b/drivers/net/ethernet/intel/i40evf/i40evf_client.h
index 15a10da5bd4a..fc6592c3de9c 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_client.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_client.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
+
 #ifndef _I40E_CLIENT_H_
 #define _I40E_CLIENT_H_
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index dc4cde274fb8..404595efea78 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 /* ethtool support for i40evf */
 #include "i40evf.h"
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 5f71532be7f1..97cda4a8f8e0 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40evf.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 26a59890532f..8bcefb7f7ac0 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
- *
- * Intel Ethernet Controller XL710 Family Linux Virtual Function Driver
- * Copyright(c) 2013 - 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include "i40evf.h"
 #include "i40e_prototype.h"
diff --git a/drivers/net/ethernet/intel/igb/Makefile b/drivers/net/ethernet/intel/igb/Makefile
index c48583e98ac1..394c1e0656b9 100644
--- a/drivers/net/ethernet/intel/igb/Makefile
+++ b/drivers/net/ethernet/intel/igb/Makefile
@@ -1,31 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel 82575 PCI-Express Ethernet Linux driver
-# Copyright(c) 1999 - 2014 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, see <http://www.gnu.org/licenses/>.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# Linux NICS <linux.nics@intel.com>
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
-
+# Copyright(c) 1999 - 2018 Intel Corporation.
 #
 # Makefile for the Intel(R) 82575 PCI-Express ethernet driver
 #
diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.c b/drivers/net/ethernet/intel/igb/e1000_82575.c
index dd9b6cac220d..b13b42e5a1d9 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.c
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 /* e1000_82575
  * e1000_82576
diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.h b/drivers/net/ethernet/intel/igb/e1000_82575.h
index e53ebe97d709..6ad775b1a4c5 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.h
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_82575_H_
 #define _E1000_82575_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h
index d3d1d868e7ba..252440a418dc 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_DEFINES_H_
 #define _E1000_DEFINES_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_hw.h b/drivers/net/ethernet/intel/igb/e1000_hw.h
index ff835e1e853d..5d87957b2627 100644
--- a/drivers/net/ethernet/intel/igb/e1000_hw.h
+++ b/drivers/net/ethernet/intel/igb/e1000_hw.h
@@ -1,25 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_HW_H_
 #define _E1000_HW_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.c b/drivers/net/ethernet/intel/igb/e1000_i210.c
index 6f548247e6d8..c54ebedca6da 100644
--- a/drivers/net/ethernet/intel/igb/e1000_i210.c
+++ b/drivers/net/ethernet/intel/igb/e1000_i210.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 /* e1000_i210
  * e1000_i211
diff --git a/drivers/net/ethernet/intel/igb/e1000_i210.h b/drivers/net/ethernet/intel/igb/e1000_i210.h
index 56f015ccb206..5c437fdc49ee 100644
--- a/drivers/net/ethernet/intel/igb/e1000_i210.h
+++ b/drivers/net/ethernet/intel/igb/e1000_i210.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_I210_H_
 #define _E1000_I210_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_mac.c b/drivers/net/ethernet/intel/igb/e1000_mac.c
index 298afa0d9159..79ee0a747260 100644
--- a/drivers/net/ethernet/intel/igb/e1000_mac.c
+++ b/drivers/net/ethernet/intel/igb/e1000_mac.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #include <linux/if_ether.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/igb/e1000_mac.h b/drivers/net/ethernet/intel/igb/e1000_mac.h
index 04d80c765aee..6e110f28f922 100644
--- a/drivers/net/ethernet/intel/igb/e1000_mac.h
+++ b/drivers/net/ethernet/intel/igb/e1000_mac.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_MAC_H_
 #define _E1000_MAC_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_mbx.c b/drivers/net/ethernet/intel/igb/e1000_mbx.c
index ef42f1689b3b..46debd991bfe 100644
--- a/drivers/net/ethernet/intel/igb/e1000_mbx.c
+++ b/drivers/net/ethernet/intel/igb/e1000_mbx.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #include "e1000_mbx.h"
 
diff --git a/drivers/net/ethernet/intel/igb/e1000_mbx.h b/drivers/net/ethernet/intel/igb/e1000_mbx.h
index 4f0ecd28354d..178e60ec71d4 100644
--- a/drivers/net/ethernet/intel/igb/e1000_mbx.h
+++ b/drivers/net/ethernet/intel/igb/e1000_mbx.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_MBX_H_
 #define _E1000_MBX_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_nvm.c b/drivers/net/ethernet/intel/igb/e1000_nvm.c
index e4596f151cd4..09f4dcb09632 100644
--- a/drivers/net/ethernet/intel/igb/e1000_nvm.c
+++ b/drivers/net/ethernet/intel/igb/e1000_nvm.c
@@ -1,25 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #include <linux/if_ether.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/igb/e1000_nvm.h b/drivers/net/ethernet/intel/igb/e1000_nvm.h
index dde68cd54a53..091cddf4ada8 100644
--- a/drivers/net/ethernet/intel/igb/e1000_nvm.h
+++ b/drivers/net/ethernet/intel/igb/e1000_nvm.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_NVM_H_
 #define _E1000_NVM_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_phy.c b/drivers/net/ethernet/intel/igb/e1000_phy.c
index 4ec61243da82..2be0e762ec69 100644
--- a/drivers/net/ethernet/intel/igb/e1000_phy.c
+++ b/drivers/net/ethernet/intel/igb/e1000_phy.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #include <linux/if_ether.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/igb/e1000_phy.h b/drivers/net/ethernet/intel/igb/e1000_phy.h
index 856d2cda0643..5894e4b1d0a8 100644
--- a/drivers/net/ethernet/intel/igb/e1000_phy.h
+++ b/drivers/net/ethernet/intel/igb/e1000_phy.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_PHY_H_
 #define _E1000_PHY_H_
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index e8fa8c6530e0..0ad737d2f289 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #ifndef _E1000_REGS_H_
 #define _E1000_REGS_H_
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 990a7fb32e4e..9643b5b3d444 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -1,26 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 /* Linux PRO/1000 Ethernet Driver main header file */
 
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 6697c273ab59..2d798499d35e 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 /* ethtool support for igb */
 
diff --git a/drivers/net/ethernet/intel/igb/igb_hwmon.c b/drivers/net/ethernet/intel/igb/igb_hwmon.c
index bebe43b3a836..3b83747b2700 100644
--- a/drivers/net/ethernet/intel/igb/igb_hwmon.c
+++ b/drivers/net/ethernet/intel/igb/igb_hwmon.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #include "igb.h"
 #include "e1000_82575.h"
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 3ce43207a35f..78574c06635b 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1,26 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Intel(R) Gigabit Ethernet Linux driver
- * Copyright(c) 2007-2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- */
+/* Copyright(c) 2007 - 2018 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 7454b9895a65..9f4d700e09df 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -1,21 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
-/* PTP Hardware Clock (PHC) driver for the Intel 82576 and 82580
- *
- * Copyright (C) 2011 Richard Cochran <richardcochran@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- */
+/* Copyright (C) 2011 Richard Cochran <richardcochran@gmail.com> */
+
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/pci.h>
diff --git a/drivers/net/ethernet/intel/igbvf/Makefile b/drivers/net/ethernet/intel/igbvf/Makefile
index efe29dae384a..afd3e36eae75 100644
--- a/drivers/net/ethernet/intel/igbvf/Makefile
+++ b/drivers/net/ethernet/intel/igbvf/Makefile
@@ -1,31 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel(R) 82576 Virtual Function Linux driver
-# Copyright(c) 2009 - 2012 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
-
+# Copyright(c) 2009 - 2018 Intel Corporation.
 #
 # Makefile for the Intel(R) 82576 VF ethernet driver
 #
diff --git a/drivers/net/ethernet/intel/igbvf/defines.h b/drivers/net/ethernet/intel/igbvf/defines.h
index 04bcfec0641b..4437f832412d 100644
--- a/drivers/net/ethernet/intel/igbvf/defines.h
+++ b/drivers/net/ethernet/intel/igbvf/defines.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 1999 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000_DEFINES_H_
 #define _E1000_DEFINES_H_
diff --git a/drivers/net/ethernet/intel/igbvf/ethtool.c b/drivers/net/ethernet/intel/igbvf/ethtool.c
index ca39e3cccaeb..3ae358b35227 100644
--- a/drivers/net/ethernet/intel/igbvf/ethtool.c
+++ b/drivers/net/ethernet/intel/igbvf/ethtool.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 /* ethtool support for igbvf */
 
diff --git a/drivers/net/ethernet/intel/igbvf/igbvf.h b/drivers/net/ethernet/intel/igbvf/igbvf.h
index f5bf248e22eb..eee26a3be90b 100644
--- a/drivers/net/ethernet/intel/igbvf/igbvf.h
+++ b/drivers/net/ethernet/intel/igbvf/igbvf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 /* Linux PRO/1000 Ethernet Driver main header file */
 
diff --git a/drivers/net/ethernet/intel/igbvf/mbx.c b/drivers/net/ethernet/intel/igbvf/mbx.c
index 9195884096f8..163e5838f7c2 100644
--- a/drivers/net/ethernet/intel/igbvf/mbx.c
+++ b/drivers/net/ethernet/intel/igbvf/mbx.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 #include "mbx.h"
 
diff --git a/drivers/net/ethernet/intel/igbvf/mbx.h b/drivers/net/ethernet/intel/igbvf/mbx.h
index 479b062fe9ee..e5b31818d565 100644
--- a/drivers/net/ethernet/intel/igbvf/mbx.h
+++ b/drivers/net/ethernet/intel/igbvf/mbx.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 1999 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _E1000_MBX_H_
 #define _E1000_MBX_H_
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index e2b7502f1953..f818f060e5a7 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/igbvf/regs.h b/drivers/net/ethernet/intel/igbvf/regs.h
index 614e52409f11..625a309a3355 100644
--- a/drivers/net/ethernet/intel/igbvf/regs.h
+++ b/drivers/net/ethernet/intel/igbvf/regs.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 #ifndef _E1000_REGS_H_
 #define _E1000_REGS_H_
diff --git a/drivers/net/ethernet/intel/igbvf/vf.c b/drivers/net/ethernet/intel/igbvf/vf.c
index bfe8d8297b2e..b8ba3f94c363 100644
--- a/drivers/net/ethernet/intel/igbvf/vf.c
+++ b/drivers/net/ethernet/intel/igbvf/vf.c
@@ -1,29 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 #include "vf.h"
 
diff --git a/drivers/net/ethernet/intel/igbvf/vf.h b/drivers/net/ethernet/intel/igbvf/vf.h
index 193b50026246..c71b0d7dbcee 100644
--- a/drivers/net/ethernet/intel/igbvf/vf.h
+++ b/drivers/net/ethernet/intel/igbvf/vf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel(R) 82576 Virtual Function Linux driver
-  Copyright(c) 2009 - 2012 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2009 - 2018 Intel Corporation. */
 
 #ifndef _E1000_VF_H_
 #define _E1000_VF_H_
diff --git a/drivers/net/ethernet/intel/ixgb/Makefile b/drivers/net/ethernet/intel/ixgb/Makefile
index 1b42dd554dd2..2433e9300a33 100644
--- a/drivers/net/ethernet/intel/ixgb/Makefile
+++ b/drivers/net/ethernet/intel/ixgb/Makefile
@@ -1,32 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel PRO/10GbE Linux driver
 # Copyright(c) 1999 - 2008 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# Linux NICS <linux.nics@intel.com>
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
-
 #
 # Makefile for the Intel(R) PRO/10GbE ethernet driver
 #
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb.h b/drivers/net/ethernet/intel/ixgb/ixgb.h
index 92022841755f..e85271b68410 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #ifndef _IXGB_H_
 #define _IXGB_H_
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ee.c b/drivers/net/ethernet/intel/ixgb/ixgb_ee.c
index eca216b9b859..129286fc1634 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ee.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ee.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ee.h b/drivers/net/ethernet/intel/ixgb/ixgb_ee.h
index 475297a810fe..3ee0a09e5d0a 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ee.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ee.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #ifndef _IXGB_EE_H_
 #define _IXGB_EE_H_
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
index d10a0d242dda..43744bf0fc1c 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 /* ethtool support for ixgb */
 
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_hw.c b/drivers/net/ethernet/intel/ixgb/ixgb_hw.c
index bf9a220f71fb..cbaa933ef30d 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_hw.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_hw.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 /* ixgb_hw.c
  * Shared functions for accessing and configuring the adapter
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_hw.h b/drivers/net/ethernet/intel/ixgb/ixgb_hw.h
index 19f36d87ef61..6064583095da 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_hw.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_hw.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #ifndef _IXGB_HW_H_
 #define _IXGB_HW_H_
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ids.h b/drivers/net/ethernet/intel/ixgb/ixgb_ids.h
index 24e849902d60..9695b8215f01 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ids.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ids.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #ifndef _IXGB_IDS_H_
 #define _IXGB_IDS_H_
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 2353c383f0a7..62f2173bc20e 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_osdep.h b/drivers/net/ethernet/intel/ixgb/ixgb_osdep.h
index b1710379192e..7bd54efa698d 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_osdep.h
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_osdep.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 /* glue for the OS independent part of ixgb
  * includes register access macros
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_param.c b/drivers/net/ethernet/intel/ixgb/ixgb_param.c
index 04a60640ddda..f0cadd532c53 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_param.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_param.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel PRO/10GbE Linux driver
-  Copyright(c) 1999 - 2008 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2008 Intel Corporation. */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile
index 4cd96c88cb5d..5414685189ce 100644
--- a/drivers/net/ethernet/intel/ixgbe/Makefile
+++ b/drivers/net/ethernet/intel/ixgbe/Makefile
@@ -1,32 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel 10 Gigabit PCI Express Linux driver
-# Copyright(c) 1999 - 2013 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# Linux NICS <linux.nics@intel.com>
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
-
+# Copyright(c) 1999 - 2018 Intel Corporation.
 #
 # Makefile for the Intel(R) 10GbE PCI Express ethernet driver
 #
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 8fccca57cd6a..fc534e91c6b2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_H_
 #define _IXGBE_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
index cb0fe5fedb33..eee277c1bedf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
index 66a74f4651e8..42f63b943ea0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 633be93f3dbb..8d038837f72b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
index 2b311382167a..4b531e8ae38a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_COMMON_H_
 #define _IXGBE_COMMON_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
index aaea8282bfd2..d26cea5b43bd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.c
@@ -1,31 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
-
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include "ixgbe_type.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h
index 73b6362d4327..60cd5863bf5e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _DCB_CONFIG_H_
 #define _DCB_CONFIG_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c
index 085130626330..379ae747cdce 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.c
@@ -1,31 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include "ixgbe_type.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h
index 7edce607f901..fdca41abb44c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82598.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _DCB_82598_CONFIG_H_
 #define _DCB_82598_CONFIG_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c
index 1eed6811e914..7948849840a5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include "ixgbe_type.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h
index fa030f0abc18..c6f084883cab 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_82599.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _DCB_82599_CONFIG_H_
 #define _DCB_82599_CONFIG_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c
index b33f3f87e4b1..c00332d2e02a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2014 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include <linux/dcbnl.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
index ad54080488ee..55fe8114fe99 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
@@ -1,30 +1,6 @@
-/*******************************************************************************
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
 #include <linux/debugfs.h>
 #include <linux/module.h>
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index c0e6ab42e0e1..bdd179c29ea4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* ethtool support for ixgbe */
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
index 7a09a40e4472..b5219e024f70 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2014 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include <linux/if_ether.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.h
index cf1919901514..724f5382329f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_FCOE_H
 #define _IXGBE_FCOE_H
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 68af127987bc..41af2b81e960 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -1,29 +1,5 @@
-/*******************************************************************************
- *
- * Intel 10 Gigabit PCI Express Linux driver
- * Copyright(c) 2017 Oracle and/or its affiliates. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * Linux NICS <linux.nics@intel.com>
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 Oracle and/or its affiliates. All rights reserved. */
 
 #include "ixgbe.h"
 #include <net/xfrm.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h
index 4f099f516645..9ef7faadda69 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h
@@ -1,30 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 2017 Oracle and/or its affiliates. All rights reserved.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program.  If not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 2017 Oracle and/or its affiliates. All rights reserved. */
 
 #ifndef _IXGBE_IPSEC_H_
 #define _IXGBE_IPSEC_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index ed4cbe94c355..893a9206e718 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include "ixgbe_sriov.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b6e5cea84949..6652b201df5b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c
index a0cb84381cd0..5679293e53f7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
index c4628b663590..e085b6520dac 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_MBX_H_
 #define _IXGBE_MBX_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
index 72446644f9fa..fcae520647ce 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel 10 Gigabit PCI Express Linux drive
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The full GNU General Public License is included in this distribution in
- * the file called "COPYING".
- *
- * Contact Information:
- * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- * Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_MODEL_H_
 #define _IXGBE_MODEL_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c
index 91bde90f9265..919a7af84b42 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2014 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
index d6a7e77348c5..64e44e01c973 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_PHY_H_
 #define _IXGBE_PHY_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index f6cc9166082a..b3e0d8bb5cbd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -1,30 +1,6 @@
-/*******************************************************************************
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
 #include "ixgbe.h"
 #include <linux/ptp_classify.h>
 #include <linux/clocksource.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index bfc4171cd3f9..2649c06d877b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index e30d1f07e891..3ec21923c89c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_SRIOV_H_
 #define _IXGBE_SRIOV_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
index 24766e125592..204844288c16 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sysfs.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2013 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe.h"
 #include "ixgbe_common.h"
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
index 2daa81e6e9b2..e8ed37749ab1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h
@@ -1,31 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_TYPE_H_
 #define _IXGBE_TYPE_H_
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
index b8c5fd2a2115..de563cfd294d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
@@ -1,30 +1,5 @@
-/*******************************************************************************
-
-  Intel 10 Gigabit PCI Express Linux driver
-  Copyright(c) 1999 - 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc.,
-  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  Linux NICS <linux.nics@intel.com>
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include <linux/pci.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h
index 182d640e9f7a..e246c0d2a427 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.h
@@ -1,27 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
- *
- * Intel 10 Gigabit PCI Express Linux driver
- *  Copyright(c) 1999 - 2014 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms and conditions of the GNU General Public License,
- *  version 2, as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- *  more details.
- *
- *  The full GNU General Public License is included in this distribution in
- *  the file called "COPYING".
- *
- *  Contact Information:
- *  Linux NICS <linux.nics@intel.com>
- *  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- *  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- *****************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "ixgbe_type.h"
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index 3123267dfba9..959a37599a6b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -1,26 +1,6 @@
-/*******************************************************************************
- *
- *  Intel 10 Gigabit PCI Express Linux driver
- *  Copyright(c) 1999 - 2016 Intel Corporation.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms and conditions of the GNU General Public License,
- *  version 2, as published by the Free Software Foundation.
- *
- *  This program is distributed in the hope it will be useful, but WITHOUT
- *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- *  more details.
- *
- *  The full GNU General Public License is included in this distribution in
- *  the file called "COPYING".
- *
- *  Contact Information:
- *  Linux NICS <linux.nics@intel.com>
- *  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- *  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
- *
- ******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
+
 #include "ixgbe_x540.h"
 #include "ixgbe_type.h"
 #include "ixgbe_common.h"
diff --git a/drivers/net/ethernet/intel/ixgbevf/Makefile b/drivers/net/ethernet/intel/ixgbevf/Makefile
index bb47814cfa90..aba1e6a37a6a 100644
--- a/drivers/net/ethernet/intel/ixgbevf/Makefile
+++ b/drivers/net/ethernet/intel/ixgbevf/Makefile
@@ -1,31 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-################################################################################
-#
-# Intel 82599 Virtual Function driver
-# Copyright(c) 1999 - 2012 Intel Corporation.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms and conditions of the GNU General Public License,
-# version 2, as published by the Free Software Foundation.
-#
-# This program is distributed in the hope it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-# more details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-#
-# The full GNU General Public License is included in this distribution in
-# the file called "COPYING".
-#
-# Contact Information:
-# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-#
-################################################################################
-
+# Copyright(c) 1999 - 2018 Intel Corporation.
 #
 # Makefile for the Intel(R) 82599 VF ethernet driver
 #
diff --git a/drivers/net/ethernet/intel/ixgbevf/defines.h b/drivers/net/ethernet/intel/ixgbevf/defines.h
index 71c828842b11..700d8eb2f6f8 100644
--- a/drivers/net/ethernet/intel/ixgbevf/defines.h
+++ b/drivers/net/ethernet/intel/ixgbevf/defines.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBEVF_DEFINES_H_
 #define _IXGBEVF_DEFINES_H_
diff --git a/drivers/net/ethernet/intel/ixgbevf/ethtool.c b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
index 8e7d6c6f5c92..e7813d76527c 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ethtool.c
@@ -1,28 +1,5 @@
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2018 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /* ethtool support for ixgbevf */
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 447ce1d5e0e3..70c75681495f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2018 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBEVF_H_
 #define _IXGBEVF_H_
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index e3d04f226d57..e91c3d1a43ce 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1,28 +1,5 @@
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2018 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 /******************************************************************************
  Copyright (c)2006 - 2007 Myricom, Inc. for some LRO specific code
diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.c b/drivers/net/ethernet/intel/ixgbevf/mbx.c
index 2819abc454c7..6bc1953263b9 100644
--- a/drivers/net/ethernet/intel/ixgbevf/mbx.c
+++ b/drivers/net/ethernet/intel/ixgbevf/mbx.c
@@ -1,28 +1,5 @@
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "mbx.h"
 #include "ixgbevf.h"
diff --git a/drivers/net/ethernet/intel/ixgbevf/mbx.h b/drivers/net/ethernet/intel/ixgbevf/mbx.h
index 5ec947fe3d09..bfd9ae150808 100644
--- a/drivers/net/ethernet/intel/ixgbevf/mbx.h
+++ b/drivers/net/ethernet/intel/ixgbevf/mbx.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBE_MBX_H_
 #define _IXGBE_MBX_H_
diff --git a/drivers/net/ethernet/intel/ixgbevf/regs.h b/drivers/net/ethernet/intel/ixgbevf/regs.h
index 278f73980501..68d16ae5b65a 100644
--- a/drivers/net/ethernet/intel/ixgbevf/regs.h
+++ b/drivers/net/ethernet/intel/ixgbevf/regs.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef _IXGBEVF_REGS_H_
 #define _IXGBEVF_REGS_H_
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.c b/drivers/net/ethernet/intel/ixgbevf/vf.c
index 38d3a327c1bc..bf0577e819e1 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.c
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.c
@@ -1,28 +1,5 @@
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #include "vf.h"
 #include "ixgbevf.h"
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h
index 194fbdaa4519..d1e9e306653b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.h
@@ -1,29 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*******************************************************************************
-
-  Intel 82599 Virtual Function driver
-  Copyright(c) 1999 - 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify it
-  under the terms and conditions of the GNU General Public License,
-  version 2, as published by the Free Software Foundation.
-
-  This program is distributed in the hope it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, see <http://www.gnu.org/licenses/>.
-
-  The full GNU General Public License is included in this distribution in
-  the file called "COPYING".
-
-  Contact Information:
-  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
-  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
-*******************************************************************************/
+/* Copyright(c) 1999 - 2018 Intel Corporation. */
 
 #ifndef __IXGBE_VF_H__
 #define __IXGBE_VF_H__
-- 
cgit v1.2.3-59-g8ed1b


From 3382576106014bf865d341efab3d94fb28d1fc63 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 26 Apr 2018 17:18:20 +0200
Subject: net/smc: fix structure size

The struct smc_cdc_msg must be defined as packed so the
size is 44 bytes.
And change the structure size check so sizeof is checked.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c | 2 +-
 net/smc/smc_cdc.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index b42395d24cba..42ad57365eca 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
 		sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
 		"must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
 	BUILD_BUG_ON_MSG(
-		offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+		sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
 		"must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
 	BUILD_BUG_ON_MSG(
 		sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index ab240b37ad11..d2012fd22100 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -48,7 +48,7 @@ struct smc_cdc_msg {
 	struct smc_cdc_producer_flags	prod_flags;
 	struct smc_cdc_conn_state_flags	conn_state_flags;
 	u8				reserved[18];
-} __aligned(8);
+} __packed;					/* format defined in RFC7609 */
 
 static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
 {
-- 
cgit v1.2.3-59-g8ed1b


From ee9dfbef02d186a90f3a4876b276701966a92d10 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 26 Apr 2018 17:18:21 +0200
Subject: net/smc: handle sockopts forcing fallback

Several TCP sockopts do not work for SMC. One example are the
TCP_FASTOPEN sockopts, since SMC-connection setup is based on the TCP
three-way-handshake.
If the SMC socket is still in state SMC_INIT, such sockopts trigger
fallback to TCP. Otherwise an error is returned.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4470501374bf..d274be7265ea 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	sock_hold(&smc->sk); /* sock put in passive closing */
 
+	if (smc->use_fallback)
+		goto out_connected;
+
 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
 		/* peer has not signalled SMC-capability */
 		smc->use_fallback = true;
@@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work)
 	int rc = 0;
 	u8 ibport;
 
+	if (new_smc->use_fallback)
+		goto out_connected;
+
 	/* check if peer is smc capable */
 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
 		new_smc->use_fallback = true;
@@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
 			continue;
 
 		new_smc->listen_smc = lsmc;
-		new_smc->use_fallback = false; /* assume rdma capability first*/
+		new_smc->use_fallback = lsmc->use_fallback;
 		sock_hold(lsk); /* sock_put in smc_listen_work */
 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 		smc_copy_sock_settings_to_smc(new_smc);
@@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog)
 	 * them to the clc socket -- copy smc socket options to clc socket
 	 */
 	smc_copy_sock_settings_to_clc(smc);
-	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+	if (!smc->use_fallback)
+		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 
 	rc = kernel_listen(smc->clcsock, backlog);
 	if (rc)
@@ -1097,6 +1104,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
 	    (sk->sk_state != SMC_INIT))
 		goto out;
+
+	if (msg->msg_flags & MSG_FASTOPEN) {
+		if (sk->sk_state == SMC_INIT) {
+			smc->use_fallback = true;
+		} else {
+			rc = -EINVAL;
+			goto out;
+		}
+	}
+
 	if (smc->use_fallback)
 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
 	else
@@ -1274,14 +1291,43 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
+	int rc;
 
 	smc = smc_sk(sk);
 
 	/* generic setsockopts reaching us here always apply to the
 	 * CLC socket
 	 */
-	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
-					     optval, optlen);
+	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+					   optval, optlen);
+	if (smc->clcsock->sk->sk_err) {
+		sk->sk_err = smc->clcsock->sk->sk_err;
+		sk->sk_error_report(sk);
+	}
+	if (rc)
+		return rc;
+
+	lock_sock(sk);
+	switch (optname) {
+	case TCP_ULP:
+	case TCP_FASTOPEN:
+	case TCP_FASTOPEN_CONNECT:
+	case TCP_FASTOPEN_KEY:
+	case TCP_FASTOPEN_NO_COOKIE:
+		/* option not supported by SMC */
+		if (sk->sk_state == SMC_INIT) {
+			smc->use_fallback = true;
+		} else {
+			if (!smc->use_fallback)
+				rc = -EINVAL;
+		}
+		break;
+	default:
+		break;
+	}
+	release_sock(sk);
+
+	return rc;
 }
 
 static int smc_getsockopt(struct socket *sock, int level, int optname,
-- 
cgit v1.2.3-59-g8ed1b


From 01d2f7e2cdd31becffafa0cb82809a5e36558ec0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 26 Apr 2018 17:18:22 +0200
Subject: net/smc: sockopts TCP_NODELAY and TCP_CORK

Setting sockopt TCP_NODELAY or resetting sockopt TCP_CORK
triggers data transfer.

For a corked SMC socket RDMA writes are deferred, if there is
still sufficient send buffer space available.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 20 +++++++++++++++++++-
 net/smc/smc_tx.c | 24 +++++++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index d274be7265ea..9d8b381281e3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1291,7 +1291,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
-	int rc;
+	int val, rc;
 
 	smc = smc_sk(sk);
 
@@ -1307,6 +1307,10 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 	if (rc)
 		return rc;
 
+	if (optlen < sizeof(int))
+		return rc;
+	get_user(val, (int __user *)optval);
+
 	lock_sock(sk);
 	switch (optname) {
 	case TCP_ULP:
@@ -1322,6 +1326,20 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 				rc = -EINVAL;
 		}
 		break;
+	case TCP_NODELAY:
+		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+			if (val)
+				mod_delayed_work(system_wq, &smc->conn.tx_work,
+						 0);
+		}
+		break;
+	case TCP_CORK:
+		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+			if (!val)
+				mod_delayed_work(system_wq, &smc->conn.tx_work,
+						 0);
+		}
+		break;
 	default:
 		break;
 	}
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 72f004c9c9b1..58dfe0bd9d60 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -19,6 +19,7 @@
 #include <linux/sched/signal.h>
 
 #include <net/sock.h>
+#include <net/tcp.h>
 
 #include "smc.h"
 #include "smc_wr.h"
@@ -26,6 +27,7 @@
 #include "smc_tx.h"
 
 #define SMC_TX_WORK_DELAY	HZ
+#define SMC_TX_CORK_DELAY	(HZ >> 2)	/* 250 ms */
 
 /***************************** sndbuf producer *******************************/
 
@@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 	return rc;
 }
 
+static bool smc_tx_is_corked(struct smc_sock *smc)
+{
+	struct tcp_sock *tp = tcp_sk(smc->clcsock->sk);
+
+	return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
+}
+
 /* sndbuf producer: main API called by socket layer.
  * called under sock lock.
  */
@@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		/* since we just produced more new data into sndbuf,
 		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
 		 */
-		smc_tx_sndbuf_nonempty(conn);
+		if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
+		    (atomic_read(&conn->sndbuf_space) >
+						(conn->sndbuf_size >> 1)))
+			/* for a corked socket defer the RDMA writes if there
+			 * is still sufficient sndbuf_space available
+			 */
+			schedule_delayed_work(&conn->tx_work,
+					      SMC_TX_CORK_DELAY);
+		else
+			smc_tx_sndbuf_nonempty(conn);
 	} /* while (msg_data_left(msg)) */
 
 	return send_done;
@@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 			}
 			rc = 0;
 			if (conn->alert_token_local) /* connection healthy */
-				schedule_delayed_work(&conn->tx_work,
-						      SMC_TX_WORK_DELAY);
+				mod_delayed_work(system_wq, &conn->tx_work,
+						 SMC_TX_WORK_DELAY);
 		}
 		goto out_unlock;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From abb190f194d082cbb7520e692d78d3ddf050e7b1 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 26 Apr 2018 17:18:23 +0200
Subject: net/smc: handle sockopt TCP_DEFER_ACCEPT

If sockopt TCP_DEFER_ACCEPT is set, the accept is delayed till
data is available.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 26 +++++++++++++++++++++++++-
 net/smc/smc.h    |  4 ++++
 net/smc/smc_rx.c |  2 +-
 net/smc/smc_rx.h |  1 +
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 9d8b381281e3..20aa4175b9f8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1044,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 
 	if (lsmc->sk.sk_state != SMC_LISTEN) {
 		rc = -EINVAL;
+		release_sock(sk);
 		goto out;
 	}
 
@@ -1071,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 
 	if (!rc)
 		rc = sock_error(nsk);
+	release_sock(sk);
+	if (rc)
+		goto out;
+
+	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+		/* wait till data arrives on the socket */
+		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
+								MSEC_PER_SEC);
+		if (smc_sk(nsk)->use_fallback) {
+			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
+
+			lock_sock(clcsk);
+			if (skb_queue_empty(&clcsk->sk_receive_queue))
+				sk_wait_data(clcsk, &timeo, NULL);
+			release_sock(clcsk);
+		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
+			lock_sock(nsk);
+			smc_rx_wait_data(smc_sk(nsk), &timeo);
+			release_sock(nsk);
+		}
+	}
 
 out:
-	release_sock(sk);
 	sock_put(sk); /* sock_hold above */
 	return rc;
 }
@@ -1340,6 +1361,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 						 0);
 		}
 		break;
+	case TCP_DEFER_ACCEPT:
+		smc->sockopt_defer_accept = val;
+		break;
 	default:
 		break;
 	}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index e4829a2f46ba..2405e889b93d 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -180,6 +180,10 @@ struct smc_sock {				/* smc sock container */
 	struct list_head	accept_q;	/* sockets to be accepted */
 	spinlock_t		accept_q_lock;	/* protects accept_q */
 	bool			use_fallback;	/* fallback to tcp */
+	int			sockopt_defer_accept;
+						/* sockopt TCP_DEFER_ACCEPT
+						 * value
+						 */
 	u8			wait_close_tx_prepared : 1;
 						/* shutdown wr or close
 						 * started, waiting for unsent
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index eff4e0d0bb31..af851d8df1f8 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk)
  * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
  * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
  */
-static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 3a32b59bf06c..0b75a6b470e6 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -20,5 +20,6 @@
 void smc_rx_init(struct smc_sock *smc);
 int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 		   int flags);
+int smc_rx_wait_data(struct smc_sock *smc, long *timeo);
 
 #endif /* SMC_RX_H */
-- 
cgit v1.2.3-59-g8ed1b


From c36207bd8774768fcb1de27bd06f3cd866db2421 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 26 Apr 2018 09:58:10 -0700
Subject: tcp: remove mss check in tcp_select_initial_window()

In tcp_select_initial_window(), we only set rcv_wnd to
tcp_default_init_rwnd() if current mss > (1 << wscale). Otherwise,
rcv_wnd is kept at the full receive space of the socket which is a
value way larger than tcp_default_init_rwnd().
With larger initial rcv_wnd value, receive buffer autotuning logic
takes longer to kick in and increase the receive buffer.

In a TCP throughput test where receiver has rmem[2] set to 125MB
(wscale is 11), we see the connection gets recvbuf limited at the
beginning of the connection and gets less throughput overall.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95feffb6d53f..d07c0dcc99aa 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -229,11 +229,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 		}
 	}
 
-	if (mss > (1 << *rcv_wscale)) {
-		if (!init_rcv_wnd) /* Use default unless specified otherwise */
-			init_rcv_wnd = tcp_default_init_rwnd(mss);
-		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
-	}
+	if (!init_rcv_wnd) /* Use default unless specified otherwise */
+		init_rcv_wnd = tcp_default_init_rwnd(mss);
+	*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
 
 	/* Set the clamp no higher than max representable value */
 	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
-- 
cgit v1.2.3-59-g8ed1b


From 5a643c861d0f20b1b1bf8c694a24759d9219f1e1 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 26 Apr 2018 19:41:02 +0200
Subject: selftests: pmtu: Minimum MTU for vti6 is 68

A vti6 interface can carry IPv4 packets too.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/pmtu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 1e428781a625..7651fd4d86fe 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -368,7 +368,7 @@ test_pmtu_vti6_link_add_mtu() {
 
 	fail=0
 
-	min=1280
+	min=68			# vti6 can carry IPv4 packets too
 	max=$((65535 - 40))
 	# Check invalid values first
 	for v in $((min - 1)) $((max + 1)); do
@@ -384,7 +384,7 @@ test_pmtu_vti6_link_add_mtu() {
 	done
 
 	# Now check valid values
-	for v in 1280 1300 $((65535 - 40)); do
+	for v in 68 1280 1300 $((65535 - 40)); do
 		${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
 		mtu="$(link_get_mtu "${ns_a}" vti6_a)"
 		${ns_a} ip link del vti6_a
-- 
cgit v1.2.3-59-g8ed1b


From c45698f89626177f8c51409142cbe9c5bbed4af7 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:50 -0300
Subject: sctp: remove old and unused SCTP_MIN_PMTU

This value is not used anywhere in the code. In essence it is a
duplicate of SCTP_DEFAULT_MINSEGMENT, which is used by the stack.

SCTP_MIN_PMTU value makes more sense, but we should not change to it now
as it would risk breaking applications.

So this patch removes SCTP_MIN_PMTU and adjust the comment above it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 20ff237c5eb2..86f034b524d4 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -254,11 +254,10 @@ enum { SCTP_ARBITRARY_COOKIE_ECHO_LEN = 200 };
 #define SCTP_TSN_MAP_SIZE 4096
 
 /* We will not record more than this many duplicate TSNs between two
- * SACKs.  The minimum PMTU is 576.  Remove all the headers and there
- * is enough room for 131 duplicate reports.  Round down to the
+ * SACKs.  The minimum PMTU is 512.  Remove all the headers and there
+ * is enough room for 117 duplicate reports.  Round down to the
  * nearest power of 2.
  */
-enum { SCTP_MIN_PMTU = 576 };
 enum { SCTP_MAX_DUP_TSNS = 16 };
 enum { SCTP_MAX_GABS = 16 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 800e00c12733fe2ed565206dcdb515fa2f705b9f Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:51 -0300
Subject: sctp: move transport pathmtu calc away of sctp_assoc_add_peer

There was only one case that sctp_assoc_add_peer couldn't handle, which
is when SPP_PMTUD_DISABLE is set and pathmtu not initialized.
So add this situation to sctp_transport_route and reuse what was
already in there.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 9 +--------
 net/sctp/transport.c | 8 ++++++--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a8f3b088fcb2..b3aa95222bd5 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -652,15 +652,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	 */
 	peer->param_flags = asoc->param_flags;
 
-	sctp_transport_route(peer, NULL, sp);
-
 	/* Initialize the pmtu of the transport. */
-	if (peer->param_flags & SPP_PMTUD_DISABLE) {
-		if (asoc->pathmtu)
-			peer->pathmtu = asoc->pathmtu;
-		else
-			peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
-	}
+	sctp_transport_route(peer, NULL, sp);
 
 	/* If this is the first transport addr on this association,
 	 * initialize the association PMTU to the peer's PMTU.
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 47f82bd794d9..c5fc3aed08a1 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -307,11 +307,15 @@ void sctp_transport_route(struct sctp_transport *transport,
 		 * association's active path for getsockname().
 		 */
 		if (asoc && (!asoc->peer.primary_path ||
-				(transport == asoc->peer.active_path)))
+			     (transport == asoc->peer.active_path)))
 			opt->pf->to_sk_saddr(&transport->saddr,
 					     asoc->base.sk);
-	} else
+	} else if ((transport->param_flags & SPP_PMTUD_DISABLE) &&
+		   asoc && asoc->pathmtu) {
+		transport->pathmtu = asoc->pathmtu;
+	} else {
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
+	}
 }
 
 /* Hold a reference to a transport.  */
-- 
cgit v1.2.3-59-g8ed1b


From c88da20f95ad0bfa49e3e9035c26aaac0b05d11d Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:52 -0300
Subject: sctp: remove an if() that is always true

As noticed by Xin Long, the if() here is always true as PMTU can never
be 0.

Reported-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b3aa95222bd5..c5ed09cfa842 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1397,10 +1397,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 			pmtu = t->pathmtu;
 	}
 
-	if (pmtu) {
-		asoc->pathmtu = pmtu;
-		asoc->frag_point = sctp_frag_point(asoc, pmtu);
-	}
+	asoc->pathmtu = pmtu;
+	asoc->frag_point = sctp_frag_point(asoc, pmtu);
 
 	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
 		 asoc->pathmtu, asoc->frag_point);
-- 
cgit v1.2.3-59-g8ed1b


From c4b2893dae6427ce1e528033383c94cbf81e80d8 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:53 -0300
Subject: sctp: introduce sctp_assoc_set_pmtu

All changes to asoc PMTU should now go through this wrapper, making it
easier to track them and to do other actions upon it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 +
 net/sctp/associola.c       | 21 +++++++++++++--------
 net/sctp/socket.c          |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 05594b248e52..c5e244be8f1e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu);
 void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
 void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index c5ed09cfa842..85b362324084 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -660,13 +660,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	 * If not and the current association PMTU is higher than the new
 	 * peer's PMTU, reset the association PMTU to the new peer's PMTU.
 	 */
-	if (asoc->pathmtu)
-		asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu);
-	else
-		asoc->pathmtu = peer->pathmtu;
-
-	pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc,
-		 asoc->pathmtu);
+	sctp_assoc_set_pmtu(asoc, asoc->pathmtu ?
+				  min_t(int, peer->pathmtu, asoc->pathmtu) :
+				  peer->pathmtu);
 
 	peer->pmtu_pending = 0;
 
@@ -1374,6 +1370,15 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
 	}
 }
 
+void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
+{
+	if (asoc->pathmtu != pmtu)
+		asoc->pathmtu = pmtu;
+
+	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
+		 asoc->pathmtu, asoc->frag_point);
+}
+
 /* Update the association's pmtu and frag_point by going through all the
  * transports. This routine is called when a transport's PMTU has changed.
  */
@@ -1397,7 +1402,7 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 			pmtu = t->pathmtu;
 	}
 
-	asoc->pathmtu = pmtu;
+	sctp_assoc_set_pmtu(asoc, pmtu);
 	asoc->frag_point = sctp_frag_point(asoc, pmtu);
 
 	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac26d2c..eeec81d5c485 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2539,7 +2539,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 			trans->pathmtu = params->spp_pathmtu;
 			sctp_assoc_sync_pmtu(asoc);
 		} else if (asoc) {
-			asoc->pathmtu = params->spp_pathmtu;
+			sctp_assoc_set_pmtu(asoc, params->spp_pathmtu);
 		} else {
 			sp->pathmtu = params->spp_pathmtu;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From feddd6c1af30ab11d73ce0e4e76b40dfc899dbda Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:54 -0300
Subject: sctp: introduce sctp_mtu_payload

When given a MTU, this function calculates how much payload we can carry
on it. Without a MTU, it calculates the amount of header overhead we
have.

So that when we have extra overhead, like the one added for IP options
on SELinux patches, it is easier to handle it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 19 +++++++++++++++++++
 net/sctp/output.c       | 25 ++++++++++---------------
 net/sctp/socket.c       |  7 ++-----
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 28b996d63490..0b98e4683f10 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -607,6 +607,25 @@ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *
 	return t->dst;
 }
 
+/* Calculate max payload size given a MTU, or the total overhead if
+ * given MTU is zero
+ */
+static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp,
+				     __u32 mtu, __u32 extra)
+{
+	__u32 overhead = sizeof(struct sctphdr) + extra;
+
+	if (sp)
+		overhead += sp->pf->af->net_header_len;
+	else
+		overhead += sizeof(struct ipv6hdr);
+
+	if (WARN_ON_ONCE(mtu && mtu <= overhead))
+		mtu = overhead;
+
+	return mtu ? mtu - overhead : overhead;
+}
+
 static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
 {
 	__u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 690d8557bb7b..bf4226c3cc1d 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 {
 	struct sctp_transport *tp = packet->transport;
 	struct sctp_association *asoc = tp->asoc;
+	struct sctp_sock *sp = NULL;
 	struct sock *sk;
-	size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr);
 
 	pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag);
 	packet->vtag = vtag;
@@ -102,25 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 
 	/* set packet max_size with pathmtu, then calculate overhead */
 	packet->max_size = tp->pathmtu;
+
 	if (asoc) {
-		struct sctp_sock *sp = sctp_sk(asoc->base.sk);
-		struct sctp_af *af = sp->pf->af;
-
-		overhead = af->net_header_len +
-			   af->ip_options_len(asoc->base.sk);
-		overhead += sizeof(struct sctphdr);
-		packet->overhead = overhead;
-		packet->size = overhead;
-	} else {
-		packet->overhead = overhead;
-		packet->size = overhead;
-		return;
+		sk = asoc->base.sk;
+		sp = sctp_sk(sk);
 	}
+	packet->overhead = sctp_mtu_payload(sp, 0, 0);
+	packet->size = packet->overhead;
+
+	if (!asoc)
+		return;
 
 	/* update dst or transport pathmtu if in need */
-	sk = asoc->base.sk;
 	if (!sctp_transport_dst_check(tp)) {
-		sctp_transport_route(tp, NULL, sctp_sk(sk));
+		sctp_transport_route(tp, NULL, sp);
 		if (asoc->param_flags & SPP_PMTUD_ENABLE)
 			sctp_assoc_sync_pmtu(asoc);
 	} else if (!sctp_transport_pmtu_check(tp)) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index eeec81d5c485..b9d14f57146b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3234,11 +3234,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 	if (val) {
 		int min_len, max_len;
 
-		min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
-		min_len -= af->ip_options_len(sk);
-		min_len -= sizeof(struct sctphdr) +
-			   sizeof(struct sctp_data_chunk);
-
+		min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
+					   sizeof(struct sctp_data_chunk));
 		max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
 
 		if (val < min_len || val > max_len)
-- 
cgit v1.2.3-59-g8ed1b


From 2f5e3c9df6938b823664869ec87af3da8df272f6 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:55 -0300
Subject: sctp: introduce sctp_assoc_update_frag_point

and avoid the open-coded versions of it.

Now sctp_datamsg_from_user can just re-use asoc->frag_point as it will
always be updated.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 20 --------------------
 include/net/sctp/structs.h |  1 +
 net/sctp/associola.c       | 24 +++++++++++++++++-------
 net/sctp/chunk.c           | 12 +-----------
 net/sctp/socket.c          |  2 +-
 5 files changed, 20 insertions(+), 39 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 0b98e4683f10..350c65620a4e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -428,26 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head)
 	return (head->next != head) && (head->next == head->prev);
 }
 
-/* Break down data chunks at this point.  */
-static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
-{
-	struct sctp_sock *sp = sctp_sk(asoc->base.sk);
-	struct sctp_af *af = sp->pf->af;
-	int frag = pmtu;
-
-	frag -= af->ip_options_len(asoc->base.sk);
-	frag -= af->net_header_len;
-	frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream);
-
-	if (asoc->user_frag)
-		frag = min_t(int, frag, asoc->user_frag);
-
-	frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN -
-					    sctp_datachk_len(&asoc->stream)));
-
-	return frag;
-}
-
 static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
 {
 	sctp_assoc_sync_pmtu(asoc);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index c5e244be8f1e..ebf809eed33a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -2097,6 +2097,7 @@ int sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
+void sctp_assoc_update_frag_point(struct sctp_association *asoc);
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu);
 void sctp_assoc_sync_pmtu(struct sctp_association *asoc);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 85b362324084..a29025418b96 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -666,8 +666,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 
 	peer->pmtu_pending = 0;
 
-	asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
-
 	/* The asoc->peer.port might not be meaningful yet, but
 	 * initialize the packet structure anyway.
 	 */
@@ -1370,10 +1368,26 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc,
 	}
 }
 
+void sctp_assoc_update_frag_point(struct sctp_association *asoc)
+{
+	int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu,
+				    sctp_datachk_len(&asoc->stream));
+
+	if (asoc->user_frag)
+		frag = min_t(int, frag, asoc->user_frag);
+
+	frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN -
+				sctp_datachk_len(&asoc->stream));
+
+	asoc->frag_point = SCTP_TRUNC4(frag);
+}
+
 void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu)
 {
-	if (asoc->pathmtu != pmtu)
+	if (asoc->pathmtu != pmtu) {
 		asoc->pathmtu = pmtu;
+		sctp_assoc_update_frag_point(asoc);
+	}
 
 	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
 		 asoc->pathmtu, asoc->frag_point);
@@ -1403,10 +1417,6 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 	}
 
 	sctp_assoc_set_pmtu(asoc, pmtu);
-	asoc->frag_point = sctp_frag_point(asoc, pmtu);
-
-	pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc,
-		 asoc->pathmtu, asoc->frag_point);
 }
 
 /* Should we send a SACK to update our peer? */
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index be296d633e95..79daa98208c3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
-	struct sctp_sock *sp;
-	struct sctp_af *af;
 	int err;
 
 	msg = sctp_datamsg_new(GFP_KERNEL);
@@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	/* This is the biggest possible DATA chunk that can fit into
 	 * the packet
 	 */
-	sp = sctp_sk(asoc->base.sk);
-	af = sp->pf->af;
-	max_data = asoc->pathmtu - af->net_header_len -
-		   sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) -
-		   af->ip_options_len(asoc->base.sk);
-	max_data = SCTP_TRUNC4(max_data);
+	max_data = asoc->frag_point;
 
 	/* If the the peer requested that we authenticate DATA chunks
 	 * we need to account for bundling of the AUTH chunks along with
@@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		}
 	}
 
-	/* Check what's our max considering the above */
-	max_data = min_t(size_t, max_data, asoc->frag_point);
-
 	/* Set first_len and then account for possible bundles on first frag */
 	first_len = max_data;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b9d14f57146b..21bf457be3ea 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3251,7 +3251,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 			       sctp_datachk_len(&asoc->stream);
 		}
 		asoc->user_frag = val;
-		asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
+		sctp_assoc_update_frag_point(asoc);
 	} else {
 		if (params.assoc_id && sctp_style(sk, UDP))
 			return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 2521680e1830c21033efe48322829044c6e6b32b Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:56 -0300
Subject: sctp: remove sctp_assoc_pending_pmtu

No need for this helper.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 6 ------
 net/sctp/socket.c       | 6 ++++--
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 350c65620a4e..e327acad8e7d 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -428,12 +428,6 @@ static inline int sctp_list_single_entry(struct list_head *head)
 	return (head->next != head) && (head->next == head->prev);
 }
 
-static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
-{
-	sctp_assoc_sync_pmtu(asoc);
-	asoc->pmtu_pending = 0;
-}
-
 static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk)
 {
 	return !list_empty(&chunk->list);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 21bf457be3ea..a93b60a28cc5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1918,8 +1918,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		goto err;
 	}
 
-	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
+	if (asoc->pmtu_pending) {
+		sctp_assoc_sync_pmtu(asoc);
+		asoc->pmtu_pending = 0;
+	}
 
 	if (sctp_wspace(asoc) < msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
-- 
cgit v1.2.3-59-g8ed1b


From 6ff0f871c20ec1769a481edca86f23c76b2b06d3 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:57 -0300
Subject: sctp: introduce sctp_dst_mtu

Which makes sure that the MTU respects the minimum value of
SCTP_DEFAULT_MINSEGMENT and that it is correctly aligned.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 9 +++++++--
 net/sctp/associola.c    | 6 ++----
 net/sctp/transport.c    | 6 +++---
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e327acad8e7d..4965cbfa7d92 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -600,10 +600,15 @@ static inline __u32 sctp_mtu_payload(const struct sctp_sock *sp,
 	return mtu ? mtu - overhead : overhead;
 }
 
+static inline __u32 sctp_dst_mtu(const struct dst_entry *dst)
+{
+	return SCTP_TRUNC4(max_t(__u32, dst_mtu(dst),
+				 SCTP_DEFAULT_MINSEGMENT));
+}
+
 static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
 {
-	__u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)),
-			   SCTP_DEFAULT_MINSEGMENT);
+	__u32 pmtu = sctp_dst_mtu(t->dst);
 
 	if (t->pathmtu == pmtu)
 		return true;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a29025418b96..039fdb862b17 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1405,11 +1405,9 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 		return;
 
 	/* Get the lowest pmtu of all the transports. */
-	list_for_each_entry(t, &asoc->peer.transport_addr_list,
-				transports) {
+	list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) {
 		if (t->pmtu_pending && t->dst) {
-			sctp_transport_update_pmtu(
-					t, SCTP_TRUNC4(dst_mtu(t->dst)));
+			sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst));
 			t->pmtu_pending = 0;
 		}
 		if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index c5fc3aed08a1..ed73a9d91b83 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,9 +242,9 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 						&transport->fl, sk);
 	}
 
-	if (transport->dst) {
-		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
-	} else
+	if (transport->dst)
+		transport->pathmtu = sctp_dst_mtu(transport->dst);
+	else
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 22d7be267eaa8114dcc28d66c1c347f667d7878a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:58 -0300
Subject: sctp: remove sctp_transport_pmtu_check

We are now keeping the MTU information synced between asoc, transport
and dst, which makes the check at sctp_packet_config() not needed
anymore. As it was the sole caller to this function, lets remove it.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 12 ------------
 net/sctp/output.c       |  3 ---
 2 files changed, 15 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 4965cbfa7d92..f66d44350007 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -606,16 +606,4 @@ static inline __u32 sctp_dst_mtu(const struct dst_entry *dst)
 				 SCTP_DEFAULT_MINSEGMENT));
 }
 
-static inline bool sctp_transport_pmtu_check(struct sctp_transport *t)
-{
-	__u32 pmtu = sctp_dst_mtu(t->dst);
-
-	if (t->pathmtu == pmtu)
-		return true;
-
-	t->pathmtu = pmtu;
-
-	return false;
-}
-
 #endif /* __net_sctp_h__ */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index bf4226c3cc1d..e672dee302c7 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -118,9 +118,6 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 		sctp_transport_route(tp, NULL, sp);
 		if (asoc->param_flags & SPP_PMTUD_ENABLE)
 			sctp_assoc_sync_pmtu(asoc);
-	} else if (!sctp_transport_pmtu_check(tp)) {
-		if (asoc->param_flags & SPP_PMTUD_ENABLE)
-			sctp_assoc_sync_pmtu(asoc);
 	}
 
 	/* If there a is a prepend chunk stick it on the list before
-- 
cgit v1.2.3-59-g8ed1b


From 6e91b578bf3f9e19c250835cba97a4be38ffcb31 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:58:59 -0300
Subject: sctp: re-use sctp_transport_pmtu in sctp_transport_route

sctp_transport_route currently is very similar to sctp_transport_pmtu plus
a few other bits.

This patch reuses sctp_transport_pmtu in sctp_transport_route and removes
the duplicated code.

Also, as all calls to sctp_transport_route were forcing the dst release
before calling it, let's just include such release too.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c    |  6 ++----
 net/sctp/transport.c | 35 +++++++++++++++++------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a93b60a28cc5..bb08d44b838b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -644,16 +644,15 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
 
 			list_for_each_entry(trans,
 			    &asoc->peer.transport_addr_list, transports) {
-				/* Clear the source and route cache */
-				sctp_transport_dst_release(trans);
 				trans->cwnd = min(4*asoc->pathmtu, max_t(__u32,
 				    2*asoc->pathmtu, 4380));
 				trans->ssthresh = asoc->peer.i.a_rwnd;
 				trans->rto = asoc->rto_initial;
 				sctp_max_rto(asoc, trans);
 				trans->rtt = trans->srtt = trans->rttvar = 0;
+				/* Clear the source and route cache */
 				sctp_transport_route(trans, NULL,
-				    sctp_sk(asoc->base.sk));
+						     sctp_sk(asoc->base.sk));
 			}
 		}
 		retval = sctp_send_asconf(asoc, chunk);
@@ -896,7 +895,6 @@ skip_mkasconf:
 		 */
 		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
 					transports) {
-			sctp_transport_dst_release(transport);
 			sctp_transport_route(transport, NULL,
 					     sctp_sk(asoc->base.sk));
 		}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index ed73a9d91b83..4a95e260b674 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -242,6 +242,15 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 						&transport->fl, sk);
 	}
 
+	if (transport->param_flags & SPP_PMTUD_DISABLE) {
+		struct sctp_association *asoc = transport->asoc;
+
+		if (!transport->pathmtu && asoc && asoc->pathmtu)
+			transport->pathmtu = asoc->pathmtu;
+		if (transport->pathmtu)
+			return;
+	}
+
 	if (transport->dst)
 		transport->pathmtu = sctp_dst_mtu(transport->dst);
 	else
@@ -290,6 +299,7 @@ void sctp_transport_route(struct sctp_transport *transport,
 	struct sctp_association *asoc = transport->asoc;
 	struct sctp_af *af = transport->af_specific;
 
+	sctp_transport_dst_release(transport);
 	af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt));
 
 	if (saddr)
@@ -297,25 +307,14 @@ void sctp_transport_route(struct sctp_transport *transport,
 	else
 		af->get_saddr(opt, transport, &transport->fl);
 
-	if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) {
-		return;
-	}
-	if (transport->dst) {
-		transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst));
+	sctp_transport_pmtu(transport, sctp_opt2sk(opt));
 
-		/* Initialize sk->sk_rcv_saddr, if the transport is the
-		 * association's active path for getsockname().
-		 */
-		if (asoc && (!asoc->peer.primary_path ||
-			     (transport == asoc->peer.active_path)))
-			opt->pf->to_sk_saddr(&transport->saddr,
-					     asoc->base.sk);
-	} else if ((transport->param_flags & SPP_PMTUD_DISABLE) &&
-		   asoc && asoc->pathmtu) {
-		transport->pathmtu = asoc->pathmtu;
-	} else {
-		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
-	}
+	/* Initialize sk->sk_rcv_saddr, if the transport is the
+	 * association's active path for getsockname().
+	 */
+	if (transport->dst && asoc &&
+	    (!asoc->peer.primary_path || transport == asoc->peer.active_path))
+		opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk);
 }
 
 /* Hold a reference to a transport.  */
-- 
cgit v1.2.3-59-g8ed1b


From 63d01330aad9531a491db1bed5f15cc6b7fd1a78 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:59:00 -0300
Subject: sctp: honor PMTU_DISABLED when handling icmp

sctp_sendmsg() could trigger PMTU updates even when PMTU_DISABLED was
set, as pmtu_pending could be set unconditionally during icmp handling
if the socket was in use by the application.

This patch fixes it by checking for PMTU_DISABLED when handling such
deferred updates.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bb08d44b838b..ad8965835d8d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1893,6 +1893,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 				struct sctp_sndrcvinfo *sinfo)
 {
 	struct sock *sk = asoc->base.sk;
+	struct sctp_sock *sp = sctp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sctp_datamsg *datamsg;
 	bool wait_connect = false;
@@ -1911,13 +1912,14 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 			goto err;
 	}
 
-	if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) {
+	if (sp->disable_fragments && msg_len > asoc->frag_point) {
 		err = -EMSGSIZE;
 		goto err;
 	}
 
 	if (asoc->pmtu_pending) {
-		sctp_assoc_sync_pmtu(asoc);
+		if (sp->param_flags & SPP_PMTUD_ENABLE)
+			sctp_assoc_sync_pmtu(asoc);
 		asoc->pmtu_pending = 0;
 	}
 
@@ -1936,7 +1938,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		if (err)
 			goto err;
 
-		if (sctp_sk(sk)->strm_interleave) {
+		if (sp->strm_interleave) {
 			timeo = sock_sndtimeo(sk, 0);
 			err = sctp_wait_for_connect(asoc, &timeo);
 			if (err)
-- 
cgit v1.2.3-59-g8ed1b


From 439ef0309cf4b743d76edc2abeca72b27ee1d996 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:59:01 -0300
Subject: sctp: consider idata chunks when setting SCTP_MAXSEG

When setting SCTP_MAXSEG sock option, it should consider which kind of
data chunk is being used if the asoc is already available, so that the
limit better reflect reality.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ad8965835d8d..2d35c8ea2470 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3233,18 +3233,21 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 		return -EINVAL;
 	}
 
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+
 	if (val) {
 		int min_len, max_len;
+		__u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) :
+				 sizeof(struct sctp_data_chunk);
 
 		min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT,
-					   sizeof(struct sctp_data_chunk));
-		max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
+					   datasize);
+		max_len = SCTP_MAX_CHUNK_LEN - datasize;
 
 		if (val < min_len || val > max_len)
 			return -EINVAL;
 	}
 
-	asoc = sctp_id2assoc(sk, params.assoc_id);
 	if (asoc) {
 		if (val == 0) {
 			val = asoc->pathmtu - af->net_header_len;
-- 
cgit v1.2.3-59-g8ed1b


From 38687b56c51b535024d46d6b5375163a6a40a196 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 26 Apr 2018 16:59:02 -0300
Subject: sctp: allow unsetting sockopt MAXSEG

RFC 6458 Section 8.1.16 says that setting MAXSEG as 0 means that the user
is not limiting it, and not that it should set to the *current* maximum,
as we are doing.

This patch thus allow setting it as 0, effectively removing the user
limit.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2d35c8ea2470..1b4593b842b0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3211,7 +3211,6 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign
 static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen)
 {
 	struct sctp_sock *sp = sctp_sk(sk);
-	struct sctp_af *af = sp->pf->af;
 	struct sctp_assoc_value params;
 	struct sctp_association *asoc;
 	int val;
@@ -3249,12 +3248,6 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
 	}
 
 	if (asoc) {
-		if (val == 0) {
-			val = asoc->pathmtu - af->net_header_len;
-			val -= af->ip_options_len(sk);
-			val -= sizeof(struct sctphdr) +
-			       sctp_datachk_len(&asoc->stream);
-		}
 		asoc->user_frag = val;
 		sctp_assoc_update_frag_point(asoc);
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From c347b9273c26453fa8c9e1acb92f45546d888745 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Thu, 26 Apr 2018 14:34:25 -0700
Subject: hv_netvsc: simplify receive side calling arguments

The calls up from the napi poll reading the receive ring had many
places where an argument was being recreated. I.e the caller already
had the value and wasn't passing it, then the callee would use
known relationship to determine the same value. Simpler and faster
to just pass arguments needed.

Also, add const in a couple places where message is being only read.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c | 58 ++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index e7308958b7a9..d2ee66c259a7 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -652,16 +652,14 @@ static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
 	sync_change_bit(index, net_device->send_section_map);
 }
 
-static void netvsc_send_tx_complete(struct netvsc_device *net_device,
-				    struct vmbus_channel *incoming_channel,
-				    struct hv_device *device,
+static void netvsc_send_tx_complete(struct net_device *ndev,
+				    struct netvsc_device *net_device,
+				    struct vmbus_channel *channel,
 				    const struct vmpacket_descriptor *desc,
 				    int budget)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id;
-	struct net_device *ndev = hv_get_drvdata(device);
 	struct net_device_context *ndev_ctx = netdev_priv(ndev);
-	struct vmbus_channel *channel = device->channel;
 	u16 q_idx = 0;
 	int queue_sends;
 
@@ -675,7 +673,6 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 		if (send_index != NETVSC_INVALID_INDEX)
 			netvsc_free_send_slot(net_device, send_index);
 		q_idx = packet->q_idx;
-		channel = incoming_channel;
 
 		tx_stats = &net_device->chan_table[q_idx].tx_stats;
 
@@ -705,14 +702,13 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 	}
 }
 
-static void netvsc_send_completion(struct netvsc_device *net_device,
+static void netvsc_send_completion(struct net_device *ndev,
+				   struct netvsc_device *net_device,
 				   struct vmbus_channel *incoming_channel,
-				   struct hv_device *device,
 				   const struct vmpacket_descriptor *desc,
 				   int budget)
 {
-	struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
-	struct net_device *ndev = hv_get_drvdata(device);
+	const struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
 
 	switch (nvsp_packet->hdr.msg_type) {
 	case NVSP_MSG_TYPE_INIT_COMPLETE:
@@ -726,8 +722,8 @@ static void netvsc_send_completion(struct netvsc_device *net_device,
 		break;
 
 	case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE:
-		netvsc_send_tx_complete(net_device, incoming_channel,
-					device, desc, budget);
+		netvsc_send_tx_complete(ndev, net_device, incoming_channel,
+					desc, budget);
 		break;
 
 	default:
@@ -1092,12 +1088,11 @@ static void enq_receive_complete(struct net_device *ndev,
 
 static int netvsc_receive(struct net_device *ndev,
 			  struct netvsc_device *net_device,
-			  struct net_device_context *net_device_ctx,
-			  struct hv_device *device,
 			  struct vmbus_channel *channel,
 			  const struct vmpacket_descriptor *desc,
-			  struct nvsp_message *nvsp)
+			  const struct nvsp_message *nvsp)
 {
+	struct net_device_context *net_device_ctx = netdev_priv(ndev);
 	const struct vmtransfer_page_packet_header *vmxferpage_packet
 		= container_of(desc, const struct vmtransfer_page_packet_header, d);
 	u16 q_idx = channel->offermsg.offer.sub_channel_index;
@@ -1158,13 +1153,12 @@ static int netvsc_receive(struct net_device *ndev,
 	return count;
 }
 
-static void netvsc_send_table(struct hv_device *hdev,
-			      struct nvsp_message *nvmsg)
+static void netvsc_send_table(struct net_device *ndev,
+			      const struct nvsp_message *nvmsg)
 {
-	struct net_device *ndev = hv_get_drvdata(hdev);
 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
-	int i;
 	u32 count, *tab;
+	int i;
 
 	count = nvmsg->msg.v5_msg.send_table.count;
 	if (count != VRSS_SEND_TAB_SIZE) {
@@ -1179,24 +1173,25 @@ static void netvsc_send_table(struct hv_device *hdev,
 		net_device_ctx->tx_table[i] = tab[i];
 }
 
-static void netvsc_send_vf(struct net_device_context *net_device_ctx,
-			   struct nvsp_message *nvmsg)
+static void netvsc_send_vf(struct net_device *ndev,
+			   const struct nvsp_message *nvmsg)
 {
+	struct net_device_context *net_device_ctx = netdev_priv(ndev);
+
 	net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
 	net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
 }
 
-static inline void netvsc_receive_inband(struct hv_device *hdev,
-				 struct net_device_context *net_device_ctx,
-				 struct nvsp_message *nvmsg)
+static  void netvsc_receive_inband(struct net_device *ndev,
+				   const struct nvsp_message *nvmsg)
 {
 	switch (nvmsg->hdr.msg_type) {
 	case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE:
-		netvsc_send_table(hdev, nvmsg);
+		netvsc_send_table(ndev, nvmsg);
 		break;
 
 	case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
-		netvsc_send_vf(net_device_ctx, nvmsg);
+		netvsc_send_vf(ndev, nvmsg);
 		break;
 	}
 }
@@ -1208,24 +1203,23 @@ static int netvsc_process_raw_pkt(struct hv_device *device,
 				  const struct vmpacket_descriptor *desc,
 				  int budget)
 {
-	struct net_device_context *net_device_ctx = netdev_priv(ndev);
-	struct nvsp_message *nvmsg = hv_pkt_data(desc);
+	const struct nvsp_message *nvmsg = hv_pkt_data(desc);
 
 	trace_nvsp_recv(ndev, channel, nvmsg);
 
 	switch (desc->type) {
 	case VM_PKT_COMP:
-		netvsc_send_completion(net_device, channel, device,
+		netvsc_send_completion(ndev, net_device, channel,
 				       desc, budget);
 		break;
 
 	case VM_PKT_DATA_USING_XFER_PAGES:
-		return netvsc_receive(ndev, net_device, net_device_ctx,
-				      device, channel, desc, nvmsg);
+		return netvsc_receive(ndev, net_device, channel,
+				      desc, nvmsg);
 		break;
 
 	case VM_PKT_DATA_INBAND:
-		netvsc_receive_inband(device, net_device_ctx, nvmsg);
+		netvsc_receive_inband(ndev, nvmsg);
 		break;
 
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From 2e8ef77ee0ff1117251a48f79d2d57d65afd0495 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:31 -0400
Subject: bnxt_en: Add TC to hardware QoS queue mapping logic.

The current driver maps MQPRIO traffic classes directly 1:1 to the
internal hardware queues (TC0 maps to hardware queue 0, etc).  This
direct mapping requires the internal hardware queues to be reconfigured
from lossless to lossy and vice versa when necessary.  This
involves reconfiguring internal buffer thresholds which is
disruptive and not always reliable.

Implement a new scheme to map TCs to internal hardware queues by
matching up their PFC requirements.  This will eliminate the need
to reconfigure a hardware queue internal buffers at run time.  After
remapping, the NIC is closed and opened for the new TC to hardware
queues to take effect.

This patch only adds the basic mapping logic.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  5 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 65 +++++++++++++++++----------
 3 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f83769d8047b..bda618d06118 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -2383,6 +2383,7 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
 	for (i = 0, j = 0; i < bp->tx_nr_rings; i++) {
 		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
 		struct bnxt_ring_struct *ring;
+		u8 qidx;
 
 		ring = &txr->tx_ring_struct;
 
@@ -2411,7 +2412,8 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
 
 			memset(txr->tx_push, 0, sizeof(struct tx_push_bd));
 		}
-		ring->queue_id = bp->q_info[j].queue_id;
+		qidx = bp->tc_to_qidx[j];
+		ring->queue_id = bp->q_info[qidx].queue_id;
 		if (i < bp->tx_nr_rings_xdp)
 			continue;
 		if (i % bp->tx_nr_rings_per_tc == (bp->tx_nr_rings_per_tc - 1))
@@ -5309,6 +5311,7 @@ static int bnxt_hwrm_queue_qportcfg(struct bnxt *bp)
 	for (i = 0; i < bp->max_tc; i++) {
 		bp->q_info[i].queue_id = *qptr++;
 		bp->q_info[i].queue_profile = *qptr++;
+		bp->tc_to_qidx[i] = i;
 	}
 
 qportcfg_exit:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 3d55d3b56865..057f8a2b5d13 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1242,6 +1242,7 @@ struct bnxt {
 	u8			max_tc;
 	u8			max_lltc;	/* lossless TCs */
 	struct bnxt_queue_info	q_info[BNXT_MAX_QUEUE];
+	u8			tc_to_qidx[BNXT_MAX_QUEUE];
 
 	unsigned int		current_interval;
 #define BNXT_TIMER_INTERVAL	HZ
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
index 3c746f2d9ed8..1b72f8a044e1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -21,6 +21,21 @@
 #include "bnxt_dcb.h"
 
 #ifdef CONFIG_BNXT_DCB
+static int bnxt_queue_to_tc(struct bnxt *bp, u8 queue_id)
+{
+	int i, j;
+
+	for (i = 0; i < bp->max_tc; i++) {
+		if (bp->q_info[i].queue_id == queue_id) {
+			for (j = 0; j < bp->max_tc; j++) {
+				if (bp->tc_to_qidx[j] == i)
+					return j;
+			}
+		}
+	}
+	return -EINVAL;
+}
+
 static int bnxt_hwrm_queue_pri2cos_cfg(struct bnxt *bp, struct ieee_ets *ets)
 {
 	struct hwrm_queue_pri2cos_cfg_input req = {0};
@@ -33,10 +48,13 @@ static int bnxt_hwrm_queue_pri2cos_cfg(struct bnxt *bp, struct ieee_ets *ets)
 
 	pri2cos = &req.pri0_cos_queue_id;
 	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		u8 qidx;
+
 		req.enables |= cpu_to_le32(
 			QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI0_COS_QUEUE_ID << i);
 
-		pri2cos[i] = bp->q_info[ets->prio_tc[i]].queue_id;
+		qidx = bp->tc_to_qidx[ets->prio_tc[i]];
+		pri2cos[i] = bp->q_info[qidx].queue_id;
 	}
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	return rc;
@@ -55,17 +73,15 @@ static int bnxt_hwrm_queue_pri2cos_qcfg(struct bnxt *bp, struct ieee_ets *ets)
 	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (!rc) {
 		u8 *pri2cos = &resp->pri0_cos_queue_id;
-		int i, j;
+		int i;
 
 		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
 			u8 queue_id = pri2cos[i];
+			int tc;
 
-			for (j = 0; j < bp->max_tc; j++) {
-				if (bp->q_info[j].queue_id == queue_id) {
-					ets->prio_tc[i] = j;
-					break;
-				}
-			}
+			tc = bnxt_queue_to_tc(bp, queue_id);
+			if (tc >= 0)
+				ets->prio_tc[i] = tc;
 		}
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
@@ -81,13 +97,15 @@ static int bnxt_hwrm_queue_cos2bw_cfg(struct bnxt *bp, struct ieee_ets *ets,
 	void *data;
 
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_COS2BW_CFG, -1, -1);
-	data = &req.unused_0;
-	for (i = 0; i < max_tc; i++, data += sizeof(cos2bw) - 4) {
+	for (i = 0; i < max_tc; i++) {
+		u8 qidx;
+
 		req.enables |= cpu_to_le32(
 			QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID << i);
 
 		memset(&cos2bw, 0, sizeof(cos2bw));
-		cos2bw.queue_id = bp->q_info[i].queue_id;
+		qidx = bp->tc_to_qidx[i];
+		cos2bw.queue_id = bp->q_info[qidx].queue_id;
 		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_STRICT) {
 			cos2bw.tsa =
 				QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP;
@@ -103,8 +121,9 @@ static int bnxt_hwrm_queue_cos2bw_cfg(struct bnxt *bp, struct ieee_ets *ets,
 				cpu_to_le32((ets->tc_tx_bw[i] * 100) |
 					    BW_VALUE_UNIT_PERCENT1_100);
 		}
+		data = &req.unused_0 + qidx * (sizeof(cos2bw) - 4);
 		memcpy(data, &cos2bw.queue_id, sizeof(cos2bw) - 4);
-		if (i == 0) {
+		if (qidx == 0) {
 			req.queue_id0 = cos2bw.queue_id;
 			req.unused_0 = 0;
 		}
@@ -132,22 +151,22 @@ static int bnxt_hwrm_queue_cos2bw_qcfg(struct bnxt *bp, struct ieee_ets *ets)
 
 	data = &resp->queue_id0 + offsetof(struct bnxt_cos2bw_cfg, queue_id);
 	for (i = 0; i < bp->max_tc; i++, data += sizeof(cos2bw) - 4) {
-		int j;
+		int tc;
 
 		memcpy(&cos2bw.queue_id, data, sizeof(cos2bw) - 4);
 		if (i == 0)
 			cos2bw.queue_id = resp->queue_id0;
 
-		for (j = 0; j < bp->max_tc; j++) {
-			if (bp->q_info[j].queue_id != cos2bw.queue_id)
-				continue;
-			if (cos2bw.tsa ==
-			    QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP) {
-				ets->tc_tsa[j] = IEEE_8021QAZ_TSA_STRICT;
-			} else {
-				ets->tc_tsa[j] = IEEE_8021QAZ_TSA_ETS;
-				ets->tc_tx_bw[j] = cos2bw.bw_weight;
-			}
+		tc = bnxt_queue_to_tc(bp, cos2bw.queue_id);
+		if (tc < 0)
+			continue;
+
+		if (cos2bw.tsa ==
+		    QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP) {
+			ets->tc_tsa[tc] = IEEE_8021QAZ_TSA_STRICT;
+		} else {
+			ets->tc_tsa[tc] = IEEE_8021QAZ_TSA_ETS;
+			ets->tc_tx_bw[tc] = cos2bw.bw_weight;
 		}
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
-- 
cgit v1.2.3-59-g8ed1b


From d31cd579a45c44ede9e56c2f6d33537ba395a49b Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:32 -0400
Subject: bnxt_en: Remap TC to hardware queues when configuring PFC.

Initially, the MQPRIO TCs are mapped 1:1 directly to the hardware
queues.  Some of these hardware queues are configured to be lossless.
When PFC is enabled on one of more TCs, we now need to remap the
TCs that have PFC enabled to the lossless hardware queues.

After remapping, we need to close and open the NIC for the new
mapping to take effect.  We also need to reprogram all ETS parameters.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c | 101 +++++++++++++++-----------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
index 1b72f8a044e1..d5bc72cecde3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -173,44 +173,59 @@ static int bnxt_hwrm_queue_cos2bw_qcfg(struct bnxt *bp, struct ieee_ets *ets)
 	return 0;
 }
 
-static int bnxt_hwrm_queue_cfg(struct bnxt *bp, unsigned int lltc_mask)
+static int bnxt_queue_remap(struct bnxt *bp, unsigned int lltc_mask)
 {
-	struct hwrm_queue_cfg_input req = {0};
-	int i;
+	unsigned long qmap = 0;
+	int max = bp->max_tc;
+	int i, j, rc;
 
-	if (netif_running(bp->dev))
-		bnxt_tx_disable(bp);
+	/* Assign lossless TCs first */
+	for (i = 0, j = 0; i < max; ) {
+		if (lltc_mask & (1 << i)) {
+			if (BNXT_LLQ(bp->q_info[j].queue_profile)) {
+				bp->tc_to_qidx[i] = j;
+				__set_bit(j, &qmap);
+				i++;
+			}
+			j++;
+			continue;
+		}
+		i++;
+	}
 
-	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_CFG, -1, -1);
-	req.flags = cpu_to_le32(QUEUE_CFG_REQ_FLAGS_PATH_BIDIR);
-	req.enables = cpu_to_le32(QUEUE_CFG_REQ_ENABLES_SERVICE_PROFILE);
+	for (i = 0, j = 0; i < max; i++) {
+		if (lltc_mask & (1 << i))
+			continue;
+		j = find_next_zero_bit(&qmap, max, j);
+		bp->tc_to_qidx[i] = j;
+		__set_bit(j, &qmap);
+		j++;
+	}
 
-	/* Configure lossless queues to lossy first */
-	req.service_profile = QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY;
-	for (i = 0; i < bp->max_tc; i++) {
-		if (BNXT_LLQ(bp->q_info[i].queue_profile)) {
-			req.queue_id = cpu_to_le32(bp->q_info[i].queue_id);
-			hwrm_send_message(bp, &req, sizeof(req),
-					  HWRM_CMD_TIMEOUT);
-			bp->q_info[i].queue_profile =
-				QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY;
+	if (netif_running(bp->dev)) {
+		bnxt_close_nic(bp, false, false);
+		rc = bnxt_open_nic(bp, false, false);
+		if (rc) {
+			netdev_warn(bp->dev, "failed to open NIC, rc = %d\n", rc);
+			return rc;
 		}
 	}
-
-	/* Now configure desired queues to lossless */
-	req.service_profile = QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS;
-	for (i = 0; i < bp->max_tc; i++) {
-		if (lltc_mask & (1 << i)) {
-			req.queue_id = cpu_to_le32(bp->q_info[i].queue_id);
-			hwrm_send_message(bp, &req, sizeof(req),
-					  HWRM_CMD_TIMEOUT);
-			bp->q_info[i].queue_profile =
-				QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS;
+	if (bp->ieee_ets) {
+		int tc = netdev_get_num_tc(bp->dev);
+
+		if (!tc)
+			tc = 1;
+		rc = bnxt_hwrm_queue_cos2bw_cfg(bp, bp->ieee_ets, tc);
+		if (rc) {
+			netdev_warn(bp->dev, "failed to config BW, rc = %d\n", rc);
+			return rc;
+		}
+		rc = bnxt_hwrm_queue_pri2cos_cfg(bp, bp->ieee_ets);
+		if (rc) {
+			netdev_warn(bp->dev, "failed to config prio, rc = %d\n", rc);
+			return rc;
 		}
 	}
-	if (netif_running(bp->dev))
-		bnxt_tx_enable(bp);
-
 	return 0;
 }
 
@@ -220,7 +235,7 @@ static int bnxt_hwrm_queue_pfc_cfg(struct bnxt *bp, struct ieee_pfc *pfc)
 	struct ieee_ets *my_ets = bp->ieee_ets;
 	unsigned int tc_mask = 0, pri_mask = 0;
 	u8 i, pri, lltc_count = 0;
-	bool need_q_recfg = false;
+	bool need_q_remap = false;
 	int rc;
 
 	if (!my_ets)
@@ -240,21 +255,25 @@ static int bnxt_hwrm_queue_pfc_cfg(struct bnxt *bp, struct ieee_pfc *pfc)
 	if (lltc_count > bp->max_lltc)
 		return -EINVAL;
 
-	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PFCENABLE_CFG, -1, -1);
-	req.flags = cpu_to_le32(pri_mask);
-	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
-	if (rc)
-		return rc;
-
 	for (i = 0; i < bp->max_tc; i++) {
 		if (tc_mask & (1 << i)) {
-			if (!BNXT_LLQ(bp->q_info[i].queue_profile))
-				need_q_recfg = true;
+			u8 qidx = bp->tc_to_qidx[i];
+
+			if (!BNXT_LLQ(bp->q_info[qidx].queue_profile)) {
+				need_q_remap = true;
+				break;
+			}
 		}
 	}
 
-	if (need_q_recfg)
-		rc = bnxt_hwrm_queue_cfg(bp, tc_mask);
+	if (need_q_remap)
+		rc = bnxt_queue_remap(bp, tc_mask);
+
+	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_QUEUE_PFCENABLE_CFG, -1, -1);
+	req.flags = cpu_to_le32(pri_mask);
+	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	if (rc)
+		return rc;
 
 	return rc;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 59895f596b13b4b09f739bf8470a5028a5ff2b9a Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:33 -0400
Subject: bnxt_en: Check the lengths of encapsulated firmware responses.

Firmware messages that are forwarded from PF to VFs are encapsulated.
The size of these encapsulated messages must not exceed the maximum
defined message size.  Add appropriate checks to avoid oversize
messages.  Firmware messages may be expanded in future specs and
this will provide some guardrails to avoid data corruption.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c |  9 +++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h | 12 ++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index f952963d594e..18ee471c0002 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -809,6 +809,9 @@ static int bnxt_hwrm_fwd_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
 	struct hwrm_fwd_resp_input req = {0};
 	struct hwrm_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
 
+	if (BNXT_FWD_RESP_SIZE_ERR(msg_size))
+		return -EINVAL;
+
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FWD_RESP, -1, -1);
 
 	/* Set the new target id */
@@ -845,6 +848,9 @@ static int bnxt_hwrm_fwd_err_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
 	struct hwrm_reject_fwd_resp_input req = {0};
 	struct hwrm_reject_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
 
+	if (BNXT_REJ_FWD_RESP_SIZE_ERR(msg_size))
+		return -EINVAL;
+
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_REJECT_FWD_RESP, -1, -1);
 	/* Set the new target id */
 	req.target_id = cpu_to_le16(vf->fw_fid);
@@ -877,6 +883,9 @@ static int bnxt_hwrm_exec_fwd_resp(struct bnxt *bp, struct bnxt_vf_info *vf,
 	struct hwrm_exec_fwd_resp_input req = {0};
 	struct hwrm_exec_fwd_resp_output *resp = bp->hwrm_cmd_resp_addr;
 
+	if (BNXT_EXEC_FWD_RESP_SIZE_ERR(msg_size))
+		return -EINVAL;
+
 	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_EXEC_FWD_RESP, -1, -1);
 	/* Set the new target id */
 	req.target_id = cpu_to_le16(vf->fw_fid);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index d10f6f6c7860..6f6d85010ecf 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -11,6 +11,18 @@
 #ifndef BNXT_SRIOV_H
 #define BNXT_SRIOV_H
 
+#define BNXT_FWD_RESP_SIZE_ERR(n)					\
+	((offsetof(struct hwrm_fwd_resp_input, encap_resp) + n) >	\
+	 sizeof(struct hwrm_fwd_resp_input))
+
+#define BNXT_EXEC_FWD_RESP_SIZE_ERR(n)					\
+	((offsetof(struct hwrm_exec_fwd_resp_input, encap_request) + n) >\
+	 offsetof(struct hwrm_exec_fwd_resp_input, encap_resp_target_id))
+
+#define BNXT_REJ_FWD_RESP_SIZE_ERR(n)					\
+	((offsetof(struct hwrm_reject_fwd_resp_input, encap_request) + n) >\
+	 offsetof(struct hwrm_reject_fwd_resp_input, encap_resp_target_id))
+
 int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
 int bnxt_set_vf_mac(struct net_device *, int, u8 *);
 int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
-- 
cgit v1.2.3-59-g8ed1b


From ca2c39e2ec04e78ca6eb5162621cb9a5b897ca16 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:34 -0400
Subject: bnxt_en: Do not set firmware time from VF driver on older firmware.

Older firmware will reject this call and cause an error message to
be printed by the VF driver.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index bda618d06118..aff4b4eed1cb 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5379,7 +5379,8 @@ int bnxt_hwrm_fw_set_time(struct bnxt *bp)
 	struct tm tm;
 	time64_t now = ktime_get_real_seconds();
 
-	if (bp->hwrm_spec_code < 0x10400)
+	if ((BNXT_VF(bp) && bp->hwrm_spec_code < 0x10901) ||
+	    bp->hwrm_spec_code < 0x10400)
 		return -EOPNOTSUPP;
 
 	time64_to_tm(now, 0, &tm);
-- 
cgit v1.2.3-59-g8ed1b


From 2727c888f2f8bef071e9a07d6e2f018840d0a834 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:35 -0400
Subject: bnxt_en: Simplify ring alloc/free error messages.

Replace switch statements printing different messages for every ring type
with a common message.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 43 +++++--------------------------
 1 file changed, 6 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index aff4b4eed1cb..b83c2ac17620 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4336,26 +4336,9 @@ static int hwrm_ring_alloc_send_msg(struct bnxt *bp,
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
 	if (rc || err) {
-		switch (ring_type) {
-		case RING_FREE_REQ_RING_TYPE_L2_CMPL:
-			netdev_err(bp->dev, "hwrm_ring_alloc cp failed. rc:%x err:%x\n",
-				   rc, err);
-			return -1;
-
-		case RING_FREE_REQ_RING_TYPE_RX:
-			netdev_err(bp->dev, "hwrm_ring_alloc rx failed. rc:%x err:%x\n",
-				   rc, err);
-			return -1;
-
-		case RING_FREE_REQ_RING_TYPE_TX:
-			netdev_err(bp->dev, "hwrm_ring_alloc tx failed. rc:%x err:%x\n",
-				   rc, err);
-			return -1;
-
-		default:
-			netdev_err(bp->dev, "Invalid ring\n");
-			return -1;
-		}
+		netdev_err(bp->dev, "hwrm_ring_alloc type %d failed. rc:%x err:%x\n",
+			   ring_type, rc, err);
+		return -EIO;
 	}
 	ring->fw_ring_id = ring_id;
 	return rc;
@@ -4479,23 +4462,9 @@ static int hwrm_ring_free_send_msg(struct bnxt *bp,
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
 	if (rc || error_code) {
-		switch (ring_type) {
-		case RING_FREE_REQ_RING_TYPE_L2_CMPL:
-			netdev_err(bp->dev, "hwrm_ring_free cp failed. rc:%d\n",
-				   rc);
-			return rc;
-		case RING_FREE_REQ_RING_TYPE_RX:
-			netdev_err(bp->dev, "hwrm_ring_free rx failed. rc:%d\n",
-				   rc);
-			return rc;
-		case RING_FREE_REQ_RING_TYPE_TX:
-			netdev_err(bp->dev, "hwrm_ring_free tx failed. rc:%d\n",
-				   rc);
-			return rc;
-		default:
-			netdev_err(bp->dev, "Invalid ring\n");
-			return -1;
-		}
+		netdev_err(bp->dev, "hwrm_ring_free type %d failed. rc:%x err:%x\n",
+			   ring_type, rc, error_code);
+		return -EIO;
 	}
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 20c1d28e106c0b526ae015fcac8e1e254bff091c Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:36 -0400
Subject: bnxt_en: Display function level rx/tx_discard_pkts via ethtool

Add counters to display sum of rx/tx_discard_pkts of all rings as
function level statistics via ethtool.

Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 33 +++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 8ba14ae00e8f..0ea8466d531b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -140,6 +140,19 @@ reset_coalesce:
 #define BNXT_RX_STATS_EXT_ENTRY(counter)	\
 	{ BNXT_RX_STATS_EXT_OFFSET(counter), __stringify(counter) }
 
+enum {
+	RX_TOTAL_DISCARDS,
+	TX_TOTAL_DISCARDS,
+};
+
+static struct {
+	u64			counter;
+	char			string[ETH_GSTRING_LEN];
+} bnxt_sw_func_stats[] = {
+	{0, "rx_total_discard_pkts"},
+	{0, "tx_total_discard_pkts"},
+};
+
 static const struct {
 	long offset;
 	char string[ETH_GSTRING_LEN];
@@ -237,6 +250,7 @@ static const struct {
 	BNXT_RX_STATS_EXT_ENTRY(resume_roce_pause_events),
 };
 
+#define BNXT_NUM_SW_FUNC_STATS	ARRAY_SIZE(bnxt_sw_func_stats)
 #define BNXT_NUM_PORT_STATS ARRAY_SIZE(bnxt_port_stats_arr)
 #define BNXT_NUM_PORT_STATS_EXT ARRAY_SIZE(bnxt_port_stats_ext_arr)
 
@@ -244,6 +258,8 @@ static int bnxt_get_num_stats(struct bnxt *bp)
 {
 	int num_stats = BNXT_NUM_STATS * bp->cp_nr_rings;
 
+	num_stats += BNXT_NUM_SW_FUNC_STATS;
+
 	if (bp->flags & BNXT_FLAG_PORT_STATS)
 		num_stats += BNXT_NUM_PORT_STATS;
 
@@ -279,6 +295,9 @@ static void bnxt_get_ethtool_stats(struct net_device *dev,
 	if (!bp->bnapi)
 		return;
 
+	for (i = 0; i < BNXT_NUM_SW_FUNC_STATS; i++)
+		bnxt_sw_func_stats[i].counter = 0;
+
 	for (i = 0; i < bp->cp_nr_rings; i++) {
 		struct bnxt_napi *bnapi = bp->bnapi[i];
 		struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
@@ -288,7 +307,16 @@ static void bnxt_get_ethtool_stats(struct net_device *dev,
 		for (k = 0; k < stat_fields; j++, k++)
 			buf[j] = le64_to_cpu(hw_stats[k]);
 		buf[j++] = cpr->rx_l4_csum_errors;
+
+		bnxt_sw_func_stats[RX_TOTAL_DISCARDS].counter +=
+			le64_to_cpu(cpr->hw_stats->rx_discard_pkts);
+		bnxt_sw_func_stats[TX_TOTAL_DISCARDS].counter +=
+			le64_to_cpu(cpr->hw_stats->tx_discard_pkts);
 	}
+
+	for (i = 0; i < BNXT_NUM_SW_FUNC_STATS; i++, j++)
+		buf[j] = bnxt_sw_func_stats[i].counter;
+
 	if (bp->flags & BNXT_FLAG_PORT_STATS) {
 		__le64 *port_stats = (__le64 *)bp->hw_rx_port_stats;
 
@@ -359,6 +387,11 @@ static void bnxt_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 			sprintf(buf, "[%d]: rx_l4_csum_errors", i);
 			buf += ETH_GSTRING_LEN;
 		}
+		for (i = 0; i < BNXT_NUM_SW_FUNC_STATS; i++) {
+			strcpy(buf, bnxt_sw_func_stats[i].string);
+			buf += ETH_GSTRING_LEN;
+		}
+
 		if (bp->flags & BNXT_FLAG_PORT_STATS) {
 			for (i = 0; i < BNXT_NUM_PORT_STATS; i++) {
 				strcpy(buf, bnxt_port_stats_arr[i].string);
-- 
cgit v1.2.3-59-g8ed1b


From 4cebbaca12514986039b2ac7d30e36ecd2222f64 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:37 -0400
Subject: bnxt_en: Do not allow VF to read EEPROM.

Firmware does not allow the operation and would return failure, causing
a warning in dmesg.  So check for VF and disallow it in the driver.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 0ea8466d531b..a699ca5493ef 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -1818,6 +1818,11 @@ static int nvm_get_dir_info(struct net_device *dev, u32 *entries, u32 *length)
 
 static int bnxt_get_eeprom_len(struct net_device *dev)
 {
+	struct bnxt *bp = netdev_priv(dev);
+
+	if (BNXT_VF(bp))
+		return 0;
+
 	/* The -1 return value allows the entire 32-bit range of offsets to be
 	 * passed via the ethtool command-line utility.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 05abe4ddf0010e15419f5a6758b5bf44b7790982 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:38 -0400
Subject: bnxt_en: Increase RING_IDLE minimum threshold to 50

This keeps the RING_IDLE flag set in hardware for higher coalesce
settings by default and improved latency.

Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index b83c2ac17620..a221a10c4c29 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7702,7 +7702,7 @@ static void bnxt_init_dflt_coal(struct bnxt *bp)
 	coal->coal_bufs = 30;
 	coal->coal_ticks_irq = 1;
 	coal->coal_bufs_irq = 2;
-	coal->idle_thresh = 25;
+	coal->idle_thresh = 50;
 	coal->bufs_per_record = 2;
 	coal->budget = 64;		/* NAPI budget */
 
-- 
cgit v1.2.3-59-g8ed1b


From 9751e8e714872aa650b030e52a9fafbb694a3714 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:39 -0400
Subject: bnxt_en: reduce timeout on initial HWRM calls

Testing with DIM enabled on older kernels indicated that firmware calls
were slower than expected.  More detailed analysis indicated that the
default 25us delay was higher than necessary.  Reducing the time spend in
usleep_range() for the first several calls would reduce the overall
latency of firmware calls on newer Intel processors.

Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 26 +++++++++++++++++++++++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  6 ++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index a221a10c4c29..ff9a5cd77138 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3495,15 +3495,29 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
 
 	if (!timeout)
 		timeout = DFLT_HWRM_CMD_TIMEOUT;
+	/* convert timeout to usec */
+	timeout *= 1000;
 
 	i = 0;
-	tmo_count = timeout * 40;
+	/* Short timeout for the first few iterations:
+	 * number of loops = number of loops for short timeout +
+	 * number of loops for standard timeout.
+	 */
+	tmo_count = HWRM_SHORT_TIMEOUT_COUNTER;
+	timeout = timeout - HWRM_SHORT_MIN_TIMEOUT * HWRM_SHORT_TIMEOUT_COUNTER;
+	tmo_count += DIV_ROUND_UP(timeout, HWRM_MIN_TIMEOUT);
 	resp_len = bp->hwrm_cmd_resp_addr + HWRM_RESP_LEN_OFFSET;
 	if (intr_process) {
 		/* Wait until hwrm response cmpl interrupt is processed */
 		while (bp->hwrm_intr_seq_id != HWRM_SEQ_ID_INVALID &&
 		       i++ < tmo_count) {
-			usleep_range(25, 40);
+			/* on first few passes, just barely sleep */
+			if (i < HWRM_SHORT_TIMEOUT_COUNTER)
+				usleep_range(HWRM_SHORT_MIN_TIMEOUT,
+					     HWRM_SHORT_MAX_TIMEOUT);
+			else
+				usleep_range(HWRM_MIN_TIMEOUT,
+					     HWRM_MAX_TIMEOUT);
 		}
 
 		if (bp->hwrm_intr_seq_id != HWRM_SEQ_ID_INVALID) {
@@ -3521,7 +3535,13 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
 			      HWRM_RESP_LEN_SFT;
 			if (len)
 				break;
-			usleep_range(25, 40);
+			/* on first few passes, just barely sleep */
+			if (i < DFLT_HWRM_CMD_TIMEOUT)
+				usleep_range(HWRM_SHORT_MIN_TIMEOUT,
+					     HWRM_SHORT_MAX_TIMEOUT);
+			else
+				usleep_range(HWRM_MIN_TIMEOUT,
+					     HWRM_MAX_TIMEOUT);
 		}
 
 		if (i >= tmo_count) {
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 057f8a2b5d13..7fa4a458edd0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -532,6 +532,12 @@ struct rx_tpa_end_cmp_ext {
 #define BNXT_HWRM_REQ_MAX_SIZE		128
 #define BNXT_HWRM_REQS_PER_PAGE		(BNXT_PAGE_SIZE /	\
 					 BNXT_HWRM_REQ_MAX_SIZE)
+#define HWRM_SHORT_MIN_TIMEOUT		3
+#define HWRM_SHORT_MAX_TIMEOUT		10
+#define HWRM_SHORT_TIMEOUT_COUNTER	5
+
+#define HWRM_MIN_TIMEOUT		25
+#define HWRM_MAX_TIMEOUT		40
 
 #define BNXT_RX_EVENT	1
 #define BNXT_AGG_EVENT	2
-- 
cgit v1.2.3-59-g8ed1b


From cabfb09d87bd7980cb4e39bd2ce679a788eb7e7a Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:40 -0400
Subject: bnxt_en: add debugfs support for DIM

This adds debugfs support for bnxt_en with the purpose of allowing users
to examine the current DIM profile in use for each receive queue.  This
was instrumental in debugging issues found with DIM and ensuring that
the profiles we expect to use are the profiles being used.

Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/Makefile       |   1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |   6 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |   2 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c | 124 ++++++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h |  23 ++++
 5 files changed, 156 insertions(+)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h

diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index 7c560d545c03..5a779b19d149 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_BNXT) += bnxt_en.o
 
 bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o
 bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
+bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index ff9a5cd77138..a45e692cd0ca 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -62,6 +62,7 @@
 #include "bnxt_vfr.h"
 #include "bnxt_tc.h"
 #include "bnxt_devlink.h"
+#include "bnxt_debugfs.h"
 
 #define BNXT_TX_TIMEOUT		(5 * HZ)
 
@@ -6870,6 +6871,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 	}
 
 	bnxt_enable_napi(bp);
+	bnxt_debug_dev_init(bp);
 
 	rc = bnxt_init_nic(bp, irq_re_init);
 	if (rc) {
@@ -6902,6 +6904,7 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 	return 0;
 
 open_err:
+	bnxt_debug_dev_exit(bp);
 	bnxt_disable_napi(bp);
 	bnxt_del_napi(bp);
 
@@ -6995,6 +6998,7 @@ static void __bnxt_close_nic(struct bnxt *bp, bool irq_re_init,
 
 	/* TODO CHIMP_FW: Link/PHY related cleanup if (link_re_init) */
 
+	bnxt_debug_dev_exit(bp);
 	bnxt_disable_napi(bp);
 	del_timer_sync(&bp->timer);
 	bnxt_free_skbs(bp);
@@ -9071,6 +9075,7 @@ static struct pci_driver bnxt_pci_driver = {
 
 static int __init bnxt_init(void)
 {
+	bnxt_debug_init();
 	return pci_register_driver(&bnxt_pci_driver);
 }
 
@@ -9079,6 +9084,7 @@ static void __exit bnxt_exit(void)
 	pci_unregister_driver(&bnxt_pci_driver);
 	if (bnxt_pf_wq)
 		destroy_workqueue(bnxt_pf_wq);
+	bnxt_debug_exit();
 }
 
 module_init(bnxt_init);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 7fa4a458edd0..8df1d8b9d2e3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1391,6 +1391,8 @@ struct bnxt {
 	u16			*cfa_code_map; /* cfa_code -> vf_idx map */
 	u8			switch_id[8];
 	struct bnxt_tc_info	*tc_info;
+	struct dentry		*debugfs_pdev;
+	struct dentry		*debugfs_dim;
 };
 
 #define BNXT_RX_STATS_OFFSET(counter)			\
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
new file mode 100644
index 000000000000..94e208e9789f
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -0,0 +1,124 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include "bnxt_hsi.h"
+#include <linux/net_dim.h>
+#include "bnxt.h"
+#include "bnxt_debugfs.h"
+
+static struct dentry *bnxt_debug_mnt;
+
+static ssize_t debugfs_dim_read(struct file *filep,
+				char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct net_dim *dim = filep->private_data;
+	int len;
+	char *buf;
+
+	if (*ppos)
+		return 0;
+	if (!dim)
+		return -ENODEV;
+	buf = kasprintf(GFP_KERNEL,
+			"state = %d\n" \
+			"profile_ix = %d\n" \
+			"mode = %d\n" \
+			"tune_state = %d\n" \
+			"steps_right = %d\n" \
+			"steps_left = %d\n" \
+			"tired = %d\n",
+			dim->state,
+			dim->profile_ix,
+			dim->mode,
+			dim->tune_state,
+			dim->steps_right,
+			dim->steps_left,
+			dim->tired);
+	if (!buf)
+		return -ENOMEM;
+	if (count < strlen(buf)) {
+		kfree(buf);
+		return -ENOSPC;
+	}
+	len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+	kfree(buf);
+	return len;
+}
+
+static const struct file_operations debugfs_dim_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = debugfs_dim_read,
+};
+
+static struct dentry *debugfs_dim_ring_init(struct net_dim *dim, int ring_idx,
+					    struct dentry *dd)
+{
+	static char qname[16];
+
+	snprintf(qname, 10, "%d", ring_idx);
+	return debugfs_create_file(qname, 0600, dd,
+				   dim, &debugfs_dim_fops);
+}
+
+void bnxt_debug_dev_init(struct bnxt *bp)
+{
+	const char *pname = pci_name(bp->pdev);
+	struct dentry *pdevf;
+	int i;
+
+	bp->debugfs_pdev = debugfs_create_dir(pname, bnxt_debug_mnt);
+	if (bp->debugfs_pdev) {
+		pdevf = debugfs_create_dir("dim", bp->debugfs_pdev);
+		if (!pdevf) {
+			pr_err("failed to create debugfs entry %s/dim\n",
+			       pname);
+			return;
+		}
+		bp->debugfs_dim = pdevf;
+		/* create files for each rx ring */
+		for (i = 0; i < bp->cp_nr_rings; i++) {
+			struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring;
+
+			if (cpr && bp->bnapi[i]->rx_ring) {
+				pdevf = debugfs_dim_ring_init(&cpr->dim, i,
+							      bp->debugfs_dim);
+				if (!pdevf)
+					pr_err("failed to create debugfs entry %s/dim/%d\n",
+					       pname, i);
+			}
+		}
+	} else {
+		pr_err("failed to create debugfs entry %s\n", pname);
+	}
+}
+
+void bnxt_debug_dev_exit(struct bnxt *bp)
+{
+	if (bp) {
+		debugfs_remove_recursive(bp->debugfs_pdev);
+		bp->debugfs_pdev = NULL;
+	}
+}
+
+void bnxt_debug_init(void)
+{
+	bnxt_debug_mnt = debugfs_create_dir("bnxt_en", NULL);
+	if (!bnxt_debug_mnt)
+		pr_err("failed to init bnxt_en debugfs\n");
+}
+
+void bnxt_debug_exit(void)
+{
+	debugfs_remove_recursive(bnxt_debug_mnt);
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h
new file mode 100644
index 000000000000..d0bb4887acd0
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h
@@ -0,0 +1,23 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2017-2018 Broadcom Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ */
+
+#include "bnxt_hsi.h"
+#include "bnxt.h"
+
+#ifdef CONFIG_DEBUG_FS
+void bnxt_debug_init(void);
+void bnxt_debug_exit(void);
+void bnxt_debug_dev_init(struct bnxt *bp);
+void bnxt_debug_dev_exit(struct bnxt *bp);
+#else
+static inline void bnxt_debug_init(void) {}
+static inline void bnxt_debug_exit(void) {}
+static inline void bnxt_debug_dev_init(struct bnxt *bp) {}
+static inline void bnxt_debug_dev_exit(struct bnxt *bp) {}
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From d8c09f19accb89fc08b246339abb005455e4c846 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:41 -0400
Subject: bnxt_en: Reserve rings in bnxt_set_channels() if device is down.

The current code does not reserve rings during ethtool -L when the device
is down.  The rings will be reserved when the device is later opened.

Change it to reserve rings during ethtool -L when the device is down.
This provides a better guarantee that the device open will be successful
when the rings are reserved ahead of time.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index a699ca5493ef..ad98b78f5aa1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -584,6 +584,8 @@ static int bnxt_set_channels(struct net_device *dev,
 			 * to renable
 			 */
 		}
+	} else {
+		rc = bnxt_reserve_rings(bp);
 	}
 
 	return rc;
-- 
cgit v1.2.3-59-g8ed1b


From 2773dfb201e18722265c38dacdea6ecadf933064 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:42 -0400
Subject: bnxt_en: Don't reserve rings on VF when min rings were not
 provisioned by PF.

When rings are more limited and the PF has not provisioned minimum
guaranteed rings to the VF, do not reserve rings during driver probe.
Wait till device open before reserving rings when they will be used.
Device open will succeed if some minimum rings can be successfully
reserved and allocated.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index a45e692cd0ca..0884e4902668 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -5952,6 +5952,9 @@ static int bnxt_init_msix(struct bnxt *bp)
 	if (total_vecs > max)
 		total_vecs = max;
 
+	if (!total_vecs)
+		return 0;
+
 	msix_ent = kcalloc(total_vecs, sizeof(struct msix_entry), GFP_KERNEL);
 	if (!msix_ent)
 		return -ENOMEM;
@@ -7276,6 +7279,25 @@ skip_uc:
 	return rc;
 }
 
+static bool bnxt_can_reserve_rings(struct bnxt *bp)
+{
+#ifdef CONFIG_BNXT_SRIOV
+	if ((bp->flags & BNXT_FLAG_NEW_RM) && BNXT_VF(bp)) {
+		struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
+
+		/* No minimum rings were provisioned by the PF.  Don't
+		 * reserve rings by default when device is down.
+		 */
+		if (hw_resc->min_tx_rings || hw_resc->resv_tx_rings)
+			return true;
+
+		if (!netif_running(bp->dev))
+			return false;
+	}
+#endif
+	return true;
+}
+
 /* If the chip and firmware supports RFS */
 static bool bnxt_rfs_supported(struct bnxt *bp)
 {
@@ -7292,7 +7314,7 @@ static bool bnxt_rfs_capable(struct bnxt *bp)
 #ifdef CONFIG_RFS_ACCEL
 	int vnics, max_vnics, max_rss_ctxs;
 
-	if (!(bp->flags & BNXT_FLAG_MSIX_CAP))
+	if (!(bp->flags & BNXT_FLAG_MSIX_CAP) || !bnxt_can_reserve_rings(bp))
 		return false;
 
 	vnics = 1 + bp->rx_nr_rings;
@@ -8526,6 +8548,9 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 {
 	int dflt_rings, max_rx_rings, max_tx_rings, rc;
 
+	if (!bnxt_can_reserve_rings(bp))
+		return 0;
+
 	if (sh)
 		bp->flags |= BNXT_FLAG_SHARED_RINGS;
 	dflt_rings = netif_get_num_default_rss_queues();
-- 
cgit v1.2.3-59-g8ed1b


From 86c3380d9b1e2a3fcc87d34cea12991b81032b9f Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:43 -0400
Subject: bnxt_en: Reserve RSS and L2 contexts for VF.

For completeness and correctness, the VF driver needs to reserve these
RSS and L2 contexts.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       |  4 ++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 10 +++++-----
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h |  5 +++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 0884e4902668..fee1c0df146f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -4713,6 +4713,10 @@ bnxt_hwrm_reserve_vf_rings(struct bnxt *bp, int tx_rings, int rx_rings,
 
 	__bnxt_hwrm_reserve_vf_rings(bp, &req, tx_rings, rx_rings, ring_grps,
 				     cp_rings, vnics);
+	req.enables |= cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS |
+				   FUNC_VF_CFG_REQ_ENABLES_NUM_L2_CTXS);
+	req.num_rsscos_ctxs = cpu_to_le16(BNXT_VF_MAX_RSS_CTX);
+	req.num_l2_ctxs = cpu_to_le16(BNXT_VF_MAX_L2_CTX);
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (rc)
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index 18ee471c0002..cc21d874eb6e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -462,13 +462,13 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs)
 	vf_vnics = hw_resc->max_vnics - bp->nr_vnics;
 	vf_vnics = min_t(u16, vf_vnics, vf_rx_rings);
 
-	req.min_rsscos_ctx = cpu_to_le16(1);
-	req.max_rsscos_ctx = cpu_to_le16(1);
+	req.min_rsscos_ctx = cpu_to_le16(BNXT_VF_MIN_RSS_CTX);
+	req.max_rsscos_ctx = cpu_to_le16(BNXT_VF_MAX_RSS_CTX);
 	if (pf->vf_resv_strategy == BNXT_VF_RESV_STRATEGY_MINIMAL) {
 		req.min_cmpl_rings = cpu_to_le16(1);
 		req.min_tx_rings = cpu_to_le16(1);
 		req.min_rx_rings = cpu_to_le16(1);
-		req.min_l2_ctxs = cpu_to_le16(1);
+		req.min_l2_ctxs = cpu_to_le16(BNXT_VF_MIN_L2_CTX);
 		req.min_vnics = cpu_to_le16(1);
 		req.min_stat_ctx = cpu_to_le16(1);
 		req.min_hw_ring_grps = cpu_to_le16(1);
@@ -483,7 +483,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs)
 		req.min_cmpl_rings = cpu_to_le16(vf_cp_rings);
 		req.min_tx_rings = cpu_to_le16(vf_tx_rings);
 		req.min_rx_rings = cpu_to_le16(vf_rx_rings);
-		req.min_l2_ctxs = cpu_to_le16(4);
+		req.min_l2_ctxs = cpu_to_le16(BNXT_VF_MAX_L2_CTX);
 		req.min_vnics = cpu_to_le16(vf_vnics);
 		req.min_stat_ctx = cpu_to_le16(vf_stat_ctx);
 		req.min_hw_ring_grps = cpu_to_le16(vf_ring_grps);
@@ -491,7 +491,7 @@ static int bnxt_hwrm_func_vf_resc_cfg(struct bnxt *bp, int num_vfs)
 	req.max_cmpl_rings = cpu_to_le16(vf_cp_rings);
 	req.max_tx_rings = cpu_to_le16(vf_tx_rings);
 	req.max_rx_rings = cpu_to_le16(vf_rx_rings);
-	req.max_l2_ctxs = cpu_to_le16(4);
+	req.max_l2_ctxs = cpu_to_le16(BNXT_VF_MAX_L2_CTX);
 	req.max_vnics = cpu_to_le16(vf_vnics);
 	req.max_stat_ctx = cpu_to_le16(vf_stat_ctx);
 	req.max_hw_ring_grps = cpu_to_le16(vf_ring_grps);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
index 6f6d85010ecf..e9b20cd19881 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.h
@@ -23,6 +23,11 @@
 	((offsetof(struct hwrm_reject_fwd_resp_input, encap_request) + n) >\
 	 offsetof(struct hwrm_reject_fwd_resp_input, encap_resp_target_id))
 
+#define BNXT_VF_MIN_RSS_CTX	1
+#define BNXT_VF_MAX_RSS_CTX	1
+#define BNXT_VF_MIN_L2_CTX	1
+#define BNXT_VF_MAX_L2_CTX	4
+
 int bnxt_get_vf_config(struct net_device *, int, struct ifla_vf_info *);
 int bnxt_set_vf_mac(struct net_device *, int, u8 *);
 int bnxt_set_vf_vlan(struct net_device *, int, u16, u8, __be16);
-- 
cgit v1.2.3-59-g8ed1b


From 47558acd56a74c1ac598093930a5559270bf8c09 Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Thu, 26 Apr 2018 17:44:44 -0400
Subject: bnxt_en: Reserve rings at driver open if none was reserved at probe
 time.

Add logic to reserve default rings at driver open time if none was
reserved during probe time.  This will happen when the PF driver did
not provision minimum rings to the VF, due to more limited resources.

Driver open will only succeed if some minimum rings can be reserved.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fee1c0df146f..efe5c7203f60 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6844,6 +6844,8 @@ static void bnxt_preset_reg_win(struct bnxt *bp)
 	}
 }
 
+static int bnxt_init_dflt_ring_mode(struct bnxt *bp);
+
 static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 {
 	int rc = 0;
@@ -6851,6 +6853,12 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init)
 	bnxt_preset_reg_win(bp);
 	netif_carrier_off(bp->dev);
 	if (irq_re_init) {
+		/* Reserve rings now if none were reserved at driver probe. */
+		rc = bnxt_init_dflt_ring_mode(bp);
+		if (rc) {
+			netdev_err(bp->dev, "Failed to reserve default rings at open\n");
+			return rc;
+		}
 		rc = bnxt_reserve_rings(bp);
 		if (rc)
 			return rc;
@@ -8600,6 +8608,29 @@ static int bnxt_set_dflt_rings(struct bnxt *bp, bool sh)
 	return rc;
 }
 
+static int bnxt_init_dflt_ring_mode(struct bnxt *bp)
+{
+	int rc;
+
+	if (bp->tx_nr_rings)
+		return 0;
+
+	rc = bnxt_set_dflt_rings(bp, true);
+	if (rc) {
+		netdev_err(bp->dev, "Not enough rings available.\n");
+		return rc;
+	}
+	rc = bnxt_init_int_mode(bp);
+	if (rc)
+		return rc;
+	bp->tx_nr_rings_per_tc = bp->tx_nr_rings;
+	if (bnxt_rfs_supported(bp) && bnxt_rfs_capable(bp)) {
+		bp->flags |= BNXT_FLAG_RFS;
+		bp->dev->features |= NETIF_F_NTUPLE;
+	}
+	return 0;
+}
+
 int bnxt_restore_pf_fw_resources(struct bnxt *bp)
 {
 	int rc;
-- 
cgit v1.2.3-59-g8ed1b


From 7d4cbae04f835525295c501770155c755228eb55 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:17:56 +0200
Subject: selftests: forwarding: Add libs for gretap mirror testing

To simplify implementation of mirror-to-gretap tests, extend lib.sh with
several new functions that might potentially be useful more
broadly (although right now the mirroring tests will be the only
client).

Also add mirror_lib.sh with code useful for mirroring tests,
mirror_gre_lib.sh with code specifically useful for mirror-to-gretap
tests, and mirror_gre_topo.sh that primes a given test with a good
baseline topology that the test can then tweak to its liking.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh      |  96 +++++++++++++++
 .../selftests/net/forwarding/mirror_gre_lib.sh     |  85 ++++++++++++++
 .../net/forwarding/mirror_gre_topo_lib.sh          | 129 +++++++++++++++++++++
 .../testing/selftests/net/forwarding/mirror_lib.sh |  40 +++++++
 4 files changed, 350 insertions(+)
 create mode 100644 tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
 create mode 100644 tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
 create mode 100644 tools/testing/selftests/net/forwarding/mirror_lib.sh

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 1ac6c62271f3..a066ca536ac4 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -321,6 +321,25 @@ simple_if_fini()
 	vrf_destroy $vrf_name
 }
 
+tunnel_create()
+{
+	local name=$1; shift
+	local type=$1; shift
+	local local=$1; shift
+	local remote=$1; shift
+
+	ip link add name $name type $type \
+	   local $local remote $remote "$@"
+	ip link set dev $name up
+}
+
+tunnel_destroy()
+{
+	local name=$1; shift
+
+	ip link del dev $name
+}
+
 master_name_get()
 {
 	local if_name=$1
@@ -335,6 +354,15 @@ link_stats_tx_packets_get()
        ip -j -s link show dev $if_name | jq '.[]["stats64"]["tx"]["packets"]'
 }
 
+tc_rule_stats_get()
+{
+	local dev=$1; shift
+	local pref=$1; shift
+
+	tc -j -s filter show dev $dev ingress pref $pref |
+	jq '.[1].options.actions[].stats.packets'
+}
+
 mac_get()
 {
 	local if_name=$1
@@ -381,6 +409,74 @@ tc_offload_check()
 	return 0
 }
 
+slow_path_trap_install()
+{
+	local dev=$1; shift
+	local direction=$1; shift
+
+	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
+		# For slow-path testing, we need to install a trap to get to
+		# slow path the packets that would otherwise be switched in HW.
+		tc filter add dev $dev $direction pref 1 \
+		   flower skip_sw action trap
+	fi
+}
+
+slow_path_trap_uninstall()
+{
+	local dev=$1; shift
+	local direction=$1; shift
+
+	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
+		tc filter del dev $dev $direction pref 1 flower skip_sw
+	fi
+}
+
+__icmp_capture_add_del()
+{
+	local add_del=$1; shift
+	local pref=$1; shift
+	local vsuf=$1; shift
+	local tundev=$1; shift
+	local filter=$1; shift
+
+	tc filter $add_del dev "$tundev" ingress \
+	   proto ip$vsuf pref $pref \
+	   flower ip_proto icmp$vsuf $filter \
+	   action pass
+}
+
+icmp_capture_install()
+{
+	__icmp_capture_add_del add 100 "" "$@"
+}
+
+icmp_capture_uninstall()
+{
+	__icmp_capture_add_del del 100 "" "$@"
+}
+
+icmp6_capture_install()
+{
+	__icmp_capture_add_del add 100 v6 "$@"
+}
+
+icmp6_capture_uninstall()
+{
+	__icmp_capture_add_del del 100 v6 "$@"
+}
+
+matchall_sink_create()
+{
+	local dev=$1; shift
+
+	tc qdisc add dev $dev clsact
+	tc filter add dev $dev ingress \
+	   pref 10000 \
+	   matchall \
+	   action drop
+}
+
 ##############################################################################
 # Tests
 
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
new file mode 100644
index 000000000000..207ffd167dba
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: GPL-2.0
+
+do_test_span_gre_dir_ips()
+{
+	local expect=$1; shift
+	local tundev=$1; shift
+	local direction=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	icmp_capture_install h3-$tundev
+	mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 $expect
+	mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 $expect
+	icmp_capture_uninstall h3-$tundev
+}
+
+quick_test_span_gre_dir_ips()
+{
+	do_test_span_gre_dir_ips 10 "$@"
+}
+
+fail_test_span_gre_dir_ips()
+{
+	do_test_span_gre_dir_ips 0 "$@"
+}
+
+test_span_gre_dir_ips()
+{
+	local tundev=$1; shift
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	quick_test_span_gre_dir_ips "$tundev" "$direction" "$ip1" "$ip2"
+
+	icmp_capture_install h3-$tundev "type $forward_type"
+	mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 10
+	icmp_capture_uninstall h3-$tundev
+
+	icmp_capture_install h3-$tundev "type $backward_type"
+	mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 10
+	icmp_capture_uninstall h3-$tundev
+}
+
+full_test_span_gre_dir_ips()
+{
+	local tundev=$1; shift
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local what=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+	test_span_gre_dir_ips "$tundev" "$direction" "$forward_type" \
+			      "$backward_type" "$ip1" "$ip2"
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction $what ($tcflags)"
+}
+
+quick_test_span_gre_dir()
+{
+	quick_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+fail_test_span_gre_dir()
+{
+	fail_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+test_span_gre_dir()
+{
+	test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+full_test_span_gre_dir()
+{
+	full_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
new file mode 100644
index 000000000000..b3ceda2b4197
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is the standard topology for testing mirroring to gretap and ip6gretap
+# netdevices. The tests that use it tweak it in one way or another--importantly,
+# $swp3 and $h3 need to have addresses set up.
+#
+#   +---------------------+                             +---------------------+
+#   | H1                  |                             |                  H2 |
+#   |     + $h1           |                             |           $h2 +     |
+#   |     | 192.0.2.1/28  |                             |  192.0.2.2/28 |     |
+#   +-----|---------------+                             +---------------|-----+
+#         |                                                             |
+#   +-----|-------------------------------------------------------------|-----+
+#   | SW  o--> mirror                                                   |     |
+#   | +---|-------------------------------------------------------------|---+ |
+#   | |   + $swp1                    BR                           $swp2 +   | |
+#   | +---------------------------------------------------------------------+ |
+#   |                                                                         |
+#   |     + $swp3               + gt6 (ip6gretap)      + gt4 (gretap)         |
+#   |     |                     : loc=2001:db8:2::1    : loc=192.0.2.129      |
+#   |     |                     : rem=2001:db8:2::2    : rem=192.0.2.130      |
+#   |     |                     : ttl=100              : ttl=100              |
+#   |     |                     : tos=inherit          : tos=inherit          |
+#   |     |                     :                      :                      |
+#   +-----|---------------------:----------------------:----------------------+
+#         |                     :                      :
+#   +-----|---------------------:----------------------:----------------------+
+#   | H3  + $h3                 + h3-gt6 (ip6gretap)   + h3-gt4 (gretap)      |
+#   |                             loc=2001:db8:2::2      loc=192.0.2.130      |
+#   |                             rem=2001:db8:2::1      rem=192.0.2.129      |
+#   |                             ttl=100                ttl=100              |
+#   |                             tos=inherit            tos=inherit          |
+#   |                                                                         |
+#   +-------------------------------------------------------------------------+
+
+mirror_gre_topo_h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28
+}
+
+mirror_gre_topo_h1_destroy()
+{
+	simple_if_fini $h1 192.0.2.1/28
+}
+
+mirror_gre_topo_h2_create()
+{
+	simple_if_init $h2 192.0.2.2/28
+}
+
+mirror_gre_topo_h2_destroy()
+{
+	simple_if_fini $h2 192.0.2.2/28
+}
+
+mirror_gre_topo_h3_create()
+{
+	simple_if_init $h3
+
+	tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129
+	ip link set h3-gt4 vrf v$h3
+	matchall_sink_create h3-gt4
+
+	tunnel_create h3-gt6 ip6gretap 2001:db8:2::2 2001:db8:2::1
+	ip link set h3-gt6 vrf v$h3
+	matchall_sink_create h3-gt6
+}
+
+mirror_gre_topo_h3_destroy()
+{
+	tunnel_destroy h3-gt6
+	tunnel_destroy h3-gt4
+
+	simple_if_fini $h3
+}
+
+mirror_gre_topo_switch_create()
+{
+	ip link set dev $swp3 up
+
+	ip link add name br1 type bridge vlan_filtering 1
+	ip link set dev br1 up
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+		      ttl 100 tos inherit
+
+	tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \
+		      ttl 100 tos inherit allow-localremote
+
+	tc qdisc add dev $swp1 clsact
+}
+
+mirror_gre_topo_switch_destroy()
+{
+	tc qdisc del dev $swp1 clsact
+
+	tunnel_destroy gt6
+	tunnel_destroy gt4
+
+	ip link set dev $swp1 down
+	ip link set dev $swp2 down
+	ip link del dev br1
+
+	ip link set dev $swp3 down
+}
+
+mirror_gre_topo_create()
+{
+	mirror_gre_topo_h1_create
+	mirror_gre_topo_h2_create
+	mirror_gre_topo_h3_create
+
+	mirror_gre_topo_switch_create
+}
+
+mirror_gre_topo_destroy()
+{
+	mirror_gre_topo_switch_destroy
+
+	mirror_gre_topo_h3_destroy
+	mirror_gre_topo_h2_destroy
+	mirror_gre_topo_h1_destroy
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
new file mode 100644
index 000000000000..e5028a5725e3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0
+
+mirror_install()
+{
+	local from_dev=$1; shift
+	local direction=$1; shift
+	local to_dev=$1; shift
+	local filter=$1; shift
+
+	tc filter add dev $from_dev $direction \
+	   pref 1000 $filter \
+	   action mirred egress mirror dev $to_dev
+}
+
+mirror_uninstall()
+{
+	local from_dev=$1; shift
+	local direction=$1; shift
+
+	tc filter del dev $swp1 $direction pref 1000
+}
+
+mirror_test()
+{
+	local vrf_name=$1; shift
+	local sip=$1; shift
+	local dip=$1; shift
+	local dev=$1; shift
+	local pref=$1; shift
+	local expect=$1; shift
+
+	local t0=$(tc_rule_stats_get $dev $pref)
+	ip vrf exec $vrf_name \
+	   ${PING} ${sip:+-I $sip} $dip -c 10 -i 0.1 -w 2 &> /dev/null
+	local t1=$(tc_rule_stats_get $dev $pref)
+	local delta=$((t1 - t0))
+	# Tolerate a couple stray extra packets.
+	((expect <= delta && delta <= expect + 2))
+	check_err $? "Expected to capture $expect packets, got $delta."
+}
-- 
cgit v1.2.3-59-g8ed1b


From ba8d39871a10568afe27f21adf9be29d8e2bea95 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:18:23 +0200
Subject: selftests: forwarding: Add test for mirror to gretap

Add a test for basic mirroring to gretap and ip6gretap netdevices.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/net/forwarding/mirror_gre.sh | 139 +++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh
new file mode 100755
index 000000000000..a8abc736f67c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" when the device to mirror to is a
+# gretap or ip6gretap netdevice. Expect that the packets come out encapsulated,
+# and another gretap / ip6gretap netdevice is then capable of decapsulating the
+# traffic. Test that the payload is what is expected (ICMP ping request or
+# reply, depending on test).
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip address add dev $swp3 192.0.2.129/28
+	ip address add dev $h3 192.0.2.130/28
+
+	ip address add dev $swp3 2001:db8:2::1/64
+	ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip address del dev $h3 2001:db8:2::2/64
+	ip address del dev $swp3 2001:db8:2::1/64
+
+	ip address del dev $h3 192.0.2.130/28
+	ip address del dev $swp3 192.0.2.129/28
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_span_gre_mac()
+{
+	local tundev=$1; shift
+	local direction=$1; shift
+	local prot=$1; shift
+	local what=$1; shift
+
+	local swp3mac=$(mac_get $swp3)
+	local h3mac=$(mac_get $h3)
+
+	RET=0
+
+	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+	tc qdisc add dev $h3 clsact
+	tc filter add dev $h3 ingress pref 77 prot $prot \
+		flower ip_proto 0x2f src_mac $swp3mac dst_mac $h3mac \
+		action pass
+
+	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10
+
+	tc filter del dev $h3 ingress pref 77
+	tc qdisc del dev $h3 clsact
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction $what: envelope MAC ($tcflags)"
+}
+
+test_two_spans()
+{
+	RET=0
+
+	mirror_install $swp1 ingress gt4 "matchall $tcflags"
+	mirror_install $swp1 egress gt6 "matchall $tcflags"
+	quick_test_span_gre_dir gt4 ingress
+	quick_test_span_gre_dir gt6 egress
+
+	mirror_uninstall $swp1 ingress
+	fail_test_span_gre_dir gt4 ingress
+	quick_test_span_gre_dir gt6 egress
+
+	mirror_install $swp1 ingress gt4 "matchall $tcflags"
+	mirror_uninstall $swp1 egress
+	quick_test_span_gre_dir gt4 ingress
+	fail_test_span_gre_dir gt6 egress
+
+	mirror_uninstall $swp1 ingress
+	log_test "two simultaneously configured mirrors ($tcflags)"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
+	full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+	full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
+
+	test_span_gre_mac gt4 ingress ip "mirror to gretap"
+	test_span_gre_mac gt6 ingress ipv6 "mirror to ip6gretap"
+	test_span_gre_mac gt4 egress ip "mirror to gretap"
+	test_span_gre_mac gt6 egress ipv6 "mirror to ip6gretap"
+
+	test_two_spans
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 304f009cc3c17b6aa089531833a7598d88858a16 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:19:15 +0200
Subject: selftests: forwarding: Test gretap mirror with next-hop remote

Test mirror to a gretap and an ip6gretap netdevice such that the remote
address of the tunnel is reachable through a next-hop route.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_nh.sh      | 117 +++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_nh.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
new file mode 100755
index 000000000000..9ac70978541f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test that gretap and ip6gretap mirroring works when the other tunnel endpoint
+# is reachable through a next-hop route (as opposed to directly-attached route).
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip address add dev $swp3 192.0.2.161/28
+	ip address add dev $h3 192.0.2.162/28
+	ip address add dev gt4 192.0.2.129/32
+	ip address add dev h3-gt4 192.0.2.130/32
+
+	# IPv6 route can't be added after address. Such routes are rejected due
+	# to the gateway address having been configured on the local system. It
+	# works the other way around though.
+	ip address add dev $swp3 2001:db8:4::1/64
+	ip -6 route add 2001:db8:2::2/128 via 2001:db8:4::2
+	ip address add dev $h3 2001:db8:4::2/64
+	ip address add dev gt6 2001:db8:2::1
+	ip address add dev h3-gt6 2001:db8:2::2
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip -6 route del 2001:db8:2::2/128 via 2001:db8:4::2
+	ip address del dev $h3 2001:db8:4::2/64
+	ip address del dev $swp3 2001:db8:4::1/64
+
+	ip address del dev $h3 192.0.2.162/28
+	ip address del dev $swp3 192.0.2.161/28
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_gretap()
+{
+	RET=0
+	mirror_install $swp1 ingress gt4 "matchall $tcflags"
+
+	# For IPv4, test that there's no mirroring without the route directing
+	# the traffic to tunnel remote address. Then add it and test that
+	# mirroring starts. For IPv6 we can't test this due to the limitation
+	# that routes for locally-specified IPv6 addresses can't be added.
+	fail_test_span_gre_dir gt4 ingress
+
+	ip route add 192.0.2.130/32 via 192.0.2.162
+	quick_test_span_gre_dir gt4 ingress
+	ip route del 192.0.2.130/32 via 192.0.2.162
+
+	mirror_uninstall $swp1 ingress
+	log_test "mirror to gre with next-hop remote ($tcflags)"
+}
+
+test_ip6gretap()
+{
+	RET=0
+
+	mirror_install $swp1 ingress gt6 "matchall $tcflags"
+	quick_test_span_gre_dir gt6 ingress
+	mirror_uninstall $swp1 ingress
+
+	log_test "mirror to ip6gre with next-hop remote ($tcflags)"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	test_gretap
+	test_ip6gretap
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 16585cbe20953ca6d1ad1feb69cadbd51abd5256 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:19:55 +0200
Subject: selftests: forwarding: Test mirror to gretap w/ bound dev

Test mirroring to a gretap and an ip6gretap netdevice with a bound
device, where the tunnel device and the bound device are in different
VRFs (an overlay / underlay configuration).

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_bound.sh   | 213 +++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_bound.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
new file mode 100755
index 000000000000..3708ac0f400a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
@@ -0,0 +1,213 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+#   +---------------------+                             +---------------------+
+#   | H1                  |                             |                  H2 |
+#   |     + $h1           |                             |           $h2 +     |
+#   |     | 192.0.2.1/28  |                             |  192.0.2.2/28 |     |
+#   +-----|---------------+                             +---------------|-----+
+#         |                                                             |
+#   +-----|-------------------------------------------------------------|-----+
+#   | SW  o--> mirror                                                   |     |
+#   | +---|-------------------------------------------------------------|---+ |
+#   | |   + $swp1                    BR                           $swp2 +   | |
+#   | +---------------------------------------------------------------------+ |
+#   |                                                                         |
+#   | +---------------------------------------------------------------------+ |
+#   | | OL                      + gt6 (ip6gretap)      + gt4 (gretap)       | |
+#   | |                         : loc=2001:db8:2::1    : loc=192.0.2.129    | |
+#   | |                         : rem=2001:db8:2::2    : rem=192.0.2.130    | |
+#   | |                         : ttl=100              : ttl=100            | |
+#   | |                         : tos=inherit          : tos=inherit        | |
+#   | +-------------------------:--|-------------------:--|-----------------+ |
+#   |                           :  |                   :  |                   |
+#   | +-------------------------:--|-------------------:--|-----------------+ |
+#   | | UL                      :  |,---------------------'                 | |
+#   | |   + $swp3               :  ||                  :                    | |
+#   | |   | 192.0.2.129/28      :  vv                  :                    | |
+#   | |   | 2001:db8:2::1/64    :  + ul (dummy)        :                    | |
+#   | +---|---------------------:----------------------:--------------------+ |
+#   +-----|---------------------:----------------------:----------------------+
+#         |                     :                      :
+#   +-----|---------------------:----------------------:----------------------+
+#   | H3  + $h3                 + h3-gt6 (ip6gretap)   + h3-gt4 (gretap)      |
+#   |       192.0.2.130/28        loc=2001:db8:2::2      loc=192.0.2.130      |
+#   |       2001:db8:2::2/64      rem=2001:db8:2::1      rem=192.0.2.129      |
+#   |                             ttl=100                ttl=100              |
+#   |                             tos=inherit            tos=inherit          |
+#   |                                                                         |
+#   +-------------------------------------------------------------------------+
+#
+# This tests mirroring to gretap and ip6gretap configured in an overlay /
+# underlay manner, i.e. with a bound dummy device that marks underlay VRF where
+# the encapsulated packed should be routed.
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+
+h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+	simple_if_init $h2 192.0.2.2/28
+}
+
+h2_destroy()
+{
+	simple_if_fini $h2 192.0.2.2/28
+}
+
+h3_create()
+{
+	simple_if_init $h3 192.0.2.130/28 2001:db8:2::2/64
+
+	tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129
+	ip link set h3-gt4 vrf v$h3
+	matchall_sink_create h3-gt4
+
+	tunnel_create h3-gt6 ip6gretap 2001:db8:2::2 2001:db8:2::1
+	ip link set h3-gt6 vrf v$h3
+	matchall_sink_create h3-gt6
+}
+
+h3_destroy()
+{
+	tunnel_destroy h3-gt6
+	tunnel_destroy h3-gt4
+
+	simple_if_fini $h3 192.0.2.130/28 2001:db8:2::2/64
+}
+
+switch_create()
+{
+	# Bridge between H1 and H2.
+
+	ip link add name br1 type bridge vlan_filtering 1
+	ip link set dev br1 up
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	tc qdisc add dev $swp1 clsact
+
+	# Underlay.
+
+	simple_if_init $swp3 192.0.2.129/28 2001:db8:2::1/64
+
+	ip link add name ul type dummy
+	ip link set dev ul master v$swp3
+	ip link set dev ul up
+
+	# Overlay.
+
+	vrf_create vrf-ol
+	ip link set dev vrf-ol up
+
+	tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+		      ttl 100 tos inherit dev ul
+	ip link set dev gt4 master vrf-ol
+	ip link set dev gt4 up
+
+	tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \
+		      ttl 100 tos inherit dev ul allow-localremote
+	ip link set dev gt6 master vrf-ol
+	ip link set dev gt6 up
+}
+
+switch_destroy()
+{
+	vrf_destroy vrf-ol
+
+	tunnel_destroy gt6
+	tunnel_destroy gt4
+
+	simple_if_fini $swp3 192.0.2.129/28 2001:db8:2::1/64
+
+	ip link del dev ul
+
+	tc qdisc del dev $swp1 clsact
+
+	ip link set dev $swp1 down
+	ip link set dev $swp2 down
+	ip link del dev br1
+}
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+
+	h1_create
+	h2_create
+	h3_create
+
+	switch_create
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	switch_destroy
+
+	h3_destroy
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+}
+
+test_all()
+{
+	RET=0
+
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap w/ UL"
+	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap w/ UL"
+
+	full_test_span_gre_dir gt4 egress  0 8 "mirror to gretap w/ UL"
+	full_test_span_gre_dir gt6 egress  0 8 "mirror to ip6gretap w/ UL"
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 16608bfe3907341c768ac7f53fa61b682f5744e7 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:20:56 +0200
Subject: selftests: forwarding: Test flower mirror to gretap

Add a test for mirroring to a gretap and an ip6gretap netdevices such
that the mirroring action is triggered by a flower match.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_flower.sh  | 116 +++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_flower.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
new file mode 100755
index 000000000000..178a42d771aa
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# This tests flower-triggered mirroring to gretap and ip6gretap netdevices. The
+# interfaces on H1 and H2 have two addresses each. Flower match on one of the
+# addresses is configured with mirror action. It is expected that when pinging
+# this address, mirroring takes place, whereas when pinging the other one,
+# there's no mirroring.
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip address add dev $swp3 192.0.2.129/28
+	ip address add dev $h3 192.0.2.130/28
+
+	ip address add dev $swp3 2001:db8:2::1/64
+	ip address add dev $h3 2001:db8:2::2/64
+
+	ip address add dev $h1 192.0.2.3/28
+	ip address add dev $h2 192.0.2.4/28
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip address del dev $h2 192.0.2.4/28
+	ip address del dev $h1 192.0.2.3/28
+
+	ip address del dev $h3 2001:db8:2::2/64
+	ip address del dev $swp3 2001:db8:2::1/64
+
+	ip address del dev $h3 192.0.2.130/28
+	ip address del dev $swp3 192.0.2.129/28
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_span_gre_dir_acl()
+{
+	test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+}
+
+full_test_span_gre_dir_acl()
+{
+	local tundev=$1; shift
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local match_dip=$1; shift
+	local what=$1; shift
+
+	mirror_install $swp1 $direction $tundev \
+		       "protocol ip flower $tcflags dst_ip $match_dip"
+	fail_test_span_gre_dir $tundev $direction
+	test_span_gre_dir_acl "$tundev" "$direction" \
+			  "$forward_type" "$backward_type"
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction $what ($tcflags)"
+}
+
+test_all()
+{
+	RET=0
+
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	full_test_span_gre_dir_acl gt4 ingress 8 0 192.0.2.4 "ACL mirror to gretap"
+	full_test_span_gre_dir_acl gt6 ingress 8 0 192.0.2.4 "ACL mirror to ip6gretap"
+
+	full_test_span_gre_dir_acl gt4 egress 0 8 192.0.2.3 "ACL mirror to gretap"
+	full_test_span_gre_dir_acl gt6 egress 0 8 192.0.2.3 "ACL mirror to ip6gretap"
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From ff68e6fb048ea5bb3ef8baef256e891e8d0299ee Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:21:23 +0200
Subject: selftests: forwarding: Test neighbor updates when mirroring to gretap

Test that when a mirror to gretap or ip6gretap netdevice is configured,
changes to neighbors are reflected.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_neigh.sh   | 101 +++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
new file mode 100755
index 000000000000..1ca29ba4f338
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for mirroring to gretap and ip6gretap, such that the neighbor entry for
+# the tunnel remote address has invalid address at the time that the mirroring
+# is set up. Later on, the neighbor is deleted and it is expected to be
+# reinitialized using the usual ARP process, and the mirroring offload updated.
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip address add dev $swp3 192.0.2.129/28
+	ip address add dev $h3 192.0.2.130/28
+
+	ip address add dev $swp3 2001:db8:2::1/64
+	ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip address del dev $h3 2001:db8:2::2/64
+	ip address del dev $swp3 2001:db8:2::1/64
+
+	ip address del dev $h3 192.0.2.130/28
+	ip address del dev $swp3 192.0.2.129/28
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_span_gre_neigh()
+{
+	local addr=$1; shift
+	local tundev=$1; shift
+	local direction=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	ip neigh replace dev $swp3 $addr lladdr 00:11:22:33:44:55
+	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+	fail_test_span_gre_dir $tundev ingress
+	ip neigh del dev $swp3 $addr
+	quick_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction $what: neighbor change ($tcflags)"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	test_span_gre_neigh 192.0.2.130 gt4 ingress "mirror to gretap"
+	test_span_gre_neigh 192.0.2.130 gt4 egress "mirror to gretap"
+	test_span_gre_neigh 2001:db8:2::2 gt6 ingress "mirror to ip6gretap"
+	test_span_gre_neigh 2001:db8:2::2 gt6 egress "mirror to ip6gretap"
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 45315673e0c5656498e8bdeec6d6db170cabc4d3 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 27 Apr 2018 01:22:25 +0200
Subject: selftests: forwarding: Test changes in mirror-to-gretap

These tests set up mirroring in a situation that the configuration is
incorrect, i.e. mirrored packets, if any, are not supposed to reach
destination tunnel device. Then the configuration is rectified and
mirroring is checked to have started working.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_changes.sh | 194 +++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_changes.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
new file mode 100755
index 000000000000..0ed288ac76d2
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test how mirrors to gretap and ip6gretap react to changes to relevant
+# configuration.
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	# This test downs $swp3, which deletes the configured IPv6 address
+	# unless this sysctl is set.
+	local key=net.ipv6.conf.$swp3.keep_addr_on_down
+	SWP3_KEEP_ADDR_ON_DOWN=$(sysctl -n $key)
+	sysctl -qw $key=1
+
+	ip address add dev $swp3 192.0.2.129/28
+	ip address add dev $h3 192.0.2.130/28
+
+	ip address add dev $swp3 2001:db8:2::1/64
+	ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip address del dev $h3 2001:db8:2::2/64
+	ip address del dev $swp3 2001:db8:2::1/64
+
+	ip address del dev $h3 192.0.2.130/28
+	ip address del dev $swp3 192.0.2.129/28
+
+	local key=net.ipv6.conf.$swp3.keep_addr_on_down
+	sysctl -qw $key=$SWP3_KEEP_ADDR_ON_DOWN
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_span_gre_ttl()
+{
+	local tundev=$1; shift
+	local type=$1; shift
+	local prot=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	tc qdisc add dev $h3 clsact
+	tc filter add dev $h3 ingress pref 77 prot $prot \
+		flower ip_ttl 50 action pass
+
+	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 0
+
+	ip link set dev $tundev type $type ttl 50
+	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10
+
+	ip link set dev $tundev type $type ttl 100
+	tc filter del dev $h3 ingress pref 77
+	tc qdisc del dev $h3 clsact
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: TTL change ($tcflags)"
+}
+
+test_span_gre_tun_up()
+{
+	local tundev=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	ip link set dev $tundev down
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	fail_test_span_gre_dir $tundev ingress
+
+	ip link set dev $tundev up
+
+	quick_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: tunnel down/up ($tcflags)"
+}
+
+test_span_gre_egress_up()
+{
+	local tundev=$1; shift
+	local remote_ip=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	ip link set dev $swp3 down
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	fail_test_span_gre_dir $tundev ingress
+
+	# After setting the device up, wait for neighbor to get resolved so that
+	# we can expect mirroring to work.
+	ip link set dev $swp3 up
+	while true; do
+		ip neigh sh dev $swp3 $remote_ip nud reachable |
+		    grep -q ^
+		if [[ $? -ne 0 ]]; then
+			sleep 1
+		else
+			break
+		fi
+	done
+
+	quick_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: egress down/up ($tcflags)"
+}
+
+test_span_gre_remote_ip()
+{
+	local tundev=$1; shift
+	local type=$1; shift
+	local correct_ip=$1; shift
+	local wrong_ip=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	ip link set dev $tundev type $type remote $wrong_ip
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	fail_test_span_gre_dir $tundev ingress
+
+	ip link set dev $tundev type $type remote $correct_ip
+	quick_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: remote address change ($tcflags)"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	test_span_gre_ttl gt4 gretap ip "mirror to gretap"
+	test_span_gre_ttl gt6 ip6gretap ipv6 "mirror to ip6gretap"
+
+	test_span_gre_tun_up gt4 "mirror to gretap"
+	test_span_gre_tun_up gt6 "mirror to ip6gretap"
+
+	test_span_gre_egress_up gt4 192.0.2.130 "mirror to gretap"
+	test_span_gre_egress_up gt6 2001:db8:2::2 "mirror to ip6gretap"
+
+	test_span_gre_remote_ip gt4 gretap 192.0.2.130 192.0.2.132 "mirror to gretap"
+	test_span_gre_remote_ip gt6 ip6gretap 2001:db8:2::2 2001:db8:2::4 "mirror to ip6gretap"
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 28fb4e59a47d7f1f0c7a26d2ed3a671c26158536 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Thu, 26 Apr 2018 22:28:49 -0700
Subject: net: qrtr: Expose tunneling endpoint to user space

This implements a misc character device named "qrtr-tun" for the purpose
of allowing user space applications to implement endpoints in the qrtr
network.

This allows more advanced (and dynamic) testing of the qrtr code as well
as opens up the ability of tunneling qrtr over a network or USB link.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/qrtr/Kconfig  |   7 +++
 net/qrtr/Makefile |   2 +
 net/qrtr/tun.c    | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 net/qrtr/tun.c

diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index 326fd97444f5..1944834d225c 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -21,4 +21,11 @@ config QRTR_SMD
 	  Say Y here to support SMD based ipcrouter channels.  SMD is the
 	  most common transport for IPC Router.
 
+config QRTR_TUN
+	tristate "TUN device for Qualcomm IPC Router"
+	---help---
+	  Say Y here to expose a character device that allows user space to
+	  implement endpoints of QRTR, for purpose of tunneling data to other
+	  hosts or testing purposes.
+
 endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index ab09e40f7c74..be012bfd3e52 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o
 
 obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
 qrtr-smd-y	:= smd.o
+obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o
+qrtr-tun-y	:= tun.o
diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
new file mode 100644
index 000000000000..ccff1e544c21
--- /dev/null
+++ b/net/qrtr/tun.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Linaro Ltd */
+
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/skbuff.h>
+#include <linux/uaccess.h>
+
+#include "qrtr.h"
+
+struct qrtr_tun {
+	struct qrtr_endpoint ep;
+
+	struct sk_buff_head queue;
+	wait_queue_head_t readq;
+};
+
+static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
+{
+	struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep);
+
+	skb_queue_tail(&tun->queue, skb);
+
+	/* wake up any blocking processes, waiting for new data */
+	wake_up_interruptible(&tun->readq);
+
+	return 0;
+}
+
+static int qrtr_tun_open(struct inode *inode, struct file *filp)
+{
+	struct qrtr_tun *tun;
+
+	tun = kzalloc(sizeof(*tun), GFP_KERNEL);
+	if (!tun)
+		return -ENOMEM;
+
+	skb_queue_head_init(&tun->queue);
+	init_waitqueue_head(&tun->readq);
+
+	tun->ep.xmit = qrtr_tun_send;
+
+	filp->private_data = tun;
+
+	return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO);
+}
+
+static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *filp = iocb->ki_filp;
+	struct qrtr_tun *tun = filp->private_data;
+	struct sk_buff *skb;
+	int count;
+
+	while (!(skb = skb_dequeue(&tun->queue))) {
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		/* Wait until we get data or the endpoint goes away */
+		if (wait_event_interruptible(tun->readq,
+					     !skb_queue_empty(&tun->queue)))
+			return -ERESTARTSYS;
+	}
+
+	count = min_t(size_t, iov_iter_count(to), skb->len);
+	if (copy_to_iter(skb->data, count, to) != count)
+		count = -EFAULT;
+
+	kfree_skb(skb);
+
+	return count;
+}
+
+static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *filp = iocb->ki_filp;
+	struct qrtr_tun *tun = filp->private_data;
+	size_t len = iov_iter_count(from);
+	ssize_t ret;
+	void *kbuf;
+
+	kbuf = kzalloc(len, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (!copy_from_iter_full(kbuf, len, from))
+		return -EFAULT;
+
+	ret = qrtr_endpoint_post(&tun->ep, kbuf, len);
+
+	return ret < 0 ? ret : len;
+}
+
+static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait)
+{
+	struct qrtr_tun *tun = filp->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(filp, &tun->readq, wait);
+
+	if (!skb_queue_empty(&tun->queue))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static int qrtr_tun_release(struct inode *inode, struct file *filp)
+{
+	struct qrtr_tun *tun = filp->private_data;
+	struct sk_buff *skb;
+
+	qrtr_endpoint_unregister(&tun->ep);
+
+	/* Discard all SKBs */
+	while (!skb_queue_empty(&tun->queue)) {
+		skb = skb_dequeue(&tun->queue);
+		kfree_skb(skb);
+	}
+
+	kfree(tun);
+
+	return 0;
+}
+
+static const struct file_operations qrtr_tun_ops = {
+	.owner = THIS_MODULE,
+	.open = qrtr_tun_open,
+	.poll = qrtr_tun_poll,
+	.read_iter = qrtr_tun_read_iter,
+	.write_iter = qrtr_tun_write_iter,
+	.release = qrtr_tun_release,
+};
+
+static struct miscdevice qrtr_tun_miscdev = {
+	MISC_DYNAMIC_MINOR,
+	"qrtr-tun",
+	&qrtr_tun_ops,
+};
+
+static int __init qrtr_tun_init(void)
+{
+	int ret;
+
+	ret = misc_register(&qrtr_tun_miscdev);
+	if (ret)
+		pr_err("failed to register Qualcomm IPC Router tun device\n");
+
+	return ret;
+}
+
+static void __exit qrtr_tun_exit(void)
+{
+	misc_deregister(&qrtr_tun_miscdev);
+}
+
+module_init(qrtr_tun_init);
+module_exit(qrtr_tun_exit);
+
+MODULE_DESCRIPTION("Qualcomm IPC Router TUN device");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 80e95f472fade809daf8110861a5553fba98cfe5 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 27 Apr 2018 15:36:18 +0800
Subject: ptp_pch: use helpers function for converting between ns and timespec

use ns_to_timespec64() and timespec64_to_ns() instead of open coding

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_pch.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c
index b3285175f20f..78ccf936d356 100644
--- a/drivers/ptp/ptp_pch.c
+++ b/drivers/ptp/ptp_pch.c
@@ -452,7 +452,6 @@ static int ptp_pch_adjtime(struct ptp_clock_info *ptp, s64 delta)
 static int ptp_pch_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 {
 	u64 ns;
-	u32 remainder;
 	unsigned long flags;
 	struct pch_dev *pch_dev = container_of(ptp, struct pch_dev, caps);
 	struct pch_ts_regs __iomem *regs = pch_dev->regs;
@@ -461,8 +460,7 @@ static int ptp_pch_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 	ns = pch_systime_read(regs);
 	spin_unlock_irqrestore(&pch_dev->register_lock, flags);
 
-	ts->tv_sec = div_u64_rem(ns, 1000000000, &remainder);
-	ts->tv_nsec = remainder;
+	*ts = ns_to_timespec64(ns);
 	return 0;
 }
 
@@ -474,8 +472,7 @@ static int ptp_pch_settime(struct ptp_clock_info *ptp,
 	struct pch_dev *pch_dev = container_of(ptp, struct pch_dev, caps);
 	struct pch_ts_regs __iomem *regs = pch_dev->regs;
 
-	ns = ts->tv_sec * 1000000000ULL;
-	ns += ts->tv_nsec;
+	ns = timespec64_to_ns(ts);
 
 	spin_lock_irqsave(&pch_dev->register_lock, flags);
 	pch_systime_write(regs, ns);
-- 
cgit v1.2.3-59-g8ed1b


From 6c3442f5f85827e40e314a74a24b1428e78748c8 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Date: Fri, 27 Apr 2018 16:18:58 +0800
Subject: drivers: net: replace UINT64_MAX with U64_MAX

U64_MAX is well defined now while the UINT64_MAX is not, so we fall
back to drivers' own definition as below:

	#ifndef UINT64_MAX
	#define UINT64_MAX             (u64)(~((u64)0))
	#endif

I believe this is in one phy driver then copied and pasted to other phy
drivers.

Replace the UINT64_MAX with U64_MAX to clean up the source code.

Signed-off-by: Jisheng Zhang <Jisheng.Zhang@synaptics.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 6 +++---
 drivers/net/dsa/mv88e6xxx/chip.h | 4 ----
 drivers/net/phy/bcm-phy-lib.c    | 6 +-----
 drivers/net/phy/marvell.c        | 5 +----
 drivers/net/phy/micrel.c         | 5 +----
 drivers/net/phy/smsc.c           | 5 +----
 6 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 8f92ccc0dd54..a13a7005ffa0 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -665,13 +665,13 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
 	case STATS_TYPE_PORT:
 		err = mv88e6xxx_port_read(chip, port, s->reg, &reg);
 		if (err)
-			return UINT64_MAX;
+			return U64_MAX;
 
 		low = reg;
 		if (s->size == 4) {
 			err = mv88e6xxx_port_read(chip, port, s->reg + 1, &reg);
 			if (err)
-				return UINT64_MAX;
+				return U64_MAX;
 			high = reg;
 		}
 		break;
@@ -685,7 +685,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
 			mv88e6xxx_g1_stats_read(chip, reg + 1, &high);
 		break;
 	default:
-		return UINT64_MAX;
+		return U64_MAX;
 	}
 	value = (((u64)high) << 16) | low;
 	return value;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 80490f66bc06..4163c8099d0b 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -21,10 +21,6 @@
 #include <linux/timecounter.h>
 #include <net/dsa.h>
 
-#ifndef UINT64_MAX
-#define UINT64_MAX		(u64)(~((u64)0))
-#endif
-
 #define SMI_CMD			0x00
 #define SMI_CMD_BUSY		BIT(15)
 #define SMI_CMD_CLAUSE_22	BIT(12)
diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c
index 5ad130c3da43..0876aec7328c 100644
--- a/drivers/net/phy/bcm-phy-lib.c
+++ b/drivers/net/phy/bcm-phy-lib.c
@@ -346,10 +346,6 @@ void bcm_phy_get_strings(struct phy_device *phydev, u8 *data)
 }
 EXPORT_SYMBOL_GPL(bcm_phy_get_strings);
 
-#ifndef UINT64_MAX
-#define UINT64_MAX              (u64)(~((u64)0))
-#endif
-
 /* Caller is supposed to provide appropriate storage for the library code to
  * access the shadow copy
  */
@@ -362,7 +358,7 @@ static u64 bcm_phy_get_stat(struct phy_device *phydev, u64 *shadow,
 
 	val = phy_read(phydev, stat.reg);
 	if (val < 0) {
-		ret = UINT64_MAX;
+		ret = U64_MAX;
 	} else {
 		val >>= stat.shift;
 		val = val & ((1 << stat.bits) - 1);
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 25e2a099b71c..b8f57e9b9379 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -1482,9 +1482,6 @@ static void marvell_get_strings(struct phy_device *phydev, u8 *data)
 	}
 }
 
-#ifndef UINT64_MAX
-#define UINT64_MAX		(u64)(~((u64)0))
-#endif
 static u64 marvell_get_stat(struct phy_device *phydev, int i)
 {
 	struct marvell_hw_stat stat = marvell_hw_stats[i];
@@ -1494,7 +1491,7 @@ static u64 marvell_get_stat(struct phy_device *phydev, int i)
 
 	val = phy_read_paged(phydev, stat.page, stat.reg);
 	if (val < 0) {
-		ret = UINT64_MAX;
+		ret = U64_MAX;
 	} else {
 		val = val & ((1 << stat.bits) - 1);
 		priv->stats[i] += val;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index f41b224a9cdb..de31c5170a5b 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -650,9 +650,6 @@ static void kszphy_get_strings(struct phy_device *phydev, u8 *data)
 	}
 }
 
-#ifndef UINT64_MAX
-#define UINT64_MAX              (u64)(~((u64)0))
-#endif
 static u64 kszphy_get_stat(struct phy_device *phydev, int i)
 {
 	struct kszphy_hw_stat stat = kszphy_hw_stats[i];
@@ -662,7 +659,7 @@ static u64 kszphy_get_stat(struct phy_device *phydev, int i)
 
 	val = phy_read(phydev, stat.reg);
 	if (val < 0) {
-		ret = UINT64_MAX;
+		ret = U64_MAX;
 	} else {
 		val = val & ((1 << stat.bits) - 1);
 		priv->stats[i] += val;
diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index be399d645224..c328208388da 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -168,9 +168,6 @@ static void smsc_get_strings(struct phy_device *phydev, u8 *data)
 	}
 }
 
-#ifndef UINT64_MAX
-#define UINT64_MAX              (u64)(~((u64)0))
-#endif
 static u64 smsc_get_stat(struct phy_device *phydev, int i)
 {
 	struct smsc_hw_stat stat = smsc_hw_stats[i];
@@ -179,7 +176,7 @@ static u64 smsc_get_stat(struct phy_device *phydev, int i)
 
 	val = phy_read(phydev, stat.reg);
 	if (val < 0)
-		ret = UINT64_MAX;
+		ret = U64_MAX;
 	else
 		ret = val;
 
-- 
cgit v1.2.3-59-g8ed1b


From a6dc6670cd7e0bba580026443cbf77fdd370c791 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Fri, 27 Apr 2018 17:51:48 +0200
Subject: ipv6: sr: Add documentation for seg_flowlabel sysctl

This patch adds a documentation for seg_flowlabel sysctl into
Documentation/networking/ip-sysctl.txt

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b583a73cf95f..b2f463e0cb33 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1428,6 +1428,19 @@ ip6frag_low_thresh - INTEGER
 ip6frag_time - INTEGER
 	Time in seconds to keep an IPv6 fragment in memory.
 
+IPv6 Segment Routing:
+
+seg6_flowlabel - INTEGER
+	Controls the behaviour of computing the flowlabel of outer
+	IPv6 header in case of SR T.encaps
+
+	-1 set flowlabel to zero.
+	0 copy flowlabel from Inner packet in case of Inner IPv6
+		(Set flowlabel to 0 in case IPv4/L2)
+	1 Compute the flowlabel using seg6_make_flowlabel()
+
+	Default is 0.
+
 conf/default/*:
 	Change the interface-specific default settings.
 
-- 
cgit v1.2.3-59-g8ed1b


From af201bab50a89aa6cf4df952b2c3bf55895c8eee Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 27 Apr 2018 11:12:10 -0400
Subject: udp: remove stray export symbol

UDP GSO needs to export __udp_gso_segment to call it from ipv6.

I accidentally exported static ipv4 function __udp4_gso_segment.
Remove that EXPORT_SYMBOL_GPL.

Fixes: ee80d1ebe5ba ("udp: add udp gso")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index dc5158cba66e..f78fb3673472 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -247,7 +247,6 @@ static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 				 udp_v4_check(sizeof(struct udphdr) + mss,
 					      iph->saddr, iph->daddr, 0));
 }
-EXPORT_SYMBOL_GPL(__udp4_gso_segment);
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
-- 
cgit v1.2.3-59-g8ed1b


From 9e8d438e8ba43a38def2890a65a26917f2233a83 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 27 Apr 2018 12:41:49 -0700
Subject: net: phy: Fix modular PHYLIB build

After commit c59530d0d5dc ("net: Move PHY statistics code into PHY
library helpers") we made net/core/ethtool.c reference symbols which are
part of the library which can be modular. David introduced a temporary
fix with 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.")
which would prevent such modularity.

This is not desireable of course, so instead, just inline the functions
into include/linux/phy.h to keep both options available.

Fixes: c59530d0d5dc ("net: Move PHY statistics code into PHY library helpers")
Fixes: 1ecd6e8ad996 ("phy: Temporary build fix after phylib changes.")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig |  3 ++-
 drivers/net/phy/phy.c   | 48 -----------------------------------------------
 include/linux/phy.h     | 50 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 40 insertions(+), 61 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 7c5e8c1e9370..edb8b9ab827f 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -9,6 +9,7 @@ menuconfig MDIO_DEVICE
 
 config MDIO_BUS
 	tristate
+	default m if PHYLIB=m
 	default MDIO_DEVICE
 	help
 	  This internal symbol is used for link time dependencies and it
@@ -170,7 +171,7 @@ config PHYLINK
 	  autonegotiation modes.
 
 menuconfig PHYLIB
-	bool "PHY Device support and infrastructure"
+	tristate "PHY Device support and infrastructure"
 	depends on NETDEVICES
 	select MDIO_DEVICE
 	help
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a98ed12c0009..05c1e8ef15e6 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1277,51 +1277,3 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
 	return phy_restart_aneg(phydev);
 }
 EXPORT_SYMBOL(phy_ethtool_nway_reset);
-
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
-{
-	if (!phydev->drv)
-		return -EIO;
-
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_strings(phydev, data);
-	mutex_unlock(&phydev->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_get_strings);
-
-int phy_ethtool_get_sset_count(struct phy_device *phydev)
-{
-	int ret;
-
-	if (!phydev->drv)
-		return -EIO;
-
-	if (phydev->drv->get_sset_count &&
-	    phydev->drv->get_strings &&
-	    phydev->drv->get_stats) {
-		mutex_lock(&phydev->lock);
-		ret = phydev->drv->get_sset_count(phydev);
-		mutex_unlock(&phydev->lock);
-
-		return ret;
-	}
-
-	return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(phy_ethtool_get_sset_count);
-
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data)
-{
-	if (!phydev->drv)
-		return -EIO;
-
-	mutex_lock(&phydev->lock);
-	phydev->drv->get_stats(phydev, stats, data);
-	mutex_unlock(&phydev->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_get_stats);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6ca81395c545..073235e70442 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1066,27 +1066,53 @@ int phy_ethtool_nway_reset(struct net_device *ndev);
 #if IS_ENABLED(CONFIG_PHYLIB)
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
-int phy_ethtool_get_sset_count(struct phy_device *phydev);
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data);
-#else
-int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+#endif
+
+/* Inline function for use within net/core/ethtool.c (built-in) */
+static inline int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
 {
-	return -EOPNOTSUPP;
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_strings(phydev, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
 }
 
-int phy_ethtool_get_sset_count(struct phy_device *phydev)
+static inline int phy_ethtool_get_sset_count(struct phy_device *phydev)
 {
+	int ret;
+
+	if (!phydev->drv)
+		return -EIO;
+
+	if (phydev->drv->get_sset_count &&
+	    phydev->drv->get_strings &&
+	    phydev->drv->get_stats) {
+		mutex_lock(&phydev->lock);
+		ret = phydev->drv->get_sset_count(phydev);
+		mutex_unlock(&phydev->lock);
+
+		return ret;
+	}
+
 	return -EOPNOTSUPP;
 }
 
-int phy_ethtool_get_stats(struct phy_device *phydev,
-			  struct ethtool_stats *stats, u64 *data)
+static inline int phy_ethtool_get_stats(struct phy_device *phydev,
+					struct ethtool_stats *stats, u64 *data)
 {
-	return -EOPNOTSUPP;
+	if (!phydev->drv)
+		return -EIO;
+
+	mutex_lock(&phydev->lock);
+	phydev->drv->get_stats(phydev, stats, data);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
 }
-#endif
 
 extern struct bus_type mdio_bus_type;
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f4126327494c189767ca64b222abadb07c55e3d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:07 -0700
Subject: bpf: change prototype for stack_map_get_build_id_offset

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb1234b67..04f6ec1679f0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ out:
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
cgit v1.2.3-59-g8ed1b


From c195651e565ae7f41a68acb7d4aa7390ad215de1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:08 -0700
Subject: bpf: add bpf_get_stack helper

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 42 ++++++++++++++++++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    | 19 ++++++++++++++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 7 files changed, 183 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc61ed99..c553f6f9c6b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -692,6 +692,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..64899c04c1a6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				has_callchain_buf:1; /* callchain buffer allocated? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a9388947..1afb606a18b9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec39efb3..9349a5db3cf2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1722,6 +1723,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->has_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1679f0..3ba102b41512 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr < skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596aebd3..253f6bdb9117 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/stringify.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include "disasm.h"
 
@@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+		const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+		err = -ENOTSUPP;
+		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+		if (err) {
+			verbose(env, err_str, func_id_name(func_id), func_id);
+			return err;
+		}
+
+		env->prog->has_callchain_buf = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 56ba0f2a01db..46d866e9937c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 849fa50662fbc8b476d652f8a4e6bdda17b37859 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:09 -0700
Subject: bpf/verifier: refine retval R0 state for bpf_get_stack helper

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
     R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 253f6bdb9117..988400e283b7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,6 +165,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -1985,6 +1987,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
+		/* remember the mem_size which may be used later
+		 * to refine return values.
+		 */
+		meta->msize_smax_value = reg->smax_value;
+		meta->msize_umax_value = reg->umax_value;
+
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
@@ -2324,6 +2332,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+	__reg_deduce_bounds(ret_reg);
+	__reg_bound_offset(ret_reg);
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2447,6 +2472,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From afbe1a5b79cd18f0ace9107e6b4cd7aa193e5e52 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:10 -0700
Subject: bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals

In verifier function adjust_scalar_min_max_vals,
when src_known is false and the opcode is BPF_LSH/BPF_RSH,
early return will happen in the function. So remove
the branch in handling BPF_LSH/BPF_RSH when src_known is false.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 988400e283b7..6e3f859b3abf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2940,10 +2940,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umin_value <<= umin_val;
 			dst_reg->umax_value <<= umax_val;
 		}
-		if (src_known)
-			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
-		else
-			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
@@ -2971,11 +2968,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		dst_reg->smin_value = S64_MIN;
 		dst_reg->smax_value = S64_MAX;
-		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
-						       umin_val);
-		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 		dst_reg->umin_value >>= umax_val;
 		dst_reg->umax_value >>= umin_val;
 		/* We may learn something more from the var_off */
-- 
cgit v1.2.3-59-g8ed1b


From 9cbe1f5a32dcd6d0508326f7d9098e5bc380a4fe Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:11 -0700
Subject: bpf/verifier: improve register value range tracking with ARSH

When helpers like bpf_get_stack returns an int value
and later on used for arithmetic computation, the LSH and ARSH
operations are often required to get proper sign extension into
64-bit. For example, without this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0)
With this patch:
    54: R0=inv(id=0,umax_value=800)
    54: (bf) r8 = r0
    55: R0=inv(id=0,umax_value=800) R8_w=inv(id=0,umax_value=800)
    55: (67) r8 <<= 32
    56: R8_w=inv(id=0,umax_value=3435973836800,var_off=(0x0; 0x3ff00000000))
    56: (c7) r8 s>>= 32
    57: R8=inv(id=0, umax_value=800,var_off=(0x0; 0x3ff))
With better range of "R8", later on when "R8" is added to other register,
e.g., a map pointer or scalar-value register, the better register
range can be derived and verifier failure may be avoided.

In our later example,
    ......
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0)
        return 0;
    ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
    ......
Without improving ARSH value range tracking, the register representing
"max_len - usize" will have smin_value equal to S64_MIN and will be
rejected by verifier.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h  |  4 +++-
 kernel/bpf/tnum.c     | 10 ++++++++++
 kernel/bpf/verifier.c | 23 +++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 0d2d3da46139..c7dc2b5902c0 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -23,8 +23,10 @@ struct tnum tnum_range(u64 min, u64 max);
 /* Arithmetic and logical ops */
 /* Shift a tnum left (by a fixed shift) */
 struct tnum tnum_lshift(struct tnum a, u8 shift);
-/* Shift a tnum right (by a fixed shift) */
+/* Shift (rsh) a tnum right (by a fixed shift) */
 struct tnum tnum_rshift(struct tnum a, u8 shift);
+/* Shift (arsh) a tnum right (by a fixed min_shift) */
+struct tnum tnum_arshift(struct tnum a, u8 min_shift);
 /* Add two tnums, return @a + @b */
 struct tnum tnum_add(struct tnum a, struct tnum b);
 /* Subtract two tnums, return @a - @b */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
 	return TNUM(a.value >> shift, a.mask >> shift);
 }
 
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+	/* if a.value is negative, arithmetic shifting by minimum shift
+	 * will have larger negative offset compared to more shifting.
+	 * If a.value is nonnegative, arithmetic shifting by minimum shift
+	 * will have larger positive offset compare to more shifting.
+	 */
+	return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
 struct tnum tnum_add(struct tnum a, struct tnum b)
 {
 	u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6e3f859b3abf..712d8655e916 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2974,6 +2974,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
+	case BPF_ARSH:
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
+			 */
+			mark_reg_unknown(env, regs, insn->dst_reg);
+			break;
+		}
+
+		/* Upon reaching here, src_known is true and
+		 * umax_val is equal to umin_val.
+		 */
+		dst_reg->smin_value >>= umin_val;
+		dst_reg->smax_value >>= umin_val;
+		dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+		/* blow away the dst_reg umin_value/umax_value and rely on
+		 * dst_reg var_off to refine the result.
+		 */
+		dst_reg->umin_value = 0;
+		dst_reg->umax_value = U64_MAX;
+		__update_reg_bounds(dst_reg);
+		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From de2ff05f48afcde816ff4edb217417f62f624ab5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:12 -0700
Subject: tools/bpf: add bpf_get_stack helper to tools headers

The tools header file bpf.h is synced with kernel uapi bpf.h.
The new helper is also added to bpf_helpers.h.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h            | 42 +++++++++++++++++++++++++++++--
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da77a9388947..1afb606a18b9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1767,6 +1767,40 @@ union bpf_attr {
  * 		**CONFIG_XFRM** configuration option.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ *		Return a user or a kernel stack in bpf program provided buffer.
+ *		To achieve this, the helper needs *ctx*, which is a pointer
+ *		to the context on which the tracing program is executed.
+ *		To store the stacktrace, the bpf program provides *buf* with
+ *		a nonnegative *size*.
+ *
+ *		The last argument, *flags*, holds the number of stack frames to
+ *		skip (from 0 to 255), masked with
+ *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *		the following flags:
+ *
+ *		**BPF_F_USER_STACK**
+ *			Collect a user space stack instead of a kernel stack.
+ *		**BPF_F_USER_BUILD_ID**
+ *			Collect buildid+offset instead of ips for user stack,
+ *			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *		**bpf_get_stack**\ () can collect up to
+ *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *		to sufficient large buffer size. Note that
+ *		this limit can be controlled with the **sysctl** program, and
+ *		that it should be manually increased in order to profile long
+ *		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *	::
+ *
+ *		# sysctl kernel.perf_event_max_stack=<new value>
+ *
+ * 	Return
+ * 		a non-negative value equal to or less than size on success, or
+ * 		a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1835,7 +1869,8 @@ union bpf_attr {
 	FN(msg_pull_data),		\
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
-	FN(skb_get_xfrm_state),
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1869,11 +1904,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 69d7b918e66a..265f8e0e8ada 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -101,6 +101,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 				     int size, int flags) =
 	(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
+	(void *) BPF_FUNC_get_stack;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From 28dbf861deacb0321604bf1c5e1ccc34dd215669 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:13 -0700
Subject: samples/bpf: move common-purpose trace functions to selftests

There is no functionality change in this patch. The common-purpose
trace functions, including perf_event polling and ksym lookup,
are moved from trace_output_user.c and bpf_load.c to
selftests/bpf/trace_helpers.c so that these function can
be reused later in selftests.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                        |  11 +-
 samples/bpf/bpf_load.c                      |  63 ----------
 samples/bpf/bpf_load.h                      |   7 --
 samples/bpf/offwaketime_user.c              |   1 +
 samples/bpf/sampleip_user.c                 |   1 +
 samples/bpf/spintest_user.c                 |   1 +
 samples/bpf/trace_event_user.c              |   1 +
 samples/bpf/trace_output_user.c             | 110 ++---------------
 tools/testing/selftests/bpf/trace_helpers.c | 180 ++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  23 ++++
 10 files changed, 223 insertions(+), 175 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b853581592fd..5e31770ac087 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -49,6 +49,7 @@ hostprogs-y += xdp_adjust_tail
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
 test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
 sock_example-objs := sock_example.o $(LIBBPF)
@@ -65,10 +66,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
 tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
+offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
 map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
 test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
 test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
@@ -82,8 +83,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
+trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
 tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
 xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index feca497d6afd..a27ef3c42e4e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -648,66 +648,3 @@ void read_trace_pipe(void)
 		}
 	}
 }
-
-#define MAX_SYMS 300000
-static struct ksym syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
-	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-int load_kallsyms(void)
-{
-	FILE *f = fopen("/proc/kallsyms", "r");
-	char func[256], buf[256];
-	char symbol;
-	void *addr;
-	int i = 0;
-
-	if (!f)
-		return -ENOENT;
-
-	while (!feof(f)) {
-		if (!fgets(buf, sizeof(buf), f))
-			break;
-		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
-			break;
-		if (!addr)
-			continue;
-		syms[i].addr = (long) addr;
-		syms[i].name = strdup(func);
-		i++;
-	}
-	sym_cnt = i;
-	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
-	return 0;
-}
-
-struct ksym *ksym_search(long key)
-{
-	int start = 0, end = sym_cnt;
-	int result;
-
-	while (start < end) {
-		size_t mid = start + (end - start) / 2;
-
-		result = key - syms[mid].addr;
-		if (result < 0)
-			end = mid;
-		else if (result > 0)
-			start = mid + 1;
-		else
-			return &syms[mid];
-	}
-
-	if (start >= 1 && syms[start - 1].addr < key &&
-	    key < syms[start].addr)
-		/* valid ksym */
-		return &syms[start - 1];
-
-	/* out of range. return _stext */
-	return &syms[0];
-}
-
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200b389b..2c3d0b448632 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
 int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
 
 void read_trace_pipe(void);
-struct ksym {
-	long addr;
-	char *name;
-};
-
-int load_kallsyms(void);
-struct ksym *ksym_search(long key);
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a5fd20..f06063af9fcb 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 #define PRINT_RAW_ADDR 0
 
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b907ff..60c2b73d1b4d 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define DEFAULT_FREQ	99
 #define DEFAULT_SECS	5
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d736219a31c..8d3e9cfa1909 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 int main(int ac, char **argv)
 {
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a259a7c9..1fa1becfa641 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define SAMPLE_FREQ 50
 
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e348017..5e78c2ecd08d 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -21,100 +21,10 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 static int pmu_fd;
 
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
-	void *base;
-	int mmap_size;
-
-	page_size = getpagesize();
-	mmap_size = page_size * (page_cnt + 1);
-
-	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	if (base == MAP_FAILED) {
-		printf("mmap err\n");
-		return -1;
-	}
-
-	header = base;
-	return 0;
-}
-
-static int perf_event_poll(int fd)
-{
-	struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
-	return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
-	struct perf_event_header header;
-	__u32 size;
-	char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	__u64 buffer_size = page_cnt * page_size;
-	void *base, *begin, *end;
-	char buf[256];
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			assert(len < e->header.size);
-			memcpy(buf, begin, len);
-			memcpy(buf + len, base, e->header.size - len);
-			e = (void *) buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		if (e->header.type == PERF_RECORD_SAMPLE) {
-			fn(e->data, e->size);
-		} else if (e->header.type == PERF_RECORD_LOST) {
-			struct {
-				struct perf_event_header header;
-				__u64 id;
-				__u64 lost;
-			} *lost = (void *) e;
-			printf("lost %lld events\n", lost->lost);
-		} else {
-			printf("unknown event type=%d size=%d\n",
-			       e->header.type, e->header.size);
-		}
-	}
-
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
-}
-
 static __u64 time_get_ns(void)
 {
 	struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
 
 #define MAX_CNT 100000ll
 
-static void print_bpf_output(void *data, int size)
+static int print_bpf_output(void *data, int size)
 {
 	static __u64 cnt;
 	struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
 	if (e->cookie != 0x12345678) {
 		printf("BUG pid %llx cookie %llx sized %d\n",
 		       e->pid, e->cookie, size);
-		kill(0, SIGINT);
+		return PERF_EVENT_ERROR;
 	}
 
 	cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
 	if (cnt == MAX_CNT) {
 		printf("recv %lld events per sec\n",
 		       MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
-		kill(0, SIGINT);
+		return PERF_EVENT_DONE;
 	}
+
+	return PERF_EVENT_CONT;
 }
 
 static void test_bpf_perf_event(void)
@@ -170,6 +82,7 @@ int main(int argc, char **argv)
 {
 	char filename[256];
 	FILE *f;
+	int ret;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
@@ -187,10 +100,7 @@ int main(int argc, char **argv)
 	(void) f;
 
 	start_time = time_get_ns();
-	for (;;) {
-		perf_event_poll(pmu_fd);
-		perf_event_read(print_bpf_output);
-	}
-
-	return 0;
+	ret = perf_event_poller(pmu_fd, print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
 }
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 000000000000..ad025bd75f1c
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include "trace_helpers.h"
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+	FILE *f = fopen("/proc/kallsyms", "r");
+	char func[256], buf[256];
+	char symbol;
+	void *addr;
+	int i = 0;
+
+	if (!f)
+		return -ENOENT;
+
+	while (!feof(f)) {
+		if (!fgets(buf, sizeof(buf), f))
+			break;
+		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+			break;
+		if (!addr)
+			continue;
+		syms[i].addr = (long) addr;
+		syms[i].name = strdup(func);
+		i++;
+	}
+	sym_cnt = i;
+	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+	return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+	int start = 0, end = sym_cnt;
+	int result;
+
+	while (start < end) {
+		size_t mid = start + (end - start) / 2;
+
+		result = key - syms[mid].addr;
+		if (result < 0)
+			end = mid;
+		else if (result > 0)
+			start = mid + 1;
+		else
+			return &syms[mid];
+	}
+
+	if (start >= 1 && syms[start - 1].addr < key &&
+	    key < syms[start].addr)
+		/* valid ksym */
+		return &syms[start - 1];
+
+	/* out of range. return _stext */
+	return &syms[0];
+}
+
+static int page_size;
+static int page_cnt = 8;
+static volatile struct perf_event_mmap_page *header;
+
+int perf_event_mmap(int fd)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	header = base;
+	return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+	struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+	return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	char data[];
+};
+
+static int perf_event_read(perf_event_print_fn fn)
+{
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+	char buf[256];
+	int ret;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return PERF_EVENT_CONT;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			assert(len < e->header.size);
+			memcpy(buf, begin, len);
+			memcpy(buf + len, base, e->header.size - len);
+			e = (void *) buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			ret = fn(e->data, e->size);
+			if (ret != PERF_EVENT_CONT)
+				return ret;
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			struct {
+				struct perf_event_header header;
+				__u64 id;
+				__u64 lost;
+			} *lost = (void *) e;
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+	return PERF_EVENT_CONT;
+}
+
+int perf_event_poller(int fd, perf_event_print_fn output_fn)
+{
+	int ret;
+
+	for (;;) {
+		perf_event_poll(fd);
+		ret = perf_event_read(output_fn);
+		if (ret != PERF_EVENT_CONT)
+			return ret;
+	}
+
+	return PERF_EVENT_DONE;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 000000000000..fe3eefd21e86
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_HELPER_H
+#define __TRACE_HELPER_H
+
+struct ksym {
+	long addr;
+	char *name;
+};
+
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
+
+typedef int (*perf_event_print_fn)(void *data, int size);
+
+/* return code for perf_event_print_fn */
+#define PERF_EVENT_DONE		0
+#define PERF_EVENT_ERROR	-1
+#define PERF_EVENT_CONT		-2
+
+int perf_event_mmap(int fd);
+/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
+int perf_event_poller(int fd, perf_event_print_fn output_fn);
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 2abe611c5f031f06ab783102e62c0ae9dfd93e80 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:14 -0700
Subject: tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH

The test_verifier already has a few ARSH test cases.
This patch adds a new test case which takes advantage of newly
improved verifier behavior for bpf_get_stack and ARSH.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 165e9ddfa446..1acafe26498b 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11680,6 +11680,51 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_XADD stores into R2 packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_get_stack return R0 within range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)),
+			BPF_MOV64_IMM(BPF_REG_4, 256),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
+			BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3-59-g8ed1b


From 173965fbfba596c02fa128966c2a33cb88afcd7f Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:15 -0700
Subject: tools/bpf: add a test for bpf_get_stack with raw tracepoint prog

The test attached a raw_tracepoint program to raw_syscalls/sys_enter.
It tested to get stack for user space, kernel space and user
space with build_id request. It also tested to get user
and kernel stack into the same buffer with back-to-back
bpf_get_stack helper calls.

If jit is not enabled, the user space application will check
to ensure that the kernel function for raw_tracepoint
___bpf_prog_run is part of the stack.

If jit is enabled, we did not have a reliable way to
verify the kernel stack, so just assume the kernel stack
is good when the kernel stack size is greater than 0.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |   4 +-
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 ++++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 172 +++++++++++++++++++--
 3 files changed, 266 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b64a7a39cbc8..9d762184b805 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
-	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o
+	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
+	test_get_stack_rawtp.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -58,6 +59,7 @@ $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
 $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_sockmap: cgroup_helpers.c
+$(OUTPUT)/test_progs: trace_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
new file mode 100644
index 000000000000..f6d9f238e00a
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK_RAWTP 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+struct bpf_map_def SEC("maps") perfmap = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(__u32),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") stackdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct stack_trace_t),
+	.max_entries = 1,
+};
+
+/* Allocate per-cpu space twice the needed. For the code below
+ *   usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+ *   if (usize < 0)
+ *     return 0;
+ *   ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+ *
+ * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
+ * verifier will complain that access "raw_data + usize"
+ * with size "max_len - usize" may be out of bound.
+ * The maximum "raw_data + usize" is "raw_data + max_len"
+ * and the maximum "max_len - usize" is "max_len", verifier
+ * concludes that the maximum buffer access range is
+ * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
+ *
+ * Doubling the to-be-used max buffer size can fix this verifier
+ * issue and avoid complicated C programming massaging.
+ * This is an acceptable workaround since there is one entry here.
+ */
+struct bpf_map_def SEC("maps") rawdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2,
+	.max_entries = 1,
+};
+
+SEC("tracepoint/raw_syscalls/sys_enter")
+int bpf_prog1(void *ctx)
+{
+	int max_len, max_buildid_len, usize, ksize, total_size;
+	struct stack_trace_t *data;
+	void *raw_data;
+	__u32 key = 0;
+
+	data = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!data)
+		return 0;
+
+	max_len = MAX_STACK_RAWTP * sizeof(__u64);
+	max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
+	data->pid = bpf_get_current_pid_tgid();
+	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+					      max_len, 0);
+	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+					    BPF_F_USER_STACK);
+	data->user_stack_buildid_size = bpf_get_stack(
+		ctx, data->user_stack_buildid, max_buildid_len,
+		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+	/* write both kernel and user stacks to the same buffer */
+	raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
+	if (!raw_data)
+		return 0;
+
+	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+	if (usize < 0)
+		return 0;
+
+	ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+	if (ksize < 0)
+		return 0;
+
+	total_size = usize + ksize;
+	if (total_size > 0 && total_size <= max_len)
+		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index eedda98d7bb1..0ddbf34b2438 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -38,8 +38,10 @@ typedef __u16 __sum16;
 #include "bpf_util.h"
 #include "bpf_endian.h"
 #include "bpf_rlimit.h"
+#include "trace_helpers.h"
 
 static int error_cnt, pass_cnt;
+static bool jit_enabled;
 
 #define MAGIC_BYTES 123
 
@@ -391,13 +393,30 @@ static inline __u64 ptr_to_u64(const void *ptr)
 	return (__u64) (unsigned long) ptr;
 }
 
+static bool is_jit_enabled(void)
+{
+	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
+	bool enabled = false;
+	int sysctl_fd;
+
+	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
+	if (sysctl_fd != -1) {
+		char tmpc;
+
+		if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
+			enabled = (tmpc != '0');
+		close(sysctl_fd);
+	}
+
+	return enabled;
+}
+
 static void test_bpf_obj_id(void)
 {
 	const __u64 array_magic_value = 0xfaceb00c;
 	const __u32 array_key = 0;
 	const int nr_iters = 2;
 	const char *file = "./test_obj_id.o";
-	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
 	const char *expected_prog_name = "test_obj_id";
 	const char *expected_map_name = "test_map_id";
 	const __u64 nsec_per_sec = 1000000000;
@@ -414,20 +433,11 @@ static void test_bpf_obj_id(void)
 	char jited_insns[128], xlated_insns[128], zeros[128];
 	__u32 i, next_id, info_len, nr_id_found, duration = 0;
 	struct timespec real_time_ts, boot_time_ts;
-	int sysctl_fd, jit_enabled = 0, err = 0;
+	int err = 0;
 	__u64 array_value;
 	uid_t my_uid = getuid();
 	time_t now, load_time;
 
-	sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
-	if (sysctl_fd != -1) {
-		char tmpc;
-
-		if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
-			jit_enabled = (tmpc != '0');
-		close(sysctl_fd);
-	}
-
 	err = bpf_prog_get_fd_by_id(0);
 	CHECK(err >= 0 || errno != ENOENT,
 	      "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
@@ -1204,8 +1214,147 @@ out:
 	return;
 }
 
+#define MAX_CNT_RAWTP	10ull
+#define MAX_STACK_RAWTP	100
+struct get_stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+static int get_stack_print_output(void *data, int size)
+{
+	bool good_kern_stack = false, good_user_stack = false;
+	const char *nonjit_func = "___bpf_prog_run";
+	struct get_stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	struct ksym *ks;
+
+	cnt++;
+
+	if (size < sizeof(struct get_stack_trace_t)) {
+		__u64 *raw_data = data;
+		bool found = false;
+
+		num_stack = size / sizeof(__u64);
+		/* If jit is enabled, we do not have a good way to
+		 * verify the sanity of the kernel stack. So we
+		 * just assume it is good if the stack is not empty.
+		 * This could be improved in the future.
+		 */
+		if (jit_enabled) {
+			found = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(raw_data[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					found = true;
+					break;
+				}
+			}
+		}
+		if (found) {
+			good_kern_stack = true;
+			good_user_stack = true;
+		}
+	} else {
+		num_stack = e->kern_stack_size / sizeof(__u64);
+		if (jit_enabled) {
+			good_kern_stack = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(e->kern_stack[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					good_kern_stack = true;
+					break;
+				}
+			}
+		}
+		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+			good_user_stack = true;
+	}
+	if (!good_kern_stack || !good_user_stack)
+		return PERF_EVENT_ERROR;
+
+	if (cnt == MAX_CNT_RAWTP)
+		return PERF_EVENT_DONE;
+
+	return PERF_EVENT_CONT;
+}
+
+static void test_get_stack_raw_tp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	int i, efd, err, prog_fd, pmu_fd, perfmap_fd;
+	struct perf_event_attr attr = {};
+	struct timespec tv = {0, 10};
+	__u32 key = 0, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
+	if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  perfmap_fd, errno))
+		goto close_prog;
+
+	err = load_kallsyms();
+	if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/,
+			 -1/*group_fd*/, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
+	if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+
+	err = perf_event_mmap(pmu_fd);
+	if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	/* trigger some syscall action */
+	for (i = 0; i < MAX_CNT_RAWTP; i++)
+		nanosleep(&tv, NULL);
+
+	err = perf_event_poller(pmu_fd, get_stack_print_output);
+	if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
+	jit_enabled = is_jit_enabled();
+
 	test_pkt_access();
 	test_xdp();
 	test_xdp_adjust_tail();
@@ -1219,6 +1368,7 @@ int main(void)
 	test_stacktrace_map();
 	test_stacktrace_build_id();
 	test_stacktrace_map_raw_tp();
+	test_get_stack_raw_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
cgit v1.2.3-59-g8ed1b


From 79b45350131057250236e162ce7b3c0b291dc0a4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sat, 28 Apr 2018 22:28:16 -0700
Subject: tools/bpf: add a test for bpf_get_stack with tracepoint prog

The test_stacktrace_map and test_stacktrace_build_id are
enhanced to call bpf_get_stack in the helper to get the
stack trace as well.  The stack traces from bpf_get_stack
and bpf_get_stackid are compared to ensure that for the
same stack as represented as the same hash, their ip addresses
or build id's must be the same.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c           | 70 ++++++++++++++++++++--
 .../selftests/bpf/test_stacktrace_build_id.c       | 20 ++++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  | 19 +++++-
 3 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 0ddbf34b2438..aa336f0abebc 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -906,11 +906,47 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
+static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+{
+	__u32 key, next_key, *cur_key_p, *next_key_p;
+	char *val_buf1, *val_buf2;
+	int i, err = 0;
+
+	val_buf1 = malloc(stack_trace_len);
+	val_buf2 = malloc(stack_trace_len);
+	cur_key_p = NULL;
+	next_key_p = &key;
+	while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+		err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+		if (err)
+			goto out;
+		err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+		if (err)
+			goto out;
+		for (i = 0; i < stack_trace_len; i++) {
+			if (val_buf1[i] != val_buf2[i]) {
+				err = -1;
+				goto out;
+			}
+		}
+		key = *next_key_p;
+		cur_key_p = &key;
+		next_key_p = &next_key;
+	}
+	if (errno != ENOENT)
+		err = -1;
+
+out:
+	free(val_buf1);
+	free(val_buf2);
+	return err;
+}
+
 static void test_stacktrace_map()
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_map.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, val, duration = 0;
 	struct bpf_object *obj;
@@ -966,6 +1002,10 @@ static void test_stacktrace_map()
 	if (stackmap_fd < 0)
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -987,6 +1027,12 @@ static void test_stacktrace_map()
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu_noerr;
 
+	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
 	goto disable_pmu_noerr;
 disable_pmu:
 	error_cnt++;
@@ -1080,9 +1126,9 @@ err:
 
 static void test_stacktrace_build_id(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_build_id.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, previous_key, val, duration = 0;
 	struct bpf_object *obj;
@@ -1147,6 +1193,11 @@ static void test_stacktrace_build_id(void)
 		  err, errno))
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
 	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
 	       == 0);
 	assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
@@ -1198,8 +1249,15 @@ static void test_stacktrace_build_id(void)
 		previous_key = key;
 	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
 
-	CHECK(build_id_matches < 1, "build id match",
-	      "Didn't find expected build ID from the map");
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map"))
+		goto disable_pmu;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH
+		* sizeof(struct bpf_stack_build_id);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+	      "err %d errno %d\n", err, errno);
 
 disable_pmu:
 	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
index b755bd783ce5..d86c281e957f 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
@@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = {
 	.map_flags = BPF_F_STACK_BUILD_ID,
 };
 
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct bpf_stack_build_id)
+		* PERF_MAX_STACK_DEPTH,
+	.max_entries = 128,
+};
+
 /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
 struct random_urandom_args {
 	unsigned long long pad;
@@ -42,7 +50,10 @@ struct random_urandom_args {
 SEC("tracepoint/random/urandom_read")
 int oncpu(struct random_urandom_args *args)
 {
+	__u32 max_len = sizeof(struct bpf_stack_build_id)
+			* PERF_MAX_STACK_DEPTH;
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(args, stack_p, max_len,
+				      BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d08bd..af111af7ca1a 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
-	.max_entries = 10000,
+	.max_entries = 16384,
+};
+
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 16384,
 };
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,9 @@ struct sched_switch_args {
 SEC("tracepoint/sched/sched_switch")
 int oncpu(struct sched_switch_args *ctx)
 {
+	__u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -52,8 +61,12 @@ int oncpu(struct sched_switch_args *ctx)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(ctx, stack_p, max_len, 0);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a3ef8e9a4d7cc26d7528d50d10c5720b523b07c9 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sat, 28 Apr 2018 16:06:19 -0700
Subject: bpf: Fix helpers ctx struct types in uapi doc

Helpers may operate on two types of ctx structures: user visible ones
(e.g. `struct bpf_sock_ops`) when used in user programs, and kernel ones
(e.g. `struct bpf_sock_ops_kern`) in kernel implementation.

UAPI documentation must refer to only user visible structures.

The patch replaces references to `_kern` structures in BPF helpers
description by corresponding user visible structures.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1afb606a18b9..23b334bba1a6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1361,7 +1361,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1435,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  * 	Description
  * 		Add an entry to, or update a *map* referencing sockets. The
  * 		*skops* is used as a new value for the entry associated to
@@ -1533,7 +1533,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		For en eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1544,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1588,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
  * 	Description
  * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
  * 		for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1721,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
  * 		*addr*, of length *addr_len*. This allows for making outgoing
-- 
cgit v1.2.3-59-g8ed1b


From 96871b9f6fc28847becf60e07477e44f6f74794f Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Sat, 28 Apr 2018 16:06:20 -0700
Subject: bpf: Sync bpf.h to tools/

The patch syncs bpf.h to tools/.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1afb606a18b9..23b334bba1a6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1361,7 +1361,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1435,7 +1435,7 @@ union bpf_attr {
  * 	Return
  * 		**SK_PASS** on success, or **SK_DROP** on error.
  *
- * int bpf_sock_map_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  * 	Description
  * 		Add an entry to, or update a *map* referencing sockets. The
  * 		*skops* is used as a new value for the entry associated to
@@ -1533,7 +1533,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_perf_prog_read_value(struct bpf_perf_event_data_kern *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
  * 	Description
  * 		For en eBPF program attached to a perf event, retrieve the
  * 		value of the event counter associated to *ctx* and store it in
@@ -1544,7 +1544,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops_kern *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1588,7 +1588,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops_kern *bpf_sock, int argval)
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
  * 	Description
  * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
  * 		for the full TCP socket associated to *bpf_sock_ops* to
@@ -1721,7 +1721,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_bind(struct bpf_sock_addr_kern *ctx, struct sockaddr *addr, int addr_len)
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
  * 	Description
  * 		Bind the socket associated to *ctx* to the address pointed by
  * 		*addr*, of length *addr_len*. This allows for making outgoing
-- 
cgit v1.2.3-59-g8ed1b


From 7ef3771205302a71a8bb4a2286ef98e5cade5d1a Mon Sep 17 00:00:00 2001
From: Teng Qin <qinteng@fb.com>
Date: Sat, 28 Apr 2018 23:39:29 -0700
Subject: bpf: Allow bpf_current_task_under_cgroup in interrupt

Currently, the bpf_current_task_under_cgroup helper has a check where if
the BPF program is running in_interrupt(), it will return -EINVAL. This
prevents the helper to be used in many useful scenarios, particularly
BPF programs attached to Perf Events.

This commit removes the check. Tested a few NMI (Perf Event) and some
softirq context, the helper returns the correct result.

Signed-off-by: Teng Qin <qinteng@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 46d866e9937c..ce2cbbff27e4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -475,8 +475,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
 
-	if (unlikely(in_interrupt()))
-		return -EINVAL;
 	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
-- 
cgit v1.2.3-59-g8ed1b


From b28f872dc48a4d55cba9d51cf3bac03ca32b2747 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Apr 2018 21:56:44 -0400
Subject: net: dsa: mv88e6xxx: move trunk setup

Move the trunking setup out of Global 2 specific setup into the top
level mv88e6xxx_setup function.

Note that the 88E6390 family calls this LAG instead of Trunk and
supports 32 possible ID routing vectors, with LAG ID bit 4 being placed
in Global 2 register 0x1D...

We don't need Trunk (or LAG) IDs for the moment, thus keep it simple.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 13 +++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.c |  7 +------
 drivers/net/dsa/mv88e6xxx/global2.h |  7 +++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index a13a7005ffa0..02ff41338491 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1026,6 +1026,15 @@ static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
 		dev_err(ds->dev, "p%d: failed to update state\n", port);
 }
 
+static int mv88e6xxx_trunk_setup(struct mv88e6xxx_chip *chip)
+{
+	/* Clear all trunk masks and mapping */
+	if (chip->info->global2_addr)
+		return mv88e6xxx_g2_trunk_clear(chip);
+
+	return 0;
+}
+
 static int mv88e6xxx_pot_setup(struct mv88e6xxx_chip *chip)
 {
 	if (chip->info->ops->pot_clear)
@@ -2239,6 +2248,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
+	err = mv88e6xxx_trunk_setup(chip);
+	if (err)
+		goto unlock;
+
 	/* Setup PTP Hardware Clock and timestamping */
 	if (chip->info->ptp_support) {
 		err = mv88e6xxx_ptp_setup(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index 0ce627fded48..3f24763aedc5 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -174,7 +174,7 @@ static int mv88e6xxx_g2_trunk_mapping_write(struct mv88e6xxx_chip *chip, int id,
 	return mv88e6xxx_g2_update(chip, MV88E6XXX_G2_TRUNK_MAPPING, val);
 }
 
-static int mv88e6xxx_g2_clear_trunk(struct mv88e6xxx_chip *chip)
+int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip)
 {
 	const u16 port_mask = BIT(mv88e6xxx_num_ports(chip)) - 1;
 	int i, err;
@@ -1159,10 +1159,5 @@ int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
 	if (err)
 		return err;
 
-	/* Clear all trunk masks and mapping. */
-	err = mv88e6xxx_g2_clear_trunk(chip);
-	if (err)
-		return err;
-
 	return 0;
 }
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h
index 520ec70d32e8..7bd4ab31a93e 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -327,6 +327,8 @@ int mv88e6352_g2_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip);
 
 int mv88e6xxx_g2_pot_clear(struct mv88e6xxx_chip *chip);
 
+int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip);
+
 extern const struct mv88e6xxx_irq_ops mv88e6097_watchdog_ops;
 extern const struct mv88e6xxx_irq_ops mv88e6390_watchdog_ops;
 
@@ -495,6 +497,11 @@ static inline int mv88e6xxx_g2_scratch_gpio_set_smi(struct mv88e6xxx_chip *chip,
 	return -EOPNOTSUPP;
 }
 
+static inline int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_NET_DSA_MV88E6XXX_GLOBAL2 */
 
 #endif /* _MV88E6XXX_GLOBAL2_H */
-- 
cgit v1.2.3-59-g8ed1b


From c7f047b6c7bc3f6740a8fb1f2d03ecb19022fdc0 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Apr 2018 21:56:45 -0400
Subject: net: dsa: mv88e6xxx: move device mapping setup

Move the Device Mapping setup out of the specific Global 2 code,
into the top level device setup function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 27 +++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global2.c | 37 ++++++-------------------------------
 drivers/net/dsa/mv88e6xxx/global2.h | 12 +++++++++++-
 3 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 02ff41338491..d1ca4d6ef9be 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1026,6 +1026,29 @@ static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
 		dev_err(ds->dev, "p%d: failed to update state\n", port);
 }
 
+static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
+{
+	int target, port;
+	int err;
+
+	if (!chip->info->global2_addr)
+		return 0;
+
+	/* Initialize the routing port to the 32 possible target devices */
+	for (target = 0; target < 32; target++) {
+		port = 0x1f;
+		if (target < DSA_MAX_SWITCHES)
+			if (chip->ds->rtable[target] != DSA_RTABLE_NONE)
+				port = chip->ds->rtable[target];
+
+		err = mv88e6xxx_g2_device_mapping_write(chip, target, port);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int mv88e6xxx_trunk_setup(struct mv88e6xxx_chip *chip)
 {
 	/* Clear all trunk masks and mapping */
@@ -2252,6 +2275,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
+	err = mv88e6xxx_devmap_setup(chip);
+	if (err)
+		goto unlock;
+
 	/* Setup PTP Hardware Clock and timestamping */
 	if (chip->info->ptp_support) {
 		err = mv88e6xxx_ptp_setup(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index 3f24763aedc5..96e74d8d500d 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -119,37 +119,17 @@ int mv88e6352_g2_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip)
 
 /* Offset 0x06: Device Mapping Table register */
 
-static int mv88e6xxx_g2_device_mapping_write(struct mv88e6xxx_chip *chip,
-					     int target, int port)
+int mv88e6xxx_g2_device_mapping_write(struct mv88e6xxx_chip *chip, int target,
+				      int port)
 {
-	u16 val = (target << 8) | (port & 0xf);
+	u16 val = (target << 8) | (port & 0x1f);
+	/* Modern chips use 5 bits to define a device mapping port,
+	 * but bit 4 is reserved on older chips, so it is safe to use.
+	 */
 
 	return mv88e6xxx_g2_update(chip, MV88E6XXX_G2_DEVICE_MAPPING, val);
 }
 
-static int mv88e6xxx_g2_set_device_mapping(struct mv88e6xxx_chip *chip)
-{
-	int target, port;
-	int err;
-
-	/* Initialize the routing port to the 32 possible target devices */
-	for (target = 0; target < 32; ++target) {
-		port = 0xf;
-
-		if (target < DSA_MAX_SWITCHES) {
-			port = chip->ds->rtable[target];
-			if (port == DSA_RTABLE_NONE)
-				port = 0xf;
-		}
-
-		err = mv88e6xxx_g2_device_mapping_write(chip, target, port);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-
 /* Offset 0x07: Trunk Mask Table register */
 
 static int mv88e6xxx_g2_trunk_mask_write(struct mv88e6xxx_chip *chip, int num,
@@ -1154,10 +1134,5 @@ int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
 	if (err)
 		return err;
 
-	/* Program the DSA routing table. */
-	err = mv88e6xxx_g2_set_device_mapping(chip);
-	if (err)
-		return err;
-
 	return 0;
 }
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h
index 7bd4ab31a93e..46913a289255 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -60,7 +60,8 @@
 #define MV88E6XXX_G2_DEVICE_MAPPING		0x06
 #define MV88E6XXX_G2_DEVICE_MAPPING_UPDATE	0x8000
 #define MV88E6XXX_G2_DEVICE_MAPPING_DEV_MASK	0x1f00
-#define MV88E6XXX_G2_DEVICE_MAPPING_PORT_MASK	0x000f
+#define MV88E6352_G2_DEVICE_MAPPING_PORT_MASK	0x000f
+#define MV88E6390_G2_DEVICE_MAPPING_PORT_MASK	0x001f
 
 /* Offset 0x07: Trunk Mask Table Register */
 #define MV88E6XXX_G2_TRUNK_MASK			0x07
@@ -329,6 +330,9 @@ int mv88e6xxx_g2_pot_clear(struct mv88e6xxx_chip *chip);
 
 int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip);
 
+int mv88e6xxx_g2_device_mapping_write(struct mv88e6xxx_chip *chip, int target,
+				      int port);
+
 extern const struct mv88e6xxx_irq_ops mv88e6097_watchdog_ops;
 extern const struct mv88e6xxx_irq_ops mv88e6390_watchdog_ops;
 
@@ -502,6 +506,12 @@ static inline int mv88e6xxx_g2_trunk_clear(struct mv88e6xxx_chip *chip)
 	return -EOPNOTSUPP;
 }
 
+static inline int mv88e6xxx_g2_device_mapping_write(struct mv88e6xxx_chip *chip,
+						    int target, int port)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_NET_DSA_MV88E6XXX_GLOBAL2 */
 
 #endif /* _MV88E6XXX_GLOBAL2_H */
-- 
cgit v1.2.3-59-g8ed1b


From 5d49d60307aaa69541beafdd17bd3d0ea238732b Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Thu, 26 Apr 2018 21:56:46 -0400
Subject: net: dsa: mv88e6xxx: remove Global 2 setup

The remaining values written to the Switch Management Register in the
mv88e6xxx_g2_setup function are specific to 88E6352 and older, and are
the default values anyway.

Thus remove completely this function. The mv88e6xxx driver no more
contains setup code to access arbitrary Global 2 registers.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    |  7 -------
 drivers/net/dsa/mv88e6xxx/global2.c | 18 ------------------
 drivers/net/dsa/mv88e6xxx/global2.h |  6 ------
 3 files changed, 31 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d1ca4d6ef9be..9d62e4acc01b 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2228,13 +2228,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
-	/* Setup Switch Global 2 Registers */
-	if (chip->info->global2_addr) {
-		err = mv88e6xxx_g2_setup(chip);
-		if (err)
-			goto unlock;
-	}
-
 	err = mv88e6xxx_irl_setup(chip);
 	if (err)
 		goto unlock;
diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index 96e74d8d500d..e6d658181b27 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -1118,21 +1118,3 @@ void mv88e6xxx_g2_irq_mdio_free(struct mv88e6xxx_chip *chip,
 	for (phy = 0; phy < chip->info->num_internal_phys; phy++)
 		irq_dispose_mapping(bus->irq[phy]);
 }
-
-int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
-{
-	u16 reg;
-	int err;
-
-	/* Ignore removed tag data on doubly tagged packets, disable
-	 * flow control messages, force flow control priority to the
-	 * highest, and send all special multicast frames to the CPU
-	 * port at the highest priority.
-	 */
-	reg = MV88E6XXX_G2_SWITCH_MGMT_FORCE_FLOW_CTL_PRI | (0x7 << 4);
-	err = mv88e6xxx_g2_write(chip, MV88E6XXX_G2_SWITCH_MGMT, reg);
-	if (err)
-		return err;
-
-	return 0;
-}
diff --git a/drivers/net/dsa/mv88e6xxx/global2.h b/drivers/net/dsa/mv88e6xxx/global2.h
index 46913a289255..37e8ce2c72a0 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.h
+++ b/drivers/net/dsa/mv88e6xxx/global2.h
@@ -314,7 +314,6 @@ int mv88e6xxx_g2_pvt_write(struct mv88e6xxx_chip *chip, int src_dev,
 			   int src_port, u16 data);
 int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip);
 
-int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip);
 int mv88e6xxx_g2_irq_setup(struct mv88e6xxx_chip *chip);
 void mv88e6xxx_g2_irq_free(struct mv88e6xxx_chip *chip);
 
@@ -447,11 +446,6 @@ static inline int mv88e6xxx_g2_misc_4_bit_port(struct mv88e6xxx_chip *chip)
 	return -EOPNOTSUPP;
 }
 
-static inline int mv88e6xxx_g2_setup(struct mv88e6xxx_chip *chip)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int mv88e6xxx_g2_irq_setup(struct mv88e6xxx_chip *chip)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From 05255b823a6173525587f29c4e8f1ca33fd7677d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Apr 2018 08:58:08 -0700
Subject: tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive

When adding tcp mmap() implementation, I forgot that socket lock
had to be taken before current->mm->mmap_sem. syzbot eventually caught
the bug.

Since we can not lock the socket in tcp mmap() handler we have to
split the operation in two phases.

1) mmap() on a tcp socket simply reserves VMA space, and nothing else.
  This operation does not involve any TCP locking.

2) getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) implements
 the transfert of pages from skbs to one VMA.
  This operation only uses down_read(&current->mm->mmap_sem) after
  holding TCP lock, thus solving the lockdep issue.

This new implementation was suggested by Andy Lutomirski with great details.

Benefits are :

- Better scalability, in case multiple threads reuse VMAS
   (without mmap()/munmap() calls) since mmap_sem wont be write locked.

- Better error recovery.
   The previous mmap() model had to provide the expected size of the
   mapping. If for some reason one part could not be mapped (partial MSS),
   the whole operation had to be aborted.
   With the tcp_zerocopy_receive struct, kernel can report how
   many bytes were successfuly mapped, and how many bytes should
   be read to skip the problematic sequence.

- No more memory allocation to hold an array of page pointers.
  16 MB mappings needed 32 KB for this array, potentially using vmalloc() :/

- skbs are freed while mmap_sem has been released

Following patch makes the change in tcp_mmap tool to demonstrate
one possible use of mmap() and setsockopt(... TCP_ZEROCOPY_RECEIVE ...)

Note that memcg might require additional changes.

Fixes: 93ab6cc69162 ("tcp: implement mmap() for zero copy receive")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Cc: linux-mm@kvack.org
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |   8 ++
 net/ipv4/af_inet.c       |   2 +
 net/ipv4/tcp.c           | 194 +++++++++++++++++++++++++----------------------
 net/ipv6/af_inet6.c      |   2 +
 4 files changed, 116 insertions(+), 90 deletions(-)

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 379b08700a54..e9e8373b34b9 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -122,6 +122,7 @@ enum {
 #define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
+#define TCP_ZEROCOPY_RECEIVE	35
 
 struct tcp_repair_opt {
 	__u32	opt_code;
@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
 };
 
+/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+
+struct tcp_zerocopy_receive {
+	__u64 address;		/* in: address of mapping */
+	__u32 length;		/* in/out: number of bytes to map/mapped */
+	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
+};
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ebf599cebae..b403499fdabe 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,
 	.sendmsg	   = inet_sendmsg,
 	.recvmsg	   = inet_recvmsg,
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dfd090ea54ad..4028ddd14dd5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 }
 EXPORT_SYMBOL(tcp_set_rcvlowat);
 
-/* When user wants to mmap X pages, we first need to perform the mapping
- * before freeing any skbs in receive queue, otherwise user would be unable
- * to fallback to standard recvmsg(). This happens if some data in the
- * requested block is not exactly fitting in a page.
- *
- * We only support order-0 pages for the moment.
- * mmap() on TCP is very strict, there is no point
- * trying to accommodate with pathological layouts.
- */
+#ifdef CONFIG_MMU
+static const struct vm_operations_struct tcp_vm_ops = {
+};
+
 int tcp_mmap(struct file *file, struct socket *sock,
 	     struct vm_area_struct *vma)
 {
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned int nr_pages = size >> PAGE_SHIFT;
-	struct page **pages_array = NULL;
-	u32 seq, len, offset, nr = 0;
-	struct sock *sk = sock->sk;
-	const skb_frag_t *frags;
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+
+	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	vma->vm_ops = &tcp_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
+static int tcp_zerocopy_receive(struct sock *sk,
+				struct tcp_zerocopy_receive *zc)
+{
+	unsigned long address = (unsigned long)zc->address;
+	const skb_frag_t *frags = NULL;
+	u32 length = 0, seq, offset;
+	struct vm_area_struct *vma;
+	struct sk_buff *skb = NULL;
 	struct tcp_sock *tp;
-	struct sk_buff *skb;
 	int ret;
 
-	if (vma->vm_pgoff || !nr_pages)
+	if (address & (PAGE_SIZE - 1) || address != zc->address)
 		return -EINVAL;
 
-	if (vma->vm_flags & VM_WRITE)
-		return -EPERM;
-	/* TODO: Maybe the following is not needed if pages are COW */
-	vma->vm_flags &= ~VM_MAYWRITE;
-
-	lock_sock(sk);
-
-	ret = -ENOTCONN;
 	if (sk->sk_state == TCP_LISTEN)
-		goto out;
+		return -ENOTCONN;
 
 	sock_rps_record_flow(sk);
 
-	if (tcp_inq(sk) < size) {
-		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+	down_read(&current->mm->mmap_sem);
+
+	ret = -EINVAL;
+	vma = find_vma(current->mm, address);
+	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
 		goto out;
-	}
+	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+
 	tp = tcp_sk(sk);
 	seq = tp->copied_seq;
-	/* Abort if urgent data is in the area */
-	if (unlikely(tp->urg_data)) {
-		u32 urg_offset = tp->urg_seq - seq;
+	zc->length = min_t(u32, zc->length, tcp_inq(sk));
+	zc->length &= ~(PAGE_SIZE - 1);
 
-		ret = -EINVAL;
-		if (urg_offset < size)
-			goto out;
-	}
-	ret = -ENOMEM;
-	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
-				     GFP_KERNEL);
-	if (!pages_array)
-		goto out;
-	skb = tcp_recv_skb(sk, seq, &offset);
-	ret = -EINVAL;
-skb_start:
-	/* We do not support anything not in page frags */
-	offset -= skb_headlen(skb);
-	if ((int)offset < 0)
-		goto out;
-	if (skb_has_frag_list(skb))
-		goto out;
-	len = skb->data_len - offset;
-	frags = skb_shinfo(skb)->frags;
-	while (offset) {
-		if (frags->size > offset)
-			goto out;
-		offset -= frags->size;
-		frags++;
-	}
-	while (nr < nr_pages) {
-		if (len) {
-			if (len < PAGE_SIZE)
-				goto out;
-			if (frags->size != PAGE_SIZE || frags->page_offset)
-				goto out;
-			pages_array[nr++] = skb_frag_page(frags);
-			frags++;
-			len -= PAGE_SIZE;
-			seq += PAGE_SIZE;
-			continue;
+	zap_page_range(vma, address, zc->length);
+
+	zc->recv_skip_hint = 0;
+	ret = 0;
+	while (length + PAGE_SIZE <= zc->length) {
+		if (zc->recv_skip_hint < PAGE_SIZE) {
+			if (skb) {
+				skb = skb->next;
+				offset = seq - TCP_SKB_CB(skb)->seq;
+			} else {
+				skb = tcp_recv_skb(sk, seq, &offset);
+			}
+
+			zc->recv_skip_hint = skb->len - offset;
+			offset -= skb_headlen(skb);
+			if ((int)offset < 0 || skb_has_frag_list(skb))
+				break;
+			frags = skb_shinfo(skb)->frags;
+			while (offset) {
+				if (frags->size > offset)
+					goto out;
+				offset -= frags->size;
+				frags++;
+			}
 		}
-		skb = skb->next;
-		offset = seq - TCP_SKB_CB(skb)->seq;
-		goto skb_start;
-	}
-	/* OK, we have a full set of pages ready to be inserted into vma */
-	for (nr = 0; nr < nr_pages; nr++) {
-		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
-				     pages_array[nr]);
+		if (frags->size != PAGE_SIZE || frags->page_offset)
+			break;
+		ret = vm_insert_page(vma, address + length,
+				     skb_frag_page(frags));
 		if (ret)
-			goto out;
+			break;
+		length += PAGE_SIZE;
+		seq += PAGE_SIZE;
+		zc->recv_skip_hint -= PAGE_SIZE;
+		frags++;
 	}
-	/* operation is complete, we can 'consume' all skbs */
-	tp->copied_seq = seq;
-	tcp_rcv_space_adjust(sk);
-
-	/* Clean up data we have read: This will do ACK frames. */
-	tcp_recv_skb(sk, seq, &offset);
-	tcp_cleanup_rbuf(sk, size);
-
-	ret = 0;
 out:
-	release_sock(sk);
-	kvfree(pages_array);
+	up_read(&current->mm->mmap_sem);
+	if (length) {
+		tp->copied_seq = seq;
+		tcp_rcv_space_adjust(sk);
+
+		/* Clean up data we have read: This will do ACK frames. */
+		tcp_recv_skb(sk, seq, &offset);
+		tcp_cleanup_rbuf(sk, length);
+		ret = 0;
+		if (length == zc->length)
+			zc->recv_skip_hint = 0;
+	} else {
+		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
+			ret = -EIO;
+	}
+	zc->length = length;
 	return ret;
 }
-EXPORT_SYMBOL(tcp_mmap);
+#endif
 
 static void tcp_update_recv_tstamps(struct sk_buff *skb,
 				    struct scm_timestamping *tss)
@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		}
 		return 0;
 	}
+#ifdef CONFIG_MMU
+	case TCP_ZEROCOPY_RECEIVE: {
+		struct tcp_zerocopy_receive zc;
+		int err;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+		if (len != sizeof(zc))
+			return -EINVAL;
+		if (copy_from_user(&zc, optval, len))
+			return -EFAULT;
+		lock_sock(sk);
+		err = tcp_zerocopy_receive(sk, &zc);
+		release_sock(sk);
+		if (!err && copy_to_user(optval, &zc, len))
+			err = -EFAULT;
+		return err;
+	}
+#endif
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 36d622c477b1..d0af96e0d109 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
 	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 	.sendmsg	   = inet_sendmsg,		/* ok		*/
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
+#ifdef CONFIG_MMU
 	.mmap		   = tcp_mmap,
+#endif
 	.sendpage	   = inet_sendpage,
 	.sendmsg_locked    = tcp_sendmsg_locked,
 	.sendpage_locked   = tcp_sendpage_locked,
-- 
cgit v1.2.3-59-g8ed1b


From aacb0c2e524c38d7ab48b0beb86427a36807079f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 27 Apr 2018 08:58:09 -0700
Subject: selftests: net: tcp_mmap must use TCP_ZEROCOPY_RECEIVE

After prior kernel change, mmap() on TCP socket only reserves VMA.

We have to use getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...)
to perform the transfert of pages from skbs in TCP receive queue into such VMA.

struct tcp_zerocopy_receive {
	__u64 address;		/* in: address of mapping */
	__u32 length;		/* in/out: number of bytes to map/mapped */
	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
};

After a successful getsockopt(...TCP_ZEROCOPY_RECEIVE...), @length contains
number of bytes that were mapped, and @recv_skip_hint contains number of bytes
that should be read using conventional read()/recv()/recvmsg() system calls,
to skip a sequence of bytes that can not be mapped, because not properly page
aligned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tcp_mmap.c | 66 +++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
index dea342fe6f4e..77f762780199 100644
--- a/tools/testing/selftests/net/tcp_mmap.c
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -76,9 +76,10 @@
 #include <time.h>
 #include <sys/time.h>
 #include <netinet/in.h>
-#include <netinet/tcp.h>
 #include <arpa/inet.h>
 #include <poll.h>
+#include <linux/tcp.h>
+#include <assert.h>
 
 #ifndef MSG_ZEROCOPY
 #define MSG_ZEROCOPY    0x4000000
@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
 void *child_thread(void *arg)
 {
 	unsigned long total_mmap = 0, total = 0;
+	struct tcp_zerocopy_receive zc;
 	unsigned long delta_usec;
 	int flags = MAP_SHARED;
 	struct timeval t0, t1;
 	char *buffer = NULL;
-	void *oaddr = NULL;
+	void *addr = NULL;
 	double throughput;
 	struct rusage ru;
 	int lu, fd;
@@ -153,41 +155,46 @@ void *child_thread(void *arg)
 		perror("malloc");
 		goto error;
 	}
+	if (zflg) {
+		addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
+		if (addr == (void *)-1)
+			zflg = 0;
+	}
 	while (1) {
 		struct pollfd pfd = { .fd = fd, .events = POLLIN, };
 		int sub;
 
 		poll(&pfd, 1, 10000);
 		if (zflg) {
-			void *naddr;
-
-			naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
-			if (naddr == (void *)-1) {
-				if (errno == EAGAIN) {
-					/* That is if SO_RCVLOWAT is buggy */
-					usleep(1000);
-					continue;
-				}
-				if (errno == EINVAL) {
-					flags = MAP_SHARED;
-					oaddr = NULL;
-					goto fallback;
-				}
-				if (errno != EIO)
-					perror("mmap()");
+			socklen_t zc_len = sizeof(zc);
+			int res;
+
+			zc.address = (__u64)addr;
+			zc.length = chunk_size;
+			zc.recv_skip_hint = 0;
+			res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
+					 &zc, &zc_len);
+			if (res == -1)
 				break;
+
+			if (zc.length) {
+				assert(zc.length <= chunk_size);
+				total_mmap += zc.length;
+				if (xflg)
+					hash_zone(addr, zc.length);
+				total += zc.length;
 			}
-			total_mmap += chunk_size;
-			if (xflg)
-				hash_zone(naddr, chunk_size);
-			total += chunk_size;
-			if (!keepflag) {
-				flags |= MAP_FIXED;
-				oaddr = naddr;
+			if (zc.recv_skip_hint) {
+				assert(zc.recv_skip_hint <= chunk_size);
+				lu = read(fd, buffer, zc.recv_skip_hint);
+				if (lu > 0) {
+					if (xflg)
+						hash_zone(buffer, lu);
+					total += lu;
+				}
 			}
 			continue;
 		}
-fallback:
 		sub = 0;
 		while (sub < chunk_size) {
 			lu = read(fd, buffer + sub, chunk_size - sub);
@@ -228,6 +235,8 @@ end:
 error:
 	free(buffer);
 	close(fd);
+	if (zflg)
+		munmap(addr, chunk_size);
 	pthread_exit(0);
 }
 
@@ -371,7 +380,8 @@ int main(int argc, char *argv[])
 		setup_sockaddr(cfg_family, host, &listenaddr);
 
 		if (mss &&
-		    setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+		    setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
+			       &mss, sizeof(mss)) == -1) {
 			perror("setsockopt TCP_MAXSEG");
 			exit(1);
 		}
@@ -402,7 +412,7 @@ int main(int argc, char *argv[])
 	setup_sockaddr(cfg_family, host, &addr);
 
 	if (mss &&
-	    setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+	    setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
 		perror("setsockopt TCP_MAXSEG");
 		exit(1);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 89b36fb5e53244701f7ddec0ece0e9a48fdb3b11 Mon Sep 17 00:00:00 2001
From: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Date: Sat, 28 Apr 2018 11:33:14 +0530
Subject: lan78xx: Lan7801 Support for Fixed PHY

Adding Fixed PHY support to the lan78xx driver.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/Kconfig   |   1 +
 drivers/net/usb/lan78xx.c | 104 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index f28bd74ac275..418b0904cecb 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -111,6 +111,7 @@ config USB_LAN78XX
 	select MII
 	select PHYLIB
 	select MICROCHIP_PHY
+	select FIXED_PHY
 	help
 	  This option adds support for Microchip LAN78XX based USB 2
 	  & USB 3 10/100/1000 Ethernet adapters.
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index c59f8afd0d73..81dfd10c3b92 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -36,7 +36,7 @@
 #include <linux/irq.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/microchipphy.h>
-#include <linux/phy.h>
+#include <linux/phy_fixed.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
 #include "lan78xx.h"
@@ -2063,52 +2063,91 @@ static int ksz9031rnx_fixup(struct phy_device *phydev)
 	return 1;
 }
 
-static int lan78xx_phy_init(struct lan78xx_net *dev)
+static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev)
 {
+	u32 buf;
 	int ret;
-	u32 mii_adv;
+	struct fixed_phy_status fphy_status = {
+		.link = 1,
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_FULL,
+	};
 	struct phy_device *phydev;
 
 	phydev = phy_find_first(dev->mdiobus);
 	if (!phydev) {
-		netdev_err(dev->net, "no PHY found\n");
-		return -EIO;
-	}
-
-	if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
-	    (dev->chipid == ID_REV_CHIP_ID_7850_)) {
-		phydev->is_internal = true;
-		dev->interface = PHY_INTERFACE_MODE_GMII;
-
-	} else if (dev->chipid == ID_REV_CHIP_ID_7801_) {
+		netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
+		phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
+					    NULL);
+		if (IS_ERR(phydev)) {
+			netdev_err(dev->net, "No PHY/fixed_PHY found\n");
+			return NULL;
+		}
+		netdev_dbg(dev->net, "Registered FIXED PHY\n");
+		dev->interface = PHY_INTERFACE_MODE_RGMII;
+		ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
+					MAC_RGMII_ID_TXC_DELAY_EN_);
+		ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
+		ret = lan78xx_read_reg(dev, HW_CFG, &buf);
+		buf |= HW_CFG_CLK125_EN_;
+		buf |= HW_CFG_REFCLK25_EN_;
+		ret = lan78xx_write_reg(dev, HW_CFG, buf);
+	} else {
 		if (!phydev->drv) {
 			netdev_err(dev->net, "no PHY driver found\n");
-			return -EIO;
+			return NULL;
 		}
-
 		dev->interface = PHY_INTERFACE_MODE_RGMII;
-
 		/* external PHY fixup for KSZ9031RNX */
 		ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0,
 						 ksz9031rnx_fixup);
 		if (ret < 0) {
 			netdev_err(dev->net, "fail to register fixup\n");
-			return ret;
+			return NULL;
 		}
 		/* external PHY fixup for LAN8835 */
 		ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0,
 						 lan8835_fixup);
 		if (ret < 0) {
 			netdev_err(dev->net, "fail to register fixup\n");
-			return ret;
+			return NULL;
 		}
 		/* add more external PHY fixup here if needed */
 
 		phydev->is_internal = false;
-	} else {
-		netdev_err(dev->net, "unknown ID found\n");
-		ret = -EIO;
-		goto error;
+	}
+	return phydev;
+}
+
+static int lan78xx_phy_init(struct lan78xx_net *dev)
+{
+	int ret;
+	u32 mii_adv;
+	struct phy_device *phydev;
+
+	switch (dev->chipid) {
+	case ID_REV_CHIP_ID_7801_:
+		phydev = lan7801_phy_init(dev);
+		if (!phydev) {
+			netdev_err(dev->net, "lan7801: PHY Init Failed");
+			return -EIO;
+		}
+		break;
+
+	case ID_REV_CHIP_ID_7800_:
+	case ID_REV_CHIP_ID_7850_:
+		phydev = phy_find_first(dev->mdiobus);
+		if (!phydev) {
+			netdev_err(dev->net, "no PHY found\n");
+			return -EIO;
+		}
+		phydev->is_internal = true;
+		dev->interface = PHY_INTERFACE_MODE_GMII;
+		break;
+
+	default:
+		netdev_err(dev->net, "Unknown CHIP ID found\n");
+		return -EIO;
 	}
 
 	/* if phyirq is not set, use polling mode in phylib */
@@ -2127,6 +2166,16 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	if (ret) {
 		netdev_err(dev->net, "can't attach PHY to %s\n",
 			   dev->mdiobus->id);
+		if (dev->chipid == ID_REV_CHIP_ID_7801_) {
+			if (phy_is_pseudo_fixed_link(phydev)) {
+				fixed_phy_unregister(phydev);
+			} else {
+				phy_unregister_fixup_for_uid(PHY_KSZ9031RNX,
+							     0xfffffff0);
+				phy_unregister_fixup_for_uid(PHY_LAN8835,
+							     0xfffffff0);
+			}
+		}
 		return -EIO;
 	}
 
@@ -2166,12 +2215,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	dev->fc_autoneg = phydev->autoneg;
 
 	return 0;
-
-error:
-	phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
-	phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
-
-	return ret;
 }
 
 static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size)
@@ -3569,6 +3612,7 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 	struct lan78xx_net		*dev;
 	struct usb_device		*udev;
 	struct net_device		*net;
+	struct phy_device		*phydev;
 
 	dev = usb_get_intfdata(intf);
 	usb_set_intfdata(intf, NULL);
@@ -3577,12 +3621,16 @@ static void lan78xx_disconnect(struct usb_interface *intf)
 
 	udev = interface_to_usbdev(intf);
 	net = dev->net;
+	phydev = net->phydev;
 
 	phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
 	phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
 
 	phy_disconnect(net->phydev);
 
+	if (phy_is_pseudo_fixed_link(phydev))
+		fixed_phy_unregister(phydev);
+
 	unregister_netdev(net);
 
 	cancel_delayed_work_sync(&dev->wq);
-- 
cgit v1.2.3-59-g8ed1b


From e92258c761130c57567b7f051ea664957ce245ba Mon Sep 17 00:00:00 2001
From: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Date: Sat, 28 Apr 2018 11:33:15 +0530
Subject: lan78xx: Remove DRIVER_VERSION for lan78xx driver

Remove driver version info from the lan78xx driver.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 81dfd10c3b92..54f8db887e3d 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -44,7 +44,6 @@
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
 #define DRIVER_DESC	"LAN78XX USB 3.0 Gigabit Ethernet Devices"
 #define DRIVER_NAME	"lan78xx"
-#define DRIVER_VERSION	"1.0.6"
 
 #define TX_TIMEOUT_JIFFIES		(5 * HZ)
 #define THROTTLE_JIFFIES		(HZ / 8)
@@ -1503,7 +1502,6 @@ static void lan78xx_get_drvinfo(struct net_device *net,
 	struct lan78xx_net *dev = netdev_priv(net);
 
 	strncpy(info->driver, DRIVER_NAME, sizeof(info->driver));
-	strncpy(info->version, DRIVER_VERSION, sizeof(info->version));
 	usb_make_path(dev->udev, info->bus_info, sizeof(info->bus_info));
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 7670ed7a25943a77af64ec4a7247a504e98fe812 Mon Sep 17 00:00:00 2001
From: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Date: Sat, 28 Apr 2018 11:33:16 +0530
Subject: lan78xx: Modify error messages

Modify the error messages when phy registration fails.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/lan78xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 54f8db887e3d..91761436709a 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -2100,14 +2100,14 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev)
 		ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0,
 						 ksz9031rnx_fixup);
 		if (ret < 0) {
-			netdev_err(dev->net, "fail to register fixup\n");
+			netdev_err(dev->net, "Failed to register fixup for PHY_KSZ9031RNX\n");
 			return NULL;
 		}
 		/* external PHY fixup for LAN8835 */
 		ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0,
 						 lan8835_fixup);
 		if (ret < 0) {
-			netdev_err(dev->net, "fail to register fixup\n");
+			netdev_err(dev->net, "Failed to register fixup for PHY_LAN8835\n");
 			return NULL;
 		}
 		/* add more external PHY fixup here if needed */
-- 
cgit v1.2.3-59-g8ed1b


From d80d8d55734998a22a064e7cd59ffc66dcc86fb2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 27 Apr 2018 14:06:35 -0400
Subject: opa_vnic: Just use skb_get_hash instead of skb_tx_hash

This patch is meant to clean up how the opa_vnic is obtaining entropy from
Tx packets.

The code as it was written was claiming to get 16 bits of hash, but from
what I can tell it was only ever actually getting 14 bits as it was limited
to 0 - (2^15 - 1). It then was folding the result to get a 8 bit value for
entropy.

Instead of throwing away all that input I am cutting out the middle man and
instead having the code call skb_get_hash directly and then folding the 32
bit value into a 8 bit value using a pair of shifts and XOR operations.

Execution wise this new approach should provide more entropy and be faster
since we are bypassing the reciprocal multiplication to reduce the 32b
value to 16b and instead just using a shift/XOR combination.

In addition we can drop the unneeded adapter value from the call to get the
entropy since the netdev itself isn't even needed.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c    | 21 ++++++++++-----------
 drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h |  2 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c   |  2 +-
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
index 4be3aef40bd2..267da8215e08 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c
@@ -443,17 +443,16 @@ static u8 opa_vnic_get_rc(struct __opa_veswport_info *info,
 }
 
 /* opa_vnic_calc_entropy - calculate the packet entropy */
-u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb)
+u8 opa_vnic_calc_entropy(struct sk_buff *skb)
 {
-	u16 hash16;
-
-	/*
-	 * Get flow based 16-bit hash and then XOR the upper and lower bytes
-	 * to get the entropy.
-	 * __skb_tx_hash limits qcount to 16 bits. Hence, get 15-bit hash.
-	 */
-	hash16 = __skb_tx_hash(adapter->netdev, skb, BIT(15));
-	return (u8)((hash16 >> 8) ^ (hash16 & 0xff));
+	u32 hash = skb_get_hash(skb);
+
+	/* store XOR of all bytes in lower 8 bits */
+	hash ^= hash >> 8;
+	hash ^= hash >> 16;
+
+	/* return lower 8 bits as entropy */
+	return (u8)(hash & 0xFF);
 }
 
 /* opa_vnic_get_def_port - get default port based on entropy */
@@ -490,7 +489,7 @@ void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb)
 
 	hdr = skb_push(skb, OPA_VNIC_HDR_LEN);
 
-	entropy = opa_vnic_calc_entropy(adapter, skb);
+	entropy = opa_vnic_calc_entropy(skb);
 	def_port = opa_vnic_get_def_port(adapter, entropy);
 	len = opa_vnic_wire_length(skb);
 	dlid = opa_vnic_get_dlid(adapter, skb, def_port);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
index afd95f432262..43ac61ffef4a 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
@@ -299,7 +299,7 @@ struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
 void opa_vnic_rem_netdev(struct opa_vnic_adapter *adapter);
 void opa_vnic_encap_skb(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
 u8 opa_vnic_get_vl(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
-u8 opa_vnic_calc_entropy(struct opa_vnic_adapter *adapter, struct sk_buff *skb);
+u8 opa_vnic_calc_entropy(struct sk_buff *skb);
 void opa_vnic_process_vema_config(struct opa_vnic_adapter *adapter);
 void opa_vnic_release_mac_tbl(struct opa_vnic_adapter *adapter);
 void opa_vnic_query_mac_tbl(struct opa_vnic_adapter *adapter,
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index ce57e0f10289..0c8aec62a425 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -104,7 +104,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff *skb,
 
 	/* pass entropy and vl as metadata in skb */
 	mdata = skb_push(skb, sizeof(*mdata));
-	mdata->entropy =  opa_vnic_calc_entropy(adapter, skb);
+	mdata->entropy = opa_vnic_calc_entropy(skb);
 	mdata->vl = opa_vnic_get_vl(adapter, skb);
 	rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
 					       accel_priv, fallback);
-- 
cgit v1.2.3-59-g8ed1b


From b86629ebd5447c1e263d89be765ba6fd747c5e4d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 27 Apr 2018 14:06:45 -0400
Subject: mlx4: Don't bother using skb_tx_hash in mlx4_en_select_queue

The code in the fallback path has supported XDP in conjunction with the Tx
traffic classification for TCs for over a year now. So instead of just
calling skb_tx_hash for every packet we are better off using the fallback
since that will record the Tx queue to the socket and then that can be used
instead of having to recompute the hash every time.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 6b6853773848..0227786308af 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -694,7 +694,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u16 rings_p_up = priv->num_tx_rings_p_up;
 
 	if (netdev_get_num_tc(dev))
-		return skb_tx_hash(dev, skb);
+		return fallback(dev, skb);
 
 	return fallback(dev, skb) % rings_p_up;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1b837d489e06a5289753ddbee99cfbc26d251d6d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 27 Apr 2018 14:06:53 -0400
Subject: net: Revoke export for __skb_tx_hash, update it to just be static
 skb_tx_hash

I am dropping the export of __skb_tx_hash as after my patches nobody is
using it outside of the net/core/dev.c file. In addition I am renaming and
repurposing it to just be a static declaration of skb_tx_hash since that
was the only user for it at this point. By doing this the compiler can
inline it into __netdev_pick_tx as that will improve performance.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 -------------
 net/core/dev.c            | 10 ++++------
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 366c32891158..82f5a9aba578 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3213,19 +3213,6 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 }
 #endif
 
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues);
-
-/*
- * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used
- * as a distribution range limit for the returned value.
- */
-static inline u16 skb_tx_hash(const struct net_device *dev,
-			      struct sk_buff *skb)
-{
-	return __skb_tx_hash(dev, skb, dev->real_num_tx_queues);
-}
-
 /**
  *	netif_is_multiqueue - test if device has multiple transmit queues
  *	@dev: network device
diff --git a/net/core/dev.c b/net/core/dev.c
index 0a2d46424069..25ceecfdd8fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2615,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach);
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
  */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues)
+static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
 {
 	u32 hash;
 	u16 qoffset = 0;
-	u16 qcount = num_tx_queues;
+	u16 qcount = dev->real_num_tx_queues;
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
-		while (unlikely(hash >= num_tx_queues))
-			hash -= num_tx_queues;
+		while (unlikely(hash >= qcount))
+			hash -= qcount;
 		return hash;
 	}
 
@@ -2638,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
 
 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
 }
-EXPORT_SYMBOL(__skb_tx_hash);
 
 static void skb_warn_bad_offload(const struct sk_buff *skb)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 3ac305c386f698abccd0523c64a8aef248c89bc6 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 27 Apr 2018 13:11:14 -0700
Subject: net: core: Assert the size of netdev_featres_t

We have about 53 netdev_features_t bits defined and counting, add a
build time check to catch when an u64 type will not be enough and we
will have to convert that to a bitmap. This is done in
register_netdevice() for convenience.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ++++++
 net/core/dev.c            | 1 +
 2 files changed, 7 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 82f5a9aba578..9e09dd897b74 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4108,6 +4108,12 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
+static inline void netdev_features_size_check(void)
+{
+	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+		     NETDEV_FEATURE_COUNT);
+}
+
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 25ceecfdd8fe..e01c21a88cae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7879,6 +7879,7 @@ int register_netdevice(struct net_device *dev)
 	int ret;
 	struct net *net = dev_net(dev);
 
+	netdev_features_size_check();
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3-59-g8ed1b


From 34745aed515c1d6040110ff82378056533518eb6 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 29 Apr 2018 19:27:48 -0700
Subject: samples/bpf: fix kprobe attachment issue on x64

Commit d5a00528b58c ("syscalls/core, syscalls/x86: Rename
struct pt_regs-based sys_*() to __x64_sys_*()") renamed a lot
of syscall function sys_*() to __x64_sys_*().
This caused several kprobe based samples/bpf tests failing.

This patch fixed the problem in bpf_load.c.
For x86_64 architecture, function name __x64_sys_*() will be
first used for kprobe event creation. If the creation is successful,
it will be used. Otherwise, function name sys_*() will be used
for kprobe event creation.

Fixes: d5a00528b58c ("syscalls/core, syscalls/x86: Rename struct pt_regs-based sys_*() to __x64_sys_*()")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/bpf_load.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index a27ef3c42e4e..da9bccfaf391 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -145,6 +145,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	}
 
 	if (is_kprobe || is_kretprobe) {
+		bool need_normal_check = true;
+		const char *event_prefix = "";
+
 		if (is_kprobe)
 			event += 7;
 		else
@@ -158,18 +161,33 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		if (isdigit(*event))
 			return populate_prog_array(event, fd);
 
-		snprintf(buf, sizeof(buf),
-			 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
-			 is_kprobe ? 'p' : 'r', event, event);
-		err = system(buf);
-		if (err < 0) {
-			printf("failed to create kprobe '%s' error '%s'\n",
-			       event, strerror(errno));
-			return -1;
+#ifdef __x86_64__
+		if (strncmp(event, "sys_", 4) == 0) {
+			snprintf(buf, sizeof(buf),
+				 "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events",
+				 is_kprobe ? 'p' : 'r', event, event);
+			err = system(buf);
+			if (err >= 0) {
+				need_normal_check = false;
+				event_prefix = "__x64_";
+			}
+		}
+#endif
+		if (need_normal_check) {
+			snprintf(buf, sizeof(buf),
+				 "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events",
+				 is_kprobe ? 'p' : 'r', event, event);
+			err = system(buf);
+			if (err < 0) {
+				printf("failed to create kprobe '%s' error '%s'\n",
+				       event, strerror(errno));
+				return -1;
+			}
 		}
 
 		strcpy(buf, DEBUGFS);
 		strcat(buf, "events/kprobes/");
+		strcat(buf, event_prefix);
 		strcat(buf, event);
 		strcat(buf, "/id");
 	} else if (is_tracepoint) {
-- 
cgit v1.2.3-59-g8ed1b


From 4d220ed0f8140c478ab7b0a14d96821da639b646 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 28 Apr 2018 19:56:37 -0700
Subject: bpf: remove tracepoints from bpf core

tracepoints to bpf core were added as a way to provide introspection
to bpf programs and maps, but after some time it became clear that
this approach is inadequate, so prog_id, map_id and corresponding
get_next_id, get_fd_by_id, get_info_by_fd, prog_query APIs were
introduced and fully adopted by bpftool and other applications.
The tracepoints in bpf core started to rot and causing syzbot warnings:
WARNING: CPU: 0 PID: 3008 at kernel/trace/trace_event_perf.c:274
Kernel panic - not syncing: panic_on_warn set ...
perf_trace_bpf_map_keyval+0x260/0xbd0 include/trace/events/bpf.h:228
trace_bpf_map_update_elem include/trace/events/bpf.h:274 [inline]
map_update_elem kernel/bpf/syscall.c:597 [inline]
SYSC_bpf kernel/bpf/syscall.c:1478 [inline]
Hence this patch deletes tracepoints in bpf core.

Reported-by: Eric Biggers <ebiggers3@gmail.com>
Reported-by: syzbot <bot+a9dbb3c3e64b62536a4bc5ee7bbd4ca627566188@syzkaller.appspotmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 MAINTAINERS                |   1 -
 include/linux/bpf_trace.h  |   1 -
 include/trace/events/bpf.h | 355 ---------------------------------------------
 kernel/bpf/core.c          |   6 -
 kernel/bpf/inode.c         |  16 +-
 kernel/bpf/syscall.c       |  15 +-
 6 files changed, 2 insertions(+), 392 deletions(-)
 delete mode 100644 include/trace/events/bpf.h

diff --git a/MAINTAINERS b/MAINTAINERS
index a52800867850..537fd17a211b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2727,7 +2727,6 @@ F:	Documentation/networking/filter.txt
 F:	Documentation/bpf/
 F:	include/linux/bpf*
 F:	include/linux/filter.h
-F:	include/trace/events/bpf.h
 F:	include/trace/events/xdp.h
 F:	include/uapi/linux/bpf*
 F:	include/uapi/linux/filter.h
diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index e6fe98ae3794..ddf896abcfb6 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -2,7 +2,6 @@
 #ifndef __LINUX_BPF_TRACE_H__
 #define __LINUX_BPF_TRACE_H__
 
-#include <trace/events/bpf.h>
 #include <trace/events/xdp.h>
 
 #endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
deleted file mode 100644
index 150185647e6b..000000000000
--- a/include/trace/events/bpf.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM bpf
-
-#if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BPF_H
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-
-#include <linux/filter.h>
-#include <linux/bpf.h>
-#include <linux/fs.h>
-#include <linux/tracepoint.h>
-
-#define __PROG_TYPE_MAP(FN)	\
-	FN(SOCKET_FILTER)	\
-	FN(KPROBE)		\
-	FN(SCHED_CLS)		\
-	FN(SCHED_ACT)		\
-	FN(TRACEPOINT)		\
-	FN(XDP)			\
-	FN(PERF_EVENT)		\
-	FN(CGROUP_SKB)		\
-	FN(CGROUP_SOCK)		\
-	FN(LWT_IN)		\
-	FN(LWT_OUT)		\
-	FN(LWT_XMIT)
-
-#define __MAP_TYPE_MAP(FN)	\
-	FN(HASH)		\
-	FN(ARRAY)		\
-	FN(PROG_ARRAY)		\
-	FN(PERF_EVENT_ARRAY)	\
-	FN(PERCPU_HASH)		\
-	FN(PERCPU_ARRAY)	\
-	FN(STACK_TRACE)		\
-	FN(CGROUP_ARRAY)	\
-	FN(LRU_HASH)		\
-	FN(LRU_PERCPU_HASH)	\
-	FN(LPM_TRIE)
-
-#define __PROG_TYPE_TP_FN(x)	\
-	TRACE_DEFINE_ENUM(BPF_PROG_TYPE_##x);
-#define __PROG_TYPE_SYM_FN(x)	\
-	{ BPF_PROG_TYPE_##x, #x },
-#define __PROG_TYPE_SYM_TAB	\
-	__PROG_TYPE_MAP(__PROG_TYPE_SYM_FN) { -1, 0 }
-__PROG_TYPE_MAP(__PROG_TYPE_TP_FN)
-
-#define __MAP_TYPE_TP_FN(x)	\
-	TRACE_DEFINE_ENUM(BPF_MAP_TYPE_##x);
-#define __MAP_TYPE_SYM_FN(x)	\
-	{ BPF_MAP_TYPE_##x, #x },
-#define __MAP_TYPE_SYM_TAB	\
-	__MAP_TYPE_MAP(__MAP_TYPE_SYM_FN) { -1, 0 }
-__MAP_TYPE_MAP(__MAP_TYPE_TP_FN)
-
-DECLARE_EVENT_CLASS(bpf_prog_event,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(u32, type)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__entry->type = prg->type;
-	),
-
-	TP_printk("prog=%s type=%s",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
-);
-
-DEFINE_EVENT(bpf_prog_event, bpf_prog_get_type,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg)
-);
-
-DEFINE_EVENT(bpf_prog_event, bpf_prog_put_rcu,
-
-	TP_PROTO(const struct bpf_prog *prg),
-
-	TP_ARGS(prg)
-);
-
-TRACE_EVENT(bpf_prog_load,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd),
-
-	TP_ARGS(prg, ufd),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(u32, type)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__entry->type = prg->type;
-		__entry->ufd  = ufd;
-	),
-
-	TP_printk("prog=%s type=%s ufd=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
-		  __entry->ufd)
-);
-
-TRACE_EVENT(bpf_map_create,
-
-	TP_PROTO(const struct bpf_map *map, int ufd),
-
-	TP_ARGS(map, ufd),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, size_key)
-		__field(u32, size_value)
-		__field(u32, max_entries)
-		__field(u32, flags)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		__entry->type        = map->map_type;
-		__entry->size_key    = map->key_size;
-		__entry->size_value  = map->value_size;
-		__entry->max_entries = map->max_entries;
-		__entry->flags       = map->map_flags;
-		__entry->ufd         = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd, __entry->size_key, __entry->size_value,
-		  __entry->max_entries, __entry->flags)
-);
-
-DECLARE_EVENT_CLASS(bpf_obj_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname),
-
-	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
-		__field(int, ufd)
-		__string(path, pname->name)
-	),
-
-	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
-		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
-		__assign_str(path, pname->name);
-		__entry->ufd = ufd;
-	),
-
-	TP_printk("prog=%s path=%s ufd=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
-		  __get_str(path), __entry->ufd)
-);
-
-DEFINE_EVENT(bpf_obj_prog, bpf_obj_pin_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname)
-);
-
-DEFINE_EVENT(bpf_obj_prog, bpf_obj_get_prog,
-
-	TP_PROTO(const struct bpf_prog *prg, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(prg, ufd, pname)
-);
-
-DECLARE_EVENT_CLASS(bpf_obj_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(int, ufd)
-		__string(path, pname->name)
-	),
-
-	TP_fast_assign(
-		__assign_str(path, pname->name);
-		__entry->type = map->map_type;
-		__entry->ufd  = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d path=%s",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd, __get_str(path))
-);
-
-DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname)
-);
-
-DEFINE_EVENT(bpf_obj_map, bpf_obj_get_map,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const struct filename *pname),
-
-	TP_ARGS(map, ufd, pname)
-);
-
-DECLARE_EVENT_CLASS(bpf_map_keyval,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__field(bool, key_trunc)
-		__field(u32, val_len)
-		__dynamic_array(u8, val, map->value_size)
-		__field(bool, val_trunc)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		memcpy(__get_dynamic_array(key), key, map->key_size);
-		memcpy(__get_dynamic_array(val), val, map->value_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->val_len   = min(map->value_size, 16U);
-		__entry->val_trunc = map->value_size != __entry->val_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __print_hex(__get_dynamic_array(key), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "",
-		  __print_hex(__get_dynamic_array(val), __entry->val_len),
-		  __entry->val_trunc ? " ..." : "")
-);
-
-DEFINE_EVENT(bpf_map_keyval, bpf_map_lookup_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val)
-);
-
-DEFINE_EVENT(bpf_map_keyval, bpf_map_update_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *val),
-
-	TP_ARGS(map, ufd, key, val)
-);
-
-TRACE_EVENT(bpf_map_delete_elem,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key),
-
-	TP_ARGS(map, ufd, key),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__field(bool, key_trunc)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		memcpy(__get_dynamic_array(key), key, map->key_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __print_hex(__get_dynamic_array(key), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "")
-);
-
-TRACE_EVENT(bpf_map_next_key,
-
-	TP_PROTO(const struct bpf_map *map, int ufd,
-		 const void *key, const void *key_next),
-
-	TP_ARGS(map, ufd, key, key_next),
-
-	TP_STRUCT__entry(
-		__field(u32, type)
-		__field(u32, key_len)
-		__dynamic_array(u8, key, map->key_size)
-		__dynamic_array(u8, nxt, map->key_size)
-		__field(bool, key_trunc)
-		__field(bool, key_null)
-		__field(int, ufd)
-	),
-
-	TP_fast_assign(
-		if (key)
-			memcpy(__get_dynamic_array(key), key, map->key_size);
-		__entry->key_null = !key;
-		memcpy(__get_dynamic_array(nxt), key_next, map->key_size);
-		__entry->type      = map->map_type;
-		__entry->key_len   = min(map->key_size, 16U);
-		__entry->key_trunc = map->key_size != __entry->key_len;
-		__entry->ufd       = ufd;
-	),
-
-	TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd,
-		  __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key),
-							   __entry->key_len),
-		  __entry->key_trunc && !__entry->key_null ? " ..." : "",
-		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
-		  __entry->key_trunc ? " ..." : "")
-);
-#endif /* CONFIG_BPF_SYSCALL */
-#endif /* _TRACE_BPF_H */
-
-#include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9349a5db3cf2..90feeba3a1a1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1845,9 +1845,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 #include <linux/bpf_trace.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
-#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index a41343009ccc..ed13645bd80c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 	ret = bpf_obj_do_pin(pname, raw, type);
 	if (ret != 0)
 		bpf_any_put(raw, type);
-	if ((trace_bpf_obj_pin_prog_enabled() ||
-	     trace_bpf_obj_pin_map_enabled()) && !ret) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_pin_prog(raw, ufd, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_pin_map(raw, ufd, pname);
-	}
 out:
 	putname(pname);
 	return ret;
@@ -502,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 	else
 		goto out;
 
-	if (ret < 0) {
+	if (ret < 0)
 		bpf_any_put(raw, type);
-	} else if (trace_bpf_obj_get_prog_enabled() ||
-		   trace_bpf_obj_get_map_enabled()) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_get_prog(raw, ret, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_get_map(raw, ret, pname);
-	}
 out:
 	putname(pname);
 	return ret;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0bd2944eafb9..263e13ede029 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -503,7 +503,6 @@ static int map_create(union bpf_attr *attr)
 		return err;
 	}
 
-	trace_bpf_map_create(map, err);
 	return err;
 
 free_map:
@@ -663,7 +662,6 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
-	trace_bpf_map_lookup_elem(map, ufd, key, value);
 	err = 0;
 
 free_value:
@@ -760,8 +758,6 @@ static int map_update_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
 	kfree(value);
 free_key:
@@ -814,8 +810,6 @@ static int map_delete_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
 err_put:
 	fdput(f);
@@ -879,7 +873,6 @@ out:
 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 		goto free_next_key;
 
-	trace_bpf_map_next_key(map, ufd, key, next_key);
 	err = 0;
 
 free_next_key:
@@ -1027,7 +1020,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		int i;
 
-		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 
@@ -1194,11 +1186,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
-
-	if (!IS_ERR(prog))
-		trace_bpf_prog_get_type(prog);
-	return prog;
+	return __bpf_prog_get(ufd, &type, attach_drv);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
@@ -1373,7 +1361,6 @@ static int bpf_prog_load(union bpf_attr *attr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
-	trace_bpf_prog_load(prog, err);
 	return err;
 
 free_used_maps:
-- 
cgit v1.2.3-59-g8ed1b


From 97389373d555cdc03ed346d7dc6f714a1586c705 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 18 Apr 2018 00:27:18 +0200
Subject: mt76x2: fix is_mt7612 routine

Fix is_mt7612 routine since asic version is set in mt76_dev revision
field and not in mt76x2_dev one.
Moreover remove mt76x2_dev rev field since it is never used in the
driver

Fixes: 7bc04215a66b ('mt76: add driver code for MT76x2e')
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2.h
index 783b8122ec3c..a5d1255e4b9c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2.h
@@ -117,7 +117,6 @@ struct mt76x2_dev {
 	u8 beacon_mask;
 	u8 beacon_data_mask;
 
-	u32 rev;
 	u32 rxfilter;
 
 	u16 chainmask;
@@ -151,7 +150,7 @@ struct mt76x2_sta {
 
 static inline bool is_mt7612(struct mt76x2_dev *dev)
 {
-	return (dev->rev >> 16) == 0x7612;
+	return mt76_chip(&dev->mt76) == 0x7612;
 }
 
 void mt76x2_set_irq_mask(struct mt76x2_dev *dev, u32 clear, u32 set);
-- 
cgit v1.2.3-59-g8ed1b


From c3d7c82a8bb017e43cafe8eaf7c8309f85ceb781 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 25 Apr 2018 11:11:21 +0200
Subject: mt76: fix concurrent rx calls on A-MPDU release

Add a spinlock in mt76_rx_complete. Without this, multiple stats updates
could happen in parallel, which can lead to deadlocks. There are
probably more corner cases fixed by this change.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c    | 2 ++
 drivers/net/wireless/mediatek/mt76/mt76.h        | 1 +
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index eb49e0a6758c..915e61733131 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -561,6 +561,7 @@ void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames,
 	if (queue >= 0)
 	    napi = &dev->napi[queue];
 
+	spin_lock(&dev->rx_lock);
 	while ((skb = __skb_dequeue(frames)) != NULL) {
 		if (mt76_check_ccmp_pn(skb)) {
 			dev_kfree_skb(skb);
@@ -570,6 +571,7 @@ void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames,
 		sta = mt76_rx_convert(skb);
 		ieee80211_rx_napi(dev->hw, sta, skb, napi);
 	}
+	spin_unlock(&dev->rx_lock);
 }
 
 void mt76_rx_poll_complete(struct mt76_dev *dev, enum mt76_rxq_id q)
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 065ff78059c3..a74e6eef51e9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -241,6 +241,7 @@ struct mt76_dev {
 	struct device *dev;
 
 	struct net_device napi_dev;
+	spinlock_t rx_lock;
 	struct napi_struct napi[__MT_RXQ_MAX];
 	struct sk_buff_head rx_skb[__MT_RXQ_MAX];
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index 359b105235b3..d21e4a7c1bb9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -645,6 +645,7 @@ struct mt76x2_dev *mt76x2_alloc_device(struct device *pdev)
 	dev->mt76.drv = &drv_ops;
 	mutex_init(&dev->mutex);
 	spin_lock_init(&dev->irq_lock);
+	spin_lock_init(&dev->mt76.rx_lock);
 
 	return dev;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9febfa67ca1566294ea5fd1f95ebf02181e7a233 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 25 Apr 2018 11:11:22 +0200
Subject: mt76: add rcu locking in tid reorder function

Avoids having the tid or station entry disappear prematurely.
Also cancel the reorder work earlier to avoid further processing delayed
by waiting for the lock to be released

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index dbf4057d2d3e..b67acc6189bf 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -103,6 +103,7 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 	__skb_queue_head_init(&frames);
 
 	local_bh_disable();
+	rcu_read_lock();
 
 	spin_lock(&tid->lock);
 	mt76_rx_aggr_check_release(tid, &frames);
@@ -114,6 +115,7 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 					     REORDER_TIMEOUT);
 	mt76_rx_complete(dev, &frames, -1);
 
+	rcu_read_unlock();
 	local_bh_enable();
 }
 
@@ -266,6 +268,8 @@ static void mt76_rx_aggr_shutdown(struct mt76_dev *dev, struct mt76_rx_tid *tid)
 	u8 size = tid->size;
 	int i;
 
+	cancel_delayed_work(&tid->reorder_work);
+
 	spin_lock_bh(&tid->lock);
 
 	tid->stopped = true;
@@ -280,8 +284,6 @@ static void mt76_rx_aggr_shutdown(struct mt76_dev *dev, struct mt76_rx_tid *tid)
 	}
 
 	spin_unlock_bh(&tid->lock);
-
-	cancel_delayed_work(&tid->reorder_work);
 }
 
 void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno)
-- 
cgit v1.2.3-59-g8ed1b


From 1d868b70e06a2319fdda46cc46ec7c6762557543 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 25 Apr 2018 11:11:23 +0200
Subject: mt76: add rcu locking around tx scheduling

Fixes a reported lockdep error in mac80211:

[  179.867321] =============================
[  179.871510] WARNING: suspicious RCU usage
[  179.875528] 4.14.32 #0 Not tainted
[  179.878924] -----------------------------
[  179.882981] backports-2017-11-01/net/mac80211/tx.c:594 suspicious rcu_dereference_check() usage!
[  179.891785]
[  179.891785] other info that might help us debug this:
[  179.891785]
[  179.899824]
[  179.899824] rcu_scheduler_active = 2, debug_locks = 1
[  179.906343] 2 locks held by ksoftirqd/0/7:
[  179.910479]  #0:  (&(&q->lock)->rlock){+.-.}, at: [<86b207a4>] mt76_dma_tx_cleanup+0x64/0x354 [mt76]
[  179.919734]  #1:  (&(&fq->lock)->rlock){+.-.}, at: [<87238410>] ieee80211_tx_dequeue+0x54/0xc3c [mac80211]
[  179.929890]
[  179.929890] stack backtrace:
[  179.934257] CPU: 0 PID: 7 Comm: ksoftirqd/0 Not tainted 4.14.32 #0
[  179.940421] Stack : 00000000 00000000 00000000 00000000 80e0fce2 00000036 00000000 00000000
[  179.948864]         87c3d24c 80696377 8061039c 00000000 00000007 00000001 87c5db78 6534689d
[  179.957306]         00000000 00000000 80e10000 87c5da74 00000001 0000015a 00000007 00000000
[  179.965748]         00000000 806a0000 000e4171 00000000 00000000 00000000 ffffffff 00000001
[  179.974189]         806c0000 8692b240 86b000d0 87316fe4 00000001 802c9a68 00000000 80700000
[  179.982632]         ...
[  179.985104] Call Trace:
[  179.987582] [<80010a48>] show_stack+0x58/0x100
[  179.992040] [<804c2c58>] dump_stack+0xe8/0x170
[  179.996868] [<87234a04>] ieee80211_tx_h_select_key+0xa8/0x5b8 [mac80211]
[  180.004299] [<87238d44>] ieee80211_tx_dequeue+0x988/0xc3c [mac80211]
[  180.011048] [<86b230dc>] mt76_txq_schedule+0x110/0x3a4 [mt76]
[  180.016821] [<86b209d0>] mt76_dma_tx_cleanup+0x290/0x354 [mt76]
[  180.022777] [<86be2e60>] mt7603_tx_tasklet+0x40/0x6c [mt7603e]
[  180.028637] [<80037058>] tasklet_action+0x110/0x1ec
[  180.033532] [<804e1dac>] __do_softirq+0x164/0x35c
[  180.038235] [<80037174>] run_ksoftirqd+0x40/0x84
[  180.042870] [<800580c8>] smpboot_thread_fn+0x1a8/0x1d8
[  180.048023] [<800542e8>] kthread+0x130/0x144
[  180.052297] [<8000b1f8>] ret_from_kernel_thread+0x14/0x1c

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/tx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 4eef69bd8a9e..6aca794cf998 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -422,12 +422,14 @@ void mt76_txq_schedule(struct mt76_dev *dev, struct mt76_queue *hwq)
 {
 	int len;
 
+	rcu_read_lock();
 	do {
 		if (hwq->swq_queued >= 4 || list_empty(&hwq->swq))
 			break;
 
 		len = mt76_txq_schedule_list(dev, hwq);
 	} while (len > 0);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(mt76_txq_schedule);
 
-- 
cgit v1.2.3-59-g8ed1b


From 95135e8c0b4add86d3ce1ab5a24881b673c2db04 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 25 Apr 2018 11:11:24 +0200
Subject: mt76: check for pending reset before attempting to schedule tx

The check within mt76_txq_send_burst is not enough, as it happens after
a first frame has already been queued up

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/tx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 6aca794cf998..7ecd2d7c5db4 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -385,6 +385,10 @@ restart:
 		bool empty = false;
 		int cur;
 
+		if (test_bit(MT76_SCANNING, &dev->state) ||
+		    test_bit(MT76_RESET, &dev->state))
+			return -EBUSY;
+
 		mtxq = list_first_entry(&hwq->swq, struct mt76_txq, list);
 		if (mtxq->send_bar && mtxq->aggr) {
 			struct ieee80211_txq *txq = mtxq_to_txq(mtxq);
-- 
cgit v1.2.3-59-g8ed1b


From c126e1995f79fb487aa99bbead0f23716b43db01 Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Tue, 24 Apr 2018 15:18:02 +0200
Subject: mwifiex: fix mwifiex_hard_start_xmit()'s return type

The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
which is a typedef for an enum type, but the implementation in this
driver returns an 'int'.

Fix this by returning 'netdev_tx_t' in this driver too.

Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index b6484582845a..802fb3ecce97 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -858,7 +858,7 @@ mwifiex_clone_skb_for_tx_status(struct mwifiex_private *priv,
 /*
  * CFG802.11 network device handler for data transmission.
  */
-static int
+static netdev_tx_t
 mwifiex_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 307857db47ebdcd56ecf26ff084cf9b966a67e17 Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Wed, 25 Apr 2018 17:38:12 +0800
Subject: mwifiex: make firmware mac address consistent with host configuration

For user configurated mac address, directly set to firmware with no change.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c |  4 +--
 drivers/net/wireless/marvell/mwifiex/main.c     | 38 ++++++++++++++-----------
 drivers/net/wireless/marvell/mwifiex/main.h     |  3 +-
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 4857b75e54a7..54a2297010d2 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -929,7 +929,7 @@ mwifiex_init_new_priv_params(struct mwifiex_private *priv,
 	adapter->rx_locked = false;
 	spin_unlock_irqrestore(&adapter->rx_proc_lock, flags);
 
-	mwifiex_set_mac_address(priv, dev);
+	mwifiex_set_mac_address(priv, dev, false, NULL);
 
 	return 0;
 }
@@ -2979,7 +2979,7 @@ struct wireless_dev *mwifiex_add_virtual_intf(struct wiphy *wiphy,
 	priv->netdev = dev;
 
 	if (!adapter->mfg_mode) {
-		mwifiex_set_mac_address(priv, dev);
+		mwifiex_set_mac_address(priv, dev, false, NULL);
 
 		ret = mwifiex_send_cmd(priv, HostCmd_CMD_SET_BSS_MODE,
 				       HostCmd_ACT_GEN_SET, 0, NULL, true);
diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 802fb3ecce97..99eaff97315c 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -940,28 +940,35 @@ mwifiex_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 int mwifiex_set_mac_address(struct mwifiex_private *priv,
-			    struct net_device *dev)
+			    struct net_device *dev, bool external,
+			    u8 *new_mac)
 {
 	int ret;
 	u64 mac_addr, old_mac_addr;
 
-	if (priv->bss_type == MWIFIEX_BSS_TYPE_ANY)
-		return -ENOTSUPP;
+	old_mac_addr = ether_addr_to_u64(priv->curr_addr);
 
-	mac_addr = ether_addr_to_u64(priv->curr_addr);
-	old_mac_addr = mac_addr;
+	if (external) {
+		mac_addr = ether_addr_to_u64(new_mac);
+	} else {
+		/* Internal mac address change */
+		if (priv->bss_type == MWIFIEX_BSS_TYPE_ANY)
+			return -ENOTSUPP;
 
-	if (priv->bss_type == MWIFIEX_BSS_TYPE_P2P)
-		mac_addr |= BIT_ULL(MWIFIEX_MAC_LOCAL_ADMIN_BIT);
+		mac_addr = old_mac_addr;
 
-	if (mwifiex_get_intf_num(priv->adapter, priv->bss_type) > 1) {
-		/* Set mac address based on bss_type/bss_num */
-		mac_addr ^= BIT_ULL(priv->bss_type + 8);
-		mac_addr += priv->bss_num;
-	}
+		if (priv->bss_type == MWIFIEX_BSS_TYPE_P2P)
+			mac_addr |= BIT_ULL(MWIFIEX_MAC_LOCAL_ADMIN_BIT);
 
-	if (mac_addr == old_mac_addr)
-		goto done;
+		if (mwifiex_get_intf_num(priv->adapter, priv->bss_type) > 1) {
+			/* Set mac address based on bss_type/bss_num */
+			mac_addr ^= BIT_ULL(priv->bss_type + 8);
+			mac_addr += priv->bss_num;
+		}
+
+		if (mac_addr == old_mac_addr)
+			goto done;
+	}
 
 	u64_to_ether_addr(mac_addr, priv->curr_addr);
 
@@ -989,8 +996,7 @@ mwifiex_ndo_set_mac_address(struct net_device *dev, void *addr)
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 	struct sockaddr *hw_addr = addr;
 
-	memcpy(priv->curr_addr, hw_addr->sa_data, ETH_ALEN);
-	return mwifiex_set_mac_address(priv, dev);
+	return mwifiex_set_mac_address(priv, dev, true, hw_addr->sa_data);
 }
 
 /*
diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h
index 9bde181700dc..7c95c1279548 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.h
+++ b/drivers/net/wireless/marvell/mwifiex/main.h
@@ -1709,7 +1709,8 @@ void mwifiex_process_multi_chan_event(struct mwifiex_private *priv,
 				      struct sk_buff *event_skb);
 void mwifiex_multi_chan_resync(struct mwifiex_adapter *adapter);
 int mwifiex_set_mac_address(struct mwifiex_private *priv,
-			    struct net_device *dev);
+			    struct net_device *dev,
+			    bool external, u8 *new_mac);
 void mwifiex_devdump_tmo_func(unsigned long function_context);
 
 #ifdef CONFIG_DEBUG_FS
-- 
cgit v1.2.3-59-g8ed1b


From c3b2a34b82db21bf46e00fbfa1b3c3ccaa0ec18f Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Wed, 25 Apr 2018 17:38:13 +0800
Subject: mwifiex: always configure firmware mac address during changing
 virtual interface

When interface type changed, firmware using a new connction pointer.
We need Re-configure the mac address.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/main.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 99eaff97315c..40c80e38a74b 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -965,9 +965,6 @@ int mwifiex_set_mac_address(struct mwifiex_private *priv,
 			mac_addr ^= BIT_ULL(priv->bss_type + 8);
 			mac_addr += priv->bss_num;
 		}
-
-		if (mac_addr == old_mac_addr)
-			goto done;
 	}
 
 	u64_to_ether_addr(mac_addr, priv->curr_addr);
@@ -983,7 +980,6 @@ int mwifiex_set_mac_address(struct mwifiex_private *priv,
 		return ret;
 	}
 
-done:
 	ether_addr_copy(dev->dev_addr, priv->curr_addr);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7cce13954f0e6b16b4c18a3985387018f2e3e44e Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Wed, 25 Apr 2018 17:38:14 +0800
Subject: mwifiex: keep user configured mac address during changing virtual
 interface

During changing virtual interface, keep using previous net device
mac address.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c
index 40c80e38a74b..510f6b8e717d 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.c
+++ b/drivers/net/wireless/marvell/mwifiex/main.c
@@ -1333,7 +1333,10 @@ void mwifiex_init_priv_params(struct mwifiex_private *priv,
 	priv->assocresp_idx = MWIFIEX_AUTO_IDX_MASK;
 	priv->gen_idx = MWIFIEX_AUTO_IDX_MASK;
 	priv->num_tx_timeout = 0;
-	ether_addr_copy(priv->curr_addr, priv->adapter->perm_addr);
+	if (is_valid_ether_addr(dev->dev_addr))
+		ether_addr_copy(priv->curr_addr, dev->dev_addr);
+	else
+		ether_addr_copy(priv->curr_addr, priv->adapter->perm_addr);
 
 	if (GET_BSS_ROLE(priv) == MWIFIEX_BSS_ROLE_STA ||
 	    GET_BSS_ROLE(priv) == MWIFIEX_BSS_ROLE_UAP) {
-- 
cgit v1.2.3-59-g8ed1b


From 6e1d8d1470b2c9335715f7d52e864f0bd91a5f59 Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Tue, 10 Apr 2018 21:54:19 +0800
Subject: net: wireless: b43legacy: Replace GFP_ATOMIC with GFP_KERNEL in
 dma_tx_fragment

dma_tx_fragment() is never called in atomic context.

dma_tx_fragment() is only called by b43legacy_dma_tx(), which is
only called by b43legacy_tx_work().
b43legacy_tx_work() is only set a parameter of INIT_WORK() in
b43legacy_wireless_init().

Despite never getting called from atomic context,
dma_tx_fragment() calls alloc_skb() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/b43legacy/dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/b43legacy/dma.c b/drivers/net/wireless/broadcom/b43legacy/dma.c
index cfa617ddb2f1..2f0c64cef65f 100644
--- a/drivers/net/wireless/broadcom/b43legacy/dma.c
+++ b/drivers/net/wireless/broadcom/b43legacy/dma.c
@@ -1064,7 +1064,7 @@ static int dma_tx_fragment(struct b43legacy_dmaring *ring,
 	meta->dmaaddr = map_descbuffer(ring, skb->data, skb->len, 1);
 	/* create a bounce buffer in zone_dma on mapping failure. */
 	if (b43legacy_dma_mapping_error(ring, meta->dmaaddr, skb->len, 1)) {
-		bounce_skb = alloc_skb(skb->len, GFP_ATOMIC | GFP_DMA);
+		bounce_skb = alloc_skb(skb->len, GFP_KERNEL | GFP_DMA);
 		if (!bounce_skb) {
 			ring->current_slot = old_top_slot;
 			ring->used_slots = old_used_slots;
-- 
cgit v1.2.3-59-g8ed1b


From 3ff6ee28535f2f6698cc2b3acfd2af6942036067 Mon Sep 17 00:00:00 2001
From: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Date: Tue, 24 Apr 2018 15:18:04 +0200
Subject: qtnfmac: fix qtnf_netdev_hard_start_xmit()'s return type

The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
which is a typedef for an enum type, but the implementation in this
driver returns an 'int'.

Fix this by returning 'netdev_tx_t' in this driver too.

Signed-off-by: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index cf26c15a84f8..b3bfb4faa918 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -76,7 +76,7 @@ static int qtnf_netdev_close(struct net_device *ndev)
 
 /* Netdev handler for data transmission.
  */
-static int
+static netdev_tx_t
 qtnf_netdev_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
 	struct qtnf_vif *vif;
-- 
cgit v1.2.3-59-g8ed1b


From 60f36637bbbdeea199d644eda284dba5144d2377 Mon Sep 17 00:00:00 2001
From: Eyal Reizer <eyalreizer@gmail.com>
Date: Thu, 26 Apr 2018 08:47:11 +0300
Subject: wlcore: sdio: allow pm to handle sdio power

pm_runtime handles sdio power on and power off transitions.
An old workaround for trying to control the power explicitly from the
driver was in fact causing failures on suspend/resume as the mmc layer
already power the module on resume.

In case of resume pm_runtime_get sync returns a positive device's usage
count causing the driver to try an re-initialize an already initialized
device. This was causing sdio bus failure on resume.

Remove this manual power on/off sequence as it is in-fact not needed.

Signed-off-by: Eyal Reizer <eyalr@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ti/wlcore/sdio.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/ti/wlcore/sdio.c b/drivers/net/wireless/ti/wlcore/sdio.c
index 1f727babbea0..6dbe61d47dc3 100644
--- a/drivers/net/wireless/ti/wlcore/sdio.c
+++ b/drivers/net/wireless/ti/wlcore/sdio.c
@@ -155,17 +155,11 @@ static int wl12xx_sdio_power_on(struct wl12xx_sdio_glue *glue)
 	struct mmc_card *card = func->card;
 
 	ret = pm_runtime_get_sync(&card->dev);
-	if (ret) {
-		/*
-		 * Runtime PM might be temporarily disabled, or the device
-		 * might have a positive reference counter. Make sure it is
-		 * really powered on.
-		 */
-		ret = mmc_power_restore_host(card->host);
-		if (ret < 0) {
-			pm_runtime_put_sync(&card->dev);
-			goto out;
-		}
+	if (ret < 0) {
+		pm_runtime_put_noidle(&card->dev);
+		dev_err(glue->dev, "%s: failed to get_sync(%d)\n",
+			__func__, ret);
+		goto out;
 	}
 
 	sdio_claim_host(func);
@@ -178,7 +172,6 @@ out:
 
 static int wl12xx_sdio_power_off(struct wl12xx_sdio_glue *glue)
 {
-	int ret;
 	struct sdio_func *func = dev_to_sdio_func(glue->dev);
 	struct mmc_card *card = func->card;
 
@@ -186,16 +179,8 @@ static int wl12xx_sdio_power_off(struct wl12xx_sdio_glue *glue)
 	sdio_disable_func(func);
 	sdio_release_host(func);
 
-	/* Power off the card manually in case it wasn't powered off above */
-	ret = mmc_power_save_host(card->host);
-	if (ret < 0)
-		goto out;
-
 	/* Let runtime PM know the card is powered off */
-	pm_runtime_put_sync(&card->dev);
-
-out:
-	return ret;
+	return pm_runtime_put_sync(&card->dev);
 }
 
 static int wl12xx_sdio_set_power(struct device *child, bool enable)
-- 
cgit v1.2.3-59-g8ed1b


From e1fd7ceec194225a822760a40dfa73efdd59881b Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 26 Apr 2018 08:01:38 -0500
Subject: rsi_91x: fix structurally dead code

Function rsi_hal_key_config returns before reaching code at line
922 if (status), hence this code is structurally dead.

Fix this by storing the value returned by rsi_hal_load_key
into _status_ for its further evaluation and use.

Addresses-Coverity-ID: 1468409 ("Structurally dead code")
Fixes: 4fd6c4762f37 ("rsi: roaming enhancements")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 766d874cc6e2..80e7f4f4f188 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -911,14 +911,14 @@ static int rsi_hal_key_config(struct ieee80211_hw *hw,
 		}
 	}
 
-	return rsi_hal_load_key(adapter->priv,
-				key->key,
-				key->keylen,
-				key_type,
-				key->keyidx,
-				key->cipher,
-				sta_id,
-				vif);
+	status = rsi_hal_load_key(adapter->priv,
+				  key->key,
+				  key->keylen,
+				  key_type,
+				  key->keyidx,
+				  key->cipher,
+				  sta_id,
+				  vif);
 	if (status)
 		return status;
 
-- 
cgit v1.2.3-59-g8ed1b


From 48c6b5c9c1180b7f8b35edeaef9b3aa3e3b6c9d5 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 26 Apr 2018 08:13:24 -0500
Subject: rsi_91x: fix uninitialized variable

There is a potential execution path in which variable ret is returned
without being properly initialized previously.

Fix this by storing the value returned by function
rsi_usb_master_reg_write into _ret_.

Addresses-Coverity-ID: 1468407 ("Uninitialized scalar variable")
Fixes: 16d3bb7b2f37 ("rsi: disable fw watchdog timer during reset")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_usb.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_usb.c b/drivers/net/wireless/rsi/rsi_91x_usb.c
index b065438f51b2..6ce6b754df12 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb.c
@@ -687,9 +687,10 @@ static int rsi_reset_card(struct rsi_hw *adapter)
 	 */
 	msleep(100);
 
-	if (rsi_usb_master_reg_write(adapter, SWBL_REGOUT,
-				     RSI_FW_WDT_DISABLE_REQ,
-				     RSI_COMMON_REG_SIZE) < 0) {
+	ret = rsi_usb_master_reg_write(adapter, SWBL_REGOUT,
+				       RSI_FW_WDT_DISABLE_REQ,
+				       RSI_COMMON_REG_SIZE);
+	if (ret < 0) {
 		rsi_dbg(ERR_ZONE, "Disabling firmware watchdog timer failed\n");
 		goto fail;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cc282a0c5f0823bfcf084103445c69894129227c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 18 Apr 2018 12:47:50 +0100
Subject: rt2x00: fix spelling mistake in various macros, UKNOWN -> UNKNOWN

Rename several macros that contain mispellings of UNKNOWN

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ralink/rt2x00/rt2800.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800.h b/drivers/net/wireless/ralink/rt2x00/rt2800.h
index 7a133e94b1bf..b05ed2f3025a 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800.h
@@ -1194,10 +1194,10 @@
 #define TX_PWR_CFG_3_MCS13		FIELD32(0x000000f0)
 #define TX_PWR_CFG_3_MCS14		FIELD32(0x00000f00)
 #define TX_PWR_CFG_3_MCS15		FIELD32(0x0000f000)
-#define TX_PWR_CFG_3_UKNOWN1		FIELD32(0x000f0000)
-#define TX_PWR_CFG_3_UKNOWN2		FIELD32(0x00f00000)
-#define TX_PWR_CFG_3_UKNOWN3		FIELD32(0x0f000000)
-#define TX_PWR_CFG_3_UKNOWN4		FIELD32(0xf0000000)
+#define TX_PWR_CFG_3_UNKNOWN1		FIELD32(0x000f0000)
+#define TX_PWR_CFG_3_UNKNOWN2		FIELD32(0x00f00000)
+#define TX_PWR_CFG_3_UNKNOWN3		FIELD32(0x0f000000)
+#define TX_PWR_CFG_3_UNKNOWN4		FIELD32(0xf0000000)
 /* bits for 3T devices */
 #define TX_PWR_CFG_3_MCS12_CH0		FIELD32(0x0000000f)
 #define TX_PWR_CFG_3_MCS12_CH1		FIELD32(0x000000f0)
@@ -1217,10 +1217,10 @@
  * TX_PWR_CFG_4:
  */
 #define TX_PWR_CFG_4			0x1324
-#define TX_PWR_CFG_4_UKNOWN5		FIELD32(0x0000000f)
-#define TX_PWR_CFG_4_UKNOWN6		FIELD32(0x000000f0)
-#define TX_PWR_CFG_4_UKNOWN7		FIELD32(0x00000f00)
-#define TX_PWR_CFG_4_UKNOWN8		FIELD32(0x0000f000)
+#define TX_PWR_CFG_4_UNKNOWN5		FIELD32(0x0000000f)
+#define TX_PWR_CFG_4_UNKNOWN6		FIELD32(0x000000f0)
+#define TX_PWR_CFG_4_UNKNOWN7		FIELD32(0x00000f00)
+#define TX_PWR_CFG_4_UNKNOWN8		FIELD32(0x0000f000)
 /* bits for 3T devices */
 #define TX_PWR_CFG_4_STBC4_CH0		FIELD32(0x0000000f)
 #define TX_PWR_CFG_4_STBC4_CH1		FIELD32(0x000000f0)
-- 
cgit v1.2.3-59-g8ed1b


From 3ea0a58cf9cf66e53f52184446fd881f828709e4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 28 Apr 2018 23:31:08 +0100
Subject: ipw2100: fix spelling mistake: "decsribed" -> "described"

Trivial fix to spelling mistake in comment and in the ord_data text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 +-
 drivers/net/wireless/intel/ipw2x00/ipw2100.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index 236b52423506..7c4f550a1475 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -3732,7 +3732,7 @@ IPW2100_ORD(STAT_TX_HOST_REQUESTS, "requested Host Tx's (MSDU)"),
 	    IPW2100_ORD(ASSOCIATED_AP_PTR,
 				"0 if not associated, else pointer to AP table entry"),
 	    IPW2100_ORD(AVAILABLE_AP_CNT,
-				"AP's decsribed in the AP table"),
+				"AP's described in the AP table"),
 	    IPW2100_ORD(AP_LIST_PTR, "Ptr to list of available APs"),
 	    IPW2100_ORD(STAT_AP_ASSNS, "associations"),
 	    IPW2100_ORD(STAT_ASSN_FAIL, "association failures"),
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.h b/drivers/net/wireless/intel/ipw2x00/ipw2100.h
index 193947865efd..ce3e35f6b60f 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.h
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.h
@@ -1009,7 +1009,7 @@ typedef enum _ORDINAL_TABLE_1 {	// NS - means Not Supported by FW
 	IPW_ORD_STAT_PERCENT_RETRIES,	// current calculation of % missed tx retries
 	IPW_ORD_ASSOCIATED_AP_PTR,	// If associated, this is ptr to the associated
 	// AP table entry. set to 0 if not associated
-	IPW_ORD_AVAILABLE_AP_CNT,	// # of AP's decsribed in the AP table
+	IPW_ORD_AVAILABLE_AP_CNT,	// # of AP's described in the AP table
 	IPW_ORD_AP_LIST_PTR,	// Ptr to list of available APs
 	IPW_ORD_STAT_AP_ASSNS,	// # of associations
 	IPW_ORD_STAT_ASSN_FAIL,	// # of association failures
-- 
cgit v1.2.3-59-g8ed1b


From cb746e47837ad0f35c8ae28e9aacc8eb07916d2a Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Thu, 26 Apr 2018 12:16:47 +0200
Subject: brcmfmac: check p2pdev mac address uniqueness

The mac address for p2pdev must be different from the primary interface
due to firmware requirement. Add an explicit check for this requirement
if user-space provides a mac address.

Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
index bcef208a81a5..4b2149b48362 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c
@@ -2073,6 +2073,13 @@ static struct wireless_dev *brcmf_p2p_create_p2pdev(struct brcmf_p2p_info *p2p,
 	}
 
 	pri_ifp = p2p->bss_idx[P2PAPI_BSSCFG_PRIMARY].vif->ifp;
+
+	/* firmware requires unique mac address for p2pdev interface */
+	if (addr && ether_addr_equal(addr, pri_ifp->mac_addr)) {
+		brcmf_err("discovery vif must be different from primary interface\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	brcmf_p2p_generate_bss_mac(p2p, addr);
 	brcmf_p2p_set_firmware(pri_ifp, p2p->dev_addr);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7742fce4c007141617dab9bcb90034b3c0fe2347 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Thu, 26 Apr 2018 12:18:35 +0200
Subject: brcmfmac: reports boottime_ns while informing bss

Provides a timestamp in bss information so user space can see when the
bss info was updated. Since tsf is not available from the dongle events
boottime is reported instead.

Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         | 26 +++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 89b86251910e..5d891cb4d625 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2728,7 +2728,6 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 				   struct brcmf_bss_info_le *bi)
 {
 	struct wiphy *wiphy = cfg_to_wiphy(cfg);
-	struct ieee80211_channel *notify_channel;
 	struct cfg80211_bss *bss;
 	struct ieee80211_supported_band *band;
 	struct brcmu_chan ch;
@@ -2738,7 +2737,7 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 	u16 notify_interval;
 	u8 *notify_ie;
 	size_t notify_ielen;
-	s32 notify_signal;
+	struct cfg80211_inform_bss bss_data = { 0 };
 
 	if (le32_to_cpu(bi->length) > WL_BSS_INFO_MAX) {
 		brcmf_err("Bss info is larger than buffer. Discarding\n");
@@ -2758,27 +2757,28 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 		band = wiphy->bands[NL80211_BAND_5GHZ];
 
 	freq = ieee80211_channel_to_frequency(channel, band->band);
-	notify_channel = ieee80211_get_channel(wiphy, freq);
+	bss_data.chan = ieee80211_get_channel(wiphy, freq);
+	bss_data.scan_width = NL80211_BSS_CHAN_WIDTH_20;
+	bss_data.boottime_ns = ktime_to_ns(ktime_get_boottime());
 
 	notify_capability = le16_to_cpu(bi->capability);
 	notify_interval = le16_to_cpu(bi->beacon_period);
 	notify_ie = (u8 *)bi + le16_to_cpu(bi->ie_offset);
 	notify_ielen = le32_to_cpu(bi->ie_length);
-	notify_signal = (s16)le16_to_cpu(bi->RSSI) * 100;
+	bss_data.signal = (s16)le16_to_cpu(bi->RSSI) * 100;
 
 	brcmf_dbg(CONN, "bssid: %pM\n", bi->BSSID);
 	brcmf_dbg(CONN, "Channel: %d(%d)\n", channel, freq);
 	brcmf_dbg(CONN, "Capability: %X\n", notify_capability);
 	brcmf_dbg(CONN, "Beacon interval: %d\n", notify_interval);
-	brcmf_dbg(CONN, "Signal: %d\n", notify_signal);
-
-	bss = cfg80211_inform_bss(wiphy, notify_channel,
-				  CFG80211_BSS_FTYPE_UNKNOWN,
-				  (const u8 *)bi->BSSID,
-				  0, notify_capability,
-				  notify_interval, notify_ie,
-				  notify_ielen, notify_signal,
-				  GFP_KERNEL);
+	brcmf_dbg(CONN, "Signal: %d\n", bss_data.signal);
+
+	bss = cfg80211_inform_bss_data(wiphy, &bss_data,
+				       CFG80211_BSS_FTYPE_UNKNOWN,
+				       (const u8 *)bi->BSSID,
+				       0, notify_capability,
+				       notify_interval, notify_ie,
+				       notify_ielen, GFP_KERNEL);
 
 	if (!bss)
 		return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From aed14219067ab96e5eeb7730e9bceed10d9be989 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Thu, 26 Apr 2018 12:16:48 +0200
Subject: brcmfmac: use nl80211_band directly to get ieee80211 channel

The enum nl80211_band used to retrieve wiphy->bands is the same as
wiphy->bands->band which is checked by wiphy_register(). So it can be used
directly as parameter of ieee80211_channel_to_frequency().

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 5d891cb4d625..ff6b3514c501 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2729,7 +2729,7 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 {
 	struct wiphy *wiphy = cfg_to_wiphy(cfg);
 	struct cfg80211_bss *bss;
-	struct ieee80211_supported_band *band;
+	enum nl80211_band band;
 	struct brcmu_chan ch;
 	u16 channel;
 	u32 freq;
@@ -2752,11 +2752,11 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 	channel = bi->ctl_ch;
 
 	if (channel <= CH_MAX_2G_CHANNEL)
-		band = wiphy->bands[NL80211_BAND_2GHZ];
+		band = NL80211_BAND_2GHZ;
 	else
-		band = wiphy->bands[NL80211_BAND_5GHZ];
+		band = NL80211_BAND_5GHZ;
 
-	freq = ieee80211_channel_to_frequency(channel, band->band);
+	freq = ieee80211_channel_to_frequency(channel, band);
 	bss_data.chan = ieee80211_get_channel(wiphy, freq);
 	bss_data.scan_width = NL80211_BSS_CHAN_WIDTH_20;
 	bss_data.boottime_ns = ktime_to_ns(ktime_get_boottime());
-- 
cgit v1.2.3-59-g8ed1b


From ff68c9f9c06d1fd437c8f90fc05ca28c47f7d85e Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Thu, 26 Apr 2018 12:16:49 +0200
Subject: brcmfmac: constify firmware mapping tables

The information in the firmware mapping does not need to be modified
so it can be static const.

Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c     | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c     | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
index 94e177d7c9b5..9095b830ae4d 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c
@@ -634,7 +634,7 @@ int brcmf_fw_get_firmwares(struct device *dev, struct brcmf_fw_request *req,
 
 struct brcmf_fw_request *
 brcmf_fw_alloc_request(u32 chip, u32 chiprev,
-		       struct brcmf_firmware_mapping mapping_table[],
+		       const struct brcmf_firmware_mapping mapping_table[],
 		       u32 table_size, struct brcmf_fw_name *fwnames,
 		       u32 n_fwnames)
 {
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
index 79a21095c349..2893e56910f0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.h
@@ -80,7 +80,7 @@ struct brcmf_fw_name {
 
 struct brcmf_fw_request *
 brcmf_fw_alloc_request(u32 chip, u32 chiprev,
-		       struct brcmf_firmware_mapping mapping_table[],
+		       const struct brcmf_firmware_mapping mapping_table[],
 		       u32 table_size, struct brcmf_fw_name *fwnames,
 		       u32 n_fwnames);
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 6e25ff94dd07..9708647b99e7 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -59,7 +59,7 @@ BRCMF_FW_DEF(4366B, "brcmfmac4366b-pcie");
 BRCMF_FW_DEF(4366C, "brcmfmac4366c-pcie");
 BRCMF_FW_DEF(4371, "brcmfmac4371-pcie");
 
-static struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
+static const struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_43602_CHIP_ID, 0xFFFFFFFF, 43602),
 	BRCMF_FW_ENTRY(BRCM_CC_43465_CHIP_ID, 0xFFFFFFF0, 4366C),
 	BRCMF_FW_ENTRY(BRCM_CC_4350_CHIP_ID, 0x000000FF, 4350C),
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 1037df7297bb..412a05b9a2b2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -619,7 +619,7 @@ BRCMF_FW_DEF(4354, "brcmfmac4354-sdio");
 BRCMF_FW_DEF(4356, "brcmfmac4356-sdio");
 BRCMF_FW_DEF(4373, "brcmfmac4373-sdio");
 
-static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
+static const struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
 	BRCMF_FW_ENTRY(BRCM_CC_43241_CHIP_ID, 0x0000001F, 43241B0),
 	BRCMF_FW_ENTRY(BRCM_CC_43241_CHIP_ID, 0x00000020, 43241B4),
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index a0873adcc01c..a4308c6e72d7 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -52,7 +52,7 @@ BRCMF_FW_DEF(43242A, "brcmfmac43242a");
 BRCMF_FW_DEF(43569, "brcmfmac43569");
 BRCMF_FW_DEF(4373, "brcmfmac4373");
 
-static struct brcmf_firmware_mapping brcmf_usb_fwnames[] = {
+static const struct brcmf_firmware_mapping brcmf_usb_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
 	BRCMF_FW_ENTRY(BRCM_CC_43235_CHIP_ID, 0x00000008, 43236B),
 	BRCMF_FW_ENTRY(BRCM_CC_43236_CHIP_ID, 0x00000008, 43236B),
-- 
cgit v1.2.3-59-g8ed1b


From 84ad327d18debe19b8d509059b61db445d048b02 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Thu, 26 Apr 2018 12:16:50 +0200
Subject: brcmfmac: add hostready indication

A hostready signal is introduced to inform firmware through mailbox
doorbell1 when common ring initialized or D3 exited.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 9708647b99e7..71b69af9294c 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -105,7 +105,8 @@ static const struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 #define BRCMF_PCIE_PCIE2REG_MAILBOXMASK		0x4C
 #define BRCMF_PCIE_PCIE2REG_CONFIGADDR		0x120
 #define BRCMF_PCIE_PCIE2REG_CONFIGDATA		0x124
-#define BRCMF_PCIE_PCIE2REG_H2D_MAILBOX		0x140
+#define BRCMF_PCIE_PCIE2REG_H2D_MAILBOX_0	0x140
+#define BRCMF_PCIE_PCIE2REG_H2D_MAILBOX_1	0x144
 
 #define BRCMF_PCIE2_INTA			0x01
 #define BRCMF_PCIE2_INTB			0x02
@@ -140,6 +141,7 @@ static const struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 #define BRCMF_PCIE_SHARED_VERSION_MASK		0x00FF
 #define BRCMF_PCIE_SHARED_DMA_INDEX		0x10000
 #define BRCMF_PCIE_SHARED_DMA_2B_IDX		0x100000
+#define BRCMF_PCIE_SHARED_HOSTRDY_DB1		0x10000000
 
 #define BRCMF_PCIE_FLAGS_HTOD_SPLIT		0x4000
 #define BRCMF_PCIE_FLAGS_DTOH_SPLIT		0x8000
@@ -782,6 +784,12 @@ static void brcmf_pcie_intr_enable(struct brcmf_pciedev_info *devinfo)
 			       BRCMF_PCIE_MB_INT_FN0_1);
 }
 
+static void brcmf_pcie_hostready(struct brcmf_pciedev_info *devinfo)
+{
+	if (devinfo->shared.flags & BRCMF_PCIE_SHARED_HOSTRDY_DB1)
+		brcmf_pcie_write_reg32(devinfo,
+				       BRCMF_PCIE_PCIE2REG_H2D_MAILBOX_1, 1);
+}
 
 static irqreturn_t brcmf_pcie_quick_check_isr(int irq, void *arg)
 {
@@ -924,7 +932,7 @@ static int brcmf_pcie_ring_mb_ring_bell(void *ctx)
 
 	brcmf_dbg(PCIE, "RING !\n");
 	/* Any arbitrary value will do, lets use 1 */
-	brcmf_pcie_write_reg32(devinfo, BRCMF_PCIE_PCIE2REG_H2D_MAILBOX, 1);
+	brcmf_pcie_write_reg32(devinfo, BRCMF_PCIE_PCIE2REG_H2D_MAILBOX_0, 1);
 
 	return 0;
 }
@@ -1728,6 +1736,7 @@ static void brcmf_pcie_setup(struct device *dev, int ret,
 	init_waitqueue_head(&devinfo->mbdata_resp_wait);
 
 	brcmf_pcie_intr_enable(devinfo);
+	brcmf_pcie_hostready(devinfo);
 	if (brcmf_attach(&devinfo->pdev->dev, devinfo->settings) == 0)
 		return;
 
@@ -1950,6 +1959,7 @@ static int brcmf_pcie_pm_leave_D3(struct device *dev)
 		brcmf_pcie_select_core(devinfo, BCMA_CORE_PCIE2);
 		brcmf_bus_change_state(bus, BRCMF_BUS_UP);
 		brcmf_pcie_intr_enable(devinfo);
+		brcmf_pcie_hostready(devinfo);
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From f56324baf329bc9362a52ad77a4a1a0f3356d1bc Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Thu, 26 Apr 2018 12:16:51 +0200
Subject: brcmfmac: coarse support for PCIe shared structure rev7

Revision 7 of PCIe dongle interface increases the item size of tx and rx
complete rings to accommodate extra payload for new feature. This patch
simply bump up the size of these two rings without adding the support
for utilizing the new space. This makes brcmfmac compatible with rev7
firmware.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../wireless/broadcom/brcm80211/brcmfmac/msgbuf.h  |  6 ++++--
 .../wireless/broadcom/brcm80211/brcmfmac/pcie.c    | 23 ++++++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/msgbuf.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/msgbuf.h
index f93ba6be1ef8..692235d25277 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/msgbuf.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/msgbuf.h
@@ -27,8 +27,10 @@
 #define BRCMF_H2D_MSGRING_CONTROL_SUBMIT_ITEMSIZE	40
 #define BRCMF_H2D_MSGRING_RXPOST_SUBMIT_ITEMSIZE	32
 #define BRCMF_D2H_MSGRING_CONTROL_COMPLETE_ITEMSIZE	24
-#define BRCMF_D2H_MSGRING_TX_COMPLETE_ITEMSIZE		16
-#define BRCMF_D2H_MSGRING_RX_COMPLETE_ITEMSIZE		32
+#define BRCMF_D2H_MSGRING_TX_COMPLETE_ITEMSIZE_PRE_V7	16
+#define BRCMF_D2H_MSGRING_TX_COMPLETE_ITEMSIZE		24
+#define BRCMF_D2H_MSGRING_RX_COMPLETE_ITEMSIZE_PRE_V7	32
+#define BRCMF_D2H_MSGRING_RX_COMPLETE_ITEMSIZE		40
 #define BRCMF_H2D_TXFLOWRING_ITEMSIZE			48
 
 struct msgbuf_buf_addr {
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 71b69af9294c..f0797aeada67 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -136,8 +136,9 @@ static const struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 						 BRCMF_PCIE_MB_INT_D2H3_DB0 | \
 						 BRCMF_PCIE_MB_INT_D2H3_DB1)
 
+#define BRCMF_PCIE_SHARED_VERSION_7		7
 #define BRCMF_PCIE_MIN_SHARED_VERSION		5
-#define BRCMF_PCIE_MAX_SHARED_VERSION		6
+#define BRCMF_PCIE_MAX_SHARED_VERSION		BRCMF_PCIE_SHARED_VERSION_7
 #define BRCMF_PCIE_SHARED_VERSION_MASK		0x00FF
 #define BRCMF_PCIE_SHARED_DMA_INDEX		0x10000
 #define BRCMF_PCIE_SHARED_DMA_2B_IDX		0x100000
@@ -318,6 +319,14 @@ static const u32 brcmf_ring_max_item[BRCMF_NROF_COMMON_MSGRINGS] = {
 	BRCMF_D2H_MSGRING_RX_COMPLETE_MAX_ITEM
 };
 
+static const u32 brcmf_ring_itemsize_pre_v7[BRCMF_NROF_COMMON_MSGRINGS] = {
+	BRCMF_H2D_MSGRING_CONTROL_SUBMIT_ITEMSIZE,
+	BRCMF_H2D_MSGRING_RXPOST_SUBMIT_ITEMSIZE,
+	BRCMF_D2H_MSGRING_CONTROL_COMPLETE_ITEMSIZE,
+	BRCMF_D2H_MSGRING_TX_COMPLETE_ITEMSIZE_PRE_V7,
+	BRCMF_D2H_MSGRING_RX_COMPLETE_ITEMSIZE_PRE_V7
+};
+
 static const u32 brcmf_ring_itemsize[BRCMF_NROF_COMMON_MSGRINGS] = {
 	BRCMF_H2D_MSGRING_CONTROL_SUBMIT_ITEMSIZE,
 	BRCMF_H2D_MSGRING_RXPOST_SUBMIT_ITEMSIZE,
@@ -1007,8 +1016,14 @@ brcmf_pcie_alloc_dma_and_ring(struct brcmf_pciedev_info *devinfo, u32 ring_id,
 	struct brcmf_pcie_ringbuf *ring;
 	u32 size;
 	u32 addr;
+	const u32 *ring_itemsize_array;
+
+	if (devinfo->shared.version < BRCMF_PCIE_SHARED_VERSION_7)
+		ring_itemsize_array = brcmf_ring_itemsize_pre_v7;
+	else
+		ring_itemsize_array = brcmf_ring_itemsize;
 
-	size = brcmf_ring_max_item[ring_id] * brcmf_ring_itemsize[ring_id];
+	size = brcmf_ring_max_item[ring_id] * ring_itemsize_array[ring_id];
 	dma_buf = brcmf_pcie_init_dmabuffer_for_device(devinfo, size,
 			tcm_ring_phys_addr + BRCMF_RING_MEM_BASE_ADDR_OFFSET,
 			&dma_handle);
@@ -1018,7 +1033,7 @@ brcmf_pcie_alloc_dma_and_ring(struct brcmf_pciedev_info *devinfo, u32 ring_id,
 	addr = tcm_ring_phys_addr + BRCMF_RING_MAX_ITEM_OFFSET;
 	brcmf_pcie_write_tcm16(devinfo, addr, brcmf_ring_max_item[ring_id]);
 	addr = tcm_ring_phys_addr + BRCMF_RING_LEN_ITEMS_OFFSET;
-	brcmf_pcie_write_tcm16(devinfo, addr, brcmf_ring_itemsize[ring_id]);
+	brcmf_pcie_write_tcm16(devinfo, addr, ring_itemsize_array[ring_id]);
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring) {
@@ -1027,7 +1042,7 @@ brcmf_pcie_alloc_dma_and_ring(struct brcmf_pciedev_info *devinfo, u32 ring_id,
 		return NULL;
 	}
 	brcmf_commonring_config(&ring->commonring, brcmf_ring_max_item[ring_id],
-				brcmf_ring_itemsize[ring_id], dma_buf);
+				ring_itemsize_array[ring_id], dma_buf);
 	ring->dma_handle = dma_handle;
 	ring->devinfo = devinfo;
 	brcmf_commonring_register_cb(&ring->commonring,
-- 
cgit v1.2.3-59-g8ed1b


From 3bd5a09b529c03bac354c9d48e688ed2aca934fd Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 30 Apr 2018 11:39:03 +0100
Subject: bpf: fix formatting for bpf_perf_event_read() helper doc

Some edits brought to the last iteration of BPF helper functions
documentation introduced an error with RST formatting. As a result, most
of one paragraph is rendered in bold text when only the name of a helper
should be. Fix it, and fix formatting of another function name in the
same paragraph.

Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23b334bba1a6..530ff6588d8f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -828,12 +828,12 @@ union bpf_attr {
  *
  * 		Also, be aware that the newer helper
  * 		**bpf_perf_event_read_value**\ () is recommended over
- * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
  * 		quirks where error and counter value are used as a return code
  * 		(which is wrong to do since ranges may overlap). This issue is
- * 		fixed with bpf_perf_event_read_value(), which at the same time
- * 		provides more features over the **bpf_perf_event_read**\ ()
- * 		interface. Please refer to the description of
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
  * 		**bpf_perf_event_read_value**\ () for details.
  * 	Return
  * 		The value of the perf event counter read from the map, or a
-- 
cgit v1.2.3-59-g8ed1b


From 79552fbc0f9dc20dc022be7cc48eb3761623fa56 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 30 Apr 2018 11:39:04 +0100
Subject: bpf: fix formatting for bpf_get_stack() helper doc

Fix formatting (indent) for bpf_get_stack() helper documentation, so
that the doc is rendered correctly with the Python script.

Fixes: c195651e565a ("bpf: add bpf_get_stack helper")
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 54 ++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 530ff6588d8f..8daef7326bb7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1770,33 +1770,33 @@ union bpf_attr {
  *
  * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
  * 	Description
- *		Return a user or a kernel stack in bpf program provided buffer.
- *		To achieve this, the helper needs *ctx*, which is a pointer
- *		to the context on which the tracing program is executed.
- *		To store the stacktrace, the bpf program provides *buf* with
- *		a nonnegative *size*.
- *
- *		The last argument, *flags*, holds the number of stack frames to
- *		skip (from 0 to 255), masked with
- *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
- *		the following flags:
- *
- *		**BPF_F_USER_STACK**
- *			Collect a user space stack instead of a kernel stack.
- *		**BPF_F_USER_BUILD_ID**
- *			Collect buildid+offset instead of ips for user stack,
- *			only valid if **BPF_F_USER_STACK** is also specified.
- *
- *		**bpf_get_stack**\ () can collect up to
- *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
- *		to sufficient large buffer size. Note that
- *		this limit can be controlled with the **sysctl** program, and
- *		that it should be manually increased in order to profile long
- *		user stacks (such as stacks for Java programs). To do so, use:
- *
- *	::
- *
- *		# sysctl kernel.perf_event_max_stack=<new value>
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
  *
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
-- 
cgit v1.2.3-59-g8ed1b


From a56497d3d5cf081e00fde944a1de129442e7b065 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Mon, 30 Apr 2018 11:39:05 +0100
Subject: bpf: update bpf.h uapi header for tools

Bring fixes for eBPF helper documentation formatting to bpf.h under
tools/ as well.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 62 +++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 23b334bba1a6..8daef7326bb7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -828,12 +828,12 @@ union bpf_attr {
  *
  * 		Also, be aware that the newer helper
  * 		**bpf_perf_event_read_value**\ () is recommended over
- * 		**bpf_perf_event_read*\ () in general. The latter has some ABI
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
  * 		quirks where error and counter value are used as a return code
  * 		(which is wrong to do since ranges may overlap). This issue is
- * 		fixed with bpf_perf_event_read_value(), which at the same time
- * 		provides more features over the **bpf_perf_event_read**\ ()
- * 		interface. Please refer to the description of
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
  * 		**bpf_perf_event_read_value**\ () for details.
  * 	Return
  * 		The value of the perf event counter read from the map, or a
@@ -1770,33 +1770,33 @@ union bpf_attr {
  *
  * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
  * 	Description
- *		Return a user or a kernel stack in bpf program provided buffer.
- *		To achieve this, the helper needs *ctx*, which is a pointer
- *		to the context on which the tracing program is executed.
- *		To store the stacktrace, the bpf program provides *buf* with
- *		a nonnegative *size*.
- *
- *		The last argument, *flags*, holds the number of stack frames to
- *		skip (from 0 to 255), masked with
- *		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
- *		the following flags:
- *
- *		**BPF_F_USER_STACK**
- *			Collect a user space stack instead of a kernel stack.
- *		**BPF_F_USER_BUILD_ID**
- *			Collect buildid+offset instead of ips for user stack,
- *			only valid if **BPF_F_USER_STACK** is also specified.
- *
- *		**bpf_get_stack**\ () can collect up to
- *		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
- *		to sufficient large buffer size. Note that
- *		this limit can be controlled with the **sysctl** program, and
- *		that it should be manually increased in order to profile long
- *		user stacks (such as stacks for Java programs). To do so, use:
- *
- *	::
- *
- *		# sysctl kernel.perf_event_max_stack=<new value>
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
  *
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
-- 
cgit v1.2.3-59-g8ed1b


From 592a4cebc2bccb23880087a21c0626ab7481626d Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:39 -0700
Subject: liquidio: Moved common function if_cfg_callback to lio_core.c

Moved common function if_cfg_callback to lio_core.c
and renamed it to lio_if_cfg_callback.

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 32 ++++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 35 +---------------------
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 34 +--------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  4 +++
 4 files changed, 38 insertions(+), 67 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 844e288d60fe..e9532e28ffac 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -29,6 +29,38 @@
 /* OOM task polling interval */
 #define LIO_OOM_POLL_INTERVAL_MS 250
 
+/**
+ * \brief Callback for getting interface configuration
+ * @param status status of request
+ * @param buf pointer to resp structure
+ */
+void lio_if_cfg_callback(struct octeon_device *oct,
+			 u32 status __attribute__((unused)), void *buf)
+{
+	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
+	struct liquidio_if_cfg_context *ctx;
+	struct liquidio_if_cfg_resp *resp;
+
+	resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+	ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
+
+	oct = lio_get_device(ctx->octeon_id);
+	if (resp->status)
+		dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: %llx\n",
+			CVM_CAST64(resp->status));
+	WRITE_ONCE(ctx->cond, 1);
+
+	snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
+		 resp->cfg_info.liquidio_firmware_version);
+
+	/* This barrier is required to be sure that the response has been
+	 * written fully before waking up the handler
+	 */
+	wmb();
+
+	wake_up_interruptible(&ctx->wc);
+}
+
 int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1)
 {
 	struct lio *lio = GET_LIO(netdev);
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index fe3edf13c8f0..4ac3c4b8ce49 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1941,39 +1941,6 @@ static int load_firmware(struct octeon_device *oct)
 	return ret;
 }
 
-/**
- * \brief Callback for getting interface configuration
- * @param status status of request
- * @param buf pointer to resp structure
- */
-static void if_cfg_callback(struct octeon_device *oct,
-			    u32 status __attribute__((unused)),
-			    void *buf)
-{
-	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
-	struct liquidio_if_cfg_resp *resp;
-	struct liquidio_if_cfg_context *ctx;
-
-	resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
-	ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
-
-	oct = lio_get_device(ctx->octeon_id);
-	if (resp->status)
-		dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: 0x%llx (0x%08x)\n",
-			CVM_CAST64(resp->status), status);
-	WRITE_ONCE(ctx->cond, 1);
-
-	snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
-		 resp->cfg_info.liquidio_firmware_version);
-
-	/* This barrier is required to be sure that the response has been
-	 * written fully before waking up the handler
-	 */
-	wmb();
-
-	wake_up_interruptible(&ctx->wc);
-}
-
 /**
  * \brief Poll routine for checking transmit queue status
  * @param work work_struct data structure
@@ -3576,7 +3543,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 					    OPCODE_NIC_IF_CFG, 0,
 					    if_cfg.u64, 0);
 
-		sc->callback = if_cfg_callback;
+		sc->callback = lio_if_cfg_callback;
 		sc->callback_arg = sc;
 		sc->wait_time = 3000;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index b7b91d15ce3f..1d3661b55a25 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1058,38 +1058,6 @@ static void free_netsgbuf_with_resp(void *buf)
 	/* Don't free the skb yet */
 }
 
-/**
- * \brief Callback for getting interface configuration
- * @param status status of request
- * @param buf pointer to resp structure
- */
-static void if_cfg_callback(struct octeon_device *oct,
-			    u32 status __attribute__((unused)), void *buf)
-{
-	struct octeon_soft_command *sc = (struct octeon_soft_command *)buf;
-	struct liquidio_if_cfg_context *ctx;
-	struct liquidio_if_cfg_resp *resp;
-
-	resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
-	ctx = (struct liquidio_if_cfg_context *)sc->ctxptr;
-
-	oct = lio_get_device(ctx->octeon_id);
-	if (resp->status)
-		dev_err(&oct->pci_dev->dev, "nic if cfg instruction failed. Status: %llx\n",
-			CVM_CAST64(resp->status));
-	WRITE_ONCE(ctx->cond, 1);
-
-	snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
-		 resp->cfg_info.liquidio_firmware_version);
-
-	/* This barrier is required to be sure that the response has been
-	 * written fully before waking up the handler
-	 */
-	wmb();
-
-	wake_up_interruptible(&ctx->wc);
-}
-
 /**
  * \brief Net device open for LiquidIO
  * @param netdev network device
@@ -2171,7 +2139,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 					    OPCODE_NIC_IF_CFG, 0, if_cfg.u64,
 					    0);
 
-		sc->callback = if_cfg_callback;
+		sc->callback = lio_if_cfg_callback;
 		sc->callback_arg = sc;
 		sc->wait_time = 5000;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index d090edd1a4a5..11907e928472 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -199,6 +199,10 @@ int lio_wait_for_clean_oq(struct octeon_device *oct);
  */
 void liquidio_set_ethtool_ops(struct net_device *netdev);
 
+void lio_if_cfg_callback(struct octeon_device *oct,
+			 u32 status __attribute__((unused)),
+			 void *buf);
+
 /**
  * \brief Net device change_mtu
  * @param netdev network device
-- 
cgit v1.2.3-59-g8ed1b


From 85a0cd81863469484f22a9b69c7f4440989d32b8 Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:45 -0700
Subject: liquidio: Moved common function list_delete_head to octeon_network.h

Moved common function list_delete_head to octeon_network.h
and renamed it to lio_list_delete_head

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 23 ++------------------
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 25 +++-------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h  | 19 ++++++++++++++++
 3 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 4ac3c4b8ce49..c11724504ab6 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -541,25 +541,6 @@ static inline int check_txq_status(struct lio *lio)
 	return ret_val;
 }
 
-/**
- * Remove the node at the head of the list. The list would be empty at
- * the end of this call if there are no more nodes in the list.
- */
-static inline struct list_head *list_delete_head(struct list_head *root)
-{
-	struct list_head *node;
-
-	if ((root->prev == root) && (root->next == root))
-		node = NULL;
-	else
-		node = root->next;
-
-	if (node)
-		list_del(node);
-
-	return node;
-}
-
 /**
  * \brief Delete gather lists
  * @param lio per-network private data
@@ -578,7 +559,7 @@ static void delete_glists(struct lio *lio)
 	for (i = 0; i < lio->linfo.num_txpciq; i++) {
 		do {
 			g = (struct octnic_gather *)
-				list_delete_head(&lio->glist[i]);
+				lio_list_delete_head(&lio->glist[i]);
 			if (g)
 				kfree(g);
 		} while (g);
@@ -2590,7 +2571,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 
 		spin_lock(&lio->glist_lock[q_idx]);
 		g = (struct octnic_gather *)
-			list_delete_head(&lio->glist[q_idx]);
+			lio_list_delete_head(&lio->glist[q_idx]);
 		spin_unlock(&lio->glist_lock[q_idx]);
 
 		if (!g) {
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 1d3661b55a25..0d9756177a2a 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -284,25 +284,6 @@ static struct pci_driver liquidio_vf_pci_driver = {
 	.err_handler	= &liquidio_vf_err_handler,    /* For AER */
 };
 
-/**
- * Remove the node at the head of the list. The list would be empty at
- * the end of this call if there are no more nodes in the list.
- */
-static struct list_head *list_delete_head(struct list_head *root)
-{
-	struct list_head *node;
-
-	if ((root->prev == root) && (root->next == root))
-		node = NULL;
-	else
-		node = root->next;
-
-	if (node)
-		list_del(node);
-
-	return node;
-}
-
 /**
  * \brief Delete gather lists
  * @param lio per-network private data
@@ -321,7 +302,7 @@ static void delete_glists(struct lio *lio)
 	for (i = 0; i < lio->linfo.num_txpciq; i++) {
 		do {
 			g = (struct octnic_gather *)
-			    list_delete_head(&lio->glist[i]);
+			    lio_list_delete_head(&lio->glist[i]);
 			kfree(g);
 		} while (g);
 
@@ -1644,8 +1625,8 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 		int i, frags;
 
 		spin_lock(&lio->glist_lock[q_idx]);
-		g = (struct octnic_gather *)list_delete_head(
-		    &lio->glist[q_idx]);
+		g = (struct octnic_gather *)
+			lio_list_delete_head(&lio->glist[q_idx]);
 		spin_unlock(&lio->glist_lock[q_idx]);
 
 		if (!g) {
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 11907e928472..1d9392bbe5ef 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -565,4 +565,23 @@ static inline int skb_iq(struct lio *lio, struct sk_buff *skb)
 	return skb->queue_mapping % lio->linfo.num_txpciq;
 }
 
+/**
+ * Remove the node at the head of the list. The list would be empty at
+ * the end of this call if there are no more nodes in the list.
+ */
+static inline struct list_head *lio_list_delete_head(struct list_head *root)
+{
+	struct list_head *node;
+
+	if (root->prev == root && root->next == root)
+		node = NULL;
+	else
+		node = root->next;
+
+	if (node)
+		list_del(node);
+
+	return node;
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From fd311f1e7548cf45a273d46aa9c9c8d8330d803c Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:49 -0700
Subject: liquidio: Moved common function delete_glists to lio_core.c

Moved common function delete_glists to lio_core.c
and renamed it to lio_delete_glists

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 41 ++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 50 ++--------------------
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 49 ++-------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  2 +
 4 files changed, 51 insertions(+), 91 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index e9532e28ffac..669b4f2d45e0 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -61,6 +61,47 @@ void lio_if_cfg_callback(struct octeon_device *oct,
 	wake_up_interruptible(&ctx->wc);
 }
 
+/**
+ * \brief Delete gather lists
+ * @param lio per-network private data
+ */
+void lio_delete_glists(struct lio *lio)
+{
+	struct octnic_gather *g;
+	int i;
+
+	kfree(lio->glist_lock);
+	lio->glist_lock = NULL;
+
+	if (!lio->glist)
+		return;
+
+	for (i = 0; i < lio->linfo.num_txpciq; i++) {
+		do {
+			g = (struct octnic_gather *)
+			    lio_list_delete_head(&lio->glist[i]);
+			kfree(g);
+		} while (g);
+
+		if (lio->glists_virt_base && lio->glists_virt_base[i] &&
+		    lio->glists_dma_base && lio->glists_dma_base[i]) {
+			lio_dma_free(lio->oct_dev,
+				     lio->glist_entry_size * lio->tx_qsize,
+				     lio->glists_virt_base[i],
+				     lio->glists_dma_base[i]);
+		}
+	}
+
+	kfree(lio->glists_virt_base);
+	lio->glists_virt_base = NULL;
+
+	kfree(lio->glists_dma_base);
+	lio->glists_dma_base = NULL;
+
+	kfree(lio->glist);
+	lio->glist = NULL;
+}
+
 int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1)
 {
 	struct lio *lio = GET_LIO(netdev);
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index c11724504ab6..cb5df7c41e92 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -541,48 +541,6 @@ static inline int check_txq_status(struct lio *lio)
 	return ret_val;
 }
 
-/**
- * \brief Delete gather lists
- * @param lio per-network private data
- */
-static void delete_glists(struct lio *lio)
-{
-	struct octnic_gather *g;
-	int i;
-
-	kfree(lio->glist_lock);
-	lio->glist_lock = NULL;
-
-	if (!lio->glist)
-		return;
-
-	for (i = 0; i < lio->linfo.num_txpciq; i++) {
-		do {
-			g = (struct octnic_gather *)
-				lio_list_delete_head(&lio->glist[i]);
-			if (g)
-				kfree(g);
-		} while (g);
-
-		if (lio->glists_virt_base && lio->glists_virt_base[i] &&
-		    lio->glists_dma_base && lio->glists_dma_base[i]) {
-			lio_dma_free(lio->oct_dev,
-				     lio->glist_entry_size * lio->tx_qsize,
-				     lio->glists_virt_base[i],
-				     lio->glists_dma_base[i]);
-		}
-	}
-
-	kfree(lio->glists_virt_base);
-	lio->glists_virt_base = NULL;
-
-	kfree(lio->glists_dma_base);
-	lio->glists_dma_base = NULL;
-
-	kfree(lio->glist);
-	lio->glist = NULL;
-}
-
 /**
  * \brief Setup gather lists
  * @param lio per-network private data
@@ -617,7 +575,7 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
 				       GFP_KERNEL);
 
 	if (!lio->glists_virt_base || !lio->glists_dma_base) {
-		delete_glists(lio);
+		lio_delete_glists(lio);
 		return -ENOMEM;
 	}
 
@@ -634,7 +592,7 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
 				      &lio->glists_dma_base[i]);
 
 		if (!lio->glists_virt_base[i]) {
-			delete_glists(lio);
+			lio_delete_glists(lio);
 			return -ENOMEM;
 		}
 
@@ -656,7 +614,7 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
 		}
 
 		if (j != lio->tx_qsize) {
-			delete_glists(lio);
+			lio_delete_glists(lio);
 			return -ENOMEM;
 		}
 	}
@@ -1452,7 +1410,7 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 
 	cleanup_rx_oom_poll_fn(netdev);
 
-	delete_glists(lio);
+	lio_delete_glists(lio);
 
 	free_netdev(netdev);
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 0d9756177a2a..ab47fbbe3570 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -284,47 +284,6 @@ static struct pci_driver liquidio_vf_pci_driver = {
 	.err_handler	= &liquidio_vf_err_handler,    /* For AER */
 };
 
-/**
- * \brief Delete gather lists
- * @param lio per-network private data
- */
-static void delete_glists(struct lio *lio)
-{
-	struct octnic_gather *g;
-	int i;
-
-	kfree(lio->glist_lock);
-	lio->glist_lock = NULL;
-
-	if (!lio->glist)
-		return;
-
-	for (i = 0; i < lio->linfo.num_txpciq; i++) {
-		do {
-			g = (struct octnic_gather *)
-			    lio_list_delete_head(&lio->glist[i]);
-			kfree(g);
-		} while (g);
-
-		if (lio->glists_virt_base && lio->glists_virt_base[i] &&
-		    lio->glists_dma_base && lio->glists_dma_base[i]) {
-			lio_dma_free(lio->oct_dev,
-				     lio->glist_entry_size * lio->tx_qsize,
-				     lio->glists_virt_base[i],
-				     lio->glists_dma_base[i]);
-		}
-	}
-
-	kfree(lio->glists_virt_base);
-	lio->glists_virt_base = NULL;
-
-	kfree(lio->glists_dma_base);
-	lio->glists_dma_base = NULL;
-
-	kfree(lio->glist);
-	lio->glist = NULL;
-}
-
 /**
  * \brief Setup gather lists
  * @param lio per-network private data
@@ -359,7 +318,7 @@ static int setup_glists(struct lio *lio, int num_iqs)
 				       GFP_KERNEL);
 
 	if (!lio->glists_virt_base || !lio->glists_dma_base) {
-		delete_glists(lio);
+		lio_delete_glists(lio);
 		return -ENOMEM;
 	}
 
@@ -374,7 +333,7 @@ static int setup_glists(struct lio *lio, int num_iqs)
 				      &lio->glists_dma_base[i]);
 
 		if (!lio->glists_virt_base[i]) {
-			delete_glists(lio);
+			lio_delete_glists(lio);
 			return -ENOMEM;
 		}
 
@@ -393,7 +352,7 @@ static int setup_glists(struct lio *lio, int num_iqs)
 		}
 
 		if (j != lio->tx_qsize) {
-			delete_glists(lio);
+			lio_delete_glists(lio);
 			return -ENOMEM;
 		}
 	}
@@ -837,7 +796,7 @@ static void liquidio_destroy_nic_device(struct octeon_device *oct, int ifidx)
 
 	cleanup_link_status_change_wq(netdev);
 
-	delete_glists(lio);
+	lio_delete_glists(lio);
 
 	free_netdev(netdev);
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 1d9392bbe5ef..777af06a8570 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -203,6 +203,8 @@ void lio_if_cfg_callback(struct octeon_device *oct,
 			 u32 status __attribute__((unused)),
 			 void *buf);
 
+void lio_delete_glists(struct lio *lio);
+
 /**
  * \brief Net device change_mtu
  * @param netdev network device
-- 
cgit v1.2.3-59-g8ed1b


From a72b2c8ced35315e4f0fdd6f2c4c12f96bb0dc2e Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:51 -0700
Subject: liquidio: Moved common definition octnic_gather to octeon_network.h

Moving common definition octnic_gather to octeon_network.h

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c     | 21 ---------------------
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c  | 18 ------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h   | 21 +++++++++++++++++++++
 3 files changed, 21 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index cb5df7c41e92..00d5634c7668 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -144,27 +144,6 @@ union tx_info {
 #define OCTNIC_GSO_MAX_SIZE                                                    \
 	(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
 
-/** Structure of a node in list of gather components maintained by
- * NIC driver for each network device.
- */
-struct octnic_gather {
-	/** List manipulation. Next and prev pointers. */
-	struct list_head list;
-
-	/** Size of the gather component at sg in bytes. */
-	int sg_size;
-
-	/** Number of bytes that sg was adjusted to make it 8B-aligned. */
-	int adjust;
-
-	/** Gather component that can accommodate max sized fragment list
-	 *  received from the IP layer.
-	 */
-	struct octeon_sg_entry *sg;
-
-	dma_addr_t sg_dma_ptr;
-};
-
 struct handshake {
 	struct completion init;
 	struct completion started;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index ab47fbbe3570..c4ba7ee9d4f0 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -75,24 +75,6 @@ union tx_info {
 #define OCTNIC_GSO_MAX_SIZE \
 		(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
 
-struct octnic_gather {
-	/* List manipulation. Next and prev pointers. */
-	struct list_head list;
-
-	/* Size of the gather component at sg in bytes. */
-	int sg_size;
-
-	/* Number of bytes that sg was adjusted to make it 8B-aligned. */
-	int adjust;
-
-	/* Gather component that can accommodate max sized fragment list
-	 * received from the IP layer.
-	 */
-	struct octeon_sg_entry *sg;
-
-	dma_addr_t sg_dma_ptr;
-};
-
 static int
 liquidio_vf_probe(struct pci_dev *pdev, const struct pci_device_id *ent);
 static void liquidio_vf_remove(struct pci_dev *pdev);
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 777af06a8570..caf25b88cb53 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -47,6 +47,27 @@ struct liquidio_if_cfg_resp {
 	u64 status;
 };
 
+/* Structure of a node in list of gather components maintained by
+ * NIC driver for each network device.
+ */
+struct octnic_gather {
+	/* List manipulation. Next and prev pointers. */
+	struct list_head list;
+
+	/* Size of the gather component at sg in bytes. */
+	int sg_size;
+
+	/* Number of bytes that sg was adjusted to make it 8B-aligned. */
+	int adjust;
+
+	/* Gather component that can accommodate max sized fragment list
+	 * received from the IP layer.
+	 */
+	struct octeon_sg_entry *sg;
+
+	dma_addr_t sg_dma_ptr;
+};
+
 struct oct_nic_stats_resp {
 	u64     rh;
 	struct oct_link_stats stats;
-- 
cgit v1.2.3-59-g8ed1b


From 128ea39439341d4f60bda1740a59ce34bcc19e4c Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:55 -0700
Subject: liquidio: Moved common function setup_glists to lio_core.c

Moved common function setup_glists to lio_core.c
and reamed it to lio_setup_glists

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 83 +++++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_main.c    | 85 +---------------------
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 80 +-------------------
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  2 +
 4 files changed, 87 insertions(+), 163 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 669b4f2d45e0..b805d54d8e00 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -29,6 +29,8 @@
 /* OOM task polling interval */
 #define LIO_OOM_POLL_INTERVAL_MS 250
 
+#define OCTNIC_MAX_SG  MAX_SKB_FRAGS
+
 /**
  * \brief Callback for getting interface configuration
  * @param status status of request
@@ -102,6 +104,87 @@ void lio_delete_glists(struct lio *lio)
 	lio->glist = NULL;
 }
 
+/**
+ * \brief Setup gather lists
+ * @param lio per-network private data
+ */
+int lio_setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
+{
+	struct octnic_gather *g;
+	int i, j;
+
+	lio->glist_lock =
+	    kcalloc(num_iqs, sizeof(*lio->glist_lock), GFP_KERNEL);
+	if (!lio->glist_lock)
+		return -ENOMEM;
+
+	lio->glist =
+	    kcalloc(num_iqs, sizeof(*lio->glist), GFP_KERNEL);
+	if (!lio->glist) {
+		kfree(lio->glist_lock);
+		lio->glist_lock = NULL;
+		return -ENOMEM;
+	}
+
+	lio->glist_entry_size =
+		ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
+
+	/* allocate memory to store virtual and dma base address of
+	 * per glist consistent memory
+	 */
+	lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
+					GFP_KERNEL);
+	lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
+				       GFP_KERNEL);
+
+	if (!lio->glists_virt_base || !lio->glists_dma_base) {
+		lio_delete_glists(lio);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < num_iqs; i++) {
+		int numa_node = dev_to_node(&oct->pci_dev->dev);
+
+		spin_lock_init(&lio->glist_lock[i]);
+
+		INIT_LIST_HEAD(&lio->glist[i]);
+
+		lio->glists_virt_base[i] =
+			lio_dma_alloc(oct,
+				      lio->glist_entry_size * lio->tx_qsize,
+				      &lio->glists_dma_base[i]);
+
+		if (!lio->glists_virt_base[i]) {
+			lio_delete_glists(lio);
+			return -ENOMEM;
+		}
+
+		for (j = 0; j < lio->tx_qsize; j++) {
+			g = kzalloc_node(sizeof(*g), GFP_KERNEL,
+					 numa_node);
+			if (!g)
+				g = kzalloc(sizeof(*g), GFP_KERNEL);
+			if (!g)
+				break;
+
+			g->sg = lio->glists_virt_base[i] +
+				(j * lio->glist_entry_size);
+
+			g->sg_dma_ptr = lio->glists_dma_base[i] +
+					(j * lio->glist_entry_size);
+
+			list_add_tail(&g->list, &lio->glist[i]);
+		}
+
+		if (j != lio->tx_qsize) {
+			lio_delete_glists(lio);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1)
 {
 	struct lio *lio = GET_LIO(netdev);
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 00d5634c7668..4b8e29f4e621 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -138,8 +138,6 @@ union tx_info {
  * by this structure in the NIC module.
  */
 
-#define OCTNIC_MAX_SG  (MAX_SKB_FRAGS)
-
 #define OCTNIC_GSO_MAX_HEADER_SIZE 128
 #define OCTNIC_GSO_MAX_SIZE                                                    \
 	(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
@@ -520,87 +518,6 @@ static inline int check_txq_status(struct lio *lio)
 	return ret_val;
 }
 
-/**
- * \brief Setup gather lists
- * @param lio per-network private data
- */
-static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
-{
-	int i, j;
-	struct octnic_gather *g;
-
-	lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock),
-				  GFP_KERNEL);
-	if (!lio->glist_lock)
-		return -ENOMEM;
-
-	lio->glist = kcalloc(num_iqs, sizeof(*lio->glist),
-			     GFP_KERNEL);
-	if (!lio->glist) {
-		kfree(lio->glist_lock);
-		lio->glist_lock = NULL;
-		return -ENOMEM;
-	}
-
-	lio->glist_entry_size =
-		ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
-
-	/* allocate memory to store virtual and dma base address of
-	 * per glist consistent memory
-	 */
-	lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
-					GFP_KERNEL);
-	lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
-				       GFP_KERNEL);
-
-	if (!lio->glists_virt_base || !lio->glists_dma_base) {
-		lio_delete_glists(lio);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < num_iqs; i++) {
-		int numa_node = dev_to_node(&oct->pci_dev->dev);
-
-		spin_lock_init(&lio->glist_lock[i]);
-
-		INIT_LIST_HEAD(&lio->glist[i]);
-
-		lio->glists_virt_base[i] =
-			lio_dma_alloc(oct,
-				      lio->glist_entry_size * lio->tx_qsize,
-				      &lio->glists_dma_base[i]);
-
-		if (!lio->glists_virt_base[i]) {
-			lio_delete_glists(lio);
-			return -ENOMEM;
-		}
-
-		for (j = 0; j < lio->tx_qsize; j++) {
-			g = kzalloc_node(sizeof(*g), GFP_KERNEL,
-					 numa_node);
-			if (!g)
-				g = kzalloc(sizeof(*g), GFP_KERNEL);
-			if (!g)
-				break;
-
-			g->sg = lio->glists_virt_base[i] +
-				(j * lio->glist_entry_size);
-
-			g->sg_dma_ptr = lio->glists_dma_base[i] +
-					(j * lio->glist_entry_size);
-
-			list_add_tail(&g->list, &lio->glist[i]);
-		}
-
-		if (j != lio->tx_qsize) {
-			lio_delete_glists(lio);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
 /**
  * \brief Print link information
  * @param netdev network device
@@ -3657,7 +3574,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq);
 		lio->rx_qsize = octeon_get_rx_qsize(octeon_dev, lio->rxq);
 
-		if (setup_glists(octeon_dev, lio, num_iqueues)) {
+		if (lio_setup_glists(octeon_dev, lio, num_iqueues)) {
 			dev_err(&octeon_dev->pci_dev->dev,
 				"Gather list allocation failed\n");
 			goto setup_nic_dev_fail;
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index c4ba7ee9d4f0..be28b8f1f67d 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -69,8 +69,6 @@ union tx_info {
 	} s;
 };
 
-#define OCTNIC_MAX_SG  (MAX_SKB_FRAGS)
-
 #define OCTNIC_GSO_MAX_HEADER_SIZE 128
 #define OCTNIC_GSO_MAX_SIZE \
 		(CN23XX_DEFAULT_INPUT_JABBER - OCTNIC_GSO_MAX_HEADER_SIZE)
@@ -266,82 +264,6 @@ static struct pci_driver liquidio_vf_pci_driver = {
 	.err_handler	= &liquidio_vf_err_handler,    /* For AER */
 };
 
-/**
- * \brief Setup gather lists
- * @param lio per-network private data
- */
-static int setup_glists(struct lio *lio, int num_iqs)
-{
-	struct octnic_gather *g;
-	int i, j;
-
-	lio->glist_lock =
-	    kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
-	if (!lio->glist_lock)
-		return -ENOMEM;
-
-	lio->glist =
-	    kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
-	if (!lio->glist) {
-		kfree(lio->glist_lock);
-		lio->glist_lock = NULL;
-		return -ENOMEM;
-	}
-
-	lio->glist_entry_size =
-		ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
-
-	/* allocate memory to store virtual and dma base address of
-	 * per glist consistent memory
-	 */
-	lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
-					GFP_KERNEL);
-	lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
-				       GFP_KERNEL);
-
-	if (!lio->glists_virt_base || !lio->glists_dma_base) {
-		lio_delete_glists(lio);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < num_iqs; i++) {
-		spin_lock_init(&lio->glist_lock[i]);
-
-		INIT_LIST_HEAD(&lio->glist[i]);
-
-		lio->glists_virt_base[i] =
-			lio_dma_alloc(lio->oct_dev,
-				      lio->glist_entry_size * lio->tx_qsize,
-				      &lio->glists_dma_base[i]);
-
-		if (!lio->glists_virt_base[i]) {
-			lio_delete_glists(lio);
-			return -ENOMEM;
-		}
-
-		for (j = 0; j < lio->tx_qsize; j++) {
-			g = kzalloc(sizeof(*g), GFP_KERNEL);
-			if (!g)
-				break;
-
-			g->sg = lio->glists_virt_base[i] +
-				(j * lio->glist_entry_size);
-
-			g->sg_dma_ptr = lio->glists_dma_base[i] +
-					(j * lio->glist_entry_size);
-
-			list_add_tail(&g->list, &lio->glist[i]);
-		}
-
-		if (j != lio->tx_qsize) {
-			lio_delete_glists(lio);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
 /**
  * \brief Print link information
  * @param netdev network device
@@ -2226,7 +2148,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		lio->tx_qsize = octeon_get_tx_qsize(octeon_dev, lio->txq);
 		lio->rx_qsize = octeon_get_rx_qsize(octeon_dev, lio->rxq);
 
-		if (setup_glists(lio, num_iqueues)) {
+		if (lio_setup_glists(octeon_dev, lio, num_iqueues)) {
 			dev_err(&octeon_dev->pci_dev->dev,
 				"Gather list allocation failed\n");
 			goto setup_nic_dev_fail;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index caf25b88cb53..f8c1091639ae 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -226,6 +226,8 @@ void lio_if_cfg_callback(struct octeon_device *oct,
 
 void lio_delete_glists(struct lio *lio);
 
+int lio_setup_glists(struct octeon_device *oct, struct lio *lio, int num_qs);
+
 /**
  * \brief Net device change_mtu
  * @param netdev network device
-- 
cgit v1.2.3-59-g8ed1b


From c33c997346c34ea7b89aec99524ad9632a2f1e0c Mon Sep 17 00:00:00 2001
From: Intiyaz Basha <intiyaz.basha@cavium.com>
Date: Fri, 27 Apr 2018 23:32:57 -0700
Subject: liquidio: enhanced ethtool --set-channels feature

Enhancing driver to accept max supported queues for ethtool --set-channels

Signed-off-by: Intiyaz Basha <intiyaz.basha@cavium.com>
Acked-by: Derek Chickles <derek.chickles@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/cavium/liquidio/cn23xx_pf_device.c    |   6 +-
 .../ethernet/cavium/liquidio/cn23xx_pf_device.h    |   2 +
 drivers/net/ethernet/cavium/liquidio/lio_core.c    |   4 +-
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 263 +++++++++++++++++++--
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  64 +++--
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c |   8 +-
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   1 +
 .../net/ethernet/cavium/liquidio/octeon_device.c   |  12 +-
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   2 +-
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  12 +-
 10 files changed, 316 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index bc9861c90ea3..929d485a3a2f 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -1245,7 +1245,7 @@ static void cn23xx_setup_reg_address(struct octeon_device *oct)
 	    CN23XX_SLI_MAC_PF_INT_ENB64(oct->pcie_port, oct->pf_num);
 }
 
-static int cn23xx_sriov_config(struct octeon_device *oct)
+int cn23xx_sriov_config(struct octeon_device *oct)
 {
 	struct octeon_cn23xx_pf *cn23xx = (struct octeon_cn23xx_pf *)oct->chip;
 	u32 max_rings, total_rings, max_vfs, rings_per_vf;
@@ -1269,8 +1269,8 @@ static int cn23xx_sriov_config(struct octeon_device *oct)
 		break;
 	}
 
-	if (max_rings <= num_present_cpus())
-		num_pf_rings = 1;
+	if (oct->sriov_info.num_pf_rings)
+		num_pf_rings = oct->sriov_info.num_pf_rings;
 	else
 		num_pf_rings = num_present_cpus();
 
diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index 63b3de4f2bfe..e6f31d0d5c0b 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -61,6 +61,8 @@ u32 cn23xx_pf_get_oq_ticks(struct octeon_device *oct, u32 time_intr_in_us);
 
 void cn23xx_dump_pf_initialized_regs(struct octeon_device *oct);
 
+int cn23xx_sriov_config(struct octeon_device *oct);
+
 int cn23xx_fw_loaded(struct octeon_device *oct);
 
 void cn23xx_tell_vf_its_macaddr_changed(struct octeon_device *oct, int vfidx,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index b805d54d8e00..6821afcdc365 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -78,7 +78,7 @@ void lio_delete_glists(struct lio *lio)
 	if (!lio->glist)
 		return;
 
-	for (i = 0; i < lio->linfo.num_txpciq; i++) {
+	for (i = 0; i < lio->oct_dev->num_iqs; i++) {
 		do {
 			g = (struct octnic_gather *)
 			    lio_list_delete_head(&lio->glist[i]);
@@ -1036,8 +1036,8 @@ int octeon_setup_interrupt(struct octeon_device *oct, u32 num_ioqs)
 	int num_ioq_vectors;
 	int irqret, err;
 
-	oct->num_msix_irqs = num_ioqs;
 	if (oct->msix_on) {
+		oct->num_msix_irqs = num_ioqs;
 		if (OCTEON_CN23XX_PF(oct)) {
 			num_interrupts = MAX_IOQ_INTERRUPTS_PER_PF + 1;
 
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 64c817a63a01..fff60ca20c35 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -361,7 +361,14 @@ lio_ethtool_get_channels(struct net_device *dev,
 		rx_count = CFG_GET_NUM_RXQS_NIC_IF(conf6x, lio->ifidx);
 		tx_count = CFG_GET_NUM_TXQS_NIC_IF(conf6x, lio->ifidx);
 	} else if (OCTEON_CN23XX_PF(oct)) {
-		max_combined = lio->linfo.num_txpciq;
+		if (oct->sriov_info.sriov_enabled) {
+			max_combined = lio->linfo.num_txpciq;
+		} else {
+			struct octeon_config *conf23_pf =
+				CHIP_CONF(oct, cn23xx_pf);
+
+			max_combined = CFG_GET_IQ_MAX_Q(conf23_pf);
+		}
 		combined_count = oct->num_iqs;
 	} else if (OCTEON_CN23XX_VF(oct)) {
 		u64 reg_val = 0ULL;
@@ -425,9 +432,15 @@ lio_irq_reallocate_irqs(struct octeon_device *oct, uint32_t num_ioqs)
 
 	kfree(oct->irq_name_storage);
 	oct->irq_name_storage = NULL;
+
+	if (octeon_allocate_ioq_vector(oct, num_ioqs)) {
+		dev_err(&oct->pci_dev->dev, "OCTEON: ioq vector allocation failed\n");
+		return -1;
+	}
+
 	if (octeon_setup_interrupt(oct, num_ioqs)) {
 		dev_info(&oct->pci_dev->dev, "Setup interrupt failed\n");
-		return 1;
+		return -1;
 	}
 
 	/* Enable Octeon device interrupts */
@@ -457,7 +470,16 @@ lio_ethtool_set_channels(struct net_device *dev,
 	combined_count = channel->combined_count;
 
 	if (OCTEON_CN23XX_PF(oct)) {
-		max_combined = channel->max_combined;
+		if (oct->sriov_info.sriov_enabled) {
+			max_combined = lio->linfo.num_txpciq;
+		} else {
+			struct octeon_config *conf23_pf =
+				CHIP_CONF(oct,
+					  cn23xx_pf);
+
+			max_combined =
+				CFG_GET_IQ_MAX_Q(conf23_pf);
+		}
 	} else if (OCTEON_CN23XX_VF(oct)) {
 		u64 reg_val = 0ULL;
 		u64 ctrl = CN23XX_VF_SLI_IQ_PKT_CONTROL64(0);
@@ -485,7 +507,6 @@ lio_ethtool_set_channels(struct net_device *dev,
 	if (lio_reset_queues(dev, combined_count))
 		return -EINVAL;
 
-	lio_irq_reallocate_irqs(oct, combined_count);
 	if (stopped)
 		dev->netdev_ops->ndo_open(dev);
 
@@ -824,12 +845,120 @@ lio_ethtool_get_ringparam(struct net_device *netdev,
 	ering->rx_jumbo_max_pending = 0;
 }
 
+static int lio_23xx_reconfigure_queue_count(struct lio *lio)
+{
+	struct octeon_device *oct = lio->oct_dev;
+	struct liquidio_if_cfg_context *ctx;
+	u32 resp_size, ctx_size, data_size;
+	struct liquidio_if_cfg_resp *resp;
+	struct octeon_soft_command *sc;
+	union oct_nic_if_cfg if_cfg;
+	struct lio_version *vdata;
+	u32 ifidx_or_pfnum;
+	int retval;
+	int j;
+
+	resp_size = sizeof(struct liquidio_if_cfg_resp);
+	ctx_size = sizeof(struct liquidio_if_cfg_context);
+	data_size = sizeof(struct lio_version);
+	sc = (struct octeon_soft_command *)
+		octeon_alloc_soft_command(oct, data_size,
+					  resp_size, ctx_size);
+	if (!sc) {
+		dev_err(&oct->pci_dev->dev, "%s: Failed to allocate soft command\n",
+			__func__);
+		return -1;
+	}
+
+	resp = (struct liquidio_if_cfg_resp *)sc->virtrptr;
+	ctx  = (struct liquidio_if_cfg_context *)sc->ctxptr;
+	vdata = (struct lio_version *)sc->virtdptr;
+
+	vdata->major = (__force u16)cpu_to_be16(LIQUIDIO_BASE_MAJOR_VERSION);
+	vdata->minor = (__force u16)cpu_to_be16(LIQUIDIO_BASE_MINOR_VERSION);
+	vdata->micro = (__force u16)cpu_to_be16(LIQUIDIO_BASE_MICRO_VERSION);
+
+	ifidx_or_pfnum = oct->pf_num;
+	WRITE_ONCE(ctx->cond, 0);
+	ctx->octeon_id = lio_get_device_id(oct);
+	init_waitqueue_head(&ctx->wc);
+
+	if_cfg.u64 = 0;
+	if_cfg.s.num_iqueues = oct->sriov_info.num_pf_rings;
+	if_cfg.s.num_oqueues = oct->sriov_info.num_pf_rings;
+	if_cfg.s.base_queue = oct->sriov_info.pf_srn;
+	if_cfg.s.gmx_port_id = oct->pf_num;
+
+	sc->iq_no = 0;
+	octeon_prepare_soft_command(oct, sc, OPCODE_NIC,
+				    OPCODE_NIC_QCOUNT_UPDATE, 0,
+				    if_cfg.u64, 0);
+	sc->callback = lio_if_cfg_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = LIO_IFCFG_WAIT_TIME;
+
+	retval = octeon_send_soft_command(oct, sc);
+	if (retval == IQ_SEND_FAILED) {
+		dev_err(&oct->pci_dev->dev,
+			"iq/oq config failed status: %x\n",
+			retval);
+		goto qcount_update_fail;
+	}
+
+	if (sleep_cond(&ctx->wc, &ctx->cond) == -EINTR) {
+		dev_err(&oct->pci_dev->dev, "Wait interrupted\n");
+		return -1;
+	}
+
+	retval = resp->status;
+	if (retval) {
+		dev_err(&oct->pci_dev->dev, "iq/oq config failed\n");
+		goto qcount_update_fail;
+	}
+
+	octeon_swap_8B_data((u64 *)(&resp->cfg_info),
+			    (sizeof(struct liquidio_if_cfg_info)) >> 3);
+
+	lio->ifidx = ifidx_or_pfnum;
+	lio->linfo.num_rxpciq = hweight64(resp->cfg_info.iqmask);
+	lio->linfo.num_txpciq = hweight64(resp->cfg_info.iqmask);
+	for (j = 0; j < lio->linfo.num_rxpciq; j++) {
+		lio->linfo.rxpciq[j].u64 =
+			resp->cfg_info.linfo.rxpciq[j].u64;
+	}
+
+	for (j = 0; j < lio->linfo.num_txpciq; j++) {
+		lio->linfo.txpciq[j].u64 =
+			resp->cfg_info.linfo.txpciq[j].u64;
+	}
+
+	lio->linfo.hw_addr = resp->cfg_info.linfo.hw_addr;
+	lio->linfo.gmxport = resp->cfg_info.linfo.gmxport;
+	lio->linfo.link.u64 = resp->cfg_info.linfo.link.u64;
+	lio->txq = lio->linfo.txpciq[0].s.q_no;
+	lio->rxq = lio->linfo.rxpciq[0].s.q_no;
+
+	octeon_free_soft_command(oct, sc);
+	dev_info(&oct->pci_dev->dev, "Queue count updated to %d\n",
+		 lio->linfo.num_rxpciq);
+
+	return 0;
+
+qcount_update_fail:
+	octeon_free_soft_command(oct, sc);
+
+	return -1;
+}
+
 static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs)
 {
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct = lio->oct_dev;
+	int i, queue_count_update = 0;
 	struct napi_struct *napi, *n;
-	int i, update = 0;
+	int ret;
+
+	schedule_timeout_uninterruptible(msecs_to_jiffies(100));
 
 	if (wait_for_pending_requests(oct))
 		dev_err(&oct->pci_dev->dev, "There were pending requests\n");
@@ -838,7 +967,7 @@ static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs)
 		dev_err(&oct->pci_dev->dev, "IQ had pending instructions\n");
 
 	if (octeon_set_io_queues_off(oct)) {
-		dev_err(&oct->pci_dev->dev, "setting io queues off failed\n");
+		dev_err(&oct->pci_dev->dev, "Setting io queues off failed\n");
 		return -1;
 	}
 
@@ -851,9 +980,40 @@ static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs)
 		netif_napi_del(napi);
 
 	if (num_qs != oct->num_iqs) {
-		netif_set_real_num_rx_queues(netdev, num_qs);
-		netif_set_real_num_tx_queues(netdev, num_qs);
-		update = 1;
+		ret = netif_set_real_num_rx_queues(netdev, num_qs);
+		if (ret) {
+			dev_err(&oct->pci_dev->dev,
+				"Setting real number rx failed\n");
+			return ret;
+		}
+
+		ret = netif_set_real_num_tx_queues(netdev, num_qs);
+		if (ret) {
+			dev_err(&oct->pci_dev->dev,
+				"Setting real number tx failed\n");
+			return ret;
+		}
+
+		/* The value of queue_count_update decides whether it is the
+		 * queue count or the descriptor count that is being
+		 * re-configured.
+		 */
+		queue_count_update = 1;
+	}
+
+	/* Re-configuration of queues can happen in two scenarios, SRIOV enabled
+	 * and SRIOV disabled. Few things like recreating queue zero, resetting
+	 * glists and IRQs are required for both. For the latter, some more
+	 * steps like updating sriov_info for the octeon device need to be done.
+	 */
+	if (queue_count_update) {
+		lio_delete_glists(lio);
+
+		/* Delete mbox for PF which is SRIOV disabled because sriov_info
+		 * will be now changed.
+		 */
+		if ((OCTEON_CN23XX_PF(oct)) && !oct->sriov_info.sriov_enabled)
+			oct->fn_list.free_mbox(oct);
 	}
 
 	for (i = 0; i < MAX_OCTEON_OUTPUT_QUEUES(oct); i++) {
@@ -868,24 +1028,91 @@ static int lio_reset_queues(struct net_device *netdev, uint32_t num_qs)
 		octeon_delete_instr_queue(oct, i);
 	}
 
+	if (queue_count_update) {
+		/* For PF re-configure sriov related information */
+		if ((OCTEON_CN23XX_PF(oct)) &&
+		    !oct->sriov_info.sriov_enabled) {
+			oct->sriov_info.num_pf_rings = num_qs;
+			if (cn23xx_sriov_config(oct)) {
+				dev_err(&oct->pci_dev->dev,
+					"Queue reset aborted: SRIOV config failed\n");
+				return -1;
+			}
+
+			num_qs = oct->sriov_info.num_pf_rings;
+		}
+	}
+
 	if (oct->fn_list.setup_device_regs(oct)) {
 		dev_err(&oct->pci_dev->dev, "Failed to configure device registers\n");
 		return -1;
 	}
 
-	if (liquidio_setup_io_queues(oct, 0, num_qs, num_qs)) {
-		dev_err(&oct->pci_dev->dev, "IO queues initialization failed\n");
-		return -1;
+	/* The following are needed in case of queue count re-configuration and
+	 * not for descriptor count re-configuration.
+	 */
+	if (queue_count_update) {
+		if (octeon_setup_instr_queues(oct))
+			return -1;
+
+		if (octeon_setup_output_queues(oct))
+			return -1;
+
+		/* Recreating mbox for PF that is SRIOV disabled */
+		if (OCTEON_CN23XX_PF(oct) && !oct->sriov_info.sriov_enabled) {
+			if (oct->fn_list.setup_mbox(oct)) {
+				dev_err(&oct->pci_dev->dev, "Mailbox setup failed\n");
+				return -1;
+			}
+		}
+
+		/* Deleting and recreating IRQs whether the interface is SRIOV
+		 * enabled or disabled.
+		 */
+		if (lio_irq_reallocate_irqs(oct, num_qs)) {
+			dev_err(&oct->pci_dev->dev, "IRQs could not be allocated\n");
+			return -1;
+		}
+
+		/* Enable the input and output queues for this Octeon device */
+		if (oct->fn_list.enable_io_queues(oct)) {
+			dev_err(&oct->pci_dev->dev, "Failed to enable input/output queues\n");
+			return -1;
+		}
+
+		for (i = 0; i < oct->num_oqs; i++)
+			writel(oct->droq[i]->max_count,
+			       oct->droq[i]->pkts_credit_reg);
+
+		/* Informing firmware about the new queue count. It is required
+		 * for firmware to allocate more number of queues than those at
+		 * load time.
+		 */
+		if (OCTEON_CN23XX_PF(oct) && !oct->sriov_info.sriov_enabled) {
+			if (lio_23xx_reconfigure_queue_count(lio))
+				return -1;
+		}
 	}
 
-	/* Enable the input and output queues for this Octeon device */
-	if (oct->fn_list.enable_io_queues(oct)) {
-		dev_err(&oct->pci_dev->dev, "Failed to enable input/output queues");
+	/* Once firmware is aware of the new value, queues can be recreated */
+	if (liquidio_setup_io_queues(oct, 0, num_qs, num_qs)) {
+		dev_err(&oct->pci_dev->dev, "I/O queues creation failed\n");
 		return -1;
 	}
 
-	if (update && lio_send_queue_count_update(netdev, num_qs))
-		return -1;
+	if (queue_count_update) {
+		if (lio_setup_glists(oct, lio, num_qs)) {
+			dev_err(&oct->pci_dev->dev, "Gather list allocation failed\n");
+			return -1;
+		}
+
+		/* Send firmware the information about new number of queues
+		 * if the interface is a VF or a PF that is SRIOV enabled.
+		 */
+		if (oct->sriov_info.sriov_enabled || OCTEON_CN23XX_VF(oct))
+			if (lio_send_queue_count_update(netdev, num_qs))
+				return -1;
+	}
 
 	return 0;
 }
@@ -930,7 +1157,7 @@ static int lio_ethtool_set_ringparam(struct net_device *netdev,
 		CFG_SET_NUM_RX_DESCS_NIC_IF(octeon_get_conf(oct), lio->ifidx,
 					    rx_count);
 
-	if (lio_reset_queues(netdev, lio->linfo.num_txpciq))
+	if (lio_reset_queues(netdev, oct->num_iqs))
 		goto err_lio_reset_queues;
 
 	if (stopped)
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 4b8e29f4e621..ee75048b3937 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -497,7 +497,7 @@ static void liquidio_deinit_pci(void)
  */
 static inline int check_txq_status(struct lio *lio)
 {
-	int numqs = lio->netdev->num_tx_queues;
+	int numqs = lio->netdev->real_num_tx_queues;
 	int ret_val = 0;
 	int q, iq;
 
@@ -1521,7 +1521,7 @@ static void free_netsgbuf(void *buf)
 		i++;
 	}
 
-	iq = skb_iq(lio, skb);
+	iq = skb_iq(lio->oct_dev, skb);
 	spin_lock(&lio->glist_lock[iq]);
 	list_add_tail(&g->list, &lio->glist[iq]);
 	spin_unlock(&lio->glist_lock[iq]);
@@ -1564,7 +1564,7 @@ static void free_netsgbuf_with_resp(void *buf)
 		i++;
 	}
 
-	iq = skb_iq(lio, skb);
+	iq = skb_iq(lio->oct_dev, skb);
 
 	spin_lock(&lio->glist_lock[iq]);
 	list_add_tail(&g->list, &lio->glist[iq]);
@@ -1851,11 +1851,6 @@ static int liquidio_open(struct net_device *netdev)
 
 	ifstate_set(lio, LIO_IFSTATE_RUNNING);
 
-	/* Ready for link status updates */
-	lio->intf_open = 1;
-
-	netif_info(lio, ifup, lio->netdev, "Interface Open, ready for traffic\n");
-
 	if (OCTEON_CN23XX_PF(oct)) {
 		if (!oct->msix_on)
 			if (setup_tx_poll_fn(netdev))
@@ -1865,7 +1860,12 @@ static int liquidio_open(struct net_device *netdev)
 			return -1;
 	}
 
-	start_txqs(netdev);
+	netif_tx_start_all_queues(netdev);
+
+	/* Ready for link status updates */
+	lio->intf_open = 1;
+
+	netif_info(lio, ifup, lio->netdev, "Interface Open, ready for traffic\n");
 
 	/* tell Octeon to start forwarding packets to host */
 	send_rx_ctrl_cmd(lio, 1);
@@ -1888,11 +1888,15 @@ static int liquidio_stop(struct net_device *netdev)
 
 	ifstate_reset(lio, LIO_IFSTATE_RUNNING);
 
-	netif_tx_disable(netdev);
+	/* Stop any link updates */
+	lio->intf_open = 0;
+
+	stop_txqs(netdev);
 
 	/* Inform that netif carrier is down */
 	netif_carrier_off(netdev);
-	lio->intf_open = 0;
+	netif_tx_disable(netdev);
+
 	lio->linfo.link.s.link_up = 0;
 	lio->link_changes++;
 
@@ -2332,7 +2336,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 	lio = GET_LIO(netdev);
 	oct = lio->oct_dev;
 
-	q_idx = skb_iq(lio, skb);
+	q_idx = skb_iq(oct, skb);
 	tag = q_idx;
 	iq_no = lio->linfo.txpciq[q_idx].s.q_no;
 
@@ -3298,6 +3302,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 	struct liquidio_if_cfg_resp *resp;
 	struct octdev_props *props;
 	int retval, num_iqueues, num_oqueues;
+	int max_num_queues = 0;
 	union oct_nic_if_cfg if_cfg;
 	unsigned int base_queue;
 	unsigned int gmx_port_id;
@@ -3380,7 +3385,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 
 		sc->callback = lio_if_cfg_callback;
 		sc->callback_arg = sc;
-		sc->wait_time = 3000;
+		sc->wait_time = LIO_IFCFG_WAIT_TIME;
 
 		retval = octeon_send_soft_command(octeon_dev, sc);
 		if (retval == IQ_SEND_FAILED) {
@@ -3434,11 +3439,20 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 				resp->cfg_info.oqmask);
 			goto setup_nic_dev_fail;
 		}
+
+		if (OCTEON_CN6XXX(octeon_dev)) {
+			max_num_queues = CFG_GET_IQ_MAX_Q(CHIP_CONF(octeon_dev,
+								    cn6xxx));
+		} else if (OCTEON_CN23XX_PF(octeon_dev)) {
+			max_num_queues = CFG_GET_IQ_MAX_Q(CHIP_CONF(octeon_dev,
+								    cn23xx_pf));
+		}
+
 		dev_dbg(&octeon_dev->pci_dev->dev,
-			"interface %d, iqmask %016llx, oqmask %016llx, numiqueues %d, numoqueues %d\n",
+			"interface %d, iqmask %016llx, oqmask %016llx, numiqueues %d, numoqueues %d max_num_queues: %d\n",
 			i, resp->cfg_info.iqmask, resp->cfg_info.oqmask,
-			num_iqueues, num_oqueues);
-		netdev = alloc_etherdev_mq(LIO_SIZE, num_iqueues);
+			num_iqueues, num_oqueues, max_num_queues);
+		netdev = alloc_etherdev_mq(LIO_SIZE, max_num_queues);
 
 		if (!netdev) {
 			dev_err(&octeon_dev->pci_dev->dev, "Device allocation failed\n");
@@ -3453,6 +3467,20 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		netdev->netdev_ops = &lionetdevops;
 		SWITCHDEV_SET_OPS(netdev, &lio_pf_switchdev_ops);
 
+		retval = netif_set_real_num_rx_queues(netdev, num_oqueues);
+		if (retval) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"setting real number rx failed\n");
+			goto setup_nic_dev_fail;
+		}
+
+		retval = netif_set_real_num_tx_queues(netdev, num_iqueues);
+		if (retval) {
+			dev_err(&octeon_dev->pci_dev->dev,
+				"setting real number tx failed\n");
+			goto setup_nic_dev_fail;
+		}
+
 		lio = GET_LIO(netdev);
 
 		memset(lio, 0, sizeof(struct lio));
@@ -4073,7 +4101,9 @@ static int octeon_device_init(struct octeon_device *octeon_dev)
 		}
 		atomic_set(&octeon_dev->status, OCT_DEV_MBOX_SETUP_DONE);
 
-		if (octeon_allocate_ioq_vector(octeon_dev)) {
+		if (octeon_allocate_ioq_vector
+				(octeon_dev,
+				 octeon_dev->sriov_info.num_pf_rings)) {
 			dev_err(&octeon_dev->pci_dev->dev, "OCTEON: ioq vector allocation failed\n");
 			return 1;
 		}
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index be28b8f1f67d..08b682b1c8ad 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -849,7 +849,7 @@ static void free_netsgbuf(void *buf)
 		i++;
 	}
 
-	iq = skb_iq(lio, skb);
+	iq = skb_iq(lio->oct_dev, skb);
 
 	spin_lock(&lio->glist_lock[iq]);
 	list_add_tail(&g->list, &lio->glist[iq]);
@@ -893,7 +893,7 @@ static void free_netsgbuf_with_resp(void *buf)
 		i++;
 	}
 
-	iq = skb_iq(lio, skb);
+	iq = skb_iq(lio->oct_dev, skb);
 
 	spin_lock(&lio->glist_lock[iq]);
 	list_add_tail(&g->list, &lio->glist[iq]);
@@ -1407,7 +1407,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
 	lio = GET_LIO(netdev);
 	oct = lio->oct_dev;
 
-	q_idx = skb_iq(lio, skb);
+	q_idx = skb_iq(lio->oct_dev, skb);
 	tag = q_idx;
 	iq_no = lio->linfo.txpciq[q_idx].s.q_no;
 
@@ -2339,7 +2339,7 @@ static int octeon_device_init(struct octeon_device *oct)
 	}
 	atomic_set(&oct->status, OCT_DEV_MBOX_SETUP_DONE);
 
-	if (octeon_allocate_ioq_vector(oct)) {
+	if (octeon_allocate_ioq_vector(oct, oct->sriov_info.rings_per_vf)) {
 		dev_err(&oct->pci_dev->dev, "ioq vector allocation failed\n");
 		return 1;
 	}
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 2166744b6812..026570499dcf 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -84,6 +84,7 @@ enum octeon_tag_type {
 #define OPCODE_NIC_IF_CFG              0x09
 #define OPCODE_NIC_VF_DRV_NOTICE       0x0A
 #define OPCODE_NIC_INTRMOD_PARAMS      0x0B
+#define OPCODE_NIC_QCOUNT_UPDATE       0x12
 #define OPCODE_NIC_SET_TRUSTED_VF	0x13
 #define OPCODE_NIC_SYNC_OCTEON_TIME	0x14
 #define VF_DRV_LOADED                  1
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
index f38abf626412..f878a552fef3 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.c
@@ -824,23 +824,18 @@ int octeon_deregister_device(struct octeon_device *oct)
 }
 
 int
-octeon_allocate_ioq_vector(struct octeon_device  *oct)
+octeon_allocate_ioq_vector(struct octeon_device *oct, u32 num_ioqs)
 {
-	int i, num_ioqs = 0;
 	struct octeon_ioq_vector *ioq_vector;
 	int cpu_num;
 	int size;
-
-	if (OCTEON_CN23XX_PF(oct))
-		num_ioqs = oct->sriov_info.num_pf_rings;
-	else if (OCTEON_CN23XX_VF(oct))
-		num_ioqs = oct->sriov_info.rings_per_vf;
+	int i;
 
 	size = sizeof(struct octeon_ioq_vector) * num_ioqs;
 
 	oct->ioq_vector = vzalloc(size);
 	if (!oct->ioq_vector)
-		return 1;
+		return -1;
 	for (i = 0; i < num_ioqs; i++) {
 		ioq_vector		= &oct->ioq_vector[i];
 		ioq_vector->oct_dev	= oct;
@@ -856,6 +851,7 @@ octeon_allocate_ioq_vector(struct octeon_device  *oct)
 		else
 			ioq_vector->ioq_num	= i;
 	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index 91937cc5c1d7..9430c0a0bb8c 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -867,7 +867,7 @@ void *oct_get_config_info(struct octeon_device *oct, u16 card_type);
 struct octeon_config *octeon_get_conf(struct octeon_device *oct);
 
 void octeon_free_ioq_vector(struct octeon_device *oct);
-int octeon_allocate_ioq_vector(struct octeon_device  *oct);
+int octeon_allocate_ioq_vector(struct octeon_device  *oct, u32 num_ioqs);
 void lio_enable_irq(struct octeon_droq *droq, struct octeon_instr_queue *iq);
 
 /* LiquidIO driver pivate flags */
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index f8c1091639ae..8571f11e3c8f 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -47,6 +47,8 @@ struct liquidio_if_cfg_resp {
 	u64 status;
 };
 
+#define LIO_IFCFG_WAIT_TIME    3000 /* In milli seconds */
+
 /* Structure of a node in list of gather components maintained by
  * NIC driver for each network device.
  */
@@ -546,7 +548,7 @@ static inline void stop_txqs(struct net_device *netdev)
 {
 	int i;
 
-	for (i = 0; i < netdev->num_tx_queues; i++)
+	for (i = 0; i < netdev->real_num_tx_queues; i++)
 		netif_stop_subqueue(netdev, i);
 }
 
@@ -559,7 +561,7 @@ static inline void wake_txqs(struct net_device *netdev)
 	struct lio *lio = GET_LIO(netdev);
 	int i, qno;
 
-	for (i = 0; i < netdev->num_tx_queues; i++) {
+	for (i = 0; i < netdev->real_num_tx_queues; i++) {
 		qno = lio->linfo.txpciq[i % lio->oct_dev->num_iqs].s.q_no;
 
 		if (__netif_subqueue_stopped(netdev, i)) {
@@ -580,14 +582,14 @@ static inline void start_txqs(struct net_device *netdev)
 	int i;
 
 	if (lio->linfo.link.s.link_up) {
-		for (i = 0; i < netdev->num_tx_queues; i++)
+		for (i = 0; i < netdev->real_num_tx_queues; i++)
 			netif_start_subqueue(netdev, i);
 	}
 }
 
-static inline int skb_iq(struct lio *lio, struct sk_buff *skb)
+static inline int skb_iq(struct octeon_device *oct, struct sk_buff *skb)
 {
-	return skb->queue_mapping % lio->linfo.num_txpciq;
+	return skb->queue_mapping % oct->num_iqs;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 76c2a96d42ca3bdac12c463ff27fec3bb2982e3f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 28 Apr 2018 10:52:16 +0100
Subject: liquidio: fix spelling mistake: "mac_tx_multi_collison" ->
 "mac_tx_multi_collision"

Trivial fix to spelling mistake in oct_stats_strings text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index fff60ca20c35..a1d84304a608 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -122,7 +122,7 @@ static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
 	"mac_tx_ctl_packets",
 	"mac_tx_total_collisions",
 	"mac_tx_one_collision",
-	"mac_tx_multi_collison",
+	"mac_tx_multi_collision",
 	"mac_tx_max_collision_fail",
 	"mac_tx_max_deferal_fail",
 	"mac_tx_fifo_err",
-- 
cgit v1.2.3-59-g8ed1b


From e66267483eab88a05103f2b9df6a5682b87f38b0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:08 +0200
Subject: r8169: remove unneeded call to __rtl8169_set_features in rtl_open

RxChkSum and RxVlan aren't touched outside __rtl8169_set_features
(except in probe), so they are always in sync with dev->features.
And the RxConfig flags are set in rtl_set_rx_mode() which is
called via dev_set_rx_mode() from __dev_open().
Therefore we can safely remove this call.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index a5d00ee94245..d2656224fc49 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7637,8 +7637,6 @@ static int rtl_open(struct net_device *dev)
 
 	rtl8169_init_phy(dev, tp);
 
-	__rtl8169_set_features(dev, dev->features);
-
 	rtl_pll_power_up(tp);
 
 	rtl_hw_start(tp);
-- 
cgit v1.2.3-59-g8ed1b


From a3984578bf0ed578085bd3c382867c0121bb1534 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:15 +0200
Subject: r8169: improve rtl8169_set_features

__rtl8169_set_features is used in rtl8169_set_features only, so we
can inline it. In addition:
- Remove check (features ^ dev->features), __netdev_update_features
  check's already that requested features differ from current ones.
- Don't mask out unsupported flags, there's no benefit in it.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index d2656224fc49..411d12bee50c 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1935,12 +1935,14 @@ static netdev_features_t rtl8169_fix_features(struct net_device *dev,
 	return features;
 }
 
-static void __rtl8169_set_features(struct net_device *dev,
-				   netdev_features_t features)
+static int rtl8169_set_features(struct net_device *dev,
+				netdev_features_t features)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	u32 rx_config;
 
+	rtl_lock_work(tp);
+
 	rx_config = RTL_R32(tp, RxConfig);
 	if (features & NETIF_F_RXALL)
 		rx_config |= (AcceptErr | AcceptRunt);
@@ -1963,24 +1965,12 @@ static void __rtl8169_set_features(struct net_device *dev,
 
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 	RTL_R16(tp, CPlusCmd);
-}
 
-static int rtl8169_set_features(struct net_device *dev,
-				netdev_features_t features)
-{
-	struct rtl8169_private *tp = netdev_priv(dev);
-
-	features &= NETIF_F_RXALL | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
-
-	rtl_lock_work(tp);
-	if (features ^ dev->features)
-		__rtl8169_set_features(dev, features);
 	rtl_unlock_work(tp);
 
 	return 0;
 }
 
-
 static inline u32 rtl8169_tx_vlan_tag(struct sk_buff *skb)
 {
 	return (skb_vlan_tag_present(skb)) ?
-- 
cgit v1.2.3-59-g8ed1b


From 9a3c81fa616ba5dc10eec59017ab1f7f0cc35ee9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:21 +0200
Subject: r8169: replace magic number for INTT mask with a constant

Use a proper constant for INTT bit mask.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 411d12bee50c..1dd189ddec3e 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -599,6 +599,7 @@ enum rtl_register_content {
 	RxChkSum	= (1 << 5),
 	PCIDAC		= (1 << 4),
 	PCIMulRW	= (1 << 3),
+#define INTT_MASK	GENMASK(1, 0)
 	INTT_0		= 0x0000,	// 8168
 	INTT_1		= 0x0001,	// 8168
 	INTT_2		= 0x0002,	// 8168
@@ -2344,7 +2345,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 	if (IS_ERR(ci))
 		return PTR_ERR(ci);
 
-	scale = &ci->scalev[RTL_R16(tp, CPlusCmd) & 3];
+	scale = &ci->scalev[RTL_R16(tp, CPlusCmd) & INTT_MASK];
 
 	/* read IntrMitigate and adjust according to scale */
 	for (w = RTL_R16(tp, IntrMitigate); w; w >>= RTL_COALESCE_SHIFT, p++) {
@@ -2443,7 +2444,7 @@ static int rtl_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 
 	RTL_W16(tp, IntrMitigate, swab16(w));
 
-	tp->cp_cmd = (tp->cp_cmd & ~3) | cp01;
+	tp->cp_cmd = (tp->cp_cmd & ~INTT_MASK) | cp01;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 	RTL_R16(tp, CPlusCmd);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0ae0974eb3d08e2166dd83aa669b63cb84a5ced3 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:26 +0200
Subject: r8169: improve CPlusCmd handling

tp->cp_cmd is supposed to reflect the current value of the CplusCmd
register. Several (quite old) changes however directly change this
register w/o updating tp->cp_cmd. Also we have places in the code
reading this register where we could use the cached value.

In addition:
- Properly initialize tp->cmd with the register value.
- In rtl_hw_start_8169 remove one setting of PCIMulRW because it's
  set unconditionally anyway a few lines later.
- In rtl_hw_start_8168 properly mask out the INTT bits before
  setting INTT_1. So far we rely on both bits being zero.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 42 +++++++++++++++---------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 1dd189ddec3e..868dee7d1131 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1962,8 +1962,6 @@ static int rtl8169_set_features(struct net_device *dev,
 	else
 		tp->cp_cmd &= ~RxVlan;
 
-	tp->cp_cmd |= RTL_R16(tp, CPlusCmd) & ~(RxVlan | RxChkSum);
-
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 	RTL_R16(tp, CPlusCmd);
 
@@ -2345,7 +2343,7 @@ static int rtl_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 	if (IS_ERR(ci))
 		return PTR_ERR(ci);
 
-	scale = &ci->scalev[RTL_R16(tp, CPlusCmd) & INTT_MASK];
+	scale = &ci->scalev[tp->cp_cmd & INTT_MASK];
 
 	/* read IntrMitigate and adjust according to scale */
 	for (w = RTL_R16(tp, IntrMitigate); w; w >>= RTL_COALESCE_SHIFT, p++) {
@@ -4841,7 +4839,7 @@ static void r8168_pll_power_down(struct rtl8169_private *tp)
 
 	if ((tp->mac_version == RTL_GIGA_MAC_VER_23 ||
 	     tp->mac_version == RTL_GIGA_MAC_VER_24) &&
-	    (RTL_R16(tp, CPlusCmd) & ASF)) {
+	    (tp->cp_cmd & ASF)) {
 		return;
 	}
 
@@ -5321,15 +5319,6 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp)
 	RTL_W32(tp, RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32));
 }
 
-static u16 rtl_rw_cpluscmd(struct rtl8169_private *tp)
-{
-	u16 cmd;
-
-	cmd = RTL_R16(tp, CPlusCmd);
-	RTL_W16(tp, CPlusCmd, cmd);
-	return cmd;
-}
-
 static void rtl_set_rx_max_size(struct rtl8169_private *tp)
 {
 	/* Low hurts. Let's disable the filtering. */
@@ -5415,10 +5404,8 @@ static void rtl_set_rx_mode(struct net_device *dev)
 
 static void rtl_hw_start_8169(struct rtl8169_private *tp)
 {
-	if (tp->mac_version == RTL_GIGA_MAC_VER_05) {
-		RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) | PCIMulRW);
+	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
 		pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
-	}
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
 	if (tp->mac_version == RTL_GIGA_MAC_VER_01 ||
@@ -5439,7 +5426,7 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 	    tp->mac_version == RTL_GIGA_MAC_VER_04)
 		rtl_set_rx_tx_config_registers(tp);
 
-	tp->cp_cmd |= rtl_rw_cpluscmd(tp) | PCIMulRW;
+	tp->cp_cmd |= PCIMulRW;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_02 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_03) {
@@ -5671,7 +5658,8 @@ static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
 {
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
-	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
+	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN) {
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B |
@@ -5699,7 +5687,8 @@ static void __rtl_hw_start_8168cp(struct rtl8169_private *tp)
 
 	rtl_disable_clock_request(tp);
 
-	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
+	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
 static void rtl_hw_start_8168cp_1(struct rtl8169_private *tp)
@@ -5728,7 +5717,8 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
+	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
 static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
@@ -5745,7 +5735,8 @@ static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
+	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
 static void rtl_hw_start_8168c_1(struct rtl8169_private *tp)
@@ -5802,7 +5793,8 @@ static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
+	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
 static void rtl_hw_start_8168dp(struct rtl8169_private *tp)
@@ -6271,8 +6263,8 @@ static void rtl_hw_start_8168(struct rtl8169_private *tp)
 
 	rtl_set_rx_max_size(tp);
 
-	tp->cp_cmd |= RTL_R16(tp, CPlusCmd) | PktCntrDisable | INTT_1;
-
+	tp->cp_cmd &= ~INTT_MASK;
+	tp->cp_cmd |= PktCntrDisable | INTT_1;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 
 	RTL_W16(tp, IntrMitigate, 0x5151);
@@ -8130,7 +8122,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Identify chip attached to board */
 	rtl8169_get_mac_version(tp, cfg->default_ver);
 
-	tp->cp_cmd = 0;
+	tp->cp_cmd = RTL_R16(tp, CPlusCmd);
 
 	if ((sizeof(dma_addr_t) > 4) &&
 	    (use_dac == 1 || (use_dac == -1 && pci_is_pcie(pdev) &&
-- 
cgit v1.2.3-59-g8ed1b


From 12d42c505eb18c0d1f70ec36cea022bb195e7ba7 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:30 +0200
Subject: r8169: improve handling of CPCMD quirk mask

Both quirk masks are the same, so we can merge them. The quirk mask
includes most bits so it's actually easier to define a mask with
the bits to keep.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 868dee7d1131..cf7a7db53791 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -690,6 +690,7 @@ enum rtl_rx_desc_bit {
 };
 
 #define RsvdMask	0x3fffc000
+#define CPCMD_QUIRK_MASK	(Normal_mode | RxVlan | RxChkSum | INTT_MASK)
 
 struct TxDesc {
 	__le32 opts1;
@@ -5643,22 +5644,11 @@ static void rtl_pcie_state_l2l3_enable(struct rtl8169_private *tp, bool enable)
 	RTL_W8(tp, Config3, data);
 }
 
-#define R8168_CPCMD_QUIRK_MASK (\
-	EnableBist | \
-	Mac_dbgo_oe | \
-	Force_half_dup | \
-	Force_rxflow_en | \
-	Force_txflow_en | \
-	Cxpl_dbg_sel | \
-	ASF | \
-	PktCntrDisable | \
-	Mac_dbgo_sel)
-
 static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
 {
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
-	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN) {
@@ -5687,7 +5677,7 @@ static void __rtl_hw_start_8168cp(struct rtl8169_private *tp)
 
 	rtl_disable_clock_request(tp);
 
-	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
@@ -5717,7 +5707,7 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
@@ -5735,7 +5725,7 @@ static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
@@ -5793,7 +5783,7 @@ static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 	if (tp->dev->mtu <= ETH_DATA_LEN)
 		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
-	tp->cp_cmd &= ~R8168_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 }
 
@@ -6394,17 +6384,6 @@ static void rtl_hw_start_8168(struct rtl8169_private *tp)
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
 
-#define R810X_CPCMD_QUIRK_MASK (\
-	EnableBist | \
-	Mac_dbgo_oe | \
-	Force_half_dup | \
-	Force_rxflow_en | \
-	Force_txflow_en | \
-	Cxpl_dbg_sel | \
-	ASF | \
-	PktCntrDisable | \
-	Mac_dbgo_sel)
-
 static void rtl_hw_start_8102e_1(struct rtl8169_private *tp)
 {
 	static const struct ephy_info e_info_8102e_1[] = {
@@ -6544,7 +6523,7 @@ static void rtl_hw_start_8101(struct rtl8169_private *tp)
 
 	rtl_set_rx_max_size(tp);
 
-	tp->cp_cmd &= ~R810X_CPCMD_QUIRK_MASK;
+	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 
 	rtl_set_rx_tx_desc_registers(tp);
-- 
cgit v1.2.3-59-g8ed1b


From 3559d81e76bfe3803e89f2e04cf6ef7ab4f3aace Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:40 +0200
Subject: r8169: simplify rtl_hw_start_8169

Currently done:
- if mac_version in (01, 02, 03, 04)
	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
- if mac_version in (01, 02, 03, 04)
	rtl_set_rx_tx_config_registers(tp);
- if mac_version not in (01, 02, 03, 04)
	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
	rtl_set_rx_tx_config_registers(tp);

So we do exactly the same independent of chip version and can simplify
the code.

In addition remove the call to rtl_init_rxcfg(), it's called in
rtl_init_one() already and the set bits are never touched later.
rtl_init_8168/8101 don't include this call either.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index cf7a7db53791..8c816f6c6fc5 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5409,24 +5409,11 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 		pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-	if (tp->mac_version == RTL_GIGA_MAC_VER_01 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_02 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_03 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_04)
-		RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-
-	rtl_init_rxcfg(tp);
 
 	RTL_W8(tp, EarlyTxThres, NoEarlyTx);
 
 	rtl_set_rx_max_size(tp);
 
-	if (tp->mac_version == RTL_GIGA_MAC_VER_01 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_02 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_03 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_04)
-		rtl_set_rx_tx_config_registers(tp);
-
 	tp->cp_cmd |= PCIMulRW;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_02 ||
@@ -5447,14 +5434,9 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 	RTL_W16(tp, IntrMitigate, 0x0000);
 
 	rtl_set_rx_tx_desc_registers(tp);
+	rtl_set_rx_tx_config_registers(tp);
 
-	if (tp->mac_version != RTL_GIGA_MAC_VER_01 &&
-	    tp->mac_version != RTL_GIGA_MAC_VER_02 &&
-	    tp->mac_version != RTL_GIGA_MAC_VER_03 &&
-	    tp->mac_version != RTL_GIGA_MAC_VER_04) {
-		RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-		rtl_set_rx_tx_config_registers(tp);
-	}
+	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
 
 	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 82d3ff6dd1994d54fe11714ffd4c696cfcacbea1 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:43 +0200
Subject: r8169: remove calls to rtl_set_rx_mode

__dev_open() calls the ndo_set_rx_mode callback anyway, so we don't
have to do it here too.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 8c816f6c6fc5..c7b9301aede1 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5445,8 +5445,6 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 
 	RTL_W32(tp, RxMissed, 0);
 
-	rtl_set_rx_mode(tp->dev);
-
 	/* no early-rx interrupts */
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
@@ -6361,8 +6359,6 @@ static void rtl_hw_start_8168(struct rtl8169_private *tp)
 
 	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
 
-	rtl_set_rx_mode(tp->dev);
-
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
 
@@ -6554,8 +6550,6 @@ static void rtl_hw_start_8101(struct rtl8169_private *tp)
 
 	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
 
-	rtl_set_rx_mode(tp->dev);
-
 	RTL_R8(tp, IntrMask);
 
 	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
-- 
cgit v1.2.3-59-g8ed1b


From 4fd48c4ac0a0578862819295222a825c97686ac7 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 28 Apr 2018 22:19:47 +0200
Subject: r8169: move common initializations to tp->hw_start

The chip-specific init code includes quite some calls which are
identical for all chips. So move these calls to tp->hw_start().

In addition move rtl_set_rx_max_size() a little to make sure it's
defined before it's used. Unfortunately the diff generated by git
is a little bit hard to read.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 74 +++++++++---------------------------
 1 file changed, 19 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index c7b9301aede1..66f10d119b12 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5301,10 +5301,10 @@ static void rtl_set_rx_tx_config_registers(struct rtl8169_private *tp)
 		(InterFrameGap << TxInterFrameGapShift));
 }
 
-static void rtl_hw_start(struct  rtl8169_private *tp)
+static void rtl_set_rx_max_size(struct rtl8169_private *tp)
 {
-	tp->hw_start(tp);
-	rtl_irq_enable_all(tp);
+	/* Low hurts. Let's disable the filtering. */
+	RTL_W16(tp, RxMaxSize, R8169_RX_BUF_SIZE + 1);
 }
 
 static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp)
@@ -5320,10 +5320,23 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp)
 	RTL_W32(tp, RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32));
 }
 
-static void rtl_set_rx_max_size(struct rtl8169_private *tp)
+static void rtl_hw_start(struct  rtl8169_private *tp)
 {
-	/* Low hurts. Let's disable the filtering. */
-	RTL_W16(tp, RxMaxSize, R8169_RX_BUF_SIZE + 1);
+	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
+
+	tp->hw_start(tp);
+
+	rtl_set_rx_max_size(tp);
+	rtl_set_rx_tx_desc_registers(tp);
+	rtl_set_rx_tx_config_registers(tp);
+	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+
+	/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
+	RTL_R8(tp, IntrMask);
+	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
+	/* no early-rx interrupts */
+	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
+	rtl_irq_enable_all(tp);
 }
 
 static void rtl8169_set_magic_reg(struct rtl8169_private *tp, unsigned mac_version)
@@ -5408,12 +5421,8 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
 		pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
 
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-
 	RTL_W8(tp, EarlyTxThres, NoEarlyTx);
 
-	rtl_set_rx_max_size(tp);
-
 	tp->cp_cmd |= PCIMulRW;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_02 ||
@@ -5433,20 +5442,7 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 	 */
 	RTL_W16(tp, IntrMitigate, 0x0000);
 
-	rtl_set_rx_tx_desc_registers(tp);
-	rtl_set_rx_tx_config_registers(tp);
-
-	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
-
-	/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
-	RTL_R8(tp, IntrMask);
-
 	RTL_W32(tp, RxMissed, 0);
-
-	/* no early-rx interrupts */
-	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
 
 static void rtl_csi_write(struct rtl8169_private *tp, int addr, int value)
@@ -6227,12 +6223,8 @@ static void rtl_hw_start_8168ep_3(struct rtl8169_private *tp)
 
 static void rtl_hw_start_8168(struct rtl8169_private *tp)
 {
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
-	rtl_set_rx_max_size(tp);
-
 	tp->cp_cmd &= ~INTT_MASK;
 	tp->cp_cmd |= PktCntrDisable | INTT_1;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
@@ -6245,12 +6237,6 @@ static void rtl_hw_start_8168(struct rtl8169_private *tp)
 		tp->event_slow &= ~RxOverflow;
 	}
 
-	rtl_set_rx_tx_desc_registers(tp);
-
-	rtl_set_rx_tx_config_registers(tp);
-
-	RTL_R8(tp, IntrMask);
-
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_11:
 		rtl_hw_start_8168bb(tp);
@@ -6354,12 +6340,6 @@ static void rtl_hw_start_8168(struct rtl8169_private *tp)
 		       tp->dev->name, tp->mac_version);
 		break;
 	}
-
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
-
-	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-
-	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
 
 static void rtl_hw_start_8102e_1(struct rtl8169_private *tp)
@@ -6495,19 +6475,11 @@ static void rtl_hw_start_8101(struct rtl8169_private *tp)
 		pcie_capability_set_word(tp->pci_dev, PCI_EXP_DEVCTL,
 					 PCI_EXP_DEVCTL_NOSNOOP_EN);
 
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
-	rtl_set_rx_max_size(tp);
-
 	tp->cp_cmd &= CPCMD_QUIRK_MASK;
 	RTL_W16(tp, CPlusCmd, tp->cp_cmd);
 
-	rtl_set_rx_tx_desc_registers(tp);
-
-	rtl_set_rx_tx_config_registers(tp);
-
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_07:
 		rtl_hw_start_8102e_1(tp);
@@ -6544,15 +6516,7 @@ static void rtl_hw_start_8101(struct rtl8169_private *tp)
 		break;
 	}
 
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
-
 	RTL_W16(tp, IntrMitigate, 0x0000);
-
-	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-
-	RTL_R8(tp, IntrMask);
-
-	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
 }
 
 static int rtl8169_change_mtu(struct net_device *dev, int new_mtu)
-- 
cgit v1.2.3-59-g8ed1b


From 04a1e08cc734da60445c417e314a0f35a642861b Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Wed, 11 Apr 2018 10:08:18 +0800
Subject: i40evf: Replace GFP_ATOMIC with GFP_KERNEL in i40evf_add_vlan

i40evf_add_vlan() is never called in atomic context.

i40evf_add_vlan() is only called by i40evf_vlan_rx_add_vid(),
which is only set as ".ndo_vlan_rx_add_vid" in struct net_device_ops.
".ndo_vlan_rx_add_vid" is not called in atomic context.

Despite never getting called from atomic context,
i40evf_add_vlan() calls kzalloc() with GFP_ATOMIC,
which does not sleep for allocation.
GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL,
which can sleep and improve the possibility of sucessful allocation.

This is found by a static analysis tool named DCNS written by myself.
And I also manually check it.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 97cda4a8f8e0..8f775a82e2fa 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -681,7 +681,7 @@ i40evf_vlan_filter *i40evf_add_vlan(struct i40evf_adapter *adapter, u16 vlan)
 
 	f = i40evf_find_vlan(adapter, vlan);
 	if (!f) {
-		f = kzalloc(sizeof(*f), GFP_ATOMIC);
+		f = kzalloc(sizeof(*f), GFP_KERNEL);
 		if (!f)
 			goto clearout;
 
-- 
cgit v1.2.3-59-g8ed1b


From 1baf5ebf8954d9bff8fa4e7dd6c416a0cebdb9e2 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 27 Apr 2018 14:16:32 -0700
Subject: erspan: auto detect truncated packets.

Currently the truncated bit is set only when the mirrored packet
is larger than mtu.  For certain cases, the packet might already
been truncated before sending to the erspan tunnel.  In this case,
the patch detect whether the IP header's total length is larger
than the actual skb->len.  If true, this indicated that the
mirrored packet is truncated and set the erspan truncate bit.

I tested the patch using bpf_skb_change_tail helper function to
shrink the packet size and send to erspan tunnel.

Reported-by: Xiaoyan Jin <xiaoyanj@vmware.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 6 ++++++
 net/ipv6/ip6_gre.c | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9c169bb2444d..dfe5b22f6ed4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -578,6 +578,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	int tunnel_hlen;
 	int version;
 	__be16 df;
+	int nhoff;
 
 	tun_info = skb_tunnel_info(skb);
 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -605,6 +606,11 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		truncate = true;
 	}
 
+	nhoff = skb_network_header(skb) - skb_mac_header(skb);
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+		truncate = true;
+
 	if (version == 1) {
 		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 				    ntohl(md->u.index), truncate, true);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 69727bc168cb..ac7ce85df667 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -896,6 +896,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	struct flowi6 fl6;
 	int err = -EINVAL;
 	__u32 mtu;
+	int nhoff;
 
 	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
 		goto tx_err;
@@ -908,6 +909,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		truncate = true;
 	}
 
+	nhoff = skb_network_header(skb) - skb_mac_header(skb);
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
+		truncate = true;
+
 	if (skb_cow_head(skb, dev->needed_headroom))
 		goto tx_err;
 
-- 
cgit v1.2.3-59-g8ed1b


From a6a188e48959829fc6d64a1b52762314bcaf6417 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 28 Apr 2018 12:35:22 +0800
Subject: libcxgb,cxgb4: use __skb_put_zero to simplfy code

use helper __skb_put_zero to replace the pattern of __skb_put() && memset()

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c |  3 +--
 drivers/net/ethernet/chelsio/cxgb4/srq.c          |  3 +--
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h | 15 +++++----------
 3 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index db92f1858060..aae980205ece 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -64,8 +64,7 @@ static int set_tcb_field(struct adapter *adap, struct filter_entry *f,
 	if (!skb)
 		return -ENOMEM;
 
-	req = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*req));
-	memset(req, 0, sizeof(*req));
+	req = (struct cpl_set_tcb_field *)__skb_put_zero(skb, sizeof(*req));
 	INIT_TP_WR_CPL(req, CPL_SET_TCB_FIELD, ftid);
 	req->reply_ctrl = htons(REPLY_CHAN_V(0) |
 				QUEUENO_V(adap->sge.fw_evtq.abs_id) |
diff --git a/drivers/net/ethernet/chelsio/cxgb4/srq.c b/drivers/net/ethernet/chelsio/cxgb4/srq.c
index 6228a5708307..82b70a565e24 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/srq.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/srq.c
@@ -84,8 +84,7 @@ int cxgb4_get_srq_entry(struct net_device *dev,
 	if (!skb)
 		return -ENOMEM;
 	req = (struct cpl_srq_table_req *)
-		__skb_put(skb, sizeof(*req));
-	memset(req, 0, sizeof(*req));
+		__skb_put_zero(skb, sizeof(*req));
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SRQ_TABLE_REQ,
 					      TID_TID_V(srq_idx) |
diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
index 4b5aacc09cab..240ba9d4c399 100644
--- a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
+++ b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
@@ -90,8 +90,7 @@ cxgb_mk_tid_release(struct sk_buff *skb, u32 len, u32 tid, u16 chan)
 {
 	struct cpl_tid_release *req;
 
-	req = __skb_put(skb, len);
-	memset(req, 0, len);
+	req = __skb_put_zero(skb, len);
 
 	INIT_TP_WR(req, tid);
 	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
@@ -104,8 +103,7 @@ cxgb_mk_close_con_req(struct sk_buff *skb, u32 len, u32 tid, u16 chan,
 {
 	struct cpl_close_con_req *req;
 
-	req = __skb_put(skb, len);
-	memset(req, 0, len);
+	req = __skb_put_zero(skb, len);
 
 	INIT_TP_WR(req, tid);
 	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
@@ -119,8 +117,7 @@ cxgb_mk_abort_req(struct sk_buff *skb, u32 len, u32 tid, u16 chan,
 {
 	struct cpl_abort_req *req;
 
-	req = __skb_put(skb, len);
-	memset(req, 0, len);
+	req = __skb_put_zero(skb, len);
 
 	INIT_TP_WR(req, tid);
 	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
@@ -134,8 +131,7 @@ cxgb_mk_abort_rpl(struct sk_buff *skb, u32 len, u32 tid, u16 chan)
 {
 	struct cpl_abort_rpl *rpl;
 
-	rpl = __skb_put(skb, len);
-	memset(rpl, 0, len);
+	rpl = __skb_put_zero(skb, len);
 
 	INIT_TP_WR(rpl, tid);
 	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
@@ -149,8 +145,7 @@ cxgb_mk_rx_data_ack(struct sk_buff *skb, u32 len, u32 tid, u16 chan,
 {
 	struct cpl_rx_data_ack *req;
 
-	req = __skb_put(skb, len);
-	memset(req, 0, len);
+	req = __skb_put_zero(skb, len);
 
 	INIT_TP_WR(req, tid);
 	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
-- 
cgit v1.2.3-59-g8ed1b


From c0dd967818a2b8106771a7c9fb7ef48d7be8a9d4 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Mon, 30 Apr 2018 07:26:23 -0700
Subject: tools, include: Grab a copy of linux/erspan.h

Bring the erspan uapi header file so BPF tunnel helpers can use it.

Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.")
Reported-by: Yonghong Song <yhs@fb.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/erspan.h | 52 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 tools/include/uapi/linux/erspan.h

diff --git a/tools/include/uapi/linux/erspan.h b/tools/include/uapi/linux/erspan.h
new file mode 100644
index 000000000000..841573019ae1
--- /dev/null
+++ b/tools/include/uapi/linux/erspan.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ERSPAN Tunnel Metadata
+ *
+ * Copyright (c) 2018 VMware
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * Userspace API for metadata mode ERSPAN tunnel
+ */
+#ifndef _UAPI_ERSPAN_H
+#define _UAPI_ERSPAN_H
+
+#include <linux/types.h>	/* For __beXX in userspace */
+#include <asm/byteorder.h>
+
+/* ERSPAN version 2 metadata header */
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;	/* security group tag */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8	hwid_upper:2,
+		ft:5,
+		p:1;
+	__u8	o:1,
+		gra:2,
+		dir:1,
+		hwid:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	__u8	p:1,
+		ft:5,
+		hwid_upper:2;
+	__u8	hwid:4,
+		dir:1,
+		gra:2,
+		o:1;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+};
+
+struct erspan_metadata {
+	int version;
+	union {
+		__be32 index;		/* Version 1 (type II)*/
+		struct erspan_md2 md2;	/* Version 2 (type III) */
+	} u;
+};
+
+#endif /* _UAPI_ERSPAN_H */
-- 
cgit v1.2.3-59-g8ed1b


From f5254429e1756ad7fede0249c9b779e37b6c967f Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 20 Apr 2018 01:41:33 -0700
Subject: i40e/i40evf: cleanup incorrect function doxygen comments

Recent versions of the Linux kernel now warn about incorrect parameter
definitions for function comments. Fix up several function comments to
correctly reflect the current function arguments. This cleans up the
warnings and helps ensure our documentation is accurate.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_client.c      |  6 ++--
 drivers/net/ethernet/intel/i40e/i40e_common.c      | 37 +++++++++++++---------
 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c      | 11 ++++---
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |  8 ++---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     | 24 ++++++++------
 drivers/net/ethernet/intel/i40e/i40e_hmc.c         |  1 -
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 22 +++++++++----
 drivers/net/ethernet/intel/i40e/i40e_nvm.c         |  1 +
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |  6 ++--
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 16 +++++-----
 drivers/net/ethernet/intel/i40evf/i40e_common.c    |  1 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c      |  4 ++-
 drivers/net/ethernet/intel/i40evf/i40evf_client.c  |  4 +--
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  7 ++--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    |  5 ++-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c    | 11 +------
 17 files changed, 95 insertions(+), 73 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 2041757f948c..5f3b8b9ff511 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -40,7 +40,7 @@ static struct i40e_ops i40e_lan_ops = {
 /**
  * i40e_client_get_params - Get the params that can change at runtime
  * @vsi: the VSI with the message
- * @param: clinet param struct
+ * @params: client param struct
  *
  **/
 static
@@ -566,7 +566,7 @@ static int i40e_client_virtchnl_send(struct i40e_info *ldev,
  * i40e_client_setup_qvlist
  * @ldev: pointer to L2 context.
  * @client: Client pointer.
- * @qv_info: queue and vector list
+ * @qvlist_info: queue and vector list
  *
  * Return 0 on success or < 0 on error
  **/
@@ -641,7 +641,7 @@ err:
  * i40e_client_request_reset
  * @ldev: pointer to L2 context.
  * @client: Client pointer.
- * @level: reset level
+ * @reset_level: reset level
  **/
 static void i40e_client_request_reset(struct i40e_info *ldev,
 				      struct i40e_client *client,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 6f8fd70d606a..eb2d1530d331 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1671,6 +1671,8 @@ enum i40e_status_code i40e_aq_set_phy_config(struct i40e_hw *hw,
 /**
  * i40e_set_fc
  * @hw: pointer to the hw struct
+ * @aq_failures: buffer to return AdminQ failure information
+ * @atomic_restart: whether to enable atomic link restart
  *
  * Set the requested flow control mode using set_phy_config.
  **/
@@ -2807,8 +2809,8 @@ i40e_status i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
  * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
  * @cmd_details: pointer to command details structure or NULL
  * @rule_id: Rule ID returned from FW
- * @rule_used: Number of rules used in internal switch
- * @rule_free: Number of rules free in internal switch
+ * @rules_used: Number of rules used in internal switch
+ * @rules_free: Number of rules free in internal switch
  *
  * Add/Delete a mirror rule to a specific switch. Mirror rules are supported for
  * VEBs/VEPA elements only
@@ -2868,8 +2870,8 @@ static i40e_status i40e_mirrorrule_op(struct i40e_hw *hw,
  * @mr_list: list of mirrored VSI SEIDs or VLAN IDs
  * @cmd_details: pointer to command details structure or NULL
  * @rule_id: Rule ID returned from FW
- * @rule_used: Number of rules used in internal switch
- * @rule_free: Number of rules free in internal switch
+ * @rules_used: Number of rules used in internal switch
+ * @rules_free: Number of rules free in internal switch
  *
  * Add mirror rule. Mirror rules are supported for VEBs or VEPA elements only
  **/
@@ -2899,8 +2901,8 @@ i40e_status i40e_aq_add_mirrorrule(struct i40e_hw *hw, u16 sw_seid,
  *		add_mirrorrule.
  * @mr_list: list of mirrored VLAN IDs to be removed
  * @cmd_details: pointer to command details structure or NULL
- * @rule_used: Number of rules used in internal switch
- * @rule_free: Number of rules free in internal switch
+ * @rules_used: Number of rules used in internal switch
+ * @rules_free: Number of rules free in internal switch
  *
  * Delete a mirror rule. Mirror rules are supported for VEBs/VEPA elements only
  **/
@@ -3648,6 +3650,8 @@ i40e_status i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
 /**
  * i40e_aq_start_lldp
  * @hw: pointer to the hw struct
+ * @buff: buffer for result
+ * @buff_size: buffer size
  * @cmd_details: pointer to command details structure or NULL
  *
  * Start the embedded LLDP Agent on all ports.
@@ -3728,7 +3732,6 @@ i40e_status i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
  * i40e_aq_add_udp_tunnel
  * @hw: pointer to the hw struct
  * @udp_port: the UDP port to add in Host byte order
- * @header_len: length of the tunneling header length in DWords
  * @protocol_index: protocol index type
  * @filter_index: pointer to filter index
  * @cmd_details: pointer to command details structure or NULL
@@ -3947,6 +3950,7 @@ i40e_status i40e_aq_config_vsi_tc_bw(struct i40e_hw *hw,
  * @hw: pointer to the hw struct
  * @seid: seid of the switching component connected to Physical Port
  * @ets_data: Buffer holding ETS parameters
+ * @opcode: Tx scheduler AQ command opcode
  * @cmd_details: pointer to command details structure or NULL
  **/
 i40e_status i40e_aq_config_switch_comp_ets(struct i40e_hw *hw,
@@ -4290,10 +4294,10 @@ i40e_status i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
  * @hw: pointer to the hw struct
  * @seid: VSI seid to add ethertype filter from
  **/
-#define I40E_FLOW_CONTROL_ETHTYPE 0x8808
 void i40e_add_filter_to_drop_tx_flow_control_frames(struct i40e_hw *hw,
 						    u16 seid)
 {
+#define I40E_FLOW_CONTROL_ETHTYPE 0x8808
 	u16 flag = I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC |
 		   I40E_AQC_ADD_CONTROL_PACKET_FLAGS_DROP |
 		   I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX;
@@ -4424,6 +4428,7 @@ void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status)
  * @ret_buff_size: actual buffer size returned
  * @ret_next_table: next block to read
  * @ret_next_index: next index to read
+ * @cmd_details: pointer to command details structure or NULL
  *
  * Dump internal FW/HW data for debug purposes.
  *
@@ -4550,7 +4555,7 @@ i40e_status i40e_aq_configure_partition_bw(struct i40e_hw *hw,
  * i40e_read_phy_register_clause22
  * @hw: pointer to the HW structure
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Reads specified PHY register value
@@ -4595,7 +4600,7 @@ i40e_status i40e_read_phy_register_clause22(struct i40e_hw *hw,
  * i40e_write_phy_register_clause22
  * @hw: pointer to the HW structure
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Writes specified PHY register value
@@ -4636,7 +4641,7 @@ i40e_status i40e_write_phy_register_clause22(struct i40e_hw *hw,
  * @hw: pointer to the HW structure
  * @page: registers page number
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Reads specified PHY register value
@@ -4710,7 +4715,7 @@ phy_read_end:
  * @hw: pointer to the HW structure
  * @page: registers page number
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Writes value to specified PHY register
@@ -4777,7 +4782,7 @@ phy_write_end:
  * @hw: pointer to the HW structure
  * @page: registers page number
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Writes value to specified PHY register
@@ -4813,7 +4818,7 @@ i40e_status i40e_write_phy_register(struct i40e_hw *hw,
  * @hw: pointer to the HW structure
  * @page: registers page number
  * @reg: register address in the page
- * @phy_adr: PHY address on MDIO interface
+ * @phy_addr: PHY address on MDIO interface
  * @value: PHY register value
  *
  * Reads specified PHY register value
@@ -4848,7 +4853,6 @@ i40e_status i40e_read_phy_register(struct i40e_hw *hw,
  * i40e_get_phy_address
  * @hw: pointer to the HW structure
  * @dev_num: PHY port num that address we want
- * @phy_addr: Returned PHY address
  *
  * Gets PHY address for current port
  **/
@@ -5058,7 +5062,9 @@ i40e_status i40e_led_get_phy(struct i40e_hw *hw, u16 *led_addr,
  * i40e_led_set_phy
  * @hw: pointer to the HW structure
  * @on: true or false
+ * @led_addr: address of led register to use
  * @mode: original val plus bit for set or ignore
+ *
  * Set led's on or off when controlled by the PHY
  *
  **/
@@ -5347,6 +5353,7 @@ i40e_status_code i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
  * @hw: pointer to the hw struct
  * @buff: command buffer (size in bytes = buff_size)
  * @buff_size: buffer size in bytes
+ * @flags: AdminQ command flags
  * @cmd_details: pointer to command details structure or NULL
  **/
 enum
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
index a09d723d5ca0..9deae9a35423 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
@@ -23,7 +23,7 @@ static void i40e_get_pfc_delay(struct i40e_hw *hw, u16 *delay)
 
 /**
  * i40e_dcbnl_ieee_getets - retrieve local IEEE ETS configuration
- * @netdev: the corresponding netdev
+ * @dev: the corresponding netdev
  * @ets: structure to hold the ETS information
  *
  * Returns local IEEE ETS configuration
@@ -62,8 +62,8 @@ static int i40e_dcbnl_ieee_getets(struct net_device *dev,
 
 /**
  * i40e_dcbnl_ieee_getpfc - retrieve local IEEE PFC configuration
- * @netdev: the corresponding netdev
- * @ets: structure to hold the PFC information
+ * @dev: the corresponding netdev
+ * @pfc: structure to hold the PFC information
  *
  * Returns local IEEE PFC configuration
  **/
@@ -95,7 +95,7 @@ static int i40e_dcbnl_ieee_getpfc(struct net_device *dev,
 
 /**
  * i40e_dcbnl_getdcbx - retrieve current DCBx capability
- * @netdev: the corresponding netdev
+ * @dev: the corresponding netdev
  *
  * Returns DCBx capability features
  **/
@@ -108,7 +108,8 @@ static u8 i40e_dcbnl_getdcbx(struct net_device *dev)
 
 /**
  * i40e_dcbnl_get_perm_hw_addr - MAC address used by DCBx
- * @netdev: the corresponding netdev
+ * @dev: the corresponding netdev
+ * @perm_addr: buffer to store the MAC address
  *
  * Returns the SAN MAC address used for LLDP exchange
  **/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 94774a2a511f..56b911a5dd8b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -12,8 +12,8 @@ static struct dentry *i40e_dbg_root;
 
 /**
  * i40e_dbg_find_vsi - searches for the vsi with the given seid
- * @pf - the PF structure to search for the vsi
- * @seid - seid of the vsi it is searching for
+ * @pf: the PF structure to search for the vsi
+ * @seid: seid of the vsi it is searching for
  **/
 static struct i40e_vsi *i40e_dbg_find_vsi(struct i40e_pf *pf, int seid)
 {
@@ -31,8 +31,8 @@ static struct i40e_vsi *i40e_dbg_find_vsi(struct i40e_pf *pf, int seid)
 
 /**
  * i40e_dbg_find_veb - searches for the veb with the given seid
- * @pf - the PF structure to search for the veb
- * @seid - seid of the veb it is searching for
+ * @pf: the PF structure to search for the veb
+ * @seid: seid of the veb it is searching for
  **/
 static struct i40e_veb *i40e_dbg_find_veb(struct i40e_pf *pf, int seid)
 {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 94dbe2c419ca..c1bbfb913e49 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1055,6 +1055,9 @@ static int i40e_nway_reset(struct net_device *netdev)
 
 /**
  * i40e_get_pauseparam -  Get Flow Control status
+ * @netdev: netdevice structure
+ * @pause: buffer to return pause parameters
+ *
  * Return tx/rx-pause status
  **/
 static void i40e_get_pauseparam(struct net_device *netdev,
@@ -2526,7 +2529,7 @@ static int i40e_get_rss_hash_opts(struct i40e_pf *pf, struct ethtool_rxnfc *cmd)
 /**
  * i40e_check_mask - Check whether a mask field is set
  * @mask: the full mask value
- * @field; mask of the field to check
+ * @field: mask of the field to check
  *
  * If the given mask is fully set, return positive value. If the mask for the
  * field is fully unset, return zero. Otherwise return a negative error code.
@@ -2597,6 +2600,7 @@ static int i40e_parse_rx_flow_user_data(struct ethtool_rx_flow_spec *fsp,
 /**
  * i40e_fill_rx_flow_user_data - Fill in user-defined data field
  * @fsp: pointer to rx_flow specification
+ * @data: pointer to return userdef data
  *
  * Reads the userdef data structure and properly fills in the user defined
  * fields of the rx_flow_spec.
@@ -2775,6 +2779,7 @@ no_input_set:
  * i40e_get_rxnfc - command to get RX flow classification rules
  * @netdev: network interface device structure
  * @cmd: ethtool rxnfc command
+ * @rule_locs: pointer to store rule data
  *
  * Returns Success if the command is supported.
  **/
@@ -2816,7 +2821,7 @@ static int i40e_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
 /**
  * i40e_get_rss_hash_bits - Read RSS Hash bits from register
  * @nfc: pointer to user request
- * @i_setc bits currently set
+ * @i_setc: bits currently set
  *
  * Returns value of bits to be set per user request
  **/
@@ -2861,7 +2866,7 @@ static u64 i40e_get_rss_hash_bits(struct ethtool_rxnfc *nfc, u64 i_setc)
 /**
  * i40e_set_rss_hash_opt - Enable/Disable flow types for RSS hash
  * @pf: pointer to the physical function struct
- * @cmd: ethtool rxnfc command
+ * @nfc: ethtool rxnfc command
  *
  * Returns Success if the flow input set is supported.
  **/
@@ -3260,7 +3265,7 @@ static int i40e_add_flex_offset(struct list_head *flex_pit_list,
  * __i40e_reprogram_flex_pit - Re-program specific FLX_PIT table
  * @pf: Pointer to the PF structure
  * @flex_pit_list: list of flexible src offsets in use
- * #flex_pit_start: index to first entry for this section of the table
+ * @flex_pit_start: index to first entry for this section of the table
  *
  * In order to handle flexible data, the hardware uses a table of values
  * called the FLX_PIT table. This table is used to indicate which sections of
@@ -3374,7 +3379,7 @@ static void i40e_reprogram_flex_pit(struct i40e_pf *pf)
 
 /**
  * i40e_flow_str - Converts a flow_type into a human readable string
- * @flow_type: the flow type from a flow specification
+ * @fsp: the flow specification
  *
  * Currently only flow types we support are included here, and the string
  * value attempts to match what ethtool would use to configure this flow type.
@@ -4079,7 +4084,7 @@ static unsigned int i40e_max_channels(struct i40e_vsi *vsi)
 
 /**
  * i40e_get_channels - Get the current channels enabled and max supported etc.
- * @netdev: network interface device structure
+ * @dev: network interface device structure
  * @ch: ethtool channels structure
  *
  * We don't support separate tx and rx queues as channels. The other count
@@ -4088,7 +4093,7 @@ static unsigned int i40e_max_channels(struct i40e_vsi *vsi)
  * q_vectors since we support a lot more queue pairs than q_vectors.
  **/
 static void i40e_get_channels(struct net_device *dev,
-			       struct ethtool_channels *ch)
+			      struct ethtool_channels *ch)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -4107,14 +4112,14 @@ static void i40e_get_channels(struct net_device *dev,
 
 /**
  * i40e_set_channels - Set the new channels count.
- * @netdev: network interface device structure
+ * @dev: network interface device structure
  * @ch: ethtool channels structure
  *
  * The new channels count may not be the same as requested by the user
  * since it gets rounded down to a power of 2 value.
  **/
 static int i40e_set_channels(struct net_device *dev,
-			      struct ethtool_channels *ch)
+			     struct ethtool_channels *ch)
 {
 	const u8 drop = I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET;
 	struct i40e_netdev_priv *np = netdev_priv(dev);
@@ -4249,6 +4254,7 @@ out:
  * @netdev: network interface device structure
  * @indir: indirection table
  * @key: hash key
+ * @hfunc: hash function to use
  *
  * Returns -EINVAL if the table specifies an invalid queue id, otherwise
  * returns 0 after programming the table.
diff --git a/drivers/net/ethernet/intel/i40e/i40e_hmc.c b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
index e40f3b59e42b..19ce93d7fd0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_hmc.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_hmc.c
@@ -174,7 +174,6 @@ exit:
  * @hw: pointer to our HW structure
  * @hmc_info: pointer to the HMC configuration information structure
  * @idx: the page index
- * @is_pf: distinguishes a VF from a PF
  *
  * This function:
  *	1. Marks the entry in pd tabe (for paged address mode) or in sd table
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 41bad112a907..ad01bfc5ec80 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -254,8 +254,8 @@ static int i40e_put_lump(struct i40e_lump_tracking *pile, u16 index, u16 id)
 
 /**
  * i40e_find_vsi_from_id - searches for the vsi with the given id
- * @pf - the pf structure to search for the vsi
- * @id - id of the vsi it is searching for
+ * @pf: the pf structure to search for the vsi
+ * @id: id of the vsi it is searching for
  **/
 struct i40e_vsi *i40e_find_vsi_from_id(struct i40e_pf *pf, u16 id)
 {
@@ -411,6 +411,7 @@ static void i40e_get_netdev_stats_struct_tx(struct i40e_ring *ring,
 /**
  * i40e_get_netdev_stats_struct - Get statistics for netdev interface
  * @netdev: network interface device structure
+ * @stats: data structure to store statistics
  *
  * Returns the address of the device statistics structure.
  * The statistics are actually updated from the service task.
@@ -2003,7 +2004,7 @@ struct i40e_new_mac_filter *i40e_next_filter(struct i40e_new_mac_filter *next)
  * from firmware
  * @count: Number of filters added
  * @add_list: return data from fw
- * @head: pointer to first filter in current batch
+ * @add_head: pointer to first filter in current batch
  *
  * MAC filter entries from list were slated to be added to device. Returns
  * number of successful filters. Note that 0 does NOT mean success!
@@ -2110,6 +2111,7 @@ void i40e_aqc_add_filters(struct i40e_vsi *vsi, const char *vsi_name,
 /**
  * i40e_aqc_broadcast_filter - Set promiscuous broadcast flags
  * @vsi: pointer to the VSI
+ * @vsi_name: the VSI name
  * @f: filter data
  *
  * This function sets or clears the promiscuous broadcast flags for VLAN
@@ -2816,6 +2818,7 @@ void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid)
 /**
  * i40e_vlan_rx_add_vid - Add a vlan id filter to HW offload
  * @netdev: network interface to be adjusted
+ * @proto: unused protocol value
  * @vid: vlan id to be added
  *
  * net_device_ops implementation for adding vlan ids
@@ -2840,6 +2843,7 @@ static int i40e_vlan_rx_add_vid(struct net_device *netdev,
 /**
  * i40e_vlan_rx_kill_vid - Remove a vlan id filter from HW offload
  * @netdev: network interface to be adjusted
+ * @proto: unused protocol value
  * @vid: vlan id to be removed
  *
  * net_device_ops implementation for removing vlan ids
@@ -3461,7 +3465,7 @@ static void i40e_vsi_configure_msix(struct i40e_vsi *vsi)
 
 /**
  * i40e_enable_misc_int_causes - enable the non-queue interrupts
- * @hw: ptr to the hardware info
+ * @pf: pointer to private device data structure
  **/
 static void i40e_enable_misc_int_causes(struct i40e_pf *pf)
 {
@@ -5072,7 +5076,7 @@ static int i40e_vsi_get_bw_info(struct i40e_vsi *vsi)
  * i40e_vsi_configure_bw_alloc - Configure VSI BW allocation per TC
  * @vsi: the VSI being configured
  * @enabled_tc: TC bitmap
- * @bw_credits: BW shared credits per TC
+ * @bw_share: BW shared credits per TC
  *
  * Returns 0 on success, negative value on failure
  **/
@@ -6329,6 +6333,7 @@ out:
 /**
  * i40e_print_link_message - print link up or down
  * @vsi: the VSI for which link needs a message
+ * @isup: true of link is up, false otherwise
  */
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 {
@@ -9980,7 +9985,7 @@ unlock_pf:
 
 /**
  * i40e_vsi_free_arrays - Free queue and vector pointer arrays for the VSI
- * @type: VSI pointer
+ * @vsi: VSI pointer
  * @free_qvectors: a bool to specify if q_vectors need to be freed.
  *
  * On error: returns error code (negative)
@@ -10776,7 +10781,7 @@ int i40e_config_rss(struct i40e_vsi *vsi, u8 *seed, u8 *lut, u16 lut_size)
  * @vsi: Pointer to VSI structure
  * @seed: Buffer to store the keys
  * @lut: Buffer to store the lookup table entries
- * lut_size: Size of buffer to store the lookup table entries
+ * @lut_size: Size of buffer to store the lookup table entries
  *
  * Returns 0 on success, negative on failure
  */
@@ -11476,6 +11481,7 @@ static int i40e_get_phys_port_id(struct net_device *netdev,
  * @tb: pointer to array of nladdr (unused)
  * @dev: the net device pointer
  * @addr: the MAC address entry being added
+ * @vid: VLAN ID
  * @flags: instructions from stack about fdb operation
  */
 static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
@@ -11521,6 +11527,7 @@ static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
  * i40e_ndo_bridge_setlink - Set the hardware bridge mode
  * @dev: the netdev being configured
  * @nlh: RTNL message
+ * @flags: bridge flags
  *
  * Inserts a new hardware bridge if not already created and
  * enables the bridging mode requested (VEB or VEPA). If the
@@ -14094,6 +14101,7 @@ static void i40e_remove(struct pci_dev *pdev)
 /**
  * i40e_pci_error_detected - warning that something funky happened in PCI land
  * @pdev: PCI device information struct
+ * @error: the type of PCI error
  *
  * Called to warn that something happened and the error handling steps
  * are in progress.  Allows the driver to quiesce things, be ready for
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 51554df9cb9a..0299e5bbb902 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -1149,6 +1149,7 @@ void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw)
  * i40e_nvmupd_check_wait_event - handle NVM update operation events
  * @hw: pointer to the hardware structure
  * @opcode: the event that just happened
+ * @desc: AdminQ descriptor
  **/
 void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode,
 				  struct i40e_aq_desc *desc)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 47d10ad39c18..43d7c44d6d9f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -483,7 +483,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 /**
  * i40e_ptp_get_ts_config - ioctl interface to read the HW timestamping
  * @pf: Board private structure
- * @ifreq: ioctl data
+ * @ifr: ioctl data
  *
  * Obtain the current hardware timestamping settigs as requested. To do this,
  * keep a shadow copy of the timestamp settings rather than attempting to
@@ -627,7 +627,7 @@ static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf,
 /**
  * i40e_ptp_set_ts_config - ioctl interface to control the HW timestamping
  * @pf: Board private structure
- * @ifreq: ioctl data
+ * @ifr: ioctl data
  *
  * Respond to the user filter requests and make the appropriate hardware
  * changes here. The XL710 cannot support splitting of the Tx/Rx timestamping
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6959897c789e..5efa68de935b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -471,7 +471,7 @@ static int i40e_add_del_fdir_ipv4(struct i40e_vsi *vsi,
 /**
  * i40e_add_del_fdir - Build raw packets to add/del fdir filter
  * @vsi: pointer to the targeted VSI
- * @cmd: command to get or set RX flow classification rules
+ * @input: filter to add or delete
  * @add: true adds a filter, false removes it
  *
  **/
@@ -689,7 +689,7 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
 
 /**
  * i40e_get_tx_pending - how many tx descriptors not processed
- * @tx_ring: the ring of descriptors
+ * @ring: the ring of descriptors
  * @in_sw: use SW variables
  *
  * Since there is no access to the ring head register
@@ -1771,6 +1771,8 @@ static inline int i40e_ptype_to_htype(u8 ptype)
  * i40e_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
  * @rx_desc: specific descriptor
+ * @skb: skb currently being received and modified
+ * @rx_ptype: Rx packet type
  **/
 static inline void i40e_rx_hash(struct i40e_ring *ring,
 				union i40e_rx_desc *rx_desc,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 3e7ab9fbeb0c..2ceea63cc6cf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -8,8 +8,8 @@
 /**
  * i40e_vc_vf_broadcast
  * @pf: pointer to the PF structure
- * @opcode: operation code
- * @retval: return value
+ * @v_opcode: operation code
+ * @v_retval: return value
  * @msg: pointer to the msg buffer
  * @msglen: msg length
  *
@@ -1639,6 +1639,7 @@ static int i40e_vc_send_resp_to_vf(struct i40e_vf *vf,
 /**
  * i40e_vc_get_version_msg
  * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
  *
  * called from the VF to request the API version used by the PF
  **/
@@ -1682,7 +1683,6 @@ static void i40e_del_qch(struct i40e_vf *vf)
  * i40e_vc_get_vf_resources_msg
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
- * @msglen: msg length
  *
  * called from the VF to request its resources
  **/
@@ -1806,8 +1806,6 @@ err:
 /**
  * i40e_vc_reset_vf_msg
  * @vf: pointer to the VF info
- * @msg: pointer to the msg buffer
- * @msglen: msg length
  *
  * called from the VF to reset itself,
  * unlike other virtchnl messages, PF driver
@@ -3532,15 +3530,16 @@ err:
  * i40e_vc_process_vf_msg
  * @pf: pointer to the PF structure
  * @vf_id: source VF id
+ * @v_opcode: operation code
+ * @v_retval: unused return value code
  * @msg: pointer to the msg buffer
  * @msglen: msg length
- * @msghndl: msg handle
  *
  * called from the common aeq/arq handler to
  * process request from VF
  **/
 int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
-			   u32 v_retval, u8 *msg, u16 msglen)
+			   u32 __always_unused v_retval, u8 *msg, u16 msglen)
 {
 	struct i40e_hw *hw = &pf->hw;
 	int local_vf_id = vf_id - (s16)hw->func_caps.vf_base_id;
@@ -3991,7 +3990,8 @@ error_pvid:
  * i40e_ndo_set_vf_bw
  * @netdev: network interface device structure
  * @vf_id: VF identifier
- * @tx_rate: Tx rate
+ * @min_tx_rate: Minimum Tx rate
+ * @max_tx_rate: Maximum Tx rate
  *
  * configure VF Tx rate
  **/
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index 0841b2d1ca6d..9cef54971312 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -1231,6 +1231,7 @@ i40e_status_code i40evf_aq_write_ddp(struct i40e_hw *hw, void *buff,
  * @hw: pointer to the hw struct
  * @buff: command buffer (size in bytes = buff_size)
  * @buff_size: buffer size in bytes
+ * @flags: AdminQ command flags
  * @cmd_details: pointer to command details structure or NULL
  **/
 enum
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 87101f565be1..a9730711e257 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -105,7 +105,7 @@ void i40evf_free_tx_resources(struct i40e_ring *tx_ring)
 
 /**
  * i40evf_get_tx_pending - how many Tx descriptors not processed
- * @tx_ring: the ring of descriptors
+ * @ring: the ring of descriptors
  * @in_sw: is tx_pending being checked in SW or HW
  *
  * Since there is no access to the ring head register
@@ -1046,6 +1046,8 @@ static inline int i40e_ptype_to_htype(u8 ptype)
  * i40e_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
  * @rx_desc: specific descriptor
+ * @skb: skb currently being received and modified
+ * @rx_ptype: Rx packet type
  **/
 static inline void i40e_rx_hash(struct i40e_ring *ring,
 				union i40e_rx_desc *rx_desc,
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_client.c b/drivers/net/ethernet/intel/i40evf/i40evf_client.c
index a30d093a52d6..3cc9d60d0d72 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_client.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_client.c
@@ -178,7 +178,6 @@ void i40evf_notify_client_close(struct i40e_vsi *vsi, bool reset)
 /**
  * i40evf_client_add_instance - add a client instance to the instance list
  * @adapter: pointer to the board struct
- * @client: pointer to a client struct in the client list.
  *
  * Returns cinst ptr on success, NULL on failure
  **/
@@ -236,7 +235,6 @@ out:
 /**
  * i40evf_client_del_instance - removes a client instance from the list
  * @adapter: pointer to the board struct
- * @client: pointer to the client struct
  *
  **/
 static
@@ -440,7 +438,7 @@ static u32 i40evf_client_virtchnl_send(struct i40e_info *ldev,
  * i40evf_client_setup_qvlist - send a message to the PF to setup iwarp qv map
  * @ldev: pointer to L2 context.
  * @client: Client pointer.
- * @qv_info: queue and vector list
+ * @qvlist_info: queue and vector list
  *
  * Return 0 on success or < 0 on error
  **/
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 404595efea78..69efe0aec76a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
@@ -202,7 +202,7 @@ static void i40evf_get_strings(struct net_device *netdev, u32 sset, u8 *data)
 
 /**
  * i40evf_get_priv_flags - report device private flags
- * @dev: network interface device structure
+ * @netdev: network interface device structure
  *
  * The get string set count and the string set should be matched for each
  * flag returned.  Add new strings for each flag to the i40e_gstrings_priv_flags
@@ -229,7 +229,7 @@ static u32 i40evf_get_priv_flags(struct net_device *netdev)
 
 /**
  * i40evf_set_priv_flags - set private flags
- * @dev: network interface device structure
+ * @netdev: network interface device structure
  * @flags: bit flags to be set
  **/
 static int i40evf_set_priv_flags(struct net_device *netdev, u32 flags)
@@ -603,6 +603,7 @@ static int i40evf_set_per_queue_coalesce(struct net_device *netdev,
  * i40evf_get_rxnfc - command to get RX flow classification rules
  * @netdev: network interface device structure
  * @cmd: ethtool rxnfc command
+ * @rule_locs: pointer to store rule locations
  *
  * Returns Success if the command is supported.
  **/
@@ -722,6 +723,7 @@ static u32 i40evf_get_rxfh_indir_size(struct net_device *netdev)
  * @netdev: network interface device structure
  * @indir: indirection table
  * @key: hash key
+ * @hfunc: hash function in use
  *
  * Reads the indirection table directly from the hardware. Always returns 0.
  **/
@@ -750,6 +752,7 @@ static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
  * @netdev: network interface device structure
  * @indir: indirection table
  * @key: hash key
+ * @hfunc: hash function to use
  *
  * Returns -EINVAL if the table specifies an inavlid queue id, otherwise
  * returns 0 after programming the table.
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 8f775a82e2fa..28a8cc4a14cb 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -449,6 +449,7 @@ static void i40evf_irq_affinity_release(struct kref *ref) {}
 /**
  * i40evf_request_traffic_irqs - Initialize MSI-X interrupts
  * @adapter: board private structure
+ * @basename: device basename
  *
  * Allocates MSI-X vectors for tx and rx handling, and requests
  * interrupts from the kernel.
@@ -721,6 +722,7 @@ static void i40evf_del_vlan(struct i40evf_adapter *adapter, u16 vlan)
 /**
  * i40evf_vlan_rx_add_vid - Add a VLAN filter to a device
  * @netdev: network device struct
+ * @proto: unused protocol data
  * @vid: VLAN tag
  **/
 static int i40evf_vlan_rx_add_vid(struct net_device *netdev,
@@ -738,6 +740,7 @@ static int i40evf_vlan_rx_add_vid(struct net_device *netdev,
 /**
  * i40evf_vlan_rx_kill_vid - Remove a VLAN filter from a device
  * @netdev: network device struct
+ * @proto: unused protocol data
  * @vid: VLAN tag
  **/
 static int i40evf_vlan_rx_kill_vid(struct net_device *netdev,
@@ -3136,7 +3139,7 @@ static int i40evf_set_features(struct net_device *netdev,
 /**
  * i40evf_features_check - Validate encapsulated packet conforms to limits
  * @skb: skb buff
- * @netdev: This physical port's netdev
+ * @dev: This physical port's netdev
  * @features: Offload features that the stack believes apply
  **/
 static netdev_features_t i40evf_features_check(struct sk_buff *skb,
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 8bcefb7f7ac0..565677de5ba3 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -155,8 +155,7 @@ int i40evf_send_vf_config_msg(struct i40evf_adapter *adapter)
 
 /**
  * i40evf_get_vf_config
- * @hw: pointer to the hardware structure
- * @len: length of buffer
+ * @adapter: private adapter structure
  *
  * Get VF configuration from PF and populate hw structure. Must be called after
  * admin queue is initialized. Busy waits until response is received from PF,
@@ -399,8 +398,6 @@ int i40evf_request_queues(struct i40evf_adapter *adapter, int num)
 /**
  * i40evf_add_ether_addrs
  * @adapter: adapter structure
- * @addrs: the MAC address filters to add (contiguous)
- * @count: number of filters
  *
  * Request that the PF add one or more addresses to our filters.
  **/
@@ -473,8 +470,6 @@ void i40evf_add_ether_addrs(struct i40evf_adapter *adapter)
 /**
  * i40evf_del_ether_addrs
  * @adapter: adapter structure
- * @addrs: the MAC address filters to remove (contiguous)
- * @count: number of filtes
  *
  * Request that the PF remove one or more addresses from our filters.
  **/
@@ -547,8 +542,6 @@ void i40evf_del_ether_addrs(struct i40evf_adapter *adapter)
 /**
  * i40evf_add_vlans
  * @adapter: adapter structure
- * @vlans: the VLANs to add
- * @count: number of VLANs
  *
  * Request that the PF add one or more VLAN filters to our VSI.
  **/
@@ -619,8 +612,6 @@ void i40evf_add_vlans(struct i40evf_adapter *adapter)
 /**
  * i40evf_del_vlans
  * @adapter: adapter structure
- * @vlans: the VLANs to remove
- * @count: number of VLANs
  *
  * Request that the PF remove one or more VLAN filters from our VSI.
  **/
-- 
cgit v1.2.3-59-g8ed1b


From 6334f2432fdf5d5252850ff2eebbaa00232f6ab9 Mon Sep 17 00:00:00 2001
From: Mariusz Stachura <mariusz.stachura@intel.com>
Date: Fri, 20 Apr 2018 01:41:34 -0700
Subject: i40e: fix reading LLDP configuration

Previous method for reading LLDP config was based on hard-coded
offsets. It happened to work, because of structured architecture of
the NVM memory. In the new approach, known as FLAT, we need to
calculate the absolute address, instead of using relative values.
Needed defines for memory location were added.

Signed-off-by: Mariusz Stachura <mariusz.stachura@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_dcb.c    | 91 ++++++++++++++++++++++++---
 drivers/net/ethernet/intel/i40e/i40e_type.h   |  8 ++-
 drivers/net/ethernet/intel/i40evf/i40e_type.h | 10 ++-
 3 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 69e7d4967b1c..56bff8faf371 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -920,6 +920,70 @@ i40e_status i40e_init_dcb(struct i40e_hw *hw)
 	return ret;
 }
 
+/**
+ * _i40e_read_lldp_cfg - generic read of LLDP Configuration data from NVM
+ * @hw: pointer to the HW structure
+ * @lldp_cfg: pointer to hold lldp configuration variables
+ * @module: address of the module pointer
+ * @word_offset: offset of LLDP configuration
+ *
+ * Reads the LLDP configuration data from NVM using passed addresses
+ **/
+static i40e_status _i40e_read_lldp_cfg(struct i40e_hw *hw,
+				       struct i40e_lldp_variables *lldp_cfg,
+				       u8 module, u32 word_offset)
+{
+	u32 address, offset = (2 * word_offset);
+	i40e_status ret;
+	__le16 raw_mem;
+	u16 mem;
+
+	ret = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret)
+		return ret;
+
+	ret = i40e_aq_read_nvm(hw, 0x0, module * 2, sizeof(raw_mem), &raw_mem,
+			       true, NULL);
+	i40e_release_nvm(hw);
+	if (ret)
+		return ret;
+
+	mem = le16_to_cpu(raw_mem);
+	/* Check if this pointer needs to be read in word size or 4K sector
+	 * units.
+	 */
+	if (mem & I40E_PTR_TYPE)
+		address = (0x7FFF & mem) * 4096;
+	else
+		address = (0x7FFF & mem) * 2;
+
+	ret = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret)
+		goto err_lldp_cfg;
+
+	ret = i40e_aq_read_nvm(hw, module, offset, sizeof(raw_mem), &raw_mem,
+			       true, NULL);
+	i40e_release_nvm(hw);
+	if (ret)
+		return ret;
+
+	mem = le16_to_cpu(raw_mem);
+	offset = mem + word_offset;
+	offset *= 2;
+
+	ret = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
+	if (ret)
+		goto err_lldp_cfg;
+
+	ret = i40e_aq_read_nvm(hw, 0, address + offset,
+			       sizeof(struct i40e_lldp_variables), lldp_cfg,
+			       true, NULL);
+	i40e_release_nvm(hw);
+
+err_lldp_cfg:
+	return ret;
+}
+
 /**
  * i40e_read_lldp_cfg - read LLDP Configuration data from NVM
  * @hw: pointer to the HW structure
@@ -931,21 +995,34 @@ i40e_status i40e_read_lldp_cfg(struct i40e_hw *hw,
 			       struct i40e_lldp_variables *lldp_cfg)
 {
 	i40e_status ret = 0;
-	u32 offset = (2 * I40E_NVM_LLDP_CFG_PTR);
+	u32 mem;
 
 	if (!lldp_cfg)
 		return I40E_ERR_PARAM;
 
 	ret = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
 	if (ret)
-		goto err_lldp_cfg;
+		return ret;
 
-	ret = i40e_aq_read_nvm(hw, I40E_SR_EMP_MODULE_PTR, offset,
-			       sizeof(struct i40e_lldp_variables),
-			       (u8 *)lldp_cfg,
-			       true, NULL);
+	ret = i40e_aq_read_nvm(hw, I40E_SR_NVM_CONTROL_WORD, 0, sizeof(mem),
+			       &mem, true, NULL);
 	i40e_release_nvm(hw);
+	if (ret)
+		return ret;
+
+	/* Read a bit that holds information whether we are running flat or
+	 * structured NVM image. Flat image has LLDP configuration in shadow
+	 * ram, so there is a need to pass different addresses for both cases.
+	 */
+	if (mem & I40E_SR_NVM_MAP_STRUCTURE_TYPE) {
+		/* Flat NVM case */
+		ret = _i40e_read_lldp_cfg(hw, lldp_cfg, I40E_SR_EMP_MODULE_PTR,
+					  I40E_SR_LLDP_CFG_PTR);
+	} else {
+		/* Good old structured NVM image */
+		ret = _i40e_read_lldp_cfg(hw, lldp_cfg, I40E_EMP_MODULE_PTR,
+					  I40E_NVM_LLDP_CFG_PTR);
+	}
 
-err_lldp_cfg:
 	return ret;
 }
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 40968a4216a7..7df969c59855 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -1294,7 +1294,8 @@ struct i40e_hw_port_stats {
 
 /* Checksum and Shadow RAM pointers */
 #define I40E_SR_NVM_CONTROL_WORD		0x00
-#define I40E_SR_EMP_MODULE_PTR			0x0F
+#define I40E_EMP_MODULE_PTR			0x0F
+#define I40E_SR_EMP_MODULE_PTR			0x48
 #define I40E_SR_PBA_FLAGS			0x15
 #define I40E_SR_PBA_BLOCK_PTR			0x16
 #define I40E_SR_BOOT_CONFIG_PTR			0x17
@@ -1313,6 +1314,8 @@ struct i40e_hw_port_stats {
 #define I40E_SR_PCIE_ALT_MODULE_MAX_SIZE	1024
 #define I40E_SR_CONTROL_WORD_1_SHIFT		0x06
 #define I40E_SR_CONTROL_WORD_1_MASK	(0x03 << I40E_SR_CONTROL_WORD_1_SHIFT)
+#define I40E_SR_CONTROL_WORD_1_NVM_BANK_VALID	BIT(5)
+#define I40E_SR_NVM_MAP_STRUCTURE_TYPE		BIT(12)
 #define I40E_PTR_TYPE				BIT(15)
 #define I40E_SR_OCP_CFG_WORD0			0x2B
 #define I40E_SR_OCP_ENABLED			BIT(15)
@@ -1430,7 +1433,8 @@ enum i40e_reset_type {
 };
 
 /* IEEE 802.1AB LLDP Agent Variables from NVM */
-#define I40E_NVM_LLDP_CFG_PTR		0xD
+#define I40E_NVM_LLDP_CFG_PTR	0x06
+#define I40E_SR_LLDP_CFG_PTR	0x31
 struct i40e_lldp_variables {
 	u16 length;
 	u16 adminstatus;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index c77cb9f672d8..094387db3c11 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -1233,7 +1233,8 @@ struct i40e_hw_port_stats {
 
 /* Checksum and Shadow RAM pointers */
 #define I40E_SR_NVM_CONTROL_WORD		0x00
-#define I40E_SR_EMP_MODULE_PTR			0x0F
+#define I40E_EMP_MODULE_PTR			0x0F
+#define I40E_SR_EMP_MODULE_PTR			0x48
 #define I40E_NVM_OEM_VER_OFF			0x83
 #define I40E_SR_NVM_DEV_STARTER_VERSION		0x18
 #define I40E_SR_NVM_WAKE_ON_LAN			0x19
@@ -1249,6 +1250,9 @@ struct i40e_hw_port_stats {
 #define I40E_SR_PCIE_ALT_MODULE_MAX_SIZE	1024
 #define I40E_SR_CONTROL_WORD_1_SHIFT		0x06
 #define I40E_SR_CONTROL_WORD_1_MASK	(0x03 << I40E_SR_CONTROL_WORD_1_SHIFT)
+#define I40E_SR_CONTROL_WORD_1_NVM_BANK_VALID	BIT(5)
+#define I40E_SR_NVM_MAP_STRUCTURE_TYPE		BIT(12)
+#define I40E_PTR_TYPE				BIT(15)
 
 /* Shadow RAM related */
 #define I40E_SR_SECTOR_SIZE_IN_WORDS	0x800
@@ -1362,6 +1366,10 @@ enum i40e_reset_type {
 	I40E_RESET_EMPR		= 3,
 };
 
+/* IEEE 802.1AB LLDP Agent Variables from NVM */
+#define I40E_NVM_LLDP_CFG_PTR	0x06
+#define I40E_SR_LLDP_CFG_PTR	0x31
+
 /* RSS Hash Table Size */
 #define I40E_PFQF_CTL_0_HASHLUTSIZE_512	0x00010000
 
-- 
cgit v1.2.3-59-g8ed1b


From 6df93462c2e3ed9656829078aa68214c64619042 Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Sat, 28 Apr 2018 12:18:35 +0200
Subject: ipv6: sr: extract the right key values for "seg6_make_flowlabel"

The seg6_make_flowlabel() is used by seg6_do_srh_encap() to compute the
flowlabel from a given skb. It relies on skb_get_hash() which eventually
calls __skb_flow_dissect() to extract the flow_keys struct values from
the skb.

In case of IPv4 traffic, calling seg6_make_flowlabel() after skb_push(),
skb_reset_network_header(), and skb_mac_header_rebuild() will results in
flow_keys struct of all key values set to zero.

This patch calls seg6_make_flowlabel() before resetting the headers of skb
to get the right key values.

Extracted Key values are based on the type inner packet as follows:
1) IPv6 traffic: src_IP, dst_IP, L4 proto, and flowlabel of inner packet.
2) IPv4 traffic: src_IP, dst_IP, L4 proto, src_port, and dst_port
3) L2 traffic: depends on what kind of traffic carried into the L2
frame. IPv6 and IPv4 traffic works as discussed 1) and 2)

Here a hex_dump of struct flow_keys for IPv4 and IPv6 traffic
10.100.1.100: 47302 > 30.0.0.2: 5001
00000000: 14 00 02 00 00 00 00 00 08 00 11 00 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 13 89 b8 c6 1e 00 00 02
00000020: 0a 64 01 64

fc00:a1:a > b2::2
00000000: 28 00 03 00 00 00 00 00 86 dd 11 00 99 f9 02 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 b2 00 00
00000020: 00 00 00 00 00 00 00 00 00 00 00 02 fc 00 00 a1
00000030: 00 00 00 00 00 00 00 00 00 00 00 0a

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 9898926ce30d..eab39bd91548 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -127,6 +127,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 		return err;
 
 	inner_hdr = ipv6_hdr(skb);
+	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
 
 	skb_push(skb, tot_len);
 	skb_reset_network_header(skb);
@@ -138,7 +139,6 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 	 * decapsulation will overwrite inner hlim with outer hlim
 	 */
 
-	flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
 	if (skb->protocol == htons(ETH_P_IPV6)) {
 		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
 			     flowlabel);
-- 
cgit v1.2.3-59-g8ed1b


From 6ee4d32255865fc4b383355a8354603d60ab9f8a Mon Sep 17 00:00:00 2001
From: Jakub Pawlak <jakub.pawlak@intel.com>
Date: Fri, 20 Apr 2018 01:41:35 -0700
Subject: i40e: Add advertising 10G LR mode

The advertising 10G LR mode should be possible to set
but in the function i40e_set_link_ksettings() check for this
is missed. This patch adds check for 10000baseLR_Full
flag for 10G modes.

Signed-off-by: Jakub Pawlak <jakub.pawlak@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index c1bbfb913e49..fc6a5eef141c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -953,7 +953,9 @@ static int i40e_set_link_ksettings(struct net_device *netdev,
 	    ethtool_link_ksettings_test_link_mode(ks, advertising,
 						  10000baseCR_Full) ||
 	    ethtool_link_ksettings_test_link_mode(ks, advertising,
-						  10000baseSR_Full))
+						  10000baseSR_Full) ||
+	    ethtool_link_ksettings_test_link_mode(ks, advertising,
+						  10000baseLR_Full))
 		config.link_speed |= I40E_LINK_SPEED_10GB;
 	if (ethtool_link_ksettings_test_link_mode(ks, advertising,
 						  20000baseKR2_Full))
-- 
cgit v1.2.3-59-g8ed1b


From e4062894d5939bbba9fbed5a70a9eaf6fa397b10 Mon Sep 17 00:00:00 2001
From: Paweł Jabłoński <pawel.jablonski@intel.com>
Date: Fri, 20 Apr 2018 01:41:36 -0700
Subject: i40evf: Fix turning TSO, GSO and GRO on after
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the problem where each MTU change turns TSO,
GSO and GRO on from off state.

Now when TSO, GSO or GRO is turned off, MTU change does not
turn them on.

Signed-off-by: Paweł Jabłoński <pawel.jablonski@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 28a8cc4a14cb..3f04a182903d 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -3357,6 +3357,24 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 	if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
 		netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
+	/* Do not turn on offloads when they are requested to be turned off.
+	 * TSO needs minimum 576 bytes to work correctly.
+	 */
+	if (netdev->wanted_features) {
+		if (!(netdev->wanted_features & NETIF_F_TSO) ||
+		    netdev->mtu < 576)
+			netdev->features &= ~NETIF_F_TSO;
+		if (!(netdev->wanted_features & NETIF_F_TSO6) ||
+		    netdev->mtu < 576)
+			netdev->features &= ~NETIF_F_TSO6;
+		if (!(netdev->wanted_features & NETIF_F_TSO_ECN))
+			netdev->features &= ~NETIF_F_TSO_ECN;
+		if (!(netdev->wanted_features & NETIF_F_GRO))
+			netdev->features &= ~NETIF_F_GRO;
+		if (!(netdev->wanted_features & NETIF_F_GSO))
+			netdev->features &= ~NETIF_F_GSO;
+	}
+
 	adapter->vsi.id = adapter->vsi_res->vsi_id;
 
 	adapter->vsi.back = adapter;
-- 
cgit v1.2.3-59-g8ed1b


From 5305d0fe2f22ec42b19f5ed205faad9b26955e5c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 20 Apr 2018 01:41:37 -0700
Subject: i40e: Fix multiple issues with UDP tunnel offload filter
 configuration

This fixes at least 2 issues I have found with the UDP tunnel filter
configuration.

The first issue is the fact that the tunnels didn't have any sort of mutual
exclusion in place to prevent an update from racing with a user request to
add/remove a port. As such you could request to add and remove a port
before the port update code had a chance to respond which would result in a
very confusing result. To address it I have added 2 changes. First I added
the RTNL mutex wrapper around our updating of the pending, port, and
filter_index bits. Second I added logic so that we cannot use a port that
has a pending deletion since we need to free the space in hardware before
we can allow software to reuse it.

The second issue addressed is the fact that we were not recording the
actual filter index provided to us by the admin queue. As a result we were
deleting filters that were not associated with the actual filter we wanted
to delete. To fix that I added a filter_index member to the UDP port
tracking structure.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      |  2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 66 +++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index f573108faec3..70d369e9139c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -310,10 +310,12 @@ struct i40e_tc_configuration {
 	struct i40e_tc_info tc_info[I40E_MAX_TRAFFIC_CLASS];
 };
 
+#define I40E_UDP_PORT_INDEX_UNUSED	255
 struct i40e_udp_port_config {
 	/* AdminQ command interface expects port number in Host byte order */
 	u16 port;
 	u8 type;
+	u8 filter_index;
 };
 
 /* macros related to FLX_PIT */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ad01bfc5ec80..0babde10fa15 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9672,9 +9672,9 @@ static void i40e_handle_mdd_event(struct i40e_pf *pf)
 	i40e_flush(hw);
 }
 
-static const char *i40e_tunnel_name(struct i40e_udp_port_config *port)
+static const char *i40e_tunnel_name(u8 type)
 {
-	switch (port->type) {
+	switch (type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
 		return "vxlan";
 	case UDP_TUNNEL_TYPE_GENEVE:
@@ -9708,37 +9708,68 @@ static void i40e_sync_udp_filters(struct i40e_pf *pf)
 static void i40e_sync_udp_filters_subtask(struct i40e_pf *pf)
 {
 	struct i40e_hw *hw = &pf->hw;
-	i40e_status ret;
+	u8 filter_index, type;
 	u16 port;
 	int i;
 
 	if (!test_and_clear_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state))
 		return;
 
+	/* acquire RTNL to maintain state of flags and port requests */
+	rtnl_lock();
+
 	for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
 		if (pf->pending_udp_bitmap & BIT_ULL(i)) {
+			struct i40e_udp_port_config *udp_port;
+			i40e_status ret = 0;
+
+			udp_port = &pf->udp_ports[i];
 			pf->pending_udp_bitmap &= ~BIT_ULL(i);
-			port = pf->udp_ports[i].port;
+
+			port = READ_ONCE(udp_port->port);
+			type = READ_ONCE(udp_port->type);
+			filter_index = READ_ONCE(udp_port->filter_index);
+
+			/* release RTNL while we wait on AQ command */
+			rtnl_unlock();
+
 			if (port)
 				ret = i40e_aq_add_udp_tunnel(hw, port,
-							pf->udp_ports[i].type,
-							NULL, NULL);
-			else
-				ret = i40e_aq_del_udp_tunnel(hw, i, NULL);
+							     type,
+							     &filter_index,
+							     NULL);
+			else if (filter_index != I40E_UDP_PORT_INDEX_UNUSED)
+				ret = i40e_aq_del_udp_tunnel(hw, filter_index,
+							     NULL);
+
+			/* reacquire RTNL so we can update filter_index */
+			rtnl_lock();
 
 			if (ret) {
 				dev_info(&pf->pdev->dev,
 					 "%s %s port %d, index %d failed, err %s aq_err %s\n",
-					 i40e_tunnel_name(&pf->udp_ports[i]),
+					 i40e_tunnel_name(type),
 					 port ? "add" : "delete",
-					 port, i,
+					 port,
+					 filter_index,
 					 i40e_stat_str(&pf->hw, ret),
 					 i40e_aq_str(&pf->hw,
 						     pf->hw.aq.asq_last_status));
-				pf->udp_ports[i].port = 0;
+				if (port) {
+					/* failed to add, just reset port,
+					 * drop pending bit for any deletion
+					 */
+					udp_port->port = 0;
+					pf->pending_udp_bitmap &= ~BIT_ULL(i);
+				}
+			} else if (port) {
+				/* record filter index on success */
+				udp_port->filter_index = filter_index;
 			}
 		}
 	}
+
+	rtnl_unlock();
 }
 
 /**
@@ -11355,6 +11386,11 @@ static u8 i40e_get_udp_port_idx(struct i40e_pf *pf, u16 port)
 	u8 i;
 
 	for (i = 0; i < I40E_MAX_PF_UDP_OFFLOAD_PORTS; i++) {
+		/* Do not report ports with pending deletions as
+		 * being available.
+		 */
+		if (!port && (pf->pending_udp_bitmap & BIT_ULL(i)))
+			continue;
 		if (pf->udp_ports[i].port == port)
 			return i;
 	}
@@ -11409,6 +11445,7 @@ static void i40e_udp_tunnel_add(struct net_device *netdev,
 
 	/* New port: add it and mark its index in the bitmap */
 	pf->udp_ports[next_idx].port = port;
+	pf->udp_ports[next_idx].filter_index = I40E_UDP_PORT_INDEX_UNUSED;
 	pf->pending_udp_bitmap |= BIT_ULL(next_idx);
 	set_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state);
 }
@@ -11450,7 +11487,12 @@ static void i40e_udp_tunnel_del(struct net_device *netdev,
 	 * and make it pending
 	 */
 	pf->udp_ports[idx].port = 0;
-	pf->pending_udp_bitmap |= BIT_ULL(idx);
+
+	/* Toggle pending bit instead of setting it. This way if we are
+	 * deleting a port that has yet to be added we just clear the pending
+	 * bit and don't have to worry about it.
+	 */
+	pf->pending_udp_bitmap ^= BIT_ULL(idx);
 	set_bit(__I40E_UDP_FILTER_SYNC_PENDING, pf->state);
 
 	return;
-- 
cgit v1.2.3-59-g8ed1b


From 830e0dd9996c4644e42412aa6c46ed8f8eab0cca Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 20 Apr 2018 01:41:38 -0700
Subject: i40e: avoid overflow in i40e_ptp_adjfreq()

When operating at 1GbE, the base incval for the PTP clock is so large
that multiplying it by numbers close to the max_adj can overflow the
u64.

Rather than attempting to limit the max_adj to a value small enough to
avoid overflow, instead calculate the incvalue adjustment based on the
40GbE incvalue, and then multiply that by the scaling factor for the
link speed.

This sacrifices a small amount of precision in the adjustment but we
avoid erratic behavior of the clock due to the overflow caused if ppb is
very near the maximum adjustment.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h     |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_ptp.c | 41 ++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 70d369e9139c..1d59ab6ca90f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -586,7 +586,7 @@ struct i40e_pf {
 	unsigned long ptp_tx_start;
 	struct hwtstamp_config tstamp_config;
 	struct mutex tmreg_lock; /* Used to protect the SYSTIME registers. */
-	u64 ptp_base_adj;
+	u32 ptp_adj_mult;
 	u32 tx_hwtstamp_timeouts;
 	u32 tx_hwtstamp_skipped;
 	u32 rx_hwtstamp_cleared;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 43d7c44d6d9f..aa3daec2049d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -16,9 +16,9 @@
  * At 1Gb link, the period is multiplied by 20. (32ns)
  * 1588 functionality is not supported at 100Mbps.
  */
-#define I40E_PTP_40GB_INCVAL 0x0199999999ULL
-#define I40E_PTP_10GB_INCVAL 0x0333333333ULL
-#define I40E_PTP_1GB_INCVAL  0x2000000000ULL
+#define I40E_PTP_40GB_INCVAL		0x0199999999ULL
+#define I40E_PTP_10GB_INCVAL_MULT	2
+#define I40E_PTP_1GB_INCVAL_MULT	20
 
 #define I40E_PRTTSYN_CTL1_TSYNTYPE_V1  BIT(I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT)
 #define I40E_PRTTSYN_CTL1_TSYNTYPE_V2  (2 << \
@@ -106,17 +106,24 @@ static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
 		ppb = -ppb;
 	}
 
-	smp_mb(); /* Force any pending update before accessing. */
-	adj = READ_ONCE(pf->ptp_base_adj);
-
-	freq = adj;
+	freq = I40E_PTP_40GB_INCVAL;
 	freq *= ppb;
 	diff = div_u64(freq, 1000000000ULL);
 
 	if (neg_adj)
-		adj -= diff;
+		adj = I40E_PTP_40GB_INCVAL - diff;
 	else
-		adj += diff;
+		adj = I40E_PTP_40GB_INCVAL + diff;
+
+	/* At some link speeds, the base incval is so large that directly
+	 * multiplying by ppb would result in arithmetic overflow even when
+	 * using a u64. Avoid this by instead calculating the new incval
+	 * always in terms of the 40GbE clock rate and then multiplying by the
+	 * link speed factor afterwards. This does result in slightly lower
+	 * precision at lower link speeds, but it is fairly minor.
+	 */
+	smp_mb(); /* Force any pending update before accessing. */
+	adj *= READ_ONCE(pf->ptp_adj_mult);
 
 	wr32(hw, I40E_PRTTSYN_INC_L, adj & 0xFFFFFFFF);
 	wr32(hw, I40E_PRTTSYN_INC_H, adj >> 32);
@@ -438,6 +445,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 	struct i40e_link_status *hw_link_info;
 	struct i40e_hw *hw = &pf->hw;
 	u64 incval;
+	u32 mult;
 
 	hw_link_info = &hw->phy.link_info;
 
@@ -445,10 +453,10 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 
 	switch (hw_link_info->link_speed) {
 	case I40E_LINK_SPEED_10GB:
-		incval = I40E_PTP_10GB_INCVAL;
+		mult = I40E_PTP_10GB_INCVAL_MULT;
 		break;
 	case I40E_LINK_SPEED_1GB:
-		incval = I40E_PTP_1GB_INCVAL;
+		mult = I40E_PTP_1GB_INCVAL_MULT;
 		break;
 	case I40E_LINK_SPEED_100MB:
 	{
@@ -459,15 +467,20 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 				 "1588 functionality is not supported at 100 Mbps. Stopping the PHC.\n");
 			warn_once++;
 		}
-		incval = 0;
+		mult = 0;
 		break;
 	}
 	case I40E_LINK_SPEED_40GB:
 	default:
-		incval = I40E_PTP_40GB_INCVAL;
+		mult = 1;
 		break;
 	}
 
+	/* The increment value is calculated by taking the base 40GbE incvalue
+	 * and multiplying it by a factor based on the link speed.
+	 */
+	incval = I40E_PTP_40GB_INCVAL * mult;
+
 	/* Write the new increment value into the increment register. The
 	 * hardware will not update the clock until both registers have been
 	 * written.
@@ -476,7 +489,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 	wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32);
 
 	/* Update the base adjustement value. */
-	WRITE_ONCE(pf->ptp_base_adj, incval);
+	WRITE_ONCE(pf->ptp_adj_mult, mult);
 	smp_mb(); /* Force the above update. */
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d0fda04d7e31e52f19ad7a21fb8d6700db55a6e9 Mon Sep 17 00:00:00 2001
From: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Date: Fri, 20 Apr 2018 01:41:39 -0700
Subject: i40e/i40evf: take into account queue map from vf when handling queues

The expectation of the ops VIRTCHNL_OP_ENABLE_QUEUES and
VIRTCHNL_OP_DISABLE_QUEUES is that the queue map sent by
the VF is taken into account when enabling/disabling
queues in the VF VSI. This patch makes sure that happens.

By breaking out the individual queue set up functions so
that they can be called directly from the i40e_virtchnl_pf.c
file, only the queues as specified by the queue bit map that
accompanies the enable/disable queues ops will be handled.

Signed-off-by: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h             |  3 +
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 40 +++++++++----
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 69 +++++++++++++++++++++-
 3 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 1d59ab6ca90f..7a80652e2500 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -987,6 +987,9 @@ void i40e_service_event_schedule(struct i40e_pf *pf);
 void i40e_notify_client_of_vf_msg(struct i40e_vsi *vsi, u32 vf_id,
 				  u8 *msg, u16 len);
 
+int i40e_control_wait_tx_q(int seid, struct i40e_pf *pf, int pf_q, bool is_xdp,
+			   bool enable);
+int i40e_control_wait_rx_q(struct i40e_pf *pf, int pf_q, bool enable);
 int i40e_vsi_start_rings(struct i40e_vsi *vsi);
 void i40e_vsi_stop_rings(struct i40e_vsi *vsi);
 void i40e_vsi_stop_rings_no_wait(struct  i40e_vsi *vsi);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0babde10fa15..b500bbf6c43f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -4235,8 +4235,8 @@ static void i40e_control_tx_q(struct i40e_pf *pf, int pf_q, bool enable)
  * @is_xdp: true if the queue is used for XDP
  * @enable: start or stop the queue
  **/
-static int i40e_control_wait_tx_q(int seid, struct i40e_pf *pf, int pf_q,
-				  bool is_xdp, bool enable)
+int i40e_control_wait_tx_q(int seid, struct i40e_pf *pf, int pf_q,
+			   bool is_xdp, bool enable)
 {
 	int ret;
 
@@ -4281,7 +4281,6 @@ static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool enable)
 		if (ret)
 			break;
 	}
-
 	return ret;
 }
 
@@ -4320,9 +4319,9 @@ static int i40e_pf_rxq_wait(struct i40e_pf *pf, int pf_q, bool enable)
  * @pf_q: the PF queue to configure
  * @enable: start or stop the queue
  *
- * This function enables or disables a single queue. Note that any delay
- * required after the operation is expected to be handled by the caller of
- * this function.
+ * This function enables or disables a single queue. Note that
+ * any delay required after the operation is expected to be
+ * handled by the caller of this function.
  **/
 static void i40e_control_rx_q(struct i40e_pf *pf, int pf_q, bool enable)
 {
@@ -4351,6 +4350,30 @@ static void i40e_control_rx_q(struct i40e_pf *pf, int pf_q, bool enable)
 	wr32(hw, I40E_QRX_ENA(pf_q), rx_reg);
 }
 
+/**
+ * i40e_control_wait_rx_q
+ * @pf: the PF structure
+ * @pf_q: queue being configured
+ * @enable: start or stop the rings
+ *
+ * This function enables or disables a single queue along with waiting
+ * for the change to finish. The caller of this function should handle
+ * the delays needed in the case of disabling queues.
+ **/
+int i40e_control_wait_rx_q(struct i40e_pf *pf, int pf_q, bool enable)
+{
+	int ret = 0;
+
+	i40e_control_rx_q(pf, pf_q, enable);
+
+	/* wait for the change to finish */
+	ret = i40e_pf_rxq_wait(pf, pf_q, enable);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
 /**
  * i40e_vsi_control_rx - Start or stop a VSI's rings
  * @vsi: the VSI being configured
@@ -4363,10 +4386,7 @@ static int i40e_vsi_control_rx(struct i40e_vsi *vsi, bool enable)
 
 	pf_q = vsi->base_queue;
 	for (i = 0; i < vsi->num_queue_pairs; i++, pf_q++) {
-		i40e_control_rx_q(pf, pf_q, enable);
-
-		/* wait for the change to finish */
-		ret = i40e_pf_rxq_wait(pf, pf_q, enable);
+		ret = i40e_control_wait_rx_q(pf, pf_q, enable);
 		if (ret) {
 			dev_info(&pf->pdev->dev,
 				 "VSI seid %d Rx ring %d %sable timeout\n",
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 2ceea63cc6cf..c6d24eaede18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2153,6 +2153,51 @@ error_param:
 				       aq_ret);
 }
 
+/**
+ * i40e_ctrl_vf_tx_rings
+ * @vsi: the SRIOV VSI being configured
+ * @q_map: bit map of the queues to be enabled
+ * @enable: start or stop the queue
+ **/
+static int i40e_ctrl_vf_tx_rings(struct i40e_vsi *vsi, unsigned long q_map,
+				 bool enable)
+{
+	struct i40e_pf *pf = vsi->back;
+	int ret = 0;
+	u16 q_id;
+
+	for_each_set_bit(q_id, &q_map, I40E_MAX_VF_QUEUES) {
+		ret = i40e_control_wait_tx_q(vsi->seid, pf,
+					     vsi->base_queue + q_id,
+					     false /*is xdp*/, enable);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+/**
+ * i40e_ctrl_vf_rx_rings
+ * @vsi: the SRIOV VSI being configured
+ * @q_map: bit map of the queues to be enabled
+ * @enable: start or stop the queue
+ **/
+static int i40e_ctrl_vf_rx_rings(struct i40e_vsi *vsi, unsigned long q_map,
+				 bool enable)
+{
+	struct i40e_pf *pf = vsi->back;
+	int ret = 0;
+	u16 q_id;
+
+	for_each_set_bit(q_id, &q_map, I40E_MAX_VF_QUEUES) {
+		ret = i40e_control_wait_rx_q(pf, vsi->base_queue + q_id,
+					     enable);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 /**
  * i40e_vc_enable_queues_msg
  * @vf: pointer to the VF info
@@ -2185,8 +2230,17 @@ static int i40e_vc_enable_queues_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 		goto error_param;
 	}
 
-	if (i40e_vsi_start_rings(pf->vsi[vf->lan_vsi_idx]))
+	/* Use the queue bit map sent by the VF */
+	if (i40e_ctrl_vf_rx_rings(pf->vsi[vf->lan_vsi_idx], vqs->rx_queues,
+				  true)) {
+		aq_ret = I40E_ERR_TIMEOUT;
+		goto error_param;
+	}
+	if (i40e_ctrl_vf_tx_rings(pf->vsi[vf->lan_vsi_idx], vqs->tx_queues,
+				  true)) {
 		aq_ret = I40E_ERR_TIMEOUT;
+		goto error_param;
+	}
 
 	/* need to start the rings for additional ADq VSI's as well */
 	if (vf->adq_enabled) {
@@ -2234,8 +2288,17 @@ static int i40e_vc_disable_queues_msg(struct i40e_vf *vf, u8 *msg, u16 msglen)
 		goto error_param;
 	}
 
-	i40e_vsi_stop_rings(pf->vsi[vf->lan_vsi_idx]);
-
+	/* Use the queue bit map sent by the VF */
+	if (i40e_ctrl_vf_tx_rings(pf->vsi[vf->lan_vsi_idx], vqs->tx_queues,
+				  false)) {
+		aq_ret = I40E_ERR_TIMEOUT;
+		goto error_param;
+	}
+	if (i40e_ctrl_vf_rx_rings(pf->vsi[vf->lan_vsi_idx], vqs->rx_queues,
+				  false)) {
+		aq_ret = I40E_ERR_TIMEOUT;
+		goto error_param;
+	}
 error_param:
 	/* send the response to the VF */
 	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DISABLE_QUEUES,
-- 
cgit v1.2.3-59-g8ed1b


From bf1099b5ea853bddaa0dc4b1c594a2fdbebaf4c9 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Fri, 20 Apr 2018 01:41:40 -0700
Subject: i40e: use %pI4b instead of byte swapping before dev_err

Fix warnings regarding restricted __be32 type usage by strictly
specifying the type of the ipv4 address being printed in the dev_err
statement.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b500bbf6c43f..c8659fbd7111 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7213,8 +7213,7 @@ static int i40e_parse_cls_flower(struct i40e_vsi *vsi,
 			if (mask->dst == cpu_to_be32(0xffffffff)) {
 				field_flags |= I40E_CLOUD_FIELD_IIP;
 			} else {
-				mask->dst = be32_to_cpu(mask->dst);
-				dev_err(&pf->pdev->dev, "Bad ip dst mask %pI4\n",
+				dev_err(&pf->pdev->dev, "Bad ip dst mask %pI4b\n",
 					&mask->dst);
 				return I40E_ERR_CONFIG;
 			}
@@ -7224,8 +7223,7 @@ static int i40e_parse_cls_flower(struct i40e_vsi *vsi,
 			if (mask->src == cpu_to_be32(0xffffffff)) {
 				field_flags |= I40E_CLOUD_FIELD_IIP;
 			} else {
-				mask->src = be32_to_cpu(mask->src);
-				dev_err(&pf->pdev->dev, "Bad ip src mask %pI4\n",
+				dev_err(&pf->pdev->dev, "Bad ip src mask %pI4b\n",
 					&mask->src);
 				return I40E_ERR_CONFIG;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From 4d4fd36126d66d6091ca5aaabab262b5da3849c5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:08 +0300
Subject: net: bridge: Publish bridge accessor functions

Add a couple new functions to allow querying FDB and vlan settings of a
bridge.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 28 ++++++++++++++++++++++++++++
 net/bridge/br_fdb.c       | 22 ++++++++++++++++++++++
 net/bridge/br_private.h   | 11 +++++++++++
 net/bridge/br_vlan.c      | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 02639ebea2f0..585d27182425 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -93,11 +93,39 @@ static inline bool br_multicast_router(const struct net_device *dev)
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
 bool br_vlan_enabled(const struct net_device *dev);
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid);
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo);
 #else
 static inline bool br_vlan_enabled(const struct net_device *dev)
 {
 	return false;
 }
+
+static inline int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	return -1;
+}
+
+static inline int br_vlan_get_info(const struct net_device *dev, u16 vid,
+				   struct bridge_vlan_info *p_vinfo)
+{
+	return -1;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_BRIDGE)
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid);
+#else
+static inline struct net_device *
+br_fdb_find_port(const struct net_device *br_dev,
+		 const unsigned char *addr,
+		 __u16 vid)
+{
+	return NULL;
+}
 #endif
 
 #endif
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..a1c409cacec4 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br,
 	return fdb;
 }
 
+struct net_device *br_fdb_find_port(const struct net_device *br_dev,
+				    const unsigned char *addr,
+				    __u16 vid)
+{
+	struct net_bridge_fdb_entry *f;
+	struct net_device *dev = NULL;
+	struct net_bridge *br;
+
+	ASSERT_RTNL();
+
+	if (!netif_is_bridge_master(br_dev))
+		return NULL;
+
+	br = netdev_priv(br_dev);
+	f = br_fdb_find(br, addr, vid);
+	if (f && f->dst)
+		dev = f->dst->dev;
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_find_port);
+
 struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
 					     const unsigned char *addr,
 					     __u16 vid)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a7cb3ece5031..1a5093115534 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -594,11 +594,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
 	return rcu_dereference(dev->rx_handler) == br_handle_frame;
 }
 
+static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+}
+
 static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
 {
 	return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL;
 }
 
+static inline struct net_bridge_port *
+br_port_get_check_rtnl(const struct net_device *dev)
+{
+	return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL;
+}
+
 /* br_ioctl.c */
 int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..df37a5137c25 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1149,3 +1149,42 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
 		stats->tx_packets += txpackets;
 	}
 }
+
+int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
+{
+	struct net_bridge_vlan_group *vg;
+
+	ASSERT_RTNL();
+	if (netif_is_bridge_master(dev))
+		vg = br_vlan_group(netdev_priv(dev));
+	else
+		return -EINVAL;
+
+	*p_pvid = br_get_pvid(vg);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+
+int br_vlan_get_info(const struct net_device *dev, u16 vid,
+		     struct bridge_vlan_info *p_vinfo)
+{
+	struct net_bridge_vlan_group *vg;
+	struct net_bridge_vlan *v;
+	struct net_bridge_port *p;
+
+	ASSERT_RTNL();
+	p = br_port_get_check_rtnl(dev);
+	if (p)
+		vg = nbp_vlan_group(p);
+	else
+		return -EINVAL;
+
+	v = br_vlan_find(vg, vid);
+	if (!v)
+		return -ENOENT;
+
+	p_vinfo->vid = vid;
+	p_vinfo->flags = v->flags;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info);
-- 
cgit v1.2.3-59-g8ed1b


From 541e11595cb27591a7b3f4327472f282f482a5b0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:09 +0300
Subject: mlxsw: spectrum: Extract mlxsw_sp_stp_spms_state()

Instead of duplicating the decision regarding port forwarding state made
by mlxsw_sp_port_vid_stp_set(), extract the decision-making into a new
function and reuse.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 26 +++++++++++++-------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  1 +
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ca38a30fbe91..7317fb8079d1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -441,29 +441,29 @@ static void mlxsw_sp_txhdr_construct(struct sk_buff *skb,
 	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
 }
 
-int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
-			      u8 state)
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 state)
 {
-	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-	enum mlxsw_reg_spms_state spms_state;
-	char *spms_pl;
-	int err;
-
 	switch (state) {
 	case BR_STATE_FORWARDING:
-		spms_state = MLXSW_REG_SPMS_STATE_FORWARDING;
-		break;
+		return MLXSW_REG_SPMS_STATE_FORWARDING;
 	case BR_STATE_LEARNING:
-		spms_state = MLXSW_REG_SPMS_STATE_LEARNING;
-		break;
+		return MLXSW_REG_SPMS_STATE_LEARNING;
 	case BR_STATE_LISTENING: /* fall-through */
 	case BR_STATE_DISABLED: /* fall-through */
 	case BR_STATE_BLOCKING:
-		spms_state = MLXSW_REG_SPMS_STATE_DISCARDING;
-		break;
+		return MLXSW_REG_SPMS_STATE_DISCARDING;
 	default:
 		BUG();
 	}
+}
+
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state)
+{
+	enum mlxsw_reg_spms_state spms_state = mlxsw_sp_stp_spms_state(state);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spms_pl;
+	int err;
 
 	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
 	if (!spms_pl)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 804d4d2c8031..4a519d8edec8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -364,6 +364,7 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
 int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
 				  enum mlxsw_reg_qeec_hr hr, u8 index,
 				  u8 next_index, u32 maxrate);
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 stp_state);
 int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
 			      u8 state);
 int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable);
-- 
cgit v1.2.3-59-g8ed1b


From ea93c7b6083deebc026ed45b61e050c42322a2a6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:10 +0300
Subject: mlxsw: spectrum_switchdev: Publish two functions

Publish the existing function mlxsw_sp_bridge_port_find(), and add
another service accessor mlxsw_sp_bridge_port_stp_state(). Publish both
in a new file spectrum_switchdev.h.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |  9 ++++-
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.h   | 43 ++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index c11c9a635866..db4aea0f8996 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -50,6 +50,7 @@
 #include <net/switchdev.h>
 
 #include "spectrum_router.h"
+#include "spectrum_switchdev.h"
 #include "spectrum.h"
 #include "core.h"
 #include "reg.h"
@@ -239,7 +240,7 @@ __mlxsw_sp_bridge_port_find(const struct mlxsw_sp_bridge_device *bridge_device,
 	return NULL;
 }
 
-static struct mlxsw_sp_bridge_port *
+struct mlxsw_sp_bridge_port *
 mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
 			  struct net_device *brport_dev)
 {
@@ -2297,6 +2298,12 @@ static struct notifier_block mlxsw_sp_switchdev_notifier = {
 	.notifier_call = mlxsw_sp_switchdev_event,
 };
 
+u8
+mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port)
+{
+	return bridge_port->stp_state;
+}
+
 static int mlxsw_sp_fdb_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
new file mode 100644
index 000000000000..bc44d5effc28
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+ * drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/netdevice.h>
+
+struct mlxsw_sp_bridge;
+struct mlxsw_sp_bridge_port;
+
+struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
+			  struct net_device *brport_dev);
+
+u8 mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port);
-- 
cgit v1.2.3-59-g8ed1b


From cda880de939ed81dbd310e2d6784451609fc5913 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:11 +0300
Subject: mlxsw: spectrum: Register SPAN before switchdev

Since switchdev events can trigger SPAN respin, it is necessary that the
data structures are available. Register SPAN first, with a commentary on
what the dependencies are.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 7317fb8079d1..94132f6cec61 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3666,6 +3666,15 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_lag_init;
 	}
 
+	/* Initialize SPAN before router and switchdev, so that those components
+	 * can call mlxsw_sp_span_respin().
+	 */
+	err = mlxsw_sp_span_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
+		goto err_span_init;
+	}
+
 	err = mlxsw_sp_switchdev_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize switchdev\n");
@@ -3684,15 +3693,6 @@ static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
 		goto err_afa_init;
 	}
 
-	err = mlxsw_sp_span_init(mlxsw_sp);
-	if (err) {
-		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
-		goto err_span_init;
-	}
-
-	/* Initialize router after SPAN is initialized, so that the FIB and
-	 * neighbor event handlers can issue SPAN respin.
-	 */
 	err = mlxsw_sp_router_init(mlxsw_sp);
 	if (err) {
 		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n");
@@ -3739,14 +3739,14 @@ err_acl_init:
 err_netdev_notifier:
 	mlxsw_sp_router_fini(mlxsw_sp);
 err_router_init:
-	mlxsw_sp_span_fini(mlxsw_sp);
-err_span_init:
 	mlxsw_sp_afa_fini(mlxsw_sp);
 err_afa_init:
 	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 err_counter_pool_init:
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
 err_switchdev_init:
+	mlxsw_sp_span_fini(mlxsw_sp);
+err_span_init:
 	mlxsw_sp_lag_fini(mlxsw_sp);
 err_lag_init:
 	mlxsw_sp_buffers_fini(mlxsw_sp);
@@ -3768,10 +3768,10 @@ static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
 	mlxsw_sp_acl_fini(mlxsw_sp);
 	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
 	mlxsw_sp_router_fini(mlxsw_sp);
-	mlxsw_sp_span_fini(mlxsw_sp);
 	mlxsw_sp_afa_fini(mlxsw_sp);
 	mlxsw_sp_counter_pool_fini(mlxsw_sp);
 	mlxsw_sp_switchdev_fini(mlxsw_sp);
+	mlxsw_sp_span_fini(mlxsw_sp);
 	mlxsw_sp_lag_fini(mlxsw_sp);
 	mlxsw_sp_buffers_fini(mlxsw_sp);
 	mlxsw_sp_traps_fini(mlxsw_sp);
-- 
cgit v1.2.3-59-g8ed1b


From c520bc6986472621469f9dc1cec1a5dbbb4769bf Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:12 +0300
Subject: mlxsw: Respin SPAN on switchdev events

Changes to switchdev artifact can make a SPAN entry offloadable or
unoffloadable. To that end:

- Listen to SWITCHDEV_FDB_*_TO_BRIDGE notifications in addition to
  the *_TO_DEVICE ones, to catch whatever activity is sent to the
  bridge (likely by mlxsw itself).

  On each FDB notification, respin SPAN to reconcile it with the FDB
  changes.

- Also respin on switchdev port attribute changes (which currently
  covers changes to STP state of ports) and port object additions and
  removals.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   | 63 ++++++++++++++++++++--
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index db4aea0f8996..1af99fe5fd32 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -49,6 +49,7 @@
 #include <linux/netlink.h>
 #include <net/switchdev.h>
 
+#include "spectrum_span.h"
 #include "spectrum_router.h"
 #include "spectrum_switchdev.h"
 #include "spectrum.h"
@@ -923,6 +924,9 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
 		break;
 	}
 
+	if (switchdev_trans_ph_commit(trans))
+		mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 	return err;
 }
 
@@ -1647,18 +1651,57 @@ mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
 	}
 }
 
+struct mlxsw_sp_span_respin_work {
+	struct work_struct work;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static void mlxsw_sp_span_respin_work(struct work_struct *work)
+{
+	struct mlxsw_sp_span_respin_work *respin_work =
+		container_of(work, struct mlxsw_sp_span_respin_work, work);
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(respin_work->mlxsw_sp);
+	rtnl_unlock();
+	kfree(respin_work);
+}
+
+static void mlxsw_sp_span_respin_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_span_respin_work *respin_work;
+
+	respin_work = kzalloc(sizeof(*respin_work), GFP_ATOMIC);
+	if (!respin_work)
+		return;
+
+	INIT_WORK(&respin_work->work, mlxsw_sp_span_respin_work);
+	respin_work->mlxsw_sp = mlxsw_sp;
+
+	mlxsw_core_schedule_work(&respin_work->work);
+}
+
 static int mlxsw_sp_port_obj_add(struct net_device *dev,
 				 const struct switchdev_obj *obj,
 				 struct switchdev_trans *trans)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	const struct switchdev_obj_port_vlan *vlan;
 	int err = 0;
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port,
-					      SWITCHDEV_OBJ_PORT_VLAN(obj),
-					      trans);
+		vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans);
+
+		if (switchdev_trans_ph_commit(trans)) {
+			/* The event is emitted before the changes are actually
+			 * applied to the bridge. Therefore schedule the respin
+			 * call for later, so that the respin logic sees the
+			 * updated bridge state.
+			 */
+			mlxsw_sp_span_respin_schedule(mlxsw_sp_port->mlxsw_sp);
+		}
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		err = mlxsw_sp_port_mdb_add(mlxsw_sp_port,
@@ -1809,6 +1852,8 @@ static int mlxsw_sp_port_obj_del(struct net_device *dev,
 		break;
 	}
 
+	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 	return err;
 }
 
@@ -2236,8 +2281,16 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 		fdb_info = &switchdev_work->fdb_info;
 		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
 		break;
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		/* These events are only used to potentially update an existing
+		 * SPAN mirror.
+		 */
+		break;
 	}
 
+	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
 out:
 	rtnl_unlock();
 	kfree(switchdev_work->fdb_info.addr);
@@ -2266,7 +2319,9 @@ static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
 
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
-	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE: /* fall through */
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From 946a11e7408e52a6d9e5f3187c32b6c2a5f88835 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sun, 29 Apr 2018 10:56:13 +0300
Subject: mlxsw: spectrum_span: Allow bridge for gretap mirror

When handling mirroring to a gretap or ip6gretap netdevice in mlxsw, the
underlay address (i.e. the remote address of the tunnel) may be routed
to a bridge.

In that case, look up the resolved neighbor Ethernet address in that
bridge's FDB. Then configure the offload to direct the mirrored traffic
to that port, possibly with tagging.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 95 ++++++++++++++++++++--
 .../net/ethernet/mellanox/mlxsw/spectrum_span.h    |  1 +
 2 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 65a77708ff61..cd9071ee19ad 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -32,6 +32,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/if_bridge.h>
 #include <linux/list.h>
 #include <net/arp.h>
 #include <net/gre.h>
@@ -39,8 +40,9 @@
 #include <net/ip6_tunnel.h>
 
 #include "spectrum.h"
-#include "spectrum_span.h"
 #include "spectrum_ipip.h"
+#include "spectrum_span.h"
+#include "spectrum_switchdev.h"
 
 int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp)
 {
@@ -167,6 +169,72 @@ mlxsw_sp_span_entry_unoffloadable(struct mlxsw_sp_span_parms *sparmsp)
 	return 0;
 }
 
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021q(const struct net_device *br_dev,
+				 unsigned char *dmac,
+				 u16 *p_vid)
+{
+	struct bridge_vlan_info vinfo;
+	struct net_device *edev;
+	u16 pvid;
+
+	if (WARN_ON(br_vlan_get_pvid(br_dev, &pvid)))
+		return NULL;
+	if (!pvid)
+		return NULL;
+
+	edev = br_fdb_find_port(br_dev, dmac, pvid);
+	if (!edev)
+		return NULL;
+
+	if (br_vlan_get_info(edev, pvid, &vinfo))
+		return NULL;
+	if (!(vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED))
+		*p_vid = pvid;
+	return edev;
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021d(const struct net_device *br_dev,
+				 unsigned char *dmac)
+{
+	return br_fdb_find_port(br_dev, dmac, 0);
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge(const struct net_device *br_dev,
+			   unsigned char dmac[ETH_ALEN],
+			   u16 *p_vid)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	enum mlxsw_reg_spms_state spms_state;
+	struct mlxsw_sp_port *port;
+	struct net_device *dev;
+	u8 stp_state;
+
+	if (br_vlan_enabled(br_dev))
+		dev = mlxsw_sp_span_entry_bridge_8021q(br_dev, dmac, p_vid);
+	else
+		dev = mlxsw_sp_span_entry_bridge_8021d(br_dev, dmac);
+	if (!dev)
+		return NULL;
+
+	port = mlxsw_sp_port_dev_lower_find(dev);
+	if (!port)
+		return NULL;
+
+	bridge_port = mlxsw_sp_bridge_port_find(port->mlxsw_sp->bridge, dev);
+	if (!bridge_port)
+		return NULL;
+
+	stp_state = mlxsw_sp_bridge_port_stp_state(bridge_port);
+	spms_state = mlxsw_sp_stp_spms_state(stp_state);
+	if (spms_state != MLXSW_REG_SPMS_STATE_FORWARDING)
+		return NULL;
+
+	return dev;
+}
+
 static __maybe_unused int
 mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 					union mlxsw_sp_l3addr saddr,
@@ -177,13 +245,22 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 					struct mlxsw_sp_span_parms *sparmsp)
 {
 	unsigned char dmac[ETH_ALEN];
+	u16 vid = 0;
 
 	if (mlxsw_sp_l3addr_is_zero(gw))
 		gw = daddr;
 
-	if (!l3edev || !mlxsw_sp_port_dev_check(l3edev) ||
-	    mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
-		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+		goto unoffloadable;
+
+	if (netif_is_bridge_master(l3edev)) {
+		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
+		if (!l3edev)
+			goto unoffloadable;
+	}
+
+	if (!mlxsw_sp_port_dev_check(l3edev))
+		goto unoffloadable;
 
 	sparmsp->dest_port = netdev_priv(l3edev);
 	sparmsp->ttl = ttl;
@@ -191,7 +268,11 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
 	sparmsp->saddr = saddr;
 	sparmsp->daddr = daddr;
+	sparmsp->vid = vid;
 	return 0;
+
+unoffloadable:
+	return mlxsw_sp_span_entry_unoffloadable(sparmsp);
 }
 
 #if IS_ENABLED(CONFIG_NET_IPGRE)
@@ -268,9 +349,10 @@ mlxsw_sp_span_entry_gretap4_configure(struct mlxsw_sp_span_entry *span_entry,
 	/* Create a new port analayzer entry for local_port. */
 	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
 			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
 				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
-				    sparms.dmac, false);
+				    sparms.dmac, !!sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(mpat_pl,
 					      sparms.ttl, sparms.smac,
 					      be32_to_cpu(sparms.saddr.addr4),
@@ -368,9 +450,10 @@ mlxsw_sp_span_entry_gretap6_configure(struct mlxsw_sp_span_entry *span_entry,
 	/* Create a new port analayzer entry for local_port. */
 	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
 			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
 				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
-				    sparms.dmac, false);
+				    sparms.dmac, !!sparms.vid);
 	mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(mpat_pl, sparms.ttl, sparms.smac,
 					      sparms.saddr.addr6,
 					      sparms.daddr.addr6);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
index 4b87ec20e658..14a6de904db1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
@@ -63,6 +63,7 @@ struct mlxsw_sp_span_parms {
 	unsigned char smac[ETH_ALEN];
 	union mlxsw_sp_l3addr daddr;
 	union mlxsw_sp_l3addr saddr;
+	u16 vid;
 };
 
 struct mlxsw_sp_span_entry_ops;
-- 
cgit v1.2.3-59-g8ed1b


From 154a8c46bad2aae171fa25d5fa1d04165ef69650 Mon Sep 17 00:00:00 2001
From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Date: Sun, 29 Apr 2018 15:05:52 +0800
Subject: change the comment of vti6_ioctl

The comment of vti6_ioctl() is wrong. which use vti6_tnl_ioctl
instead of vti6_ioctl.

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_vti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c214ffec02f0..deadc4c3703b 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
 }
 
 /**
- * vti6_tnl_ioctl - configure vti6 tunnels from userspace
+ * vti6_ioctl - configure vti6 tunnels from userspace
  *   @dev: virtual device associated with tunnel
  *   @ifr: parameters passed from userspace
  *   @cmd: command to be performed
-- 
cgit v1.2.3-59-g8ed1b


From 6dac152355d9308c9e187bf1d38d98afefcaa315 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:10 +0300
Subject: tcp: Add clean acked data hook

Called when a TCP segment is acknowledged.
Could be used by application protocols who hold additional
metadata associated with the stream data.

This is required by TLS device offload to release
metadata associated with acknowledged TLS records.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |  2 ++
 include/net/tcp.h                  |  8 ++++++++
 net/ipv4/tcp_input.c               | 25 +++++++++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b68fea022a82..2ab6667275df 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops {
  * @icsk_af_ops		   Operations which are AF_INET{4,6} specific
  * @icsk_ulp_ops	   Pluggable ULP control hook
  * @icsk_ulp_data	   ULP private data
+ * @icsk_clean_acked	   Clean acked data hook
  * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
  * @icsk_ca_state:	   Congestion control state
  * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
@@ -102,6 +103,7 @@ struct inet_connection_sock {
 	const struct inet_connection_sock_af_ops *icsk_af_ops;
 	const struct tcp_ulp_ops  *icsk_ulp_ops;
 	void			  *icsk_ulp_data;
+	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
 	struct hlist_node         icsk_listen_portaddr_node;
 	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
 	__u8			  icsk_ca_state:6,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 833154e3df17..cf803fe0fb86 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2105,4 +2105,12 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 #if IS_ENABLED(CONFIG_SMC)
 extern struct static_key_false tcp_have_smc;
 #endif
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+			     void (*cad)(struct sock *sk, u32 ack_seq));
+void clean_acked_data_disable(struct inet_connection_sock *icsk);
+
+#endif
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8acbe5fd2098..b188e0d75edd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 #define REXMIT_LOST	1 /* retransmit packets marked lost */
 #define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+
+void clean_acked_data_enable(struct inet_connection_sock *icsk,
+			     void (*cad)(struct sock *sk, u32 ack_seq))
+{
+	icsk->icsk_clean_acked = cad;
+	static_branch_inc(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_enable);
+
+void clean_acked_data_disable(struct inet_connection_sock *icsk)
+{
+	static_branch_dec(&clean_acked_data_enabled);
+	icsk->icsk_clean_acked = NULL;
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_disable);
+#endif
+
 static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
 			     unsigned int len)
 {
@@ -3560,6 +3579,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, prior_snd_una)) {
 		flag |= FLAG_SND_UNA_ADVANCED;
 		icsk->icsk_retransmits = 0;
+
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+		if (static_branch_unlikely(&clean_acked_data_enabled))
+			if (icsk->icsk_clean_acked)
+				icsk->icsk_clean_acked(sk, ack);
+#endif
 	}
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
-- 
cgit v1.2.3-59-g8ed1b


From 08303c189581c985e60f588ad92a041e46b6e307 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:11 +0300
Subject: net: Rename and export copy_skb_header

copy_skb_header is renamed to skb_copy_header and
exported. Exposing this function give more flexibility
in copying SKBs.
skb_copy and skb_copy_expand do not give enough control
over which parts are copied.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 net/core/skbuff.c      | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a4a5c0c5cba8..908d66e55b14 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1034,6 +1034,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 				   gfp_t gfp_mask, bool fclone);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c647cfe114e0..c642304f178c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 	skb->inner_mac_header += off;
 }
 
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	__copy_skb_header(new, old);
 
@@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
+EXPORT_SYMBOL(skb_copy_header);
 
 static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
 {
@@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 
 	BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 	return n;
 }
 EXPORT_SYMBOL(skb_copy);
@@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 		skb_clone_fraglist(n);
 	}
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 out:
 	return n;
 }
@@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 	BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
 			     skb->len + head_copy_len));
 
-	copy_skb_header(n, skb);
+	skb_copy_header(n, skb);
 
 	skb_headers_offset_update(n, newheadroom - oldheadroom);
 
-- 
cgit v1.2.3-59-g8ed1b


From ebf4e808fa0b22e551baf862e17c26c325c068f4 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:12 +0300
Subject: net: Add Software fallback infrastructure for socket dependent
 offloads

With socket dependent offloads we rely on the netdev to transform
the transmitted packets before sending them to the wire.
When a packet from an offloaded socket is rerouted to a different
device we need to detect it and do the transformation in software.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 21 +++++++++++++++++++++
 net/Kconfig        |  3 +++
 net/core/dev.c     |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/include/net/sock.h b/include/net/sock.h
index 74d725fdbe0f..3c568b36ee36 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -481,6 +481,11 @@ struct sock {
 	void			(*sk_error_report)(struct sock *sk);
 	int			(*sk_backlog_rcv)(struct sock *sk,
 						  struct sk_buff *skb);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+	struct sk_buff*		(*sk_validate_xmit_skb)(struct sock *sk,
+							struct net_device *dev,
+							struct sk_buff *skb);
+#endif
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
 	struct rcu_head		sk_rcu;
@@ -2332,6 +2337,22 @@ static inline bool sk_fullsock(const struct sock *sk)
 	return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
 }
 
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ */
+static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+						   struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+	struct sock *sk = skb->sk;
+
+	if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb)
+		skb = sk->sk_validate_xmit_skb(sk, dev, skb);
+#endif
+
+	return skb;
+}
+
 /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
  * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
  */
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..b62089fb1332 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -407,6 +407,9 @@ config GRO_CELLS
 	bool
 	default n
 
+config SOCK_VALIDATE_XMIT
+	bool
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/core/dev.c b/net/core/dev.c
index e01c21a88cae..8944e1e0059d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3112,6 +3112,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (unlikely(!skb))
 		goto out_null;
 
+	skb = sk_validate_xmit_skb(skb, dev);
+	if (unlikely(!skb))
+		goto out_null;
+
 	if (netif_needs_gso(skb, features)) {
 		struct sk_buff *segs;
 
-- 
cgit v1.2.3-59-g8ed1b


From a5c37c63f71b044fbe1a3f893d6f6daa36fee5f5 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:13 +0300
Subject: net: Add TLS offload netdev ops

Add new netdev ops to add and delete tls context

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9e09dd897b74..a569bf5524fa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -865,6 +865,26 @@ struct xfrmdev_ops {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+enum tls_offload_ctx_dir {
+	TLS_OFFLOAD_CTX_DIR_RX,
+	TLS_OFFLOAD_CTX_DIR_TX,
+};
+
+struct tls_crypto_info;
+struct tls_context;
+
+struct tlsdev_ops {
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+	void (*tls_dev_del)(struct net_device *netdev,
+			    struct tls_context *ctx,
+			    enum tls_offload_ctx_dir direction);
+};
+#endif
+
 struct dev_ifalias {
 	struct rcu_head rcuhead;
 	char ifalias[];
@@ -1750,6 +1770,10 @@ struct net_device {
 	const struct xfrmdev_ops *xfrmdev_ops;
 #endif
 
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+	const struct tlsdev_ops *tlsdev_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned int		flags;
-- 
cgit v1.2.3-59-g8ed1b


From 2342a8512a1e98c286b6b36094058e4ee2261c74 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:14 +0300
Subject: net: Add TLS TX offload features

This patch adds a netdev feature to configure TLS TX offloads.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 2 ++
 net/core/ethtool.c              | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index fe2f3b30960e..c87c3a3453c1 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -78,6 +78,7 @@ enum {
 	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
+	NETIF_F_HW_TLS_TX_BIT,		/* Hardware TLS TX offload */
 
 	NETIF_F_GRO_HW_BIT,		/* Hardware Generic receive offload */
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
@@ -149,6 +150,7 @@ enum {
 #define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 #define NETIF_F_HW_TLS_RECORD	__NETIF_F(HW_TLS_RECORD)
 #define NETIF_F_GSO_UDP_L4	__NETIF_F(GSO_UDP_L4)
+#define NETIF_F_HW_TLS_TX	__NETIF_F(HW_TLS_TX)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b849fdae7e87..3c6ce8a3928a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -110,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
 	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
+	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 };
 
 static const char
-- 
cgit v1.2.3-59-g8ed1b


From f66de3ee2c161fcc2d66974e9671f8a2a471ab20 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:15 +0300
Subject: net/tls: Split conf to rx + tx

In TLS inline crypto, we can have one direction in software
and another in hardware. Thus, we split the TLS configuration to separate
structures for receive and transmit.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h  |  51 +++++++++++++-------
 net/tls/tls_main.c | 103 ++++++++++++++++++++-------------------
 net/tls/tls_sw.c   | 138 ++++++++++++++++++++++++++++++-----------------------
 3 files changed, 163 insertions(+), 129 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 3da8e13a6d96..95a8c60b36be 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -83,21 +83,10 @@ struct tls_device {
 	void (*unhash)(struct tls_device *device, struct sock *sk);
 };
 
-struct tls_sw_context {
+struct tls_sw_context_tx {
 	struct crypto_aead *aead_send;
-	struct crypto_aead *aead_recv;
 	struct crypto_wait async_wait;
 
-	/* Receive context */
-	struct strparser strp;
-	void (*saved_data_ready)(struct sock *sk);
-	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
-				struct poll_table_struct *wait);
-	struct sk_buff *recv_pkt;
-	u8 control;
-	bool decrypted;
-
-	/* Sending context */
 	char aad_space[TLS_AAD_SPACE_SIZE];
 
 	unsigned int sg_plaintext_size;
@@ -114,6 +103,19 @@ struct tls_sw_context {
 	struct scatterlist sg_aead_out[2];
 };
 
+struct tls_sw_context_rx {
+	struct crypto_aead *aead_recv;
+	struct crypto_wait async_wait;
+
+	struct strparser strp;
+	void (*saved_data_ready)(struct sock *sk);
+	unsigned int (*sk_poll)(struct file *file, struct socket *sock,
+				struct poll_table_struct *wait);
+	struct sk_buff *recv_pkt;
+	u8 control;
+	bool decrypted;
+};
+
 enum {
 	TLS_PENDING_CLOSED_RECORD
 };
@@ -138,9 +140,15 @@ struct tls_context {
 		struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128;
 	};
 
-	void *priv_ctx;
+	struct list_head list;
+	struct net_device *netdev;
+	refcount_t refcount;
+
+	void *priv_ctx_tx;
+	void *priv_ctx_rx;
 
-	u8 conf:3;
+	u8 tx_conf:3;
+	u8 rx_conf:3;
 
 	struct cipher_context tx;
 	struct cipher_context rx;
@@ -177,7 +185,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags);
 void tls_sw_close(struct sock *sk, long timeout);
-void tls_sw_free_resources(struct sock *sk);
+void tls_sw_free_resources_tx(struct sock *sk);
+void tls_sw_free_resources_rx(struct sock *sk);
 int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		   int nonblock, int flags, int *addr_len);
 unsigned int tls_sw_poll(struct file *file, struct socket *sock,
@@ -297,16 +306,22 @@ static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 	return icsk->icsk_ulp_data;
 }
 
-static inline struct tls_sw_context *tls_sw_ctx(
+static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
+		const struct tls_context *tls_ctx)
+{
+	return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx;
+}
+
+static inline struct tls_sw_context_tx *tls_sw_ctx_tx(
 		const struct tls_context *tls_ctx)
 {
-	return (struct tls_sw_context *)tls_ctx->priv_ctx;
+	return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx;
 }
 
 static inline struct tls_offload_context *tls_offload_ctx(
 		const struct tls_context *tls_ctx)
 {
-	return (struct tls_offload_context *)tls_ctx->priv_ctx;
+	return (struct tls_offload_context *)tls_ctx->priv_ctx_tx;
 }
 
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 0d379970960e..545bf34ed599 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,12 +51,9 @@ enum {
 	TLSV6,
 	TLS_NUM_PROTS,
 };
-
 enum {
 	TLS_BASE,
-	TLS_SW_TX,
-	TLS_SW_RX,
-	TLS_SW_RXTX,
+	TLS_SW,
 	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };
@@ -65,14 +62,14 @@ static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
 static LIST_HEAD(device_list);
 static DEFINE_MUTEX(device_mutex);
-static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
 static struct proto_ops tls_sw_proto_ops;
 
-static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
-	sk->sk_prot = &tls_prots[ip_ver][ctx->conf];
+	sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -245,10 +242,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	lock_sock(sk);
 	sk_proto_close = ctx->sk_proto_close;
 
-	if (ctx->conf == TLS_HW_RECORD)
+	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
 		goto skip_tx_cleanup;
 
-	if (ctx->conf == TLS_BASE) {
+	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE) {
 		kfree(ctx);
 		ctx = NULL;
 		goto skip_tx_cleanup;
@@ -270,15 +267,17 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		}
 	}
 
-	kfree(ctx->tx.rec_seq);
-	kfree(ctx->tx.iv);
-	kfree(ctx->rx.rec_seq);
-	kfree(ctx->rx.iv);
+	/* We need these for tls_sw_fallback handling of other packets */
+	if (ctx->tx_conf == TLS_SW) {
+		kfree(ctx->tx.rec_seq);
+		kfree(ctx->tx.iv);
+		tls_sw_free_resources_tx(sk);
+	}
 
-	if (ctx->conf == TLS_SW_TX ||
-	    ctx->conf == TLS_SW_RX ||
-	    ctx->conf == TLS_SW_RXTX) {
-		tls_sw_free_resources(sk);
+	if (ctx->rx_conf == TLS_SW) {
+		kfree(ctx->rx.rec_seq);
+		kfree(ctx->rx.iv);
+		tls_sw_free_resources_rx(sk);
 	}
 
 skip_tx_cleanup:
@@ -287,7 +286,8 @@ skip_tx_cleanup:
 	/* free ctx for TLS_HW_RECORD, used by tcp_set_state
 	 * for sk->sk_prot->unhash [tls_hw_unhash]
 	 */
-	if (ctx && ctx->conf == TLS_HW_RECORD)
+	if (ctx && ctx->tx_conf == TLS_HW_RECORD &&
+	    ctx->rx_conf == TLS_HW_RECORD)
 		kfree(ctx);
 }
 
@@ -441,25 +441,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 		goto err_crypto_info;
 	}
 
-	/* currently SW is default, we will have ethtool in future */
 	if (tx) {
 		rc = tls_set_sw_offload(sk, ctx, 1);
-		if (ctx->conf == TLS_SW_RX)
-			conf = TLS_SW_RXTX;
-		else
-			conf = TLS_SW_TX;
+		conf = TLS_SW;
 	} else {
 		rc = tls_set_sw_offload(sk, ctx, 0);
-		if (ctx->conf == TLS_SW_TX)
-			conf = TLS_SW_RXTX;
-		else
-			conf = TLS_SW_RX;
+		conf = TLS_SW;
 	}
 
 	if (rc)
 		goto err_crypto_info;
 
-	ctx->conf = conf;
+	if (tx)
+		ctx->tx_conf = conf;
+	else
+		ctx->rx_conf = conf;
 	update_sk_prot(sk, ctx);
 	if (tx) {
 		ctx->sk_write_space = sk->sk_write_space;
@@ -535,7 +531,8 @@ static int tls_hw_prot(struct sock *sk)
 			ctx->hash = sk->sk_prot->hash;
 			ctx->unhash = sk->sk_prot->unhash;
 			ctx->sk_proto_close = sk->sk_prot->close;
-			ctx->conf = TLS_HW_RECORD;
+			ctx->rx_conf = TLS_HW_RECORD;
+			ctx->tx_conf = TLS_HW_RECORD;
 			update_sk_prot(sk, ctx);
 			rc = 1;
 			break;
@@ -579,29 +576,30 @@ static int tls_hw_hash(struct sock *sk)
 	return err;
 }
 
-static void build_protos(struct proto *prot, struct proto *base)
+static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
+			 struct proto *base)
 {
-	prot[TLS_BASE] = *base;
-	prot[TLS_BASE].setsockopt	= tls_setsockopt;
-	prot[TLS_BASE].getsockopt	= tls_getsockopt;
-	prot[TLS_BASE].close		= tls_sk_proto_close;
-
-	prot[TLS_SW_TX] = prot[TLS_BASE];
-	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
-	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
-
-	prot[TLS_SW_RX] = prot[TLS_BASE];
-	prot[TLS_SW_RX].recvmsg		= tls_sw_recvmsg;
-	prot[TLS_SW_RX].close		= tls_sk_proto_close;
-
-	prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
-	prot[TLS_SW_RXTX].recvmsg	= tls_sw_recvmsg;
-	prot[TLS_SW_RXTX].close		= tls_sk_proto_close;
-
-	prot[TLS_HW_RECORD] = *base;
-	prot[TLS_HW_RECORD].hash	= tls_hw_hash;
-	prot[TLS_HW_RECORD].unhash	= tls_hw_unhash;
-	prot[TLS_HW_RECORD].close	= tls_sk_proto_close;
+	prot[TLS_BASE][TLS_BASE] = *base;
+	prot[TLS_BASE][TLS_BASE].setsockopt	= tls_setsockopt;
+	prot[TLS_BASE][TLS_BASE].getsockopt	= tls_getsockopt;
+	prot[TLS_BASE][TLS_BASE].close		= tls_sk_proto_close;
+
+	prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_SW][TLS_BASE].sendmsg		= tls_sw_sendmsg;
+	prot[TLS_SW][TLS_BASE].sendpage		= tls_sw_sendpage;
+
+	prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_BASE][TLS_SW].recvmsg		= tls_sw_recvmsg;
+	prot[TLS_BASE][TLS_SW].close		= tls_sk_proto_close;
+
+	prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
+	prot[TLS_SW][TLS_SW].recvmsg	= tls_sw_recvmsg;
+	prot[TLS_SW][TLS_SW].close	= tls_sk_proto_close;
+
+	prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_hw_hash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_hw_unhash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].close	= tls_sk_proto_close;
 }
 
 static int tls_init(struct sock *sk)
@@ -643,7 +641,8 @@ static int tls_init(struct sock *sk)
 		mutex_unlock(&tcpv6_prot_mutex);
 	}
 
-	ctx->conf = TLS_BASE;
+	ctx->tx_conf = TLS_BASE;
+	ctx->rx_conf = TLS_BASE;
 	update_sk_prot(sk, ctx);
 out:
 	return rc;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 6ed1c02cfc94..5c3909c311f1 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk,
 			     gfp_t flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = strp_msg(skb);
 	struct aead_request *aead_req;
 
@@ -122,7 +122,7 @@ out:
 static void trim_both_sgl(struct sock *sk, int target_size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	trim_sg(sk, ctx->sg_plaintext_data,
 		&ctx->sg_plaintext_num_elem,
@@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 static int alloc_encrypted_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len,
@@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 static int alloc_plaintext_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
@@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg,
 static void tls_free_both_sg(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem,
 		&ctx->sg_encrypted_size);
@@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk)
 }
 
 static int tls_do_encryption(struct tls_context *tls_ctx,
-			     struct tls_sw_context *ctx, size_t data_len,
+			     struct tls_sw_context_tx *ctx, size_t data_len,
 			     gfp_t flags)
 {
 	unsigned int req_size = sizeof(struct aead_request) +
@@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags,
 			   unsigned char record_type)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int rc;
 
 	sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1);
@@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from,
 			     int bytes)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	struct scatterlist *sg = ctx->sg_plaintext_data;
 	int copy, i, rc = 0;
 
@@ -367,7 +367,7 @@ out:
 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int ret = 0;
 	int required_size;
 	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 		    int offset, size_t size, int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 	int ret = 0;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	bool eor;
@@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
 				     long timeo, int *err)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct sk_buff *skb;
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
@@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 		       struct scatterlist *sgout)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE];
 	struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
 	struct scatterlist *sgin = &sgin_arr[0];
@@ -723,7 +723,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 			       unsigned int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = strp_msg(skb);
 
 	if (len < rxm->full_len) {
@@ -749,7 +749,7 @@ int tls_sw_recvmsg(struct sock *sk,
 		   int *addr_len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	unsigned char control;
 	struct strp_msg *rxm;
 	struct sk_buff *skb;
@@ -869,7 +869,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 			   size_t len, unsigned int flags)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm = NULL;
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
@@ -922,7 +922,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
 	unsigned int ret;
 	struct sock *sk = sock->sk;
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
 	/* Grab POLLOUT and POLLHUP from the underlying socket */
 	ret = ctx->sk_poll(file, sock, wait);
@@ -938,7 +938,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock,
 static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	char header[tls_ctx->rx.prepend_size];
 	struct strp_msg *rxm = strp_msg(skb);
 	size_t cipher_overhead;
@@ -987,7 +987,7 @@ read_failure:
 static void tls_queue(struct strparser *strp, struct sk_buff *skb)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 	struct strp_msg *rxm;
 
 	rxm = strp_msg(skb);
@@ -1003,18 +1003,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
 static void tls_data_ready(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
 	strp_data_ready(&ctx->strp);
 }
 
-void tls_sw_free_resources(struct sock *sk)
+void tls_sw_free_resources_tx(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
 
 	if (ctx->aead_send)
 		crypto_free_aead(ctx->aead_send);
+	tls_free_both_sg(sk);
+
+	kfree(ctx);
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
 	if (ctx->aead_recv) {
 		if (ctx->recv_pkt) {
 			kfree_skb(ctx->recv_pkt);
@@ -1030,10 +1040,7 @@ void tls_sw_free_resources(struct sock *sk)
 		lock_sock(sk);
 	}
 
-	tls_free_both_sg(sk);
-
 	kfree(ctx);
-	kfree(tls_ctx);
 }
 
 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
@@ -1041,7 +1048,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
 	struct tls_crypto_info *crypto_info;
 	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
-	struct tls_sw_context *sw_ctx;
+	struct tls_sw_context_tx *sw_ctx_tx = NULL;
+	struct tls_sw_context_rx *sw_ctx_rx = NULL;
 	struct cipher_context *cctx;
 	struct crypto_aead **aead;
 	struct strp_callbacks cb;
@@ -1054,27 +1062,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		goto out;
 	}
 
-	if (!ctx->priv_ctx) {
-		sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
-		if (!sw_ctx) {
+	if (tx) {
+		sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+		if (!sw_ctx_tx) {
 			rc = -ENOMEM;
 			goto out;
 		}
-		crypto_init_wait(&sw_ctx->async_wait);
+		crypto_init_wait(&sw_ctx_tx->async_wait);
+		ctx->priv_ctx_tx = sw_ctx_tx;
 	} else {
-		sw_ctx = ctx->priv_ctx;
+		sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+		if (!sw_ctx_rx) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		crypto_init_wait(&sw_ctx_rx->async_wait);
+		ctx->priv_ctx_rx = sw_ctx_rx;
 	}
 
-	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
-
 	if (tx) {
 		crypto_info = &ctx->crypto_send;
 		cctx = &ctx->tx;
-		aead = &sw_ctx->aead_send;
+		aead = &sw_ctx_tx->aead_send;
 	} else {
 		crypto_info = &ctx->crypto_recv;
 		cctx = &ctx->rx;
-		aead = &sw_ctx->aead_recv;
+		aead = &sw_ctx_rx->aead_recv;
 	}
 
 	switch (crypto_info->cipher_type) {
@@ -1121,22 +1134,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	}
 	memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
 
-	if (tx) {
-		sg_init_table(sw_ctx->sg_encrypted_data,
-			      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
-		sg_init_table(sw_ctx->sg_plaintext_data,
-			      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
-		sg_init_table(sw_ctx->sg_aead_in, 2);
-		sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
-			   sizeof(sw_ctx->aad_space));
-		sg_unmark_end(&sw_ctx->sg_aead_in[1]);
-		sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
-		sg_init_table(sw_ctx->sg_aead_out, 2);
-		sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
-			   sizeof(sw_ctx->aad_space));
-		sg_unmark_end(&sw_ctx->sg_aead_out[1]);
-		sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+	if (sw_ctx_tx) {
+		sg_init_table(sw_ctx_tx->sg_encrypted_data,
+			      ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data));
+		sg_init_table(sw_ctx_tx->sg_plaintext_data,
+			      ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data));
+
+		sg_init_table(sw_ctx_tx->sg_aead_in, 2);
+		sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space,
+			   sizeof(sw_ctx_tx->aad_space));
+		sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]);
+		sg_chain(sw_ctx_tx->sg_aead_in, 2,
+			 sw_ctx_tx->sg_plaintext_data);
+		sg_init_table(sw_ctx_tx->sg_aead_out, 2);
+		sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space,
+			   sizeof(sw_ctx_tx->aad_space));
+		sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]);
+		sg_chain(sw_ctx_tx->sg_aead_out, 2,
+			 sw_ctx_tx->sg_encrypted_data);
 	}
 
 	if (!*aead) {
@@ -1161,22 +1176,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 	if (rc)
 		goto free_aead;
 
-	if (!tx) {
+	if (sw_ctx_rx) {
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
 		cb.rcv_msg = tls_queue;
 		cb.parse_msg = tls_read_size;
 
-		strp_init(&sw_ctx->strp, sk, &cb);
+		strp_init(&sw_ctx_rx->strp, sk, &cb);
 
 		write_lock_bh(&sk->sk_callback_lock);
-		sw_ctx->saved_data_ready = sk->sk_data_ready;
+		sw_ctx_rx->saved_data_ready = sk->sk_data_ready;
 		sk->sk_data_ready = tls_data_ready;
 		write_unlock_bh(&sk->sk_callback_lock);
 
-		sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+		sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll;
 
-		strp_check_rcv(&sw_ctx->strp);
+		strp_check_rcv(&sw_ctx_rx->strp);
 	}
 
 	goto out;
@@ -1188,11 +1203,16 @@ free_rec_seq:
 	kfree(cctx->rec_seq);
 	cctx->rec_seq = NULL;
 free_iv:
-	kfree(ctx->tx.iv);
-	ctx->tx.iv = NULL;
+	kfree(cctx->iv);
+	cctx->iv = NULL;
 free_priv:
-	kfree(ctx->priv_ctx);
-	ctx->priv_ctx = NULL;
+	if (tx) {
+		kfree(ctx->priv_ctx_tx);
+		ctx->priv_ctx_tx = NULL;
+	} else {
+		kfree(ctx->priv_ctx_rx);
+		ctx->priv_ctx_rx = NULL;
+	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e8f69799810c32dd40c6724d829eccc70baad07f Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:16 +0300
Subject: net/tls: Add generic NIC offload infrastructure

This patch adds a generic infrastructure to offload TLS crypto to a
network device. It enables the kernel TLS socket to skip encryption
and authentication operations on the transmit side of the data path.
Leaving those computationally expensive operations to the NIC.

The NIC offload infrastructure builds TLS records and pushes them to
the TCP layer just like the SW KTLS implementation and using the same
API.
TCP segmentation is mostly unaffected. Currently the only exception is
that we prevent mixed SKBs where only part of the payload requires
offload. In the future we are likely to add a similar restriction
following a change cipher spec record.

The notable differences between SW KTLS and NIC offloaded TLS
implementations are as follows:
1. The offloaded implementation builds "plaintext TLS record", those
records contain plaintext instead of ciphertext and place holder bytes
instead of authentication tags.
2. The offloaded implementation maintains a mapping from TCP sequence
number to TLS records. Thus given a TCP SKB sent from a NIC offloaded
TLS socket, we can use the tls NIC offload infrastructure to obtain
enough context to encrypt the payload of the SKB.
A TLS record is released when the last byte of the record is ack'ed,
this is done through the new icsk_clean_acked callback.

The infrastructure should be extendable to support various NIC offload
implementations.  However it is currently written with the
implementation below in mind:
The NIC assumes that packets from each offloaded stream are sent as
plaintext and in-order. It keeps track of the TLS records in the TCP
stream. When a packet marked for offload is transmitted, the NIC
encrypts the payload in-place and puts authentication tags in the
relevant place holders.

The responsibility for handling out-of-order packets (i.e. TCP
retransmission, qdisc drops) falls on the netdev driver.

The netdev driver keeps track of the expected TCP SN from the NIC's
perspective.  If the next packet to transmit matches the expected TCP
SN, the driver advances the expected TCP SN, and transmits the packet
with TLS offload indication.

If the next packet to transmit does not match the expected TCP SN. The
driver calls the TLS layer to obtain the TLS record that includes the
TCP of the packet for transmission. Using this TLS record, the driver
posts a work entry on the transmit queue to reconstruct the NIC TLS
state required for the offload of the out-of-order packet. It updates
the expected TCP SN accordingly and transmits the now in-order packet.
The same queue is used for packet transmission and TLS context
reconstruction to avoid the need for flushing the transmit queue before
issuing the context reconstruction request.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Aviad Yehezkel <aviadye@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h             |  69 +++-
 net/tls/Kconfig               |  10 +
 net/tls/Makefile              |   2 +
 net/tls/tls_device.c          | 764 ++++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_device_fallback.c | 450 +++++++++++++++++++++++++
 net/tls/tls_main.c            |  42 ++-
 6 files changed, 1332 insertions(+), 5 deletions(-)
 create mode 100644 net/tls/tls_device.c
 create mode 100644 net/tls/tls_device_fallback.c

diff --git a/include/net/tls.h b/include/net/tls.h
index 95a8c60b36be..8c56809eb384 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -116,6 +116,37 @@ struct tls_sw_context_rx {
 	bool decrypted;
 };
 
+struct tls_record_info {
+	struct list_head list;
+	u32 end_seq;
+	int len;
+	int num_frags;
+	skb_frag_t frags[MAX_SKB_FRAGS];
+};
+
+struct tls_offload_context {
+	struct crypto_aead *aead_send;
+	spinlock_t lock;	/* protects records list */
+	struct list_head records_list;
+	struct tls_record_info *open_record;
+	struct tls_record_info *retransmit_hint;
+	u64 hint_record_sn;
+	u64 unacked_record_sn;
+
+	struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
+	void (*sk_destruct)(struct sock *sk);
+	u8 driver_state[];
+	/* The TLS layer reserves room for driver specific state
+	 * Currently the belief is that there is not enough
+	 * driver specific state to justify another layer of indirection
+	 */
+#define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *)))
+};
+
+#define TLS_OFFLOAD_CONTEXT_SIZE                                               \
+	(ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) +           \
+	 TLS_DRIVER_STATE_SIZE)
+
 enum {
 	TLS_PENDING_CLOSED_RECORD
 };
@@ -195,9 +226,28 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
 			   struct pipe_inode_info *pipe,
 			   size_t len, unsigned int flags);
 
-void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
-void tls_icsk_clean_acked(struct sock *sk);
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tls_device_sendpage(struct sock *sk, struct page *page,
+			int offset, size_t size, int flags);
+void tls_device_sk_destruct(struct sock *sk);
+void tls_device_init(void);
+void tls_device_cleanup(void);
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+				       u32 seq, u64 *p_record_sn);
+
+static inline bool tls_record_is_start_marker(struct tls_record_info *rec)
+{
+	return rec->len == 0;
+}
+
+static inline u32 tls_record_start_seq(struct tls_record_info *rec)
+{
+	return rec->end_seq - rec->len;
+}
 
+void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		struct scatterlist *sg, u16 first_offset,
 		int flags);
@@ -234,6 +284,13 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx)
 	return tls_ctx->pending_open_record_frags;
 }
 
+static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk)
+{
+	return sk_fullsock(sk) &&
+	       /* matches smp_store_release in tls_set_device_offload */
+	       smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct;
+}
+
 static inline void tls_err_abort(struct sock *sk, int err)
 {
 	sk->sk_err = err;
@@ -329,4 +386,12 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 void tls_register_device(struct tls_device *device);
 void tls_unregister_device(struct tls_device *device);
 
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+				      struct net_device *dev,
+				      struct sk_buff *skb);
+
+int tls_sw_fallback_init(struct sock *sk,
+			 struct tls_offload_context *offload_ctx,
+			 struct tls_crypto_info *crypto_info);
+
 #endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 89b8745a986f..73f05ece53d0 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -14,3 +14,13 @@ config TLS
 	encryption handling of the TLS protocol to be done in-kernel.
 
 	If unsure, say N.
+
+config TLS_DEVICE
+	bool "Transport Layer Security HW offload"
+	depends on TLS
+	select SOCK_VALIDATE_XMIT
+	default n
+	help
+	Enable kernel support for HW offload of the TLS protocol.
+
+	If unsure, say N.
diff --git a/net/tls/Makefile b/net/tls/Makefile
index a930fd1c4f7b..4d6b728a67d0 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_TLS) += tls.o
 
 tls-y := tls_main.o tls_sw.o
+
+tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
new file mode 100644
index 000000000000..ac45d6224e41
--- /dev/null
+++ b/net/tls/tls_device.c
@@ -0,0 +1,764 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <crypto/aead.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <net/dst.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+
+/* device_offload_lock is used to synchronize tls_dev_add
+ * against NETDEV_DOWN notifications.
+ */
+static DECLARE_RWSEM(device_offload_lock);
+
+static void tls_device_gc_task(struct work_struct *work);
+
+static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task);
+static LIST_HEAD(tls_device_gc_list);
+static LIST_HEAD(tls_device_list);
+static DEFINE_SPINLOCK(tls_device_lock);
+
+static void tls_device_free_ctx(struct tls_context *ctx)
+{
+	struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+
+	kfree(offload_ctx);
+	kfree(ctx);
+}
+
+static void tls_device_gc_task(struct work_struct *work)
+{
+	struct tls_context *ctx, *tmp;
+	unsigned long flags;
+	LIST_HEAD(gc_list);
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_splice_init(&tls_device_gc_list, &gc_list);
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+
+	list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
+		struct net_device *netdev = ctx->netdev;
+
+		if (netdev) {
+			netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+							TLS_OFFLOAD_CTX_DIR_TX);
+			dev_put(netdev);
+		}
+
+		list_del(&ctx->list);
+		tls_device_free_ctx(ctx);
+	}
+}
+
+static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_move_tail(&ctx->list, &tls_device_gc_list);
+
+	/* schedule_work inside the spinlock
+	 * to make sure tls_device_down waits for that work.
+	 */
+	schedule_work(&tls_device_gc_work);
+
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+}
+
+/* We assume that the socket is already connected */
+static struct net_device *get_netdev_for_sock(struct sock *sk)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+	struct net_device *netdev = NULL;
+
+	if (likely(dst)) {
+		netdev = dst->dev;
+		dev_hold(netdev);
+	}
+
+	dst_release(dst);
+
+	return netdev;
+}
+
+static void destroy_record(struct tls_record_info *record)
+{
+	int nr_frags = record->num_frags;
+	skb_frag_t *frag;
+
+	while (nr_frags-- > 0) {
+		frag = &record->frags[nr_frags];
+		__skb_frag_unref(frag);
+	}
+	kfree(record);
+}
+
+static void delete_all_records(struct tls_offload_context *offload_ctx)
+{
+	struct tls_record_info *info, *temp;
+
+	list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
+		list_del(&info->list);
+		destroy_record(info);
+	}
+
+	offload_ctx->retransmit_hint = NULL;
+}
+
+static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_record_info *info, *temp;
+	struct tls_offload_context *ctx;
+	u64 deleted_records = 0;
+	unsigned long flags;
+
+	if (!tls_ctx)
+		return;
+
+	ctx = tls_offload_ctx(tls_ctx);
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	info = ctx->retransmit_hint;
+	if (info && !before(acked_seq, info->end_seq)) {
+		ctx->retransmit_hint = NULL;
+		list_del(&info->list);
+		destroy_record(info);
+		deleted_records++;
+	}
+
+	list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
+		if (before(acked_seq, info->end_seq))
+			break;
+		list_del(&info->list);
+
+		destroy_record(info);
+		deleted_records++;
+	}
+
+	ctx->unacked_record_sn += deleted_records;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/* At this point, there should be no references on this
+ * socket and no in-flight SKBs associated with this
+ * socket, so it is safe to free all the resources.
+ */
+void tls_device_sk_destruct(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+
+	if (ctx->open_record)
+		destroy_record(ctx->open_record);
+
+	delete_all_records(ctx);
+	crypto_free_aead(ctx->aead_send);
+	ctx->sk_destruct(sk);
+	clean_acked_data_disable(inet_csk(sk));
+
+	if (refcount_dec_and_test(&tls_ctx->refcount))
+		tls_device_queue_ctx_destruction(tls_ctx);
+}
+EXPORT_SYMBOL(tls_device_sk_destruct);
+
+static void tls_append_frag(struct tls_record_info *record,
+			    struct page_frag *pfrag,
+			    int size)
+{
+	skb_frag_t *frag;
+
+	frag = &record->frags[record->num_frags - 1];
+	if (frag->page.p == pfrag->page &&
+	    frag->page_offset + frag->size == pfrag->offset) {
+		frag->size += size;
+	} else {
+		++frag;
+		frag->page.p = pfrag->page;
+		frag->page_offset = pfrag->offset;
+		frag->size = size;
+		++record->num_frags;
+		get_page(pfrag->page);
+	}
+
+	pfrag->offset += size;
+	record->len += size;
+}
+
+static int tls_push_record(struct sock *sk,
+			   struct tls_context *ctx,
+			   struct tls_offload_context *offload_ctx,
+			   struct tls_record_info *record,
+			   struct page_frag *pfrag,
+			   int flags,
+			   unsigned char record_type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct page_frag dummy_tag_frag;
+	skb_frag_t *frag;
+	int i;
+
+	/* fill prepend */
+	frag = &record->frags[0];
+	tls_fill_prepend(ctx,
+			 skb_frag_address(frag),
+			 record->len - ctx->tx.prepend_size,
+			 record_type);
+
+	/* HW doesn't care about the data in the tag, because it fills it. */
+	dummy_tag_frag.page = skb_frag_page(frag);
+	dummy_tag_frag.offset = 0;
+
+	tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
+	record->end_seq = tp->write_seq + record->len;
+	spin_lock_irq(&offload_ctx->lock);
+	list_add_tail(&record->list, &offload_ctx->records_list);
+	spin_unlock_irq(&offload_ctx->lock);
+	offload_ctx->open_record = NULL;
+	set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags);
+	tls_advance_record_sn(sk, &ctx->tx);
+
+	for (i = 0; i < record->num_frags; i++) {
+		frag = &record->frags[i];
+		sg_unmark_end(&offload_ctx->sg_tx_data[i]);
+		sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
+			    frag->size, frag->page_offset);
+		sk_mem_charge(sk, frag->size);
+		get_page(skb_frag_page(frag));
+	}
+	sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);
+
+	/* all ready, send */
+	return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
+}
+
+static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+				 struct page_frag *pfrag,
+				 size_t prepend_size)
+{
+	struct tls_record_info *record;
+	skb_frag_t *frag;
+
+	record = kmalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		return -ENOMEM;
+
+	frag = &record->frags[0];
+	__skb_frag_set_page(frag, pfrag->page);
+	frag->page_offset = pfrag->offset;
+	skb_frag_size_set(frag, prepend_size);
+
+	get_page(pfrag->page);
+	pfrag->offset += prepend_size;
+
+	record->num_frags = 1;
+	record->len = prepend_size;
+	offload_ctx->open_record = record;
+	return 0;
+}
+
+static int tls_do_allocation(struct sock *sk,
+			     struct tls_offload_context *offload_ctx,
+			     struct page_frag *pfrag,
+			     size_t prepend_size)
+{
+	int ret;
+
+	if (!offload_ctx->open_record) {
+		if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
+						   sk->sk_allocation))) {
+			sk->sk_prot->enter_memory_pressure(sk);
+			sk_stream_moderate_sndbuf(sk);
+			return -ENOMEM;
+		}
+
+		ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
+		if (ret)
+			return ret;
+
+		if (pfrag->size > pfrag->offset)
+			return 0;
+	}
+
+	if (!sk_page_frag_refill(sk, pfrag))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int tls_push_data(struct sock *sk,
+			 struct iov_iter *msg_iter,
+			 size_t size, int flags,
+			 unsigned char record_type)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+	int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
+	struct tls_record_info *record = ctx->open_record;
+	struct page_frag *pfrag;
+	size_t orig_size = size;
+	u32 max_open_record_len;
+	int copy, rc = 0;
+	bool done = false;
+	long timeo;
+
+	if (flags &
+	    ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST))
+		return -ENOTSUPP;
+
+	if (sk->sk_err)
+		return -sk->sk_err;
+
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+	rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo);
+	if (rc < 0)
+		return rc;
+
+	pfrag = sk_page_frag(sk);
+
+	/* TLS_HEADER_SIZE is not counted as part of the TLS record, and
+	 * we need to leave room for an authentication tag.
+	 */
+	max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
+			      tls_ctx->tx.prepend_size;
+	do {
+		rc = tls_do_allocation(sk, ctx, pfrag,
+				       tls_ctx->tx.prepend_size);
+		if (rc) {
+			rc = sk_stream_wait_memory(sk, &timeo);
+			if (!rc)
+				continue;
+
+			record = ctx->open_record;
+			if (!record)
+				break;
+handle_error:
+			if (record_type != TLS_RECORD_TYPE_DATA) {
+				/* avoid sending partial
+				 * record with type !=
+				 * application_data
+				 */
+				size = orig_size;
+				destroy_record(record);
+				ctx->open_record = NULL;
+			} else if (record->len > tls_ctx->tx.prepend_size) {
+				goto last_record;
+			}
+
+			break;
+		}
+
+		record = ctx->open_record;
+		copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
+		copy = min_t(size_t, copy, (max_open_record_len - record->len));
+
+		if (copy_from_iter_nocache(page_address(pfrag->page) +
+					       pfrag->offset,
+					   copy, msg_iter) != copy) {
+			rc = -EFAULT;
+			goto handle_error;
+		}
+		tls_append_frag(record, pfrag, copy);
+
+		size -= copy;
+		if (!size) {
+last_record:
+			tls_push_record_flags = flags;
+			if (more) {
+				tls_ctx->pending_open_record_frags =
+						record->num_frags;
+				break;
+			}
+
+			done = true;
+		}
+
+		if (done || record->len >= max_open_record_len ||
+		    (record->num_frags >= MAX_SKB_FRAGS - 1)) {
+			rc = tls_push_record(sk,
+					     tls_ctx,
+					     ctx,
+					     record,
+					     pfrag,
+					     tls_push_record_flags,
+					     record_type);
+			if (rc < 0)
+				break;
+		}
+	} while (!done);
+
+	if (orig_size - size > 0)
+		rc = orig_size - size;
+
+	return rc;
+}
+
+int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	unsigned char record_type = TLS_RECORD_TYPE_DATA;
+	int rc;
+
+	lock_sock(sk);
+
+	if (unlikely(msg->msg_controllen)) {
+		rc = tls_proccess_cmsg(sk, msg, &record_type);
+		if (rc)
+			goto out;
+	}
+
+	rc = tls_push_data(sk, &msg->msg_iter, size,
+			   msg->msg_flags, record_type);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+int tls_device_sendpage(struct sock *sk, struct page *page,
+			int offset, size_t size, int flags)
+{
+	struct iov_iter	msg_iter;
+	char *kaddr = kmap(page);
+	struct kvec iov;
+	int rc;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	lock_sock(sk);
+
+	if (flags & MSG_OOB) {
+		rc = -ENOTSUPP;
+		goto out;
+	}
+
+	iov.iov_base = kaddr + offset;
+	iov.iov_len = size;
+	iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
+	rc = tls_push_data(sk, &msg_iter, size,
+			   flags, TLS_RECORD_TYPE_DATA);
+	kunmap(page);
+
+out:
+	release_sock(sk);
+	return rc;
+}
+
+struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+				       u32 seq, u64 *p_record_sn)
+{
+	u64 record_sn = context->hint_record_sn;
+	struct tls_record_info *info;
+
+	info = context->retransmit_hint;
+	if (!info ||
+	    before(seq, info->end_seq - info->len)) {
+		/* if retransmit_hint is irrelevant start
+		 * from the beggining of the list
+		 */
+		info = list_first_entry(&context->records_list,
+					struct tls_record_info, list);
+		record_sn = context->unacked_record_sn;
+	}
+
+	list_for_each_entry_from(info, &context->records_list, list) {
+		if (before(seq, info->end_seq)) {
+			if (!context->retransmit_hint ||
+			    after(info->end_seq,
+				  context->retransmit_hint->end_seq)) {
+				context->hint_record_sn = record_sn;
+				context->retransmit_hint = info;
+			}
+			*p_record_sn = record_sn;
+			return info;
+		}
+		record_sn++;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(tls_get_record);
+
+static int tls_device_push_pending_record(struct sock *sk, int flags)
+{
+	struct iov_iter	msg_iter;
+
+	iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
+	return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
+}
+
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+	u16 nonce_size, tag_size, iv_size, rec_seq_size;
+	struct tls_record_info *start_marker_record;
+	struct tls_offload_context *offload_ctx;
+	struct tls_crypto_info *crypto_info;
+	struct net_device *netdev;
+	char *iv, *rec_seq;
+	struct sk_buff *skb;
+	int rc = -EINVAL;
+	__be64 rcd_sn;
+
+	if (!ctx)
+		goto out;
+
+	if (ctx->priv_ctx_tx) {
+		rc = -EEXIST;
+		goto out;
+	}
+
+	start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
+	if (!start_marker_record) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+	if (!offload_ctx) {
+		rc = -ENOMEM;
+		goto free_marker_record;
+	}
+
+	crypto_info = &ctx->crypto_send;
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128:
+		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
+		iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv;
+		rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE;
+		rec_seq =
+		 ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq;
+		break;
+	default:
+		rc = -EINVAL;
+		goto free_offload_ctx;
+	}
+
+	ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
+	ctx->tx.tag_size = tag_size;
+	ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
+	ctx->tx.iv_size = iv_size;
+	ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			     GFP_KERNEL);
+	if (!ctx->tx.iv) {
+		rc = -ENOMEM;
+		goto free_offload_ctx;
+	}
+
+	memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+
+	ctx->tx.rec_seq_size = rec_seq_size;
+	ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!ctx->tx.rec_seq) {
+		rc = -ENOMEM;
+		goto free_iv;
+	}
+	memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
+
+	rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
+	if (rc)
+		goto free_rec_seq;
+
+	/* start at rec_seq - 1 to account for the start marker record */
+	memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
+	offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;
+
+	start_marker_record->end_seq = tcp_sk(sk)->write_seq;
+	start_marker_record->len = 0;
+	start_marker_record->num_frags = 0;
+
+	INIT_LIST_HEAD(&offload_ctx->records_list);
+	list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
+	spin_lock_init(&offload_ctx->lock);
+
+	clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
+	ctx->push_pending_record = tls_device_push_pending_record;
+	offload_ctx->sk_destruct = sk->sk_destruct;
+
+	/* TLS offload is greatly simplified if we don't send
+	 * SKBs where only part of the payload needs to be encrypted.
+	 * So mark the last skb in the write queue as end of record.
+	 */
+	skb = tcp_write_queue_tail(sk);
+	if (skb)
+		TCP_SKB_CB(skb)->eor = 1;
+
+	refcount_set(&ctx->refcount, 1);
+
+	/* We support starting offload on multiple sockets
+	 * concurrently, so we only need a read lock here.
+	 * This lock must precede get_netdev_for_sock to prevent races between
+	 * NETDEV_DOWN and setsockopt.
+	 */
+	down_read(&device_offload_lock);
+	netdev = get_netdev_for_sock(sk);
+	if (!netdev) {
+		pr_err_ratelimited("%s: netdev not found\n", __func__);
+		rc = -EINVAL;
+		goto release_lock;
+	}
+
+	if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
+		rc = -ENOTSUPP;
+		goto release_netdev;
+	}
+
+	/* Avoid offloading if the device is down
+	 * We don't want to offload new flows after
+	 * the NETDEV_DOWN event
+	 */
+	if (!(netdev->flags & IFF_UP)) {
+		rc = -EINVAL;
+		goto release_netdev;
+	}
+
+	ctx->priv_ctx_tx = offload_ctx;
+	rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
+					     &ctx->crypto_send,
+					     tcp_sk(sk)->write_seq);
+	if (rc)
+		goto release_netdev;
+
+	ctx->netdev = netdev;
+
+	spin_lock_irq(&tls_device_lock);
+	list_add_tail(&ctx->list, &tls_device_list);
+	spin_unlock_irq(&tls_device_lock);
+
+	sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
+	/* following this assignment tls_is_sk_tx_device_offloaded
+	 * will return true and the context might be accessed
+	 * by the netdev's xmit function.
+	 */
+	smp_store_release(&sk->sk_destruct,
+			  &tls_device_sk_destruct);
+	up_read(&device_offload_lock);
+	goto out;
+
+release_netdev:
+	dev_put(netdev);
+release_lock:
+	up_read(&device_offload_lock);
+	clean_acked_data_disable(inet_csk(sk));
+	crypto_free_aead(offload_ctx->aead_send);
+free_rec_seq:
+	kfree(ctx->tx.rec_seq);
+free_iv:
+	kfree(ctx->tx.iv);
+free_offload_ctx:
+	kfree(offload_ctx);
+	ctx->priv_ctx_tx = NULL;
+free_marker_record:
+	kfree(start_marker_record);
+out:
+	return rc;
+}
+
+static int tls_device_down(struct net_device *netdev)
+{
+	struct tls_context *ctx, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	/* Request a write lock to block new offload attempts */
+	down_write(&device_offload_lock);
+
+	spin_lock_irqsave(&tls_device_lock, flags);
+	list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
+		if (ctx->netdev != netdev ||
+		    !refcount_inc_not_zero(&ctx->refcount))
+			continue;
+
+		list_move(&ctx->list, &list);
+	}
+	spin_unlock_irqrestore(&tls_device_lock, flags);
+
+	list_for_each_entry_safe(ctx, tmp, &list, list)	{
+		netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+						TLS_OFFLOAD_CTX_DIR_TX);
+		ctx->netdev = NULL;
+		dev_put(netdev);
+		list_del_init(&ctx->list);
+
+		if (refcount_dec_and_test(&ctx->refcount))
+			tls_device_free_ctx(ctx);
+	}
+
+	up_write(&device_offload_lock);
+
+	flush_work(&tls_device_gc_work);
+
+	return NOTIFY_DONE;
+}
+
+static int tls_dev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	if (!(dev->features & NETIF_F_HW_TLS_TX))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_FEAT_CHANGE:
+		if  (dev->tlsdev_ops &&
+		     dev->tlsdev_ops->tls_dev_add &&
+		     dev->tlsdev_ops->tls_dev_del)
+			return NOTIFY_DONE;
+		else
+			return NOTIFY_BAD;
+	case NETDEV_DOWN:
+		return tls_device_down(dev);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block tls_dev_notifier = {
+	.notifier_call	= tls_dev_event,
+};
+
+void __init tls_device_init(void)
+{
+	register_netdevice_notifier(&tls_dev_notifier);
+}
+
+void __exit tls_device_cleanup(void)
+{
+	unregister_netdevice_notifier(&tls_dev_notifier);
+	flush_work(&tls_device_gc_work);
+}
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
new file mode 100644
index 000000000000..748914abdb60
--- /dev/null
+++ b/net/tls/tls_device_fallback.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tls.h>
+#include <crypto/aead.h>
+#include <crypto/scatterwalk.h>
+#include <net/ip6_checksum.h>
+
+static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk)
+{
+	struct scatterlist *src = walk->sg;
+	int diff = walk->offset - src->offset;
+
+	sg_set_page(sg, sg_page(src),
+		    src->length - diff, walk->offset);
+
+	scatterwalk_crypto_chain(sg, sg_next(src), 0, 2);
+}
+
+static int tls_enc_record(struct aead_request *aead_req,
+			  struct crypto_aead *aead, char *aad,
+			  char *iv, __be64 rcd_sn,
+			  struct scatter_walk *in,
+			  struct scatter_walk *out, int *in_len)
+{
+	unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE];
+	struct scatterlist sg_in[3];
+	struct scatterlist sg_out[3];
+	u16 len;
+	int rc;
+
+	len = min_t(int, *in_len, ARRAY_SIZE(buf));
+
+	scatterwalk_copychunks(buf, in, len, 0);
+	scatterwalk_copychunks(buf, out, len, 1);
+
+	*in_len -= len;
+	if (!*in_len)
+		return 0;
+
+	scatterwalk_pagedone(in, 0, 1);
+	scatterwalk_pagedone(out, 1, 1);
+
+	len = buf[4] | (buf[3] << 8);
+	len -= TLS_CIPHER_AES_GCM_128_IV_SIZE;
+
+	tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE,
+		     (char *)&rcd_sn, sizeof(rcd_sn), buf[0]);
+
+	memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE,
+	       TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+	sg_init_table(sg_in, ARRAY_SIZE(sg_in));
+	sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+	sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE);
+	sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE);
+	chain_to_walk(sg_in + 1, in);
+	chain_to_walk(sg_out + 1, out);
+
+	*in_len -= len;
+	if (*in_len < 0) {
+		*in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+		/* the input buffer doesn't contain the entire record.
+		 * trim len accordingly. The resulting authentication tag
+		 * will contain garbage, but we don't care, so we won't
+		 * include any of it in the output skb
+		 * Note that we assume the output buffer length
+		 * is larger then input buffer length + tag size
+		 */
+		if (*in_len < 0)
+			len += *in_len;
+
+		*in_len = 0;
+	}
+
+	if (*in_len) {
+		scatterwalk_copychunks(NULL, in, len, 2);
+		scatterwalk_pagedone(in, 0, 1);
+		scatterwalk_copychunks(NULL, out, len, 2);
+		scatterwalk_pagedone(out, 1, 1);
+	}
+
+	len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+	aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv);
+
+	rc = crypto_aead_encrypt(aead_req);
+
+	return rc;
+}
+
+static void tls_init_aead_request(struct aead_request *aead_req,
+				  struct crypto_aead *aead)
+{
+	aead_request_set_tfm(aead_req, aead);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+}
+
+static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead,
+						   gfp_t flags)
+{
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(aead);
+	struct aead_request *aead_req;
+
+	aead_req = kzalloc(req_size, flags);
+	if (aead_req)
+		tls_init_aead_request(aead_req, aead);
+	return aead_req;
+}
+
+static int tls_enc_records(struct aead_request *aead_req,
+			   struct crypto_aead *aead, struct scatterlist *sg_in,
+			   struct scatterlist *sg_out, char *aad, char *iv,
+			   u64 rcd_sn, int len)
+{
+	struct scatter_walk out, in;
+	int rc;
+
+	scatterwalk_start(&in, sg_in);
+	scatterwalk_start(&out, sg_out);
+
+	do {
+		rc = tls_enc_record(aead_req, aead, aad, iv,
+				    cpu_to_be64(rcd_sn), &in, &out, &len);
+		rcd_sn++;
+
+	} while (rc == 0 && len);
+
+	scatterwalk_done(&in, 0, 0);
+	scatterwalk_done(&out, 1, 0);
+
+	return rc;
+}
+
+/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses
+ * might have been changed by NAT.
+ */
+static void update_chksum(struct sk_buff *skb, int headln)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	int datalen = skb->len - headln;
+	const struct ipv6hdr *ipv6h;
+	const struct iphdr *iph;
+
+	/* We only changed the payload so if we are using partial we don't
+	 * need to update anything.
+	 */
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
+		return;
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
+
+	if (skb->sk->sk_family == AF_INET6) {
+		ipv6h = ipv6_hdr(skb);
+		th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+					     datalen, IPPROTO_TCP, 0);
+	} else {
+		iph = ip_hdr(skb);
+		th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+					       IPPROTO_TCP, 0);
+	}
+}
+
+static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
+{
+	skb_copy_header(nskb, skb);
+
+	skb_put(nskb, skb->len);
+	memcpy(nskb->data, skb->data, headln);
+	update_chksum(nskb, headln);
+
+	nskb->destructor = skb->destructor;
+	nskb->sk = skb->sk;
+	skb->destructor = NULL;
+	skb->sk = NULL;
+	refcount_add(nskb->truesize - skb->truesize,
+		     &nskb->sk->sk_wmem_alloc);
+}
+
+/* This function may be called after the user socket is already
+ * closed so make sure we don't use anything freed during
+ * tls_sk_proto_close here
+ */
+
+static int fill_sg_in(struct scatterlist *sg_in,
+		      struct sk_buff *skb,
+		      struct tls_offload_context *ctx,
+		      u64 *rcd_sn,
+		      s32 *sync_size,
+		      int *resync_sgs)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int payload_len = skb->len - tcp_payload_offset;
+	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+	struct tls_record_info *record;
+	unsigned long flags;
+	int remaining;
+	int i;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	record = tls_get_record(ctx, tcp_seq, rcd_sn);
+	if (!record) {
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		WARN(1, "Record not found for seq %u\n", tcp_seq);
+		return -EINVAL;
+	}
+
+	*sync_size = tcp_seq - tls_record_start_seq(record);
+	if (*sync_size < 0) {
+		int is_start_marker = tls_record_is_start_marker(record);
+
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		/* This should only occur if the relevant record was
+		 * already acked. In that case it should be ok
+		 * to drop the packet and avoid retransmission.
+		 *
+		 * There is a corner case where the packet contains
+		 * both an acked and a non-acked record.
+		 * We currently don't handle that case and rely
+		 * on TCP to retranmit a packet that doesn't contain
+		 * already acked payload.
+		 */
+		if (!is_start_marker)
+			*sync_size = 0;
+		return -EINVAL;
+	}
+
+	remaining = *sync_size;
+	for (i = 0; remaining > 0; i++) {
+		skb_frag_t *frag = &record->frags[i];
+
+		__skb_frag_ref(frag);
+		sg_set_page(sg_in + i, skb_frag_page(frag),
+			    skb_frag_size(frag), frag->page_offset);
+
+		remaining -= skb_frag_size(frag);
+
+		if (remaining < 0)
+			sg_in[i].length += remaining;
+	}
+	*resync_sgs = i;
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void fill_sg_out(struct scatterlist sg_out[3], void *buf,
+			struct tls_context *tls_ctx,
+			struct sk_buff *nskb,
+			int tcp_payload_offset,
+			int payload_len,
+			int sync_size,
+			void *dummy_buf)
+{
+	sg_set_buf(&sg_out[0], dummy_buf, sync_size);
+	sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len);
+	/* Add room for authentication tag produced by crypto */
+	dummy_buf += sync_size;
+	sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+}
+
+static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
+				   struct scatterlist sg_out[3],
+				   struct scatterlist *sg_in,
+				   struct sk_buff *skb,
+				   s32 sync_size, u64 rcd_sn)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int payload_len = skb->len - tcp_payload_offset;
+	void *buf, *iv, *aad, *dummy_buf;
+	struct aead_request *aead_req;
+	struct sk_buff *nskb = NULL;
+	int buf_len;
+
+	aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC);
+	if (!aead_req)
+		return NULL;
+
+	buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+		  TLS_CIPHER_AES_GCM_128_IV_SIZE +
+		  TLS_AAD_SPACE_SIZE +
+		  sync_size +
+		  TLS_CIPHER_AES_GCM_128_TAG_SIZE;
+	buf = kmalloc(buf_len, GFP_ATOMIC);
+	if (!buf)
+		goto free_req;
+
+	iv = buf;
+	memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt,
+	       TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE +
+	      TLS_CIPHER_AES_GCM_128_IV_SIZE;
+	dummy_buf = aad + TLS_AAD_SPACE_SIZE;
+
+	nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC);
+	if (!nskb)
+		goto free_buf;
+
+	skb_reserve(nskb, skb_headroom(skb));
+
+	fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset,
+		    payload_len, sync_size, dummy_buf);
+
+	if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv,
+			    rcd_sn, sync_size + payload_len) < 0)
+		goto free_nskb;
+
+	complete_skb(nskb, skb, tcp_payload_offset);
+
+	/* validate_xmit_skb_list assumes that if the skb wasn't segmented
+	 * nskb->prev will point to the skb itself
+	 */
+	nskb->prev = nskb;
+
+free_buf:
+	kfree(buf);
+free_req:
+	kfree(aead_req);
+	return nskb;
+free_nskb:
+	kfree_skb(nskb);
+	nskb = NULL;
+	goto free_buf;
+}
+
+static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
+{
+	int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+	int payload_len = skb->len - tcp_payload_offset;
+	struct scatterlist *sg_in, sg_out[3];
+	struct sk_buff *nskb = NULL;
+	int sg_in_max_elements;
+	int resync_sgs = 0;
+	s32 sync_size = 0;
+	u64 rcd_sn;
+
+	/* worst case is:
+	 * MAX_SKB_FRAGS in tls_record_info
+	 * MAX_SKB_FRAGS + 1 in SKB head and frags.
+	 */
+	sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1;
+
+	if (!payload_len)
+		return skb;
+
+	sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC);
+	if (!sg_in)
+		goto free_orig;
+
+	sg_init_table(sg_in, sg_in_max_elements);
+	sg_init_table(sg_out, ARRAY_SIZE(sg_out));
+
+	if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) {
+		/* bypass packets before kernel TLS socket option was set */
+		if (sync_size < 0 && payload_len <= -sync_size)
+			nskb = skb_get(skb);
+		goto put_sg;
+	}
+
+	nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn);
+
+put_sg:
+	while (resync_sgs)
+		put_page(sg_page(&sg_in[--resync_sgs]));
+	kfree(sg_in);
+free_orig:
+	kfree_skb(skb);
+	return nskb;
+}
+
+struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
+				      struct net_device *dev,
+				      struct sk_buff *skb)
+{
+	if (dev == tls_get_ctx(sk)->netdev)
+		return skb;
+
+	return tls_sw_fallback(sk, skb);
+}
+
+int tls_sw_fallback_init(struct sock *sk,
+			 struct tls_offload_context *offload_ctx,
+			 struct tls_crypto_info *crypto_info)
+{
+	const u8 *key;
+	int rc;
+
+	offload_ctx->aead_send =
+	    crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(offload_ctx->aead_send)) {
+		rc = PTR_ERR(offload_ctx->aead_send);
+		pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc);
+		offload_ctx->aead_send = NULL;
+		goto err_out;
+	}
+
+	key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key;
+
+	rc = crypto_aead_setkey(offload_ctx->aead_send, key,
+				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+	if (rc)
+		goto free_aead;
+
+	rc = crypto_aead_setauthsize(offload_ctx->aead_send,
+				     TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+	if (rc)
+		goto free_aead;
+
+	return 0;
+free_aead:
+	crypto_free_aead(offload_ctx->aead_send);
+err_out:
+	return rc;
+}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 545bf34ed599..3aafb871a0a8 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -54,6 +54,9 @@ enum {
 enum {
 	TLS_BASE,
 	TLS_SW,
+#ifdef CONFIG_TLS_DEVICE
+	TLS_HW,
+#endif
 	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };
@@ -280,6 +283,15 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		tls_sw_free_resources_rx(sk);
 	}
 
+#ifdef CONFIG_TLS_DEVICE
+	if (ctx->tx_conf != TLS_HW) {
+#else
+	{
+#endif
+		kfree(ctx);
+		ctx = NULL;
+	}
+
 skip_tx_cleanup:
 	release_sock(sk);
 	sk_proto_close(sk, timeout);
@@ -442,8 +454,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 	}
 
 	if (tx) {
-		rc = tls_set_sw_offload(sk, ctx, 1);
-		conf = TLS_SW;
+#ifdef CONFIG_TLS_DEVICE
+		rc = tls_set_device_offload(sk, ctx);
+		conf = TLS_HW;
+		if (rc) {
+#else
+		{
+#endif
+			rc = tls_set_sw_offload(sk, ctx, 1);
+			conf = TLS_SW;
+		}
 	} else {
 		rc = tls_set_sw_offload(sk, ctx, 0);
 		conf = TLS_SW;
@@ -596,6 +616,16 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 	prot[TLS_SW][TLS_SW].recvmsg	= tls_sw_recvmsg;
 	prot[TLS_SW][TLS_SW].close	= tls_sk_proto_close;
 
+#ifdef CONFIG_TLS_DEVICE
+	prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
+	prot[TLS_HW][TLS_BASE].sendmsg		= tls_device_sendmsg;
+	prot[TLS_HW][TLS_BASE].sendpage		= tls_device_sendpage;
+
+	prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
+	prot[TLS_HW][TLS_SW].sendmsg		= tls_device_sendmsg;
+	prot[TLS_HW][TLS_SW].sendpage		= tls_device_sendpage;
+#endif
+
 	prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
 	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_hw_hash;
 	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_hw_unhash;
@@ -630,7 +660,7 @@ static int tls_init(struct sock *sk)
 	ctx->getsockopt = sk->sk_prot->getsockopt;
 	ctx->sk_proto_close = sk->sk_prot->close;
 
-	/* Build IPv6 TLS whenever the address of tcpv6_prot changes */
+	/* Build IPv6 TLS whenever the address of tcpv6	_prot changes */
 	if (ip_ver == TLSV6 &&
 	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
 		mutex_lock(&tcpv6_prot_mutex);
@@ -680,6 +710,9 @@ static int __init tls_register(void)
 	tls_sw_proto_ops.poll = tls_sw_poll;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
+#ifdef CONFIG_TLS_DEVICE
+	tls_device_init();
+#endif
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
@@ -688,6 +721,9 @@ static int __init tls_register(void)
 static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
+#ifdef CONFIG_TLS_DEVICE
+	tls_device_cleanup();
+#endif
 }
 
 module_init(tls_register);
-- 
cgit v1.2.3-59-g8ed1b


From bb9094161b2320e431a5d8a7b9c3dc632bc92ae6 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:17 +0300
Subject: net/mlx5e: Move defines out of ipsec code

The defines are not IPSEC specific.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h             | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h | 3 ---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c     | 5 +----
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h       | 2 ++
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 56c748ca2a09..66ecb1c81002 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -55,6 +55,9 @@
 
 struct page_pool;
 
+#define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
+#define MLX5E_METADATA_ETHER_LEN 8
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
index 1198fc1eba4c..93bf10e6508c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
@@ -45,9 +45,6 @@
 #define MLX5E_IPSEC_SADB_RX_BITS 10
 #define MLX5E_IPSEC_ESN_SCOPE_MID 0x80000000L
 
-#define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
-#define MLX5E_METADATA_ETHER_LEN 8
-
 struct mlx5e_priv;
 
 struct mlx5e_ipsec_sw_stats {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 0f5da499a223..3c4f1f326e13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -43,9 +43,6 @@
 #include "fpga/sdk.h"
 #include "fpga/core.h"
 
-#define SBU_QP_QUEUE_SIZE 8
-#define MLX5_FPGA_IPSEC_CMD_TIMEOUT_MSEC	(60 * 1000)
-
 enum mlx5_fpga_ipsec_cmd_status {
 	MLX5_FPGA_IPSEC_CMD_PENDING,
 	MLX5_FPGA_IPSEC_CMD_SEND_FAIL,
@@ -258,7 +255,7 @@ static int mlx5_fpga_ipsec_cmd_wait(void *ctx)
 {
 	struct mlx5_fpga_ipsec_cmd_context *context = ctx;
 	unsigned long timeout =
-		msecs_to_jiffies(MLX5_FPGA_IPSEC_CMD_TIMEOUT_MSEC);
+		msecs_to_jiffies(MLX5_FPGA_CMD_TIMEOUT_MSEC);
 	int res;
 
 	res = wait_for_completion_timeout(&context->complete, timeout);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
index baa537e54a49..a0573cc2fc9b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
@@ -41,6 +41,8 @@
  * DOC: Innova SDK
  * This header defines the in-kernel API for Innova FPGA client drivers.
  */
+#define SBU_QP_QUEUE_SIZE 8
+#define MLX5_FPGA_CMD_TIMEOUT_MSEC (60 * 1000)
 
 enum mlx5_fpga_access_type {
 	MLX5_FPGA_ACCESS_TYPE_I2C = 0x0,
-- 
cgit v1.2.3-59-g8ed1b


From 1ae1732284895498b7119e42323cf12821423e6d Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:18 +0300
Subject: net/mlx5: Accel, Add TLS tx offload interface

Add routines for manipulating TLS TX offload contexts.

In Innova TLS, TLS contexts are added or deleted
via a command message over the SBU connection.
The HW then sends a response message over the same connection.

Add implementation for Innova TLS (FPGA-based) hardware.

These routines will be used by the TLS offload support in a later patch

mlx5/accel is a middle acceleration layer to allow mlx5e and other ULPs
to work directly with mlx5_core rather than Innova FPGA or other mlx5
acceleration providers.

In the future, when IPSec/TLS or any other acceleration gets integrated
into ConnectX chip, mlx5/accel layer will provide the integrated
acceleration, rather than the Innova one.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 .../net/ethernet/mellanox/mlx5/core/accel/tls.c    |  71 +++
 .../net/ethernet/mellanox/mlx5/core/accel/tls.h    |  86 ++++
 .../net/ethernet/mellanox/mlx5/core/fpga/core.h    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 562 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h |  68 +++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  11 +
 include/linux/mlx5/mlx5_ifc.h                      |  16 -
 include/linux/mlx5/mlx5_ifc_fpga.h                 |  77 +++
 9 files changed, 878 insertions(+), 18 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index c805769d92a9..9989e5265a45 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -8,10 +8,10 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o lib/clock.o \
 		diag/fs_tracepoint.o
 
-mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o
+mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
 
 mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
-		fpga/ipsec.o
+		fpga/ipsec.o fpga/tls.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
new file mode 100644
index 000000000000..77ac19f38cbe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "accel/tls.h"
+#include "mlx5_core.h"
+#include "fpga/tls.h"
+
+int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			       struct tls_crypto_info *crypto_info,
+			       u32 start_offload_tcp_sn, u32 *p_swid)
+{
+	return mlx5_fpga_tls_add_tx_flow(mdev, flow, crypto_info,
+					 start_offload_tcp_sn, p_swid);
+}
+
+void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid)
+{
+	mlx5_fpga_tls_del_tx_flow(mdev, swid, GFP_KERNEL);
+}
+
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_is_tls_device(mdev);
+}
+
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_device_caps(mdev);
+}
+
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_init(mdev);
+}
+
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	mlx5_fpga_tls_cleanup(mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
new file mode 100644
index 000000000000..6f9c9f446ecc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_TLS_H__
+#define __MLX5_ACCEL_TLS_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/tls.h>
+
+#ifdef CONFIG_MLX5_ACCEL
+
+enum {
+	MLX5_ACCEL_TLS_TX = BIT(0),
+	MLX5_ACCEL_TLS_RX = BIT(1),
+	MLX5_ACCEL_TLS_V12 = BIT(2),
+	MLX5_ACCEL_TLS_V13 = BIT(3),
+	MLX5_ACCEL_TLS_LRO = BIT(4),
+	MLX5_ACCEL_TLS_IPV6 = BIT(5),
+	MLX5_ACCEL_TLS_AES_GCM128 = BIT(30),
+	MLX5_ACCEL_TLS_AES_GCM256 = BIT(31),
+};
+
+struct mlx5_ifc_tls_flow_bits {
+	u8         src_port[0x10];
+	u8         dst_port[0x10];
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+	u8         ipv6[0x1];
+	u8         direction_sx[0x1];
+	u8         reserved_at_2[0x1e];
+};
+
+int mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			       struct tls_crypto_info *crypto_info,
+			       u32 start_offload_tcp_sn, u32 *p_swid);
+void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid);
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev);
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev);
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev);
+
+#else
+
+static inline int
+mlx5_accel_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid) { return 0; }
+static inline void mlx5_accel_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid) { }
+static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; }
+static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
+static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
+static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { }
+
+#endif
+
+#endif	/* __MLX5_ACCEL_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
index 82405ed84725..3e2355c8df3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
@@ -53,6 +53,7 @@ struct mlx5_fpga_device {
 	} conn_res;
 
 	struct mlx5_fpga_ipsec *ipsec;
+	struct mlx5_fpga_tls *tls;
 };
 
 #define mlx5_fpga_dbg(__adev, format, ...) \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
new file mode 100644
index 000000000000..21048013826c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+#include "fpga/tls.h"
+#include "fpga/cmd.h"
+#include "fpga/sdk.h"
+#include "fpga/core.h"
+#include "accel/tls.h"
+
+struct mlx5_fpga_tls_command_context;
+
+typedef void (*mlx5_fpga_tls_command_complete)
+	(struct mlx5_fpga_conn *conn, struct mlx5_fpga_device *fdev,
+	 struct mlx5_fpga_tls_command_context *ctx,
+	 struct mlx5_fpga_dma_buf *resp);
+
+struct mlx5_fpga_tls_command_context {
+	struct list_head list;
+	/* There is no guarantee on the order between the TX completion
+	 * and the command response.
+	 * The TX completion is going to touch cmd->buf even in
+	 * the case of successful transmission.
+	 * So instead of requiring separate allocations for cmd
+	 * and cmd->buf we've decided to use a reference counter
+	 */
+	refcount_t ref;
+	struct mlx5_fpga_dma_buf buf;
+	mlx5_fpga_tls_command_complete complete;
+};
+
+static void
+mlx5_fpga_tls_put_command_ctx(struct mlx5_fpga_tls_command_context *ctx)
+{
+	if (refcount_dec_and_test(&ctx->ref))
+		kfree(ctx);
+}
+
+static void mlx5_fpga_tls_cmd_complete(struct mlx5_fpga_device *fdev,
+				       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_fpga_conn *conn = fdev->tls->conn;
+	struct mlx5_fpga_tls_command_context *ctx;
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	ctx = list_first_entry(&tls->pending_cmds,
+			       struct mlx5_fpga_tls_command_context, list);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+	ctx->complete(conn, fdev, ctx, resp);
+}
+
+static void mlx5_fpga_cmd_send_complete(struct mlx5_fpga_conn *conn,
+					struct mlx5_fpga_device *fdev,
+					struct mlx5_fpga_dma_buf *buf,
+					u8 status)
+{
+	struct mlx5_fpga_tls_command_context *ctx =
+	    container_of(buf, struct mlx5_fpga_tls_command_context, buf);
+
+	mlx5_fpga_tls_put_command_ctx(ctx);
+
+	if (unlikely(status))
+		mlx5_fpga_tls_cmd_complete(fdev, NULL);
+}
+
+static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev,
+				   struct mlx5_fpga_tls_command_context *cmd,
+				   mlx5_fpga_tls_command_complete complete)
+{
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+	int ret;
+
+	refcount_set(&cmd->ref, 2);
+	cmd->complete = complete;
+	cmd->buf.complete = mlx5_fpga_cmd_send_complete;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	/* mlx5_fpga_sbu_conn_sendmsg is called under pending_cmds_lock
+	 * to make sure commands are inserted to the tls->pending_cmds list
+	 * and the command QP in the same order.
+	 */
+	ret = mlx5_fpga_sbu_conn_sendmsg(tls->conn, &cmd->buf);
+	if (likely(!ret))
+		list_add_tail(&cmd->list, &tls->pending_cmds);
+	else
+		complete(tls->conn, fdev, cmd, NULL);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+}
+
+/* Start of context identifiers range (inclusive) */
+#define SWID_START	0
+/* End of context identifiers range (exclusive) */
+#define SWID_END	BIT(24)
+
+static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock,
+				    void *ptr)
+{
+	int ret;
+
+	/* TLS metadata format is 1 byte for syndrome followed
+	 * by 3 bytes of swid (software ID)
+	 * swid must not exceed 3 bytes.
+	 * See tls_rxtx.c:insert_pet() for details
+	 */
+	BUILD_BUG_ON((SWID_END - 1) & 0xFF000000);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(idr_spinlock);
+	ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC);
+	spin_unlock_irq(idr_spinlock);
+	idr_preload_end();
+
+	return ret;
+}
+
+static void mlx5_fpga_tls_release_swid(struct idr *idr,
+				       spinlock_t *idr_spinlock, u32 swid)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(idr_spinlock, flags);
+	idr_remove(idr, swid);
+	spin_unlock_irqrestore(idr_spinlock, flags);
+}
+
+struct mlx5_teardown_stream_context {
+	struct mlx5_fpga_tls_command_context cmd;
+	u32 swid;
+};
+
+static void
+mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn,
+				  struct mlx5_fpga_device *fdev,
+				  struct mlx5_fpga_tls_command_context *cmd,
+				  struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_teardown_stream_context *ctx =
+		    container_of(cmd, struct mlx5_teardown_stream_context, cmd);
+
+	if (resp) {
+		u32 syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+
+		if (syndrome)
+			mlx5_fpga_err(fdev,
+				      "Teardown stream failed with syndrome = %d",
+				      syndrome);
+		else
+			mlx5_fpga_tls_release_swid(&fdev->tls->tx_idr,
+						   &fdev->tls->idr_spinlock,
+						   ctx->swid);
+	}
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
+{
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, src_port), flow,
+	       MLX5_BYTE_OFF(tls_flow, ipv6));
+
+	MLX5_SET(tls_cmd, cmd, ipv6, MLX5_GET(tls_flow, flow, ipv6));
+	MLX5_SET(tls_cmd, cmd, direction_sx,
+		 MLX5_GET(tls_flow, flow, direction_sx));
+}
+
+void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, void *flow,
+				     u32 swid, gfp_t flags)
+{
+	struct mlx5_teardown_stream_context *ctx;
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd;
+
+	ctx = kzalloc(sizeof(*ctx) + MLX5_TLS_COMMAND_SIZE, flags);
+	if (!ctx)
+		return;
+
+	buf = &ctx->cmd.buf;
+	cmd = (ctx + 1);
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_TEARDOWN_STREAM);
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+	kfree(flow);
+
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+
+	ctx->swid = swid;
+	mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd,
+			       mlx5_fpga_tls_teardown_completion);
+}
+
+void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
+			       gfp_t flags)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	void *flow;
+
+	rcu_read_lock();
+	flow = idr_find(&tls->tx_idr, swid);
+	rcu_read_unlock();
+
+	if (!flow) {
+		mlx5_fpga_err(mdev->fpga, "No flow information for swid %u\n",
+			      swid);
+		return;
+	}
+
+	mlx5_fpga_tls_send_teardown_cmd(mdev, flow, swid, flags);
+}
+
+enum mlx5_fpga_setup_stream_status {
+	MLX5_FPGA_CMD_PENDING,
+	MLX5_FPGA_CMD_SEND_FAILED,
+	MLX5_FPGA_CMD_RESPONSE_RECEIVED,
+	MLX5_FPGA_CMD_ABANDONED,
+};
+
+struct mlx5_setup_stream_context {
+	struct mlx5_fpga_tls_command_context cmd;
+	atomic_t status;
+	u32 syndrome;
+	struct completion comp;
+};
+
+static void
+mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_device *fdev,
+			       struct mlx5_fpga_tls_command_context *cmd,
+			       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_setup_stream_context *ctx =
+	    container_of(cmd, struct mlx5_setup_stream_context, cmd);
+	int status = MLX5_FPGA_CMD_SEND_FAILED;
+	void *tls_cmd = ctx + 1;
+
+	/* If we failed to send to command resp == NULL */
+	if (resp) {
+		ctx->syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+		status = MLX5_FPGA_CMD_RESPONSE_RECEIVED;
+	}
+
+	status = atomic_xchg_release(&ctx->status, status);
+	if (likely(status != MLX5_FPGA_CMD_ABANDONED)) {
+		complete(&ctx->comp);
+		return;
+	}
+
+	mlx5_fpga_err(fdev, "Command was abandoned, syndrome = %u\n",
+		      ctx->syndrome);
+
+	if (!ctx->syndrome) {
+		/* The process was killed while waiting for the context to be
+		 * added, and the add completed successfully.
+		 * We need to destroy the HW context, and we can't can't reuse
+		 * the command context because we might not have received
+		 * the tx completion yet.
+		 */
+		mlx5_fpga_tls_del_tx_flow(fdev->mdev,
+					  MLX5_GET(tls_cmd, tls_cmd, swid),
+					  GFP_ATOMIC);
+	}
+
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static int mlx5_fpga_tls_setup_stream_cmd(struct mlx5_core_dev *mdev,
+					  struct mlx5_setup_stream_context *ctx)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd = ctx + 1;
+	int status, ret = 0;
+
+	buf = &ctx->cmd.buf;
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_SETUP_STREAM);
+
+	init_completion(&ctx->comp);
+	atomic_set(&ctx->status, MLX5_FPGA_CMD_PENDING);
+	ctx->syndrome = -1;
+
+	mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd,
+			       mlx5_fpga_tls_setup_completion);
+	wait_for_completion_killable(&ctx->comp);
+
+	status = atomic_xchg_acquire(&ctx->status, MLX5_FPGA_CMD_ABANDONED);
+	if (unlikely(status == MLX5_FPGA_CMD_PENDING))
+	/* ctx is going to be released in mlx5_fpga_tls_setup_completion */
+		return -EINTR;
+
+	if (unlikely(ctx->syndrome))
+		ret = -ENOMEM;
+
+	mlx5_fpga_tls_put_command_ctx(&ctx->cmd);
+	return ret;
+}
+
+static void mlx5_fpga_tls_hw_qp_recv_cb(void *cb_arg,
+					struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_fpga_device *fdev = (struct mlx5_fpga_device *)cb_arg;
+
+	mlx5_fpga_tls_cmd_complete(fdev, buf);
+}
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga))
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_version) != 0)
+		return false;
+
+	return true;
+}
+
+static int mlx5_fpga_tls_get_caps(struct mlx5_fpga_device *fdev,
+				  u32 *p_caps)
+{
+	int err, cap_size = MLX5_ST_SZ_BYTES(tls_extended_cap);
+	u32 caps = 0;
+	void *buf;
+
+	buf = kzalloc(cap_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	err = mlx5_fpga_get_sbu_caps(fdev, cap_size, buf);
+	if (err)
+		goto out;
+
+	if (MLX5_GET(tls_extended_cap, buf, tx))
+		caps |= MLX5_ACCEL_TLS_TX;
+	if (MLX5_GET(tls_extended_cap, buf, rx))
+		caps |= MLX5_ACCEL_TLS_RX;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v12))
+		caps |= MLX5_ACCEL_TLS_V12;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v13))
+		caps |= MLX5_ACCEL_TLS_V13;
+	if (MLX5_GET(tls_extended_cap, buf, lro))
+		caps |= MLX5_ACCEL_TLS_LRO;
+	if (MLX5_GET(tls_extended_cap, buf, ipv6))
+		caps |= MLX5_ACCEL_TLS_IPV6;
+
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_128))
+		caps |= MLX5_ACCEL_TLS_AES_GCM128;
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_256))
+		caps |= MLX5_ACCEL_TLS_AES_GCM256;
+
+	*p_caps = caps;
+	err = 0;
+out:
+	kfree(buf);
+	return err;
+}
+
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_conn_attr init_attr = {0};
+	struct mlx5_fpga_conn *conn;
+	struct mlx5_fpga_tls *tls;
+	int err = 0;
+
+	if (!mlx5_fpga_is_tls_device(mdev) || !fdev)
+		return 0;
+
+	tls = kzalloc(sizeof(*tls), GFP_KERNEL);
+	if (!tls)
+		return -ENOMEM;
+
+	err = mlx5_fpga_tls_get_caps(fdev, &tls->caps);
+	if (err)
+		goto error;
+
+	if (!(tls->caps & (MLX5_ACCEL_TLS_TX | MLX5_ACCEL_TLS_V12 |
+				 MLX5_ACCEL_TLS_AES_GCM128))) {
+		err = -ENOTSUPP;
+		goto error;
+	}
+
+	init_attr.rx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.tx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.recv_cb = mlx5_fpga_tls_hw_qp_recv_cb;
+	init_attr.cb_arg = fdev;
+	conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		mlx5_fpga_err(fdev, "Error creating TLS command connection %d\n",
+			      err);
+		goto error;
+	}
+
+	tls->conn = conn;
+	spin_lock_init(&tls->pending_cmds_lock);
+	INIT_LIST_HEAD(&tls->pending_cmds);
+
+	idr_init(&tls->tx_idr);
+	spin_lock_init(&tls->idr_spinlock);
+	fdev->tls = tls;
+	return 0;
+
+error:
+	kfree(tls);
+	return err;
+}
+
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!fdev || !fdev->tls)
+		return;
+
+	mlx5_fpga_sbu_conn_destroy(fdev->tls->conn);
+	kfree(fdev->tls);
+	fdev->tls = NULL;
+}
+
+static void mlx5_fpga_tls_set_aes_gcm128_ctx(void *cmd,
+					     struct tls_crypto_info *info,
+					     __be64 *rcd_sn)
+{
+	struct tls12_crypto_info_aes_gcm_128 *crypto_info =
+	    (struct tls12_crypto_info_aes_gcm_128 *)info;
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_rcd_sn), crypto_info->rec_seq,
+	       TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_implicit_iv),
+	       crypto_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key),
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	/* in AES-GCM 128 we need to write the key twice */
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key) +
+		   TLS_CIPHER_AES_GCM_128_KEY_SIZE,
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	MLX5_SET(tls_cmd, cmd, alg, MLX5_TLS_ALG_AES_GCM_128);
+}
+
+static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps,
+					  struct tls_crypto_info *crypto_info)
+{
+	__be64 rcd_sn;
+
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128:
+		if (!(caps & MLX5_ACCEL_TLS_AES_GCM128))
+			return -EINVAL;
+		mlx5_fpga_tls_set_aes_gcm128_ctx(cmd, crypto_info, &rcd_sn);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+				  struct tls_crypto_info *crypto_info, u32 swid,
+				  u32 tcp_sn)
+{
+	u32 caps = mlx5_fpga_tls_device_caps(mdev);
+	struct mlx5_setup_stream_context *ctx;
+	int ret = -ENOMEM;
+	size_t cmd_size;
+	void *cmd;
+
+	cmd_size = MLX5_TLS_COMMAND_SIZE + sizeof(*ctx);
+	ctx = kzalloc(cmd_size, GFP_KERNEL);
+	if (!ctx)
+		goto out;
+
+	cmd = ctx + 1;
+	ret = mlx5_fpga_tls_set_key_material(cmd, caps, crypto_info);
+	if (ret)
+		goto free_ctx;
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+	MLX5_SET(tls_cmd, cmd, tcp_sn, tcp_sn);
+
+	return mlx5_fpga_tls_setup_stream_cmd(mdev, ctx);
+
+free_ctx:
+	kfree(ctx);
+out:
+	return ret;
+}
+
+int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			      struct tls_crypto_info *crypto_info,
+			      u32 start_offload_tcp_sn, u32 *p_swid)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	int ret = -ENOMEM;
+	u32 swid;
+
+	ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr, &tls->idr_spinlock, flow);
+	if (ret < 0)
+		return ret;
+
+	swid = ret;
+	MLX5_SET(tls_flow, flow, direction_sx, 1);
+
+	ret = mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid,
+				     start_offload_tcp_sn);
+	if (ret && ret != -EINTR)
+		goto free_swid;
+
+	*p_swid = swid;
+	return 0;
+free_swid:
+	mlx5_fpga_tls_release_swid(&tls->tx_idr, &tls->idr_spinlock, swid);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
new file mode 100644
index 000000000000..800a214e4e49
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_TLS_H__
+#define __MLX5_FPGA_TLS_H__
+
+#include <linux/mlx5/driver.h>
+
+#include <net/tls.h>
+#include "fpga/core.h"
+
+struct mlx5_fpga_tls {
+	struct list_head pending_cmds;
+	spinlock_t pending_cmds_lock; /* Protects pending_cmds */
+	u32 caps;
+	struct mlx5_fpga_conn *conn;
+
+	struct idr tx_idr;
+	spinlock_t idr_spinlock; /* protects the IDR */
+};
+
+int mlx5_fpga_tls_add_tx_flow(struct mlx5_core_dev *mdev, void *flow,
+			      struct tls_crypto_info *crypto_info,
+			      u32 start_offload_tcp_sn, u32 *p_swid);
+
+void mlx5_fpga_tls_del_tx_flow(struct mlx5_core_dev *mdev, u32 swid,
+			       gfp_t flags);
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev);
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev);
+
+static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mdev->fpga->tls->caps;
+}
+
+#endif /* __MLX5_FPGA_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 63a8ea31601c..b865597630ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -60,6 +60,7 @@
 #include "fpga/core.h"
 #include "fpga/ipsec.h"
 #include "accel/ipsec.h"
+#include "accel/tls.h"
 #include "lib/clock.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
@@ -1190,6 +1191,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_ipsec_start;
 	}
 
+	err = mlx5_accel_tls_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "TLS device start failed %d\n", err);
+		goto err_tls_start;
+	}
+
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1231,6 +1238,9 @@ err_sriov:
 	mlx5_cleanup_fs(dev);
 
 err_fs:
+	mlx5_accel_tls_cleanup(dev);
+
+err_tls_start:
 	mlx5_accel_ipsec_cleanup(dev);
 
 err_ipsec_start:
@@ -1306,6 +1316,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_sriov_detach(dev);
 	mlx5_cleanup_fs(dev);
 	mlx5_accel_ipsec_cleanup(dev);
+	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1aad455538f4..b8918a1da11f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -356,22 +356,6 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits {
 	u8         reserved_at_6[0x1a];
 };
 
-struct mlx5_ifc_ipv4_layout_bits {
-	u8         reserved_at_0[0x60];
-
-	u8         ipv4[0x20];
-};
-
-struct mlx5_ifc_ipv6_layout_bits {
-	u8         ipv6[16][0x8];
-};
-
-union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
-	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
-	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
-	u8         reserved_at_0[0x80];
-};
-
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         smac_47_16[0x20];
 
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index ec052491ba3d..193091537cb6 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -32,12 +32,29 @@
 #ifndef MLX5_IFC_FPGA_H
 #define MLX5_IFC_FPGA_H
 
+struct mlx5_ifc_ipv4_layout_bits {
+	u8         reserved_at_0[0x60];
+
+	u8         ipv4[0x20];
+};
+
+struct mlx5_ifc_ipv6_layout_bits {
+	u8         ipv6[16][0x8];
+};
+
+union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
+	struct mlx5_ifc_ipv6_layout_bits ipv6_layout;
+	struct mlx5_ifc_ipv4_layout_bits ipv4_layout;
+	u8         reserved_at_0[0x80];
+};
+
 enum {
 	MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9,
 };
 
 enum {
 	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC    = 0x2,
+	MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS      = 0x3,
 };
 
 struct mlx5_ifc_fpga_shell_caps_bits {
@@ -370,6 +387,27 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_tls_extended_cap_bits {
+	u8         aes_gcm_128[0x1];
+	u8         aes_gcm_256[0x1];
+	u8         reserved_at_2[0x1e];
+	u8         reserved_at_20[0x20];
+	u8         context_capacity_total[0x20];
+	u8         context_capacity_rx[0x20];
+	u8         context_capacity_tx[0x20];
+	u8         reserved_at_a0[0x10];
+	u8         tls_counter_size[0x10];
+	u8         tls_counters_addr_low[0x20];
+	u8         tls_counters_addr_high[0x20];
+	u8         rx[0x1];
+	u8         tx[0x1];
+	u8         tls_v12[0x1];
+	u8         tls_v13[0x1];
+	u8         lro[0x1];
+	u8         ipv6[0x1];
+	u8         reserved_at_106[0x1a];
+};
+
 struct mlx5_ifc_ipsec_extended_cap_bits {
 	u8         encapsulation[0x20];
 
@@ -519,4 +557,43 @@ struct mlx5_ifc_fpga_ipsec_sa {
 	__be16 reserved2;
 } __packed;
 
+enum fpga_tls_cmds {
+	CMD_SETUP_STREAM		= 0x1001,
+	CMD_TEARDOWN_STREAM		= 0x1002,
+};
+
+#define MLX5_TLS_1_2 (0)
+
+#define MLX5_TLS_ALG_AES_GCM_128 (0)
+#define MLX5_TLS_ALG_AES_GCM_256 (1)
+
+struct mlx5_ifc_tls_cmd_bits {
+	u8         command_type[0x20];
+	u8         ipv6[0x1];
+	u8         direction_sx[0x1];
+	u8         tls_version[0x2];
+	u8         reserved[0x1c];
+	u8         swid[0x20];
+	u8         src_port[0x10];
+	u8         dst_port[0x10];
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+	u8         tls_rcd_sn[0x40];
+	u8         tcp_sn[0x20];
+	u8         tls_implicit_iv[0x20];
+	u8         tls_xor_iv[0x40];
+	u8         encryption_key[0x100];
+	u8         alg[4];
+	u8         reserved2[0x1c];
+	u8         reserved3[0x4a0];
+};
+
+struct mlx5_ifc_tls_resp_bits {
+	u8         syndrome[0x20];
+	u8         stream_id[0x20];
+	u8         reserverd[0x40];
+};
+
+#define MLX5_TLS_COMMAND_SIZE (0x100)
+
 #endif /* MLX5_IFC_FPGA_H */
-- 
cgit v1.2.3-59-g8ed1b


From c83294b9efa51e19b582a5962e1366196f684dba Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:19 +0300
Subject: net/mlx5e: TLS, Add Innova TLS TX support

Add NETIF_F_HW_TLS_TX capability and expose tlsdev_ops to work with the
TLS generic NIC offload infrastructure.
The NETIF_F_HW_TLS_TX capability will be added in the next patch.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |  11 ++
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 173 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h |  65 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 5 files changed, 254 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 12257034131e..ee6684779d11 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -86,3 +86,14 @@ config MLX5_EN_IPSEC
 	  Build support for IPsec cryptography-offload accelaration in the NIC.
 	  Note: Support for hardware with this capability needs to be selected
 	  for this option to become available.
+
+config MLX5_EN_TLS
+	bool "TLS cryptography-offload accelaration"
+	depends on MLX5_CORE_EN
+	depends on TLS_DEVICE
+	depends on MLX5_ACCEL
+	default n
+	---help---
+	  Build support for TLS cryptography-offload accelaration in the NIC.
+	  Note: Support for hardware with this capability needs to be selected
+	  for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9989e5265a45..50872ed30c0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -28,4 +28,6 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 		en_accel/ipsec_stats.o
 
+mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o
+
 CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
new file mode 100644
index 000000000000..38d88108a55a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include "en_accel/tls.h"
+#include "accel/tls.h"
+
+static void mlx5e_tls_set_ipv4_flow(void *flow, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	MLX5_SET(tls_flow, flow, ipv6, 0);
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+	       &inet->inet_daddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv4_layout.ipv4),
+	       &inet->inet_rcv_saddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4));
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlx5e_tls_set_ipv6_flow(void *flow, struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	MLX5_SET(tls_flow, flow, ipv6, 1);
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+	       &sk->sk_v6_daddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv6_layout.ipv6),
+	       &np->saddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+}
+#endif
+
+static void mlx5e_tls_set_flow_tcp_ports(void *flow, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_port), &inet->inet_sport,
+	       MLX5_FLD_SZ_BYTES(tls_flow, src_port));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_port), &inet->inet_dport,
+	       MLX5_FLD_SZ_BYTES(tls_flow, dst_port));
+}
+
+static int mlx5e_tls_set_flow(void *flow, struct sock *sk, u32 caps)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		mlx5e_tls_set_ipv4_flow(flow, sk);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		if (!sk->sk_ipv6only &&
+		    ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) {
+			mlx5e_tls_set_ipv4_flow(flow, sk);
+			break;
+		}
+		if (!(caps & MLX5_ACCEL_TLS_IPV6))
+			goto error_out;
+
+		mlx5e_tls_set_ipv6_flow(flow, sk);
+		break;
+#endif
+	default:
+		goto error_out;
+	}
+
+	mlx5e_tls_set_flow_tcp_ports(flow, sk);
+	return 0;
+error_out:
+	return -EINVAL;
+}
+
+static int mlx5e_tls_add(struct net_device *netdev, struct sock *sk,
+			 enum tls_offload_ctx_dir direction,
+			 struct tls_crypto_info *crypto_info,
+			 u32 start_offload_tcp_sn)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 caps = mlx5_accel_tls_device_caps(mdev);
+	int ret = -ENOMEM;
+	void *flow;
+
+	if (direction != TLS_OFFLOAD_CTX_DIR_TX)
+		return -EINVAL;
+
+	flow = kzalloc(MLX5_ST_SZ_BYTES(tls_flow), GFP_KERNEL);
+	if (!flow)
+		return ret;
+
+	ret = mlx5e_tls_set_flow(flow, sk, caps);
+	if (ret)
+		goto free_flow;
+
+	if (direction == TLS_OFFLOAD_CTX_DIR_TX) {
+		struct mlx5e_tls_offload_context *tx_ctx =
+		    mlx5e_get_tls_tx_context(tls_ctx);
+		u32 swid;
+
+		ret = mlx5_accel_tls_add_tx_flow(mdev, flow, crypto_info,
+						 start_offload_tcp_sn, &swid);
+		if (ret < 0)
+			goto free_flow;
+
+		tx_ctx->swid = htonl(swid);
+		tx_ctx->expected_seq = start_offload_tcp_sn;
+	}
+
+	return 0;
+free_flow:
+	kfree(flow);
+	return ret;
+}
+
+static void mlx5e_tls_del(struct net_device *netdev,
+			  struct tls_context *tls_ctx,
+			  enum tls_offload_ctx_dir direction)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (direction == TLS_OFFLOAD_CTX_DIR_TX) {
+		u32 swid = ntohl(mlx5e_get_tls_tx_context(tls_ctx)->swid);
+
+		mlx5_accel_tls_del_tx_flow(priv->mdev, swid);
+	} else {
+		netdev_err(netdev, "unsupported direction %d\n", direction);
+	}
+}
+
+static const struct tlsdev_ops mlx5e_tls_ops = {
+	.tls_dev_add = mlx5e_tls_add,
+	.tls_dev_del = mlx5e_tls_del,
+};
+
+void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+
+	if (!mlx5_accel_is_tls_device(priv->mdev))
+		return;
+
+	netdev->tlsdev_ops = &mlx5e_tls_ops;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
new file mode 100644
index 000000000000..f7216b9b98e2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#ifndef __MLX5E_TLS_H__
+#define __MLX5E_TLS_H__
+
+#ifdef CONFIG_MLX5_EN_TLS
+
+#include <net/tls.h>
+#include "en.h"
+
+struct mlx5e_tls_offload_context {
+	struct tls_offload_context base;
+	u32 expected_seq;
+	__be32 swid;
+};
+
+static inline struct mlx5e_tls_offload_context *
+mlx5e_get_tls_tx_context(struct tls_context *tls_ctx)
+{
+	BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) >
+		     TLS_OFFLOAD_CONTEXT_SIZE);
+	return container_of(tls_offload_ctx(tls_ctx),
+			    struct mlx5e_tls_offload_context,
+			    base);
+}
+
+void mlx5e_tls_build_netdev(struct mlx5e_priv *priv);
+
+#else
+
+static inline void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) { }
+
+#endif
+
+#endif /* __MLX5E_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f1fe490ed794..0450c8c9b447 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -42,7 +42,9 @@
 #include "en_rep.h"
 #include "en_accel/ipsec.h"
 #include "en_accel/ipsec_rxtx.h"
+#include "en_accel/tls.h"
 #include "accel/ipsec.h"
+#include "accel/tls.h"
 #include "vxlan.h"
 
 struct mlx5e_rq_param {
@@ -4376,6 +4378,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 #endif
 
 	mlx5e_ipsec_build_netdev(priv);
+	mlx5e_tls_build_netdev(priv);
 }
 
 static void mlx5e_create_q_counters(struct mlx5e_priv *priv)
-- 
cgit v1.2.3-59-g8ed1b


From bf23974104fa7aa2f3cbd2f2295f866a71875efd Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:20 +0300
Subject: net/mlx5e: TLS, Add Innova TLS TX offload data path

Implement the TLS tx offload data path according to the
requirements of the TLS generic NIC offload infrastructure.

Special metadata ethertype is used to pass information to
the hardware.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  15 ++
 .../mellanox/mlx5/core/en_accel/en_accel.h         |  72 ++++++
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c |   2 +
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c         | 272 +++++++++++++++++++++
 .../mellanox/mlx5/core/en_accel/tls_rxtx.h         |  50 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  10 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |   9 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    |  37 +--
 10 files changed, 455 insertions(+), 16 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 50872ed30c0b..ec785f589666 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -28,6 +28,6 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 		en_accel/ipsec_stats.o
 
-mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o
+mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o en_accel/tls_rxtx.o
 
 CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 66ecb1c81002..ca5b8523bc95 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -335,6 +335,7 @@ enum {
 	MLX5E_SQ_STATE_RECOVERING,
 	MLX5E_SQ_STATE_IPSEC,
 	MLX5E_SQ_STATE_AM,
+	MLX5E_SQ_STATE_TLS,
 };
 
 struct mlx5e_sq_wqe_info {
@@ -830,6 +831,8 @@ void mlx5e_build_ptys2ethtool_map(void);
 u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 		       void *accel_priv, select_queue_fallback_t fallback);
 netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5e_tx_wqe *wqe, u16 pi);
 
 void mlx5e_completion_event(struct mlx5_core_cq *mcq);
 void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
@@ -945,6 +948,18 @@ static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
 		MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
 }
 
+static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
+				      struct mlx5e_tx_wqe **wqe,
+				      u16 *pi)
+{
+	struct mlx5_wq_cyc *wq;
+
+	wq = &sq->wq;
+	*pi = sq->pc & wq->sz_m1;
+	*wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
+	memset(*wqe, 0, sizeof(**wqe));
+}
+
 static inline
 struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
new file mode 100644
index 000000000000..68fcb40a2847
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_EN_ACCEL_H__
+#define __MLX5E_EN_ACCEL_H__
+
+#ifdef CONFIG_MLX5_ACCEL
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/tls_rxtx.h"
+#include "en.h"
+
+static inline struct sk_buff *mlx5e_accel_handle_tx(struct sk_buff *skb,
+						    struct mlx5e_txqsq *sq,
+						    struct net_device *dev,
+						    struct mlx5e_tx_wqe **wqe,
+						    u16 *pi)
+{
+#ifdef CONFIG_MLX5_EN_TLS
+	if (sq->state & BIT(MLX5E_SQ_STATE_TLS)) {
+		skb = mlx5e_tls_handle_tx_skb(dev, sq, skb, wqe, pi);
+		if (unlikely(!skb))
+			return NULL;
+	}
+#endif
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (sq->state & BIT(MLX5E_SQ_STATE_IPSEC)) {
+		skb = mlx5e_ipsec_handle_tx_skb(dev, *wqe, skb);
+		if (unlikely(!skb))
+			return NULL;
+	}
+#endif
+
+	return skb;
+}
+
+#endif /* CONFIG_MLX5_ACCEL */
+
+#endif /* __MLX5E_EN_ACCEL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index 38d88108a55a..aa6981c98bdc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -169,5 +169,7 @@ void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
 	if (!mlx5_accel_is_tls_device(priv->mdev))
 		return;
 
+	netdev->features |= NETIF_F_HW_TLS_TX;
+	netdev->hw_features |= NETIF_F_HW_TLS_TX;
 	netdev->tlsdev_ops = &mlx5e_tls_ops;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
new file mode 100644
index 000000000000..49e8d455ebc3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "en_accel/tls.h"
+#include "en_accel/tls_rxtx.h"
+
+#define SYNDROME_OFFLOAD_REQUIRED 32
+#define SYNDROME_SYNC 33
+
+struct sync_info {
+	u64 rcd_sn;
+	s32 sync_len;
+	int nr_frags;
+	skb_frag_t frags[MAX_SKB_FRAGS];
+};
+
+struct mlx5e_tls_metadata {
+	/* One byte of syndrome followed by 3 bytes of swid */
+	__be32 syndrome_swid;
+	__be16 first_seq;
+	/* packet type ID field	*/
+	__be16 ethertype;
+} __packed;
+
+static int mlx5e_tls_add_metadata(struct sk_buff *skb, __be32 swid)
+{
+	struct mlx5e_tls_metadata *pet;
+	struct ethhdr *eth;
+
+	if (skb_cow_head(skb, sizeof(struct mlx5e_tls_metadata)))
+		return -ENOMEM;
+
+	eth = (struct ethhdr *)skb_push(skb, sizeof(struct mlx5e_tls_metadata));
+	skb->mac_header -= sizeof(struct mlx5e_tls_metadata);
+	pet = (struct mlx5e_tls_metadata *)(eth + 1);
+
+	memmove(skb->data, skb->data + sizeof(struct mlx5e_tls_metadata),
+		2 * ETH_ALEN);
+
+	eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE);
+	pet->syndrome_swid = htonl(SYNDROME_OFFLOAD_REQUIRED << 24) | swid;
+
+	return 0;
+}
+
+static int mlx5e_tls_get_sync_data(struct mlx5e_tls_offload_context *context,
+				   u32 tcp_seq, struct sync_info *info)
+{
+	int remaining, i = 0, ret = -EINVAL;
+	struct tls_record_info *record;
+	unsigned long flags;
+	s32 sync_size;
+
+	spin_lock_irqsave(&context->base.lock, flags);
+	record = tls_get_record(&context->base, tcp_seq, &info->rcd_sn);
+
+	if (unlikely(!record))
+		goto out;
+
+	sync_size = tcp_seq - tls_record_start_seq(record);
+	info->sync_len = sync_size;
+	if (unlikely(sync_size < 0)) {
+		if (tls_record_is_start_marker(record))
+			goto done;
+
+		goto out;
+	}
+
+	remaining = sync_size;
+	while (remaining > 0) {
+		info->frags[i] = record->frags[i];
+		__skb_frag_ref(&info->frags[i]);
+		remaining -= skb_frag_size(&info->frags[i]);
+
+		if (remaining < 0)
+			skb_frag_size_add(&info->frags[i], remaining);
+
+		i++;
+	}
+	info->nr_frags = i;
+done:
+	ret = 0;
+out:
+	spin_unlock_irqrestore(&context->base.lock, flags);
+	return ret;
+}
+
+static void mlx5e_tls_complete_sync_skb(struct sk_buff *skb,
+					struct sk_buff *nskb, u32 tcp_seq,
+					int headln, __be64 rcd_sn)
+{
+	struct mlx5e_tls_metadata *pet;
+	u8 syndrome = SYNDROME_SYNC;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	int data_len, mss;
+
+	nskb->dev = skb->dev;
+	skb_reset_mac_header(nskb);
+	skb_set_network_header(nskb, skb_network_offset(skb));
+	skb_set_transport_header(nskb, skb_transport_offset(skb));
+	memcpy(nskb->data, skb->data, headln);
+	memcpy(nskb->data + headln, &rcd_sn, sizeof(rcd_sn));
+
+	iph = ip_hdr(nskb);
+	iph->tot_len = htons(nskb->len - skb_network_offset(nskb));
+	th = tcp_hdr(nskb);
+	data_len = nskb->len - headln;
+	tcp_seq -= data_len;
+	th->seq = htonl(tcp_seq);
+
+	mss = nskb->dev->mtu - (headln - skb_network_offset(nskb));
+	skb_shinfo(nskb)->gso_size = 0;
+	if (data_len > mss) {
+		skb_shinfo(nskb)->gso_size = mss;
+		skb_shinfo(nskb)->gso_segs = DIV_ROUND_UP(data_len, mss);
+	}
+	skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
+
+	pet = (struct mlx5e_tls_metadata *)(nskb->data + sizeof(struct ethhdr));
+	memcpy(pet, &syndrome, sizeof(syndrome));
+	pet->first_seq = htons(tcp_seq);
+
+	/* MLX5 devices don't care about the checksum partial start, offset
+	 * and pseudo header
+	 */
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+
+	nskb->xmit_more = 1;
+	nskb->queue_mapping = skb->queue_mapping;
+}
+
+static struct sk_buff *
+mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
+		     struct mlx5e_txqsq *sq, struct sk_buff *skb,
+		     struct mlx5e_tx_wqe **wqe,
+		     u16 *pi)
+{
+	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+	struct sync_info info;
+	struct sk_buff *nskb;
+	int linear_len = 0;
+	int headln;
+	int i;
+
+	sq->stats.tls_ooo++;
+
+	if (mlx5e_tls_get_sync_data(context, tcp_seq, &info))
+		/* We might get here if a retransmission reaches the driver
+		 * after the relevant record is acked.
+		 * It should be safe to drop the packet in this case
+		 */
+		goto err_out;
+
+	if (unlikely(info.sync_len < 0)) {
+		u32 payload;
+
+		headln = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		payload = skb->len - headln;
+		if (likely(payload <= -info.sync_len))
+			/* SKB payload doesn't require offload
+			 */
+			return skb;
+
+		netdev_err(skb->dev,
+			   "Can't offload from the middle of an SKB [seq: %X, offload_seq: %X, end_seq: %X]\n",
+			   tcp_seq, tcp_seq + payload + info.sync_len,
+			   tcp_seq + payload);
+		goto err_out;
+	}
+
+	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid)))
+		goto err_out;
+
+	headln = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	linear_len += headln + sizeof(info.rcd_sn);
+	nskb = alloc_skb(linear_len, GFP_ATOMIC);
+	if (unlikely(!nskb))
+		goto err_out;
+
+	context->expected_seq = tcp_seq + skb->len - headln;
+	skb_put(nskb, linear_len);
+	for (i = 0; i < info.nr_frags; i++)
+		skb_shinfo(nskb)->frags[i] = info.frags[i];
+
+	skb_shinfo(nskb)->nr_frags = info.nr_frags;
+	nskb->data_len = info.sync_len;
+	nskb->len += info.sync_len;
+	sq->stats.tls_resync_bytes += nskb->len;
+	mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln,
+				    cpu_to_be64(info.rcd_sn));
+	mlx5e_sq_xmit(sq, nskb, *wqe, *pi);
+	mlx5e_sq_fetch_wqe(sq, wqe, pi);
+	return skb;
+
+err_out:
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
+					struct mlx5e_txqsq *sq,
+					struct sk_buff *skb,
+					struct mlx5e_tx_wqe **wqe,
+					u16 *pi)
+{
+	struct mlx5e_tls_offload_context *context;
+	struct tls_context *tls_ctx;
+	u32 expected_seq;
+	int datalen;
+	u32 skb_seq;
+
+	if (!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk))
+		goto out;
+
+	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	if (!datalen)
+		goto out;
+
+	tls_ctx = tls_get_ctx(skb->sk);
+	if (unlikely(tls_ctx->netdev != netdev))
+		goto out;
+
+	skb_seq = ntohl(tcp_hdr(skb)->seq);
+	context = mlx5e_get_tls_tx_context(tls_ctx);
+	expected_seq = context->expected_seq;
+
+	if (unlikely(expected_seq != skb_seq)) {
+		skb = mlx5e_tls_handle_ooo(context, sq, skb, wqe, pi);
+		goto out;
+	}
+
+	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) {
+		dev_kfree_skb_any(skb);
+		skb = NULL;
+		goto out;
+	}
+
+	context->expected_seq = skb_seq + datalen;
+out:
+	return skb;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h
new file mode 100644
index 000000000000..405dfd302225
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_TLS_RXTX_H__
+#define __MLX5E_TLS_RXTX_H__
+
+#ifdef CONFIG_MLX5_EN_TLS
+
+#include <linux/skbuff.h>
+#include "en.h"
+
+struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
+					struct mlx5e_txqsq *sq,
+					struct sk_buff *skb,
+					struct mlx5e_tx_wqe **wqe,
+					u16 *pi);
+
+#endif /* CONFIG_MLX5_EN_TLS */
+
+#endif /* __MLX5E_TLS_RXTX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0450c8c9b447..5f5f97ee64b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1016,6 +1016,8 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
+	if (mlx5_accel_is_tls_device(c->priv->mdev))
+		set_bit(MLX5E_SQ_STATE_TLS, &sq->state);
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index b08c94422907..c04cf2b4d838 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -43,6 +43,12 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_bytes) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_added_vlan_packets) },
+
+#ifdef CONFIG_MLX5_EN_TLS
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_ooo) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_resync_bytes) },
+#endif
+
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_packets) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_bytes) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_removed_vlan_packets) },
@@ -161,6 +167,10 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 			s->tx_csum_partial_inner += sq_stats->csum_partial_inner;
 			s->tx_csum_none		+= sq_stats->csum_none;
 			s->tx_csum_partial	+= sq_stats->csum_partial;
+#ifdef CONFIG_MLX5_EN_TLS
+			s->tx_tls_ooo		+= sq_stats->tls_ooo;
+			s->tx_tls_resync_bytes	+= sq_stats->tls_resync_bytes;
+#endif
 		}
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 53111a2df587..a36e6a87066b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -93,6 +93,11 @@ struct mlx5e_sw_stats {
 	u64 rx_cache_waive;
 	u64 ch_eq_rearm;
 
+#ifdef CONFIG_MLX5_EN_TLS
+	u64 tx_tls_ooo;
+	u64 tx_tls_resync_bytes;
+#endif
+
 	/* Special handling counters */
 	u64 link_down_events_phy;
 };
@@ -194,6 +199,10 @@ struct mlx5e_sq_stats {
 	u64 csum_partial_inner;
 	u64 added_vlan_packets;
 	u64 nop;
+#ifdef CONFIG_MLX5_EN_TLS
+	u64 tls_ooo;
+	u64 tls_resync_bytes;
+#endif
 	/* less likely accessed in data path */
 	u64 csum_none;
 	u64 stopped;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 20297108528a..486b6bfba8b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -35,12 +35,21 @@
 #include <net/dsfield.h>
 #include "en.h"
 #include "ipoib/ipoib.h"
-#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/en_accel.h"
 #include "lib/clock.h"
 
 #define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
+
+#ifndef CONFIG_MLX5_EN_TLS
 #define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
 			    MLX5E_SQ_NOPS_ROOM)
+#else
+/* TLS offload requires MLX5E_SQ_STOP_ROOM to have
+ * enough room for a resync SKB, a normal SKB and a NOP
+ */
+#define MLX5E_SQ_STOP_ROOM (2 * MLX5_SEND_WQE_MAX_WQEBBS +\
+			    MLX5E_SQ_NOPS_ROOM)
+#endif
 
 static inline void mlx5e_tx_dma_unmap(struct device *pdev,
 				      struct mlx5e_sq_dma *dma)
@@ -325,8 +334,8 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	}
 }
 
-static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
-				 struct mlx5e_tx_wqe *wqe, u16 pi)
+netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5e_tx_wqe *wqe, u16 pi)
 {
 	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
 
@@ -399,21 +408,19 @@ dma_unmap_wqe_err:
 netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
-	struct mlx5e_txqsq *sq = priv->txq2sq[skb_get_queue_mapping(skb)];
-	struct mlx5_wq_cyc *wq = &sq->wq;
-	u16 pi = sq->pc & wq->sz_m1;
-	struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	struct mlx5e_tx_wqe *wqe;
+	struct mlx5e_txqsq *sq;
+	u16 pi;
 
-	memset(wqe, 0, sizeof(*wqe));
+	sq = priv->txq2sq[skb_get_queue_mapping(skb)];
+	mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
 
-#ifdef CONFIG_MLX5_EN_IPSEC
-	if (sq->state & BIT(MLX5E_SQ_STATE_IPSEC)) {
-		skb = mlx5e_ipsec_handle_tx_skb(dev, wqe, skb);
-		if (unlikely(!skb))
-			return NETDEV_TX_OK;
-	}
+#ifdef CONFIG_MLX5_ACCEL
+	/* might send skbs and update wqe and pi */
+	skb = mlx5e_accel_handle_tx(skb, sq, dev, &wqe, &pi);
+	if (unlikely(!skb))
+		return NETDEV_TX_OK;
 #endif
-
 	return mlx5e_sq_xmit(sq, skb, wqe, pi);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 43585a41bd894925abe5015edbce475beb3d8c10 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:21 +0300
Subject: net/mlx5e: TLS, Add error statistics

Add statistics for rare TLS related errors.
Since the errors are rare we have a counter per netdev
rather then per SQ.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  3 +
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 22 ++++++
 .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h | 22 ++++++
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c         | 24 +++---
 .../mellanox/mlx5/core/en_accel/tls_stats.c        | 89 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  4 +
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 22 ++++++
 8 files changed, 178 insertions(+), 10 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index ec785f589666..a7135f5d5cf6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -28,6 +28,6 @@ mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 		en_accel/ipsec_stats.o
 
-mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o en_accel/tls_rxtx.o
+mlx5_core-$(CONFIG_MLX5_EN_TLS) +=  en_accel/tls.o en_accel/tls_rxtx.o en_accel/tls_stats.o
 
 CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ca5b8523bc95..9cc07da09b70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -801,6 +801,9 @@ struct mlx5e_priv {
 #ifdef CONFIG_MLX5_EN_IPSEC
 	struct mlx5e_ipsec        *ipsec;
 #endif
+#ifdef CONFIG_MLX5_EN_TLS
+	struct mlx5e_tls          *tls;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
index aa6981c98bdc..d167845271c3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -173,3 +173,25 @@ void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
 	netdev->hw_features |= NETIF_F_HW_TLS_TX;
 	netdev->tlsdev_ops = &mlx5e_tls_ops;
 }
+
+int mlx5e_tls_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tls *tls = kzalloc(sizeof(*tls), GFP_KERNEL);
+
+	if (!tls)
+		return -ENOMEM;
+
+	priv->tls = tls;
+	return 0;
+}
+
+void mlx5e_tls_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tls *tls = priv->tls;
+
+	if (!tls)
+		return;
+
+	kfree(tls);
+	priv->tls = NULL;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
index f7216b9b98e2..b6162178f621 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
@@ -38,6 +38,17 @@
 #include <net/tls.h>
 #include "en.h"
 
+struct mlx5e_tls_sw_stats {
+	atomic64_t tx_tls_drop_metadata;
+	atomic64_t tx_tls_drop_resync_alloc;
+	atomic64_t tx_tls_drop_no_sync_data;
+	atomic64_t tx_tls_drop_bypass_required;
+};
+
+struct mlx5e_tls {
+	struct mlx5e_tls_sw_stats sw_stats;
+};
+
 struct mlx5e_tls_offload_context {
 	struct tls_offload_context base;
 	u32 expected_seq;
@@ -55,10 +66,21 @@ mlx5e_get_tls_tx_context(struct tls_context *tls_ctx)
 }
 
 void mlx5e_tls_build_netdev(struct mlx5e_priv *priv);
+int mlx5e_tls_init(struct mlx5e_priv *priv);
+void mlx5e_tls_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_tls_get_count(struct mlx5e_priv *priv);
+int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data);
+int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data);
 
 #else
 
 static inline void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) { }
+static inline int mlx5e_tls_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { }
+static inline int mlx5e_tls_get_count(struct mlx5e_priv *priv) { return 0; }
+static inline int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data) { return 0; }
+static inline int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data) { return 0; }
 
 #endif
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
index 49e8d455ebc3..ad2790fb5966 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -164,7 +164,8 @@ static struct sk_buff *
 mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
 		     struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		     struct mlx5e_tx_wqe **wqe,
-		     u16 *pi)
+		     u16 *pi,
+		     struct mlx5e_tls *tls)
 {
 	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
 	struct sync_info info;
@@ -175,12 +176,14 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
 
 	sq->stats.tls_ooo++;
 
-	if (mlx5e_tls_get_sync_data(context, tcp_seq, &info))
+	if (mlx5e_tls_get_sync_data(context, tcp_seq, &info)) {
 		/* We might get here if a retransmission reaches the driver
 		 * after the relevant record is acked.
 		 * It should be safe to drop the packet in this case
 		 */
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_no_sync_data);
 		goto err_out;
+	}
 
 	if (unlikely(info.sync_len < 0)) {
 		u32 payload;
@@ -192,21 +195,22 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
 			 */
 			return skb;
 
-		netdev_err(skb->dev,
-			   "Can't offload from the middle of an SKB [seq: %X, offload_seq: %X, end_seq: %X]\n",
-			   tcp_seq, tcp_seq + payload + info.sync_len,
-			   tcp_seq + payload);
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_bypass_required);
 		goto err_out;
 	}
 
-	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid)))
+	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) {
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_metadata);
 		goto err_out;
+	}
 
 	headln = skb_transport_offset(skb) + tcp_hdrlen(skb);
 	linear_len += headln + sizeof(info.rcd_sn);
 	nskb = alloc_skb(linear_len, GFP_ATOMIC);
-	if (unlikely(!nskb))
+	if (unlikely(!nskb)) {
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_resync_alloc);
 		goto err_out;
+	}
 
 	context->expected_seq = tcp_seq + skb->len - headln;
 	skb_put(nskb, linear_len);
@@ -234,6 +238,7 @@ struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
 					struct mlx5e_tx_wqe **wqe,
 					u16 *pi)
 {
+	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_tls_offload_context *context;
 	struct tls_context *tls_ctx;
 	u32 expected_seq;
@@ -256,11 +261,12 @@ struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
 	expected_seq = context->expected_seq;
 
 	if (unlikely(expected_seq != skb_seq)) {
-		skb = mlx5e_tls_handle_ooo(context, sq, skb, wqe, pi);
+		skb = mlx5e_tls_handle_ooo(context, sq, skb, wqe, pi, priv->tls);
 		goto out;
 	}
 
 	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) {
+		atomic64_inc(&priv->tls->sw_stats.tx_tls_drop_metadata);
 		dev_kfree_skb_any(skb);
 		skb = NULL;
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c
new file mode 100644
index 000000000000..01468ec27446
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/ethtool.h>
+#include <net/sock.h>
+
+#include "en.h"
+#include "accel/tls.h"
+#include "fpga/sdk.h"
+#include "en_accel/tls.h"
+
+static const struct counter_desc mlx5e_tls_sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_metadata) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_resync_alloc) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_no_sync_data) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_bypass_required) },
+};
+
+#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \
+	atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset))
+
+#define NUM_TLS_SW_COUNTERS ARRAY_SIZE(mlx5e_tls_sw_stats_desc)
+
+int mlx5e_tls_get_count(struct mlx5e_priv *priv)
+{
+	if (!priv->tls)
+		return 0;
+
+	return NUM_TLS_SW_COUNTERS;
+}
+
+int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	unsigned int i, idx = 0;
+
+	if (!priv->tls)
+		return 0;
+
+	for (i = 0; i < NUM_TLS_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_tls_sw_stats_desc[i].format);
+
+	return NUM_TLS_SW_COUNTERS;
+}
+
+int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	int i, idx = 0;
+
+	if (!priv->tls)
+		return 0;
+
+	for (i = 0; i < NUM_TLS_SW_COUNTERS; i++)
+		data[idx++] =
+		    MLX5E_READ_CTR_ATOMIC64(&priv->tls->sw_stats,
+					    mlx5e_tls_sw_stats_desc, i);
+
+	return NUM_TLS_SW_COUNTERS;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5f5f97ee64b2..553eb63753ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4422,12 +4422,16 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 	err = mlx5e_ipsec_init(priv);
 	if (err)
 		mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err);
+	err = mlx5e_tls_init(priv);
+	if (err)
+		mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
 	mlx5e_build_nic_netdev(netdev);
 	mlx5e_vxlan_init(priv);
 }
 
 static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
 {
+	mlx5e_tls_cleanup(priv);
 	mlx5e_ipsec_cleanup(priv);
 	mlx5e_vxlan_cleanup(priv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index c04cf2b4d838..e17919c0af08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -32,6 +32,7 @@
 
 #include "en.h"
 #include "en_accel/ipsec.h"
+#include "en_accel/tls.h"
 
 static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
@@ -1075,6 +1076,22 @@ static void mlx5e_grp_ipsec_update_stats(struct mlx5e_priv *priv)
 	mlx5e_ipsec_update_stats(priv);
 }
 
+static int mlx5e_grp_tls_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_tls_get_count(priv);
+}
+
+static int mlx5e_grp_tls_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	return idx + mlx5e_tls_get_strings(priv, data + idx * ETH_GSTRING_LEN);
+}
+
+static int mlx5e_grp_tls_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	return idx + mlx5e_tls_get_stats(priv, data + idx);
+}
+
 static const struct counter_desc rq_stats_desc[] = {
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) },
@@ -1277,6 +1294,11 @@ const struct mlx5e_stats_grp mlx5e_stats_grps[] = {
 		.fill_stats = mlx5e_grp_ipsec_fill_stats,
 		.update_stats = mlx5e_grp_ipsec_update_stats,
 	},
+	{
+		.get_num_stats = mlx5e_grp_tls_get_num_stats,
+		.fill_strings = mlx5e_grp_tls_fill_strings,
+		.fill_stats = mlx5e_grp_tls_fill_stats,
+	},
 	{
 		.get_num_stats = mlx5e_grp_channels_get_num_stats,
 		.fill_strings = mlx5e_grp_channels_fill_strings,
-- 
cgit v1.2.3-59-g8ed1b


From a051505c7ec025772983419cc309dc593843f3d2 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:22 +0300
Subject: MAINTAINERS: Update mlx5 innova driver maintainers

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a52800867850..8f0fc25f0042 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9037,26 +9037,17 @@ W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
 F:	drivers/net/ethernet/mellanox/mlx5/core/en_*
 
-MELLANOX ETHERNET INNOVA DRIVER
-M:	Ilan Tayari <ilant@mellanox.com>
-R:	Boris Pismenny <borisp@mellanox.com>
+MELLANOX ETHERNET INNOVA DRIVERS
+M:	Boris Pismenny <borisp@mellanox.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 W:	http://www.mellanox.com
 Q:	http://patchwork.ozlabs.org/project/netdev/list/
+F:	drivers/net/ethernet/mellanox/mlx5/core/en_accel/*
+F:	drivers/net/ethernet/mellanox/mlx5/core/accel/*
 F:	drivers/net/ethernet/mellanox/mlx5/core/fpga/*
 F:	include/linux/mlx5/mlx5_ifc_fpga.h
 
-MELLANOX ETHERNET INNOVA IPSEC DRIVER
-M:	Ilan Tayari <ilant@mellanox.com>
-R:	Boris Pismenny <borisp@mellanox.com>
-L:	netdev@vger.kernel.org
-S:	Supported
-W:	http://www.mellanox.com
-Q:	http://patchwork.ozlabs.org/project/netdev/list/
-F:	drivers/net/ethernet/mellanox/mlx5/core/en_ipsec/*
-F:	drivers/net/ethernet/mellanox/mlx5/core/ipsec*
-
 MELLANOX ETHERNET SWITCH DRIVERS
 M:	Jiri Pirko <jiri@mellanox.com>
 M:	Ido Schimmel <idosch@mellanox.com>
-- 
cgit v1.2.3-59-g8ed1b


From f9c8141fc10324cef00f7e5a3358ccdbe1bd10b4 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Mon, 30 Apr 2018 10:16:23 +0300
Subject: MAINTAINERS: Update TLS maintainers

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 8f0fc25f0042..4ee8bec9c9bd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9839,7 +9839,7 @@ F:	net/netfilter/xt_CONNSECMARK.c
 F:	net/netfilter/xt_SECMARK.c
 
 NETWORKING [TLS]
-M:	Ilya Lesokhin <ilyal@mellanox.com>
+M:	Boris Pismenny <borisp@mellanox.com>
 M:	Aviad Yehezkel <aviadye@mellanox.com>
 M:	Dave Watson <davejwatson@fb.com>
 L:	netdev@vger.kernel.org
-- 
cgit v1.2.3-59-g8ed1b


From 26045a7b14bc7a5455e411d820110f66557d6589 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 29 Apr 2018 12:44:11 +0200
Subject: uevent: add alloc_uevent_skb() helper

This patch adds alloc_uevent_skb() in preparation for follow up patches.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/kobject_uevent.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 15ea216a67ce..649bf60a9440 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -22,6 +22,7 @@
 #include <linux/socket.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/uidgid.h>
 #include <linux/uuid.h>
 #include <linux/ctype.h>
 #include <net/sock.h>
@@ -296,6 +297,38 @@ static void cleanup_uevent_env(struct subprocess_info *info)
 }
 #endif
 
+#ifdef CONFIG_NET
+static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
+					const char *action_string,
+					const char *devpath)
+{
+	struct netlink_skb_parms *parms;
+	struct sk_buff *skb = NULL;
+	char *scratch;
+	size_t len;
+
+	/* allocate message with maximum possible size */
+	len = strlen(action_string) + strlen(devpath) + 2;
+	skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* add header */
+	scratch = skb_put(skb, len);
+	sprintf(scratch, "%s@%s", action_string, devpath);
+
+	skb_put_data(skb, env->buf, env->buflen);
+
+	parms = &NETLINK_CB(skb);
+	parms->creds.uid = GLOBAL_ROOT_UID;
+	parms->creds.gid = GLOBAL_ROOT_GID;
+	parms->dst_group = 1;
+	parms->portid = 0;
+
+	return skb;
+}
+#endif
+
 static int kobject_uevent_net_broadcast(struct kobject *kobj,
 					struct kobj_uevent_env *env,
 					const char *action_string,
@@ -314,22 +347,10 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 			continue;
 
 		if (!skb) {
-			/* allocate message with the maximum possible size */
-			size_t len = strlen(action_string) + strlen(devpath) + 2;
-			char *scratch;
-
 			retval = -ENOMEM;
-			skb = alloc_skb(len + env->buflen, GFP_KERNEL);
+			skb = alloc_uevent_skb(env, action_string, devpath);
 			if (!skb)
 				continue;
-
-			/* add header */
-			scratch = skb_put(skb, len);
-			sprintf(scratch, "%s@%s", action_string, devpath);
-
-			skb_put_data(skb, env->buf, env->buflen);
-
-			NETLINK_CB(skb).dst_group = 1;
 		}
 
 		retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
-- 
cgit v1.2.3-59-g8ed1b


From a3498436b3a0f8ec289e6847e1de40b4123e1639 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Sun, 29 Apr 2018 12:44:12 +0200
Subject: netns: restrict uevents

commit 07e98962fa77 ("kobject: Send hotplug events in all network namespaces")

enabled sending hotplug events into all network namespaces back in 2010.
Over time the set of uevents that get sent into all network namespaces has
shrunk. We have now reached the point where hotplug events for all devices
that carry a namespace tag are filtered according to that namespace.
Specifically, they are filtered whenever the namespace tag of the kobject
does not match the namespace tag of the netlink socket.
Currently, only network devices carry namespace tags (i.e. network
namespace tags). Hence, uevents for network devices only show up in the
network namespace such devices are created in or moved to.

However, any uevent for a kobject that does not have a namespace tag
associated with it will not be filtered and we will broadcast it into all
network namespaces. This behavior stopped making sense when user namespaces
were introduced.

This patch simplifies and fixes couple of things:
- Split codepath for sending uevents by kobject namespace tags:
  1. Untagged kobjects - uevent_net_broadcast_untagged():
     Untagged kobjects will be broadcast into all uevent sockets recorded
     in uevent_sock_list, i.e. into all network namespacs owned by the
     intial user namespace.
  2. Tagged kobjects - uevent_net_broadcast_tagged():
     Tagged kobjects will only be broadcast into the network namespace they
     were tagged with.
  Handling of tagged kobjects in 2. does not cause any semantic changes.
  This is just splitting out the filtering logic that was handled by
  kobj_bcast_filter() before.
  Handling of untagged kobjects in 1. will cause a semantic change. The
  reasons why this is needed and ok have been discussed in [1]. Here is a
  short summary:
  - Userspace ignores uevents from network namespaces that are not owned by
    the intial user namespace:
    Uevents are filtered by userspace in a user namespace because the
    received uid != 0. Instead the uid associated with the event will be
    65534 == "nobody" because the global root uid is not mapped.
    This means we can safely and without introducing regressions modify the
    kernel to not send uevents into all network namespaces whose owning
    user namespace is not the initial user namespace because we know that
    userspace will ignore the message because of the uid anyway.
    I have a) verified that is is true for every udev implementation out
    there b) that this behavior has been present in all udev
    implementations from the very beginning.
  - Thundering herd:
    Broadcasting uevents into all network namespaces introduces significant
    overhead.
    All processes that listen to uevents running in non-initial user
    namespaces will end up responding to uevents that will be meaningless
    to them. Mainly, because non-initial user namespaces cannot easily
    manage devices unless they have a privileged host-process helping them
    out. This means that there will be a thundering herd of activity when
    there shouldn't be any.
  - Removing needless overhead/Increasing performance:
    Currently, the uevent socket for each network namespace is added to the
    global variable uevent_sock_list. The list itself needs to be protected
    by a mutex. So everytime a uevent is generated the mutex is taken on
    the list. The mutex is held *from the creation of the uevent (memory
    allocation, string creation etc. until all uevent sockets have been
    handled*. This is aggravated by the fact that for each uevent socket
    that has listeners the mc_list must be walked as well which means we're
    talking O(n^2) here. Given that a standard Linux workload usually has
    quite a lot of network namespaces and - in the face of containers - a
    lot of user namespaces this quickly becomes a performance problem (see
    "Thundering herd" above). By just recording uevent sockets of network
    namespaces that are owned by the initial user namespace we
    significantly increase performance in this codepath.
  - Injecting uevents:
    There's a valid argument that containers might be interested in
    receiving device events especially if they are delegated to them by a
    privileged userspace process. One prime example are SR-IOV enabled
    devices that are explicitly designed to be handed of to other users
    such as VMs or containers.
    This use-case can now be correctly handled since
    commit 692ec06d7c92 ("netns: send uevent messages"). This commit
    introduced the ability to send uevents from userspace. As such we can
    let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
    namespace of the network namespace of the netlink socket) userspace
    process make a decision what uevents should be sent. This removes the
    need to blindly broadcast uevents into all user namespaces and provides
    a performant and safe solution to this problem.
  - Filtering logic:
    This patch filters by *owning user namespace of the network namespace a
    given task resides in* and not by user namespace of the task per se.
    This means if the user namespace of a given task is unshared but the
    network namespace is kept and is owned by the initial user namespace a
    listener that is opening the uevent socket in that network namespace
    can still listen to uevents.
- Fix permission for tagged kobjects:
  Network devices that are created or moved into a network namespace that
  is owned by a non-initial user namespace currently are send with
  INVALID_{G,U}ID in their credentials. This means that all current udev
  implementations in userspace will ignore the uevent they receive for
  them. This has lead to weird bugs whereby new devices showing up in such
  network namespaces were not recognized and did not get IPs assigned etc.
  This patch adjusts the permission to the appropriate {g,u}id in the
  respective user namespace. This way udevd is able to correctly handle
  such devices.
- Simplify filtering logic:
  do_one_broadcast() already ensures that only listeners in mc_list receive
  uevents that have the same network namespace as the uevent socket itself.
  So the filtering logic in kobj_bcast_filter is not needed (see [3]). This
  patch therefore removes kobj_bcast_filter() and replaces
  netlink_broadcast_filtered() with the simpler netlink_broadcast()
  everywhere.

[1]: https://lkml.org/lkml/2018/4/4/739
[2]: https://lkml.org/lkml/2018/4/26/767
[3]: https://lkml.org/lkml/2018/4/26/738
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/kobject_uevent.c | 137 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 95 insertions(+), 42 deletions(-)

diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 649bf60a9440..63d0816ab23b 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -232,30 +232,6 @@ out:
 	return r;
 }
 
-#ifdef CONFIG_NET
-static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
-{
-	struct kobject *kobj = data, *ksobj;
-	const struct kobj_ns_type_operations *ops;
-
-	ops = kobj_ns_ops(kobj);
-	if (!ops && kobj->kset) {
-		ksobj = &kobj->kset->kobj;
-		if (ksobj->parent != NULL)
-			ops = kobj_ns_ops(ksobj->parent);
-	}
-
-	if (ops && ops->netlink_ns && kobj->ktype->namespace) {
-		const void *sock_ns, *ns;
-		ns = kobj->ktype->namespace(kobj);
-		sock_ns = ops->netlink_ns(dsk);
-		return sock_ns != ns;
-	}
-
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_UEVENT_HELPER
 static int kobj_usermode_filter(struct kobject *kobj)
 {
@@ -327,17 +303,14 @@ static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
 
 	return skb;
 }
-#endif
 
-static int kobject_uevent_net_broadcast(struct kobject *kobj,
-					struct kobj_uevent_env *env,
-					const char *action_string,
-					const char *devpath)
+static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env,
+					 const char *action_string,
+					 const char *devpath)
 {
-	int retval = 0;
-#if defined(CONFIG_NET)
 	struct sk_buff *skb = NULL;
 	struct uevent_sock *ue_sk;
+	int retval = 0;
 
 	/* send netlink message */
 	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
@@ -353,19 +326,93 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
 				continue;
 		}
 
-		retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
-						    0, 1, GFP_KERNEL,
-						    kobj_bcast_filter,
-						    kobj);
+		retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1,
+					   GFP_KERNEL);
 		/* ENOBUFS should be handled in userspace */
 		if (retval == -ENOBUFS || retval == -ESRCH)
 			retval = 0;
 	}
 	consume_skb(skb);
-#endif
+
 	return retval;
 }
 
+static int uevent_net_broadcast_tagged(struct sock *usk,
+				       struct kobj_uevent_env *env,
+				       const char *action_string,
+				       const char *devpath)
+{
+	struct user_namespace *owning_user_ns = sock_net(usk)->user_ns;
+	struct sk_buff *skb = NULL;
+	int ret = 0;
+
+	skb = alloc_uevent_skb(env, action_string, devpath);
+	if (!skb)
+		return -ENOMEM;
+
+	/* fix credentials */
+	if (owning_user_ns != &init_user_ns) {
+		struct netlink_skb_parms *parms = &NETLINK_CB(skb);
+		kuid_t root_uid;
+		kgid_t root_gid;
+
+		/* fix uid */
+		root_uid = make_kuid(owning_user_ns, 0);
+		if (uid_valid(root_uid))
+			parms->creds.uid = root_uid;
+
+		/* fix gid */
+		root_gid = make_kgid(owning_user_ns, 0);
+		if (gid_valid(root_gid))
+			parms->creds.gid = root_gid;
+	}
+
+	ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL);
+	/* ENOBUFS should be handled in userspace */
+	if (ret == -ENOBUFS || ret == -ESRCH)
+		ret = 0;
+
+	return ret;
+}
+#endif
+
+static int kobject_uevent_net_broadcast(struct kobject *kobj,
+					struct kobj_uevent_env *env,
+					const char *action_string,
+					const char *devpath)
+{
+	int ret = 0;
+
+#ifdef CONFIG_NET
+	const struct kobj_ns_type_operations *ops;
+	const struct net *net = NULL;
+
+	ops = kobj_ns_ops(kobj);
+	if (!ops && kobj->kset) {
+		struct kobject *ksobj = &kobj->kset->kobj;
+		if (ksobj->parent != NULL)
+			ops = kobj_ns_ops(ksobj->parent);
+	}
+
+	/* kobjects currently only carry network namespace tags and they
+	 * are the only tag relevant here since we want to decide which
+	 * network namespaces to broadcast the uevent into.
+	 */
+	if (ops && ops->netlink_ns && kobj->ktype->namespace)
+		if (ops->type == KOBJ_NS_TYPE_NET)
+			net = kobj->ktype->namespace(kobj);
+
+	if (!net)
+		ret = uevent_net_broadcast_untagged(env, action_string,
+						    devpath);
+	else
+		ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env,
+						  action_string, devpath);
+#endif
+
+	return ret;
+}
+
 static void zap_modalias_env(struct kobj_uevent_env *env)
 {
 	static const char modalias_prefix[] = "MODALIAS=";
@@ -724,9 +771,13 @@ static int uevent_net_init(struct net *net)
 
 	net->uevent_sock = ue_sk;
 
-	mutex_lock(&uevent_sock_mutex);
-	list_add_tail(&ue_sk->list, &uevent_sock_list);
-	mutex_unlock(&uevent_sock_mutex);
+	/* Restrict uevents to initial user namespace. */
+	if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+		mutex_lock(&uevent_sock_mutex);
+		list_add_tail(&ue_sk->list, &uevent_sock_list);
+		mutex_unlock(&uevent_sock_mutex);
+	}
+
 	return 0;
 }
 
@@ -734,9 +785,11 @@ static void uevent_net_exit(struct net *net)
 {
 	struct uevent_sock *ue_sk = net->uevent_sock;
 
-	mutex_lock(&uevent_sock_mutex);
-	list_del(&ue_sk->list);
-	mutex_unlock(&uevent_sock_mutex);
+	if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+		mutex_lock(&uevent_sock_mutex);
+		list_del(&ue_sk->list);
+		mutex_unlock(&uevent_sock_mutex);
+	}
 
 	netlink_kernel_release(ue_sk->sk);
 	kfree(ue_sk);
-- 
cgit v1.2.3-59-g8ed1b


From 7e5d05e18ba1ed491c6f836edee7f0b90f3167bc Mon Sep 17 00:00:00 2001
From: Yixun Lan <yixun.lan@amlogic.com>
Date: Sat, 28 Apr 2018 10:21:10 +0000
Subject: dt-bindings: net: meson-dwmac: new compatible name for AXG SoC

We need to introduce a new compatible name for the Meson-AXG SoC
in order to support the RMII 100M ethernet PHY, since the PRG_ETH0
register of the dwmac glue layer is changed from previous old SoC.

Signed-off-by: Yixun Lan <yixun.lan@amlogic.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/meson-dwmac.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt b/Documentation/devicetree/bindings/net/meson-dwmac.txt
index 61cada22ae6c..1321bb194ed9 100644
--- a/Documentation/devicetree/bindings/net/meson-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt
@@ -11,6 +11,7 @@ Required properties on all platforms:
 			- "amlogic,meson8b-dwmac"
 			- "amlogic,meson8m2-dwmac"
 			- "amlogic,meson-gxbb-dwmac"
+			- "amlogic,meson-axg-dwmac"
 		Additionally "snps,dwmac" and any applicable more
 		detailed version number described in net/stmmac.txt
 		should be used.
-- 
cgit v1.2.3-59-g8ed1b


From efacb568c962470894dcda2bc51ea42af96a39eb Mon Sep 17 00:00:00 2001
From: Yixun Lan <yixun.lan@amlogic.com>
Date: Sat, 28 Apr 2018 10:21:11 +0000
Subject: net: stmmac: dwmac-meson: extend phy mode setting

In the Meson-AXG SoC, the phy mode setting of PRG_ETH0 in the glue layer
is extended from bit[0] to bit[2:0].
  There is no problem if we configure it to the RGMII 1000M PHY mode,
since the register setting is coincidentally compatible with previous one,
but for the RMII 100M PHY mode, the configuration need to be changed to
value - b100.
  This patch was verified with a RTL8201F 100M ethernet PHY.

Signed-off-by: Yixun Lan <yixun.lan@amlogic.com>
Acked-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c    | 120 ++++++++++++++++++---
 1 file changed, 104 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 7cb794094a70..4ff231df7322 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -18,6 +18,7 @@
 #include <linux/io.h>
 #include <linux/ioport.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/of_net.h>
 #include <linux/mfd/syscon.h>
 #include <linux/platform_device.h>
@@ -29,6 +30,10 @@
 
 #define PRG_ETH0_RGMII_MODE		BIT(0)
 
+#define PRG_ETH0_EXT_PHY_MODE_MASK	GENMASK(2, 0)
+#define PRG_ETH0_EXT_RGMII_MODE		1
+#define PRG_ETH0_EXT_RMII_MODE		4
+
 /* mux to choose between fclk_div2 (bit unset) and mpll2 (bit set) */
 #define PRG_ETH0_CLK_M250_SEL_SHIFT	4
 #define PRG_ETH0_CLK_M250_SEL_MASK	GENMASK(4, 4)
@@ -47,12 +52,20 @@
 
 #define MUX_CLK_NUM_PARENTS		2
 
+struct meson8b_dwmac;
+
+struct meson8b_dwmac_data {
+	int (*set_phy_mode)(struct meson8b_dwmac *dwmac);
+};
+
 struct meson8b_dwmac {
-	struct device		*dev;
-	void __iomem		*regs;
-	phy_interface_t		phy_mode;
-	struct clk		*rgmii_tx_clk;
-	u32			tx_delay_ns;
+	struct device			*dev;
+	void __iomem			*regs;
+
+	const struct meson8b_dwmac_data	*data;
+	phy_interface_t			phy_mode;
+	struct clk			*rgmii_tx_clk;
+	u32				tx_delay_ns;
 };
 
 struct meson8b_dwmac_clk_configs {
@@ -171,6 +184,59 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac)
 	return 0;
 }
 
+static int meson8b_set_phy_mode(struct meson8b_dwmac *dwmac)
+{
+	switch (dwmac->phy_mode) {
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		/* enable RGMII mode */
+		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
+					PRG_ETH0_RGMII_MODE,
+					PRG_ETH0_RGMII_MODE);
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		/* disable RGMII mode -> enables RMII mode */
+		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
+					PRG_ETH0_RGMII_MODE, 0);
+		break;
+	default:
+		dev_err(dwmac->dev, "fail to set phy-mode %s\n",
+			phy_modes(dwmac->phy_mode));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int meson_axg_set_phy_mode(struct meson8b_dwmac *dwmac)
+{
+	switch (dwmac->phy_mode) {
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		/* enable RGMII mode */
+		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
+					PRG_ETH0_EXT_PHY_MODE_MASK,
+					PRG_ETH0_EXT_RGMII_MODE);
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		/* disable RGMII mode -> enables RMII mode */
+		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
+					PRG_ETH0_EXT_PHY_MODE_MASK,
+					PRG_ETH0_EXT_RMII_MODE);
+		break;
+	default:
+		dev_err(dwmac->dev, "fail to set phy-mode %s\n",
+			phy_modes(dwmac->phy_mode));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac)
 {
 	int ret;
@@ -188,10 +254,6 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac)
 
 	case PHY_INTERFACE_MODE_RGMII_ID:
 	case PHY_INTERFACE_MODE_RGMII_TXID:
-		/* enable RGMII mode */
-		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0, PRG_ETH0_RGMII_MODE,
-					PRG_ETH0_RGMII_MODE);
-
 		/* only relevant for RMII mode -> disable in RGMII mode */
 		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
 					PRG_ETH0_INVERTED_RMII_CLK, 0);
@@ -224,10 +286,6 @@ static int meson8b_init_prg_eth(struct meson8b_dwmac *dwmac)
 		break;
 
 	case PHY_INTERFACE_MODE_RMII:
-		/* disable RGMII mode -> enables RMII mode */
-		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0, PRG_ETH0_RGMII_MODE,
-					0);
-
 		/* invert internal clk_rmii_i to generate 25/2.5 tx_rx_clk */
 		meson8b_dwmac_mask_bits(dwmac, PRG_ETH0,
 					PRG_ETH0_INVERTED_RMII_CLK,
@@ -274,6 +332,11 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 		goto err_remove_config_dt;
 	}
 
+	dwmac->data = (const struct meson8b_dwmac_data *)
+		of_device_get_match_data(&pdev->dev);
+	if (!dwmac->data)
+		return -EINVAL;
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
 	dwmac->regs = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(dwmac->regs)) {
@@ -298,6 +361,10 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_remove_config_dt;
 
+	ret = dwmac->data->set_phy_mode(dwmac);
+	if (ret)
+		goto err_remove_config_dt;
+
 	ret = meson8b_init_prg_eth(dwmac);
 	if (ret)
 		goto err_remove_config_dt;
@@ -316,10 +383,31 @@ err_remove_config_dt:
 	return ret;
 }
 
+static const struct meson8b_dwmac_data meson8b_dwmac_data = {
+	.set_phy_mode = meson8b_set_phy_mode,
+};
+
+static const struct meson8b_dwmac_data meson_axg_dwmac_data = {
+	.set_phy_mode = meson_axg_set_phy_mode,
+};
+
 static const struct of_device_id meson8b_dwmac_match[] = {
-	{ .compatible = "amlogic,meson8b-dwmac" },
-	{ .compatible = "amlogic,meson8m2-dwmac" },
-	{ .compatible = "amlogic,meson-gxbb-dwmac" },
+	{
+		.compatible = "amlogic,meson8b-dwmac",
+		.data = &meson8b_dwmac_data,
+	},
+	{
+		.compatible = "amlogic,meson8m2-dwmac",
+		.data = &meson8b_dwmac_data,
+	},
+	{
+		.compatible = "amlogic,meson-gxbb-dwmac",
+		.data = &meson8b_dwmac_data,
+	},
+	{
+		.compatible = "amlogic,meson-axg-dwmac",
+		.data = &meson_axg_dwmac_data,
+	},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, meson8b_dwmac_match);
-- 
cgit v1.2.3-59-g8ed1b


From 6d3e8aa87622b30bfabbbfad7674d2464f2a9cb9 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Sun, 29 Apr 2018 12:56:31 -0300
Subject: sctp: allow sctp_init_cause to return errors

And do so if the skb doesn't have enough space for the payload.
This is a preparation for the next patch.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |  2 +-
 net/sctp/sm_make_chunk.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index f4b657478a30..5ef1bad81ef5 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -215,7 +215,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 struct sctp_chunk *sctp_make_shutdown_complete(
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *chunk);
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
 struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 				   const struct sctp_chunk *chunk,
 				   const size_t hint);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index db93eabd6ef5..e518eb64ccf3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -158,8 +158,8 @@ static const struct sctp_paramhdr prsctp_param = {
  * provided chunk, as most cause codes will be embedded inside an
  * abort chunk.
  */
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
-		     size_t paylen)
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+		    size_t paylen)
 {
 	struct sctp_errhdr err;
 	__u16 len;
@@ -167,8 +167,14 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 	/* Cause code constants are now defined in network order.  */
 	err.cause = cause_code;
 	len = sizeof(err) + paylen;
-	err.length  = htons(len);
+	err.length = htons(len);
+
+	if (skb_tailroom(chunk->skb) < len)
+		return -ENOSPC;
+
 	chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
+
+	return 0;
 }
 
 /* A helper to initialize an op error inside a
-- 
cgit v1.2.3-59-g8ed1b


From 8914f4bace3f3a56f14accb45f9f4b9456670b4a Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Sun, 29 Apr 2018 12:56:32 -0300
Subject: sctp: add sctp_make_op_error_limited and reuse inner functions

The idea is quite similar to the old functions, but note that the _fixed
function wasn't "fixed" as in that it would generate a packet with a fixed
size, but rather limited/bounded to PMTU.

Also, now with sctp_mtu_payload(), we have a more accurate limit.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 130 +++++++++++++++++------------------------------
 1 file changed, 46 insertions(+), 84 deletions(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e518eb64ccf3..4d7b3ccea078 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc,
 			      gfp_t gfp);
 static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
 			      const void *data);
-static void  *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
-				     const void *data);
 
 /* Control chunk destructor */
 static void sctp_control_release_owner(struct sk_buff *skb)
@@ -154,9 +152,8 @@ static const struct sctp_paramhdr prsctp_param = {
 	cpu_to_be16(sizeof(struct sctp_paramhdr)),
 };
 
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk.
+/* A helper to initialize an op error inside a provided chunk, as most
+ * cause codes will be embedded inside an abort chunk.
  */
 int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 		    size_t paylen)
@@ -177,29 +174,6 @@ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 	return 0;
 }
 
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk.  Differs from sctp_init_cause in that it won't oops
- * if there isn't enough space in the op error chunk
- */
-static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
-				 size_t paylen)
-{
-	struct sctp_errhdr err;
-	__u16 len;
-
-	/* Cause code constants are now defined in network order.  */
-	err.cause = cause_code;
-	len = sizeof(err) + paylen;
-	err.length  = htons(len);
-
-	if (skb_tailroom(chunk->skb) < len)
-		return -ENOSPC;
-
-	chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err);
-
-	return 0;
-}
 /* 3.3.2 Initiation (INIT) (1)
  *
  * This chunk is used to initiate a SCTP association between two
@@ -1263,20 +1237,26 @@ nodata:
 	return retval;
 }
 
-/* Create an Operation Error chunk of a fixed size,
- * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
- * This is a helper function to allocate an error chunk for
- * for those invalid parameter codes in which we may not want
- * to report all the errors, if the incoming chunk is large
+/* Create an Operation Error chunk of a fixed size, specifically,
+ * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
+ * This is a helper function to allocate an error chunk for for those
+ * invalid parameter codes in which we may not want to report all the
+ * errors, if the incoming chunk is large. If it can't fit in a single
+ * packet, we ignore it.
  */
-static inline struct sctp_chunk *sctp_make_op_error_fixed(
+static inline struct sctp_chunk *sctp_make_op_error_limited(
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *chunk)
 {
-	size_t size = asoc ? asoc->pathmtu : 0;
+	size_t size = SCTP_DEFAULT_MAXSEGMENT;
+	struct sctp_sock *sp = NULL;
+
+	if (asoc) {
+		size = min_t(size_t, size, asoc->pathmtu);
+		sp = sctp_sk(asoc->base.sk);
+	}
 
-	if (!size)
-		size = SCTP_DEFAULT_MAXSEGMENT;
+	size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
 
 	return sctp_make_op_error_space(asoc, chunk, size);
 }
@@ -1528,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
 	return target;
 }
 
-/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
- * space in the chunk
- */
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
-				    int len, const void *data)
-{
-	if (skb_tailroom(chunk->skb) >= len)
-		return sctp_addto_chunk(chunk, len, data);
-	else
-		return NULL;
-}
-
 /* Append bytes from user space to the end of a chunk.  Will panic if
  * chunk is not big enough.
  * Returns a kernel err value.
@@ -1834,6 +1802,9 @@ no_hmac:
 		kt = ktime_get_real();
 
 	if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
+		suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
+		__be32 n = htonl(usecs);
+
 		/*
 		 * Section 3.3.10.3 Stale Cookie Error (3)
 		 *
@@ -1842,17 +1813,12 @@ no_hmac:
 		 * Stale Cookie Error:  Indicates the receipt of a valid State
 		 * Cookie that has expired.
 		 */
-		len = ntohs(chunk->chunk_hdr->length);
-		*errp = sctp_make_op_error_space(asoc, chunk, len);
-		if (*errp) {
-			suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
-			__be32 n = htonl(usecs);
-
-			sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
-					sizeof(n));
-			sctp_addto_chunk(*errp, sizeof(n), &n);
+		*errp = sctp_make_op_error(asoc, chunk,
+					   SCTP_ERROR_STALE_COOKIE, &n,
+					   sizeof(n), 0);
+		if (*errp)
 			*error = -SCTP_IERROR_STALE_COOKIE;
-		} else
+		else
 			*error = -SCTP_IERROR_NOMEM;
 
 		goto fail;
@@ -2003,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
 	if (*errp)
 		sctp_chunk_free(*errp);
 
-	*errp = sctp_make_op_error_space(asoc, chunk, len);
-
-	if (*errp) {
-		sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
-		sctp_addto_chunk(*errp, len, param.v);
-	}
+	*errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
+				   param.v, len, 0);
 
 	/* Stop processing this chunk. */
 	return 0;
@@ -2133,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param(
 		/* Make an ERROR chunk, preparing enough room for
 		 * returning multiple unknown parameters.
 		 */
-		if (NULL == *errp)
-			*errp = sctp_make_op_error_fixed(asoc, chunk);
-
-		if (*errp) {
-			if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
-					SCTP_PAD4(ntohs(param.p->length))))
-				sctp_addto_chunk_fixed(*errp,
-						SCTP_PAD4(ntohs(param.p->length)),
-						param.v);
-		} else {
-			/* If there is no memory for generating the ERROR
-			 * report as specified, an ABORT will be triggered
-			 * to the peer and the association won't be
-			 * established.
-			 */
-			retval = SCTP_IERROR_NOMEM;
+		if (!*errp) {
+			*errp = sctp_make_op_error_limited(asoc, chunk);
+			if (!*errp) {
+				/* If there is no memory for generating the
+				 * ERROR report as specified, an ABORT will be
+				 * triggered to the peer and the association
+				 * won't be established.
+				 */
+				retval = SCTP_IERROR_NOMEM;
+				break;
+			}
 		}
+
+		if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+				     ntohs(param.p->length)))
+			sctp_addto_chunk(*errp, ntohs(param.p->length),
+					 param.v);
 		break;
 	default:
 		break;
@@ -2225,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
 		 * MUST be aborted.  The ABORT chunk SHOULD contain the error
 		 * cause 'Protocol Violation'.
 		 */
-		if (SCTP_AUTH_RANDOM_LENGTH !=
-			ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) {
+		if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
+					       sizeof(struct sctp_paramhdr)) {
 			sctp_process_inv_paramlength(asoc, param.p,
-							chunk, err_chunk);
+						     chunk, err_chunk);
 			retval = SCTP_IERROR_ABORT;
 		}
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 05cd271fd61a0bb64fc20c46c9c87b8272fb980c Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Mon, 30 Apr 2018 14:28:30 +0300
Subject: cls_flower: Support multiple masks per priority

Currently flower doesn't support inserting filters with different masks
on a single priority, even if the actual flows (key + mask) inserted
aren't overlapping, as with the use case of offloading openvswitch
datapath flows. Instead one must go up one level, and assign different
priorities for each mask, which will create a different flower
instances.

This patch opens flower to support more than one mask per priority,
and a single flower instance. It does so by adding another hash table
on top of the existing one which will store the different masks,
and the filters that share it.

The user is left with the responsibility of ensuring non overlapping
flows, otherwise precedence is not guaranteed.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 275 +++++++++++++++++++++++++++++++------------------
 1 file changed, 174 insertions(+), 101 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d964e60c730e..eacaaf803914 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -61,16 +61,18 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
 	struct fl_flow_key key;
 	struct fl_flow_mask_range range;
-	struct rcu_head	rcu;
+	struct rhash_head ht_node;
+	struct rhashtable ht;
+	struct rhashtable_params filter_ht_params;
+	struct flow_dissector dissector;
+	struct list_head filters;
+	struct rcu_head rcu;
+	struct list_head list;
 };
 
 struct cls_fl_head {
 	struct rhashtable ht;
-	struct fl_flow_mask mask;
-	struct flow_dissector dissector;
-	bool mask_assigned;
-	struct list_head filters;
-	struct rhashtable_params ht_params;
+	struct list_head masks;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -79,6 +81,7 @@ struct cls_fl_head {
 };
 
 struct cls_fl_filter {
+	struct fl_flow_mask *mask;
 	struct rhash_head ht_node;
 	struct fl_flow_key mkey;
 	struct tcf_exts exts;
@@ -94,6 +97,13 @@ struct cls_fl_filter {
 	struct net_device *hw_dev;
 };
 
+static const struct rhashtable_params mask_ht_params = {
+	.key_offset = offsetof(struct fl_flow_mask, key),
+	.key_len = sizeof(struct fl_flow_key),
+	.head_offset = offsetof(struct fl_flow_mask, ht_node),
+	.automatic_shrinking = true,
+};
+
 static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
 {
 	return mask->range.end - mask->range.start;
@@ -103,13 +113,19 @@ static void fl_mask_update_range(struct fl_flow_mask *mask)
 {
 	const u8 *bytes = (const u8 *) &mask->key;
 	size_t size = sizeof(mask->key);
-	size_t i, first = 0, last = size - 1;
+	size_t i, first = 0, last;
 
-	for (i = 0; i < sizeof(mask->key); i++) {
+	for (i = 0; i < size; i++) {
+		if (bytes[i]) {
+			first = i;
+			break;
+		}
+	}
+	last = first;
+	for (i = size - 1; i != first; i--) {
 		if (bytes[i]) {
-			if (!first && i)
-				first = i;
 			last = i;
+			break;
 		}
 	}
 	mask->range.start = rounddown(first, sizeof(long));
@@ -140,12 +156,11 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head,
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
 				       struct fl_flow_key *mkey)
 {
-	return rhashtable_lookup_fast(&head->ht,
-				      fl_key_get_start(mkey, &head->mask),
-				      head->ht_params);
+	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
+				      mask->filter_ht_params);
 }
 
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -153,28 +168,28 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 {
 	struct cls_fl_head *head = rcu_dereference_bh(tp->root);
 	struct cls_fl_filter *f;
+	struct fl_flow_mask *mask;
 	struct fl_flow_key skb_key;
 	struct fl_flow_key skb_mkey;
 
-	if (!atomic_read(&head->ht.nelems))
-		return -1;
-
-	fl_clear_masked_range(&skb_key, &head->mask);
+	list_for_each_entry_rcu(mask, &head->masks, list) {
+		fl_clear_masked_range(&skb_key, mask);
 
-	skb_key.indev_ifindex = skb->skb_iif;
-	/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
-	 * so do it rather here.
-	 */
-	skb_key.basic.n_proto = skb->protocol;
-	skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key);
-	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
+		skb_key.indev_ifindex = skb->skb_iif;
+		/* skb_flow_dissect() does not set n_proto in case an unknown
+		 * protocol, so do it rather here.
+		 */
+		skb_key.basic.n_proto = skb->protocol;
+		skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key);
+		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
 
-	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
+		fl_set_masked_key(&skb_mkey, &skb_key, mask);
 
-	f = fl_lookup(head, &skb_mkey);
-	if (f && !tc_skip_sw(f->flags)) {
-		*res = f->res;
-		return tcf_exts_exec(skb, &f->exts, res);
+		f = fl_lookup(mask, &skb_mkey);
+		if (f && !tc_skip_sw(f->flags)) {
+			*res = f->res;
+			return tcf_exts_exec(skb, &f->exts, res);
+		}
 	}
 	return -1;
 }
@@ -187,11 +202,28 @@ static int fl_init(struct tcf_proto *tp)
 	if (!head)
 		return -ENOBUFS;
 
-	INIT_LIST_HEAD_RCU(&head->filters);
+	INIT_LIST_HEAD_RCU(&head->masks);
 	rcu_assign_pointer(tp->root, head);
 	idr_init(&head->handle_idr);
 
-	return 0;
+	return rhashtable_init(&head->ht, &mask_ht_params);
+}
+
+static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
+			bool async)
+{
+	if (!list_empty(&mask->filters))
+		return false;
+
+	rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+	rhashtable_destroy(&mask->ht);
+	list_del_rcu(&mask->list);
+	if (async)
+		kfree_rcu(mask, rcu);
+	else
+		kfree(mask);
+
+	return true;
 }
 
 static void __fl_destroy_filter(struct cls_fl_filter *f)
@@ -234,8 +266,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
-				struct flow_dissector *dissector,
-				struct fl_flow_key *mask,
 				struct cls_fl_filter *f,
 				struct netlink_ext_ack *extack)
 {
@@ -247,8 +277,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
 	cls_flower.command = TC_CLSFLOWER_REPLACE;
 	cls_flower.cookie = (unsigned long) f;
-	cls_flower.dissector = dissector;
-	cls_flower.mask = mask;
+	cls_flower.dissector = &f->mask->dissector;
+	cls_flower.mask = &f->mask->key;
 	cls_flower.key = &f->mkey;
 	cls_flower.exts = &f->exts;
 	cls_flower.classid = f->res.classid;
@@ -283,28 +313,31 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 			 &cls_flower, false);
 }
 
-static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 			struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
+	bool async = tcf_exts_get_net(&f->exts);
+	bool last;
 
 	idr_remove(&head->handle_idr, f->handle);
 	list_del_rcu(&f->list);
+	last = fl_mask_put(head, f->mask, async);
 	if (!tc_skip_hw(f->flags))
 		fl_hw_destroy_filter(tp, f, extack);
 	tcf_unbind_filter(tp, &f->res);
-	if (tcf_exts_get_net(&f->exts))
+	if (async)
 		call_rcu(&f->rcu, fl_destroy_filter);
 	else
 		__fl_destroy_filter(f);
+
+	return last;
 }
 
 static void fl_destroy_sleepable(struct work_struct *work)
 {
 	struct cls_fl_head *head = container_of(work, struct cls_fl_head,
 						work);
-	if (head->mask_assigned)
-		rhashtable_destroy(&head->ht);
 	kfree(head);
 	module_put(THIS_MODULE);
 }
@@ -320,10 +353,15 @@ static void fl_destroy_rcu(struct rcu_head *rcu)
 static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
+	struct fl_flow_mask *mask, *next_mask;
 	struct cls_fl_filter *f, *next;
 
-	list_for_each_entry_safe(f, next, &head->filters, list)
-		__fl_delete(tp, f, extack);
+	list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
+		list_for_each_entry_safe(f, next, &mask->filters, list) {
+			if (__fl_delete(tp, f, extack))
+				break;
+		}
+	}
 	idr_destroy(&head->handle_idr);
 
 	__module_get(THIS_MODULE);
@@ -715,14 +753,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 	return ret;
 }
 
-static bool fl_mask_eq(struct fl_flow_mask *mask1,
-		       struct fl_flow_mask *mask2)
+static void fl_mask_copy(struct fl_flow_mask *dst,
+			 struct fl_flow_mask *src)
 {
-	const long *lmask1 = fl_key_get_start(&mask1->key, mask1);
-	const long *lmask2 = fl_key_get_start(&mask2->key, mask2);
+	const void *psrc = fl_key_get_start(&src->key, src);
+	void *pdst = fl_key_get_start(&dst->key, src);
 
-	return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) &&
-	       !memcmp(lmask1, lmask2, fl_mask_range(mask1));
+	memcpy(pdst, psrc, fl_mask_range(src));
+	dst->range = src->range;
 }
 
 static const struct rhashtable_params fl_ht_params = {
@@ -731,14 +769,13 @@ static const struct rhashtable_params fl_ht_params = {
 	.automatic_shrinking = true,
 };
 
-static int fl_init_hashtable(struct cls_fl_head *head,
-			     struct fl_flow_mask *mask)
+static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
 {
-	head->ht_params = fl_ht_params;
-	head->ht_params.key_len = fl_mask_range(mask);
-	head->ht_params.key_offset += mask->range.start;
+	mask->filter_ht_params = fl_ht_params;
+	mask->filter_ht_params.key_len = fl_mask_range(mask);
+	mask->filter_ht_params.key_offset += mask->range.start;
 
-	return rhashtable_init(&head->ht, &head->ht_params);
+	return rhashtable_init(&mask->ht, &mask->filter_ht_params);
 }
 
 #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member)
@@ -761,8 +798,7 @@ static int fl_init_hashtable(struct cls_fl_head *head,
 			FL_KEY_SET(keys, cnt, id, member);			\
 	} while(0);
 
-static void fl_init_dissector(struct cls_fl_head *head,
-			      struct fl_flow_mask *mask)
+static void fl_init_dissector(struct fl_flow_mask *mask)
 {
 	struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
 	size_t cnt = 0;
@@ -802,31 +838,66 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
 			     FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
 
-	skb_flow_dissector_init(&head->dissector, keys, cnt);
+	skb_flow_dissector_init(&mask->dissector, keys, cnt);
+}
+
+static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
+					       struct fl_flow_mask *mask)
+{
+	struct fl_flow_mask *newmask;
+	int err;
+
+	newmask = kzalloc(sizeof(*newmask), GFP_KERNEL);
+	if (!newmask)
+		return ERR_PTR(-ENOMEM);
+
+	fl_mask_copy(newmask, mask);
+
+	err = fl_init_mask_hashtable(newmask);
+	if (err)
+		goto errout_free;
+
+	fl_init_dissector(newmask);
+
+	INIT_LIST_HEAD_RCU(&newmask->filters);
+
+	err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
+				     mask_ht_params);
+	if (err)
+		goto errout_destroy;
+
+	list_add_tail_rcu(&newmask->list, &head->masks);
+
+	return newmask;
+
+errout_destroy:
+	rhashtable_destroy(&newmask->ht);
+errout_free:
+	kfree(newmask);
+
+	return ERR_PTR(err);
 }
 
 static int fl_check_assign_mask(struct cls_fl_head *head,
+				struct cls_fl_filter *fnew,
+				struct cls_fl_filter *fold,
 				struct fl_flow_mask *mask)
 {
-	int err;
+	struct fl_flow_mask *newmask;
 
-	if (head->mask_assigned) {
-		if (!fl_mask_eq(&head->mask, mask))
+	fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+	if (!fnew->mask) {
+		if (fold)
 			return -EINVAL;
-		else
-			return 0;
-	}
 
-	/* Mask is not assigned yet. So assign it and init hashtable
-	 * according to that.
-	 */
-	err = fl_init_hashtable(head, mask);
-	if (err)
-		return err;
-	memcpy(&head->mask, mask, sizeof(head->mask));
-	head->mask_assigned = true;
+		newmask = fl_create_new_mask(head, mask);
+		if (IS_ERR(newmask))
+			return PTR_ERR(newmask);
 
-	fl_init_dissector(head, mask);
+		fnew->mask = newmask;
+	} else if (fold && fold->mask == fnew->mask) {
+		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -924,30 +995,26 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	if (err)
 		goto errout_idr;
 
-	err = fl_check_assign_mask(head, &mask);
+	err = fl_check_assign_mask(head, fnew, fold, &mask);
 	if (err)
 		goto errout_idr;
 
 	if (!tc_skip_sw(fnew->flags)) {
-		if (!fold && fl_lookup(head, &fnew->mkey)) {
+		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
 			err = -EEXIST;
-			goto errout_idr;
+			goto errout_mask;
 		}
 
-		err = rhashtable_insert_fast(&head->ht, &fnew->ht_node,
-					     head->ht_params);
+		err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+					     fnew->mask->filter_ht_params);
 		if (err)
-			goto errout_idr;
+			goto errout_mask;
 	}
 
 	if (!tc_skip_hw(fnew->flags)) {
-		err = fl_hw_replace_filter(tp,
-					   &head->dissector,
-					   &mask.key,
-					   fnew,
-					   extack);
+		err = fl_hw_replace_filter(tp, fnew, extack);
 		if (err)
-			goto errout_idr;
+			goto errout_mask;
 	}
 
 	if (!tc_in_hw(fnew->flags))
@@ -955,8 +1022,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 
 	if (fold) {
 		if (!tc_skip_sw(fold->flags))
-			rhashtable_remove_fast(&head->ht, &fold->ht_node,
-					       head->ht_params);
+			rhashtable_remove_fast(&fold->mask->ht,
+					       &fold->ht_node,
+					       fold->mask->filter_ht_params);
 		if (!tc_skip_hw(fold->flags))
 			fl_hw_destroy_filter(tp, fold, NULL);
 	}
@@ -970,12 +1038,15 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		tcf_exts_get_net(&fold->exts);
 		call_rcu(&fold->rcu, fl_destroy_filter);
 	} else {
-		list_add_tail_rcu(&fnew->list, &head->filters);
+		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
 	}
 
 	kfree(tb);
 	return 0;
 
+errout_mask:
+	fl_mask_put(head, fnew->mask, false);
+
 errout_idr:
 	if (fnew->handle)
 		idr_remove(&head->handle_idr, fnew->handle);
@@ -994,10 +1065,10 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
 	struct cls_fl_filter *f = arg;
 
 	if (!tc_skip_sw(f->flags))
-		rhashtable_remove_fast(&head->ht, &f->ht_node,
-				       head->ht_params);
+		rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+				       f->mask->filter_ht_params);
 	__fl_delete(tp, f, extack);
-	*last = list_empty(&head->filters);
+	*last = list_empty(&head->masks);
 	return 0;
 }
 
@@ -1005,16 +1076,19 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f;
-
-	list_for_each_entry_rcu(f, &head->filters, list) {
-		if (arg->count < arg->skip)
-			goto skip;
-		if (arg->fn(tp, f, arg) < 0) {
-			arg->stop = 1;
-			break;
-		}
+	struct fl_flow_mask *mask;
+
+	list_for_each_entry_rcu(mask, &head->masks, list) {
+		list_for_each_entry_rcu(f, &mask->filters, list) {
+			if (arg->count < arg->skip)
+				goto skip;
+			if (arg->fn(tp, f, arg) < 0) {
+				arg->stop = 1;
+				break;
+			}
 skip:
-		arg->count++;
+			arg->count++;
+		}
 	}
 }
 
@@ -1150,7 +1224,6 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
 static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
-	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f = fh;
 	struct nlattr *nest;
 	struct fl_flow_key *key, *mask;
@@ -1169,7 +1242,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		goto nla_put_failure;
 
 	key = &f->key;
-	mask = &head->mask.key;
+	mask = &f->mask->key;
 
 	if (mask->indev_ifindex) {
 		struct net_device *dev;
-- 
cgit v1.2.3-59-g8ed1b


From a8c744a8b43733509b5625e65fee1985fbb901d7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 30 Apr 2018 15:58:36 -0400
Subject: udp: disable gso with no_check_tx

Syzbot managed to send a udp gso packet without checksum offload into
the gso stack by disabling tx checksum (UDP_NO_CHECK6_TX). This
triggered the skb_warn_bad_offload.

  RIP: 0010:skb_warn_bad_offload+0x2bc/0x600 net/core/dev.c:2658
   skb_gso_segment include/linux/netdevice.h:4038 [inline]
   validate_xmit_skb+0x54d/0xd90 net/core/dev.c:3120
   __dev_queue_xmit+0xbf8/0x34c0 net/core/dev.c:3577
   dev_queue_xmit+0x17/0x20 net/core/dev.c:3618

UDP_NO_CHECK6_TX sets skb->ip_summed to CHECKSUM_NONE just after the
udp gso integrity checks in udp_(v6_)send_skb. Extend those checks to
catch and fail in this case.

After the integrity checks jump directly to the CHECKSUM_PARTIAL case
to avoid reading the no_check_tx flags again (a TOCTTOU race).

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 4 ++++
 net/ipv6/udp.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 794aeafeb782..dd3102a37ef9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -786,11 +786,14 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EINVAL;
 		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
 			return -EINVAL;
+		if (sk->sk_no_check_tx)
+			return -EINVAL;
 		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		goto csum_partial;
 	}
 
 	if (is_udplite)  				 /*     UDP-Lite      */
@@ -802,6 +805,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 		goto send;
 
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
 
 		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
 		goto send;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6acfdd3e442b..a34e28ac03a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1051,11 +1051,14 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EINVAL;
 		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
 			return -EINVAL;
+		if (udp_sk(sk)->no_check6_tx)
+			return -EINVAL;
 		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		goto csum_partial;
 	}
 
 	if (is_udplite)
@@ -1064,6 +1067,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 		skb->ip_summed = CHECKSUM_NONE;
 		goto send;
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+csum_partial:
 		udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len);
 		goto send;
 	} else
-- 
cgit v1.2.3-59-g8ed1b


From e283de3a4fa885aed11525129fd4570f92c1d1a9 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 30 Apr 2018 14:20:05 -0700
Subject: net: core: Inline netdev_features_size_check()

We do not require this inline function to be used in multiple different
locations, just inline it where it gets used in register_netdevice().

Suggested-by: David Miller <davem@davemloft.net>
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 ------
 net/core/dev.c            | 3 ++-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a569bf5524fa..46dcb5f7522f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4132,12 +4132,6 @@ const char *netdev_drivername(const struct net_device *dev);
 
 void linkwatch_run_queue(void);
 
-static inline void netdev_features_size_check(void)
-{
-	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
-		     NETDEV_FEATURE_COUNT);
-}
-
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 8944e1e0059d..bb81a6e1d354 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7883,7 +7883,8 @@ int register_netdevice(struct net_device *dev)
 	int ret;
 	struct net *net = dev_net(dev);
 
-	netdev_features_size_check();
+	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
+		     NETDEV_FEATURE_COUNT);
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
 
-- 
cgit v1.2.3-59-g8ed1b


From b086ff87251b4a4c147bc3af20369514e9d0d9ad Mon Sep 17 00:00:00 2001
From: Stefan Strogin <sstrogin@cisco.com>
Date: Tue, 1 May 2018 01:04:29 +0300
Subject: connector: add parent pid and tgid to coredump and exit events

The intention is to get notified of process failures as soon
as possible, before a possible core dumping (which could be very long)
(e.g. in some process-manager). Coredump and exit process events
are perfect for such use cases (see 2b5faa4c553f "connector: Added
coredumping event to the process connector").

The problem is that for now the process-manager cannot know the parent
of a dying process using connectors. This could be useful if the
process-manager should monitor for failures only children of certain
parents, so we could filter the coredump and exit events by parent
process and/or thread ID.

Add parent pid and tgid to coredump and exit process connectors event
data.

Signed-off-by: Stefan Strogin <sstrogin@cisco.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_proc.c  | 4 ++++
 include/uapi/linux/cn_proc.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce87715c..ed5e42461094 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -262,6 +262,8 @@ void proc_coredump_connector(struct task_struct *task)
 	ev->what = PROC_EVENT_COREDUMP;
 	ev->event_data.coredump.process_pid = task->pid;
 	ev->event_data.coredump.process_tgid = task->tgid;
+	ev->event_data.coredump.parent_pid = task->real_parent->pid;
+	ev->event_data.coredump.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
@@ -288,6 +290,8 @@ void proc_exit_connector(struct task_struct *task)
 	ev->event_data.exit.process_tgid = task->tgid;
 	ev->event_data.exit.exit_code = task->exit_code;
 	ev->event_data.exit.exit_signal = task->exit_signal;
+	ev->event_data.exit.parent_pid = task->real_parent->pid;
+	ev->event_data.exit.parent_tgid = task->real_parent->tgid;
 
 	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
 	msg->ack = 0; /* not used */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 68ff25414700..db210625cee8 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -116,12 +116,16 @@ struct proc_event {
 		struct coredump_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} coredump;
 
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
+			__kernel_pid_t parent_pid;
+			__kernel_pid_t parent_tgid;
 		} exit;
 
 	} event_data;
-- 
cgit v1.2.3-59-g8ed1b


From 206703289a5712a5645558ac7ae24bd29ca69178 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 1 May 2018 19:55:57 +0100
Subject: net: hns3: Remove error log when getting pfc stats fails

When mac supports DCB, but is in GE mode, it does not support
querying pfc stats, firmware returns error when trying to
query the pfc stats. this creates a lot of noise in the kernel
log when it prints the error log.

This patch fixes it by removing the error log, because it already
return the error to the user space, so the user should be aware of
the error.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 885f25cd7be4..c69ecab460f9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -134,11 +134,8 @@ static int hclge_pfc_stats_get(struct hclge_dev *hdev,
 	}
 
 	ret = hclge_cmd_send(&hdev->hw, desc, HCLGE_TM_PFC_PKT_GET_CMD_NUM);
-	if (ret) {
-		dev_err(&hdev->pdev->dev,
-			"Get pfc pause stats fail, ret = %d.\n", ret);
+	if (ret)
 		return ret;
-	}
 
 	for (i = 0; i < HCLGE_TM_PFC_PKT_GET_CMD_NUM; i++) {
 		struct hclge_pfc_stats_cmd *pfc_stats =
-- 
cgit v1.2.3-59-g8ed1b


From 35f58fd792d7fb15f5c036c7b191b0f44d644d75 Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Tue, 1 May 2018 19:55:58 +0100
Subject: net: hns3: fix to correctly fetch l4 protocol outer header

This patch fixes the function being used to fetch L4
protocol outer header. Mistakenly skb_inner_transport_header
API was being used earlier.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 8c55965a66ac..c4e295094da7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -502,7 +502,7 @@ static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
 
 	/* find outer header point */
 	l3.hdr = skb_network_header(skb);
-	l4_hdr = skb_inner_transport_header(skb);
+	l4_hdr = skb_transport_header(skb);
 
 	if (skb->protocol == htons(ETH_P_IPV6)) {
 		exthdr = l3.hdr + sizeof(*l3.v6);
-- 
cgit v1.2.3-59-g8ed1b


From 38e62046d4c95272e2fb001d2d72baf48fa090e9 Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Tue, 1 May 2018 19:55:59 +0100
Subject: net: hns3: Fixes the out of bounds access in hclge_map_tqp

This patch fixes the handling of the check when number of vports
are detected to be more than available TPQs. Current handling causes
an out of bounds access in hclge_map_tqp().

Fixes: 7df7dad633e2 ("net: hns3: Refactor the mapping of tqp to vport")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2066dd734444..c9e80ca6d98a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1459,8 +1459,11 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	/* We need to alloc a vport for main NIC of PF */
 	num_vport = hdev->num_vmdq_vport + hdev->num_req_vfs + 1;
 
-	if (hdev->num_tqps < num_vport)
-		num_vport = hdev->num_tqps;
+	if (hdev->num_tqps < num_vport) {
+		dev_err(&hdev->pdev->dev, "tqps(%d) is less than vports(%d)",
+			hdev->num_tqps, num_vport);
+		return -EINVAL;
+	}
 
 	/* Alloc the same number of TQPs for every vport */
 	tqp_per_vport = hdev->num_tqps / num_vport;
-- 
cgit v1.2.3-59-g8ed1b


From ffd5656e182b94148998b48f748112450c9ff2b0 Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Tue, 1 May 2018 19:56:00 +0100
Subject: net: hns3: Fixes the error legs in hclge_init_ae_dev function

This patch fixes some of the missed error legs in the initialization
function of the ae device. This might cause leaks in case of failure.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer
Support")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 55 +++++++++++++---------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c9e80ca6d98a..b5e0c58fe0c2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5430,7 +5430,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hdev = devm_kzalloc(&pdev->dev, sizeof(*hdev), GFP_KERNEL);
 	if (!hdev) {
 		ret = -ENOMEM;
-		goto err_hclge_dev;
+		goto out;
 	}
 
 	hdev->pdev = pdev;
@@ -5443,38 +5443,38 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	ret = hclge_pci_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "PCI init failed\n");
-		goto err_pci_init;
+		goto out;
 	}
 
 	/* Firmware command queue initialize */
 	ret = hclge_cmd_queue_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Cmd queue init failed, ret = %d.\n", ret);
-		return ret;
+		goto err_pci_uninit;
 	}
 
 	/* Firmware command initialize */
 	ret = hclge_cmd_init(hdev);
 	if (ret)
-		goto err_cmd_init;
+		goto err_cmd_uninit;
 
 	ret = hclge_get_cap(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "get hw capability error, ret = %d.\n",
 			ret);
-		return ret;
+		goto err_cmd_uninit;
 	}
 
 	ret = hclge_configure(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Configure dev error, ret = %d.\n", ret);
-		return ret;
+		goto err_cmd_uninit;
 	}
 
 	ret = hclge_init_msi(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Init MSI/MSI-X error, ret = %d.\n", ret);
-		return ret;
+		goto err_cmd_uninit;
 	}
 
 	ret = hclge_misc_irq_init(hdev);
@@ -5482,69 +5482,69 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 		dev_err(&pdev->dev,
 			"Misc IRQ(vector0) init error, ret = %d.\n",
 			ret);
-		return ret;
+		goto err_msi_uninit;
 	}
 
 	ret = hclge_alloc_tqps(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Allocate TQPs error, ret = %d.\n", ret);
-		return ret;
+		goto err_msi_irq_uninit;
 	}
 
 	ret = hclge_alloc_vport(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Allocate vport error, ret = %d.\n", ret);
-		return ret;
+		goto err_msi_irq_uninit;
 	}
 
 	ret = hclge_map_tqp(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Map tqp error, ret = %d.\n", ret);
-		return ret;
+		goto err_sriov_disable;
 	}
 
 	ret = hclge_mac_mdio_config(hdev);
 	if (ret) {
 		dev_warn(&hdev->pdev->dev,
 			 "mdio config fail ret=%d\n", ret);
-		return ret;
+		goto err_sriov_disable;
 	}
 
 	ret = hclge_mac_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Mac init error, ret = %d\n", ret);
-		return ret;
+		goto err_mdiobus_unreg;
 	}
 
 	ret = hclge_config_tso(hdev, HCLGE_TSO_MSS_MIN, HCLGE_TSO_MSS_MAX);
 	if (ret) {
 		dev_err(&pdev->dev, "Enable tso fail, ret =%d\n", ret);
-		return ret;
+		goto err_mdiobus_unreg;
 	}
 
 	ret = hclge_init_vlan_config(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "VLAN init fail, ret =%d\n", ret);
-		return  ret;
+		goto err_mdiobus_unreg;
 	}
 
 	ret = hclge_tm_schd_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "tm schd init fail, ret =%d\n", ret);
-		return ret;
+		goto err_mdiobus_unreg;
 	}
 
 	hclge_rss_init_cfg(hdev);
 	ret = hclge_rss_init_hw(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Rss init fail, ret =%d\n", ret);
-		return ret;
+		goto err_mdiobus_unreg;
 	}
 
 	ret = init_mgr_tbl(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "manager table init fail, ret =%d\n", ret);
-		return ret;
+		goto err_mdiobus_unreg;
 	}
 
 	hclge_dcb_ops_set(hdev);
@@ -5567,11 +5567,24 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	pr_info("%s driver initialization finished.\n", HCLGE_DRIVER_NAME);
 	return 0;
 
-err_cmd_init:
+err_mdiobus_unreg:
+	if (hdev->hw.mac.phydev)
+		mdiobus_unregister(hdev->hw.mac.mdio_bus);
+err_sriov_disable:
+	if (IS_ENABLED(CONFIG_PCI_IOV))
+		hclge_disable_sriov(hdev);
+err_msi_irq_uninit:
+	hclge_misc_irq_uninit(hdev);
+err_msi_uninit:
+	pci_free_irq_vectors(pdev);
+err_cmd_uninit:
+	hclge_destroy_cmd_queue(&hdev->hw);
+err_pci_uninit:
+	pci_clear_master(pdev);
 	pci_release_regions(pdev);
-err_pci_init:
+	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);
-err_hclge_dev:
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c5ef83cbb1e9fa17a3b78fb2deb25ae8beebbf48 Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Tue, 1 May 2018 19:56:01 +0100
Subject: net: hns3: fix for phy_addr error in hclge_mac_mdio_config

When phy exists, phy_addr must less than PHY_MAX_ADDR.
If not, hclge_mac_mdio_config should return error.
And for fiber(phy_addr=0xff), it does not need hclge_mac_mdio_config.

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 12 +++++++-----
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c |  7 +++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index b5e0c58fe0c2..cc09713b89ad 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5503,11 +5503,13 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 		goto err_sriov_disable;
 	}
 
-	ret = hclge_mac_mdio_config(hdev);
-	if (ret) {
-		dev_warn(&hdev->pdev->dev,
-			 "mdio config fail ret=%d\n", ret);
-		goto err_sriov_disable;
+	if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) {
+		ret = hclge_mac_mdio_config(hdev);
+		if (ret) {
+			dev_err(&hdev->pdev->dev,
+				"mdio config fail ret=%d\n", ret);
+			goto err_sriov_disable;
+		}
 	}
 
 	ret = hclge_mac_init(hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
index 682c2d6618e7..9f7932e423b5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
@@ -140,8 +140,11 @@ int hclge_mac_mdio_config(struct hclge_dev *hdev)
 	struct mii_bus *mdio_bus;
 	int ret;
 
-	if (hdev->hw.mac.phy_addr >= PHY_MAX_ADDR)
-		return 0;
+	if (hdev->hw.mac.phy_addr >= PHY_MAX_ADDR) {
+		dev_err(&hdev->pdev->dev, "phy_addr(%d) is too large.\n",
+			hdev->hw.mac.phy_addr);
+		return -EINVAL;
+	}
 
 	mdio_bus = devm_mdiobus_alloc(&hdev->pdev->dev);
 	if (!mdio_bus)
-- 
cgit v1.2.3-59-g8ed1b


From 0c963e8c20ce368ddedc22cbfa710315d715dfae Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 1 May 2018 19:56:02 +0100
Subject: net: hns3: Fix to support autoneg only for port attached with phy

This patch adds a check to support autoneg(ethtool -A) only when PHY
is attached with the port.

Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility
Layer) Support")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index cc09713b89ad..a4e999180117 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5169,12 +5169,6 @@ static int hclge_set_pauseparam(struct hnae3_handle *handle, u32 auto_neg,
 	struct phy_device *phydev = hdev->hw.mac.phydev;
 	u32 fc_autoneg;
 
-	/* Only support flow control negotiation for netdev with
-	 * phy attached for now.
-	 */
-	if (!phydev)
-		return -EOPNOTSUPP;
-
 	fc_autoneg = hclge_get_autoneg(handle);
 	if (auto_neg != fc_autoneg) {
 		dev_info(&hdev->pdev->dev,
@@ -5193,6 +5187,12 @@ static int hclge_set_pauseparam(struct hnae3_handle *handle, u32 auto_neg,
 	if (!fc_autoneg)
 		return hclge_cfg_pauseparam(hdev, rx_en, tx_en);
 
+	/* Only support flow control negotiation for netdev with
+	 * phy attached for now.
+	 */
+	if (!phydev)
+		return -EOPNOTSUPP;
+
 	return phy_start_aneg(phydev);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3ff504908f95093cc013a01d1cafb1f7fda0ab34 Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Tue, 1 May 2018 19:56:03 +0100
Subject: net: hns3: fix a dead loop in hclge_cmd_csq_clean

If head has invlid value then a dead loop can be triggered in
hclge_cmd_csq_clean. This patch adds sanity check for this case.

Fixes: 68c0a5c70614 ("net: hns3: Add HNS3 IMP(Integrated Mgmt Proc) Cmd
Interface Support")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c    | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index ff13d1876d9e..fab70683bbf7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -31,6 +31,17 @@ static int hclge_ring_space(struct hclge_cmq_ring *ring)
 	return ring->desc_num - used - 1;
 }
 
+static int is_valid_csq_clean_head(struct hclge_cmq_ring *ring, int h)
+{
+	int u = ring->next_to_use;
+	int c = ring->next_to_clean;
+
+	if (unlikely(h >= ring->desc_num))
+		return 0;
+
+	return u > c ? (h > c && h <= u) : (h > c || h <= u);
+}
+
 static int hclge_alloc_cmd_desc(struct hclge_cmq_ring *ring)
 {
 	int size  = ring->desc_num * sizeof(struct hclge_desc);
@@ -141,6 +152,7 @@ static void hclge_cmd_init_regs(struct hclge_hw *hw)
 
 static int hclge_cmd_csq_clean(struct hclge_hw *hw)
 {
+	struct hclge_dev *hdev = (struct hclge_dev *)hw->back;
 	struct hclge_cmq_ring *csq = &hw->cmq.csq;
 	u16 ntc = csq->next_to_clean;
 	struct hclge_desc *desc;
@@ -149,6 +161,13 @@ static int hclge_cmd_csq_clean(struct hclge_hw *hw)
 
 	desc = &csq->desc[ntc];
 	head = hclge_read_dev(hw, HCLGE_NIC_CSQ_HEAD_REG);
+	rmb(); /* Make sure head is ready before touch any data */
+
+	if (!is_valid_csq_clean_head(csq, head)) {
+		dev_warn(&hdev->pdev->dev, "wrong head (%d, %d-%d)\n", head,
+			   csq->next_to_use, csq->next_to_clean);
+		return 0;
+	}
 
 	while (head != ntc) {
 		memset(desc, 0, sizeof(*desc));
-- 
cgit v1.2.3-59-g8ed1b


From dc8131d846d45dabc39dadac32407b321960791f Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 1 May 2018 19:56:04 +0100
Subject: net: hns3: Fix for packet loss due wrong filter config in VLAN tbls

There are two level of vlan tables in hardware, one is port vlan
which is shared by all functions, the other one is function
vlan table, each function has it's own function vlan table.
Currently, PF sets the port vlan table, and vf sets the function
vlan table, which will cause packet lost problem.

This patch fixes this problem by setting both vlan table, and
use hdev->vlan_table to manage thet port vlan table.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 79 +++++++++++++++++-----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  8 ++-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c |  7 +-
 3 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index a4e999180117..77d9e4c02534 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4543,8 +4543,9 @@ static void hclge_enable_vlan_filter(struct hnae3_handle *handle, bool enable)
 	hclge_set_vlan_filter_ctrl(hdev, HCLGE_FILTER_TYPE_VF, enable);
 }
 
-int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
-			     bool is_kill, u16 vlan, u8 qos, __be16 proto)
+static int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
+				    bool is_kill, u16 vlan, u8 qos,
+				    __be16 proto)
 {
 #define HCLGE_MAX_VF_BYTES  16
 	struct hclge_vlan_filter_vf_cfg_cmd *req0;
@@ -4602,12 +4603,9 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
 	return -EIO;
 }
 
-static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
-				      __be16 proto, u16 vlan_id,
-				      bool is_kill)
+static int hclge_set_port_vlan_filter(struct hclge_dev *hdev, __be16 proto,
+				      u16 vlan_id, bool is_kill)
 {
-	struct hclge_vport *vport = hclge_get_vport(handle);
-	struct hclge_dev *hdev = vport->back;
 	struct hclge_vlan_filter_pf_cfg_cmd *req;
 	struct hclge_desc desc;
 	u8 vlan_offset_byte_val;
@@ -4627,22 +4625,66 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
 	req->vlan_offset_bitmap[vlan_offset_byte] = vlan_offset_byte_val;
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+	if (ret)
+		dev_err(&hdev->pdev->dev,
+			"port vlan command, send fail, ret =%d.\n", ret);
+	return ret;
+}
+
+static int hclge_set_vlan_filter_hw(struct hclge_dev *hdev, __be16 proto,
+				    u16 vport_id, u16 vlan_id, u8 qos,
+				    bool is_kill)
+{
+	u16 vport_idx, vport_num = 0;
+	int ret;
+
+	ret = hclge_set_vf_vlan_common(hdev, vport_id, is_kill, vlan_id,
+				       0, proto);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
-			"port vlan command, send fail, ret =%d.\n",
-			ret);
+			"Set %d vport vlan filter config fail, ret =%d.\n",
+			vport_id, ret);
 		return ret;
 	}
 
-	ret = hclge_set_vf_vlan_common(hdev, 0, is_kill, vlan_id, 0, proto);
-	if (ret) {
+	/* vlan 0 may be added twice when 8021q module is enabled */
+	if (!is_kill && !vlan_id &&
+	    test_bit(vport_id, hdev->vlan_table[vlan_id]))
+		return 0;
+
+	if (!is_kill && test_and_set_bit(vport_id, hdev->vlan_table[vlan_id])) {
 		dev_err(&hdev->pdev->dev,
-			"Set pf vlan filter config fail, ret =%d.\n",
-			ret);
-		return -EIO;
+			"Add port vlan failed, vport %d is already in vlan %d\n",
+			vport_id, vlan_id);
+		return -EINVAL;
 	}
 
-	return 0;
+	if (is_kill &&
+	    !test_and_clear_bit(vport_id, hdev->vlan_table[vlan_id])) {
+		dev_err(&hdev->pdev->dev,
+			"Delete port vlan failed, vport %d is not in vlan %d\n",
+			vport_id, vlan_id);
+		return -EINVAL;
+	}
+
+	for_each_set_bit(vport_idx, hdev->vlan_table[vlan_id], VLAN_N_VID)
+		vport_num++;
+
+	if ((is_kill && vport_num == 0) || (!is_kill && vport_num == 1))
+		ret = hclge_set_port_vlan_filter(hdev, proto, vlan_id,
+						 is_kill);
+
+	return ret;
+}
+
+int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
+			  u16 vlan_id, bool is_kill)
+{
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct hclge_dev *hdev = vport->back;
+
+	return hclge_set_vlan_filter_hw(hdev, proto, vport->vport_id, vlan_id,
+					0, is_kill);
 }
 
 static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid,
@@ -4656,7 +4698,7 @@ static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid,
 	if (proto != htons(ETH_P_8021Q))
 		return -EPROTONOSUPPORT;
 
-	return hclge_set_vf_vlan_common(hdev, vfid, false, vlan, qos, proto);
+	return hclge_set_vlan_filter_hw(hdev, proto, vfid, vlan, qos, false);
 }
 
 static int hclge_set_vlan_tx_offload_cfg(struct hclge_vport *vport)
@@ -4821,7 +4863,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 	}
 
 	handle = &hdev->vport[0].nic;
-	return hclge_set_port_vlan_filter(handle, htons(ETH_P_8021Q), 0, false);
+	return hclge_set_vlan_filter(handle, htons(ETH_P_8021Q), 0, false);
 }
 
 static int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
@@ -5604,6 +5646,7 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
 	set_bit(HCLGE_STATE_DOWN, &hdev->state);
 
 	hclge_stats_clear(hdev);
+	memset(hdev->vlan_table, 0, sizeof(hdev->vlan_table));
 
 	ret = hclge_cmd_init(hdev);
 	if (ret) {
@@ -6221,7 +6264,7 @@ static const struct hnae3_ae_ops hclge_ops = {
 	.get_fw_version = hclge_get_fw_version,
 	.get_mdix_mode = hclge_get_mdix_mode,
 	.enable_vlan_filter = hclge_enable_vlan_filter,
-	.set_vlan_filter = hclge_set_port_vlan_filter,
+	.set_vlan_filter = hclge_set_vlan_filter,
 	.set_vf_vlan_filter = hclge_set_vf_vlan_filter,
 	.enable_hw_strip_rxvtag = hclge_en_hw_strip_rxvtag,
 	.reset_event = hclge_reset_event,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 0f4157e71282..6432f754c1c6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -12,6 +12,8 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/phy.h>
+#include <linux/if_vlan.h>
+
 #include "hclge_cmd.h"
 #include "hnae3.h"
 
@@ -471,6 +473,7 @@ struct hclge_vlan_type_cfg {
 	u16 tx_in_vlan_type;
 };
 
+#define HCLGE_VPORT_NUM 256
 struct hclge_dev {
 	struct pci_dev *pdev;
 	struct hnae3_ae_dev *ae_dev;
@@ -562,6 +565,7 @@ struct hclge_dev {
 
 	u64 rx_pkts_for_led;
 	u64 tx_pkts_for_led;
+	unsigned long vlan_table[VLAN_N_VID][BITS_TO_LONGS(HCLGE_VPORT_NUM)];
 };
 
 /* VPort level vlan tag configuration for TX direction */
@@ -646,8 +650,8 @@ static inline int hclge_get_queue_id(struct hnae3_queue *queue)
 }
 
 int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex);
-int hclge_set_vf_vlan_common(struct hclge_dev *vport, int vfid,
-			     bool is_kill, u16 vlan, u8 qos, __be16 proto);
+int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
+			  u16 vlan_id, bool is_kill);
 
 int hclge_buffer_alloc(struct hclge_dev *hdev);
 int hclge_rss_init_hw(struct hclge_dev *hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index a6f7ffa9c259..7563335b0c7f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -264,19 +264,18 @@ static int hclge_set_vf_vlan_cfg(struct hclge_vport *vport,
 				 struct hclge_mbx_vf_to_pf_cmd *mbx_req,
 				 bool gen_resp)
 {
-	struct hclge_dev *hdev = vport->back;
 	int status = 0;
 
 	if (mbx_req->msg[1] == HCLGE_MBX_VLAN_FILTER) {
+		struct hnae3_handle *handle = &vport->nic;
 		u16 vlan, proto;
 		bool is_kill;
 
 		is_kill = !!mbx_req->msg[2];
 		memcpy(&vlan, &mbx_req->msg[3], sizeof(vlan));
 		memcpy(&proto, &mbx_req->msg[5], sizeof(proto));
-		status = hclge_set_vf_vlan_common(hdev, vport->vport_id,
-						  is_kill, vlan, 0,
-						  cpu_to_be16(proto));
+		status = hclge_set_vlan_filter(handle, cpu_to_be16(proto),
+					       vlan, is_kill);
 	}
 
 	if (gen_resp)
-- 
cgit v1.2.3-59-g8ed1b


From dbecc7796cf1ade372e1dec006031feb56fff140 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 1 May 2018 19:56:05 +0100
Subject: net: hns3: Remove packet statistics in the range of 8192~12287

Because the current statistics for size 8192~12287 are only valid for GE,
the ranges of 8192~9216 and 9217~12287 are valid only for LGE/CGE, and are
always 0 for GE interfaces. it is easy to cause confusion when viewing the
packet statistics using the command ethtool -S.

This patch removes the 8192~12287 range of packet statistics and uses the
8192~9216 and 9217~12287 ranges for statistics. This change depends on the
firmware upgrade.

Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c |  4 ----
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 12 ++++++------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 77d9e4c02534..dd5d65c9cca6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -304,8 +304,6 @@ static const struct hclge_comm_stats_str g_mac_stats_string[] = {
 		HCLGE_MAC_STATS_FIELD_OFF(mac_tx_2048_4095_oct_pkt_num)},
 	{"mac_tx_4096_8191_oct_pkt_num",
 		HCLGE_MAC_STATS_FIELD_OFF(mac_tx_4096_8191_oct_pkt_num)},
-	{"mac_tx_8192_12287_oct_pkt_num",
-		HCLGE_MAC_STATS_FIELD_OFF(mac_tx_8192_12287_oct_pkt_num)},
 	{"mac_tx_8192_9216_oct_pkt_num",
 		HCLGE_MAC_STATS_FIELD_OFF(mac_tx_8192_9216_oct_pkt_num)},
 	{"mac_tx_9217_12287_oct_pkt_num",
@@ -356,8 +354,6 @@ static const struct hclge_comm_stats_str g_mac_stats_string[] = {
 		HCLGE_MAC_STATS_FIELD_OFF(mac_rx_2048_4095_oct_pkt_num)},
 	{"mac_rx_4096_8191_oct_pkt_num",
 		HCLGE_MAC_STATS_FIELD_OFF(mac_rx_4096_8191_oct_pkt_num)},
-	{"mac_rx_8192_12287_oct_pkt_num",
-		HCLGE_MAC_STATS_FIELD_OFF(mac_rx_8192_12287_oct_pkt_num)},
 	{"mac_rx_8192_9216_oct_pkt_num",
 		HCLGE_MAC_STATS_FIELD_OFF(mac_rx_8192_9216_oct_pkt_num)},
 	{"mac_rx_9217_12287_oct_pkt_num",
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 6432f754c1c6..b7ee91daea0c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -408,9 +408,9 @@ struct hclge_mac_stats {
 	u64 mac_tx_1519_2047_oct_pkt_num;
 	u64 mac_tx_2048_4095_oct_pkt_num;
 	u64 mac_tx_4096_8191_oct_pkt_num;
-	u64 mac_tx_8192_12287_oct_pkt_num; /* valid for GE MAC only */
-	u64 mac_tx_8192_9216_oct_pkt_num; /* valid for LGE & CGE MAC only */
-	u64 mac_tx_9217_12287_oct_pkt_num; /* valid for LGE & CGE MAC */
+	u64 rsv0;
+	u64 mac_tx_8192_9216_oct_pkt_num;
+	u64 mac_tx_9217_12287_oct_pkt_num;
 	u64 mac_tx_12288_16383_oct_pkt_num;
 	u64 mac_tx_1519_max_good_oct_pkt_num;
 	u64 mac_tx_1519_max_bad_oct_pkt_num;
@@ -435,9 +435,9 @@ struct hclge_mac_stats {
 	u64 mac_rx_1519_2047_oct_pkt_num;
 	u64 mac_rx_2048_4095_oct_pkt_num;
 	u64 mac_rx_4096_8191_oct_pkt_num;
-	u64 mac_rx_8192_12287_oct_pkt_num;/* valid for GE MAC only */
-	u64 mac_rx_8192_9216_oct_pkt_num; /* valid for LGE & CGE MAC only */
-	u64 mac_rx_9217_12287_oct_pkt_num; /* valid for LGE & CGE MAC only */
+	u64 rsv1;
+	u64 mac_rx_8192_9216_oct_pkt_num;
+	u64 mac_rx_9217_12287_oct_pkt_num;
 	u64 mac_rx_12288_16383_oct_pkt_num;
 	u64 mac_rx_1519_max_good_oct_pkt_num;
 	u64 mac_rx_1519_max_bad_oct_pkt_num;
-- 
cgit v1.2.3-59-g8ed1b


From a3fe1f6f2adaa01c620793666d489a584211ecba Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 1 May 2018 21:18:38 +0100
Subject: tools: bpftool: change time format for program 'loaded at:'
 information

To make eBPF program load time easier to parse from "bpftool prog"
output for machines, change the time format used by the program. The
format now differs for plain and JSON version:

- Plain version uses a string formatted according to ISO 8601.
- JSON uses the number of seconds since the Epoch, wich is less friendly
  for humans but even easier to process.

Example output:

    # ./bpftool prog
    41298: xdp  tag a04f5eef06a7f555 dev foo
            loaded_at 2018-04-18T17:19:47+0100  uid 0
            xlated 16B  not jited  memlock 4096B

    # ./bpftool prog -p
    [{
            "id": 41298,
            "type": "xdp",
            "tag": "a04f5eef06a7f555",
            "gpl_compatible": false,
            "dev": {
                "ifindex": 14,
                "ns_dev": 3,
                "ns_inode": 4026531993,
                "ifname": "foo"
            },
            "loaded_at": 1524068387,
            "uid": 0,
            "bytes_xlated": 16,
            "jited": false,
            "bytes_memlock": 4096
        }
    ]

Previously, "Apr 18/17:19" would be used at both places.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/prog.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e71a0a11afde..9bdfdf2d3fbe 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -96,7 +96,10 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
 		return;
 	}
 
-	strftime(buf, size, "%b %d/%H:%M", &load_tm);
+	if (json_output)
+		strftime(buf, size, "%s", &load_tm);
+	else
+		strftime(buf, size, "%FT%T%z", &load_tm);
 }
 
 static int prog_fd_by_tag(unsigned char *tag)
@@ -245,7 +248,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		print_boot_time(info->load_time, buf, sizeof(buf));
 
 		/* Piggy back on load_time, since 0 uid is a valid one */
-		jsonw_string_field(json_wtr, "loaded_at", buf);
+		jsonw_name(json_wtr, "loaded_at");
+		jsonw_printf(json_wtr, "%s", buf);
 		jsonw_uint_field(json_wtr, "uid", info->created_by_uid);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b75eba76d3d72e2374fac999926dafef2997edd2 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 1 May 2018 15:39:15 -0400
Subject: tcp: send in-queue bytes in cmsg upon read

Applications with many concurrent connections, high variance
in receive queue length and tight memory bounds cannot
allocate worst-case buffer size to drain sockets. Knowing
the size of receive queue length, applications can optimize
how they allocate buffers to read from the socket.

The number of bytes pending on the socket is directly
available through ioctl(FIONREAD/SIOCINQ) and can be
approximated using getsockopt(MEMINFO) (rmem_alloc includes
skb overheads in addition to application data). But, both of
these options add an extra syscall per recvmsg. Moreover,
ioctl(FIONREAD/SIOCINQ) takes the socket lock.

Add the TCP_INQ socket option to TCP. When this socket
option is set, recvmsg() relays the number of bytes available
on the socket for reading to the application via the
TCP_CM_INQ control message.

Calculate the number of bytes after releasing the socket lock
to include the processed backlog, if any. To avoid an extra
branch in the hot path of recvmsg() for this new control
message, move all cmsg processing inside an existing branch for
processing receive timestamps. Since the socket lock is not held
when calculating the size of receive queue, TCP_INQ is a hint.
For example, it can overestimate the queue size by one byte,
if FIN is received.

With this method, applications can start reading from the socket
using a small buffer, and then use larger buffers based on the
remaining data when needed.

V3 change-log:
	As suggested by David Miller, added loads with barrier
	to check whether we have multiple threads calling recvmsg
	in parallel. When that happens we lock the socket to
	calculate inq.
V4 change-log:
	Removed inline from a static function.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 +-
 include/uapi/linux/tcp.h |  3 +++
 net/ipv4/tcp.c           | 43 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 20585d5c4e1c..807776928cb8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -228,7 +228,7 @@ struct tcp_sock {
 		unused:2;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
-		unused1	    : 1,
+		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		repair      : 1,
 		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index e9e8373b34b9..29eb659aa77a 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -123,6 +123,9 @@ enum {
 #define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
 #define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
 #define TCP_ZEROCOPY_RECEIVE	35
+#define TCP_INQ			36	/* Notify bytes available to read as a cmsg on read */
+
+#define TCP_CM_INQ		TCP_INQ
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4028ddd14dd5..868ed74a76a8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1889,6 +1889,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
 	}
 }
 
+static int tcp_inq_hint(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 copied_seq = READ_ONCE(tp->copied_seq);
+	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
+	int inq;
+
+	inq = rcv_nxt - copied_seq;
+	if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
+		lock_sock(sk);
+		inq = tp->rcv_nxt - tp->copied_seq;
+		release_sock(sk);
+	}
+	return inq;
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -1905,13 +1921,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	u32 peek_seq;
 	u32 *seq;
 	unsigned long used;
-	int err;
+	int err, inq;
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 	struct scm_timestamping tss;
 	bool has_tss = false;
+	bool has_cmsg;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
@@ -1926,6 +1943,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	if (sk->sk_state == TCP_LISTEN)
 		goto out;
 
+	has_cmsg = tp->recvmsg_inq;
 	timeo = sock_rcvtimeo(sk, nonblock);
 
 	/* Urgent data needs to be handled specially. */
@@ -2112,6 +2130,7 @@ skip_copy:
 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
 			tcp_update_recv_tstamps(skb, &tss);
 			has_tss = true;
+			has_cmsg = true;
 		}
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
@@ -2131,13 +2150,20 @@ skip_copy:
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
 
-	if (has_tss)
-		tcp_recv_timestamp(msg, sk, &tss);
-
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
 
 	release_sock(sk);
+
+	if (has_cmsg) {
+		if (has_tss)
+			tcp_recv_timestamp(msg, sk, &tss);
+		if (tp->recvmsg_inq) {
+			inq = tcp_inq_hint(sk);
+			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+		}
+	}
+
 	return copied;
 
 out:
@@ -3006,6 +3032,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 		break;
+	case TCP_INQ:
+		if (val > 1 || val < 0)
+			err = -EINVAL;
+		else
+			tp->recvmsg_inq = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -3431,6 +3463,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
+	case TCP_INQ:
+		val = tp->recvmsg_inq;
+		break;
 	case TCP_SAVE_SYN:
 		val = tp->save_syn;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 702353b538f50f9a3b135473dbd7e45384bf941d Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 1 May 2018 15:39:16 -0400
Subject: selftest: add test for TCP_INQ

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile  |   3 +-
 tools/testing/selftests/net/tcp_inq.c | 189 ++++++++++++++++++++++++++++++++++
 2 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/tcp_inq.c

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index df9102ec7b7a..0a1821f8dfb1 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -9,7 +9,7 @@ TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh udpgso.sh
 TEST_PROGS += udpgso_bench.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
-TEST_GEN_FILES += tcp_mmap
+TEST_GEN_FILES += tcp_mmap tcp_inq
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
 TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
@@ -18,3 +18,4 @@ include ../lib.mk
 
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
 $(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
+$(OUTPUT)/tcp_inq: LDFLAGS += -lpthread
diff --git a/tools/testing/selftests/net/tcp_inq.c b/tools/testing/selftests/net/tcp_inq.c
new file mode 100644
index 000000000000..d044b29ddabc
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_inq.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright 2018 Google Inc.
+ * Author: Soheil Hassas Yeganeh (soheil@google.com)
+ *
+ * Simple example on how to use TCP_INQ and TCP_CM_INQ.
+ *
+ * License (GPLv2):
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ */
+#define _GNU_SOURCE
+
+#include <error.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#ifndef TCP_INQ
+#define TCP_INQ 36
+#endif
+
+#ifndef TCP_CM_INQ
+#define TCP_CM_INQ TCP_INQ
+#endif
+
+#define BUF_SIZE 8192
+#define CMSG_SIZE 32
+
+static int family = AF_INET6;
+static socklen_t addr_len = sizeof(struct sockaddr_in6);
+static int port = 4974;
+
+static void setup_loopback_addr(int family, struct sockaddr_storage *sockaddr)
+{
+	struct sockaddr_in6 *addr6 = (void *) sockaddr;
+	struct sockaddr_in *addr4 = (void *) sockaddr;
+
+	switch (family) {
+	case PF_INET:
+		memset(addr4, 0, sizeof(*addr4));
+		addr4->sin_family = AF_INET;
+		addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		addr4->sin_port = htons(port);
+		break;
+	case PF_INET6:
+		memset(addr6, 0, sizeof(*addr6));
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_addr = in6addr_loopback;
+		addr6->sin6_port = htons(port);
+		break;
+	default:
+		error(1, 0, "illegal family");
+	}
+}
+
+void *start_server(void *arg)
+{
+	int server_fd = (int)(unsigned long)arg;
+	struct sockaddr_in addr;
+	socklen_t addrlen = sizeof(addr);
+	char *buf;
+	int fd;
+	int r;
+
+	buf = malloc(BUF_SIZE);
+
+	for (;;) {
+		fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
+		if (fd == -1) {
+			perror("accept");
+			break;
+		}
+		do {
+			r = send(fd, buf, BUF_SIZE, 0);
+		} while (r < 0 && errno == EINTR);
+		if (r < 0)
+			perror("send");
+		if (r != BUF_SIZE)
+			fprintf(stderr, "can only send %d bytes\n", r);
+		/* TCP_INQ can overestimate in-queue by one byte if we send
+		 * the FIN packet. Sleep for 1 second, so that the client
+		 * likely invoked recvmsg().
+		 */
+		sleep(1);
+		close(fd);
+	}
+
+	free(buf);
+	close(server_fd);
+	pthread_exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct sockaddr_storage listen_addr, addr;
+	int c, one = 1, inq = -1;
+	pthread_t server_thread;
+	char cmsgbuf[CMSG_SIZE];
+	struct iovec iov[1];
+	struct cmsghdr *cm;
+	struct msghdr msg;
+	int server_fd, fd;
+	char *buf;
+
+	while ((c = getopt(argc, argv, "46p:")) != -1) {
+		switch (c) {
+		case '4':
+			family = PF_INET;
+			addr_len = sizeof(struct sockaddr_in);
+			break;
+		case '6':
+			family = PF_INET6;
+			addr_len = sizeof(struct sockaddr_in6);
+			break;
+		case 'p':
+			port = atoi(optarg);
+			break;
+		}
+	}
+
+	server_fd = socket(family, SOCK_STREAM, 0);
+	if (server_fd < 0)
+		error(1, errno, "server socket");
+	setup_loopback_addr(family, &listen_addr);
+	if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR,
+		       &one, sizeof(one)) != 0)
+		error(1, errno, "setsockopt(SO_REUSEADDR)");
+	if (bind(server_fd, (const struct sockaddr *)&listen_addr,
+		 addr_len) == -1)
+		error(1, errno, "bind");
+	if (listen(server_fd, 128) == -1)
+		error(1, errno, "listen");
+	if (pthread_create(&server_thread, NULL, start_server,
+			   (void *)(unsigned long)server_fd) != 0)
+		error(1, errno, "pthread_create");
+
+	fd = socket(family, SOCK_STREAM, 0);
+	if (fd < 0)
+		error(1, errno, "client socket");
+	setup_loopback_addr(family, &addr);
+	if (connect(fd, (const struct sockaddr *)&addr, addr_len) == -1)
+		error(1, errno, "connect");
+	if (setsockopt(fd, SOL_TCP, TCP_INQ, &one, sizeof(one)) != 0)
+		error(1, errno, "setsockopt(TCP_INQ)");
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = cmsgbuf;
+	msg.msg_controllen = sizeof(cmsgbuf);
+	msg.msg_flags = 0;
+
+	buf = malloc(BUF_SIZE);
+	iov[0].iov_base = buf;
+	iov[0].iov_len = BUF_SIZE / 2;
+
+	if (recvmsg(fd, &msg, 0) != iov[0].iov_len)
+		error(1, errno, "recvmsg");
+	if (msg.msg_flags & MSG_CTRUNC)
+		error(1, 0, "control message is truncated");
+
+	for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm))
+		if (cm->cmsg_level == SOL_TCP && cm->cmsg_type == TCP_CM_INQ)
+			inq = *((int *) CMSG_DATA(cm));
+
+	if (inq != BUF_SIZE - iov[0].iov_len) {
+		fprintf(stderr, "unexpected inq: %d\n", inq);
+		exit(1);
+	}
+
+	printf("PASSED\n");
+	free(buf);
+	close(fd);
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From a2c7a98301d9f9bcfc4244de04252a04c0b68a79 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 27 Apr 2018 11:54:40 +0200
Subject: x86/bpf: Clean up non-standard comments, to make the code more
 readable

So by chance I looked into x86 assembly in arch/x86/net/bpf_jit_comp.c and
noticed the weird and inconsistent comment style it mistakenly learned from
the networking code:

 /* Multi-line comment ...
  * ... looks like this.
  */

Fix this to use the standard comment style specified in Documentation/CodingStyle
and used in arch/x86/ as well:

 /*
  * Multi-line comment ...
  * ... looks like this.
  */

Also, to quote Linus's ... more explicit views about this:

  http://article.gmane.org/gmane.linux.kernel.cryptoapi/21066

  > But no, the networking code picked *none* of the above sane formats.
  > Instead, it picked these two models that are just half-arsed
  > shit-for-brains:
  >
  >  (no)
  >      /* This is disgusting drug-induced
  >        * crap, and should die
  >        */
  >
  >   (no-no-no)
  >       /* This is also very nasty
  >        * and visually unbalanced */
  >
  > Please. The networking code actually has the *worst* possible comment
  > style. You can literally find that (no-no-no) style, which is just
  > really horribly disgusting and worse than the otherwise fairly similar
  > (d) in pretty much every way.

Also improve the comments and some other details while at it:

 - Don't mix same-line and previous-line comment style on otherwise
   identical code patterns within the same function,

 - capitalize 'BPF' and x86 register names consistently,

 - capitalize sentences consistently,

 - instead of 'x64' use 'x86-64': x64 is a Microsoft specific term,

 - use more consistent punctuation,

 - use standard coding style in macros as well,

 - fix typos and a few other minor details.

Consistent coding style is not optional, at least in arch/x86/.

No change in functionality.

( In case this commit causes conflicts with pending development code
  I'll be glad to help resolve any conflicts! )

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/x86/net/bpf_jit_comp.c | 233 +++++++++++++++++++++++++-------------------
 1 file changed, 133 insertions(+), 100 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index abce27ceb411..1c3c81ddeb87 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,4 +1,5 @@
-/* bpf_jit_comp.c : BPF JIT compiler
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
  *
  * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
  * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
@@ -17,7 +18,7 @@
 #include <asm/nospec-branch.h>
 
 /*
- * assembly code in arch/x86/net/bpf_jit.S
+ * Assembly code in arch/x86/net/bpf_jit.S
  */
 extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
 extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
@@ -45,14 +46,15 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
 #define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
 #define EMIT4(b1, b2, b3, b4)   EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
 #define EMIT1_off32(b1, off) \
-	do {EMIT1(b1); EMIT(off, 4); } while (0)
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
 #define EMIT2_off32(b1, b2, off) \
-	do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
 #define EMIT3_off32(b1, b2, b3, off) \
-	do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
 #define EMIT4_off32(b1, b2, b3, b4, off) \
-	do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
 
 static bool is_imm8(int value)
 {
@@ -70,9 +72,10 @@ static bool is_uimm32(u64 value)
 }
 
 /* mov dst, src */
-#define EMIT_mov(DST, SRC) \
-	do {if (DST != SRC) \
-		EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+#define EMIT_mov(DST, SRC)								 \
+	do {										 \
+		if (DST != SRC)								 \
+			EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
 	} while (0)
 
 static int bpf_size_to_x86_bytes(int bpf_size)
@@ -89,7 +92,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 		return 0;
 }
 
-/* list of x86 cond jumps opcodes (. + s8)
+/*
+ * List of x86 cond jumps opcodes (. + s8)
  * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
  */
 #define X86_JB  0x72
@@ -106,35 +110,37 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define CHOOSE_LOAD_FUNC(K, func) \
 	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
 
-/* pick a register outside of BPF range for JIT internal work */
+/* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
-/* The following table maps BPF registers to x64 registers.
+/*
+ * The following table maps BPF registers to x86-64 registers.
  *
- * x64 register r12 is unused, since if used as base address
+ * x86-64 register R12 is unused, since if used as base address
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- *  r9 caches skb->len - skb->data_len
- * r10 caches skb->data, and used for blinding (if enabled)
+ * R9  caches skb->len - skb->data_len
+ * R10 caches skb->data, and used for blinding (if enabled)
  */
 static const int reg2hex[] = {
-	[BPF_REG_0] = 0,  /* rax */
-	[BPF_REG_1] = 7,  /* rdi */
-	[BPF_REG_2] = 6,  /* rsi */
-	[BPF_REG_3] = 2,  /* rdx */
-	[BPF_REG_4] = 1,  /* rcx */
-	[BPF_REG_5] = 0,  /* r8 */
-	[BPF_REG_6] = 3,  /* rbx callee saved */
-	[BPF_REG_7] = 5,  /* r13 callee saved */
-	[BPF_REG_8] = 6,  /* r14 callee saved */
-	[BPF_REG_9] = 7,  /* r15 callee saved */
-	[BPF_REG_FP] = 5, /* rbp readonly */
-	[BPF_REG_AX] = 2, /* r10 temp register */
-	[AUX_REG] = 3,    /* r11 temp register */
+	[BPF_REG_0] = 0,  /* RAX */
+	[BPF_REG_1] = 7,  /* RDI */
+	[BPF_REG_2] = 6,  /* RSI */
+	[BPF_REG_3] = 2,  /* RDX */
+	[BPF_REG_4] = 1,  /* RCX */
+	[BPF_REG_5] = 0,  /* R8  */
+	[BPF_REG_6] = 3,  /* RBX callee saved */
+	[BPF_REG_7] = 5,  /* R13 callee saved */
+	[BPF_REG_8] = 6,  /* R14 callee saved */
+	[BPF_REG_9] = 7,  /* R15 callee saved */
+	[BPF_REG_FP] = 5, /* RBP readonly */
+	[BPF_REG_AX] = 2, /* R10 temp register */
+	[AUX_REG] = 3,    /* R11 temp register */
 };
 
-/* is_ereg() == true if BPF register 'reg' maps to x64 r8..r15
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  * which need extra byte of encoding.
  * rax,rcx,...,rbp have simpler encoding
  */
@@ -153,7 +159,7 @@ static bool is_axreg(u32 reg)
 	return reg == BPF_REG_0;
 }
 
-/* add modifiers if 'reg' maps to x64 registers r8..r15 */
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
 static u8 add_1mod(u8 byte, u32 reg)
 {
 	if (is_ereg(reg))
@@ -170,13 +176,13 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
 	return byte;
 }
 
-/* encode 'dst_reg' register into x64 opcode 'byte' */
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
 	return byte + reg2hex[dst_reg];
 }
 
-/* encode 'dst_reg' and 'src_reg' registers into x64 opcode 'byte' */
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
 static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 {
 	return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
@@ -184,27 +190,28 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
 
 static void jit_fill_hole(void *area, unsigned int size)
 {
-	/* fill whole space with int3 instructions */
+	/* Fill whole space with INT3 instructions */
 	memset(area, 0xcc, size);
 }
 
 struct jit_context {
-	int cleanup_addr; /* epilogue code offset */
+	int cleanup_addr; /* Epilogue code offset */
 	bool seen_ld_abs;
 	bool seen_ax_reg;
 };
 
-/* maximum number of bytes emitted while JITing one eBPF insn */
+/* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
 #define AUX_STACK_SPACE \
-	(32 /* space for rbx, r13, r14, r15 */ + \
-	 8 /* space for skb_copy_bits() buffer */)
+	(32 /* Space for RBX, R13, R14, R15 */ + \
+	  8 /* Space for skb_copy_bits() buffer */)
 
 #define PROLOGUE_SIZE 37
 
-/* emit x64 prologue code for BPF program and check it's size.
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
  * bpf_tail_call helper will skip it while jumping into another program
  */
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
@@ -212,8 +219,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	EMIT1(0x55); /* push rbp */
-	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+	/* push rbp */
+	EMIT1(0x55);
+
+	/* mov rbp,rsp */
+	EMIT3(0x48, 0x89, 0xE5);
 
 	/* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */
 	EMIT3_off32(0x48, 0x81, 0xEC,
@@ -222,14 +232,15 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* all classic BPF filters use R6(rbx) save it */
+	/* All classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
 
-	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
-	 * R8(r14). R9(r15) spill could be made conditional, but there is only
+	/*
+	 * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
+	 * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
+	 * R8(R14). R9(R15) spill could be made conditional, but there is only
 	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
 	 * The overhead of extra spill is negligible for any filter other
 	 * than synthetic ones. Therefore not worth adding complexity.
@@ -243,9 +254,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	EMIT4(0x4C, 0x89, 0x7D, 24);
 
 	if (!ebpf_from_cbpf) {
-		/* Clear the tail call counter (tail_call_cnt): for eBPF tail
+		/*
+		 * Clear the tail call counter (tail_call_cnt): for eBPF tail
 		 * calls we need to reset the counter to 0. It's done in two
-		 * instructions, resetting rax register to 0, and moving it
+		 * instructions, resetting RAX register to 0, and moving it
 		 * to the counter location.
 		 */
 
@@ -260,7 +272,9 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	*pprog = prog;
 }
 
-/* generate the following code:
+/*
+ * Generate the following code:
+ *
  * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
  *   if (index >= array->map.max_entries)
  *     goto out;
@@ -278,23 +292,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	int label1, label2, label3;
 	int cnt = 0;
 
-	/* rdi - pointer to ctx
+	/*
+	 * rdi - pointer to ctx
 	 * rsi - pointer to bpf_array
 	 * rdx - index in bpf_array
 	 */
 
-	/* if (index >= array->map.max_entries)
-	 *   goto out;
+	/*
+	 * if (index >= array->map.max_entries)
+	 *	goto out;
 	 */
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
 	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
-	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
-	 *   goto out;
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *	goto out;
 	 */
 	EMIT2_off32(0x8B, 0x85, 36);              /* mov eax, dword ptr [rbp + 36] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
@@ -308,8 +325,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT4_off32(0x48, 0x8B, 0x84, 0xD6,       /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
 		    offsetof(struct bpf_array, ptrs));
 
-	/* if (prog == NULL)
-	 *   goto out;
+	/*
+	 * if (prog == NULL)
+	 *	goto out;
 	 */
 	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
 #define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
@@ -321,7 +339,8 @@ static void emit_bpf_tail_call(u8 **pprog)
 	      offsetof(struct bpf_prog, bpf_func));
 	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
 
-	/* now we're ready to jump into next BPF program
+	/*
+	 * Wow we're ready to jump into next BPF program
 	 * rdi == ctx (1st arg)
 	 * rax == prog->bpf_func + prologue_size
 	 */
@@ -340,7 +359,8 @@ static void emit_load_skb_data_hlen(u8 **pprog)
 	u8 *prog = *pprog;
 	int cnt = 0;
 
-	/* r9d = skb->len - skb->data_len (headlen)
+	/*
+	 * r9d = skb->len - skb->data_len (headlen)
 	 * r10 = skb->data
 	 */
 	/* mov %r9d, off32(%rdi) */
@@ -361,7 +381,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 	u8 b1, b2, b3;
 	int cnt = 0;
 
-	/* optimization: if imm32 is positive, use 'mov %eax, imm32'
+	/*
+	 * Optimization: if imm32 is positive, use 'mov %eax, imm32'
 	 * (which zero-extends imm32) to save 2 bytes.
 	 */
 	if (sign_propagate && (s32)imm32 < 0) {
@@ -373,7 +394,8 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 		goto done;
 	}
 
-	/* optimization: if imm32 is zero, use 'xor %eax, %eax'
+	/*
+	 * Optimization: if imm32 is zero, use 'xor %eax, %eax'
 	 * to save 3 bytes.
 	 */
 	if (imm32 == 0) {
@@ -400,7 +422,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 	int cnt = 0;
 
 	if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
-		/* For emitting plain u32, where sign bit must not be
+		/*
+		 * For emitting plain u32, where sign bit must not be
 		 * propagated LLVM tends to load imm64 over mov32
 		 * directly, so save couple of bytes by just doing
 		 * 'mov %eax, imm32' instead.
@@ -525,7 +548,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			else if (is_ereg(dst_reg))
 				EMIT1(add_1mod(0x40, dst_reg));
 
-			/* b3 holds 'normal' opcode, b2 short form only valid
+			/*
+			 * b3 holds 'normal' opcode, b2 short form only valid
 			 * in case dst is eax/rax.
 			 */
 			switch (BPF_OP(insn->code)) {
@@ -593,7 +617,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			/* mov rax, dst_reg */
 			EMIT_mov(BPF_REG_0, dst_reg);
 
-			/* xor edx, edx
+			/*
+			 * xor edx, edx
 			 * equivalent to 'xor rdx, rdx', but one byte less
 			 */
 			EMIT2(0x31, 0xd2);
@@ -655,7 +680,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			}
 			break;
 		}
-			/* shifts */
+			/* Shifts */
 		case BPF_ALU | BPF_LSH | BPF_K:
 		case BPF_ALU | BPF_RSH | BPF_K:
 		case BPF_ALU | BPF_ARSH | BPF_K:
@@ -686,7 +711,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* check for bad case when dst_reg == rcx */
+			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
 				EMIT_mov(AUX_REG, dst_reg);
@@ -724,13 +749,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
 			switch (imm32) {
 			case 16:
-				/* emit 'ror %ax, 8' to swap lower 2 bytes */
+				/* Emit 'ror %ax, 8' to swap lower 2 bytes */
 				EMIT1(0x66);
 				if (is_ereg(dst_reg))
 					EMIT1(0x41);
 				EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
 
-				/* emit 'movzwl eax, ax' */
+				/* Emit 'movzwl eax, ax' */
 				if (is_ereg(dst_reg))
 					EMIT3(0x45, 0x0F, 0xB7);
 				else
@@ -738,7 +763,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'bswap eax' to swap lower 4 bytes */
+				/* Emit 'bswap eax' to swap lower 4 bytes */
 				if (is_ereg(dst_reg))
 					EMIT2(0x41, 0x0F);
 				else
@@ -746,7 +771,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_1reg(0xC8, dst_reg));
 				break;
 			case 64:
-				/* emit 'bswap rax' to swap 8 bytes */
+				/* Emit 'bswap rax' to swap 8 bytes */
 				EMIT3(add_1mod(0x48, dst_reg), 0x0F,
 				      add_1reg(0xC8, dst_reg));
 				break;
@@ -756,7 +781,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		case BPF_ALU | BPF_END | BPF_FROM_LE:
 			switch (imm32) {
 			case 16:
-				/* emit 'movzwl eax, ax' to zero extend 16-bit
+				/*
+				 * Emit 'movzwl eax, ax' to zero extend 16-bit
 				 * into 64 bit
 				 */
 				if (is_ereg(dst_reg))
@@ -766,7 +792,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 				EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
 				break;
 			case 32:
-				/* emit 'mov eax, eax' to clear upper 32-bits */
+				/* Emit 'mov eax, eax' to clear upper 32-bits */
 				if (is_ereg(dst_reg))
 					EMIT1(0x45);
 				EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
@@ -809,9 +835,9 @@ st:			if (is_imm8(insn->off))
 
 			/* STX: *(u8*)(dst_reg + off) = src_reg */
 		case BPF_STX | BPF_MEM | BPF_B:
-			/* emit 'mov byte ptr [rax + off], al' */
+			/* Emit 'mov byte ptr [rax + off], al' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
-			    /* have to add extra byte for x86 SIL, DIL regs */
+			    /* We have to add extra byte for x86 SIL, DIL regs */
 			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
 				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
 			else
@@ -840,25 +866,26 @@ stx:			if (is_imm8(insn->off))
 
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
-			/* emit 'movzx rax, byte ptr [rax + off]' */
+			/* Emit 'movzx rax, byte ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
-			/* emit 'movzx rax, word ptr [rax + off]' */
+			/* Emit 'movzx rax, word ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
-			/* emit 'mov eax, dword ptr [rax+0x14]' */
+			/* Emit 'mov eax, dword ptr [rax+0x14]' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
 			else
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
-			/* emit 'mov rax, qword ptr [rax+0x14]' */
+			/* Emit 'mov rax, qword ptr [rax+0x14]' */
 			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
-ldx:			/* if insn->off == 0 we can save one extra byte, but
-			 * special case of x86 r13 which always needs an offset
+ldx:			/*
+			 * If insn->off == 0 we can save one extra byte, but
+			 * special case of x86 R13 which always needs an offset
 			 * is not worth the hassle
 			 */
 			if (is_imm8(insn->off))
@@ -870,7 +897,7 @@ ldx:			/* if insn->off == 0 we can save one extra byte, but
 
 			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
 		case BPF_STX | BPF_XADD | BPF_W:
-			/* emit 'lock add dword ptr [rax + off], eax' */
+			/* Emit 'lock add dword ptr [rax + off], eax' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
 			else
@@ -897,14 +924,15 @@ xadd:			if (is_imm8(insn->off))
 				} else {
 					EMIT2(0x41, 0x52); /* push %r10 */
 					EMIT2(0x41, 0x51); /* push %r9 */
-					/* need to adjust jmp offset, since
+					/*
+					 * We need to adjust jmp offset, since
 					 * pop %r9, pop %r10 take 4 bytes after call insn
 					 */
 					jmp_offset += 4;
 				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -970,7 +998,7 @@ xadd:			if (is_imm8(insn->off))
 			else
 				EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
 
-emit_cond_jmp:		/* convert BPF opcode to x86 */
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
 			switch (BPF_OP(insn->code)) {
 			case BPF_JEQ:
 				jmp_cond = X86_JE;
@@ -996,22 +1024,22 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_cond = X86_JBE;
 				break;
 			case BPF_JSGT:
-				/* signed '>', GT in x86 */
+				/* Signed '>', GT in x86 */
 				jmp_cond = X86_JG;
 				break;
 			case BPF_JSLT:
-				/* signed '<', LT in x86 */
+				/* Signed '<', LT in x86 */
 				jmp_cond = X86_JL;
 				break;
 			case BPF_JSGE:
-				/* signed '>=', GE in x86 */
+				/* Signed '>=', GE in x86 */
 				jmp_cond = X86_JGE;
 				break;
 			case BPF_JSLE:
-				/* signed '<=', LE in x86 */
+				/* Signed '<=', LE in x86 */
 				jmp_cond = X86_JLE;
 				break;
-			default: /* to silence gcc warning */
+			default: /* to silence GCC warning */
 				return -EFAULT;
 			}
 			jmp_offset = addrs[i + insn->off] - addrs[i];
@@ -1039,7 +1067,7 @@ emit_cond_jmp:		/* convert BPF opcode to x86 */
 				jmp_offset = addrs[i + insn->off] - addrs[i];
 
 			if (!jmp_offset)
-				/* optimize out nop jumps */
+				/* Optimize out nop jumps */
 				break;
 emit_jmp:
 			if (is_imm8(jmp_offset)) {
@@ -1061,7 +1089,7 @@ common_load:
 			ctx->seen_ld_abs = seen_ld_abs = true;
 			jmp_offset = func - (image + addrs[i]);
 			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported bpf func %d addr %p image %p\n",
+				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
@@ -1080,7 +1108,8 @@ common_load:
 						EMIT2_off32(0x81, 0xC6, imm32);
 				}
 			}
-			/* skb pointer is in R6 (%rbx), it will be copied into
+			/*
+			 * skb pointer is in R6 (%rbx), it will be copied into
 			 * %rdi if skb_copy_bits() call is necessary.
 			 * sk_load_* helpers also use %r10 and %r9d.
 			 * See bpf_jit.S
@@ -1111,7 +1140,7 @@ common_load:
 				goto emit_jmp;
 			}
 			seen_exit = true;
-			/* update cleanup_addr */
+			/* Update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp+0] */
 			EMIT4(0x48, 0x8B, 0x5D, 0);
@@ -1129,10 +1158,11 @@ common_load:
 			break;
 
 		default:
-			/* By design x64 JIT should support all BPF instructions
+			/*
+			 * By design x86-64 JIT should support all BPF instructions.
 			 * This error will be seen if new instruction was added
-			 * to interpreter, but not to JIT
-			 * or if there is junk in bpf_prog
+			 * to the interpreter, but not to the JIT, or if there is
+			 * junk in bpf_prog.
 			 */
 			pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
 			return -EINVAL;
@@ -1184,7 +1214,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		return orig_prog;
 
 	tmp = bpf_jit_blind_constants(prog);
-	/* If blinding was requested and we failed during blinding,
+	/*
+	 * If blinding was requested and we failed during blinding,
 	 * we must fall back to the interpreter.
 	 */
 	if (IS_ERR(tmp))
@@ -1218,8 +1249,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_addrs;
 	}
 
-	/* Before first pass, make a rough estimation of addrs[]
-	 * each bpf instruction is translated to less than 64 bytes
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
 	 */
 	for (proglen = 0, i = 0; i < prog->len; i++) {
 		proglen += 64;
@@ -1228,10 +1260,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.cleanup_addr = proglen;
 skip_init_addrs:
 
-	/* JITed image shrinks with every pass and the loop iterates
-	 * until the image stops shrinking. Very large bpf programs
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
 	 * may converge on the last pass. In such case do one more
-	 * pass to emit the final image
+	 * pass to emit the final image.
 	 */
 	for (pass = 0; pass < 20 || image; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 6c035ba7e73aba4536a1112f9a0901ab40aab460 Mon Sep 17 00:00:00 2001
From: Sean Tranchetti <stranche@codeaurora.org>
Date: Mon, 30 Apr 2018 18:01:02 -0600
Subject: udp: Complement partial checksum for GSO packet

Using the udp_v4_check() function to calculate the pseudo header
for the newly segmented UDP packets results in assigning the complement
of the value to the UDP header checksum field.

Always undo the complement the partial checksum value in order to
match the case where GSO is not used on the UDP transmit path.

Fixes: ee80d1ebe5ba ("udp: add udp gso")
Signed-off-by: Sean Tranchetti <stranche@codeaurora.org>
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f78fb3673472..006257092f06 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -223,6 +223,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 			csum_replace2(&uh->check, htons(mss),
 				      htons(seg->len - hdrlen - sizeof(*uh)));
 
+		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
-- 
cgit v1.2.3-59-g8ed1b


From 795d8098d32b6bef3d0821588cb6e4b1f369a7a4 Mon Sep 17 00:00:00 2001
From: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Date: Tue, 1 May 2018 10:32:10 -0700
Subject: liquidio VF: indicate that disabling rx vlan offload is not allowed

NIC firmware does not support disabling rx vlan offload, but the VF driver
incorrectly indicates that it is supported.  The PF driver already does the
correct indication by clearing the NETIF_F_HW_VLAN_CTAG_RX bit in its
netdev->hw_features.  So just do the same thing in the VF.

Signed-off-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Acked-by: Prasad Kanneganti <prasad.kanneganti@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 08b682b1c8ad..6295eeec795c 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -2100,6 +2100,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 		netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
 
 		netdev->hw_features = lio->dev_capability;
+		netdev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
 
 		/* MTU range: 68 - 16000 */
 		netdev->min_mtu = LIO_MIN_MTU_SIZE;
-- 
cgit v1.2.3-59-g8ed1b


From 8ac60ffb9a83c8f0f6b3bab5c205d75a32914904 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 1 May 2018 14:01:30 -0700
Subject: net: stmmac: Avoid VLA usage

In the quest to remove all stack VLAs from the kernel[1], this switches
the "status" stack buffer to use the existing small (8) upper bound on
how many queues can be checked for DMA, and adds a sanity-check just to
make sure it doesn't operate under pathological conditions.

[1] http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Jose Abreu <joabreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 0135fd3aa6ef..b935478df55f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2022,7 +2022,11 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 				tx_channel_count : rx_channel_count;
 	u32 chan;
 	bool poll_scheduled = false;
-	int status[channels_to_check];
+	int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)];
+
+	/* Make sure we never check beyond our status buffer. */
+	if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status)))
+		channels_to_check = ARRAY_SIZE(status);
 
 	/* Each DMA channel can be used for rx and tx simultaneously, yet
 	 * napi_struct is embedded in struct stmmac_rx_queue rather than in a
-- 
cgit v1.2.3-59-g8ed1b


From 6f96674dbd8ca659769f1c65ca15638e50b69341 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 2 May 2018 14:20:24 +0100
Subject: bpf: relax constraints on formatting for eBPF helper documentation

The Python script used to parse and extract eBPF helpers documentation
from include/uapi/linux/bpf.h expects a very specific formatting for the
descriptions (single dot represents a space, '>' stands for a tab):

    /*
     ...
     *.int bpf_helper(list of arguments)
     *.>    Description
     *.>    >       Start of description
     *.>    >       Another line of description
     *.>    >       And yet another line of description
     *.>    Return
     *.>    >       0 on success, or a negative error in case of failure
     ...
     */

This is too strict, and painful for developers who wants to add
documentation for new helpers. Worse, it is extremely difficult to check
that the formatting is correct during reviews. Change the format
expected by the script and make it more flexible. The script now works
whether or not the initial space (right after the star) is present, and
accepts both tabs and white spaces (or a combination of both) for
indenting description sections and contents.

Concretely, something like the following would now be supported:

    /*
     ...
     *int bpf_helper(list of arguments)
     *......Description
     *.>    >       Start of description...
     *>     >       Another line of description
     *..............And yet another line of description
     *>     Return
     *.>    ........0 on success, or a negative error in case of failure
     ...
     */

While at it, remove unnecessary carets from each regex used with match()
in the script. They are redundant, as match() tries to match from the
beginning of the string by default.

v2: Remove unnecessary caret when a regex is used with match().

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 scripts/bpf_helpers_doc.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 30ba0fee36e4..8f59897fbda1 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -39,9 +39,9 @@ class Helper(object):
         Break down helper function protocol into smaller chunks: return type,
         name, distincts arguments.
         """
-        arg_re = re.compile('^((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
+        arg_re = re.compile('((const )?(struct )?(\w+|...))( (\**)(\w+))?$')
         res = {}
-        proto_re = re.compile('^(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
+        proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
 
         capture = proto_re.match(self.proto)
         res['ret_type'] = capture.group(1)
@@ -87,7 +87,7 @@ class HeaderParser(object):
         #   - Same as above, with "const" and/or "struct" in front of type
         #   - "..." (undefined number of arguments, for bpf_trace_printk())
         # There is at least one term ("void"), and at most five arguments.
-        p = re.compile('^ \* ((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
+        p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
         capture = p.match(self.line)
         if not capture:
             raise NoHelperFound
@@ -95,7 +95,7 @@ class HeaderParser(object):
         return capture.group(1)
 
     def parse_desc(self):
-        p = re.compile('^ \* \tDescription$')
+        p = re.compile(' \* ?(?:\t| {6,8})Description$')
         capture = p.match(self.line)
         if not capture:
             # Helper can have empty description and we might be parsing another
@@ -109,7 +109,7 @@ class HeaderParser(object):
             if self.line == ' *\n':
                 desc += '\n'
             else:
-                p = re.compile('^ \* \t\t(.*)')
+                p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
                     desc += capture.group(1) + '\n'
@@ -118,7 +118,7 @@ class HeaderParser(object):
         return desc
 
     def parse_ret(self):
-        p = re.compile('^ \* \tReturn$')
+        p = re.compile(' \* ?(?:\t| {6,8})Return$')
         capture = p.match(self.line)
         if not capture:
             # Helper can have empty retval and we might be parsing another
@@ -132,7 +132,7 @@ class HeaderParser(object):
             if self.line == ' *\n':
                 ret += '\n'
             else:
-                p = re.compile('^ \* \t\t(.*)')
+                p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
                     ret += capture.group(1) + '\n'
-- 
cgit v1.2.3-59-g8ed1b


From 794451c1b57fa3871f51b62c0e430f44073e7cfc Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Wed, 2 May 2018 11:47:15 +0530
Subject: cxgb4: add new T5 device id's

Add device id's 0x5019, 0x501a and 0x501b for T5
cards.

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index 51b18035d691..90b5274a8ba1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -145,6 +145,9 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x5016),	/* T580-OCP-SO */
 	CH_PCI_ID_TABLE_FENTRY(0x5017),	/* T520-OCP-SO */
 	CH_PCI_ID_TABLE_FENTRY(0x5018),	/* T540-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x5019),	/* T540-LP-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x501a),	/* T540-SO-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x501b),	/* T540-SO-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x5080),	/* Custom T540-cr */
 	CH_PCI_ID_TABLE_FENTRY(0x5081),	/* Custom T540-LL-cr */
 	CH_PCI_ID_TABLE_FENTRY(0x5082),	/* Custom T504-cr */
-- 
cgit v1.2.3-59-g8ed1b


From 6290182b2b3bda119ecb243fb3d6bcc98a74344f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 2 May 2018 10:17:34 +0300
Subject: mlxsw: spectrum_router: Return an error for non-default FIB rules

Since commit 9776d32537d2 ("net: Move call_fib_rule_notifiers up in
fib_nl_newrule") it is possible to forbid the installation of
unsupported FIB rules.

Have mlxsw return an error for non-default FIB rules in addition to the
existing extack message.

Example:
# ip rule add from 198.51.100.1 table 10
Error: mlxsw_spectrum: FIB rules not supported.

Note that offload is only aborted when non-default FIB rules are already
installed and merely replayed during module initialization.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 8e4edb634b11..added380e344 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5882,24 +5882,24 @@ static int mlxsw_sp_router_fib_rule_event(unsigned long event,
 	switch (info->family) {
 	case AF_INET:
 		if (!fib4_rule_default(rule) && !rule->l3mdev)
-			err = -1;
+			err = -EOPNOTSUPP;
 		break;
 	case AF_INET6:
 		if (!fib6_rule_default(rule) && !rule->l3mdev)
-			err = -1;
+			err = -EOPNOTSUPP;
 		break;
 	case RTNL_FAMILY_IPMR:
 		if (!ipmr_rule_default(rule) && !rule->l3mdev)
-			err = -1;
+			err = -EOPNOTSUPP;
 		break;
 	case RTNL_FAMILY_IP6MR:
 		if (!ip6mr_rule_default(rule) && !rule->l3mdev)
-			err = -1;
+			err = -EOPNOTSUPP;
 		break;
 	}
 
 	if (err < 0)
-		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported. Aborting offload");
+		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported");
 
 	return err;
 }
@@ -5926,8 +5926,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	case FIB_EVENT_RULE_DEL:
 		err = mlxsw_sp_router_fib_rule_event(event, info,
 						     router->mlxsw_sp);
-		if (!err)
-			return NOTIFY_DONE;
+		if (!err || info->extack)
+			return notifier_from_errno(err);
 	}
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From 50d10711cf1b0bf82f274b0901f54e7ff030b740 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 2 May 2018 10:17:35 +0300
Subject: mlxsw: spectrum_router: Return an error for routes added after abort

We currently do not perform accounting in the driver and thus can't
reject routes before resources are exceeded.

However, in order to make users aware of the fact that routes are no
longer offloaded we can return an error for routes configured after the
abort mechanism was triggered.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index added380e344..8028d221aece 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5928,6 +5928,13 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 						     router->mlxsw_sp);
 		if (!err || info->extack)
 			return notifier_from_errno(err);
+		break;
+	case FIB_EVENT_ENTRY_ADD:
+		if (router->aborted) {
+			NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route");
+			return notifier_from_errno(-EINVAL);
+		}
+		break;
 	}
 
 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From 877ae5be421de3173b1306113c3f88003ae798b3 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 2 May 2018 16:56:44 +0200
Subject: net/smc: periodic testlink support

Add periodic LLC testlink support to ensure the link is still active.
The interval time is initialized using the value of
sysctl_tcp_keepalive_time.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  6 ++++--
 net/smc/smc_core.c |  2 ++
 net/smc/smc_core.h |  4 ++++
 net/smc/smc_llc.c  | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_llc.h  |  3 +++
 net/smc/smc_wr.c   |  1 +
 6 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 20aa4175b9f8..961b8eff9553 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -294,6 +294,7 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 
 static int smc_clnt_conf_first_link(struct smc_sock *smc)
 {
+	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link_group *lgr = smc->conn.lgr;
 	struct smc_link *link;
 	int rest;
@@ -353,7 +354,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 	if (rc < 0)
 		return SMC_CLC_DECL_TCL;
 
-	link->state = SMC_LNK_ACTIVE;
+	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
 
 	return 0;
 }
@@ -715,6 +716,7 @@ void smc_close_non_accepted(struct sock *sk)
 
 static int smc_serv_conf_first_link(struct smc_sock *smc)
 {
+	struct net *net = sock_net(smc->clcsock->sk);
 	struct smc_link_group *lgr = smc->conn.lgr;
 	struct smc_link *link;
 	int rest;
@@ -769,7 +771,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 		return rc;
 	}
 
-	link->state = SMC_LNK_ACTIVE;
+	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
 
 	return 0;
 }
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f44f6803f7ff..d9247765aff3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -310,6 +310,7 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
 /* remove a link group */
 void smc_lgr_free(struct smc_link_group *lgr)
 {
+	smc_llc_link_flush(&lgr->lnk[SMC_SINGLE_LINK]);
 	smc_lgr_free_bufs(lgr);
 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 	kfree(lgr);
@@ -332,6 +333,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	struct rb_node *node;
 
 	smc_lgr_forget(lgr);
+	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 
 	write_lock_bh(&lgr->conns_lock);
 	node = rb_first(&lgr->conns_all);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 07e2a393e6d9..97339f03ba79 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -79,6 +79,7 @@ struct smc_link {
 	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
+	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
 
 	struct ib_reg_wr	wr_reg;		/* WR register memory region */
 	wait_queue_head_t	wr_reg_wait;	/* wait for wr_reg result */
@@ -101,6 +102,9 @@ struct smc_link {
 	int			llc_confirm_resp_rc; /* rc from conf_resp msg */
 	struct completion	llc_add;	/* wait for rx of add link */
 	struct completion	llc_add_resp;	/* wait for rx of add link rsp*/
+	struct delayed_work	llc_testlink_wrk; /* testlink worker */
+	struct completion	llc_testlink_resp; /* wait for rx of testlink */
+	int			llc_testlink_time; /* testlink interval */
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index ea4b21981b4b..33b4d856f4c6 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -397,7 +397,8 @@ static void smc_llc_rx_test_link(struct smc_link *link,
 				 struct smc_llc_msg_test_link *llc)
 {
 	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		/* unused as long as we don't send this type of msg */
+		if (link->state == SMC_LNK_ACTIVE)
+			complete(&link->llc_testlink_resp);
 	} else {
 		smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP);
 	}
@@ -502,6 +503,65 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 	}
 }
 
+/***************************** worker ****************************************/
+
+static void smc_llc_testlink_work(struct work_struct *work)
+{
+	struct smc_link *link = container_of(to_delayed_work(work),
+					     struct smc_link, llc_testlink_wrk);
+	unsigned long next_interval;
+	struct smc_link_group *lgr;
+	unsigned long expire_time;
+	u8 user_data[16] = { 0 };
+	int rc;
+
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+	if (link->state != SMC_LNK_ACTIVE)
+		return;		/* don't reschedule worker */
+	expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
+	if (time_is_after_jiffies(expire_time)) {
+		next_interval = expire_time - jiffies;
+		goto out;
+	}
+	reinit_completion(&link->llc_testlink_resp);
+	smc_llc_send_test_link(link, user_data, SMC_LLC_REQ);
+	/* receive TEST LINK response over RoCE fabric */
+	rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
+						       SMC_LLC_WAIT_TIME);
+	if (rc <= 0) {
+		smc_lgr_terminate(lgr);
+		return;
+	}
+	next_interval = link->llc_testlink_time;
+out:
+	schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
+}
+
+void smc_llc_link_active(struct smc_link *link, int testlink_time)
+{
+	init_completion(&link->llc_testlink_resp);
+	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+	link->state = SMC_LNK_ACTIVE;
+	if (testlink_time) {
+		link->llc_testlink_time = testlink_time * HZ;
+		schedule_delayed_work(&link->llc_testlink_wrk,
+				      link->llc_testlink_time);
+	}
+}
+
+/* called in tasklet context */
+void smc_llc_link_inactive(struct smc_link *link)
+{
+	link->state = SMC_LNK_INACTIVE;
+	cancel_delayed_work(&link->llc_testlink_wrk);
+}
+
+/* called in worker context */
+void smc_llc_link_flush(struct smc_link *link)
+{
+	cancel_delayed_work_sync(&link->llc_testlink_wrk);
+}
+
 /***************************** init, exit, misc ******************************/
 
 static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index e4a7d5e234d5..d6e42116485e 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -44,6 +44,9 @@ int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp);
 int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
 			   enum smc_llc_reqresp reqresp);
+void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_inactive(struct smc_link *link);
+void smc_llc_link_flush(struct smc_link *link);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 1b8af23e6e2b..cc7c1bb60fe8 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
 	for (i = 0; i < num; i++) {
 		link = wc[i].qp->qp_context;
 		if (wc[i].status == IB_WC_SUCCESS) {
+			link->wr_rx_tstamp = jiffies;
 			smc_wr_rx_demultiplex(&wc[i]);
 			smc_wr_rx_post(link); /* refill WR RX */
 		} else {
-- 
cgit v1.2.3-59-g8ed1b


From ed75986f4aaef7822f63bd40c2ce491bdd9c6dec Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 2 May 2018 16:56:45 +0200
Subject: net/smc: ipv6 support for smc_diag.c

Update smc_diag.c to support ipv6 addresses on the diagnosis interface.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_diag.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 427b91c1c964..05dd7e6d314d 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
 {
 	struct smc_sock *smc = smc_sk(sk);
 
-	r->diag_family = sk->sk_family;
 	if (!smc->clcsock)
 		return;
 	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
 	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
 	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
 	sock_diag_save_cookie(sk, r->id.idiag_cookie);
-	memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
-	memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-	r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
-	r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+	if (sk->sk_protocol == SMCPROTO_SMC) {
+		r->diag_family = PF_INET;
+		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+		r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+		r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (sk->sk_protocol == SMCPROTO_SMC6) {
+		r->diag_family = PF_INET6;
+		memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
+		       sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
+		memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
+		       sizeof(smc->clcsock->sk->sk_v6_daddr));
+#endif
+	}
 }
 
 static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
@@ -153,7 +163,8 @@ errout:
 	return -EMSGSIZE;
 }
 
-static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb,
+			       struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *bc = NULL;
@@ -161,8 +172,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct sock *sk;
 	int rc = 0;
 
-	read_lock(&smc_proto.h.smc_hash->lock);
-	head = &smc_proto.h.smc_hash->ht;
+	read_lock(&prot->h.smc_hash->lock);
+	head = &prot->h.smc_hash->ht;
 	if (hlist_empty(head))
 		goto out;
 
@@ -175,7 +186,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 out:
-	read_unlock(&smc_proto.h.smc_hash->lock);
+	read_unlock(&prot->h.smc_hash->lock);
+	return rc;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int rc = 0;
+
+	rc = smc_diag_dump_proto(&smc_proto, skb, cb);
+	if (!rc)
+		rc = smc_diag_dump_proto(&smc_proto6, skb, cb);
 	return rc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 9b67e26f936cd40a551da98914ed56dc5606686b Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 2 May 2018 16:56:46 +0200
Subject: net/smc: handle ioctls SIOCINQ, SIOCOUTQ, and SIOCOUTQNSD

SIOCINQ returns the amount of unread data in the RMB.
SIOCOUTQ returns the amount of unsent or unacked sent data in the send
buffer.
SIOCOUTQNSD returns the amount of data prepared for sending, but
not yet sent.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 961b8eff9553..823ea3371575 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -29,6 +29,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <net/smc.h>
+#include <asm/ioctls.h>
 
 #include "smc.h"
 #include "smc_clc.h"
@@ -1389,12 +1390,38 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		     unsigned long arg)
 {
 	struct smc_sock *smc;
+	int answ;
 
 	smc = smc_sk(sock->sk);
-	if (smc->use_fallback)
+	if (smc->use_fallback) {
+		if (!smc->clcsock)
+			return -EBADF;
 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
-	else
-		return sock_no_ioctl(sock, cmd, arg);
+	}
+	switch (cmd) {
+	case SIOCINQ: /* same as FIONREAD */
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		answ = atomic_read(&smc->conn.bytes_to_rcv);
+		break;
+	case SIOCOUTQ:
+		/* output queue size (not send + not acked) */
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		answ = smc->conn.sndbuf_size -
+					atomic_read(&smc->conn.sndbuf_space);
+		break;
+	case SIOCOUTQNSD:
+		/* output queue size (not send only) */
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		answ = smc_tx_prepared_sends(&smc->conn);
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return put_user(answ, (int __user *)arg);
 }
 
 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
-- 
cgit v1.2.3-59-g8ed1b


From cb9d43f6775457cac75544bc4197f26ac2b6f294 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 2 May 2018 16:56:47 +0200
Subject: net/smc: determine vlan_id of stacked net_device

An SMC link group is bound to a specific vlan_id. Its link uses
the RoCE-GIDs established for the specific vlan_id. This patch makes
sure the appropriate vlan_id is determined for stacked scenarios like
for instance a master bonding device with vlan devices enslaved.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d9247765aff3..1f3ea62fac5c 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -360,7 +360,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
 {
 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
-	int rc = 0;
+	struct net_device *ndev;
+	int i, nest_lvl, rc = 0;
 
 	*vlan_id = 0;
 	if (!dst) {
@@ -372,8 +373,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
 		goto out_rel;
 	}
 
-	if (is_vlan_dev(dst->dev))
-		*vlan_id = vlan_dev_vlan_id(dst->dev);
+	ndev = dst->dev;
+	if (is_vlan_dev(ndev)) {
+		*vlan_id = vlan_dev_vlan_id(ndev);
+		goto out_rel;
+	}
+
+	rtnl_lock();
+	nest_lvl = dev_get_nest_level(ndev);
+	for (i = 0; i < nest_lvl; i++) {
+		struct list_head *lower = &ndev->adj_list.lower;
+
+		if (list_empty(lower))
+			break;
+		lower = lower->next;
+		ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower);
+		if (is_vlan_dev(ndev)) {
+			*vlan_id = vlan_dev_vlan_id(ndev);
+			break;
+		}
+	}
+	rtnl_unlock();
 
 out_rel:
 	dst_release(dst);
-- 
cgit v1.2.3-59-g8ed1b


From 6851d025e514089bf8d2ea80048d0c928f4bbf02 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:35 +0200
Subject: r8169: remove unneeded check in r8168_pll_power_down

RTL_GIGA_MAC_VER_23/24 are configured by rtl_hw_start_8168cp_2()
and rtl_hw_start_8168cp_3() respectively which both apply
CPCMD_QUIRK_MASK, thus clearing bit ASF.

Bit ASF isn't set at any other place in the driver, therefore this
check can be removed.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 66f10d119b12..ce7843aab81d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4838,12 +4838,6 @@ static void r8168_pll_power_down(struct rtl8169_private *tp)
 	if (r8168_check_dash(tp))
 		return;
 
-	if ((tp->mac_version == RTL_GIGA_MAC_VER_23 ||
-	     tp->mac_version == RTL_GIGA_MAC_VER_24) &&
-	    (tp->cp_cmd & ASF)) {
-		return;
-	}
-
 	if (tp->mac_version == RTL_GIGA_MAC_VER_32 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_33)
 		rtl_ephy_write(tp, 0x19, 0xff64);
-- 
cgit v1.2.3-59-g8ed1b


From 40242e232e153dc4979721a89c286be613f2ae87 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:40 +0200
Subject: r8169: remove 810x_phy_power_up/down

The functionality of 810x_phy_power_up/down is covered by the default
clause in 8168_phy_power_up/down. Therefore we don't need these
functions.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 98 ++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 55 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index ce7843aab81d..58559a004850 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4718,61 +4718,6 @@ static bool rtl_wol_pll_power_down(struct rtl8169_private *tp)
 	return true;
 }
 
-static void r810x_phy_power_down(struct rtl8169_private *tp)
-{
-	rtl_writephy(tp, 0x1f, 0x0000);
-	rtl_writephy(tp, MII_BMCR, BMCR_PDOWN);
-}
-
-static void r810x_phy_power_up(struct rtl8169_private *tp)
-{
-	rtl_writephy(tp, 0x1f, 0x0000);
-	rtl_writephy(tp, MII_BMCR, BMCR_ANENABLE);
-}
-
-static void r810x_pll_power_down(struct rtl8169_private *tp)
-{
-	if (rtl_wol_pll_power_down(tp))
-		return;
-
-	r810x_phy_power_down(tp);
-
-	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_07:
-	case RTL_GIGA_MAC_VER_08:
-	case RTL_GIGA_MAC_VER_09:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_16:
-		break;
-	default:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
-		break;
-	}
-}
-
-static void r810x_pll_power_up(struct rtl8169_private *tp)
-{
-	r810x_phy_power_up(tp);
-
-	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_07:
-	case RTL_GIGA_MAC_VER_08:
-	case RTL_GIGA_MAC_VER_09:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_16:
-		break;
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
-		break;
-	default:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80);
-		break;
-	}
-}
-
 static void r8168_phy_power_up(struct rtl8169_private *tp)
 {
 	rtl_writephy(tp, 0x1f, 0x0000);
@@ -4833,6 +4778,49 @@ static void r8168_phy_power_down(struct rtl8169_private *tp)
 	}
 }
 
+static void r810x_pll_power_down(struct rtl8169_private *tp)
+{
+	if (rtl_wol_pll_power_down(tp))
+		return;
+
+	r8168_phy_power_down(tp);
+
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_07:
+	case RTL_GIGA_MAC_VER_08:
+	case RTL_GIGA_MAC_VER_09:
+	case RTL_GIGA_MAC_VER_10:
+	case RTL_GIGA_MAC_VER_13:
+	case RTL_GIGA_MAC_VER_16:
+		break;
+	default:
+		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
+		break;
+	}
+}
+
+static void r810x_pll_power_up(struct rtl8169_private *tp)
+{
+	r8168_phy_power_up(tp);
+
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_07:
+	case RTL_GIGA_MAC_VER_08:
+	case RTL_GIGA_MAC_VER_09:
+	case RTL_GIGA_MAC_VER_10:
+	case RTL_GIGA_MAC_VER_13:
+	case RTL_GIGA_MAC_VER_16:
+		break;
+	case RTL_GIGA_MAC_VER_47:
+	case RTL_GIGA_MAC_VER_48:
+		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
+		break;
+	default:
+		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80);
+		break;
+	}
+}
+
 static void r8168_pll_power_down(struct rtl8169_private *tp)
 {
 	if (r8168_check_dash(tp))
-- 
cgit v1.2.3-59-g8ed1b


From 73570bf19faae9da0941ba4a10989dfe085119ec Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:45 +0200
Subject: r8169: merge r810x_pll_power_down/up into r8168_pll_power_down/up

r810x_pll_power_down/up and r8168_pll_power_down/up have a lot in common,
so we can simplify the code by merging the former into the latter.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 61 ++++++++++--------------------------
 1 file changed, 16 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 58559a004850..0585a9c6edcc 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4778,49 +4778,6 @@ static void r8168_phy_power_down(struct rtl8169_private *tp)
 	}
 }
 
-static void r810x_pll_power_down(struct rtl8169_private *tp)
-{
-	if (rtl_wol_pll_power_down(tp))
-		return;
-
-	r8168_phy_power_down(tp);
-
-	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_07:
-	case RTL_GIGA_MAC_VER_08:
-	case RTL_GIGA_MAC_VER_09:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_16:
-		break;
-	default:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
-		break;
-	}
-}
-
-static void r810x_pll_power_up(struct rtl8169_private *tp)
-{
-	r8168_phy_power_up(tp);
-
-	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_07:
-	case RTL_GIGA_MAC_VER_08:
-	case RTL_GIGA_MAC_VER_09:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_16:
-		break;
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
-		break;
-	default:
-		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80);
-		break;
-	}
-}
-
 static void r8168_pll_power_down(struct rtl8169_private *tp)
 {
 	if (r8168_check_dash(tp))
@@ -4840,12 +4797,19 @@ static void r8168_pll_power_down(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_26:
 	case RTL_GIGA_MAC_VER_27:
 	case RTL_GIGA_MAC_VER_28:
+	case RTL_GIGA_MAC_VER_29:
+	case RTL_GIGA_MAC_VER_30:
 	case RTL_GIGA_MAC_VER_31:
 	case RTL_GIGA_MAC_VER_32:
 	case RTL_GIGA_MAC_VER_33:
+	case RTL_GIGA_MAC_VER_37:
+	case RTL_GIGA_MAC_VER_39:
+	case RTL_GIGA_MAC_VER_43:
 	case RTL_GIGA_MAC_VER_44:
 	case RTL_GIGA_MAC_VER_45:
 	case RTL_GIGA_MAC_VER_46:
+	case RTL_GIGA_MAC_VER_47:
+	case RTL_GIGA_MAC_VER_48:
 	case RTL_GIGA_MAC_VER_50:
 	case RTL_GIGA_MAC_VER_51:
 		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~0x80);
@@ -4867,14 +4831,21 @@ static void r8168_pll_power_up(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_26:
 	case RTL_GIGA_MAC_VER_27:
 	case RTL_GIGA_MAC_VER_28:
+	case RTL_GIGA_MAC_VER_29:
+	case RTL_GIGA_MAC_VER_30:
 	case RTL_GIGA_MAC_VER_31:
 	case RTL_GIGA_MAC_VER_32:
 	case RTL_GIGA_MAC_VER_33:
+	case RTL_GIGA_MAC_VER_37:
+	case RTL_GIGA_MAC_VER_39:
+	case RTL_GIGA_MAC_VER_43:
 		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0x80);
 		break;
 	case RTL_GIGA_MAC_VER_44:
 	case RTL_GIGA_MAC_VER_45:
 	case RTL_GIGA_MAC_VER_46:
+	case RTL_GIGA_MAC_VER_47:
+	case RTL_GIGA_MAC_VER_48:
 	case RTL_GIGA_MAC_VER_50:
 	case RTL_GIGA_MAC_VER_51:
 		RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | 0xc0);
@@ -4925,8 +4896,8 @@ static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_43:
 	case RTL_GIGA_MAC_VER_47:
 	case RTL_GIGA_MAC_VER_48:
-		ops->down	= r810x_pll_power_down;
-		ops->up		= r810x_pll_power_up;
+		ops->down	= r8168_pll_power_down;
+		ops->up		= r8168_pll_power_up;
 		break;
 
 	case RTL_GIGA_MAC_VER_11:
-- 
cgit v1.2.3-59-g8ed1b


From 4f447d296982e5776c4fbaf1cc56c5402b86ccb7 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:47 +0200
Subject: r8169: drop member pll_power_ops from struct rtl8169_private

After merging r810x_pll_power_down/up and r8168_pll_power_down/up we
don't need member pll_power_ops any longer and can drop it, thus
simplifying the code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 76 +++++-------------------------------
 1 file changed, 10 insertions(+), 66 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 0585a9c6edcc..3573d8256ff9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -776,11 +776,6 @@ struct rtl8169_private {
 		int (*read)(struct rtl8169_private *, int);
 	} mdio_ops;
 
-	struct pll_power_ops {
-		void (*down)(struct rtl8169_private *);
-		void (*up)(struct rtl8169_private *);
-	} pll_power_ops;
-
 	struct jumbo_ops {
 		void (*enable)(struct rtl8169_private *);
 		void (*disable)(struct rtl8169_private *);
@@ -4871,73 +4866,23 @@ static void rtl_generic_op(struct rtl8169_private *tp,
 
 static void rtl_pll_power_down(struct rtl8169_private *tp)
 {
-	rtl_generic_op(tp, tp->pll_power_ops.down);
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
+	case RTL_GIGA_MAC_VER_13 ... RTL_GIGA_MAC_VER_15:
+		break;
+	default:
+		r8168_pll_power_down(tp);
+	}
 }
 
 static void rtl_pll_power_up(struct rtl8169_private *tp)
 {
-	rtl_generic_op(tp, tp->pll_power_ops.up);
-}
-
-static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
-{
-	struct pll_power_ops *ops = &tp->pll_power_ops;
-
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_07:
-	case RTL_GIGA_MAC_VER_08:
-	case RTL_GIGA_MAC_VER_09:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_16:
-	case RTL_GIGA_MAC_VER_29:
-	case RTL_GIGA_MAC_VER_30:
-	case RTL_GIGA_MAC_VER_37:
-	case RTL_GIGA_MAC_VER_39:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-		ops->down	= r8168_pll_power_down;
-		ops->up		= r8168_pll_power_up;
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
+	case RTL_GIGA_MAC_VER_13 ... RTL_GIGA_MAC_VER_15:
 		break;
-
-	case RTL_GIGA_MAC_VER_11:
-	case RTL_GIGA_MAC_VER_12:
-	case RTL_GIGA_MAC_VER_17:
-	case RTL_GIGA_MAC_VER_18:
-	case RTL_GIGA_MAC_VER_19:
-	case RTL_GIGA_MAC_VER_20:
-	case RTL_GIGA_MAC_VER_21:
-	case RTL_GIGA_MAC_VER_22:
-	case RTL_GIGA_MAC_VER_23:
-	case RTL_GIGA_MAC_VER_24:
-	case RTL_GIGA_MAC_VER_25:
-	case RTL_GIGA_MAC_VER_26:
-	case RTL_GIGA_MAC_VER_27:
-	case RTL_GIGA_MAC_VER_28:
-	case RTL_GIGA_MAC_VER_31:
-	case RTL_GIGA_MAC_VER_32:
-	case RTL_GIGA_MAC_VER_33:
-	case RTL_GIGA_MAC_VER_34:
-	case RTL_GIGA_MAC_VER_35:
-	case RTL_GIGA_MAC_VER_36:
-	case RTL_GIGA_MAC_VER_38:
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
-		ops->down	= r8168_pll_power_down;
-		ops->up		= r8168_pll_power_up;
-		break;
-
 	default:
-		ops->down	= NULL;
-		ops->up		= NULL;
-		break;
+		r8168_pll_power_up(tp);
 	}
 }
 
@@ -8027,7 +7972,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_set_master(pdev);
 
 	rtl_init_mdio_ops(tp);
-	rtl_init_pll_power_ops(tp);
 	rtl_init_jumbo_ops(tp);
 	rtl_init_csi_ops(tp);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2a71883c4fd73a5d235363e6f8db118532d2d001 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:49 +0200
Subject: r8169: simplify code by using ranges in switch clauses

Several switch statements can be significantly simplified by using
case ranges.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 193 ++++-------------------------------
 1 file changed, 19 insertions(+), 174 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3573d8256ff9..9c92b018b1a3 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1611,23 +1611,8 @@ static u32 __rtl8169_get_wol(struct rtl8169_private *tp)
 	if (options & LinkUp)
 		wolopts |= WAKE_PHY;
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_34:
-	case RTL_GIGA_MAC_VER_35:
-	case RTL_GIGA_MAC_VER_36:
-	case RTL_GIGA_MAC_VER_37:
-	case RTL_GIGA_MAC_VER_38:
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		if (rtl_eri_read(tp, 0xdc, ERIAR_EXGMAC) & MagicPacket_v2)
 			wolopts |= WAKE_MAGIC;
 		break;
@@ -1688,23 +1673,8 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
 
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_34:
-	case RTL_GIGA_MAC_VER_35:
-	case RTL_GIGA_MAC_VER_36:
-	case RTL_GIGA_MAC_VER_37:
-	case RTL_GIGA_MAC_VER_38:
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		tmp = ARRAY_SIZE(cfg) - 1;
 		if (wolopts & WAKE_MAGIC)
 			rtl_w0w1_eri(tp,
@@ -4623,18 +4593,7 @@ static void rtl_init_mdio_ops(struct rtl8169_private *tp)
 		ops->write	= r8168dp_2_mdio_write;
 		ops->read	= r8168dp_2_mdio_read;
 		break;
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		ops->write	= r8168g_mdio_write;
 		ops->read	= r8168g_mdio_read;
 		break;
@@ -4679,21 +4638,7 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_32:
 	case RTL_GIGA_MAC_VER_33:
 	case RTL_GIGA_MAC_VER_34:
-	case RTL_GIGA_MAC_VER_37:
-	case RTL_GIGA_MAC_VER_38:
-	case RTL_GIGA_MAC_VER_39:
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_51:
 		RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) |
 			AcceptBroadcast | AcceptMulticast | AcceptMyPhys);
 		break;
@@ -4719,18 +4664,7 @@ static void r8168_phy_power_up(struct rtl8169_private *tp)
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_11:
 	case RTL_GIGA_MAC_VER_12:
-	case RTL_GIGA_MAC_VER_17:
-	case RTL_GIGA_MAC_VER_18:
-	case RTL_GIGA_MAC_VER_19:
-	case RTL_GIGA_MAC_VER_20:
-	case RTL_GIGA_MAC_VER_21:
-	case RTL_GIGA_MAC_VER_22:
-	case RTL_GIGA_MAC_VER_23:
-	case RTL_GIGA_MAC_VER_24:
-	case RTL_GIGA_MAC_VER_25:
-	case RTL_GIGA_MAC_VER_26:
-	case RTL_GIGA_MAC_VER_27:
-	case RTL_GIGA_MAC_VER_28:
+	case RTL_GIGA_MAC_VER_17 ... RTL_GIGA_MAC_VER_28:
 	case RTL_GIGA_MAC_VER_31:
 		rtl_writephy(tp, 0x0e, 0x0000);
 		break;
@@ -4753,18 +4687,7 @@ static void r8168_phy_power_down(struct rtl8169_private *tp)
 
 	case RTL_GIGA_MAC_VER_11:
 	case RTL_GIGA_MAC_VER_12:
-	case RTL_GIGA_MAC_VER_17:
-	case RTL_GIGA_MAC_VER_18:
-	case RTL_GIGA_MAC_VER_19:
-	case RTL_GIGA_MAC_VER_20:
-	case RTL_GIGA_MAC_VER_21:
-	case RTL_GIGA_MAC_VER_22:
-	case RTL_GIGA_MAC_VER_23:
-	case RTL_GIGA_MAC_VER_24:
-	case RTL_GIGA_MAC_VER_25:
-	case RTL_GIGA_MAC_VER_26:
-	case RTL_GIGA_MAC_VER_27:
-	case RTL_GIGA_MAC_VER_28:
+	case RTL_GIGA_MAC_VER_17 ... RTL_GIGA_MAC_VER_28:
 	case RTL_GIGA_MAC_VER_31:
 		rtl_writephy(tp, 0x0e, 0x0200);
 	default:
@@ -4788,15 +4711,7 @@ static void r8168_pll_power_down(struct rtl8169_private *tp)
 	r8168_phy_power_down(tp);
 
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_25:
-	case RTL_GIGA_MAC_VER_26:
-	case RTL_GIGA_MAC_VER_27:
-	case RTL_GIGA_MAC_VER_28:
-	case RTL_GIGA_MAC_VER_29:
-	case RTL_GIGA_MAC_VER_30:
-	case RTL_GIGA_MAC_VER_31:
-	case RTL_GIGA_MAC_VER_32:
-	case RTL_GIGA_MAC_VER_33:
+	case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_33:
 	case RTL_GIGA_MAC_VER_37:
 	case RTL_GIGA_MAC_VER_39:
 	case RTL_GIGA_MAC_VER_43:
@@ -4822,15 +4737,7 @@ static void r8168_pll_power_down(struct rtl8169_private *tp)
 static void r8168_pll_power_up(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_25:
-	case RTL_GIGA_MAC_VER_26:
-	case RTL_GIGA_MAC_VER_27:
-	case RTL_GIGA_MAC_VER_28:
-	case RTL_GIGA_MAC_VER_29:
-	case RTL_GIGA_MAC_VER_30:
-	case RTL_GIGA_MAC_VER_31:
-	case RTL_GIGA_MAC_VER_32:
-	case RTL_GIGA_MAC_VER_33:
+	case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_33:
 	case RTL_GIGA_MAC_VER_37:
 	case RTL_GIGA_MAC_VER_39:
 	case RTL_GIGA_MAC_VER_43:
@@ -4889,45 +4796,16 @@ static void rtl_pll_power_up(struct rtl8169_private *tp)
 static void rtl_init_rxcfg(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_01:
-	case RTL_GIGA_MAC_VER_02:
-	case RTL_GIGA_MAC_VER_03:
-	case RTL_GIGA_MAC_VER_04:
-	case RTL_GIGA_MAC_VER_05:
-	case RTL_GIGA_MAC_VER_06:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_11:
-	case RTL_GIGA_MAC_VER_12:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_14:
-	case RTL_GIGA_MAC_VER_15:
-	case RTL_GIGA_MAC_VER_16:
-	case RTL_GIGA_MAC_VER_17:
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
+	case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17:
 		RTL_W32(tp, RxConfig, RX_FIFO_THRESH | RX_DMA_BURST);
 		break;
-	case RTL_GIGA_MAC_VER_18:
-	case RTL_GIGA_MAC_VER_19:
-	case RTL_GIGA_MAC_VER_20:
-	case RTL_GIGA_MAC_VER_21:
-	case RTL_GIGA_MAC_VER_22:
-	case RTL_GIGA_MAC_VER_23:
-	case RTL_GIGA_MAC_VER_24:
+	case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_24:
 	case RTL_GIGA_MAC_VER_34:
 	case RTL_GIGA_MAC_VER_35:
 		RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
 		break;
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF);
 		break;
 	default:
@@ -5064,18 +4942,7 @@ static void rtl_init_jumbo_ops(struct rtl8169_private *tp)
 	 * No action needed for jumbo frames with 8169.
 	 * No jumbo for 810x at all.
 	 */
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 	default:
 		ops->disable	= NULL;
 		ops->enable	= NULL;
@@ -5438,20 +5305,8 @@ static void rtl_init_csi_ops(struct rtl8169_private *tp)
 	struct csi_ops *ops = &tp->csi_ops;
 
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_01:
-	case RTL_GIGA_MAC_VER_02:
-	case RTL_GIGA_MAC_VER_03:
-	case RTL_GIGA_MAC_VER_04:
-	case RTL_GIGA_MAC_VER_05:
-	case RTL_GIGA_MAC_VER_06:
-	case RTL_GIGA_MAC_VER_10:
-	case RTL_GIGA_MAC_VER_11:
-	case RTL_GIGA_MAC_VER_12:
-	case RTL_GIGA_MAC_VER_13:
-	case RTL_GIGA_MAC_VER_14:
-	case RTL_GIGA_MAC_VER_15:
-	case RTL_GIGA_MAC_VER_16:
-	case RTL_GIGA_MAC_VER_17:
+	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
+	case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17:
 		ops->write	= NULL;
 		ops->read	= NULL;
 		break;
@@ -7843,20 +7698,10 @@ static void rtl_hw_init_8168ep(struct rtl8169_private *tp)
 static void rtl_hw_initialize(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_40:
-	case RTL_GIGA_MAC_VER_41:
-	case RTL_GIGA_MAC_VER_42:
-	case RTL_GIGA_MAC_VER_43:
-	case RTL_GIGA_MAC_VER_44:
-	case RTL_GIGA_MAC_VER_45:
-	case RTL_GIGA_MAC_VER_46:
-	case RTL_GIGA_MAC_VER_47:
-	case RTL_GIGA_MAC_VER_48:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48:
 		rtl_hw_init_8168g(tp);
 		break;
-	case RTL_GIGA_MAC_VER_49:
-	case RTL_GIGA_MAC_VER_50:
-	case RTL_GIGA_MAC_VER_51:
+	case RTL_GIGA_MAC_VER_49 ... RTL_GIGA_MAC_VER_51:
 		rtl_hw_init_8168ep(tp);
 		break;
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From b2d43e6ecf6630e39f35573dce0ff9ae5af9e1a8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:52 +0200
Subject: r8169: replace longer if statements with switch statements

Some longer if statements can be simplified by using switch
statements instead.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 54 +++++++++++-------------------------
 1 file changed, 16 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 9c92b018b1a3..41f9b5c2bc0d 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5028,32 +5028,21 @@ static void rtl8169_hw_reset(struct rtl8169_private *tp)
 
 	rtl_rx_close(tp);
 
-	if (tp->mac_version == RTL_GIGA_MAC_VER_27 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_28 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_31) {
+	switch (tp->mac_version) {
+	case RTL_GIGA_MAC_VER_27:
+	case RTL_GIGA_MAC_VER_28:
+	case RTL_GIGA_MAC_VER_31:
 		rtl_udelay_loop_wait_low(tp, &rtl_npq_cond, 20, 42*42);
-	} else if (tp->mac_version == RTL_GIGA_MAC_VER_34 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_35 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_36 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_37 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_38 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_40 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_41 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_42 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_43 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_44 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_45 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_46 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_47 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_48 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_49 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_50 ||
-		   tp->mac_version == RTL_GIGA_MAC_VER_51) {
+		break;
+	case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq);
 		rtl_udelay_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666);
-	} else {
+		break;
+	default:
 		RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq);
 		udelay(100);
+		break;
 	}
 
 	rtl_hw_reset(tp);
@@ -7854,29 +7843,18 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	u64_stats_init(&tp->tx_stats.syncp);
 
 	/* Get MAC address */
-	if (tp->mac_version == RTL_GIGA_MAC_VER_35 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_36 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_37 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_38 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_40 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_41 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_42 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_43 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_44 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_45 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_46 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_47 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_48 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_49 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_50 ||
-	    tp->mac_version == RTL_GIGA_MAC_VER_51) {
+	switch (tp->mac_version) {
 		u16 mac_addr[3];
-
+	case RTL_GIGA_MAC_VER_35 ... RTL_GIGA_MAC_VER_38:
+	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		*(u32 *)&mac_addr[0] = rtl_eri_read(tp, 0xe0, ERIAR_EXGMAC);
 		*(u16 *)&mac_addr[2] = rtl_eri_read(tp, 0xe4, ERIAR_EXGMAC);
 
 		if (is_valid_ether_addr((u8 *)mac_addr))
 			rtl_rar_set(tp, (u8 *)mac_addr);
+		break;
+	default:
+		break;
 	}
 	for (i = 0; i < ETH_ALEN; i++)
 		dev->dev_addr[i] = RTL_R8(tp, MAC0 + i);
-- 
cgit v1.2.3-59-g8ed1b


From eda40b8cbe8e4b099233ce30535cb4c835896637 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:54 +0200
Subject: r8169: drop rtl_generic_op

Only two places are left where rtl_generic_op() is used, so we can
inline it and simplify the code a little.
This change also avoids the overhead of unlocking/locking in case
the respective operation isn't set.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 41f9b5c2bc0d..7703503e61ff 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4764,13 +4764,6 @@ static void r8168_pll_power_up(struct rtl8169_private *tp)
 	r8168_phy_power_up(tp);
 }
 
-static void rtl_generic_op(struct rtl8169_private *tp,
-			   void (*op)(struct rtl8169_private *))
-{
-	if (op)
-		op(tp);
-}
-
 static void rtl_pll_power_down(struct rtl8169_private *tp)
 {
 	switch (tp->mac_version) {
@@ -4821,16 +4814,20 @@ static void rtl8169_init_ring_indexes(struct rtl8169_private *tp)
 
 static void rtl_hw_jumbo_enable(struct rtl8169_private *tp)
 {
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-	rtl_generic_op(tp, tp->jumbo_ops.enable);
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+	if (tp->jumbo_ops.enable) {
+		RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
+		tp->jumbo_ops.enable(tp);
+		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+	}
 }
 
 static void rtl_hw_jumbo_disable(struct rtl8169_private *tp)
 {
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-	rtl_generic_op(tp, tp->jumbo_ops.disable);
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+	if (tp->jumbo_ops.disable) {
+		RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
+		tp->jumbo_ops.disable(tp);
+		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+	}
 }
 
 static void r8168c_hw_jumbo_enable(struct rtl8169_private *tp)
-- 
cgit v1.2.3-59-g8ed1b


From ff1d733155da3fdba29e2ba99be428da86070989 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:56 +0200
Subject: r8169: improve PCI config space access

Some chips have a non-zero function id, however instead of hardcoding
the id's (CSIAR_FUNC_NIC and CSIAR_FUNC_NIC2) we can get them
dynamically via PCI_FUNC(pci_dev->devfn). This way we can get rid
of the csi_ops.

In general csi is just a fallback mechanism for PCI config space
access in case no native access is supported. Therefore let's
try native access first.

I checked with Realtek regarding the functionality of config space
byte 0x070f and according to them it controls the L0s/L1
entrance latency.
Currently ASPM is disabled in general and therefore this value
isn't used. However we may introduce a whitelist for chips
where ASPM is known to work, therefore let's keep this code.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 132 ++++++++---------------------------
 1 file changed, 29 insertions(+), 103 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 7703503e61ff..d72b3fdf853a 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -410,13 +410,8 @@ enum rtl8168_8101_registers {
 	CSIAR			= 0x68,
 #define	CSIAR_FLAG			0x80000000
 #define	CSIAR_WRITE_CMD			0x80000000
-#define	CSIAR_BYTE_ENABLE		0x0f
-#define	CSIAR_BYTE_ENABLE_SHIFT		12
-#define	CSIAR_ADDR_MASK			0x0fff
-#define CSIAR_FUNC_CARD			0x00000000
-#define CSIAR_FUNC_SDIO			0x00010000
-#define CSIAR_FUNC_NIC			0x00020000
-#define CSIAR_FUNC_NIC2			0x00010000
+#define	CSIAR_BYTE_ENABLE		0x0000f000
+#define	CSIAR_ADDR_MASK			0x00000fff
 	PMCH			= 0x6f,
 	EPHYAR			= 0x80,
 #define	EPHYAR_FLAG			0x80000000
@@ -781,11 +776,6 @@ struct rtl8169_private {
 		void (*disable)(struct rtl8169_private *);
 	} jumbo_ops;
 
-	struct csi_ops {
-		void (*write)(struct rtl8169_private *, int, int);
-		u32 (*read)(struct rtl8169_private *, int);
-	} csi_ops;
-
 	int (*set_speed)(struct net_device *, u8 aneg, u16 sp, u8 dpx, u32 adv);
 	int (*get_link_ksettings)(struct net_device *,
 				  struct ethtool_link_ksettings *);
@@ -5196,123 +5186,60 @@ static void rtl_hw_start_8169(struct rtl8169_private *tp)
 	RTL_W32(tp, RxMissed, 0);
 }
 
-static void rtl_csi_write(struct rtl8169_private *tp, int addr, int value)
-{
-	if (tp->csi_ops.write)
-		tp->csi_ops.write(tp, addr, value);
-}
-
-static u32 rtl_csi_read(struct rtl8169_private *tp, int addr)
-{
-	return tp->csi_ops.read ? tp->csi_ops.read(tp, addr) : ~0;
-}
-
-static void rtl_csi_access_enable(struct rtl8169_private *tp, u32 bits)
-{
-	u32 csi;
-
-	csi = rtl_csi_read(tp, 0x070c) & 0x00ffffff;
-	rtl_csi_write(tp, 0x070c, csi | bits);
-}
-
-static void rtl_csi_access_enable_1(struct rtl8169_private *tp)
-{
-	rtl_csi_access_enable(tp, 0x17000000);
-}
-
-static void rtl_csi_access_enable_2(struct rtl8169_private *tp)
-{
-	rtl_csi_access_enable(tp, 0x27000000);
-}
-
 DECLARE_RTL_COND(rtl_csiar_cond)
 {
 	return RTL_R32(tp, CSIAR) & CSIAR_FLAG;
 }
 
-static void r8169_csi_write(struct rtl8169_private *tp, int addr, int value)
-{
-	RTL_W32(tp, CSIDR, value);
-	RTL_W32(tp, CSIAR, CSIAR_WRITE_CMD | (addr & CSIAR_ADDR_MASK) |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT);
-
-	rtl_udelay_loop_wait_low(tp, &rtl_csiar_cond, 10, 100);
-}
-
-static u32 r8169_csi_read(struct rtl8169_private *tp, int addr)
+static void rtl_csi_write(struct rtl8169_private *tp, int addr, int value)
 {
-	RTL_W32(tp, CSIAR, (addr & CSIAR_ADDR_MASK) |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT);
+	u32 func = PCI_FUNC(tp->pci_dev->devfn);
 
-	return rtl_udelay_loop_wait_high(tp, &rtl_csiar_cond, 10, 100) ?
-		RTL_R32(tp, CSIDR) : ~0;
-}
-
-static void r8402_csi_write(struct rtl8169_private *tp, int addr, int value)
-{
 	RTL_W32(tp, CSIDR, value);
 	RTL_W32(tp, CSIAR, CSIAR_WRITE_CMD | (addr & CSIAR_ADDR_MASK) |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT |
-		CSIAR_FUNC_NIC);
+		CSIAR_BYTE_ENABLE | func << 16);
 
 	rtl_udelay_loop_wait_low(tp, &rtl_csiar_cond, 10, 100);
 }
 
-static u32 r8402_csi_read(struct rtl8169_private *tp, int addr)
+static u32 rtl_csi_read(struct rtl8169_private *tp, int addr)
 {
-	RTL_W32(tp, CSIAR, (addr & CSIAR_ADDR_MASK) | CSIAR_FUNC_NIC |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT);
+	u32 func = PCI_FUNC(tp->pci_dev->devfn);
+
+	RTL_W32(tp, CSIAR, (addr & CSIAR_ADDR_MASK) | func << 16 |
+		CSIAR_BYTE_ENABLE);
 
 	return rtl_udelay_loop_wait_high(tp, &rtl_csiar_cond, 10, 100) ?
 		RTL_R32(tp, CSIDR) : ~0;
 }
 
-static void r8411_csi_write(struct rtl8169_private *tp, int addr, int value)
+static void rtl_csi_access_enable(struct rtl8169_private *tp, u8 val)
 {
-	RTL_W32(tp, CSIDR, value);
-	RTL_W32(tp, CSIAR, CSIAR_WRITE_CMD | (addr & CSIAR_ADDR_MASK) |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT |
-		CSIAR_FUNC_NIC2);
+	struct pci_dev *pdev = tp->pci_dev;
+	u32 csi;
 
-	rtl_udelay_loop_wait_low(tp, &rtl_csiar_cond, 10, 100);
+	/* According to Realtek the value at config space address 0x070f
+	 * controls the L0s/L1 entrance latency. We try standard ECAM access
+	 * first and if it fails fall back to CSI.
+	 */
+	if (pdev->cfg_size > 0x070f &&
+	    pci_write_config_byte(pdev, 0x070f, val) == PCIBIOS_SUCCESSFUL)
+		return;
+
+	netdev_notice_once(tp->dev,
+		"No native access to PCI extended config space, falling back to CSI\n");
+	csi = rtl_csi_read(tp, 0x070c) & 0x00ffffff;
+	rtl_csi_write(tp, 0x070c, csi | val << 24);
 }
 
-static u32 r8411_csi_read(struct rtl8169_private *tp, int addr)
+static void rtl_csi_access_enable_1(struct rtl8169_private *tp)
 {
-	RTL_W32(tp, CSIAR, (addr & CSIAR_ADDR_MASK) | CSIAR_FUNC_NIC2 |
-		CSIAR_BYTE_ENABLE << CSIAR_BYTE_ENABLE_SHIFT);
-
-	return rtl_udelay_loop_wait_high(tp, &rtl_csiar_cond, 10, 100) ?
-		RTL_R32(tp, CSIDR) : ~0;
+	rtl_csi_access_enable(tp, 0x17);
 }
 
-static void rtl_init_csi_ops(struct rtl8169_private *tp)
+static void rtl_csi_access_enable_2(struct rtl8169_private *tp)
 {
-	struct csi_ops *ops = &tp->csi_ops;
-
-	switch (tp->mac_version) {
-	case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
-	case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17:
-		ops->write	= NULL;
-		ops->read	= NULL;
-		break;
-
-	case RTL_GIGA_MAC_VER_37:
-	case RTL_GIGA_MAC_VER_38:
-		ops->write	= r8402_csi_write;
-		ops->read	= r8402_csi_read;
-		break;
-
-	case RTL_GIGA_MAC_VER_44:
-		ops->write	= r8411_csi_write;
-		ops->read	= r8411_csi_read;
-		break;
-
-	default:
-		ops->write	= r8169_csi_write;
-		ops->read	= r8169_csi_read;
-		break;
-	}
+	rtl_csi_access_enable(tp, 0x27);
 }
 
 struct ephy_info {
@@ -7804,7 +7731,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	rtl_init_mdio_ops(tp);
 	rtl_init_jumbo_ops(tp);
-	rtl_init_csi_ops(tp);
 
 	rtl8169_print_mac_version(tp);
 
-- 
cgit v1.2.3-59-g8ed1b


From 353af85ed811d84ca349c6d6e5df8055eeab5bd5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:39:59 +0200
Subject: r8169: avoid potentially misaligned access when getting mac address

Interpreting a member of an u16 array as u32 may result in a misaligned
access. Also it's not really intuitive to define a mac address variable
as array of three u16 words. Therefore use an array of six bytes that
is properly aligned for 32 bit access.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index d72b3fdf853a..6fce3cc86ae3 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7767,14 +7767,14 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* Get MAC address */
 	switch (tp->mac_version) {
-		u16 mac_addr[3];
+		u8 mac_addr[ETH_ALEN] __aligned(4);
 	case RTL_GIGA_MAC_VER_35 ... RTL_GIGA_MAC_VER_38:
 	case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_51:
 		*(u32 *)&mac_addr[0] = rtl_eri_read(tp, 0xe0, ERIAR_EXGMAC);
-		*(u16 *)&mac_addr[2] = rtl_eri_read(tp, 0xe4, ERIAR_EXGMAC);
+		*(u16 *)&mac_addr[4] = rtl_eri_read(tp, 0xe4, ERIAR_EXGMAC);
 
-		if (is_valid_ether_addr((u8 *)mac_addr))
-			rtl_rar_set(tp, (u8 *)mac_addr);
+		if (is_valid_ether_addr(mac_addr))
+			rtl_rar_set(tp, mac_addr);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 4ff36466281428734791d3cc6331b8cca7c76019 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 2 May 2018 21:40:02 +0200
Subject: r8169: replace get_protocol with vlan_get_protocol

This patch is basically the same as 6e74d1749a33 ("r8152: replace
get_protocol with vlan_get_protocol"). Use vlan_get_protocol
instead of duplicating the functionality.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 6fce3cc86ae3..6d99b141a7aa 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6510,18 +6510,6 @@ static int msdn_giant_send_check(struct sk_buff *skb)
 	return ret;
 }
 
-static inline __be16 get_protocol(struct sk_buff *skb)
-{
-	__be16 protocol;
-
-	if (skb->protocol == htons(ETH_P_8021Q))
-		protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
-	else
-		protocol = skb->protocol;
-
-	return protocol;
-}
-
 static bool rtl8169_tso_csum_v1(struct rtl8169_private *tp,
 				struct sk_buff *skb, u32 *opts)
 {
@@ -6558,7 +6546,7 @@ static bool rtl8169_tso_csum_v2(struct rtl8169_private *tp,
 			return false;
 		}
 
-		switch (get_protocol(skb)) {
+		switch (vlan_get_protocol(skb)) {
 		case htons(ETH_P_IP):
 			opts[0] |= TD1_GTSENV4;
 			break;
@@ -6590,7 +6578,7 @@ static bool rtl8169_tso_csum_v2(struct rtl8169_private *tp,
 			return false;
 		}
 
-		switch (get_protocol(skb)) {
+		switch (vlan_get_protocol(skb)) {
 		case htons(ETH_P_IP):
 			opts[1] |= TD1_IPv4_CS;
 			ip_protocol = ip_hdr(skb)->protocol;
-- 
cgit v1.2.3-59-g8ed1b


From 29e6eee192f31caf9e0af7713224fd171044cef4 Mon Sep 17 00:00:00 2001
From: Craig Dillabaugh <cdillaba@mojatatu.com>
Date: Tue, 1 May 2018 10:17:43 -0400
Subject: net sched: Implemented get_fill_size routine for act_csum.

Signed-off-by: Craig Dillabaugh <cdillaba@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_csum.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7e28b2ce1437..526a8e491626 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -648,6 +648,11 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
 	return tcf_idr_search(tn, a, index);
 }
 
+static size_t tcf_csum_get_fill_size(const struct tc_action *act)
+{
+	return nla_total_size(sizeof(struct tc_csum));
+}
+
 static struct tc_action_ops act_csum_ops = {
 	.kind		= "csum",
 	.type		= TCA_ACT_CSUM,
@@ -658,6 +663,7 @@ static struct tc_action_ops act_csum_ops = {
 	.cleanup	= tcf_csum_cleanup,
 	.walk		= tcf_csum_walker,
 	.lookup		= tcf_csum_search,
+	.get_fill_size  = tcf_csum_get_fill_size,
 	.size		= sizeof(struct tcf_csum),
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 2c785b388f9124551503e23e668fb5d10ce1d75e Mon Sep 17 00:00:00 2001
From: Craig Dillabaugh <cdillaba@mojatatu.com>
Date: Tue, 1 May 2018 10:17:44 -0400
Subject: tc-testing: Updated csum action tests batch create w/wo cookies.

Signed-off-by: Craig Dillabaugh <cdillaba@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../tc-testing/tc-tests/actions/csum.json          | 74 +++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json
index 93cf8fea8ae7..3a2f51fc7fd4 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json
@@ -398,13 +398,83 @@
                 255
             ]
         ],
-        "cmdUnderTest": "for i in `seq 1 32`; do cmd=\"action csum tcp continue index $i \"; args=\"$args$cmd\"; done && $TC actions add $args",
-        "expExitCode": "255",
+        "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+        "expExitCode": "0",
         "verifyCmd": "$TC actions ls action csum",
         "matchPattern": "^[ \t]+index [0-9]* ref",
         "matchCount": "32",
         "teardown": [
             "$TC actions flush action csum"
         ]
+    },
+    {
+        "id": "b4e9",
+        "name": "Delete batch of 32 csum actions",
+        "category": [
+            "actions",
+            "csum"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action csum",
+                0,
+                1,
+                255
+            ],
+            "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\""
+        ],
+        "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action csum",
+        "matchPattern": "^[ \t]+index [0-9]+ ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "0015",
+        "name": "Add batch of 32 csum tcp actions with large cookies",
+        "category": [
+            "actions",
+            "csum"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action csum",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie aaabbbcccdddeee \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions ls action csum",
+        "matchPattern": "^[ \t]+index [0-9]* ref",
+        "matchCount": "32",
+        "teardown": [
+            "$TC actions flush action csum"
+        ]
+    },
+    {
+        "id": "989e",
+        "name": "Delete batch of 32 csum actions with large cookies",
+        "category": [
+            "actions",
+            "csum"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action csum",
+                0,
+                1,
+                255
+            ],
+            "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie aaabbbcccdddeee \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\""
+        ],
+        "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action csum",
+        "matchPattern": "^[ \t]+index [0-9]+ ref",
+        "matchCount": "0",
+        "teardown": []
     }
 ]
-- 
cgit v1.2.3-59-g8ed1b


From 7ccbdff13e8df624e73d775e1b710070834d47c6 Mon Sep 17 00:00:00 2001
From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Date: Thu, 3 May 2018 09:34:29 +0800
Subject: ip6_gre: correct the function name in ip6gre_tnl_addr_conflict()
 comment

The function name is wrong in ip6gre_tnl_addr_conflict() comment, which
use ip6_tnl_addr_conflict instead of ip6gre_tnl_addr_conflict.

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index ac7ce85df667..04c69e0c84b3 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -807,7 +807,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 }
 
 /**
- * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
+ * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
  *   @t: the outgoing tunnel device
  *   @hdr: IPv6 header from the incoming packet
  *
-- 
cgit v1.2.3-59-g8ed1b


From 03f5781be2c7b7e728d724ac70ba10799cc710d7 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Thu, 3 May 2018 14:10:43 +0800
Subject: bpf, x86_32: add eBPF JIT compiler for ia32

The JIT compiler emits ia32 bit instructions. Currently, It supports eBPF
only. Classic BPF is supported because of the conversion by BPF core.

Almost all instructions from eBPF ISA supported except the following:
BPF_ALU64 | BPF_DIV | BPF_K
BPF_ALU64 | BPF_DIV | BPF_X
BPF_ALU64 | BPF_MOD | BPF_K
BPF_ALU64 | BPF_MOD | BPF_X
BPF_STX | BPF_XADD | BPF_W
BPF_STX | BPF_XADD | BPF_DW

It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL at the moment.

IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI. I use
EAX|EDX|ECX|EBX as temporary registers to simulate instructions in eBPF
ISA, and allocate ESI|EDI to BPF_REG_AX for constant blinding, all others
eBPF registers, R0-R10, are simulated through scratch space on stack.

The reasons behind the hardware registers allocation policy are:
1:MUL need EAX:EDX, shift operation need ECX, so they aren't fit
  for general eBPF 64bit register simulation.
2:We need at least 4 registers to simulate most eBPF ISA operations
  on registers operands instead of on register&memory operands.
3:We need to put BPF_REG_AX on hardware registers, or constant blinding
  will degrade jit performance heavily.

Tested on PC (Intel(R) Core(TM) i5-5200U CPU).
Testing results on i5-5200U:
1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed]
2) test_progs: Summary: 83 PASSED, 0 FAILED.
3) test_lpm: OK
4) test_lru_map: OK
5) test_verifier: Summary: 828 PASSED, 0 FAILED.

Above tests are all done in following two conditions separately:
1:bpf_jit_enable=1 and bpf_jit_harden=0
2:bpf_jit_enable=1 and bpf_jit_harden=2

Below are some numbers for this jit implementation:
Note:
  I run test_progs in kselftest 100 times continuously for every condition,
  the numbers are in format: total/times=avg.
  The numbers that test_bpf reports show almost the same relation.

a:jit_enable=0 and jit_harden=0            b:jit_enable=1 and jit_harden=0
  test_pkt_access:PASS:ipv4:15622/100=156    test_pkt_access:PASS:ipv4:10674/100=106
  test_pkt_access:PASS:ipv6:9130/100=91      test_pkt_access:PASS:ipv6:4855/100=48
  test_xdp:PASS:ipv4:240198/100=2401         test_xdp:PASS:ipv4:138912/100=1389
  test_xdp:PASS:ipv6:137326/100=1373         test_xdp:PASS:ipv6:68542/100=685
  test_l4lb:PASS:ipv4:61100/100=611          test_l4lb:PASS:ipv4:37302/100=373
  test_l4lb:PASS:ipv6:101000/100=1010        test_l4lb:PASS:ipv6:55030/100=550

c:jit_enable=1 and jit_harden=2
  test_pkt_access:PASS:ipv4:10558/100=105
  test_pkt_access:PASS:ipv6:5092/100=50
  test_xdp:PASS:ipv4:131902/100=1319
  test_xdp:PASS:ipv6:77932/100=779
  test_l4lb:PASS:ipv4:38924/100=389
  test_l4lb:PASS:ipv6:57520/100=575

The numbers show we get 30%~50% improvement.

See Documentation/networking/filter.txt for more information.

Changelog:

 Changes v5-v6:
 1:Add do {} while (0) to RETPOLINE_RAX_BPF_JIT for
   consistence reason.
 2:Clean up non-standard comments, reported by Daniel Borkmann.
 3:Fix a memory leak issue, repoted by Daniel Borkmann.

 Changes v4-v5:
 1:Delete is_on_stack, BPF_REG_AX is the only one
   on real hardware registers, so just check with
   it.
 2:Apply commit 1612a981b766 ("bpf, x64: fix JIT emission
   for dead code"), suggested by Daniel Borkmann.

 Changes v3-v4:
 1:Fix changelog in commit.
   I install llvm-6.0, then test_progs willn't report errors.
   I submit another patch:
   "bpf: fix misaligned access for BPF_PROG_TYPE_PERF_EVENT program type on x86_32 platform"
   to fix another problem, after that patch, test_verifier willn't report errors too.
 2:Fix clear r0[1] twice unnecessarily in *BPF_IND|BPF_ABS* simulation.

 Changes v2-v3:
 1:Move BPF_REG_AX to real hardware registers for performance reason.
 3:Using bpf_load_pointer instead of bpf_jit32.S, suggested by Daniel Borkmann.
 4:Delete partial codes in 1c2a088a6626, suggested by Daniel Borkmann.
 5:Some bug fixes and comments improvement.

 Changes v1-v2:
 1:Fix bug in emit_ia32_neg64.
 2:Fix bug in emit_ia32_arsh_r64.
 3:Delete filename in top level comment, suggested by Thomas Gleixner.
 4:Delete unnecessary boiler plate text, suggested by Thomas Gleixner.
 5:Rewrite some words in changelog.
 6:CodingSytle improvement and a little more comments.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/sysctl/net.txt         |    1 +
 arch/x86/Kconfig                     |    2 +-
 arch/x86/include/asm/nospec-branch.h |   30 +-
 arch/x86/net/Makefile                |    8 +-
 arch/x86/net/bpf_jit_comp32.c        | 2553 ++++++++++++++++++++++++++++++++++
 5 files changed, 2588 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/net/bpf_jit_comp32.c

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 5992602469d8..9ecde517728c 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -45,6 +45,7 @@ through bpf(2) and passing a verifier in the kernel, a JIT will then
 translate these BPF proglets into native CPU instructions. There are
 two flavors of JITs, the newer eBPF JIT currently supported on:
   - x86_64
+  - x86_32
   - arm64
   - arm32
   - ppc64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81f2c56..1f5fa2f2c168 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -137,7 +137,7 @@ config X86
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_EBPF_JIT			if X86_64
+	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EXIT_THREAD
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9b143f..2cd344d1a6e5 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -291,16 +291,20 @@ do {									\
  *    lfence
  *    jmp spec_trap
  *  do_rop:
- *    mov %rax,(%rsp)
+ *    mov %rax,(%rsp) for x86_64
+ *    mov %edx,(%esp) for x86_32
  *    retq
  *
  * Without retpolines configured:
  *
- *    jmp *%rax
+ *    jmp *%rax for x86_64
+ *    jmp *%edx for x86_32
  */
 #ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_64
 # define RETPOLINE_RAX_BPF_JIT_SIZE	17
 # define RETPOLINE_RAX_BPF_JIT()				\
+do {								\
 	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
 	/* spec_trap: */					\
 	EMIT2(0xF3, 0x90);       /* pause */			\
@@ -308,11 +312,31 @@ do {									\
 	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
 	/* do_rop: */						\
 	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
-	EMIT1(0xC3);             /* retq */
+	EMIT1(0xC3);             /* retq */			\
+} while (0)
 #else
+# define RETPOLINE_EDX_BPF_JIT()				\
+do {								\
+	EMIT1_off32(0xE8, 7);	 /* call do_rop */		\
+	/* spec_trap: */					\
+	EMIT2(0xF3, 0x90);       /* pause */			\
+	EMIT3(0x0F, 0xAE, 0xE8); /* lfence */			\
+	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
+	/* do_rop: */						\
+	EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */		\
+	EMIT1(0xC3);             /* ret */			\
+} while (0)
+#endif
+#else /* !CONFIG_RETPOLINE */
+
+#ifdef CONFIG_X86_64
 # define RETPOLINE_RAX_BPF_JIT_SIZE	2
 # define RETPOLINE_RAX_BPF_JIT()				\
 	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
+#else
+# define RETPOLINE_EDX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE2) /* jmp *%edx */
+#endif
 #endif
 
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b619598..c6b464a261bb 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,10 @@
 #
 # Arch-specific network modules
 #
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
 
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+ifeq ($(CONFIG_X86_32),y)
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+        OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+        obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000000..61e61341b777
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2553 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ *                         high
+ * original ESP =>        +-----+
+ *                        |     | callee saved registers
+ *                        +-----+
+ *                        | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP  =>    +-----+
+ *                        | ... | eBPF prog stack
+ *                        +-----+
+ *                        |RSVD | JIT scratchpad
+ * current ESP =>         +-----+
+ *                        |     |
+ *                        | ... | Function call stack
+ *                        |     |
+ *                        +-----+
+ *                          low
+ *
+ * The callee saved registers:
+ *
+ *                                high
+ * original ESP =>        +------------------+ \
+ *                        |        ebp       | |
+ * current EBP =>         +------------------+ } callee saved registers
+ *                        |    ebx,esi,edi   | |
+ *                        +------------------+ /
+ *                                low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+	if (len == 1)
+		*ptr = bytes;
+	else if (len == 2)
+		*(u16 *)ptr = bytes;
+	else {
+		*(u32 *)ptr = bytes;
+		barrier();
+	}
+	return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1)		EMIT(b1, 1)
+#define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3)	EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4)   \
+	EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+	do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+	do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+	do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+	do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+	return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+	return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k)	(k)
+#define TCALL_CNT	(MAX_BPF_JIT_REG + 0)	/* Tail Call Count */
+
+#define IA32_EAX	(0x0)
+#define IA32_EBX	(0x3)
+#define IA32_ECX	(0x1)
+#define IA32_EDX	(0x2)
+#define IA32_ESI	(0x6)
+#define IA32_EDI	(0x7)
+#define IA32_EBP	(0x5)
+#define IA32_ESP	(0x4)
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB  0x72
+#define IA32_JAE 0x73
+#define IA32_JE  0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA  0x77
+#define IA32_JL  0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG  0x7F
+
+/*
+ * Map eBPF registers to IA32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ *    registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ *    mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
+ * registers, we have to map each eBPF registers with two IA32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+	/* Return value from in-kernel function, and exit value from eBPF */
+	[BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+	/* The arguments from eBPF program to in-kernel function */
+	/* Stored on stack scratch space */
+	[BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+	[BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+	[BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+	[BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+	[BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+	/* Callee saved registers that in-kernel function will preserve */
+	/* Stored on stack scratch space */
+	[BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+	[BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+	[BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+	[BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+	/* Read only Frame Pointer to access Stack */
+	[BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+	/* Temporary register for blinding constants. */
+	[BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+	/* Tail call count. Stored on stack scratch space. */
+	[TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo	dst[0]
+#define dst_hi	dst[1]
+#define src_lo	src[0]
+#define src_hi	src[1]
+
+#define STACK_ALIGNMENT	8
+/*
+ * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* Total stack size used in JITed code */
+#define _STACK_SIZE \
+	(stack_depth + \
+	 + SCRATCH_SIZE + \
+	 + 4 /* Extra space for skb_copy_bits buffer */)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Offset of skb_copy_bits buffer */
+#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+
+/* Encode 'dst_reg' register into IA32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+	return byte + dst_reg;
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+	return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+	/* Fill whole space with int3 instructions */
+	memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk) {
+		if (val == 0) {
+			/* xor eax,eax */
+			EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		} else {
+			EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+				    STACK_VAR(dst), val);
+		}
+	} else {
+		if (val == 0)
+			EMIT2(0x33, add_2reg(0xC0, dst, dst));
+		else
+			EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+				    val);
+	}
+	*pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+	else
+		/* mov dst,sreg */
+		EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+	*pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+				     const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+	if (is64)
+		/* complete 8 byte move */
+		emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+	else
+		/* zero out high 4 bytes */
+		emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+				     const u32 val, bool dstk, u8 **pprog)
+{
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+	emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+	emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_ECX : src;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+	EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov dst,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+					 bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk && val != 64) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/*
+		 * Emit 'movzwl eax,ax' to zero extend 16-bit
+		 * into 64 bit
+		 */
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* nop */
+		break;
+	}
+
+	if (dstk && val != 64) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+				       bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	switch (val) {
+	case 16:
+		/* Emit 'ror %ax, 8' to swap lower 2 bytes */
+		EMIT1(0x66);
+		EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+		EMIT2(0x0F, 0xB7);
+		EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 32:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+		break;
+	case 64:
+		/* Emit 'bswap eax' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_lo));
+
+		/* Emit 'bswap edx' to swap lower 4 bytes */
+		EMIT1(0x0F);
+		EMIT1(add_1reg(0xC8, dreg_hi));
+
+		/* mov ecx,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* mov dreg_lo,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+		break;
+	}
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+				       bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst));
+	else
+		/* mov eax,dst */
+		EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+	/* xor edx,edx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* div ecx */
+	EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+	if (op == BPF_MOD) {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+	} else {
+		if (dstk)
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(dst));
+		else
+			EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+	}
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 b2;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+	else if (src != IA32_ECX)
+		/* mov ecx,src */
+		EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+	switch (op) {
+	case BPF_LSH:
+		b2 = 0xE0; break;
+	case BPF_RSH:
+		b2 = 0xE8; break;
+	case BPF_ARSH:
+		b2 = 0xF8; break;
+	default:
+		return;
+	}
+	EMIT2(0xD3, add_1reg(b2, dreg));
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const u8 src, bool dstk,
+				   bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 sreg = sstk ? IA32_EAX : src;
+	u8 dreg = dstk ? IA32_EDX : dst;
+
+	if (sstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+	switch (BPF_OP(op)) {
+	/* dst = dst + src */
+	case BPF_ADD:
+		if (hi && is64)
+			EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst - src */
+	case BPF_SUB:
+		if (hi && is64)
+			EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		else
+			EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst | src */
+	case BPF_OR:
+		EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & src */
+	case BPF_AND:
+		EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ src */
+	case BPF_XOR:
+		EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+				     const u8 dst[], const u8 src[],
+				     bool dstk,  bool sstk,
+				     u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+	if (is64)
+		emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+				&prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+	*pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+				   const u8 dst, const s32 val, bool dstk,
+				   u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg = dstk ? IA32_EAX : dst;
+	u8 sreg = IA32_EDX;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+	if (!is_imm8(val))
+		/* mov edx,imm32*/
+		EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+	switch (op) {
+	/* dst = dst + val */
+	case BPF_ADD:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD0, dreg), val);
+			else
+				EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xC0, dreg), val);
+			else
+				EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst - val */
+	case BPF_SUB:
+		if (hi && is64) {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xD8, dreg), val);
+			else
+				EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+		} else {
+			if (is_imm8(val))
+				EMIT3(0x83, add_1reg(0xE8, dreg), val);
+			else
+				EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+		}
+		break;
+	/* dst = dst | val */
+	case BPF_OR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xC8, dreg), val);
+		else
+			EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst & val */
+	case BPF_AND:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xE0, dreg), val);
+		else
+			EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+		break;
+	/* dst = dst ^ val */
+	case BPF_XOR:
+		if (is_imm8(val))
+			EMIT3(0x83, add_1reg(0xF0, dreg), val);
+		else
+			EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+		break;
+	case BPF_NEG:
+		EMIT2(0xF7, add_1reg(0xD8, dreg));
+		break;
+	}
+
+	if (dstk)
+		/* mov dword ptr [ebp+off],dreg */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+		      STACK_VAR(dst));
+	*pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+				     const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	u32 hi = 0;
+
+	if (is64 && (val & (1<<31)))
+		hi = (u32)~0;
+
+	emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+	if (is64)
+		emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+	else
+		emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+	*pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sub dreg_lo,ecx */
+	EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
+	/* mov dreg_lo,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+	/* xor ecx,ecx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+	/* sbb dreg_hi,ecx */
+	EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
+	/* mov dreg_hi,ecx */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+				     bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* shl dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
+	/* mov ebx,dreg_lo */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shr ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+	/* or dreg_hi,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shl dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+	/* mov dreg_hi,dreg_lo */
+	EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+				      bool dstk, bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* ashr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* ashr dreg_hi,imm8 */
+	EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	static int jmp_label1 = -1;
+	static int jmp_label2 = -1;
+	static int jmp_label3 = -1;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	if (sstk)
+		/* mov ecx,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(src_lo));
+	else
+		/* mov ecx,src_lo */
+		EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+	/* cmp ecx,32 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+	/* Jumps when >= 32 */
+	if (is_imm8(jmp_label(jmp_label1, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+	/* < 32 */
+	/* lshr dreg_lo,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+	/* mov ebx,dreg_hi */
+	EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+	/* IA32_ECX = -IA32_ECX + 32 */
+	/* neg ecx */
+	EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+	/* add ecx,32 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+	/* shl ebx,cl */
+	EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+	/* or dreg_lo,ebx */
+	EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 32 */
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+	/* cmp ecx,64 */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+	/* Jumps when >= 64 */
+	if (is_imm8(jmp_label(jmp_label2, 2)))
+		EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+	else
+		EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+	/* >= 32 && < 64 */
+	/* sub ecx,32 */
+	EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+	/* shr dreg_hi,cl */
+	EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+	/* mov dreg_lo,dreg_hi */
+	EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	/* goto out; */
+	if (is_imm8(jmp_label(jmp_label3, 2)))
+		EMIT2(0xEB, jmp_label(jmp_label3, 2));
+	else
+		EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+	/* >= 64 */
+	if (jmp_label2 == -1)
+		jmp_label2 = cnt;
+	/* xor dreg_lo,dreg_lo */
+	EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	/* xor dreg_hi,dreg_hi */
+	EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+	if (jmp_label3 == -1)
+		jmp_label3 = cnt;
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	/* out: */
+	*pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do LSH operation */
+	if (val < 32) {
+		/* shl dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
+		/* mov ebx,dreg_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shr ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+		/* or dreg_hi,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shl dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+		/* mov dreg_hi,dreg_lo */
+		EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* shr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	} else {
+		/* xor dreg_lo,dreg_lo */
+		EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+		/* xor dreg_hi,dreg_hi */
+		EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+				      bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+	u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+	if (dstk) {
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+		      STACK_VAR(dst_hi));
+	}
+	/* Do RSH operation */
+	if (val < 32) {
+		/* shr dreg_lo,imm8 */
+		EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+		/* mov ebx,dreg_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+
+		/* IA32_ECX = 32 - val */
+		/* mov ecx,val */
+		EMIT2(0xB1, val);
+		/* movzx ecx,ecx */
+		EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+		/* neg ecx */
+		EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+		/* add ecx,32 */
+		EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+		/* shl ebx,cl */
+		EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+		/* or dreg_lo,ebx */
+		EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+	} else if (val >= 32 && val < 64) {
+		u32 value = val - 32;
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+	} else {
+		/* ashr dreg_hi,imm8 */
+		EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+		/* mov dreg_lo,dreg_hi */
+		EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+	}
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],dreg_lo */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],dreg_hi */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+		      STACK_VAR(dst_hi));
+	}
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+				     bool sstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_hi));
+	else
+		/* mov eax,dst_hi */
+		EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+	else
+		/* mul src_hi */
+		EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+	/* add eax,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	if (dstk)
+		/* mov eax,dword ptr [ebp+off] */
+		EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+	else
+		/* mov eax,dst_lo */
+		EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+	if (sstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+	else
+		/* mul src_lo */
+		EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dst_lo,eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dst_hi,ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+				     bool dstk, u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	u32 hi;
+
+	hi = val & (1<<31) ? (u32)~0 : 0;
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+	else
+		/* mul dst_hi */
+		EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+	/* mov ecx,eax */
+	EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+	/* add ecx,eax */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+	/* movl eax,imm32 */
+	EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+	if (dstk)
+		/* mul dword ptr [ebp+off] */
+		EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+	else
+		/* mul dst_lo */
+		EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+	/* add ecx,edx */
+	EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+	if (dstk) {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+		      STACK_VAR(dst_lo));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+		      STACK_VAR(dst_hi));
+	} else {
+		/* mov dword ptr [ebp+off],eax */
+		EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+		/* mov dword ptr [ebp+off],ecx */
+		EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+	}
+
+	*pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 4; /* imm32 */
+	else
+		return 0;
+}
+
+struct jit_context {
+	int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE	128
+#define BPF_INSN_SAFETY		64
+
+#define PROLOGUE_SIZE 35
+
+/*
+ * Emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program.
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+	const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+	/* push ebp */
+	EMIT1(0x55);
+	/* mov ebp,esp */
+	EMIT2(0x89, 0xE5);
+	/* push edi */
+	EMIT1(0x57);
+	/* push esi */
+	EMIT1(0x56);
+	/* push ebx */
+	EMIT1(0x53);
+
+	/* sub esp,STACK_SIZE */
+	EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+	/* sub ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+	/* xor ebx,ebx */
+	EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+	/* Set up BPF prog stack base register */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+	/* Move BPF_CTX (EAX) to BPF_REG_R1 */
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+	/* Initialize Tail Count */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+	u8 *prog = *pprog;
+	const u8 *r0 = bpf2ia32[BPF_REG_0];
+	int cnt = 0;
+
+	/* mov eax,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+	/* mov edx,dword ptr [ebp+off]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+	/* add ebp,SCRATCH_SIZE+4+12*/
+	EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+	/* mov ebx,dword ptr [ebp-12]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+	/* mov esi,dword ptr [ebp-8]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+	/* mov edi,dword ptr [ebp-4]*/
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+	EMIT1(0xC9); /* leave */
+	EMIT1(0xC3); /* ret */
+	*pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->ptrs[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	const u8 *r1 = bpf2ia32[BPF_REG_1];
+	const u8 *r2 = bpf2ia32[BPF_REG_2];
+	const u8 *r3 = bpf2ia32[BPF_REG_3];
+	const u8 *tcc = bpf2ia32[TCALL_CNT];
+	u32 lo, hi;
+	static int jmp_label1 = -1;
+
+	/*
+	 * if (index >= array->map.max_entries)
+	 *     goto out;
+	 */
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+	/* mov edx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+	/* cmp dword ptr [eax+off],edx */
+	EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+	      offsetof(struct bpf_array, map.max_entries));
+	/* jbe out */
+	EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+	/*
+	 * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *     goto out;
+	 */
+	lo = (u32)MAX_TAIL_CALL_CNT;
+	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* cmp edx,hi */
+	EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
+	EMIT2(IA32_JNE, 3);
+	/* cmp ecx,lo */
+	EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
+
+	/* ja out */
+	EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+
+	/* add eax,0x1 */
+	EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
+	/* adc ebx,0x0 */
+	EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
+
+	/* mov dword ptr [ebp+off],eax */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+	/* mov dword ptr [ebp+off],edx */
+	EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+	/* prog = array->ptrs[index]; */
+	/* mov edx, [eax + edx * 4 + offsetof(...)] */
+	EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+	/*
+	 * if (prog == NULL)
+	 *     goto out;
+	 */
+	/* test edx,edx */
+	EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+	/* je out */
+	EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	/* mov edx, dword ptr [edx + 32] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+	      offsetof(struct bpf_prog, bpf_func));
+	/* add edx,prologue_size */
+	EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+	/* mov eax,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+	/*
+	 * Now we're ready to jump into next BPF program:
+	 * eax == ctx (1st arg)
+	 * edx == prog->bpf_func + prologue_size
+	 */
+	RETPOLINE_EDX_BPF_JIT();
+
+	if (jmp_label1 == -1)
+		jmp_label1 = cnt;
+
+	/* out: */
+	*pprog = prog;
+}
+
+/* Push the scratch stack register on top of the stack. */
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+	/* push ecx */
+	EMIT1(0x51);
+
+	/* mov ecx,dword ptr [ebp+off] */
+	EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+	/* push ecx */
+	EMIT1(0x51);
+
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		const s32 imm32 = insn->imm;
+		const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+		const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+		const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+		const u8 code = insn->code;
+		const u8 *dst = bpf2ia32[insn->dst_reg];
+		const u8 *src = bpf2ia32[insn->src_reg];
+		const u8 *r0 = bpf2ia32[BPF_REG_0];
+		s64 jmp_offset;
+		u8 jmp_cond;
+		int ilen;
+		u8 *func;
+
+		switch (code) {
+		/* ALU operations */
+		/* dst = src */
+		case BPF_ALU | BPF_MOV | BPF_K:
+		case BPF_ALU | BPF_MOV | BPF_X:
+		case BPF_ALU64 | BPF_MOV | BPF_K:
+		case BPF_ALU64 | BPF_MOV | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mov_r64(is64, dst, src, dstk,
+						  sstk, &prog);
+				break;
+			case BPF_K:
+				/* Sign-extend immediate value to dst reg */
+				emit_ia32_mov_i64(is64, dst, imm32,
+						  dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = dst + src/imm */
+		/* dst = dst - src/imm */
+		/* dst = dst | src/imm */
+		/* dst = dst & src/imm */
+		/* dst = dst ^ src/imm */
+		/* dst = dst * src/imm */
+		/* dst = dst << src */
+		/* dst = dst >> src */
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU64 | BPF_ADD | BPF_K:
+		case BPF_ALU64 | BPF_ADD | BPF_X:
+		case BPF_ALU64 | BPF_SUB | BPF_K:
+		case BPF_ALU64 | BPF_SUB | BPF_X:
+		case BPF_ALU64 | BPF_OR | BPF_K:
+		case BPF_ALU64 | BPF_OR | BPF_X:
+		case BPF_ALU64 | BPF_AND | BPF_K:
+		case BPF_ALU64 | BPF_AND | BPF_X:
+		case BPF_ALU64 | BPF_XOR | BPF_K:
+		case BPF_ALU64 | BPF_XOR | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+						  src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+						  imm32, dstk, &prog);
+				break;
+			}
+			break;
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r(dst_lo, src_lo, dstk,
+						sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+						false, &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_ARSH | BPF_K:
+		case BPF_ALU | BPF_ARSH | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+						  dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_shift_r(BPF_OP(code), dst_lo,
+						  IA32_ECX, dstk, false,
+						  &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst / src(imm) */
+		/* dst = dst % src(imm) */
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_X:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    src_lo, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				/* mov ecx,imm32*/
+				EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+					    imm32);
+				emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+						    IA32_ECX, dstk, false,
+						    &prog);
+				break;
+			}
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		case BPF_ALU64 | BPF_DIV | BPF_K:
+		case BPF_ALU64 | BPF_DIV | BPF_X:
+		case BPF_ALU64 | BPF_MOD | BPF_K:
+		case BPF_ALU64 | BPF_MOD | BPF_X:
+			goto notyet;
+		/* dst = dst >> imm */
+		/* dst = dst << imm */
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 31))
+				return -EINVAL;
+			/* mov ecx,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+					  false, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = dst << imm */
+		case BPF_ALU64 | BPF_LSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst >> imm */
+		case BPF_ALU64 | BPF_RSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = dst << src */
+		case BPF_ALU64 | BPF_LSH | BPF_X:
+			emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src */
+		case BPF_ALU64 | BPF_RSH | BPF_X:
+			emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> src (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+			break;
+		/* dst = dst >> imm (signed) */
+		case BPF_ALU64 | BPF_ARSH | BPF_K:
+			if (unlikely(imm32 > 63))
+				return -EINVAL;
+			emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = ~dst */
+		case BPF_ALU | BPF_NEG:
+			emit_ia32_alu_i(is64, false, BPF_OP(code),
+					dst_lo, 0, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+			break;
+		/* dst = ~dst (64 bit) */
+		case BPF_ALU64 | BPF_NEG:
+			emit_ia32_neg64(dst, dstk, &prog);
+			break;
+		/* dst = dst * src/imm */
+		case BPF_ALU64 | BPF_MUL | BPF_X:
+		case BPF_ALU64 | BPF_MUL | BPF_K:
+			switch (BPF_SRC(code)) {
+			case BPF_X:
+				emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+				break;
+			case BPF_K:
+				emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+				break;
+			}
+			break;
+		/* dst = htole(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_LE:
+			emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = htobe(dst) */
+		case BPF_ALU | BPF_END | BPF_FROM_BE:
+			emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+			break;
+		/* dst = imm64 */
+		case BPF_LD | BPF_IMM | BPF_DW: {
+			s32 hi, lo = imm32;
+
+			hi = insn[1].imm;
+			emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+			emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+			insn++;
+			i++;
+			break;
+		}
+		/* ST: *(u8*)(dst_reg + off) = imm */
+		case BPF_ST | BPF_MEM | BPF_H:
+		case BPF_ST | BPF_MEM | BPF_B:
+		case BPF_ST | BPF_MEM | BPF_W:
+		case BPF_ST | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0xC6, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0xC7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0xC7, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+			else
+				EMIT1_off32(add_1reg(0x80, IA32_EAX),
+					    insn->off);
+			EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				u32 hi;
+
+				hi = imm32 & (1<<31) ? (u32)~0 : 0;
+				EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+					    insn->off + 4);
+				EMIT(hi, 4);
+			}
+			break;
+
+		/* STX: *(u8*)(dst_reg + off) = src_reg */
+		case BPF_STX | BPF_MEM | BPF_B:
+		case BPF_STX | BPF_MEM | BPF_H:
+		case BPF_STX | BPF_MEM | BPF_W:
+		case BPF_STX | BPF_MEM | BPF_DW:
+			if (dstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov eax,dst_lo */
+				EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+			if (sstk)
+				/* mov edx,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov edx,src_lo */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT(0x88, 1); break;
+			case BPF_H:
+				EMIT2(0x66, 0x89); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x89, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (BPF_SIZE(code) == BPF_DW) {
+				if (sstk)
+					/* mov edi,dword ptr [ebp+off] */
+					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+							     IA32_EDX),
+					      STACK_VAR(src_hi));
+				else
+					/* mov edi,src_hi */
+					EMIT2(0x8B, add_2reg(0xC0, src_hi,
+							     IA32_EDX));
+				EMIT1(0x89);
+				if (is_imm8(insn->off + 4)) {
+					EMIT2(add_2reg(0x40, IA32_EAX,
+						       IA32_EDX),
+					      insn->off + 4);
+				} else {
+					EMIT1(add_2reg(0x80, IA32_EAX,
+						       IA32_EDX));
+					EMIT(insn->off + 4, 4);
+				}
+			}
+			break;
+
+		/* LDX: dst_reg = *(u8*)(src_reg + off) */
+		case BPF_LDX | BPF_MEM | BPF_B:
+		case BPF_LDX | BPF_MEM | BPF_H:
+		case BPF_LDX | BPF_MEM | BPF_W:
+		case BPF_LDX | BPF_MEM | BPF_DW:
+			if (sstk)
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(src_lo));
+			else
+				/* mov eax,dword ptr [ebp+off] */
+				EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+				EMIT2(0x0F, 0xB6); break;
+			case BPF_H:
+				EMIT2(0x0F, 0xB7); break;
+			case BPF_W:
+			case BPF_DW:
+				EMIT(0x8B, 1); break;
+			}
+
+			if (is_imm8(insn->off))
+				EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+				      insn->off);
+			else
+				EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off);
+
+			if (dstk)
+				/* mov dword ptr [ebp+off],edx */
+				EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_lo));
+			else
+				/* mov dst_lo,edx */
+				EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+			switch (BPF_SIZE(code)) {
+			case BPF_B:
+			case BPF_H:
+			case BPF_W:
+				if (dstk) {
+					EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+					      STACK_VAR(dst_hi));
+					EMIT(0x0, 4);
+				} else {
+					EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+				}
+				break;
+			case BPF_DW:
+				EMIT2_off32(0x8B,
+					    add_2reg(0x80, IA32_EAX, IA32_EDX),
+					    insn->off + 4);
+				if (dstk)
+					EMIT3(0x89,
+					      add_2reg(0x40, IA32_EBP,
+						       IA32_EDX),
+					      STACK_VAR(dst_hi));
+				else
+					EMIT2(0x89,
+					      add_2reg(0xC0, dst_hi, IA32_EDX));
+				break;
+			default:
+				break;
+			}
+			break;
+		/* call */
+		case BPF_JMP | BPF_CALL:
+		{
+			const u8 *r1 = bpf2ia32[BPF_REG_1];
+			const u8 *r2 = bpf2ia32[BPF_REG_2];
+			const u8 *r3 = bpf2ia32[BPF_REG_3];
+			const u8 *r4 = bpf2ia32[BPF_REG_4];
+			const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+			if (insn->src_reg == BPF_PSEUDO_CALL)
+				goto notyet;
+
+			func = (u8 *) __bpf_call_base + imm32;
+			jmp_offset = func - (image + addrs[i]);
+
+			if (!imm32 || !is_simm32(jmp_offset)) {
+				pr_err("unsupported BPF func %d addr %p image %p\n",
+				       imm32, func, image);
+				return -EINVAL;
+			}
+
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r1[0]));
+			/* mov edx,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r1[1]));
+
+			emit_push_r64(r5, &prog);
+			emit_push_r64(r4, &prog);
+			emit_push_r64(r3, &prog);
+			emit_push_r64(r2, &prog);
+
+			EMIT1_off32(0xE8, jmp_offset + 9);
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[1]));
+
+			/* add esp,32 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+			break;
+		}
+		case BPF_JMP | BPF_TAIL_CALL:
+			emit_bpf_tail_call(&prog);
+			break;
+
+		/* cond jump */
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JNE | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JLT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_X:
+		case BPF_JMP | BPF_JLE | BPF_X:
+		case BPF_JMP | BPF_JSGT | BPF_X:
+		case BPF_JMP | BPF_JSLE | BPF_X:
+		case BPF_JMP | BPF_JSLT | BPF_X:
+		case BPF_JMP | BPF_JSGE | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_X: {
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+			u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			if (sstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+				      STACK_VAR(src_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+				      STACK_VAR(src_hi));
+			}
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JSET | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* and dreg_lo,sreg_lo */
+			EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+			/* and dreg_hi,sreg_hi */
+			EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+			/* or dreg_lo,dreg_hi */
+			EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+			goto emit_cond_jmp;
+		}
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JNE | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JLT | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JLE | BPF_K:
+		case BPF_JMP | BPF_JSGT | BPF_K:
+		case BPF_JMP | BPF_JSLE | BPF_K:
+		case BPF_JMP | BPF_JSLT | BPF_K:
+		case BPF_JMP | BPF_JSGE | BPF_K: {
+			u32 hi;
+			u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+			u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+			u8 sreg_lo = IA32_ECX;
+			u8 sreg_hi = IA32_EBX;
+
+			if (dstk) {
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+				      STACK_VAR(dst_lo));
+				EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+				      STACK_VAR(dst_hi));
+			}
+
+			hi = imm32 & (1<<31) ? (u32)~0 : 0;
+			/* mov ecx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+			/* mov ebx,imm32 */
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+			/* cmp dreg_hi,sreg_hi */
+			EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+			EMIT2(IA32_JNE, 2);
+			/* cmp dreg_lo,sreg_lo */
+			EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp:		/* Convert BPF opcode to x86 */
+			switch (BPF_OP(code)) {
+			case BPF_JEQ:
+				jmp_cond = IA32_JE;
+				break;
+			case BPF_JSET:
+			case BPF_JNE:
+				jmp_cond = IA32_JNE;
+				break;
+			case BPF_JGT:
+				/* GT is unsigned '>', JA in x86 */
+				jmp_cond = IA32_JA;
+				break;
+			case BPF_JLT:
+				/* LT is unsigned '<', JB in x86 */
+				jmp_cond = IA32_JB;
+				break;
+			case BPF_JGE:
+				/* GE is unsigned '>=', JAE in x86 */
+				jmp_cond = IA32_JAE;
+				break;
+			case BPF_JLE:
+				/* LE is unsigned '<=', JBE in x86 */
+				jmp_cond = IA32_JBE;
+				break;
+			case BPF_JSGT:
+				/* Signed '>', GT in x86 */
+				jmp_cond = IA32_JG;
+				break;
+			case BPF_JSLT:
+				/* Signed '<', LT in x86 */
+				jmp_cond = IA32_JL;
+				break;
+			case BPF_JSGE:
+				/* Signed '>=', GE in x86 */
+				jmp_cond = IA32_JGE;
+				break;
+			case BPF_JSLE:
+				/* Signed '<=', LE in x86 */
+				jmp_cond = IA32_JLE;
+				break;
+			default: /* to silence GCC warning */
+				return -EFAULT;
+			}
+			jmp_offset = addrs[i + insn->off] - addrs[i];
+			if (is_imm8(jmp_offset)) {
+				EMIT2(jmp_cond, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+			} else {
+				pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+
+			break;
+		}
+		case BPF_JMP | BPF_JA:
+			if (insn->off == -1)
+				/* -1 jmp instructions will always jump
+				 * backwards two bytes. Explicitly handling
+				 * this case avoids wasting too many passes
+				 * when there are long sequences of replaced
+				 * dead code.
+				 */
+				jmp_offset = -2;
+			else
+				jmp_offset = addrs[i + insn->off] - addrs[i];
+
+			if (!jmp_offset)
+				/* Optimize out nop jumps */
+				break;
+emit_jmp:
+			if (is_imm8(jmp_offset)) {
+				EMIT2(0xEB, jmp_offset);
+			} else if (is_simm32(jmp_offset)) {
+				EMIT1_off32(0xE9, jmp_offset);
+			} else {
+				pr_err("jmp gen bug %llx\n", jmp_offset);
+				return -EFAULT;
+			}
+			break;
+
+		case BPF_LD | BPF_ABS | BPF_W:
+		case BPF_LD | BPF_ABS | BPF_H:
+		case BPF_LD | BPF_ABS | BPF_B:
+		case BPF_LD | BPF_IND | BPF_W:
+		case BPF_LD | BPF_IND | BPF_H:
+		case BPF_LD | BPF_IND | BPF_B:
+		{
+			int size;
+			const u8 *r6 = bpf2ia32[BPF_REG_6];
+
+			/* Setting up first argument */
+			/* mov eax,dword ptr [ebp+off] */
+			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r6[0]));
+
+			/* Setting up second argument */
+			if (BPF_MODE(code) == BPF_ABS) {
+				/* mov %edx, imm32 */
+				EMIT1_off32(0xBA, imm32);
+			} else {
+				if (sstk)
+					/* mov edx,dword ptr [ebp+off] */
+					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+							     IA32_EDX),
+					      STACK_VAR(src_lo));
+				else
+					/* mov edx,src_lo */
+					EMIT2(0x8B, add_2reg(0xC0, src_lo,
+							     IA32_EDX));
+				if (imm32) {
+					if (is_imm8(imm32))
+						/* add %edx,imm8 */
+						EMIT3(0x83, 0xC2, imm32);
+					else
+						/* add %edx,imm32 */
+						EMIT2_off32(0x81, 0xC2, imm32);
+				}
+			}
+
+			/* Setting up third argument */
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				size = 4;
+				break;
+			case BPF_H:
+				size = 2;
+				break;
+			case BPF_B:
+				size = 1;
+				break;
+			default:
+				return -EINVAL;
+			}
+			/* mov ecx,val */
+			EMIT2(0xB1, size);
+			/* movzx ecx,ecx */
+			EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+
+			/* mov ebx,ebp */
+			EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX));
+			/* add %ebx,imm8 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER);
+			/* push ebx */
+			EMIT1(0x53);
+
+			/* Setting up function pointer to call */
+			/* mov ebx,imm32*/
+			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX),
+				    (unsigned int)bpf_load_pointer);
+
+			EMIT2(0xFF, add_1reg(0xD0, IA32_EBX));
+			/* add %esp,4 */
+			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4);
+			/* xor edx,edx */
+			EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[0]));
+			/* mov dword ptr [ebp+off],edx */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+			      STACK_VAR(r0[1]));
+
+			/*
+			 * Check if return address is NULL or not.
+			 * If NULL then jump to epilogue else continue
+			 * to load the value from retn address
+			 */
+			EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0);
+			jmp_offset = ctx->cleanup_addr - addrs[i];
+
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				jmp_offset += 7;
+				break;
+			case BPF_H:
+				jmp_offset += 10;
+				break;
+			case BPF_B:
+				jmp_offset += 6;
+				break;
+			}
+
+			EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset);
+			/* Load value from the address */
+			switch (BPF_SIZE(code)) {
+			case BPF_W:
+				/* mov eax,[eax] */
+				EMIT2(0x8B, 0x0);
+				/* Emit 'bswap eax' */
+				EMIT2(0x0F, add_1reg(0xC8, IA32_EAX));
+				break;
+			case BPF_H:
+				EMIT3(0x0F, 0xB7, 0x0);
+				EMIT1(0x66);
+				EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8);
+				break;
+			case BPF_B:
+				EMIT3(0x0F, 0xB6, 0x0);
+				break;
+			}
+
+			/* mov dword ptr [ebp+off],eax */
+			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+			      STACK_VAR(r0[0]));
+			break;
+		}
+		/* STX XADD: lock *(u32 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_W:
+		/* STX XADD: lock *(u64 *)(dst + off) += src */
+		case BPF_STX | BPF_XADD | BPF_DW:
+			goto notyet;
+		case BPF_JMP | BPF_EXIT:
+			if (seen_exit) {
+				jmp_offset = ctx->cleanup_addr - addrs[i];
+				goto emit_jmp;
+			}
+			seen_exit = true;
+			/* Update cleanup_addr */
+			ctx->cleanup_addr = proglen;
+			emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+			break;
+notyet:
+			pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+			return -EFAULT;
+		default:
+			/*
+			 * This error will be seen if new instruction was added
+			 * to interpreter, but not to JIT or if there is junk in
+			 * bpf_prog
+			 */
+			pr_err("bpf_jit: unknown opcode %02x\n", code);
+			return -EINVAL;
+		}
+
+		ilen = prog - temp;
+		if (ilen > BPF_MAX_INSN_SIZE) {
+			pr_err("bpf_jit: fatal insn size error\n");
+			return -EFAULT;
+		}
+
+		if (image) {
+			if (unlikely(proglen + ilen > oldproglen)) {
+				pr_err("bpf_jit: fatal error\n");
+				return -EFAULT;
+			}
+			memcpy(image + proglen, temp, ilen);
+		}
+		proglen += ilen;
+		addrs[i] = proglen;
+		prog = temp;
+	}
+	return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+	struct bpf_binary_header *header = NULL;
+	struct bpf_prog *tmp, *orig_prog = prog;
+	int proglen, oldproglen = 0;
+	struct jit_context ctx = {};
+	bool tmp_blinded = false;
+	u8 *image = NULL;
+	int *addrs;
+	int pass;
+	int i;
+
+	if (!prog->jit_requested)
+		return orig_prog;
+
+	tmp = bpf_jit_blind_constants(prog);
+	/*
+	 * If blinding was requested and we failed during blinding,
+	 * we must fall back to the interpreter.
+	 */
+	if (IS_ERR(tmp))
+		return orig_prog;
+	if (tmp != prog) {
+		tmp_blinded = true;
+		prog = tmp;
+	}
+
+	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+	if (!addrs) {
+		prog = orig_prog;
+		goto out;
+	}
+
+	/*
+	 * Before first pass, make a rough estimation of addrs[]
+	 * each BPF instruction is translated to less than 64 bytes
+	 */
+	for (proglen = 0, i = 0; i < prog->len; i++) {
+		proglen += 64;
+		addrs[i] = proglen;
+	}
+	ctx.cleanup_addr = proglen;
+
+	/*
+	 * JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large BPF programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image.
+	 */
+	for (pass = 0; pass < 20 || image; pass++) {
+		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+		if (proglen <= 0) {
+out_image:
+			image = NULL;
+			if (header)
+				bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_addrs;
+		}
+		if (image) {
+			if (proglen != oldproglen) {
+				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+				       proglen, oldproglen);
+				goto out_image;
+			}
+			break;
+		}
+		if (proglen == oldproglen) {
+			header = bpf_jit_binary_alloc(proglen, &image,
+						      1, jit_fill_hole);
+			if (!header) {
+				prog = orig_prog;
+				goto out_addrs;
+			}
+		}
+		oldproglen = proglen;
+		cond_resched();
+	}
+
+	if (bpf_jit_enable > 1)
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+	if (image) {
+		bpf_jit_binary_lock_ro(header);
+		prog->bpf_func = (void *)image;
+		prog->jited = 1;
+		prog->jited_len = proglen;
+	} else {
+		prog = orig_prog;
+	}
+
+out_addrs:
+	kfree(addrs);
+out:
+	if (tmp_blinded)
+		bpf_jit_prog_release_other(prog, prog == orig_prog ?
+					   tmp : orig_prog);
+	return prog;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 5f110899ae1494d47678f88e3c53c4c5f67077c4 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Thu, 3 May 2018 11:54:23 +0530
Subject: cxgb4: update latest firmware version supported

Change t4fw_version.h to update latest firmware version
number to 1.19.1.0.

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
index 123e2c1b65f5..4eb15ceddca3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
@@ -36,8 +36,8 @@
 #define __T4FW_VERSION_H__
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x10
-#define T4FW_VERSION_MICRO 0x3F
+#define T4FW_VERSION_MINOR 0x13
+#define T4FW_VERSION_MICRO 0x01
 #define T4FW_VERSION_BUILD 0x00
 
 #define T4FW_MIN_VERSION_MAJOR 0x01
@@ -45,8 +45,8 @@
 #define T4FW_MIN_VERSION_MICRO 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x10
-#define T5FW_VERSION_MICRO 0x3F
+#define T5FW_VERSION_MINOR 0x13
+#define T5FW_VERSION_MICRO 0x01
 #define T5FW_VERSION_BUILD 0x00
 
 #define T5FW_MIN_VERSION_MAJOR 0x00
@@ -54,8 +54,8 @@
 #define T5FW_MIN_VERSION_MICRO 0x00
 
 #define T6FW_VERSION_MAJOR 0x01
-#define T6FW_VERSION_MINOR 0x10
-#define T6FW_VERSION_MICRO 0x3F
+#define T6FW_VERSION_MINOR 0x13
+#define T6FW_VERSION_MICRO 0x01
 #define T6FW_VERSION_BUILD 0x00
 
 #define T6FW_MIN_VERSION_MAJOR 0x00
-- 
cgit v1.2.3-59-g8ed1b


From 9413248753c4c5eba5a9e2548e7d98c73230763d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 May 2018 10:51:32 +0300
Subject: selftests: forwarding: Increase maximum deviation in multipath test

We sometimes observe failures in the test due to too large discrepancy
between the measured and expected ratios. For example:

TEST: ECMP                                                          [FAIL]
        Too large discrepancy between expected and measured ratios
        INFO: Expected ratio 1.00 Measured ratio 1.11

Fix this by allowing an up to 15% deviation between both ratios.

Another possibility is to increase the number of generated flows, but
this will prolong the execution time of the test, which is already quite
high.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/router_multipath.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
index 3bc351008db6..2bd3d41354d0 100755
--- a/tools/testing/selftests/net/forwarding/router_multipath.sh
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -191,7 +191,7 @@ multipath_eval()
        diff=$(echo $weights_ratio - $packets_ratio | bc -l)
        diff=${diff#-}
 
-       test "$(echo "$diff / $weights_ratio > 0.1" | bc -l)" -eq 0
+       test "$(echo "$diff / $weights_ratio > 0.15" | bc -l)" -eq 0
        check_err $? "Too large discrepancy between expected and measured ratios"
        log_test "$desc"
        log_info "Expected ratio $weights_ratio Measured ratio $packets_ratio"
-- 
cgit v1.2.3-59-g8ed1b


From 0eb8053c147b012dcb6c2c1deeb4d069a5b35fd9 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 May 2018 10:51:33 +0300
Subject: selftests: forwarding: Allow running specific tests

Similar to commit a511858c7536 ("selftests: fib_tests: Allow user to run
a specific test"), allow user to run only a subset of the tests using
the TESTS environment variable.

This is useful when not all the tests can pass on a given system.

Example:
# export TESTS="ping_ipv4 ping_ipv6"
# ./bridge_vlan_aware.sh
TEST: ping					[PASS]
TEST: ping6					[PASS]

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/bridge_vlan_aware.sh  | 26 +++++++++++++---
 .../net/forwarding/bridge_vlan_unaware.sh          | 26 +++++++++++++---
 tools/testing/selftests/net/forwarding/lib.sh      |  9 ++++++
 .../testing/selftests/net/forwarding/mirror_gre.sh | 36 +++++++++++++++++-----
 .../selftests/net/forwarding/mirror_gre_bound.sh   | 23 +++++++++++---
 .../selftests/net/forwarding/mirror_gre_changes.sh | 29 ++++++++++++++---
 .../selftests/net/forwarding/mirror_gre_flower.sh  | 23 +++++++++++---
 .../selftests/net/forwarding/mirror_gre_neigh.sh   | 22 ++++++++++---
 .../selftests/net/forwarding/mirror_gre_nh.sh      |  8 +++--
 tools/testing/selftests/net/forwarding/router.sh   | 14 +++++++--
 .../selftests/net/forwarding/router_multipath.sh   | 15 +++++++--
 .../testing/selftests/net/forwarding/tc_actions.sh | 25 ++++++++++-----
 .../testing/selftests/net/forwarding/tc_chains.sh  |  7 ++---
 .../testing/selftests/net/forwarding/tc_flower.sh  | 14 +++------
 .../selftests/net/forwarding/tc_shblocks.sh        |  5 +--
 15 files changed, 219 insertions(+), 63 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
index 75d922438bc9..d8313d0438b7 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
 NUM_NETIFS=4
 CHECK_TC="yes"
 source lib.sh
@@ -75,14 +76,31 @@ cleanup()
 	vrf_cleanup
 }
 
+ping_ipv4()
+{
+	ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:1::2
+}
+
+learning()
+{
+	learning_test "br0" $swp1 $h1 $h2
+}
+
+flooding()
+{
+	flood_test $swp2 $h1 $h2
+}
+
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-ping_test $h1 192.0.2.2
-ping6_test $h1 2001:db8:1::2
-learning_test "br0" $swp1 $h1 $h2
-flood_test $swp2 $h1 $h2
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
index 1cddf06f691d..c15c6c85c984 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
 NUM_NETIFS=4
 source lib.sh
 
@@ -73,14 +74,31 @@ cleanup()
 	vrf_cleanup
 }
 
+ping_ipv4()
+{
+	ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:1::2
+}
+
+learning()
+{
+	learning_test "br0" $swp1 $h1 $h2
+}
+
+flooding()
+{
+	flood_test $swp2 $h1 $h2
+}
+
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-ping_test $h1 192.0.2.2
-ping6_test $h1 2001:db8:1::2
-learning_test "br0" $swp1 $h1 $h2
-flood_test $swp2 $h1 $h2
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index a066ca536ac4..061c87bbf77c 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -477,6 +477,15 @@ matchall_sink_create()
 	   action drop
 }
 
+tests_run()
+{
+	local current_test
+
+	for current_test in ${TESTS:-$ALL_TESTS}; do
+		$current_test
+	done
+}
+
 ##############################################################################
 # Tests
 
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh
index a8abc736f67c..c6786d1b2b96 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh
@@ -10,6 +10,14 @@
 # traffic. Test that the payload is what is expected (ICMP ping request or
 # reply, depending on test).
 
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+	test_gretap_mac
+	test_ip6gretap_mac
+	test_two_spans
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -100,22 +108,36 @@ test_two_spans()
 	log_test "two simultaneously configured mirrors ($tcflags)"
 }
 
-test_all()
+test_gretap()
 {
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
 	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
-	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
 	full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
 	full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
+}
 
+test_gretap_mac()
+{
 	test_span_gre_mac gt4 ingress ip "mirror to gretap"
-	test_span_gre_mac gt6 ingress ipv6 "mirror to ip6gretap"
 	test_span_gre_mac gt4 egress ip "mirror to gretap"
+}
+
+test_ip6gretap_mac()
+{
+	test_span_gre_mac gt6 ingress ipv6 "mirror to ip6gretap"
 	test_span_gre_mac gt6 egress ipv6 "mirror to ip6gretap"
+}
 
-	test_two_spans
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
index 3708ac0f400a..360ca133bead 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
@@ -42,6 +42,11 @@
 # underlay manner, i.e. with a bound dummy device that marks underlay VRF where
 # the encapsulated packed should be routed.
 
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -178,6 +183,18 @@ cleanup()
 	vrf_cleanup
 }
 
+test_gretap()
+{
+	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap w/ UL"
+	full_test_span_gre_dir gt4 egress  0 8 "mirror to gretap w/ UL"
+}
+
+test_ip6gretap()
+{
+	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap w/ UL"
+	full_test_span_gre_dir gt6 egress  0 8 "mirror to ip6gretap w/ UL"
+}
+
 test_all()
 {
 	RET=0
@@ -185,11 +202,7 @@ test_all()
 	slow_path_trap_install $swp1 ingress
 	slow_path_trap_install $swp1 egress
 
-	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap w/ UL"
-	full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap w/ UL"
-
-	full_test_span_gre_dir gt4 egress  0 8 "mirror to gretap w/ UL"
-	full_test_span_gre_dir gt6 egress  0 8 "mirror to ip6gretap w/ UL"
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index 0ed288ac76d2..fdb612f69613 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -7,6 +7,13 @@
 # Test how mirrors to gretap and ip6gretap react to changes to relevant
 # configuration.
 
+ALL_TESTS="
+	test_ttl
+	test_tun_up
+	test_egress_up
+	test_remote_ip
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -155,22 +162,36 @@ test_span_gre_remote_ip()
 	log_test "$what: remote address change ($tcflags)"
 }
 
-test_all()
+test_ttl()
 {
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
 	test_span_gre_ttl gt4 gretap ip "mirror to gretap"
 	test_span_gre_ttl gt6 ip6gretap ipv6 "mirror to ip6gretap"
+}
 
+test_tun_up()
+{
 	test_span_gre_tun_up gt4 "mirror to gretap"
 	test_span_gre_tun_up gt6 "mirror to ip6gretap"
+}
 
+test_egress_up()
+{
 	test_span_gre_egress_up gt4 192.0.2.130 "mirror to gretap"
 	test_span_gre_egress_up gt6 2001:db8:2::2 "mirror to ip6gretap"
+}
 
+test_remote_ip()
+{
 	test_span_gre_remote_ip gt4 gretap 192.0.2.130 192.0.2.132 "mirror to gretap"
 	test_span_gre_remote_ip gt6 ip6gretap 2001:db8:2::2 2001:db8:2::4 "mirror to ip6gretap"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
index 178a42d771aa..2e54407d8954 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
@@ -10,6 +10,11 @@
 # this address, mirroring takes place, whereas when pinging the other one,
 # there's no mirroring.
 
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -81,6 +86,18 @@ full_test_span_gre_dir_acl()
 	log_test "$direction $what ($tcflags)"
 }
 
+test_gretap()
+{
+	full_test_span_gre_dir_acl gt4 ingress 8 0 192.0.2.4 "ACL mirror to gretap"
+	full_test_span_gre_dir_acl gt4 egress 0 8 192.0.2.3 "ACL mirror to gretap"
+}
+
+test_ip6gretap()
+{
+	full_test_span_gre_dir_acl gt6 ingress 8 0 192.0.2.4 "ACL mirror to ip6gretap"
+	full_test_span_gre_dir_acl gt6 egress 0 8 192.0.2.3 "ACL mirror to ip6gretap"
+}
+
 test_all()
 {
 	RET=0
@@ -88,11 +105,7 @@ test_all()
 	slow_path_trap_install $swp1 ingress
 	slow_path_trap_install $swp1 egress
 
-	full_test_span_gre_dir_acl gt4 ingress 8 0 192.0.2.4 "ACL mirror to gretap"
-	full_test_span_gre_dir_acl gt6 ingress 8 0 192.0.2.4 "ACL mirror to ip6gretap"
-
-	full_test_span_gre_dir_acl gt4 egress 0 8 192.0.2.3 "ACL mirror to gretap"
-	full_test_span_gre_dir_acl gt6 egress 0 8 192.0.2.3 "ACL mirror to ip6gretap"
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
index 1ca29ba4f338..fc0508e40fca 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
@@ -9,6 +9,11 @@
 # is set up. Later on, the neighbor is deleted and it is expected to be
 # reinitialized using the usual ARP process, and the mirroring offload updated.
 
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -69,15 +74,24 @@ test_span_gre_neigh()
 	log_test "$direction $what: neighbor change ($tcflags)"
 }
 
-test_all()
+test_gretap()
 {
-	slow_path_trap_install $swp1 ingress
-	slow_path_trap_install $swp1 egress
-
 	test_span_gre_neigh 192.0.2.130 gt4 ingress "mirror to gretap"
 	test_span_gre_neigh 192.0.2.130 gt4 egress "mirror to gretap"
+}
+
+test_ip6gretap()
+{
 	test_span_gre_neigh 2001:db8:2::2 gt6 ingress "mirror to ip6gretap"
 	test_span_gre_neigh 2001:db8:2::2 gt6 egress "mirror to ip6gretap"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
index 9ac70978541f..a0d1ad46a2bc 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
@@ -7,6 +7,11 @@
 # Test that gretap and ip6gretap mirroring works when the other tunnel endpoint
 # is reachable through a next-hop route (as opposed to directly-attached route).
 
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+"
+
 NUM_NETIFS=6
 source lib.sh
 source mirror_lib.sh
@@ -92,8 +97,7 @@ test_all()
 	slow_path_trap_install $swp1 ingress
 	slow_path_trap_install $swp1 egress
 
-	test_gretap
-	test_ip6gretap
+	tests_run
 
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh
index cc6a14abfa87..a75cb51cc5bd 100755
--- a/tools/testing/selftests/net/forwarding/router.sh
+++ b/tools/testing/selftests/net/forwarding/router.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="ping_ipv4 ping_ipv6"
 NUM_NETIFS=4
 source lib.sh
 
@@ -114,12 +115,21 @@ cleanup()
 	vrf_cleanup
 }
 
+ping_ipv4()
+{
+	ping_test $h1 198.51.100.2
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:2::2
+}
+
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-ping_test $h1 198.51.100.2
-ping6_test $h1 2001:db8:2::2
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
index 2bd3d41354d0..6c4376289695 100755
--- a/tools/testing/selftests/net/forwarding/router_multipath.sh
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test"
 NUM_NETIFS=8
 source lib.sh
 
@@ -364,13 +365,21 @@ cleanup()
 	vrf_cleanup
 }
 
+ping_ipv4()
+{
+	ping_test $h1 198.51.100.2
+}
+
+ping_ipv6()
+{
+	ping6_test $h1 2001:db8:2::2
+}
+
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-ping_test $h1 198.51.100.2
-ping6_test $h1 2001:db8:2::2
-multipath_test
+tests_run
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
index 3a6385ebd5d0..813d02d1939d 100755
--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
+	mirred_egress_mirror_test gact_trap_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -111,6 +113,10 @@ gact_trap_test()
 {
 	RET=0
 
+	if [[ "$tcflags" != "skip_sw" ]]; then
+		return 0;
+	fi
+
 	tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
 		skip_hw dst_ip 192.0.2.2 action drop
 	tc filter add dev $swp1 ingress protocol ip pref 3 handle 103 flower \
@@ -179,24 +185,29 @@ cleanup()
 	ip link set $swp1 address $swp1origmac
 }
 
+mirred_egress_redirect_test()
+{
+	mirred_egress_test "redirect"
+}
+
+mirred_egress_mirror_test()
+{
+	mirred_egress_test "mirror"
+}
+
 trap cleanup EXIT
 
 setup_prepare
 setup_wait
 
-gact_drop_and_ok_test
-mirred_egress_test "redirect"
-mirred_egress_test "mirror"
+tests_run
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
 	log_info "Could not test offloaded functionality"
 else
 	tcflags="skip_sw"
-	gact_drop_and_ok_test
-	mirred_egress_test "redirect"
-	mirred_egress_test "mirror"
-	gact_trap_test
+	tests_run
 fi
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_chains.sh b/tools/testing/selftests/net/forwarding/tc_chains.sh
index 2fd15226974b..d2c783e94df3 100755
--- a/tools/testing/selftests/net/forwarding/tc_chains.sh
+++ b/tools/testing/selftests/net/forwarding/tc_chains.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="unreachable_chain_test gact_goto_chain_test"
 NUM_NETIFS=2
 source tc_common.sh
 source lib.sh
@@ -107,16 +108,14 @@ trap cleanup EXIT
 setup_prepare
 setup_wait
 
-unreachable_chain_test
-gact_goto_chain_test
+tests_run
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
 	log_info "Could not test offloaded functionality"
 else
 	tcflags="skip_sw"
-	unreachable_chain_test
-	gact_goto_chain_test
+	tests_run
 fi
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh
index 0c54059f1875..20d1077e5a3d 100755
--- a/tools/testing/selftests/net/forwarding/tc_flower.sh
+++ b/tools/testing/selftests/net/forwarding/tc_flower.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="match_dst_mac_test match_src_mac_test match_dst_ip_test \
+	match_src_ip_test match_ip_flags_test"
 NUM_NETIFS=2
 source tc_common.sh
 source lib.sh
@@ -245,22 +247,14 @@ trap cleanup EXIT
 setup_prepare
 setup_wait
 
-match_dst_mac_test
-match_src_mac_test
-match_dst_ip_test
-match_src_ip_test
-match_ip_flags_test
+tests_run
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
 	log_info "Could not test offloaded functionality"
 else
 	tcflags="skip_sw"
-	match_dst_mac_test
-	match_src_mac_test
-	match_dst_ip_test
-	match_src_ip_test
-	match_ip_flags_test
+	tests_run
 fi
 
 exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_shblocks.sh b/tools/testing/selftests/net/forwarding/tc_shblocks.sh
index 077b98048ef4..b5b917203815 100755
--- a/tools/testing/selftests/net/forwarding/tc_shblocks.sh
+++ b/tools/testing/selftests/net/forwarding/tc_shblocks.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+ALL_TESTS="shared_block_test"
 NUM_NETIFS=4
 source tc_common.sh
 source lib.sh
@@ -109,14 +110,14 @@ trap cleanup EXIT
 setup_prepare
 setup_wait
 
-shared_block_test
+tests_run
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
 	log_info "Could not test offloaded functionality"
 else
 	tcflags="skip_sw"
-	shared_block_test
+	tests_run
 fi
 
 exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From f5ae57784ba805d0e8b11db1a7b2c17c5a07dd70 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 12:36:59 +0200
Subject: selftests: forwarding: lib: Add sysctl_set(), sysctl_restore()

Add two helper functions: sysctl_set() to change the value of a given
sysctl setting, and sysctl_restore() to change it back to what it was.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 061c87bbf77c..e49e78b5cd3d 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -381,6 +381,23 @@ bridge_ageing_time_get()
 	echo $((ageing_time / 100))
 }
 
+declare -A SYSCTL_ORIG
+sysctl_set()
+{
+	local key=$1; shift
+	local value=$1; shift
+
+	SYSCTL_ORIG[$key]=$(sysctl -n $key)
+	sysctl -qw $key=$value
+}
+
+sysctl_restore()
+{
+	local key=$1; shift
+
+	sysctl -qw $key=${SYSCTL_ORIG["$key"]}
+}
+
 forwarding_enable()
 {
        ipv4_fwd=$(sysctl -n net.ipv4.conf.all.forwarding)
-- 
cgit v1.2.3-59-g8ed1b


From d51d10aa1dbdedd9254874bf37e8b0cd96ea3bde Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 12:37:13 +0200
Subject: selftests: forwarding: Use sysctl_set(), sysctl_restore()

Instead of hand-managing the sysctl set and restore, use the wrappers
sysctl_set() and sysctl_restore() to do the bookkeeping automatically.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh                | 11 ++++-------
 tools/testing/selftests/net/forwarding/mirror_gre_changes.sh |  7 ++-----
 tools/testing/selftests/net/forwarding/router_multipath.sh   | 12 ++++--------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index e49e78b5cd3d..91041c49655b 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -400,17 +400,14 @@ sysctl_restore()
 
 forwarding_enable()
 {
-       ipv4_fwd=$(sysctl -n net.ipv4.conf.all.forwarding)
-       ipv6_fwd=$(sysctl -n net.ipv6.conf.all.forwarding)
-
-       sysctl -q -w net.ipv4.conf.all.forwarding=1
-       sysctl -q -w net.ipv6.conf.all.forwarding=1
+	sysctl_set net.ipv4.conf.all.forwarding 1
+	sysctl_set net.ipv6.conf.all.forwarding 1
 }
 
 forwarding_restore()
 {
-       sysctl -q -w net.ipv6.conf.all.forwarding=$ipv6_fwd
-       sysctl -q -w net.ipv4.conf.all.forwarding=$ipv4_fwd
+	sysctl_restore net.ipv6.conf.all.forwarding
+	sysctl_restore net.ipv4.conf.all.forwarding
 }
 
 tc_offload_check()
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index fdb612f69613..50ab3462af0c 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -36,9 +36,7 @@ setup_prepare()
 
 	# This test downs $swp3, which deletes the configured IPv6 address
 	# unless this sysctl is set.
-	local key=net.ipv6.conf.$swp3.keep_addr_on_down
-	SWP3_KEEP_ADDR_ON_DOWN=$(sysctl -n $key)
-	sysctl -qw $key=1
+	sysctl_set net.ipv6.conf.$swp3.keep_addr_on_down 1
 
 	ip address add dev $swp3 192.0.2.129/28
 	ip address add dev $h3 192.0.2.130/28
@@ -57,8 +55,7 @@ cleanup()
 	ip address del dev $h3 192.0.2.130/28
 	ip address del dev $swp3 192.0.2.129/28
 
-	local key=net.ipv6.conf.$swp3.keep_addr_on_down
-	sysctl -qw $key=$SWP3_KEEP_ADDR_ON_DOWN
+	sysctl_restore net.ipv6.conf.$swp3.keep_addr_on_down
 
 	mirror_gre_topo_destroy
 	vrf_cleanup
diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
index 6c4376289695..8b6d0fb6d604 100755
--- a/tools/testing/selftests/net/forwarding/router_multipath.sh
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -205,13 +205,11 @@ multipath4_test()
        local weight_rp13=$3
        local t0_rp12 t0_rp13 t1_rp12 t1_rp13
        local packets_rp12 packets_rp13
-       local hash_policy
 
        # Transmit multiple flows from h1 to h2 and make sure they are
        # distributed between both multipath links (rp12 and rp13)
        # according to the configured weights.
-       hash_policy=$(sysctl -n net.ipv4.fib_multipath_hash_policy)
-       sysctl -q -w net.ipv4.fib_multipath_hash_policy=1
+       sysctl_set net.ipv4.fib_multipath_hash_policy 1
        ip route replace 198.51.100.0/24 vrf vrf-r1 \
                nexthop via 169.254.2.22 dev $rp12 weight $weight_rp12 \
                nexthop via 169.254.3.23 dev $rp13 weight $weight_rp13
@@ -233,7 +231,7 @@ multipath4_test()
        ip route replace 198.51.100.0/24 vrf vrf-r1 \
                nexthop via 169.254.2.22 dev $rp12 \
                nexthop via 169.254.3.23 dev $rp13
-       sysctl -q -w net.ipv4.fib_multipath_hash_policy=$hash_policy
+       sysctl_restore net.ipv4.fib_multipath_hash_policy
 }
 
 multipath6_l4_test()
@@ -243,13 +241,11 @@ multipath6_l4_test()
        local weight_rp13=$3
        local t0_rp12 t0_rp13 t1_rp12 t1_rp13
        local packets_rp12 packets_rp13
-       local hash_policy
 
        # Transmit multiple flows from h1 to h2 and make sure they are
        # distributed between both multipath links (rp12 and rp13)
        # according to the configured weights.
-       hash_policy=$(sysctl -n net.ipv6.fib_multipath_hash_policy)
-       sysctl -q -w net.ipv6.fib_multipath_hash_policy=1
+       sysctl_set net.ipv6.fib_multipath_hash_policy 1
 
        ip route replace 2001:db8:2::/64 vrf vrf-r1 \
 	       nexthop via fe80:2::22 dev $rp12 weight $weight_rp12 \
@@ -272,7 +268,7 @@ multipath6_l4_test()
 	       nexthop via fe80:2::22 dev $rp12 \
 	       nexthop via fe80:3::23 dev $rp13
 
-       sysctl -q -w net.ipv6.fib_multipath_hash_policy=$hash_policy
+       sysctl_restore net.ipv6.fib_multipath_hash_policy
 }
 
 multipath6_test()
-- 
cgit v1.2.3-59-g8ed1b


From 7eaaf0bc52127401f7f39c9f7950f3f624763c90 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 12:37:21 +0200
Subject: selftests: forwarding: mirror_gre_nh: Unset RP filter

The test fails to work if reverse-path filtering is in effect on the
mirrored-to host interface, or for all interfaces.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_gre_nh.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
index a0d1ad46a2bc..8fa681eb90e7 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
@@ -29,6 +29,9 @@ setup_prepare()
 	swp3=${NETIFS[p5]}
 	h3=${NETIFS[p6]}
 
+	sysctl_set net.ipv4.conf.all.rp_filter 0
+	sysctl_set net.ipv4.conf.$h3.rp_filter 0
+
 	vrf_prepare
 	mirror_gre_topo_create
 
@@ -60,6 +63,9 @@ cleanup()
 
 	mirror_gre_topo_destroy
 	vrf_cleanup
+
+	sysctl_restore net.ipv4.conf.$h3.rp_filter
+	sysctl_restore net.ipv4.conf.all.rp_filter
 }
 
 test_gretap()
-- 
cgit v1.2.3-59-g8ed1b


From faa1cd8298439cf56f7fd2d8647726b30c263bf0 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 3 May 2018 13:47:24 +0300
Subject: net: bridge: avoid duplicate notification on up/down/change netdev
 events

While handling netdevice events, br_device_event() sometimes uses
br_stp_(disable|enable)_port which unconditionally send a notification,
but then a second notification for the same event is sent at the end of
the br_device_event() function. To avoid sending duplicate notifications
in such cases, check if one has already been sent (i.e.
br_stp_enable/disable_port have been called).
The patch is based on a change by Satish Ashok.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         | 12 ++++++++----
 net/bridge/br_if.c      | 11 ++++++++---
 net/bridge/br_private.h |  2 +-
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 671d13c10f6f..2ca035054664 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net_bridge_port *p;
 	struct net_bridge *br;
+	bool notified = false;
 	bool changed_addr;
 	int err;
 
@@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 		break;
 
 	case NETDEV_CHANGE:
-		br_port_carrier_check(p);
+		br_port_carrier_check(p, &notified);
 		break;
 
 	case NETDEV_FEAT_CHANGE:
@@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	case NETDEV_DOWN:
 		spin_lock_bh(&br->lock);
-		if (br->dev->flags & IFF_UP)
+		if (br->dev->flags & IFF_UP) {
 			br_stp_disable_port(p);
+			notified = true;
+		}
 		spin_unlock_bh(&br->lock);
 		break;
 
@@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 		if (netif_running(br->dev) && netif_oper_up(dev)) {
 			spin_lock_bh(&br->lock);
 			br_stp_enable_port(p);
+			notified = true;
 			spin_unlock_bh(&br->lock);
 		}
 		break;
@@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 	}
 
 	/* Events that may cause spanning tree to refresh */
-	if (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
-	    event == NETDEV_CHANGE || event == NETDEV_DOWN)
+	if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
+			  event == NETDEV_CHANGE || event == NETDEV_DOWN))
 		br_ifinfo_notify(RTM_NEWLINK, NULL, p);
 
 	return NOTIFY_DONE;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 82c1a6f430b3..e3a8ea1bcbe2 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev)
 
 
 /* Check for port carrier transitions. */
-void br_port_carrier_check(struct net_bridge_port *p)
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
 {
 	struct net_device *dev = p->dev;
 	struct net_bridge *br = p->br;
@@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p)
 	    netif_running(dev) && netif_oper_up(dev))
 		p->path_cost = port_cost(dev);
 
+	*notified = false;
 	if (!netif_running(br->dev))
 		return;
 
 	spin_lock_bh(&br->lock);
 	if (netif_running(dev) && netif_oper_up(dev)) {
-		if (p->state == BR_STATE_DISABLED)
+		if (p->state == BR_STATE_DISABLED) {
 			br_stp_enable_port(p);
+			*notified = true;
+		}
 	} else {
-		if (p->state != BR_STATE_DISABLED)
+		if (p->state != BR_STATE_DISABLED) {
 			br_stp_disable_port(p);
+			*notified = true;
+		}
 	}
 	spin_unlock_bh(&br->lock);
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1a5093115534..0ddeeea2c6a7 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -573,7 +573,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
 
 /* br_if.c */
-void br_port_carrier_check(struct net_bridge_port *p);
+void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
 int br_add_bridge(struct net *net, const char *name);
 int br_del_bridge(struct net *net, const char *name);
 int br_add_if(struct net_bridge *br, struct net_device *dev,
-- 
cgit v1.2.3-59-g8ed1b


From 9b934a3bdc8778e9cd92f65d0d08c17bb005091e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 May 2018 14:59:39 +0300
Subject: mlxsw: resources: Add CQE versions resources

Add resources that FW uses to report supported CQE versions.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/resources.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h
index 087aad52c195..fd9299ccec72 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/resources.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h
@@ -43,6 +43,9 @@ enum mlxsw_res_id {
 	MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE,
 	MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE,
 	MLXSW_RES_ID_MAX_TRAP_GROUPS,
+	MLXSW_RES_ID_CQE_V0,
+	MLXSW_RES_ID_CQE_V1,
+	MLXSW_RES_ID_CQE_V2,
 	MLXSW_RES_ID_COUNTER_POOL_SIZE,
 	MLXSW_RES_ID_MAX_SPAN,
 	MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES,
@@ -81,6 +84,9 @@ static u16 mlxsw_res_ids[] = {
 	[MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE] = 0x1002,
 	[MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE] = 0x1003,
 	[MLXSW_RES_ID_MAX_TRAP_GROUPS] = 0x2201,
+	[MLXSW_RES_ID_CQE_V0] = 0x2210,
+	[MLXSW_RES_ID_CQE_V1] = 0x2211,
+	[MLXSW_RES_ID_CQE_V2] = 0x2212,
 	[MLXSW_RES_ID_COUNTER_POOL_SIZE] = 0x2410,
 	[MLXSW_RES_ID_MAX_SPAN] = 0x2420,
 	[MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES] = 0x2443,
-- 
cgit v1.2.3-59-g8ed1b


From b76550bbed9f9aa10b01b89698f6a5c55509b070 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 May 2018 14:59:40 +0300
Subject: mlxsw: pci: Introduce helpers to work with multiple CQE versions

Introduce definitions of fields in CQE version 1 and 2. Also, introduce
common helpers that would call appropriate version-specific helpers
according to the version enum passed.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/pci.c    | 72 +++++++++++++++------------
 drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 74 ++++++++++++++++++++++++----
 2 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 3a9381977d6d..24686ba45729 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -117,6 +117,7 @@ struct mlxsw_pci_queue {
 		struct {
 			u32 comp_sdq_count;
 			u32 comp_rdq_count;
+			enum mlxsw_pci_cqe_v v;
 		} cq;
 		struct {
 			u32 ev_cmd_count;
@@ -202,24 +203,6 @@ static bool mlxsw_pci_elem_hw_owned(struct mlxsw_pci_queue *q, bool owner_bit)
 	return owner_bit != !!(q->consumer_counter & q->count);
 }
 
-static char *
-mlxsw_pci_queue_sw_elem_get(struct mlxsw_pci_queue *q,
-			    u32 (*get_elem_owner_func)(const char *))
-{
-	struct mlxsw_pci_queue_elem_info *elem_info;
-	char *elem;
-	bool owner_bit;
-
-	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-	elem = elem_info->elem;
-	owner_bit = get_elem_owner_func(elem);
-	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
-		return NULL;
-	q->consumer_counter++;
-	rmb(); /* make sure we read owned bit before the rest of elem */
-	return elem;
-}
-
 static struct mlxsw_pci_queue_type_group *
 mlxsw_pci_queue_type_group_get(struct mlxsw_pci *mlxsw_pci,
 			       enum mlxsw_pci_queue_type q_type)
@@ -505,7 +488,7 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	for (i = 0; i < q->count; i++) {
 		char *elem = mlxsw_pci_queue_elem_get(q, i);
 
-		mlxsw_pci_cqe_owner_set(elem, 1);
+		mlxsw_pci_cqe_owner_set(q->u.cq.v, elem, 1);
 	}
 
 	mlxsw_cmd_mbox_sw2hw_cq_cv_set(mbox, 0); /* CQE ver 0 */
@@ -559,7 +542,7 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
 static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 				     struct mlxsw_pci_queue *q,
 				     u16 consumer_counter_limit,
-				     char *cqe)
+				     enum mlxsw_pci_cqe_v cqe_v, char *cqe)
 {
 	struct pci_dev *pdev = mlxsw_pci->pdev;
 	struct mlxsw_pci_queue_elem_info *elem_info;
@@ -579,10 +562,11 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 	if (q->consumer_counter++ != consumer_counter_limit)
 		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
 
-	if (mlxsw_pci_cqe_lag_get(cqe)) {
+	if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
 		rx_info.is_lag = true;
-		rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe);
-		rx_info.lag_port_index = mlxsw_pci_cqe_lag_port_index_get(cqe);
+		rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe_v, cqe);
+		rx_info.lag_port_index =
+			mlxsw_pci_cqe_lag_subport_get(cqe_v, cqe);
 	} else {
 		rx_info.is_lag = false;
 		rx_info.u.sys_port = mlxsw_pci_cqe_system_port_get(cqe);
@@ -591,7 +575,7 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 	rx_info.trap_id = mlxsw_pci_cqe_trap_id_get(cqe);
 
 	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
-	if (mlxsw_pci_cqe_crc_get(cqe))
+	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
 		byte_count -= ETH_FCS_LEN;
 	skb_put(skb, byte_count);
 	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
@@ -608,7 +592,18 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 
 static char *mlxsw_pci_cq_sw_cqe_get(struct mlxsw_pci_queue *q)
 {
-	return mlxsw_pci_queue_sw_elem_get(q, mlxsw_pci_cqe_owner_get);
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *elem;
+	bool owner_bit;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	elem = elem_info->elem;
+	owner_bit = mlxsw_pci_cqe_owner_get(q->u.cq.v, elem);
+	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
+		return NULL;
+	q->consumer_counter++;
+	rmb(); /* make sure we read owned bit before the rest of elem */
+	return elem;
 }
 
 static void mlxsw_pci_cq_tasklet(unsigned long data)
@@ -621,8 +616,8 @@ static void mlxsw_pci_cq_tasklet(unsigned long data)
 
 	while ((cqe = mlxsw_pci_cq_sw_cqe_get(q))) {
 		u16 wqe_counter = mlxsw_pci_cqe_wqe_counter_get(cqe);
-		u8 sendq = mlxsw_pci_cqe_sr_get(cqe);
-		u8 dqn = mlxsw_pci_cqe_dqn_get(cqe);
+		u8 sendq = mlxsw_pci_cqe_sr_get(q->u.cq.v, cqe);
+		u8 dqn = mlxsw_pci_cqe_dqn_get(q->u.cq.v, cqe);
 
 		if (sendq) {
 			struct mlxsw_pci_queue *sdq;
@@ -636,7 +631,7 @@ static void mlxsw_pci_cq_tasklet(unsigned long data)
 
 			rdq = mlxsw_pci_rdq_get(mlxsw_pci, dqn);
 			mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq,
-						 wqe_counter, cqe);
+						 wqe_counter, q->u.cq.v, cqe);
 			q->u.cq.comp_rdq_count++;
 		}
 		if (++items == credits)
@@ -696,7 +691,18 @@ static void mlxsw_pci_eq_cmd_event(struct mlxsw_pci *mlxsw_pci, char *eqe)
 
 static char *mlxsw_pci_eq_sw_eqe_get(struct mlxsw_pci_queue *q)
 {
-	return mlxsw_pci_queue_sw_elem_get(q, mlxsw_pci_eqe_owner_get);
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *elem;
+	bool owner_bit;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	elem = elem_info->elem;
+	owner_bit = mlxsw_pci_eqe_owner_get(elem);
+	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
+		return NULL;
+	q->consumer_counter++;
+	rmb(); /* make sure we read owned bit before the rest of elem */
+	return elem;
 }
 
 static void mlxsw_pci_eq_tasklet(unsigned long data)
@@ -779,8 +785,8 @@ static const struct mlxsw_pci_queue_ops mlxsw_pci_cq_ops = {
 	.init		= mlxsw_pci_cq_init,
 	.fini		= mlxsw_pci_cq_fini,
 	.tasklet	= mlxsw_pci_cq_tasklet,
-	.elem_count	= MLXSW_PCI_CQE_COUNT,
-	.elem_size	= MLXSW_PCI_CQE_SIZE
+	.elem_count	= MLXSW_PCI_CQE01_COUNT,
+	.elem_size	= MLXSW_PCI_CQE01_SIZE
 };
 
 static const struct mlxsw_pci_queue_ops mlxsw_pci_eq_ops = {
@@ -800,6 +806,8 @@ static int mlxsw_pci_queue_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	int i;
 	int err;
 
+	q->u.cq.v = MLXSW_PCI_CQE_V0;
+
 	spin_lock_init(&q->lock);
 	q->num = q_num;
 	q->count = q_ops->elem_count;
@@ -938,7 +946,7 @@ static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
 
 	if ((1 << sdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
 	    (1 << rdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
-	    (1 << cq_log2sz != MLXSW_PCI_CQE_COUNT) ||
+	    (1 << cq_log2sz != MLXSW_PCI_CQE01_COUNT) ||
 	    (1 << eq_log2sz != MLXSW_PCI_EQE_COUNT)) {
 		dev_err(&pdev->dev, "Unsupported number of async queue descriptors\n");
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
index fb082ad21b00..963155f6a17a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -82,10 +82,12 @@
 #define MLXSW_PCI_AQ_PAGES	8
 #define MLXSW_PCI_AQ_SIZE	(MLXSW_PCI_PAGE_SIZE * MLXSW_PCI_AQ_PAGES)
 #define MLXSW_PCI_WQE_SIZE	32 /* 32 bytes per element */
-#define MLXSW_PCI_CQE_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_CQE01_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_CQE2_SIZE	32 /* 32 bytes per element */
 #define MLXSW_PCI_EQE_SIZE	16 /* 16 bytes per element */
 #define MLXSW_PCI_WQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_WQE_SIZE)
-#define MLXSW_PCI_CQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE_SIZE)
+#define MLXSW_PCI_CQE01_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE01_SIZE)
+#define MLXSW_PCI_CQE2_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE2_SIZE)
 #define MLXSW_PCI_EQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_EQE_SIZE)
 #define MLXSW_PCI_EQE_UPDATE_COUNT	0x80
 
@@ -126,10 +128,48 @@ MLXSW_ITEM16_INDEXED(pci, wqe, byte_count, 0x02, 0, 14, 0x02, 0x00, false);
  */
 MLXSW_ITEM64_INDEXED(pci, wqe, address, 0x08, 0, 64, 0x8, 0x0, false);
 
+enum mlxsw_pci_cqe_v {
+	MLXSW_PCI_CQE_V0,
+	MLXSW_PCI_CQE_V1,
+	MLXSW_PCI_CQE_V2,
+};
+
+#define mlxsw_pci_cqe_item_helpers(name, v0, v1, v2)				\
+static inline u32 mlxsw_pci_cqe_##name##_get(enum mlxsw_pci_cqe_v v, char *cqe)	\
+{										\
+	switch (v) {								\
+	default:								\
+	case MLXSW_PCI_CQE_V0:							\
+		return mlxsw_pci_cqe##v0##_##name##_get(cqe);			\
+	case MLXSW_PCI_CQE_V1:							\
+		return mlxsw_pci_cqe##v1##_##name##_get(cqe);			\
+	case MLXSW_PCI_CQE_V2:							\
+		return mlxsw_pci_cqe##v2##_##name##_get(cqe);			\
+	}									\
+}										\
+static inline void mlxsw_pci_cqe_##name##_set(enum mlxsw_pci_cqe_v v,		\
+					      char *cqe, u32 val)		\
+{										\
+	switch (v) {								\
+	default:								\
+	case MLXSW_PCI_CQE_V0:							\
+		mlxsw_pci_cqe##v0##_##name##_set(cqe, val);			\
+		break;								\
+	case MLXSW_PCI_CQE_V1:							\
+		mlxsw_pci_cqe##v1##_##name##_set(cqe, val);			\
+		break;								\
+	case MLXSW_PCI_CQE_V2:							\
+		mlxsw_pci_cqe##v2##_##name##_set(cqe, val);			\
+		break;								\
+	}									\
+}
+
 /* pci_cqe_lag
  * Packet arrives from a port which is a LAG
  */
-MLXSW_ITEM32(pci, cqe, lag, 0x00, 23, 1);
+MLXSW_ITEM32(pci, cqe0, lag, 0x00, 23, 1);
+MLXSW_ITEM32(pci, cqe12, lag, 0x00, 24, 1);
+mlxsw_pci_cqe_item_helpers(lag, 0, 12, 12);
 
 /* pci_cqe_system_port/lag_id
  * When lag=0: System port on which the packet was received
@@ -138,8 +178,12 @@ MLXSW_ITEM32(pci, cqe, lag, 0x00, 23, 1);
  * bits [3:0] sub_port on which the packet was received
  */
 MLXSW_ITEM32(pci, cqe, system_port, 0x00, 0, 16);
-MLXSW_ITEM32(pci, cqe, lag_id, 0x00, 4, 12);
-MLXSW_ITEM32(pci, cqe, lag_port_index, 0x00, 0, 4);
+MLXSW_ITEM32(pci, cqe0, lag_id, 0x00, 4, 12);
+MLXSW_ITEM32(pci, cqe12, lag_id, 0x00, 0, 16);
+mlxsw_pci_cqe_item_helpers(lag_id, 0, 12, 12);
+MLXSW_ITEM32(pci, cqe0, lag_subport, 0x00, 0, 4);
+MLXSW_ITEM32(pci, cqe12, lag_subport, 0x00, 16, 8);
+mlxsw_pci_cqe_item_helpers(lag_subport, 0, 12, 12);
 
 /* pci_cqe_wqe_counter
  * WQE count of the WQEs completed on the associated dqn
@@ -162,28 +206,38 @@ MLXSW_ITEM32(pci, cqe, trap_id, 0x08, 0, 9);
  * Length include CRC. Indicates the length field includes
  * the packet's CRC.
  */
-MLXSW_ITEM32(pci, cqe, crc, 0x0C, 8, 1);
+MLXSW_ITEM32(pci, cqe0, crc, 0x0C, 8, 1);
+MLXSW_ITEM32(pci, cqe12, crc, 0x0C, 9, 1);
+mlxsw_pci_cqe_item_helpers(crc, 0, 12, 12);
 
 /* pci_cqe_e
  * CQE with Error.
  */
-MLXSW_ITEM32(pci, cqe, e, 0x0C, 7, 1);
+MLXSW_ITEM32(pci, cqe0, e, 0x0C, 7, 1);
+MLXSW_ITEM32(pci, cqe12, e, 0x00, 27, 1);
+mlxsw_pci_cqe_item_helpers(e, 0, 12, 12);
 
 /* pci_cqe_sr
  * 1 - Send Queue
  * 0 - Receive Queue
  */
-MLXSW_ITEM32(pci, cqe, sr, 0x0C, 6, 1);
+MLXSW_ITEM32(pci, cqe0, sr, 0x0C, 6, 1);
+MLXSW_ITEM32(pci, cqe12, sr, 0x00, 26, 1);
+mlxsw_pci_cqe_item_helpers(sr, 0, 12, 12);
 
 /* pci_cqe_dqn
  * Descriptor Queue (DQ) Number.
  */
-MLXSW_ITEM32(pci, cqe, dqn, 0x0C, 1, 5);
+MLXSW_ITEM32(pci, cqe0, dqn, 0x0C, 1, 5);
+MLXSW_ITEM32(pci, cqe12, dqn, 0x0C, 1, 6);
+mlxsw_pci_cqe_item_helpers(dqn, 0, 12, 12);
 
 /* pci_cqe_owner
  * Ownership bit.
  */
-MLXSW_ITEM32(pci, cqe, owner, 0x0C, 0, 1);
+MLXSW_ITEM32(pci, cqe01, owner, 0x0C, 0, 1);
+MLXSW_ITEM32(pci, cqe2, owner, 0x1C, 0, 1);
+mlxsw_pci_cqe_item_helpers(owner, 01, 01, 2);
 
 /* pci_eqe_event_type
  * Event type.
-- 
cgit v1.2.3-59-g8ed1b


From 8404f6f2e8ed2bd667c0c95ee40561b63e0800f9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 May 2018 14:59:41 +0300
Subject: mlxsw: pci: Allow to use CQEs of version 1 and version 2

Use previously added resources to query FW support for multiple versions
of CQEs. Use the biggest version supported. For SDQs, it has no sense to
use version 2 as it does not introduce any new features, but it is
twice the size of CQE version 1.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/cmd.h | 24 ++++++++--
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 78 +++++++++++++++++++++++++++----
 2 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
index 479511cf79bc..669226b0759c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -662,6 +662,12 @@ MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_single_size, 0x0C, 25, 1);
  */
 MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_double_size, 0x0C, 26, 1);
 
+/* cmd_mbox_config_set_cqe_version
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_cqe_version, 0x08, 0, 1);
+
 /* cmd_mbox_config_profile_max_vepa_channels
  * Maximum number of VEPA channels per port (0 through 16)
  * 0 - multi-channel VEPA is disabled
@@ -841,6 +847,14 @@ MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_type,
 MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_properties,
 		     0x60, 0, 8, 0x08, 0x00, false);
 
+/* cmd_mbox_config_profile_cqe_version
+ * CQE version:
+ * 0: CQE version is 0
+ * 1: CQE version is either 1 or 2
+ * CQE ver 1 or 2 is configured by Completion Queue Context field cqe_ver.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, cqe_version, 0xB0, 0, 8);
+
 /* ACCESS_REG - Access EMAD Supported Register
  * ----------------------------------
  * OpMod == 0 (N/A), INMmod == 0 (N/A)
@@ -1032,11 +1046,15 @@ static inline int mlxsw_cmd_sw2hw_cq(struct mlxsw_core *mlxsw_core,
 				 0, cq_number, in_mbox, MLXSW_CMD_MBOX_SIZE);
 }
 
-/* cmd_mbox_sw2hw_cq_cv
+enum mlxsw_cmd_mbox_sw2hw_cq_cqe_ver {
+	MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_1,
+	MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_2,
+};
+
+/* cmd_mbox_sw2hw_cq_cqe_ver
  * CQE Version.
- * 0 - CQE Version 0, 1 - CQE Version 1
  */
-MLXSW_ITEM32(cmd_mbox, sw2hw_cq, cv, 0x00, 28, 4);
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, cqe_ver, 0x00, 28, 4);
 
 /* cmd_mbox_sw2hw_cq_c_eqn
  * Event Queue this CQ reports completion events to.
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 24686ba45729..e9ce0e27aa9c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -156,6 +156,8 @@ struct mlxsw_pci {
 	} cmd;
 	struct mlxsw_bus_info bus_info;
 	const struct pci_device_id *id;
+	enum mlxsw_pci_cqe_v max_cqe_ver; /* Maximal supported CQE version */
+	u8 num_sdq_cqs; /* Number of CQs used for SDQs */
 };
 
 static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
@@ -477,6 +479,17 @@ static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
 	}
 }
 
+static void mlxsw_pci_cq_pre_init(struct mlxsw_pci *mlxsw_pci,
+				  struct mlxsw_pci_queue *q)
+{
+	q->u.cq.v = mlxsw_pci->max_cqe_ver;
+
+	/* For SDQ it is pointless to use CQEv2, so use CQEv1 instead */
+	if (q->u.cq.v == MLXSW_PCI_CQE_V2 &&
+	    q->num < mlxsw_pci->num_sdq_cqs)
+		q->u.cq.v = MLXSW_PCI_CQE_V1;
+}
+
 static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 			     struct mlxsw_pci_queue *q)
 {
@@ -491,7 +504,13 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 		mlxsw_pci_cqe_owner_set(q->u.cq.v, elem, 1);
 	}
 
-	mlxsw_cmd_mbox_sw2hw_cq_cv_set(mbox, 0); /* CQE ver 0 */
+	if (q->u.cq.v == MLXSW_PCI_CQE_V1)
+		mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox,
+				MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_1);
+	else if (q->u.cq.v == MLXSW_PCI_CQE_V2)
+		mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox,
+				MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_2);
+
 	mlxsw_cmd_mbox_sw2hw_cq_c_eqn_set(mbox, MLXSW_PCI_EQ_COMP_NUM);
 	mlxsw_cmd_mbox_sw2hw_cq_st_set(mbox, 0);
 	mlxsw_cmd_mbox_sw2hw_cq_log_cq_size_set(mbox, ilog2(q->count));
@@ -643,6 +662,18 @@ static void mlxsw_pci_cq_tasklet(unsigned long data)
 	}
 }
 
+static u16 mlxsw_pci_cq_elem_count(const struct mlxsw_pci_queue *q)
+{
+	return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_COUNT :
+					       MLXSW_PCI_CQE01_COUNT;
+}
+
+static u8 mlxsw_pci_cq_elem_size(const struct mlxsw_pci_queue *q)
+{
+	return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_SIZE :
+					       MLXSW_PCI_CQE01_SIZE;
+}
+
 static int mlxsw_pci_eq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 			     struct mlxsw_pci_queue *q)
 {
@@ -755,11 +786,15 @@ static void mlxsw_pci_eq_tasklet(unsigned long data)
 struct mlxsw_pci_queue_ops {
 	const char *name;
 	enum mlxsw_pci_queue_type type;
+	void (*pre_init)(struct mlxsw_pci *mlxsw_pci,
+			 struct mlxsw_pci_queue *q);
 	int (*init)(struct mlxsw_pci *mlxsw_pci, char *mbox,
 		    struct mlxsw_pci_queue *q);
 	void (*fini)(struct mlxsw_pci *mlxsw_pci,
 		     struct mlxsw_pci_queue *q);
 	void (*tasklet)(unsigned long data);
+	u16 (*elem_count_f)(const struct mlxsw_pci_queue *q);
+	u8 (*elem_size_f)(const struct mlxsw_pci_queue *q);
 	u16 elem_count;
 	u8 elem_size;
 };
@@ -782,11 +817,12 @@ static const struct mlxsw_pci_queue_ops mlxsw_pci_rdq_ops = {
 
 static const struct mlxsw_pci_queue_ops mlxsw_pci_cq_ops = {
 	.type		= MLXSW_PCI_QUEUE_TYPE_CQ,
+	.pre_init	= mlxsw_pci_cq_pre_init,
 	.init		= mlxsw_pci_cq_init,
 	.fini		= mlxsw_pci_cq_fini,
 	.tasklet	= mlxsw_pci_cq_tasklet,
-	.elem_count	= MLXSW_PCI_CQE01_COUNT,
-	.elem_size	= MLXSW_PCI_CQE01_SIZE
+	.elem_count_f	= mlxsw_pci_cq_elem_count,
+	.elem_size_f	= mlxsw_pci_cq_elem_size
 };
 
 static const struct mlxsw_pci_queue_ops mlxsw_pci_eq_ops = {
@@ -806,12 +842,15 @@ static int mlxsw_pci_queue_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	int i;
 	int err;
 
-	q->u.cq.v = MLXSW_PCI_CQE_V0;
+	q->num = q_num;
+	if (q_ops->pre_init)
+		q_ops->pre_init(mlxsw_pci, q);
 
 	spin_lock_init(&q->lock);
-	q->num = q_num;
-	q->count = q_ops->elem_count;
-	q->elem_size = q_ops->elem_size;
+	q->count = q_ops->elem_count_f ? q_ops->elem_count_f(q) :
+					 q_ops->elem_count;
+	q->elem_size = q_ops->elem_size_f ? q_ops->elem_size_f(q) :
+					    q_ops->elem_size;
 	q->type = q_ops->type;
 	q->pci = mlxsw_pci;
 
@@ -840,7 +879,7 @@ static int mlxsw_pci_queue_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
 		elem_info->elem =
-			__mlxsw_pci_queue_elem_get(q, q_ops->elem_size, i);
+			__mlxsw_pci_queue_elem_get(q, q->elem_size, i);
 	}
 
 	mlxsw_cmd_mbox_zero(mbox);
@@ -952,6 +991,8 @@ static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
 		return -EINVAL;
 	}
 
+	mlxsw_pci->num_sdq_cqs = num_sdqs;
+
 	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_eq_ops,
 					 num_eqs);
 	if (err) {
@@ -1192,6 +1233,11 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox,
 		mlxsw_pci_config_profile_swid_config(mlxsw_pci, mbox, i,
 						     &profile->swid_config[i]);
 
+	if (mlxsw_pci->max_cqe_ver > MLXSW_PCI_CQE_V0) {
+		mlxsw_cmd_mbox_config_profile_set_cqe_version_set(mbox, 1);
+		mlxsw_cmd_mbox_config_profile_cqe_version_set(mbox, 1);
+	}
+
 	return mlxsw_cmd_config_profile_set(mlxsw_pci->core, mbox);
 }
 
@@ -1386,6 +1432,21 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
 	if (err)
 		goto err_query_resources;
 
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V2) &&
+	    MLXSW_CORE_RES_GET(mlxsw_core, CQE_V2))
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V2;
+	else if (MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V1) &&
+		 MLXSW_CORE_RES_GET(mlxsw_core, CQE_V1))
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V1;
+	else if ((MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V0) &&
+		  MLXSW_CORE_RES_GET(mlxsw_core, CQE_V0)) ||
+		 !MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V0)) {
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V0;
+	} else {
+		dev_err(&pdev->dev, "Invalid supported CQE version combination reported\n");
+		goto err_cqe_v_check;
+	}
+
 	err = mlxsw_pci_config_profile(mlxsw_pci, mbox, profile, res);
 	if (err)
 		goto err_config_profile;
@@ -1408,6 +1469,7 @@ err_request_eq_irq:
 	mlxsw_pci_aqs_fini(mlxsw_pci);
 err_aqs_init:
 err_config_profile:
+err_cqe_v_check:
 err_query_resources:
 err_boardinfo:
 	mlxsw_pci_fw_area_fini(mlxsw_pci);
-- 
cgit v1.2.3-59-g8ed1b


From 41107685b9ce3c204be4ba784c3dd6baf9e4bcf4 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 May 2018 14:59:42 +0300
Subject: mlxsw: pci: Check number of CQEs for CQE version 2

Check number of CQEs for CQE version 2 reported by QUERY_AQ_CAP command.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/cmd.h | 7 ++++++-
 drivers/net/ethernet/mellanox/mlxsw/pci.c | 4 ++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
index 669226b0759c..8da91b023b13 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -424,10 +424,15 @@ MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_rdq_sz, 0x04, 24, 8);
 MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_rdqs, 0x04, 0, 8);
 
 /* cmd_mbox_query_aq_cap_log_max_cq_sz
- * Log (base 2) of max CQEs allowed on CQ.
+ * Log (base 2) of the Maximum CQEs allowed in a CQ for CQEv0 and CQEv1.
  */
 MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_cq_sz, 0x08, 24, 8);
 
+/* cmd_mbox_query_aq_cap_log_max_cqv2_sz
+ * Log (base 2) of the Maximum CQEs allowed in a CQ for CQEv2.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_cqv2_sz, 0x08, 16, 8);
+
 /* cmd_mbox_query_aq_cap_max_num_cqs
  * Maximum number of CQs.
  */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index e9ce0e27aa9c..db794a1a3a7e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -959,6 +959,7 @@ static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
 	u8 rdq_log2sz;
 	u8 num_cqs;
 	u8 cq_log2sz;
+	u8 cqv2_log2sz;
 	u8 num_eqs;
 	u8 eq_log2sz;
 	int err;
@@ -974,6 +975,7 @@ static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
 	rdq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_rdq_sz_get(mbox);
 	num_cqs = mlxsw_cmd_mbox_query_aq_cap_max_num_cqs_get(mbox);
 	cq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_cq_sz_get(mbox);
+	cqv2_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_cqv2_sz_get(mbox);
 	num_eqs = mlxsw_cmd_mbox_query_aq_cap_max_num_eqs_get(mbox);
 	eq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_eq_sz_get(mbox);
 
@@ -986,6 +988,8 @@ static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
 	if ((1 << sdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
 	    (1 << rdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
 	    (1 << cq_log2sz != MLXSW_PCI_CQE01_COUNT) ||
+	    (mlxsw_pci->max_cqe_ver == MLXSW_PCI_CQE_V2 &&
+	     (1 << cqv2_log2sz != MLXSW_PCI_CQE2_COUNT)) ||
 	    (1 << eq_log2sz != MLXSW_PCI_EQE_COUNT)) {
 		dev_err(&pdev->dev, "Unsupported number of async queue descriptors\n");
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 816a3bed9549fcc0c90ba97c9077e64e734f0df6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 14:43:46 +0200
Subject: switchdev: Add fdb.added_by_user to switchdev notifications

The following patch enables sending notifications also for events on FDB
entries that weren't added by the user. Give the drivers the information
necessary to distinguish between the two origins of FDB entries.

To maintain the current behavior, have switchdev-implementing drivers
bail out on notifications about non-user-added FDB entries. In case of
mlxsw driver, allow a call to mlxsw_sp_span_respin() so that SPAN over
bridge catches up with the changed FDB.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c |  4 ++++
 drivers/net/ethernet/rocker/rocker_main.c                |  2 ++
 include/net/switchdev.h                                  |  1 +
 net/bridge/br_switchdev.c                                | 10 +++++++---
 net/dsa/slave.c                                          |  5 ++++-
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 1af99fe5fd32..3973d90cc908 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -2270,6 +2270,8 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		err = mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, true);
 		if (err)
 			break;
@@ -2279,6 +2281,8 @@ static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
 		break;
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 056cb6093630..152d6948611c 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2783,6 +2783,8 @@ static int rocker_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (!fdb_info->added_by_user)
+			break;
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 39bc855d7fee..d574ce63bf22 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -155,6 +155,7 @@ struct switchdev_notifier_fdb_info {
 	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
 	u16 vid;
+	bool added_by_user;
 };
 
 static inline struct net_device *
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index ee775f4ff76c..71a03c487c4b 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
 
 static void
 br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
-				u16 vid, struct net_device *dev)
+				u16 vid, struct net_device *dev,
+				bool added_by_user)
 {
 	struct switchdev_notifier_fdb_info info;
 	unsigned long notifier_type;
 
 	info.addr = mac;
 	info.vid = vid;
+	info.added_by_user = added_by_user;
 	notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
 	call_switchdev_notifiers(notifier_type, dev, &info.info);
 }
@@ -123,12 +125,14 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 	case RTM_DELNEIGH:
 		br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
 						fdb->key.vlan_id,
-						fdb->dst->dev);
+						fdb->dst->dev,
+						fdb->added_by_user);
 		break;
 	case RTM_NEWNEIGH:
 		br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
 						fdb->key.vlan_id,
-						fdb->dst->dev);
+						fdb->dst->dev,
+						fdb->added_by_user);
 		break;
 	}
 }
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f3fb3a0880b1..c287f1ef964c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1441,6 +1441,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 				     unsigned long event, void *ptr)
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	struct switchdev_notifier_fdb_info *fdb_info = ptr;
 	struct dsa_switchdev_event_work *switchdev_work;
 
 	if (!dsa_slave_dev_check(dev))
@@ -1458,8 +1459,10 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		if (!fdb_info->added_by_user)
+			break;
 		if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
-						      ptr))
+						      fdb_info))
 			goto err_fdb_work_init;
 		dev_hold(dev);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 161d82de1ff8430ef96d90d0da0e3643979791d2 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 3 May 2018 14:43:53 +0200
Subject: net: bridge: Notify about !added_by_user FDB entries

Do not automatically bail out on sending notifications about activity on
non-user-added FDB entries. Instead, notify about this activity except
for cases where the activity itself originates in a notification, to
avoid sending duplicate notifications.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c           |  4 ++--
 net/bridge/br_fdb.c       | 47 ++++++++++++++++++++++++++---------------------
 net/bridge/br_private.h   |  6 ++++--
 net/bridge/br_switchdev.c |  2 +-
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 2ca035054664..b0a0b82e2d91 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -145,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
-						fdb_info->vid);
+						fdb_info->vid, false);
 		if (err) {
 			err = notifier_from_errno(err);
 			break;
@@ -156,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_del(br, p, fdb_info->addr,
-						fdb_info->vid);
+						fdb_info->vid, false);
 		if (err)
 			err = notifier_from_errno(err);
 		break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a1c409cacec4..b19e3104afd6 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly;
 static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		      const unsigned char *addr, u16 vid);
 static void fdb_notify(struct net_bridge *br,
-		       const struct net_bridge_fdb_entry *, int);
+		       const struct net_bridge_fdb_entry *, int, bool);
 
 int __init br_fdb_init(void)
 {
@@ -195,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
 	}
 }
 
-static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
+static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
+		       bool swdev_notify)
 {
 	trace_fdb_delete(br, f);
 
@@ -205,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	hlist_del_init_rcu(&f->fdb_node);
 	rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
 			       br_fdb_rht_params);
-	fdb_notify(br, f, RTM_DELNEIGH);
+	fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
 
@@ -241,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br,
 		return;
 	}
 
-	fdb_delete(br, f);
+	fdb_delete(br, f, true);
 }
 
 void br_fdb_find_delete_local(struct net_bridge *br,
@@ -356,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work)
 		} else {
 			spin_lock_bh(&br->hash_lock);
 			if (!hlist_unhashed(&f->fdb_node))
-				fdb_delete(br, f);
+				fdb_delete(br, f, true);
 			spin_unlock_bh(&br->hash_lock);
 		}
 	}
@@ -376,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br)
 	spin_lock_bh(&br->hash_lock);
 	hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
 		if (!f->is_static)
-			fdb_delete(br, f);
+			fdb_delete(br, f, true);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -405,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br,
 		if (f->is_local)
 			fdb_delete_local(br, p, f);
 		else
-			fdb_delete(br, f);
+			fdb_delete(br, f, true);
 	}
 	spin_unlock_bh(&br->hash_lock);
 }
@@ -531,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 			return 0;
 		br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
 		       source ? source->dev->name : br->dev->name, addr, vid);
-		fdb_delete(br, fdb);
+		fdb_delete(br, fdb, true);
 	}
 
 	fdb = fdb_create(br, source, addr, vid, 1, 1);
@@ -539,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		return -ENOMEM;
 
 	fdb_add_hw_addr(br, addr);
-	fdb_notify(br, fdb, RTM_NEWNEIGH);
+	fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 	return 0;
 }
 
@@ -594,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				fdb->added_by_user = 1;
 			if (unlikely(fdb_modified)) {
 				trace_br_fdb_update(br, source, addr, vid, added_by_user);
-				fdb_notify(br, fdb, RTM_NEWNEIGH);
+				fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 			}
 		}
 	} else {
@@ -605,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				fdb->added_by_user = 1;
 			trace_br_fdb_update(br, source, addr, vid,
 					    added_by_user);
-			fdb_notify(br, fdb, RTM_NEWNEIGH);
+			fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 		}
 		/* else  we lose race and someone else inserts
 		 * it first, don't bother updating
@@ -687,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void)
 }
 
 static void fdb_notify(struct net_bridge *br,
-		       const struct net_bridge_fdb_entry *fdb, int type)
+		       const struct net_bridge_fdb_entry *fdb, int type,
+		       bool swdev_notify)
 {
 	struct net *net = dev_net(br->dev);
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	br_switchdev_fdb_notify(fdb, type);
+	if (swdev_notify)
+		br_switchdev_fdb_notify(fdb, type);
 
 	skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
 	if (skb == NULL)
@@ -832,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
 	fdb->used = jiffies;
 	if (modified) {
 		fdb->updated = jiffies;
-		fdb_notify(br, fdb, RTM_NEWNEIGH);
+		fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 	}
 
 	return 0;
@@ -856,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 		rcu_read_unlock();
 		local_bh_enable();
 	} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
-		err = br_fdb_external_learn_add(br, p, addr, vid);
+		err = br_fdb_external_learn_add(br, p, addr, vid, true);
 	} else {
 		spin_lock_bh(&br->hash_lock);
 		err = fdb_add_entry(br, p, addr, ndm->ndm_state,
@@ -945,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br,
 	if (!fdb || fdb->dst != p)
 		return -ENOENT;
 
-	fdb_delete(br, fdb);
+	fdb_delete(br, fdb, true);
 
 	return 0;
 }
@@ -1065,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid)
+			      const unsigned char *addr, u16 vid,
+			      bool swdev_notify)
 {
 	struct net_bridge_fdb_entry *fdb;
 	bool modified = false;
@@ -1083,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			goto err_unlock;
 		}
 		fdb->added_by_external_learn = 1;
-		fdb_notify(br, fdb, RTM_NEWNEIGH);
+		fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
 	} else {
 		fdb->updated = jiffies;
 
@@ -1102,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		}
 
 		if (modified)
-			fdb_notify(br, fdb, RTM_NEWNEIGH);
+			fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
 	}
 
 err_unlock:
@@ -1112,7 +1116,8 @@ err_unlock:
 }
 
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid)
+			      const unsigned char *addr, u16 vid,
+			      bool swdev_notify)
 {
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
@@ -1121,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 
 	fdb = br_fdb_find(br, addr, vid);
 	if (fdb && fdb->added_by_external_learn)
-		fdb_delete(br, fdb);
+		fdb_delete(br, fdb, swdev_notify);
 	else
 		err = -ENOENT;
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0ddeeea2c6a7..742f40aefdaf 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -553,9 +553,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid);
+			      const unsigned char *addr, u16 vid,
+			      bool swdev_notify);
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid);
+			      const unsigned char *addr, u16 vid,
+			      bool swdev_notify);
 void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
 			  const unsigned char *addr, u16 vid);
 
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 71a03c487c4b..35474d49555d 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -118,7 +118,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
 void
 br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 {
-	if (!fdb->added_by_user || !fdb->dst)
+	if (!fdb->dst)
 		return;
 
 	switch (type) {
-- 
cgit v1.2.3-59-g8ed1b


From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:22 +0200
Subject: net: initial AF_XDP skeleton
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Buildable skeleton of AF_XDP without any functionality. Just what it
takes to register a new address family.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS                         |  8 ++++++++
 include/linux/socket.h              |  5 ++++-
 net/Kconfig                         |  1 +
 net/core/sock.c                     | 12 ++++++++----
 net/xdp/Kconfig                     |  7 +++++++
 security/selinux/hooks.c            |  4 +++-
 security/selinux/include/classmap.h |  4 +++-
 7 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 net/xdp/Kconfig

diff --git a/MAINTAINERS b/MAINTAINERS
index 537fd17a211b..52d246fd29c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15424,6 +15424,14 @@ T:	git git://linuxtv.org/media_tree.git
 S:	Maintained
 F:	drivers/media/tuners/tuner-xc2028.*
 
+XDP SOCKETS (AF_XDP)
+M:	Björn Töpel <bjorn.topel@intel.com>
+M:	Magnus Karlsson <magnus.karlsson@intel.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	kernel/bpf/xskmap.c
+F:	net/xdp/
+
 XEN BLOCK SUBSYSTEM
 M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 M:	Roger Pau Monné <roger.pau@citrix.com>
diff --git a/include/linux/socket.h b/include/linux/socket.h
index ea50f4a65816..7ed4713d5337 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -207,8 +207,9 @@ struct ucred {
 				 * PF_SMC protocol family that
 				 * reuses AF_INET address family
 				 */
+#define AF_XDP		44	/* XDP sockets			*/
 
-#define AF_MAX		44	/* For now.. */
+#define AF_MAX		45	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -257,6 +258,7 @@ struct ucred {
 #define PF_KCM		AF_KCM
 #define PF_QIPCRTR	AF_QIPCRTR
 #define PF_SMC		AF_SMC
+#define PF_XDP		AF_XDP
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -338,6 +340,7 @@ struct ucred {
 #define SOL_NFC		280
 #define SOL_KCM		281
 #define SOL_TLS		282
+#define SOL_XDP		283
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/net/Kconfig b/net/Kconfig
index 6fa1a4493b8c..86471a1c1ed4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -59,6 +59,7 @@ source "net/tls/Kconfig"
 source "net/xfrm/Kconfig"
 source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
+source "net/xdp/Kconfig"
 
 config INET
 	bool "TCP/IP networking"
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..e7d8b6c955c6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
-  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_MAX"
+  x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
+  x "AF_MAX"
 
 static const char *const af_family_key_strings[AF_MAX+1] = {
 	_sock_locks("sk_lock-")
@@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
-  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_MAX"
+  "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
+  "rlock-AF_MAX"
 };
 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
@@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
-  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_MAX"
+  "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
+  "wlock-AF_MAX"
 };
 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
@@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = {
   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
-  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_MAX"
+  "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
+  "elock-AF_MAX"
 };
 
 /*
diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
new file mode 100644
index 000000000000..90e4a7152854
--- /dev/null
+++ b/net/xdp/Kconfig
@@ -0,0 +1,7 @@
+config XDP_SOCKETS
+	bool "XDP sockets"
+	depends on BPF_SYSCALL
+	default n
+	help
+	  XDP sockets allows a channel between XDP programs and
+	  userspace applications.
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..5c508d26b367 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1471,7 +1471,9 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
 			return SECCLASS_QIPCRTR_SOCKET;
 		case PF_SMC:
 			return SECCLASS_SMC_SOCKET;
-#if PF_MAX > 44
+		case PF_XDP:
+			return SECCLASS_XDP_SOCKET;
+#if PF_MAX > 45
 #error New address family defined, please update this function.
 #endif
 		}
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 7f0372426494..bd5fe0d3204a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -240,9 +240,11 @@ struct security_class_mapping secclass_map[] = {
 	  { "manage_subnet", NULL } },
 	{ "bpf",
 	  {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
+	{ "xdp_socket",
+	  { COMMON_SOCK_PERMS, NULL } },
 	{ NULL }
   };
 
-#if PF_MAX > 44
+#if PF_MAX > 45
 #error New address family defined, please update secclass_map.
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:23 +0200
Subject: xsk: add user memory registration support sockopt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this commit the base structure of the AF_XDP address family is set
up. Further, we introduce the abilty register a window of user memory
to the kernel via the XDP_UMEM_REG setsockopt syscall. The memory
window is viewed by an AF_XDP socket as a set of equally large
frames. After a user memory registration all frames are "owned" by the
user application, and not the kernel.

v2: More robust checks on umem creation and unaccount on error.
    Call set_page_dirty_lock on cleanup.
    Simplified xdp_umem_reg.

Co-authored-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  31 ++++++
 include/uapi/linux/if_xdp.h |  34 ++++++
 net/Makefile                |   1 +
 net/xdp/Makefile            |   2 +
 net/xdp/xdp_umem.c          | 245 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  45 ++++++++
 net/xdp/xdp_umem_props.h    |  23 +++++
 net/xdp/xsk.c               | 215 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 596 insertions(+)
 create mode 100644 include/net/xdp_sock.h
 create mode 100644 include/uapi/linux/if_xdp.h
 create mode 100644 net/xdp/Makefile
 create mode 100644 net/xdp/xdp_umem.c
 create mode 100644 net/xdp/xdp_umem.h
 create mode 100644 net/xdp/xdp_umem_props.h
 create mode 100644 net/xdp/xsk.c

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
new file mode 100644
index 000000000000..94785f5db13e
--- /dev/null
+++ b/include/net/xdp_sock.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * AF_XDP internal functions
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XDP_SOCK_H
+#define _LINUX_XDP_SOCK_H
+
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+struct xdp_umem;
+
+struct xdp_sock {
+	/* struct sock must be the first member of struct xdp_sock */
+	struct sock sk;
+	struct xdp_umem *umem;
+	/* Protects multiple processes in the control path */
+	struct mutex mutex;
+};
+
+#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..41252135a0fe
--- /dev/null
+++ b/include/uapi/linux/if_xdp.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* XDP socket options */
+#define XDP_UMEM_REG			3
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 frame_size; /* Frame size */
+	__u32 frame_headroom; /* Frame head room */
+};
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/net/Makefile b/net/Makefile
index a6147c61b174..77aaddedbd29 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -85,3 +85,4 @@ obj-y				+= l3mdev/
 endif
 obj-$(CONFIG_QRTR)		+= qrtr/
 obj-$(CONFIG_NET_NCSI)		+= ncsi/
+obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
new file mode 100644
index 000000000000..a5d736640a0f
--- /dev/null
+++ b/net/xdp/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
new file mode 100644
index 000000000000..ec8b3552be44
--- /dev/null
+++ b/net/xdp/xdp_umem.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/mm.h>
+
+#include "xdp_umem.h"
+
+#define XDP_UMEM_MIN_FRAME_SIZE 2048
+
+int xdp_umem_create(struct xdp_umem **umem)
+{
+	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
+
+	if (!(*umem))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void xdp_umem_unpin_pages(struct xdp_umem *umem)
+{
+	unsigned int i;
+
+	if (umem->pgs) {
+		for (i = 0; i < umem->npgs; i++) {
+			struct page *page = umem->pgs[i];
+
+			set_page_dirty_lock(page);
+			put_page(page);
+		}
+
+		kfree(umem->pgs);
+		umem->pgs = NULL;
+	}
+}
+
+static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
+{
+	if (umem->user) {
+		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+		free_uid(umem->user);
+	}
+}
+
+static void xdp_umem_release(struct xdp_umem *umem)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	if (umem->pgs) {
+		xdp_umem_unpin_pages(umem);
+
+		task = get_pid_task(umem->pid, PIDTYPE_PID);
+		put_pid(umem->pid);
+		if (!task)
+			goto out;
+		mm = get_task_mm(task);
+		put_task_struct(task);
+		if (!mm)
+			goto out;
+
+		mmput(mm);
+		umem->pgs = NULL;
+	}
+
+	xdp_umem_unaccount_pages(umem);
+out:
+	kfree(umem);
+}
+
+static void xdp_umem_release_deferred(struct work_struct *work)
+{
+	struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
+
+	xdp_umem_release(umem);
+}
+
+void xdp_get_umem(struct xdp_umem *umem)
+{
+	atomic_inc(&umem->users);
+}
+
+void xdp_put_umem(struct xdp_umem *umem)
+{
+	if (!umem)
+		return;
+
+	if (atomic_dec_and_test(&umem->users)) {
+		INIT_WORK(&umem->work, xdp_umem_release_deferred);
+		schedule_work(&umem->work);
+	}
+}
+
+static int xdp_umem_pin_pages(struct xdp_umem *umem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL);
+	if (!umem->pgs)
+		return -ENOMEM;
+
+	down_write(&current->mm->mmap_sem);
+	npgs = get_user_pages(umem->address, umem->npgs,
+			      gup_flags, &umem->pgs[0], NULL);
+	up_write(&current->mm->mmap_sem);
+
+	if (npgs != umem->npgs) {
+		if (npgs >= 0) {
+			umem->npgs = npgs;
+			err = -ENOMEM;
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	xdp_umem_unpin_pages(umem);
+out_pgs:
+	kfree(umem->pgs);
+	umem->pgs = NULL;
+	return err;
+}
+
+static int xdp_umem_account_pages(struct xdp_umem *umem)
+{
+	unsigned long lock_limit, new_npgs, old_npgs;
+
+	if (capable(CAP_IPC_LOCK))
+		return 0;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	umem->user = get_uid(current_user());
+
+	do {
+		old_npgs = atomic_long_read(&umem->user->locked_vm);
+		new_npgs = old_npgs + umem->npgs;
+		if (new_npgs > lock_limit) {
+			free_uid(umem->user);
+			umem->user = NULL;
+			return -ENOBUFS;
+		}
+	} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
+				     new_npgs) != old_npgs);
+	return 0;
+}
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+{
+	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u64 addr = mr->addr, size = mr->len;
+	unsigned int nframes, nfpp;
+	int size_chk, err;
+
+	if (!umem)
+		return -EINVAL;
+
+	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+		/* Strictly speaking we could support this, if:
+		 * - huge pages, or*
+		 * - using an IOMMU, or
+		 * - making sure the memory area is consecutive
+		 * but for now, we simply say "computer says no".
+		 */
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(frame_size))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr)) {
+		/* Memory area has to be page size aligned. For
+		 * simplicity, this might change.
+		 */
+		return -EINVAL;
+	}
+
+	if ((addr + size) < addr)
+		return -EINVAL;
+
+	nframes = size / frame_size;
+	if (nframes == 0 || nframes > UINT_MAX)
+		return -EINVAL;
+
+	nfpp = PAGE_SIZE / frame_size;
+	if (nframes < nfpp || nframes % nfpp)
+		return -EINVAL;
+
+	frame_headroom = ALIGN(frame_headroom, 64);
+
+	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	if (size_chk < 0)
+		return -EINVAL;
+
+	umem->pid = get_task_pid(current, PIDTYPE_PID);
+	umem->size = (size_t)size;
+	umem->address = (unsigned long)addr;
+	umem->props.frame_size = frame_size;
+	umem->props.nframes = nframes;
+	umem->frame_headroom = frame_headroom;
+	umem->npgs = size / PAGE_SIZE;
+	umem->pgs = NULL;
+	umem->user = NULL;
+
+	umem->frame_size_log2 = ilog2(frame_size);
+	umem->nfpp_mask = nfpp - 1;
+	umem->nfpplog2 = ilog2(nfpp);
+	atomic_set(&umem->users, 1);
+
+	err = xdp_umem_account_pages(umem);
+	if (err)
+		goto out;
+
+	err = xdp_umem_pin_pages(umem);
+	if (err)
+		goto out_account;
+	return 0;
+
+out_account:
+	xdp_umem_unaccount_pages(umem);
+out:
+	put_pid(umem->pid);
+	return err;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
new file mode 100644
index 000000000000..4597ae81a221
--- /dev/null
+++ b/net/xdp/xdp_umem.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_H_
+#define XDP_UMEM_H_
+
+#include <linux/mm.h>
+#include <linux/if_xdp.h>
+#include <linux/workqueue.h>
+
+#include "xdp_umem_props.h"
+
+struct xdp_umem {
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 npgs;
+	u32 frame_headroom;
+	u32 nfpp_mask;
+	u32 nfpplog2;
+	u32 frame_size_log2;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	size_t size;
+	atomic_t users;
+	struct work_struct work;
+};
+
+int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
+void xdp_get_umem(struct xdp_umem *umem);
+void xdp_put_umem(struct xdp_umem *umem);
+int xdp_umem_create(struct xdp_umem **umem);
+
+#endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
new file mode 100644
index 000000000000..77fb5daf29f3
--- /dev/null
+++ b/net/xdp/xdp_umem_props.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space packet buffer
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef XDP_UMEM_PROPS_H_
+#define XDP_UMEM_PROPS_H_
+
+struct xdp_umem_props {
+	u32 frame_size;
+	u32 nframes;
+};
+
+#endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
new file mode 100644
index 000000000000..84e0e867febb
--- /dev/null
+++ b/net/xdp/xsk.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP sockets
+ *
+ * AF_XDP sockets allows a channel between XDP programs and userspace
+ * applications.
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
+
+#include <linux/if_xdp.h>
+#include <linux/init.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/socket.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <net/xdp_sock.h>
+
+#include "xdp_umem.h"
+
+static struct xdp_sock *xdp_sk(struct sock *sk)
+{
+	return (struct xdp_sock *)sk;
+}
+
+static int xsk_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct net *net;
+
+	if (!sk)
+		return 0;
+
+	net = sock_net(sk);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, sk->sk_prot, -1);
+	local_bh_enable();
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	sk_refcnt_debug_release(sk);
+	sock_put(sk);
+
+	return 0;
+}
+
+static int xsk_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int err;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	switch (optname) {
+	case XDP_UMEM_REG:
+	{
+		struct xdp_umem_reg mr;
+		struct xdp_umem *umem;
+
+		if (xs->umem)
+			return -EBUSY;
+
+		if (copy_from_user(&mr, optval, sizeof(mr)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		err = xdp_umem_create(&umem);
+
+		err = xdp_umem_reg(umem, &mr);
+		if (err) {
+			kfree(umem);
+			mutex_unlock(&xs->mutex);
+			return err;
+		}
+
+		/* Make sure umem is ready before it can be seen by others */
+		smp_wmb();
+
+		xs->umem = umem;
+		mutex_unlock(&xs->mutex);
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -ENOPROTOOPT;
+}
+
+static struct proto xsk_proto = {
+	.name =		"XDP",
+	.owner =	THIS_MODULE,
+	.obj_size =	sizeof(struct xdp_sock),
+};
+
+static const struct proto_ops xsk_proto_ops = {
+	.family =	PF_XDP,
+	.owner =	THIS_MODULE,
+	.release =	xsk_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		sock_no_poll,
+	.ioctl =	sock_no_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	xsk_setsockopt,
+	.getsockopt =	sock_no_getsockopt,
+	.sendmsg =	sock_no_sendmsg,
+	.recvmsg =	sock_no_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static void xsk_destruct(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		return;
+
+	xdp_put_umem(xs->umem);
+
+	sk_refcnt_debug_dec(sk);
+}
+
+static int xsk_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+	struct xdp_sock *xs;
+
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
+		return -EPERM;
+	if (sock->type != SOCK_RAW)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol)
+		return -EPROTONOSUPPORT;
+
+	sock->state = SS_UNCONNECTED;
+
+	sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
+	if (!sk)
+		return -ENOBUFS;
+
+	sock->ops = &xsk_proto_ops;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_family = PF_XDP;
+
+	sk->sk_destruct = xsk_destruct;
+	sk_refcnt_debug_inc(sk);
+
+	xs = xdp_sk(sk);
+	mutex_init(&xs->mutex);
+
+	local_bh_disable();
+	sock_prot_inuse_add(net, &xsk_proto, 1);
+	local_bh_enable();
+
+	return 0;
+}
+
+static const struct net_proto_family xsk_family_ops = {
+	.family = PF_XDP,
+	.create = xsk_create,
+	.owner	= THIS_MODULE,
+};
+
+static int __init xsk_init(void)
+{
+	int err;
+
+	err = proto_register(&xsk_proto, 0 /* no slab */);
+	if (err)
+		goto out;
+
+	err = sock_register(&xsk_family_ops);
+	if (err)
+		goto out_proto;
+
+	return 0;
+
+out_proto:
+	proto_unregister(&xsk_proto);
+out:
+	return err;
+}
+
+fs_initcall(xsk_init);
-- 
cgit v1.2.3-59-g8ed1b


From 423f38329d267969130fb6f2c685f73d72687558 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:24 +0200
Subject: xsk: add umem fill queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
ask the kernel to allocate a queue (ring buffer) and also mmap it
(XDP_UMEM_PGOFF_FILL_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
user process to the kernel. These frames will in a later patch be
filled in with Rx packet data by the kernel.

v2: Fixed potential crash in xsk_mmap.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 15 +++++++++++
 net/xdp/Makefile            |  2 +-
 net/xdp/xdp_umem.c          |  5 ++++
 net/xdp/xdp_umem.h          |  2 ++
 net/xdp/xsk.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         | 58 ++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 38 ++++++++++++++++++++++++++
 7 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 41252135a0fe..975661e1baca 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,6 +23,7 @@
 
 /* XDP socket options */
 #define XDP_UMEM_REG			3
+#define XDP_UMEM_FILL_RING		4
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -31,4 +32,18 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+/* Pgoff for mmaping the rings */
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+
+struct xdp_ring {
+	__u32 producer __attribute__((aligned(64)));
+	__u32 consumer __attribute__((aligned(64)));
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	__u32 desc[0] __attribute__((aligned(64)));
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index a5d736640a0f..074fb2b2d51c 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index ec8b3552be44..e1f627d0cc1c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	if (umem->fq) {
+		xskq_destroy(umem->fq);
+		umem->fq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 4597ae81a221..25634b8a5c6f 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -19,9 +19,11 @@
 #include <linux/if_xdp.h>
 #include <linux/workqueue.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem_props.h"
 
 struct xdp_umem {
+	struct xsk_queue *fq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 84e0e867febb..da67a3c5c1c9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -32,6 +32,7 @@
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem.h"
 
 static struct xdp_sock *xdp_sk(struct sock *sk)
@@ -39,6 +40,21 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries);
+	if (!q)
+		return -ENOMEM;
+
+	*queue = q;
+	return 0;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -101,6 +117,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
+	case XDP_UMEM_FILL_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (!xs->umem)
+			return -EINVAL;
+
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->umem->fq;
+		err = xsk_init_queue(entries, q);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	default:
 		break;
 	}
@@ -108,6 +141,36 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct xsk_queue *q = NULL;
+	unsigned long pfn;
+	struct page *qpg;
+
+	if (!xs->umem)
+		return -EINVAL;
+
+	if (offset == XDP_UMEM_PGOFF_FILL_RING)
+		q = xs->umem->fq;
+	else
+		return -EINVAL;
+
+	if (!q)
+		return -EINVAL;
+
+	qpg = virt_to_head_page(q->ring);
+	if (size > (PAGE_SIZE << compound_order(qpg)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn,
+			       size, vma->vm_page_prot);
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -131,7 +194,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	sock_no_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..23da4f29d3fb
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+}
+
+struct xsk_queue *xskq_create(u32 nentries)
+{
+	struct xsk_queue *q;
+	gfp_t gfp_flags;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+		    __GFP_COMP  | __GFP_NORETRY;
+	size = xskq_umem_get_ring_size(q);
+
+	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
+						      get_order(size));
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	page_frag_free(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..7eb556bf73be
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XSK_QUEUE_H
+#define _LINUX_XSK_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+struct xsk_queue {
+	struct xdp_umem_props umem_props;
+	u32 ring_mask;
+	u32 nentries;
+	u32 prod_head;
+	u32 prod_tail;
+	u32 cons_head;
+	u32 cons_tail;
+	struct xdp_ring *ring;
+	u64 invalid_descs;
+};
+
+struct xsk_queue *xskq_create(u32 nentries);
+void xskq_destroy(struct xsk_queue *q);
+
+#endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3-59-g8ed1b


From b9b6b68e8abd101be6eb5330e4999218c696d1e8 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:25 +0200
Subject: xsk: add Rx queue setup and mmap support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Another setsockopt (XDP_RX_QUEUE) is added to let the process allocate
a queue, where the kernel can pass completed Rx frames from the kernel
to user process.

The mmapping of the queue is done using the XDP_PGOFF_RX_QUEUE offset.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |  4 ++++
 include/uapi/linux/if_xdp.h | 16 ++++++++++++++++
 net/xdp/xsk.c               | 41 ++++++++++++++++++++++++++++++++---------
 net/xdp/xsk_queue.c         | 11 +++++++++--
 net/xdp/xsk_queue.h         |  2 +-
 5 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 94785f5db13e..db9a321de087 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -18,11 +18,15 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+struct net_device;
+struct xsk_queue;
 struct xdp_umem;
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
+	struct xsk_queue *rx;
+	struct net_device *dev;
 	struct xdp_umem *umem;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 975661e1baca..65324558829d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 
 /* XDP socket options */
+#define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 
@@ -33,13 +34,28 @@ struct xdp_umem_reg {
 };
 
 /* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 
+struct xdp_desc {
+	__u32 idx;
+	__u32 len;
+	__u16 offset;
+	__u8 flags;
+	__u8 padding[5];
+};
+
 struct xdp_ring {
 	__u32 producer __attribute__((aligned(64)));
 	__u32 consumer __attribute__((aligned(64)));
 };
 
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] __attribute__((aligned(64)));
+};
+
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index da67a3c5c1c9..92bd9b7e548f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,7 @@
 #include <linux/net.h>
 #include <linux/netdevice.h>
 #include <net/xdp_sock.h>
+#include <net/xdp.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
@@ -40,14 +41,15 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
-static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
+			  bool umem_queue)
 {
 	struct xsk_queue *q;
 
 	if (entries == 0 || *queue || !is_power_of_2(entries))
 		return -EINVAL;
 
-	q = xskq_create(entries);
+	q = xskq_create(entries, umem_queue);
 	if (!q)
 		return -ENOMEM;
 
@@ -89,6 +91,22 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return -ENOPROTOOPT;
 
 	switch (optname) {
+	case XDP_RX_RING:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (optlen < sizeof(entries))
+			return -EINVAL;
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->rx;
+		err = xsk_init_queue(entries, q, false);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	case XDP_UMEM_REG:
 	{
 		struct xdp_umem_reg mr;
@@ -130,7 +148,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 		mutex_lock(&xs->mutex);
 		q = &xs->umem->fq;
-		err = xsk_init_queue(entries, q);
+		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
@@ -151,13 +169,17 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
-	if (!xs->umem)
-		return -EINVAL;
+	if (offset == XDP_PGOFF_RX_RING) {
+		q = xs->rx;
+	} else {
+		if (!xs->umem)
+			return -EINVAL;
 
-	if (offset == XDP_UMEM_PGOFF_FILL_RING)
-		q = xs->umem->fq;
-	else
-		return -EINVAL;
+		if (offset == XDP_UMEM_PGOFF_FILL_RING)
+			q = xs->umem->fq;
+		else
+			return -EINVAL;
+	}
 
 	if (!q)
 		return -EINVAL;
@@ -205,6 +227,7 @@ static void xsk_destruct(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		return;
 
+	xskq_destroy(xs->rx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 23da4f29d3fb..894f9f89afc7 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -21,7 +21,13 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
 }
 
-struct xsk_queue *xskq_create(u32 nentries)
+static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
+{
+	return (sizeof(struct xdp_ring) +
+		q->nentries * sizeof(struct xdp_desc));
+}
+
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
 {
 	struct xsk_queue *q;
 	gfp_t gfp_flags;
@@ -36,7 +42,8 @@ struct xsk_queue *xskq_create(u32 nentries)
 
 	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
 		    __GFP_COMP  | __GFP_NORETRY;
-	size = xskq_umem_get_ring_size(q);
+	size = umem_queue ? xskq_umem_get_ring_size(q) :
+	       xskq_rxtx_get_ring_size(q);
 
 	q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags,
 						      get_order(size));
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7eb556bf73be..5439fa381763 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,7 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
-struct xsk_queue *xskq_create(u32 nentries);
+struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 965a990984432cd01a9eb3514c64d86f56704295 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:26 +0200
Subject: xsk: add support for bind for Rx

Here, the bind syscall is added. Binding an AF_XDP socket, means
associating the socket to an umem, a netdev and a queue index. This
can be done in two ways.

The first way, creating a "socket from scratch". Create the umem using
the XDP_UMEM_REG setsockopt and an associated fill queue with
XDP_UMEM_FILL_QUEUE. Create the Rx queue using the XDP_RX_QUEUE
setsockopt. Call bind passing ifindex and queue index ("channel" in
ethtool speak).

The second way to bind a socket, is simply skipping the
umem/netdev/queue index, and passing another already setup AF_XDP
socket. The new socket will then have the same umem/netdev/queue index
as the parent so it will share the same umem. You must also set the
flags field in the socket address to XDP_SHARED_UMEM.

v2: Use PTR_ERR instead of passing error variable explicitly.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      |   1 +
 include/uapi/linux/if_xdp.h |  11 ++++
 net/xdp/xdp_umem.c          |   5 ++
 net/xdp/xdp_umem.h          |   1 +
 net/xdp/xsk.c               | 124 +++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         |   8 +++
 net/xdp/xsk_queue.h         |   1 +
 7 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index db9a321de087..85d02512f59b 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 };
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 65324558829d..e5091881f776 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -21,6 +21,17 @@
 
 #include <linux/types.h>
 
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM 1
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+	__u16 sxdp_flags;
+};
+
 /* XDP socket options */
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index e1f627d0cc1c..9bac1ad570fa 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -248,3 +248,8 @@ out:
 	put_pid(umem->pid);
 	return err;
 }
+
+bool xdp_umem_validate_queues(struct xdp_umem *umem)
+{
+	return umem->fq;
+}
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 25634b8a5c6f..b13133e9c501 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -39,6 +39,7 @@ struct xdp_umem {
 	struct work_struct work;
 };
 
+bool xdp_umem_validate_queues(struct xdp_umem *umem);
 int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 92bd9b7e548f..bf2c97b87992 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -57,9 +57,18 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	return 0;
 }
 
+static void __xsk_release(struct xdp_sock *xs)
+{
+	/* Wait for driver to stop using the xdp socket. */
+	synchronize_net();
+
+	dev_put(xs->dev);
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
 	struct net *net;
 
 	if (!sk)
@@ -71,6 +80,11 @@ static int xsk_release(struct socket *sock)
 	sock_prot_inuse_add(net, sk->sk_prot, -1);
 	local_bh_enable();
 
+	if (xs->dev) {
+		__xsk_release(xs);
+		xs->dev = NULL;
+	}
+
 	sock_orphan(sk);
 	sock->sk = NULL;
 
@@ -80,6 +94,114 @@ static int xsk_release(struct socket *sock)
 	return 0;
 }
 
+static struct socket *xsk_lookup_xsk_from_fd(int fd)
+{
+	struct socket *sock;
+	int err;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return ERR_PTR(-ENOTSOCK);
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return ERR_PTR(-ENOPROTOOPT);
+	}
+
+	return sock;
+}
+
+static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+{
+	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
+	struct sock *sk = sock->sk;
+	struct net_device *dev, *dev_curr;
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct xdp_umem *old_umem = NULL;
+	int err = 0;
+
+	if (addr_len < sizeof(struct sockaddr_xdp))
+		return -EINVAL;
+	if (sxdp->sxdp_family != AF_XDP)
+		return -EINVAL;
+
+	mutex_lock(&xs->mutex);
+	dev_curr = xs->dev;
+	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
+	if (!dev) {
+		err = -ENODEV;
+		goto out_release;
+	}
+
+	if (!xs->rx) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+		struct xdp_sock *umem_xs;
+		struct socket *sock;
+
+		if (xs->umem) {
+			/* We have already our own. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
+		if (IS_ERR(sock)) {
+			err = PTR_ERR(sock);
+			goto out_unlock;
+		}
+
+		umem_xs = xdp_sk(sock->sk);
+		if (!umem_xs->umem) {
+			/* No umem to inherit. */
+			err = -EBADF;
+			sockfd_put(sock);
+			goto out_unlock;
+		} else if (umem_xs->dev != dev ||
+			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+			err = -EINVAL;
+			sockfd_put(sock);
+			goto out_unlock;
+		}
+
+		xdp_get_umem(umem_xs->umem);
+		old_umem = xs->umem;
+		xs->umem = umem_xs->umem;
+		sockfd_put(sock);
+	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Rebind? */
+	if (dev_curr && (dev_curr != dev ||
+			 xs->queue_id != sxdp->sxdp_queue_id)) {
+		__xsk_release(xs);
+		if (old_umem)
+			xdp_put_umem(old_umem);
+	}
+
+	xs->dev = dev;
+	xs->queue_id = sxdp->sxdp_queue_id;
+
+	xskq_set_umem(xs->rx, &xs->umem->props);
+
+out_unlock:
+	if (err)
+		dev_put(dev);
+out_release:
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  char __user *optval, unsigned int optlen)
 {
@@ -203,7 +325,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.family =	PF_XDP,
 	.owner =	THIS_MODULE,
 	.release =	xsk_release,
-	.bind =		sock_no_bind,
+	.bind =		xsk_bind,
 	.connect =	sock_no_connect,
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 894f9f89afc7..d012e5e23591 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -16,6 +16,14 @@
 
 #include "xsk_queue.h"
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
+{
+	if (!q)
+		return;
+
+	q->umem_props = *umem_props;
+}
+
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
 	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5439fa381763..9ddd2ee07a84 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -32,6 +32,7 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
 void xskq_destroy(struct xsk_queue *q);
 
-- 
cgit v1.2.3-59-g8ed1b


From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:27 +0200
Subject: xsk: add Rx receive functions and poll support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here the actual receive functions of AF_XDP are implemented, that in a
later commit, will be called from the XDP layers.

There's one set of functions for the XDP_DRV side and another for
XDP_SKB (generic).

A new XDP API, xdp_return_buff, is also introduced.

Adding xdp_return_buff, which is analogous to xdp_return_frame, but
acts upon an struct xdp_buff. The API will be used by AF_XDP in future
commits.

Support for the poll syscall is also implemented.

v2: xskq_validate_id did not update cons_tail.
    The entries variable was calculated twice in xskq_nb_avail.
    Squashed xdp_return_buff commit.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h      |   1 +
 include/net/xdp_sock.h |  22 ++++++++++
 net/core/xdp.c         |  15 +++++--
 net/xdp/xdp_umem.h     |  18 ++++++++
 net/xdp/xsk.c          |  73 ++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.h    | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 238 insertions(+), 5 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 137ad5f9f40f..0b689cf561c7 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 85d02512f59b..a0342dff6a4d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -31,6 +31,28 @@ struct xdp_sock {
 	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
+	u64 rx_dropped;
 };
 
+struct xdp_buff;
+#ifdef CONFIG_XDP_SOCKETS
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
+void xsk_flush(struct xdp_sock *xs);
+#else
+static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	return -ENOTSUPP;
+}
+
+static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	return -ENOTSUPP;
+}
+
+static inline void xsk_flush(struct xdp_sock *xs)
+{
+}
+#endif /* CONFIG_XDP_SOCKETS */
+
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 0c86b53a3a63..bf6758f74339 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,11 +308,9 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-void xdp_return_frame(struct xdp_frame *xdpf)
+static void xdp_return(void *data, struct xdp_mem_info *mem)
 {
-	struct xdp_mem_info *mem = &xdpf->mem;
 	struct xdp_mem_allocator *xa;
-	void *data = xdpf->data;
 	struct page *page;
 
 	switch (mem->type) {
@@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf)
 		break;
 	}
 }
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+	xdp_return(xdpf->data, &xdpf->mem);
+}
 EXPORT_SYMBOL_GPL(xdp_return_frame);
+
+void xdp_return_buff(struct xdp_buff *xdp)
+{
+	xdp_return(xdp->data, &xdp->rxq->mem);
+}
+EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index b13133e9c501..c7378a11721f 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -39,6 +39,24 @@ struct xdp_umem {
 	struct work_struct work;
 };
 
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
+{
+	u64 pg, off;
+	char *data;
+
+	pg = idx >> umem->nfpplog2;
+	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
+
+	data = page_address(umem->pgs[pg]);
+	return data + off;
+}
+
+static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
+						    u32 idx)
+{
+	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+}
+
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index bf2c97b87992..4e1e6c581e1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 *id, len = xdp->data_end - xdp->data;
+	void *buffer;
+	int err = 0;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	id = xskq_peek_id(xs->umem->fq);
+	if (!id)
+		return -ENOSPC;
+
+	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, *id, len,
+				      xs->umem->frame_headroom);
+	if (!err)
+		xskq_discard_id(xs->umem->fq);
+
+	return err;
+}
+
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	int err;
+
+	err = __xsk_rcv(xs, xdp);
+	if (likely(!err))
+		xdp_return_buff(xdp);
+	else
+		xs->rx_dropped++;
+
+	return err;
+}
+
+void xsk_flush(struct xdp_sock *xs)
+{
+	xskq_produce_flush_desc(xs->rx);
+	xs->sk.sk_data_ready(&xs->sk);
+}
+
+int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	int err;
+
+	err = __xsk_rcv(xs, xdp);
+	if (!err)
+		xsk_flush(xs);
+	else
+		xs->rx_dropped++;
+
+	return err;
+}
+
+static unsigned int xsk_poll(struct file *file, struct socket *sock,
+			     struct poll_table_struct *wait)
+{
+	unsigned int mask = datagram_poll(file, sock, wait);
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (xs->rx && !xskq_empty_desc(xs->rx))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 			  bool umem_queue)
 {
@@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 		err = -EINVAL;
 		goto out_unlock;
+	} else {
+		/* This xsk has its own umem. */
+		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 	}
 
 	/* Rebind? */
@@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	sock_no_getname,
-	.poll =		sock_no_poll,
+	.poll =		xsk_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 9ddd2ee07a84..0a9b92b4f93a 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -20,6 +20,8 @@
 
 #include "xdp_umem_props.h"
 
+#define RX_BATCH_SIZE 16
+
 struct xsk_queue {
 	struct xdp_umem_props umem_props;
 	u32 ring_mask;
@@ -32,8 +34,118 @@ struct xsk_queue {
 	u64 invalid_descs;
 };
 
+/* Common functions operating for both RXTX and umem queues */
+
+static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
+{
+	u32 entries = q->prod_tail - q->cons_tail;
+
+	if (entries == 0) {
+		/* Refresh the local pointer */
+		q->prod_tail = READ_ONCE(q->ring->producer);
+		entries = q->prod_tail - q->cons_tail;
+	}
+
+	return (entries > dcnt) ? dcnt : entries;
+}
+
+static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
+{
+	u32 free_entries = q->nentries - (producer - q->cons_tail);
+
+	if (free_entries >= dcnt)
+		return free_entries;
+
+	/* Refresh the local tail pointer */
+	q->cons_tail = READ_ONCE(q->ring->consumer);
+	return q->nentries - (producer - q->cons_tail);
+}
+
+/* UMEM queue */
+
+static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+{
+	if (unlikely(idx >= q->umem_props.nframes)) {
+		q->invalid_descs++;
+		return false;
+	}
+	return true;
+}
+
+static inline u32 *xskq_validate_id(struct xsk_queue *q)
+{
+	while (q->cons_tail != q->cons_head) {
+		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+		unsigned int idx = q->cons_tail & q->ring_mask;
+
+		if (xskq_is_valid_id(q, ring->desc[idx]))
+			return &ring->desc[idx];
+
+		q->cons_tail++;
+	}
+
+	return NULL;
+}
+
+static inline u32 *xskq_peek_id(struct xsk_queue *q)
+{
+	struct xdp_umem_ring *ring;
+
+	if (q->cons_tail == q->cons_head) {
+		WRITE_ONCE(q->ring->consumer, q->cons_tail);
+		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+		/* Order consumer and data */
+		smp_rmb();
+
+		return xskq_validate_id(q);
+	}
+
+	ring = (struct xdp_umem_ring *)q->ring;
+	return &ring->desc[q->cons_tail & q->ring_mask];
+}
+
+static inline void xskq_discard_id(struct xsk_queue *q)
+{
+	q->cons_tail++;
+	(void)xskq_validate_id(q);
+}
+
+/* Rx queue */
+
+static inline int xskq_produce_batch_desc(struct xsk_queue *q,
+					  u32 id, u32 len, u16 offset)
+{
+	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+	unsigned int idx;
+
+	if (xskq_nb_free(q, q->prod_head, 1) == 0)
+		return -ENOSPC;
+
+	idx = (q->prod_head++) & q->ring_mask;
+	ring->desc[idx].idx = id;
+	ring->desc[idx].len = len;
+	ring->desc[idx].offset = offset;
+
+	return 0;
+}
+
+static inline void xskq_produce_flush_desc(struct xsk_queue *q)
+{
+	/* Order producer and data */
+	smp_wmb();
+
+	q->prod_tail = q->prod_head,
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
+static inline bool xskq_empty_desc(struct xsk_queue *q)
+{
+	return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries);
+}
+
 void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
-void xskq_destroy(struct xsk_queue *q);
+void xskq_destroy(struct xsk_queue *q_ops);
 
 #endif /* _LINUX_XSK_QUEUE_H */
-- 
cgit v1.2.3-59-g8ed1b


From fbfc504a24f53f7ebe128ab55cb5dba634f4ece8 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:28 +0200
Subject: bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xskmap is yet another BPF map, very much inspired by
dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application
adds AF_XDP sockets into the map, and by using the bpf_redirect_map
helper, an XDP program can redirect XDP frames to an AF_XDP socket.

Note that a socket that is bound to certain ifindex/queue index will
*only* accept XDP frames from that netdev/queue index. If an XDP
program tries to redirect from a netdev/queue index other than what
the socket is bound to, the frame will not be received on the socket.

A socket can reside in multiple maps.

v3: Fixed race and simplified code.
v2: Removed one indirection in map lookup.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h       |  25 +++++
 include/linux/bpf_types.h |   3 +
 include/net/xdp_sock.h    |   7 ++
 include/uapi/linux/bpf.h  |   1 +
 kernel/bpf/Makefile       |   3 +
 kernel/bpf/verifier.c     |   8 +-
 kernel/bpf/xskmap.c       | 239 ++++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk.c             |   5 +
 8 files changed, 289 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/xskmap.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c553f6f9c6b0..68ecdb4eea09 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map,
 }
 #endif
 
+#if defined(CONFIG_XDP_SOCKETS)
+struct xdp_sock;
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs);
+void __xsk_map_flush(struct bpf_map *map);
+#else
+struct xdp_sock;
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
+{
+	return NULL;
+}
+
+static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+				     struct xdp_sock *xs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void __xsk_map_flush(struct bpf_map *map)
+{
+}
+#endif
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf6f6ae..d7df1b323082 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
+#if defined(CONFIG_XDP_SOCKETS)
+BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
+#endif
 #endif
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index a0342dff6a4d..ce3a2ab16b8f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -28,6 +28,7 @@ struct xdp_sock {
 	struct xsk_queue *rx;
 	struct net_device *dev;
 	struct xdp_umem *umem;
+	struct list_head flush_node;
 	u16 queue_id;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
@@ -39,6 +40,7 @@ struct xdp_buff;
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 static inline void xsk_flush(struct xdp_sock *xs)
 {
 }
+
+static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return false;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8daef7326bb7..a3a495052511 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 35c485fa9ea3..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 712d8655e916..0d91f18b2eb5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
-	/* Restrict bpf side of cpumap, open when use-cases appear */
+	/* Restrict bpf side of cpumap and xskmap, open when use-cases
+	 * appear.
+	 */
 	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
-		    map->map_type != BPF_MAP_TYPE_CPUMAP)
+		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
+		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..869dbb11b612
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+	struct bpf_map map;
+	struct xdp_sock **xsk_map;
+	struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	int cpu, err = -EINVAL;
+	struct xsk_map *m;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	m = kzalloc(sizeof(*m), GFP_USER);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&m->map, attr);
+
+	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+	cost += sizeof(struct list_head) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_m;
+
+	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	err = bpf_map_precharge_memlock(m->map.pages);
+	if (err)
+		goto free_m;
+
+	m->flush_list = alloc_percpu(struct list_head);
+	if (!m->flush_list)
+		goto free_m;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+					sizeof(struct xdp_sock *),
+					m->map.numa_node);
+	if (!m->xsk_map)
+		goto free_percpu;
+	return &m->map;
+
+free_percpu:
+	free_percpu(m->flush_list);
+free_m:
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	int i;
+
+	synchronize_net();
+
+	for (i = 0; i < map->max_entries; i++) {
+		struct xdp_sock *xs;
+
+		xs = m->xsk_map[i];
+		if (!xs)
+			continue;
+
+		sock_put((struct sock *)xs);
+	}
+
+	free_percpu(m->flush_list);
+	bpf_map_area_free(m->xsk_map);
+	kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	xs = READ_ONCE(m->xsk_map[key]);
+	return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del(xs->flush_node.prev, xs->flush_node.next);
+		xs->flush_node.prev = NULL;
+	}
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xdp_sock *xs, *old_xs;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	if (!xsk_is_setup_for_bpf_map(xs)) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	sock_hold(sock->sk);
+
+	old_xs = xchg(&m->xsk_map[i], xs);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	sockfd_put(sock);
+	return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *old_xs;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	old_xs = xchg(&m->xsk_map[k], NULL);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4e1e6c581e1d..b931a0db5588 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
+{
+	return !!xs->rx;
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 *id, len = xdp->data_end - xdp->data;
-- 
cgit v1.2.3-59-g8ed1b


From 1b1a251c83d29ec18b4a2191cce5c5b7117529ff Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:29 +0200
Subject: xsk: wire up XDP_DRV side of AF_XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit wires up the xskmap to XDP_DRV layer.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index d3781daa26ab..40d4bbb4508d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2801,7 +2801,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 {
 	int err;
 
-	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP: {
 		struct net_device *dev = fwd;
 		struct xdp_frame *xdpf;
 
@@ -2819,14 +2820,25 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-
-	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		break;
+	}
+	case BPF_MAP_TYPE_CPUMAP: {
 		struct bpf_cpu_map_entry *rcpu = fwd;
 
 		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
 		if (err)
 			return err;
 		__cpu_map_insert_ctx(map, index);
+		break;
+	}
+	case BPF_MAP_TYPE_XSKMAP: {
+		struct xdp_sock *xs = fwd;
+
+		err = __xsk_map_redirect(map, xdp, xs);
+		return err;
+	}
+	default:
+		break;
 	}
 	return 0;
 }
@@ -2845,6 +2857,9 @@ void xdp_do_flush_map(void)
 		case BPF_MAP_TYPE_CPUMAP:
 			__cpu_map_flush(map);
 			break;
+		case BPF_MAP_TYPE_XSKMAP:
+			__xsk_map_flush(map);
+			break;
 		default:
 			break;
 		}
@@ -2859,6 +2874,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 		return __dev_map_lookup_elem(map, index);
 	case BPF_MAP_TYPE_CPUMAP:
 		return __cpu_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_XSKMAP:
+		return __xsk_map_lookup_elem(map, index);
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 02671e23e7b383763fe1ae4f20b56d8029f9dfc6 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Wed, 2 May 2018 13:01:30 +0200
Subject: xsk: wire up XDP_SKB side of AF_XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit wires up the xskmap to XDP_SKB layer.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  2 +-
 net/core/dev.c         | 35 +++++++++++++++++++----------------
 net/core/filter.c      | 17 ++++++++++++++---
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 64899c04c1a6..b7f81e3a70cb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -760,7 +760,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
  * This does not appear to be a real limitation for existing software.
  */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *prog);
+			    struct xdp_buff *xdp, struct bpf_prog *prog);
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8f8931b93140..aea36b5a2fed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3994,12 +3994,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
 }
 
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+				     struct xdp_buff *xdp,
 				     struct bpf_prog *xdp_prog)
 {
 	struct netdev_rx_queue *rxqueue;
 	void *orig_data, *orig_data_end;
 	u32 metalen, act = XDP_DROP;
-	struct xdp_buff xdp;
 	int hlen, off;
 	u32 mac_len;
 
@@ -4034,19 +4034,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	 */
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
-	xdp.data = skb->data - mac_len;
-	xdp.data_meta = xdp.data;
-	xdp.data_end = xdp.data + hlen;
-	xdp.data_hard_start = skb->data - skb_headroom(skb);
-	orig_data_end = xdp.data_end;
-	orig_data = xdp.data;
+	xdp->data = skb->data - mac_len;
+	xdp->data_meta = xdp->data;
+	xdp->data_end = xdp->data + hlen;
+	xdp->data_hard_start = skb->data - skb_headroom(skb);
+	orig_data_end = xdp->data_end;
+	orig_data = xdp->data;
 
 	rxqueue = netif_get_rxqueue(skb);
-	xdp.rxq = &rxqueue->xdp_rxq;
+	xdp->rxq = &rxqueue->xdp_rxq;
 
-	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+	act = bpf_prog_run_xdp(xdp_prog, xdp);
 
-	off = xdp.data - orig_data;
+	off = xdp->data - orig_data;
 	if (off > 0)
 		__skb_pull(skb, off);
 	else if (off < 0)
@@ -4056,10 +4056,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
 	 * pckt.
 	 */
-	off = orig_data_end - xdp.data_end;
+	off = orig_data_end - xdp->data_end;
 	if (off != 0) {
-		skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
 		skb->len -= off;
+
 	}
 
 	switch (act) {
@@ -4068,7 +4069,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		__skb_push(skb, mac_len);
 		break;
 	case XDP_PASS:
-		metalen = xdp.data - xdp.data_meta;
+		metalen = xdp->data - xdp->data_meta;
 		if (metalen)
 			skb_metadata_set(skb, metalen);
 		break;
@@ -4118,17 +4119,19 @@ static struct static_key generic_xdp_needed __read_mostly;
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 {
 	if (xdp_prog) {
-		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+		struct xdp_buff xdp;
+		u32 act;
 		int err;
 
+		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 		if (act != XDP_PASS) {
 			switch (act) {
 			case XDP_REDIRECT:
 				err = xdp_do_generic_redirect(skb->dev, skb,
-							      xdp_prog);
+							      &xdp, xdp_prog);
 				if (err)
 					goto out_redir;
-			/* fallthru to submit skb */
+				break;
 			case XDP_TX:
 				generic_xdp_tx(skb, xdp_prog);
 				break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 40d4bbb4508d..120bc8a202d9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -59,6 +59,7 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -2973,13 +2974,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
+				       struct xdp_buff *xdp,
 				       struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err = 0;
 
 	ri->ifindex = 0;
@@ -3001,6 +3003,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
 		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
 			goto err;
 		skb->dev = fwd;
+		generic_xdp_tx(skb, xdp_prog);
+	} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
+		struct xdp_sock *xs = fwd;
+
+		err = xsk_generic_rcv(xs, xdp);
+		if (err)
+			goto err;
+		consume_skb(skb);
 	} else {
 		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
 		err = -EBADRQC;
@@ -3015,7 +3025,7 @@ err:
 }
 
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	u32 index = ri->ifindex;
@@ -3023,7 +3033,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	int err = 0;
 
 	if (ri->map)
-		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
 
 	ri->ifindex = 0;
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
@@ -3037,6 +3047,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 
 	skb->dev = fwd;
 	_trace_xdp_redirect(dev, xdp_prog, index);
+	generic_xdp_tx(skb, xdp_prog);
 	return 0;
 err:
 	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
-- 
cgit v1.2.3-59-g8ed1b


From fe2308328cd2f26ebc986f543796e7d13ae00bc4 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:31 +0200
Subject: xsk: add umem completion queue support and mmap

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_COMPLETION_QUEUE. Using this socket option, the
process can ask the kernel to allocate a queue (ring buffer) and also
mmap it (XDP_UMEM_PGOFF_COMPLETION_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
kernel to user process. This will be used by the TX path to tell user
space that a certain frame has been transmitted and user space can use
it for something else, if it wishes.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xdp_umem.c          | 7 ++++++-
 net/xdp/xdp_umem.h          | 1 +
 net/xdp/xsk.c               | 7 ++++++-
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e5091881f776..71581a139f26 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -36,6 +36,7 @@ struct sockaddr_xdp {
 #define XDP_RX_RING			1
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
+#define XDP_UMEM_COMPLETION_RING	5
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
 struct xdp_desc {
 	__u32 idx;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9bac1ad570fa..881dfdefe235 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -70,6 +70,11 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->fq = NULL;
 	}
 
+	if (umem->cq) {
+		xskq_destroy(umem->cq);
+		umem->cq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
@@ -251,5 +256,5 @@ out:
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 {
-	return umem->fq;
+	return (umem->fq && umem->cq);
 }
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index c7378a11721f..7e0b2fab8522 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -24,6 +24,7 @@
 
 struct xdp_umem {
 	struct xsk_queue *fq;
+	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
 	u32 npgs;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b931a0db5588..f4a2c5bc6da9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -255,6 +255,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	} else {
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
+		xskq_set_umem(xs->umem->cq, &xs->umem->props);
 	}
 
 	/* Rebind? */
@@ -334,6 +335,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		return 0;
 	}
 	case XDP_UMEM_FILL_RING:
+	case XDP_UMEM_COMPLETION_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -345,7 +347,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->umem->fq;
+		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
+			&xs->umem->cq;
 		err = xsk_init_queue(entries, q, true);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -375,6 +378,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
 			q = xs->umem->fq;
+		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
+			q = xs->umem->cq;
 		else
 			return -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From f61459030ec7fffdaa3c462cc0f728eef11b4d05 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:32 +0200
Subject: xsk: add Tx queue setup and mmap support

Another setsockopt (XDP_TX_QUEUE) is added to let the process allocate
a queue, where the user process can pass frames to be transmitted by
the kernel.

The mmapping of the queue is done using the XDP_PGOFF_TX_QUEUE offset.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h      | 1 +
 include/uapi/linux/if_xdp.h | 2 ++
 net/xdp/xsk.c               | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ce3a2ab16b8f..185f4928fbda 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -30,6 +30,7 @@ struct xdp_sock {
 	struct xdp_umem *umem;
 	struct list_head flush_node;
 	u16 queue_id;
+	struct xsk_queue *tx ____cacheline_aligned_in_smp;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 71581a139f26..e2ea878d025c 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -34,6 +34,7 @@ struct sockaddr_xdp {
 
 /* XDP socket options */
 #define XDP_RX_RING			1
+#define XDP_TX_RING			2
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
@@ -47,6 +48,7 @@ struct xdp_umem_reg {
 
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f4a2c5bc6da9..2d7b0c90d996 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -206,7 +206,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_release;
 	}
 
-	if (!xs->rx) {
+	if (!xs->rx && !xs->tx) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
@@ -291,6 +291,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
 	switch (optname) {
 	case XDP_RX_RING:
+	case XDP_TX_RING:
 	{
 		struct xsk_queue **q;
 		int entries;
@@ -301,7 +302,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		q = &xs->rx;
+		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 		err = xsk_init_queue(entries, q, false);
 		mutex_unlock(&xs->mutex);
 		return err;
@@ -372,6 +373,8 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 
 	if (offset == XDP_PGOFF_RX_RING) {
 		q = xs->rx;
+	} else if (offset == XDP_PGOFF_TX_RING) {
+		q = xs->tx;
 	} else {
 		if (!xs->umem)
 			return -EINVAL;
@@ -431,6 +434,7 @@ static void xsk_destruct(struct sock *sk)
 		return;
 
 	xskq_destroy(xs->rx);
+	xskq_destroy(xs->tx);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
-- 
cgit v1.2.3-59-g8ed1b


From 865b03f21162e4edfda51fc08693c864b1d4fdaf Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:33 +0200
Subject: dev: packet: make packet_direct_xmit a common function

The new dev_direct_xmit will be used by AF_XDP in later commits.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 38 ++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c    | 42 +++++-------------------------------------
 3 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 366c32891158..a30435118530 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2486,6 +2486,7 @@ void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
 void unregister_netdevice_many(struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index aea36b5a2fed..d3fdc86516e8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3625,6 +3625,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+	struct net_device *dev = skb->dev;
+	struct sk_buff *orig_skb = skb;
+	struct netdev_queue *txq;
+	int ret = NETDEV_TX_BUSY;
+	bool again = false;
+
+	if (unlikely(!netif_running(dev) ||
+		     !netif_carrier_ok(dev)))
+		goto drop;
+
+	skb = validate_xmit_skb_list(skb, dev, &again);
+	if (skb != orig_skb)
+		goto drop;
+
+	skb_set_queue_mapping(skb, queue_id);
+	txq = skb_get_tx_queue(dev, skb);
+
+	local_bh_disable();
+
+	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	if (!netif_xmit_frozen_or_drv_stopped(txq))
+		ret = netdev_start_xmit(skb, dev, txq, false);
+	HARD_TX_UNLOCK(dev, txq);
+
+	local_bh_enable();
+
+	if (!dev_xmit_complete(ret))
+		kfree_skb(skb);
+
+	return ret;
+drop:
+	atomic_long_inc(&dev->tx_dropped);
+	kfree_skb_list(skb);
+	return NET_XMIT_DROP;
+}
+EXPORT_SYMBOL(dev_direct_xmit);
 
 /*************************************************************************
  *			Receiver routines
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01f3515cada0..611a26d5235c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
 		struct tpacket3_hdr *);
 static void packet_flush_mclist(struct sock *sk);
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb);
+static u16 packet_pick_tx_queue(struct sk_buff *skb);
 
 struct packet_skb_cb {
 	union {
@@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po);
 
 static int packet_direct_xmit(struct sk_buff *skb)
 {
-	struct net_device *dev = skb->dev;
-	struct sk_buff *orig_skb = skb;
-	struct netdev_queue *txq;
-	int ret = NETDEV_TX_BUSY;
-	bool again = false;
-
-	if (unlikely(!netif_running(dev) ||
-		     !netif_carrier_ok(dev)))
-		goto drop;
-
-	skb = validate_xmit_skb_list(skb, dev, &again);
-	if (skb != orig_skb)
-		goto drop;
-
-	packet_pick_tx_queue(dev, skb);
-	txq = skb_get_tx_queue(dev, skb);
-
-	local_bh_disable();
-
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_xmit_frozen_or_drv_stopped(txq))
-		ret = netdev_start_xmit(skb, dev, txq, false);
-	HARD_TX_UNLOCK(dev, txq);
-
-	local_bh_enable();
-
-	if (!dev_xmit_complete(ret))
-		kfree_skb(skb);
-
-	return ret;
-drop:
-	atomic_long_inc(&dev->tx_dropped);
-	kfree_skb_list(skb);
-	return NET_XMIT_DROP;
+	return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
 }
 
 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
@@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 	return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
 }
 
-static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
+static u16 packet_pick_tx_queue(struct sk_buff *skb)
 {
+	struct net_device *dev = skb->dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
 	u16 queue_index;
 
@@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
 		queue_index = __packet_pick_tx_queue(dev, skb);
 	}
 
-	skb_set_queue_mapping(skb, queue_index);
+	return queue_index;
 }
 
 /* __register_prot_hook must be invoked through register_prot_hook
-- 
cgit v1.2.3-59-g8ed1b


From 35fcde7f8deb51b707b161bf19cbd22363aef2df Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:34 +0200
Subject: xsk: support for Tx

Here, Tx support is added. The user fills the Tx queue with frames to
be sent by the kernel, and let's the kernel know using the sendmsg
syscall.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/xdp/xsk.c       | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 net/xdp/xsk_queue.h |  93 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 200 insertions(+), 4 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2d7b0c90d996..b33c535c7996 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,8 @@
 #include "xsk_queue.h"
 #include "xdp_umem.h"
 
+#define TX_BATCH_SIZE 16
+
 static struct xdp_sock *xdp_sk(struct sock *sk)
 {
 	return (struct xdp_sock *)sk;
@@ -101,6 +103,108 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+static void xsk_destruct_skb(struct sk_buff *skb)
+{
+	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
+	struct xdp_sock *xs = xdp_sk(skb->sk);
+
+	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
+
+	sock_wfree(skb);
+}
+
+static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
+			    size_t total_len)
+{
+	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
+	u32 max_batch = TX_BATCH_SIZE;
+	struct xdp_sock *xs = xdp_sk(sk);
+	bool sent_frame = false;
+	struct xdp_desc desc;
+	struct sk_buff *skb;
+	int err = 0;
+
+	if (unlikely(!xs->tx))
+		return -ENOBUFS;
+	if (need_wait)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&xs->mutex);
+
+	while (xskq_peek_desc(xs->tx, &desc)) {
+		char *buffer;
+		u32 id, len;
+
+		if (max_batch-- == 0) {
+			err = -EAGAIN;
+			goto out;
+		}
+
+		if (xskq_reserve_id(xs->umem->cq)) {
+			err = -EAGAIN;
+			goto out;
+		}
+
+		len = desc.len;
+		if (unlikely(len > xs->dev->mtu)) {
+			err = -EMSGSIZE;
+			goto out;
+		}
+
+		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
+		if (unlikely(!skb)) {
+			err = -EAGAIN;
+			goto out;
+		}
+
+		skb_put(skb, len);
+		id = desc.idx;
+		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
+		err = skb_store_bits(skb, 0, buffer, len);
+		if (unlikely(err)) {
+			kfree_skb(skb);
+			goto out;
+		}
+
+		skb->dev = xs->dev;
+		skb->priority = sk->sk_priority;
+		skb->mark = sk->sk_mark;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
+		skb->destructor = xsk_destruct_skb;
+
+		err = dev_direct_xmit(skb, xs->queue_id);
+		/* Ignore NET_XMIT_CN as packet might have been sent */
+		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
+			err = -EAGAIN;
+			/* SKB consumed by dev_direct_xmit() */
+			goto out;
+		}
+
+		sent_frame = true;
+		xskq_discard_desc(xs->tx);
+	}
+
+out:
+	if (sent_frame)
+		sk->sk_write_space(sk);
+
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
+static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+
+	if (unlikely(!xs->dev))
+		return -ENXIO;
+	if (unlikely(!(xs->dev->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	return xsk_generic_xmit(sk, m, total_len);
+}
+
 static unsigned int xsk_poll(struct file *file, struct socket *sock,
 			     struct poll_table_struct *wait)
 {
@@ -110,6 +214,8 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
 
 	if (xs->rx && !xskq_empty_desc(xs->rx))
 		mask |= POLLIN | POLLRDNORM;
+	if (xs->tx && !xskq_full_desc(xs->tx))
+		mask |= POLLOUT | POLLWRNORM;
 
 	return mask;
 }
@@ -270,6 +376,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xs->queue_id = sxdp->sxdp_queue_id;
 
 	xskq_set_umem(xs->rx, &xs->umem->props);
+	xskq_set_umem(xs->tx, &xs->umem->props);
 
 out_unlock:
 	if (err)
@@ -383,8 +490,6 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 			q = xs->umem->fq;
 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
 			q = xs->umem->cq;
-		else
-			return -EINVAL;
 	}
 
 	if (!q)
@@ -420,7 +525,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.shutdown =	sock_no_shutdown,
 	.setsockopt =	xsk_setsockopt,
 	.getsockopt =	sock_no_getsockopt,
-	.sendmsg =	sock_no_sendmsg,
+	.sendmsg =	xsk_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
 	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 0a9b92b4f93a..3497e8808608 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -111,7 +111,93 @@ static inline void xskq_discard_id(struct xsk_queue *q)
 	(void)xskq_validate_id(q);
 }
 
-/* Rx queue */
+static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	ring->desc[q->prod_tail++ & q->ring_mask] = id;
+
+	/* Order producer and data */
+	smp_wmb();
+
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+	return 0;
+}
+
+static inline int xskq_reserve_id(struct xsk_queue *q)
+{
+	if (xskq_nb_free(q, q->prod_head, 1) == 0)
+		return -ENOSPC;
+
+	q->prod_head++;
+	return 0;
+}
+
+/* Rx/Tx queue */
+
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+{
+	u32 buff_len;
+
+	if (unlikely(d->idx >= q->umem_props.nframes)) {
+		q->invalid_descs++;
+		return false;
+	}
+
+	buff_len = q->umem_props.frame_size;
+	if (unlikely(d->len > buff_len || d->len == 0 ||
+		     d->offset > buff_len || d->offset + d->len > buff_len)) {
+		q->invalid_descs++;
+		return false;
+	}
+
+	return true;
+}
+
+static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
+						  struct xdp_desc *desc)
+{
+	while (q->cons_tail != q->cons_head) {
+		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
+		unsigned int idx = q->cons_tail & q->ring_mask;
+
+		if (xskq_is_valid_desc(q, &ring->desc[idx])) {
+			if (desc)
+				*desc = ring->desc[idx];
+			return desc;
+		}
+
+		q->cons_tail++;
+	}
+
+	return NULL;
+}
+
+static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
+					      struct xdp_desc *desc)
+{
+	struct xdp_rxtx_ring *ring;
+
+	if (q->cons_tail == q->cons_head) {
+		WRITE_ONCE(q->ring->consumer, q->cons_tail);
+		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
+
+		/* Order consumer and data */
+		smp_rmb();
+
+		return xskq_validate_desc(q, desc);
+	}
+
+	ring = (struct xdp_rxtx_ring *)q->ring;
+	*desc = ring->desc[q->cons_tail & q->ring_mask];
+	return desc;
+}
+
+static inline void xskq_discard_desc(struct xsk_queue *q)
+{
+	q->cons_tail++;
+	(void)xskq_validate_desc(q, NULL);
+}
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 					  u32 id, u32 len, u16 offset)
@@ -139,6 +225,11 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q)
 	WRITE_ONCE(q->ring->producer, q->prod_tail);
 }
 
+static inline bool xskq_full_desc(struct xsk_queue *q)
+{
+	return (xskq_nb_avail(q, q->nentries) == q->nentries);
+}
+
 static inline bool xskq_empty_desc(struct xsk_queue *q)
 {
 	return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries);
-- 
cgit v1.2.3-59-g8ed1b


From af75d9e02d08dc55ce6a1e42e485465c630d7349 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:35 +0200
Subject: xsk: statistics support

In this commit, a new getsockopt is added: XDP_STATISTICS. This is
used to obtain stats from the sockets.

v2: getsockopt now returns size of stats structure.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_xdp.h |  7 +++++++
 net/xdp/xsk.c               | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.h         |  5 +++++
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e2ea878d025c..77b88c4efe98 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -38,6 +38,7 @@ struct sockaddr_xdp {
 #define XDP_UMEM_REG			3
 #define XDP_UMEM_FILL_RING		4
 #define XDP_UMEM_COMPLETION_RING	5
+#define XDP_STATISTICS			6
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -46,6 +47,12 @@ struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
 /* Pgoff for mmaping the rings */
 #define XDP_PGOFF_RX_RING			  0
 #define XDP_PGOFF_TX_RING		 0x80000000
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b33c535c7996..009c5af5bba5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -468,6 +468,49 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct xdp_sock *xs = xdp_sk(sk);
+	int len;
+
+	if (level != SOL_XDP)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case XDP_STATISTICS:
+	{
+		struct xdp_statistics stats;
+
+		if (len < sizeof(stats))
+			return -EINVAL;
+
+		mutex_lock(&xs->mutex);
+		stats.rx_dropped = xs->rx_dropped;
+		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
+		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
+		mutex_unlock(&xs->mutex);
+
+		if (copy_to_user(optval, &stats, sizeof(stats)))
+			return -EFAULT;
+		if (put_user(sizeof(stats), optlen))
+			return -EFAULT;
+
+		return 0;
+	}
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 static int xsk_mmap(struct file *file, struct socket *sock,
 		    struct vm_area_struct *vma)
 {
@@ -524,7 +567,7 @@ static const struct proto_ops xsk_proto_ops = {
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
 	.setsockopt =	xsk_setsockopt,
-	.getsockopt =	sock_no_getsockopt,
+	.getsockopt =	xsk_getsockopt,
 	.sendmsg =	xsk_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
 	.mmap =		xsk_mmap,
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 3497e8808608..7aa9a535db0e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -36,6 +36,11 @@ struct xsk_queue {
 
 /* Common functions operating for both RXTX and umem queues */
 
+static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
+{
+	return q ? q->invalid_descs : 0;
+}
+
 static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 {
 	u32 entries = q->prod_tail - q->cons_tail;
-- 
cgit v1.2.3-59-g8ed1b


From b4b8faa1ded7a3bb34db374c692a51cea29f9080 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Wed, 2 May 2018 13:01:36 +0200
Subject: samples/bpf: sample application and documentation for AF_XDP sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a sample application for AF_XDP sockets. The application
supports three different modes of operation: rxdrop, txonly and l2fwd.

To show-case a simple round-robin load-balancing between a set of
sockets in an xskmap, set the RR_LB compile time define option to 1 in
"xdpsock.h".

v2: The entries variable was calculated twice in {umem,xq}_nb_avail.

Co-authored-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/networking/af_xdp.rst | 297 +++++++++++
 Documentation/networking/index.rst  |   1 +
 samples/bpf/Makefile                |   4 +
 samples/bpf/xdpsock.h               |  11 +
 samples/bpf/xdpsock_kern.c          |  56 +++
 samples/bpf/xdpsock_user.c          | 948 ++++++++++++++++++++++++++++++++++++
 6 files changed, 1317 insertions(+)
 create mode 100644 Documentation/networking/af_xdp.rst
 create mode 100644 samples/bpf/xdpsock.h
 create mode 100644 samples/bpf/xdpsock_kern.c
 create mode 100644 samples/bpf/xdpsock_user.c

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
new file mode 100644
index 000000000000..91928d9ee4bf
--- /dev/null
+++ b/Documentation/networking/af_xdp.rst
@@ -0,0 +1,297 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+AF_XDP
+======
+
+Overview
+========
+
+AF_XDP is an address family that is optimized for high performance
+packet processing.
+
+This document assumes that the reader is familiar with BPF and XDP. If
+not, the Cilium project has an excellent reference guide at
+http://cilium.readthedocs.io/en/doc-1.0/bpf/.
+
+Using the XDP_REDIRECT action from an XDP program, the program can
+redirect ingress frames to other XDP enabled netdevs, using the
+bpf_redirect_map() function. AF_XDP sockets enable the possibility for
+XDP programs to redirect frames to a memory buffer in a user-space
+application.
+
+An AF_XDP socket (XSK) is created with the normal socket()
+syscall. Associated with each XSK are two rings: the RX ring and the
+TX ring. A socket can receive packets on the RX ring and it can send
+packets on the TX ring. These rings are registered and sized with the
+setsockopts XDP_RX_RING and XDP_TX_RING, respectively. It is mandatory
+to have at least one of these rings for each socket. An RX or TX
+descriptor ring points to a data buffer in a memory area called a
+UMEM. RX and TX can share the same UMEM so that a packet does not have
+to be copied between RX and TX. Moreover, if a packet needs to be kept
+for a while due to a possible retransmit, the descriptor that points
+to that packet can be changed to point to another and reused right
+away. This again avoids copying data.
+
+The UMEM consists of a number of equally size frames and each frame
+has a unique frame id. A descriptor in one of the rings references a
+frame by referencing its frame id. The user space allocates memory for
+this UMEM using whatever means it feels is most appropriate (malloc,
+mmap, huge pages, etc). This memory area is then registered with the
+kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
+rings: the FILL ring and the COMPLETION ring. The fill ring is used by
+the application to send down frame ids for the kernel to fill in with
+RX packet data. References to these frames will then appear in the RX
+ring once each packet has been received. The completion ring, on the
+other hand, contains frame ids that the kernel has transmitted
+completely and can now be used again by user space, for either TX or
+RX. Thus, the frame ids appearing in the completion ring are ids that
+were previously transmitted using the TX ring. In summary, the RX and
+FILL rings are used for the RX path and the TX and COMPLETION rings
+are used for the TX path.
+
+The socket is then finally bound with a bind() call to a device and a
+specific queue id on that device, and it is not until bind is
+completed that traffic starts to flow.
+
+The UMEM can be shared between processes, if desired. If a process
+wants to do this, it simply skips the registration of the UMEM and its
+corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
+call and submits the XSK of the process it would like to share UMEM
+with as well as its own newly created XSK socket. The new process will
+then receive frame id references in its own RX ring that point to this
+shared UMEM. Note that since the ring structures are single-consumer /
+single-producer (for performance reasons), the new process has to
+create its own socket with associated RX and TX rings, since it cannot
+share this with the other process. This is also the reason that there
+is only one set of FILL and COMPLETION rings per UMEM. It is the
+responsibility of a single process to handle the UMEM.
+
+How is then packets distributed from an XDP program to the XSKs? There
+is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
+user-space application can place an XSK at an arbitrary place in this
+map. The XDP program can then redirect a packet to a specific index in
+this map and at this point XDP validates that the XSK in that map was
+indeed bound to that device and ring number. If not, the packet is
+dropped. If the map is empty at that index, the packet is also
+dropped. This also means that it is currently mandatory to have an XDP
+program loaded (and one XSK in the XSKMAP) to be able to get any
+traffic to user space through the XSK.
+
+AF_XDP can operate in two different modes: XDP_SKB and XDP_DRV. If the
+driver does not have support for XDP, or XDP_SKB is explicitly chosen
+when loading the XDP program, XDP_SKB mode is employed that uses SKBs
+together with the generic XDP support and copies out the data to user
+space. A fallback mode that works for any network device. On the other
+hand, if the driver has support for XDP, it will be used by the AF_XDP
+code to provide better performance, but there is still a copy of the
+data into user space.
+
+Concepts
+========
+
+In order to use an AF_XDP socket, a number of associated objects need
+to be setup.
+
+Jonathan Corbet has also written an excellent article on LWN,
+"Accelerating networking with AF_XDP". It can be found at
+https://lwn.net/Articles/750845/.
+
+UMEM
+----
+
+UMEM is a region of virtual contiguous memory, divided into
+equal-sized frames. An UMEM is associated to a netdev and a specific
+queue id of that netdev. It is created and configured (frame size,
+frame headroom, start address and size) by using the XDP_UMEM_REG
+setsockopt system call. A UMEM is bound to a netdev and queue id, via
+the bind() system call.
+
+An AF_XDP is socket linked to a single UMEM, but one UMEM can have
+multiple AF_XDP sockets. To share an UMEM created via one socket A,
+the next socket B can do this by setting the XDP_SHARED_UMEM flag in
+struct sockaddr_xdp member sxdp_flags, and passing the file descriptor
+of A to struct sockaddr_xdp member sxdp_shared_umem_fd.
+
+The UMEM has two single-producer/single-consumer rings, that are used
+to transfer ownership of UMEM frames between the kernel and the
+user-space application.
+
+Rings
+-----
+
+There are a four different kind of rings: Fill, Completion, RX and
+TX. All rings are single-producer/single-consumer, so the user-space
+application need explicit synchronization of multiple
+processes/threads are reading/writing to them.
+
+The UMEM uses two rings: Fill and Completion. Each socket associated
+with the UMEM must have an RX queue, TX queue or both. Say, that there
+is a setup with four sockets (all doing TX and RX). Then there will be
+one Fill ring, one Completion ring, four TX rings and four RX rings.
+
+The rings are head(producer)/tail(consumer) based rings. A producer
+writes the data ring at the index pointed out by struct xdp_ring
+producer member, and increasing the producer index. A consumer reads
+the data ring at the index pointed out by struct xdp_ring consumer
+member, and increasing the consumer index.
+
+The rings are configured and created via the _RING setsockopt system
+calls and mmapped to user-space using the appropriate offset to mmap()
+(XDP_PGOFF_RX_RING, XDP_PGOFF_TX_RING, XDP_UMEM_PGOFF_FILL_RING and
+XDP_UMEM_PGOFF_COMPLETION_RING).
+
+The size of the rings need to be of size power of two.
+
+UMEM Fill Ring
+~~~~~~~~~~~~~~
+
+The Fill ring is used to transfer ownership of UMEM frames from
+user-space to kernel-space. The UMEM indicies are passed in the
+ring. As an example, if the UMEM is 64k and each frame is 4k, then the
+UMEM has 16 frames and can pass indicies between 0 and 15.
+
+Frames passed to the kernel are used for the ingress path (RX rings).
+
+The user application produces UMEM indicies to this ring.
+
+UMEM Completetion Ring
+~~~~~~~~~~~~~~~~~~~~~~
+
+The Completion Ring is used transfer ownership of UMEM frames from
+kernel-space to user-space. Just like the Fill ring, UMEM indicies are
+used.
+
+Frames passed from the kernel to user-space are frames that has been
+sent (TX ring) and can be used by user-space again.
+
+The user application consumes UMEM indicies from this ring.
+
+
+RX Ring
+~~~~~~~
+
+The RX ring is the receiving side of a socket. Each entry in the ring
+is a struct xdp_desc descriptor. The descriptor contains UMEM index
+(idx), the length of the data (len), the offset into the frame
+(offset).
+
+If no frames have been passed to kernel via the Fill ring, no
+descriptors will (or can) appear on the RX ring.
+
+The user application consumes struct xdp_desc descriptors from this
+ring.
+
+TX Ring
+~~~~~~~
+
+The TX ring is used to send frames. The struct xdp_desc descriptor is
+filled (index, length and offset) and passed into the ring.
+
+To start the transfer a sendmsg() system call is required. This might
+be relaxed in the future.
+
+The user application produces struct xdp_desc descriptors to this
+ring.
+
+XSKMAP / BPF_MAP_TYPE_XSKMAP
+----------------------------
+
+On XDP side there is a BPF map type BPF_MAP_TYPE_XSKMAP (XSKMAP) that
+is used in conjunction with bpf_redirect_map() to pass the ingress
+frame to a socket.
+
+The user application inserts the socket into the map, via the bpf()
+system call.
+
+Note that if an XDP program tries to redirect to a socket that does
+not match the queue configuration and netdev, the frame will be
+dropped. E.g. an AF_XDP socket is bound to netdev eth0 and
+queue 17. Only the XDP program executing for eth0 and queue 17 will
+successfully pass data to the socket. Please refer to the sample
+application (samples/bpf/) in for an example.
+
+Usage
+=====
+
+In order to use AF_XDP sockets there are two parts needed. The
+user-space application and the XDP program. For a complete setup and
+usage example, please refer to the sample application. The user-space
+side is xdpsock_user.c and the XDP side xdpsock_kern.c.
+
+Naive ring dequeue and enqueue could look like this::
+
+    // typedef struct xdp_rxtx_ring RING;
+    // typedef struct xdp_umem_ring RING;
+
+    // typedef struct xdp_desc RING_TYPE;
+    // typedef __u32 RING_TYPE;
+
+    int dequeue_one(RING *ring, RING_TYPE *item)
+    {
+        __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
+
+        if (entries == 0)
+            return -1;
+
+        // read-barrier!
+
+        *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
+        ring->ptrs.consumer++;
+        return 0;
+    }
+
+    int enqueue_one(RING *ring, const RING_TYPE *item)
+    {
+        u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
+
+        if (free_entries == 0)
+            return -1;
+
+        ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
+
+        // write-barrier!
+
+        ring->ptrs.producer++;
+        return 0;
+    }
+
+
+For a more optimized version, please refer to the sample application.
+
+Sample application
+==================
+
+There is a xdpsock benchmarking/test application included that
+demonstrates how to use AF_XDP sockets with both private and shared
+UMEMs. Say that you would like your UDP traffic from port 4242 to end
+up in queue 16, that we will enable AF_XDP on. Here, we use ethtool
+for this::
+
+      ethtool -N p3p2 rx-flow-hash udp4 fn
+      ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
+          action 16
+
+Running the rxdrop benchmark in XDP_DRV mode can then be done
+using::
+
+      samples/bpf/xdpsock -i p3p2 -q 16 -r -N
+
+For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
+can be displayed with "-h", as usual.
+
+Credits
+=======
+
+- Björn Töpel (AF_XDP core)
+- Magnus Karlsson (AF_XDP core)
+- Alexander Duyck
+- Alexei Starovoitov
+- Daniel Borkmann
+- Jesper Dangaard Brouer
+- John Fastabend
+- Jonathan Corbet (LWN coverage)
+- Michael S. Tsirkin
+- Qi Z Zhang
+- Willem de Bruijn
+
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index f204eaff657d..cbd9bdd4a79e 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -6,6 +6,7 @@ Contents:
 .. toctree::
    :maxdepth: 2
 
+   af_xdp
    batman-adv
    can
    dpaa2/index
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 5e31770ac087..8e0c7fb6d7cc 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -45,6 +45,7 @@ hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
+hostprogs-y += xdpsock
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -98,6 +99,7 @@ xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
+xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -151,6 +153,7 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
+always += xdpsock_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -197,6 +200,7 @@ HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
+HOSTLOADLIBES_xdpsock += -lelf -pthread
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
new file mode 100644
index 000000000000..533ab81adfa1
--- /dev/null
+++ b/samples/bpf/xdpsock.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XDPSOCK_H_
+#define XDPSOCK_H_
+
+/* Power-of-2 number of sockets */
+#define MAX_SOCKS 4
+
+/* Round-robin receive */
+#define RR_LB 0
+
+#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
new file mode 100644
index 000000000000..d8806c41362e
--- /dev/null
+++ b/samples/bpf/xdpsock_kern.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#include "xdpsock.h"
+
+struct bpf_map_def SEC("maps") qidconf_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(int),
+	.max_entries	= 1,
+};
+
+struct bpf_map_def SEC("maps") xsks_map = {
+	.type = BPF_MAP_TYPE_XSKMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 4,
+};
+
+struct bpf_map_def SEC("maps") rr_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(unsigned int),
+	.max_entries = 1,
+};
+
+SEC("xdp_sock")
+int xdp_sock_prog(struct xdp_md *ctx)
+{
+	int *qidconf, key = 0, idx;
+	unsigned int *rr;
+
+	qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
+	if (!qidconf)
+		return XDP_ABORTED;
+
+	if (*qidconf != ctx->rx_queue_index)
+		return XDP_PASS;
+
+#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
+	rr = bpf_map_lookup_elem(&rr_map, &key);
+	if (!rr)
+		return XDP_ABORTED;
+
+	*rr = (*rr + 1) & (MAX_SOCKS - 1);
+	idx = *rr;
+#else
+	idx = 0;
+#endif
+
+	return bpf_redirect_map(&xsks_map, idx, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
new file mode 100644
index 000000000000..4b8a7cf3e63b
--- /dev/null
+++ b/samples/bpf/xdpsock_user.c
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2017 - 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <linux/if_ether.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/ethernet.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <locale.h>
+#include <sys/types.h>
+#include <poll.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+#include "xdpsock.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define NUM_FRAMES 131072
+#define FRAME_HEADROOM 0
+#define FRAME_SIZE 2048
+#define NUM_DESCS 1024
+#define BATCH_SIZE 16
+
+#define FQ_NUM_DESCS 1024
+#define CQ_NUM_DESCS 1024
+
+#define DEBUG_HEXDUMP 0
+
+typedef __u32 u32;
+
+static unsigned long prev_time;
+
+enum benchmark_type {
+	BENCH_RXDROP = 0,
+	BENCH_TXONLY = 1,
+	BENCH_L2FWD = 2,
+};
+
+static enum benchmark_type opt_bench = BENCH_RXDROP;
+static u32 opt_xdp_flags;
+static const char *opt_if = "";
+static int opt_ifindex;
+static int opt_queue;
+static int opt_poll;
+static int opt_shared_packet_buffer;
+static int opt_interval = 1;
+
+struct xdp_umem_uqueue {
+	u32 cached_prod;
+	u32 cached_cons;
+	u32 mask;
+	u32 size;
+	struct xdp_umem_ring *ring;
+};
+
+struct xdp_umem {
+	char (*frames)[FRAME_SIZE];
+	struct xdp_umem_uqueue fq;
+	struct xdp_umem_uqueue cq;
+	int fd;
+};
+
+struct xdp_uqueue {
+	u32 cached_prod;
+	u32 cached_cons;
+	u32 mask;
+	u32 size;
+	struct xdp_rxtx_ring *ring;
+};
+
+struct xdpsock {
+	struct xdp_uqueue rx;
+	struct xdp_uqueue tx;
+	int sfd;
+	struct xdp_umem *umem;
+	u32 outstanding_tx;
+	unsigned long rx_npkts;
+	unsigned long tx_npkts;
+	unsigned long prev_rx_npkts;
+	unsigned long prev_tx_npkts;
+};
+
+#define MAX_SOCKS 4
+static int num_socks;
+struct xdpsock *xsks[MAX_SOCKS];
+
+static unsigned long get_nsecs(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
+}
+
+static void dump_stats(void);
+
+#define lassert(expr)							\
+	do {								\
+		if (!(expr)) {						\
+			fprintf(stderr, "%s:%s:%i: Assertion failed: "	\
+				#expr ": errno: %d/\"%s\"\n",		\
+				__FILE__, __func__, __LINE__,		\
+				errno, strerror(errno));		\
+			dump_stats();					\
+			exit(EXIT_FAILURE);				\
+		}							\
+	} while (0)
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+#define u_smp_rmb() barrier()
+#define u_smp_wmb() barrier()
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+static const char pkt_data[] =
+	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
+
+static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
+{
+	u32 free_entries = q->size - (q->cached_prod - q->cached_cons);
+
+	if (free_entries >= nb)
+		return free_entries;
+
+	/* Refresh the local tail pointer */
+	q->cached_cons = q->ring->ptrs.consumer;
+
+	return q->size - (q->cached_prod - q->cached_cons);
+}
+
+static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+{
+	u32 free_entries = q->cached_cons - q->cached_prod;
+
+	if (free_entries >= ndescs)
+		return free_entries;
+
+	/* Refresh the local tail pointer */
+	q->cached_cons = q->ring->ptrs.consumer + q->size;
+	return q->cached_cons - q->cached_prod;
+}
+
+static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
+{
+	u32 entries = q->cached_prod - q->cached_cons;
+
+	if (entries == 0) {
+		q->cached_prod = q->ring->ptrs.producer;
+		entries = q->cached_prod - q->cached_cons;
+	}
+
+	return (entries > nb) ? nb : entries;
+}
+
+static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+{
+	u32 entries = q->cached_prod - q->cached_cons;
+
+	if (entries == 0) {
+		q->cached_prod = q->ring->ptrs.producer;
+		entries = q->cached_prod - q->cached_cons;
+	}
+
+	return (entries > ndescs) ? ndescs : entries;
+}
+
+static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
+					 struct xdp_desc *d,
+					 size_t nb)
+{
+	u32 i;
+
+	if (umem_nb_free(fq, nb) < nb)
+		return -ENOSPC;
+
+	for (i = 0; i < nb; i++) {
+		u32 idx = fq->cached_prod++ & fq->mask;
+
+		fq->ring->desc[idx] = d[i].idx;
+	}
+
+	u_smp_wmb();
+
+	fq->ring->ptrs.producer = fq->cached_prod;
+
+	return 0;
+}
+
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
+				      size_t nb)
+{
+	u32 i;
+
+	if (umem_nb_free(fq, nb) < nb)
+		return -ENOSPC;
+
+	for (i = 0; i < nb; i++) {
+		u32 idx = fq->cached_prod++ & fq->mask;
+
+		fq->ring->desc[idx] = d[i];
+	}
+
+	u_smp_wmb();
+
+	fq->ring->ptrs.producer = fq->cached_prod;
+
+	return 0;
+}
+
+static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
+					       u32 *d, size_t nb)
+{
+	u32 idx, i, entries = umem_nb_avail(cq, nb);
+
+	u_smp_rmb();
+
+	for (i = 0; i < entries; i++) {
+		idx = cq->cached_cons++ & cq->mask;
+		d[i] = cq->ring->desc[idx];
+	}
+
+	if (entries > 0) {
+		u_smp_wmb();
+
+		cq->ring->ptrs.consumer = cq->cached_cons;
+	}
+
+	return entries;
+}
+
+static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off)
+{
+	lassert(idx < NUM_FRAMES);
+	return &xsk->umem->frames[idx][off];
+}
+
+static inline int xq_enq(struct xdp_uqueue *uq,
+			 const struct xdp_desc *descs,
+			 unsigned int ndescs)
+{
+	struct xdp_rxtx_ring *r = uq->ring;
+	unsigned int i;
+
+	if (xq_nb_free(uq, ndescs) < ndescs)
+		return -ENOSPC;
+
+	for (i = 0; i < ndescs; i++) {
+		u32 idx = uq->cached_prod++ & uq->mask;
+
+		r->desc[idx].idx = descs[i].idx;
+		r->desc[idx].len = descs[i].len;
+		r->desc[idx].offset = descs[i].offset;
+	}
+
+	u_smp_wmb();
+
+	r->ptrs.producer = uq->cached_prod;
+	return 0;
+}
+
+static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
+				 __u32 idx, unsigned int ndescs)
+{
+	struct xdp_rxtx_ring *q = uq->ring;
+	unsigned int i;
+
+	if (xq_nb_free(uq, ndescs) < ndescs)
+		return -ENOSPC;
+
+	for (i = 0; i < ndescs; i++) {
+		u32 idx = uq->cached_prod++ & uq->mask;
+
+		q->desc[idx].idx	= idx + i;
+		q->desc[idx].len	= sizeof(pkt_data) - 1;
+		q->desc[idx].offset	= 0;
+	}
+
+	u_smp_wmb();
+
+	q->ptrs.producer = uq->cached_prod;
+	return 0;
+}
+
+static inline int xq_deq(struct xdp_uqueue *uq,
+			 struct xdp_desc *descs,
+			 int ndescs)
+{
+	struct xdp_rxtx_ring *r = uq->ring;
+	unsigned int idx;
+	int i, entries;
+
+	entries = xq_nb_avail(uq, ndescs);
+
+	u_smp_rmb();
+
+	for (i = 0; i < entries; i++) {
+		idx = uq->cached_cons++ & uq->mask;
+		descs[i] = r->desc[idx];
+	}
+
+	if (entries > 0) {
+		u_smp_wmb();
+
+		r->ptrs.consumer = uq->cached_cons;
+	}
+
+	return entries;
+}
+
+static void swap_mac_addresses(void *data)
+{
+	struct ether_header *eth = (struct ether_header *)data;
+	struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
+	struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
+	struct ether_addr tmp;
+
+	tmp = *src_addr;
+	*src_addr = *dst_addr;
+	*dst_addr = tmp;
+}
+
+#if DEBUG_HEXDUMP
+static void hex_dump(void *pkt, size_t length, const char *prefix)
+{
+	int i = 0;
+	const unsigned char *address = (unsigned char *)pkt;
+	const unsigned char *line = address;
+	size_t line_size = 32;
+	unsigned char c;
+
+	printf("length = %zu\n", length);
+	printf("%s | ", prefix);
+	while (length-- > 0) {
+		printf("%02X ", *address++);
+		if (!(++i % line_size) || (length == 0 && i % line_size)) {
+			if (length == 0) {
+				while (i++ % line_size)
+					printf("__ ");
+			}
+			printf(" | ");	/* right close */
+			while (line < address) {
+				c = *line++;
+				printf("%c", (c < 33 || c == 255) ? 0x2E : c);
+			}
+			printf("\n");
+			if (length > 0)
+				printf("%s | ", prefix);
+		}
+	}
+	printf("\n");
+}
+#endif
+
+static size_t gen_eth_frame(char *frame)
+{
+	memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+	return sizeof(pkt_data) - 1;
+}
+
+static struct xdp_umem *xdp_umem_configure(int sfd)
+{
+	int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
+	struct xdp_umem_reg mr;
+	struct xdp_umem *umem;
+	void *bufs;
+
+	umem = calloc(1, sizeof(*umem));
+	lassert(umem);
+
+	lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+			       NUM_FRAMES * FRAME_SIZE) == 0);
+
+	mr.addr = (__u64)bufs;
+	mr.len = NUM_FRAMES * FRAME_SIZE;
+	mr.frame_size = FRAME_SIZE;
+	mr.frame_headroom = FRAME_HEADROOM;
+
+	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
+	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
+			   sizeof(int)) == 0);
+	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
+			   sizeof(int)) == 0);
+
+	umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
+			     FQ_NUM_DESCS * sizeof(u32),
+			     PROT_READ | PROT_WRITE,
+			     MAP_SHARED | MAP_POPULATE, sfd,
+			     XDP_UMEM_PGOFF_FILL_RING);
+	lassert(umem->fq.ring != MAP_FAILED);
+
+	umem->fq.mask = FQ_NUM_DESCS - 1;
+	umem->fq.size = FQ_NUM_DESCS;
+
+	umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
+			     CQ_NUM_DESCS * sizeof(u32),
+			     PROT_READ | PROT_WRITE,
+			     MAP_SHARED | MAP_POPULATE, sfd,
+			     XDP_UMEM_PGOFF_COMPLETION_RING);
+	lassert(umem->cq.ring != MAP_FAILED);
+
+	umem->cq.mask = CQ_NUM_DESCS - 1;
+	umem->cq.size = CQ_NUM_DESCS;
+
+	umem->frames = (char (*)[FRAME_SIZE])bufs;
+	umem->fd = sfd;
+
+	if (opt_bench == BENCH_TXONLY) {
+		int i;
+
+		for (i = 0; i < NUM_FRAMES; i++)
+			(void)gen_eth_frame(&umem->frames[i][0]);
+	}
+
+	return umem;
+}
+
+static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+{
+	struct sockaddr_xdp sxdp = {};
+	int sfd, ndescs = NUM_DESCS;
+	struct xdpsock *xsk;
+	bool shared = true;
+	u32 i;
+
+	sfd = socket(PF_XDP, SOCK_RAW, 0);
+	lassert(sfd >= 0);
+
+	xsk = calloc(1, sizeof(*xsk));
+	lassert(xsk);
+
+	xsk->sfd = sfd;
+	xsk->outstanding_tx = 0;
+
+	if (!umem) {
+		shared = false;
+		xsk->umem = xdp_umem_configure(sfd);
+	} else {
+		xsk->umem = umem;
+	}
+
+	lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
+			   &ndescs, sizeof(int)) == 0);
+	lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
+			   &ndescs, sizeof(int)) == 0);
+
+	/* Rx */
+	xsk->rx.ring = mmap(NULL,
+			    sizeof(struct xdp_ring) +
+			    NUM_DESCS * sizeof(struct xdp_desc),
+			    PROT_READ | PROT_WRITE,
+			    MAP_SHARED | MAP_POPULATE, sfd,
+			    XDP_PGOFF_RX_RING);
+	lassert(xsk->rx.ring != MAP_FAILED);
+
+	if (!shared) {
+		for (i = 0; i < NUM_DESCS / 2; i++)
+			lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
+				== 0);
+	}
+
+	/* Tx */
+	xsk->tx.ring = mmap(NULL,
+			 sizeof(struct xdp_ring) +
+			 NUM_DESCS * sizeof(struct xdp_desc),
+			 PROT_READ | PROT_WRITE,
+			 MAP_SHARED | MAP_POPULATE, sfd,
+			 XDP_PGOFF_TX_RING);
+	lassert(xsk->tx.ring != MAP_FAILED);
+
+	xsk->rx.mask = NUM_DESCS - 1;
+	xsk->rx.size = NUM_DESCS;
+
+	xsk->tx.mask = NUM_DESCS - 1;
+	xsk->tx.size = NUM_DESCS;
+
+	sxdp.sxdp_family = PF_XDP;
+	sxdp.sxdp_ifindex = opt_ifindex;
+	sxdp.sxdp_queue_id = opt_queue;
+	if (shared) {
+		sxdp.sxdp_flags = XDP_SHARED_UMEM;
+		sxdp.sxdp_shared_umem_fd = umem->fd;
+	}
+
+	lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+
+	return xsk;
+}
+
+static void print_benchmark(bool running)
+{
+	const char *bench_str = "INVALID";
+
+	if (opt_bench == BENCH_RXDROP)
+		bench_str = "rxdrop";
+	else if (opt_bench == BENCH_TXONLY)
+		bench_str = "txonly";
+	else if (opt_bench == BENCH_L2FWD)
+		bench_str = "l2fwd";
+
+	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+		printf("xdp-skb ");
+	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+		printf("xdp-drv ");
+	else
+		printf("	");
+
+	if (opt_poll)
+		printf("poll() ");
+
+	if (running) {
+		printf("running...");
+		fflush(stdout);
+	}
+}
+
+static void dump_stats(void)
+{
+	unsigned long now = get_nsecs();
+	long dt = now - prev_time;
+	int i;
+
+	prev_time = now;
+
+	for (i = 0; i < num_socks; i++) {
+		char *fmt = "%-15s %'-11.0f %'-11lu\n";
+		double rx_pps, tx_pps;
+
+		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+			 1000000000. / dt;
+		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+			 1000000000. / dt;
+
+		printf("\n sock%d@", i);
+		print_benchmark(false);
+		printf("\n");
+
+		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+		       dt / 1000000000.);
+		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+
+		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+	}
+}
+
+static void *poller(void *arg)
+{
+	(void)arg;
+	for (;;) {
+		sleep(opt_interval);
+		dump_stats();
+	}
+
+	return NULL;
+}
+
+static void int_exit(int sig)
+{
+	(void)sig;
+	dump_stats();
+	bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+	exit(EXIT_SUCCESS);
+}
+
+static struct option long_options[] = {
+	{"rxdrop", no_argument, 0, 'r'},
+	{"txonly", no_argument, 0, 't'},
+	{"l2fwd", no_argument, 0, 'l'},
+	{"interface", required_argument, 0, 'i'},
+	{"queue", required_argument, 0, 'q'},
+	{"poll", no_argument, 0, 'p'},
+	{"shared-buffer", no_argument, 0, 's'},
+	{"xdp-skb", no_argument, 0, 'S'},
+	{"xdp-native", no_argument, 0, 'N'},
+	{"interval", required_argument, 0, 'n'},
+	{0, 0, 0, 0}
+};
+
+static void usage(const char *prog)
+{
+	const char *str =
+		"  Usage: %s [OPTIONS]\n"
+		"  Options:\n"
+		"  -r, --rxdrop		Discard all incoming packets (default)\n"
+		"  -t, --txonly		Only send packets\n"
+		"  -l, --l2fwd		MAC swap L2 forwarding\n"
+		"  -i, --interface=n	Run on interface n\n"
+		"  -q, --queue=n	Use queue n (default 0)\n"
+		"  -p, --poll		Use poll syscall\n"
+		"  -s, --shared-buffer	Use shared packet buffer\n"
+		"  -S, --xdp-skb=n	Use XDP skb-mod\n"
+		"  -N, --xdp-native=n	Enfore XDP native mode\n"
+		"  -n, --interval=n	Specify statistics update interval (default 1 sec).\n"
+		"\n";
+	fprintf(stderr, str, prog);
+	exit(EXIT_FAILURE);
+}
+
+static void parse_command_line(int argc, char **argv)
+{
+	int option_index, c;
+
+	opterr = 0;
+
+	for (;;) {
+		c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
+				&option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'r':
+			opt_bench = BENCH_RXDROP;
+			break;
+		case 't':
+			opt_bench = BENCH_TXONLY;
+			break;
+		case 'l':
+			opt_bench = BENCH_L2FWD;
+			break;
+		case 'i':
+			opt_if = optarg;
+			break;
+		case 'q':
+			opt_queue = atoi(optarg);
+			break;
+		case 's':
+			opt_shared_packet_buffer = 1;
+			break;
+		case 'p':
+			opt_poll = 1;
+			break;
+		case 'S':
+			opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'N':
+			opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
+		case 'n':
+			opt_interval = atoi(optarg);
+			break;
+		default:
+			usage(basename(argv[0]));
+		}
+	}
+
+	opt_ifindex = if_nametoindex(opt_if);
+	if (!opt_ifindex) {
+		fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
+			opt_if);
+		usage(basename(argv[0]));
+	}
+}
+
+static void kick_tx(int fd)
+{
+	int ret;
+
+	ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN)
+		return;
+	lassert(0);
+}
+
+static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+{
+	u32 descs[BATCH_SIZE];
+	unsigned int rcvd;
+	size_t ndescs;
+
+	if (!xsk->outstanding_tx)
+		return;
+
+	kick_tx(xsk->sfd);
+	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
+		 xsk->outstanding_tx;
+
+	/* re-add completed Tx buffers */
+	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+	if (rcvd > 0) {
+		umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+		xsk->outstanding_tx -= rcvd;
+		xsk->tx_npkts += rcvd;
+	}
+}
+
+static inline void complete_tx_only(struct xdpsock *xsk)
+{
+	u32 descs[BATCH_SIZE];
+	unsigned int rcvd;
+
+	if (!xsk->outstanding_tx)
+		return;
+
+	kick_tx(xsk->sfd);
+
+	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+	if (rcvd > 0) {
+		xsk->outstanding_tx -= rcvd;
+		xsk->tx_npkts += rcvd;
+	}
+}
+
+static void rx_drop(struct xdpsock *xsk)
+{
+	struct xdp_desc descs[BATCH_SIZE];
+	unsigned int rcvd, i;
+
+	rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+	if (!rcvd)
+		return;
+
+	for (i = 0; i < rcvd; i++) {
+		u32 idx = descs[i].idx;
+
+		lassert(idx < NUM_FRAMES);
+#if DEBUG_HEXDUMP
+		char *pkt;
+		char buf[32];
+
+		pkt = xq_get_data(xsk, idx, descs[i].offset);
+		sprintf(buf, "idx=%d", idx);
+		hex_dump(pkt, descs[i].len, buf);
+#endif
+	}
+
+	xsk->rx_npkts += rcvd;
+
+	umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
+}
+
+static void rx_drop_all(void)
+{
+	struct pollfd fds[MAX_SOCKS + 1];
+	int i, ret, timeout, nfds = 1;
+
+	memset(fds, 0, sizeof(fds));
+
+	for (i = 0; i < num_socks; i++) {
+		fds[i].fd = xsks[i]->sfd;
+		fds[i].events = POLLIN;
+		timeout = 1000; /* 1sn */
+	}
+
+	for (;;) {
+		if (opt_poll) {
+			ret = poll(fds, nfds, timeout);
+			if (ret <= 0)
+				continue;
+		}
+
+		for (i = 0; i < num_socks; i++)
+			rx_drop(xsks[i]);
+	}
+}
+
+static void tx_only(struct xdpsock *xsk)
+{
+	int timeout, ret, nfds = 1;
+	struct pollfd fds[nfds + 1];
+	unsigned int idx = 0;
+
+	memset(fds, 0, sizeof(fds));
+	fds[0].fd = xsk->sfd;
+	fds[0].events = POLLOUT;
+	timeout = 1000; /* 1sn */
+
+	for (;;) {
+		if (opt_poll) {
+			ret = poll(fds, nfds, timeout);
+			if (ret <= 0)
+				continue;
+
+			if (fds[0].fd != xsk->sfd ||
+			    !(fds[0].revents & POLLOUT))
+				continue;
+		}
+
+		if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
+			lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+
+			xsk->outstanding_tx += BATCH_SIZE;
+			idx += BATCH_SIZE;
+			idx %= NUM_FRAMES;
+		}
+
+		complete_tx_only(xsk);
+	}
+}
+
+static void l2fwd(struct xdpsock *xsk)
+{
+	for (;;) {
+		struct xdp_desc descs[BATCH_SIZE];
+		unsigned int rcvd, i;
+		int ret;
+
+		for (;;) {
+			complete_tx_l2fwd(xsk);
+
+			rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+			if (rcvd > 0)
+				break;
+		}
+
+		for (i = 0; i < rcvd; i++) {
+			char *pkt = xq_get_data(xsk, descs[i].idx,
+						descs[i].offset);
+
+			swap_mac_addresses(pkt);
+#if DEBUG_HEXDUMP
+			char buf[32];
+			u32 idx = descs[i].idx;
+
+			sprintf(buf, "idx=%d", idx);
+			hex_dump(pkt, descs[i].len, buf);
+#endif
+		}
+
+		xsk->rx_npkts += rcvd;
+
+		ret = xq_enq(&xsk->tx, descs, rcvd);
+		lassert(ret == 0);
+		xsk->outstanding_tx += rcvd;
+	}
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	char xdp_filename[256];
+	int i, ret, key = 0;
+	pthread_t pt;
+
+	parse_command_line(argc, argv);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
+			strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(xdp_filename)) {
+		fprintf(stderr, "ERROR: load_bpf_file %s\n", bpf_log_buf);
+		exit(EXIT_FAILURE);
+	}
+
+	if (!prog_fd[0]) {
+		fprintf(stderr, "ERROR: load_bpf_file: \"%s\"\n",
+			strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd[0], opt_xdp_flags) < 0) {
+		fprintf(stderr, "ERROR: link set xdp fd failed\n");
+		exit(EXIT_FAILURE);
+	}
+
+	ret = bpf_map_update_elem(map_fd[0], &key, &opt_queue, 0);
+	if (ret) {
+		fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Create sockets... */
+	xsks[num_socks++] = xsk_configure(NULL);
+
+#if RR_LB
+	for (i = 0; i < MAX_SOCKS - 1; i++)
+		xsks[num_socks++] = xsk_configure(xsks[0]->umem);
+#endif
+
+	/* ...and insert them into the map. */
+	for (i = 0; i < num_socks; i++) {
+		key = i;
+		ret = bpf_map_update_elem(map_fd[1], &key, &xsks[i]->sfd, 0);
+		if (ret) {
+			fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+	signal(SIGABRT, int_exit);
+
+	setlocale(LC_ALL, "");
+
+	ret = pthread_create(&pt, NULL, poller, NULL);
+	lassert(ret == 0);
+
+	prev_time = get_nsecs();
+
+	if (opt_bench == BENCH_RXDROP)
+		rx_drop_all();
+	else if (opt_bench == BENCH_TXONLY)
+		tx_only(xsks[0]);
+	else
+		l2fwd(xsks[0]);
+
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From b390134c2423c557cac844ee3b7cb52dc1cfab02 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:12 +0200
Subject: bpf: prefix cbpf internal helpers with bpf_

No change in functionality, just remove the '__' prefix and replace it
with a 'bpf_' prefix instead. We later on add a couple of more helpers
for cBPF and keeping the scheme with '__' is suboptimal there.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 120bc8a202d9..c33595a8d604 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -113,12 +113,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 }
 EXPORT_SYMBOL(sk_filter_trim_cap);
 
-BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
+BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
 {
 	return skb_get_poff(skb);
 }
 
-BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 {
 	struct nlattr *nla;
 
@@ -138,7 +138,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 	return 0;
 }
 
-BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
+BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 {
 	struct nlattr *nla;
 
@@ -162,13 +162,13 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 	return 0;
 }
 
-BPF_CALL_0(__get_raw_cpu_id)
+BPF_CALL_0(bpf_get_raw_cpu_id)
 {
 	return raw_smp_processor_id();
 }
 
 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
-	.func		= __get_raw_cpu_id,
+	.func		= bpf_get_raw_cpu_id,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
@@ -318,16 +318,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		/* Emit call(arg1=CTX, arg2=A, arg3=X) */
 		switch (fp->k) {
 		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
-			*insn = BPF_EMIT_CALL(__skb_get_pay_offset);
+			*insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
 			break;
 		case SKF_AD_OFF + SKF_AD_NLATTR:
-			*insn = BPF_EMIT_CALL(__skb_get_nlattr);
+			*insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
 			break;
 		case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
-			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
+			*insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
 			break;
 		case SKF_AD_OFF + SKF_AD_CPU:
-			*insn = BPF_EMIT_CALL(__get_raw_cpu_id);
+			*insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
 			break;
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
-- 
cgit v1.2.3-59-g8ed1b


From 93731ef086cee90af594e62874bb98ae6d6eee91 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:13 +0200
Subject: bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier

Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason
is that the eBPF tests from test_bpf module do not go via BPF verifier
and therefore any instruction rewrites from verifier cannot take place.

Therefore, move them into test_verifier which runs out of user space,
so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches.
It will have the same effect since runtime tests are also performed from
there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto
and keep it internal to core kernel.

Additionally, also add further cBPF LD_ABS/LD_IND test coverage into
test_bpf.ko suite.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h                         |   2 -
 lib/test_bpf.c                              | 570 +++++++++++++++++-----------
 net/core/filter.c                           |   6 +-
 tools/testing/selftests/bpf/test_verifier.c | 266 ++++++++++++-
 4 files changed, 619 insertions(+), 225 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 68ecdb4eea09..d0e3d7ef36a8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -714,8 +714,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
-extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
-extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8e157806df7a..317f231462d4 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -386,116 +386,6 @@ static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self)
 	return 0;
 }
 
-#define PUSH_CNT 68
-/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
-static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
-{
-	unsigned int len = BPF_MAXINSNS;
-	struct bpf_insn *insn;
-	int i = 0, j, k = 0;
-
-	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	insn[i++] = BPF_MOV64_REG(R6, R1);
-loop:
-	for (j = 0; j < PUSH_CNT; j++) {
-		insn[i++] = BPF_LD_ABS(BPF_B, 0);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
-		i++;
-		insn[i++] = BPF_MOV64_REG(R1, R6);
-		insn[i++] = BPF_MOV64_IMM(R2, 1);
-		insn[i++] = BPF_MOV64_IMM(R3, 2);
-		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 bpf_skb_vlan_push_proto.func - __bpf_call_base);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
-		i++;
-	}
-
-	for (j = 0; j < PUSH_CNT; j++) {
-		insn[i++] = BPF_LD_ABS(BPF_B, 0);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2);
-		i++;
-		insn[i++] = BPF_MOV64_REG(R1, R6);
-		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-					 bpf_skb_vlan_pop_proto.func - __bpf_call_base);
-		insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2);
-		i++;
-	}
-	if (++k < 5)
-		goto loop;
-
-	for (; i < len - 1; i++)
-		insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef);
-
-	insn[len - 1] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = len;
-
-	return 0;
-}
-
-static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self)
-{
-	struct bpf_insn *insn;
-
-	insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	/* Due to func address being non-const, we need to
-	 * assemble this here.
-	 */
-	insn[0] = BPF_MOV64_REG(R6, R1);
-	insn[1] = BPF_LD_ABS(BPF_B, 0);
-	insn[2] = BPF_LD_ABS(BPF_H, 0);
-	insn[3] = BPF_LD_ABS(BPF_W, 0);
-	insn[4] = BPF_MOV64_REG(R7, R6);
-	insn[5] = BPF_MOV64_IMM(R6, 0);
-	insn[6] = BPF_MOV64_REG(R1, R7);
-	insn[7] = BPF_MOV64_IMM(R2, 1);
-	insn[8] = BPF_MOV64_IMM(R3, 2);
-	insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
-			       bpf_skb_vlan_push_proto.func - __bpf_call_base);
-	insn[10] = BPF_MOV64_REG(R6, R7);
-	insn[11] = BPF_LD_ABS(BPF_B, 0);
-	insn[12] = BPF_LD_ABS(BPF_H, 0);
-	insn[13] = BPF_LD_ABS(BPF_W, 0);
-	insn[14] = BPF_MOV64_IMM(R0, 42);
-	insn[15] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = 16;
-
-	return 0;
-}
-
-static int bpf_fill_jump_around_ld_abs(struct bpf_test *self)
-{
-	unsigned int len = BPF_MAXINSNS;
-	struct bpf_insn *insn;
-	int i = 0;
-
-	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
-	if (!insn)
-		return -ENOMEM;
-
-	insn[i++] = BPF_MOV64_REG(R6, R1);
-	insn[i++] = BPF_LD_ABS(BPF_B, 0);
-	insn[i] = BPF_JMP_IMM(BPF_JEQ, R0, 10, len - i - 2);
-	i++;
-	while (i < len - 1)
-		insn[i++] = BPF_LD_ABS(BPF_B, 1);
-	insn[i] = BPF_EXIT_INSN();
-
-	self->u.ptr.insns = insn;
-	self->u.ptr.len = len;
-
-	return 0;
-}
-
 static int __bpf_fill_stxdw(struct bpf_test *self, int size)
 {
 	unsigned int len = BPF_MAXINSNS;
@@ -1987,40 +1877,6 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, -1 } }
 	},
-	{
-		"INT: DIV + ABS",
-		.u.insns_int = {
-			BPF_ALU64_REG(BPF_MOV, R6, R1),
-			BPF_LD_ABS(BPF_B, 3),
-			BPF_ALU64_IMM(BPF_MOV, R2, 2),
-			BPF_ALU32_REG(BPF_DIV, R0, R2),
-			BPF_ALU64_REG(BPF_MOV, R8, R0),
-			BPF_LD_ABS(BPF_B, 4),
-			BPF_ALU64_REG(BPF_ADD, R8, R0),
-			BPF_LD_IND(BPF_B, R8, -70),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 10, 20, 30, 40, 50 },
-		{ { 4, 0 }, { 5, 10 } }
-	},
-	{
-		/* This one doesn't go through verifier, but is just raw insn
-		 * as opposed to cBPF tests from here. Thus div by 0 tests are
-		 * done in test_verifier in BPF kselftests.
-		 */
-		"INT: DIV by -1",
-		.u.insns_int = {
-			BPF_ALU64_REG(BPF_MOV, R6, R1),
-			BPF_ALU64_IMM(BPF_MOV, R7, -1),
-			BPF_LD_ABS(BPF_B, 3),
-			BPF_ALU32_REG(BPF_DIV, R0, R7),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 10, 20, 30, 40, 50 },
-		{ { 3, 0 }, { 4, 0 } }
-	},
 	{
 		"check: missing ret",
 		.u.insns = {
@@ -2383,50 +2239,6 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } }
 	},
-	{
-		"nmap reduced",
-		.u.insns_int = {
-			BPF_MOV64_REG(R6, R1),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 28),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 26),
-			BPF_MOV32_IMM(R0, 18),
-			BPF_STX_MEM(BPF_W, R10, R0, -64),
-			BPF_LDX_MEM(BPF_W, R7, R10, -64),
-			BPF_LD_IND(BPF_W, R7, 14),
-			BPF_STX_MEM(BPF_W, R10, R0, -60),
-			BPF_MOV32_IMM(R0, 280971478),
-			BPF_STX_MEM(BPF_W, R10, R0, -56),
-			BPF_LDX_MEM(BPF_W, R7, R10, -56),
-			BPF_LDX_MEM(BPF_W, R0, R10, -60),
-			BPF_ALU32_REG(BPF_SUB, R0, R7),
-			BPF_JMP_IMM(BPF_JNE, R0, 0, 15),
-			BPF_LD_ABS(BPF_H, 12),
-			BPF_JMP_IMM(BPF_JNE, R0, 0x806, 13),
-			BPF_MOV32_IMM(R0, 22),
-			BPF_STX_MEM(BPF_W, R10, R0, -56),
-			BPF_LDX_MEM(BPF_W, R7, R10, -56),
-			BPF_LD_IND(BPF_H, R7, 14),
-			BPF_STX_MEM(BPF_W, R10, R0, -52),
-			BPF_MOV32_IMM(R0, 17366),
-			BPF_STX_MEM(BPF_W, R10, R0, -48),
-			BPF_LDX_MEM(BPF_W, R7, R10, -48),
-			BPF_LDX_MEM(BPF_W, R0, R10, -52),
-			BPF_ALU32_REG(BPF_SUB, R0, R7),
-			BPF_JMP_IMM(BPF_JNE, R0, 0, 2),
-			BPF_MOV32_IMM(R0, 256),
-			BPF_EXIT_INSN(),
-			BPF_MOV32_IMM(R0, 0),
-			BPF_EXIT_INSN(),
-		},
-		INTERNAL,
-		{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, 0,
-		  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-		  0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6},
-		{ { 38, 256 } },
-		.stack_depth = 64,
-	},
 	/* BPF_ALU | BPF_MOV | BPF_X */
 	{
 		"ALU_MOV_X: dst = 2",
@@ -5485,22 +5297,6 @@ static struct bpf_test tests[] = {
 		{ { 1, 0xbee } },
 		.fill_helper = bpf_fill_ld_abs_get_processor_id,
 	},
-	{
-		"BPF_MAXINSNS: ld_abs+vlan_push/pop",
-		{ },
-		INTERNAL,
-		{ 0x34 },
-		{ { ETH_HLEN, 0xbef } },
-		.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
-	},
-	{
-		"BPF_MAXINSNS: jump around ld_abs",
-		{ },
-		INTERNAL,
-		{ 10, 11 },
-		{ { 2, 10 } },
-		.fill_helper = bpf_fill_jump_around_ld_abs,
-	},
 	/*
 	 * LD_IND / LD_ABS on fragmented SKBs
 	 */
@@ -5682,6 +5478,53 @@ static struct bpf_test tests[] = {
 		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
 		{ {0x40, 0x05 } },
 	},
+	{
+		"LD_IND byte positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xff } },
+	},
+	{
+		"LD_IND byte positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND byte negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
+	{
+		"LD_IND byte negative offset, multiple calls",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 1),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 2),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 3),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 4),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
 	{
 		"LD_IND halfword positive offset",
 		.u.insns = {
@@ -5730,6 +5573,39 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x66cc } },
 	},
+	{
+		"LD_IND halfword positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3d),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffff } },
+	},
+	{
+		"LD_IND halfword positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND halfword negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_H, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
 	{
 		"LD_IND word positive offset",
 		.u.insns = {
@@ -5820,6 +5696,39 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x66cc77dd } },
 	},
+	{
+		"LD_IND word positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3b),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffffffff } },
+	},
+	{
+		"LD_IND word positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_IND word negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_IMM, 0x3e),
+			BPF_STMT(BPF_LD | BPF_IND | BPF_W, -0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 } },
+	},
 	{
 		"LD_ABS byte",
 		.u.insns = {
@@ -5837,6 +5746,68 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0xcc } },
 	},
+	{
+		"LD_ABS byte positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xff } },
+	},
+	{
+		"LD_ABS byte positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS byte negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS byte negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
+	{
+		"LD_ABS byte negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS byte negative offset, multiple calls",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3d),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x82 }, },
+	},
 	{
 		"LD_ABS halfword",
 		.u.insns = {
@@ -5871,6 +5842,55 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x99ff } },
 	},
+	{
+		"LD_ABS halfword positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffff } },
+	},
+	{
+		"LD_ABS halfword positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS halfword negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS halfword negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x1982 }, },
+	},
+	{
+		"LD_ABS halfword negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
 	{
 		"LD_ABS word",
 		.u.insns = {
@@ -5939,6 +5959,140 @@ static struct bpf_test tests[] = {
 		},
 		{ {0x40, 0x88ee99ff } },
 	},
+	{
+		"LD_ABS word positive offset, all ff",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0xff, [0x3d] = 0xff,  [0x3e] = 0xff, [0x3f] = 0xff },
+		{ {0x40, 0xffffffff } },
+	},
+	{
+		"LD_ABS word positive offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LD_ABS word negative offset, out of bounds load",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, -1),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"LD_ABS word negative offset, in bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x25051982 }, },
+	},
+	{
+		"LD_ABS word negative offset, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x3f, 0 }, },
+	},
+	{
+		"LDX_MSH standalone, preserved A",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0xffeebbaa }, },
+	},
+	{
+		"LDX_MSH standalone, preserved A 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0x175e9d63),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3d),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3f),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x175e9d63 }, },
+	},
+	{
+		"LDX_MSH standalone, test result 1",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x14 }, },
+	},
+	{
+		"LDX_MSH standalone, test result 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x24 }, },
+	},
+	{
+		"LDX_MSH standalone, negative offset",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, -1),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0 }, },
+	},
+	{
+		"LDX_MSH standalone, negative offset 2",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, SKF_LL_OFF + 0x3e),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0x24 }, },
+	},
+	{
+		"LDX_MSH standalone, out of bounds",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa),
+			BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x40),
+			BPF_STMT(BPF_MISC | BPF_TXA, 0),
+			BPF_STMT(BPF_RET | BPF_A, 0x0),
+		},
+		CLASSIC,
+		{ [0x3c] = 0x25, [0x3d] = 0x05,  [0x3e] = 0x19, [0x3f] = 0x82 },
+		{ {0x40, 0 }, },
+	},
 	/*
 	 * verify that the interpreter or JIT correctly sets A and X
 	 * to 0.
@@ -6127,14 +6281,6 @@ static struct bpf_test tests[] = {
 		{},
 		{ {0x1, 0x42 } },
 	},
-	{
-		"LD_ABS with helper changing skb data",
-		{ },
-		INTERNAL,
-		{ 0x34 },
-		{ { ETH_HLEN, 42 } },
-		.fill_helper = bpf_fill_ld_abs_vlan_push_pop2,
-	},
 	/* Checking interpreter vs JIT wrt signed extended imms. */
 	{
 		"JNE signed compare, test 1",
diff --git a/net/core/filter.c b/net/core/filter.c
index c33595a8d604..865500f6180d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2181,7 +2181,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	return ret;
 }
 
-const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 	.func           = bpf_skb_vlan_push,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
@@ -2189,7 +2189,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = {
 	.arg2_type      = ARG_ANYTHING,
 	.arg3_type      = ARG_ANYTHING,
 };
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
 
 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 {
@@ -2203,13 +2202,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	return ret;
 }
 
-const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
 	.func           = bpf_skb_vlan_pop,
 	.gpl_only       = false,
 	.ret_type       = RET_INTEGER,
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
 
 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
 {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1acafe26498b..275b4570b5b8 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -47,7 +47,7 @@
 # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
 
-#define MAX_INSNS	512
+#define MAX_INSNS	BPF_MAXINSNS
 #define MAX_FIXUPS	8
 #define MAX_NR_MAPS	4
 #define POINTER_VALUE	0xcafe4all
@@ -77,6 +77,8 @@ struct bpf_test {
 	} result, result_unpriv;
 	enum bpf_prog_type prog_type;
 	uint8_t flags;
+	__u8 data[TEST_DATA_LEN];
+	void (*fill_helper)(struct bpf_test *self);
 };
 
 /* Note we want this to be 64 bit aligned so that the end of our array is
@@ -94,6 +96,62 @@ struct other_val {
 	long long bar;
 };
 
+static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
+{
+	/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */
+#define PUSH_CNT 51
+	unsigned int len = BPF_MAXINSNS;
+	struct bpf_insn *insn = self->insns;
+	int i = 0, j, k = 0;
+
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+loop:
+	for (j = 0; j < PUSH_CNT; j++) {
+		insn[i++] = BPF_LD_ABS(BPF_B, 0);
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
+		i++;
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+		insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1);
+		insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2);
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_skb_vlan_push),
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
+		i++;
+	}
+
+	for (j = 0; j < PUSH_CNT; j++) {
+		insn[i++] = BPF_LD_ABS(BPF_B, 0);
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2);
+		i++;
+		insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+		insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+					 BPF_FUNC_skb_vlan_pop),
+		insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2);
+		i++;
+	}
+	if (++k < 5)
+		goto loop;
+
+	for (; i < len - 1; i++)
+		insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef);
+	insn[len - 1] = BPF_EXIT_INSN();
+}
+
+static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
+{
+	struct bpf_insn *insn = self->insns;
+	unsigned int len = BPF_MAXINSNS;
+	int i = 0;
+
+	insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	insn[i++] = BPF_LD_ABS(BPF_B, 0);
+	insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2);
+	i++;
+	while (i < len - 1)
+		insn[i++] = BPF_LD_ABS(BPF_B, 1);
+	insn[i] = BPF_EXIT_INSN();
+}
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -11725,6 +11783,197 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
+	{
+		"ld_abs: invalid op 1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_DW, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "unknown opcode",
+	},
+	{
+		"ld_abs: invalid op 2",
+		.insns = {
+			BPF_MOV32_IMM(BPF_REG_0, 256),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_IND(BPF_DW, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = REJECT,
+		.errstr = "unknown opcode",
+	},
+	{
+		"ld_abs: nmap reduced",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26),
+			BPF_MOV32_IMM(BPF_REG_0, 18),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64),
+			BPF_LD_IND(BPF_W, BPF_REG_7, 14),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60),
+			BPF_MOV32_IMM(BPF_REG_0, 280971478),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60),
+			BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15),
+			BPF_LD_ABS(BPF_H, 12),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13),
+			BPF_MOV32_IMM(BPF_REG_0, 22),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+			BPF_LD_IND(BPF_H, BPF_REG_7, 14),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52),
+			BPF_MOV32_IMM(BPF_REG_0, 17366),
+			BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52),
+			BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+			BPF_MOV32_IMM(BPF_REG_0, 256),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0,
+			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 256,
+	},
+	{
+		"ld_abs: div + abs, test 1",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+			BPF_LD_ABS(BPF_B, 4),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+			BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 10,
+	},
+	{
+		"ld_abs: div + abs, test 2",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+			BPF_LD_ABS(BPF_B, 128),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+			BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: div + abs, test 3",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+			BPF_LD_ABS(BPF_B, 3),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: div + abs, test 4",
+		.insns = {
+			BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+			BPF_LD_ABS(BPF_B, 256),
+			BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			10, 20, 30, 40, 50,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0,
+	},
+	{
+		"ld_abs: vlan + abs, test 1",
+		.insns = { },
+		.data = {
+			0x34,
+		},
+		.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 0xbef,
+	},
+	{
+		"ld_abs: vlan + abs, test 2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+			BPF_MOV64_IMM(BPF_REG_6, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_skb_vlan_push),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+			BPF_LD_ABS(BPF_B, 0),
+			BPF_LD_ABS(BPF_H, 0),
+			BPF_LD_ABS(BPF_W, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 42),
+			BPF_EXIT_INSN(),
+		},
+		.data = {
+			0x34,
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"ld_abs: jump around ld_abs",
+		.insns = { },
+		.data = {
+			10, 11,
+		},
+		.fill_helper = bpf_fill_jump_around_ld_abs,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 10,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -11828,7 +12077,7 @@ static int create_map_in_map(void)
 	return outer_map_fd;
 }
 
-static char bpf_vlog[32768];
+static char bpf_vlog[UINT_MAX >> 8];
 
 static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 			  int *map_fds)
@@ -11839,6 +12088,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_prog = test->fixup_prog;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
+	if (test->fill_helper)
+		test->fill_helper(test);
+
 	/* Allocating HTs with 1 elem is fine here, since we only test
 	 * for verifier and not do a runtime lookup, so the only thing
 	 * that really matters is value size in this case.
@@ -11888,10 +12140,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 			   int *passes, int *errors)
 {
 	int fd_prog, expected_ret, reject_from_alignment;
+	int prog_len, prog_type = test->prog_type;
 	struct bpf_insn *prog = test->insns;
-	int prog_len = probe_filter_length(prog);
-	char data_in[TEST_DATA_LEN] = {};
-	int prog_type = test->prog_type;
 	int map_fds[MAX_NR_MAPS];
 	const char *expected_err;
 	uint32_t retval;
@@ -11901,6 +12151,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		map_fds[i] = -1;
 
 	do_test_fixup(test, prog, map_fds);
+	prog_len = probe_filter_length(prog);
 
 	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
 				     prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
@@ -11940,8 +12191,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 	}
 
 	if (fd_prog >= 0) {
-		err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in),
-					NULL, NULL, &retval, NULL);
+		err = bpf_prog_test_run(fd_prog, 1, test->data,
+					sizeof(test->data), NULL, NULL,
+					&retval, NULL);
 		if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
 			printf("Unexpected bpf_prog_test_run error\n");
 			goto fail_log;
-- 
cgit v1.2.3-59-g8ed1b


From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:14 +0200
Subject: bpf: implement ld_abs/ld_ind in native bpf

The main part of this work is to finally allow removal of LD_ABS
and LD_IND from the BPF core by reimplementing them through native
eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and
keeping them around in native eBPF caused way more trouble than
actually worth it. To just list some of the security issues in
the past:

  * fdfaf64e7539 ("x86: bpf_jit: support negative offsets")
  * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets")
  * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler")
  * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call")
  * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context")
  * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context")

For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy
these days due to their limitations and more efficient/flexible
alternatives that have been developed over time such as direct
packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a
register, the load happens in host endianness and its exception
handling can yield unexpected behavior. The latter is explained
in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by
div/mod by 0 exception") with similar cases of exceptions we had.
In native eBPF more recent program types will disable LD_ABS/LD_IND
altogether through may_access_skb() in verifier, and given the
limitations in terms of exception handling, it's also disabled
in programs that use BPF to BPF calls.

In terms of cBPF, the LD_ABS/LD_IND is used in networking programs
to access packet data. It is not used in seccomp-BPF but programs
that use it for socket filtering or reuseport for demuxing with
cBPF. This is mostly relevant for applications that have not yet
migrated to native eBPF.

The main complexity and source of bugs in LD_ABS/LD_IND is coming
from their implementation in the various JITs. Most of them keep
the model around from cBPF times by implementing a fastpath written
in asm. They use typically two from the BPF program hidden CPU
registers for caching the skb's headlen (skb->len - skb->data_len)
and skb->data. Throughout the JIT phase this requires to keep track
whether LD_ABS/LD_IND are used and if so, the two registers need
to be recached each time a BPF helper would change the underlying
packet data in native eBPF case. At least in eBPF case, available
CPU registers are rare and the additional exit path out of the
asm written JIT helper makes it also inflexible since not all
parts of the JITer are in control from plain C. A LD_ABS/LD_IND
implementation in eBPF therefore allows to significantly reduce
the complexity in JITs with comparable performance results for
them, e.g.:

test_bpf             tcpdump port 22             tcpdump complex
x64      - before    15 21 10                    14 19  18
         - after      7 10 10                     7 10  15
arm64    - before    40 91 92                    40 91 151
         - after     51 64 73                    51 62 113

For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter()
and cache the skb's headlen and data in the cBPF prologue. The
BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just
used as a local temporary variable. This allows to shrink the
image on x86_64 also for seccomp programs slightly since mapping
to %rsi is not an ereg. In callee-saved R8 and R9 we now track
skb data and headlen, respectively. For normal prologue emission
in the JITs this does not add any extra instructions since R8, R9
are pushed to stack in any case from eBPF side. cBPF uses the
convert_bpf_ld_abs() emitter which probes the fast path inline
already and falls back to bpf_skb_load_helper_{8,16,32}() helper
relying on the cached skb data and headlen as well. R8 and R9
never need to be reloaded due to bpf_helper_changes_pkt_data()
since all skb access in cBPF is read-only. Then, for the case
of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls
the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally,
does neither cache skb data and headlen nor has an inlined fast
path. The reason for the latter is that native eBPF does not have
any extra registers available anyway, but even if there were, it
avoids any reload of skb data and headlen in the first place.
Additionally, for the negative offsets, we provide an alternative
bpf_skb_load_bytes_relative() helper in eBPF which operates
similarly as bpf_skb_load_bytes() and allows for more flexibility.
Tested myself on x64, arm64, s390x, from Sandipan on ppc64.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |   2 +
 include/linux/filter.h |   4 +-
 kernel/bpf/core.c      |  96 ++------------------
 kernel/bpf/verifier.c  |  24 +++++
 net/core/filter.c      | 236 ++++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 262 insertions(+), 100 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0e3d7ef36a8..0e00a13ff01b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -235,6 +235,8 @@ struct bpf_verifier_ops {
 				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
+	int (*gen_ld_abs)(const struct bpf_insn *orig,
+			  struct bpf_insn *insn_buf);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index b7f81e3a70cb..da7e16523128 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -47,7 +47,9 @@ struct xdp_buff;
 /* Additional register mappings for converted user programs. */
 #define BPF_REG_A	BPF_REG_0
 #define BPF_REG_X	BPF_REG_7
-#define BPF_REG_TMP	BPF_REG_8
+#define BPF_REG_TMP	BPF_REG_2	/* scratch reg */
+#define BPF_REG_D	BPF_REG_8	/* data, callee-saved */
+#define BPF_REG_H	BPF_REG_9	/* hlen, callee-saved */
 
 /* Kernel hidden auxiliary/helper register for hardening step.
  * Only used by eBPF JITs. It's nothing more than a temporary
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 90feeba3a1a1..1127552c8033 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -634,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
 		break;
 
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
 	case BPF_LD | BPF_IMM | BPF_DW:
 		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
 		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -891,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(LDX, MEM, W),			\
 	INSN_3(LDX, MEM, DW),			\
 	/*   Immediate based. */		\
-	INSN_3(LD, IMM, DW),			\
-	/*   Misc (old cBPF carry-over). */	\
-	INSN_3(LD, ABS, B),			\
-	INSN_3(LD, ABS, H),			\
-	INSN_3(LD, ABS, W),			\
-	INSN_3(LD, IND, B),			\
-	INSN_3(LD, IND, H),			\
-	INSN_3(LD, IND, W)
+	INSN_3(LD, IMM, DW)
 
 bool bpf_opcode_in_insntable(u8 code)
 {
@@ -908,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code)
 		[0 ... 255] = false,
 		/* Now overwrite non-defaults ... */
 		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+		/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+		[BPF_LD | BPF_ABS | BPF_B] = true,
+		[BPF_LD | BPF_ABS | BPF_H] = true,
+		[BPF_LD | BPF_ABS | BPF_W] = true,
+		[BPF_LD | BPF_IND | BPF_B] = true,
+		[BPF_LD | BPF_IND | BPF_H] = true,
+		[BPF_LD | BPF_IND | BPF_W] = true,
 	};
 #undef BPF_INSN_3_TBL
 #undef BPF_INSN_2_TBL
@@ -938,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 #undef BPF_INSN_3_LBL
 #undef BPF_INSN_2_LBL
 	u32 tail_call_cnt = 0;
-	void *ptr;
-	int off;
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
@@ -1266,67 +1247,6 @@ out:
 		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
 			     (DST + insn->off));
 		CONT;
-	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
-		off = IMM;
-load_word:
-		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
-		 * appearing in the programs where ctx == skb
-		 * (see may_access_skb() in the verifier). All programs
-		 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
-		 * bpf_convert_filter() saves it in BPF_R6, internal BPF
-		 * verifier will check that BPF_R6 == ctx.
-		 *
-		 * BPF_ABS and BPF_IND are wrappers of function calls,
-		 * so they scratch BPF_R1-BPF_R5 registers, preserve
-		 * BPF_R6-BPF_R9, and store return value into BPF_R0.
-		 *
-		 * Implicit input:
-		 *   ctx == skb == BPF_R6 == CTX
-		 *
-		 * Explicit input:
-		 *   SRC == any register
-		 *   IMM == 32-bit immediate
-		 *
-		 * Output:
-		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
-		 */
-
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be32(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
-		off = IMM;
-load_half:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be16(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
-		off = IMM;
-load_byte:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = *(u8 *)ptr;
-			CONT;
-		}
-
-		return 0;
-	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_word;
-	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_half;
-	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
-		off = IMM + SRC;
-		goto load_byte;
 
 	default_label:
 		/* If we ever reach this, we have a bug somewhere. Die hard here
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0d91f18b2eb5..6ba10a83909d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3884,6 +3884,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (!env->ops->gen_ld_abs) {
+		verbose(env, "bpf verifier is misconfigured\n");
+		return -EINVAL;
+	}
+
 	if (env->subprog_cnt) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
@@ -5519,6 +5524,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_LD &&
+		    (BPF_MODE(insn->code) == BPF_ABS ||
+		     BPF_MODE(insn->code) == BPF_IND)) {
+			cnt = env->ops->gen_ld_abs(insn, insn_buf);
+			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+				verbose(env, "bpf verifier is misconfigured\n");
+				return -EINVAL;
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
diff --git a/net/core/filter.c b/net/core/filter.c
index 865500f6180d..a49729842b3d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 	return 0;
 }
 
+BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u8 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (offset >= 0) {
+		if (headlen - offset >= len)
+			return *(u8 *)(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return tmp;
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return *(u8 *)ptr;
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
+					 offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u16 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (offset >= 0) {
+		if (headlen - offset >= len)
+			return get_unaligned_be16(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return be16_to_cpu(tmp);
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return get_unaligned_be16(ptr);
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
+					  offset);
+}
+
+BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
+	   data, int, headlen, int, offset)
+{
+	u32 tmp, *ptr;
+	const int len = sizeof(tmp);
+
+	if (likely(offset >= 0)) {
+		if (headlen - offset >= len)
+			return get_unaligned_be32(data + offset);
+		if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+			return be32_to_cpu(tmp);
+	} else {
+		ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
+		if (likely(ptr))
+			return get_unaligned_be32(ptr);
+	}
+
+	return -EFAULT;
+}
+
+BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
+	   int, offset)
+{
+	return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
+					  offset);
+}
+
 BPF_CALL_0(bpf_get_raw_cpu_id)
 {
 	return raw_smp_processor_id();
@@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	return true;
 }
 
+static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
+{
+	const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
+	int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
+	bool endian = BPF_SIZE(fp->code) == BPF_H ||
+		      BPF_SIZE(fp->code) == BPF_W;
+	bool indirect = BPF_MODE(fp->code) == BPF_IND;
+	const int ip_align = NET_IP_ALIGN;
+	struct bpf_insn *insn = *insnp;
+	int offset = fp->k;
+
+	if (!indirect &&
+	    ((unaligned_ok && offset >= 0) ||
+	     (!unaligned_ok && offset >= 0 &&
+	      offset + ip_align >= 0 &&
+	      offset + ip_align % size == 0))) {
+		*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
+		*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+		*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
+		*insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
+				      offset);
+		if (endian)
+			*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
+		*insn++ = BPF_JMP_A(8);
+	}
+
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
+	*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
+	if (!indirect) {
+		*insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
+	} else {
+		*insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
+		if (fp->k)
+			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
+	}
+
+	switch (BPF_SIZE(fp->code)) {
+	case BPF_B:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
+		break;
+	case BPF_H:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
+		break;
+	case BPF_W:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
+		break;
+	default:
+		return false;
+	}
+
+	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
+	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+	*insn   = BPF_EXIT_INSN();
+
+	*insnp = insn;
+	return true;
+}
+
 /**
  *	bpf_convert_filter - convert filter program
  *	@prog: the user passed filter program
  *	@len: the length of the user passed filter program
  *	@new_prog: allocated 'struct bpf_prog' or NULL
  *	@new_len: pointer to store length of converted program
+ *	@seen_ld_abs: bool whether we've seen ld_abs/ind
  *
  * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
  * style extended BPF (eBPF).
  * Conversion workflow:
  *
  * 1) First pass for calculating the new program length:
- *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
+ *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
  *
  * 2) 2nd pass to remap in two passes: 1st pass finds new
  *    jump offsets, 2nd pass remapping:
- *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
  */
 static int bpf_convert_filter(struct sock_filter *prog, int len,
-			      struct bpf_prog *new_prog, int *new_len)
+			      struct bpf_prog *new_prog, int *new_len,
+			      bool *seen_ld_abs)
 {
 	int new_flen = 0, pass = 0, target, i, stack_off;
 	struct bpf_insn *new_insn, *first_insn = NULL;
@@ -412,12 +554,27 @@ do_pass:
 		 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
 		 */
 		*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+		if (*seen_ld_abs) {
+			/* For packet access in classic BPF, cache skb->data
+			 * in callee-saved BPF R8 and skb->len - skb->data_len
+			 * (headlen) in BPF R9. Since classic BPF is read-only
+			 * on CTX, we only need to cache it once.
+			 */
+			*new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+						  BPF_REG_D, BPF_REG_CTX,
+						  offsetof(struct sk_buff, data));
+			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
+						  offsetof(struct sk_buff, len));
+			*new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
+						  offsetof(struct sk_buff, data_len));
+			*new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
+		}
 	} else {
 		new_insn += 3;
 	}
 
 	for (i = 0; i < len; fp++, i++) {
-		struct bpf_insn tmp_insns[6] = { };
+		struct bpf_insn tmp_insns[32] = { };
 		struct bpf_insn *insn = tmp_insns;
 
 		if (addrs)
@@ -460,6 +617,11 @@ do_pass:
 			    BPF_MODE(fp->code) == BPF_ABS &&
 			    convert_bpf_extensions(fp, &insn))
 				break;
+			if (BPF_CLASS(fp->code) == BPF_LD &&
+			    convert_bpf_ld_abs(fp, &insn)) {
+				*seen_ld_abs = true;
+				break;
+			}
 
 			if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
 			    fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
@@ -562,21 +724,31 @@ jmp_rest:
 			break;
 
 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
-		case BPF_LDX | BPF_MSH | BPF_B:
-			/* tmp = A */
-			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
+		case BPF_LDX | BPF_MSH | BPF_B: {
+			struct sock_filter tmp = {
+				.code	= BPF_LD | BPF_ABS | BPF_B,
+				.k	= fp->k,
+			};
+
+			*seen_ld_abs = true;
+
+			/* X = A */
+			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			/* A = BPF_R0 = *(u8 *) (skb->data + K) */
-			*insn++ = BPF_LD_ABS(BPF_B, fp->k);
+			convert_bpf_ld_abs(&tmp, &insn);
+			insn++;
 			/* A &= 0xf */
 			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 			/* A <<= 2 */
 			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
+			/* tmp = X */
+			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
 			/* X = A */
 			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 			/* A = tmp */
 			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 			break;
-
+		}
 		/* RET_K is remaped into 2 insns. RET_A case doesn't need an
 		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
 		 */
@@ -658,6 +830,8 @@ jmp_rest:
 	if (!new_prog) {
 		/* Only calculating new length. */
 		*new_len = new_insn - first_insn;
+		if (*seen_ld_abs)
+			*new_len += 4; /* Prologue bits. */
 		return 0;
 	}
 
@@ -1019,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	struct sock_filter *old_prog;
 	struct bpf_prog *old_fp;
 	int err, new_len, old_len = fp->len;
+	bool seen_ld_abs = false;
 
 	/* We are free to overwrite insns et al right here as it
 	 * won't be used at this point in time anymore internally
@@ -1040,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	}
 
 	/* 1st pass: calculate the new program length. */
-	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
+				 &seen_ld_abs);
 	if (err)
 		goto out_err_free;
 
@@ -1059,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	fp->len = new_len;
 
 	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
-	err = bpf_convert_filter(old_prog, old_len, fp, &new_len);
+	err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
+				 &seen_ld_abs);
 	if (err)
 		/* 2nd bpf_convert_filter() can fail only if it fails
 		 * to allocate memory, remapping must succeed. Note,
@@ -4330,6 +4507,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 	return insn - insn_buf;
 }
 
+static int bpf_gen_ld_abs(const struct bpf_insn *orig,
+			  struct bpf_insn *insn_buf)
+{
+	bool indirect = BPF_MODE(orig->code) == BPF_IND;
+	struct bpf_insn *insn = insn_buf;
+
+	/* We're guaranteed here that CTX is in R6. */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
+	if (!indirect) {
+		*insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
+	} else {
+		*insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
+		if (orig->imm)
+			*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
+	}
+
+	switch (BPF_SIZE(orig->code)) {
+	case BPF_B:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
+		break;
+	case BPF_H:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
+		break;
+	case BPF_W:
+		*insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
+		break;
+	}
+
+	*insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+	*insn++ = BPF_EXIT_INSN();
+
+	return insn - insn_buf;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -5599,6 +5811,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_ld_abs		= bpf_gen_ld_abs,
 };
 
 const struct bpf_prog_ops sk_filter_prog_ops = {
@@ -5610,6 +5823,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+	.gen_ld_abs		= bpf_gen_ld_abs,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:15 +0200
Subject: bpf: add skb_load_bytes_relative helper

This adds a small BPF helper similar to bpf_skb_load_bytes() that
is able to load relative to mac/net header offset from the skb's
linear data. Compared to bpf_skb_load_bytes(), it takes a fifth
argument namely start_header, which is either BPF_HDR_START_MAC
or BPF_HDR_START_NET. This allows for a more flexible alternative
compared to LD_ABS/LD_IND with negative offset. It's enabled for
tc BPF programs as well as sock filter program types where it's
mainly useful in reuseport programs to ease access to lower header
data.

Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++-
 net/core/filter.c        | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a3a495052511..93d5a4eeec2a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1802,6 +1802,30 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1871,7 +1895,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
 };
 
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index a49729842b3d..6877426c23a6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1684,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
+	   u32, offset, void *, to, u32, len, u32, start_header)
+{
+	u8 *ptr;
+
+	if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+		goto err_clear;
+
+	switch (start_header) {
+	case BPF_HDR_START_MAC:
+		ptr = skb_mac_header(skb) + offset;
+		break;
+	case BPF_HDR_START_NET:
+		ptr = skb_network_header(skb) + offset;
+		break;
+	default:
+		goto err_clear;
+	}
+
+	if (likely(ptr >= skb_mac_header(skb) &&
+		   ptr + len <= skb_tail_pointer(skb))) {
+		memcpy(to, ptr, len);
+		return 0;
+	}
+
+err_clear:
+	memset(to, 0, len);
+	return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
+	.func		= bpf_skb_load_bytes_relative,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
 {
 	/* Idea is the following: should the needed direct read/write
@@ -4061,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
@@ -4078,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_load_bytes_relative:
+		return &bpf_skb_load_bytes_relative_proto;
 	case BPF_FUNC_skb_pull_data:
 		return &bpf_skb_pull_data_proto;
 	case BPF_FUNC_csum_diff:
-- 
cgit v1.2.3-59-g8ed1b


From e782bdcf58c5ace7b7d58b2436177de9785a18e8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:16 +0200
Subject: bpf, x64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from x64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/Makefile       |   3 +-
 arch/x86/net/bpf_jit.S      | 154 --------------------------------------------
 arch/x86/net/bpf_jit_comp.c | 144 ++---------------------------------------
 3 files changed, 5 insertions(+), 296 deletions(-)
 delete mode 100644 arch/x86/net/bpf_jit.S

diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index c6b464a261bb..59e123da580c 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -5,6 +5,5 @@
 ifeq ($(CONFIG_X86_32),y)
         obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
 else
-        OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
-        obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+        obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
 endif
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
deleted file mode 100644
index b33093f84528..000000000000
--- a/arch/x86/net/bpf_jit.S
+++ /dev/null
@@ -1,154 +0,0 @@
-/* bpf_jit.S : BPF JIT helper functions
- *
- * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-/*
- * Calling convention :
- * rbx : skb pointer (callee saved)
- * esi : offset of byte(s) to fetch in skb (can be scratched)
- * r10 : copy of skb->data
- * r9d : hlen = skb->len - skb->data_len
- */
-#define SKBDATA	%r10
-#define SKF_MAX_NEG_OFF    $(-0x200000) /* SKF_LL_OFF from filter.h */
-
-#define FUNC(name) \
-	.globl name; \
-	.type name, @function; \
-	name:
-
-FUNC(sk_load_word)
-	test	%esi,%esi
-	js	bpf_slow_path_word_neg
-
-FUNC(sk_load_word_positive_offset)
-	mov	%r9d,%eax		# hlen
-	sub	%esi,%eax		# hlen - offset
-	cmp	$3,%eax
-	jle	bpf_slow_path_word
-	mov     (SKBDATA,%rsi),%eax
-	bswap   %eax  			/* ntohl() */
-	ret
-
-FUNC(sk_load_half)
-	test	%esi,%esi
-	js	bpf_slow_path_half_neg
-
-FUNC(sk_load_half_positive_offset)
-	mov	%r9d,%eax
-	sub	%esi,%eax		#	hlen - offset
-	cmp	$1,%eax
-	jle	bpf_slow_path_half
-	movzwl	(SKBDATA,%rsi),%eax
-	rol	$8,%ax			# ntohs()
-	ret
-
-FUNC(sk_load_byte)
-	test	%esi,%esi
-	js	bpf_slow_path_byte_neg
-
-FUNC(sk_load_byte_positive_offset)
-	cmp	%esi,%r9d   /* if (offset >= hlen) goto bpf_slow_path_byte */
-	jle	bpf_slow_path_byte
-	movzbl	(SKBDATA,%rsi),%eax
-	ret
-
-/* rsi contains offset and can be scratched */
-#define bpf_slow_path_common(LEN)		\
-	lea	32(%rbp), %rdx;\
-	FRAME_BEGIN;				\
-	mov	%rbx, %rdi; /* arg1 == skb */	\
-	push	%r9;				\
-	push	SKBDATA;			\
-/* rsi already has offset */			\
-	mov	$LEN,%ecx;	/* len */	\
-	call	skb_copy_bits;			\
-	test    %eax,%eax;			\
-	pop	SKBDATA;			\
-	pop	%r9;				\
-	FRAME_END
-
-
-bpf_slow_path_word:
-	bpf_slow_path_common(4)
-	js	bpf_error
-	mov	32(%rbp),%eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half:
-	bpf_slow_path_common(2)
-	js	bpf_error
-	mov	32(%rbp),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte:
-	bpf_slow_path_common(1)
-	js	bpf_error
-	movzbl	32(%rbp),%eax
-	ret
-
-#define sk_negative_common(SIZE)				\
-	FRAME_BEGIN;						\
-	mov	%rbx, %rdi; /* arg1 == skb */			\
-	push	%r9;						\
-	push	SKBDATA;					\
-/* rsi already has offset */					\
-	mov	$SIZE,%edx;	/* size */			\
-	call	bpf_internal_load_pointer_neg_helper;		\
-	test	%rax,%rax;					\
-	pop	SKBDATA;					\
-	pop	%r9;						\
-	FRAME_END;						\
-	jz	bpf_error
-
-bpf_slow_path_word_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi	/* test range */
-	jl	bpf_error	/* offset lower -> error  */
-
-FUNC(sk_load_word_negative_offset)
-	sk_negative_common(4)
-	mov	(%rax), %eax
-	bswap	%eax
-	ret
-
-bpf_slow_path_half_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_half_negative_offset)
-	sk_negative_common(2)
-	mov	(%rax),%ax
-	rol	$8,%ax
-	movzwl	%ax,%eax
-	ret
-
-bpf_slow_path_byte_neg:
-	cmp	SKF_MAX_NEG_OFF, %esi
-	jl	bpf_error
-
-FUNC(sk_load_byte_negative_offset)
-	sk_negative_common(1)
-	movzbl	(%rax), %eax
-	ret
-
-bpf_error:
-# force a return 0 from jit handler
-	xor	%eax,%eax
-	mov	(%rbp),%rbx
-	mov	8(%rbp),%r13
-	mov	16(%rbp),%r14
-	mov	24(%rbp),%r15
-	add	$40, %rbp
-	leaveq
-	ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 1c3c81ddeb87..ce08b7bb0304 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,15 +17,6 @@
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 
-/*
- * Assembly code in arch/x86/net/bpf_jit.S
- */
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-extern u8 sk_load_word_positive_offset[], sk_load_half_positive_offset[];
-extern u8 sk_load_byte_positive_offset[];
-extern u8 sk_load_word_negative_offset[], sk_load_half_negative_offset[];
-extern u8 sk_load_byte_negative_offset[];
-
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
 	if (len == 1)
@@ -107,9 +98,6 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 #define X86_JLE 0x7E
 #define X86_JG  0x7F
 
-#define CHOOSE_LOAD_FUNC(K, func) \
-	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
 /* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
 
@@ -120,8 +108,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- * R9  caches skb->len - skb->data_len
- * R10 caches skb->data, and used for blinding (if enabled)
+ * Also x86-64 register R9 is unused. x86-64 register R10 is
+ * used for blinding (if enabled).
  */
 static const int reg2hex[] = {
 	[BPF_REG_0] = 0,  /* RAX */
@@ -196,19 +184,15 @@ static void jit_fill_hole(void *area, unsigned int size)
 
 struct jit_context {
 	int cleanup_addr; /* Epilogue code offset */
-	bool seen_ld_abs;
-	bool seen_ax_reg;
 };
 
 /* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-#define AUX_STACK_SPACE \
-	(32 /* Space for RBX, R13, R14, R15 */ + \
-	  8 /* Space for skb_copy_bits() buffer */)
+#define AUX_STACK_SPACE		40 /* Space for RBX, R13, R14, R15, tailcnt */
 
-#define PROLOGUE_SIZE 37
+#define PROLOGUE_SIZE		37
 
 /*
  * Emit x86-64 prologue code for BPF program and check its size.
@@ -232,20 +216,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 	/* sub rbp, AUX_STACK_SPACE */
 	EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE);
 
-	/* All classic BPF filters use R6(rbx) save it */
-
 	/* mov qword ptr [rbp+0],rbx */
 	EMIT4(0x48, 0x89, 0x5D, 0);
-
-	/*
-	 * bpf_convert_filter() maps classic BPF register X to R7 and uses R8
-	 * as temporary, so all tcpdump filters need to spill/fill R7(R13) and
-	 * R8(R14). R9(R15) spill could be made conditional, but there is only
-	 * one 'bpf_error' return path out of helper functions inside bpf_jit.S
-	 * The overhead of extra spill is negligible for any filter other
-	 * than synthetic ones. Therefore not worth adding complexity.
-	 */
-
 	/* mov qword ptr [rbp+8],r13 */
 	EMIT4(0x4C, 0x89, 0x6D, 8);
 	/* mov qword ptr [rbp+16],r14 */
@@ -353,27 +325,6 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
-
-static void emit_load_skb_data_hlen(u8 **pprog)
-{
-	u8 *prog = *pprog;
-	int cnt = 0;
-
-	/*
-	 * r9d = skb->len - skb->data_len (headlen)
-	 * r10 = skb->data
-	 */
-	/* mov %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
-
-	/* sub %r9d, off32(%rdi) */
-	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
-
-	/* mov %r10, off32(%rdi) */
-	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
-	*pprog = prog;
-}
-
 static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
 			   u32 dst_reg, const u32 imm32)
 {
@@ -462,8 +413,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 {
 	struct bpf_insn *insn = bpf_prog->insnsi;
 	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_ax_reg = ctx->seen_ax_reg | (oldproglen == 0);
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
 	int i, cnt = 0;
@@ -473,9 +422,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	emit_prologue(&prog, bpf_prog->aux->stack_depth,
 		      bpf_prog_was_classic(bpf_prog));
 
-	if (seen_ld_abs)
-		emit_load_skb_data_hlen(&prog);
-
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
 		u32 dst_reg = insn->dst_reg;
@@ -483,13 +429,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
-		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
-		if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
-			ctx->seen_ax_reg = seen_ax_reg = true;
-
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -916,36 +858,12 @@ xadd:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_CALL:
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
-			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_pkt_data(func);
-				if (reload_skb_data) {
-					EMIT1(0x57); /* push %rdi */
-					jmp_offset += 22; /* pop, mov, sub, mov */
-				} else {
-					EMIT2(0x41, 0x52); /* push %r10 */
-					EMIT2(0x41, 0x51); /* push %r9 */
-					/*
-					 * We need to adjust jmp offset, since
-					 * pop %r9, pop %r10 take 4 bytes after call insn
-					 */
-					jmp_offset += 4;
-				}
-			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported BPF func %d addr %p image %p\n",
 				       imm32, func, image);
 				return -EINVAL;
 			}
 			EMIT1_off32(0xE8, jmp_offset);
-			if (seen_ld_abs) {
-				if (reload_skb_data) {
-					EMIT1(0x5F); /* pop %rdi */
-					emit_load_skb_data_hlen(&prog);
-				} else {
-					EMIT2(0x41, 0x59); /* pop %r9 */
-					EMIT2(0x41, 0x5A); /* pop %r10 */
-				}
-			}
 			break;
 
 		case BPF_JMP | BPF_TAIL_CALL:
@@ -1080,60 +998,6 @@ emit_jmp:
 			}
 			break;
 
-		case BPF_LD | BPF_IND | BPF_W:
-			func = sk_load_word;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_W:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
-common_load:
-			ctx->seen_ld_abs = seen_ld_abs = true;
-			jmp_offset = func - (image + addrs[i]);
-			if (!func || !is_simm32(jmp_offset)) {
-				pr_err("unsupported BPF func %d addr %p image %p\n",
-				       imm32, func, image);
-				return -EINVAL;
-			}
-			if (BPF_MODE(insn->code) == BPF_ABS) {
-				/* mov %esi, imm32 */
-				EMIT1_off32(0xBE, imm32);
-			} else {
-				/* mov %rsi, src_reg */
-				EMIT_mov(BPF_REG_2, src_reg);
-				if (imm32) {
-					if (is_imm8(imm32))
-						/* add %esi, imm8 */
-						EMIT3(0x83, 0xC6, imm32);
-					else
-						/* add %esi, imm32 */
-						EMIT2_off32(0x81, 0xC6, imm32);
-				}
-			}
-			/*
-			 * skb pointer is in R6 (%rbx), it will be copied into
-			 * %rdi if skb_copy_bits() call is necessary.
-			 * sk_load_* helpers also use %r10 and %r9d.
-			 * See bpf_jit.S
-			 */
-			if (seen_ax_reg)
-				/* r10 = skb->data, mov %r10, off32(%rbx) */
-				EMIT3_off32(0x4c, 0x8b, 0x93,
-					    offsetof(struct sk_buff, data));
-			EMIT1_off32(0xE8, jmp_offset); /* call */
-			break;
-
-		case BPF_LD | BPF_IND | BPF_H:
-			func = sk_load_half;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_H:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_half);
-			goto common_load;
-		case BPF_LD | BPF_IND | BPF_B:
-			func = sk_load_byte;
-			goto common_load;
-		case BPF_LD | BPF_ABS | BPF_B:
-			func = CHOOSE_LOAD_FUNC(imm32, sk_load_byte);
-			goto common_load;
-
 		case BPF_JMP | BPF_EXIT:
 			if (seen_exit) {
 				jmp_offset = ctx->cleanup_addr - addrs[i];
-- 
cgit v1.2.3-59-g8ed1b


From 816d9ef32a8b29174cd5bb8c9f76c5b326c58a9b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:17 +0200
Subject: bpf, arm64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from arm64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 65 -------------------------------------------
 1 file changed, 65 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a93350451e8e..0b40c8fb0706 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -723,71 +723,6 @@ emit_cond_jmp:
 		emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
 		break;
 
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-	{
-		const u8 r0 = bpf2a64[BPF_REG_0]; /* r0 = return value */
-		const u8 r6 = bpf2a64[BPF_REG_6]; /* r6 = pointer to sk_buff */
-		const u8 fp = bpf2a64[BPF_REG_FP];
-		const u8 r1 = bpf2a64[BPF_REG_1]; /* r1: struct sk_buff *skb */
-		const u8 r2 = bpf2a64[BPF_REG_2]; /* r2: int k */
-		const u8 r3 = bpf2a64[BPF_REG_3]; /* r3: unsigned int size */
-		const u8 r4 = bpf2a64[BPF_REG_4]; /* r4: void *buffer */
-		const u8 r5 = bpf2a64[BPF_REG_5]; /* r5: void *(*func)(...) */
-		int size;
-
-		emit(A64_MOV(1, r1, r6), ctx);
-		emit_a64_mov_i(0, r2, imm, ctx);
-		if (BPF_MODE(code) == BPF_IND)
-			emit(A64_ADD(0, r2, r2, src), ctx);
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			size = 4;
-			break;
-		case BPF_H:
-			size = 2;
-			break;
-		case BPF_B:
-			size = 1;
-			break;
-		default:
-			return -EINVAL;
-		}
-		emit_a64_mov_i64(r3, size, ctx);
-		emit(A64_SUB_I(1, r4, fp, ctx->stack_size), ctx);
-		emit_a64_mov_i64(r5, (unsigned long)bpf_load_pointer, ctx);
-		emit(A64_BLR(r5), ctx);
-		emit(A64_MOV(1, r0, A64_R(0)), ctx);
-
-		jmp_offset = epilogue_offset(ctx);
-		check_imm19(jmp_offset);
-		emit(A64_CBZ(1, r0, jmp_offset), ctx);
-		emit(A64_MOV(1, r5, r0), ctx);
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			emit(A64_LDR32(r0, r5, A64_ZR), ctx);
-#ifndef CONFIG_CPU_BIG_ENDIAN
-			emit(A64_REV32(0, r0, r0), ctx);
-#endif
-			break;
-		case BPF_H:
-			emit(A64_LDRH(r0, r5, A64_ZR), ctx);
-#ifndef CONFIG_CPU_BIG_ENDIAN
-			emit(A64_REV16(0, r0, r0), ctx);
-#endif
-			break;
-		case BPF_B:
-			emit(A64_LDRB(r0, r5, A64_ZR), ctx);
-			break;
-		}
-		break;
-	}
 	default:
 		pr_err_once("unknown opcode %02x\n", code);
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From fe83963b7c38a2b30a26a90880c75a31c283d957 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:18 +0200
Subject: bpf, sparc64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from sparc64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/sparc/net/Makefile          |   5 +-
 arch/sparc/net/bpf_jit_64.h      |  29 -------
 arch/sparc/net/bpf_jit_asm_64.S  | 162 ---------------------------------------
 arch/sparc/net/bpf_jit_comp_64.c |  79 +------------------
 4 files changed, 6 insertions(+), 269 deletions(-)
 delete mode 100644 arch/sparc/net/bpf_jit_asm_64.S

diff --git a/arch/sparc/net/Makefile b/arch/sparc/net/Makefile
index 76fa8e95b721..d32aac3a25b8 100644
--- a/arch/sparc/net/Makefile
+++ b/arch/sparc/net/Makefile
@@ -1,4 +1,7 @@
 #
 # Arch-specific network modules
 #
-obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_$(BITS).o bpf_jit_comp_$(BITS).o
+obj-$(CONFIG_BPF_JIT) += bpf_jit_comp_$(BITS).o
+ifeq ($(BITS),32)
+obj-$(CONFIG_BPF_JIT) += bpf_jit_asm_32.o
+endif
diff --git a/arch/sparc/net/bpf_jit_64.h b/arch/sparc/net/bpf_jit_64.h
index 428f7fd19175..fbc836f1c51c 100644
--- a/arch/sparc/net/bpf_jit_64.h
+++ b/arch/sparc/net/bpf_jit_64.h
@@ -33,35 +33,6 @@
 #define I5		0x1d
 #define FP		0x1e
 #define I7		0x1f
-
-#define r_SKB		L0
-#define r_HEADLEN	L4
-#define r_SKB_DATA	L5
-#define r_TMP		G1
-#define r_TMP2		G3
-
-/* assembly code in arch/sparc/net/bpf_jit_asm_64.S */
-extern u32 bpf_jit_load_word[];
-extern u32 bpf_jit_load_half[];
-extern u32 bpf_jit_load_byte[];
-extern u32 bpf_jit_load_byte_msh[];
-extern u32 bpf_jit_load_word_positive_offset[];
-extern u32 bpf_jit_load_half_positive_offset[];
-extern u32 bpf_jit_load_byte_positive_offset[];
-extern u32 bpf_jit_load_byte_msh_positive_offset[];
-extern u32 bpf_jit_load_word_negative_offset[];
-extern u32 bpf_jit_load_half_negative_offset[];
-extern u32 bpf_jit_load_byte_negative_offset[];
-extern u32 bpf_jit_load_byte_msh_negative_offset[];
-
-#else
-#define r_RESULT	%o0
-#define r_SKB		%o0
-#define r_OFF		%o1
-#define r_HEADLEN	%l4
-#define r_SKB_DATA	%l5
-#define r_TMP		%g1
-#define r_TMP2		%g3
 #endif
 
 #endif /* _BPF_JIT_H */
diff --git a/arch/sparc/net/bpf_jit_asm_64.S b/arch/sparc/net/bpf_jit_asm_64.S
deleted file mode 100644
index 7177867052a1..000000000000
--- a/arch/sparc/net/bpf_jit_asm_64.S
+++ /dev/null
@@ -1,162 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <asm/ptrace.h>
-
-#include "bpf_jit_64.h"
-
-#define SAVE_SZ		176
-#define SCRATCH_OFF	STACK_BIAS + 128
-#define BE_PTR(label)	be,pn %xcc, label
-#define SIGN_EXTEND(reg)	sra reg, 0, reg
-
-#define SKF_MAX_NEG_OFF	(-0x200000) /* SKF_LL_OFF from filter.h */
-
-	.text
-	.globl	bpf_jit_load_word
-bpf_jit_load_word:
-	cmp	r_OFF, 0
-	bl	bpf_slow_path_word_neg
-	 nop
-	.globl	bpf_jit_load_word_positive_offset
-bpf_jit_load_word_positive_offset:
-	sub	r_HEADLEN, r_OFF, r_TMP
-	cmp	r_TMP, 3
-	ble	bpf_slow_path_word
-	 add	r_SKB_DATA, r_OFF, r_TMP
-	andcc	r_TMP, 3, %g0
-	bne	load_word_unaligned
-	 nop
-	retl
-	 ld	[r_TMP], r_RESULT
-load_word_unaligned:
-	ldub	[r_TMP + 0x0], r_OFF
-	ldub	[r_TMP + 0x1], r_TMP2
-	sll	r_OFF, 8, r_OFF
-	or	r_OFF, r_TMP2, r_OFF
-	ldub	[r_TMP + 0x2], r_TMP2
-	sll	r_OFF, 8, r_OFF
-	or	r_OFF, r_TMP2, r_OFF
-	ldub	[r_TMP + 0x3], r_TMP2
-	sll	r_OFF, 8, r_OFF
-	retl
-	 or	r_OFF, r_TMP2, r_RESULT
-
-	.globl	bpf_jit_load_half
-bpf_jit_load_half:
-	cmp	r_OFF, 0
-	bl	bpf_slow_path_half_neg
-	 nop
-	.globl	bpf_jit_load_half_positive_offset
-bpf_jit_load_half_positive_offset:
-	sub	r_HEADLEN, r_OFF, r_TMP
-	cmp	r_TMP, 1
-	ble	bpf_slow_path_half
-	 add	r_SKB_DATA, r_OFF, r_TMP
-	andcc	r_TMP, 1, %g0
-	bne	load_half_unaligned
-	 nop
-	retl
-	 lduh	[r_TMP], r_RESULT
-load_half_unaligned:
-	ldub	[r_TMP + 0x0], r_OFF
-	ldub	[r_TMP + 0x1], r_TMP2
-	sll	r_OFF, 8, r_OFF
-	retl
-	 or	r_OFF, r_TMP2, r_RESULT
-
-	.globl	bpf_jit_load_byte
-bpf_jit_load_byte:
-	cmp	r_OFF, 0
-	bl	bpf_slow_path_byte_neg
-	 nop
-	.globl	bpf_jit_load_byte_positive_offset
-bpf_jit_load_byte_positive_offset:
-	cmp	r_OFF, r_HEADLEN
-	bge	bpf_slow_path_byte
-	 nop
-	retl
-	 ldub	[r_SKB_DATA + r_OFF], r_RESULT
-
-#define bpf_slow_path_common(LEN)	\
-	save	%sp, -SAVE_SZ, %sp;	\
-	mov	%i0, %o0;		\
-	mov	%i1, %o1;		\
-	add	%fp, SCRATCH_OFF, %o2;	\
-	call	skb_copy_bits;		\
-	 mov	(LEN), %o3;		\
-	cmp	%o0, 0;			\
-	restore;
-
-bpf_slow_path_word:
-	bpf_slow_path_common(4)
-	bl	bpf_error
-	 ld	[%sp + SCRATCH_OFF], r_RESULT
-	retl
-	 nop
-bpf_slow_path_half:
-	bpf_slow_path_common(2)
-	bl	bpf_error
-	 lduh	[%sp + SCRATCH_OFF], r_RESULT
-	retl
-	 nop
-bpf_slow_path_byte:
-	bpf_slow_path_common(1)
-	bl	bpf_error
-	 ldub	[%sp + SCRATCH_OFF], r_RESULT
-	retl
-	 nop
-
-#define bpf_negative_common(LEN)			\
-	save	%sp, -SAVE_SZ, %sp;			\
-	mov	%i0, %o0;				\
-	mov	%i1, %o1;				\
-	SIGN_EXTEND(%o1);				\
-	call	bpf_internal_load_pointer_neg_helper;	\
-	 mov	(LEN), %o2;				\
-	mov	%o0, r_TMP;				\
-	cmp	%o0, 0;					\
-	BE_PTR(bpf_error);				\
-	 restore;
-
-bpf_slow_path_word_neg:
-	sethi	%hi(SKF_MAX_NEG_OFF), r_TMP
-	cmp	r_OFF, r_TMP
-	bl	bpf_error
-	 nop
-	.globl	bpf_jit_load_word_negative_offset
-bpf_jit_load_word_negative_offset:
-	bpf_negative_common(4)
-	andcc	r_TMP, 3, %g0
-	bne	load_word_unaligned
-	 nop
-	retl
-	 ld	[r_TMP], r_RESULT
-
-bpf_slow_path_half_neg:
-	sethi	%hi(SKF_MAX_NEG_OFF), r_TMP
-	cmp	r_OFF, r_TMP
-	bl	bpf_error
-	 nop
-	.globl	bpf_jit_load_half_negative_offset
-bpf_jit_load_half_negative_offset:
-	bpf_negative_common(2)
-	andcc	r_TMP, 1, %g0
-	bne	load_half_unaligned
-	 nop
-	retl
-	 lduh	[r_TMP], r_RESULT
-
-bpf_slow_path_byte_neg:
-	sethi	%hi(SKF_MAX_NEG_OFF), r_TMP
-	cmp	r_OFF, r_TMP
-	bl	bpf_error
-	 nop
-	.globl	bpf_jit_load_byte_negative_offset
-bpf_jit_load_byte_negative_offset:
-	bpf_negative_common(1)
-	retl
-	 ldub	[r_TMP], r_RESULT
-
-bpf_error:
-	/* Make the JIT program itself return zero. */
-	ret
-	restore	%g0, %g0, %o0
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 48a25869349b..9f5918e0693a 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -48,10 +48,6 @@ static void bpf_flush_icache(void *start_, void *end_)
 	}
 }
 
-#define SEEN_DATAREF 1 /* might call external helpers */
-#define SEEN_XREG    2 /* ebx is used */
-#define SEEN_MEM     4 /* use mem[] for temporary storage */
-
 #define S13(X)		((X) & 0x1fff)
 #define S5(X)		((X) & 0x1f)
 #define IMMED		0x00002000
@@ -198,7 +194,6 @@ struct jit_ctx {
 	bool 			tmp_1_used;
 	bool 			tmp_2_used;
 	bool 			tmp_3_used;
-	bool			saw_ld_abs_ind;
 	bool			saw_frame_pointer;
 	bool			saw_call;
 	bool			saw_tail_call;
@@ -207,9 +202,7 @@ struct jit_ctx {
 
 #define TMP_REG_1	(MAX_BPF_JIT_REG + 0)
 #define TMP_REG_2	(MAX_BPF_JIT_REG + 1)
-#define SKB_HLEN_REG	(MAX_BPF_JIT_REG + 2)
-#define SKB_DATA_REG	(MAX_BPF_JIT_REG + 3)
-#define TMP_REG_3	(MAX_BPF_JIT_REG + 4)
+#define TMP_REG_3	(MAX_BPF_JIT_REG + 2)
 
 /* Map BPF registers to SPARC registers */
 static const int bpf2sparc[] = {
@@ -238,9 +231,6 @@ static const int bpf2sparc[] = {
 	[TMP_REG_1] = G1,
 	[TMP_REG_2] = G2,
 	[TMP_REG_3] = G3,
-
-	[SKB_HLEN_REG] = L4,
-	[SKB_DATA_REG] = L5,
 };
 
 static void emit(const u32 insn, struct jit_ctx *ctx)
@@ -800,25 +790,6 @@ static int emit_compare_and_branch(const u8 code, const u8 dst, u8 src,
 	return 0;
 }
 
-static void load_skb_regs(struct jit_ctx *ctx, u8 r_skb)
-{
-	const u8 r_headlen = bpf2sparc[SKB_HLEN_REG];
-	const u8 r_data = bpf2sparc[SKB_DATA_REG];
-	const u8 r_tmp = bpf2sparc[TMP_REG_1];
-	unsigned int off;
-
-	off = offsetof(struct sk_buff, len);
-	emit(LD32I | RS1(r_skb) | S13(off) | RD(r_headlen), ctx);
-
-	off = offsetof(struct sk_buff, data_len);
-	emit(LD32I | RS1(r_skb) | S13(off) | RD(r_tmp), ctx);
-
-	emit(SUB | RS1(r_headlen) | RS2(r_tmp) | RD(r_headlen), ctx);
-
-	off = offsetof(struct sk_buff, data);
-	emit(LDPTRI | RS1(r_skb) | S13(off) | RD(r_data), ctx);
-}
-
 /* Just skip the save instruction and the ctx register move.  */
 #define BPF_TAILCALL_PROLOGUE_SKIP	16
 #define BPF_TAILCALL_CNT_SP_OFF		(STACK_BIAS + 128)
@@ -857,9 +828,6 @@ static void build_prologue(struct jit_ctx *ctx)
 
 	emit_reg_move(I0, O0, ctx);
 	/* If you add anything here, adjust BPF_TAILCALL_PROLOGUE_SKIP above. */
-
-	if (ctx->saw_ld_abs_ind)
-		load_skb_regs(ctx, bpf2sparc[BPF_REG_1]);
 }
 
 static void build_epilogue(struct jit_ctx *ctx)
@@ -1225,16 +1193,11 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		u8 *func = ((u8 *)__bpf_call_base) + imm;
 
 		ctx->saw_call = true;
-		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
-			emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
 
 		emit_call((u32 *)func, ctx);
 		emit_nop(ctx);
 
 		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
-
-		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
-			load_skb_regs(ctx, L7);
 		break;
 	}
 
@@ -1412,43 +1375,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		emit_nop(ctx);
 		break;
 	}
-#define CHOOSE_LOAD_FUNC(K, func) \
-		((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
-	case BPF_LD | BPF_ABS | BPF_W:
-		func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_word);
-		goto common_load;
-	case BPF_LD | BPF_ABS | BPF_H:
-		func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_half);
-		goto common_load;
-	case BPF_LD | BPF_ABS | BPF_B:
-		func = CHOOSE_LOAD_FUNC(imm, bpf_jit_load_byte);
-		goto common_load;
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
-	case BPF_LD | BPF_IND | BPF_W:
-		func = bpf_jit_load_word;
-		goto common_load;
-	case BPF_LD | BPF_IND | BPF_H:
-		func = bpf_jit_load_half;
-		goto common_load;
-
-	case BPF_LD | BPF_IND | BPF_B:
-		func = bpf_jit_load_byte;
-	common_load:
-		ctx->saw_ld_abs_ind = true;
-
-		emit_reg_move(bpf2sparc[BPF_REG_6], O0, ctx);
-		emit_loadimm(imm, O1, ctx);
-
-		if (BPF_MODE(code) == BPF_IND)
-			emit_alu(ADD, src, O1, ctx);
-
-		emit_call(func, ctx);
-		emit_alu_K(SRA, O1, 0, ctx);
-
-		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
-		break;
 
 	default:
 		pr_err_once("unknown opcode %02x\n", code);
@@ -1583,12 +1509,11 @@ skip_init_ctx:
 		build_epilogue(&ctx);
 
 		if (bpf_jit_enable > 1)
-			pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c%c]\n", pass,
+			pr_info("Pass %d: shrink = %d, seen = [%c%c%c%c%c%c]\n", pass,
 				image_size - (ctx.idx * 4),
 				ctx.tmp_1_used ? '1' : ' ',
 				ctx.tmp_2_used ? '2' : ' ',
 				ctx.tmp_3_used ? '3' : ' ',
-				ctx.saw_ld_abs_ind ? 'L' : ' ',
 				ctx.saw_frame_pointer ? 'F' : ' ',
 				ctx.saw_call ? 'C' : ' ',
 				ctx.saw_tail_call ? 'T' : ' ');
-- 
cgit v1.2.3-59-g8ed1b


From 0d2d0cedc0814edaa2b6cb290c78ec333b3eed71 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:19 +0200
Subject: bpf, arm32: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from arm32 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c | 77 -----------------------------------------------
 1 file changed, 77 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index b5030e1a41d8..82689b999257 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1452,83 +1452,6 @@ exit:
 			emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
 		emit_ldx_r(dst, rn, dstk, off, ctx, BPF_SIZE(code));
 		break;
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + imm)) */
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-	/* R0 = ntohx(*(size *)(((struct sk_buff *)R6)->data + src + imm)) */
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-	{
-		const u8 r4 = bpf2a32[BPF_REG_6][1]; /* r4 = ptr to sk_buff */
-		const u8 r0 = bpf2a32[BPF_REG_0][1]; /*r0: struct sk_buff *skb*/
-						     /* rtn value */
-		const u8 r1 = bpf2a32[BPF_REG_0][0]; /* r1: int k */
-		const u8 r2 = bpf2a32[BPF_REG_1][1]; /* r2: unsigned int size */
-		const u8 r3 = bpf2a32[BPF_REG_1][0]; /* r3: void *buffer */
-		const u8 r6 = bpf2a32[TMP_REG_1][1]; /* r6: void *(*func)(..) */
-		int size;
-
-		/* Setting up first argument */
-		emit(ARM_MOV_R(r0, r4), ctx);
-
-		/* Setting up second argument */
-		emit_a32_mov_i(r1, imm, false, ctx);
-		if (BPF_MODE(code) == BPF_IND)
-			emit_a32_alu_r(r1, src_lo, false, sstk, ctx,
-				       false, false, BPF_ADD);
-
-		/* Setting up third argument */
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			size = 4;
-			break;
-		case BPF_H:
-			size = 2;
-			break;
-		case BPF_B:
-			size = 1;
-			break;
-		default:
-			return -EINVAL;
-		}
-		emit_a32_mov_i(r2, size, false, ctx);
-
-		/* Setting up fourth argument */
-		emit(ARM_ADD_I(r3, ARM_SP, imm8m(SKB_BUFFER)), ctx);
-
-		/* Setting up function pointer to call */
-		emit_a32_mov_i(r6, (unsigned int)bpf_load_pointer, false, ctx);
-		emit_blx_r(r6, ctx);
-
-		emit(ARM_EOR_R(r1, r1, r1), ctx);
-		/* Check if return address is NULL or not.
-		 * if NULL then jump to epilogue
-		 * else continue to load the value from retn address
-		 */
-		emit(ARM_CMP_I(r0, 0), ctx);
-		jmp_offset = epilogue_offset(ctx);
-		check_imm24(jmp_offset);
-		_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
-
-		/* Load value from the address */
-		switch (BPF_SIZE(code)) {
-		case BPF_W:
-			emit(ARM_LDR_I(r0, r0, 0), ctx);
-			emit_rev32(r0, r0, ctx);
-			break;
-		case BPF_H:
-			emit(ARM_LDRH_I(r0, r0, 0), ctx);
-			emit_rev16(r0, r0, ctx);
-			break;
-		case BPF_B:
-			emit(ARM_LDRB_I(r0, r0, 0), ctx);
-			/* No need to reverse */
-			break;
-		}
-		break;
-	}
 	/* ST: *(size *)(dst + off) = imm */
 	case BPF_ST | BPF_MEM | BPF_W:
 	case BPF_ST | BPF_MEM | BPF_H:
-- 
cgit v1.2.3-59-g8ed1b


From 4db25cc988518239a2b9fd76716a829e6deca66a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:20 +0200
Subject: bpf, mips64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from mips64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/mips/net/ebpf_jit.c | 104 -----------------------------------------------
 1 file changed, 104 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 3e2798bfea4f..7ba7df9c28fc 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1267,110 +1267,6 @@ jeq_common:
 			return -EINVAL;
 		break;
 
-	case BPF_LD | BPF_B | BPF_ABS:
-	case BPF_LD | BPF_H | BPF_ABS:
-	case BPF_LD | BPF_W | BPF_ABS:
-	case BPF_LD | BPF_DW | BPF_ABS:
-		ctx->flags |= EBPF_SAVE_RA;
-
-		gen_imm_to_reg(insn, MIPS_R_A1, ctx);
-		emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
-
-		if (insn->imm < 0) {
-			emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper);
-		} else {
-			emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
-			emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
-		}
-		goto ld_skb_common;
-
-	case BPF_LD | BPF_B | BPF_IND:
-	case BPF_LD | BPF_H | BPF_IND:
-	case BPF_LD | BPF_W | BPF_IND:
-	case BPF_LD | BPF_DW | BPF_IND:
-		ctx->flags |= EBPF_SAVE_RA;
-		src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp);
-		if (src < 0)
-			return src;
-		ts = get_reg_val_type(ctx, this_idx, insn->src_reg);
-		if (ts == REG_32BIT_ZERO_EX) {
-			/* sign extend */
-			emit_instr(ctx, sll, MIPS_R_A1, src, 0);
-			src = MIPS_R_A1;
-		}
-		if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) {
-			emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm);
-		} else {
-			gen_imm_to_reg(insn, MIPS_R_AT, ctx);
-			emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src);
-		}
-		/* truncate to 32-bit int */
-		emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0);
-		emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset);
-		emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO);
-
-		emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper);
-		emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer);
-		emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn));
-		emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT);
-
-ld_skb_common:
-		emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
-		/* delay slot move */
-		emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO);
-
-		/* Check the error value */
-		b_off = b_imm(exit_idx, ctx);
-		if (is_bad_offset(b_off)) {
-			target = j_target(ctx, exit_idx);
-			if (target == (unsigned int)-1)
-				return -E2BIG;
-
-			if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) {
-				ctx->offsets[this_idx] |= OFFSETS_B_CONV;
-				ctx->long_b_conversion = 1;
-			}
-			emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3);
-			emit_instr(ctx, nop);
-			emit_instr(ctx, j, target);
-			emit_instr(ctx, nop);
-		} else {
-			emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off);
-			emit_instr(ctx, nop);
-		}
-
-#ifdef __BIG_ENDIAN
-		need_swap = false;
-#else
-		need_swap = true;
-#endif
-		dst = MIPS_R_V0;
-		switch (BPF_SIZE(insn->code)) {
-		case BPF_B:
-			emit_instr(ctx, lbu, dst, 0, MIPS_R_V0);
-			break;
-		case BPF_H:
-			emit_instr(ctx, lhu, dst, 0, MIPS_R_V0);
-			if (need_swap)
-				emit_instr(ctx, wsbh, dst, dst);
-			break;
-		case BPF_W:
-			emit_instr(ctx, lw, dst, 0, MIPS_R_V0);
-			if (need_swap) {
-				emit_instr(ctx, wsbh, dst, dst);
-				emit_instr(ctx, rotr, dst, dst, 16);
-			}
-			break;
-		case BPF_DW:
-			emit_instr(ctx, ld, dst, 0, MIPS_R_V0);
-			if (need_swap) {
-				emit_instr(ctx, dsbh, dst, dst);
-				emit_instr(ctx, dshd, dst, dst);
-			}
-			break;
-		}
-
-		break;
 	case BPF_ALU | BPF_END | BPF_FROM_BE:
 	case BPF_ALU | BPF_END | BPF_FROM_LE:
 		dst = ebpf_to_mips_reg(ctx, insn, dst_reg);
-- 
cgit v1.2.3-59-g8ed1b


From dbf44daf7c88bb0b378e3cb9dc101ae0c5b33832 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:21 +0200
Subject: bpf, ppc64: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from ppc64 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/powerpc/net/Makefile         |   2 +-
 arch/powerpc/net/bpf_jit64.h      |  37 ++------
 arch/powerpc/net/bpf_jit_asm64.S  | 180 --------------------------------------
 arch/powerpc/net/bpf_jit_comp64.c | 109 +----------------------
 4 files changed, 11 insertions(+), 317 deletions(-)
 delete mode 100644 arch/powerpc/net/bpf_jit_asm64.S

diff --git a/arch/powerpc/net/Makefile b/arch/powerpc/net/Makefile
index 02d369ca6a53..809f019d3cba 100644
--- a/arch/powerpc/net/Makefile
+++ b/arch/powerpc/net/Makefile
@@ -3,7 +3,7 @@
 # Arch-specific network modules
 #
 ifeq ($(CONFIG_PPC64),y)
-obj-$(CONFIG_BPF_JIT) += bpf_jit_asm64.o bpf_jit_comp64.o
+obj-$(CONFIG_BPF_JIT) += bpf_jit_comp64.o
 else
 obj-$(CONFIG_BPF_JIT) += bpf_jit_asm.o bpf_jit_comp.o
 endif
diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index 8bdef7ed28a8..3609be4692b3 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -20,7 +20,7 @@
  * with our redzone usage.
  *
  *		[	prev sp		] <-------------
- *		[   nv gpr save area	] 8*8		|
+ *		[   nv gpr save area	] 6*8		|
  *		[    tail_call_cnt	] 8		|
  *		[    local_tmp_var	] 8		|
  * fp (r31) -->	[   ebpf stack space	] upto 512	|
@@ -28,8 +28,8 @@
  * sp (r1) --->	[    stack pointer	] --------------
  */
 
-/* for gpr non volatile registers BPG_REG_6 to 10, plus skb cache registers */
-#define BPF_PPC_STACK_SAVE	(8*8)
+/* for gpr non volatile registers BPG_REG_6 to 10 */
+#define BPF_PPC_STACK_SAVE	(6*8)
 /* for bpf JIT code internal usage */
 #define BPF_PPC_STACK_LOCALS	16
 /* stack frame excluding BPF stack, ensure this is quadword aligned */
@@ -39,10 +39,8 @@
 #ifndef __ASSEMBLY__
 
 /* BPF register usage */
-#define SKB_HLEN_REG	(MAX_BPF_JIT_REG + 0)
-#define SKB_DATA_REG	(MAX_BPF_JIT_REG + 1)
-#define TMP_REG_1	(MAX_BPF_JIT_REG + 2)
-#define TMP_REG_2	(MAX_BPF_JIT_REG + 3)
+#define TMP_REG_1	(MAX_BPF_JIT_REG + 0)
+#define TMP_REG_2	(MAX_BPF_JIT_REG + 1)
 
 /* BPF to ppc register mappings */
 static const int b2p[] = {
@@ -63,40 +61,23 @@ static const int b2p[] = {
 	[BPF_REG_FP] = 31,
 	/* eBPF jit internal registers */
 	[BPF_REG_AX] = 2,
-	[SKB_HLEN_REG] = 25,
-	[SKB_DATA_REG] = 26,
 	[TMP_REG_1] = 9,
 	[TMP_REG_2] = 10
 };
 
-/* PPC NVR range -- update this if we ever use NVRs below r24 */
-#define BPF_PPC_NVR_MIN		24
-
-/* Assembly helpers */
-#define DECLARE_LOAD_FUNC(func)	u64 func(u64 r3, u64 r4);			\
-				u64 func##_negative_offset(u64 r3, u64 r4);	\
-				u64 func##_positive_offset(u64 r3, u64 r4);
-
-DECLARE_LOAD_FUNC(sk_load_word);
-DECLARE_LOAD_FUNC(sk_load_half);
-DECLARE_LOAD_FUNC(sk_load_byte);
-
-#define CHOOSE_LOAD_FUNC(imm, func)						\
-			(imm < 0 ?						\
-			(imm >= SKF_LL_OFF ? func##_negative_offset : func) :	\
-			func##_positive_offset)
+/* PPC NVR range -- update this if we ever use NVRs below r27 */
+#define BPF_PPC_NVR_MIN		27
 
 #define SEEN_FUNC	0x1000 /* might call external helpers */
 #define SEEN_STACK	0x2000 /* uses BPF stack */
-#define SEEN_SKB	0x4000 /* uses sk_buff */
-#define SEEN_TAILCALL	0x8000 /* uses tail calls */
+#define SEEN_TAILCALL	0x4000 /* uses tail calls */
 
 struct codegen_context {
 	/*
 	 * This is used to track register usage as well
 	 * as calls to external helpers.
 	 * - register usage is tracked with corresponding
-	 *   bits (r3-r10 and r25-r31)
+	 *   bits (r3-r10 and r27-r31)
 	 * - rest of the bits can be used to track other
 	 *   things -- for now, we use bits 16 to 23
 	 *   encoded in SEEN_* macros above
diff --git a/arch/powerpc/net/bpf_jit_asm64.S b/arch/powerpc/net/bpf_jit_asm64.S
deleted file mode 100644
index 7e4c51430b84..000000000000
--- a/arch/powerpc/net/bpf_jit_asm64.S
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * bpf_jit_asm64.S: Packet/header access helper functions
- * for PPC64 BPF compiler.
- *
- * Copyright 2016, Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
- * 		   IBM Corporation
- *
- * Based on bpf_jit_asm.S by Matt Evans
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <asm/ppc_asm.h>
-#include <asm/ptrace.h>
-#include "bpf_jit64.h"
-
-/*
- * All of these routines are called directly from generated code,
- * with the below register usage:
- * r27		skb pointer (ctx)
- * r25		skb header length
- * r26		skb->data pointer
- * r4		offset
- *
- * Result is passed back in:
- * r8		data read in host endian format (accumulator)
- *
- * r9 is used as a temporary register
- */
-
-#define r_skb	r27
-#define r_hlen	r25
-#define r_data	r26
-#define r_off	r4
-#define r_val	r8
-#define r_tmp	r9
-
-_GLOBAL_TOC(sk_load_word)
-	cmpdi	r_off, 0
-	blt	bpf_slow_path_word_neg
-	b	sk_load_word_positive_offset
-
-_GLOBAL_TOC(sk_load_word_positive_offset)
-	/* Are we accessing past headlen? */
-	subi	r_tmp, r_hlen, 4
-	cmpd	r_tmp, r_off
-	blt	bpf_slow_path_word
-	/* Nope, just hitting the header.  cr0 here is eq or gt! */
-	LWZX_BE	r_val, r_data, r_off
-	blr	/* Return success, cr0 != LT */
-
-_GLOBAL_TOC(sk_load_half)
-	cmpdi	r_off, 0
-	blt	bpf_slow_path_half_neg
-	b	sk_load_half_positive_offset
-
-_GLOBAL_TOC(sk_load_half_positive_offset)
-	subi	r_tmp, r_hlen, 2
-	cmpd	r_tmp, r_off
-	blt	bpf_slow_path_half
-	LHZX_BE	r_val, r_data, r_off
-	blr
-
-_GLOBAL_TOC(sk_load_byte)
-	cmpdi	r_off, 0
-	blt	bpf_slow_path_byte_neg
-	b	sk_load_byte_positive_offset
-
-_GLOBAL_TOC(sk_load_byte_positive_offset)
-	cmpd	r_hlen, r_off
-	ble	bpf_slow_path_byte
-	lbzx	r_val, r_data, r_off
-	blr
-
-/*
- * Call out to skb_copy_bits:
- * Allocate a new stack frame here to remain ABI-compliant in
- * stashing LR.
- */
-#define bpf_slow_path_common(SIZE)					\
-	mflr	r0;							\
-	std	r0, PPC_LR_STKOFF(r1);					\
-	stdu	r1, -(STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS)(r1);	\
-	mr	r3, r_skb;						\
-	/* r4 = r_off as passed */					\
-	addi	r5, r1, STACK_FRAME_MIN_SIZE;				\
-	li	r6, SIZE;						\
-	bl	skb_copy_bits;						\
-	nop;								\
-	/* save r5 */							\
-	addi	r5, r1, STACK_FRAME_MIN_SIZE;				\
-	/* r3 = 0 on success */						\
-	addi	r1, r1, STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS;	\
-	ld	r0, PPC_LR_STKOFF(r1);					\
-	mtlr	r0;							\
-	cmpdi	r3, 0;							\
-	blt	bpf_error;	/* cr0 = LT */
-
-bpf_slow_path_word:
-	bpf_slow_path_common(4)
-	/* Data value is on stack, and cr0 != LT */
-	LWZX_BE	r_val, 0, r5
-	blr
-
-bpf_slow_path_half:
-	bpf_slow_path_common(2)
-	LHZX_BE	r_val, 0, r5
-	blr
-
-bpf_slow_path_byte:
-	bpf_slow_path_common(1)
-	lbzx	r_val, 0, r5
-	blr
-
-/*
- * Call out to bpf_internal_load_pointer_neg_helper
- */
-#define sk_negative_common(SIZE)				\
-	mflr	r0;						\
-	std	r0, PPC_LR_STKOFF(r1);				\
-	stdu	r1, -STACK_FRAME_MIN_SIZE(r1);			\
-	mr	r3, r_skb;					\
-	/* r4 = r_off, as passed */				\
-	li	r5, SIZE;					\
-	bl	bpf_internal_load_pointer_neg_helper;		\
-	nop;							\
-	addi	r1, r1, STACK_FRAME_MIN_SIZE;			\
-	ld	r0, PPC_LR_STKOFF(r1);				\
-	mtlr	r0;						\
-	/* R3 != 0 on success */				\
-	cmpldi	r3, 0;						\
-	beq	bpf_error_slow;	/* cr0 = EQ */
-
-bpf_slow_path_word_neg:
-	lis     r_tmp, -32	/* SKF_LL_OFF */
-	cmpd	r_off, r_tmp	/* addr < SKF_* */
-	blt	bpf_error	/* cr0 = LT */
-	b	sk_load_word_negative_offset
-
-_GLOBAL_TOC(sk_load_word_negative_offset)
-	sk_negative_common(4)
-	LWZX_BE	r_val, 0, r3
-	blr
-
-bpf_slow_path_half_neg:
-	lis     r_tmp, -32	/* SKF_LL_OFF */
-	cmpd	r_off, r_tmp	/* addr < SKF_* */
-	blt	bpf_error	/* cr0 = LT */
-	b	sk_load_half_negative_offset
-
-_GLOBAL_TOC(sk_load_half_negative_offset)
-	sk_negative_common(2)
-	LHZX_BE	r_val, 0, r3
-	blr
-
-bpf_slow_path_byte_neg:
-	lis     r_tmp, -32	/* SKF_LL_OFF */
-	cmpd	r_off, r_tmp	/* addr < SKF_* */
-	blt	bpf_error	/* cr0 = LT */
-	b	sk_load_byte_negative_offset
-
-_GLOBAL_TOC(sk_load_byte_negative_offset)
-	sk_negative_common(1)
-	lbzx	r_val, 0, r3
-	blr
-
-bpf_error_slow:
-	/* fabricate a cr0 = lt */
-	li	r_tmp, -1
-	cmpdi	r_tmp, 0
-bpf_error:
-	/*
-	 * Entered with cr0 = lt
-	 * Generated code will 'blt epilogue', returning 0.
-	 */
-	li	r_val, 0
-	blr
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 0ef3d9580e98..1bdb1aff0619 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -59,7 +59,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
  *		[	prev sp		] <-------------
  *		[	  ...       	] 		|
  * sp (r1) --->	[    stack pointer	] --------------
- *		[   nv gpr save area	] 8*8
+ *		[   nv gpr save area	] 6*8
  *		[    tail_call_cnt	] 8
  *		[    local_tmp_var	] 8
  *		[   unused red zone	] 208 bytes protected
@@ -88,21 +88,6 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
 	BUG();
 }
 
-static void bpf_jit_emit_skb_loads(u32 *image, struct codegen_context *ctx)
-{
-	/*
-	 * Load skb->len and skb->data_len
-	 * r3 points to skb
-	 */
-	PPC_LWZ(b2p[SKB_HLEN_REG], 3, offsetof(struct sk_buff, len));
-	PPC_LWZ(b2p[TMP_REG_1], 3, offsetof(struct sk_buff, data_len));
-	/* header_len = len - data_len */
-	PPC_SUB(b2p[SKB_HLEN_REG], b2p[SKB_HLEN_REG], b2p[TMP_REG_1]);
-
-	/* skb->data pointer */
-	PPC_BPF_LL(b2p[SKB_DATA_REG], 3, offsetof(struct sk_buff, data));
-}
-
 static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 {
 	int i;
@@ -145,18 +130,6 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 		if (bpf_is_seen_register(ctx, i))
 			PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i]));
 
-	/*
-	 * Save additional non-volatile regs if we cache skb
-	 * Also, setup skb data
-	 */
-	if (ctx->seen & SEEN_SKB) {
-		PPC_BPF_STL(b2p[SKB_HLEN_REG], 1,
-				bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG]));
-		PPC_BPF_STL(b2p[SKB_DATA_REG], 1,
-				bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG]));
-		bpf_jit_emit_skb_loads(image, ctx);
-	}
-
 	/* Setup frame pointer to point to the bpf stack area */
 	if (bpf_is_seen_register(ctx, BPF_REG_FP))
 		PPC_ADDI(b2p[BPF_REG_FP], 1,
@@ -172,14 +145,6 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
 		if (bpf_is_seen_register(ctx, i))
 			PPC_BPF_LL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, b2p[i]));
 
-	/* Restore non-volatile registers used for skb cache */
-	if (ctx->seen & SEEN_SKB) {
-		PPC_BPF_LL(b2p[SKB_HLEN_REG], 1,
-				bpf_jit_stack_offsetof(ctx, b2p[SKB_HLEN_REG]));
-		PPC_BPF_LL(b2p[SKB_DATA_REG], 1,
-				bpf_jit_stack_offsetof(ctx, b2p[SKB_DATA_REG]));
-	}
-
 	/* Tear down our stack frame */
 	if (bpf_has_stack_frame(ctx)) {
 		PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size);
@@ -753,23 +718,10 @@ emit_clear:
 			ctx->seen |= SEEN_FUNC;
 			func = (u8 *) __bpf_call_base + imm;
 
-			/* Save skb pointer if we need to re-cache skb data */
-			if ((ctx->seen & SEEN_SKB) &&
-			    bpf_helper_changes_pkt_data(func))
-				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
-
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 
 			/* move return value from r3 to BPF_REG_0 */
 			PPC_MR(b2p[BPF_REG_0], 3);
-
-			/* refresh skb cache */
-			if ((ctx->seen & SEEN_SKB) &&
-			    bpf_helper_changes_pkt_data(func)) {
-				/* reload skb pointer to r3 */
-				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
-				bpf_jit_emit_skb_loads(image, ctx);
-			}
 			break;
 
 		/*
@@ -886,65 +838,6 @@ cond_branch:
 			PPC_BCC(true_cond, addrs[i + 1 + off]);
 			break;
 
-		/*
-		 * Loads from packet header/data
-		 * Assume 32-bit input value in imm and X (src_reg)
-		 */
-
-		/* Absolute loads */
-		case BPF_LD | BPF_W | BPF_ABS:
-			func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_word);
-			goto common_load_abs;
-		case BPF_LD | BPF_H | BPF_ABS:
-			func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_half);
-			goto common_load_abs;
-		case BPF_LD | BPF_B | BPF_ABS:
-			func = (u8 *)CHOOSE_LOAD_FUNC(imm, sk_load_byte);
-common_load_abs:
-			/*
-			 * Load from [imm]
-			 * Load into r4, which can just be passed onto
-			 *  skb load helpers as the second parameter
-			 */
-			PPC_LI32(4, imm);
-			goto common_load;
-
-		/* Indirect loads */
-		case BPF_LD | BPF_W | BPF_IND:
-			func = (u8 *)sk_load_word;
-			goto common_load_ind;
-		case BPF_LD | BPF_H | BPF_IND:
-			func = (u8 *)sk_load_half;
-			goto common_load_ind;
-		case BPF_LD | BPF_B | BPF_IND:
-			func = (u8 *)sk_load_byte;
-common_load_ind:
-			/*
-			 * Load from [src_reg + imm]
-			 * Treat src_reg as a 32-bit value
-			 */
-			PPC_EXTSW(4, src_reg);
-			if (imm) {
-				if (imm >= -32768 && imm < 32768)
-					PPC_ADDI(4, 4, IMM_L(imm));
-				else {
-					PPC_LI32(b2p[TMP_REG_1], imm);
-					PPC_ADD(4, 4, b2p[TMP_REG_1]);
-				}
-			}
-
-common_load:
-			ctx->seen |= SEEN_SKB;
-			ctx->seen |= SEEN_FUNC;
-			bpf_jit_emit_func_call(image, ctx, (u64)func);
-
-			/*
-			 * Helper returns 'lt' condition on error, and an
-			 * appropriate return value in BPF_REG_0
-			 */
-			PPC_BCC(COND_LT, exit_addr);
-			break;
-
 		/*
 		 * Tail call
 		 */
-- 
cgit v1.2.3-59-g8ed1b


From e1cf4befa297b149149f633eff746593e400c030 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:22 +0200
Subject: bpf, s390x: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from s390x JIT.
Tested on s390x instance on LinuxONE.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/s390/net/Makefile       |   2 +-
 arch/s390/net/bpf_jit.S      | 116 ---------------------------------------
 arch/s390/net/bpf_jit.h      |  20 +------
 arch/s390/net/bpf_jit_comp.c | 127 ++++---------------------------------------
 4 files changed, 13 insertions(+), 252 deletions(-)
 delete mode 100644 arch/s390/net/bpf_jit.S

diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile
index e0d5f245e42b..d4663b4bf509 100644
--- a/arch/s390/net/Makefile
+++ b/arch/s390/net/Makefile
@@ -2,4 +2,4 @@
 #
 # Arch-specific network modules
 #
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S
deleted file mode 100644
index 25bb4643c4f4..000000000000
--- a/arch/s390/net/bpf_jit.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * BPF Jit compiler for s390, help functions.
- *
- * Copyright IBM Corp. 2012,2015
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- *	      Michael Holzheu <holzheu@linux.vnet.ibm.com>
- */
-
-#include <linux/linkage.h>
-#include "bpf_jit.h"
-
-/*
- * Calling convention:
- * registers %r7-%r10, %r11,%r13, and %r15 are call saved
- *
- * Input (64 bit):
- *   %r3 (%b2) = offset into skb data
- *   %r6 (%b5) = return address
- *   %r7 (%b6) = skb pointer
- *   %r12      = skb data pointer
- *
- * Output:
- *   %r14= %b0 = return value (read skb value)
- *
- * Work registers: %r2,%r4,%r5,%r14
- *
- * skb_copy_bits takes 4 parameters:
- *   %r2 = skb pointer
- *   %r3 = offset into skb data
- *   %r4 = pointer to temp buffer
- *   %r5 = length to copy
- *   Return value in %r2: 0 = ok
- *
- * bpf_internal_load_pointer_neg_helper takes 3 parameters:
- *   %r2 = skb pointer
- *   %r3 = offset into data
- *   %r4 = length to copy
- *   Return value in %r2: Pointer to data
- */
-
-#define SKF_MAX_NEG_OFF	-0x200000	/* SKF_LL_OFF from filter.h */
-
-/*
- * Load SIZE bytes from SKB
- */
-#define sk_load_common(NAME, SIZE, LOAD)				\
-ENTRY(sk_load_##NAME);							\
-	ltgr	%r3,%r3;		/* Is offset negative? */	\
-	jl	sk_load_##NAME##_slow_neg;				\
-ENTRY(sk_load_##NAME##_pos);						\
-	aghi	%r3,SIZE;		/* Offset + SIZE */		\
-	clg	%r3,STK_OFF_HLEN(%r15);	/* Offset + SIZE > hlen? */	\
-	jh	sk_load_##NAME##_slow;					\
-	LOAD	%r14,-SIZE(%r3,%r12);	/* Get data from skb */		\
-	b	OFF_OK(%r6);		/* Return */			\
-									\
-sk_load_##NAME##_slow:;							\
-	lgr	%r2,%r7;		/* Arg1 = skb pointer */	\
-	aghi	%r3,-SIZE;		/* Arg2 = offset */		\
-	la	%r4,STK_OFF_TMP(%r15);	/* Arg3 = temp bufffer */	\
-	lghi	%r5,SIZE;		/* Arg4 = size */		\
-	brasl	%r14,skb_copy_bits;	/* Get data from skb */		\
-	LOAD	%r14,STK_OFF_TMP(%r15);	/* Load from temp bufffer */	\
-	ltgr	%r2,%r2;		/* Set cc to (%r2 != 0) */	\
-	br	%r6;			/* Return */
-
-sk_load_common(word, 4, llgf)	/* r14 = *(u32 *) (skb->data+offset) */
-sk_load_common(half, 2, llgh)	/* r14 = *(u16 *) (skb->data+offset) */
-
-/*
- * Load 1 byte from SKB (optimized version)
- */
-	/* r14 = *(u8 *) (skb->data+offset) */
-ENTRY(sk_load_byte)
-	ltgr	%r3,%r3			# Is offset negative?
-	jl	sk_load_byte_slow_neg
-ENTRY(sk_load_byte_pos)
-	clg	%r3,STK_OFF_HLEN(%r15)	# Offset >= hlen?
-	jnl	sk_load_byte_slow
-	llgc	%r14,0(%r3,%r12)	# Get byte from skb
-	b	OFF_OK(%r6)		# Return OK
-
-sk_load_byte_slow:
-	lgr	%r2,%r7			# Arg1 = skb pointer
-					# Arg2 = offset
-	la	%r4,STK_OFF_TMP(%r15)	# Arg3 = pointer to temp buffer
-	lghi	%r5,1			# Arg4 = size (1 byte)
-	brasl	%r14,skb_copy_bits	# Get data from skb
-	llgc	%r14,STK_OFF_TMP(%r15)	# Load result from temp buffer
-	ltgr	%r2,%r2			# Set cc to (%r2 != 0)
-	br	%r6			# Return cc
-
-#define sk_negative_common(NAME, SIZE, LOAD)				\
-sk_load_##NAME##_slow_neg:;						\
-	cgfi	%r3,SKF_MAX_NEG_OFF;					\
-	jl	bpf_error;						\
-	lgr	%r2,%r7;		/* Arg1 = skb pointer */	\
-					/* Arg2 = offset */		\
-	lghi	%r4,SIZE;		/* Arg3 = size */		\
-	brasl	%r14,bpf_internal_load_pointer_neg_helper;		\
-	ltgr	%r2,%r2;						\
-	jz	bpf_error;						\
-	LOAD	%r14,0(%r2);		/* Get data from pointer */	\
-	xr	%r3,%r3;		/* Set cc to zero */		\
-	br	%r6;			/* Return cc */
-
-sk_negative_common(word, 4, llgf)
-sk_negative_common(half, 2, llgh)
-sk_negative_common(byte, 1, llgc)
-
-bpf_error:
-# force a return 0 from jit handler
-	ltgr	%r15,%r15	# Set condition code
-	br	%r6
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
index 5e1e5133132d..7822ea92e54a 100644
--- a/arch/s390/net/bpf_jit.h
+++ b/arch/s390/net/bpf_jit.h
@@ -16,9 +16,6 @@
 #include <linux/filter.h>
 #include <linux/types.h>
 
-extern u8 sk_load_word_pos[], sk_load_half_pos[], sk_load_byte_pos[];
-extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
-
 #endif /* __ASSEMBLY__ */
 
 /*
@@ -36,15 +33,6 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
  *	      |		      |     |
  *	      |   BPF stack   |     |
  *	      |		      |     |
- *	      +---------------+     |
- *	      | 8 byte skbp   |     |
- * R15+176 -> +---------------+     |
- *	      | 8 byte hlen   |     |
- * R15+168 -> +---------------+     |
- *	      | 4 byte align  |     |
- *	      +---------------+     |
- *	      | 4 byte temp   |     |
- *	      | for bpf_jit.S |     |
  * R15+160 -> +---------------+     |
  *	      | new backchain |     |
  * R15+152 -> +---------------+     |
@@ -57,17 +45,11 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
  * The stack size used by the BPF program ("BPF stack" above) is passed
  * via "aux->stack_depth".
  */
-#define STK_SPACE_ADD (8 + 8 + 4 + 4 + 160)
+#define STK_SPACE_ADD	(160)
 #define STK_160_UNUSED	(160 - 12 * 8)
 #define STK_OFF		(STK_SPACE_ADD - STK_160_UNUSED)
-#define STK_OFF_TMP	160	/* Offset of tmp buffer on stack */
-#define STK_OFF_HLEN	168	/* Offset of SKB header length on stack */
-#define STK_OFF_SKBP	176	/* Offset of SKB pointer on stack */
 
 #define STK_OFF_R6	(160 - 11 * 8)	/* Offset of r6 on stack */
 #define STK_OFF_TCCNT	(160 - 12 * 8)	/* Offset of tail_call_cnt on stack */
 
-/* Offset to skip condition code check */
-#define OFF_OK		4
-
 #endif /* __ARCH_S390_NET_BPF_JIT_H */
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 78a19c93b380..b020bea040b7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -47,23 +47,21 @@ struct bpf_jit {
 
 #define BPF_SIZE_MAX	0xffff	/* Max size for program (16 bit branches) */
 
-#define SEEN_SKB	1	/* skb access */
-#define SEEN_MEM	2	/* use mem[] for temporary storage */
-#define SEEN_RET0	4	/* ret0_ip points to a valid return 0 */
-#define SEEN_LITERAL	8	/* code uses literals */
-#define SEEN_FUNC	16	/* calls C functions */
-#define SEEN_TAIL_CALL	32	/* code uses tail calls */
-#define SEEN_REG_AX	64	/* code uses constant blinding */
-#define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)
+#define SEEN_MEM	(1 << 0)	/* use mem[] for temporary storage */
+#define SEEN_RET0	(1 << 1)	/* ret0_ip points to a valid return 0 */
+#define SEEN_LITERAL	(1 << 2)	/* code uses literals */
+#define SEEN_FUNC	(1 << 3)	/* calls C functions */
+#define SEEN_TAIL_CALL	(1 << 4)	/* code uses tail calls */
+#define SEEN_REG_AX	(1 << 5)	/* code uses constant blinding */
+#define SEEN_STACK	(SEEN_FUNC | SEEN_MEM)
 
 /*
  * s390 registers
  */
 #define REG_W0		(MAX_BPF_JIT_REG + 0)	/* Work register 1 (even) */
 #define REG_W1		(MAX_BPF_JIT_REG + 1)	/* Work register 2 (odd) */
-#define REG_SKB_DATA	(MAX_BPF_JIT_REG + 2)	/* SKB data register */
-#define REG_L		(MAX_BPF_JIT_REG + 3)	/* Literal pool register */
-#define REG_15		(MAX_BPF_JIT_REG + 4)	/* Register 15 */
+#define REG_L		(MAX_BPF_JIT_REG + 2)	/* Literal pool register */
+#define REG_15		(MAX_BPF_JIT_REG + 3)	/* Register 15 */
 #define REG_0		REG_W0			/* Register 0 */
 #define REG_1		REG_W1			/* Register 1 */
 #define REG_2		BPF_REG_1		/* Register 2 */
@@ -88,10 +86,8 @@ static const int reg2hex[] = {
 	[BPF_REG_9]	= 10,
 	/* BPF stack pointer */
 	[BPF_REG_FP]	= 13,
-	/* Register for blinding (shared with REG_SKB_DATA) */
+	/* Register for blinding */
 	[BPF_REG_AX]	= 12,
-	/* SKB data pointer */
-	[REG_SKB_DATA]	= 12,
 	/* Work registers for s390x backend */
 	[REG_W0]	= 0,
 	[REG_W1]	= 1,
@@ -384,27 +380,6 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
 	} while (re <= 15);
 }
 
-/*
- * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S"
- * we store the SKB header length on the stack and the SKB data
- * pointer in REG_SKB_DATA if BPF_REG_AX is not used.
- */
-static void emit_load_skb_data_hlen(struct bpf_jit *jit)
-{
-	/* Header length: llgf %w1,<len>(%b1) */
-	EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1,
-		      offsetof(struct sk_buff, len));
-	/* s %w1,<data_len>(%b1) */
-	EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1,
-		   offsetof(struct sk_buff, data_len));
-	/* stg %w1,ST_OFF_HLEN(%r0,%r15) */
-	EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN);
-	if (!(jit->seen & SEEN_REG_AX))
-		/* lg %skb_data,data_off(%b1) */
-		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
-			      BPF_REG_1, offsetof(struct sk_buff, data));
-}
-
 /*
  * Emit function prologue
  *
@@ -445,12 +420,6 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
 			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
 				      REG_15, 152);
 	}
-	if (jit->seen & SEEN_SKB) {
-		emit_load_skb_data_hlen(jit);
-		/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
-		EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
-			      STK_OFF_SKBP);
-	}
 }
 
 /*
@@ -483,12 +452,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 {
 	struct bpf_insn *insn = &fp->insnsi[i];
 	int jmp_off, last, insn_count = 1;
-	unsigned int func_addr, mask;
 	u32 dst_reg = insn->dst_reg;
 	u32 src_reg = insn->src_reg;
 	u32 *addrs = jit->addrs;
 	s32 imm = insn->imm;
 	s16 off = insn->off;
+	unsigned int mask;
 
 	if (dst_reg == BPF_REG_AX || src_reg == BPF_REG_AX)
 		jit->seen |= SEEN_REG_AX;
@@ -970,13 +939,6 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if ((jit->seen & SEEN_SKB) &&
-		    bpf_helper_changes_pkt_data((void *)func)) {
-			/* lg %b1,ST_OFF_SKBP(%r15) */
-			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
-				      REG_15, STK_OFF_SKBP);
-			emit_load_skb_data_hlen(jit);
-		}
 		break;
 	}
 	case BPF_JMP | BPF_TAIL_CALL:
@@ -1176,73 +1138,6 @@ branch_oc:
 		jmp_off = addrs[i + off + 1] - (addrs[i + 1] - 4);
 		EMIT4_PCREL(0xa7040000 | mask << 8, jmp_off);
 		break;
-	/*
-	 * BPF_LD
-	 */
-	case BPF_LD | BPF_ABS | BPF_B: /* b0 = *(u8 *) (skb->data+imm) */
-	case BPF_LD | BPF_IND | BPF_B: /* b0 = *(u8 *) (skb->data+imm+src) */
-		if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
-			func_addr = __pa(sk_load_byte_pos);
-		else
-			func_addr = __pa(sk_load_byte);
-		goto call_fn;
-	case BPF_LD | BPF_ABS | BPF_H: /* b0 = *(u16 *) (skb->data+imm) */
-	case BPF_LD | BPF_IND | BPF_H: /* b0 = *(u16 *) (skb->data+imm+src) */
-		if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
-			func_addr = __pa(sk_load_half_pos);
-		else
-			func_addr = __pa(sk_load_half);
-		goto call_fn;
-	case BPF_LD | BPF_ABS | BPF_W: /* b0 = *(u32 *) (skb->data+imm) */
-	case BPF_LD | BPF_IND | BPF_W: /* b0 = *(u32 *) (skb->data+imm+src) */
-		if ((BPF_MODE(insn->code) == BPF_ABS) && (imm >= 0))
-			func_addr = __pa(sk_load_word_pos);
-		else
-			func_addr = __pa(sk_load_word);
-		goto call_fn;
-call_fn:
-		jit->seen |= SEEN_SKB | SEEN_RET0 | SEEN_FUNC;
-		REG_SET_SEEN(REG_14); /* Return address of possible func call */
-
-		/*
-		 * Implicit input:
-		 *  BPF_REG_6	 (R7) : skb pointer
-		 *  REG_SKB_DATA (R12): skb data pointer (if no BPF_REG_AX)
-		 *
-		 * Calculated input:
-		 *  BPF_REG_2	 (R3) : offset of byte(s) to fetch in skb
-		 *  BPF_REG_5	 (R6) : return address
-		 *
-		 * Output:
-		 *  BPF_REG_0	 (R14): data read from skb
-		 *
-		 * Scratch registers (BPF_REG_1-5)
-		 */
-
-		/* Call function: llilf %w1,func_addr  */
-		EMIT6_IMM(0xc00f0000, REG_W1, func_addr);
-
-		/* Offset: lgfi %b2,imm */
-		EMIT6_IMM(0xc0010000, BPF_REG_2, imm);
-		if (BPF_MODE(insn->code) == BPF_IND)
-			/* agfr %b2,%src (%src is s32 here) */
-			EMIT4(0xb9180000, BPF_REG_2, src_reg);
-
-		/* Reload REG_SKB_DATA if BPF_REG_AX is used */
-		if (jit->seen & SEEN_REG_AX)
-			/* lg %skb_data,data_off(%b6) */
-			EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0,
-				      BPF_REG_6, offsetof(struct sk_buff, data));
-		/* basr %b5,%w1 (%b5 is call saved) */
-		EMIT2(0x0d00, BPF_REG_5, REG_W1);
-
-		/*
-		 * Note: For fast access we jump directly after the
-		 * jnz instruction from bpf_jit.S
-		 */
-		/* jnz <ret0> */
-		EMIT4_PCREL(0xa7740000, jit->ret0_ip - jit->prg);
-		break;
 	default: /* too complex, give up */
 		pr_err("Unknown opcode %02x\n", insn->code);
 		return -1;
-- 
cgit v1.2.3-59-g8ed1b


From 24dea04767e6e5175f4750770281b0c17ac6a2fb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:23 +0200
Subject: bpf, x32: remove ld_abs/ld_ind

Since LD_ABS/LD_IND instructions are now removed from the core and
reimplemented through a combination of inlined BPF instructions and
a slow-path helper, we can get rid of the complexity from x32 JIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp32.c | 136 +-----------------------------------------
 1 file changed, 1 insertion(+), 135 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 61e61341b777..0cc04e30adc1 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -175,19 +175,13 @@ static const u8 bpf2ia32[][2] = {
 #define SCRATCH_SIZE 96
 
 /* Total stack size used in JITed code */
-#define _STACK_SIZE \
-	(stack_depth + \
-	 + SCRATCH_SIZE + \
-	 + 4 /* Extra space for skb_copy_bits buffer */)
+#define _STACK_SIZE	(stack_depth + SCRATCH_SIZE)
 
 #define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
 
 /* Get the offset of eBPF REGISTERs stored on scratch space. */
 #define STACK_VAR(off) (off)
 
-/* Offset of skb_copy_bits buffer */
-#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
-
 /* Encode 'dst_reg' register into IA32 opcode 'byte' */
 static u8 add_1reg(u8 byte, u32 dst_reg)
 {
@@ -2276,134 +2270,6 @@ emit_jmp:
 				return -EFAULT;
 			}
 			break;
-
-		case BPF_LD | BPF_ABS | BPF_W:
-		case BPF_LD | BPF_ABS | BPF_H:
-		case BPF_LD | BPF_ABS | BPF_B:
-		case BPF_LD | BPF_IND | BPF_W:
-		case BPF_LD | BPF_IND | BPF_H:
-		case BPF_LD | BPF_IND | BPF_B:
-		{
-			int size;
-			const u8 *r6 = bpf2ia32[BPF_REG_6];
-
-			/* Setting up first argument */
-			/* mov eax,dword ptr [ebp+off] */
-			EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
-			      STACK_VAR(r6[0]));
-
-			/* Setting up second argument */
-			if (BPF_MODE(code) == BPF_ABS) {
-				/* mov %edx, imm32 */
-				EMIT1_off32(0xBA, imm32);
-			} else {
-				if (sstk)
-					/* mov edx,dword ptr [ebp+off] */
-					EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
-							     IA32_EDX),
-					      STACK_VAR(src_lo));
-				else
-					/* mov edx,src_lo */
-					EMIT2(0x8B, add_2reg(0xC0, src_lo,
-							     IA32_EDX));
-				if (imm32) {
-					if (is_imm8(imm32))
-						/* add %edx,imm8 */
-						EMIT3(0x83, 0xC2, imm32);
-					else
-						/* add %edx,imm32 */
-						EMIT2_off32(0x81, 0xC2, imm32);
-				}
-			}
-
-			/* Setting up third argument */
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				size = 4;
-				break;
-			case BPF_H:
-				size = 2;
-				break;
-			case BPF_B:
-				size = 1;
-				break;
-			default:
-				return -EINVAL;
-			}
-			/* mov ecx,val */
-			EMIT2(0xB1, size);
-			/* movzx ecx,ecx */
-			EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
-
-			/* mov ebx,ebp */
-			EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX));
-			/* add %ebx,imm8 */
-			EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER);
-			/* push ebx */
-			EMIT1(0x53);
-
-			/* Setting up function pointer to call */
-			/* mov ebx,imm32*/
-			EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX),
-				    (unsigned int)bpf_load_pointer);
-
-			EMIT2(0xFF, add_1reg(0xD0, IA32_EBX));
-			/* add %esp,4 */
-			EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4);
-			/* xor edx,edx */
-			EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX));
-
-			/* mov dword ptr [ebp+off],eax */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
-			      STACK_VAR(r0[0]));
-			/* mov dword ptr [ebp+off],edx */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
-			      STACK_VAR(r0[1]));
-
-			/*
-			 * Check if return address is NULL or not.
-			 * If NULL then jump to epilogue else continue
-			 * to load the value from retn address
-			 */
-			EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0);
-			jmp_offset = ctx->cleanup_addr - addrs[i];
-
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				jmp_offset += 7;
-				break;
-			case BPF_H:
-				jmp_offset += 10;
-				break;
-			case BPF_B:
-				jmp_offset += 6;
-				break;
-			}
-
-			EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset);
-			/* Load value from the address */
-			switch (BPF_SIZE(code)) {
-			case BPF_W:
-				/* mov eax,[eax] */
-				EMIT2(0x8B, 0x0);
-				/* Emit 'bswap eax' */
-				EMIT2(0x0F, add_1reg(0xC8, IA32_EAX));
-				break;
-			case BPF_H:
-				EMIT3(0x0F, 0xB7, 0x0);
-				EMIT1(0x66);
-				EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8);
-				break;
-			case BPF_B:
-				EMIT3(0x0F, 0xB6, 0x0);
-				break;
-			}
-
-			/* mov dword ptr [ebp+off],eax */
-			EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
-			      STACK_VAR(r0[0]));
-			break;
-		}
 		/* STX XADD: lock *(u32 *)(dst + off) += src */
 		case BPF_STX | BPF_XADD | BPF_W:
 		/* STX XADD: lock *(u64 *)(dst + off) += src */
-- 
cgit v1.2.3-59-g8ed1b


From 32b3652c307ef62f624182fac1fd6328ccc8fcbe Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 01:08:24 +0200
Subject: bpf: sync tools bpf.h uapi header

Only sync the header from include/uapi/linux/bpf.h.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8daef7326bb7..83a95ae388dd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1801,6 +1801,30 @@ union bpf_attr {
  * 	Return
  * 		a non-negative value equal to or less than size on success, or
  * 		a negative error in case of failure.
+ *
+ * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ *
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1870,7 +1894,8 @@ union bpf_attr {
 	FN(bind),			\
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
-	FN(get_stack),
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -1931,6 +1956,12 @@ enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
 };
 
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
-- 
cgit v1.2.3-59-g8ed1b


From 565f0fa902b64020d5d147ff1708567e9e0b6e49 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 3 May 2018 10:55:07 +0200
Subject: xfrm: use a dedicated slab cache for struct xfrm_state

struct xfrm_state is rather large (768 bytes here) and therefore wastes
quite a lot of memory as it falls into the kmalloc-1024 slab cache,
leaving 256 bytes of unused memory per XFRM state object -- a net waste
of 25%.

Using a dedicated slab cache for struct xfrm_state reduces the level of
internal fragmentation to a minimum.

On my configuration SLUB chooses to create a slab cache covering 4
pages holding 21 objects, resulting in an average memory waste of ~13
bytes per object -- a net waste of only 1.6%.

In my tests this led to memory savings of roughly 2.3MB for 10k XFRM
states.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f9d2f2233f09..f595797a20ce 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -42,6 +42,7 @@ static void xfrm_state_gc_task(struct work_struct *work);
 
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation);
+static struct kmem_cache *xfrm_state_cache __ro_after_init;
 
 static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
 static HLIST_HEAD(xfrm_state_gc_list);
@@ -451,7 +452,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	}
 	xfrm_dev_state_free(x);
 	security_xfrm_state_free(x);
-	kfree(x);
+	kmem_cache_free(xfrm_state_cache, x);
 }
 
 static void xfrm_state_gc_task(struct work_struct *work)
@@ -563,7 +564,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 {
 	struct xfrm_state *x;
 
-	x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC);
+	x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
 
 	if (x) {
 		write_pnet(&x->xs_net, net);
@@ -2307,6 +2308,10 @@ int __net_init xfrm_state_init(struct net *net)
 {
 	unsigned int sz;
 
+	if (net_eq(net, &init_net))
+		xfrm_state_cache = KMEM_CACHE(xfrm_state,
+					      SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
 	INIT_LIST_HEAD(&net->xfrm.state_all);
 
 	sz = sizeof(struct hlist_head) * 8;
-- 
cgit v1.2.3-59-g8ed1b


From f910cefa32b6cdabc96b126bcfc46d8940b1dc45 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 2 May 2018 16:17:17 -0400
Subject: bpf: unify main prog and subprog

Currently, verifier treat main prog and subprog differently. All subprogs
detected are kept in env->subprog_starts while main prog is not kept there.
Instead, main prog is implicitly defined as the prog start at 0.

There is actually no difference between main prog and subprog, it is better
to unify them, and register all progs detected into env->subprog_starts.

This could also help simplifying some code logic.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 57 ++++++++++++++++++++++++--------------------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c395fddf..f655b926e432 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -191,7 +191,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS];
+	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
 	/* computes the stack depth of each bpf function */
 	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ba10a83909d..8e8e582a7c03 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -768,7 +768,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	ret = find_subprog(env, off);
 	if (ret >= 0)
 		return 0;
-	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
+	if (env->subprog_cnt > BPF_MAX_SUBPROGS) {
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
@@ -784,6 +784,11 @@ static int check_subprogs(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
+	/* Add entry function. */
+	ret = add_subprog(env, 0);
+	if (ret < 0)
+		return ret;
+
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -809,10 +814,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog)
+	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog++];
+		subprog_end = env->subprog_starts[cur_subprog + 1];
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -836,11 +841,13 @@ next:
 				verbose(env, "last insn is not an exit or jmp\n");
 				return -EINVAL;
 			}
+			cur_subprog++;
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog)
+			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end = env->subprog_starts[cur_subprog++];
+				subprog_end =
+					env->subprog_starts[cur_subprog + 1];
 		}
 	}
 	return 0;
@@ -1508,10 +1515,10 @@ process_func:
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog)
+	if (env->subprog_cnt == subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog];
+		subprog_end = env->subprog_starts[subprog + 1];
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1529,7 +1536,6 @@ continue_func:
 				  i);
 			return -EFAULT;
 		}
-		subprog++;
 		frame++;
 		if (frame >= MAX_CALL_FRAMES) {
 			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1561,7 +1567,6 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	subprog++;
 	return env->subprog_stack_depth[subprog];
 }
 #endif
@@ -2099,7 +2104,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_tail_call:
 		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
 			goto error;
-		if (env->subprog_cnt) {
+		if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
 			return -EINVAL;
 		}
@@ -2272,7 +2277,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			/* remember the callsite, it will be used by bpf_exit */
 			*insn_idx /* callsite */,
 			state->curframe + 1 /* frameno within this callchain */,
-			subprog + 1 /* subprog number within this prog */);
+			subprog /* subprog number within this prog */);
 
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -3889,7 +3894,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	if (env->subprog_cnt) {
+	if (env->subprog_cnt > 1) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
 		 * that can have arbitrary arguments. It's problematic
@@ -4920,11 +4925,11 @@ process_bpf_exit:
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt + 1; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		u32 depth = env->subprog_stack_depth[i];
 
 		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt + 1)
+		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
@@ -5301,7 +5306,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	void *old_bpf_func;
 	int err = -ENOMEM;
 
-	if (env->subprog_cnt == 0)
+	if (env->subprog_cnt <= 1)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5317,7 +5322,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* temporarily remember subprog id inside insn instead of
 		 * aux_data, since next loop will split up all insns into funcs
 		 */
-		insn->off = subprog + 1;
+		insn->off = subprog;
 		/* remember original imm in case JIT fails and fallback
 		 * to interpreter will be needed
 		 */
@@ -5326,16 +5331,16 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
-	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+	func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
 	if (!func)
 		return -ENOMEM;
 
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i)
+		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i];
+			subprog_end = env->subprog_starts[i + 1];
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5365,7 +5370,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
@@ -5378,7 +5383,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 				__bpf_call_base;
 		}
 	}
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5392,7 +5397,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	/* finally lock prog and jit images for all functions and
 	 * populate kallsysm
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
@@ -5409,7 +5414,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog + 1]->bpf_func;
+		addr  = (unsigned long)func[subprog]->bpf_func;
 		addr &= PAGE_MASK;
 		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 			    addr - __bpf_call_base;
@@ -5418,10 +5423,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt + 1;
+	prog->aux->func_cnt = env->subprog_cnt;
 	return 0;
 out_free:
-	for (i = 0; i <= env->subprog_cnt; i++)
+	for (i = 0; i < env->subprog_cnt; i++)
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
-- 
cgit v1.2.3-59-g8ed1b


From 9c8105bd4402236b1bb0f8f10709c5cec1440a0c Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 2 May 2018 16:17:18 -0400
Subject: bpf: centre subprog information fields

It is better to centre all subprog information fields into one structure.
This structure could later serve as function node in call graph.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  9 ++++---
 kernel/bpf/verifier.c        | 62 +++++++++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f655b926e432..8f70dc181e23 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -173,6 +173,11 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 
 #define BPF_MAX_SUBPROGS 256
 
+struct bpf_subprog_info {
+	u32 start; /* insn idx of function entry point */
+	u16 stack_depth; /* max. stack depth used by this function */
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -191,9 +196,7 @@ struct bpf_verifier_env {
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	struct bpf_verifier_log log;
-	u32 subprog_starts[BPF_MAX_SUBPROGS + 1];
-	/* computes the stack depth of each bpf function */
-	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
+	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e8e582a7c03..5b293b4abb70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -741,18 +741,19 @@ enum reg_arg_type {
 
 static int cmp_subprogs(const void *a, const void *b)
 {
-	return *(int *)a - *(int *)b;
+	return ((struct bpf_subprog_info *)a)->start -
+	       ((struct bpf_subprog_info *)b)->start;
 }
 
 static int find_subprog(struct bpf_verifier_env *env, int off)
 {
-	u32 *p;
+	struct bpf_subprog_info *p;
 
-	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
-		    sizeof(env->subprog_starts[0]), cmp_subprogs);
+	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+		    sizeof(env->subprog_info[0]), cmp_subprogs);
 	if (!p)
 		return -ENOENT;
-	return p - env->subprog_starts;
+	return p - env->subprog_info;
 
 }
 
@@ -772,15 +773,16 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
-	env->subprog_starts[env->subprog_cnt++] = off;
-	sort(env->subprog_starts, env->subprog_cnt,
-	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+	env->subprog_info[env->subprog_cnt++].start = off;
+	sort(env->subprog_info, env->subprog_cnt,
+	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return 0;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
 {
 	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
@@ -810,14 +812,14 @@ static int check_subprogs(struct bpf_verifier_env *env)
 
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
-			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
 	subprog_start = 0;
 	if (env->subprog_cnt == cur_subprog + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[cur_subprog + 1];
+		subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -846,8 +848,7 @@ next:
 			if (env->subprog_cnt == cur_subprog + 1)
 				subprog_end = insn_cnt;
 			else
-				subprog_end =
-					env->subprog_starts[cur_subprog + 1];
+				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
 	return 0;
@@ -1480,13 +1481,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_stack_depth[func->subprogno];
+	u16 stack = env->subprog_info[func->subprogno].stack_depth;
 
 	if (stack >= -off)
 		return 0;
 
 	/* update known max for given subprogram */
-	env->subprog_stack_depth[func->subprogno] = -off;
+	env->subprog_info[func->subprogno].stack_depth = -off;
 	return 0;
 }
 
@@ -1498,7 +1499,8 @@ static int update_stack_depth(struct bpf_verifier_env *env,
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
@@ -1508,17 +1510,17 @@ process_func:
 	/* round up to 32-bytes, since this is granularity
 	 * of interpreter stack size
 	 */
-	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	if (depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
 			frame + 1, depth);
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog + 1)
+	if (env->subprog_cnt == idx + 1)
 		subprog_end = insn_cnt;
 	else
-		subprog_end = env->subprog_starts[subprog + 1];
+		subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1526,12 +1528,12 @@ continue_func:
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
-		ret_prog[frame] = subprog;
+		ret_prog[frame] = idx;
 
 		/* find the callee */
 		i = i + insn[i].imm + 1;
-		subprog = find_subprog(env, i);
-		if (subprog < 0) {
+		idx = find_subprog(env, i);
+		if (idx < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
 				  i);
 			return -EFAULT;
@@ -1548,10 +1550,10 @@ continue_func:
 	 */
 	if (frame == 0)
 		return 0;
-	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	frame--;
 	i = ret_insn[frame];
-	subprog = ret_prog[frame];
+	idx = ret_prog[frame];
 	goto continue_func;
 }
 
@@ -1567,7 +1569,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	return env->subprog_stack_depth[subprog];
+	return env->subprog_info[subprog].stack_depth;
 }
 #endif
 
@@ -4926,14 +4928,14 @@ process_bpf_exit:
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
 	for (i = 0; i < env->subprog_cnt; i++) {
-		u32 depth = env->subprog_stack_depth[i];
+		u32 depth = env->subprog_info[i].stack_depth;
 
 		verbose(env, "%d", depth);
 		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
-	env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
 
@@ -5140,9 +5142,9 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 	if (len == 1)
 		return;
 	for (i = 0; i < env->subprog_cnt; i++) {
-		if (env->subprog_starts[i] < off)
+		if (env->subprog_info[i].start < off)
 			continue;
-		env->subprog_starts[i] += len - 1;
+		env->subprog_info[i].start += len - 1;
 	}
 }
 
@@ -5340,7 +5342,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		if (env->subprog_cnt == i + 1)
 			subprog_end = prog->len;
 		else
-			subprog_end = env->subprog_starts[i + 1];
+			subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5357,7 +5359,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
-		func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
-- 
cgit v1.2.3-59-g8ed1b


From 4cb3d99c84ccbf728ff0e381c7c9815c3fa2bd5e Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Wed, 2 May 2018 16:17:19 -0400
Subject: bpf: add faked "ending" subprog

There are quite a few code snippet like the following in verifier:

       subprog_start = 0;
       if (env->subprog_cnt == cur_subprog + 1)
               subprog_end = insn_cnt;
       else
               subprog_end = env->subprog_info[cur_subprog + 1].start;

The reason is there is no marker in subprog_info array to tell the end of
it.

We could resolve this issue by introducing a faked "ending" subprog.
The special "ending" subprog is with "insn_cnt" as start offset, so it is
serving as the end mark whenever we iterate over all subprogs.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5b293b4abb70..37e0affa515e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -769,7 +769,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 	ret = find_subprog(env, off);
 	if (ret >= 0)
 		return 0;
-	if (env->subprog_cnt > BPF_MAX_SUBPROGS) {
+	if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
@@ -810,16 +810,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	/* Add a fake 'exit' subprog which could simplify subprog iteration
+	 * logic. 'subprog_cnt' should not be increased.
+	 */
+	subprog[env->subprog_cnt].start = insn_cnt;
+
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
 			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
-	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog + 1)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = subprog[cur_subprog + 1].start;
+	subprog_start = subprog[cur_subprog].start;
+	subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -843,11 +845,9 @@ next:
 				verbose(env, "last insn is not an exit or jmp\n");
 				return -EINVAL;
 			}
-			cur_subprog++;
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog + 1)
-				subprog_end = insn_cnt;
-			else
+			cur_subprog++;
+			if (cur_subprog < env->subprog_cnt)
 				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
@@ -1502,7 +1502,6 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
 	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
 	int ret_prog[MAX_CALL_FRAMES];
 
@@ -1517,10 +1516,7 @@ process_func:
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == idx + 1)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = subprog[idx + 1].start;
+	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -5141,7 +5137,8 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 
 	if (len == 1)
 		return;
-	for (i = 0; i < env->subprog_cnt; i++) {
+	/* NOTE: fake 'exit' subprog should be updated as well. */
+	for (i = 0; i <= env->subprog_cnt; i++) {
 		if (env->subprog_info[i].start < off)
 			continue;
 		env->subprog_info[i].start += len - 1;
@@ -5339,10 +5336,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 
 	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i + 1)
-			subprog_end = prog->len;
-		else
-			subprog_end = env->subprog_info[i + 1].start;
+		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
-- 
cgit v1.2.3-59-g8ed1b


From 9c87758cf0893d6d3b51aac34546807b138cb3e7 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 30 Apr 2018 15:19:16 +0200
Subject: rt2x00: call sta_add/remove directly in rt2800

Only rt2800 subdriver of rt2x00 implement sta_add() and sta_remove(),
we do not need generic version of those.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 11 +++++++----
 drivers/net/wireless/ralink/rt2x00/rt2800lib.h |  5 +++--
 drivers/net/wireless/ralink/rt2x00/rt2800pci.c |  6 ++----
 drivers/net/wireless/ralink/rt2x00/rt2800soc.c |  6 ++----
 drivers/net/wireless/ralink/rt2x00/rt2800usb.c |  6 ++----
 drivers/net/wireless/ralink/rt2x00/rt2x00.h    |  4 ----
 6 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index e827dc522580..a567bc273ffc 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -1557,12 +1557,13 @@ static void rt2800_set_max_psdu_len(struct rt2x00_dev *rt2x00dev)
 	rt2800_register_write(rt2x00dev, MAX_LEN_CFG, reg);
 }
 
-int rt2800_sta_add(struct rt2x00_dev *rt2x00dev, struct ieee80211_vif *vif,
+int rt2800_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		   struct ieee80211_sta *sta)
 {
-	int wcid;
-	struct rt2x00_sta *sta_priv = sta_to_rt2x00_sta(sta);
+	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct rt2800_drv_data *drv_data = rt2x00dev->drv_data;
+	struct rt2x00_sta *sta_priv = sta_to_rt2x00_sta(sta);
+	int wcid;
 
 	/*
 	 * Limit global maximum TX AMPDU length to smallest value of all
@@ -1608,8 +1609,10 @@ int rt2800_sta_add(struct rt2x00_dev *rt2x00dev, struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL_GPL(rt2800_sta_add);
 
-int rt2800_sta_remove(struct rt2x00_dev *rt2x00dev, struct ieee80211_sta *sta)
+int rt2800_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		      struct ieee80211_sta *sta)
 {
+	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct rt2800_drv_data *drv_data = rt2x00dev->drv_data;
 	struct rt2x00_sta *sta_priv = sta_to_rt2x00_sta(sta);
 	int wcid = sta_priv->wcid;
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
index 275e3969abdd..51d9c2a932cc 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
@@ -208,9 +208,10 @@ int rt2800_config_shared_key(struct rt2x00_dev *rt2x00dev,
 int rt2800_config_pairwise_key(struct rt2x00_dev *rt2x00dev,
 			       struct rt2x00lib_crypto *crypto,
 			       struct ieee80211_key_conf *key);
-int rt2800_sta_add(struct rt2x00_dev *rt2x00dev, struct ieee80211_vif *vif,
+int rt2800_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		   struct ieee80211_sta *sta);
-int rt2800_sta_remove(struct rt2x00_dev *rt2x00dev, struct ieee80211_sta *sta);
+int rt2800_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		      struct ieee80211_sta *sta);
 void rt2800_config_filter(struct rt2x00_dev *rt2x00dev,
 			  const unsigned int filter_flags);
 void rt2800_config_intf(struct rt2x00_dev *rt2x00dev, struct rt2x00_intf *intf,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
index 1172eefd1c1a..71b1affc3885 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800pci.c
@@ -311,8 +311,8 @@ static const struct ieee80211_ops rt2800pci_mac80211_ops = {
 	.get_stats		= rt2x00mac_get_stats,
 	.get_key_seq		= rt2800_get_key_seq,
 	.set_rts_threshold	= rt2800_set_rts_threshold,
-	.sta_add		= rt2x00mac_sta_add,
-	.sta_remove		= rt2x00mac_sta_remove,
+	.sta_add		= rt2800_sta_add,
+	.sta_remove		= rt2800_sta_remove,
 	.bss_info_changed	= rt2x00mac_bss_info_changed,
 	.conf_tx		= rt2800_conf_tx,
 	.get_tsf		= rt2800_get_tsf,
@@ -377,8 +377,6 @@ static const struct rt2x00lib_ops rt2800pci_rt2x00_ops = {
 	.config_erp		= rt2800_config_erp,
 	.config_ant		= rt2800_config_ant,
 	.config			= rt2800_config,
-	.sta_add		= rt2800_sta_add,
-	.sta_remove		= rt2800_sta_remove,
 };
 
 static const struct rt2x00_ops rt2800pci_ops = {
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800soc.c b/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
index 6848ebc83534..a502816214ab 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800soc.c
@@ -150,8 +150,8 @@ static const struct ieee80211_ops rt2800soc_mac80211_ops = {
 	.get_stats		= rt2x00mac_get_stats,
 	.get_key_seq		= rt2800_get_key_seq,
 	.set_rts_threshold	= rt2800_set_rts_threshold,
-	.sta_add		= rt2x00mac_sta_add,
-	.sta_remove		= rt2x00mac_sta_remove,
+	.sta_add		= rt2800_sta_add,
+	.sta_remove		= rt2800_sta_remove,
 	.bss_info_changed	= rt2x00mac_bss_info_changed,
 	.conf_tx		= rt2800_conf_tx,
 	.get_tsf		= rt2800_get_tsf,
@@ -216,8 +216,6 @@ static const struct rt2x00lib_ops rt2800soc_rt2x00_ops = {
 	.config_erp		= rt2800_config_erp,
 	.config_ant		= rt2800_config_ant,
 	.config			= rt2800_config,
-	.sta_add		= rt2800_sta_add,
-	.sta_remove		= rt2800_sta_remove,
 };
 
 static const struct rt2x00_ops rt2800soc_ops = {
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
index d901a41d36e4..98a7313fea4a 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
@@ -797,8 +797,8 @@ static const struct ieee80211_ops rt2800usb_mac80211_ops = {
 	.get_stats		= rt2x00mac_get_stats,
 	.get_key_seq		= rt2800_get_key_seq,
 	.set_rts_threshold	= rt2800_set_rts_threshold,
-	.sta_add		= rt2x00mac_sta_add,
-	.sta_remove		= rt2x00mac_sta_remove,
+	.sta_add		= rt2800_sta_add,
+	.sta_remove		= rt2800_sta_remove,
 	.bss_info_changed	= rt2x00mac_bss_info_changed,
 	.conf_tx		= rt2800_conf_tx,
 	.get_tsf		= rt2800_get_tsf,
@@ -858,8 +858,6 @@ static const struct rt2x00lib_ops rt2800usb_rt2x00_ops = {
 	.config_erp		= rt2800_config_erp,
 	.config_ant		= rt2800_config_ant,
 	.config			= rt2800_config,
-	.sta_add		= rt2800_sta_add,
-	.sta_remove		= rt2800_sta_remove,
 };
 
 static void rt2800usb_queue_init(struct data_queue *queue)
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
index 1f38c338ca7a..a279a4363bc1 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
@@ -1457,10 +1457,6 @@ int rt2x00mac_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 #else
 #define rt2x00mac_set_key	NULL
 #endif /* CONFIG_RT2X00_LIB_CRYPTO */
-int rt2x00mac_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		      struct ieee80211_sta *sta);
-int rt2x00mac_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-			 struct ieee80211_sta *sta);
 void rt2x00mac_sw_scan_start(struct ieee80211_hw *hw,
 			     struct ieee80211_vif *vif,
 			     const u8 *mac_addr);
-- 
cgit v1.2.3-59-g8ed1b


From 811a3991510735566b66069fdd4ce3ce33a2ec18 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Mon, 30 Apr 2018 15:19:17 +0200
Subject: rt2x00: check against flushing empty queue

We have check if queue is not empty when start flushing queues on
by mac80211 callback, but we also can start flushing queues by internal
driver calls. So move check into rt2x00queue_flush_queue() to assure
we do not flush empty queue anytime.

Additionally add warning if we start to kick empty queue as in such
situation we set wrong index in the HW queue, what can confuse the HW
and have various negative consequences.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ralink/rt2x00/rt2800mmio.c  | 1 +
 drivers/net/wireless/ralink/rt2x00/rt2x00mac.c   | 3 +--
 drivers/net/wireless/ralink/rt2x00/rt2x00queue.c | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c
index 1123e2bed803..e1a7ed7e4892 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800mmio.c
@@ -600,6 +600,7 @@ void rt2800mmio_kick_queue(struct data_queue *queue)
 	case QID_AC_VI:
 	case QID_AC_BE:
 	case QID_AC_BK:
+		WARN_ON_ONCE(rt2x00queue_empty(queue));
 		entry = rt2x00queue_get_entry(queue, Q_INDEX);
 		rt2x00mmio_register_write(rt2x00dev, TX_CTX_IDX(queue->qid),
 					  entry->entry_idx);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
index a971bc7a6b63..c380c1f56ba6 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
@@ -739,8 +739,7 @@ void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		return;
 
 	tx_queue_for_each(rt2x00dev, queue)
-		if (!rt2x00queue_empty(queue))
-			rt2x00queue_flush_queue(queue, drop);
+		rt2x00queue_flush_queue(queue, drop);
 }
 EXPORT_SYMBOL_GPL(rt2x00mac_flush);
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
index a6884e73d2ab..7c1f8f561d4a 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c
@@ -1000,6 +1000,8 @@ void rt2x00queue_flush_queue(struct data_queue *queue, bool drop)
 		(queue->qid == QID_AC_BE) ||
 		(queue->qid == QID_AC_BK);
 
+	if (rt2x00queue_empty(queue))
+		return;
 
 	/*
 	 * If we are not supposed to drop any pending
-- 
cgit v1.2.3-59-g8ed1b


From 070fc356e21addb579edcd7a1f2ef692b29a78e5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 30 Apr 2018 14:41:02 +0100
Subject: rtlwifi: fix spelling mistake: "dismatch" -> "mismatch"

Trivial fix to spelling mistake in RT_TRACE message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8192e2ant.c | 2 +-
 drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c | 2 +-
 drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a2ant.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8192e2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8192e2ant.c
index 8fce371749d3..f22fec093f1d 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8192e2ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8192e2ant.c
@@ -1783,7 +1783,7 @@ static void btc8192e2ant_tdma_duration_adjust(struct btc_coexist *btcoexist,
 		bool scan = false, link = false, roam = false;
 
 		RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_LOUD,
-			 "[BTCoex], PsTdma type dismatch!!!, ");
+			 "[BTCoex], PsTdma type mismatch!!!, ");
 		RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_LOUD,
 			 "curPsTdma=%d, recordPsTdma=%d\n",
 			 coex_dm->cur_ps_tdma, coex_dm->tdma_adj_type);
diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
index 73ec31972944..279fe01bb55e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
@@ -2766,7 +2766,7 @@ static void btc8723b2ant_tdma_duration_adjust(struct btc_coexist *btcoexist,
 	if (coex_dm->cur_ps_tdma != coex_dm->ps_tdma_du_adj_type) {
 		bool scan = false, link = false, roam = false;
 		RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_LOUD,
-			 "[BTCoex], PsTdma type dismatch!!!, curPsTdma=%d, recordPsTdma=%d\n",
+			 "[BTCoex], PsTdma type mismatch!!!, curPsTdma=%d, recordPsTdma=%d\n",
 			 coex_dm->cur_ps_tdma, coex_dm->ps_tdma_du_adj_type);
 
 		btcoexist->btc_get(btcoexist, BTC_GET_BL_WIFI_SCAN, &scan);
diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a2ant.c
index 2202d5e18977..01a9d303603b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a2ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8821a2ant.c
@@ -2614,7 +2614,7 @@ static void btc8821a2ant_tdma_duration_adjust(struct btc_coexist *btcoexist,
 		bool scan = false, link = false, roam = false;
 
 		RT_TRACE(rtlpriv, COMP_BT_COEXIST, DBG_LOUD,
-			 "[BTCoex], PsTdma type dismatch!!!, cur_ps_tdma = %d, recordPsTdma = %d\n",
+			 "[BTCoex], PsTdma type mismatch!!!, cur_ps_tdma = %d, recordPsTdma = %d\n",
 			 coex_dm->cur_ps_tdma, coex_dm->ps_tdma_du_adj_type);
 
 		btcoexist->btc_get(btcoexist, BTC_GET_BL_WIFI_SCAN, &scan);
-- 
cgit v1.2.3-59-g8ed1b


From 8efb868566db107d6d0130ea61a653a29f851661 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 1 May 2018 10:33:54 +0200
Subject: mt76x2: remove unnecessary break in mt76x2_mac_process_tx_rate()

Remove unnecessary break statement in the default case of bw switch
block in mt76x2_mac_process_tx_rate routine

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_mac.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
index d18315652583..dab713756004 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
@@ -410,7 +410,6 @@ mt76x2_mac_process_tx_rate(struct ieee80211_tx_rate *txrate, u16 rate,
 		break;
 	default:
 		return -EINVAL;
-		break;
 	}
 
 	if (rate & MT_RXWI_RATE_SGI)
-- 
cgit v1.2.3-59-g8ed1b


From 7e7939e80e3cda3e5677c3905d20c066bfb22ef2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 1 May 2018 10:36:43 +0100
Subject: ipw2200: fix spelling mistake: "functionalitis" -> "functionalities"

Trivial fix to spelling mistake in module parameter description text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/intel/ipw2x00/ipw2200.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
index 87a5e414c2f7..ba3fb1d2ddb4 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
@@ -12012,7 +12012,7 @@ MODULE_PARM_DESC(rtap_iface, "create the rtap interface (1 - create, default 0)"
 
 #ifdef CONFIG_IPW2200_QOS
 module_param(qos_enable, int, 0444);
-MODULE_PARM_DESC(qos_enable, "enable all QoS functionalitis");
+MODULE_PARM_DESC(qos_enable, "enable all QoS functionalities");
 
 module_param(qos_burst_enable, int, 0444);
 MODULE_PARM_DESC(qos_burst_enable, "enable QoS burst mode");
-- 
cgit v1.2.3-59-g8ed1b


From c990affd5abce1f338ac52539e092dad04647fb6 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Wed, 2 May 2018 11:46:36 +0200
Subject: mt76x2: fix avg_rssi estimation

Add leftover filter coefficients in IIR rssi estimation

Fixes: 7bc04215a66b ("mt76: add driver code for MT76x2e")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_phy.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
index 038d1fe6c726..c1c38ca3330a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_phy.c
@@ -508,8 +508,10 @@ mt76x2_phy_update_channel_gain(struct mt76x2_dev *dev)
 	u8 gain_delta;
 	int low_gain;
 
-	dev->cal.avg_rssi[0] = (dev->cal.avg_rssi[0] * 15) / 16 + (rssi0 << 8);
-	dev->cal.avg_rssi[1] = (dev->cal.avg_rssi[1] * 15) / 16 + (rssi1 << 8);
+	dev->cal.avg_rssi[0] = (dev->cal.avg_rssi[0] * 15) / 16 +
+			       (rssi0 << 8) / 16;
+	dev->cal.avg_rssi[1] = (dev->cal.avg_rssi[1] * 15) / 16 +
+			       (rssi1 << 8) / 16;
 	dev->cal.avg_rssi_all = (dev->cal.avg_rssi[0] +
 				 dev->cal.avg_rssi[1]) / 512;
 
-- 
cgit v1.2.3-59-g8ed1b


From 77cb065fde0a179f252caabfad5466cf9c522572 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 2 May 2018 22:54:48 +0300
Subject: sh_eth: use TSU register accessors for TSU_POST<n>

There's no particularly good reason TSU_POST<n> registers get accessed
circumventing sh_eth_tsu_{read|write}() -- start using those, removing
(badly named) sh_eth_tsu_get_post_reg_offset(),  while at it...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index b6b90a6314e3..e2e8838fa981 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2610,12 +2610,6 @@ static int sh_eth_change_mtu(struct net_device *ndev, int new_mtu)
 }
 
 /* For TSU_POSTn. Please refer to the manual about this (strange) bitfields */
-static void *sh_eth_tsu_get_post_reg_offset(struct sh_eth_private *mdp,
-					    int entry)
-{
-	return sh_eth_tsu_get_offset(mdp, TSU_POST1) + (entry / 8 * 4);
-}
-
 static u32 sh_eth_tsu_get_post_mask(int entry)
 {
 	return 0x0f << (28 - ((entry % 8) * 4));
@@ -2630,27 +2624,25 @@ static void sh_eth_tsu_enable_cam_entry_post(struct net_device *ndev,
 					     int entry)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
+	int reg = TSU_POST1 + entry / 8;
 	u32 tmp;
-	void *reg_offset;
 
-	reg_offset = sh_eth_tsu_get_post_reg_offset(mdp, entry);
-	tmp = ioread32(reg_offset);
-	iowrite32(tmp | sh_eth_tsu_get_post_bit(mdp, entry), reg_offset);
+	tmp = sh_eth_tsu_read(mdp, reg);
+	sh_eth_tsu_write(mdp, tmp | sh_eth_tsu_get_post_bit(mdp, entry), reg);
 }
 
 static bool sh_eth_tsu_disable_cam_entry_post(struct net_device *ndev,
 					      int entry)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
+	int reg = TSU_POST1 + entry / 8;
 	u32 post_mask, ref_mask, tmp;
-	void *reg_offset;
 
-	reg_offset = sh_eth_tsu_get_post_reg_offset(mdp, entry);
 	post_mask = sh_eth_tsu_get_post_mask(entry);
 	ref_mask = sh_eth_tsu_get_post_bit(mdp, entry) & ~post_mask;
 
-	tmp = ioread32(reg_offset);
-	iowrite32(tmp & ~post_mask, reg_offset);
+	tmp = sh_eth_tsu_read(mdp, reg);
+	sh_eth_tsu_write(mdp, tmp & ~post_mask, reg);
 
 	/* If other port enables, the function returns "true" */
 	return tmp & ref_mask;
-- 
cgit v1.2.3-59-g8ed1b


From 627a0d20fa7b9e05ea4f42f47ea84d450dbbc96f Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 2 May 2018 22:55:52 +0300
Subject: sh_eth: WARN_ON() access to unimplemented TSU register

Commit 3365711df024 ("sh_eth: WARN on access to a register not implemented
in a particular chip") added  WARN_ON() to sh_eth_{read|write}() but not
to sh_eth_tsu_{read|write}(). Now that we've routed almost all TSU register
accesses  (except TSU_ADR{H|L}<n> -- which are special) thru the latter
pair of accessors, it makes sense to check for the unimplemented TSU
registers as well...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index e2e8838fa981..5970d9e5ddf1 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -442,12 +442,22 @@ static void sh_eth_modify(struct net_device *ndev, int enum_index, u32 clear,
 static void sh_eth_tsu_write(struct sh_eth_private *mdp, u32 data,
 			     int enum_index)
 {
-	iowrite32(data, mdp->tsu_addr + mdp->reg_offset[enum_index]);
+	u16 offset = mdp->reg_offset[enum_index];
+
+	if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+		return;
+
+	iowrite32(data, mdp->tsu_addr + offset);
 }
 
 static u32 sh_eth_tsu_read(struct sh_eth_private *mdp, int enum_index)
 {
-	return ioread32(mdp->tsu_addr + mdp->reg_offset[enum_index]);
+	u16 offset = mdp->reg_offset[enum_index];
+
+	if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+		return ~0U;
+
+	return ioread32(mdp->tsu_addr + offset);
 }
 
 static void sh_eth_select_mii(struct net_device *ndev)
-- 
cgit v1.2.3-59-g8ed1b


From c8b8ec8e0d605872c62909e7af54fb0bcb174d0c Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@linux.ibm.com>
Date: Thu, 3 May 2018 18:12:36 +0200
Subject: smc: simplify abort logic

Some of the conditions to exit recv() are common in two pathes - cleaning up
code by moving the check up so we have it only once.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com><
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_rx.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index af851d8df1f8..def33fb29ac9 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -112,26 +112,22 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 		if (atomic_read(&conn->bytes_to_rcv))
 			goto copy;
 
+		if (sk->sk_shutdown & RCV_SHUTDOWN ||
+		    smc_cdc_rxed_any_close_or_senddone(conn) ||
+		    conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+			break;
+
 		if (read_done) {
 			if (sk->sk_err ||
 			    sk->sk_state == SMC_CLOSED ||
-			    sk->sk_shutdown & RCV_SHUTDOWN ||
 			    !timeo ||
-			    signal_pending(current) ||
-			    smc_cdc_rxed_any_close_or_senddone(conn) ||
-			    conn->local_tx_ctrl.conn_state_flags.
-			    peer_conn_abort)
+			    signal_pending(current))
 				break;
 		} else {
 			if (sk->sk_err) {
 				read_done = sock_error(sk);
 				break;
 			}
-			if (sk->sk_shutdown & RCV_SHUTDOWN ||
-			    smc_cdc_rxed_any_close_or_senddone(conn) ||
-			    conn->local_tx_ctrl.conn_state_flags.
-			    peer_conn_abort)
-				break;
 			if (sk->sk_state == SMC_CLOSED) {
 				if (!sock_flag(sk, SOCK_DONE)) {
 					/* This occurs when user tries to read
-- 
cgit v1.2.3-59-g8ed1b


From b51fa1b135fbfc89e34d90b5bab563745d94186f Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@linux.ibm.com>
Date: Thu, 3 May 2018 18:12:37 +0200
Subject: smc: make smc_rx_wait_data() generic

Turn smc_rx_wait_data into a generic function that can be used at various
instances to wait on traffic to complete with varying criteria.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com><
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c |  2 +-
 net/smc/smc_rx.c | 21 +++++++++++----------
 net/smc/smc_rx.h |  8 +++++++-
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 95b00e4c3396..a7495c84ae15 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1089,7 +1089,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 			release_sock(clcsk);
 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
 			lock_sock(nsk);
-			smc_rx_wait_data(smc_sk(nsk), &timeo);
+			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
 			release_sock(nsk);
 		}
 	}
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index def33fb29ac9..7b64bee656e8 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -22,11 +22,10 @@
 #include "smc_tx.h" /* smc_tx_consumer_update() */
 #include "smc_rx.h"
 
-/* callback implementation for sk.sk_data_ready()
- * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+/* callback implementation to wakeup consumers blocked with smc_rx_wait().
  * indirectly called by smc_cdc_msg_recv_action().
  */
-static void smc_rx_data_ready(struct sock *sk)
+static void smc_rx_wake_up(struct sock *sk)
 {
 	struct socket_wq *wq;
 
@@ -47,25 +46,27 @@ static void smc_rx_data_ready(struct sock *sk)
 /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
  *   @smc    smc socket
  *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
+ *   @fcrit  add'l criterion to evaluate as function pointer
  * Returns:
  * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
  * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
  */
-int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+		int (*fcrit)(struct smc_connection *conn))
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
 	struct sock *sk = &smc->sk;
 	int rc;
 
-	if (atomic_read(&conn->bytes_to_rcv))
+	if (fcrit(conn))
 		return 1;
 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
 	add_wait_queue(sk_sleep(sk), &wait);
 	rc = sk_wait_event(sk, timeo,
 			   sk->sk_err ||
 			   sk->sk_shutdown & RCV_SHUTDOWN ||
-			   atomic_read(&conn->bytes_to_rcv) ||
+			   fcrit(conn) ||
 			   smc_cdc_rxed_any_close_or_senddone(conn),
 			   &wait);
 	remove_wait_queue(sk_sleep(sk), &wait);
@@ -146,14 +147,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 				return -EAGAIN;
 		}
 
-		if (!atomic_read(&conn->bytes_to_rcv)) {
-			smc_rx_wait_data(smc, &timeo);
+		if (!smc_rx_data_available(conn)) {
+			smc_rx_wait(smc, &timeo, smc_rx_data_available);
 			continue;
 		}
 
 copy:
 		/* initialize variables for 1st iteration of subsequent loop */
-		/* could be just 1 byte, even after smc_rx_wait_data above */
+		/* could be just 1 byte, even after waiting on data above */
 		readable = atomic_read(&conn->bytes_to_rcv);
 		/* not more than what user space asked for */
 		copylen = min_t(size_t, read_remaining, readable);
@@ -213,5 +214,5 @@ out:
 /* Initialize receive properties on connection establishment. NB: not __init! */
 void smc_rx_init(struct smc_sock *smc)
 {
-	smc->sk.sk_data_ready = smc_rx_data_ready;
+	smc->sk.sk_data_ready = smc_rx_wake_up;
 }
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 0b75a6b470e6..8f9f00997641 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -20,6 +20,12 @@
 void smc_rx_init(struct smc_sock *smc);
 int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 		   int flags);
-int smc_rx_wait_data(struct smc_sock *smc, long *timeo);
+int smc_rx_wait(struct smc_sock *smc, long *timeo,
+		int (*fcrit)(struct smc_connection *conn));
+static inline int smc_rx_data_available(struct smc_connection *conn)
+{
+	return atomic_read(&conn->bytes_to_rcv);
+}
+
 
 #endif /* SMC_RX_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2ef4f27ad00dc9d36cadb81fce9d07366d579451 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@linux.ibm.com>
Date: Thu, 3 May 2018 18:12:38 +0200
Subject: smc: allocate RMBs as compound pages

Preparatory work for splice() support.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com><
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 18 +++++++++---------
 net/smc/smc_core.h |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 79aa7d312d6b..9c74844e2008 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -290,8 +290,8 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
 				    DMA_TO_DEVICE);
 	}
 	sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]);
-	if (buf_desc->cpu_addr)
-		free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order);
+	if (buf_desc->pages)
+		__free_pages(buf_desc->pages, buf_desc->order);
 	kfree(buf_desc);
 }
 
@@ -566,16 +566,16 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 	if (!buf_desc)
 		return ERR_PTR(-ENOMEM);
 
-	buf_desc->cpu_addr =
-		(void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN |
-					 __GFP_NOMEMALLOC |
-					 __GFP_NORETRY | __GFP_ZERO,
-					 get_order(bufsize));
-	if (!buf_desc->cpu_addr) {
+	buf_desc->order = get_order(bufsize);
+	buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+				      __GFP_NOMEMALLOC | __GFP_COMP |
+				      __GFP_NORETRY | __GFP_ZERO,
+				      buf_desc->order);
+	if (!buf_desc->pages) {
 		kfree(buf_desc);
 		return ERR_PTR(-EAGAIN);
 	}
-	buf_desc->order = get_order(bufsize);
+	buf_desc->cpu_addr = (void *)page_address(buf_desc->pages);
 
 	/* build the sg table from the pages */
 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 70ef4f47dc8d..fca8624e5e71 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -120,6 +120,7 @@ struct smc_link {
 struct smc_buf_desc {
 	struct list_head	list;
 	void			*cpu_addr;	/* virtual address of buffer */
+	struct page		*pages;
 	struct sg_table		sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
 	struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX];
 						/* for rmb only: memory region
-- 
cgit v1.2.3-59-g8ed1b


From 9014db202cb764b8e14c53e7bacc81f9a1a2ba7f Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@linux.ibm.com>
Date: Thu, 3 May 2018 18:12:39 +0200
Subject: smc: add support for splice()

Provide an implementation for splice() when we are using SMC. See
smc_splice_read() for further details.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com><
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c |  38 +++++++++++--
 net/smc/smc.h    |   3 +
 net/smc/smc_rx.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_rx.h |   6 +-
 4 files changed, 185 insertions(+), 25 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index a7495c84ae15..17688a02035b 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1163,10 +1163,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		goto out;
 	}
 
-	if (smc->use_fallback)
+	if (smc->use_fallback) {
 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
-	else
-		rc = smc_rx_recvmsg(smc, msg, len, flags);
+	} else {
+		msg->msg_namelen = 0;
+		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
+	}
 
 out:
 	release_sock(sk);
@@ -1447,9 +1449,15 @@ out:
 	return rc;
 }
 
+/* Map the affected portions of the rmbe into an spd, note the number of bytes
+ * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
+ * updates till whenever a respective page has been fully processed.
+ * Note that subsequent recv() calls have to wait till all splice() processing
+ * completed.
+ */
 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
 			       struct pipe_inode_info *pipe, size_t len,
-				    unsigned int flags)
+			       unsigned int flags)
 {
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
@@ -1457,16 +1465,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
 
 	smc = smc_sk(sk);
 	lock_sock(sk);
-	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+
+	if (sk->sk_state == SMC_INIT ||
+	    sk->sk_state == SMC_LISTEN ||
+	    sk->sk_state == SMC_CLOSED)
+		goto out;
+
+	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+		rc = 0;
 		goto out;
+	}
+
 	if (smc->use_fallback) {
 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
 						    pipe, len, flags);
 	} else {
-		rc = -EOPNOTSUPP;
+		if (*ppos) {
+			rc = -ESPIPE;
+			goto out;
+		}
+		if (flags & SPLICE_F_NONBLOCK)
+			flags = MSG_DONTWAIT;
+		else
+			flags = 0;
+		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
 	}
 out:
 	release_sock(sk);
+
 	return rc;
 }
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 2405e889b93d..ec209cd48d42 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -164,6 +164,9 @@ struct smc_connection {
 	atomic_t		bytes_to_rcv;	/* arrived data,
 						 * not yet received
 						 */
+	atomic_t		splice_pending;	/* number of spliced bytes
+						 * pending processing
+						 */
 #ifndef KERNEL_HAS_ATOMIC64
 	spinlock_t		acurs_lock;	/* protect cursors */
 #endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 7b64bee656e8..ed45569289f5 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -43,6 +43,116 @@ static void smc_rx_wake_up(struct sock *sk)
 	rcu_read_unlock();
 }
 
+/* Update consumer cursor
+ *   @conn   connection to update
+ *   @cons   consumer cursor
+ *   @len    number of Bytes consumed
+ */
+static void smc_rx_update_consumer(struct smc_connection *conn,
+				   union smc_host_cursor cons, size_t len)
+{
+	smc_curs_add(conn->rmbe_size, &cons, len);
+	smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
+		       conn);
+	/* send consumer cursor update if required */
+	/* similar to advertising new TCP rcv_wnd if required */
+	smc_tx_consumer_update(conn);
+}
+
+struct smc_spd_priv {
+	struct smc_sock *smc;
+	size_t		 len;
+};
+
+static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
+	struct smc_sock *smc = priv->smc;
+	struct smc_connection *conn;
+	union smc_host_cursor cons;
+	struct sock *sk = &smc->sk;
+
+	if (sk->sk_state == SMC_CLOSED ||
+	    sk->sk_state == SMC_PEERFINCLOSEWAIT ||
+	    sk->sk_state == SMC_APPFINCLOSEWAIT)
+		goto out;
+	conn = &smc->conn;
+	lock_sock(sk);
+	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_rx_update_consumer(conn, cons, priv->len);
+	release_sock(sk);
+	if (atomic_sub_and_test(priv->len, &conn->splice_pending))
+		smc_rx_wake_up(sk);
+out:
+	kfree(priv);
+	put_page(buf->page);
+	sock_put(sk);
+}
+
+static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+				   struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static const struct pipe_buf_operations smc_pipe_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = smc_rx_pipe_buf_release,
+	.steal = smc_rx_pipe_buf_nosteal,
+	.get = generic_pipe_buf_get
+};
+
+static void smc_rx_spd_release(struct splice_pipe_desc *spd,
+			       unsigned int i)
+{
+	put_page(spd->pages[i]);
+}
+
+static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
+			 struct smc_sock *smc)
+{
+	struct splice_pipe_desc spd;
+	struct partial_page partial;
+	struct smc_spd_priv *priv;
+	struct page *page;
+	int bytes;
+
+	page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+	priv->len = len;
+	priv->smc = smc;
+	partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr;
+	partial.len = len;
+	partial.private = (unsigned long)priv;
+
+	spd.nr_pages_max = 1;
+	spd.nr_pages = 1;
+	spd.pages = &page;
+	spd.partial = &partial;
+	spd.ops = &smc_pipe_ops;
+	spd.spd_release = smc_rx_spd_release;
+
+	bytes = splice_to_pipe(pipe, &spd);
+	if (bytes > 0) {
+		sock_hold(&smc->sk);
+		get_page(smc->conn.rmb_desc->pages);
+		atomic_add(bytes, &smc->conn.splice_pending);
+	}
+
+	return bytes;
+}
+
+static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn)
+{
+	return atomic_read(&conn->bytes_to_rcv) &&
+	       !atomic_read(&conn->splice_pending);
+}
+
 /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
  *   @smc    smc socket
  *   @timeo  pointer to max seconds to wait, pointer to value 0 for no timeout
@@ -74,19 +184,25 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 	return rc;
 }
 
-/* rcvbuf consumer: main API called by socket layer.
- * called under sk lock.
+/* smc_rx_recvmsg - receive data from RMBE
+ * @msg:	copy data to receive buffer
+ * @pipe:	copy data to pipe if set - indicates splice() call
+ *
+ * rcvbuf consumer: main API called by socket layer.
+ * Called under sk lock.
  */
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
-		   int flags)
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+		   struct pipe_inode_info *pipe, size_t len, int flags)
 {
 	size_t copylen, read_done = 0, read_remaining = len;
 	size_t chunk_len, chunk_off, chunk_len_sum;
 	struct smc_connection *conn = &smc->conn;
+	int (*func)(struct smc_connection *conn);
 	union smc_host_cursor cons;
 	int readable, chunk;
 	char *rcvbuf_base;
 	struct sock *sk;
+	int splbytes;
 	long timeo;
 	int target;		/* Read at least these many bytes */
 	int rc;
@@ -102,12 +218,11 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-	msg->msg_namelen = 0;
 	/* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
 	rcvbuf_base = conn->rmb_desc->cpu_addr;
 
 	do { /* while (read_remaining) */
-		if (read_done >= target)
+		if (read_done >= target || (pipe && read_done))
 			break;
 
 		if (atomic_read(&conn->bytes_to_rcv))
@@ -156,11 +271,24 @@ copy:
 		/* initialize variables for 1st iteration of subsequent loop */
 		/* could be just 1 byte, even after waiting on data above */
 		readable = atomic_read(&conn->bytes_to_rcv);
+		splbytes = atomic_read(&conn->splice_pending);
+		if (!readable || (msg && splbytes)) {
+			if (splbytes)
+				func = smc_rx_data_available_and_no_splice_pend;
+			else
+				func = smc_rx_data_available;
+			smc_rx_wait(smc, &timeo, func);
+			continue;
+		}
+
 		/* not more than what user space asked for */
 		copylen = min_t(size_t, read_remaining, readable);
 		smc_curs_write(&cons,
 			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
 			       conn);
+		/* subsequent splice() calls pick up where previous left */
+		if (splbytes)
+			smc_curs_add(conn->rmbe_size, &cons, splbytes);
 		/* determine chunks where to read from rcvbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
 		chunk_len = min_t(size_t,
@@ -170,9 +298,16 @@ copy:
 		smc_rmb_sync_sg_for_cpu(conn);
 		for (chunk = 0; chunk < 2; chunk++) {
 			if (!(flags & MSG_TRUNC)) {
-				rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
-						   chunk_len);
-				if (rc) {
+				if (msg) {
+					rc = memcpy_to_msg(msg, rcvbuf_base +
+							   chunk_off,
+							   chunk_len);
+				} else {
+					rc = smc_rx_splice(pipe, rcvbuf_base +
+							chunk_off, chunk_len,
+							smc);
+				}
+				if (rc < 0) {
 					if (!read_done)
 						read_done = -EFAULT;
 					smc_rmb_sync_sg_for_device(conn);
@@ -193,18 +328,13 @@ copy:
 
 		/* update cursors */
 		if (!(flags & MSG_PEEK)) {
-			smc_curs_add(conn->rmbe_size, &cons, copylen);
 			/* increased in recv tasklet smc_cdc_msg_rcv() */
 			smp_mb__before_atomic();
 			atomic_sub(copylen, &conn->bytes_to_rcv);
 			/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
 			smp_mb__after_atomic();
-			smc_curs_write(&conn->local_tx_ctrl.cons,
-				       smc_curs_read(&cons, conn),
-				       conn);
-			/* send consumer cursor update if required */
-			/* similar to advertising new TCP rcv_wnd if required */
-			smc_tx_consumer_update(conn);
+			if (msg)
+				smc_rx_update_consumer(conn, cons, copylen);
 		}
 	} while (read_remaining);
 out:
@@ -215,4 +345,5 @@ out:
 void smc_rx_init(struct smc_sock *smc)
 {
 	smc->sk.sk_data_ready = smc_rx_wake_up;
+	atomic_set(&smc->conn.splice_pending, 0);
 }
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
index 8f9f00997641..db823c97d824 100644
--- a/net/smc/smc_rx.h
+++ b/net/smc/smc_rx.h
@@ -18,8 +18,9 @@
 #include "smc.h"
 
 void smc_rx_init(struct smc_sock *smc);
-int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
-		   int flags);
+
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
+		   struct pipe_inode_info *pipe, size_t len, int flags);
 int smc_rx_wait(struct smc_sock *smc, long *timeo,
 		int (*fcrit)(struct smc_connection *conn));
 static inline int smc_rx_data_available(struct smc_connection *conn)
@@ -27,5 +28,4 @@ static inline int smc_rx_data_available(struct smc_connection *conn)
 	return atomic_read(&conn->bytes_to_rcv);
 }
 
-
 #endif /* SMC_RX_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2d943adf860302a4c7bfee498e64ded9fe7d9dba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Apr 2018 08:49:29 -0700
Subject: net/mlx4_en: optimizes get_fixed_ipv6_csum()

While trying to support CHECKSUM_COMPLETE for IPV6 fragments,
I had to experiments various hacks in get_fixed_ipv6_csum().
I must admit I could not find how to implement this :/

However, get_fixed_ipv6_csum() does a lot of redundant operations,
calling csum_partial() twice.

First csum_partial() computes the checksum of saddr and daddr,
put in @csum_pseudo_hdr. Undone later in the second csum_partial()
computed on whole ipv6 header.

Then nexthdr is added once, added a second time, then substracted.

payload_len is added once, then substracted.

Really all this can be reduced to two add_csum(), to add back 6 bytes
that were removed by mlx4 when providing hw_checksum in RX descriptor.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index efc55feddc5c..9f54ccbddea7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -593,30 +593,25 @@ static int get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-/* In IPv6 packets, besides subtracting the pseudo header checksum,
- * we also compute/add the IP header checksum which
- * is not added by the HW.
+/* In IPv6 packets, hw_checksum lacks 6 bytes from IPv6 header:
+ * 4 first bytes : priority, version, flow_lbl
+ * and 2 additional bytes : nexthdr, hop_limit.
  */
 static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
 			       struct ipv6hdr *ipv6h)
 {
 	__u8 nexthdr = ipv6h->nexthdr;
-	__wsum csum_pseudo_hdr = 0;
+	__wsum temp;
 
 	if (unlikely(nexthdr == IPPROTO_FRAGMENT ||
 		     nexthdr == IPPROTO_HOPOPTS ||
 		     nexthdr == IPPROTO_SCTP))
 		return -1;
-	hw_checksum = csum_add(hw_checksum, (__force __wsum)htons(nexthdr));
 
-	csum_pseudo_hdr = csum_partial(&ipv6h->saddr,
-				       sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0);
-	csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len);
-	csum_pseudo_hdr = csum_add(csum_pseudo_hdr,
-				   (__force __wsum)htons(nexthdr));
-
-	skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr);
-	skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0));
+	/* priority, version, flow_lbl */
+	temp = csum_add(hw_checksum, *(__wsum *)ipv6h);
+	/* nexthdr and hop_limit */
+	skb->csum = csum_add(temp, (__force __wsum)*(__be16 *)&ipv6h->nexthdr);
 	return 0;
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d034294e3c5bec1d5b5c6338991b29d19c820ea0 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 3 Apr 2018 14:03:52 +0300
Subject: net/mlx5: Decrease level of prints about non-existent MKEY

User-controlled application can cause multiple prints as below to flood
dmesg. Since knowledge of failed MKey release is important for debug,
let's decrease its level to debug.

mlx5_core 0000:00:04.0: mlx5_core_destroy_mkey:127:(pid 2352): failed
radix tree delete of mkey 0x1ed700

Reported-by: Noa Osherovich <noaos@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/mr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index b9736f505bdf..f4f02f775c93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -123,8 +123,8 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
 	deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key));
 	write_unlock_irqrestore(&table->lock, flags);
 	if (!deleted_mkey) {
-		mlx5_core_warn(dev, "failed radix tree delete of mkey 0x%x\n",
-			       mlx5_base_mkey(mkey->key));
+		mlx5_core_dbg(dev, "failed radix tree delete of mkey 0x%x\n",
+			      mlx5_base_mkey(mkey->key));
 		return -ENOENT;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ed644fab19db05902243f3ebf9e1245c4c79bebb Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Sat, 4 Nov 2017 07:18:28 +0200
Subject: net/mlx5: Refactor num of blocks in mailbox calculation

Get the logic that calculates the number of blocks in a command mailbox
into a dedicated function.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 21cd1703a862..0f7062104ad9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -135,6 +135,14 @@ static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx)
 	return cmd->cmd_buf + (idx << cmd->log_stride);
 }
 
+static int mlx5_calc_cmd_blocks(struct mlx5_cmd_msg *msg)
+{
+	int size = msg->len;
+	int blen = size - min_t(int, sizeof(msg->first.data), size);
+
+	return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE);
+}
+
 static u8 xor8_buf(void *buf, size_t offset, int len)
 {
 	u8 *ptr = buf;
@@ -174,10 +182,7 @@ static void calc_block_sig(struct mlx5_cmd_prot_block *block)
 static void calc_chain_sig(struct mlx5_cmd_msg *msg)
 {
 	struct mlx5_cmd_mailbox *next = msg->next;
-	int size = msg->len;
-	int blen = size - min_t(int, sizeof(msg->first.data), size);
-	int n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1)
-		/ MLX5_CMD_DATA_BLOCK_SIZE;
+	int n = mlx5_calc_cmd_blocks(msg);
 	int i = 0;
 
 	for (i = 0; i < n && next; i++)  {
@@ -220,12 +225,9 @@ static void free_cmd(struct mlx5_cmd_work_ent *ent)
 static int verify_signature(struct mlx5_cmd_work_ent *ent)
 {
 	struct mlx5_cmd_mailbox *next = ent->out->next;
+	int n = mlx5_calc_cmd_blocks(ent->out);
 	int err;
 	u8 sig;
-	int size = ent->out->len;
-	int blen = size - min_t(int, sizeof(ent->out->first.data), size);
-	int n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1)
-		/ MLX5_CMD_DATA_BLOCK_SIZE;
 	int i = 0;
 
 	sig = xor8_buf(ent->lay, 0, sizeof(*ent->lay));
@@ -1137,7 +1139,6 @@ static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
 	struct mlx5_cmd_mailbox *tmp, *head = NULL;
 	struct mlx5_cmd_prot_block *block;
 	struct mlx5_cmd_msg *msg;
-	int blen;
 	int err;
 	int n;
 	int i;
@@ -1146,8 +1147,8 @@ static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
 	if (!msg)
 		return ERR_PTR(-ENOMEM);
 
-	blen = size - min_t(int, sizeof(msg->first.data), size);
-	n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1) / MLX5_CMD_DATA_BLOCK_SIZE;
+	msg->len = size;
+	n = mlx5_calc_cmd_blocks(msg);
 
 	for (i = 0; i < n; i++) {
 		tmp = alloc_cmd_box(dev, flags);
@@ -1165,7 +1166,6 @@ static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
 		head = tmp;
 	}
 	msg->next = head;
-	msg->len = size;
 	return msg;
 
 err_alloc:
-- 
cgit v1.2.3-59-g8ed1b


From 12b996d16b83414fbbdde37b661cfd0a622f40cf Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Sun, 8 Apr 2018 14:16:48 +0300
Subject: net/mlx5: Fix dump_command mailbox length printed

Dump command mailbox length printed was correct only if data_only flag
was set. For the case that data_only flag was clear the offset to stop
printing at was wrong and so the buffer printed was too short.
Changed the print loop to stop according to number of buffers in
mailbox.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 0f7062104ad9..487388aed98f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -722,9 +722,11 @@ static void dump_command(struct mlx5_core_dev *dev,
 	struct mlx5_cmd_msg *msg = input ? ent->in : ent->out;
 	u16 op = MLX5_GET(mbox_in, ent->lay->in, opcode);
 	struct mlx5_cmd_mailbox *next = msg->next;
+	int n = mlx5_calc_cmd_blocks(msg);
 	int data_only;
 	u32 offset = 0;
 	int dump_len;
+	int i;
 
 	data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA));
 
@@ -751,7 +753,7 @@ static void dump_command(struct mlx5_core_dev *dev,
 		offset += sizeof(*ent->lay);
 	}
 
-	while (next && offset < msg->len) {
+	for (i = 0; i < n && next; i++)  {
 		if (data_only) {
 			dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset);
 			dump_buf(next->buf, dump_len, 1, offset);
-- 
cgit v1.2.3-59-g8ed1b


From 6fa242af73244b6a23dc0aa58c5c410693e407e5 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Sun, 18 Feb 2018 10:35:25 +0200
Subject: net/mlx5: Cleanup unused field in Work Queue parameters

Remove the 'linear' field from struct mlx5_wq_param.
It is redundant, set but never read.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 -
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b29c1d93f058..f60905648797 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1896,7 +1896,6 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
 
 	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
-	param->wq.linear = 1;
 }
 
 static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index fca90b94596d..f3dfa0ca3c5d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -38,7 +38,6 @@
 #include <linux/mlx5/qp.h>
 
 struct mlx5_wq_param {
-	int		linear;
 	int		buf_numa_node;
 	int		db_numa_node;
 };
-- 
cgit v1.2.3-59-g8ed1b


From a8408f4e6db775e245f20edf12b13fd58cc03a1c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 3 May 2018 14:35:03 +0100
Subject: net/mlx5: fix spelling mistake: "modfiy" -> "modify"

Trivial fix to spelling mistake in netdev_warn warning message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 610d485c4b03..f64b5e78519b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -565,7 +565,7 @@ static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
 	err =  mlx5_modify_rule_destination(rule, &dst, NULL);
 	if (err)
 		netdev_warn(priv->netdev,
-			    "Failed to modfiy aRFS rule destination to rq=%d\n", rxq);
+			    "Failed to modify aRFS rule destination to rq=%d\n", rxq);
 }
 
 static void arfs_handle_work(struct work_struct *work)
-- 
cgit v1.2.3-59-g8ed1b


From 0cd3cbed3caf6eae3bc0fa4afa4f26a9babfe55a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:08 -0700
Subject: bpf: offload: allow offloaded programs to use perf event arrays

BPF_MAP_TYPE_PERF_EVENT_ARRAY is special as far as offload goes.
The map only holds glue to perf ring, not actual data.  Allow
non-offloaded perf event arrays to be used in offloaded programs.
Offload driver can extract the events from HW and put them in
the map for user space to retrieve.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 5 +++++
 kernel/bpf/offload.c | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0e00a13ff01b..321969da67b7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -110,6 +110,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
+{
+	return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
+}
+
 static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
 {
 	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 	struct bpf_prog_offload *offload;
 	bool ret;
 
-	if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+	if (!bpf_prog_is_dev_bound(prog->aux))
 		return false;
+	if (!bpf_map_is_dev_bound(map))
+		return bpf_map_offload_neutral(map);
 
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
-- 
cgit v1.2.3-59-g8ed1b


From 630a4d3874a06aa9f9481cbfc688981aad7a834c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:09 -0700
Subject: nfp: bpf: record offload neutral maps in the driver

For asynchronous events originating from the device, like perf event
output, we need to be able to make sure that objects being referred
to by the FW message are valid on the host.  FW events can get queued
and reordered.  Even if we had a FW message "barrier" we should still
protect ourselves from bogus FW output.

Add a reverse-mapping hash table and record in it all raw map pointers
FW may refer to.  Only record neutral maps, i.e. perf event arrays.
These are currently the only objects FW can refer to.  Use RCU protection
on the read side, update side is under RTNL.

Since program vs map destruction order is slightly painful for offload
simply take an extra reference on all the recorded maps to make sure
they don't disappear.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c    |  25 ++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h    |  20 +++-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 125 ++++++++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/nfp_app.c     |   2 +-
 kernel/bpf/syscall.c                             |   2 +
 5 files changed, 168 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 1dc424685f4e..f65df341c557 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -43,6 +43,14 @@
 #include "fw.h"
 #include "main.h"
 
+const struct rhashtable_params nfp_bpf_maps_neutral_params = {
+	.nelem_hint		= 4,
+	.key_len		= FIELD_SIZEOF(struct nfp_bpf_neutral_map, ptr),
+	.key_offset		= offsetof(struct nfp_bpf_neutral_map, ptr),
+	.head_offset		= offsetof(struct nfp_bpf_neutral_map, l),
+	.automatic_shrinking	= true,
+};
+
 static bool nfp_net_ebpf_capable(struct nfp_net *nn)
 {
 #ifdef __LITTLE_ENDIAN
@@ -401,17 +409,28 @@ static int nfp_bpf_init(struct nfp_app *app)
 	init_waitqueue_head(&bpf->cmsg_wq);
 	INIT_LIST_HEAD(&bpf->map_list);
 
-	err = nfp_bpf_parse_capabilities(app);
+	err = rhashtable_init(&bpf->maps_neutral, &nfp_bpf_maps_neutral_params);
 	if (err)
 		goto err_free_bpf;
 
+	err = nfp_bpf_parse_capabilities(app);
+	if (err)
+		goto err_free_neutral_maps;
+
 	return 0;
 
+err_free_neutral_maps:
+	rhashtable_destroy(&bpf->maps_neutral);
 err_free_bpf:
 	kfree(bpf);
 	return err;
 }
 
+static void nfp_check_rhashtable_empty(void *ptr, void *arg)
+{
+	WARN_ON_ONCE(1);
+}
+
 static void nfp_bpf_clean(struct nfp_app *app)
 {
 	struct nfp_app_bpf *bpf = app->priv;
@@ -419,6 +438,8 @@ static void nfp_bpf_clean(struct nfp_app *app)
 	WARN_ON(!skb_queue_empty(&bpf->cmsg_replies));
 	WARN_ON(!list_empty(&bpf->map_list));
 	WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use);
+	rhashtable_free_and_destroy(&bpf->maps_neutral,
+				    nfp_check_rhashtable_empty, NULL);
 	kfree(bpf);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 68b5d326483d..5db6284a44ba 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ * Copyright (C) 2016-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -39,6 +39,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/rhashtable.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
 #include <linux/wait.h>
@@ -114,6 +115,8 @@ enum pkt_vec {
  * @maps_in_use:	number of currently offloaded maps
  * @map_elems_in_use:	number of elements allocated to offloaded maps
  *
+ * @maps_neutral:	hash table of offload-neutral maps (on pointer)
+ *
  * @adjust_head:	adjust head capability
  * @adjust_head.flags:		extra flags for adjust head
  * @adjust_head.off_min:	minimal packet offset within buffer required
@@ -150,6 +153,8 @@ struct nfp_app_bpf {
 	unsigned int maps_in_use;
 	unsigned int map_elems_in_use;
 
+	struct rhashtable maps_neutral;
+
 	struct nfp_bpf_cap_adjust_head {
 		u32 flags;
 		int off_min;
@@ -199,6 +204,14 @@ struct nfp_bpf_map {
 	enum nfp_bpf_map_use use_map[];
 };
 
+struct nfp_bpf_neutral_map {
+	struct rhash_head l;
+	struct bpf_map *ptr;
+	u32 count;
+};
+
+extern const struct rhashtable_params nfp_bpf_maps_neutral_params;
+
 struct nfp_prog;
 struct nfp_insn_meta;
 typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
@@ -367,6 +380,8 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
  * @error: error code if something went wrong
  * @stack_depth: max stack depth from the verifier
  * @adjust_head_location: if program has single adjust head call - the insn no.
+ * @map_records_cnt: the number of map pointers recorded for this prog
+ * @map_records: the map record pointers from bpf->maps_neutral
  * @insns: list of BPF instruction wrappers (struct nfp_insn_meta)
  */
 struct nfp_prog {
@@ -390,6 +405,9 @@ struct nfp_prog {
 	unsigned int stack_depth;
 	unsigned int adjust_head_location;
 
+	unsigned int map_records_cnt;
+	struct nfp_bpf_neutral_map **map_records;
+
 	struct list_head insns;
 };
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 42d98792bd25..e3ead906cc60 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ * Copyright (C) 2016-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -56,6 +56,126 @@
 #include "../nfp_net_ctrl.h"
 #include "../nfp_net.h"
 
+static int
+nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
+		   struct bpf_map *map)
+{
+	struct nfp_bpf_neutral_map *record;
+	int err;
+
+	/* Map record paths are entered via ndo, update side is protected. */
+	ASSERT_RTNL();
+
+	/* Reuse path - other offloaded program is already tracking this map. */
+	record = rhashtable_lookup_fast(&bpf->maps_neutral, &map,
+					nfp_bpf_maps_neutral_params);
+	if (record) {
+		nfp_prog->map_records[nfp_prog->map_records_cnt++] = record;
+		record->count++;
+		return 0;
+	}
+
+	/* Grab a single ref to the map for our record.  The prog destroy ndo
+	 * happens after free_used_maps().
+	 */
+	map = bpf_map_inc(map, false);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	record = kmalloc(sizeof(*record), GFP_KERNEL);
+	if (!record) {
+		err = -ENOMEM;
+		goto err_map_put;
+	}
+
+	record->ptr = map;
+	record->count = 1;
+
+	err = rhashtable_insert_fast(&bpf->maps_neutral, &record->l,
+				     nfp_bpf_maps_neutral_params);
+	if (err)
+		goto err_free_rec;
+
+	nfp_prog->map_records[nfp_prog->map_records_cnt++] = record;
+
+	return 0;
+
+err_free_rec:
+	kfree(record);
+err_map_put:
+	bpf_map_put(map);
+	return err;
+}
+
+static void
+nfp_map_ptrs_forget(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog)
+{
+	bool freed = false;
+	int i;
+
+	ASSERT_RTNL();
+
+	for (i = 0; i < nfp_prog->map_records_cnt; i++) {
+		if (--nfp_prog->map_records[i]->count) {
+			nfp_prog->map_records[i] = NULL;
+			continue;
+		}
+
+		WARN_ON(rhashtable_remove_fast(&bpf->maps_neutral,
+					       &nfp_prog->map_records[i]->l,
+					       nfp_bpf_maps_neutral_params));
+		freed = true;
+	}
+
+	if (freed) {
+		synchronize_rcu();
+
+		for (i = 0; i < nfp_prog->map_records_cnt; i++)
+			if (nfp_prog->map_records[i]) {
+				bpf_map_put(nfp_prog->map_records[i]->ptr);
+				kfree(nfp_prog->map_records[i]);
+			}
+	}
+
+	kfree(nfp_prog->map_records);
+	nfp_prog->map_records = NULL;
+	nfp_prog->map_records_cnt = 0;
+}
+
+static int
+nfp_map_ptrs_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
+		    struct bpf_prog *prog)
+{
+	int i, cnt, err;
+
+	/* Quickly count the maps we will have to remember */
+	cnt = 0;
+	for (i = 0; i < prog->aux->used_map_cnt; i++)
+		if (bpf_map_offload_neutral(prog->aux->used_maps[i]))
+			cnt++;
+	if (!cnt)
+		return 0;
+
+	nfp_prog->map_records = kmalloc_array(cnt,
+					      sizeof(nfp_prog->map_records[0]),
+					      GFP_KERNEL);
+	if (!nfp_prog->map_records)
+		return -ENOMEM;
+
+	for (i = 0; i < prog->aux->used_map_cnt; i++)
+		if (bpf_map_offload_neutral(prog->aux->used_maps[i])) {
+			err = nfp_map_ptr_record(bpf, nfp_prog,
+						 prog->aux->used_maps[i]);
+			if (err) {
+				nfp_map_ptrs_forget(bpf, nfp_prog);
+				return err;
+			}
+		}
+	WARN_ON(cnt != nfp_prog->map_records_cnt);
+
+	return 0;
+}
+
 static int
 nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		 unsigned int cnt)
@@ -151,7 +271,7 @@ static int nfp_bpf_translate(struct nfp_net *nn, struct bpf_prog *prog)
 	prog->aux->offload->jited_len = nfp_prog->prog_len * sizeof(u64);
 	prog->aux->offload->jited_image = nfp_prog->prog;
 
-	return 0;
+	return nfp_map_ptrs_record(nfp_prog->bpf, nfp_prog, prog);
 }
 
 static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
@@ -159,6 +279,7 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog)
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
 
 	kvfree(nfp_prog->prog);
+	nfp_map_ptrs_forget(nfp_prog->bpf, nfp_prog);
 	nfp_prog_free(nfp_prog);
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index 6aedef0ad433..0e0253c7e17b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 263e13ede029..9b87198deea2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -282,6 +282,7 @@ void bpf_map_put(struct bpf_map *map)
 {
 	__bpf_map_put(map, true);
 }
+EXPORT_SYMBOL_GPL(bpf_map_put);
 
 void bpf_map_put_with_uref(struct bpf_map *map)
 {
@@ -543,6 +544,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
 		atomic_inc(&map->usercnt);
 	return map;
 }
+EXPORT_SYMBOL_GPL(bpf_map_inc);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6cb5fb3891db9d3f6feb0f7669946550c9abc420 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:10 -0700
Subject: bpf: export bpf_event_output()

bpf_event_output() is useful for offloads to add events to BPF
event rings, export it.  Note that export is placed near the stub
since tracing is optional and kernel/bpf/core.c is always going
to be built.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1127552c8033..d0d7d9462368 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1719,6 +1719,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 {
 	return -ENOTSUPP;
 }
+EXPORT_SYMBOL_GPL(bpf_event_output);
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
-- 
cgit v1.2.3-59-g8ed1b


From f4e3ec0d573e238f383b3da365127002579a07d6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:11 -0700
Subject: bpf: replace map pointer loads before calling into offloads

Offloads may find host map pointers more useful than map fds.
Map pointers can be used to identify the map, while fds are
only valid within the context of loading process.

Jump to skip_full_check on error in case verifier log overflow
has to be handled (replace_map_fd_with_map_ptr() prints to the
log, driver prep may do that too in the future).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 37e0affa515e..ccac552ac861 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5741,16 +5741,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	ret = replace_map_fd_with_map_ptr(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	if (bpf_prog_is_dev_bound(env->prog->aux)) {
 		ret = bpf_prog_offload_verifier_prep(env);
 		if (ret)
-			goto err_unlock;
+			goto skip_full_check;
 	}
 
-	ret = replace_map_fd_with_map_ptr(env);
-	if (ret < 0)
-		goto skip_full_check;
-
 	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
-- 
cgit v1.2.3-59-g8ed1b


From 9816dd35ececc095f3e3be29d30d3adc755908d9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:12 -0700
Subject: nfp: bpf: perf event output helpers support

Add support for the perf_event_output family of helpers.

The implementation on the NFP will not match the host code exactly.
The state of the host map and rings is unknown to the device, hence
device can't return errors when rings are not installed.  The device
simply packs the data into a firmware notification message and sends
it over to the host, returning success to the program.

There is no notion of a host CPU on the device when packets are being
processed.  Device will only offload programs which set BPF_F_CURRENT_CPU.
Still, if map index doesn't match CPU no error will be returned (see
above).

Dropped/lost firmware notification messages will not cause "lost
events" event on the perf ring, they are only visible via device
error counters.

Firmware notification messages may also get reordered in respect
to the packets which caused their generation.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/cmsg.c     | 16 +++++-
 drivers/net/ethernet/netronome/nfp/bpf/fw.h       | 20 ++++++-
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 32 ++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.c     |  3 +
 drivers/net/ethernet/netronome/nfp/bpf/main.h     |  4 ++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  | 47 +++++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 69 ++++++++++++++++++++++-
 7 files changed, 187 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
index 7e298148ca26..cb87fccb9f6a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -102,6 +102,15 @@ nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n)
 	return nfp_bpf_cmsg_alloc(bpf, size);
 }
 
+static u8 nfp_bpf_cmsg_get_type(struct sk_buff *skb)
+{
+	struct cmsg_hdr *hdr;
+
+	hdr = (struct cmsg_hdr *)skb->data;
+
+	return hdr->type;
+}
+
 static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb)
 {
 	struct cmsg_hdr *hdr;
@@ -431,6 +440,11 @@ void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)
 		goto err_free;
 	}
 
+	if (nfp_bpf_cmsg_get_type(skb) == CMSG_TYPE_BPF_EVENT) {
+		nfp_bpf_event_output(bpf, skb);
+		return;
+	}
+
 	nfp_ctrl_lock(bpf->app->ctrl);
 
 	tag = nfp_bpf_cmsg_get_tag(skb);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index 39639ac28b01..3dbc21653ce5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -37,6 +37,14 @@
 #include <linux/bitops.h>
 #include <linux/types.h>
 
+/* Kernel's enum bpf_reg_type is not uABI so people may change it breaking
+ * our FW ABI.  In that case we will do translation in the driver.
+ */
+#define NFP_BPF_SCALAR_VALUE		1
+#define NFP_BPF_MAP_VALUE		4
+#define NFP_BPF_STACK			6
+#define NFP_BPF_PACKET_DATA		8
+
 enum bpf_cap_tlv_type {
 	NFP_BPF_CAP_TYPE_FUNC		= 1,
 	NFP_BPF_CAP_TYPE_ADJUST_HEAD	= 2,
@@ -81,6 +89,7 @@ enum nfp_bpf_cmsg_type {
 	CMSG_TYPE_MAP_DELETE	= 5,
 	CMSG_TYPE_MAP_GETNEXT	= 6,
 	CMSG_TYPE_MAP_GETFIRST	= 7,
+	CMSG_TYPE_BPF_EVENT	= 8,
 	__CMSG_TYPE_MAP_MAX,
 };
 
@@ -155,4 +164,13 @@ struct cmsg_reply_map_op {
 	__be32 resv;
 	struct cmsg_key_value_pair elem[0];
 };
+
+struct cmsg_bpf_event {
+	struct cmsg_hdr hdr;
+	__be32 cpu_id;
+	__be64 map_ptr;
+	__be32 data_size;
+	__be32 pkt_size;
+	u8 data[0];
+};
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 65f0791cae0c..2127cf1548dd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ * Copyright (C) 2016-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -1456,6 +1456,31 @@ nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int
+nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	swreg ptr_type;
+	u32 ret_tgt;
+
+	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
+
+	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+
+	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
+		     2, RELO_BR_HELPER);
+
+	/* Load ptr type into A1 */
+	wrp_mov(nfp_prog, reg_a(1), ptr_type);
+
+	/* Load the return address into B0 */
+	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
+
+	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
+		return -EINVAL;
+
+	return 0;
+}
+
 /* --- Callbacks --- */
 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
@@ -2411,6 +2436,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 		return map_call_stack_common(nfp_prog, meta);
 	case BPF_FUNC_get_prandom_u32:
 		return nfp_get_prandom_u32(nfp_prog, meta);
+	case BPF_FUNC_perf_event_output:
+		return nfp_perf_event_output(nfp_prog, meta);
 	default:
 		WARN_ONCE(1, "verifier allowed unsupported function\n");
 		return -EOPNOTSUPP;
@@ -3353,6 +3380,9 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 			case BPF_FUNC_map_delete_elem:
 				val = nfp_prog->bpf->helpers.map_delete;
 				break;
+			case BPF_FUNC_perf_event_output:
+				val = nfp_prog->bpf->helpers.perf_event_output;
+				break;
 			default:
 				pr_err("relocation of unknown helper %d\n",
 				       val);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index f65df341c557..d72f9e7f42da 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -298,6 +298,9 @@ nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length)
 	case BPF_FUNC_map_delete_elem:
 		bpf->helpers.map_delete = readl(&cap->func_addr);
 		break;
+	case BPF_FUNC_perf_event_output:
+		bpf->helpers.perf_event_output = readl(&cap->func_addr);
+		break;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 5db6284a44ba..82682378d57f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -136,6 +136,7 @@ enum pkt_vec {
  * @helpers.map_lookup:		map lookup helper address
  * @helpers.map_update:		map update helper address
  * @helpers.map_delete:		map delete helper address
+ * @helpers.perf_event_output:	output perf event to a ring buffer
  *
  * @pseudo_random:	FW initialized the pseudo-random machinery (CSRs)
  */
@@ -176,6 +177,7 @@ struct nfp_app_bpf {
 		u32 map_lookup;
 		u32 map_update;
 		u32 map_delete;
+		u32 perf_event_output;
 	} helpers;
 
 	bool pseudo_random;
@@ -458,5 +460,7 @@ int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap,
 int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
 			       void *key, void *next_key);
 
+int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb);
+
 void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index e3ead906cc60..4db0ac1e42a8 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -441,6 +441,53 @@ int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf)
 	}
 }
 
+static unsigned long
+nfp_bpf_perf_event_copy(void *dst, const void *src,
+			unsigned long off, unsigned long len)
+{
+	memcpy(dst, src + off, len);
+	return 0;
+}
+
+int nfp_bpf_event_output(struct nfp_app_bpf *bpf, struct sk_buff *skb)
+{
+	struct cmsg_bpf_event *cbe = (void *)skb->data;
+	u32 pkt_size, data_size;
+	struct bpf_map *map;
+
+	if (skb->len < sizeof(struct cmsg_bpf_event))
+		goto err_drop;
+
+	pkt_size = be32_to_cpu(cbe->pkt_size);
+	data_size = be32_to_cpu(cbe->data_size);
+	map = (void *)(unsigned long)be64_to_cpu(cbe->map_ptr);
+
+	if (skb->len < sizeof(struct cmsg_bpf_event) + pkt_size + data_size)
+		goto err_drop;
+	if (cbe->hdr.ver != CMSG_MAP_ABI_VERSION)
+		goto err_drop;
+
+	rcu_read_lock();
+	if (!rhashtable_lookup_fast(&bpf->maps_neutral, &map,
+				    nfp_bpf_maps_neutral_params)) {
+		rcu_read_unlock();
+		pr_warn("perf event: dest map pointer %px not recognized, dropping event\n",
+			map);
+		goto err_drop;
+	}
+
+	bpf_event_output(map, be32_to_cpu(cbe->cpu_id),
+			 &cbe->data[round_up(pkt_size, 4)], data_size,
+			 cbe->data, pkt_size, nfp_bpf_perf_event_copy);
+	rcu_read_unlock();
+
+	dev_consume_skb_any(skb);
+	return 0;
+err_drop:
+	dev_kfree_skb_any(skb);
+	return -EINVAL;
+}
+
 static int
 nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
 		 struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 06ad53ce4ad9..c76b622743ae 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ * Copyright (C) 2016-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -36,6 +36,8 @@
 #include <linux/kernel.h>
 #include <linux/pkt_cls.h>
 
+#include "../nfp_app.h"
+#include "../nfp_main.h"
 #include "fw.h"
 #include "main.h"
 
@@ -216,6 +218,71 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env,
 		pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n");
 		return -EOPNOTSUPP;
 
+	case BPF_FUNC_perf_event_output:
+		BUILD_BUG_ON(NFP_BPF_SCALAR_VALUE != SCALAR_VALUE ||
+			     NFP_BPF_MAP_VALUE != PTR_TO_MAP_VALUE ||
+			     NFP_BPF_STACK != PTR_TO_STACK ||
+			     NFP_BPF_PACKET_DATA != PTR_TO_PACKET);
+
+		if (!bpf->helpers.perf_event_output) {
+			pr_vlog(env, "event_output: not supported by FW\n");
+			return -EOPNOTSUPP;
+		}
+
+		/* Force current CPU to make sure we can report the event
+		 * wherever we get the control message from FW.
+		 */
+		if (reg3->var_off.mask & BPF_F_INDEX_MASK ||
+		    (reg3->var_off.value & BPF_F_INDEX_MASK) !=
+		    BPF_F_CURRENT_CPU) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg3->var_off);
+			pr_vlog(env, "event_output: must use BPF_F_CURRENT_CPU, var_off: %s\n",
+				tn_buf);
+			return -EOPNOTSUPP;
+		}
+
+		/* Save space in meta, we don't care about arguments other
+		 * than 4th meta, shove it into arg1.
+		 */
+		reg1 = cur_regs(env) + BPF_REG_4;
+
+		if (reg1->type != SCALAR_VALUE /* NULL ptr */ &&
+		    reg1->type != PTR_TO_STACK &&
+		    reg1->type != PTR_TO_MAP_VALUE &&
+		    reg1->type != PTR_TO_PACKET) {
+			pr_vlog(env, "event_output: unsupported ptr type: %d\n",
+				reg1->type);
+			return -EOPNOTSUPP;
+		}
+
+		if (reg1->type == PTR_TO_STACK &&
+		    !nfp_bpf_stack_arg_ok("event_output", env, reg1, NULL))
+			return -EOPNOTSUPP;
+
+		/* Warn user that on offload NFP may return success even if map
+		 * is not going to accept the event, since the event output is
+		 * fully async and device won't know the state of the map.
+		 * There is also FW limitation on the event length.
+		 *
+		 * Lost events will not show up on the perf ring, driver
+		 * won't see them at all.  Events may also get reordered.
+		 */
+		dev_warn_once(&nfp_prog->bpf->app->pf->pdev->dev,
+			      "bpf: note: return codes and behavior of bpf_event_output() helper differs for offloaded programs!\n");
+		pr_vlog(env, "warning: return codes and behavior of event_output helper differ for offload!\n");
+
+		if (!meta->func_id)
+			break;
+
+		if (reg1->type != meta->arg1.type) {
+			pr_vlog(env, "event_output: ptr type changed: %d %d\n",
+				meta->arg1.type, reg1->type);
+			return -EINVAL;
+		}
+		break;
+
 	default:
 		pr_vlog(env, "unsupported function id: %d\n", func_id);
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From b4264c96b5cbc00c4c07deb9fbab928d43dffcf9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:13 -0700
Subject: nfp: bpf: rewrite map pointers with NFP TIDs

Kernel will now replace map fds with actual pointer before
calling the offload prepare.  We can identify those pointers
and replace them with NFP table IDs instead of loading the
table ID in code generated for CALL instruction.

This allows us to support having the same CALL being used with
different maps.

Since we don't want to change the FW ABI we still need to
move the TID from R1 to portion of R0 before the jump.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 44 ++++++++++++++++-------
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  9 -----
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 2127cf1548dd..326a2085d650 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1395,15 +1395,9 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	struct bpf_offloaded_map *offmap;
-	struct nfp_bpf_map *nfp_map;
 	bool load_lm_ptr;
 	u32 ret_tgt;
 	s64 lm_off;
-	swreg tid;
-
-	offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr;
-	nfp_map = offmap->dev_priv;
 
 	/* We only have to reload LM0 if the key is not at start of stack */
 	lm_off = nfp_prog->stack_depth;
@@ -1416,17 +1410,12 @@ map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	if (meta->func_id == BPF_FUNC_map_update_elem)
 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
 
-	/* Load map ID into a register, it should actually fit as an immediate
-	 * but in case it doesn't deal with it here, not in the delay slots.
-	 */
-	tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog));
-
 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
 		     2, RELO_BR_HELPER);
 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
 
 	/* Load map ID into A0 */
-	wrp_mov(nfp_prog, reg_a(0), tid);
+	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
 
 	/* Load the return address into B0 */
 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
@@ -3254,6 +3243,33 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
 	return 0;
 }
 
+static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
+{
+	struct nfp_insn_meta *meta1, *meta2;
+	struct nfp_bpf_map *nfp_map;
+	struct bpf_map *map;
+
+	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
+		if (meta1->skip || meta2->skip)
+			continue;
+
+		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
+		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
+			continue;
+
+		map = (void *)(unsigned long)((u32)meta1->insn.imm |
+					      (u64)meta2->insn.imm << 32);
+		if (bpf_map_offload_neutral(map))
+			continue;
+		nfp_map = map_to_offmap(map)->dev_priv;
+
+		meta1->insn.imm = nfp_map->tid;
+		meta2->insn.imm = 0;
+	}
+
+	return 0;
+}
+
 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
 {
 	__le64 *ustore = (__force __le64 *)prog;
@@ -3290,6 +3306,10 @@ int nfp_bpf_jit(struct nfp_prog *nfp_prog)
 {
 	int ret;
 
+	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
+	if (ret)
+		return ret;
+
 	ret = nfp_bpf_optimize(nfp_prog);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index c76b622743ae..e163f3cfa47d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -151,15 +151,6 @@ nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env,
 		return false;
 	}
 
-	/* Rest of the checks is only if we re-parse the same insn */
-	if (!meta->func_id)
-		return true;
-
-	if (meta->arg1.map_ptr != reg1->map_ptr) {
-		pr_vlog(env, "%s: called for different map\n", fname);
-		return false;
-	}
-
 	return true;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c642ea265445873901041fa0b892a45ea7c6ff33 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:14 -0700
Subject: tools: bpftool: fold hex keyword in command help

Instead of spelling [hex] BYTES everywhere use DATA as keyword
for generalized value.  This will help us keep the messages
concise when longer command are added in the future.  It will
also be useful once BTF support comes.  We will only have to
change the definition of DATA.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst | 19 ++++++++++---------
 tools/bpf/bpftool/map.c                         | 13 +++++++------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 5f512b14bff9..c3eef8c972cd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -23,16 +23,17 @@ MAP COMMANDS
 
 |	**bpftool** **map { show | list }**   [*MAP*]
 |	**bpftool** **map dump**    *MAP*
-|	**bpftool** **map update**  *MAP*  **key** [**hex**] *BYTES*   **value** [**hex**] *VALUE* [*UPDATE_FLAGS*]
-|	**bpftool** **map lookup**  *MAP*  **key** [**hex**] *BYTES*
-|	**bpftool** **map getnext** *MAP* [**key** [**hex**] *BYTES*]
-|	**bpftool** **map delete**  *MAP*  **key** [**hex**] *BYTES*
+|	**bpftool** **map update**  *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
+|	**bpftool** **map lookup**  *MAP*  **key** *DATA*
+|	**bpftool** **map getnext** *MAP* [**key** *DATA*]
+|	**bpftool** **map delete**  *MAP*  **key** *DATA*
 |	**bpftool** **map pin**     *MAP*  *FILE*
 |	**bpftool** **map help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
+|	*DATA* := { [**hex**] *BYTES* }
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*VALUE* := { *BYTES* | *MAP* | *PROG* }
+|	*VALUE* := { *DATA* | *MAP* | *PROG* }
 |	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
 
 DESCRIPTION
@@ -48,7 +49,7 @@ DESCRIPTION
 	**bpftool map dump**    *MAP*
 		  Dump all entries in a given *MAP*.
 
-	**bpftool map update**  *MAP*  **key** [**hex**] *BYTES*   **value** [**hex**] *VALUE* [*UPDATE_FLAGS*]
+	**bpftool map update**  *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
 		  Update map entry for a given *KEY*.
 
 		  *UPDATE_FLAGS* can be one of: **any** update existing entry
@@ -61,13 +62,13 @@ DESCRIPTION
 		  the bytes are parsed as decimal values, unless a "0x" prefix
 		  (for hexadecimal) or a "0" prefix (for octal) is provided.
 
-	**bpftool map lookup**  *MAP*  **key** [**hex**] *BYTES*
+	**bpftool map lookup**  *MAP*  **key** *DATA*
 		  Lookup **key** in the map.
 
-	**bpftool map getnext** *MAP* [**key** [**hex**] *BYTES*]
+	**bpftool map getnext** *MAP* [**key** *DATA*]
 		  Get next key.  If *key* is not specified, get first key.
 
-	**bpftool map delete**  *MAP*  **key** [**hex**] *BYTES*
+	**bpftool map delete**  *MAP*  **key** *DATA*
 		  Remove entry from the map.
 
 	**bpftool map pin**     *MAP*  *FILE*
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a6cdb640a0d7..7da77e4166ec 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -874,16 +874,17 @@ static int do_help(int argc, char **argv)
 	fprintf(stderr,
 		"Usage: %s %s { show | list }   [MAP]\n"
 		"       %s %s dump    MAP\n"
-		"       %s %s update  MAP  key [hex] BYTES value [hex] VALUE [UPDATE_FLAGS]\n"
-		"       %s %s lookup  MAP  key [hex] BYTES\n"
-		"       %s %s getnext MAP [key [hex] BYTES]\n"
-		"       %s %s delete  MAP  key [hex] BYTES\n"
+		"       %s %s update  MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
+		"       %s %s lookup  MAP  key DATA\n"
+		"       %s %s getnext MAP [key DATA]\n"
+		"       %s %s delete  MAP  key DATA\n"
 		"       %s %s pin     MAP  FILE\n"
 		"       %s %s help\n"
 		"\n"
 		"       MAP := { id MAP_ID | pinned FILE }\n"
+		"       DATA := { [hex] BYTES }\n"
 		"       " HELP_SPEC_PROGRAM "\n"
-		"       VALUE := { BYTES | MAP | PROG }\n"
+		"       VALUE := { DATA | MAP | PROG }\n"
 		"       UPDATE_FLAGS := { any | exist | noexist }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
-- 
cgit v1.2.3-59-g8ed1b


From e64d52569f6e847495091db40ab58d2d379748ef Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:15 -0700
Subject: tools: bpftool: move get_possible_cpus() to common code

Move the get_possible_cpus() function to shared code.  No functional
changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/common.c | 58 +++++++++++++++++++++++++++++++++++++++++++++-
 tools/bpf/bpftool/main.h   |  3 ++-
 tools/bpf/bpftool/map.c    | 56 --------------------------------------------
 3 files changed, 59 insertions(+), 58 deletions(-)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 465995281dcd..9c620770c6ed 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -33,6 +33,7 @@
 
 /* Author: Jakub Kicinski <kubakici@wp.pl> */
 
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <fts.h>
@@ -420,6 +421,61 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
 	}
 }
 
+unsigned int get_possible_cpus(void)
+{
+	static unsigned int result;
+	char buf[128];
+	long int n;
+	char *ptr;
+	int fd;
+
+	if (result)
+		return result;
+
+	fd = open("/sys/devices/system/cpu/possible", O_RDONLY);
+	if (fd < 0) {
+		p_err("can't open sysfs possible cpus");
+		exit(-1);
+	}
+
+	n = read(fd, buf, sizeof(buf));
+	if (n < 2) {
+		p_err("can't read sysfs possible cpus");
+		exit(-1);
+	}
+	close(fd);
+
+	if (n == sizeof(buf)) {
+		p_err("read sysfs possible cpus overflow");
+		exit(-1);
+	}
+
+	ptr = buf;
+	n = 0;
+	while (*ptr && *ptr != '\n') {
+		unsigned int a, b;
+
+		if (sscanf(ptr, "%u-%u", &a, &b) == 2) {
+			n += b - a + 1;
+
+			ptr = strchr(ptr, '-') + 1;
+		} else if (sscanf(ptr, "%u", &a) == 1) {
+			n++;
+		} else {
+			assert(0);
+		}
+
+		while (isdigit(*ptr))
+			ptr++;
+		if (*ptr == ',')
+			ptr++;
+	}
+
+	result = n;
+
+	return result;
+}
+
 static char *
 ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
 {
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index b8e9584d6246..cbf8985da362 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is dual licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -125,6 +125,7 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
 		       const char *arch);
 void print_hex_data_json(uint8_t *data, size_t len);
 
+unsigned int get_possible_cpus(void);
 const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
 
 #endif
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 7da77e4166ec..5efefde5f578 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -34,7 +34,6 @@
 /* Author: Jakub Kicinski <kubakici@wp.pl> */
 
 #include <assert.h>
-#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdbool.h>
@@ -69,61 +68,6 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_CPUMAP]		= "cpumap",
 };
 
-static unsigned int get_possible_cpus(void)
-{
-	static unsigned int result;
-	char buf[128];
-	long int n;
-	char *ptr;
-	int fd;
-
-	if (result)
-		return result;
-
-	fd = open("/sys/devices/system/cpu/possible", O_RDONLY);
-	if (fd < 0) {
-		p_err("can't open sysfs possible cpus");
-		exit(-1);
-	}
-
-	n = read(fd, buf, sizeof(buf));
-	if (n < 2) {
-		p_err("can't read sysfs possible cpus");
-		exit(-1);
-	}
-	close(fd);
-
-	if (n == sizeof(buf)) {
-		p_err("read sysfs possible cpus overflow");
-		exit(-1);
-	}
-
-	ptr = buf;
-	n = 0;
-	while (*ptr && *ptr != '\n') {
-		unsigned int a, b;
-
-		if (sscanf(ptr, "%u-%u", &a, &b) == 2) {
-			n += b - a + 1;
-
-			ptr = strchr(ptr, '-') + 1;
-		} else if (sscanf(ptr, "%u", &a) == 1) {
-			n++;
-		} else {
-			assert(0);
-		}
-
-		while (isdigit(*ptr))
-			ptr++;
-		if (*ptr == ',')
-			ptr++;
-	}
-
-	result = n;
-
-	return result;
-}
-
 static bool map_is_per_cpu(__u32 type)
 {
 	return type == BPF_MAP_TYPE_PERCPU_HASH ||
-- 
cgit v1.2.3-59-g8ed1b


From f412eed9dfdeeb6becd7de2ffe8b5d0a8b3f81ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:16 -0700
Subject: tools: bpftool: add simple perf event output reader

Users of BPF sooner or later discover perf_event_output() helpers
and BPF_MAP_TYPE_PERF_EVENT_ARRAY.  Dumping this array type is
not possible, however, we can add simple reading of perf events.
Create a new event_pipe subcommand for maps, this sub command
will only work with BPF_MAP_TYPE_PERF_EVENT_ARRAY maps.

Parts of the code from samples/bpf/trace_output_user.c.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst |  29 +-
 tools/bpf/bpftool/Documentation/bpftool.rst     |   2 +-
 tools/bpf/bpftool/Makefile                      |   7 +-
 tools/bpf/bpftool/bash-completion/bpftool       |  36 ++-
 tools/bpf/bpftool/common.c                      |  19 ++
 tools/bpf/bpftool/main.h                        |   4 +
 tools/bpf/bpftool/map.c                         |  19 +-
 tools/bpf/bpftool/map_perf_ring.c               | 347 ++++++++++++++++++++++++
 8 files changed, 444 insertions(+), 19 deletions(-)
 create mode 100644 tools/bpf/bpftool/map_perf_ring.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index c3eef8c972cd..a6258bc8ec4f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -22,12 +22,13 @@ MAP COMMANDS
 =============
 
 |	**bpftool** **map { show | list }**   [*MAP*]
-|	**bpftool** **map dump**    *MAP*
-|	**bpftool** **map update**  *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
-|	**bpftool** **map lookup**  *MAP*  **key** *DATA*
-|	**bpftool** **map getnext** *MAP* [**key** *DATA*]
-|	**bpftool** **map delete**  *MAP*  **key** *DATA*
-|	**bpftool** **map pin**     *MAP*  *FILE*
+|	**bpftool** **map dump**       *MAP*
+|	**bpftool** **map update**     *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
+|	**bpftool** **map lookup**     *MAP*  **key** *DATA*
+|	**bpftool** **map getnext**    *MAP* [**key** *DATA*]
+|	**bpftool** **map delete**     *MAP*  **key** *DATA*
+|	**bpftool** **map pin**        *MAP*  *FILE*
+|	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
 |	**bpftool** **map help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -76,6 +77,22 @@ DESCRIPTION
 
 		  Note: *FILE* must be located in *bpffs* mount.
 
+	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
+		  Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
+
+		  Install perf rings into a perf event array map and dump
+		  output of any bpf_perf_event_output() call in the kernel.
+		  By default read the number of CPUs on the system and
+		  install perf ring for each CPU in the corresponding index
+		  in the array.
+
+		  If **cpu** and **index** are specified, install perf ring
+		  for given **cpu** at **index** in the array (single ring).
+
+		  Note that installing a perf ring into an array will silently
+		  replace any existing ring.  Any other application will stop
+		  receiving events if it installed its rings earlier.
+
 	**bpftool map help**
 		  Print short help message.
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 20689a321ffe..564cb0d9692b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -23,7 +23,7 @@ SYNOPSIS
 
 	*MAP-COMMANDS* :=
 	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
-	| **pin** | **help** }
+	| **pin** | **event_pipe** | **help** }
 
 	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
 	| **load** | **help** }
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 4e69782c4a79..892dbf095bff 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -39,7 +39,12 @@ CC = gcc
 
 CFLAGS += -O2
 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
-CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
+CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
+	-I$(srctree)/kernel/bpf/ \
+	-I$(srctree)/tools/include \
+	-I$(srctree)/tools/include/uapi \
+	-I$(srctree)/tools/lib/bpf \
+	-I$(srctree)/tools/perf
 CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"'
 LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 852d84a98acd..b301c9b315f1 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1,6 +1,6 @@
 # bpftool(8) bash completion                               -*- shell-script -*-
 #
-# Copyright (C) 2017 Netronome Systems, Inc.
+# Copyright (C) 2017-2018 Netronome Systems, Inc.
 #
 # This software is dual licensed under the GNU General License
 # Version 2, June 1991 as shown in the file COPYING in the top-level
@@ -79,6 +79,14 @@ _bpftool_get_map_ids()
         command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
 }
 
+_bpftool_get_perf_map_ids()
+{
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp map  2>&1 | \
+        command grep -C2 perf_event_array | \
+        command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
+}
+
+
 _bpftool_get_prog_ids()
 {
     COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
@@ -359,10 +367,34 @@ _bpftool()
                     fi
                     return 0
                     ;;
+                event_pipe)
+                    case $prev in
+                        $command)
+                            COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
+                            return 0
+                            ;;
+                        id)
+                            _bpftool_get_perf_map_ids
+                            return 0
+                            ;;
+                        cpu)
+                            return 0
+                            ;;
+                        index)
+                            return 0
+                            ;;
+                        *)
+                            _bpftool_once_attr 'cpu'
+                            _bpftool_once_attr 'index'
+                            return 0
+                            ;;
+                    esac
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'delete dump getnext help \
-                            lookup pin show list update' -- "$cur" ) )
+                            lookup pin event_pipe show list update' -- \
+                            "$cur" ) )
                     ;;
             esac
             ;;
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 9c620770c6ed..32f9e397a6c0 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -331,6 +331,16 @@ char *get_fdinfo(int fd, const char *key)
 	return NULL;
 }
 
+void print_data_json(uint8_t *data, size_t len)
+{
+	unsigned int i;
+
+	jsonw_start_array(json_wtr);
+	for (i = 0; i < len; i++)
+		jsonw_printf(json_wtr, "%d", data[i]);
+	jsonw_end_array(json_wtr);
+}
+
 void print_hex_data_json(uint8_t *data, size_t len)
 {
 	unsigned int i;
@@ -421,6 +431,15 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
 	}
 }
 
+unsigned int get_page_size(void)
+{
+	static int result;
+
+	if (!result)
+		result = getpagesize();
+	return result;
+}
+
 unsigned int get_possible_cpus(void)
 {
 	static unsigned int result;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index cbf8985da362..6173cd997e7a 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -117,14 +117,18 @@ int do_pin_fd(int fd, const char *name);
 
 int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
+int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
+int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
 
 void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
 		       const char *arch);
+void print_data_json(uint8_t *data, size_t len);
 void print_hex_data_json(uint8_t *data, size_t len);
 
+unsigned int get_page_size(void);
 unsigned int get_possible_cpus(void);
 const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
 
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 5efefde5f578..af6766e956ba 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -130,8 +130,7 @@ static int map_parse_fd(int *argc, char ***argv)
 	return -1;
 }
 
-static int
-map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
+int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len)
 {
 	int err;
 	int fd;
@@ -817,12 +816,13 @@ static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %s %s { show | list }   [MAP]\n"
-		"       %s %s dump    MAP\n"
-		"       %s %s update  MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
-		"       %s %s lookup  MAP  key DATA\n"
-		"       %s %s getnext MAP [key DATA]\n"
-		"       %s %s delete  MAP  key DATA\n"
-		"       %s %s pin     MAP  FILE\n"
+		"       %s %s dump       MAP\n"
+		"       %s %s update     MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
+		"       %s %s lookup     MAP  key DATA\n"
+		"       %s %s getnext    MAP [key DATA]\n"
+		"       %s %s delete     MAP  key DATA\n"
+		"       %s %s pin        MAP  FILE\n"
+		"       %s %s event_pipe MAP [cpu N index M]\n"
 		"       %s %s help\n"
 		"\n"
 		"       MAP := { id MAP_ID | pinned FILE }\n"
@@ -834,7 +834,7 @@ static int do_help(int argc, char **argv)
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-		bin_name, argv[-2], bin_name, argv[-2]);
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
 
 	return 0;
 }
@@ -849,6 +849,7 @@ static const struct cmd cmds[] = {
 	{ "getnext",	do_getnext },
 	{ "delete",	do_delete },
 	{ "pin",	do_pin },
+	{ "event_pipe",	do_event_pipe },
 	{ 0 }
 };
 
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
new file mode 100644
index 000000000000..c5a2ced8552d
--- /dev/null
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2018 Netronome Systems, Inc. */
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <libbpf.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+
+#include <bpf.h>
+#include <perf-sys.h>
+
+#include "main.h"
+
+#define MMAP_PAGE_CNT	16
+
+static bool stop;
+
+struct event_ring_info {
+	int fd;
+	int key;
+	unsigned int cpu;
+	void *mem;
+};
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	unsigned char data[];
+};
+
+static void int_exit(int signo)
+{
+	fprintf(stderr, "Stopping...\n");
+	stop = true;
+}
+
+static void
+print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
+{
+	struct {
+		struct perf_event_header header;
+		__u64 id;
+		__u64 lost;
+	} *lost = (void *)e;
+	struct timespec ts;
+
+	if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
+		perror("Can't read clock for timestamp");
+		return;
+	}
+
+	if (json_output) {
+		jsonw_start_object(json_wtr);
+		jsonw_name(json_wtr, "timestamp");
+		jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
+		jsonw_name(json_wtr, "type");
+		jsonw_uint(json_wtr, e->header.type);
+		jsonw_name(json_wtr, "cpu");
+		jsonw_uint(json_wtr, ring->cpu);
+		jsonw_name(json_wtr, "index");
+		jsonw_uint(json_wtr, ring->key);
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			jsonw_name(json_wtr, "data");
+			print_data_json(e->data, e->size);
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			jsonw_name(json_wtr, "lost");
+			jsonw_start_object(json_wtr);
+			jsonw_name(json_wtr, "id");
+			jsonw_uint(json_wtr, lost->id);
+			jsonw_name(json_wtr, "count");
+			jsonw_uint(json_wtr, lost->lost);
+			jsonw_end_object(json_wtr);
+		}
+		jsonw_end_object(json_wtr);
+	} else {
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			printf("== @%ld.%ld CPU: %d index: %d =====\n",
+			       (long)ts.tv_sec, ts.tv_nsec,
+			       ring->cpu, ring->key);
+			fprint_hex(stdout, e->data, e->size, " ");
+			printf("\n");
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+}
+
+static void
+perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
+{
+	volatile struct perf_event_mmap_page *header = ring->mem;
+	__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	void *base, *begin, *end;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return;
+
+	base = ((char *)header) + get_page_size();
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			if (*buf_len < e->header.size) {
+				free(*buf);
+				*buf = malloc(e->header.size);
+				if (!*buf) {
+					fprintf(stderr,
+						"can't allocate memory");
+					stop = true;
+					return;
+				}
+				*buf_len = e->header.size;
+			}
+
+			memcpy(*buf, begin, len);
+			memcpy(*buf + len, base, e->header.size - len);
+			e = (void *)*buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		print_bpf_output(ring, e);
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+}
+
+static int perf_mmap_size(void)
+{
+	return get_page_size() * (MMAP_PAGE_CNT + 1);
+}
+
+static void *perf_event_mmap(int fd)
+{
+	int mmap_size = perf_mmap_size();
+	void *base;
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		p_err("event mmap failed: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	return base;
+}
+
+static void perf_event_unmap(void *mem)
+{
+	if (munmap(mem, perf_mmap_size()))
+		fprintf(stderr, "Can't unmap ring memory!\n");
+}
+
+static int bpf_perf_event_open(int map_fd, int key, int cpu)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int pmu_fd;
+
+	pmu_fd = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+	if (pmu_fd < 0) {
+		p_err("failed to open perf event %d for CPU %d", key, cpu);
+		return -1;
+	}
+
+	if (bpf_map_update_elem(map_fd, &key, &pmu_fd, BPF_ANY)) {
+		p_err("failed to update map for event %d for CPU %d", key, cpu);
+		goto err_close;
+	}
+	if (ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) {
+		p_err("failed to enable event %d for CPU %d", key, cpu);
+		goto err_close;
+	}
+
+	return pmu_fd;
+
+err_close:
+	close(pmu_fd);
+	return -1;
+}
+
+int do_event_pipe(int argc, char **argv)
+{
+	int i, nfds, map_fd, index = -1, cpu = -1;
+	struct bpf_map_info map_info = {};
+	struct event_ring_info *rings;
+	size_t tmp_buf_sz = 0;
+	void *tmp_buf = NULL;
+	struct pollfd *pfds;
+	__u32 map_info_len;
+	bool do_all = true;
+
+	map_info_len = sizeof(map_info);
+	map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
+	if (map_fd < 0)
+		return -1;
+
+	if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+		p_err("map is not a perf event array");
+		goto err_close_map;
+	}
+
+	while (argc) {
+		if (argc < 2)
+			BAD_ARG();
+
+		if (is_prefix(*argv, "cpu")) {
+			char *endptr;
+
+			NEXT_ARG();
+			cpu = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as CPU ID", **argv);
+				goto err_close_map;
+			}
+
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "index")) {
+			char *endptr;
+
+			NEXT_ARG();
+			index = strtoul(*argv, &endptr, 0);
+			if (*endptr) {
+				p_err("can't parse %s as index", **argv);
+				goto err_close_map;
+			}
+
+			NEXT_ARG();
+		} else {
+			BAD_ARG();
+		}
+
+		do_all = false;
+	}
+
+	if (!do_all) {
+		if (index == -1 || cpu == -1) {
+			p_err("cpu and index must be specified together");
+			goto err_close_map;
+		}
+
+		nfds = 1;
+	} else {
+		nfds = min(get_possible_cpus(), map_info.max_entries);
+		cpu = 0;
+		index = 0;
+	}
+
+	rings = calloc(nfds, sizeof(rings[0]));
+	if (!rings)
+		goto err_close_map;
+
+	pfds = calloc(nfds, sizeof(pfds[0]));
+	if (!pfds)
+		goto err_free_rings;
+
+	for (i = 0; i < nfds; i++) {
+		rings[i].cpu = cpu + i;
+		rings[i].key = index + i;
+
+		rings[i].fd = bpf_perf_event_open(map_fd, rings[i].key,
+						  rings[i].cpu);
+		if (rings[i].fd < 0)
+			goto err_close_fds_prev;
+
+		rings[i].mem = perf_event_mmap(rings[i].fd);
+		if (!rings[i].mem)
+			goto err_close_fds_current;
+
+		pfds[i].fd = rings[i].fd;
+		pfds[i].events = POLLIN;
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGHUP, int_exit);
+	signal(SIGTERM, int_exit);
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+
+	while (!stop) {
+		poll(pfds, nfds, 200);
+		for (i = 0; i < nfds; i++)
+			perf_event_read(&rings[i], &tmp_buf, &tmp_buf_sz);
+	}
+	free(tmp_buf);
+
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	for (i = 0; i < nfds; i++) {
+		perf_event_unmap(rings[i].mem);
+		close(rings[i].fd);
+	}
+	free(pfds);
+	free(rings);
+	close(map_fd);
+
+	return 0;
+
+err_close_fds_prev:
+	while (i--) {
+		perf_event_unmap(rings[i].mem);
+err_close_fds_current:
+		close(rings[i].fd);
+	}
+	free(pfds);
+err_free_rings:
+	free(rings);
+err_close_map:
+	close(map_fd);
+	return -1;
+}
-- 
cgit v1.2.3-59-g8ed1b


From ab7f5bf0928be2f148d000a6eaa6c0a36e74750e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 May 2018 18:37:17 -0700
Subject: bpf: fix references to free_bpf_prog_info() in comments

Comments in the verifier refer to free_bpf_prog_info() which
seems to have never existed in tree.  Replace it with
free_used_maps().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ccac552ac861..d5e1a6c4165d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5055,7 +5055,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
-			 * and all maps are released in free_bpf_prog_info()
+			 * and all maps are released in free_used_maps()
 			 */
 			map = bpf_map_inc(map, false);
 			if (IS_ERR(map)) {
@@ -5821,7 +5821,7 @@ skip_full_check:
 err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
-		 * them now. Otherwise free_bpf_prog_info() will release them.
+		 * them now. Otherwise free_used_maps() will release them.
 		 */
 		release_maps(env);
 	*prog = env->prog;
-- 
cgit v1.2.3-59-g8ed1b


From e94fa1d93117e7f1eb783dc9cae6c70650944449 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 May 2018 16:27:53 +0200
Subject: bpf, xskmap: fix crash in xsk_map_alloc error path handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If bpf_map_precharge_memlock() did not fail, then we set err to zero.
However, any subsequent failure from either alloc_percpu() or the
bpf_map_area_alloc() will return ERR_PTR(0) which in find_and_alloc_map()
will cause NULL pointer deref.

In devmap we have the convention that we return -EINVAL on page count
overflow, so keep the same logic here and just set err to -ENOMEM
after successful bpf_map_precharge_memlock().

Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Björn Töpel <bjorn.topel@intel.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/xskmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 869dbb11b612..cb3a12137404 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -56,6 +56,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto free_m;
 
+	err = -ENOMEM;
+
 	m->flush_list = alloc_percpu(struct list_head);
 	if (!m->flush_list)
 		goto free_m;
-- 
cgit v1.2.3-59-g8ed1b


From 8fb11a9a8d51df9a314a6d970436963c127ff1bd Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 4 May 2018 13:54:24 -0700
Subject: net/ipv6: rename rt6_next to fib6_next

This slipped through the cracks in the followup set to the fib6_info flip.
Rename rt6_next to fib6_next.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 26 +++++++++++++-------------
 net/ipv6/route.c      | 12 ++++++------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..a3ec08d05756 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -135,7 +135,7 @@ struct fib6_nh {
 
 struct fib6_info {
 	struct fib6_table		*fib6_table;
-	struct fib6_info __rcu		*rt6_next;
+	struct fib6_info __rcu		*fib6_next;
 	struct fib6_node __rcu		*fib6_node;
 
 	/* Multipath routes:
@@ -192,11 +192,11 @@ struct rt6_info {
 
 #define for_each_fib6_node_rt_rcu(fn)					\
 	for (rt = rcu_dereference((fn)->leaf); rt;			\
-	     rt = rcu_dereference(rt->rt6_next))
+	     rt = rcu_dereference(rt->fib6_next))
 
 #define for_each_fib6_walker_rt(w)					\
 	for (rt = (w)->leaf; rt;					\
-	     rt = rcu_dereference_protected(rt->rt6_next, 1))
+	     rt = rcu_dereference_protected(rt->fib6_next, 1))
 
 static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..f0a4262a4789 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -945,7 +945,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 	ins = &fn->leaf;
 
 	for (iter = leaf; iter;
-	     iter = rcu_dereference_protected(iter->rt6_next,
+	     iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
 		/*
 		 *	Search for duplicates
@@ -1002,7 +1002,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 			break;
 
 next_iter:
-		ins = &iter->rt6_next;
+		ins = &iter->fib6_next;
 	}
 
 	if (fallback_ins && !found) {
@@ -1031,7 +1031,7 @@ next_iter:
 					      &sibling->fib6_siblings);
 				break;
 			}
-			sibling = rcu_dereference_protected(sibling->rt6_next,
+			sibling = rcu_dereference_protected(sibling->fib6_next,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 		}
 		/* For each sibling in the list, increment the counter of
@@ -1065,7 +1065,7 @@ add:
 		if (err)
 			return err;
 
-		rcu_assign_pointer(rt->rt6_next, iter);
+		rcu_assign_pointer(rt->fib6_next, iter);
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
 		rcu_assign_pointer(*ins, rt);
@@ -1096,7 +1096,7 @@ add:
 
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->rt6_next = iter->rt6_next;
+		rt->fib6_next = iter->fib6_next;
 		rcu_assign_pointer(*ins, rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
@@ -1113,14 +1113,14 @@ add:
 
 		if (nsiblings) {
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->rt6_next;
+			ins = &rt->fib6_next;
 			iter = rcu_dereference_protected(*ins,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
 			while (iter) {
 				if (iter->fib6_metric > rt->fib6_metric)
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->rt6_next;
+					*ins = iter->fib6_next;
 					iter->fib6_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					if (rcu_access_pointer(fn->rr_ptr) == iter)
@@ -1129,7 +1129,7 @@ add:
 					nsiblings--;
 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 				} else {
-					ins = &iter->rt6_next;
+					ins = &iter->fib6_next;
 				}
 				iter = rcu_dereference_protected(*ins,
 					lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -1712,7 +1712,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	RT6_TRACE("fib6_del_route\n");
 
 	/* Unlink it */
-	*rtp = rt->rt6_next;
+	*rtp = rt->fib6_next;
 	rt->fib6_node = NULL;
 	net->ipv6.rt6_stats->fib_rt_entries--;
 	net->ipv6.rt6_stats->fib_discarded_routes++;
@@ -1741,7 +1741,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
 	FOR_WALKERS(net, w) {
 		if (w->state == FWS_C && w->leaf == rt) {
 			RT6_TRACE("walker %p adjusted by delroute\n", w);
-			w->leaf = rcu_dereference_protected(rt->rt6_next,
+			w->leaf = rcu_dereference_protected(rt->fib6_next,
 					    lockdep_is_held(&table->tb6_lock));
 			if (!w->leaf)
 				w->state = FWS_U;
@@ -1795,7 +1795,7 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
 			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
-		rtp_next = &cur->rt6_next;
+		rtp_next = &cur->fib6_next;
 	}
 	return -ENOENT;
 }
@@ -2279,7 +2279,7 @@ static int ipv6_route_yield(struct fib6_walker *w)
 
 	do {
 		iter->w.leaf = rcu_dereference_protected(
-				iter->w.leaf->rt6_next,
+				iter->w.leaf->fib6_next,
 				lockdep_is_held(&iter->tbl->tb6_lock));
 		iter->skip--;
 		if (!iter->skip && iter->w.leaf)
@@ -2345,7 +2345,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!v)
 		goto iter_table;
 
-	n = rcu_dereference_bh(((struct fib6_info *)v)->rt6_next);
+	n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
 	if (n) {
 		++*pos;
 		return n;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8ed1b519c2b0..daa3662da0ee 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -468,7 +468,7 @@ static inline struct fib6_info *rt6_device_match(struct net *net,
 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
 		return rt;
 
-	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
+	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
 
 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
@@ -696,7 +696,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 
 	match = NULL;
 	cont = NULL;
-	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
+	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
 		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
@@ -706,7 +706,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	}
 
 	for (rt = leaf; rt && rt != rr_head;
-	     rt = rcu_dereference(rt->rt6_next)) {
+	     rt = rcu_dereference(rt->fib6_next)) {
 		if (rt->fib6_metric != metric) {
 			cont = rt;
 			break;
@@ -718,7 +718,7 @@ static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
 	if (match || !cont)
 		return match;
 
-	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
+	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
 
 	return match;
@@ -756,7 +756,7 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
 			     &do_rr);
 
 	if (do_rr) {
-		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
+		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
 
 		/* no entries matched; do round-robin */
 		if (!next || next->fib6_metric != rt0->fib6_metric)
@@ -3781,7 +3781,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 		if (iter->fib6_metric == rt->fib6_metric &&
 		    rt6_qualify_for_ecmp(iter))
 			return iter;
-		iter = rcu_dereference_protected(iter->rt6_next,
+		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From d734a2888922efa521f9eb8dfe6790ced8f62bb7 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Sun, 22 Apr 2018 11:03:23 +0200
Subject: netfilter: nft_numgen: add map lookups for numgen statements

This patch includes a new attribute in the numgen structure to allow
the lookup of an element based on the number generator as a key.

For this purpose, different ops have been included to extend the
current numgen inc functions.

Currently, only supported for numgen incremental operations, but
it will be supported for random in a follow-up patch.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 ++
 net/netfilter/nft_numgen.c               | 85 ++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6a3d653d5b27..5a5551a580f7 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1450,6 +1450,8 @@ enum nft_trace_types {
  * @NFTA_NG_MODULUS: maximum counter value (NLA_U32)
  * @NFTA_NG_TYPE: operation type (NLA_U32)
  * @NFTA_NG_OFFSET: offset to be added to the counter (NLA_U32)
+ * @NFTA_NG_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_NG_SET_ID: id of the map (NLA_U32)
  */
 enum nft_ng_attributes {
 	NFTA_NG_UNSPEC,
@@ -1457,6 +1459,8 @@ enum nft_ng_attributes {
 	NFTA_NG_MODULUS,
 	NFTA_NG_TYPE,
 	NFTA_NG_OFFSET,
+	NFTA_NG_SET_NAME,
+	NFTA_NG_SET_ID,
 	__NFTA_NG_MAX
 };
 #define NFTA_NG_MAX	(__NFTA_NG_MAX - 1)
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 5a3a52c71545..8a64db8f2e69 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -24,13 +24,11 @@ struct nft_ng_inc {
 	u32			modulus;
 	atomic_t		counter;
 	u32			offset;
+	struct nft_set		*map;
 };
 
-static void nft_ng_inc_eval(const struct nft_expr *expr,
-			    struct nft_regs *regs,
-			    const struct nft_pktinfo *pkt)
+static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
 {
-	struct nft_ng_inc *priv = nft_expr_priv(expr);
 	u32 nval, oval;
 
 	do {
@@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr,
 		nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
 	} while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
 
-	regs->data[priv->dreg] = nval + priv->offset;
+	return nval + priv->offset;
+}
+
+static void nft_ng_inc_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	regs->data[priv->dreg] = nft_ng_inc_gen(priv);
+}
+
+static void nft_ng_inc_map_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = nft_ng_inc_gen(priv);
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
 }
 
 static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
@@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = {
 	[NFTA_NG_MODULUS]	= { .type = NLA_U32 },
 	[NFTA_NG_TYPE]		= { .type = NLA_U32 },
 	[NFTA_NG_OFFSET]	= { .type = NLA_U32 },
+	[NFTA_NG_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_NG_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_ng_inc_init(const struct nft_ctx *ctx,
@@ -71,6 +101,25 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_ng_inc *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_ng_inc_init(ctx, expr, tb);
+
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_NG_SET_NAME],
+					  tb[NFTA_NG_SET_ID], genmask);
+
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
 		       u32 modulus, enum nft_ng_types type, u32 offset)
 {
@@ -97,6 +146,22 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
+static int nft_ng_inc_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+			NFT_NG_INCREMENTAL, priv->offset) ||
+	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
@@ -156,6 +221,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = {
 	.dump		= nft_ng_inc_dump,
 };
 
+static const struct nft_expr_ops nft_ng_inc_map_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
+	.eval		= nft_ng_inc_map_eval,
+	.init		= nft_ng_inc_map_init,
+	.dump		= nft_ng_inc_map_dump,
+};
+
 static const struct nft_expr_ops nft_ng_random_ops = {
 	.type		= &nft_ng_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
@@ -178,6 +251,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 
 	switch (type) {
 	case NFT_NG_INCREMENTAL:
+		if (tb[NFTA_NG_SET_NAME])
+			return &nft_ng_inc_map_ops;
 		return &nft_ng_inc_ops;
 	case NFT_NG_RANDOM:
 		return &nft_ng_random_ops;
-- 
cgit v1.2.3-59-g8ed1b


From 75e72f05418afbeac0a707d2e0cc9451ceb47e43 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Mon, 23 Apr 2018 12:48:07 +0200
Subject: netfilter: nft_numgen: enable hashing of one element

The modulus in the hash function was limited to > 1 as initially
there was no sense to create a hashing of just one element.

Nevertheless, there are certain cases specially for load balancing
where this case needs to be addressed.

This patch fixes the following error.

Error: Could not process rule: Numerical result out of range
add rule ip nftlb lb01 dnat to jhash ip saddr mod 1 map { 0: 192.168.0.10 }
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The solution comes to force the hash to 0 when the modulus is 1.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
---
 net/netfilter/nft_hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 24f2f7567ddb..e235c17f1b8b 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -97,7 +97,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
 	priv->len = len;
 
 	priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
-	if (priv->modulus <= 1)
+	if (priv->modulus < 1)
 		return -ERANGE;
 
 	if (priv->offset + priv->modulus - 1 < priv->offset)
-- 
cgit v1.2.3-59-g8ed1b


From c1c7e44b4f62e9c07fea8aaa58ed46cfae48ba3d Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Wed, 25 Apr 2018 05:30:24 -0500
Subject: netfilter: ip6t_srh: extend SRH matching for previous, next and last
 SID

IPv6 Segment Routing Header (SRH) contains a list of SIDs to be crossed
by SR encapsulated packet. Each SID is encoded as an IPv6 prefix.

When a Firewall receives an SR encapsulated packet, it should be able
to identify which node previously processed the packet (previous SID),
which node is going to process the packet next (next SID), and which
node is the last to process the packet (last SID) which represent the
final destination of the packet in case of inline SR mode.

An example use-case of using these features could be SID list that
includes two firewalls. When the second firewall receives a packet,
it can check whether the packet has been processed by the first firewall
or not. Based on that check, it decides to apply all rules, apply just
subset of the rules, or totally skip all rules and forward the packet to
the next SID.

This patch extends SRH match to support matching previous SID, next SID,
and last SID.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h |  43 ++++++-
 net/ipv6/netfilter/ip6t_srh.c                | 173 +++++++++++++++++++++++++--
 2 files changed, 205 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
index f3cc0ef514a7..54ed83360dac 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
@@ -17,7 +17,10 @@
 #define IP6T_SRH_LAST_GT        0x0100
 #define IP6T_SRH_LAST_LT        0x0200
 #define IP6T_SRH_TAG            0x0400
-#define IP6T_SRH_MASK           0x07FF
+#define IP6T_SRH_PSID           0x0800
+#define IP6T_SRH_NSID           0x1000
+#define IP6T_SRH_LSID           0x2000
+#define IP6T_SRH_MASK           0x3FFF
 
 /* Values for "mt_invflags" field in struct ip6t_srh */
 #define IP6T_SRH_INV_NEXTHDR    0x0001
@@ -31,7 +34,10 @@
 #define IP6T_SRH_INV_LAST_GT    0x0100
 #define IP6T_SRH_INV_LAST_LT    0x0200
 #define IP6T_SRH_INV_TAG        0x0400
-#define IP6T_SRH_INV_MASK       0x07FF
+#define IP6T_SRH_INV_PSID       0x0800
+#define IP6T_SRH_INV_NSID       0x1000
+#define IP6T_SRH_INV_LSID       0x2000
+#define IP6T_SRH_INV_MASK       0x3FFF
 
 /**
  *      struct ip6t_srh - SRH match options
@@ -54,4 +60,37 @@ struct ip6t_srh {
 	__u16                   mt_invflags;
 };
 
+/**
+ *      struct ip6t_srh1 - SRH match options (revision 1)
+ *      @ next_hdr: Next header field of SRH
+ *      @ hdr_len: Extension header length field of SRH
+ *      @ segs_left: Segments left field of SRH
+ *      @ last_entry: Last entry field of SRH
+ *      @ tag: Tag field of SRH
+ *      @ psid_addr: Address of previous SID in SRH SID list
+ *      @ nsid_addr: Address of NEXT SID in SRH SID list
+ *      @ lsid_addr: Address of LAST SID in SRH SID list
+ *      @ psid_msk: Mask of previous SID in SRH SID list
+ *      @ nsid_msk: Mask of next SID in SRH SID list
+ *      @ lsid_msk: MAsk of last SID in SRH SID list
+ *      @ mt_flags: match options
+ *      @ mt_invflags: Invert the sense of match options
+ */
+
+struct ip6t_srh1 {
+	__u8                    next_hdr;
+	__u8                    hdr_len;
+	__u8                    segs_left;
+	__u8                    last_entry;
+	__u16                   tag;
+	struct in6_addr         psid_addr;
+	struct in6_addr         nsid_addr;
+	struct in6_addr         lsid_addr;
+	struct in6_addr         psid_msk;
+	struct in6_addr         nsid_msk;
+	struct in6_addr         lsid_msk;
+	__u16                   mt_flags;
+	__u16                   mt_invflags;
+};
+
 #endif /*_IP6T_SRH_H*/
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 33719d5560c8..1059894a6f4c 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
+static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+	struct in6_addr *psid, *nsid, *lsid;
+	struct in6_addr _psid, _nsid, _lsid;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6_sr_hdr _srh;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return false;
+	srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh);
+	if (!srh)
+		return false;
+
+	hdrlen = ipv6_optlen(srh);
+	if (skb->len - srhoff < hdrlen)
+		return false;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (srh->segments_left > srh->first_segment)
+		return false;
+
+	/* Next Header matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR,
+				!(srh->nexthdr == srhinfo->next_hdr)))
+			return false;
+
+	/* Header Extension Length matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ,
+				!(srh->hdrlen == srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT,
+				!(srh->hdrlen > srhinfo->hdr_len)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LEN_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT,
+				!(srh->hdrlen < srhinfo->hdr_len)))
+			return false;
+
+	/* Segments Left matching */
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ,
+				!(srh->segments_left == srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT,
+				!(srh->segments_left > srhinfo->segs_left)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT,
+				!(srh->segments_left < srhinfo->segs_left)))
+			return false;
+
+	/**
+	 * Last Entry matching
+	 * Last_Entry field was introduced in revision 6 of the SRH draft.
+	 * It was called First_Segment in the previous revision
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ,
+				!(srh->first_segment == srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_GT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT,
+				!(srh->first_segment > srhinfo->last_entry)))
+			return false;
+	if (srhinfo->mt_flags & IP6T_SRH_LAST_LT)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT,
+				!(srh->first_segment < srhinfo->last_entry)))
+			return false;
+
+	/**
+	 * Tag matchig
+	 * Tag field was introduced in revision 6 of the SRH draft
+	 */
+	if (srhinfo->mt_flags & IP6T_SRH_TAG)
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG,
+				!(srh->tag == srhinfo->tag)))
+			return false;
+
+	/* Previous SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_PSID) {
+		if (srh->segments_left == srh->first_segment)
+			return false;
+		psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left + 1) * sizeof(struct in6_addr));
+		psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
+				ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
+						     &srhinfo->psid_addr)))
+			return false;
+	}
+
+	/* Next SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_NSID) {
+		if (srh->segments_left == 0)
+			return false;
+		nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
+			  ((srh->segments_left - 1) * sizeof(struct in6_addr));
+		nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
+				ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
+						     &srhinfo->nsid_addr)))
+			return false;
+	}
+
+	/* Last SID matching */
+	if (srhinfo->mt_flags & IP6T_SRH_LSID) {
+		lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
+		lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+		if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
+				ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
+						     &srhinfo->lsid_addr)))
+			return false;
+	}
+	return true;
+}
+
 static int srh_mt6_check(const struct xt_mtchk_param *par)
 {
 	const struct ip6t_srh *srhinfo = par->matchinfo;
@@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static struct xt_match srh_mt6_reg __read_mostly = {
-	.name		= "srh",
-	.family		= NFPROTO_IPV6,
-	.match		= srh_mt6,
-	.matchsize	= sizeof(struct ip6t_srh),
-	.checkentry	= srh_mt6_check,
-	.me		= THIS_MODULE,
+static int srh1_mt6_check(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_srh1 *srhinfo = par->matchinfo;
+
+	if (srhinfo->mt_flags & ~IP6T_SRH_MASK) {
+		pr_info_ratelimited("unknown srh match flags  %X\n",
+				    srhinfo->mt_flags);
+		return -EINVAL;
+	}
+
+	if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) {
+		pr_info_ratelimited("unknown srh invflags %X\n",
+				    srhinfo->mt_invflags);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match srh_mt6_reg[] __read_mostly = {
+	{
+		.name		= "srh",
+		.revision	= 0,
+		.family		= NFPROTO_IPV6,
+		.match		= srh_mt6,
+		.matchsize	= sizeof(struct ip6t_srh),
+		.checkentry	= srh_mt6_check,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "srh",
+		.revision       = 1,
+		.family         = NFPROTO_IPV6,
+		.match          = srh1_mt6,
+		.matchsize      = sizeof(struct ip6t_srh1),
+		.checkentry     = srh1_mt6_check,
+		.me             = THIS_MODULE,
+	}
 };
 
 static int __init srh_mt6_init(void)
 {
-	return xt_register_match(&srh_mt6_reg);
+	return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 static void __exit srh_mt6_exit(void)
 {
-	xt_unregister_match(&srh_mt6_reg);
+	xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg));
 }
 
 module_init(srh_mt6_init);
-- 
cgit v1.2.3-59-g8ed1b


From 3a2e86f64577be9a6e2c13ee06834e769cd3824f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 26 Apr 2018 17:42:15 +0200
Subject: netfilter: nf_nat: remove unused ct arg from lookup functions

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l3proto.h   | 24 ++++++++----------------
 net/ipv4/netfilter/iptable_nat.c         |  3 +--
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 14 +++++---------
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  |  3 +--
 net/ipv6/netfilter/ip6table_nat.c        |  3 +--
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 14 +++++---------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  |  3 +--
 7 files changed, 22 insertions(+), 42 deletions(-)

diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index ac47098a61dc..8bad2560576f 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -48,30 +48,26 @@ unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 			     const struct nf_hook_state *state,
 			     unsigned int (*do_chain)(void *priv,
 						      struct sk_buff *skb,
-						      const struct nf_hook_state *state,
-						      struct nf_conn *ct));
+						      const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_local_fn(void *priv,
 				  struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  unsigned int (*do_chain)(void *priv,
 							   struct sk_buff *skb,
-							   const struct nf_hook_state *state,
-							   struct nf_conn *ct));
+							   const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				    enum ip_conntrack_info ctinfo,
@@ -81,29 +77,25 @@ unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 			     const struct nf_hook_state *state,
 			     unsigned int (*do_chain)(void *priv,
 						      struct sk_buff *skb,
-						      const struct nf_hook_state *state,
-						      struct nf_conn *ct));
+						      const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_local_fn(void *priv,
 				  struct sk_buff *skb,
 				  const struct nf_hook_state *state,
 				  unsigned int (*do_chain)(void *priv,
 							   struct sk_buff *skb,
-							   const struct nf_hook_state *state,
-							   struct nf_conn *ct));
+							   const struct nf_hook_state *state));
 
 unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 			    const struct nf_hook_state *state,
 			    unsigned int (*do_chain)(void *priv,
 						     struct sk_buff *skb,
-						     const struct nf_hook_state *state,
-						     struct nf_conn *ct));
+						     const struct nf_hook_state *state));
 
 #endif /* _NF_NAT_L3PROTO_H */
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..529d89ec31e8 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -33,8 +33,7 @@ static const struct xt_table nf_nat_ipv4_table = {
 
 static unsigned int iptable_nat_do_chain(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct)
+					 const struct nf_hook_state *state)
 {
 	return ipt_do_table(skb, state, state->net->ipv4.nat_table);
 }
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 4346336cee4c..325e02956bf5 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -246,8 +246,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -285,7 +284,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = do_chain(priv, skb, state, ct);
+			ret = do_chain(priv, skb, state);
 			if (ret != NF_ACCEPT)
 				return ret;
 
@@ -326,8 +325,7 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct))
+					 const struct nf_hook_state *state))
 {
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
@@ -346,8 +344,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 		const struct nf_hook_state *state,
 		unsigned int (*do_chain)(void *priv,
 					  struct sk_buff *skb,
-					  const struct nf_hook_state *state,
-					  struct nf_conn *ct))
+					  const struct nf_hook_state *state))
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -383,8 +380,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state,
 		     unsigned int (*do_chain)(void *priv,
 					       struct sk_buff *skb,
-					       const struct nf_hook_state *state,
-					       struct nf_conn *ct))
+					       const struct nf_hook_state *state))
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index b5464a3f253b..285baccfbdea 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -28,8 +28,7 @@
 
 static unsigned int nft_nat_do_chain(void *priv,
 				      struct sk_buff *skb,
-				      const struct nf_hook_state *state,
-				      struct nf_conn *ct)
+				      const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 47306e45a80a..2bf554e18af8 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -35,8 +35,7 @@ static const struct xt_table nf_nat_ipv6_table = {
 
 static unsigned int ip6table_nat_do_chain(void *priv,
 					  struct sk_buff *skb,
-					  const struct nf_hook_state *state,
-					  struct nf_conn *ct)
+					  const struct nf_hook_state *state)
 {
 	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
 }
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 56d75eb5448f..f1582b6f9588 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -257,8 +257,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -303,7 +302,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = do_chain(priv, skb, state, ct);
+			ret = do_chain(priv, skb, state);
 			if (ret != NF_ACCEPT)
 				return ret;
 
@@ -343,8 +342,7 @@ nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state,
 	       unsigned int (*do_chain)(void *priv,
 					struct sk_buff *skb,
-					const struct nf_hook_state *state,
-					struct nf_conn *ct))
+					const struct nf_hook_state *state))
 {
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
@@ -363,8 +361,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 		const struct nf_hook_state *state,
 		unsigned int (*do_chain)(void *priv,
 					 struct sk_buff *skb,
-					 const struct nf_hook_state *state,
-					 struct nf_conn *ct))
+					 const struct nf_hook_state *state))
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -400,8 +397,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 		     const struct nf_hook_state *state,
 		     unsigned int (*do_chain)(void *priv,
 					      struct sk_buff *skb,
-					      const struct nf_hook_state *state,
-					      struct nf_conn *ct))
+					      const struct nf_hook_state *state))
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 3557b114446c..100a6bd1046a 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -26,8 +26,7 @@
 
 static unsigned int nft_nat_do_chain(void *priv,
 				     struct sk_buff *skb,
-				     const struct nf_hook_state *state,
-				     struct nf_conn *ct)
+				     const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
-- 
cgit v1.2.3-59-g8ed1b


From 3f9c56a581b96d8117922c4fd8221687fd649f9b Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 27 Apr 2018 12:47:01 +0200
Subject: netfilter: nf_tables: Provide NFT_{RT,CT}_MAX for userspace

These macros allow conveniently declaring arrays which use NFT_{RT,CT}_*
values as indexes.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5a5551a580f7..ce031cf72288 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -831,7 +831,9 @@ enum nft_rt_keys {
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
 	NFT_RT_TCPMSS,
+	__NFT_RT_MAX
 };
+#define NFT_RT_MAX		(__NFT_RT_MAX - 1)
 
 /**
  * enum nft_hash_types - nf_tables hash expression types
@@ -949,7 +951,9 @@ enum nft_ct_keys {
 	NFT_CT_DST_IP,
 	NFT_CT_SRC_IP6,
 	NFT_CT_DST_IP6,
+	__NFT_CT_MAX
 };
+#define NFT_CT_MAX		(__NFT_CT_MAX - 1)
 
 /**
  * enum nft_ct_attributes - nf_tables ct expression netlink attributes
-- 
cgit v1.2.3-59-g8ed1b


From bfb15f2a95cbbc548b59abf8007d0fdb35fdfee5 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Thu, 3 May 2018 14:05:40 +0200
Subject: netfilter: extract Passive OS fingerprint infrastructure from xt_osf

Add nf_osf_ttl() and nf_osf_match() into nf_osf.c to prepare for
nf_tables support.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      |  27 +++++
 include/uapi/linux/netfilter/nf_osf.h |  90 ++++++++++++++
 include/uapi/linux/netfilter/xt_osf.h | 106 +++--------------
 net/netfilter/Kconfig                 |   4 +
 net/netfilter/Makefile                |   1 +
 net/netfilter/nf_osf.c                | 218 ++++++++++++++++++++++++++++++++++
 net/netfilter/xt_osf.c                | 202 +------------------------------
 7 files changed, 359 insertions(+), 289 deletions(-)
 create mode 100644 include/linux/netfilter/nf_osf.h
 create mode 100644 include/uapi/linux/netfilter/nf_osf.h
 create mode 100644 net/netfilter/nf_osf.c

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..a2b39602e87d
--- /dev/null
+++ b/include/linux/netfilter/nf_osf.h
@@ -0,0 +1,27 @@
+#include <uapi/linux/netfilter/nf_osf.h>
+
+/* Initial window size option state machine: multiple of mss, mtu or
+ * plain numeric value. Can also be made as plain numeric value which
+ * is not a multiple of specified value.
+ */
+enum nf_osf_window_size_options {
+	OSF_WSS_PLAIN   = 0,
+	OSF_WSS_MSS,
+	OSF_WSS_MTU,
+	OSF_WSS_MODULO,
+	OSF_WSS_MAX,
+};
+
+enum osf_fmatch_states {
+	/* Packet does not match the fingerprint */
+	FMATCH_WRONG = 0,
+	/* Packet matches the fingerprint */
+	FMATCH_OK,
+	/* Options do not match the fingerprint, but header does */
+	FMATCH_OPT_WRONG,
+};
+
+bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+		  int hooknum, struct net_device *in, struct net_device *out,
+		  const struct nf_osf_info *info, struct net *net,
+		  const struct list_head *nf_osf_fingers);
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
new file mode 100644
index 000000000000..45376eae31ef
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -0,0 +1,90 @@
+#ifndef _NF_OSF_H
+#define _NF_OSF_H
+
+#define MAXGENRELEN	32
+
+#define NF_OSF_GENRE	(1 << 0)
+#define NF_OSF_TTL	(1 << 1)
+#define NF_OSF_LOG	(1 << 2)
+#define NF_OSF_INVERT	(1 << 3)
+
+#define NF_OSF_LOGLEVEL_ALL		0	/* log all matched fingerprints */
+#define NF_OSF_LOGLEVEL_FIRST		1	/* log only the first matced fingerprint */
+#define NF_OSF_LOGLEVEL_ALL_KNOWN	2	/* do not log unknown packets */
+
+#define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
+
+/* Do not compare ip and fingerprint TTL at all */
+#define NF_OSF_TTL_NOCHECK		2
+
+/* Wildcard MSS (kind of).
+ * It is used to implement a state machine for the different wildcard values
+ * of the MSS and window sizes.
+ */
+struct nf_osf_wc {
+	__u32	wc;
+	__u32	val;
+};
+
+/* This struct represents IANA options
+ * http://www.iana.org/assignments/tcp-parameters
+ */
+struct nf_osf_opt {
+	__u16			kind, length;
+	struct nf_osf_wc	wc;
+};
+
+struct nf_osf_info {
+	char	genre[MAXGENRELEN];
+	__u32	len;
+	__u32	flags;
+	__u32	loglevel;
+	__u32	ttl;
+};
+
+struct nf_osf_user_finger {
+	struct nf_osf_wc	wss;
+
+	__u8	ttl, df;
+	__u16	ss, mss;
+	__u16	opt_num;
+
+	char	genre[MAXGENRELEN];
+	char	version[MAXGENRELEN];
+	char	subtype[MAXGENRELEN];
+
+	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
+	struct nf_osf_opt	opt[MAX_IPOPTLEN];
+};
+
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
+struct nf_osf_nlmsg {
+	struct nf_osf_user_finger	f;
+	struct iphdr			ip;
+	struct tcphdr			tcp;
+};
+
+/* Defines for IANA option kinds */
+enum iana_options {
+	OSFOPT_EOL = 0,		/* End of options */
+	OSFOPT_NOP,		/* NOP */
+	OSFOPT_MSS,		/* Maximum segment size */
+	OSFOPT_WSO,		/* Window scale option */
+	OSFOPT_SACKP,		/* SACK permitted */
+	OSFOPT_SACK,		/* SACK */
+	OSFOPT_ECHO,
+	OSFOPT_ECHOREPLY,
+	OSFOPT_TS,		/* Timestamp option */
+	OSFOPT_POCP,		/* Partial Order Connection Permitted */
+	OSFOPT_POSP,		/* Partial Order Service Profile */
+
+	/* Others are not used in the current OSF */
+	OSFOPT_EMPTY = 255,
+};
+
+#endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index dad197e2ab99..72956eceeb09 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -23,101 +23,29 @@
 #include <linux/types.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
+#include <linux/netfilter/nf_osf.h>
 
-#define MAXGENRELEN		32
+#define XT_OSF_GENRE		NF_OSF_GENRE
+#define XT_OSF_INVERT		NF_OSF_INVERT
 
-#define XT_OSF_GENRE		(1<<0)
-#define	XT_OSF_TTL		(1<<1)
-#define XT_OSF_LOG		(1<<2)
-#define XT_OSF_INVERT		(1<<3)
+#define XT_OSF_TTL		NF_OSF_TTL
+#define XT_OSF_LOG		NF_OSF_LOG
 
-#define XT_OSF_LOGLEVEL_ALL	0	/* log all matched fingerprints */
-#define XT_OSF_LOGLEVEL_FIRST	1	/* log only the first matced fingerprint */
-#define XT_OSF_LOGLEVEL_ALL_KNOWN	2 /* do not log unknown packets */
+#define XT_OSF_LOGLEVEL_ALL		NF_OSF_LOGLEVEL_ALL
+#define XT_OSF_LOGLEVEL_FIRST		NF_OSF_LOGLEVEL_FIRST
+#define XT_OSF_LOGLEVEL_ALL_KNOWN	NF_OSF_LOGLEVEL_ALL_KNOWN
 
-#define XT_OSF_TTL_TRUE		0	/* True ip and fingerprint TTL comparison */
-#define XT_OSF_TTL_LESS		1	/* Check if ip TTL is less than fingerprint one */
-#define XT_OSF_TTL_NOCHECK	2	/* Do not compare ip and fingerprint TTL at all */
+#define XT_OSF_TTL_TRUE		NF_OSF_TTL_TRUE
+#define XT_OSF_TTL_NOCHECK	NF_OSF_TTL_NOCHECK
 
-struct xt_osf_info {
-	char			genre[MAXGENRELEN];
-	__u32			len;
-	__u32			flags;
-	__u32			loglevel;
-	__u32			ttl;
-};
-
-/*
- * Wildcard MSS (kind of).
- * It is used to implement a state machine for the different wildcard values
- * of the MSS and window sizes.
- */
-struct xt_osf_wc {
-	__u32			wc;
-	__u32			val;
-};
-
-/*
- * This struct represents IANA options
- * http://www.iana.org/assignments/tcp-parameters
- */
-struct xt_osf_opt {
-	__u16			kind, length;
-	struct xt_osf_wc	wc;
-};
-
-struct xt_osf_user_finger {
-	struct xt_osf_wc	wss;
-
-	__u8			ttl, df;
-	__u16			ss, mss;
-	__u16			opt_num;
-
-	char			genre[MAXGENRELEN];
-	char			version[MAXGENRELEN];
-	char			subtype[MAXGENRELEN];
+#define XT_OSF_TTL_LESS	1	/* Check if ip TTL is less than fingerprint one */
 
-	/* MAX_IPOPTLEN is maximum if all options are NOPs or EOLs */
-	struct xt_osf_opt	opt[MAX_IPOPTLEN];
-};
-
-struct xt_osf_nlmsg {
-	struct xt_osf_user_finger	f;
-	struct iphdr		ip;
-	struct tcphdr		tcp;
-};
-
-/* Defines for IANA option kinds */
-
-enum iana_options {
-	OSFOPT_EOL = 0,		/* End of options */
-	OSFOPT_NOP, 		/* NOP */
-	OSFOPT_MSS, 		/* Maximum segment size */
-	OSFOPT_WSO, 		/* Window scale option */
-	OSFOPT_SACKP,		/* SACK permitted */
-	OSFOPT_SACK,		/* SACK */
-	OSFOPT_ECHO,
-	OSFOPT_ECHOREPLY,
-	OSFOPT_TS,		/* Timestamp option */
-	OSFOPT_POCP,		/* Partial Order Connection Permitted */
-	OSFOPT_POSP,		/* Partial Order Service Profile */
-
-	/* Others are not used in the current OSF */
-	OSFOPT_EMPTY = 255,
-};
-
-/*
- * Initial window size option state machine: multiple of mss, mtu or
- * plain numeric value. Can also be made as plain numeric value which
- * is not a multiple of specified value.
- */
-enum xt_osf_window_size_options {
-	OSF_WSS_PLAIN	= 0,
-	OSF_WSS_MSS,
-	OSF_WSS_MTU,
-	OSF_WSS_MODULO,
-	OSF_WSS_MAX,
-};
+#define xt_osf_wc		nf_osf_wc
+#define xt_osf_opt		nf_osf_opt
+#define xt_osf_info		nf_osf_info
+#define xt_osf_user_finger	nf_osf_user_finger
+#define xt_osf_finger		nf_osf_finger
+#define xt_osf_nlmsg		nf_osf_nlmsg
 
 /*
  * Add/remove fingerprint from the kernel.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f66586fb41cd..e25cdb3d51cb 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -444,6 +444,9 @@ config NETFILTER_SYNPROXY
 
 endif # NF_CONNTRACK
 
+config NF_OSF
+	tristate 'Passive OS fingerprint infrastructure'
+
 config NF_TABLES
 	select NETFILTER_NETLINK
 	tristate "Netfilter nf_tables support"
@@ -1358,6 +1361,7 @@ config NETFILTER_XT_MATCH_NFACCT
 config NETFILTER_XT_MATCH_OSF
 	tristate '"osf" Passive OS fingerprint match'
 	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
+	select NF_OSF
 	help
 	  This option selects the Passive OS Fingerprinting match module
 	  that allows to passively match the remote operating system by
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b37ce0bc9ab7..1aa710b5d384 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,6 +101,7 @@ obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
 obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
+obj-$(CONFIG_NF_OSF)		+= nf_osf.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
new file mode 100644
index 000000000000..5ba5c7bef2f9
--- /dev/null
+++ b/net/netfilter/nf_osf.c
@@ -0,0 +1,218 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nf_osf.h>
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+			     const struct nf_osf_info *info,
+			     unsigned char f_ttl)
+{
+	const struct iphdr *ip = ip_hdr(skb);
+
+	if (info->flags & NF_OSF_TTL) {
+		if (info->ttl == NF_OSF_TTL_TRUE)
+			return ip->ttl == f_ttl;
+		if (info->ttl == NF_OSF_TTL_NOCHECK)
+			return 1;
+		else if (ip->ttl <= f_ttl)
+			return 1;
+		else {
+			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+			int ret = 0;
+
+			for_ifa(in_dev) {
+				if (inet_ifa_match(ip->saddr, ifa)) {
+					ret = (ip->ttl == f_ttl);
+					break;
+				}
+			}
+			endfor_ifa(in_dev);
+
+			return ret;
+		}
+	}
+
+	return ip->ttl == f_ttl;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+	     int hooknum, struct net_device *in, struct net_device *out,
+	     const struct nf_osf_info *info, struct net *net,
+	     const struct list_head *nf_osf_fingers)
+{
+	const unsigned char *optp = NULL, *_optp = NULL;
+	unsigned int optsize = 0, check_WSS = 0;
+	int fmatch = FMATCH_WRONG, fcount = 0;
+	const struct iphdr *ip = ip_hdr(skb);
+	const struct nf_osf_user_finger *f;
+	unsigned char opts[MAX_IPOPTLEN];
+	const struct nf_osf_finger *kf;
+	u16 window, totlen, mss = 0;
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+	bool df;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return false;
+
+	if (!tcp->syn)
+		return false;
+
+	totlen = ntohs(ip->tot_len);
+	df = ntohs(ip->frag_off) & IP_DF;
+	window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), optsize, opts);
+	}
+
+	list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+		int foptsize, optnum;
+
+		f = &kf->finger;
+
+		if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+			continue;
+
+		optp = _optp;
+		fmatch = FMATCH_WRONG;
+
+		if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+			continue;
+
+		/*
+		 * Should not happen if userspace parser was written correctly.
+		 */
+		if (f->wss.wc >= OSF_WSS_MAX)
+			continue;
+
+		/* Check options */
+
+		foptsize = 0;
+		for (optnum = 0; optnum < f->opt_num; ++optnum)
+			foptsize += f->opt[optnum].length;
+
+		if (foptsize > MAX_IPOPTLEN ||
+		    optsize > MAX_IPOPTLEN ||
+		    optsize != foptsize)
+			continue;
+
+		check_WSS = f->wss.wc;
+
+		for (optnum = 0; optnum < f->opt_num; ++optnum) {
+			if (f->opt[optnum].kind == (*optp)) {
+				__u32 len = f->opt[optnum].length;
+				const __u8 *optend = optp + len;
+
+				fmatch = FMATCH_OK;
+
+				switch (*optp) {
+				case OSFOPT_MSS:
+					mss = optp[3];
+					mss <<= 8;
+					mss |= optp[2];
+
+					mss = ntohs((__force __be16)mss);
+					break;
+				case OSFOPT_TS:
+					break;
+				}
+
+				optp = optend;
+			} else
+				fmatch = FMATCH_OPT_WRONG;
+
+			if (fmatch != FMATCH_OK)
+				break;
+		}
+
+		if (fmatch != FMATCH_OPT_WRONG) {
+			fmatch = FMATCH_WRONG;
+
+			switch (check_WSS) {
+			case OSF_WSS_PLAIN:
+				if (f->wss.val == 0 || window == f->wss.val)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MSS:
+				/*
+				 * Some smart modems decrease mangle MSS to
+				 * SMART_MSS_2, so we check standard, decreased
+				 * and the one provided in the fingerprint MSS
+				 * values.
+				 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+				if (window == f->wss.val * mss ||
+				    window == f->wss.val * SMART_MSS_1 ||
+				    window == f->wss.val * SMART_MSS_2)
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MTU:
+				if (window == f->wss.val * (mss + 40) ||
+				    window == f->wss.val * (SMART_MSS_1 + 40) ||
+				    window == f->wss.val * (SMART_MSS_2 + 40))
+					fmatch = FMATCH_OK;
+				break;
+			case OSF_WSS_MODULO:
+				if ((window % f->wss.val) == 0)
+					fmatch = FMATCH_OK;
+				break;
+			}
+		}
+
+		if (fmatch != FMATCH_OK)
+			continue;
+
+		fcount++;
+
+		if (info->flags & NF_OSF_LOG)
+			nf_log_packet(net, family, hooknum, skb,
+				      in, out, NULL,
+				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+				      f->genre, f->version, f->subtype,
+				      &ip->saddr, ntohs(tcp->source),
+				      &ip->daddr, ntohs(tcp->dest),
+				      f->ttl - ip->ttl);
+
+		if ((info->flags & NF_OSF_LOG) &&
+		    info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+			break;
+	}
+
+	if (!fcount && (info->flags & NF_OSF_LOG))
+		nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+			      "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+			      &ip->saddr, ntohs(tcp->source),
+			      &ip->daddr, ntohs(tcp->dest));
+
+	if (fcount)
+		fmatch = FMATCH_OK;
+
+	return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a34f314a8c23..9cfef73b4107 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,21 +37,6 @@
 #include <net/netfilter/nf_log.h>
 #include <linux/netfilter/xt_osf.h>
 
-struct xt_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct xt_osf_user_finger	finger;
-};
-
-enum osf_fmatch_states {
-	/* Packet does not match the fingerprint */
-	FMATCH_WRONG = 0,
-	/* Packet matches the fingerprint */
-	FMATCH_OK,
-	/* Options do not match the fingerprint, but header does */
-	FMATCH_OPT_WRONG,
-};
-
 /*
  * Indexed by dont-fragment bit.
  * It is the only constant value in the fingerprint.
@@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
 	.cb			= xt_osf_nfnetlink_callbacks,
 };
 
-static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info,
-			    unsigned char f_ttl)
-{
-	const struct iphdr *ip = ip_hdr(skb);
-
-	if (info->flags & XT_OSF_TTL) {
-		if (info->ttl == XT_OSF_TTL_TRUE)
-			return ip->ttl == f_ttl;
-		if (info->ttl == XT_OSF_TTL_NOCHECK)
-			return 1;
-		else if (ip->ttl <= f_ttl)
-			return 1;
-		else {
-			struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
-			int ret = 0;
-
-			for_ifa(in_dev) {
-				if (inet_ifa_match(ip->saddr, ifa)) {
-					ret = (ip->ttl == f_ttl);
-					break;
-				}
-			}
-			endfor_ifa(in_dev);
-
-			return ret;
-		}
-	}
-
-	return ip->ttl == f_ttl;
-}
-
 static bool
 xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 {
 	const struct xt_osf_info *info = p->matchinfo;
-	const struct iphdr *ip = ip_hdr(skb);
-	const struct tcphdr *tcp;
-	struct tcphdr _tcph;
-	int fmatch = FMATCH_WRONG, fcount = 0;
-	unsigned int optsize = 0, check_WSS = 0;
-	u16 window, totlen, mss = 0;
-	bool df;
-	const unsigned char *optp = NULL, *_optp = NULL;
-	unsigned char opts[MAX_IPOPTLEN];
-	const struct xt_osf_finger *kf;
-	const struct xt_osf_user_finger *f;
 	struct net *net = xt_net(p);
 
 	if (!info)
 		return false;
 
-	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
-	if (!tcp)
-		return false;
-
-	if (!tcp->syn)
-		return false;
-
-	totlen = ntohs(ip->tot_len);
-	df = ntohs(ip->frag_off) & IP_DF;
-	window = ntohs(tcp->window);
-
-	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
-		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
-		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
-				sizeof(struct tcphdr), optsize, opts);
-	}
-
-	list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) {
-		int foptsize, optnum;
-
-		f = &kf->finger;
-
-		if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre))
-			continue;
-
-		optp = _optp;
-		fmatch = FMATCH_WRONG;
-
-		if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl))
-			continue;
-
-		/*
-		 * Should not happen if userspace parser was written correctly.
-		 */
-		if (f->wss.wc >= OSF_WSS_MAX)
-			continue;
-
-		/* Check options */
-
-		foptsize = 0;
-		for (optnum = 0; optnum < f->opt_num; ++optnum)
-			foptsize += f->opt[optnum].length;
-
-		if (foptsize > MAX_IPOPTLEN ||
-		    optsize > MAX_IPOPTLEN ||
-		    optsize != foptsize)
-			continue;
-
-		check_WSS = f->wss.wc;
-
-		for (optnum = 0; optnum < f->opt_num; ++optnum) {
-			if (f->opt[optnum].kind == (*optp)) {
-				__u32 len = f->opt[optnum].length;
-				const __u8 *optend = optp + len;
-
-				fmatch = FMATCH_OK;
-
-				switch (*optp) {
-				case OSFOPT_MSS:
-					mss = optp[3];
-					mss <<= 8;
-					mss |= optp[2];
-
-					mss = ntohs((__force __be16)mss);
-					break;
-				case OSFOPT_TS:
-					break;
-				}
-
-				optp = optend;
-			} else
-				fmatch = FMATCH_OPT_WRONG;
-
-			if (fmatch != FMATCH_OK)
-				break;
-		}
-
-		if (fmatch != FMATCH_OPT_WRONG) {
-			fmatch = FMATCH_WRONG;
-
-			switch (check_WSS) {
-			case OSF_WSS_PLAIN:
-				if (f->wss.val == 0 || window == f->wss.val)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MSS:
-				/*
-				 * Some smart modems decrease mangle MSS to
-				 * SMART_MSS_2, so we check standard, decreased
-				 * and the one provided in the fingerprint MSS
-				 * values.
-				 */
-#define SMART_MSS_1	1460
-#define SMART_MSS_2	1448
-				if (window == f->wss.val * mss ||
-				    window == f->wss.val * SMART_MSS_1 ||
-				    window == f->wss.val * SMART_MSS_2)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MTU:
-				if (window == f->wss.val * (mss + 40) ||
-				    window == f->wss.val * (SMART_MSS_1 + 40) ||
-				    window == f->wss.val * (SMART_MSS_2 + 40))
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MODULO:
-				if ((window % f->wss.val) == 0)
-					fmatch = FMATCH_OK;
-				break;
-			}
-		}
-
-		if (fmatch != FMATCH_OK)
-			continue;
-
-		fcount++;
-
-		if (info->flags & XT_OSF_LOG)
-			nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
-				      xt_in(p), xt_out(p), NULL,
-				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
-				      f->genre, f->version, f->subtype,
-				      &ip->saddr, ntohs(tcp->source),
-				      &ip->daddr, ntohs(tcp->dest),
-				      f->ttl - ip->ttl);
-
-		if ((info->flags & XT_OSF_LOG) &&
-		    info->loglevel == XT_OSF_LOGLEVEL_FIRST)
-			break;
-	}
-
-	if (!fcount && (info->flags & XT_OSF_LOG))
-		nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
-			      xt_out(p), NULL,
-			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
-				&ip->saddr, ntohs(tcp->source),
-				&ip->daddr, ntohs(tcp->dest));
-
-	if (fcount)
-		fmatch = FMATCH_OK;
-
-	return fmatch == FMATCH_OK;
+	return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
+			    xt_out(p), info, net, xt_osf_fingers);
 }
 
 static struct xt_match xt_osf_match = {
-- 
cgit v1.2.3-59-g8ed1b


From 538c5672be6d67b7b10c15701588e20617374973 Mon Sep 17 00:00:00 2001
From: Florent Fourcot <florent.fourcot@wifirst.fr>
Date: Sun, 6 May 2018 16:30:14 +0200
Subject: netfilter: ctnetlink: export nf_conntrack_max

IPCTNL_MSG_CT_GET_STATS netlink command allow to monitor current number
of conntrack entries. However, if one wants to compare it with the
maximum (and detect exhaustion), the only solution is currently to read
sysctl value.

This patch add nf_conntrack_max value in netlink message, and simplify
monitoring for application built on netlink API.

Signed-off-by: Florent Fourcot <florent.fourcot@wifirst.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 +
 net/netfilter/nf_conntrack_core.c                  | 1 +
 net/netfilter/nf_conntrack_netlink.c               | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 77987111cab0..1d41810d17e2 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -262,6 +262,7 @@ enum ctattr_stats_cpu {
 enum ctattr_stats_global {
 	CTA_STATS_GLOBAL_UNSPEC,
 	CTA_STATS_GLOBAL_ENTRIES,
+	CTA_STATS_GLOBAL_MAX_ENTRIES,
 	__CTA_STATS_GLOBAL_MAX,
 };
 #define CTA_STATS_GLOBAL_MAX (__CTA_STATS_GLOBAL_MAX - 1)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 41ff04ee2554..605441727008 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -186,6 +186,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
 unsigned int nf_conntrack_max __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_t nf_conntrack_generation __read_mostly;
 static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4c1d0c5bc268..d807b8770be3 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return skb->len;
 
-- 
cgit v1.2.3-59-g8ed1b


From b13468dc577498002cf4e62978359ff97ffcd187 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 27 Apr 2018 22:37:43 +0200
Subject: netfilter: nft_dynset: fix timeout updates on 32bit

This must now use a 64bit jiffies value, else we set
a bogus timeout on 32bit.

Fixes: 8e1102d5a1596 ("netfilter: nf_tables: support timeouts larger than 23 days")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_dynset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 5cc3509659c6..b07a3fd9eeea 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 		if (priv->op == NFT_DYNSET_OP_UPDATE &&
 		    nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
 			timeout = priv->timeout ? : set->timeout;
-			*nft_set_ext_expiration(ext) = jiffies + timeout;
+			*nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
 		}
 
 		if (sexpr != NULL)
-- 
cgit v1.2.3-59-g8ed1b


From c096b92aa79e603995982c4d324a52ee02775b85 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 20 Apr 2018 13:49:18 +0300
Subject: mac80211: rename rtap_vendor_space to rtap_space

Since all the HE data won't fit into struct ieee80211_rx_status,
we'll (have to) move that into the SKB proper. This means we'll
need to skip over more things in the future, so rename this to
remove the vendor-only notion from it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 03102aff0953..0a38cc1cbebc 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -5,6 +5,7 @@
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -97,27 +98,27 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
  */
 static void remove_monitor_info(struct sk_buff *skb,
 				unsigned int present_fcs_len,
-				unsigned int rtap_vendor_space)
+				unsigned int rtap_space)
 {
 	if (present_fcs_len)
 		__pskb_trim(skb, skb->len - present_fcs_len);
-	__pskb_pull(skb, rtap_vendor_space);
+	__pskb_pull(skb, rtap_space);
 }
 
 static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
-				     unsigned int rtap_vendor_space)
+				     unsigned int rtap_space)
 {
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	struct ieee80211_hdr *hdr;
 
-	hdr = (void *)(skb->data + rtap_vendor_space);
+	hdr = (void *)(skb->data + rtap_space);
 
 	if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
 			    RX_FLAG_FAILED_PLCP_CRC |
 			    RX_FLAG_ONLY_MONITOR))
 		return true;
 
-	if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
+	if (unlikely(skb->len < 16 + present_fcs_len + rtap_space))
 		return true;
 
 	if (ieee80211_is_ctl(hdr->frame_control) &&
@@ -199,7 +200,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 
 static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
 					 struct sk_buff *skb,
-					 int rtap_vendor_space)
+					 int rtap_space)
 {
 	struct {
 		struct ieee80211_hdr_3addr hdr;
@@ -212,14 +213,14 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata,
 
 	BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1);
 
-	if (skb->len < rtap_vendor_space + sizeof(action) +
+	if (skb->len < rtap_space + sizeof(action) +
 		       VHT_MUMIMO_GROUPS_DATA_LEN)
 		return;
 
 	if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr))
 		return;
 
-	skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action));
+	skb_copy_bits(skb, rtap_space, &action, sizeof(action));
 
 	if (!ieee80211_is_action(action.hdr.frame_control))
 		return;
@@ -545,7 +546,7 @@ static struct sk_buff *
 ieee80211_make_monitor_skb(struct ieee80211_local *local,
 			   struct sk_buff **origskb,
 			   struct ieee80211_rate *rate,
-			   int rtap_vendor_space, bool use_origskb)
+			   int rtap_space, bool use_origskb)
 {
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb);
 	int rt_hdrlen, needed_headroom;
@@ -553,7 +554,7 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local,
 
 	/* room for the radiotap header based on driver features */
 	rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb);
-	needed_headroom = rt_hdrlen - rtap_vendor_space;
+	needed_headroom = rt_hdrlen - rtap_space;
 
 	if (use_origskb) {
 		/* only need to expand headroom if necessary */
@@ -607,7 +608,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	struct ieee80211_sub_if_data *sdata;
 	struct sk_buff *monskb = NULL;
 	int present_fcs_len = 0;
-	unsigned int rtap_vendor_space = 0;
+	unsigned int rtap_space = 0;
 	struct ieee80211_sub_if_data *monitor_sdata =
 		rcu_dereference(local->monitor_sdata);
 	bool only_monitor = false;
@@ -615,7 +616,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
 		struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data;
 
-		rtap_vendor_space = sizeof(*rtap) + rtap->len + rtap->pad;
+		rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
 	}
 
 	/*
@@ -638,13 +639,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	}
 
 	/* ensure hdr->frame_control and vendor radiotap data are in skb head */
-	if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) {
+	if (!pskb_may_pull(origskb, 2 + rtap_space)) {
 		dev_kfree_skb(origskb);
 		return NULL;
 	}
 
-	only_monitor = should_drop_frame(origskb, present_fcs_len,
-					 rtap_vendor_space);
+	only_monitor = should_drop_frame(origskb, present_fcs_len, rtap_space);
 
 	if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
 		if (only_monitor) {
@@ -652,12 +652,11 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 			return NULL;
 		}
 
-		remove_monitor_info(origskb, present_fcs_len,
-				    rtap_vendor_space);
+		remove_monitor_info(origskb, present_fcs_len, rtap_space);
 		return origskb;
 	}
 
-	ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space);
+	ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space);
 
 	list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) {
 		bool last_monitor = list_is_last(&sdata->u.mntr.list,
@@ -665,8 +664,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 
 		if (!monskb)
 			monskb = ieee80211_make_monitor_skb(local, &origskb,
-							    rate,
-							    rtap_vendor_space,
+							    rate, rtap_space,
 							    only_monitor &&
 							    last_monitor);
 
@@ -698,7 +696,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	if (!origskb)
 		return NULL;
 
-	remove_monitor_info(origskb, present_fcs_len, rtap_vendor_space);
+	remove_monitor_info(origskb, present_fcs_len, rtap_space);
 	return origskb;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 35f4962c01757db6cc1dcdf5526ec6e5afcb6245 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 20 Apr 2018 13:49:21 +0300
Subject: mac80211: clean up rate info bandwidth setting

There's no need to do the same thing three times in
the different switch cases, pull that out to a single
place.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 11f9cfc016d9..2d82c88efd0b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2793,12 +2793,13 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 
 	memset(&ri, 0, sizeof(ri));
 
+	ri.bw = status->bw;
+
 	/* Fill cfg80211 rate info */
 	switch (status->encoding) {
 	case RX_ENC_HT:
 		ri.mcs = status->rate_idx;
 		ri.flags |= RATE_INFO_FLAGS_MCS;
-		ri.bw = status->bw;
 		if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
 			ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
 		break;
@@ -2806,7 +2807,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 		ri.flags |= RATE_INFO_FLAGS_VHT_MCS;
 		ri.mcs = status->rate_idx;
 		ri.nss = status->nss;
-		ri.bw = status->bw;
 		if (status->enc_flags & RX_ENC_FLAG_SHORT_GI)
 			ri.flags |= RATE_INFO_FLAGS_SHORT_GI;
 		break;
@@ -2818,8 +2818,6 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 		int shift = 0;
 		int bitrate;
 
-		ri.bw = status->bw;
-
 		switch (status->bw) {
 		case RATE_INFO_BW_10:
 			shift = 1;
-- 
cgit v1.2.3-59-g8ed1b


From bfb27814e3d89a6abf362542bc4b26d84debc5b6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 20 Apr 2018 13:49:22 +0300
Subject: mac80211: ethtool: memset the whole sinfo struct to 0

Rather than just setting the valid flags to 0 set the
whole struct to 0 since other places might rely on it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ethtool.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 9cc986deda61..08408520c3f8 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -4,6 +4,7 @@
  * Copied from cfg.c - originally
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2014	Intel Corporation (Author: Johannes Berg)
+ * Copyright (C) 2018 Intel Corporation
  *
  * This file is GPLv2 as found in COPYING.
  */
@@ -106,7 +107,7 @@ static void ieee80211_get_stats(struct net_device *dev,
 		if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
 			goto do_survey;
 
-		sinfo.filled = 0;
+		memset(&sinfo, 0, sizeof(sinfo));
 		sta_set_sinfo(sta, &sinfo);
 
 		i = 0;
@@ -133,7 +134,7 @@ static void ieee80211_get_stats(struct net_device *dev,
 			if (sta->sdata->dev != dev)
 				continue;
 
-			sinfo.filled = 0;
+			memset(&sinfo, 0, sizeof(sinfo));
 			sta_set_sinfo(sta, &sinfo);
 			i = 0;
 			ADD_STA_STATS(sta);
-- 
cgit v1.2.3-59-g8ed1b


From a403f3bf6390f085b7864814fb17567718c5a22c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 20 Apr 2018 13:49:23 +0300
Subject: mac80211: remove pointless flags=0 assignment

The data structure is initialized to all zeroes, and
we already rely on that in other places, so remove the
pointless assignment to 0.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 655c3d8b0d80..2d5ffb0a16f6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -3,6 +3,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1992,7 +1993,6 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
 		int band = STA_STATS_GET(LEGACY_BAND, rate);
 		int rate_idx = STA_STATS_GET(LEGACY_IDX, rate);
 
-		rinfo->flags = 0;
 		sband = local->hw.wiphy->bands[band];
 		brate = sband->bitrates[rate_idx].bitrate;
 		if (rinfo->bw == RATE_INFO_BW_5)
-- 
cgit v1.2.3-59-g8ed1b


From 03737001e463d458c13510fcb64179b3368aaeee Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Fri, 20 Apr 2018 13:49:24 +0300
Subject: mac80211: add api to set CSA counter in mac80211

Sometimes the most updated CSA counter values are known only
to the device. Add an API to pass this data to mac80211.

Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 13 +++++++++++++
 net/mac80211/tx.c      | 25 +++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d2279b2d61aa..52f36c43f35f 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4449,6 +4449,19 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
  */
 u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif);
 
+/**
+ * ieee80211_csa_set_counter - request mac80211 to set csa counter
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @counter: the new value for the counter
+ *
+ * The csa counter can be changed by the device, this API should be
+ * used by the device driver to update csa counter in mac80211.
+ *
+ * It should never be used together with ieee80211_csa_update_counter(),
+ * as it will cause a race condition around the counter value.
+ */
+void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter);
+
 /**
  * ieee80211_csa_finish - notify mac80211 about channel switch
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 535de3161a78..062e125a324c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4084,6 +4084,31 @@ unlock:
 }
 EXPORT_SYMBOL(ieee80211_csa_update_counter);
 
+void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct beacon_data *beacon = NULL;
+
+	rcu_read_lock();
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
+		beacon = rcu_dereference(sdata->u.ap.beacon);
+	else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+		beacon = rcu_dereference(sdata->u.ibss.presp);
+	else if (ieee80211_vif_is_mesh(&sdata->vif))
+		beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+	if (!beacon)
+		goto unlock;
+
+	if (counter < beacon->csa_current_counter)
+		beacon->csa_current_counter = counter;
+
+unlock:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_csa_set_counter);
+
 bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-- 
cgit v1.2.3-59-g8ed1b


From 50f32718e125c3be5b0528bfa3868e88d677d8ce Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Fri, 20 Apr 2018 13:49:26 +0300
Subject: nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command

This will serve userspace entity to maintain its regulatory limitation.
More specifcally APs can use this data to calculate the WMM IE when
building: beacons, probe responses, assoc responses etc...

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++
 net/wireless/nl80211.c       | 57 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 15daf5e2638d..04c9b97aa5fc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,6 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -3141,6 +3142,29 @@ enum nl80211_band_attr {
 
 #define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA
 
+/**
+ * enum nl80211_wmm_rule - regulatory wmm rule
+ *
+ * @__NL80211_WMMR_INVALID: attribute number 0 is reserved
+ * @NL80211_WMMR_CW_MIN: Minimum contention window slot.
+ * @NL80211_WMMR_CW_MAX: Maximum contention window slot.
+ * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space.
+ * @NL80211_WMMR_TXOP: Maximum allowed tx operation time.
+ * @nl80211_WMMR_MAX: highest possible wmm rule.
+ * @__NL80211_WMMR_LAST: Internal use.
+ */
+enum nl80211_wmm_rule {
+	__NL80211_WMMR_INVALID,
+	NL80211_WMMR_CW_MIN,
+	NL80211_WMMR_CW_MAX,
+	NL80211_WMMR_AIFSN,
+	NL80211_WMMR_TXOP,
+
+	/* keep last */
+	__NL80211_WMMR_LAST,
+	NL80211_WMMR_MAX = __NL80211_WMMR_LAST - 1
+};
+
 /**
  * enum nl80211_frequency_attr - frequency attributes
  * @__NL80211_FREQUENCY_ATTR_INVALID: attribute number 0 is reserved
@@ -3190,6 +3214,9 @@ enum nl80211_band_attr {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_10MHZ: 10 MHz operation is not allowed
  *	on this channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_WMM: this channel has wmm limitations.
+ *	This is a nested attribute that contains the wmm limitation per AC.
+ *	(see &enum nl80211_wmm_rule)
  * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number
  *	currently defined
  * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use
@@ -3218,6 +3245,7 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_IR_CONCURRENT,
 	NL80211_FREQUENCY_ATTR_NO_20MHZ,
 	NL80211_FREQUENCY_ATTR_NO_10MHZ,
+	NL80211_FREQUENCY_ATTR_WMM,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ff28f8feeb09..016d0a1de576 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  */
 
 #include <linux/if.h>
@@ -645,7 +646,43 @@ static inline void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
 	return genlmsg_put(skb, portid, seq, &nl80211_fam, flags, cmd);
 }
 
-static int nl80211_msg_put_channel(struct sk_buff *msg,
+static int nl80211_msg_put_wmm_rules(struct sk_buff *msg,
+				     const struct ieee80211_reg_rule *rule)
+{
+	int j;
+	struct nlattr *nl_wmm_rules =
+		nla_nest_start(msg, NL80211_FREQUENCY_ATTR_WMM);
+
+	if (!nl_wmm_rules)
+		goto nla_put_failure;
+
+	for (j = 0; j < IEEE80211_NUM_ACS; j++) {
+		struct nlattr *nl_wmm_rule = nla_nest_start(msg, j);
+
+		if (!nl_wmm_rule)
+			goto nla_put_failure;
+
+		if (nla_put_u16(msg, NL80211_WMMR_CW_MIN,
+				rule->wmm_rule->client[j].cw_min) ||
+		    nla_put_u16(msg, NL80211_WMMR_CW_MAX,
+				rule->wmm_rule->client[j].cw_max) ||
+		    nla_put_u8(msg, NL80211_WMMR_AIFSN,
+			       rule->wmm_rule->client[j].aifsn) ||
+		    nla_put_u8(msg, NL80211_WMMR_TXOP,
+			       rule->wmm_rule->client[j].cot))
+			goto nla_put_failure;
+
+		nla_nest_end(msg, nl_wmm_rule);
+	}
+	nla_nest_end(msg, nl_wmm_rules);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 				   struct ieee80211_channel *chan,
 				   bool large)
 {
@@ -721,6 +758,16 @@ static int nl80211_msg_put_channel(struct sk_buff *msg,
 			DBM_TO_MBM(chan->max_power)))
 		goto nla_put_failure;
 
+	if (large) {
+		const struct ieee80211_reg_rule *rule =
+			freq_reg_info(wiphy, chan->center_freq);
+
+		if (!IS_ERR(rule) && rule->wmm_rule) {
+			if (nl80211_msg_put_wmm_rules(msg, rule))
+				goto nla_put_failure;
+		}
+	}
+
 	return 0;
 
  nla_put_failure:
@@ -1631,7 +1678,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 					chan = &sband->channels[i];
 
 					if (nl80211_msg_put_channel(
-							msg, chan,
+							msg, &rdev->wiphy, chan,
 							state->split))
 						goto nla_put_failure;
 
@@ -14320,7 +14367,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_BEFORE);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_before, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_before, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
@@ -14328,7 +14376,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	nl_freq = nla_nest_start(msg, NL80211_ATTR_FREQ_AFTER);
 	if (!nl_freq)
 		goto nla_put_failure;
-	if (nl80211_msg_put_channel(msg, channel_after, false))
+
+	if (nl80211_msg_put_channel(msg, wiphy, channel_after, false))
 		goto nla_put_failure;
 	nla_nest_end(msg, nl_freq);
 
-- 
cgit v1.2.3-59-g8ed1b


From aced43ce780dc5e683b3de00ce9fb3db7d28e1d3 Mon Sep 17 00:00:00 2001
From: Amar Singhal <asinghal@codeaurora.org>
Date: Thu, 26 Apr 2018 20:13:07 +0300
Subject: cfg80211: Call reg_notifier for self managed hints conditionally

Currently the regulatory core does not call the regulatory callback
reg_notifier for self managed wiphys, but regulatory_hint_user() call is
independent of wiphy and is meant for all wiphys in the system. Even a
self managed wiphy may be interested in regulatory_hint_user() to know
the country code from a trusted regulatory domain change like a cellular
base station. Therefore, for the regulatory source
NL80211_REGDOM_SET_BY_USER and the user hint type
NL80211_USER_REG_HINT_CELL_BASE, call the regulatory notifier.

No current wlan driver uses the REGULATORY_WIPHY_SELF_MANAGED flag while
also registering the reg_notifier regulatory callback, therefore there
will be no impact on existing drivers without them being explicitly
modified to take advantage of this new possibility.

Signed-off-by: Amar Singhal <asinghal@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index ecfee5f06c76..69cf79165d43 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2768,6 +2768,21 @@ out_free:
 	reg_free_request(reg_request);
 }
 
+static void notify_self_managed_wiphys(struct regulatory_request *request)
+{
+	struct cfg80211_registered_device *rdev;
+	struct wiphy *wiphy;
+
+	list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
+		wiphy = &rdev->wiphy;
+		if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED &&
+		    request->initiator == NL80211_REGDOM_SET_BY_USER &&
+		    request->user_reg_hint_type ==
+				NL80211_USER_REG_HINT_CELL_BASE)
+			reg_call_notifier(wiphy, request);
+	}
+}
+
 static bool reg_only_self_managed_wiphys(void)
 {
 	struct cfg80211_registered_device *rdev;
@@ -2819,6 +2834,7 @@ static void reg_process_pending_hints(void)
 
 	spin_unlock(&reg_requests_lock);
 
+	notify_self_managed_wiphys(reg_request);
 	if (reg_only_self_managed_wiphys()) {
 		reg_free_request(reg_request);
 		return;
@@ -3698,17 +3714,26 @@ EXPORT_SYMBOL(regulatory_set_wiphy_regd_sync_rtnl);
 
 void wiphy_regulatory_register(struct wiphy *wiphy)
 {
-	struct regulatory_request *lr;
+	struct regulatory_request *lr = get_last_request();
 
-	/* self-managed devices ignore external hints */
-	if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
+	/* self-managed devices ignore beacon hints and country IE */
+	if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
 		wiphy->regulatory_flags |= REGULATORY_DISABLE_BEACON_HINTS |
 					   REGULATORY_COUNTRY_IE_IGNORE;
 
+		/*
+		 * The last request may have been received before this
+		 * registration call. Call the driver notifier if
+		 * initiator is USER and user type is CELL_BASE.
+		 */
+		if (lr->initiator == NL80211_REGDOM_SET_BY_USER &&
+		    lr->user_reg_hint_type == NL80211_USER_REG_HINT_CELL_BASE)
+			reg_call_notifier(wiphy, lr);
+	}
+
 	if (!reg_dev_ignore_cell_hint(wiphy))
 		reg_num_devs_support_basehint++;
 
-	lr = get_last_request();
 	wiphy_update_regulatory(wiphy, lr->initiator);
 	wiphy_all_share_dfs_chan_state(wiphy);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 81d5439da84419ee35bea54309a9f2c3871b6605 Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori <bpothuno@codeaurora.org>
Date: Mon, 16 Apr 2018 20:18:40 +0530
Subject: cfg80211: average ack rssi support for data frames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Average ack rssi will be given to userspace via NL80211 interface
if firmware is capable. Userspace tool ‘iw’ can process this
information and give the output as one of the fields in
‘iw dev wlanX station dump’.

Example output :

localhost ~ #iw dev wlan-5000mhz station dump Station
34:f3:9a:aa:3b:29 (on wlan-5000mhz)
        inactive time:  5370 ms
        rx bytes:       85321
        rx packets:     576
        tx bytes:       14225
        tx packets:     71
        tx retries:     0
        tx failed:      2
        beacon loss:    0
        rx drop misc:   0
        signal:         -54 dBm
        signal avg:     -53 dBm
        tx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        rx bitrate:     866.7 MBit/s VHT-MCS 9 80MHz short GI VHT-NSS 2
        avg ack signal: -56 dBm
        authorized:     yes
        authenticated:  yes
        associated:     yes
        preamble:       short
        WMM/WME:        yes
        MFP:            no
        TDLS peer:      no
        DTIM period:    2
        beacon interval:100
       short preamble: yes
       short slot time:yes
       connected time: 203 seconds

Main use case is to measure the signal strength of a connected station
to AP. Data packet transmit rates and bandwidth used by station can vary
a lot even if the station is at fixed location, especially if the rates
used are multi stream(2stream, 3stream) rates with different bandwidth(20/40/80 Mhz).
These multi stream rates are sensitive and station can use different transmit power
for each of the rate and bandwidth combinations. RSSI measured from these RX packets
on AP will be not stable and can vary a lot with in a short time.
Whereas 802.11 ack frames from station are sent relatively at a constant
rate (6/12/24 Mbps) with constant bandwidth(20 Mhz).
So average rssi of the ack packets is good and more accurate.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 7 +++++++
 net/wireless/nl80211.c       | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 250dac390806..5e888ec62811 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1152,6 +1152,8 @@ struct cfg80211_tid_stats {
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ *	been sent.
  */
 struct station_info {
 	u64 filled;
@@ -1197,6 +1199,7 @@ struct station_info {
 	u8 rx_beacon_signal_avg;
 	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
 	s8 ack_signal;
+	s8 avg_ack_signal;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 04c9b97aa5fc..8e55b63ed3ff 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2981,6 +2981,8 @@ enum nl80211_sta_bss_param {
  *	received from the station (u64, usec)
  * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment
  * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm)
+ * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data)
+ *	ACK frame (s8, dBm)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3020,6 +3022,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_RX_DURATION,
 	NL80211_STA_INFO_PAD,
 	NL80211_STA_INFO_ACK_SIGNAL,
+	NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
@@ -5066,6 +5069,9 @@ enum nl80211_feature_flags {
  *	"radar detected" event.
  * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and
  *	receiving control port frames over nl80211 instead of the netdevice.
+ * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
+ *	rssi if firmware support, this flag is to intimate about ack rssi
+ *	support to nl80211.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5098,6 +5104,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN,
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
+	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 016d0a1de576..6b942a68d1c8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4541,6 +4541,9 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
 	PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT))
+		PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8);
 
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
-- 
cgit v1.2.3-59-g8ed1b


From cc60dbbfed8ff0bd4c530ee48e9e915333a35470 Mon Sep 17 00:00:00 2001
From: Balaji Pothunoori <bpothuno@codeaurora.org>
Date: Mon, 16 Apr 2018 20:18:41 +0530
Subject: mac80211: average ack rssi support for data frames

The driver will process the RSSI if available and send it to mac80211.
mac80211 will compute the weighted average of ack RSSI for stations.

Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c | 10 ++++++++++
 net/mac80211/sta_info.h |  2 ++
 net/mac80211/status.c   |  2 ++
 3 files changed, 14 insertions(+)

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 2d5ffb0a16f6..f83e6e2ad5ab 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -358,6 +358,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 
 	sta->last_connected = ktime_get_seconds();
 	ewma_signal_init(&sta->rx_stats_avg.signal);
+	ewma_avg_signal_init(&sta->status_stats.avg_ack_signal);
 	for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++)
 		ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]);
 
@@ -2294,6 +2295,15 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 		sinfo->ack_signal = sta->status_stats.last_ack_signal;
 		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
 	}
+
+	if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) &&
+	    !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) {
+		sinfo->avg_ack_signal =
+			-(s8)ewma_avg_signal_read(
+				&sta->status_stats.avg_ack_signal);
+		sinfo->filled |=
+			BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG);
+	}
 }
 
 u32 sta_get_expected_throughput(struct sta_info *sta)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index f64eb86ca64b..d79bd6eeb549 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -119,6 +119,7 @@ enum ieee80211_sta_info_flags {
 #define HT_AGG_STATE_START_CB		6
 #define HT_AGG_STATE_STOP_CB		7
 
+DECLARE_EWMA(avg_signal, 10, 8)
 enum ieee80211_agg_stop_reason {
 	AGG_STOP_DECLINED,
 	AGG_STOP_LOCAL_REQUEST,
@@ -550,6 +551,7 @@ struct sta_info {
 		unsigned long last_ack;
 		s8 last_ack_signal;
 		bool ack_signal_filled;
+		struct ewma_avg_signal avg_ack_signal;
 	} status_stats;
 
 	/* Updated from TX path only, no locking requirements */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 743e89c5926c..9a6d7208bf4f 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -195,6 +195,8 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
 			sta->status_stats.last_ack_signal =
 					 (s8)txinfo->status.ack_signal;
 			sta->status_stats.ack_signal_filled = true;
+			ewma_avg_signal_add(&sta->status_stats.avg_ack_signal,
+					    -txinfo->status.ack_signal);
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 6328c95ce7cd70f8a5b99e1d04050a061dd27223 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 4 May 2018 11:17:25 +0200
Subject: selftests: net: add udpgso* to TEST_GEN_FILES

The generated files udpgso* shouldn't be part of TEST_PROGS, they are
used by udpgso.sh and udpgsp_bench.sh. They should be added to the
TEST_GEN_FILES to get installed without being added to the main
run_kselftest.sh script.

Fixes: 3a687bef148d ("selftests: udp gso benchmark")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 902820d3e848..73af45773938 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -11,9 +11,9 @@ TEST_GEN_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_FILES += tcp_mmap tcp_inq
+TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
-TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
 
 include ../lib.mk
 
-- 
cgit v1.2.3-59-g8ed1b


From c36a68fab1c3ddf71918cd1e5937a647cd80ba2b Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 4 May 2018 17:17:46 +0200
Subject: net: u64_stats_sync: Remove functions without user

Commit 67db3e4bfbc9 ("tcp: no longer hold ehash lock while calling
tcp_get_info()") removes the only users of u64_stats_update_end/begin_raw()
without removing the function in header file.

Remove no longer used functions.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 07ee0f84a46c..a27604f99ed0 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -112,20 +112,6 @@ u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
 #endif
 }
 
-static inline void u64_stats_update_begin_raw(struct u64_stats_sync *syncp)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	raw_write_seqcount_begin(&syncp->seq);
-#endif
-}
-
-static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	raw_write_seqcount_end(&syncp->seq);
-#endif
-}
-
 static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-- 
cgit v1.2.3-59-g8ed1b


From 577b995be5f5235e7bfe48536c72a5d1f0084f6b Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 4 May 2018 17:17:47 +0200
Subject: net: 3com: 3c59x: Move boomerang/vortex conditional into function

If vp->full_bus_master_tx is set, vp->full_bus_master_rx is set as well
(see vortex_probe1()). Therefore the conditionals for the decision if
boomerang or vortex ISR is executed have the same result. Instead of
repeating the explicit conditional execution of the boomerang/vortex ISR,
move it into an own function.

No functional change.

Cc: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 36c8950dbd2d..0cfdb07f3e59 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -765,8 +765,9 @@ static netdev_tx_t boomerang_start_xmit(struct sk_buff *skb,
 					struct net_device *dev);
 static int vortex_rx(struct net_device *dev);
 static int boomerang_rx(struct net_device *dev);
-static irqreturn_t vortex_interrupt(int irq, void *dev_id);
-static irqreturn_t boomerang_interrupt(int irq, void *dev_id);
+static irqreturn_t vortex_boomerang_interrupt(int irq, void *dev_id);
+static irqreturn_t _vortex_interrupt(int irq, struct net_device *dev);
+static irqreturn_t _boomerang_interrupt(int irq, struct net_device *dev);
 static int vortex_close(struct net_device *dev);
 static void dump_tx_ring(struct net_device *dev);
 static void update_stats(void __iomem *ioaddr, struct net_device *dev);
@@ -838,10 +839,9 @@ MODULE_PARM_DESC(use_mmio, "3c59x: use memory-mapped PCI I/O resource (0-1)");
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void poll_vortex(struct net_device *dev)
 {
-	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_irq_save(flags);
-	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
+	vortex_boomerang_interrupt(dev->irq, dev);
 	local_irq_restore(flags);
 }
 #endif
@@ -1729,8 +1729,7 @@ vortex_open(struct net_device *dev)
 	dma_addr_t dma;
 
 	/* Use the now-standard shared IRQ implementation. */
-	if ((retval = request_irq(dev->irq, vp->full_bus_master_rx ?
-				boomerang_interrupt : vortex_interrupt, IRQF_SHARED, dev->name, dev))) {
+	if ((retval = request_irq(dev->irq, vortex_boomerang_interrupt, IRQF_SHARED, dev->name, dev))) {
 		pr_err("%s: Could not reserve IRQ %d\n", dev->name, dev->irq);
 		goto err;
 	}
@@ -1911,10 +1910,7 @@ static void vortex_tx_timeout(struct net_device *dev)
 			 */
 			unsigned long flags;
 			local_irq_save(flags);
-			if (vp->full_bus_master_tx)
-				boomerang_interrupt(dev->irq, dev);
-			else
-				vortex_interrupt(dev->irq, dev);
+			vortex_boomerang_interrupt(dev->irq, dev);
 			local_irq_restore(flags);
 		}
 	}
@@ -2267,9 +2263,8 @@ out_dma_err:
  */
 
 static irqreturn_t
-vortex_interrupt(int irq, void *dev_id)
+_vortex_interrupt(int irq, struct net_device *dev)
 {
-	struct net_device *dev = dev_id;
 	struct vortex_private *vp = netdev_priv(dev);
 	void __iomem *ioaddr;
 	int status;
@@ -2386,9 +2381,8 @@ handler_exit:
  */
 
 static irqreturn_t
-boomerang_interrupt(int irq, void *dev_id)
+_boomerang_interrupt(int irq, struct net_device *dev)
 {
-	struct net_device *dev = dev_id;
 	struct vortex_private *vp = netdev_priv(dev);
 	void __iomem *ioaddr;
 	int status;
@@ -2526,6 +2520,18 @@ handler_exit:
 	return IRQ_RETVAL(handled);
 }
 
+static irqreturn_t
+vortex_boomerang_interrupt(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct vortex_private *vp = netdev_priv(dev);
+
+	if (vp->full_bus_master_rx)
+		return _boomerang_interrupt(dev->irq, dev);
+	else
+		return _vortex_interrupt(dev->irq, dev);
+}
+
 static int vortex_rx(struct net_device *dev)
 {
 	struct vortex_private *vp = netdev_priv(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 47f66c766c8d91fe0a20b85359ce13e467863697 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 4 May 2018 17:17:48 +0200
Subject: net: 3com: 3c59x: Pull locking out of ISR

Locking is done in the same way in _vortex_interrupt() and
_boomerang_interrupt(). To prevent duplication, move the locking into the
calling vortex_boomerang_interrupt() function.

No functional change.

Cc: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 0cfdb07f3e59..fdafe25da153 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -2273,7 +2273,6 @@ _vortex_interrupt(int irq, struct net_device *dev)
 	unsigned int bytes_compl = 0, pkts_compl = 0;
 
 	ioaddr = vp->ioaddr;
-	spin_lock(&vp->lock);
 
 	status = ioread16(ioaddr + EL3_STATUS);
 
@@ -2371,7 +2370,6 @@ _vortex_interrupt(int irq, struct net_device *dev)
 		pr_debug("%s: exiting interrupt, status %4.4x.\n",
 			   dev->name, status);
 handler_exit:
-	spin_unlock(&vp->lock);
 	return IRQ_RETVAL(handled);
 }
 
@@ -2392,12 +2390,6 @@ _boomerang_interrupt(int irq, struct net_device *dev)
 
 	ioaddr = vp->ioaddr;
 
-
-	/*
-	 * It seems dopey to put the spinlock this early, but we could race against vortex_tx_timeout
-	 * and boomerang_start_xmit
-	 */
-	spin_lock(&vp->lock);
 	vp->handling_irq = 1;
 
 	status = ioread16(ioaddr + EL3_STATUS);
@@ -2516,7 +2508,6 @@ _boomerang_interrupt(int irq, struct net_device *dev)
 			   dev->name, status);
 handler_exit:
 	vp->handling_irq = 0;
-	spin_unlock(&vp->lock);
 	return IRQ_RETVAL(handled);
 }
 
@@ -2525,11 +2516,18 @@ vortex_boomerang_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
 	struct vortex_private *vp = netdev_priv(dev);
+	irqreturn_t ret;
+
+	spin_lock(&vp->lock);
 
 	if (vp->full_bus_master_rx)
-		return _boomerang_interrupt(dev->irq, dev);
+		ret = _boomerang_interrupt(dev->irq, dev);
 	else
-		return _vortex_interrupt(dev->irq, dev);
+		ret = _vortex_interrupt(dev->irq, dev);
+
+	spin_unlock(&vp->lock);
+
+	return ret;
 }
 
 static int vortex_rx(struct net_device *dev)
-- 
cgit v1.2.3-59-g8ed1b


From dea4c96e65b3c5520f1e9d9e038e79d59f5688f1 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Fri, 4 May 2018 17:17:49 +0200
Subject: net: 3com: 3c59x: irq save variant of ISR

When vortex_boomerang_interrupt() is invoked from vortex_tx_timeout() or
poll_vortex() interrupts must be disabled. This detaches the interrupt
disable logic from locking which requires patching for PREEMPT_RT.

The advantage of avoiding spin_lock_irqsave() in the interrupt handler is
minimal, but converting it removes all the extra code for callers which
come not from interrupt context.

Cc: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/3com/3c59x.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index fdafe25da153..cabbe227bb98 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -839,10 +839,7 @@ MODULE_PARM_DESC(use_mmio, "3c59x: use memory-mapped PCI I/O resource (0-1)");
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void poll_vortex(struct net_device *dev)
 {
-	unsigned long flags;
-	local_irq_save(flags);
 	vortex_boomerang_interrupt(dev->irq, dev);
-	local_irq_restore(flags);
 }
 #endif
 
@@ -1904,15 +1901,7 @@ static void vortex_tx_timeout(struct net_device *dev)
 		pr_err("%s: Interrupt posted but not delivered --"
 			   " IRQ blocked by another device?\n", dev->name);
 		/* Bad idea here.. but we might as well handle a few events. */
-		{
-			/*
-			 * Block interrupts because vortex_interrupt does a bare spin_lock()
-			 */
-			unsigned long flags;
-			local_irq_save(flags);
-			vortex_boomerang_interrupt(dev->irq, dev);
-			local_irq_restore(flags);
-		}
+		vortex_boomerang_interrupt(dev->irq, dev);
 	}
 
 	if (vortex_debug > 0)
@@ -2516,16 +2505,17 @@ vortex_boomerang_interrupt(int irq, void *dev_id)
 {
 	struct net_device *dev = dev_id;
 	struct vortex_private *vp = netdev_priv(dev);
+	unsigned long flags;
 	irqreturn_t ret;
 
-	spin_lock(&vp->lock);
+	spin_lock_irqsave(&vp->lock, flags);
 
 	if (vp->full_bus_master_rx)
 		ret = _boomerang_interrupt(dev->irq, dev);
 	else
 		ret = _vortex_interrupt(dev->irq, dev);
 
-	spin_unlock(&vp->lock);
+	spin_unlock_irqrestore(&vp->lock, flags);
 
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 18b338f5f9539512e76fd9ebd4c6ca1a0e159e2b Mon Sep 17 00:00:00 2001
From: Weilin Chang <weilin.chang@cavium.com>
Date: Fri, 4 May 2018 11:07:19 -0700
Subject: liquidio: support use of ethtool to set link speed of CN23XX-225
 cards

Support setting the link speed of CN23XX-225 cards (which can do 25Gbps or
10Gbps) via ethtool_ops.set_link_ksettings.

Also fix the function assigned to ethtool_ops.get_link_ksettings to use the
new link_ksettings api completely (instead of partially via
ethtool_convert_legacy_u32_to_link_mode).

Signed-off-by: Weilin Chang <weilin.chang@cavium.com>
Acked-by: Raghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_core.c    | 196 +++++++++++++++++++++
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 195 +++++++++++++++++---
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  20 +++
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c |   5 +
 .../net/ethernet/cavium/liquidio/liquidio_common.h |   4 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  14 ++
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  15 ++
 7 files changed, 425 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c
index 6821afcdc365..8093c5eafea2 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -1481,3 +1481,199 @@ int octnet_get_link_stats(struct net_device *netdev)
 
 	return 0;
 }
+
+static void liquidio_nic_seapi_ctl_callback(struct octeon_device *oct,
+					    u32 status,
+					    void *buf)
+{
+	struct liquidio_nic_seapi_ctl_context *ctx;
+	struct octeon_soft_command *sc = buf;
+
+	ctx = sc->ctxptr;
+
+	oct = lio_get_device(ctx->octeon_id);
+	if (status) {
+		dev_err(&oct->pci_dev->dev, "%s: instruction failed. Status: %llx\n",
+			__func__,
+			CVM_CAST64(status));
+	}
+	ctx->status = status;
+	complete(&ctx->complete);
+}
+
+int liquidio_set_speed(struct lio *lio, int speed)
+{
+	struct liquidio_nic_seapi_ctl_context *ctx;
+	struct octeon_device *oct = lio->oct_dev;
+	struct oct_nic_seapi_resp *resp;
+	struct octeon_soft_command *sc;
+	union octnet_cmd *ncmd;
+	u32 ctx_size;
+	int retval;
+	u32 var;
+
+	if (oct->speed_setting == speed)
+		return 0;
+
+	if (!OCTEON_CN23XX_PF(oct)) {
+		dev_err(&oct->pci_dev->dev, "%s: SET SPEED only for PF\n",
+			__func__);
+		return -EOPNOTSUPP;
+	}
+
+	ctx_size = sizeof(struct liquidio_nic_seapi_ctl_context);
+	sc = octeon_alloc_soft_command(oct, OCTNET_CMD_SIZE,
+				       sizeof(struct oct_nic_seapi_resp),
+				       ctx_size);
+	if (!sc)
+		return -ENOMEM;
+
+	ncmd = sc->virtdptr;
+	ctx  = sc->ctxptr;
+	resp = sc->virtrptr;
+	memset(resp, 0, sizeof(struct oct_nic_seapi_resp));
+
+	ctx->octeon_id = lio_get_device_id(oct);
+	ctx->status = 0;
+	init_completion(&ctx->complete);
+
+	ncmd->u64 = 0;
+	ncmd->s.cmd = SEAPI_CMD_SPEED_SET;
+	ncmd->s.param1 = speed;
+
+	octeon_swap_8B_data((u64 *)ncmd, (OCTNET_CMD_SIZE >> 3));
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	octeon_prepare_soft_command(oct, sc, OPCODE_NIC,
+				    OPCODE_NIC_UBOOT_CTL, 0, 0, 0);
+
+	sc->callback = liquidio_nic_seapi_ctl_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = 5000;
+
+	retval = octeon_send_soft_command(oct, sc);
+	if (retval == IQ_SEND_FAILED) {
+		dev_info(&oct->pci_dev->dev, "Failed to send soft command\n");
+		retval = -EBUSY;
+	} else {
+		/* Wait for response or timeout */
+		if (wait_for_completion_timeout(&ctx->complete,
+						msecs_to_jiffies(10000)) == 0) {
+			dev_err(&oct->pci_dev->dev, "%s: sc timeout\n",
+				__func__);
+			octeon_free_soft_command(oct, sc);
+			return -EINTR;
+		}
+
+		retval = resp->status;
+
+		if (retval) {
+			dev_err(&oct->pci_dev->dev, "%s failed, retval=%d\n",
+				__func__, retval);
+			octeon_free_soft_command(oct, sc);
+			return -EIO;
+		}
+
+		var = be32_to_cpu((__force __be32)resp->speed);
+		if (var != speed) {
+			dev_err(&oct->pci_dev->dev,
+				"%s: setting failed speed= %x, expect %x\n",
+				__func__, var, speed);
+		}
+
+		oct->speed_setting = var;
+	}
+
+	octeon_free_soft_command(oct, sc);
+
+	return retval;
+}
+
+int liquidio_get_speed(struct lio *lio)
+{
+	struct liquidio_nic_seapi_ctl_context *ctx;
+	struct octeon_device *oct = lio->oct_dev;
+	struct oct_nic_seapi_resp *resp;
+	struct octeon_soft_command *sc;
+	union octnet_cmd *ncmd;
+	u32 ctx_size;
+	int retval;
+
+	ctx_size = sizeof(struct liquidio_nic_seapi_ctl_context);
+	sc = octeon_alloc_soft_command(oct, OCTNET_CMD_SIZE,
+				       sizeof(struct oct_nic_seapi_resp),
+				       ctx_size);
+	if (!sc)
+		return -ENOMEM;
+
+	ncmd = sc->virtdptr;
+	ctx  = sc->ctxptr;
+	resp = sc->virtrptr;
+	memset(resp, 0, sizeof(struct oct_nic_seapi_resp));
+
+	ctx->octeon_id = lio_get_device_id(oct);
+	ctx->status = 0;
+	init_completion(&ctx->complete);
+
+	ncmd->u64 = 0;
+	ncmd->s.cmd = SEAPI_CMD_SPEED_GET;
+
+	octeon_swap_8B_data((u64 *)ncmd, (OCTNET_CMD_SIZE >> 3));
+
+	sc->iq_no = lio->linfo.txpciq[0].s.q_no;
+
+	octeon_prepare_soft_command(oct, sc, OPCODE_NIC,
+				    OPCODE_NIC_UBOOT_CTL, 0, 0, 0);
+
+	sc->callback = liquidio_nic_seapi_ctl_callback;
+	sc->callback_arg = sc;
+	sc->wait_time = 5000;
+
+	retval = octeon_send_soft_command(oct, sc);
+	if (retval == IQ_SEND_FAILED) {
+		dev_info(&oct->pci_dev->dev, "Failed to send soft command\n");
+		oct->no_speed_setting = 1;
+		oct->speed_setting = 25;
+
+		retval = -EBUSY;
+	} else {
+		if (wait_for_completion_timeout(&ctx->complete,
+						msecs_to_jiffies(10000)) == 0) {
+			dev_err(&oct->pci_dev->dev, "%s: sc timeout\n",
+				__func__);
+
+			oct->speed_setting = 25;
+			oct->no_speed_setting = 1;
+
+			octeon_free_soft_command(oct, sc);
+
+			return -EINTR;
+		}
+		retval = resp->status;
+		if (retval) {
+			dev_err(&oct->pci_dev->dev,
+				"%s failed retval=%d\n", __func__, retval);
+			oct->no_speed_setting = 1;
+			oct->speed_setting = 25;
+			octeon_free_soft_command(oct, sc);
+			retval = -EIO;
+		} else {
+			u32 var;
+
+			var = be32_to_cpu((__force __be32)resp->speed);
+			oct->speed_setting = var;
+			if (var == 0xffff) {
+				oct->no_speed_setting = 1;
+				/* unable to access boot variables
+				 * get the default value based on the NIC type
+				 */
+				oct->speed_setting = 25;
+			}
+		}
+	}
+
+	octeon_free_soft_command(oct, sc);
+
+	return retval;
+}
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index a1d84304a608..06f7449c569d 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -230,46 +230,147 @@ static int lio_get_link_ksettings(struct net_device *netdev,
 	struct lio *lio = GET_LIO(netdev);
 	struct octeon_device *oct = lio->oct_dev;
 	struct oct_link_info *linfo;
-	u32 supported = 0, advertising = 0;
 
 	linfo = &lio->linfo;
 
+	ethtool_link_ksettings_zero_link_mode(ecmd, supported);
+	ethtool_link_ksettings_zero_link_mode(ecmd, advertising);
+
 	switch (linfo->link.s.phy_type) {
 	case LIO_PHY_PORT_TP:
 		ecmd->base.port = PORT_TP;
-		supported = (SUPPORTED_10000baseT_Full |
-			     SUPPORTED_TP | SUPPORTED_Pause);
-		advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_Pause);
 		ecmd->base.autoneg = AUTONEG_DISABLE;
+		ethtool_link_ksettings_add_link_mode(ecmd, supported, TP);
+		ethtool_link_ksettings_add_link_mode(ecmd, supported, Pause);
+		ethtool_link_ksettings_add_link_mode(ecmd, supported,
+						     10000baseT_Full);
+
+		ethtool_link_ksettings_add_link_mode(ecmd, advertising, Pause);
+		ethtool_link_ksettings_add_link_mode(ecmd, advertising,
+						     10000baseT_Full);
+
 		break;
 
 	case LIO_PHY_PORT_FIBRE:
-		ecmd->base.port = PORT_FIBRE;
-
-		if (linfo->link.s.speed == SPEED_10000) {
-			supported = SUPPORTED_10000baseT_Full;
-			advertising = ADVERTISED_10000baseT_Full;
+		if (linfo->link.s.if_mode == INTERFACE_MODE_XAUI ||
+		    linfo->link.s.if_mode == INTERFACE_MODE_RXAUI ||
+		    linfo->link.s.if_mode == INTERFACE_MODE_XLAUI ||
+		    linfo->link.s.if_mode == INTERFACE_MODE_XFI) {
+			dev_dbg(&oct->pci_dev->dev, "ecmd->base.transceiver is XCVR_EXTERNAL\n");
+		} else {
+			dev_err(&oct->pci_dev->dev, "Unknown link interface mode: %d\n",
+				linfo->link.s.if_mode);
 		}
 
-		supported |= SUPPORTED_FIBRE | SUPPORTED_Pause;
-		advertising |= ADVERTISED_Pause;
+		ecmd->base.port = PORT_FIBRE;
 		ecmd->base.autoneg = AUTONEG_DISABLE;
+		ethtool_link_ksettings_add_link_mode(ecmd, supported, FIBRE);
+
+		ethtool_link_ksettings_add_link_mode(ecmd, supported, Pause);
+		ethtool_link_ksettings_add_link_mode(ecmd, advertising, Pause);
+		if (oct->subsystem_id == OCTEON_CN2350_25GB_SUBSYS_ID ||
+		    oct->subsystem_id == OCTEON_CN2360_25GB_SUBSYS_ID) {
+			if (OCTEON_CN23XX_PF(oct)) {
+				ethtool_link_ksettings_add_link_mode
+					(ecmd, supported, 25000baseSR_Full);
+				ethtool_link_ksettings_add_link_mode
+					(ecmd, supported, 25000baseKR_Full);
+				ethtool_link_ksettings_add_link_mode
+					(ecmd, supported, 25000baseCR_Full);
+
+				if (oct->no_speed_setting == 0)  {
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseCR_Full);
+				}
+
+				if (oct->no_speed_setting == 0)
+					liquidio_get_speed(lio);
+				else
+					oct->speed_setting = 25;
+
+				if (oct->speed_setting == 10) {
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseCR_Full);
+				}
+				if (oct->speed_setting == 25) {
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseCR_Full);
+				}
+			} else { /* VF */
+				if (linfo->link.s.speed == 10000) {
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 10000baseCR_Full);
+
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 10000baseCR_Full);
+				}
+
+				if (linfo->link.s.speed == 25000) {
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 25000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 25000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, supported,
+						 25000baseCR_Full);
+
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseSR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseKR_Full);
+					ethtool_link_ksettings_add_link_mode
+						(ecmd, advertising,
+						 25000baseCR_Full);
+				}
+			}
+		} else {
+			ethtool_link_ksettings_add_link_mode(ecmd, supported,
+							     10000baseT_Full);
+			ethtool_link_ksettings_add_link_mode(ecmd, advertising,
+							     10000baseT_Full);
+		}
 		break;
 	}
 
-	if (linfo->link.s.if_mode == INTERFACE_MODE_XAUI ||
-	    linfo->link.s.if_mode == INTERFACE_MODE_RXAUI ||
-	    linfo->link.s.if_mode == INTERFACE_MODE_XLAUI ||
-	    linfo->link.s.if_mode == INTERFACE_MODE_XFI) {
-		ethtool_convert_legacy_u32_to_link_mode(
-			ecmd->link_modes.supported, supported);
-		ethtool_convert_legacy_u32_to_link_mode(
-			ecmd->link_modes.advertising, advertising);
-	} else {
-		dev_err(&oct->pci_dev->dev, "Unknown link interface reported %d\n",
-			linfo->link.s.if_mode);
-	}
-
 	if (linfo->link.s.link_up) {
 		ecmd->base.speed = linfo->link.s.speed;
 		ecmd->base.duplex = linfo->link.s.duplex;
@@ -281,6 +382,51 @@ static int lio_get_link_ksettings(struct net_device *netdev,
 	return 0;
 }
 
+static int lio_set_link_ksettings(struct net_device *netdev,
+				  const struct ethtool_link_ksettings *ecmd)
+{
+	const int speed = ecmd->base.speed;
+	struct lio *lio = GET_LIO(netdev);
+	struct oct_link_info *linfo;
+	struct octeon_device *oct;
+	u32 is25G = 0;
+
+	oct = lio->oct_dev;
+
+	linfo = &lio->linfo;
+
+	if (oct->subsystem_id == OCTEON_CN2350_25GB_SUBSYS_ID ||
+	    oct->subsystem_id == OCTEON_CN2360_25GB_SUBSYS_ID) {
+		is25G = 1;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	if (oct->no_speed_setting) {
+		dev_err(&oct->pci_dev->dev, "%s: Changing speed is not supported\n",
+			__func__);
+		return -EOPNOTSUPP;
+	}
+
+	if ((ecmd->base.duplex != DUPLEX_UNKNOWN &&
+	     ecmd->base.duplex != linfo->link.s.duplex) ||
+	     ecmd->base.autoneg != AUTONEG_DISABLE ||
+	    (ecmd->base.speed != 10000 && ecmd->base.speed != 25000 &&
+	     ecmd->base.speed != SPEED_UNKNOWN))
+		return -EOPNOTSUPP;
+
+	if ((oct->speed_boot == speed / 1000) &&
+	    oct->speed_boot == oct->speed_setting)
+		return 0;
+
+	liquidio_set_speed(lio, speed / 1000);
+
+	dev_dbg(&oct->pci_dev->dev, "Port speed is set to %dG\n",
+		oct->speed_setting);
+
+	return 0;
+}
+
 static void
 lio_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
 {
@@ -2966,6 +3112,7 @@ static int lio_set_priv_flags(struct net_device *netdev, u32 flags)
 
 static const struct ethtool_ops lio_ethtool_ops = {
 	.get_link_ksettings	= lio_get_link_ksettings,
+	.set_link_ksettings	= lio_set_link_ksettings,
 	.get_link		= ethtool_op_get_link,
 	.get_drvinfo		= lio_get_drvinfo,
 	.get_ringparam		= lio_ethtool_get_ringparam,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index ee75048b3937..e500528ad751 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -912,6 +912,9 @@ liquidio_probe(struct pci_dev *pdev,
 	/* set linux specific device pointer */
 	oct_dev->pci_dev = (void *)pdev;
 
+	oct_dev->subsystem_id = pdev->subsystem_vendor |
+		(pdev->subsystem_device << 16);
+
 	hs = &handshake[oct_dev->octeon_id];
 	init_completion(&hs->init);
 	init_completion(&hs->started);
@@ -3664,6 +3667,23 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			"NIC ifidx:%d Setup successful\n", i);
 
 		octeon_free_soft_command(octeon_dev, sc);
+
+		if (octeon_dev->subsystem_id ==
+			OCTEON_CN2350_25GB_SUBSYS_ID ||
+		    octeon_dev->subsystem_id ==
+			OCTEON_CN2360_25GB_SUBSYS_ID) {
+			liquidio_get_speed(lio);
+
+			if (octeon_dev->speed_setting == 0) {
+				octeon_dev->speed_setting = 25;
+				octeon_dev->no_speed_setting = 1;
+			}
+		} else {
+			octeon_dev->no_speed_setting = 1;
+			octeon_dev->speed_setting = 10;
+		}
+		octeon_dev->speed_boot = octeon_dev->speed_setting;
+
 	}
 
 	devlink = devlink_alloc(&liquidio_devlink_ops,
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 6295eeec795c..7fa0212873ac 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -411,6 +411,9 @@ liquidio_vf_probe(struct pci_dev *pdev,
 	/* set linux specific device pointer */
 	oct_dev->pci_dev = pdev;
 
+	oct_dev->subsystem_id = pdev->subsystem_vendor |
+		(pdev->subsystem_device << 16);
+
 	if (octeon_device_init(oct_dev)) {
 		liquidio_vf_remove(pdev);
 		return -ENOMEM;
@@ -2199,6 +2202,8 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 			"NIC ifidx:%d Setup successful\n", i);
 
 		octeon_free_soft_command(octeon_dev, sc);
+
+		octeon_dev->no_speed_setting = 1;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 026570499dcf..285b24836974 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -93,6 +93,7 @@ enum octeon_tag_type {
 
 #define OPCODE_NIC_VF_REP_PKT          0x15
 #define OPCODE_NIC_VF_REP_CMD          0x16
+#define OPCODE_NIC_UBOOT_CTL           0x17
 
 #define CORE_DRV_TEST_SCATTER_OP    0xFFF5
 
@@ -249,6 +250,9 @@ static inline void add_sg_size(struct octeon_sg_entry *sg_entry,
 #define   OCTNET_CMD_VLAN_FILTER_ENABLE 0x1
 #define   OCTNET_CMD_VLAN_FILTER_DISABLE 0x0
 
+#define   SEAPI_CMD_SPEED_SET           0x2
+#define   SEAPI_CMD_SPEED_GET           0x3
+
 #define   LIO_CMD_WAIT_TM 100
 
 /* RX(packets coming from wire) Checksum verification flags */
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index 9430c0a0bb8c..94a4ed88d618 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -43,6 +43,13 @@
 #define  OCTEON_CN23XX_REV_1_1        0x01
 #define  OCTEON_CN23XX_REV_2_0        0x80
 
+/**SubsystemId for the chips */
+#define	 OCTEON_CN2350_10GB_SUBSYS_ID_1	0X3177d
+#define	 OCTEON_CN2350_10GB_SUBSYS_ID_2	0X4177d
+#define	 OCTEON_CN2360_10GB_SUBSYS_ID	0X5177d
+#define	 OCTEON_CN2350_25GB_SUBSYS_ID	0X7177d
+#define	 OCTEON_CN2360_25GB_SUBSYS_ID	0X6177d
+
 /** Endian-swap modes supported by Octeon. */
 enum octeon_pci_swap_mode {
 	OCTEON_PCI_PASSTHROUGH = 0,
@@ -430,6 +437,8 @@ struct octeon_device {
 
 	u16 rev_id;
 
+	u32 subsystem_id;
+
 	u16 pf_num;
 
 	u16 vf_num;
@@ -584,6 +593,11 @@ struct octeon_device {
 	struct lio_vf_rep_list vf_rep_list;
 	struct devlink *devlink;
 	enum devlink_eswitch_mode eswitch_mode;
+
+	/* for 25G NIC speed change */
+	u8  speed_boot;
+	u8  speed_setting;
+	u8  no_speed_setting;
 };
 
 #define  OCT_DRV_ONLINE 1
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 8571f11e3c8f..dd3177a526d2 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -81,6 +81,18 @@ struct oct_nic_stats_ctrl {
 	struct net_device *netdev;
 };
 
+struct oct_nic_seapi_resp {
+	u64 rh;
+	u32 speed;
+	u64 status;
+};
+
+struct liquidio_nic_seapi_ctl_context {
+	int octeon_id;
+	u32 status;
+	struct completion complete;
+};
+
 /** LiquidIO per-interface network private data */
 struct lio {
 	/** State of the interface. Rx/Tx happens only in the RUNNING state.  */
@@ -230,6 +242,9 @@ void lio_delete_glists(struct lio *lio);
 
 int lio_setup_glists(struct octeon_device *oct, struct lio *lio, int num_qs);
 
+int liquidio_get_speed(struct lio *lio);
+int liquidio_set_speed(struct lio *lio, int speed);
+
 /**
  * \brief Net device change_mtu
  * @param netdev network device
-- 
cgit v1.2.3-59-g8ed1b


From 84a2fba2cb5e90b3b6f6e72676d8e3918725d9e5 Mon Sep 17 00:00:00 2001
From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Date: Sat, 5 May 2018 09:08:18 +0800
Subject: vlan: correct the file path in vlan_dev_change_flags() comment

The vlan_flags enum is defined in include/uapi/linux/if_vlan.h file.
not in include/linux/if_vlan.h file.

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 236452ebbd9e..546af0e73ac3 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -215,7 +215,9 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 	return 0;
 }
 
-/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */
+/* Flags are defined in the vlan_flags enum in
+ * include/uapi/linux/if_vlan.h file.
+ */
 int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 3a443bd6dd7c43bf5763779309514bf3e7c1c3eb Mon Sep 17 00:00:00 2001
From: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Date: Sat, 5 May 2018 11:29:16 +0800
Subject: net/9p: correct the variable name in v9fs_get_trans_by_name() comment

The v9fs_get_trans_by_name(char *s) variable name is not "name" but "s".

Signed-off-by: Sun Lianwen <sunlw.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/9p/mod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/9p/mod.c b/net/9p/mod.c
index 6ab36aea7727..eb9777f05755 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -104,7 +104,7 @@ EXPORT_SYMBOL(v9fs_unregister_trans);
 
 /**
  * v9fs_get_trans_by_name - get transport with the matching name
- * @name: string identifying transport
+ * @s: string identifying transport
  *
  */
 struct p9_trans_module *v9fs_get_trans_by_name(char *s)
-- 
cgit v1.2.3-59-g8ed1b


From 0bc5fe857274133ca028ebb15ff2e8549a369916 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:42:59 -0700
Subject: qed*: Refactor mf_mode to consist of bits.

`mf_mode' field indicates the multi-partitioning mode the device is
configured to. This method doesn't scale very well, adding a new MF mode
requires going over all the existing conditions, and deciding whether those
are needed for the new mode or not.
The patch defines a set of bit-fields for modes which are derived according
to the mode info shared by the MFW and all the configuration would be made
according to those. To add a new mode, there would be a single place where
we'll need to go and choose which bits apply and which don't.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h             | 41 ++++++++++++++++++++---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 39 +++++++++++----------
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         |  6 ++--
 drivers/net/ethernet/qlogic/qed/qed_main.c        |  6 ++--
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  3 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 16 +++------
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  4 +--
 include/linux/qed/qed_if.h                        |  2 +-
 8 files changed, 71 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index e07460a68d30..c8f3507d1151 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -439,6 +439,41 @@ struct qed_fw_data {
 	u32			init_ops_size;
 };
 
+enum qed_mf_mode_bit {
+	/* Supports PF-classification based on tag */
+	QED_MF_OVLAN_CLSS,
+
+	/* Supports PF-classification based on MAC */
+	QED_MF_LLH_MAC_CLSS,
+
+	/* Supports PF-classification based on protocol type */
+	QED_MF_LLH_PROTO_CLSS,
+
+	/* Requires a default PF to be set */
+	QED_MF_NEED_DEF_PF,
+
+	/* Allow LL2 to multicast/broadcast */
+	QED_MF_LL2_NON_UNICAST,
+
+	/* Allow Cross-PF [& child VFs] Tx-switching */
+	QED_MF_INTER_PF_SWITCH,
+
+	/* Unified Fabtic Port support enabled */
+	QED_MF_UFP_SPECIFIC,
+
+	/* Disable Accelerated Receive Flow Steering (aRFS) */
+	QED_MF_DISABLE_ARFS,
+
+	/* Use vlan for steering */
+	QED_MF_8021Q_TAGGING,
+
+	/* Use stag for steering */
+	QED_MF_8021AD_TAGGING,
+
+	/* Allow DSCP to TC mapping */
+	QED_MF_DSCP_TO_TC_MAP,
+};
+
 enum BAR_ID {
 	BAR_ID_0,		/* used for GRC */
 	BAR_ID_1		/* Used for doorbells */
@@ -669,10 +704,8 @@ struct qed_dev {
 	u8				num_funcs_in_port;
 
 	u8				path_id;
-	enum qed_mf_mode		mf_mode;
-#define IS_MF_DEFAULT(_p_hwfn)  (((_p_hwfn)->cdev)->mf_mode == QED_MF_DEFAULT)
-#define IS_MF_SI(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_NPAR)
-#define IS_MF_SD(_p_hwfn)       (((_p_hwfn)->cdev)->mf_mode == QED_MF_OVLAN)
+
+	unsigned long			mf_bits;
 
 	int				pcie_width;
 	int				pcie_speed;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index d2ad5e92c74f..9b07d7f25042 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1149,18 +1149,10 @@ static int qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 		return -EINVAL;
 	}
 
-	switch (p_hwfn->cdev->mf_mode) {
-	case QED_MF_DEFAULT:
-	case QED_MF_NPAR:
-		hw_mode |= 1 << MODE_MF_SI;
-		break;
-	case QED_MF_OVLAN:
+	if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits))
 		hw_mode |= 1 << MODE_MF_SD;
-		break;
-	default:
-		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+	else
 		hw_mode |= 1 << MODE_MF_SI;
-	}
 
 	hw_mode |= 1 << MODE_ASIC;
 
@@ -1557,7 +1549,6 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 
 		/* send function start command */
 		rc = qed_sp_pf_start(p_hwfn, p_ptt, p_tunn,
-				     p_hwfn->cdev->mf_mode,
 				     allow_npar_tx_switch);
 		if (rc) {
 			DP_NOTICE(p_hwfn, "Function start ramrod failed\n");
@@ -2651,17 +2642,25 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	switch (mf_mode) {
 	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
-		p_hwfn->cdev->mf_mode = QED_MF_OVLAN;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
 		break;
 	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
-		p_hwfn->cdev->mf_mode = QED_MF_NPAR;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_LL2_NON_UNICAST) |
+					BIT(QED_MF_INTER_PF_SWITCH);
 		break;
 	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
-		p_hwfn->cdev->mf_mode = QED_MF_DEFAULT;
+		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_LL2_NON_UNICAST);
+		if (QED_IS_BB(p_hwfn->cdev))
+			p_hwfn->cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
 		break;
 	}
-	DP_INFO(p_hwfn, "Multi function mode is %08x\n",
-		p_hwfn->cdev->mf_mode);
+
+	DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
+		p_hwfn->cdev->mf_bits);
 
 	/* Read Multi-function information from shmem */
 	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
@@ -3462,7 +3461,7 @@ int qed_llh_add_mac_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0, en;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_MAC_CLSS, &p_hwfn->cdev->mf_bits))
 		return 0;
 
 	qed_llh_mac_to_filter(&high, &low, p_filter);
@@ -3507,7 +3506,7 @@ void qed_llh_remove_mac_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_MAC_CLSS, &p_hwfn->cdev->mf_bits))
 		return;
 
 	qed_llh_mac_to_filter(&high, &low, p_filter);
@@ -3549,7 +3548,7 @@ qed_llh_add_protocol_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0, en;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &p_hwfn->cdev->mf_bits))
 		return 0;
 
 	switch (type) {
@@ -3647,7 +3646,7 @@ qed_llh_remove_protocol_filter(struct qed_hwfn *p_hwfn,
 	u32 high = 0, low = 0;
 	int i;
 
-	if (!(IS_MF_SI(p_hwfn) || IS_MF_DEFAULT(p_hwfn)))
+	if (!test_bit(QED_MF_LLH_PROTO_CLSS, &p_hwfn->cdev->mf_bits))
 		return;
 
 	switch (type) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 38502815d681..6c942c133b2c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -922,9 +922,9 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
 	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
-	if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
-	    p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
-	    (conn_type != QED_LL2_TYPE_IWARP)) {
+	if (test_bit(QED_MF_LL2_NON_UNICAST, &p_hwfn->cdev->mf_bits) &&
+	    p_ramrod->main_func_queue && conn_type != QED_LL2_TYPE_ROCE &&
+	    conn_type != QED_LL2_TYPE_IWARP) {
 		p_ramrod->mf_si_bcast_accept_all = 1;
 		p_ramrod->mf_si_mcast_accept_all = 1;
 	} else {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d1d3787affe8..307fe33d75c8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -264,7 +264,8 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
 	dev_info->pci_irq = cdev->pci_params.irq;
 	dev_info->rdma_supported = QED_IS_RDMA_PERSONALITY(p_hwfn);
-	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
+	dev_info->is_mf_default = !test_bit(QED_MF_LLH_MAC_CLSS,
+					    &cdev->mf_bits);
 	dev_info->dev_type = cdev->type;
 	ether_addr_copy(dev_info->hw_mac, hw_info->hw_mac_addr);
 
@@ -273,7 +274,8 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 		dev_info->fw_minor = FW_MINOR_VERSION;
 		dev_info->fw_rev = FW_REVISION_VERSION;
 		dev_info->fw_eng = FW_ENGINEERING_VERSION;
-		dev_info->mf_mode = cdev->mf_mode;
+		dev_info->b_inter_pf_switch = test_bit(QED_MF_INTER_PF_SWITCH,
+						       &cdev->mf_bits);
 		dev_info->tx_switching = true;
 
 		if (hw_info->b_wol_support == QED_WOL_SUPPORT_PME)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index ab4ad8a1e2a5..768022235451 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -416,7 +416,6 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
  * @param p_hwfn
  * @param p_ptt
  * @param p_tunn
- * @param mode
  * @param allow_npar_tx_switch
  *
  * @return int
@@ -425,7 +424,7 @@ int qed_sp_init_request(struct qed_hwfn *p_hwfn,
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    struct qed_tunnel_info *p_tunn,
-		    enum qed_mf_mode mode, bool allow_npar_tx_switch);
+		    bool allow_npar_tx_switch);
 
 /**
  * @brief qed_sp_pf_update - PF Function Update Ramrod
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 5e927b6cac22..fbb317257629 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -306,7 +306,7 @@ qed_tunn_set_pf_start_params(struct qed_hwfn *p_hwfn,
 int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    struct qed_tunnel_info *p_tunn,
-		    enum qed_mf_mode mode, bool allow_npar_tx_switch)
+		    bool allow_npar_tx_switch)
 {
 	struct pf_start_ramrod_data *p_ramrod = NULL;
 	u16 sb = qed_int_get_sp_sb_id(p_hwfn);
@@ -339,18 +339,10 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->dont_log_ramrods	= 0;
 	p_ramrod->log_type_mask		= cpu_to_le16(0xf);
 
-	switch (mode) {
-	case QED_MF_DEFAULT:
-	case QED_MF_NPAR:
-		p_ramrod->mf_mode = MF_NPAR;
-		break;
-	case QED_MF_OVLAN:
+	if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits))
 		p_ramrod->mf_mode = MF_OVLAN;
-		break;
-	default:
-		DP_NOTICE(p_hwfn, "Unsupported MF mode, init as DEFAULT\n");
+	else
 		p_ramrod->mf_mode = MF_NPAR;
-	}
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
 		cpu_to_le16(p_hwfn->hw_info.ovlan);
@@ -365,7 +357,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
 
-	if (IS_MF_SI(p_hwfn))
+	if (test_bit(QED_MF_INTER_PF_SWITCH, &p_hwfn->cdev->mf_bits))
 		p_ramrod->allow_npar_tx_switching = allow_npar_tx_switch;
 
 	switch (p_hwfn->hw_info.personality) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index a01e7d6e5442..89c581c3c21a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -199,7 +199,7 @@ static int qede_sriov_configure(struct pci_dev *pdev, int num_vfs_param)
 
 	/* Enable/Disable Tx switching for PF */
 	if ((rc == num_vfs_param) && netif_running(edev->ndev) &&
-	    qed_info->mf_mode != QED_MF_NPAR && qed_info->tx_switching) {
+	    !qed_info->b_inter_pf_switch && qed_info->tx_switching) {
 		vport_params->vport_id = 0;
 		vport_params->update_tx_switching_flg = 1;
 		vport_params->tx_switching_flg = num_vfs_param ? 1 : 0;
@@ -1928,7 +1928,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 	vport_update_params->update_vport_active_flg = 1;
 	vport_update_params->vport_active_flg = 1;
 
-	if ((qed_info->mf_mode == QED_MF_NPAR || pci_num_vf(edev->pdev)) &&
+	if ((qed_info->b_inter_pf_switch || pci_num_vf(edev->pdev)) &&
 	    qed_info->tx_switching) {
 		vport_update_params->update_tx_switching_flg = 1;
 		vport_update_params->tx_switching_flg = 1;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index e53f9c7c2809..5dac56147a07 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -359,7 +359,7 @@ struct qed_dev_info {
 #define QED_MFW_VERSION_3_OFFSET	24
 
 	u32		flash_size;
-	u8		mf_mode;
+	bool		b_inter_pf_switch;
 	bool		tx_switching;
 	bool		rdma_supported;
 	u16		mtu;
-- 
cgit v1.2.3-59-g8ed1b


From 27bf96e32c92599dc7523b36d6c761fc8312c8c0 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:43:00 -0700
Subject: qed: Remove unused data member 'is_mf_default'.

The data member 'is_mf_default' is not used by the qed/qede drivers,
removing the same.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 2 --
 include/linux/qed/qed_if.h                 | 1 -
 2 files changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 307fe33d75c8..70bc5634675c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -264,8 +264,6 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
 	dev_info->pci_irq = cdev->pci_params.irq;
 	dev_info->rdma_supported = QED_IS_RDMA_PERSONALITY(p_hwfn);
-	dev_info->is_mf_default = !test_bit(QED_MF_LLH_MAC_CLSS,
-					    &cdev->mf_bits);
 	dev_info->dev_type = cdev->type;
 	ether_addr_copy(dev_info->hw_mac, hw_info->hw_mac_addr);
 
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5dac56147a07..907976fd56f7 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -339,7 +339,6 @@ struct qed_dev_info {
 	u8		num_hwfns;
 
 	u8		hw_mac[ETH_ALEN];
-	bool		is_mf_default;
 
 	/* FW version */
 	u16		fw_major;
-- 
cgit v1.2.3-59-g8ed1b


From b51bdfb9cbe2ecf99a4c45c48c6286963344786c Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:43:01 -0700
Subject: qed: Add support for multi function mode with 802.1ad tagging.

The patch adds support for new Multi function mode wherein the traffic
classification is done based on the 802.1ad tagging and the outer vlan tag
provided by the management firmware.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 64 ++++++++++++++++-------
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |  5 ++
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 9b07d7f25042..95d00cbe1a7f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1668,6 +1668,18 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		if (rc)
 			return rc;
 
+		if (IS_PF(cdev) && test_bit(QED_MF_8021AD_TAGGING,
+					    &cdev->mf_bits)) {
+			STORE_RT_REG(p_hwfn, PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET,
+				     ETH_P_8021AD);
+			STORE_RT_REG(p_hwfn, DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET,
+				     ETH_P_8021AD);
+		}
+
 		qed_fill_load_req_params(&load_req_params,
 					 p_params->p_drv_load_params);
 		rc = qed_mcp_load_req(p_hwfn, p_hwfn->p_main_ptt,
@@ -2630,39 +2642,51 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		   link->pause.autoneg,
 		   p_caps->default_eee, p_caps->eee_lpi_timer);
 
-	/* Read Multi-function information from shmem */
-	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
-	       offsetof(struct nvm_cfg1, glob) +
-	       offsetof(struct nvm_cfg1_glob, generic_cont0);
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		struct qed_dev *cdev = p_hwfn->cdev;
 
-	generic_cont0 = qed_rd(p_hwfn, p_ptt, addr);
+		/* Read Multi-function information from shmem */
+		addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
+		       offsetof(struct nvm_cfg1, glob) +
+		       offsetof(struct nvm_cfg1_glob, generic_cont0);
 
-	mf_mode = (generic_cont0 & NVM_CFG1_GLOB_MF_MODE_MASK) >>
-		  NVM_CFG1_GLOB_MF_MODE_OFFSET;
+		generic_cont0 = qed_rd(p_hwfn, p_ptt, addr);
 
-	switch (mf_mode) {
-	case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
-		break;
-	case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+		mf_mode = (generic_cont0 & NVM_CFG1_GLOB_MF_MODE_MASK) >>
+			  NVM_CFG1_GLOB_MF_MODE_OFFSET;
+
+		switch (mf_mode) {
+		case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_BD:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_8021AD_TAGGING);
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_NPAR1_0:
+			cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
 					BIT(QED_MF_LL2_NON_UNICAST) |
 					BIT(QED_MF_INTER_PF_SWITCH);
-		break;
-	case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
-		p_hwfn->cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
+			break;
+		case NVM_CFG1_GLOB_MF_MODE_DEFAULT:
+			cdev->mf_bits = BIT(QED_MF_LLH_MAC_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
 					BIT(QED_MF_LL2_NON_UNICAST);
-		if (QED_IS_BB(p_hwfn->cdev))
-			p_hwfn->cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
-		break;
+			if (QED_IS_BB(p_hwfn->cdev))
+				cdev->mf_bits |= BIT(QED_MF_NEED_DEF_PF);
+			break;
+		}
+
+		DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
+			cdev->mf_bits);
 	}
 
 	DP_INFO(p_hwfn, "Multi function mode is 0x%lx\n",
 		p_hwfn->cdev->mf_bits);
 
-	/* Read Multi-function information from shmem */
+	/* Read device capabilities information from shmem */
 	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
 		offsetof(struct nvm_cfg1, glob) +
 		offsetof(struct nvm_cfg1_glob, device_capabilities);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index fbb317257629..26bed26b9071 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -346,6 +346,11 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
 		cpu_to_le16(p_hwfn->hw_info.ovlan);
+	if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
+		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021AD;
+		p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
+	}
+
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
-- 
cgit v1.2.3-59-g8ed1b


From cac6f691546b9efd50c31c0db97fe50d0357104a Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Sat, 5 May 2018 18:43:02 -0700
Subject: qed: Add support for Unified Fabric Port.

This patch adds driver changes for supporting the Unified Fabric Port
(UFP). This is a new paritioning mode wherein MFW provides the set of
parameters to be used by the device such as traffic class, outer-vlan
tag value, priority type etc. Drivers receives this info via notifications
from mfw and configures the hardware accordingly.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h             | 20 ++++++
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c        | 14 +++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c         | 32 ++++++++--
 drivers/net/ethernet/qlogic/qed/qed_fcoe.c        |  3 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h         | 28 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_ll2.c         | 40 ++++++++----
 drivers/net/ethernet/qlogic/qed/qed_mcp.c         | 78 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h         |  8 +++
 drivers/net/ethernet/qlogic/qed/qed_sp.h          |  9 +++
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c | 57 ++++++++++++++++-
 drivers/scsi/qedf/qedf_fip.c                      |  7 +-
 drivers/scsi/qedf/qedf_main.c                     |  2 +-
 drivers/scsi/qedi/qedi_iscsi.c                    |  2 +-
 include/linux/qed/qed_ll2_if.h                    | 10 ++-
 14 files changed, 283 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index c8f3507d1151..adcff495466e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -474,6 +474,24 @@ enum qed_mf_mode_bit {
 	QED_MF_DSCP_TO_TC_MAP,
 };
 
+enum qed_ufp_mode {
+	QED_UFP_MODE_ETS,
+	QED_UFP_MODE_VNIC_BW,
+	QED_UFP_MODE_UNKNOWN
+};
+
+enum qed_ufp_pri_type {
+	QED_UFP_PRI_OS,
+	QED_UFP_PRI_VNIC,
+	QED_UFP_PRI_UNKNOWN
+};
+
+struct qed_ufp_info {
+	enum qed_ufp_pri_type pri_type;
+	enum qed_ufp_mode mode;
+	u8 tc;
+};
+
 enum BAR_ID {
 	BAR_ID_0,		/* used for GRC */
 	BAR_ID_1		/* Used for doorbells */
@@ -582,6 +600,8 @@ struct qed_hwfn {
 
 	struct qed_dcbx_info		*p_dcbx_info;
 
+	struct qed_ufp_info		ufp_info;
+
 	struct qed_dmae_info		dmae_info;
 
 	/* QM init */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 449777f21237..8f31406ec894 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -274,8 +274,8 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 		     u32 pri_tc_tbl, int count, u8 dcbx_version)
 {
 	enum dcbx_protocol_type type;
+	bool enable, ieee, eth_tlv;
 	u8 tc, priority_map;
-	bool enable, ieee;
 	u16 protocol_id;
 	int priority;
 	int i;
@@ -283,6 +283,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Num APP entries = %d\n", count);
 
 	ieee = (dcbx_version == DCBX_CONFIG_VERSION_IEEE);
+	eth_tlv = false;
 	/* Parse APP TLV */
 	for (i = 0; i < count; i++) {
 		protocol_id = QED_MFW_GET_FIELD(p_tbl[i].entry,
@@ -304,13 +305,22 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn,
 			 * indication, but we only got here if there was an
 			 * app tlv for the protocol, so dcbx must be enabled.
 			 */
-			enable = !(type == DCBX_PROTOCOL_ETH);
+			if (type == DCBX_PROTOCOL_ETH) {
+				enable = false;
+				eth_tlv = true;
+			} else {
+				enable = true;
+			}
 
 			qed_dcbx_update_app_info(p_data, p_hwfn, enable,
 						 priority, tc, type);
 		}
 	}
 
+	/* If Eth TLV is not detected, use UFP TC as default TC */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) && !eth_tlv)
+		p_data->arr[DCBX_PROTOCOL_ETH].tc = p_hwfn->ufp_info.tc;
+
 	/* Update ramrod protocol data and hw_info fields
 	 * with default info when corresponding APP TLV's are not detected.
 	 * The enabled field has a different logic for ethernet as only for
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 95d00cbe1a7f..560528962658 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1499,6 +1499,11 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_EN_RT_OFFSET, 1);
 		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_TAG_VALUE_RT_OFFSET,
 			     p_hwfn->hw_info.ovlan);
+
+		DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
+			   "Configuring LLH_FUNC_FILTER_HDR_SEL\n");
+		STORE_RT_REG(p_hwfn, NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET,
+			     1);
 	}
 
 	/* Enable classification by MAC if needed */
@@ -1635,6 +1640,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 	bool b_default_mtu = true;
 	struct qed_hwfn *p_hwfn;
 	int rc = 0, mfw_rc, i;
+	u16 ether_type;
 
 	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
 		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
@@ -1668,16 +1674,22 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 		if (rc)
 			return rc;
 
-		if (IS_PF(cdev) && test_bit(QED_MF_8021AD_TAGGING,
-					    &cdev->mf_bits)) {
+		if (IS_PF(cdev) && (test_bit(QED_MF_8021Q_TAGGING,
+					     &cdev->mf_bits) ||
+				    test_bit(QED_MF_8021AD_TAGGING,
+					     &cdev->mf_bits))) {
+			if (test_bit(QED_MF_8021Q_TAGGING, &cdev->mf_bits))
+				ether_type = ETH_P_8021Q;
+			else
+				ether_type = ETH_P_8021AD;
 			STORE_RT_REG(p_hwfn, PRS_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, NIG_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 			STORE_RT_REG(p_hwfn, DORQ_REG_TAG1_ETHERTYPE_RT_OFFSET,
-				     ETH_P_8021AD);
+				     ether_type);
 		}
 
 		qed_fill_load_req_params(&load_req_params,
@@ -2659,6 +2671,12 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		case NVM_CFG1_GLOB_MF_MODE_MF_ALLOWED:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS);
 			break;
+		case NVM_CFG1_GLOB_MF_MODE_UFP:
+			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
+					BIT(QED_MF_LLH_PROTO_CLSS) |
+					BIT(QED_MF_UFP_SPECIFIC) |
+					BIT(QED_MF_8021Q_TAGGING);
+			break;
 		case NVM_CFG1_GLOB_MF_MODE_BD:
 			cdev->mf_bits = BIT(QED_MF_OVLAN_CLSS) |
 					BIT(QED_MF_LLH_PROTO_CLSS) |
@@ -2879,6 +2897,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
 
 		qed_get_eee_caps(p_hwfn, p_ptt);
+
+		qed_mcp_read_ufp_config(p_hwfn, p_ptt);
 	}
 
 	if (qed_mcp_is_init(p_hwfn)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
index 2dc9b312a795..cc1b373c0ace 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_fcoe.c
@@ -313,6 +313,9 @@ qed_sp_fcoe_conn_offload(struct qed_hwfn *p_hwfn,
 	p_data->d_id.addr_mid = p_conn->d_id.addr_mid;
 	p_data->d_id.addr_lo = p_conn->d_id.addr_lo;
 	p_data->flags = p_conn->flags;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		SET_FIELD(p_data->flags,
+			  FCOE_CONN_OFFLOAD_RAMROD_DATA_B_SINGLE_VLAN, 1);
 	p_data->def_q_idx = p_conn->def_q_idx;
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 7f5ec42dde48..b5f70eff0182 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -11993,6 +11993,16 @@ struct public_port {
 #define EEE_REMOTE_TW_TX_OFFSET 0
 #define EEE_REMOTE_TW_RX_MASK   0xffff0000
 #define EEE_REMOTE_TW_RX_OFFSET 16
+
+	u32 oem_cfg_port;
+#define OEM_CFG_CHANNEL_TYPE_MASK                       0x00000003
+#define OEM_CFG_CHANNEL_TYPE_OFFSET                     0
+#define OEM_CFG_CHANNEL_TYPE_VLAN_PARTITION             0x1
+#define OEM_CFG_CHANNEL_TYPE_STAGGED                    0x2
+#define OEM_CFG_SCHED_TYPE_MASK                         0x0000000C
+#define OEM_CFG_SCHED_TYPE_OFFSET                       2
+#define OEM_CFG_SCHED_TYPE_ETS                          0x1
+#define OEM_CFG_SCHED_TYPE_VNIC_BW                      0x2
 };
 
 struct public_func {
@@ -12069,6 +12079,23 @@ struct public_func {
 #define DRV_ID_DRV_INIT_HW_MASK		0x80000000
 #define DRV_ID_DRV_INIT_HW_SHIFT	31
 #define DRV_ID_DRV_INIT_HW_FLAG		(1 << DRV_ID_DRV_INIT_HW_SHIFT)
+
+	u32 oem_cfg_func;
+#define OEM_CFG_FUNC_TC_MASK                    0x0000000F
+#define OEM_CFG_FUNC_TC_OFFSET                  0
+#define OEM_CFG_FUNC_TC_0                       0x0
+#define OEM_CFG_FUNC_TC_1                       0x1
+#define OEM_CFG_FUNC_TC_2                       0x2
+#define OEM_CFG_FUNC_TC_3                       0x3
+#define OEM_CFG_FUNC_TC_4                       0x4
+#define OEM_CFG_FUNC_TC_5                       0x5
+#define OEM_CFG_FUNC_TC_6                       0x6
+#define OEM_CFG_FUNC_TC_7                       0x7
+
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_MASK         0x00000030
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET       4
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC         0x1
+#define OEM_CFG_FUNC_HOST_PRI_CTRL_OS           0x2
 };
 
 struct mcp_mac {
@@ -12495,6 +12522,7 @@ enum MFW_DRV_MSG_TYPE {
 	MFW_DRV_MSG_BW_UPDATE10,
 	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
 	MFW_DRV_MSG_BW_UPDATE11,
+	MFW_DRV_MSG_OEM_CFG_UPDATE,
 	MFW_DRV_MSG_MAX
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 6c942c133b2c..c3c1a99a0252 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -919,6 +919,10 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
 	p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
 	p_ramrod->inner_vlan_stripping_en =
 		p_ll2_conn->input.rx_vlan_removal_en;
+
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+	    p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE)
+		p_ramrod->report_outer_vlan = 1;
 	p_ramrod->queue_id = p_ll2_conn->queue_id;
 	p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
 
@@ -1493,11 +1497,12 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
 	qed_ll2_establish_connection_ooo(p_hwfn, p_ll2_conn);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_add_protocol_filter(p_hwfn, p_ptt,
+						    ETH_P_FCOE, 0,
+						    QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8906, 0,
-					    QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_add_protocol_filter(p_hwfn, p_ptt,
-					    0x8914, 0,
+					    ETH_P_FIP, 0,
 					    QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -1653,11 +1658,16 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
-	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+	    p_ll2->input.conn_type == QED_LL2_TYPE_OOO) {
 		start_bd->nw_vlan_or_lb_echo =
 		    cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
-	else
+	} else {
 		start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+		if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) &&
+		    p_ll2->input.conn_type == QED_LL2_TYPE_FCOE)
+			pkt->remove_stag = true;
+	}
+
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
 		  cpu_to_le16(pkt->l4_hdr_offset_w));
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
@@ -1668,6 +1678,9 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
 	SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_DISABLE_STAG_INSERTION,
+		  !!(pkt->remove_stag));
+
 	start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
 	DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
 	start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
@@ -1884,11 +1897,12 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 		qed_ooo_release_all_isles(p_hwfn, p_hwfn->p_ooo_info);
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_FCOE) {
+		if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+			qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
+						       ETH_P_FCOE, 0,
+						      QED_LLH_FILTER_ETHERTYPE);
 		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8906, 0,
-					       QED_LLH_FILTER_ETHERTYPE);
-		qed_llh_remove_protocol_filter(p_hwfn, p_ptt,
-					       0x8914, 0,
+					       ETH_P_FIP, 0,
 					       QED_LLH_FILTER_ETHERTYPE);
 	}
 
@@ -2360,7 +2374,8 @@ fail:
 	return -EINVAL;
 }
 
-static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
+static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb,
+			      unsigned long xmit_flags)
 {
 	struct qed_ll2_tx_pkt_info pkt;
 	const skb_frag_t *frag;
@@ -2405,6 +2420,9 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
 	pkt.first_frag = mapping;
 	pkt.first_frag_len = skb->len;
 	pkt.cookie = skb;
+	if (test_bit(QED_MF_UFP_SPECIFIC, &cdev->mf_bits) &&
+	    test_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &xmit_flags))
+		pkt.remove_stag = true;
 
 	rc = qed_ll2_prepare_tx_packet(&cdev->hwfns[0], cdev->ll2->handle,
 				       &pkt, 1);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 0550f0ee11b3..e80f5e7c7992 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/etherdevice.h>
 #include "qed.h"
+#include "qed_cxt.h"
 #include "qed_dcbx.h"
 #include "qed_hsi.h"
 #include "qed_hw.h"
@@ -1486,6 +1487,80 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		    &resp, &param);
 }
 
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct public_func shmem_info;
+	u32 port_cfg, val;
+
+	if (!test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits))
+		return;
+
+	memset(&p_hwfn->ufp_info, 0, sizeof(p_hwfn->ufp_info));
+	port_cfg = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			  offsetof(struct public_port, oem_cfg_port));
+	val = (port_cfg & OEM_CFG_CHANNEL_TYPE_MASK) >>
+		OEM_CFG_CHANNEL_TYPE_OFFSET;
+	if (val != OEM_CFG_CHANNEL_TYPE_STAGGED)
+		DP_NOTICE(p_hwfn, "Incorrect UFP Channel type  %d\n", val);
+
+	val = (port_cfg & OEM_CFG_SCHED_TYPE_MASK) >> OEM_CFG_SCHED_TYPE_OFFSET;
+	if (val == OEM_CFG_SCHED_TYPE_ETS) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_ETS;
+	} else if (val == OEM_CFG_SCHED_TYPE_VNIC_BW) {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_VNIC_BW;
+	} else {
+		p_hwfn->ufp_info.mode = QED_UFP_MODE_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown UFP scheduling mode %d\n", val);
+	}
+
+	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
+	val = (port_cfg & OEM_CFG_FUNC_TC_MASK) >> OEM_CFG_FUNC_TC_OFFSET;
+	p_hwfn->ufp_info.tc = (u8)val;
+	val = (port_cfg & OEM_CFG_FUNC_HOST_PRI_CTRL_MASK) >>
+		OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET;
+	if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_VNIC;
+	} else if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_OS) {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_OS;
+	} else {
+		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_UNKNOWN;
+		DP_NOTICE(p_hwfn, "Unknown Host priority control %d\n", val);
+	}
+
+	DP_NOTICE(p_hwfn,
+		  "UFP shmem config: mode = %d tc = %d pri_type = %d\n",
+		  p_hwfn->ufp_info.mode,
+		  p_hwfn->ufp_info.tc, p_hwfn->ufp_info.pri_type);
+}
+
+static int
+qed_mcp_handle_ufp_event(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	qed_mcp_read_ufp_config(p_hwfn, p_ptt);
+
+	if (p_hwfn->ufp_info.mode == QED_UFP_MODE_VNIC_BW) {
+		p_hwfn->qm_info.ooo_tc = p_hwfn->ufp_info.tc;
+		p_hwfn->hw_info.offload_tc = p_hwfn->ufp_info.tc;
+
+		qed_qm_reconf(p_hwfn, p_ptt);
+	} else if (p_hwfn->ufp_info.mode == QED_UFP_MODE_ETS) {
+		/* Merge UFP TC with the dcbx TC data */
+		qed_dcbx_mib_update_event(p_hwfn, p_ptt,
+					  QED_DCBX_OPERATIONAL_MIB);
+	} else {
+		DP_ERR(p_hwfn, "Invalid sched type, discard the UFP config\n");
+		return -EINVAL;
+	}
+
+	/* update storm FW with negotiation results */
+	qed_sp_pf_update_ufp(p_hwfn);
+
+	/* update stag pcp value */
+	qed_sp_pf_update_stag(p_hwfn);
+
+	return 0;
+}
+
 int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt)
 {
@@ -1529,6 +1604,9 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 			qed_dcbx_mib_update_event(p_hwfn, p_ptt,
 						  QED_DCBX_OPERATIONAL_MIB);
 			break;
+		case MFW_DRV_MSG_OEM_CFG_UPDATE:
+			qed_mcp_handle_ufp_event(p_hwfn, p_ptt);
+			break;
 		case MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE:
 			qed_mcp_handle_transceiver_change(p_hwfn, p_ptt);
 			break;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 3af3896420b9..250579ba632b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -1004,6 +1004,14 @@ int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
  */
 int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
+/**
+ * @brief Read ufp config from the shared memory.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /**
  * @brief Populate the nvm info shadow in the given hardware function
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 768022235451..e95431f6acd4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -462,6 +462,15 @@ int qed_sp_pf_update_stag(struct qed_hwfn *p_hwfn);
  * @return int
  */
 
+/**
+ * @brief qed_sp_pf_update_ufp - PF ufp update Ramrod
+ *
+ * @param p_hwfn
+ *
+ * @return int
+ */
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn);
+
 int qed_sp_pf_stop(struct qed_hwfn *p_hwfn);
 
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 26bed26b9071..8de644b4721e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -314,7 +314,7 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
 	int rc = -EINVAL;
-	u8 page_cnt;
+	u8 page_cnt, i;
 
 	/* update initial eq producer */
 	qed_eq_prod_update(p_hwfn,
@@ -345,12 +345,30 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 		p_ramrod->mf_mode = MF_NPAR;
 
 	p_ramrod->outer_tag_config.outer_tag.tci =
-		cpu_to_le16(p_hwfn->hw_info.ovlan);
-	if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
+				cpu_to_le16(p_hwfn->hw_info.ovlan);
+	if (test_bit(QED_MF_8021Q_TAGGING, &p_hwfn->cdev->mf_bits)) {
+		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021Q;
+	} else if (test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits)) {
 		p_ramrod->outer_tag_config.outer_tag.tpid = ETH_P_8021AD;
 		p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
 	}
 
+	p_ramrod->outer_tag_config.pri_map_valid = 1;
+	for (i = 0; i < QED_MAX_PFC_PRIORITIES; i++)
+		p_ramrod->outer_tag_config.inner_to_outer_pri_map[i] = i;
+
+	/* enable_stag_pri_change should be set if port is in BD mode or,
+	 * UFP with Host Control mode.
+	 */
+	if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits)) {
+		if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 1;
+		else
+			p_ramrod->outer_tag_config.enable_stag_pri_change = 0;
+
+		p_ramrod->outer_tag_config.outer_tag.tci |=
+		    cpu_to_le16(((u16)p_hwfn->ufp_info.tc << 13));
+	}
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
@@ -431,6 +449,39 @@ int qed_sp_pf_update(struct qed_hwfn *p_hwfn)
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+int qed_sp_pf_update_ufp(struct qed_hwfn *p_hwfn)
+{
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EOPNOTSUPP;
+
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_UNKNOWN) {
+		DP_INFO(p_hwfn, "Invalid priority type %d\n",
+			p_hwfn->ufp_info.pri_type);
+		return -EINVAL;
+	}
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_CB;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 COMMON_RAMROD_PF_UPDATE, PROTOCOLID_COMMON,
+				 &init_data);
+	if (rc)
+		return rc;
+
+	p_ent->ramrod.pf_update.update_enable_stag_pri_change = true;
+	if (p_hwfn->ufp_info.pri_type == QED_UFP_PRI_OS)
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 1;
+	else
+		p_ent->ramrod.pf_update.enable_stag_pri_change = 0;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 /* Set pf update ramrod command params */
 int qed_sp_pf_update_tunn_cfg(struct qed_hwfn *p_hwfn,
 			      struct qed_ptt *p_ptt,
diff --git a/drivers/scsi/qedf/qedf_fip.c b/drivers/scsi/qedf/qedf_fip.c
index 773558fc0697..16d1a21cdff9 100644
--- a/drivers/scsi/qedf/qedf_fip.c
+++ b/drivers/scsi/qedf/qedf_fip.c
@@ -23,6 +23,7 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 	struct fip_vlan *vlan;
 #define MY_FIP_ALL_FCF_MACS        ((__u8[6]) { 1, 0x10, 0x18, 1, 0, 2 })
 	static u8 my_fcoe_all_fcfs[ETH_ALEN] = MY_FIP_ALL_FCF_MACS;
+	unsigned long flags = 0;
 
 	skb = dev_alloc_skb(sizeof(struct fip_vlan));
 	if (!skb)
@@ -65,7 +66,9 @@ void qedf_fcoe_send_vlan_req(struct qedf_ctx *qedf)
 		kfree_skb(skb);
 		return;
 	}
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+
+	set_bit(QED_LL2_XMIT_FLAGS_FIP_DISCOVERY, &flags);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, flags);
 }
 
 static void qedf_fcoe_process_vlan_resp(struct qedf_ctx *qedf,
@@ -139,7 +142,7 @@ void qedf_fip_send(struct fcoe_ctlr *fip, struct sk_buff *skb)
 		print_hex_dump(KERN_WARNING, "fip ", DUMP_PREFIX_OFFSET, 16, 1,
 		    skb->data, skb->len, false);
 
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 }
 
 /* Process incoming FIP frames. */
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index 284ccb566b19..6c19015975a8 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -994,7 +994,7 @@ static int qedf_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	if (qedf_dump_frames)
 		print_hex_dump(KERN_WARNING, "fcoe: ", DUMP_PREFIX_OFFSET, 16,
 		    1, skb->data, skb->len, false);
-	qed_ops->ll2->start_xmit(qedf->cdev, skb);
+	qed_ops->ll2->start_xmit(qedf->cdev, skb, 0);
 
 	return 0;
 }
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 7ec7f6e00fb8..ff21aa1fd939 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -1150,7 +1150,7 @@ static int qedi_data_avail(struct qedi_ctx *qedi, u16 vlanid)
 	if (vlanid)
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
 
-	rc = qedi_ops->ll2->start_xmit(cdev, skb);
+	rc = qedi_ops->ll2->start_xmit(cdev, skb, 0);
 	if (rc) {
 		QEDI_ERR(&qedi->dbg_ctx, "ll2 start_xmit returned %d\n",
 			 rc);
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 266c1fb45387..5eb022953aca 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -202,6 +202,7 @@ struct qed_ll2_tx_pkt_info {
 	bool enable_ip_cksum;
 	bool enable_l4_cksum;
 	bool calc_ip_len;
+	bool remove_stag;
 };
 
 #define QED_LL2_UNUSED_HANDLE   (0xff)
@@ -220,6 +221,11 @@ struct qed_ll2_params {
 	u8 ll2_mac_address[ETH_ALEN];
 };
 
+enum qed_ll2_xmit_flags {
+	/* FIP discovery packet */
+	QED_LL2_XMIT_FLAGS_FIP_DISCOVERY
+};
+
 struct qed_ll2_ops {
 /**
  * @brief start - initializes ll2
@@ -245,10 +251,12 @@ struct qed_ll2_ops {
  *
  * @param cdev
  * @param skb
+ * @param xmit_flags - Transmit options defined by the enum qed_ll2_xmit_flags.
  *
  * @return 0 on success, otherwise error value.
  */
-	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb);
+	int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb,
+			  unsigned long xmit_flags);
 
 /**
  * @brief register_cb_ops - protocol driver register the callback for Rx/Tx
-- 
cgit v1.2.3-59-g8ed1b


From 6f2f8212bfdf3ec6dd60f1f0725dc84c574b768b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 7 May 2018 10:45:26 +0300
Subject: net: ipv6: Fix typo in ipv6_find_hdr() documentation

Fix 'an' into 'and', and use a comma instead of a period.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/exthdrs_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index b643f5ce6c80..ae365df8abf7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
  * if target < 0. "last header" is transport protocol header, ESP, or
  * "No next header".
  *
- * Note that *offset is used as input/output parameter. an if it is not zero,
+ * Note that *offset is used as input/output parameter, and if it is not zero,
  * then it must be a valid offset to an inner IPv6 header. This can be used
  * to explore inner IPv6 header, eg. ICMPv6 error messages.
  *
-- 
cgit v1.2.3-59-g8ed1b


From 0c1dd2a162e02b6bdcbae8e493215a945acbf73b Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 7 May 2018 10:45:27 +0300
Subject: net: ipv6/gre: Add GRO support

Add GRO capability for IPv6 GRE tunnel and ip6erspan tap, via gro_cells
infrastructure.

Performance testing: 55% higher badwidth.
Measuring bandwidth of 1 thread IPv4 TCP traffic over IPv6 GRE tunnel
while GRO on the physical interface is disabled.
CPU: Intel Xeon E312xx (Sandy Bridge)
NIC: Mellanox Technologies MT27700 Family [ConnectX-4]
Before (GRO not working in tunnel) : 2.47 Gbits/sec
After  (GRO working in tunnel)     : 3.85 Gbits/sec

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
CC: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 04c69e0c84b3..b511818b268c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1314,6 +1314,7 @@ static void ip6gre_dev_free(struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 
+	gro_cells_destroy(&t->gro_cells);
 	dst_cache_destroy(&t->dst_cache);
 	free_percpu(dev->tstats);
 }
@@ -1381,11 +1382,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 		return -ENOMEM;
 
 	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
-	if (ret) {
-		free_percpu(dev->tstats);
-		dev->tstats = NULL;
-		return ret;
-	}
+	if (ret)
+		goto cleanup_alloc_pcpu_stats;
+
+	ret = gro_cells_init(&tunnel->gro_cells, dev);
+	if (ret)
+		goto cleanup_dst_cache_init;
 
 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
@@ -1405,6 +1407,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
 	ip6gre_tnl_init_features(dev);
 
 	return 0;
+
+cleanup_dst_cache_init:
+	dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+	free_percpu(dev->tstats);
+	dev->tstats = NULL;
+	return ret;
 }
 
 static int ip6gre_tunnel_init(struct net_device *dev)
@@ -1751,11 +1760,12 @@ static int ip6erspan_tap_init(struct net_device *dev)
 		return -ENOMEM;
 
 	ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
-	if (ret) {
-		free_percpu(dev->tstats);
-		dev->tstats = NULL;
-		return ret;
-	}
+	if (ret)
+		goto cleanup_alloc_pcpu_stats;
+
+	ret = gro_cells_init(&tunnel->gro_cells, dev);
+	if (ret)
+		goto cleanup_dst_cache_init;
 
 	tunnel->tun_hlen = 8;
 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
@@ -1773,6 +1783,13 @@ static int ip6erspan_tap_init(struct net_device *dev)
 	ip6gre_tnl_link_config(tunnel, 1);
 
 	return 0;
+
+cleanup_dst_cache_init:
+	dst_cache_destroy(&tunnel->dst_cache);
+cleanup_alloc_pcpu_stats:
+	free_percpu(dev->tstats);
+	dev->tstats = NULL;
+	return ret;
 }
 
 static const struct net_device_ops ip6erspan_netdev_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 72a338bcc6ae51e01c95d687e5d775e3fe52eff1 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 May 2018 11:32:59 +0200
Subject: net: core: rework basic flow dissection helper

When the core networking needs to detect the transport offset in a given
packet and parse it explicitly, a full-blown flow_keys struct is used for
storage.
This patch introduces a smaller keys store, rework the basic flow dissect
helper to use it, and apply this new helper where possible - namely in
skb_probe_transport_header(). The used flow dissector data structures
are renamed to match more closely the new role.

The above gives ~50% performance improvement in micro benchmarking around
skb_probe_transport_header() and ~30% around eth_get_headlen(), mostly due
to the smaller memset. Small, but measurable improvement is measured also
in macro benchmarking.

v1 -> v2: use the new helper in eth_get_headlen() and skb_get_poff(),
  as per DaveM suggestion

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 18 ++++++++++--------
 include/net/flow_dissector.h |  7 ++++++-
 net/core/flow_dissector.c    | 17 +++++++++--------
 net/ethernet/eth.c           |  6 +++---
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 908d66e55b14..693564a9a979 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1171,7 +1171,7 @@ void __skb_get_hash(struct sk_buff *skb);
 u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
+		   const struct flow_keys_basic *keys, int hlen);
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 			    void *data, int hlen_proto);
 
@@ -1208,13 +1208,14 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 				  NULL, 0, 0, 0, flags);
 }
 
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen,
-						  unsigned int flags)
+static inline bool
+skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
+				 struct flow_keys_basic *flow, void *data,
+				 __be16 proto, int nhoff, int hlen,
+				 unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+	return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow,
 				  data, proto, nhoff, hlen, flags);
 }
 
@@ -2350,11 +2351,12 @@ static inline void skb_pop_mac_header(struct sk_buff *skb)
 static inline void skb_probe_transport_header(struct sk_buff *skb,
 					      const int offset_hint)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
+
+	if (skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 9a074776f70b..e2f6e5c928bb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -226,6 +226,11 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
+struct flow_keys_basic {
+	struct flow_dissector_key_control control;
+	struct flow_dissector_key_basic basic;
+};
+
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -244,7 +249,7 @@ __be32 flow_get_u32_src(const struct flow_keys *flow);
 __be32 flow_get_u32_dst(const struct flow_keys *flow);
 
 extern struct flow_dissector flow_keys_dissector;
-extern struct flow_dissector flow_keys_buf_dissector;
+extern struct flow_dissector flow_keys_basic_dissector;
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..030d4ca177fb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen)
+		   const struct flow_keys_basic *keys, int hlen)
 {
 	u32 poff = keys->control.thoff;
 
@@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  */
 u32 skb_get_poff(const struct sk_buff *skb)
 {
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
 	},
 };
 
-static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
 	{
 		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
 		.offset = offsetof(struct flow_keys, control),
@@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
 struct flow_dissector flow_keys_dissector __read_mostly;
 EXPORT_SYMBOL(flow_keys_dissector);
 
-struct flow_dissector flow_keys_buf_dissector __read_mostly;
+struct flow_dissector flow_keys_basic_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_basic_dissector);
 
 static int __init init_default_flow_dissectors(void)
 {
@@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void)
 	skb_flow_dissector_init(&flow_keys_dissector_symmetric,
 				flow_keys_dissector_symmetric_keys,
 				ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
-	skb_flow_dissector_init(&flow_keys_buf_dissector,
-				flow_keys_buf_dissector_keys,
-				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_basic_dissector,
+				flow_keys_basic_dissector_keys,
+				ARRAY_SIZE(flow_keys_basic_dissector_keys));
 	return 0;
 }
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index eaeba9b99a73..ee28440f57c5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len)
 {
 	const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 	const struct ethhdr *eth = (const struct ethhdr *)data;
-	struct flow_keys keys;
+	struct flow_keys_basic keys;
 
 	/* this should never happen, but better safe than sorry */
 	if (unlikely(len < sizeof(*eth)))
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len, flags))
+	if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
+					      sizeof(*eth), len, flags))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
-- 
cgit v1.2.3-59-g8ed1b


From d869dea664e662a4380ccf59cb9be9888a7af53a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 7 May 2018 12:06:03 +0200
Subject: flow_dissector: do not rely on implicit casts

This change fixes a couple of type mismatch reported by the sparse
tool, explicitly using the requested type for the offending arguments.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tipc.h        | 4 ++--
 net/core/flow_dissector.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/tipc.h b/include/net/tipc.h
index 07670ec022a7..f0e7e6bc1bef 100644
--- a/include/net/tipc.h
+++ b/include/net/tipc.h
@@ -44,11 +44,11 @@ struct tipc_basic_hdr {
 	__be32 w[4];
 };
 
-static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
+static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
 {
 	u32 w0 = ntohl(hdr->w[0]);
 	bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
-	int key;
+	__be32 key;
 
 	/* Return source node identity as key */
 	if (likely(!keepalive_msg))
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 030d4ca177fb..4fc1e84d77ec 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1316,7 +1316,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys_basic keys;
 
-	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, 0, 0, 0, 0, 0))
+	if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
-- 
cgit v1.2.3-59-g8ed1b


From 724e47a149f504cbc9c799804bb46ea81a911909 Mon Sep 17 00:00:00 2001
From: Zhao Chen <zhaochen6@huawei.com>
Date: Mon, 7 May 2018 09:21:57 -0400
Subject: net-next/hinic: add pci device ids for 25ge and 100ge card

This patch adds PCI device IDs to support 25GE and 100GE card:

1. Add device id 0x0201 for HINIC 100GE dual port card.
2. Add device id 0x0200 for HINIC 25GE dual port card.
3. Macro of device id 0x1822 is modified for HINIC 25GE quad port card.

Signed-off-by: Zhao Chen <zhaochen6@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/hinic_main.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index eb53bd93065e..5b122728dcb4 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -51,7 +51,9 @@ static unsigned int rx_weight = 64;
 module_param(rx_weight, uint, 0644);
 MODULE_PARM_DESC(rx_weight, "Number Rx packets for NAPI budget (default=64)");
 
-#define PCI_DEVICE_ID_HI1822_PF         0x1822
+#define HINIC_DEV_ID_QUAD_PORT_25GE     0x1822
+#define HINIC_DEV_ID_DUAL_PORT_25GE     0x0200
+#define HINIC_DEV_ID_DUAL_PORT_100GE    0x0201
 
 #define HINIC_WQ_NAME                   "hinic_dev"
 
@@ -1097,7 +1099,9 @@ static void hinic_remove(struct pci_dev *pdev)
 }
 
 static const struct pci_device_id hinic_pci_table[] = {
-	{ PCI_VDEVICE(HUAWEI, PCI_DEVICE_ID_HI1822_PF), 0},
+	{ PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_QUAD_PORT_25GE), 0},
+	{ PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_25GE), 0},
+	{ PCI_VDEVICE(HUAWEI, HINIC_DEV_ID_DUAL_PORT_100GE), 0},
 	{ 0, 0}
 };
 MODULE_DEVICE_TABLE(pci, hinic_pci_table);
-- 
cgit v1.2.3-59-g8ed1b


From 52539ca89f365d3db530535fbffa88a3cca4d2ec Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 8 May 2018 13:03:50 +0200
Subject: cfg80211: Expose TXQ stats and parameters to userspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for exporting the mac80211 TXQ stats via nl80211 by
way of a nested TXQ stats attribute, as well as for configuring the
quantum and limits that were previously only changeable through debugfs.

This commit adds just the nl80211 API, a subsequent commit adds support to
mac80211 itself.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  50 +++++++++++
 include/uapi/linux/nl80211.h |  58 +++++++++++++
 net/mac80211/ethtool.c       |  32 ++++---
 net/wireless/nl80211.c       | 202 +++++++++++++++++++++++++++++++++++++------
 net/wireless/rdev-ops.h      |  12 +++
 net/wireless/trace.h         |  14 +++
 net/wireless/wext-compat.c   |  23 +++--
 7 files changed, 343 insertions(+), 48 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5e888ec62811..8db6071b6063 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1079,6 +1079,37 @@ struct sta_bss_parameters {
 	u16 beacon_interval;
 };
 
+/**
+ * struct cfg80211_txq_stats - TXQ statistics for this TID
+ * @filled: bitmap of flags using the bits of &enum nl80211_txq_stats to
+ *	indicate the relevant values in this struct are filled
+ * @backlog_bytes: total number of bytes currently backlogged
+ * @backlog_packets: total number of packets currently backlogged
+ * @flows: number of new flows seen
+ * @drops: total number of packets dropped
+ * @ecn_marks: total number of packets marked with ECN CE
+ * @overlimit: number of drops due to queue space overflow
+ * @overmemory: number of drops due to memory limit overflow
+ * @collisions: number of hash collisions
+ * @tx_bytes: total number of bytes dequeued
+ * @tx_packets: total number of packets dequeued
+ * @max_flows: maximum number of flows supported
+ */
+struct cfg80211_txq_stats {
+	u32 filled;
+	u32 backlog_bytes;
+	u32 backlog_packets;
+	u32 flows;
+	u32 drops;
+	u32 ecn_marks;
+	u32 overlimit;
+	u32 overmemory;
+	u32 collisions;
+	u32 tx_bytes;
+	u32 tx_packets;
+	u32 max_flows;
+};
+
 /**
  * struct cfg80211_tid_stats - per-TID statistics
  * @filled: bitmap of flags using the bits of &enum nl80211_tid_stats to
@@ -1088,6 +1119,7 @@ struct sta_bss_parameters {
  * @tx_msdu_retries: number of retries (not counting the first) for
  *	transmitted MSDUs
  * @tx_msdu_failed: number of failed transmitted MSDUs
+ * @txq_stats: TXQ statistics
  */
 struct cfg80211_tid_stats {
 	u32 filled;
@@ -1095,6 +1127,7 @@ struct cfg80211_tid_stats {
 	u64 tx_msdu;
 	u64 tx_msdu_retries;
 	u64 tx_msdu_failed;
+	struct cfg80211_txq_stats txq_stats;
 };
 
 #define IEEE80211_MAX_CHAINS	4
@@ -2204,6 +2237,9 @@ enum cfg80211_connect_params_changed {
  * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed
  * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed
  * @WIPHY_PARAM_DYN_ACK: dynack has been enabled
+ * @WIPHY_PARAM_TXQ_LIMIT: TXQ packet limit has been changed
+ * @WIPHY_PARAM_TXQ_MEMORY_LIMIT: TXQ memory limit has been changed
+ * @WIPHY_PARAM_TXQ_QUANTUM: TXQ scheduler quantum
  */
 enum wiphy_params_flags {
 	WIPHY_PARAM_RETRY_SHORT		= 1 << 0,
@@ -2212,6 +2248,9 @@ enum wiphy_params_flags {
 	WIPHY_PARAM_RTS_THRESHOLD	= 1 << 3,
 	WIPHY_PARAM_COVERAGE_CLASS	= 1 << 4,
 	WIPHY_PARAM_DYN_ACK		= 1 << 5,
+	WIPHY_PARAM_TXQ_LIMIT		= 1 << 6,
+	WIPHY_PARAM_TXQ_MEMORY_LIMIT	= 1 << 7,
+	WIPHY_PARAM_TXQ_QUANTUM		= 1 << 8,
 };
 
 /**
@@ -2964,6 +3003,9 @@ struct cfg80211_external_auth_params {
  *
  * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
  *
+ * @get_txq_stats: Get TXQ stats for interface or phy. If wdev is %NULL, this
+ *      function should return phy stats, and interface stats otherwise.
+ *
  * @set_pmk: configure the PMK to be used for offloaded 802.1X 4-Way handshake.
  *	If not deleted through @del_pmk the PMK remains valid until disconnect
  *	upon which the driver should clear it.
@@ -3265,6 +3307,10 @@ struct cfg80211_ops {
 					    struct net_device *dev,
 					    const bool enabled);
 
+	int	(*get_txq_stats)(struct wiphy *wiphy,
+				 struct wireless_dev *wdev,
+				 struct cfg80211_txq_stats *txqstats);
+
 	int	(*set_pmk)(struct wiphy *wiphy, struct net_device *dev,
 			   const struct cfg80211_pmk_conf *conf);
 	int	(*del_pmk)(struct wiphy *wiphy, struct net_device *dev,
@@ -3943,6 +3989,10 @@ struct wiphy {
 
 	u8 nan_supported_bands;
 
+	u32 txq_limit;
+	u32 txq_memory_limit;
+	u32 txq_quantum;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 8e55b63ed3ff..5e67e3444aba 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2226,6 +2226,16 @@ enum nl80211_commands {
  * @NL80211_ATTR_NSS: Station's New/updated  RX_NSS value notified using this
  *	u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED.
  *
+ * @NL80211_ATTR_TXQ_STATS: TXQ statistics (nested attribute, see &enum
+ *      nl80211_txq_stats)
+ * @NL80211_ATTR_TXQ_LIMIT: Total packet limit for the TXQ queues for this phy.
+ *      The smaller of this and the memory limit is enforced.
+ * @NL80211_ATTR_TXQ_MEMORY_LIMIT: Total memory memory limit (in bytes) for the
+ *      TXQ queues for this phy. The smaller of this and the packet limit is
+ *      enforced.
+ * @NL80211_ATTR_TXQ_QUANTUM: TXQ scheduler quantum (bytes). Number of bytes
+ *      a flow is assigned on each round of the DRR scheduler.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2660,6 +2670,11 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_CONTROL_PORT_OVER_NL80211,
 
+	NL80211_ATTR_TXQ_STATS,
+	NL80211_ATTR_TXQ_LIMIT,
+	NL80211_ATTR_TXQ_MEMORY_LIMIT,
+	NL80211_ATTR_TXQ_QUANTUM,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3040,6 +3055,7 @@ enum nl80211_sta_info {
  * @NL80211_TID_STATS_TX_MSDU_FAILED: number of failed transmitted
  *	MSDUs (u64)
  * @NL80211_TID_STATS_PAD: attribute used for padding for 64-bit alignment
+ * @NL80211_TID_STATS_TXQ_STATS: TXQ stats (nested attribute)
  * @NUM_NL80211_TID_STATS: number of attributes here
  * @NL80211_TID_STATS_MAX: highest numbered attribute here
  */
@@ -3050,12 +3066,51 @@ enum nl80211_tid_stats {
 	NL80211_TID_STATS_TX_MSDU_RETRIES,
 	NL80211_TID_STATS_TX_MSDU_FAILED,
 	NL80211_TID_STATS_PAD,
+	NL80211_TID_STATS_TXQ_STATS,
 
 	/* keep last */
 	NUM_NL80211_TID_STATS,
 	NL80211_TID_STATS_MAX = NUM_NL80211_TID_STATS - 1
 };
 
+/**
+ * enum nl80211_txq_stats - per TXQ statistics attributes
+ * @__NL80211_TXQ_STATS_INVALID: attribute number 0 is reserved
+ * @NUM_NL80211_TXQ_STATS: number of attributes here
+ * @NL80211_TXQ_STATS_BACKLOG_BYTES: number of bytes currently backlogged
+ * @NL80211_TXQ_STATS_BACKLOG_PACKETS: number of packets currently
+ *      backlogged
+ * @NL80211_TXQ_STATS_FLOWS: total number of new flows seen
+ * @NL80211_TXQ_STATS_DROPS: total number of packet drops
+ * @NL80211_TXQ_STATS_ECN_MARKS: total number of packet ECN marks
+ * @NL80211_TXQ_STATS_OVERLIMIT: number of drops due to queue space overflow
+ * @NL80211_TXQ_STATS_OVERMEMORY: number of drops due to memory limit overflow
+ *      (only for per-phy stats)
+ * @NL80211_TXQ_STATS_COLLISIONS: number of hash collisions
+ * @NL80211_TXQ_STATS_TX_BYTES: total number of bytes dequeued from TXQ
+ * @NL80211_TXQ_STATS_TX_PACKETS: total number of packets dequeued from TXQ
+ * @NL80211_TXQ_STATS_MAX_FLOWS: number of flow buckets for PHY
+ * @NL80211_TXQ_STATS_MAX: highest numbered attribute here
+ */
+enum nl80211_txq_stats {
+	__NL80211_TXQ_STATS_INVALID,
+	NL80211_TXQ_STATS_BACKLOG_BYTES,
+	NL80211_TXQ_STATS_BACKLOG_PACKETS,
+	NL80211_TXQ_STATS_FLOWS,
+	NL80211_TXQ_STATS_DROPS,
+	NL80211_TXQ_STATS_ECN_MARKS,
+	NL80211_TXQ_STATS_OVERLIMIT,
+	NL80211_TXQ_STATS_OVERMEMORY,
+	NL80211_TXQ_STATS_COLLISIONS,
+	NL80211_TXQ_STATS_TX_BYTES,
+	NL80211_TXQ_STATS_TX_PACKETS,
+	NL80211_TXQ_STATS_MAX_FLOWS,
+
+	/* keep last */
+	NUM_NL80211_TXQ_STATS,
+	NL80211_TXQ_STATS_MAX = NUM_NL80211_TXQ_STATS - 1
+};
+
 /**
  * enum nl80211_mpath_flags - nl80211 mesh path flags
  *
@@ -5072,6 +5127,8 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack
  *	rssi if firmware support, this flag is to intimate about ack rssi
  *	support to nl80211.
+ * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
+ *      TXQs.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -5105,6 +5162,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_DFS_OFFLOAD,
 	NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211,
 	NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT,
+	NL80211_EXT_FEATURE_TXQS,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 08408520c3f8..1afeff94af8b 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -71,11 +71,15 @@ static void ieee80211_get_stats(struct net_device *dev,
 	struct ieee80211_channel *channel;
 	struct sta_info *sta;
 	struct ieee80211_local *local = sdata->local;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct survey_info survey;
 	int i, q;
 #define STA_STATS_SURVEY_LEN 7
 
+	sinfo = kmalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return;
+
 	memset(data, 0, sizeof(u64) * STA_STATS_LEN);
 
 #define ADD_STA_STATS(sta)					\
@@ -86,8 +90,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] += sta->rx_stats.fragments;		\
 		data[i++] += sta->rx_stats.dropped;		\
 								\
-		data[i++] += sinfo.tx_packets;			\
-		data[i++] += sinfo.tx_bytes;			\
+		data[i++] += sinfo->tx_packets;			\
+		data[i++] += sinfo->tx_bytes;			\
 		data[i++] += sta->status_stats.filtered;	\
 		data[i++] += sta->status_stats.retry_failed;	\
 		data[i++] += sta->status_stats.retry_count;	\
@@ -107,8 +111,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
 			goto do_survey;
 
-		memset(&sinfo, 0, sizeof(sinfo));
-		sta_set_sinfo(sta, &sinfo);
+		memset(sinfo, 0, sizeof(*sinfo));
+		sta_set_sinfo(sta, sinfo);
 
 		i = 0;
 		ADD_STA_STATS(sta);
@@ -116,17 +120,17 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] = sta->sta_state;
 
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.txrate);
+				cfg80211_calculate_bitrate(&sinfo->txrate);
 		i++;
-		if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))
 			data[i] = 100000 *
-				cfg80211_calculate_bitrate(&sinfo.rxrate);
+				cfg80211_calculate_bitrate(&sinfo->rxrate);
 		i++;
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
-			data[i] = (u8)sinfo.signal_avg;
+		if (sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+			data[i] = (u8)sinfo->signal_avg;
 		i++;
 	} else {
 		list_for_each_entry(sta, &local->sta_list, list) {
@@ -134,14 +138,16 @@ static void ieee80211_get_stats(struct net_device *dev,
 			if (sta->sdata->dev != dev)
 				continue;
 
-			memset(&sinfo, 0, sizeof(sinfo));
-			sta_set_sinfo(sta, &sinfo);
+			memset(sinfo, 0, sizeof(*sinfo));
+			sta_set_sinfo(sta, sinfo);
 			i = 0;
 			ADD_STA_STATS(sta);
 		}
 	}
 
 do_survey:
+	kfree(sinfo);
+
 	i = STA_STATS_LEN - STA_STATS_SURVEY_LEN;
 	/* Get survey stats for current channel */
 	survey.filled = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6b942a68d1c8..f7715b85fd2b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -424,6 +424,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 	[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
 	[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
+
+	[NL80211_ATTR_TXQ_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_MEMORY_LIMIT] = { .type = NLA_U32 },
+	[NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 },
 };
 
 /* policy for the key attributes */
@@ -774,6 +778,39 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
 	return -ENOBUFS;
 }
 
+static bool nl80211_put_txq_stats(struct sk_buff *msg,
+				  struct cfg80211_txq_stats *txqstats,
+				  int attrtype)
+{
+	struct nlattr *txqattr;
+
+#define PUT_TXQVAL_U32(attr, memb) do {					  \
+	if (txqstats->filled & BIT(NL80211_TXQ_STATS_ ## attr) &&	  \
+	    nla_put_u32(msg, NL80211_TXQ_STATS_ ## attr, txqstats->memb)) \
+		return false;						  \
+	} while (0)
+
+	txqattr = nla_nest_start(msg, attrtype);
+	if (!txqattr)
+		return false;
+
+	PUT_TXQVAL_U32(BACKLOG_BYTES, backlog_bytes);
+	PUT_TXQVAL_U32(BACKLOG_PACKETS, backlog_packets);
+	PUT_TXQVAL_U32(FLOWS, flows);
+	PUT_TXQVAL_U32(DROPS, drops);
+	PUT_TXQVAL_U32(ECN_MARKS, ecn_marks);
+	PUT_TXQVAL_U32(OVERLIMIT, overlimit);
+	PUT_TXQVAL_U32(OVERMEMORY, overmemory);
+	PUT_TXQVAL_U32(COLLISIONS, collisions);
+	PUT_TXQVAL_U32(TX_BYTES, tx_bytes);
+	PUT_TXQVAL_U32(TX_PACKETS, tx_packets);
+	PUT_TXQVAL_U32(MAX_FLOWS, max_flows);
+	nla_nest_end(msg, txqattr);
+
+#undef PUT_TXQVAL_U32
+	return true;
+}
+
 /* netlink command implementations */
 
 struct key_parse {
@@ -1973,6 +2010,28 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 				rdev->wiphy.nan_supported_bands))
 			goto nla_put_failure;
 
+		if (wiphy_ext_feature_isset(&rdev->wiphy,
+					    NL80211_EXT_FEATURE_TXQS)) {
+			struct cfg80211_txq_stats txqstats = {};
+			int res;
+
+			res = rdev_get_txq_stats(rdev, NULL, &txqstats);
+			if (!res &&
+			    !nl80211_put_txq_stats(msg, &txqstats,
+						   NL80211_ATTR_TXQ_STATS))
+				goto nla_put_failure;
+
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_LIMIT,
+					rdev->wiphy.txq_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_MEMORY_LIMIT,
+					rdev->wiphy.txq_memory_limit))
+				goto nla_put_failure;
+			if (nla_put_u32(msg, NL80211_ATTR_TXQ_QUANTUM,
+					rdev->wiphy.txq_quantum))
+				goto nla_put_failure;
+		}
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -2350,6 +2409,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	u8 retry_short = 0, retry_long = 0;
 	u32 frag_threshold = 0, rts_threshold = 0;
 	u8 coverage_class = 0;
+	u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
 
 	ASSERT_RTNL();
 
@@ -2556,10 +2616,38 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		changed |= WIPHY_PARAM_DYN_ACK;
 	}
 
+	if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_memory_limit = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
+		changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
+	}
+
+	if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_TXQS))
+			return -EOPNOTSUPP;
+		txq_quantum = nla_get_u32(
+			info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
+		changed |= WIPHY_PARAM_TXQ_QUANTUM;
+	}
+
 	if (changed) {
 		u8 old_retry_short, old_retry_long;
 		u32 old_frag_threshold, old_rts_threshold;
 		u8 old_coverage_class;
+		u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
 
 		if (!rdev->ops->set_wiphy_params)
 			return -EOPNOTSUPP;
@@ -2569,6 +2657,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		old_frag_threshold = rdev->wiphy.frag_threshold;
 		old_rts_threshold = rdev->wiphy.rts_threshold;
 		old_coverage_class = rdev->wiphy.coverage_class;
+		old_txq_limit = rdev->wiphy.txq_limit;
+		old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
+		old_txq_quantum = rdev->wiphy.txq_quantum;
 
 		if (changed & WIPHY_PARAM_RETRY_SHORT)
 			rdev->wiphy.retry_short = retry_short;
@@ -2580,6 +2671,12 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.rts_threshold = rts_threshold;
 		if (changed & WIPHY_PARAM_COVERAGE_CLASS)
 			rdev->wiphy.coverage_class = coverage_class;
+		if (changed & WIPHY_PARAM_TXQ_LIMIT)
+			rdev->wiphy.txq_limit = txq_limit;
+		if (changed & WIPHY_PARAM_TXQ_MEMORY_LIMIT)
+			rdev->wiphy.txq_memory_limit = txq_memory_limit;
+		if (changed & WIPHY_PARAM_TXQ_QUANTUM)
+			rdev->wiphy.txq_quantum = txq_quantum;
 
 		result = rdev_set_wiphy_params(rdev, changed);
 		if (result) {
@@ -2588,6 +2685,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.frag_threshold = old_frag_threshold;
 			rdev->wiphy.rts_threshold = old_rts_threshold;
 			rdev->wiphy.coverage_class = old_coverage_class;
+			rdev->wiphy.txq_limit = old_txq_limit;
+			rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
+			rdev->wiphy.txq_quantum = old_txq_quantum;
 			return result;
 		}
 	}
@@ -2709,6 +2809,16 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	}
 	wdev_unlock(wdev);
 
+	if (rdev->ops->get_txq_stats) {
+		struct cfg80211_txq_stats txqstats = {};
+		int ret = rdev_get_txq_stats(rdev, wdev, &txqstats);
+
+		if (ret == 0 &&
+		    !nl80211_put_txq_stats(msg, &txqstats,
+					   NL80211_ATTR_TXQ_STATS))
+			goto nla_put_failure;
+	}
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -4582,6 +4692,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 			PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed);
 
 #undef PUT_TIDVAL_U64
+			if ((tidstats->filled &
+			     BIT(NL80211_TID_STATS_TXQ_STATS)) &&
+			    !nl80211_put_txq_stats(msg, &tidstats->txq_stats,
+						   NL80211_TID_STATS_TXQ_STATS))
+				goto nla_put_failure;
+
 			nla_nest_end(msg, tidattr);
 		}
 
@@ -4606,13 +4722,17 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 static int nl80211_dump_station(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
 	u8 mac_addr[ETH_ALEN];
 	int sta_idx = cb->args[2];
 	int err;
 
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
+
 	rtnl_lock();
 	err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
 	if (err)
@@ -4629,9 +4749,9 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	}
 
 	while (1) {
-		memset(&sinfo, 0, sizeof(sinfo));
+		memset(sinfo, 0, sizeof(*sinfo));
 		err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
-					mac_addr, &sinfo);
+					mac_addr, sinfo);
 		if (err == -ENOENT)
 			break;
 		if (err)
@@ -4641,7 +4761,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev->netdev, mac_addr,
-				&sinfo) < 0)
+				sinfo) < 0)
 			goto out;
 
 		sta_idx++;
@@ -4652,6 +4772,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	err = skb->len;
  out_err:
 	rtnl_unlock();
+	kfree(sinfo);
 
 	return err;
 }
@@ -4660,37 +4781,49 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
 	int err;
 
-	memset(&sinfo, 0, sizeof(sinfo));
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!info->attrs[NL80211_ATTR_MAC])
-		return -EINVAL;
+	if (!info->attrs[NL80211_ATTR_MAC]) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (!rdev->ops->get_station)
-		return -EOPNOTSUPP;
+	if (!rdev->ops->get_station) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
+	err = rdev_get_station(rdev, dev, mac_addr, sinfo);
 	if (err)
-		return err;
+		goto out;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
+	if (!msg) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
-				 rdev, dev, mac_addr, &sinfo) < 0) {
+				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return -ENOBUFS;
+		err = -ENOBUFS;
+		goto out;
 	}
 
-	return genlmsg_reply(msg, info);
+	err = genlmsg_reply(msg, info);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 int cfg80211_check_station_change(struct wiphy *wiphy,
@@ -9954,18 +10087,26 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 */
 	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
 	    rdev->ops->get_station) {
-		struct station_info sinfo = {};
+		struct station_info *sinfo;
 		u8 *mac_addr;
 
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo)
+			return -ENOMEM;
+
 		mac_addr = wdev->current_bss->pub.bssid;
 
-		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
-		if (err)
+		err = rdev_get_station(rdev, dev, mac_addr, sinfo);
+		if (err) {
+			kfree(sinfo);
 			return err;
+		}
 
-		if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+		if (sinfo->filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
 			wdev->cqm_config->last_rssi_event_value =
-				(s8) sinfo.rx_beacon_signal_avg;
+				(s8)sinfo->rx_beacon_signal_avg;
+
+		kfree(sinfo);
 	}
 
 	last = wdev->cqm_config->last_rssi_event_value;
@@ -14499,25 +14640,32 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct sk_buff *msg;
-	struct station_info empty_sinfo = {};
+	struct station_info *empty_sinfo = NULL;
 
-	if (!sinfo)
-		sinfo = &empty_sinfo;
+	if (!sinfo) {
+		empty_sinfo = kzalloc(sizeof(*empty_sinfo), GFP_KERNEL);
+		if (!empty_sinfo)
+			return;
+		sinfo = empty_sinfo;
+	}
 
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
 	if (!msg)
-		return;
+		goto out;
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		return;
+		goto out;
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				NL80211_MCGRP_MLME, gfp);
+
+out:
+	kfree(empty_sinfo);
 }
 EXPORT_SYMBOL(cfg80211_del_sta_sinfo);
 
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 87479a53411b..364f5d67f05b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -586,6 +586,18 @@ rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_get_txq_stats(struct cfg80211_registered_device *rdev,
+		   struct wireless_dev *wdev,
+		   struct cfg80211_txq_stats *txqstats)
+{
+	int ret;
+	trace_rdev_get_txq_stats(&rdev->wiphy, wdev);
+	ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev)
 {
 	trace_rdev_rfkill_poll(&rdev->wiphy);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 55fb279a5196..2b417a2fe63f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3243,6 +3243,20 @@ TRACE_EVENT(rdev_set_multicast_to_unicast,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG,
 		  BOOL_TO_STR(__entry->enabled))
 );
+
+TRACE_EVENT(rdev_get_txq_stats,
+	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+	TP_ARGS(wiphy, wdev),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG)
+);
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 05186a47878f..9e002df0f8d8 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1254,7 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
-	struct station_info sinfo = {};
+	struct station_info *sinfo;
 	u8 addr[ETH_ALEN];
 	int err;
 
@@ -1274,16 +1274,23 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 	if (err)
 		return err;
 
-	err = rdev_get_station(rdev, dev, addr, &sinfo);
-	if (err)
-		return err;
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
-	if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
-		return -EOPNOTSUPP;
+	err = rdev_get_station(rdev, dev, addr, sinfo);
+	if (err)
+		goto out;
 
-	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+	if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
 
-	return 0;
+	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo->txrate);
+out:
+	kfree(sinfo);
+	return err;
 }
 
 /* Get wireless statistics.  Called by /proc/net/wireless and by SIOCGIWSTATS */
-- 
cgit v1.2.3-59-g8ed1b


From 2fe4a29a452a68ffa8a501000d0ef8095c242eba Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Tue, 8 May 2018 13:03:50 +0200
Subject: mac80211: Support the new cfg80211 TXQ stats API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support to mac80211 to export TXQ stats via the newly added
cfg80211 API.

Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         | 99 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 ++
 net/mac80211/main.c        |  3 ++
 net/mac80211/sta_info.c    | 12 ++++++
 net/mac80211/tx.c          | 20 ++++++++++
 5 files changed, 137 insertions(+)

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 85dbaa891059..5ce9d121af2b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2376,6 +2376,11 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 	    (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
 		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
 
+	if (changed & (WIPHY_PARAM_TXQ_LIMIT |
+		       WIPHY_PARAM_TXQ_MEMORY_LIMIT |
+		       WIPHY_PARAM_TXQ_QUANTUM))
+		ieee80211_txq_set_params(local);
+
 	return 0;
 }
 
@@ -3705,6 +3710,99 @@ static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy,
 	return 0;
 }
 
+void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
+			      struct txq_info *txqi)
+{
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_BYTES))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_BYTES);
+		txqstats->backlog_bytes = txqi->tin.backlog_bytes;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS);
+		txqstats->backlog_packets = txqi->tin.backlog_packets;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_FLOWS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_FLOWS);
+		txqstats->flows = txqi->tin.flows;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_DROPS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_DROPS);
+		txqstats->drops = txqi->cstats.drop_count;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_ECN_MARKS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_ECN_MARKS);
+		txqstats->ecn_marks = txqi->cstats.ecn_mark;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_OVERLIMIT))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_OVERLIMIT);
+		txqstats->overlimit = txqi->tin.overlimit;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_COLLISIONS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_COLLISIONS);
+		txqstats->collisions = txqi->tin.collisions;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_BYTES))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_BYTES);
+		txqstats->tx_bytes = txqi->tin.tx_bytes;
+	}
+
+	if (!(txqstats->filled & BIT(NL80211_TXQ_STATS_TX_PACKETS))) {
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_TX_PACKETS);
+		txqstats->tx_packets = txqi->tin.tx_packets;
+	}
+}
+
+static int ieee80211_get_txq_stats(struct wiphy *wiphy,
+				   struct wireless_dev *wdev,
+				   struct cfg80211_txq_stats *txqstats)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_sub_if_data *sdata;
+	int ret = 0;
+
+	if (!local->ops->wake_tx_queue)
+		return 1;
+
+	spin_lock_bh(&local->fq.lock);
+	rcu_read_lock();
+
+	if (wdev) {
+		sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+		if (!sdata->vif.txq) {
+			ret = 1;
+			goto out;
+		}
+		ieee80211_fill_txq_stats(txqstats, to_txq_info(sdata->vif.txq));
+	} else {
+		/* phy stats */
+		txqstats->filled |= BIT(NL80211_TXQ_STATS_BACKLOG_PACKETS) |
+				    BIT(NL80211_TXQ_STATS_BACKLOG_BYTES) |
+				    BIT(NL80211_TXQ_STATS_OVERLIMIT) |
+				    BIT(NL80211_TXQ_STATS_OVERMEMORY) |
+				    BIT(NL80211_TXQ_STATS_COLLISIONS) |
+				    BIT(NL80211_TXQ_STATS_MAX_FLOWS);
+		txqstats->backlog_packets = local->fq.backlog;
+		txqstats->backlog_bytes = local->fq.memory_usage;
+		txqstats->overlimit = local->fq.overlimit;
+		txqstats->overmemory = local->fq.overmemory;
+		txqstats->collisions = local->fq.collisions;
+		txqstats->max_flows = local->fq.flows_cnt;
+	}
+
+out:
+	rcu_read_unlock();
+	spin_unlock_bh(&local->fq.lock);
+
+	return ret;
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -3798,4 +3896,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.del_nan_func = ieee80211_del_nan_func,
 	.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
 	.tx_control_port = ieee80211_tx_control_port,
+	.get_txq_stats = ieee80211_get_txq_stats,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6372dbdadf53..d1978aa1c15d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2012,6 +2012,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local);
+void ieee80211_txq_set_params(struct ieee80211_local *local);
 void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
 void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 			struct sta_info *sta,
@@ -2020,6 +2021,8 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 			 struct txq_info *txqi);
 void ieee80211_txq_remove_vlan(struct ieee80211_local *local,
 			       struct ieee80211_sub_if_data *sdata);
+void ieee80211_fill_txq_stats(struct cfg80211_txq_stats *txqstats,
+			      struct txq_info *txqi);
 void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
 			 u16 transaction, u16 auth_alg, u16 status,
 			 const u8 *extra, size_t extra_len, const u8 *bssid,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 9ea17afaa237..4d2e797e3f16 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -565,6 +565,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	if (!ops->set_key)
 		wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
 
+	if (ops->wake_tx_queue)
+		wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS);
+
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM);
 
 	wiphy->bss_priv_size = sizeof(struct ieee80211_bss);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f83e6e2ad5ab..43f34aa873bc 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2052,6 +2052,18 @@ static void sta_set_tidstats(struct sta_info *sta,
 		tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
 		tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid];
 	}
+
+	if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) {
+		spin_lock_bh(&local->fq.lock);
+		rcu_read_lock();
+
+		tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS);
+		ieee80211_fill_txq_stats(&tidstats->txq_stats,
+					 to_txq_info(sta->sta.txq[tid]));
+
+		rcu_read_unlock();
+		spin_unlock_bh(&local->fq.lock);
+	}
 }
 
 static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 062e125a324c..8275a58450b2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1459,6 +1459,24 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	ieee80211_purge_tx_queue(&local->hw, &txqi->frags);
 }
 
+void ieee80211_txq_set_params(struct ieee80211_local *local)
+{
+	if (local->hw.wiphy->txq_limit)
+		local->fq.limit = local->hw.wiphy->txq_limit;
+	else
+		local->hw.wiphy->txq_limit = local->fq.limit;
+
+	if (local->hw.wiphy->txq_memory_limit)
+		local->fq.memory_limit = local->hw.wiphy->txq_memory_limit;
+	else
+		local->hw.wiphy->txq_memory_limit = local->fq.memory_limit;
+
+	if (local->hw.wiphy->txq_quantum)
+		local->fq.quantum = local->hw.wiphy->txq_quantum;
+	else
+		local->hw.wiphy->txq_quantum = local->fq.quantum;
+}
+
 int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 {
 	struct fq *fq = &local->fq;
@@ -1508,6 +1526,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 	for (i = 0; i < fq->flows_cnt; i++)
 		codel_vars_init(&local->cvars[i]);
 
+	ieee80211_txq_set_params(local);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 57c6cb81717f957fb741f2e4c79bd0e2f4f55910 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 8 May 2018 13:57:32 +0100
Subject: mac80211: ethtool: avoid 32 bit multiplication overflow

The multiplication of 100000 * cfg80211_calculate_bitrate() is a 32 bit
operation and can overflow if cfg80211_calculate_bitrate is greater
than 42949. Although I don't believe this is occurring at present, it
would be safer to avoid the potential overflow by making the constant
100000 an ULL to ensure a 64 multiplication occurs.

Detected by CoverityScan, CID#1468643 ("Unintentional integer overflow")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 1afeff94af8b..09210aa8ea9a 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -121,11 +121,11 @@ static void ieee80211_get_stats(struct net_device *dev,
 
 
 		if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))
-			data[i] = 100000 *
+			data[i] = 100000ULL *
 				cfg80211_calculate_bitrate(&sinfo->txrate);
 		i++;
 		if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))
-			data[i] = 100000 *
+			data[i] = 100000ULL *
 				cfg80211_calculate_bitrate(&sinfo->rxrate);
 		i++;
 
-- 
cgit v1.2.3-59-g8ed1b


From cc559c1ac250a6025bd4a9528e424b8da250655b Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Tue, 8 May 2018 03:18:38 -0400
Subject: bnxt_en: Fix firmware message delay loop regression.

A recent change to reduce delay granularity waiting for firmware
reponse has caused a regression.  With a tighter delay loop,
the driver may see the beginning part of the response faster.
The original 5 usec delay to wait for the rest of the message
is not long enough and some messages are detected as invalid.

Increase the maximum wait time from 5 usec to 20 usec.  Also, fix
the debug message that shows the total delay time for the response
when the message times out.  With the new logic, the delay time
is not fixed per iteration of the loop, so we define a macro to
show the total delay time.

Fixes: 9751e8e71487 ("bnxt_en: reduce timeout on initial HWRM calls")
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++++++----
 drivers/net/ethernet/broadcom/bnxt/bnxt.h |  7 +++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index efe5c7203f60..168342ac35c5 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3530,6 +3530,8 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
 		      HWRM_RESP_LEN_SFT;
 		valid = bp->hwrm_cmd_resp_addr + len - 1;
 	} else {
+		int j;
+
 		/* Check if response len is updated */
 		for (i = 0; i < tmo_count; i++) {
 			len = (le32_to_cpu(*resp_len) & HWRM_RESP_LEN_MASK) >>
@@ -3547,14 +3549,15 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
 
 		if (i >= tmo_count) {
 			netdev_err(bp->dev, "Error (timeout: %d) msg {0x%x 0x%x} len:%d\n",
-				   timeout, le16_to_cpu(req->req_type),
+				   HWRM_TOTAL_TIMEOUT(i),
+				   le16_to_cpu(req->req_type),
 				   le16_to_cpu(req->seq_id), len);
 			return -1;
 		}
 
 		/* Last byte of resp contains valid bit */
 		valid = bp->hwrm_cmd_resp_addr + len - 1;
-		for (i = 0; i < 5; i++) {
+		for (j = 0; j < HWRM_VALID_BIT_DELAY_USEC; j++) {
 			/* make sure we read from updated DMA memory */
 			dma_rmb();
 			if (*valid)
@@ -3562,9 +3565,10 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len,
 			udelay(1);
 		}
 
-		if (i >= 5) {
+		if (j >= HWRM_VALID_BIT_DELAY_USEC) {
 			netdev_err(bp->dev, "Error (timeout: %d) msg {0x%x 0x%x} len:%d v:%d\n",
-				   timeout, le16_to_cpu(req->req_type),
+				   HWRM_TOTAL_TIMEOUT(i),
+				   le16_to_cpu(req->req_type),
 				   le16_to_cpu(req->seq_id), len, *valid);
 			return -1;
 		}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8df1d8b9d2e3..a9c210e70868 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -539,6 +539,13 @@ struct rx_tpa_end_cmp_ext {
 #define HWRM_MIN_TIMEOUT		25
 #define HWRM_MAX_TIMEOUT		40
 
+#define HWRM_TOTAL_TIMEOUT(n)	(((n) <= HWRM_SHORT_TIMEOUT_COUNTER) ?	\
+	((n) * HWRM_SHORT_MIN_TIMEOUT) :				\
+	(HWRM_SHORT_TIMEOUT_COUNTER * HWRM_SHORT_MIN_TIMEOUT +		\
+	 ((n) - HWRM_SHORT_TIMEOUT_COUNTER) * HWRM_MIN_TIMEOUT))
+
+#define HWRM_VALID_BIT_DELAY_USEC	20
+
 #define BNXT_RX_EVENT	1
 #define BNXT_AGG_EVENT	2
 #define BNXT_TX_EVENT	4
-- 
cgit v1.2.3-59-g8ed1b


From dac0490718bd17df5e3995ffca14255e5f9ed22d Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Tue, 8 May 2018 03:18:39 -0400
Subject: bnxt_en: Check unsupported speeds in bnxt_update_link() on PF only.

Only non-NPAR PFs need to actively check and manage unsupported link
speeds.  NPAR functions and VFs do not control the link speed and
should skip the unsupported speed detection logic, to avoid warning
messages from firmware rejecting the unsupported firmware calls.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 168342ac35c5..cd3ab788936f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6462,6 +6462,9 @@ static int bnxt_update_link(struct bnxt *bp, bool chng_link_state)
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
+	if (!BNXT_SINGLE_PF(bp))
+		return 0;
+
 	diff = link_info->support_auto_speeds ^ link_info->advertising;
 	if ((link_info->support_auto_speeds | diff) !=
 	    link_info->support_auto_speeds) {
-- 
cgit v1.2.3-59-g8ed1b


From 7328a23c063a9ecf56314fb9631889c1820bd0ce Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Tue, 8 May 2018 03:18:40 -0400
Subject: bnxt_en: Read phy eeprom A2h address only when optical diagnostics is
 supported.

For SFP+ modules, 0xA2 page is available only when Diagnostic Monitoring
Type [Address A0h, Byte 92] is implemented. Extend bnxt_get_module_info(),
to read optical diagnostics support at offset 92(0x5c) and set eeprom_len
length to ETH_MODULE_SFF_8436_LEN (to exclude A2 page), if dianostics is
not supported.

Also in bnxt_get_module_info(), module id is read from offset 0x5e which
is not correct. It was working by accident, as offset was not effective
without setting enables flag in the firmware request. SFP module id is
present at location 0. Fix this by removing the offset and read it
from location 0.

Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h         |  3 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 20 ++++++++------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index a9c210e70868..9b14eb610b9f 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -1414,8 +1414,7 @@ struct bnxt {
 
 #define I2C_DEV_ADDR_A0				0xa0
 #define I2C_DEV_ADDR_A2				0xa2
-#define SFP_EEPROM_SFF_8472_COMP_ADDR		0x5e
-#define SFP_EEPROM_SFF_8472_COMP_SIZE		1
+#define SFF_DIAG_SUPPORT_OFFSET			0x5c
 #define SFF_MODULE_ID_SFP			0x3
 #define SFF_MODULE_ID_QSFP			0xc
 #define SFF_MODULE_ID_QSFP_PLUS			0xd
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index ad98b78f5aa1..7270c8b0cef3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2184,9 +2184,8 @@ static int bnxt_read_sfp_module_eeprom_info(struct bnxt *bp, u16 i2c_addr,
 static int bnxt_get_module_info(struct net_device *dev,
 				struct ethtool_modinfo *modinfo)
 {
+	u8 data[SFF_DIAG_SUPPORT_OFFSET + 1];
 	struct bnxt *bp = netdev_priv(dev);
-	struct hwrm_port_phy_i2c_read_input req = {0};
-	struct hwrm_port_phy_i2c_read_output *output = bp->hwrm_cmd_resp_addr;
 	int rc;
 
 	/* No point in going further if phy status indicates
@@ -2201,21 +2200,19 @@ static int bnxt_get_module_info(struct net_device *dev,
 	if (bp->hwrm_spec_code < 0x10202)
 		return -EOPNOTSUPP;
 
-	bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_PORT_PHY_I2C_READ, -1, -1);
-	req.i2c_slave_addr = I2C_DEV_ADDR_A0;
-	req.page_number = 0;
-	req.page_offset = cpu_to_le16(SFP_EEPROM_SFF_8472_COMP_ADDR);
-	req.data_length = SFP_EEPROM_SFF_8472_COMP_SIZE;
-	req.port_id = cpu_to_le16(bp->pf.port_id);
-	mutex_lock(&bp->hwrm_cmd_lock);
-	rc = _hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
+	rc = bnxt_read_sfp_module_eeprom_info(bp, I2C_DEV_ADDR_A0, 0, 0,
+					      SFF_DIAG_SUPPORT_OFFSET + 1,
+					      data);
 	if (!rc) {
-		u32 module_id = le32_to_cpu(output->data[0]);
+		u8 module_id = data[0];
+		u8 diag_supported = data[SFF_DIAG_SUPPORT_OFFSET];
 
 		switch (module_id) {
 		case SFF_MODULE_ID_SFP:
 			modinfo->type = ETH_MODULE_SFF_8472;
 			modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+			if (!diag_supported)
+				modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
 			break;
 		case SFF_MODULE_ID_QSFP:
 		case SFF_MODULE_ID_QSFP_PLUS:
@@ -2231,7 +2228,6 @@ static int bnxt_get_module_info(struct net_device *dev,
 			break;
 		}
 	}
-	mutex_unlock(&bp->hwrm_cmd_lock);
 	return rc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 707e7e96602675beb5e09bb994195663da6eb56d Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Tue, 8 May 2018 03:18:41 -0400
Subject: bnxt_en: Always forward VF MAC address to the PF.

The current code already forwards the VF MAC address to the PF, except
in one case.  If the VF driver gets a valid MAC address from the firmware
during probe time, it will not forward the MAC address to the PF,
incorrectly assuming that the PF already knows the MAC address.  This
causes "ip link show" to show zero VF MAC addresses for this case.

This assumption is not correct.  Newer firmware remembers the VF MAC
address last used by the VF and provides it to the VF driver during
probe.  So we need to always forward the VF MAC address to the PF.

The forwarded MAC address may now be the PF assigned MAC address and so we
need to make sure we approve it for this case.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c       | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index cd3ab788936f..dfa0839f6656 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -8678,8 +8678,8 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
 			memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN);
 		} else {
 			eth_hw_addr_random(bp->dev);
-			rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
 		}
+		rc = bnxt_approve_mac(bp, bp->dev->dev_addr);
 #endif
 	}
 	return rc;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index cc21d874eb6e..a64910892c25 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -923,7 +923,8 @@ static int bnxt_vf_configure_mac(struct bnxt *bp, struct bnxt_vf_info *vf)
 	if (req->enables & cpu_to_le32(FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR)) {
 		if (is_valid_ether_addr(req->dflt_mac_addr) &&
 		    ((vf->flags & BNXT_VF_TRUST) ||
-		     (!is_valid_ether_addr(vf->mac_addr)))) {
+		     !is_valid_ether_addr(vf->mac_addr) ||
+		     ether_addr_equal(req->dflt_mac_addr, vf->mac_addr))) {
 			ether_addr_copy(vf->vf_mac_addr, req->dflt_mac_addr);
 			return bnxt_hwrm_exec_fwd_resp(bp, vf, msg_size);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 2b999ba899059eed00a03d029894a62486e7e2bc Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Fri, 4 May 2018 17:21:03 +0200
Subject: net: phy: sfp: handle cases where neither BR, min nor BR, max is
 given

When computing the bitrate using values read from an SFP module EEPROM,
we use the nominal BR plus BR,min and BR,max to determine the
boundaries. But in some cases BR,min and BR,max aren't provided, which
led the SFP code to end up having the nominal value for both the minimum
and maximum bitrate values. When using a passive cable, the nominal
value should be used as the maximum one, and there is no minimum one
so we should use 0.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 0381da78d228..b2f4519dfb84 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -132,6 +132,13 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 			br_max = br_nom + br_nom * id->ext.br_min / 100;
 			br_min = br_nom - br_nom * id->ext.br_min / 100;
 		}
+
+		/* When using passive cables, in case neither BR,min nor BR,max
+		 * are specified, set br_min to 0 as the nominal value is then
+		 * used as the maximum.
+		 */
+		if (br_min == br_max && id->base.sfp_ct_passive)
+			br_min = 0;
 	}
 
 	/* Set ethtool support from the compliance fields. */
-- 
cgit v1.2.3-59-g8ed1b


From 53a7bdfb2a2756cce8003b90817f8a6fb4d830d9 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Mon, 7 May 2018 09:17:51 -0300
Subject: dt-bindings: dsa: Remove unnecessary #address/#size-cells

If the example binding is used on a real dts file, the following DTC
warning is seen with W=1:

arch/arm/boot/dts/imx6q-b450v3.dtb: Warning (avoid_unnecessary_addr_size): /mdio-gpio/switch@0: unnecessary #address-cells/#size-cells without "ranges" or child "reg" property

Remove unnecessary #address-cells/#size-cells to improve the binding
document examples.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dsa/dsa.txt | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt
index cfe8f64eca4f..3ceeb8de1196 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa.txt
+++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt
@@ -82,8 +82,6 @@ linked into one DSA cluster.
 
 	switch0: switch0@0 {
 		compatible = "marvell,mv88e6085";
-		#address-cells = <1>;
-		#size-cells = <0>;
 		reg = <0>;
 
 		dsa,member = <0 0>;
@@ -135,8 +133,6 @@ linked into one DSA cluster.
 
 	switch1: switch1@0 {
 		compatible = "marvell,mv88e6085";
-		#address-cells = <1>;
-		#size-cells = <0>;
 		reg = <0>;
 
 		dsa,member = <0 1>;
@@ -204,8 +200,6 @@ linked into one DSA cluster.
 
 	switch2: switch2@0 {
 		compatible = "marvell,mv88e6085";
-		#address-cells = <1>;
-		#size-cells = <0>;
 		reg = <0>;
 
 		dsa,member = <0 2>;
-- 
cgit v1.2.3-59-g8ed1b


From dfec0ee22c0a4488e4f4c764b2720d3096920383 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:22 -0700
Subject: udp: Record gso_segs when supporting UDP segmentation offload

We need to record the number of segments that will be generated when this
frame is segmented. The expectation is that if gso_size is set then
gso_segs is set as well. Without this some drivers such as ixgbe get
confused if they attempt to offload this as they record 0 segments for the
entire packet instead of the correct value.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dd3102a37ef9..e07db83b311e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -793,6 +793,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
+							 cork->gso_size);
 		goto csum_partial;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b21c034b3df833b5d9db1cfdc3938dbb0d7995c6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:28 -0700
Subject: udp: Do not pass MSS as parameter to GSO segmentation

There is no point in passing MSS as a parameter for for the GSO
segmentation call as it is already available via the shared info for the
skb itself.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h      | 2 +-
 net/ipv4/udp_offload.c | 6 ++++--
 net/ipv6/udp_offload.c | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 05990746810e..8bd83b044ecd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -176,7 +176,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check);
+				  __sum16 check);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 006257092f06..c1afcd2f1a76 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -189,14 +189,16 @@ EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features,
-				  unsigned int mss, __sum16 check)
+				  __sum16 check)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
 	unsigned int hdrlen;
 	struct udphdr *uh;
+	unsigned int mss;
 
+	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
@@ -244,7 +246,7 @@ static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v4_check(sizeof(struct udphdr) + mss,
 					      iph->saddr, iph->daddr, 0));
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index f7b85b1e6b3e..dea03ec09715 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -26,7 +26,7 @@ static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features, mss,
+	return __udp_gso_segment(gso_skb, features,
 				 udp_v6_check(sizeof(struct udphdr) + mss,
 					      &ip6h->saddr, &ip6h->daddr, 0));
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9a0d41b3598ff62ecb26661bbfb1d523586cdea3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:34 -0700
Subject: udp: Do not pass checksum as a parameter to GSO segmentation

This patch is meant to allow us to avoid having to recompute the checksum
from scratch and have it passed as a parameter.

Instead of taking that approach we can take advantage of the fact that the
length that was used to compute the existing checksum is included in the
UDP header.

Finally to avoid the need to invert the result we can just call csum16_add
and csum16_sub directly. By doing this we can avoid a number of
instructions in the loop that is handling segmentation.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h      |  3 +--
 net/ipv4/udp_offload.c | 32 ++++++++++++++++++--------------
 net/ipv6/udp_offload.c |  7 +------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8bd83b044ecd..9289b6425032 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -175,8 +175,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check);
+				  netdev_features_t features);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index c1afcd2f1a76..92c182e99ddc 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -188,8 +188,7 @@ out_unlock:
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
-				  netdev_features_t features,
-				  __sum16 check)
+				  netdev_features_t features)
 {
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
@@ -197,6 +196,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
+	__sum16 check;
+	__be16 newlen;
 
 	mss = skb_shinfo(gso_skb)->gso_size;
 	if (gso_skb->len <= sizeof(*uh) + mss)
@@ -215,17 +216,25 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	uh = udp_hdr(segs);
+
+	/* compute checksum adjustment based on old length versus new */
+	newlen = htons(sizeof(*uh) + mss);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
 	for (seg = segs; seg; seg = seg->next) {
 		uh = udp_hdr(seg);
-		uh->len = htons(seg->len - hdrlen);
-		uh->check = check;
 
 		/* last packet can be partial gso_size */
-		if (!seg->next)
-			csum_replace2(&uh->check, htons(mss),
-				      htons(seg->len - hdrlen - sizeof(*uh)));
+		if (!seg->next) {
+			newlen = htons(seg->len - hdrlen);
+			check = csum16_add(csum16_sub(uh->check, uh->len),
+					   newlen);
+		}
+
+		uh->len = newlen;
+		uh->check = check;
 
-		uh->check = ~uh->check;
 		seg->destructor = sock_wfree;
 		seg->sk = sk;
 		sum_truesize += seg->truesize;
@@ -240,15 +249,10 @@ EXPORT_SYMBOL_GPL(__udp_gso_segment);
 static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct iphdr *iph = ip_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v4_check(sizeof(struct udphdr) + mss,
-					      iph->saddr, iph->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index dea03ec09715..61e34f1d2fa2 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -20,15 +20,10 @@
 static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
 					  netdev_features_t features)
 {
-	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
-	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
-
 	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
 		return ERR_PTR(-EIO);
 
-	return __udp_gso_segment(gso_skb, features,
-				 udp_v6_check(sizeof(struct udphdr) + mss,
-					      &ip6h->saddr, &ip6h->daddr, 0));
+	return __udp_gso_segment(gso_skb, features);
 }
 
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
-- 
cgit v1.2.3-59-g8ed1b


From 0ad6509571e06b302d519f2f05e616ac8c1a10d7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:40 -0700
Subject: udp: Partially unroll handling of first segment and last segment

This patch allows us to take care of unrolling the first segment and the
last segment of the loop for processing the segmented skb. Part of the
motivation for this is that it makes it easier to process the fact that the
first fame and all of the frames in between should be mostly identical
in terms of header data, and the last frame has differences in the length
and partial checksum.

In addition I am dropping the header length calculation since we don't
really need it for anything but the last frame and it can be easily
obtained by just pulling the data_len and offset of tail from the transport
header.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 92c182e99ddc..b15c78ac3f23 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -193,7 +193,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sock *sk = gso_skb->sk;
 	unsigned int sum_truesize = 0;
 	struct sk_buff *segs, *seg;
-	unsigned int hdrlen;
 	struct udphdr *uh;
 	unsigned int mss;
 	__sum16 check;
@@ -203,7 +202,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	if (gso_skb->len <= sizeof(*uh) + mss)
 		return ERR_PTR(-EINVAL);
 
-	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
@@ -216,30 +214,37 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
-	uh = udp_hdr(segs);
+	seg = segs;
+	uh = udp_hdr(seg);
 
 	/* compute checksum adjustment based on old length versus new */
 	newlen = htons(sizeof(*uh) + mss);
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
-	for (seg = segs; seg; seg = seg->next) {
-		uh = udp_hdr(seg);
+	for (;;) {
+		seg->destructor = sock_wfree;
+		seg->sk = sk;
+		sum_truesize += seg->truesize;
 
-		/* last packet can be partial gso_size */
-		if (!seg->next) {
-			newlen = htons(seg->len - hdrlen);
-			check = csum16_add(csum16_sub(uh->check, uh->len),
-					   newlen);
-		}
+		if (!seg->next)
+			break;
 
 		uh->len = newlen;
 		uh->check = check;
 
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		seg = seg->next;
+		uh = udp_hdr(seg);
 	}
 
+	/* last packet can be partial gso_size, account for that in checksum */
+	newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) +
+		       seg->data_len);
+	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
+
+	uh->len = newlen;
+	uh->check = check;
+
+	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
 	return segs;
-- 
cgit v1.2.3-59-g8ed1b


From 6053d0f189064302420930f9ef9022e24a04946a Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:46 -0700
Subject: udp: Add support for software checksum and GSO_PARTIAL with GSO
 offload

This patch adds support for a software provided checksum and GSO_PARTIAL
segmentation support. With this we can offload UDP segmentation on devices
that only have partial support for tunnels.

Since we are no longer needing the hardware checksum we can drop the checks
in the segmentation code that were verifying if it was present.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 29 +++++++++++++++++++----------
 net/ipv6/udp_offload.c | 11 +----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b15c78ac3f23..d4f2daca0c33 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -214,6 +214,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		return segs;
 	}
 
+	/* GSO partial and frag_list segmentation only requires splitting
+	 * the frame into an MSS multiple and possibly a remainder, both
+	 * cases return a GSO skb. So update the mss now.
+	 */
+	if (skb_is_gso(segs))
+		mss *= skb_shinfo(segs)->gso_segs;
+
 	seg = segs;
 	uh = udp_hdr(seg);
 
@@ -232,6 +239,12 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->len = newlen;
 		uh->check = check;
 
+		if (seg->ip_summed == CHECKSUM_PARTIAL)
+			gso_reset_checksum(seg, ~check);
+		else
+			uh->check = gso_make_checksum(seg, ~check) ? :
+				    CSUM_MANGLED_0;
+
 		seg = seg->next;
 		uh = udp_hdr(seg);
 	}
@@ -244,6 +257,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	uh->len = newlen;
 	uh->check = check;
 
+	if (seg->ip_summed == CHECKSUM_PARTIAL)
+		gso_reset_checksum(seg, ~check);
+	else
+		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+
 	/* update refcount for the packet */
 	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
@@ -251,15 +269,6 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
 
-static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IP)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -283,7 +292,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 
 	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-		return __udp4_gso_segment(skb, features);
+		return __udp_gso_segment(skb, features);
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 61e34f1d2fa2..03a2ff3fe1e6 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,15 +17,6 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
-					  netdev_features_t features)
-{
-	if (!can_checksum_protocol(features, htons(ETH_P_IPV6)))
-		return ERR_PTR(-EIO);
-
-	return __udp_gso_segment(gso_skb, features);
-}
-
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -58,7 +49,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 			goto out;
 
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
-			return __udp6_gso_segment(skb, features);
+			return __udp_gso_segment(skb, features);
 
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.
-- 
cgit v1.2.3-59-g8ed1b


From 04d55b257cbb9eb6c722410251d3b7df00835c6c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Mon, 7 May 2018 11:08:52 -0700
Subject: udp: Do not copy destructor if one is not present

This patch makes it so that if a destructor is not present we avoid trying
to update the skb socket or any reference counting that would be associated
with the NULL socket and/or descriptor. By doing this we can support
traffic coming from another namespace without any issues.

Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d4f2daca0c33..ede2a7305b90 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -195,6 +195,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	struct sk_buff *segs, *seg;
 	struct udphdr *uh;
 	unsigned int mss;
+	bool copy_dtor;
 	__sum16 check;
 	__be16 newlen;
 
@@ -205,12 +206,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	skb_pull(gso_skb, sizeof(*uh));
 
 	/* clear destructor to avoid skb_segment assigning it to tail */
-	WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
-	gso_skb->destructor = NULL;
+	copy_dtor = gso_skb->destructor == sock_wfree;
+	if (copy_dtor)
+		gso_skb->destructor = NULL;
 
 	segs = skb_segment(gso_skb, features);
 	if (unlikely(IS_ERR_OR_NULL(segs))) {
-		gso_skb->destructor = sock_wfree;
+		if (copy_dtor)
+			gso_skb->destructor = sock_wfree;
 		return segs;
 	}
 
@@ -229,9 +232,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
 
 	for (;;) {
-		seg->destructor = sock_wfree;
-		seg->sk = sk;
-		sum_truesize += seg->truesize;
+		if (copy_dtor) {
+			seg->destructor = sock_wfree;
+			seg->sk = sk;
+			sum_truesize += seg->truesize;
+		}
 
 		if (!seg->next)
 			break;
@@ -263,8 +268,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
 
 	/* update refcount for the packet */
-	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
-
+	if (copy_dtor)
+		refcount_add(sum_truesize - gso_skb->truesize,
+			     &sk->sk_wmem_alloc);
 	return segs;
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
-- 
cgit v1.2.3-59-g8ed1b


From bf516e7d8b1c1ff7706d56961306fa7e19f352b6 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 4 May 2018 10:42:04 +0800
Subject: rtlwifi: remove duplicate definition of antenna number for btcoex

Two enumerations bt_total_ant_num and bt_ant_num are identical, so one
can be removed.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
index fd7928fdbd1a..3f2295fdb02d 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c
@@ -2574,11 +2574,11 @@ void rtl92ee_read_bt_coexist_info_from_hwpg(struct ieee80211_hw *hw,
 			rtlpriv->btcoexist.btc_info.btcoexist = 0;
 
 		rtlpriv->btcoexist.btc_info.bt_type = BT_RTL8192E;
-		rtlpriv->btcoexist.btc_info.ant_num = ANT_TOTAL_X2;
+		rtlpriv->btcoexist.btc_info.ant_num = ANT_X2;
 	} else {
 		rtlpriv->btcoexist.btc_info.btcoexist = 1;
 		rtlpriv->btcoexist.btc_info.bt_type = BT_RTL8192E;
-		rtlpriv->btcoexist.btc_info.ant_num = ANT_TOTAL_X1;
+		rtlpriv->btcoexist.btc_info.ant_num = ANT_X1;
 	}
 }
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index ce1754054a07..208010fcde21 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2842,11 +2842,6 @@ enum bt_co_type {
 	BT_RTL8812A = 11,
 };
 
-enum bt_total_ant_num {
-	ANT_TOTAL_X2 = 0,
-	ANT_TOTAL_X1 = 1
-};
-
 enum bt_cur_state {
 	BT_OFF = 0,
 	BT_ON = 1,
-- 
cgit v1.2.3-59-g8ed1b


From 9c4a121e82634aa000a702c98cd6f05b27d6e186 Mon Sep 17 00:00:00 2001
From: Sean Lanigan <sean@lano.id.au>
Date: Fri, 4 May 2018 16:48:23 +1000
Subject: brcmfmac: Add support for bcm43364 wireless chipset

Add support for the BCM43364 chipset via an SDIO interface, as used in
e.g. the Murata 1FX module.

The BCM43364 uses the same firmware as the BCM43430 (which is already
included), the only difference is the omission of Bluetooth.

However, the SDIO_ID for the BCM43364 is 02D0:A9A4, giving it a MODALIAS
of sdio:c00v02D0dA9A4, which doesn't get recognised and hence doesn't
load the brcmfmac module. Adding the 'A9A4' ID in the appropriate place
triggers the brcmfmac driver to load, and then correctly use the
firmware file 'brcmfmac43430-sdio.bin'.

Signed-off-by: Sean Lanigan <sean@lano.id.au>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 include/linux/mmc/sdio_ids.h                              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 0b68240ec7b4..a1915411c280 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -963,6 +963,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43340),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43341),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43362),
+ 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43364),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430),
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index cdd66a5fbd5e..0a7abe8a407f 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -35,6 +35,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_4335_4339	0x4335
 #define SDIO_DEVICE_ID_BROADCOM_4339		0x4339
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
+#define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_4345		0x4345
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
-- 
cgit v1.2.3-59-g8ed1b


From 4793e5a954faf2fa5e88726332bf851fbebcbf1c Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sun, 6 May 2018 09:53:53 +0200
Subject: mwifiex: delete unneeded include

Nothing that is defined in 11ac.h is referenced in cmdevt.c.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cmdevt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
index 7014f440e6f8..9cfcdf6bec52 100644
--- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c
+++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
@@ -25,7 +25,6 @@
 #include "main.h"
 #include "wmm.h"
 #include "11n.h"
-#include "11ac.h"
 
 static void mwifiex_cancel_pending_ioctl(struct mwifiex_adapter *adapter);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4f9fb990013c03c5d56266fb4c439816cd84e1a6 Mon Sep 17 00:00:00 2001
From: Ganapathi Bhat <gbhat@marvell.com>
Date: Tue, 8 May 2018 00:10:58 +0530
Subject: mwifiex: increase TX threashold to avoid TX timeout during ED MAC
 test

While carrying energy detection(ED) tests, the chip will stop
transmission upon detecting an energy in the connected channel.
As a feedback, driver will stop dequeuing TX packets and due to
which wmm_tx_pending keep incremeting. Once wmm_tx_pending
reaches 100, driver calls netif_tx_stop_queue(). If TX ques is
not restarted within 5(watchdog_timeo) seconds, it will result in
TX timeout.

The ED test is carried out for one minute and the current
threshold of 100 is easily overcome by the traffic, cuasing TX
timeouts. To fix this increase the threshold to 400.

Signed-off-by: Cathy Luo <cluo@marvell.com>
Signed-off-by: Ganapathi Bhat <gbhat@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/main.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h
index 7c95c1279548..8ae74ed78805 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.h
+++ b/drivers/net/wireless/marvell/mwifiex/main.h
@@ -84,8 +84,8 @@ enum {
 #define MWIFIEX_TIMER_10S			10000
 #define MWIFIEX_TIMER_1S			1000
 
-#define MAX_TX_PENDING      100
-#define LOW_TX_PENDING      80
+#define MAX_TX_PENDING      400
+#define LOW_TX_PENDING      380
 
 #define HIGH_RX_PENDING     50
 #define LOW_RX_PENDING      20
-- 
cgit v1.2.3-59-g8ed1b


From 3c6a67dd6697e25bfec0b85a008ec4a3c482d993 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:54 -0700
Subject: fm10k: setup VLANs for l2 accelerated macvlan interfaces

We have support for accelerating macvlan devices via the
.ndo_dfwd_add_station() netdev op. These accelerated macvlan MAC
addresses are stored in the l2_accel structure, separate from the
unicast or multicast address lists.

If a VLAN is added on top of the macvlan device by the stack, traffic
will not properly flow to the macvlan. This occurs because we fail to
setup the VLANs for l2_accel MAC addresses.

In the non-offloaded case the MAC address is added to the unicast
address list, and thus the normal setup for enabling VLANs works as
expected.

We also need to add VLANs marked from .ndo_vlan_rx_add_vid() into the
l2_accel MAC addresses. Otherwise, VLAN traffic will not properly be
received by the VLAN devices attached to the offloaded macvlan devices.

Fix this by adding necessary logic to setup VLANs not only for the
unicast and multicast addresses, but also the l2_accel list. We need
similar logic in dfwd_add_station, dfwd_del_station, fm10k_update_vid,
and fm10k_restore_rx_state.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 50 ++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index c879af72bbf5..0dc9f2dbc1ad 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -907,7 +907,9 @@ static int fm10k_mc_vlan_unsync(struct net_device *netdev,
 static int fm10k_update_vid(struct net_device *netdev, u16 vid, bool set)
 {
 	struct fm10k_intfc *interface = netdev_priv(netdev);
+	struct fm10k_l2_accel *l2_accel = interface->l2_accel;
 	struct fm10k_hw *hw = &interface->hw;
+	u16 glort;
 	s32 err;
 	int i;
 
@@ -975,6 +977,22 @@ static int fm10k_update_vid(struct net_device *netdev, u16 vid, bool set)
 	if (err)
 		goto err_out;
 
+	/* Update L2 accelerated macvlan addresses */
+	if (l2_accel) {
+		for (i = 0; i < l2_accel->size; i++) {
+			struct net_device *sdev = l2_accel->macvlan[i];
+
+			if (!sdev)
+				continue;
+
+			glort = l2_accel->dglort + 1 + i;
+
+			fm10k_queue_mac_request(interface, glort,
+						sdev->dev_addr,
+						vid, set);
+		}
+	}
+
 	/* set VLAN ID prior to syncing/unsyncing the VLAN */
 	interface->vid = vid + (set ? VLAN_N_VID : 0);
 
@@ -1214,6 +1232,22 @@ void fm10k_restore_rx_state(struct fm10k_intfc *interface)
 
 		fm10k_queue_mac_request(interface, glort,
 					hw->mac.addr, vid, true);
+
+		/* synchronize macvlan addresses */
+		if (l2_accel) {
+			for (i = 0; i < l2_accel->size; i++) {
+				struct net_device *sdev = l2_accel->macvlan[i];
+
+				if (!sdev)
+					continue;
+
+				glort = l2_accel->dglort + 1 + i;
+
+				fm10k_queue_mac_request(interface, glort,
+							sdev->dev_addr,
+							vid, true);
+			}
+		}
 	}
 
 	/* update xcast mode before synchronizing addresses if host's mailbox
@@ -1430,7 +1464,7 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 	struct fm10k_dglort_cfg dglort = { 0 };
 	struct fm10k_hw *hw = &interface->hw;
 	int size = 0, i;
-	u16 glort;
+	u16 vid, glort;
 
 	/* The hardware supported by fm10k only filters on the destination MAC
 	 * address. In order to avoid issues we only support offloading modes
@@ -1510,6 +1544,12 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 					hw->mac.default_vid, true);
 	}
 
+	for (vid = fm10k_find_next_vlan(interface, 0);
+	     vid < VLAN_N_VID;
+	     vid = fm10k_find_next_vlan(interface, vid))
+		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+					vid, true);
+
 	fm10k_mbx_unlock(interface);
 
 	return sdev;
@@ -1522,8 +1562,8 @@ static void fm10k_dfwd_del_station(struct net_device *dev, void *priv)
 	struct fm10k_dglort_cfg dglort = { 0 };
 	struct fm10k_hw *hw = &interface->hw;
 	struct net_device *sdev = priv;
+	u16 vid, glort;
 	int i;
-	u16 glort;
 
 	if (!l2_accel)
 		return;
@@ -1550,6 +1590,12 @@ static void fm10k_dfwd_del_station(struct net_device *dev, void *priv)
 					hw->mac.default_vid, false);
 	}
 
+	for (vid = fm10k_find_next_vlan(interface, 0);
+	     vid < VLAN_N_VID;
+	     vid = fm10k_find_next_vlan(interface, vid))
+		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+					vid, false);
+
 	fm10k_mbx_unlock(interface);
 
 	/* record removal */
-- 
cgit v1.2.3-59-g8ed1b


From 82e9697250977f3f87cd42e71e8daa8810e64520 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:50 -0700
Subject: bpf: btf: Avoid WARN_ON when CONFIG_REFCOUNT_FULL=y

If CONFIG_REFCOUNT_FULL=y, refcount_inc() WARN when refcount is 0.
When creating a new btf, the initial btf->refcnt is 0 and
triggered the following:

[   34.855452] refcount_t: increment on 0; use-after-free.
[   34.856252] WARNING: CPU: 6 PID: 1857 at lib/refcount.c:153 refcount_inc+0x26/0x30
....
[   34.868809] Call Trace:
[   34.869168]  btf_new_fd+0x1af6/0x24d0
[   34.869645]  ? btf_type_seq_show+0x200/0x200
[   34.870212]  ? lock_acquire+0x3b0/0x3b0
[   34.870726]  ? security_capable+0x54/0x90
[   34.871247]  __x64_sys_bpf+0x1b2/0x310
[   34.871761]  ? __ia32_sys_bpf+0x310/0x310
[   34.872285]  ? bad_area_access_error+0x310/0x310
[   34.872894]  do_syscall_64+0x95/0x3f0

This patch uses refcount_set() instead.

Reported-by: Yonghong Song <yhs@fb.com>
Tested-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 22e1046a1a86..fa0dce0452e7 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1977,7 +1977,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	if (!err) {
 		btf_verifier_env_free(env);
-		btf_get(btf);
+		refcount_set(&btf->refcnt, 1);
 		return btf;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 78958fca7ead2f81b60a6827881c4866d1ed0c52 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:51 -0700
Subject: bpf: btf: Introduce BTF ID

This patch gives an ID to each loaded BTF.  The ID is allocated by
the idr like the existing prog-id and map-id.

The bpf_put(map->btf) is moved to __bpf_map_put() so that the
userspace can stop seeing the BTF ID ASAP when the last BTF
refcnt is gone.

It also makes BTF accessible from userspace through the
1. new BPF_BTF_GET_FD_BY_ID command.  It is limited to CAP_SYS_ADMIN
   which is inline with the BPF_BTF_LOAD cmd and the existing
   BPF_[MAP|PROG]_GET_FD_BY_ID cmd.
2. new btf_id (and btf_key_id + btf_value_id) in "struct bpf_map_info"

Once the BTF ID handler is accessible from userspace, freeing a BTF
object has to go through a rcu period.  The BPF_BTF_GET_FD_BY_ID cmd
can then be done under a rcu_read_lock() instead of taking
spin_lock.
[Note: A similar rcu usage can be done to the existing
       bpf_prog_get_fd_by_id() in a follow up patch]

When processing the BPF_BTF_GET_FD_BY_ID cmd,
refcount_inc_not_zero() is needed because the BTF object
could be already in the rcu dead row .  btf_get() is
removed since its usage is currently limited to btf.c
alone.  refcount_inc() is used directly instead.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/btf.h      |   2 +
 include/uapi/linux/bpf.h |   5 +++
 kernel/bpf/btf.c         | 108 ++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c     |  24 ++++++++++-
 4 files changed, 128 insertions(+), 11 deletions(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a966dc6d61ee..e076c4697049 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -44,5 +44,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *ret_size);
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m);
+int btf_get_fd_by_id(u32 id);
+u32 btf_id(const struct btf *btf);
 
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 93d5a4eeec2a..6106f23a9a8a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_cmd {
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -344,6 +345,7 @@ union bpf_attr {
 			__u32		start_id;
 			__u32		prog_id;
 			__u32		map_id;
+			__u32		btf_id;
 		};
 		__u32		next_id;
 		__u32		open_flags;
@@ -2130,6 +2132,9 @@ struct bpf_map_info {
 	__u32 ifindex;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index fa0dce0452e7..40950b6bf395 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -11,6 +11,7 @@
 #include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -179,6 +180,9 @@
 	     i < btf_type_vlen(struct_type);				\
 	     i++, member++)
 
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
 struct btf {
 	union {
 		struct btf_header *hdr;
@@ -193,6 +197,8 @@ struct btf {
 	u32 types_size;
 	u32 data_size;
 	refcount_t refcnt;
+	u32 id;
+	struct rcu_head rcu;
 };
 
 enum verifier_phase {
@@ -598,6 +604,42 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 	return 0;
 }
 
+static int btf_alloc_id(struct btf *btf)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&btf_idr_lock);
+	id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		btf->id = id;
+	spin_unlock_bh(&btf_idr_lock);
+	idr_preload_end();
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+	unsigned long flags;
+
+	/*
+	 * In map-in-map, calling map_delete_elem() on outer
+	 * map will call bpf_map_put on the inner map.
+	 * It will then eventually call btf_free_id()
+	 * on the inner map.  Some of the map_delete_elem()
+	 * implementation may have irq disabled, so
+	 * we need to use the _irqsave() version instead
+	 * of the _bh() version.
+	 */
+	spin_lock_irqsave(&btf_idr_lock, flags);
+	idr_remove(&btf_idr, btf->id);
+	spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
@@ -607,15 +649,19 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
-static void btf_get(struct btf *btf)
+static void btf_free_rcu(struct rcu_head *rcu)
 {
-	refcount_inc(&btf->refcnt);
+	struct btf *btf = container_of(rcu, struct btf, rcu);
+
+	btf_free(btf);
 }
 
 void btf_put(struct btf *btf)
 {
-	if (btf && refcount_dec_and_test(&btf->refcnt))
-		btf_free(btf);
+	if (btf && refcount_dec_and_test(&btf->refcnt)) {
+		btf_free_id(btf);
+		call_rcu(&btf->rcu, btf_free_rcu);
+	}
 }
 
 static int env_resolve_init(struct btf_verifier_env *env)
@@ -2006,10 +2052,15 @@ const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
+static int __btf_new_fd(struct btf *btf)
+{
+	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
 int btf_new_fd(const union bpf_attr *attr)
 {
 	struct btf *btf;
-	int fd;
+	int ret;
 
 	btf = btf_parse(u64_to_user_ptr(attr->btf),
 			attr->btf_size, attr->btf_log_level,
@@ -2018,12 +2069,23 @@ int btf_new_fd(const union bpf_attr *attr)
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
-	fd = anon_inode_getfd("btf", &btf_fops, btf,
-			      O_RDONLY | O_CLOEXEC);
-	if (fd < 0)
+	ret = btf_alloc_id(btf);
+	if (ret) {
+		btf_free(btf);
+		return ret;
+	}
+
+	/*
+	 * The BTF ID is published to the userspace.
+	 * All BTF free must go through call_rcu() from
+	 * now on (i.e. free by calling btf_put()).
+	 */
+
+	ret = __btf_new_fd(btf);
+	if (ret < 0)
 		btf_put(btf);
 
-	return fd;
+	return ret;
 }
 
 struct btf *btf_get_by_fd(int fd)
@@ -2042,7 +2104,7 @@ struct btf *btf_get_by_fd(int fd)
 	}
 
 	btf = f.file->private_data;
-	btf_get(btf);
+	refcount_inc(&btf->refcnt);
 	fdput(f);
 
 	return btf;
@@ -2062,3 +2124,29 @@ int btf_get_info_by_fd(const struct btf *btf,
 
 	return 0;
 }
+
+int btf_get_fd_by_id(u32 id)
+{
+	struct btf *btf;
+	int fd;
+
+	rcu_read_lock();
+	btf = idr_find(&btf_idr, id);
+	if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+		btf = ERR_PTR(-ENOENT);
+	rcu_read_unlock();
+
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = __btf_new_fd(btf);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+	return btf->id;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9b87198deea2..31c4092da277 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -252,7 +252,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
-	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -273,6 +272,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
+		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -2002,6 +2002,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (map->btf) {
+		info.btf_id = btf_id(map->btf);
+		info.btf_key_id = map->btf_key_id;
+		info.btf_value_id = map->btf_value_id;
+	}
+
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
 		if (err)
@@ -2059,6 +2065,19 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	return btf_new_fd(attr);
 }
 
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_get_fd_by_id(attr->btf_id);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2142,6 +2161,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_LOAD:
 		err = bpf_btf_load(&attr);
 		break;
+	case BPF_BTF_GET_FD_BY_ID:
+		err = bpf_btf_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 62dab84c81a487d946a5fc37c6df541dd95cca38 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:52 -0700
Subject: bpf: btf: Add struct bpf_btf_info

During BPF_OBJ_GET_INFO_BY_FD on a btf_fd, the current bpf_attr's
info.info is directly filled with the BTF binary data.  It is
not extensible.  In this case, we want to add BTF ID.

This patch adds "struct bpf_btf_info" which has the BTF ID as
one of its member.  The BTF binary data itself is exposed through
the "btf" and "btf_size" members.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/btf.c         | 26 +++++++++++++++++++++-----
 kernel/bpf/syscall.c     | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6106f23a9a8a..d615c777b573 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2137,6 +2137,12 @@ struct bpf_map_info {
 	__u32 btf_value_id;
 } __attribute__((aligned(8)));
 
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
+} __attribute__((aligned(8)));
+
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
  * attach attach type).
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 40950b6bf395..ded10ab47b8a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2114,12 +2114,28 @@ int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
 		       union bpf_attr __user *uattr)
 {
-	void __user *udata = u64_to_user_ptr(attr->info.info);
-	u32 copy_len = min_t(u32, btf->data_size,
-			     attr->info.info_len);
+	struct bpf_btf_info __user *uinfo;
+	struct bpf_btf_info info = {};
+	u32 info_copy, btf_copy;
+	void __user *ubtf;
+	u32 uinfo_len;
 
-	if (copy_to_user(udata, btf->data, copy_len) ||
-	    put_user(btf->data_size, &uattr->info.info_len))
+	uinfo = u64_to_user_ptr(attr->info.info);
+	uinfo_len = attr->info.info_len;
+
+	info_copy = min_t(u32, uinfo_len, sizeof(info));
+	if (copy_from_user(&info, uinfo, info_copy))
+		return -EFAULT;
+
+	info.id = btf->id;
+	ubtf = u64_to_user_ptr(info.btf);
+	btf_copy = min_t(u32, btf->data_size, info.btf_size);
+	if (copy_to_user(ubtf, btf->data, btf_copy))
+		return -EFAULT;
+	info.btf_size = btf->data_size;
+
+	if (copy_to_user(uinfo, &info, info_copy) ||
+	    put_user(info_copy, &uattr->info.info_len))
 		return -EFAULT;
 
 	return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 31c4092da277..e2aeb5e89f44 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2021,6 +2021,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	return 0;
 }
 
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	if (err)
+		return err;
+
+	return btf_get_info_by_fd(btf, attr, uattr);
+}
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -2044,7 +2059,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
 	else if (f.file->f_op == &btf_fops)
-		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
+		err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From e34d98d3b4e2e3edcaebb68b1fad564af68f2b73 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:53 -0700
Subject: bpf: btf: Some test_btf clean up

This patch adds a CHECK() macro for condition checking
and error report purpose.  Something similar to test_progs.c

It also counts the number of tests passed/skipped/failed and
print them at the end of the test run.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_btf.c | 201 ++++++++++++++++-----------------
 1 file changed, 99 insertions(+), 102 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 7b39b1f712a1..b7880a20fad1 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -20,6 +20,30 @@
 
 #include "bpf_rlimit.h"
 
+static uint32_t pass_cnt;
+static uint32_t error_cnt;
+static uint32_t skip_cnt;
+
+#define CHECK(condition, format...) ({					\
+	int __ret = !!(condition);					\
+	if (__ret) {							\
+		fprintf(stderr, "%s:%d:FAIL ", __func__, __LINE__);	\
+		fprintf(stderr, format);				\
+	}								\
+	__ret;								\
+})
+
+static int count_result(int err)
+{
+	if (err)
+		error_cnt++;
+	else
+		pass_cnt++;
+
+	fprintf(stderr, "\n");
+	return err;
+}
+
 #define min(a, b) ((a) < (b) ? (a) : (b))
 #define __printf(a, b)	__attribute__((format(printf, a, b)))
 
@@ -894,17 +918,13 @@ static void *btf_raw_create(const struct btf_header *hdr,
 	void *raw_btf;
 
 	type_sec_size = get_type_sec_size(raw_types);
-	if (type_sec_size < 0) {
-		fprintf(stderr, "Cannot get nr_raw_types\n");
+	if (CHECK(type_sec_size < 0, "Cannot get nr_raw_types"))
 		return NULL;
-	}
 
 	size_needed = sizeof(*hdr) + type_sec_size + str_sec_size;
 	raw_btf = malloc(size_needed);
-	if (!raw_btf) {
-		fprintf(stderr, "Cannot allocate memory for raw_btf\n");
+	if (CHECK(!raw_btf, "Cannot allocate memory for raw_btf"))
 		return NULL;
-	}
 
 	/* Copy header */
 	memcpy(raw_btf, hdr, sizeof(*hdr));
@@ -915,8 +935,7 @@ static void *btf_raw_create(const struct btf_header *hdr,
 	for (i = 0; i < type_sec_size / sizeof(raw_types[0]); i++) {
 		if (raw_types[i] == NAME_TBD) {
 			next_str = get_next_str(next_str, end_str);
-			if (!next_str) {
-				fprintf(stderr, "Error in getting next_str\n");
+			if (CHECK(!next_str, "Error in getting next_str")) {
 				free(raw_btf);
 				return NULL;
 			}
@@ -973,9 +992,8 @@ static int do_test_raw(unsigned int test_num)
 	free(raw_btf);
 
 	err = ((btf_fd == -1) != test->btf_load_err);
-	if (err)
-		fprintf(stderr, "btf_load_err:%d btf_fd:%d\n",
-			test->btf_load_err, btf_fd);
+	CHECK(err, "btf_fd:%d test->btf_load_err:%u",
+	      btf_fd, test->btf_load_err);
 
 	if (err || btf_fd == -1)
 		goto done;
@@ -992,16 +1010,15 @@ static int do_test_raw(unsigned int test_num)
 	map_fd = bpf_create_map_xattr(&create_attr);
 
 	err = ((map_fd == -1) != test->map_create_err);
-	if (err)
-		fprintf(stderr, "map_create_err:%d map_fd:%d\n",
-			test->map_create_err, map_fd);
+	CHECK(err, "map_fd:%d test->map_create_err:%u",
+	      map_fd, test->map_create_err);
 
 done:
 	if (!err)
-		fprintf(stderr, "OK\n");
+		fprintf(stderr, "OK");
 
 	if (*btf_log_buf && (err || args.always_log))
-		fprintf(stderr, "%s\n", btf_log_buf);
+		fprintf(stderr, "\n%s", btf_log_buf);
 
 	if (btf_fd != -1)
 		close(btf_fd);
@@ -1017,10 +1034,10 @@ static int test_raw(void)
 	int err = 0;
 
 	if (args.raw_test_num)
-		return do_test_raw(args.raw_test_num);
+		return count_result(do_test_raw(args.raw_test_num));
 
 	for (i = 1; i <= ARRAY_SIZE(raw_tests); i++)
-		err |= do_test_raw(i);
+		err |= count_result(do_test_raw(i));
 
 	return err;
 }
@@ -1080,8 +1097,7 @@ static int do_test_get_info(unsigned int test_num)
 	*btf_log_buf = '\0';
 
 	user_btf = malloc(raw_btf_size);
-	if (!user_btf) {
-		fprintf(stderr, "Cannot allocate memory for user_btf\n");
+	if (CHECK(!user_btf, "!user_btf")) {
 		err = -1;
 		goto done;
 	}
@@ -1089,9 +1105,7 @@ static int do_test_get_info(unsigned int test_num)
 	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
 			      btf_log_buf, BTF_LOG_BUF_SIZE,
 			      args.always_log);
-	if (btf_fd == -1) {
-		fprintf(stderr, "bpf_load_btf:%s(%d)\n",
-			strerror(errno), errno);
+	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
 		err = -1;
 		goto done;
 	}
@@ -1103,31 +1117,31 @@ static int do_test_get_info(unsigned int test_num)
 		       raw_btf_size - expected_nbytes);
 
 	err = bpf_obj_get_info_by_fd(btf_fd, user_btf, &user_btf_size);
-	if (err || user_btf_size != raw_btf_size ||
-	    memcmp(raw_btf, user_btf, expected_nbytes)) {
-		fprintf(stderr,
-			"err:%d(errno:%d) raw_btf_size:%u user_btf_size:%u expected_nbytes:%u memcmp:%d\n",
-			err, errno,
-			raw_btf_size, user_btf_size, expected_nbytes,
-			memcmp(raw_btf, user_btf, expected_nbytes));
+	if (CHECK(err || user_btf_size != raw_btf_size ||
+		  memcmp(raw_btf, user_btf, expected_nbytes),
+		  "err:%d(errno:%d) raw_btf_size:%u user_btf_size:%u expected_nbytes:%u memcmp:%d",
+		  err, errno,
+		  raw_btf_size, user_btf_size, expected_nbytes,
+		  memcmp(raw_btf, user_btf, expected_nbytes))) {
 		err = -1;
 		goto done;
 	}
 
 	while (expected_nbytes < raw_btf_size) {
 		fprintf(stderr, "%u...", expected_nbytes);
-		if (user_btf[expected_nbytes++] != 0xff) {
-			fprintf(stderr, "!= 0xff\n");
+		if (CHECK(user_btf[expected_nbytes++] != 0xff,
+			  "user_btf[%u]:%x != 0xff", expected_nbytes - 1,
+			  user_btf[expected_nbytes - 1])) {
 			err = -1;
 			goto done;
 		}
 	}
 
-	fprintf(stderr, "OK\n");
+	fprintf(stderr, "OK");
 
 done:
 	if (*btf_log_buf && (err || args.always_log))
-		fprintf(stderr, "%s\n", btf_log_buf);
+		fprintf(stderr, "\n%s", btf_log_buf);
 
 	free(raw_btf);
 	free(user_btf);
@@ -1144,10 +1158,10 @@ static int test_get_info(void)
 	int err = 0;
 
 	if (args.get_info_test_num)
-		return do_test_get_info(args.get_info_test_num);
+		return count_result(do_test_get_info(args.get_info_test_num));
 
 	for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++)
-		err |= do_test_get_info(i);
+		err |= count_result(do_test_get_info(i));
 
 	return err;
 }
@@ -1175,28 +1189,21 @@ static int file_has_btf_elf(const char *fn)
 	Elf *elf;
 	int ret;
 
-	if (elf_version(EV_CURRENT) == EV_NONE) {
-		fprintf(stderr, "Failed to init libelf\n");
+	if (CHECK(elf_version(EV_CURRENT) == EV_NONE,
+		  "elf_version(EV_CURRENT) == EV_NONE"))
 		return -1;
-	}
 
 	elf_fd = open(fn, O_RDONLY);
-	if (elf_fd == -1) {
-		fprintf(stderr, "Cannot open file %s: %s(%d)\n",
-			fn, strerror(errno), errno);
+	if (CHECK(elf_fd == -1, "open(%s): errno:%d", fn, errno))
 		return -1;
-	}
 
 	elf = elf_begin(elf_fd, ELF_C_READ, NULL);
-	if (!elf) {
-		fprintf(stderr, "Failed to read ELF from %s. %s\n", fn,
-			elf_errmsg(elf_errno()));
+	if (CHECK(!elf, "elf_begin(%s): %s", fn, elf_errmsg(elf_errno()))) {
 		ret = -1;
 		goto done;
 	}
 
-	if (!gelf_getehdr(elf, &ehdr)) {
-		fprintf(stderr, "Failed to get EHDR from %s\n", fn);
+	if (CHECK(!gelf_getehdr(elf, &ehdr), "!gelf_getehdr(%s)", fn)) {
 		ret = -1;
 		goto done;
 	}
@@ -1205,9 +1212,8 @@ static int file_has_btf_elf(const char *fn)
 		const char *sh_name;
 		GElf_Shdr sh;
 
-		if (gelf_getshdr(scn, &sh) != &sh) {
-			fprintf(stderr,
-				"Failed to get section header from %s\n", fn);
+		if (CHECK(gelf_getshdr(scn, &sh) != &sh,
+			  "file:%s gelf_getshdr != &sh", fn)) {
 			ret = -1;
 			goto done;
 		}
@@ -1243,53 +1249,44 @@ static int do_test_file(unsigned int test_num)
 		return err;
 
 	if (err == 0) {
-		fprintf(stderr, "SKIP. No ELF %s found\n", BTF_ELF_SEC);
+		fprintf(stderr, "SKIP. No ELF %s found", BTF_ELF_SEC);
+		skip_cnt++;
 		return 0;
 	}
 
 	obj = bpf_object__open(test->file);
-	if (IS_ERR(obj))
+	if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj)))
 		return PTR_ERR(obj);
 
 	err = bpf_object__btf_fd(obj);
-	if (err == -1) {
-		fprintf(stderr, "bpf_object__btf_fd: -1\n");
+	if (CHECK(err == -1, "bpf_object__btf_fd: -1"))
 		goto done;
-	}
 
 	prog = bpf_program__next(NULL, obj);
-	if (!prog) {
-		fprintf(stderr, "Cannot find bpf_prog\n");
+	if (CHECK(!prog, "Cannot find bpf_prog")) {
 		err = -1;
 		goto done;
 	}
 
 	bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
 	err = bpf_object__load(obj);
-	if (err < 0) {
-		fprintf(stderr, "bpf_object__load: %d\n", err);
+	if (CHECK(err < 0, "bpf_object__load: %d", err))
 		goto done;
-	}
 
 	map = bpf_object__find_map_by_name(obj, "btf_map");
-	if (!map) {
-		fprintf(stderr, "btf_map not found\n");
+	if (CHECK(!map, "btf_map not found")) {
 		err = -1;
 		goto done;
 	}
 
 	err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0)
 		!= test->btf_kv_notfound;
-	if (err) {
-		fprintf(stderr,
-			"btf_kv_notfound:%u btf_key_id:%u btf_value_id:%u\n",
-			test->btf_kv_notfound,
-			bpf_map__btf_key_id(map),
-			bpf_map__btf_value_id(map));
+	if (CHECK(err, "btf_key_id:%u btf_value_id:%u test->btf_kv_notfound:%u",
+		  bpf_map__btf_key_id(map), bpf_map__btf_value_id(map),
+		  test->btf_kv_notfound))
 		goto done;
-	}
 
-	fprintf(stderr, "OK\n");
+	fprintf(stderr, "OK");
 
 done:
 	bpf_object__close(obj);
@@ -1302,10 +1299,10 @@ static int test_file(void)
 	int err = 0;
 
 	if (args.file_test_num)
-		return do_test_file(args.file_test_num);
+		return count_result(do_test_file(args.file_test_num));
 
 	for (i = 1; i <= ARRAY_SIZE(file_tests); i++)
-		err |= do_test_file(i);
+		err |= count_result(do_test_file(i));
 
 	return err;
 }
@@ -1425,7 +1422,7 @@ static int test_pprint(void)
 	unsigned int key;
 	uint8_t *raw_btf;
 	ssize_t nread;
-	int err;
+	int err, ret;
 
 	fprintf(stderr, "%s......", test->descr);
 	raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
@@ -1441,10 +1438,8 @@ static int test_pprint(void)
 			      args.always_log);
 	free(raw_btf);
 
-	if (btf_fd == -1) {
+	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
 		err = -1;
-		fprintf(stderr, "bpf_load_btf: %s(%d)\n",
-			strerror(errno), errno);
 		goto done;
 	}
 
@@ -1458,26 +1453,23 @@ static int test_pprint(void)
 	create_attr.btf_value_id = test->value_id;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
-	if (map_fd == -1) {
+	if (CHECK(map_fd == -1, "errno:%d", errno)) {
 		err = -1;
-		fprintf(stderr, "bpf_creat_map_btf: %s(%d)\n",
-			strerror(errno), errno);
 		goto done;
 	}
 
-	if (snprintf(pin_path, sizeof(pin_path), "%s/%s",
-		     "/sys/fs/bpf", test->map_name) == sizeof(pin_path)) {
+	ret = snprintf(pin_path, sizeof(pin_path), "%s/%s",
+		       "/sys/fs/bpf", test->map_name);
+
+	if (CHECK(ret == sizeof(pin_path), "pin_path %s/%s is too long",
+		  "/sys/fs/bpf", test->map_name)) {
 		err = -1;
-		fprintf(stderr, "pin_path is too long\n");
 		goto done;
 	}
 
 	err = bpf_obj_pin(map_fd, pin_path);
-	if (err) {
-		fprintf(stderr, "Cannot pin to %s. %s(%d).\n", pin_path,
-			strerror(errno), errno);
+	if (CHECK(err, "bpf_obj_pin(%s): errno:%d.", pin_path, errno))
 		goto done;
-	}
 
 	for (key = 0; key < test->max_entries; key++) {
 		set_pprint_mapv(&mapv, key);
@@ -1485,10 +1477,8 @@ static int test_pprint(void)
 	}
 
 	pin_file = fopen(pin_path, "r");
-	if (!pin_file) {
+	if (CHECK(!pin_file, "fopen(%s): errno:%d", pin_path, errno)) {
 		err = -1;
-		fprintf(stderr, "fopen(%s): %s(%d)\n", pin_path,
-			strerror(errno), errno);
 		goto done;
 	}
 
@@ -1497,9 +1487,8 @@ static int test_pprint(void)
 	       *line == '#')
 		;
 
-	if (nread <= 0) {
+	if (CHECK(nread <= 0, "Unexpected EOF")) {
 		err = -1;
-		fprintf(stderr, "Unexpected EOF\n");
 		goto done;
 	}
 
@@ -1518,9 +1507,9 @@ static int test_pprint(void)
 					  mapv.ui8a[4], mapv.ui8a[5], mapv.ui8a[6], mapv.ui8a[7],
 					  pprint_enum_str[mapv.aenum]);
 
-		if (nexpected_line == sizeof(expected_line)) {
+		if (CHECK(nexpected_line == sizeof(expected_line),
+			  "expected_line is too long")) {
 			err = -1;
-			fprintf(stderr, "expected_line is too long\n");
 			goto done;
 		}
 
@@ -1535,15 +1524,15 @@ static int test_pprint(void)
 		nread = getline(&line, &line_len, pin_file);
 	} while (++key < test->max_entries && nread > 0);
 
-	if (key < test->max_entries) {
+	if (CHECK(key < test->max_entries,
+		  "Unexpected EOF. key:%u test->max_entries:%u",
+		  key, test->max_entries)) {
 		err = -1;
-		fprintf(stderr, "Unexpected EOF\n");
 		goto done;
 	}
 
-	if (nread > 0) {
+	if (CHECK(nread > 0, "Unexpected extra pprint output: %s", line)) {
 		err = -1;
-		fprintf(stderr, "Unexpected extra pprint output: %s\n", line);
 		goto done;
 	}
 
@@ -1551,9 +1540,9 @@ static int test_pprint(void)
 
 done:
 	if (!err)
-		fprintf(stderr, "OK\n");
+		fprintf(stderr, "OK");
 	if (*btf_log_buf && (err || args.always_log))
-		fprintf(stderr, "%s\n", btf_log_buf);
+		fprintf(stderr, "\n%s", btf_log_buf);
 	if (btf_fd != -1)
 		close(btf_fd);
 	if (map_fd != -1)
@@ -1634,6 +1623,12 @@ static int parse_args(int argc, char **argv)
 	return 0;
 }
 
+static void print_summary(void)
+{
+	fprintf(stderr, "PASS:%u SKIP:%u FAIL:%u\n",
+		pass_cnt - skip_cnt, skip_cnt, error_cnt);
+}
+
 int main(int argc, char **argv)
 {
 	int err = 0;
@@ -1655,15 +1650,17 @@ int main(int argc, char **argv)
 		err |= test_file();
 
 	if (args.pprint_test)
-		err |= test_pprint();
+		err |= count_result(test_pprint());
 
 	if (args.raw_test || args.get_info_test || args.file_test ||
 	    args.pprint_test)
-		return err;
+		goto done;
 
 	err |= test_raw();
 	err |= test_get_info();
 	err |= test_file();
 
+done:
+	print_summary();
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7a01f6a3bd695d6d0f993a6df242a73a8b7c28c8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:54 -0700
Subject: bpf: btf: Update tools/include/uapi/linux/btf.h with BTF ID

This patch sync the tools/include/uapi/linux/btf.h with
the newly introduced BTF ID support.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 83a95ae388dd..fff51c187d1e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_cmd {
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -343,6 +344,7 @@ union bpf_attr {
 			__u32		start_id;
 			__u32		prog_id;
 			__u32		map_id;
+			__u32		btf_id;
 		};
 		__u32		next_id;
 		__u32		open_flags;
@@ -2129,6 +2131,15 @@ struct bpf_map_info {
 	__u32 ifindex;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_id;
+	__u32 btf_value_id;
+} __attribute__((aligned(8)));
+
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
-- 
cgit v1.2.3-59-g8ed1b


From cd8b89280c1c843756a2c95ed50a1a6446b42b52 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 4 May 2018 14:49:55 -0700
Subject: bpf: btf: Tests for BPF_OBJ_GET_INFO_BY_FD and BPF_BTF_GET_FD_BY_ID

This patch adds test for BPF_BTF_GET_FD_BY_ID and the new
btf_id/btf_key_id/btf_value_id in the "struct bpf_map_info".

It also modifies the existing BPF_OBJ_GET_INFO_BY_FD test
to reflect the new "struct bpf_btf_info".

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/bpf.c                    |  10 ++
 tools/lib/bpf/bpf.h                    |   1 +
 tools/testing/selftests/bpf/test_btf.c | 289 +++++++++++++++++++++++++++++++--
 3 files changed, 287 insertions(+), 13 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 76b36cc16e7f..a3a8fb2ac697 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -458,6 +458,16 @@ int bpf_map_get_fd_by_id(__u32 id)
 	return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
 }
 
+int bpf_btf_get_fd_by_id(__u32 id)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.btf_id = id;
+
+	return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr));
+}
+
 int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 553b11ad52b3..fb3a146d92ff 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -98,6 +98,7 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
 int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
 int bpf_prog_get_fd_by_id(__u32 id);
 int bpf_map_get_fd_by_id(__u32 id);
+int bpf_btf_get_fd_by_id(__u32 id);
 int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
 int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index b7880a20fad1..c8bceae7ec02 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1047,9 +1047,13 @@ struct btf_get_info_test {
 	const char *str_sec;
 	__u32 raw_types[MAX_NR_RAW_TYPES];
 	__u32 str_sec_size;
-	int info_size_delta;
+	int btf_size_delta;
+	int (*special_test)(unsigned int test_num);
 };
 
+static int test_big_btf_info(unsigned int test_num);
+static int test_btf_id(unsigned int test_num);
+
 const struct btf_get_info_test get_info_tests[] = {
 {
 	.descr = "== raw_btf_size+1",
@@ -1060,7 +1064,7 @@ const struct btf_get_info_test get_info_tests[] = {
 	},
 	.str_sec = "",
 	.str_sec_size = sizeof(""),
-	.info_size_delta = 1,
+	.btf_size_delta = 1,
 },
 {
 	.descr = "== raw_btf_size-3",
@@ -1071,20 +1075,274 @@ const struct btf_get_info_test get_info_tests[] = {
 	},
 	.str_sec = "",
 	.str_sec_size = sizeof(""),
-	.info_size_delta = -3,
+	.btf_size_delta = -3,
+},
+{
+	.descr = "Large bpf_btf_info",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.special_test = test_big_btf_info,
+},
+{
+	.descr = "BTF ID",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* unsigned int */			/* [2] */
+		BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.special_test = test_btf_id,
 },
 };
 
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+	return (__u64)(unsigned long)ptr;
+}
+
+static int test_big_btf_info(unsigned int test_num)
+{
+	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+	uint8_t *raw_btf = NULL, *user_btf = NULL;
+	unsigned int raw_btf_size;
+	struct {
+		struct bpf_btf_info info;
+		uint64_t garbage;
+	} info_garbage;
+	struct bpf_btf_info *info;
+	int btf_fd = -1, err;
+	uint32_t info_len;
+
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+
+	user_btf = malloc(raw_btf_size);
+	if (CHECK(!user_btf, "!user_btf")) {
+		err = -1;
+		goto done;
+	}
+
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+
+	/*
+	 * GET_INFO should error out if the userspace info
+	 * has non zero tailing bytes.
+	 */
+	info = &info_garbage.info;
+	memset(info, 0, sizeof(*info));
+	info_garbage.garbage = 0xdeadbeef;
+	info_len = sizeof(info_garbage);
+	info->btf = ptr_to_u64(user_btf);
+	info->btf_size = raw_btf_size;
+
+	err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+	if (CHECK(!err, "!err")) {
+		err = -1;
+		goto done;
+	}
+
+	/*
+	 * GET_INFO should succeed even info_len is larger than
+	 * the kernel supported as long as tailing bytes are zero.
+	 * The kernel supported info len should also be returned
+	 * to userspace.
+	 */
+	info_garbage.garbage = 0;
+	err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+	if (CHECK(err || info_len != sizeof(*info),
+		  "err:%d errno:%d info_len:%u sizeof(*info):%lu",
+		  err, errno, info_len, sizeof(*info))) {
+		err = -1;
+		goto done;
+	}
+
+	fprintf(stderr, "OK");
+
+done:
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "\n%s", btf_log_buf);
+
+	free(raw_btf);
+	free(user_btf);
+
+	if (btf_fd != -1)
+		close(btf_fd);
+
+	return err;
+}
+
+static int test_btf_id(unsigned int test_num)
+{
+	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+	struct bpf_create_map_attr create_attr = {};
+	uint8_t *raw_btf = NULL, *user_btf[2] = {};
+	int btf_fd[2] = {-1, -1}, map_fd = -1;
+	struct bpf_map_info map_info = {};
+	struct bpf_btf_info info[2] = {};
+	unsigned int raw_btf_size;
+	uint32_t info_len;
+	int err, i, ret;
+
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+
+	for (i = 0; i < 2; i++) {
+		user_btf[i] = malloc(raw_btf_size);
+		if (CHECK(!user_btf[i], "!user_btf[%d]", i)) {
+			err = -1;
+			goto done;
+		}
+		info[i].btf = ptr_to_u64(user_btf[i]);
+		info[i].btf_size = raw_btf_size;
+	}
+
+	btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size,
+				 btf_log_buf, BTF_LOG_BUF_SIZE,
+				 args.always_log);
+	if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+
+	/* Test BPF_OBJ_GET_INFO_BY_ID on btf_id */
+	info_len = sizeof(info[0]);
+	err = bpf_obj_get_info_by_fd(btf_fd[0], &info[0], &info_len);
+	if (CHECK(err, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+
+	btf_fd[1] = bpf_btf_get_fd_by_id(info[0].id);
+	if (CHECK(btf_fd[1] == -1, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+
+	ret = 0;
+	err = bpf_obj_get_info_by_fd(btf_fd[1], &info[1], &info_len);
+	if (CHECK(err || info[0].id != info[1].id ||
+		  info[0].btf_size != info[1].btf_size ||
+		  (ret = memcmp(user_btf[0], user_btf[1], info[0].btf_size)),
+		  "err:%d errno:%d id0:%u id1:%u btf_size0:%u btf_size1:%u memcmp:%d",
+		  err, errno, info[0].id, info[1].id,
+		  info[0].btf_size, info[1].btf_size, ret)) {
+		err = -1;
+		goto done;
+	}
+
+	/* Test btf members in struct bpf_map_info */
+	create_attr.name = "test_btf_id";
+	create_attr.map_type = BPF_MAP_TYPE_ARRAY;
+	create_attr.key_size = sizeof(int);
+	create_attr.value_size = sizeof(unsigned int);
+	create_attr.max_entries = 4;
+	create_attr.btf_fd = btf_fd[0];
+	create_attr.btf_key_id = 1;
+	create_attr.btf_value_id = 2;
+
+	map_fd = bpf_create_map_xattr(&create_attr);
+	if (CHECK(map_fd == -1, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+
+	info_len = sizeof(map_info);
+	err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+	if (CHECK(err || map_info.btf_id != info[0].id ||
+		  map_info.btf_key_id != 1 || map_info.btf_value_id != 2,
+		  "err:%d errno:%d info.id:%u btf_id:%u btf_key_id:%u btf_value_id:%u",
+		  err, errno, info[0].id, map_info.btf_id, map_info.btf_key_id,
+		  map_info.btf_value_id)) {
+		err = -1;
+		goto done;
+	}
+
+	for (i = 0; i < 2; i++) {
+		close(btf_fd[i]);
+		btf_fd[i] = -1;
+	}
+
+	/* Test BTF ID is removed from the kernel */
+	btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
+	if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+		err = -1;
+		goto done;
+	}
+	close(btf_fd[0]);
+	btf_fd[0] = -1;
+
+	/* The map holds the last ref to BTF and its btf_id */
+	close(map_fd);
+	map_fd = -1;
+	btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
+	if (CHECK(btf_fd[0] != -1, "BTF lingers")) {
+		err = -1;
+		goto done;
+	}
+
+	fprintf(stderr, "OK");
+
+done:
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "\n%s", btf_log_buf);
+
+	free(raw_btf);
+	if (map_fd != -1)
+		close(map_fd);
+	for (i = 0; i < 2; i++) {
+		free(user_btf[i]);
+		if (btf_fd[i] != -1)
+			close(btf_fd[i]);
+	}
+
+	return err;
+}
+
 static int do_test_get_info(unsigned int test_num)
 {
 	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
 	unsigned int raw_btf_size, user_btf_size, expected_nbytes;
 	uint8_t *raw_btf = NULL, *user_btf = NULL;
-	int btf_fd = -1, err;
+	struct bpf_btf_info info = {};
+	int btf_fd = -1, err, ret;
+	uint32_t info_len;
 
-	fprintf(stderr, "BTF GET_INFO_BY_ID test[%u] (%s): ",
+	fprintf(stderr, "BTF GET_INFO test[%u] (%s): ",
 		test_num, test->descr);
 
+	if (test->special_test)
+		return test->special_test(test_num);
+
 	raw_btf = btf_raw_create(&hdr_tmpl,
 				 test->raw_types,
 				 test->str_sec,
@@ -1110,19 +1368,24 @@ static int do_test_get_info(unsigned int test_num)
 		goto done;
 	}
 
-	user_btf_size = (int)raw_btf_size + test->info_size_delta;
+	user_btf_size = (int)raw_btf_size + test->btf_size_delta;
 	expected_nbytes = min(raw_btf_size, user_btf_size);
 	if (raw_btf_size > expected_nbytes)
 		memset(user_btf + expected_nbytes, 0xff,
 		       raw_btf_size - expected_nbytes);
 
-	err = bpf_obj_get_info_by_fd(btf_fd, user_btf, &user_btf_size);
-	if (CHECK(err || user_btf_size != raw_btf_size ||
-		  memcmp(raw_btf, user_btf, expected_nbytes),
-		  "err:%d(errno:%d) raw_btf_size:%u user_btf_size:%u expected_nbytes:%u memcmp:%d",
-		  err, errno,
-		  raw_btf_size, user_btf_size, expected_nbytes,
-		  memcmp(raw_btf, user_btf, expected_nbytes))) {
+	info_len = sizeof(info);
+	info.btf = ptr_to_u64(user_btf);
+	info.btf_size = user_btf_size;
+
+	ret = 0;
+	err = bpf_obj_get_info_by_fd(btf_fd, &info, &info_len);
+	if (CHECK(err || !info.id || info_len != sizeof(info) ||
+		  info.btf_size != raw_btf_size ||
+		  (ret = memcmp(raw_btf, user_btf, expected_nbytes)),
+		  "err:%d errno:%d info.id:%u info_len:%u sizeof(info):%lu raw_btf_size:%u info.btf_size:%u expected_nbytes:%u memcmp:%d",
+		  err, errno, info.id, info_len, sizeof(info),
+		  raw_btf_size, info.btf_size, expected_nbytes, ret)) {
 		err = -1;
 		goto done;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 2ead8ae110c6b62fe4d1d1bf04855e86582b96f5 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:55 -0700
Subject: fm10k: reduce duplicate fm10k_stat macro code

Share some of the code for setting up fm10k_stat macros by implementing
an FM10K_STAT_FIELDS macro which we can use when setting up the type
specific macros.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 29 ++++++++++++------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index eeac2b75a195..471dfd92d41b 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -11,12 +11,17 @@ struct fm10k_stats {
 	int stat_offset;
 };
 
-#define FM10K_NETDEV_STAT(_net_stat) { \
-	.stat_string = #_net_stat, \
-	.sizeof_stat = FIELD_SIZEOF(struct net_device_stats, _net_stat), \
-	.stat_offset = offsetof(struct net_device_stats, _net_stat) \
+#define FM10K_STAT_FIELDS(_type, _name, _stat) { \
+	.stat_string = _name, \
+	.sizeof_stat = FIELD_SIZEOF(_type, _stat), \
+	.stat_offset = offsetof(_type, _stat) \
 }
 
+/* netdevice statistics */
+#define FM10K_NETDEV_STAT(_net_stat) \
+	FM10K_STAT_FIELDS(struct net_device_stats, __stringify(_net_stat), \
+			  _net_stat)
+
 static const struct fm10k_stats fm10k_gstrings_net_stats[] = {
 	FM10K_NETDEV_STAT(tx_packets),
 	FM10K_NETDEV_STAT(tx_bytes),
@@ -34,11 +39,9 @@ static const struct fm10k_stats fm10k_gstrings_net_stats[] = {
 
 #define FM10K_NETDEV_STATS_LEN	ARRAY_SIZE(fm10k_gstrings_net_stats)
 
-#define FM10K_STAT(_name, _stat) { \
-	.stat_string = _name, \
-	.sizeof_stat = FIELD_SIZEOF(struct fm10k_intfc, _stat), \
-	.stat_offset = offsetof(struct fm10k_intfc, _stat) \
-}
+/* General interface statistics */
+#define FM10K_STAT(_name, _stat) \
+	FM10K_STAT_FIELDS(struct fm10k_intfc, _name, _stat)
 
 static const struct fm10k_stats fm10k_gstrings_global_stats[] = {
 	FM10K_STAT("tx_restart_queue", restart_queue),
@@ -75,11 +78,9 @@ static const struct fm10k_stats fm10k_gstrings_pf_stats[] = {
 	FM10K_STAT("nodesc_drop", stats.nodesc_drop.count),
 };
 
-#define FM10K_MBX_STAT(_name, _stat) { \
-	.stat_string = _name, \
-	.sizeof_stat = FIELD_SIZEOF(struct fm10k_mbx_info, _stat), \
-	.stat_offset = offsetof(struct fm10k_mbx_info, _stat) \
-}
+/* mailbox statistics */
+#define FM10K_MBX_STAT(_name, _stat) \
+	FM10K_STAT_FIELDS(struct fm10k_mbx_info, _name, _stat)
 
 static const struct fm10k_stats fm10k_gstrings_mbx_stats[] = {
 	FM10K_MBX_STAT("mbx_tx_busy", tx_busy),
-- 
cgit v1.2.3-59-g8ed1b


From d63bb21a7e722fcaa6cc6a217f21fe25a9e2c89e Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:56 -0700
Subject: fm10k: use variadic arguments to fm10k_add_stat_strings

Instead of using a fixed prefix string we setup before each call to
fm10k_add_stat_strings, modify the helper to take variadic arguments and
pass them to vsnprintf. This requires changing the fm10k_stat strings to
take % format specifiers where necessary, but the resulting code is much
simpler.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 53 ++++++++++++------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 471dfd92d41b..17d2388e71a2 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -6,6 +6,11 @@
 #include "fm10k.h"
 
 struct fm10k_stats {
+	/* The stat_string is expected to be a format string formatted using
+	 * vsnprintf by fm10k_add_stat_strings. Every member of a stats array
+	 * should use the same format specifiers as they will be formatted
+	 * using the same variadic arguments.
+	 */
 	char stat_string[ETH_GSTRING_LEN];
 	int sizeof_stat;
 	int stat_offset;
@@ -94,15 +99,13 @@ static const struct fm10k_stats fm10k_gstrings_mbx_stats[] = {
 	FM10K_MBX_STAT("mbx_rx_mbmem_pushed", rx_mbmem_pushed),
 };
 
-#define FM10K_QUEUE_STAT(_name, _stat) { \
-	.stat_string = _name, \
-	.sizeof_stat = FIELD_SIZEOF(struct fm10k_ring, _stat), \
-	.stat_offset = offsetof(struct fm10k_ring, _stat) \
-}
+/* per-queue ring statistics */
+#define FM10K_QUEUE_STAT(_name, _stat) \
+	FM10K_STAT_FIELDS(struct fm10k_ring, _name, _stat)
 
 static const struct fm10k_stats fm10k_gstrings_queue_stats[] = {
-	FM10K_QUEUE_STAT("packets", stats.packets),
-	FM10K_QUEUE_STAT("bytes", stats.bytes),
+	FM10K_QUEUE_STAT("%s_queue_%u_packets", stats.packets),
+	FM10K_QUEUE_STAT("%s_queue_%u_bytes", stats.bytes),
 };
 
 #define FM10K_GLOBAL_STATS_LEN ARRAY_SIZE(fm10k_gstrings_global_stats)
@@ -132,16 +135,18 @@ enum {
 static const char fm10k_prv_flags[FM10K_PRV_FLAG_LEN][ETH_GSTRING_LEN] = {
 };
 
-static void fm10k_add_stat_strings(u8 **p, const char *prefix,
-				   const struct fm10k_stats stats[],
-				   const unsigned int size)
+static void fm10k_add_stat_strings(u8 **p, const struct fm10k_stats stats[],
+				   const unsigned int size, ...)
 {
 	unsigned int i;
 
 	for (i = 0; i < size; i++) {
-		snprintf(*p, ETH_GSTRING_LEN, "%s%s",
-			 prefix, stats[i].stat_string);
+		va_list args;
+
+		va_start(args, size);
+		vsnprintf(*p, ETH_GSTRING_LEN, stats[i].stat_string, args);
 		*p += ETH_GSTRING_LEN;
+		va_end(args);
 	}
 }
 
@@ -150,31 +155,27 @@ static void fm10k_get_stat_strings(struct net_device *dev, u8 *data)
 	struct fm10k_intfc *interface = netdev_priv(dev);
 	unsigned int i;
 
-	fm10k_add_stat_strings(&data, "", fm10k_gstrings_net_stats,
+	fm10k_add_stat_strings(&data, fm10k_gstrings_net_stats,
 			       FM10K_NETDEV_STATS_LEN);
 
-	fm10k_add_stat_strings(&data, "", fm10k_gstrings_global_stats,
+	fm10k_add_stat_strings(&data, fm10k_gstrings_global_stats,
 			       FM10K_GLOBAL_STATS_LEN);
 
-	fm10k_add_stat_strings(&data, "", fm10k_gstrings_mbx_stats,
+	fm10k_add_stat_strings(&data, fm10k_gstrings_mbx_stats,
 			       FM10K_MBX_STATS_LEN);
 
 	if (interface->hw.mac.type != fm10k_mac_vf)
-		fm10k_add_stat_strings(&data, "", fm10k_gstrings_pf_stats,
+		fm10k_add_stat_strings(&data, fm10k_gstrings_pf_stats,
 				       FM10K_PF_STATS_LEN);
 
 	for (i = 0; i < interface->hw.mac.max_queues; i++) {
-		char prefix[ETH_GSTRING_LEN];
-
-		snprintf(prefix, ETH_GSTRING_LEN, "tx_queue_%u_", i);
-		fm10k_add_stat_strings(&data, prefix,
-				       fm10k_gstrings_queue_stats,
-				       FM10K_QUEUE_STATS_LEN);
+		fm10k_add_stat_strings(&data, fm10k_gstrings_queue_stats,
+				       FM10K_QUEUE_STATS_LEN,
+				       "tx", i);
 
-		snprintf(prefix, ETH_GSTRING_LEN, "rx_queue_%u_", i);
-		fm10k_add_stat_strings(&data, prefix,
-				       fm10k_gstrings_queue_stats,
-				       FM10K_QUEUE_STATS_LEN);
+		fm10k_add_stat_strings(&data, fm10k_gstrings_queue_stats,
+				       FM10K_QUEUE_STATS_LEN,
+				       "rx", i);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 36592d6ce8d38590894fb34329b0786386ee75bc Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:57 -0700
Subject: fm10k: use macro to avoid passing the array and size separately

Avoid potential bugs with fm10k_add_stat_strings and
fm10k_add_ethtool_stats by using a macro to calculate the ARRAY_SIZE
when passing. This helps ensure that the size is always correct.

Note that it assumes we only pass static const fm10k_stat arrays, and
that evaluation of the argument won't have side effects.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 48 +++++++++++-------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 17d2388e71a2..09fa1a30ee3e 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -135,8 +135,8 @@ enum {
 static const char fm10k_prv_flags[FM10K_PRV_FLAG_LEN][ETH_GSTRING_LEN] = {
 };
 
-static void fm10k_add_stat_strings(u8 **p, const struct fm10k_stats stats[],
-				   const unsigned int size, ...)
+static void __fm10k_add_stat_strings(u8 **p, const struct fm10k_stats stats[],
+				     const unsigned int size, ...)
 {
 	unsigned int i;
 
@@ -150,31 +150,28 @@ static void fm10k_add_stat_strings(u8 **p, const struct fm10k_stats stats[],
 	}
 }
 
+#define fm10k_add_stat_strings(p, stats, ...) \
+	__fm10k_add_stat_strings(p, stats, ARRAY_SIZE(stats), ## __VA_ARGS__)
+
 static void fm10k_get_stat_strings(struct net_device *dev, u8 *data)
 {
 	struct fm10k_intfc *interface = netdev_priv(dev);
 	unsigned int i;
 
-	fm10k_add_stat_strings(&data, fm10k_gstrings_net_stats,
-			       FM10K_NETDEV_STATS_LEN);
+	fm10k_add_stat_strings(&data, fm10k_gstrings_net_stats);
 
-	fm10k_add_stat_strings(&data, fm10k_gstrings_global_stats,
-			       FM10K_GLOBAL_STATS_LEN);
+	fm10k_add_stat_strings(&data, fm10k_gstrings_global_stats);
 
-	fm10k_add_stat_strings(&data, fm10k_gstrings_mbx_stats,
-			       FM10K_MBX_STATS_LEN);
+	fm10k_add_stat_strings(&data, fm10k_gstrings_mbx_stats);
 
 	if (interface->hw.mac.type != fm10k_mac_vf)
-		fm10k_add_stat_strings(&data, fm10k_gstrings_pf_stats,
-				       FM10K_PF_STATS_LEN);
+		fm10k_add_stat_strings(&data, fm10k_gstrings_pf_stats);
 
 	for (i = 0; i < interface->hw.mac.max_queues; i++) {
 		fm10k_add_stat_strings(&data, fm10k_gstrings_queue_stats,
-				       FM10K_QUEUE_STATS_LEN,
 				       "tx", i);
 
 		fm10k_add_stat_strings(&data, fm10k_gstrings_queue_stats,
-				       FM10K_QUEUE_STATS_LEN,
 				       "rx", i);
 	}
 }
@@ -220,9 +217,9 @@ static int fm10k_get_sset_count(struct net_device *dev, int sset)
 	}
 }
 
-static void fm10k_add_ethtool_stats(u64 **data, void *pointer,
-				    const struct fm10k_stats stats[],
-				    const unsigned int size)
+static void __fm10k_add_ethtool_stats(u64 **data, void *pointer,
+				      const struct fm10k_stats stats[],
+				      const unsigned int size)
 {
 	unsigned int i;
 	char *p;
@@ -256,6 +253,9 @@ static void fm10k_add_ethtool_stats(u64 **data, void *pointer,
 	}
 }
 
+#define fm10k_add_ethtool_stats(data, pointer, stats) \
+	__fm10k_add_ethtool_stats(data, pointer, stats, ARRAY_SIZE(stats))
+
 static void fm10k_get_ethtool_stats(struct net_device *netdev,
 				    struct ethtool_stats __always_unused *stats,
 				    u64 *data)
@@ -266,20 +266,16 @@ static void fm10k_get_ethtool_stats(struct net_device *netdev,
 
 	fm10k_update_stats(interface);
 
-	fm10k_add_ethtool_stats(&data, net_stats, fm10k_gstrings_net_stats,
-				FM10K_NETDEV_STATS_LEN);
+	fm10k_add_ethtool_stats(&data, net_stats, fm10k_gstrings_net_stats);
 
-	fm10k_add_ethtool_stats(&data, interface, fm10k_gstrings_global_stats,
-				FM10K_GLOBAL_STATS_LEN);
+	fm10k_add_ethtool_stats(&data, interface, fm10k_gstrings_global_stats);
 
 	fm10k_add_ethtool_stats(&data, &interface->hw.mbx,
-				fm10k_gstrings_mbx_stats,
-				FM10K_MBX_STATS_LEN);
+				fm10k_gstrings_mbx_stats);
 
 	if (interface->hw.mac.type != fm10k_mac_vf) {
 		fm10k_add_ethtool_stats(&data, interface,
-					fm10k_gstrings_pf_stats,
-					FM10K_PF_STATS_LEN);
+					fm10k_gstrings_pf_stats);
 	}
 
 	for (i = 0; i < interface->hw.mac.max_queues; i++) {
@@ -287,13 +283,11 @@ static void fm10k_get_ethtool_stats(struct net_device *netdev,
 
 		ring = interface->tx_ring[i];
 		fm10k_add_ethtool_stats(&data, ring,
-					fm10k_gstrings_queue_stats,
-					FM10K_QUEUE_STATS_LEN);
+					fm10k_gstrings_queue_stats);
 
 		ring = interface->rx_ring[i];
 		fm10k_add_ethtool_stats(&data, ring,
-					fm10k_gstrings_queue_stats,
-					FM10K_QUEUE_STATS_LEN);
+					fm10k_gstrings_queue_stats);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 454ca380ce3cac79bfd295f5a7ae15ec26ff1b67 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:58 -0700
Subject: fm10k: warn if the stat size is unknown

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
index 09fa1a30ee3e..7657daa27298 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c
@@ -248,6 +248,8 @@ static void __fm10k_add_ethtool_stats(u64 **data, void *pointer,
 			*((*data)++) = *(u8 *)p;
 			break;
 		default:
+			WARN_ONCE(1, "unexpected stat size for %s",
+				  stats[i].stat_string);
 			*((*data)++) = 0;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 0a3e92de1be42b5dbed4b81f835740d3f58c5430 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 12 Apr 2018 11:15:59 -0700
Subject: fm10k: don't protect fm10k_queue_mac_request by fm10k_host_mbx_ready

We don't actually need to check if the host mbx is ready when queuing
MAC requests, because these are not handled by a special queue which
queues up requests until the mailbox is capable of handling them.

Pull these requests outside the fm10k_host_mbx_ready() check, as it is
not necessary.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Krishneil Singh <krishneil.k.singh@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 0dc9f2dbc1ad..929f538d28bc 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1537,12 +1537,12 @@ static void *fm10k_dfwd_add_station(struct net_device *dev,
 
 	glort = l2_accel->dglort + 1 + i;
 
-	if (fm10k_host_mbx_ready(interface)) {
+	if (fm10k_host_mbx_ready(interface))
 		hw->mac.ops.update_xcast_mode(hw, glort,
 					      FM10K_XCAST_MODE_NONE);
-		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
-					hw->mac.default_vid, true);
-	}
+
+	fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+				hw->mac.default_vid, true);
 
 	for (vid = fm10k_find_next_vlan(interface, 0);
 	     vid < VLAN_N_VID;
@@ -1583,12 +1583,12 @@ static void fm10k_dfwd_del_station(struct net_device *dev, void *priv)
 
 	glort = l2_accel->dglort + 1 + i;
 
-	if (fm10k_host_mbx_ready(interface)) {
+	if (fm10k_host_mbx_ready(interface))
 		hw->mac.ops.update_xcast_mode(hw, glort,
 					      FM10K_XCAST_MODE_NONE);
-		fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
-					hw->mac.default_vid, false);
-	}
+
+	fm10k_queue_mac_request(interface, glort, sdev->dev_addr,
+				hw->mac.default_vid, false);
 
 	for (vid = fm10k_find_next_vlan(interface, 0);
 	     vid < VLAN_N_VID;
-- 
cgit v1.2.3-59-g8ed1b


From 0d8300325660f81787892a1c58dc1f9428a67143 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 8 May 2018 19:37:06 -0700
Subject: bpf: xdp: allow offloads to store into rx_queue_index

It's fairly easy for offloaded XDP programs to select the RX queue
packets go to.  We need a way of expressing this in the software.
Allow write to the rx_queue_index field of struct xdp_md for
device-bound programs.

Skip convert_ctx_access callback entirely for offloads.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h   | 2 +-
 kernel/bpf/verifier.c | 2 +-
 net/core/filter.c     | 9 ++++++++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 321969da67b7..a38e474bf7ee 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -627,7 +627,7 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
 
-static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux)
 {
 	return aux->offload_requested;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d5e1a6c4165d..d92d9c37affd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5215,7 +5215,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access)
+	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
diff --git a/net/core/filter.c b/net/core/filter.c
index 6877426c23a6..0baa715e4699 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4645,8 +4645,15 @@ static bool xdp_is_valid_access(int off, int size,
 				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info)
 {
-	if (type == BPF_WRITE)
+	if (type == BPF_WRITE) {
+		if (bpf_prog_is_dev_bound(prog->aux)) {
+			switch (off) {
+			case offsetof(struct xdp_md, rx_queue_index):
+				return __is_valid_xdp_access(off, size);
+			}
+		}
 		return false;
+	}
 
 	switch (off) {
 	case offsetof(struct xdp_md, data):
-- 
cgit v1.2.3-59-g8ed1b


From d985888faae6588c8ce9e45ad1e4a3ab5f0376b4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue, 8 May 2018 19:37:07 -0700
Subject: nfp: bpf: support setting the RX queue index

BPF has access to all internal FW datapath structures.  Including
the structure containing RX queue selection.  With little coordination
with the datapath we can let the offloaded BPF select the RX queue.
We just need a way to tell the datapath that queue selection has already
been done and it shouldn't overwrite it.  Define a bit to tell datapath
BPF already selected a queue (QSEL_SET), if the selected queue is not
enabled (>= number of enabled queues) datapath will perform normal RSS.

BPF queue selection on the NIC can be used to replace standard
datapath RSS with fully programmable BPF/XDP RSS.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/fw.h       |  1 +
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 47 +++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/main.c     | 11 ++++++
 drivers/net/ethernet/netronome/nfp/bpf/main.h     |  8 ++++
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 28 +++++++++++++-
 drivers/net/ethernet/netronome/nfp/nfp_asm.h      | 22 ++++++-----
 6 files changed, 105 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index 3dbc21653ce5..4c7972e3db63 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -50,6 +50,7 @@ enum bpf_cap_tlv_type {
 	NFP_BPF_CAP_TYPE_ADJUST_HEAD	= 2,
 	NFP_BPF_CAP_TYPE_MAPS		= 3,
 	NFP_BPF_CAP_TYPE_RANDOM		= 4,
+	NFP_BPF_CAP_TYPE_QUEUE_SELECT	= 5,
 };
 
 struct nfp_bpf_cap_tlv_func {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 326a2085d650..a4d3da215863 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -42,6 +42,7 @@
 
 #include "main.h"
 #include "../nfp_asm.h"
+#include "../nfp_net_ctrl.h"
 
 /* --- NFP prog --- */
 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
@@ -1470,6 +1471,38 @@ nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int
+nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	u32 jmp_tgt;
+
+	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
+
+	/* Make sure the queue id fits into FW field */
+	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
+		 ALU_OP_AND_NOT_B, reg_imm(0xff));
+	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
+
+	/* Set the 'queue selected' bit and the queue value */
+	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
+		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
+		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
+	emit_ld_field(nfp_prog,
+		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
+		      SHF_SC_NONE, 0);
+	/* Delay slots end here, we will jump over next instruction if queue
+	 * value fits into the field.
+	 */
+	emit_ld_field(nfp_prog,
+		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
+		      SHF_SC_NONE, 0);
+
+	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
+		return -EINVAL;
+
+	return 0;
+}
+
 /* --- Callbacks --- */
 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
@@ -2160,6 +2193,17 @@ mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 			    false, wrp_lmem_store);
 }
 
+static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	switch (meta->insn.off) {
+	case offsetof(struct xdp_md, rx_queue_index):
+		return nfp_queue_select(nfp_prog, meta);
+	}
+
+	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
+	return -EOPNOTSUPP;
+}
+
 static int
 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	unsigned int size)
@@ -2186,6 +2230,9 @@ static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 
 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
+	if (meta->ptr.type == PTR_TO_CTX)
+		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
+			return mem_stx_xdp(nfp_prog, meta);
 	return mem_stx(nfp_prog, meta, 4);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index d72f9e7f42da..f1846d8f59cc 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -334,6 +334,13 @@ nfp_bpf_parse_cap_random(struct nfp_app_bpf *bpf, void __iomem *value,
 	return 0;
 }
 
+static int
+nfp_bpf_parse_cap_qsel(struct nfp_app_bpf *bpf, void __iomem *value, u32 length)
+{
+	bpf->queue_select = true;
+	return 0;
+}
+
 static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 {
 	struct nfp_cpp *cpp = app->pf->cpp;
@@ -376,6 +383,10 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 			if (nfp_bpf_parse_cap_random(app->priv, value, length))
 				goto err_release_free;
 			break;
+		case NFP_BPF_CAP_TYPE_QUEUE_SELECT:
+			if (nfp_bpf_parse_cap_qsel(app->priv, value, length))
+				goto err_release_free;
+			break;
 		default:
 			nfp_dbg(cpp, "unknown BPF capability: %d\n", type);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 82682378d57f..8b143546ae85 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -82,10 +82,16 @@ enum static_regs {
 enum pkt_vec {
 	PKT_VEC_PKT_LEN		= 0,
 	PKT_VEC_PKT_PTR		= 2,
+	PKT_VEC_QSEL_SET	= 4,
+	PKT_VEC_QSEL_VAL	= 6,
 };
 
+#define PKT_VEL_QSEL_SET_BIT	4
+
 #define pv_len(np)	reg_lm(1, PKT_VEC_PKT_LEN)
 #define pv_ctm_ptr(np)	reg_lm(1, PKT_VEC_PKT_PTR)
+#define pv_qsel_set(np)	reg_lm(1, PKT_VEC_QSEL_SET)
+#define pv_qsel_val(np)	reg_lm(1, PKT_VEC_QSEL_VAL)
 
 #define stack_reg(np)	reg_a(STATIC_REG_STACK)
 #define stack_imm(np)	imm_b(np)
@@ -139,6 +145,7 @@ enum pkt_vec {
  * @helpers.perf_event_output:	output perf event to a ring buffer
  *
  * @pseudo_random:	FW initialized the pseudo-random machinery (CSRs)
+ * @queue_select:	BPF can set the RX queue ID in packet vector
  */
 struct nfp_app_bpf {
 	struct nfp_app *app;
@@ -181,6 +188,7 @@ struct nfp_app_bpf {
 	} helpers;
 
 	bool pseudo_random;
+	bool queue_select;
 };
 
 enum nfp_bpf_map_use {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e163f3cfa47d..844a9be6e55a 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -467,6 +467,30 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	return 0;
 }
 
+static int
+nfp_bpf_check_store(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		    struct bpf_verifier_env *env)
+{
+	const struct bpf_reg_state *reg = cur_regs(env) + meta->insn.dst_reg;
+
+	if (reg->type == PTR_TO_CTX) {
+		if (nfp_prog->type == BPF_PROG_TYPE_XDP) {
+			/* XDP ctx accesses must be 4B in size */
+			switch (meta->insn.off) {
+			case offsetof(struct xdp_md, rx_queue_index):
+				if (nfp_prog->bpf->queue_select)
+					goto exit_check_ptr;
+				pr_vlog(env, "queue selection not supported by FW\n");
+				return -EOPNOTSUPP;
+			}
+		}
+		pr_vlog(env, "unsupported store to context field\n");
+		return -EOPNOTSUPP;
+	}
+exit_check_ptr:
+	return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.dst_reg);
+}
+
 static int
 nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 		   struct bpf_verifier_env *env)
@@ -522,8 +546,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 		return nfp_bpf_check_ptr(nfp_prog, meta, env,
 					 meta->insn.src_reg);
 	if (is_mbpf_store(meta))
-		return nfp_bpf_check_ptr(nfp_prog, meta, env,
-					 meta->insn.dst_reg);
+		return nfp_bpf_check_store(nfp_prog, meta, env);
+
 	if (is_mbpf_xadd(meta))
 		return nfp_bpf_check_xadd(nfp_prog, meta, env);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index 5f2b2f24f4fa..faa4e131c136 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -183,16 +183,18 @@ enum shf_sc {
 #define OP_ALU_DST_LMEXTN	0x80000000000ULL
 
 enum alu_op {
-	ALU_OP_NONE	= 0x00,
-	ALU_OP_ADD	= 0x01,
-	ALU_OP_NOT	= 0x04,
-	ALU_OP_ADD_2B	= 0x05,
-	ALU_OP_AND	= 0x08,
-	ALU_OP_SUB_C	= 0x0d,
-	ALU_OP_ADD_C	= 0x11,
-	ALU_OP_OR	= 0x14,
-	ALU_OP_SUB	= 0x15,
-	ALU_OP_XOR	= 0x18,
+	ALU_OP_NONE		= 0x00,
+	ALU_OP_ADD		= 0x01,
+	ALU_OP_NOT		= 0x04,
+	ALU_OP_ADD_2B		= 0x05,
+	ALU_OP_AND		= 0x08,
+	ALU_OP_AND_NOT_A	= 0x0c,
+	ALU_OP_SUB_C		= 0x0d,
+	ALU_OP_AND_NOT_B	= 0x10,
+	ALU_OP_ADD_C		= 0x11,
+	ALU_OP_OR		= 0x14,
+	ALU_OP_SUB		= 0x15,
+	ALU_OP_XOR		= 0x18,
 };
 
 enum alu_dst_ab {
-- 
cgit v1.2.3-59-g8ed1b


From ea7e3435297c6ba3c8ae51db6ea48f1ed657dc5c Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 7 May 2018 19:43:50 +0200
Subject: xsk: fix 64-bit division
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

i386 builds report:
  net/xdp/xdp_umem.o: In function `xdp_umem_reg':
  xdp_umem.c:(.text+0x47e): undefined reference to `__udivdi3'

This fix uses div_u64 instead of the GCC built-in.

Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 881dfdefe235..2b47a1dd7c6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -209,7 +209,7 @@ int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = size / frame_size;
+	nframes = (unsigned int)div_u64(size, frame_size);
 	if (nframes == 0 || nframes > UINT_MAX)
 		return -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From cd65cd95128781ca59d06611270fcbd9b4a7cf8d Mon Sep 17 00:00:00 2001
From: Sirio Balmelli <sirio@b-ad.ch>
Date: Tue, 8 May 2018 15:36:12 +0200
Subject: selftests/bpf: add architecture-agnostic headers

The BPF selftests fail to build with missing headers
'asm/bitsperlong.h' and 'asm/errno.h'.

These already exist in 'tools/arch/[arch]/include';
add architecture-agnostic header files in 'tools/include/uapi'
to reference them.

Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/asm/bitsperlong.h | 18 ++++++++++++++++++
 tools/include/uapi/asm/errno.h       | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 tools/include/uapi/asm/bitsperlong.h
 create mode 100644 tools/include/uapi/asm/errno.h

diff --git a/tools/include/uapi/asm/bitsperlong.h b/tools/include/uapi/asm/bitsperlong.h
new file mode 100644
index 000000000000..8dd6aefdafa4
--- /dev/null
+++ b/tools/include/uapi/asm/bitsperlong.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/uapi/asm/bitsperlong.h"
+#elif defined(__aarch64__)
+#include "../../arch/arm64/include/uapi/asm/bitsperlong.h"
+#elif defined(__powerpc__)
+#include "../../arch/powerpc/include/uapi/asm/bitsperlong.h"
+#elif defined(__s390__)
+#include "../../arch/s390/include/uapi/asm/bitsperlong.h"
+#elif defined(__sparc__)
+#include "../../arch/sparc/include/uapi/asm/bitsperlong.h"
+#elif defined(__mips__)
+#include "../../arch/mips/include/uapi/asm/bitsperlong.h"
+#elif defined(__ia64__)
+#include "../../arch/ia64/include/uapi/asm/bitsperlong.h"
+#else
+#include <asm-generic/bitsperlong.h>
+#endif
diff --git a/tools/include/uapi/asm/errno.h b/tools/include/uapi/asm/errno.h
new file mode 100644
index 000000000000..ce3c5945a1c4
--- /dev/null
+++ b/tools/include/uapi/asm/errno.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/uapi/asm/errno.h"
+#elif defined(__powerpc__)
+#include "../../arch/powerpc/include/uapi/asm/errno.h"
+#elif defined(__sparc__)
+#include "../../arch/sparc/include/uapi/asm/errno.h"
+#elif defined(__alpha__)
+#include "../../arch/alpha/include/uapi/asm/errno.h"
+#elif defined(__mips__)
+#include "../../arch/mips/include/uapi/asm/errno.h"
+#elif defined(__ia64__)
+#include "../../arch/ia64/include/uapi/asm/errno.h"
+#elif defined(__xtensa__)
+#include "../../arch/xtensa/include/uapi/asm/errno.h"
+#else
+#include <asm-generic/errno.h>
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 96112e936350ae14454cb53be8d8c0b2c6f239f4 Mon Sep 17 00:00:00 2001
From: Sirio Balmelli <sirio@b-ad.ch>
Date: Tue, 8 May 2018 15:36:37 +0200
Subject: selftests/bpf: ignore build products

Update .gitignore files.

Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/.gitignore           | 3 +++
 tools/testing/selftests/bpf/.gitignore | 1 +
 2 files changed, 4 insertions(+)
 create mode 100644 tools/bpf/bpftool/.gitignore

diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore
new file mode 100644
index 000000000000..d7e678c2d396
--- /dev/null
+++ b/tools/bpf/bpftool/.gitignore
@@ -0,0 +1,3 @@
+*.d
+bpftool
+FEATURE-DUMP.bpftool
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 3e3b3ced3f7c..adc8e5474b66 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -16,3 +16,4 @@ test_sock
 test_sock_addr
 urandom_read
 test_btf
+test_sockmap
-- 
cgit v1.2.3-59-g8ed1b


From 3e50d2da5850dd126b3e6a6e4387620d55b71db4 Mon Sep 17 00:00:00 2001
From: Nisar Sayed <Nisar.Sayed@microchip.com>
Date: Wed, 2 May 2018 21:09:17 +0530
Subject: microchip_t1: Add driver for Microchip LAN87XX T1 PHYs

Add driver for Microchip LAN87XX T1 PHYs

This patch support driver for Microchp T1 PHYs.
There will be followup patches to this driver to support T1 PHY
features such as cable diagnostics, signal quality indicator(SQI),
sleep and wakeup (TC10) support.

Signed-off-by: Nisar Sayed <Nisar.Sayed@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig        |  5 +++
 drivers/net/phy/Makefile       |  1 +
 drivers/net/phy/microchip_t1.c | 74 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 drivers/net/phy/microchip_t1.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index edb8b9ab827f..ae2d69170dc3 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -360,6 +360,11 @@ config MICROCHIP_PHY
 	help
 	  Supports the LAN88XX PHYs.
 
+config MICROCHIP_T1_PHY
+	tristate "Microchip T1 PHYs"
+	---help---
+	  Supports the LAN87XX PHYs.
+
 config MICROSEMI_PHY
 	tristate "Microsemi PHYs"
 	---help---
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 701ca0b8717e..22183b9d7e08 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_MESON_GXL_PHY)	+= meson-gxl.o
 obj-$(CONFIG_MICREL_KS8995MA)	+= spi_ks8995.o
 obj-$(CONFIG_MICREL_PHY)	+= micrel.o
 obj-$(CONFIG_MICROCHIP_PHY)	+= microchip.o
+obj-$(CONFIG_MICROCHIP_T1_PHY)	+= microchip_t1.o
 obj-$(CONFIG_MICROSEMI_PHY)	+= mscc.o
 obj-$(CONFIG_NATIONAL_PHY)	+= national.o
 obj-$(CONFIG_QSEMI_PHY)		+= qsemi.o
diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c
new file mode 100644
index 000000000000..b1917dd1978a
--- /dev/null
+++ b/drivers/net/phy/microchip_t1.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2018 Microchip Technology
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mii.h>
+#include <linux/phy.h>
+
+/* Interrupt Source Register */
+#define LAN87XX_INTERRUPT_SOURCE                (0x18)
+
+/* Interrupt Mask Register */
+#define LAN87XX_INTERRUPT_MASK                  (0x19)
+#define LAN87XX_MASK_LINK_UP                    (0x0004)
+#define LAN87XX_MASK_LINK_DOWN                  (0x0002)
+
+#define DRIVER_AUTHOR	"Nisar Sayed <nisar.sayed@microchip.com>"
+#define DRIVER_DESC	"Microchip LAN87XX T1 PHY driver"
+
+static int lan87xx_phy_config_intr(struct phy_device *phydev)
+{
+	int rc, val = 0;
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+		/* unmask all source and clear them before enable */
+		rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, 0x7FFF);
+		rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE);
+		val = LAN87XX_MASK_LINK_UP | LAN87XX_MASK_LINK_DOWN;
+	}
+
+	rc = phy_write(phydev, LAN87XX_INTERRUPT_MASK, val);
+
+	return rc < 0 ? rc : 0;
+}
+
+static int lan87xx_phy_ack_interrupt(struct phy_device *phydev)
+{
+	int rc = phy_read(phydev, LAN87XX_INTERRUPT_SOURCE);
+
+	return rc < 0 ? rc : 0;
+}
+
+static struct phy_driver microchip_t1_phy_driver[] = {
+	{
+		.phy_id         = 0x0007c150,
+		.phy_id_mask    = 0xfffffff0,
+		.name           = "Microchip LAN87xx T1",
+
+		.features       = SUPPORTED_100baseT_Full,
+		.flags          = PHY_HAS_INTERRUPT,
+
+		.config_init    = genphy_config_init,
+		.config_aneg    = genphy_config_aneg,
+
+		.ack_interrupt  = lan87xx_phy_ack_interrupt,
+		.config_intr    = lan87xx_phy_config_intr,
+
+		.suspend        = genphy_suspend,
+		.resume         = genphy_resume,
+	}
+};
+
+module_phy_driver(microchip_t1_phy_driver);
+
+static struct mdio_device_id __maybe_unused microchip_t1_tbl[] = {
+	{ 0x0007c150, 0xfffffff0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(mdio, microchip_t1_tbl);
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From b2641e2ad456459a655da2433e3150d41640a6de Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Thu, 3 May 2018 17:28:11 +0100
Subject: net: hns3: Add support of hardware rx-vlan-offload to HNS3 VF driver

This patch adds support of hardware rx-vlan-offload to VF driver.
VF uses mailbox to convey PF to configure the hardware.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c           |  8 +-------
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h   |  1 +
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c    |  5 +++++
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 12 ++++++++++++
 5 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index c4e295094da7..729bcab947df 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1626,8 +1626,6 @@ static struct pci_driver hns3_driver = {
 /* set default feature to hns3 */
 static void hns3_set_default_feature(struct net_device *netdev)
 {
-	struct hnae3_handle *h = hns3_get_handle(netdev);
-
 	netdev->priv_flags |= IFF_UNICAST_FLT;
 
 	netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -1656,15 +1654,11 @@ static void hns3_set_default_feature(struct net_device *netdev)
 		NETIF_F_GSO_UDP_TUNNEL_CSUM;
 
 	netdev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-		NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
 		NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO |
 		NETIF_F_GRO | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_GRE |
 		NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_UDP_TUNNEL |
 		NETIF_F_GSO_UDP_TUNNEL_CSUM;
-
-	if (!(h->flags & HNAE3_SUPPORT_VF))
-		netdev->hw_features |=
-			NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
 }
 
 static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index dd5d65c9cca6..084b90437e23 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4862,7 +4862,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 	return hclge_set_vlan_filter(handle, htons(ETH_P_8021Q), 0, false);
 }
 
-static int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
+int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
 {
 	struct hclge_vport *vport = hclge_get_vport(handle);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index b7ee91daea0c..af736a44607c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -652,6 +652,7 @@ static inline int hclge_get_queue_id(struct hnae3_queue *queue)
 int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex);
 int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto,
 			  u16 vlan_id, bool is_kill);
+int hclge_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable);
 
 int hclge_buffer_alloc(struct hclge_dev *hdev);
 int hclge_rss_init_hw(struct hclge_dev *hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 7563335b0c7f..b6ae26ba0a46 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -276,6 +276,11 @@ static int hclge_set_vf_vlan_cfg(struct hclge_vport *vport,
 		memcpy(&proto, &mbx_req->msg[5], sizeof(proto));
 		status = hclge_set_vlan_filter(handle, cpu_to_be16(proto),
 					       vlan, is_kill);
+	} else if (mbx_req->msg[1] == HCLGE_MBX_VLAN_RX_OFF_CFG) {
+		struct hnae3_handle *handle = &vport->nic;
+		bool en = mbx_req->msg[2] ? true : false;
+
+		status = hclge_en_hw_strip_rxvtag(handle, en);
 	}
 
 	if (gen_resp)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 2b8426412cc9..2dbffcef7109 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -830,6 +830,17 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle,
 				    HCLGEVF_VLAN_MBX_MSG_LEN, false, NULL, 0);
 }
 
+static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable)
+{
+	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
+	u8 msg_data;
+
+	msg_data = enable ? 1 : 0;
+	return hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_VLAN,
+				    HCLGE_MBX_VLAN_RX_OFF_CFG, &msg_data,
+				    1, false, NULL, 0);
+}
+
 static void hclgevf_reset_tqp(struct hnae3_handle *handle, u16 queue_id)
 {
 	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
@@ -1825,6 +1836,7 @@ static const struct hnae3_ae_ops hclgevf_ops = {
 	.get_tc_size = hclgevf_get_tc_size,
 	.get_fw_version = hclgevf_get_fw_version,
 	.set_vlan_filter = hclgevf_set_vlan_filter,
+	.enable_hw_strip_rxvtag = hclgevf_en_hw_strip_rxvtag,
 	.reset_event = hclgevf_reset_event,
 	.get_channels = hclgevf_get_channels,
 	.get_tqps_and_rss_info = hclgevf_get_tqps_and_rss_info,
-- 
cgit v1.2.3-59-g8ed1b


From 4dbbe8dde8485b89bce8bbbe7564337fd7eed69f Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 4 May 2018 10:01:38 +0100
Subject: net: stmmac: Add support for U32 TC filter using Flexible RX Parser

This adds support for U32 filter by using an HW only feature called
Flexible RX Parser. This allow us to match any given packet field with a
pattern and accept/reject or even route the packet to a specific DMA
channel.

Right now we only support acception or rejection of frame and we only
support simple rules. Though, the Parser has the flexibility of jumping to
specific rules as an if condition so complex rules can be established.

This is only supported in GMAC5.10+.

The following commands can be used to test this code:

	1) Setup an ingress qdisk:
	# tc qdisc add dev eth0 handle ffff: ingress

	2) Setup a filter (e.g. filter by IP):
	# tc filter add dev eth0 parent ffff: protocol ip u32 match ip \
		src 192.168.0.3 skip_sw action drop

In every tests performed we always used the "skip_sw" flag to make sure
only the RX Parser was involved.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Jakub Kicinski <kubakici@wp.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/Makefile      |   2 +-
 drivers/net/ethernet/stmicro/stmmac/common.h      |   5 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |   4 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |   1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  |   3 +
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c      | 195 ++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h      |  13 +
 drivers/net/ethernet/stmicro/stmmac/hwif.c        |   8 +
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |  25 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |  29 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  59 +++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c   | 295 ++++++++++++++++++++++
 12 files changed, 636 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c

diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile
index e3b578b4f7cb..68e9e2640c62 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Makefile
+++ b/drivers/net/ethernet/stmicro/stmmac/Makefile
@@ -5,7 +5,7 @@ stmmac-objs:= stmmac_main.o stmmac_ethtool.o stmmac_mdio.o ring_mode.o	\
 	      dwmac100_core.o dwmac100_dma.o enh_desc.o norm_desc.o	\
 	      mmc_core.o stmmac_hwtstamp.o stmmac_ptp.o dwmac4_descs.o	\
 	      dwmac4_dma.o dwmac4_lib.o dwmac4_core.o dwmac5.o hwif.o \
-	      $(stmmac-y)
+	      stmmac_tc.o $(stmmac-y)
 
 # Ordering matters. Generic driver must be last.
 obj-$(CONFIG_STMMAC_PLATFORM)	+= stmmac-platform.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 627e905b6d76..a679cb729d1d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -353,6 +353,10 @@ struct dma_features {
 	unsigned int rx_fifo_size;
 	/* Automotive Safety Package */
 	unsigned int asp;
+	/* RX Parser */
+	unsigned int frpsel;
+	unsigned int frpbs;
+	unsigned int frpes;
 };
 
 /* GMAC TX FIFO is 8K, Rx FIFO is 16K */
@@ -412,6 +416,7 @@ struct mac_device_info {
 	const struct stmmac_dma_ops *dma;
 	const struct stmmac_mode_ops *mode;
 	const struct stmmac_hwtimestamp *ptp;
+	const struct stmmac_tc_ops *tc;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 03eab9077c1c..6330a55953df 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -194,6 +194,9 @@ enum power_event {
 
 /* MAC HW features3 bitmap */
 #define GMAC_HW_FEAT_ASP		GENMASK(29, 28)
+#define GMAC_HW_FEAT_FRPES		GENMASK(14, 13)
+#define GMAC_HW_FEAT_FRPBS		GENMASK(12, 11)
+#define GMAC_HW_FEAT_FRPSEL		BIT(10)
 
 /* MAC HW ADDR regs */
 #define GMAC_HI_DCS			GENMASK(18, 16)
@@ -202,6 +205,7 @@ enum power_event {
 
 /*  MTL registers */
 #define MTL_OPERATION_MODE		0x00000c00
+#define MTL_FRPE			BIT(15)
 #define MTL_OPERATION_SCHALG_MASK	GENMASK(6, 5)
 #define MTL_OPERATION_SCHALG_WRR	(0x0 << 5)
 #define MTL_OPERATION_SCHALG_WFQ	(0x1 << 5)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 7289b3b47d8e..a7121a7d9391 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -795,6 +795,7 @@ const struct stmmac_ops dwmac510_ops = {
 	.safety_feat_config = dwmac5_safety_feat_config,
 	.safety_feat_irq_status = dwmac5_safety_feat_irq_status,
 	.safety_feat_dump = dwmac5_safety_feat_dump,
+	.rxp_config = dwmac5_rxp_config,
 };
 
 int dwmac4_setup(struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index d37d457306d1..117c3a5288f0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -379,6 +379,9 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr,
 
 	/* 5.10 Features */
 	dma_cap->asp = (hw_cap & GMAC_HW_FEAT_ASP) >> 28;
+	dma_cap->frpes = (hw_cap & GMAC_HW_FEAT_FRPES) >> 13;
+	dma_cap->frpbs = (hw_cap & GMAC_HW_FEAT_FRPBS) >> 11;
+	dma_cap->frpsel = (hw_cap & GMAC_HW_FEAT_FRPSEL) >> 10;
 }
 
 /* Enable/disable TSO feature and set MSS */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index 2978550bb7f6..b2becb80a697 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -7,6 +7,7 @@
 #include "common.h"
 #include "dwmac4.h"
 #include "dwmac5.h"
+#include "stmmac.h"
 
 struct dwmac5_error_desc {
 	bool valid;
@@ -299,3 +300,197 @@ int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
 		*desc = dwmac5_all_errors[module].desc[offset].desc;
 	return 0;
 }
+
+static int dwmac5_rxp_disable(void __iomem *ioaddr)
+{
+	u32 val;
+	int ret;
+
+	val = readl(ioaddr + MTL_OPERATION_MODE);
+	val &= ~MTL_FRPE;
+	writel(val, ioaddr + MTL_OPERATION_MODE);
+
+	ret = readl_poll_timeout(ioaddr + MTL_RXP_CONTROL_STATUS, val,
+			val & RXPI, 1, 10000);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+static void dwmac5_rxp_enable(void __iomem *ioaddr)
+{
+	u32 val;
+
+	val = readl(ioaddr + MTL_OPERATION_MODE);
+	val |= MTL_FRPE;
+	writel(val, ioaddr + MTL_OPERATION_MODE);
+}
+
+static int dwmac5_rxp_update_single_entry(void __iomem *ioaddr,
+					  struct stmmac_tc_entry *entry,
+					  int pos)
+{
+	int ret, i;
+
+	for (i = 0; i < (sizeof(entry->val) / sizeof(u32)); i++) {
+		int real_pos = pos * (sizeof(entry->val) / sizeof(u32)) + i;
+		u32 val;
+
+		/* Wait for ready */
+		ret = readl_poll_timeout(ioaddr + MTL_RXP_IACC_CTRL_STATUS,
+				val, !(val & STARTBUSY), 1, 10000);
+		if (ret)
+			return ret;
+
+		/* Write data */
+		val = *((u32 *)&entry->val + i);
+		writel(val, ioaddr + MTL_RXP_IACC_DATA);
+
+		/* Write pos */
+		val = real_pos & ADDR;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Write OP */
+		val |= WRRDN;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Start Write */
+		val |= STARTBUSY;
+		writel(val, ioaddr + MTL_RXP_IACC_CTRL_STATUS);
+
+		/* Wait for done */
+		ret = readl_poll_timeout(ioaddr + MTL_RXP_IACC_CTRL_STATUS,
+				val, !(val & STARTBUSY), 1, 10000);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct stmmac_tc_entry *
+dwmac5_rxp_get_next_entry(struct stmmac_tc_entry *entries, unsigned int count,
+			  u32 curr_prio)
+{
+	struct stmmac_tc_entry *entry;
+	u32 min_prio = ~0x0;
+	int i, min_prio_idx;
+	bool found = false;
+
+	for (i = count - 1; i >= 0; i--) {
+		entry = &entries[i];
+
+		/* Do not update unused entries */
+		if (!entry->in_use)
+			continue;
+		/* Do not update already updated entries (i.e. fragments) */
+		if (entry->in_hw)
+			continue;
+		/* Let last entry be updated last */
+		if (entry->is_last)
+			continue;
+		/* Do not return fragments */
+		if (entry->is_frag)
+			continue;
+		/* Check if we already checked this prio */
+		if (entry->prio < curr_prio)
+			continue;
+		/* Check if this is the minimum prio */
+		if (entry->prio < min_prio) {
+			min_prio = entry->prio;
+			min_prio_idx = i;
+			found = true;
+		}
+	}
+
+	if (found)
+		return &entries[min_prio_idx];
+	return NULL;
+}
+
+int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+		      unsigned int count)
+{
+	struct stmmac_tc_entry *entry, *frag;
+	int i, ret, nve = 0;
+	u32 curr_prio = 0;
+	u32 old_val, val;
+
+	/* Force disable RX */
+	old_val = readl(ioaddr + GMAC_CONFIG);
+	val = old_val & ~GMAC_CONFIG_RE;
+	writel(val, ioaddr + GMAC_CONFIG);
+
+	/* Disable RX Parser */
+	ret = dwmac5_rxp_disable(ioaddr);
+	if (ret)
+		goto re_enable;
+
+	/* Set all entries as NOT in HW */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		entry->in_hw = false;
+	}
+
+	/* Update entries by reverse order */
+	while (1) {
+		entry = dwmac5_rxp_get_next_entry(entries, count, curr_prio);
+		if (!entry)
+			break;
+
+		curr_prio = entry->prio;
+		frag = entry->frag_ptr;
+
+		/* Set special fragment requirements */
+		if (frag) {
+			entry->val.af = 0;
+			entry->val.rf = 0;
+			entry->val.nc = 1;
+			entry->val.ok_index = nve + 2;
+		}
+
+		ret = dwmac5_rxp_update_single_entry(ioaddr, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+		entry->in_hw = true;
+
+		if (frag && !frag->in_hw) {
+			ret = dwmac5_rxp_update_single_entry(ioaddr, frag, nve);
+			if (ret)
+				goto re_enable;
+			frag->table_pos = nve++;
+			frag->in_hw = true;
+		}
+	}
+
+	if (!nve)
+		goto re_enable;
+
+	/* Update all pass entry */
+	for (i = 0; i < count; i++) {
+		entry = &entries[i];
+		if (!entry->is_last)
+			continue;
+
+		ret = dwmac5_rxp_update_single_entry(ioaddr, entry, nve);
+		if (ret)
+			goto re_enable;
+
+		entry->table_pos = nve++;
+	}
+
+	/* Assume n. of parsable entries == n. of valid entries */
+	val = (nve << 16) & NPE;
+	val |= nve & NVE;
+	writel(val, ioaddr + MTL_RXP_CONTROL_STATUS);
+
+	/* Enable RX Parser */
+	dwmac5_rxp_enable(ioaddr);
+
+re_enable:
+	/* Re-enable RX */
+	writel(old_val, ioaddr + GMAC_CONFIG);
+	return ret;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index bd4c466d303e..cc810aff7100 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -11,6 +11,17 @@
 #define PRTYEN				BIT(1)
 #define TMOUTEN				BIT(0)
 
+#define MTL_RXP_CONTROL_STATUS		0x00000ca0
+#define RXPI				BIT(31)
+#define NPE				GENMASK(23, 16)
+#define NVE				GENMASK(7, 0)
+#define MTL_RXP_IACC_CTRL_STATUS	0x00000cb0
+#define STARTBUSY			BIT(31)
+#define RXPEIEC				GENMASK(22, 21)
+#define RXPEIEE				BIT(20)
+#define WRRDN				BIT(16)
+#define ADDR				GENMASK(15, 0)
+#define MTL_RXP_IACC_DATA		0x00000cb4
 #define MTL_ECC_CONTROL			0x00000cc0
 #define TSOEE				BIT(4)
 #define MRXPEE				BIT(3)
@@ -48,5 +59,7 @@ int dwmac5_safety_feat_irq_status(struct net_device *ndev,
 		struct stmmac_safety_stats *stats);
 int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
 			int index, unsigned long *count, const char **desc);
+int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+		      unsigned int count);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 2b0a7e79de00..9acc8d2f1039 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -77,6 +77,7 @@ static const struct stmmac_hwif_entry {
 	const void *mac;
 	const void *hwtimestamp;
 	const void *mode;
+	const void *tc;
 	int (*setup)(struct stmmac_priv *priv);
 	int (*quirks)(struct stmmac_priv *priv);
 } stmmac_hw[] = {
@@ -90,6 +91,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac100_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac100_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
@@ -101,6 +103,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac1000_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac1000_setup,
 		.quirks = stmmac_dwmac1_quirks,
 	}, {
@@ -112,6 +115,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac4_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = NULL,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = stmmac_dwmac4_quirks,
 	}, {
@@ -123,6 +127,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac410_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
@@ -134,6 +139,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac410_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = NULL,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}, {
@@ -145,6 +151,7 @@ static const struct stmmac_hwif_entry {
 		.mac = &dwmac510_ops,
 		.hwtimestamp = &stmmac_ptp,
 		.mode = &dwmac4_ring_mode_ops,
+		.tc = &dwmac510_tc_ops,
 		.setup = dwmac4_setup,
 		.quirks = NULL,
 	}
@@ -196,6 +203,7 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 		mac->mac = entry->mac;
 		mac->ptp = entry->hwtimestamp;
 		mac->mode = entry->mode;
+		mac->tc = entry->tc;
 
 		priv->hw = mac;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index bfad61607f07..b7539a14e6ad 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -5,10 +5,12 @@
 #ifndef __STMMAC_HWIF_H__
 #define __STMMAC_HWIF_H__
 
+#include <linux/netdevice.h>
+
 #define stmmac_do_void_callback(__priv, __module, __cname,  __arg0, __args...) \
 ({ \
 	int __result = -EINVAL; \
-	if ((__priv)->hw->__module->__cname) { \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) { \
 		(__priv)->hw->__module->__cname((__arg0), ##__args); \
 		__result = 0; \
 	} \
@@ -17,7 +19,7 @@
 #define stmmac_do_callback(__priv, __module, __cname,  __arg0, __args...) \
 ({ \
 	int __result = -EINVAL; \
-	if ((__priv)->hw->__module->__cname) \
+	if ((__priv)->hw->__module && (__priv)->hw->__module->__cname) \
 		__result = (__priv)->hw->__module->__cname((__arg0), ##__args); \
 	__result; \
 })
@@ -232,6 +234,7 @@ struct mac_device_info;
 struct net_device;
 struct rgmii_adv;
 struct stmmac_safety_stats;
+struct stmmac_tc_entry;
 
 /* Helpers to program the MAC core */
 struct stmmac_ops {
@@ -301,6 +304,9 @@ struct stmmac_ops {
 			struct stmmac_safety_stats *stats);
 	int (*safety_feat_dump)(struct stmmac_safety_stats *stats,
 			int index, unsigned long *count, const char **desc);
+	/* Flexible RX Parser */
+	int (*rxp_config)(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
+			  unsigned int count);
 };
 
 #define stmmac_core_init(__priv, __args...) \
@@ -365,6 +371,8 @@ struct stmmac_ops {
 	stmmac_do_callback(__priv, mac, safety_feat_irq_status, __args)
 #define stmmac_safety_feat_dump(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, safety_feat_dump, __args)
+#define stmmac_rxp_config(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, rxp_config, __args)
 
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
@@ -419,6 +427,18 @@ struct stmmac_mode_ops {
 	stmmac_do_void_callback(__priv, mode, clean_desc3, __args)
 
 struct stmmac_priv;
+struct tc_cls_u32_offload;
+
+struct stmmac_tc_ops {
+	int (*init)(struct stmmac_priv *priv);
+	int (*setup_cls_u32)(struct stmmac_priv *priv,
+			     struct tc_cls_u32_offload *cls);
+};
+
+#define stmmac_tc_init(__priv, __args...) \
+	stmmac_do_callback(__priv, tc, init, __args)
+#define stmmac_tc_setup_cls_u32(__priv, __args...) \
+	stmmac_do_callback(__priv, tc, setup_cls_u32, __args)
 
 extern const struct stmmac_ops dwmac100_ops;
 extern const struct stmmac_dma_ops dwmac100_dma_ops;
@@ -429,6 +449,7 @@ extern const struct stmmac_dma_ops dwmac4_dma_ops;
 extern const struct stmmac_ops dwmac410_ops;
 extern const struct stmmac_dma_ops dwmac410_dma_ops;
 extern const struct stmmac_ops dwmac510_ops;
+extern const struct stmmac_tc_ops dwmac510_tc_ops;
 
 #define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 2443f20e07bf..42fc76e76bf9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -76,6 +76,30 @@ struct stmmac_rx_queue {
 	struct napi_struct napi ____cacheline_aligned_in_smp;
 };
 
+struct stmmac_tc_entry {
+	bool in_use;
+	bool in_hw;
+	bool is_last;
+	bool is_frag;
+	void *frag_ptr;
+	unsigned int table_pos;
+	u32 handle;
+	u32 prio;
+	struct {
+		u32 match_data;
+		u32 match_en;
+		u8 af:1;
+		u8 rf:1;
+		u8 im:1;
+		u8 nc:1;
+		u8 res1:4;
+		u8 frame_offset;
+		u8 ok_index;
+		u8 dma_ch_no;
+		u32 res2;
+	} __packed val;
+};
+
 struct stmmac_priv {
 	/* Frequently used values are kept adjacent for cache effect */
 	u32 tx_count_frames;
@@ -151,6 +175,11 @@ struct stmmac_priv {
 	unsigned long state;
 	struct workqueue_struct *wq;
 	struct work_struct service_task;
+
+	/* TC Handling */
+	unsigned int tc_entries_max;
+	unsigned int tc_off_max;
+	struct stmmac_tc_entry *tc_entries;
 };
 
 enum stmmac_state {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b935478df55f..d9dbe1355896 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -45,6 +45,7 @@
 #include <linux/seq_file.h>
 #endif /* CONFIG_DEBUG_FS */
 #include <linux/net_tstamp.h>
+#include <net/pkt_cls.h>
 #include "stmmac_ptp.h"
 #include "stmmac.h"
 #include <linux/reset.h>
@@ -3790,6 +3791,58 @@ static int stmmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	return ret;
 }
 
+static int stmmac_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				    void *cb_priv)
+{
+	struct stmmac_priv *priv = cb_priv;
+	int ret = -EOPNOTSUPP;
+
+	stmmac_disable_all_queues(priv);
+
+	switch (type) {
+	case TC_SETUP_CLSU32:
+		if (tc_cls_can_offload_and_chain0(priv->dev, type_data))
+			ret = stmmac_tc_setup_cls_u32(priv, priv, type_data);
+		break;
+	default:
+		break;
+	}
+
+	stmmac_enable_all_queues(priv);
+	return ret;
+}
+
+static int stmmac_setup_tc_block(struct stmmac_priv *priv,
+				 struct tc_block_offload *f)
+{
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb,
+				priv, priv);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type,
+			   void *type_data)
+{
+	struct stmmac_priv *priv = netdev_priv(ndev);
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return stmmac_setup_tc_block(priv, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int stmmac_set_mac_address(struct net_device *ndev, void *addr)
 {
 	struct stmmac_priv *priv = netdev_priv(ndev);
@@ -4028,6 +4081,7 @@ static const struct net_device_ops stmmac_netdev_ops = {
 	.ndo_set_rx_mode = stmmac_set_rx_mode,
 	.ndo_tx_timeout = stmmac_tx_timeout,
 	.ndo_do_ioctl = stmmac_ioctl,
+	.ndo_setup_tc = stmmac_setup_tc,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = stmmac_poll_controller,
 #endif
@@ -4227,6 +4281,11 @@ int stmmac_dvr_probe(struct device *device,
 	ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 			    NETIF_F_RXCSUM;
 
+	ret = stmmac_tc_init(priv, priv);
+	if (!ret) {
+		ndev->hw_features |= NETIF_F_HW_TC;
+	}
+
 	if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) {
 		ndev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
 		priv->tso = true;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
new file mode 100644
index 000000000000..881c94b73e2f
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Copyright (c) 2018 Synopsys, Inc. and/or its affiliates.
+ * stmmac TC Handling (HW only)
+ */
+
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include "common.h"
+#include "dwmac4.h"
+#include "dwmac5.h"
+#include "stmmac.h"
+
+static void tc_fill_all_pass_entry(struct stmmac_tc_entry *entry)
+{
+	memset(entry, 0, sizeof(*entry));
+	entry->in_use = true;
+	entry->is_last = true;
+	entry->is_frag = false;
+	entry->prio = ~0x0;
+	entry->handle = 0;
+	entry->val.match_data = 0x0;
+	entry->val.match_en = 0x0;
+	entry->val.af = 1;
+	entry->val.dma_ch_no = 0x0;
+}
+
+static struct stmmac_tc_entry *tc_find_entry(struct stmmac_priv *priv,
+					     struct tc_cls_u32_offload *cls,
+					     bool free)
+{
+	struct stmmac_tc_entry *entry, *first = NULL, *dup = NULL;
+	u32 loc = cls->knode.handle;
+	int i;
+
+	for (i = 0; i < priv->tc_entries_max; i++) {
+		entry = &priv->tc_entries[i];
+		if (!entry->in_use && !first && free)
+			first = entry;
+		if (entry->handle == loc && !free)
+			dup = entry;
+	}
+
+	if (dup)
+		return dup;
+	if (first) {
+		first->handle = loc;
+		first->in_use = true;
+
+		/* Reset HW values */
+		memset(&first->val, 0, sizeof(first->val));
+	}
+
+	return first;
+}
+
+static int tc_fill_actions(struct stmmac_tc_entry *entry,
+			   struct stmmac_tc_entry *frag,
+			   struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *action_entry = entry;
+	const struct tc_action *act;
+	struct tcf_exts *exts;
+	LIST_HEAD(actions);
+
+	exts = cls->knode.exts;
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+	if (frag)
+		action_entry = frag;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(act, &actions, list) {
+		/* Accept */
+		if (is_tcf_gact_ok(act)) {
+			action_entry->val.af = 1;
+			break;
+		}
+		/* Drop */
+		if (is_tcf_gact_shot(act)) {
+			action_entry->val.rf = 1;
+			break;
+		}
+
+		/* Unsupported */
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int tc_fill_entry(struct stmmac_priv *priv,
+			 struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *entry, *frag = NULL;
+	struct tc_u32_sel *sel = cls->knode.sel;
+	u32 off, data, mask, real_off, rem;
+	u32 prio = cls->common.prio;
+	int ret;
+
+	/* Only 1 match per entry */
+	if (sel->nkeys <= 0 || sel->nkeys > 1)
+		return -EINVAL;
+
+	off = sel->keys[0].off << sel->offshift;
+	data = sel->keys[0].val;
+	mask = sel->keys[0].mask;
+
+	switch (ntohs(cls->common.protocol)) {
+	case ETH_P_ALL:
+		break;
+	case ETH_P_IP:
+		off += ETH_HLEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (off > priv->tc_off_max)
+		return -EINVAL;
+
+	real_off = off / 4;
+	rem = off % 4;
+
+	entry = tc_find_entry(priv, cls, true);
+	if (!entry)
+		return -EINVAL;
+
+	if (rem) {
+		frag = tc_find_entry(priv, cls, true);
+		if (!frag) {
+			ret = -EINVAL;
+			goto err_unuse;
+		}
+
+		entry->frag_ptr = frag;
+		entry->val.match_en = (mask << (rem * 8)) &
+			GENMASK(31, rem * 8);
+		entry->val.match_data = (data << (rem * 8)) &
+			GENMASK(31, rem * 8);
+		entry->val.frame_offset = real_off;
+		entry->prio = prio;
+
+		frag->val.match_en = (mask >> (rem * 8)) &
+			GENMASK(rem * 8 - 1, 0);
+		frag->val.match_data = (data >> (rem * 8)) &
+			GENMASK(rem * 8 - 1, 0);
+		frag->val.frame_offset = real_off + 1;
+		frag->prio = prio;
+		frag->is_frag = true;
+	} else {
+		entry->frag_ptr = NULL;
+		entry->val.match_en = mask;
+		entry->val.match_data = data;
+		entry->val.frame_offset = real_off;
+		entry->prio = prio;
+	}
+
+	ret = tc_fill_actions(entry, frag, cls);
+	if (ret)
+		goto err_unuse;
+
+	return 0;
+
+err_unuse:
+	if (frag)
+		frag->in_use = false;
+	entry->in_use = false;
+	return ret;
+}
+
+static void tc_unfill_entry(struct stmmac_priv *priv,
+			    struct tc_cls_u32_offload *cls)
+{
+	struct stmmac_tc_entry *entry;
+
+	entry = tc_find_entry(priv, cls, false);
+	if (!entry)
+		return;
+
+	entry->in_use = false;
+	if (entry->frag_ptr) {
+		entry = entry->frag_ptr;
+		entry->is_frag = false;
+		entry->in_use = false;
+	}
+}
+
+static int tc_config_knode(struct stmmac_priv *priv,
+			   struct tc_cls_u32_offload *cls)
+{
+	int ret;
+
+	ret = tc_fill_entry(priv, cls);
+	if (ret)
+		return ret;
+
+	ret = stmmac_rxp_config(priv, priv->hw->pcsr, priv->tc_entries,
+			priv->tc_entries_max);
+	if (ret)
+		goto err_unfill;
+
+	return 0;
+
+err_unfill:
+	tc_unfill_entry(priv, cls);
+	return ret;
+}
+
+static int tc_delete_knode(struct stmmac_priv *priv,
+			   struct tc_cls_u32_offload *cls)
+{
+	int ret;
+
+	/* Set entry and fragments as not used */
+	tc_unfill_entry(priv, cls);
+
+	ret = stmmac_rxp_config(priv, priv->hw->pcsr, priv->tc_entries,
+			priv->tc_entries_max);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int tc_setup_cls_u32(struct stmmac_priv *priv,
+			    struct tc_cls_u32_offload *cls)
+{
+	switch (cls->command) {
+	case TC_CLSU32_REPLACE_KNODE:
+		tc_unfill_entry(priv, cls);
+		/* Fall through */
+	case TC_CLSU32_NEW_KNODE:
+		return tc_config_knode(priv, cls);
+	case TC_CLSU32_DELETE_KNODE:
+		return tc_delete_knode(priv, cls);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int tc_init(struct stmmac_priv *priv)
+{
+	struct dma_features *dma_cap = &priv->dma_cap;
+	unsigned int count;
+
+	if (!dma_cap->frpsel)
+		return -EINVAL;
+
+	switch (dma_cap->frpbs) {
+	case 0x0:
+		priv->tc_off_max = 64;
+		break;
+	case 0x1:
+		priv->tc_off_max = 128;
+		break;
+	case 0x2:
+		priv->tc_off_max = 256;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (dma_cap->frpes) {
+	case 0x0:
+		count = 64;
+		break;
+	case 0x1:
+		count = 128;
+		break;
+	case 0x2:
+		count = 256;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Reserve one last filter which lets all pass */
+	priv->tc_entries_max = count;
+	priv->tc_entries = devm_kzalloc(priv->device,
+			sizeof(*priv->tc_entries) * count, GFP_KERNEL);
+	if (!priv->tc_entries)
+		return -ENOMEM;
+
+	tc_fill_all_pass_entry(&priv->tc_entries[count - 1]);
+
+	dev_info(priv->device, "Enabling HW TC (entries=%d, max_off=%d)\n",
+			priv->tc_entries_max, priv->tc_off_max);
+	return 0;
+}
+
+const struct stmmac_tc_ops dwmac510_tc_ops = {
+	.init = tc_init,
+	.setup_cls_u32 = tc_setup_cls_u32,
+};
-- 
cgit v1.2.3-59-g8ed1b


From 5bafeb6e7e878ce41c834806239f8c629a59d40a Mon Sep 17 00:00:00 2001
From: Marek Behún <marek.behun@nic.cz>
Date: Fri, 4 May 2018 19:26:10 +0200
Subject: net: dsa: mv88e6xxx: 88E6141/6341 SERDES support

The 88E6141/6341 switches (also known as Topaz) have 1 SGMII lane,
which can be configured the same way as the SERDES lane on 88E6390.

Signed-off-by: Marek Behun <marek.behun@nic.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c   |  2 ++
 drivers/net/dsa/mv88e6xxx/serdes.c | 20 ++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/serdes.h |  3 +++
 3 files changed, 25 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9d62e4acc01b..e7e079b1888c 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2529,6 +2529,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
 	.reset = mv88e6185_g1_reset,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
+	.serdes_power = mv88e6341_serdes_power,
 };
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
@@ -2848,6 +2849,7 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
 	.reset = mv88e6352_g1_reset,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
+	.serdes_power = mv88e6341_serdes_power,
 };
 
 static const struct mv88e6xxx_ops mv88e6176_ops = {
diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c
index fb058fd35c0d..880b2cf0a530 100644
--- a/drivers/net/dsa/mv88e6xxx/serdes.c
+++ b/drivers/net/dsa/mv88e6xxx/serdes.c
@@ -326,3 +326,23 @@ int mv88e6390_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
 
 	return 0;
 }
+
+int mv88e6341_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on)
+{
+	int err;
+	u8 cmode;
+
+	if (port != 5)
+		return 0;
+
+	err = mv88e6xxx_port_get_cmode(chip, port, &cmode);
+	if (err)
+		return err;
+
+	if (cmode == MV88E6XXX_PORT_STS_CMODE_1000BASE_X ||
+	    cmode == MV88E6XXX_PORT_STS_CMODE_SGMII ||
+	    cmode == MV88E6XXX_PORT_STS_CMODE_2500BASEX)
+		return mv88e6390_serdes_sgmii(chip, MV88E6341_ADDR_SERDES, on);
+
+	return 0;
+}
diff --git a/drivers/net/dsa/mv88e6xxx/serdes.h b/drivers/net/dsa/mv88e6xxx/serdes.h
index 1897c01c6e19..b6e5fbd46b5e 100644
--- a/drivers/net/dsa/mv88e6xxx/serdes.h
+++ b/drivers/net/dsa/mv88e6xxx/serdes.h
@@ -19,6 +19,8 @@
 #define MV88E6352_ADDR_SERDES		0x0f
 #define MV88E6352_SERDES_PAGE_FIBER	0x01
 
+#define MV88E6341_ADDR_SERDES		0x15
+
 #define MV88E6390_PORT9_LANE0		0x09
 #define MV88E6390_PORT9_LANE1		0x12
 #define MV88E6390_PORT9_LANE2		0x13
@@ -42,6 +44,7 @@
 #define MV88E6390_SGMII_CONTROL_LOOPBACK	BIT(14)
 #define MV88E6390_SGMII_CONTROL_PDOWN		BIT(11)
 
+int mv88e6341_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
 int mv88e6352_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
 int mv88e6390_serdes_power(struct mv88e6xxx_chip *chip, int port, bool on);
 int mv88e6352_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port);
-- 
cgit v1.2.3-59-g8ed1b


From 5263a98f162f7a46e1292632c87f1e3444eb8fbf Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:06:58 -0700
Subject: net/ipv4: Update ip_tunnel_metadata_cnt static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  | 4 ++--
 net/ipv4/ip_tunnel_core.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 751646adc769..90ff430f5e9d 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -477,12 +477,12 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat
 	return (struct ip_tunnel_info *)lwtstate->data;
 }
 
-extern struct static_key ip_tunnel_metadata_cnt;
+DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
 
 /* Returns > 0 if metadata should be collected */
 static inline int ip_tunnel_collect_metadata(void)
 {
-	return static_key_false(&ip_tunnel_metadata_cnt);
+	return static_branch_unlikely(&ip_tunnel_metadata_cnt);
 }
 
 void __init ip_tunnel_core_init(void);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..dde671e97829 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void)
 	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
 }
 
-struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
 EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
 
 void ip_tunnel_need_metadata(void)
 {
-	static_key_slow_inc(&ip_tunnel_metadata_cnt);
+	static_branch_inc(&ip_tunnel_metadata_cnt);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
 
 void ip_tunnel_unneed_metadata(void)
 {
-	static_key_slow_dec(&ip_tunnel_metadata_cnt);
+	static_branch_dec(&ip_tunnel_metadata_cnt);
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
-- 
cgit v1.2.3-59-g8ed1b


From a7950ae8213cf38343fd27ad1fb58f3f04e3130f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:06:59 -0700
Subject: net/sock: Update memalloc_socks static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to memalloc_socks, for better self
documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++--
 net/core/sock.c    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 3c568b36ee36..4f7c584e9765 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -808,10 +808,10 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
 }
 
 #ifdef CONFIG_NET
-extern struct static_key memalloc_socks;
+DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
 static inline int sk_memalloc_socks(void)
 {
-	return static_key_false(&memalloc_socks);
+	return static_branch_unlikely(&memalloc_socks_key);
 }
 #else
 
diff --git a/net/core/sock.c b/net/core/sock.c
index e7d8b6c955c6..042cfc612660 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -327,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max);
 
 int sysctl_tstamp_allow_data __read_mostly = 1;
 
-struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL_GPL(memalloc_socks);
+DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
+EXPORT_SYMBOL_GPL(memalloc_socks_key);
 
 /**
  * sk_set_memalloc - sets %SOCK_MEMALLOC
@@ -342,7 +342,7 @@ void sk_set_memalloc(struct sock *sk)
 {
 	sock_set_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation |= __GFP_MEMALLOC;
-	static_key_slow_inc(&memalloc_socks);
+	static_branch_inc(&memalloc_socks_key);
 }
 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 
@@ -350,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk)
 {
 	sock_reset_flag(sk, SOCK_MEMALLOC);
 	sk->sk_allocation &= ~__GFP_MEMALLOC;
-	static_key_slow_dec(&memalloc_socks);
+	static_branch_dec(&memalloc_socks_key);
 
 	/*
 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
-- 
cgit v1.2.3-59-g8ed1b


From aabf6772cc745f90d1e15b0054eca734ba787784 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:07:00 -0700
Subject: net: Update [e/in]gress_needed static key to modern api

No changes in semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to both ingress_needed and egress_needed,
for better self documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 29bf39174900..ad4288f6e105 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1755,33 +1755,33 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
 #ifdef CONFIG_NET_INGRESS
-static struct static_key ingress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
 
 void net_inc_ingress_queue(void)
 {
-	static_key_slow_inc(&ingress_needed);
+	static_branch_inc(&ingress_needed_key);
 }
 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
 
 void net_dec_ingress_queue(void)
 {
-	static_key_slow_dec(&ingress_needed);
+	static_branch_dec(&ingress_needed_key);
 }
 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 #endif
 
 #ifdef CONFIG_NET_EGRESS
-static struct static_key egress_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
 
 void net_inc_egress_queue(void)
 {
-	static_key_slow_inc(&egress_needed);
+	static_branch_inc(&egress_needed_key);
 }
 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
 
 void net_dec_egress_queue(void)
 {
-	static_key_slow_dec(&egress_needed);
+	static_branch_dec(&egress_needed_key);
 }
 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
@@ -3532,7 +3532,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_at_ingress = 0;
 # ifdef CONFIG_NET_EGRESS
-	if (static_key_false(&egress_needed)) {
+	if (static_branch_unlikely(&egress_needed_key)) {
 		skb = sch_handle_egress(skb, &rc, dev);
 		if (!skb)
 			goto out;
@@ -4566,7 +4566,7 @@ another_round:
 
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
-	if (static_key_false(&ingress_needed)) {
+	if (static_branch_unlikely(&ingress_needed_key)) {
 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto out;
-- 
cgit v1.2.3-59-g8ed1b


From 39e8392201dcd02b7d70f2149e678035b20cc26a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:07:01 -0700
Subject: net: Update netstamp_needed static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to netstamp_needed, for better self
documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ad4288f6e105..d6fd1578f770 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1786,7 +1786,7 @@ void net_dec_egress_queue(void)
 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
-static struct static_key netstamp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 #ifdef HAVE_JUMP_LABEL
 static atomic_t netstamp_needed_deferred;
 static atomic_t netstamp_wanted;
@@ -1797,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work)
 
 	wanted = atomic_add_return(deferred, &netstamp_wanted);
 	if (wanted > 0)
-		static_key_enable(&netstamp_needed);
+		static_branch_enable(&netstamp_needed_key);
 	else
-		static_key_disable(&netstamp_needed);
+		static_branch_disable(&netstamp_needed_key);
 }
 static DECLARE_WORK(netstamp_work, netstamp_clear);
 #endif
@@ -1819,7 +1819,7 @@ void net_enable_timestamp(void)
 	atomic_inc(&netstamp_needed_deferred);
 	schedule_work(&netstamp_work);
 #else
-	static_key_slow_inc(&netstamp_needed);
+	static_branch_inc(&netstamp_needed_key);
 #endif
 }
 EXPORT_SYMBOL(net_enable_timestamp);
@@ -1839,7 +1839,7 @@ void net_disable_timestamp(void)
 	atomic_dec(&netstamp_needed_deferred);
 	schedule_work(&netstamp_work);
 #else
-	static_key_slow_dec(&netstamp_needed);
+	static_branch_dec(&netstamp_needed_key);
 #endif
 }
 EXPORT_SYMBOL(net_disable_timestamp);
@@ -1847,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
 	skb->tstamp = 0;
-	if (static_key_false(&netstamp_needed))
+	if (static_branch_unlikely(&netstamp_needed_key))
 		__net_timestamp(skb);
 }
 
-#define net_timestamp_check(COND, SKB)			\
-	if (static_key_false(&netstamp_needed)) {		\
-		if ((COND) && !(SKB)->tstamp)	\
-			__net_timestamp(SKB);		\
-	}						\
+#define net_timestamp_check(COND, SKB)				\
+	if (static_branch_unlikely(&netstamp_needed_key)) {	\
+		if ((COND) && !(SKB)->tstamp)			\
+			__net_timestamp(SKB);			\
+	}							\
 
 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 02786475c74c7d0721b0142fac74108112cb456c Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:07:02 -0700
Subject: net: Update generic_xdp_needed static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to generic_xdp_needed, for better self
documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d6fd1578f770..9f4390182384 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4154,7 +4154,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 }
 EXPORT_SYMBOL_GPL(generic_xdp_tx);
 
-static struct static_key generic_xdp_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
 
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 {
@@ -4194,7 +4194,7 @@ static int netif_rx_internal(struct sk_buff *skb)
 
 	trace_netif_rx(skb);
 
-	if (static_key_false(&generic_xdp_needed)) {
+	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 		int ret;
 
 		preempt_disable();
@@ -4726,9 +4726,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 			bpf_prog_put(old);
 
 		if (old && !new) {
-			static_key_slow_dec(&generic_xdp_needed);
+			static_branch_dec(&generic_xdp_needed_key);
 		} else if (new && !old) {
-			static_key_slow_inc(&generic_xdp_needed);
+			static_branch_inc(&generic_xdp_needed_key);
 			dev_disable_lro(dev);
 			dev_disable_gro_hw(dev);
 		}
@@ -4756,7 +4756,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	if (skb_defer_rx_timestamp(skb))
 		return NET_RX_SUCCESS;
 
-	if (static_key_false(&generic_xdp_needed)) {
+	if (static_branch_unlikely(&generic_xdp_needed_key)) {
 		int ret;
 
 		preempt_disable();
-- 
cgit v1.2.3-59-g8ed1b


From 88ab31081b8c8d4d7698711b80fbfd3f52d27fd9 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 8 May 2018 09:07:03 -0700
Subject: net/udp: Update udp_encap_needed static key to modern api

No changes in refcount semantics -- key init is false; replace

static_key_enable         with   static_branch_enable
static_key_slow_inc|dec   with   static_branch_inc|dec
static_key_false          with   static_branch_unlikely

Added a '_key' suffix to udp and udpv6 encap_needed, for better
self documentation.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 8 ++++----
 net/ipv6/udp.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e07db83b311e..229e616fafc5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1875,10 +1875,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static struct static_key udp_encap_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void)
 {
-	static_key_enable(&udp_encap_needed);
+	static_branch_enable(&udp_encap_needed_key);
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
@@ -1902,7 +1902,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto drop;
 	nf_reset(skb);
 
-	if (static_key_false(&udp_encap_needed) && up->encap_type) {
+	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
 		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 
 		/*
@@ -2365,7 +2365,7 @@ void udp_destroy_sock(struct sock *sk)
 	bool slow = lock_sock_fast(sk);
 	udp_flush_pending_frames(sk);
 	unlock_sock_fast(sk, slow);
-	if (static_key_false(&udp_encap_needed) && up->encap_type) {
+	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
 		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index a34e28ac03a7..0056ae766d93 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -546,10 +546,10 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-static struct static_key udpv6_encap_needed __read_mostly;
+static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void)
 {
-	static_key_enable(&udpv6_encap_needed);
+	static_branch_enable(&udpv6_encap_needed_key);
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
@@ -561,7 +561,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto drop;
 
-	if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+	if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
 		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
 
 		/*
@@ -1427,7 +1427,7 @@ void udpv6_destroy_sock(struct sock *sk)
 	udp_v6_flush_pending_frames(sk);
 	release_sock(sk);
 
-	if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
+	if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
 		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
-- 
cgit v1.2.3-59-g8ed1b


From f663706a33ce32df69a2b3e85a80cb5a690f2234 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 8 May 2018 19:21:34 +0300
Subject: tun: Do SIOCGSKNS out of rtnl_lock()

Since net ns of tun device is assigned on the device creation,
and it never changes, we do not need to use any lock to get it
from alive tun.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3c04ab9752a..44d4f3d25350 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2850,10 +2850,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
 	struct tun_file *tfile = file->private_data;
+	struct net *net = sock_net(&tfile->sk);
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
-	struct net *net;
 	kuid_t owner;
 	kgid_t group;
 	int sndbuf;
@@ -2877,14 +2877,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		 */
 		return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES,
 				(unsigned int __user*)argp);
-	} else if (cmd == TUNSETQUEUE)
+	} else if (cmd == TUNSETQUEUE) {
 		return tun_set_queue(file, &ifr);
+	} else if (cmd == SIOCGSKNS) {
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		return open_related_ns(&net->ns, get_net_ns);
+	}
 
 	ret = 0;
 	rtnl_lock();
 
 	tun = tun_get(tfile);
-	net = sock_net(&tfile->sk);
 	if (cmd == TUNSETIFF) {
 		ret = -EEXIST;
 		if (tun)
@@ -2914,14 +2918,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		tfile->ifindex = ifindex;
 		goto unlock;
 	}
-	if (cmd == SIOCGSKNS) {
-		ret = -EPERM;
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
-			goto unlock;
-
-		ret = open_related_ns(&net->ns, get_net_ns);
-		goto unlock;
-	}
 
 	ret = -EBADFD;
 	if (!tun)
-- 
cgit v1.2.3-59-g8ed1b


From 5f30721c517a9c8512da3bc8d909ff5e810c2b44 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 9 May 2018 02:59:41 +0200
Subject: tipc: clean up removal of binding table items

In commit be47e41d77fb ("tipc: fix use-after-free in tipc_nametbl_stop")
we fixed a problem caused by premature release of service range items.

That fix is correct, and solved the problem. However, it doesn't address
the root of the problem, which is that we don't lookup the tipc_service
 -> service_range -> publication items in the correct hierarchical
order.

In this commit we try to make this right, and as a side effect obtain
some code simplification.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 103 ++++++++++++++++++++++++++------------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index dd1c4fa2eb78..bebe88cae07b 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -136,12 +136,12 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
 }
 
 /**
- * tipc_service_find_range - find service range matching a service instance
+ * tipc_service_first_range - find first service range in tree matching instance
  *
  * Very time-critical, so binary search through range rb tree
  */
-static struct service_range *tipc_service_find_range(struct tipc_service *sc,
-						     u32 instance)
+static struct service_range *tipc_service_first_range(struct tipc_service *sc,
+						      u32 instance)
 {
 	struct rb_node *n = sc->ranges.rb_node;
 	struct service_range *sr;
@@ -158,6 +158,30 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc,
 	return NULL;
 }
 
+/*  tipc_service_find_range - find service range matching publication parameters
+ */
+static struct service_range *tipc_service_find_range(struct tipc_service *sc,
+						     u32 lower, u32 upper)
+{
+	struct rb_node *n = sc->ranges.rb_node;
+	struct service_range *sr;
+
+	sr = tipc_service_first_range(sc, lower);
+	if (!sr)
+		return NULL;
+
+	/* Look for exact match */
+	for (n = &sr->tree_node; n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->upper == upper)
+			break;
+	}
+	if (!n || sr->lower != lower || sr->upper != upper)
+		return NULL;
+
+	return sr;
+}
+
 static struct service_range *tipc_service_create_range(struct tipc_service *sc,
 						       u32 lower, u32 upper)
 {
@@ -238,54 +262,19 @@ err:
 /**
  * tipc_service_remove_publ - remove a publication from a service
  */
-static struct publication *tipc_service_remove_publ(struct net *net,
-						    struct tipc_service *sc,
-						    u32 lower, u32 upper,
-						    u32 node, u32 key,
-						    struct service_range **rng)
+static struct publication *tipc_service_remove_publ(struct service_range *sr,
+						    u32 node, u32 key)
 {
-	struct tipc_subscription *sub, *tmp;
-	struct service_range *sr;
 	struct publication *p;
-	bool found = false;
-	bool last = false;
-	struct rb_node *n;
-
-	sr = tipc_service_find_range(sc, lower);
-	if (!sr)
-		return NULL;
 
-	/* Find exact matching service range */
-	for (n = &sr->tree_node; n; n = rb_next(n)) {
-		sr = container_of(n, struct service_range, tree_node);
-		if (sr->upper == upper)
-			break;
-	}
-	if (!n || sr->lower != lower || sr->upper != upper)
-		return NULL;
-
-	/* Find publication, if it exists */
 	list_for_each_entry(p, &sr->all_publ, all_publ) {
 		if (p->key != key || (node && node != p->node))
 			continue;
-		found = true;
-		break;
+		list_del(&p->all_publ);
+		list_del(&p->local_publ);
+		return p;
 	}
-	if (!found)
-		return NULL;
-
-	list_del(&p->all_publ);
-	list_del(&p->local_publ);
-	if (list_empty(&sr->all_publ))
-		last = true;
-
-	/* Notify any waiting subscriptions */
-	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
-		tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
-					p->port, p->node, p->scope, last);
-	}
-	*rng = sr;
-	return p;
+	return NULL;
 }
 
 /**
@@ -376,17 +365,31 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 node, u32 key)
 {
 	struct tipc_service *sc = tipc_service_find(net, type);
+	struct tipc_subscription *sub, *tmp;
 	struct service_range *sr = NULL;
 	struct publication *p = NULL;
+	bool last;
 
 	if (!sc)
 		return NULL;
 
 	spin_lock_bh(&sc->lock);
-	p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+	sr = tipc_service_find_range(sc, lower, upper);
+	if (!sr)
+		goto exit;
+	p = tipc_service_remove_publ(sr, node, key);
+	if (!p)
+		goto exit;
+
+	/* Notify any waiting subscriptions */
+	last = list_empty(&sr->all_publ);
+	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
+		tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN,
+					p->port, node, p->scope, last);
+	}
 
 	/* Remove service range item if this was its last publication */
-	if (sr && list_empty(&sr->all_publ)) {
+	if (list_empty(&sr->all_publ)) {
 		rb_erase(&sr->tree_node, &sc->ranges);
 		kfree(sr);
 	}
@@ -396,6 +399,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 		hlist_del_init_rcu(&sc->service_list);
 		kfree_rcu(sc, rcu);
 	}
+exit:
 	spin_unlock_bh(&sc->lock);
 	return p;
 }
@@ -437,7 +441,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
 		goto not_found;
 
 	spin_lock_bh(&sc->lock);
-	sr = tipc_service_find_range(sc, instance);
+	sr = tipc_service_first_range(sc, instance);
 	if (unlikely(!sr))
 		goto no_match;
 
@@ -484,7 +488,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 
 	spin_lock_bh(&sc->lock);
 
-	sr = tipc_service_find_range(sc, instance);
+	sr = tipc_service_first_range(sc, instance);
 	if (!sr)
 		goto no_match;
 
@@ -756,8 +760,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
 	spin_lock_bh(&sc->lock);
 	rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
 		list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
-			tipc_service_remove_publ(net, sc, p->lower, p->upper,
-						 p->node, p->key, &sr);
+			tipc_service_remove_publ(sr, p->node, p->key);
 			kfree_rcu(p, rcu);
 		}
 		rb_erase(&sr->tree_node, &sc->ranges);
-- 
cgit v1.2.3-59-g8ed1b


From a37fb855f6e8e9eafac046721393f68f48eb5f91 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 8 May 2018 23:03:12 -0400
Subject: net: dsa: fix added_by_user switchdev notification

Commit 161d82de1ff8 ("net: bridge: Notify about !added_by_user FDB
entries") causes the below oops when bringing up a slave interface,
because dsa_port_fdb_add is still scheduled, but with a NULL address.

To fix this, keep the dsa_slave_switchdev_event function agnostic of the
notified info structure and handle the added_by_user flag in the
specific dsa_slave_switchdev_event_work function.

    [   75.512263] Unable to handle kernel NULL pointer dereference at virtual address 00000000
    [   75.519063] pgd = (ptrval)
    [   75.520545] [00000000] *pgd=00000000
    [   75.522839] Internal error: Oops: 17 [#1] ARM
    [   75.525898] Modules linked in:
    [   75.527673] CPU: 0 PID: 9 Comm: kworker/u2:1 Not tainted 4.17.0-rc2 #78
    [   75.532988] Hardware name: Freescale Vybrid VF5xx/VF6xx (Device Tree)
    [   75.538153] Workqueue: dsa_ordered dsa_slave_switchdev_event_work
    [   75.542970] PC is at mv88e6xxx_port_db_load_purge+0x60/0x1b0
    [   75.547341] LR is at mdiobus_read_nested+0x6c/0x78
    [   75.550833] pc : [<804cd5c0>]    lr : [<804bba84>]    psr: 60070013
    [   75.555796] sp : 9f54bd78  ip : 9f54bd87  fp : 9f54bddc
    [   75.559719] r10: 00000000  r9 : 0000000e  r8 : 9f6a6010
    [   75.563643] r7 : 00000000  r6 : 81203048  r5 : 9f6a6010  r4 : 9f6a601c
    [   75.568867] r3 : 00000000  r2 : 00000000  r1 : 0000000d  r0 : 00000000
    [   75.574094] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
    [   75.579933] Control: 10c53c7d  Table: 9de20059  DAC: 00000051
    [   75.584384] Process kworker/u2:1 (pid: 9, stack limit = 0x(ptrval))
    [   75.589349] Stack: (0x9f54bd78 to 0x9f54c000)
    [   75.592406] bd60:                                                       00000000 00000000
    [   75.599295] bd80: 00000391 9f299d10 9f299d68 8014317c 9f7f0000 8120af00 00006dc2 00000000
    [   75.606186] bda0: 8120af00 00000000 9f54bdec 1c9f5d92 8014317c 9f6a601c 9f6a6010 00000000
    [   75.613076] bdc0: 00000000 00000000 9dd1141c 8125a0b4 9f54be0c 9f54bde0 804cd8a8 804cd56c
    [   75.619966] bde0: 0000000e 80143680 00000001 9dce9c1c 81203048 9dce9c10 00000003 00000000
    [   75.626858] be00: 9f54be5c 9f54be10 806abcac 804cd864 9f54be54 80143664 8014317c 80143054
    [   75.633748] be20: ffcaa81d 00000000 812030b0 1c9f5d92 00000000 81203048 9f54beb4 00000003
    [   75.640639] be40: ffffffff 00000000 9dd1141c 8125a0b4 9f54be84 9f54be60 80138e98 806abb18
    [   75.647529] be60: 81203048 9ddc4000 9dce9c54 9f72a300 00000000 00000000 9f54be9c 9f54be88
    [   75.654420] be80: 801390bc 80138e50 00000000 9dce9c54 9f54beac 9f54bea0 806a9524 801390a0
    [   75.661310] bea0: 9f54bedc 9f54beb0 806a9c7c 806a950c 9f54becc 00000000 00000000 00000000
    [   75.668201] bec0: 9f540000 1c9f5d92 805fe604 9ddffc00 9f54befc 9f54bee0 806ab228 806a9c38
    [   75.675092] bee0: 806ab178 9ddffc00 9f4c1900 9f40d200 9f54bf34 9f54bf00 80131e30 806ab184
    [   75.681983] bf00: 9f40d214 9f54a038 9f40d200 9f40d200 9f4c1918 812119a0 9f40d214 9f54a038
    [   75.688873] bf20: 9f40d200 9f4c1900 9f54bf7c 9f54bf38 80132124 80131d1c 9f5f2dd8 00000000
    [   75.695764] bf40: 812119a0 9f54a038 812119a0 81259c5b 9f5f2dd8 9f5f2dc0 9f53dbc0 00000000
    [   75.702655] bf60: 9f4c1900 801320b4 9f5f2dd8 9f4f7e88 9f54bfac 9f54bf80 80137ad0 801320c0
    [   75.709544] bf80: 9f54a000 9f53dbc0 801379a0 00000000 00000000 00000000 00000000 00000000
    [   75.716434] bfa0: 00000000 9f54bfb0 801010e8 801379ac 00000000 00000000 00000000 00000000
    [   75.723324] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
    [   75.730206] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 00000000 00000000
    [   75.737083] Backtrace:
    [   75.738252] [<804cd560>] (mv88e6xxx_port_db_load_purge) from [<804cd8a8>] (mv88e6xxx_port_fdb_add+0x50/0x68)
    [   75.746795]  r10:8125a0b4 r9:9dd1141c r8:00000000 r7:00000000 r6:00000000 r5:9f6a6010
    [   75.753323]  r4:9f6a601c
    [   75.754570] [<804cd858>] (mv88e6xxx_port_fdb_add) from [<806abcac>] (dsa_switch_event+0x1a0/0x660)
    [   75.762238]  r8:00000000 r7:00000003 r6:9dce9c10 r5:81203048 r4:9dce9c1c
    [   75.767655] [<806abb0c>] (dsa_switch_event) from [<80138e98>] (notifier_call_chain+0x54/0x94)
    [   75.774893]  r10:8125a0b4 r9:9dd1141c r8:00000000 r7:ffffffff r6:00000003 r5:9f54beb4
    [   75.781423]  r4:81203048
    [   75.782672] [<80138e44>] (notifier_call_chain) from [<801390bc>] (raw_notifier_call_chain+0x28/0x30)
    [   75.790514]  r9:00000000 r8:00000000 r7:9f72a300 r6:9dce9c54 r5:9ddc4000 r4:81203048
    [   75.796982] [<80139094>] (raw_notifier_call_chain) from [<806a9524>] (dsa_port_notify+0x24/0x38)
    [   75.804483] [<806a9500>] (dsa_port_notify) from [<806a9c7c>] (dsa_port_fdb_add+0x50/0x6c)
    [   75.811371] [<806a9c2c>] (dsa_port_fdb_add) from [<806ab228>] (dsa_slave_switchdev_event_work+0xb0/0x10c)
    [   75.819635]  r4:9ddffc00
    [   75.820885] [<806ab178>] (dsa_slave_switchdev_event_work) from [<80131e30>] (process_one_work+0x120/0x3a4)
    [   75.829241]  r6:9f40d200 r5:9f4c1900 r4:9ddffc00 r3:806ab178
    [   75.833612] [<80131d10>] (process_one_work) from [<80132124>] (worker_thread+0x70/0x574)
    [   75.840415]  r10:9f4c1900 r9:9f40d200 r8:9f54a038 r7:9f40d214 r6:812119a0 r5:9f4c1918
    [   75.846945]  r4:9f40d200
    [   75.848191] [<801320b4>] (worker_thread) from [<80137ad0>] (kthread+0x130/0x160)
    [   75.854300]  r10:9f4f7e88 r9:9f5f2dd8 r8:801320b4 r7:9f4c1900 r6:00000000 r5:9f53dbc0
    [   75.860830]  r4:9f5f2dc0
    [   75.862076] [<801379a0>] (kthread) from [<801010e8>] (ret_from_fork+0x14/0x2c)
    [   75.867999] Exception stack(0x9f54bfb0 to 0x9f54bff8)
    [   75.871753] bfa0:                                     00000000 00000000 00000000 00000000
    [   75.878640] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
    [   75.885519] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000
    [   75.890844]  r10:00000000 r9:00000000 r8:00000000 r7:00000000 r6:00000000 r5:801379a0
    [   75.897377]  r4:9f53dbc0 r3:9f54a000
    [   75.899663] Code: e3a02000 e3a03000 e14b26f4 e24bc055 (e5973000)
    [   75.904575] ---[ end trace fbca818a124dbf0d ]---

Fixes: 816a3bed9549 ("switchdev: Add fdb.added_by_user to switchdev notifications")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index c287f1ef964c..746ab428a17a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1395,6 +1395,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
+
 		err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
 		if (err) {
 			netdev_dbg(dev, "fdb add failed err=%d\n", err);
@@ -1406,6 +1409,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work)
 
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
+
 		err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
 		if (err) {
 			netdev_dbg(dev, "fdb del failed err=%d\n", err);
@@ -1441,7 +1447,6 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 				     unsigned long event, void *ptr)
 {
 	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
-	struct switchdev_notifier_fdb_info *fdb_info = ptr;
 	struct dsa_switchdev_event_work *switchdev_work;
 
 	if (!dsa_slave_dev_check(dev))
@@ -1459,10 +1464,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (!fdb_info->added_by_user)
-			break;
-		if (dsa_slave_switchdev_fdb_work_init(switchdev_work,
-						      fdb_info))
+		if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
 			goto err_fdb_work_init;
 		dev_hold(dev);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From e5c9a705452acf59efe00cb84f086624fda010ab Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 9 May 2018 18:29:02 +0300
Subject: net/mlx4_core: Report driver version to FW

If supported, write a driver version string to FW as part of the
INIT_HCA command.

Example of driver version: "Linux,mlx4_core,4.0-0"

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  2 +-
 include/linux/mlx4/device.h               |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index de6b3d416148..46dcbfbe4c5e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -165,6 +165,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[36] = "QinQ VST mode support",
 		[37] = "sl to vl mapping table change event support",
 		[38] = "user MAC support",
+		[39] = "Report driver version to FW support",
 	};
 	int i;
 
@@ -1038,6 +1039,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
 	if (field32 & (1 << 7))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+	if (field32 & (1 << 8))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW;
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
 	if (field32 & (1 << 17))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
@@ -1860,6 +1863,8 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
 #define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
 #define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_DRIVER_VERSION_OFFSET   0x140
+#define  INIT_HCA_DRIVER_VERSION_SZ       0x40
 #define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
 #define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
 #define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
@@ -1950,6 +1955,13 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
 		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) {
+		u8 *dst = (u8 *)(inbox + INIT_HCA_DRIVER_VERSION_OFFSET / 4);
+
+		strncpy(dst, DRV_NAME_FOR_FW, INIT_HCA_DRIVER_VERSION_SZ - 1);
+		mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n", dst);
+	}
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index c68da1986e51..cb9e923e8399 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -55,8 +55,8 @@
 #include "fw_qos.h"
 
 #define DRV_NAME	"mlx4_core"
-#define PFX		DRV_NAME ": "
 #define DRV_VERSION	"4.0-0"
+#define DRV_NAME_FOR_FW		"Linux," DRV_NAME "," DRV_VERSION
 
 #define MLX4_FS_UDP_UC_EN		(1 << 1)
 #define MLX4_FS_TCP_UC_EN		(1 << 2)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 81d0799b6091..122e7e9d3091 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -225,6 +225,7 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
 	MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37,
 	MLX4_DEV_CAP_FLAG2_USER_MAC_EN		= 1ULL << 38,
+	MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW = 1ULL << 39,
 };
 
 enum {
-- 
cgit v1.2.3-59-g8ed1b


From 86a3e5d02c2006aaf085c940f037d2453ac44957 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Wed, 9 May 2018 18:29:03 +0300
Subject: net/mlx4_core: Add PCI calls for suspend/resume

Implement suspend/resume callbacks in struct pci_driver.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 46 +++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 211578ffc70d..b6aaf34d6648 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4125,12 +4125,58 @@ static const struct pci_error_handlers mlx4_err_handler = {
 	.resume		= mlx4_pci_resume,
 };
 
+static int mlx4_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	*dev = persist->dev;
+
+	mlx4_err(dev, "suspend was called\n");
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	mutex_unlock(&persist->interface_state_mutex);
+
+	return 0;
+}
+
+static int mlx4_resume(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	*dev = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int total_vfs;
+	int ret = 0;
+
+	mlx4_err(dev, "resume was called\n");
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+		ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs,
+				    nvfs, priv, 1);
+		if (!ret) {
+			ret = restore_current_port_types(dev,
+					dev->persist->curr_port_type,
+					dev->persist->curr_port_poss_type);
+			if (ret)
+				mlx4_err(dev, "resume: could not restore original port types (%d)\n", ret);
+		}
+	}
+	mutex_unlock(&persist->interface_state_mutex);
+
+	return ret;
+}
+
 static struct pci_driver mlx4_driver = {
 	.name		= DRV_NAME,
 	.id_table	= mlx4_pci_table,
 	.probe		= mlx4_init_one,
 	.shutdown	= mlx4_shutdown,
 	.remove		= mlx4_remove_one,
+	.suspend	= mlx4_suspend,
+	.resume		= mlx4_resume,
 	.err_handler    = &mlx4_err_handler,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From e57328384b6a9148df653216203408de3be3dda1 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 9 May 2018 18:29:04 +0300
Subject: net/mlx4_core: Use msi_x module param to limit num of MSI-X irqs

Extend the boolean interpretation of msi_x module parameter
to numerical, as follows:

0   - Don't use MSI-X.
1   - Use MSI-X, driver decides the num of MSI-X irqs.
>=2 - Use MSI-X, limit number of MSI-X irqs to msi_x.
      In SRIOV, this limits the number of MSI-X irqs per VF.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Cc: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
Reviewed-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index b6aaf34d6648..80a75c80a463 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -73,7 +73,7 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 
 static int msi_x = 1;
 module_param(msi_x, int, 0444);
-MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x");
 
 #else /* CONFIG_PCI_MSI */
 
@@ -2815,6 +2815,9 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 				dev->caps.num_eqs - dev->caps.reserved_eqs,
 				MAX_MSIX);
 
+		if (msi_x > 1)
+			nreq = min_t(int, nreq, msi_x);
+
 		entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
 		if (!entries)
 			goto no_msi;
@@ -4182,6 +4185,11 @@ static struct pci_driver mlx4_driver = {
 
 static int __init mlx4_verify_params(void)
 {
+	if (msi_x < 0) {
+		pr_warn("mlx4_core: bad msi_x: %d\n", msi_x);
+		return -1;
+	}
+
 	if ((log_num_mac < 0) || (log_num_mac > 7)) {
 		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
 		return -1;
-- 
cgit v1.2.3-59-g8ed1b


From 02317e68369b0feecc18e791016d38a8e4df35f7 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed, 9 May 2018 11:38:49 -0400
Subject: net: dsa: mv88e6xxx: add a cascade port op

Only the 88E6185 family has bits 15:12 Cascade Port bits in the Global
Control 2 register. Hence inconsistent values are actually written in
this register for other families.

Add a .set_cascade_port operation to isolate the 88E6185 case, and call
it from the device mapping setup function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 10 +++++++++-
 drivers/net/dsa/mv88e6xxx/chip.h    |  6 ++++++
 drivers/net/dsa/mv88e6xxx/global1.c | 23 +++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global1.h |  7 +++++--
 4 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index e7e079b1888c..9b61d5b65b0f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1046,6 +1046,13 @@ static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
 			return err;
 	}
 
+	if (chip->info->ops->set_cascade_port) {
+		port = MV88E6XXX_CASCADE_PORT_MULTIPLE;
+		err = chip->info->ops->set_cascade_port(chip, port);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -2158,7 +2165,6 @@ static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip *chip)
 
 	/* Disable remote management, and set the switch's DSA device number. */
 	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL2,
-				 MV88E6XXX_G1_CTL2_MULTIPLE_CASCADE |
 				 (ds->index & 0x1f));
 	if (err)
 		return err;
@@ -2643,6 +2649,7 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
 	.watchdog_ops = &mv88e6097_watchdog_ops,
 	.mgmt_rsvd2cpu = mv88e6185_g2_mgmt_rsvd2cpu,
 	.ppu_enable = mv88e6185_g1_ppu_enable,
+	.set_cascade_port = mv88e6185_g1_set_cascade_port,
 	.ppu_disable = mv88e6185_g1_ppu_disable,
 	.reset = mv88e6185_g1_reset,
 	.vtu_getnext = mv88e6185_g1_vtu_getnext,
@@ -2911,6 +2918,7 @@ static const struct mv88e6xxx_ops mv88e6185_ops = {
 	.set_egress_port = mv88e6095_g1_set_egress_port,
 	.watchdog_ops = &mv88e6097_watchdog_ops,
 	.mgmt_rsvd2cpu = mv88e6185_g2_mgmt_rsvd2cpu,
+	.set_cascade_port = mv88e6185_g1_set_cascade_port,
 	.ppu_enable = mv88e6185_g1_ppu_enable,
 	.ppu_disable = mv88e6185_g1_ppu_disable,
 	.reset = mv88e6185_g1_reset,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 4163c8099d0b..62234c2287a2 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -401,6 +401,12 @@ struct mv88e6xxx_ops {
 			       uint64_t *data);
 	int (*set_cpu_port)(struct mv88e6xxx_chip *chip, int port);
 	int (*set_egress_port)(struct mv88e6xxx_chip *chip, int port);
+
+#define MV88E6XXX_CASCADE_PORT_NONE		0xe
+#define MV88E6XXX_CASCADE_PORT_MULTIPLE		0xf
+
+	int (*set_cascade_port)(struct mv88e6xxx_chip *chip, int port);
+
 	const struct mv88e6xxx_irq_ops *watchdog_ops;
 
 	int (*mgmt_rsvd2cpu)(struct mv88e6xxx_chip *chip);
diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index b43bd6476632..6eb4eca7ca5b 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -350,6 +350,29 @@ int mv88e6390_g1_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip)
 
 /* Offset 0x1c: Global Control 2 */
 
+static int mv88e6xxx_g1_ctl2_mask(struct mv88e6xxx_chip *chip, u16 mask,
+				  u16 val)
+{
+	u16 reg;
+	int err;
+
+	err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL2, &reg);
+	if (err)
+		return err;
+
+	reg &= ~mask;
+	reg |= val & mask;
+
+	return mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL2, reg);
+}
+
+int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port)
+{
+	const u16 mask = MV88E6185_G1_CTL2_CASCADE_PORT_MASK;
+
+	return mv88e6xxx_g1_ctl2_mask(chip, mask, port << __bf_shf(mask));
+}
+
 int mv88e6390_g1_stats_set_histogram(struct mv88e6xxx_chip *chip)
 {
 	u16 val;
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index 6aee7316fea6..bcbb8046ad63 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -201,11 +201,12 @@
 
 /* Offset 0x1C: Global Control 2 */
 #define MV88E6XXX_G1_CTL2			0x1c
-#define MV88E6XXX_G1_CTL2_NO_CASCADE		0xe000
-#define MV88E6XXX_G1_CTL2_MULTIPLE_CASCADE	0xf000
 #define MV88E6XXX_G1_CTL2_HIST_RX		0x0040
 #define MV88E6XXX_G1_CTL2_HIST_TX		0x0080
 #define MV88E6XXX_G1_CTL2_HIST_RX_TX		0x00c0
+#define MV88E6185_G1_CTL2_CASCADE_PORT_MASK	0xf000
+#define MV88E6185_G1_CTL2_CASCADE_PORT_NONE	0xe000
+#define MV88E6185_G1_CTL2_CASCADE_PORT_MULTI	0xf000
 
 /* Offset 0x1D: Stats Operation Register */
 #define MV88E6XXX_G1_STATS_OP			0x1d
@@ -253,6 +254,8 @@ int mv88e6095_g1_set_cpu_port(struct mv88e6xxx_chip *chip, int port);
 int mv88e6390_g1_set_cpu_port(struct mv88e6xxx_chip *chip, int port);
 int mv88e6390_g1_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip);
 
+int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port);
+
 int mv88e6xxx_g1_atu_set_learn2all(struct mv88e6xxx_chip *chip, bool learn2all);
 int mv88e6xxx_g1_atu_set_age_time(struct mv88e6xxx_chip *chip,
 				  unsigned int msecs);
-- 
cgit v1.2.3-59-g8ed1b


From 23c9891996959dbde59333b8ed997d66b44fb3ff Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed, 9 May 2018 11:38:50 -0400
Subject: net: dsa: mv88e6xxx: set device number

All Marvell switches supported by mv88e6xxx have to set their device
number in the Global Control 2 register. Extract this in a read then
write function, called from the device mapping setup code.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 11 ++++-------
 drivers/net/dsa/mv88e6xxx/global1.c |  7 +++++++
 drivers/net/dsa/mv88e6xxx/global1.h |  3 +++
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 9b61d5b65b0f..ef753ed89b30 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1053,6 +1053,10 @@ static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
 			return err;
 	}
 
+	err = mv88e6xxx_g1_set_device_number(chip, chip->ds->index);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -2160,15 +2164,8 @@ static int mv88e6xxx_set_ageing_time(struct dsa_switch *ds,
 
 static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip *chip)
 {
-	struct dsa_switch *ds = chip->ds;
 	int err;
 
-	/* Disable remote management, and set the switch's DSA device number. */
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL2,
-				 (ds->index & 0x1f));
-	if (err)
-		return err;
-
 	/* Configure the IP ToS mapping registers. */
 	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_0, 0x0000);
 	if (err)
diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index 6eb4eca7ca5b..89c6330c53eb 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -389,6 +389,13 @@ int mv88e6390_g1_stats_set_histogram(struct mv88e6xxx_chip *chip)
 	return err;
 }
 
+int mv88e6xxx_g1_set_device_number(struct mv88e6xxx_chip *chip, int index)
+{
+	return mv88e6xxx_g1_ctl2_mask(chip,
+				      MV88E6XXX_G1_CTL2_DEVICE_NUMBER_MASK,
+				      index);
+}
+
 /* Offset 0x1d: Statistics Operation 2 */
 
 int mv88e6xxx_g1_stats_wait(struct mv88e6xxx_chip *chip)
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index bcbb8046ad63..8b043e813761 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -207,6 +207,7 @@
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MASK	0xf000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_NONE	0xe000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MULTI	0xf000
+#define MV88E6XXX_G1_CTL2_DEVICE_NUMBER_MASK	0x001f
 
 /* Offset 0x1D: Stats Operation Register */
 #define MV88E6XXX_G1_STATS_OP			0x1d
@@ -256,6 +257,8 @@ int mv88e6390_g1_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip);
 
 int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port);
 
+int mv88e6xxx_g1_set_device_number(struct mv88e6xxx_chip *chip, int index);
+
 int mv88e6xxx_g1_atu_set_learn2all(struct mv88e6xxx_chip *chip, bool learn2all);
 int mv88e6xxx_g1_atu_set_age_time(struct mv88e6xxx_chip *chip,
 				  unsigned int msecs);
-- 
cgit v1.2.3-59-g8ed1b


From 9e5baf9b363673a8f78508c99a9d815da6ea7133 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed, 9 May 2018 11:38:51 -0400
Subject: net: dsa: mv88e6xxx: add RMU disable op

The RMU mode bits moved a lot within the Global Control 2 register of
the Marvell switch families. Add an .rmu_disable op to support at least
3 known alternatives.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 24 ++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/chip.h    |  3 +++
 drivers/net/dsa/mv88e6xxx/global1.c | 18 ++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global1.h | 20 ++++++++++++++++++++
 4 files changed, 65 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ef753ed89b30..5d933549dc49 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1069,6 +1069,14 @@ static int mv88e6xxx_trunk_setup(struct mv88e6xxx_chip *chip)
 	return 0;
 }
 
+static int mv88e6xxx_rmu_setup(struct mv88e6xxx_chip *chip)
+{
+	if (chip->info->ops->rmu_disable)
+		return chip->info->ops->rmu_disable(chip);
+
+	return 0;
+}
+
 static int mv88e6xxx_pot_setup(struct mv88e6xxx_chip *chip)
 {
 	if (chip->info->ops->pot_clear)
@@ -2263,6 +2271,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
+	err = mv88e6xxx_rmu_setup(chip);
+	if (err)
+		goto unlock;
+
 	err = mv88e6xxx_rsvd2cpu_setup(chip);
 	if (err)
 		goto unlock;
@@ -2530,6 +2542,7 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
 	.ppu_enable = mv88e6185_g1_ppu_enable,
 	.ppu_disable = mv88e6185_g1_ppu_disable,
 	.reset = mv88e6185_g1_reset,
+	.rmu_disable = mv88e6085_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6341_serdes_power,
@@ -2588,6 +2601,7 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
 	.mgmt_rsvd2cpu = mv88e6352_g2_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6085_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 };
@@ -2815,6 +2829,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
 	.mgmt_rsvd2cpu = mv88e6352_g2_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6352_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
@@ -2888,6 +2903,7 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
 	.mgmt_rsvd2cpu = mv88e6352_g2_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6352_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
@@ -2953,6 +2969,7 @@ static const struct mv88e6xxx_ops mv88e6190_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -2989,6 +3006,7 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -3025,6 +3043,7 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -3062,6 +3081,7 @@ static const struct mv88e6xxx_ops mv88e6240_ops = {
 	.mgmt_rsvd2cpu = mv88e6352_g2_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6352_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
@@ -3100,6 +3120,7 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -3316,6 +3337,7 @@ static const struct mv88e6xxx_ops mv88e6352_ops = {
 	.mgmt_rsvd2cpu = mv88e6352_g2_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6352_g1_rmu_disable,
 	.vtu_getnext = mv88e6352_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
 	.serdes_power = mv88e6352_serdes_power,
@@ -3359,6 +3381,7 @@ static const struct mv88e6xxx_ops mv88e6390_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
@@ -3399,6 +3422,7 @@ static const struct mv88e6xxx_ops mv88e6390x_ops = {
 	.mgmt_rsvd2cpu = mv88e6390_g1_mgmt_rsvd2cpu,
 	.pot_clear = mv88e6xxx_g2_pot_clear,
 	.reset = mv88e6352_g1_reset,
+	.rmu_disable = mv88e6390_g1_rmu_disable,
 	.vtu_getnext = mv88e6390_g1_vtu_getnext,
 	.vtu_loadpurge = mv88e6390_g1_vtu_loadpurge,
 	.serdes_power = mv88e6390_serdes_power,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 62234c2287a2..a1bedb0a888b 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -432,6 +432,9 @@ struct mv88e6xxx_ops {
 
 	/* Interface to the AVB/PTP registers */
 	const struct mv88e6xxx_avb_ops *avb_ops;
+
+	/* Remote Management Unit operations */
+	int (*rmu_disable)(struct mv88e6xxx_chip *chip);
 };
 
 struct mv88e6xxx_irq_ops {
diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index 89c6330c53eb..244ee1ff9edc 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -373,6 +373,24 @@ int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port)
 	return mv88e6xxx_g1_ctl2_mask(chip, mask, port << __bf_shf(mask));
 }
 
+int mv88e6085_g1_rmu_disable(struct mv88e6xxx_chip *chip)
+{
+	return mv88e6xxx_g1_ctl2_mask(chip, MV88E6085_G1_CTL2_P10RM |
+				      MV88E6085_G1_CTL2_RM_ENABLE, 0);
+}
+
+int mv88e6352_g1_rmu_disable(struct mv88e6xxx_chip *chip)
+{
+	return mv88e6xxx_g1_ctl2_mask(chip, MV88E6352_G1_CTL2_RMU_MODE_MASK,
+				      MV88E6352_G1_CTL2_RMU_MODE_DISABLED);
+}
+
+int mv88e6390_g1_rmu_disable(struct mv88e6xxx_chip *chip)
+{
+	return mv88e6xxx_g1_ctl2_mask(chip, MV88E6390_G1_CTL2_RMU_MODE_MASK,
+				      MV88E6390_G1_CTL2_RMU_MODE_DISABLED);
+}
+
 int mv88e6390_g1_stats_set_histogram(struct mv88e6xxx_chip *chip)
 {
 	u16 val;
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index 8b043e813761..e186a026e1b1 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -207,6 +207,22 @@
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MASK	0xf000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_NONE	0xe000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MULTI	0xf000
+#define MV88E6352_G1_CTL2_RMU_MODE_MASK		0x3000
+#define MV88E6352_G1_CTL2_RMU_MODE_DISABLED	0x0000
+#define MV88E6352_G1_CTL2_RMU_MODE_PORT_4	0x1000
+#define MV88E6352_G1_CTL2_RMU_MODE_PORT_5	0x2000
+#define MV88E6352_G1_CTL2_RMU_MODE_PORT_6	0x3000
+#define MV88E6085_G1_CTL2_DA_CHECK		0x4000
+#define MV88E6085_G1_CTL2_P10RM			0x2000
+#define MV88E6085_G1_CTL2_RM_ENABLE		0x1000
+#define MV88E6352_G1_CTL2_DA_CHECK		0x0800
+#define MV88E6390_G1_CTL2_RMU_MODE_MASK		0x0700
+#define MV88E6390_G1_CTL2_RMU_MODE_PORT_0	0x0000
+#define MV88E6390_G1_CTL2_RMU_MODE_PORT_1	0x0100
+#define MV88E6390_G1_CTL2_RMU_MODE_PORT_9	0x0200
+#define MV88E6390_G1_CTL2_RMU_MODE_PORT_10	0x0300
+#define MV88E6390_G1_CTL2_RMU_MODE_ALL_DSA	0x0600
+#define MV88E6390_G1_CTL2_RMU_MODE_DISABLED	0x0700
 #define MV88E6XXX_G1_CTL2_DEVICE_NUMBER_MASK	0x001f
 
 /* Offset 0x1D: Stats Operation Register */
@@ -257,6 +273,10 @@ int mv88e6390_g1_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip);
 
 int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port);
 
+int mv88e6085_g1_rmu_disable(struct mv88e6xxx_chip *chip);
+int mv88e6352_g1_rmu_disable(struct mv88e6xxx_chip *chip);
+int mv88e6390_g1_rmu_disable(struct mv88e6xxx_chip *chip);
+
 int mv88e6xxx_g1_set_device_number(struct mv88e6xxx_chip *chip, int index);
 
 int mv88e6xxx_g1_atu_set_learn2all(struct mv88e6xxx_chip *chip, bool learn2all);
-- 
cgit v1.2.3-59-g8ed1b


From 54472edff0afa09e35828b2b81b00695a9da1c17 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Wed, 9 May 2018 21:15:15 +0530
Subject: drivers: net: davinci_mdio: prevent spurious timeout

A well timed kernel preemption in the time_after() loop
in wait_for_idle() can result in a spurious timeout
error to be returned.

Fix it by using readl_poll_timeout() which takes care of
this issue.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_mdio.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c
index 3c33f4504d8e..98a1c97fb95e 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -34,6 +34,7 @@
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/io.h>
+#include <linux/iopoll.h>
 #include <linux/pm_runtime.h>
 #include <linux/davinci_emac.h>
 #include <linux/of.h>
@@ -227,14 +228,14 @@ static inline int wait_for_user_access(struct davinci_mdio_data *data)
 static inline int wait_for_idle(struct davinci_mdio_data *data)
 {
 	struct davinci_mdio_regs __iomem *regs = data->regs;
-	unsigned long timeout = jiffies + msecs_to_jiffies(MDIO_TIMEOUT);
+	u32 val, ret;
 
-	while (time_after(timeout, jiffies)) {
-		if (__raw_readl(&regs->control) & CONTROL_IDLE)
-			return 0;
-	}
-	dev_err(data->dev, "timed out waiting for idle\n");
-	return -ETIMEDOUT;
+	ret = readl_poll_timeout(&regs->control, val, val & CONTROL_IDLE,
+				 0, MDIO_TIMEOUT * 1000);
+	if (ret)
+		dev_err(data->dev, "timed out waiting for idle\n");
+
+	return ret;
 }
 
 static int davinci_mdio_read(struct mii_bus *bus, int phy_id, int phy_reg)
-- 
cgit v1.2.3-59-g8ed1b


From bfcbcb67e9611cd27da84810f09c57cbe89aa687 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 9 May 2018 09:00:07 -0700
Subject: hv_netvsc: typo in NDIS RSS parameters structure

Fix simple misspelling kashkey_offset should be hashkey_offset.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/hyperv_net.h   | 2 +-
 drivers/net/hyperv/rndis_filter.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 6ebe39a3dde6..1be34d2e3563 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -110,7 +110,7 @@ struct ndis_recv_scale_param { /* NDIS_RECEIVE_SCALE_PARAMETERS */
 	u16 hashkey_size;
 
 	/* The offset of the secret key from the beginning of this structure */
-	u32 kashkey_offset;
+	u32 hashkey_offset;
 
 	u32 processor_masks_offset;
 	u32 num_processor_masks;
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 3b6dbacaf77d..3cb2c4666b2c 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -752,7 +752,7 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
 	rssp->indirect_tabsize = 4*ITAB_NUM;
 	rssp->indirect_taboffset = sizeof(struct ndis_recv_scale_param);
 	rssp->hashkey_size = NETVSC_HASH_KEYLEN;
-	rssp->kashkey_offset = rssp->indirect_taboffset +
+	rssp->hashkey_offset = rssp->indirect_taboffset +
 			       rssp->indirect_tabsize;
 
 	/* Set indirection table entries */
@@ -761,7 +761,7 @@ int rndis_filter_set_rss_param(struct rndis_device *rdev,
 		itab[i] = rdev->rx_table[i];
 
 	/* Set hask key values */
-	keyp = (u8 *)((unsigned long)rssp + rssp->kashkey_offset);
+	keyp = (u8 *)((unsigned long)rssp + rssp->hashkey_offset);
 	memcpy(keyp, rss_key, NETVSC_HASH_KEYLEN);
 
 	ret = rndis_filter_send_request(rdev, request);
-- 
cgit v1.2.3-59-g8ed1b


From f09555ffe379aeef7fa83c459bbe29e8bce8ec50 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 9 May 2018 17:24:38 +0100
Subject: net: hns3: Fix for setting mac address when resetting

When hns3_init_mac_addr is called during reset process, it will
get the mac address from NCL_CONFIG and set it to hardware. If
user has changed the mac address, then the mac address set by
user is lost during resetting.

This patch fixes it by not getting the mac address from NCL_CONFIG
when resetting.

Fixes: 424eb834a9be ("net: hns3: Unified HNS3 {VF|PF} Ethernet Driver for hip08 SoC")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 729bcab947df..a55c8f515e99 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3046,13 +3046,13 @@ int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
 }
 
 /* Set mac addr if it is configured. or leave it to the AE driver */
-static void hns3_init_mac_addr(struct net_device *netdev)
+static void hns3_init_mac_addr(struct net_device *netdev, bool init)
 {
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
 	struct hnae3_handle *h = priv->ae_handle;
 	u8 mac_addr_temp[ETH_ALEN];
 
-	if (h->ae_algo->ops->get_mac_addr) {
+	if (h->ae_algo->ops->get_mac_addr && init) {
 		h->ae_algo->ops->get_mac_addr(h, mac_addr_temp);
 		ether_addr_copy(netdev->dev_addr, mac_addr_temp);
 	}
@@ -3106,7 +3106,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
 	handle->kinfo.netdev = netdev;
 	handle->priv = (void *)priv;
 
-	hns3_init_mac_addr(netdev);
+	hns3_init_mac_addr(netdev, true);
 
 	hns3_set_default_feature(netdev);
 
@@ -3353,7 +3353,7 @@ static int hns3_reset_notify_init_enet(struct hnae3_handle *handle)
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
 	int ret;
 
-	hns3_init_mac_addr(netdev);
+	hns3_init_mac_addr(netdev, false);
 	hns3_nic_set_rx_mode(netdev);
 	hns3_recover_hw_addr(netdev);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0a78a1dfd17a05d377115d6352d12d5507a27994 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 9 May 2018 17:24:39 +0100
Subject: net: hns3: remove add/del_tunnel_udp in hns3_enet module

The add/del_tunnel_udp is not implemented in hclge_main moulde,
the NETIF_F_RX_UDP_TUNNEL_PORT feature bit is added automatically
by stack when ndo_udp_tunnel_add is not null in dev->netdev_ops.

This patch removes the add/del_tunnel_udp related function, for
we do not support this feature now.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h     |  7 --
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 89 -------------------------
 2 files changed, 96 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 37ec1b3286c6..804ea83e1713 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -273,10 +273,6 @@ struct hnae3_ae_dev {
  *   Map rings to vector
  * unmap_ring_from_vector()
  *   Unmap rings from vector
- * add_tunnel_udp()
- *   Add tunnel information to hardware
- * del_tunnel_udp()
- *   Delete tunnel information from hardware
  * reset_queue()
  *   Reset queue
  * get_fw_version()
@@ -388,9 +384,6 @@ struct hnae3_ae_ops {
 				      int vector_num,
 				      struct hnae3_ring_chain_node *vr_chain);
 
-	int (*add_tunnel_udp)(struct hnae3_handle *handle, u16 port_num);
-	int (*del_tunnel_udp)(struct hnae3_handle *handle, u16 port_num);
-
 	void (*reset_queue)(struct hnae3_handle *handle, u16 queue_id);
 	u32 (*get_fw_version)(struct hnae3_handle *handle);
 	void (*get_mdix_mode)(struct hnae3_handle *handle,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index a55c8f515e99..bd8e14b53889 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1244,93 +1244,6 @@ static void hns3_nic_get_stats64(struct net_device *netdev,
 	stats->tx_compressed = netdev->stats.tx_compressed;
 }
 
-static void hns3_add_tunnel_port(struct net_device *netdev, u16 port,
-				 enum hns3_udp_tnl_type type)
-{
-	struct hns3_nic_priv *priv = netdev_priv(netdev);
-	struct hns3_udp_tunnel *udp_tnl = &priv->udp_tnl[type];
-	struct hnae3_handle *h = priv->ae_handle;
-
-	if (udp_tnl->used && udp_tnl->dst_port == port) {
-		udp_tnl->used++;
-		return;
-	}
-
-	if (udp_tnl->used) {
-		netdev_warn(netdev,
-			    "UDP tunnel [%d], port [%d] offload\n", type, port);
-		return;
-	}
-
-	udp_tnl->dst_port = port;
-	udp_tnl->used = 1;
-	/* TBD send command to hardware to add port */
-	if (h->ae_algo->ops->add_tunnel_udp)
-		h->ae_algo->ops->add_tunnel_udp(h, port);
-}
-
-static void hns3_del_tunnel_port(struct net_device *netdev, u16 port,
-				 enum hns3_udp_tnl_type type)
-{
-	struct hns3_nic_priv *priv = netdev_priv(netdev);
-	struct hns3_udp_tunnel *udp_tnl = &priv->udp_tnl[type];
-	struct hnae3_handle *h = priv->ae_handle;
-
-	if (!udp_tnl->used || udp_tnl->dst_port != port) {
-		netdev_warn(netdev,
-			    "Invalid UDP tunnel port %d\n", port);
-		return;
-	}
-
-	udp_tnl->used--;
-	if (udp_tnl->used)
-		return;
-
-	udp_tnl->dst_port = 0;
-	/* TBD send command to hardware to del port  */
-	if (h->ae_algo->ops->del_tunnel_udp)
-		h->ae_algo->ops->del_tunnel_udp(h, port);
-}
-
-/* hns3_nic_udp_tunnel_add - Get notifiacetion about UDP tunnel ports
- * @netdev: This physical ports's netdev
- * @ti: Tunnel information
- */
-static void hns3_nic_udp_tunnel_add(struct net_device *netdev,
-				    struct udp_tunnel_info *ti)
-{
-	u16 port_n = ntohs(ti->port);
-
-	switch (ti->type) {
-	case UDP_TUNNEL_TYPE_VXLAN:
-		hns3_add_tunnel_port(netdev, port_n, HNS3_UDP_TNL_VXLAN);
-		break;
-	case UDP_TUNNEL_TYPE_GENEVE:
-		hns3_add_tunnel_port(netdev, port_n, HNS3_UDP_TNL_GENEVE);
-		break;
-	default:
-		netdev_err(netdev, "unsupported tunnel type %d\n", ti->type);
-		break;
-	}
-}
-
-static void hns3_nic_udp_tunnel_del(struct net_device *netdev,
-				    struct udp_tunnel_info *ti)
-{
-	u16 port_n = ntohs(ti->port);
-
-	switch (ti->type) {
-	case UDP_TUNNEL_TYPE_VXLAN:
-		hns3_del_tunnel_port(netdev, port_n, HNS3_UDP_TNL_VXLAN);
-		break;
-	case UDP_TUNNEL_TYPE_GENEVE:
-		hns3_del_tunnel_port(netdev, port_n, HNS3_UDP_TNL_GENEVE);
-		break;
-	default:
-		break;
-	}
-}
-
 static int hns3_setup_tc(struct net_device *netdev, void *type_data)
 {
 	struct tc_mqprio_qopt_offload *mqprio_qopt = type_data;
@@ -1569,8 +1482,6 @@ static const struct net_device_ops hns3_nic_netdev_ops = {
 	.ndo_get_stats64	= hns3_nic_get_stats64,
 	.ndo_setup_tc		= hns3_nic_setup_tc,
 	.ndo_set_rx_mode	= hns3_nic_set_rx_mode,
-	.ndo_udp_tunnel_add	= hns3_nic_udp_tunnel_add,
-	.ndo_udp_tunnel_del	= hns3_nic_udp_tunnel_del,
 	.ndo_vlan_rx_add_vid	= hns3_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= hns3_vlan_rx_kill_vid,
 	.ndo_set_vf_vlan	= hns3_ndo_set_vf_vlan,
-- 
cgit v1.2.3-59-g8ed1b


From beebca3a911051a5f4d359ed64b0375a83d42490 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 9 May 2018 17:24:40 +0100
Subject: net: hns3: fix for cleaning ring problem

The head or tail in hardware is not longer valid when resetting,
current hns3_clear_all_ring use them to clean the ring, which
will cause problem during resetting.

This patch fixes it by using next_to_use and next_to_clean in
the ring struct.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 34 ++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index bd8e14b53889..4031174181a0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3203,9 +3203,35 @@ static void hns3_recover_hw_addr(struct net_device *ndev)
 		hns3_nic_mc_sync(ndev, ha->addr);
 }
 
-static void hns3_drop_skb_data(struct hns3_enet_ring *ring, struct sk_buff *skb)
+static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
 {
-	dev_kfree_skb_any(skb);
+	if (!HNAE3_IS_TX_RING(ring))
+		return;
+
+	while (ring->next_to_clean != ring->next_to_use) {
+		hns3_free_buffer_detach(ring, ring->next_to_clean);
+		ring_ptr_move_fw(ring, next_to_clean);
+	}
+}
+
+static void hns3_clear_rx_ring(struct hns3_enet_ring *ring)
+{
+	if (HNAE3_IS_TX_RING(ring))
+		return;
+
+	while (ring->next_to_use != ring->next_to_clean) {
+		/* When a buffer is not reused, it's memory has been
+		 * freed in hns3_handle_rx_bd or will be freed by
+		 * stack, so only need to unmap the buffer here.
+		 */
+		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
+			hns3_unmap_buffer(ring,
+					  &ring->desc_cb[ring->next_to_use]);
+			ring->desc_cb[ring->next_to_use].dma = 0;
+		}
+
+		ring_ptr_move_fw(ring, next_to_use);
+	}
 }
 
 static void hns3_clear_all_ring(struct hnae3_handle *h)
@@ -3219,13 +3245,13 @@ static void hns3_clear_all_ring(struct hnae3_handle *h)
 		struct hns3_enet_ring *ring;
 
 		ring = priv->ring_data[i].ring;
-		hns3_clean_tx_ring(ring, ring->desc_num);
+		hns3_clear_tx_ring(ring);
 		dev_queue = netdev_get_tx_queue(ndev,
 						priv->ring_data[i].queue_index);
 		netdev_tx_reset_queue(dev_queue);
 
 		ring = priv->ring_data[i + h->kinfo.num_tqps].ring;
-		hns3_clean_rx_ring(ring, ring->desc_num, hns3_drop_skb_data);
+		hns3_clear_rx_ring(ring);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e4d68dae43fbed0132472061d9f1ab01ef4e3efe Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Wed, 9 May 2018 17:24:41 +0100
Subject: net: hns3: refactor the loopback related function

This patch refactors the loopback related function in order
to support the serdes loopback.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 21 +++----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 68 +++++++++++-----------
 2 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index eb3c34f3cf87..c16bb6cb0564 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -74,7 +74,7 @@ struct hns3_link_mode_mapping {
 	u32 ethtool_link_mode;
 };
 
-static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop)
+static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en)
 {
 	struct hnae3_handle *h = hns3_get_handle(ndev);
 	int ret;
@@ -85,11 +85,7 @@ static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop)
 
 	switch (loop) {
 	case HNAE3_MAC_INTER_LOOP_MAC:
-		ret = h->ae_algo->ops->set_loopback(h, loop, true);
-		break;
-	case HNAE3_MAC_LOOP_NONE:
-		ret = h->ae_algo->ops->set_loopback(h,
-			HNAE3_MAC_INTER_LOOP_MAC, false);
+		ret = h->ae_algo->ops->set_loopback(h, loop, en);
 		break;
 	default:
 		ret = -ENOTSUPP;
@@ -99,10 +95,7 @@ static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop)
 	if (ret)
 		return ret;
 
-	if (loop == HNAE3_MAC_LOOP_NONE)
-		h->ae_algo->ops->set_promisc_mode(h, ndev->flags & IFF_PROMISC);
-	else
-		h->ae_algo->ops->set_promisc_mode(h, 1);
+	h->ae_algo->ops->set_promisc_mode(h, en);
 
 	return ret;
 }
@@ -122,13 +115,13 @@ static int hns3_lp_up(struct net_device *ndev, enum hnae3_loop loop_mode)
 		return ret;
 	}
 
-	ret = hns3_lp_setup(ndev, loop_mode);
+	ret = hns3_lp_setup(ndev, loop_mode, true);
 	usleep_range(10000, 20000);
 
 	return ret;
 }
 
-static int hns3_lp_down(struct net_device *ndev)
+static int hns3_lp_down(struct net_device *ndev, enum hnae3_loop loop_mode)
 {
 	struct hnae3_handle *h = hns3_get_handle(ndev);
 	int ret;
@@ -136,7 +129,7 @@ static int hns3_lp_down(struct net_device *ndev)
 	if (!h->ae_algo->ops->stop)
 		return -EOPNOTSUPP;
 
-	ret = hns3_lp_setup(ndev, HNAE3_MAC_LOOP_NONE);
+	ret = hns3_lp_setup(ndev, loop_mode, false);
 	if (ret) {
 		netdev_err(ndev, "lb_setup return error: %d\n", ret);
 		return ret;
@@ -332,7 +325,7 @@ static void hns3_self_test(struct net_device *ndev,
 		data[test_index] = hns3_lp_up(ndev, loop_type);
 		if (!data[test_index]) {
 			data[test_index] = hns3_lp_run_test(ndev, loop_type);
-			hns3_lp_down(ndev);
+			hns3_lp_down(ndev, loop_type);
 		}
 
 		if (data[test_index])
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 084b90437e23..316ec8427891 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3682,48 +3682,50 @@ static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
 			"mac enable fail, ret =%d.\n", ret);
 }
 
-static int hclge_set_loopback(struct hnae3_handle *handle,
-			      enum hnae3_loop loop_mode, bool en)
+static int hclge_set_mac_loopback(struct hclge_dev *hdev, bool en)
 {
-	struct hclge_vport *vport = hclge_get_vport(handle);
 	struct hclge_config_mac_mode_cmd *req;
-	struct hclge_dev *hdev = vport->back;
 	struct hclge_desc desc;
 	u32 loop_en;
 	int ret;
 
-	switch (loop_mode) {
-	case HNAE3_MAC_INTER_LOOP_MAC:
-		req = (struct hclge_config_mac_mode_cmd *)&desc.data[0];
-		/* 1 Read out the MAC mode config at first */
-		hclge_cmd_setup_basic_desc(&desc,
-					   HCLGE_OPC_CONFIG_MAC_MODE,
-					   true);
-		ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-		if (ret) {
-			dev_err(&hdev->pdev->dev,
-				"mac loopback get fail, ret =%d.\n",
-				ret);
-			return ret;
-		}
+	req = (struct hclge_config_mac_mode_cmd *)&desc.data[0];
+	/* 1 Read out the MAC mode config at first */
+	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, true);
+	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+	if (ret) {
+		dev_err(&hdev->pdev->dev,
+			"mac loopback get fail, ret =%d.\n", ret);
+		return ret;
+	}
 
-		/* 2 Then setup the loopback flag */
-		loop_en = le32_to_cpu(req->txrx_pad_fcs_loop_en);
-		if (en)
-			hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, 1);
-		else
-			hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, 0);
+	/* 2 Then setup the loopback flag */
+	loop_en = le32_to_cpu(req->txrx_pad_fcs_loop_en);
+	hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, en ? 1 : 0);
 
-		req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
+	req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
 
-		/* 3 Config mac work mode with loopback flag
-		 * and its original configure parameters
-		 */
-		hclge_cmd_reuse_desc(&desc, false);
-		ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-		if (ret)
-			dev_err(&hdev->pdev->dev,
-				"mac loopback set fail, ret =%d.\n", ret);
+	/* 3 Config mac work mode with loopback flag
+	 * and its original configure parameters
+	 */
+	hclge_cmd_reuse_desc(&desc, false);
+	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+	if (ret)
+		dev_err(&hdev->pdev->dev,
+			"mac loopback set fail, ret =%d.\n", ret);
+	return ret;
+}
+
+static int hclge_set_loopback(struct hnae3_handle *handle,
+			      enum hnae3_loop loop_mode, bool en)
+{
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct hclge_dev *hdev = vport->back;
+	int ret;
+
+	switch (loop_mode) {
+	case HNAE3_MAC_INTER_LOOP_MAC:
+		ret = hclge_set_mac_loopback(hdev, en);
 		break;
 	default:
 		ret = -ENOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From 9e5750106630113ab112e8044ff7089d3e9c2906 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 May 2018 10:05:46 -0700
Subject: net/ipv6: fix lock imbalance in ip6_route_del()

WARNING: lock held when returning to user space!
4.17.0-rc3+ #37 Not tainted

syz-executor1/27662 is leaving the kernel with locks still held!
1 lock held by syz-executor1/27662:
 #0: 00000000f661aee7 (rcu_read_lock){....}, at: ip6_route_del+0xea/0x13f0 net/ipv6/route.c:3206
BUG: scheduling while atomic: syz-executor1/27662/0x00000002
INFO: lockdep is turned off.
Modules linked in:
Kernel panic - not syncing: scheduling while atomic

CPU: 1 PID: 27662 Comm: syz-executor1 Not tainted 4.17.0-rc3+ #37
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 panic+0x22f/0x4de kernel/panic.c:184
 __schedule_bug.cold.85+0xdf/0xdf kernel/sched/core.c:3290
 schedule_debug kernel/sched/core.c:3307 [inline]
 __schedule+0x139e/0x1e30 kernel/sched/core.c:3412
 schedule+0xef/0x430 kernel/sched/core.c:3549
 exit_to_usermode_loop+0x220/0x310 arch/x86/entry/common.c:152
 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
 do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x455979
RSP: 002b:00007fbf4051dc68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: 0000000000000000 RBX: 00007fbf4051e6d4 RCX: 0000000000455979
RDX: 00000000200001c0 RSI: 000000000000890c RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 00000000000003c8 R14: 00000000006f9b60 R15: 0000000000000000
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..

Fixes: 23fb93a4d3f1 ("net/ipv6: Cleanup exception and cache route handling")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsahern@gmail.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index daa3662da0ee..af0416701fb2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3224,8 +3224,10 @@ static int ip6_route_del(struct fib6_config *cfg,
 							      &cfg->fc_src);
 				if (rt_cache) {
 					rc = ip6_del_cached_rt(rt_cache, cfg);
-					if (rc != -ESRCH)
+					if (rc != -ESRCH) {
+						rcu_read_unlock();
 						return rc;
+					}
 				}
 				continue;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From f60475761838478ececbc3d47ab385808441a0a0 Mon Sep 17 00:00:00 2001
From: Felix Manlunas <felix.manlunas@cavium.com>
Date: Wed, 9 May 2018 11:31:31 -0700
Subject: liquidio: monitor all of Octeon's cores in watchdog thread

The liquidio_watchdog kernel thread is watching over only 12 cores of the
Octeon CN23XX; it's neglecting the other 4 cores that are present in the
CN2360.  Fix it by defining LIO_MAX_CORES as 16.

Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/octeon_network.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index dd3177a526d2..d7a3916fe877 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -192,7 +192,7 @@ struct lio {
 #define LIO_SIZE         (sizeof(struct lio))
 #define GET_LIO(netdev)  ((struct lio *)netdev_priv(netdev))
 
-#define LIO_MAX_CORES                12
+#define LIO_MAX_CORES                16
 
 /**
  * \brief Enable or disable feature
-- 
cgit v1.2.3-59-g8ed1b


From 7d0870f661e11b9248c87919bd2ce61ba6a67a82 Mon Sep 17 00:00:00 2001
From: Felix Manlunas <felix.manlunas@cavium.com>
Date: Wed, 9 May 2018 11:49:38 -0700
Subject: liquidio: bump up driver version to 1.7.2 to match newer NIC firmware

Signed-off-by: Felix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 285b24836974..690424b6781a 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -28,7 +28,7 @@
 #define LIQUIDIO_PACKAGE ""
 #define LIQUIDIO_BASE_MAJOR_VERSION 1
 #define LIQUIDIO_BASE_MINOR_VERSION 7
-#define LIQUIDIO_BASE_MICRO_VERSION 0
+#define LIQUIDIO_BASE_MICRO_VERSION 2
 #define LIQUIDIO_BASE_VERSION   __stringify(LIQUIDIO_BASE_MAJOR_VERSION) "." \
 				__stringify(LIQUIDIO_BASE_MINOR_VERSION)
 #define LIQUIDIO_MICRO_VERSION  "." __stringify(LIQUIDIO_BASE_MICRO_VERSION)
-- 
cgit v1.2.3-59-g8ed1b


From 03bdfc001c951cb04ad3d28aecee4ec0e18e9664 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 9 May 2018 23:24:07 -0700
Subject: net: ipv4: remove define INET_CSK_DEBUG and unnecessary EXPORT_SYMBOL

INET_CSK_DEBUG is always set and only is used for 2 pr_debug calls.

EXPORT_SYMBOL(inet_csk_timer_bug_msg) is only used by these 2
pr_debug calls and is also unnecessary as the exported string can
be used directly by these calls.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 22 ++++------------------
 net/ipv4/inet_connection_sock.c    |  5 -----
 2 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2ab6667275df..0a6c9e0f2b5a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -23,8 +23,6 @@
 #include <net/inet_sock.h>
 #include <net/request_sock.h>
 
-#define INET_CSK_DEBUG 1
-
 /* Cancel timers, when they are not required. */
 #undef INET_CSK_CLEAR_TIMERS
 
@@ -196,10 +194,6 @@ static inline void inet_csk_delack_init(struct sock *sk)
 void inet_csk_delete_keepalive_timer(struct sock *sk);
 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
 
-#ifdef INET_CSK_DEBUG
-extern const char inet_csk_timer_bug_msg[];
-#endif
-
 static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -214,12 +208,9 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
 #ifdef INET_CSK_CLEAR_TIMERS
 		sk_stop_timer(sk, &icsk->icsk_delack_timer);
 #endif
+	} else {
+		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
-#ifdef INET_CSK_DEBUG
-	else {
-		pr_debug("%s", inet_csk_timer_bug_msg);
-	}
-#endif
 }
 
 /*
@@ -232,10 +223,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (when > max_when) {
-#ifdef INET_CSK_DEBUG
 		pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",
 			 sk, what, when, current_text_addr());
-#endif
 		when = max_when;
 	}
 
@@ -249,12 +238,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 		icsk->icsk_ack.pending |= ICSK_ACK_TIMER;
 		icsk->icsk_ack.timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+	} else {
+		pr_debug("inet_csk BUG: unknown timer value\n");
 	}
-#ifdef INET_CSK_DEBUG
-	else {
-		pr_debug("%s", inet_csk_timer_bug_msg);
-	}
-#endif
 }
 
 static inline unsigned long
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 881ac6d046f2..33a88e045efd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -27,11 +27,6 @@
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
 
-#ifdef INET_CSK_DEBUG
-const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
-EXPORT_SYMBOL(inet_csk_timer_bug_msg);
-#endif
-
 #if IS_ENABLED(CONFIG_IPV6)
 /* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
  *                          only, and any IPv4 addresses if not IPv6 only
-- 
cgit v1.2.3-59-g8ed1b


From 00483690552c5fb6aa30bf3acb75b0ee89b4c0fd Mon Sep 17 00:00:00 2001
From: Jon Maxwell <jmaxwell37@gmail.com>
Date: Thu, 10 May 2018 16:53:51 +1000
Subject: tcp: Add mark for TIMEWAIT sockets

This version has some suggestions by Eric Dumazet:

- Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP
races.
- Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
statement.
- Factorize code as sk_fullsock() check is not necessary.

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large
increase in Uplink retransmissions caused by the client never receiving
the final ACK to their FINACK - this ACK misses the mark and routes out
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect"
sysctl is enabled. But not for TW sockets that had sk->sk_mark set via
setsockopt(SO_MARK..).

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location.
Then progate this so that the skb gets sent with the correct mark. Do the same
for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that
netfilter rules are still honored.

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/ip_output.c             |  2 +-
 net/ipv4/tcp_ipv4.c              | 16 ++++++++++++++--
 net/ipv4/tcp_minisocks.c         |  1 +
 net/ipv6/tcp_ipv6.c              |  6 +++++-
 5 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..659d8ed5a3bc 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -62,6 +62,7 @@ struct inet_timewait_sock {
 #define tw_dr			__tw_common.skc_tw_dr
 
 	int			tw_timeout;
+	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 95adb171f852..b5e21eb198d8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 		oif = skb->skb_iif;
 
 	flowi4_init_output(&fl4, oif,
-			   IP4_REPLY_MARK(net, skb->mark),
+			   IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
 			   RT_TOS(arg->tos),
 			   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
 			   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..caf23de88f8a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	struct sock *sk1 = NULL;
 #endif
 	struct net *net;
+	struct sock *ctl_sk;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	arg.tos = ip_hdr(skb)->tos;
 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk)
+		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
 	local_bh_enable();
@@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
+	struct sock *ctl_sk;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	arg.tos = tos;
 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
 	local_bh_disable();
-	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+	ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+	if (sk)
+		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+				   inet_twsk(sk)->tw_mark : sk->sk_mark;
+	ip_send_unicast_reply(ctl_sk,
 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
+	ctl_sk->sk_mark = 0;
 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 	local_bh_enable();
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 57b5468b5139..f867658b4b30 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		struct inet_sock *inet = inet_sk(sk);
 
 		tw->tw_transparent	= inet->transparent;
+		tw->tw_mark		= sk->sk_mark;
 		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
 		tcptw->tw_rcv_nxt	= tp->rcv_nxt;
 		tcptw->tw_snd_nxt	= tp->snd_nxt;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6d664d83cd16..7d47c2b550a9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	unsigned int tot_len = sizeof(struct tcphdr);
 	struct dst_entry *dst;
 	__be32 *topt;
+	__u32 mark = 0;
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		fl6.flowi6_oif = oif;
 	}
 
-	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
+	if (sk)
+		mark = (sk->sk_state == TCP_TIME_WAIT) ?
+			inet_twsk(sk)->tw_mark : sk->sk_mark;
+	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
-- 
cgit v1.2.3-59-g8ed1b


From 2b18d79e733fd727378aea7927536fb270c7ccbc Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 10 May 2018 13:13:03 +0300
Subject: net: bridge: Allow bridge master in br_vlan_get_info()

Mirroring offload in mlxsw needs to check that a given VLAN is allowed
to ingress the bridge device. br_vlan_get_info() is the function that is
used for this, however currently it only supports bridge port devices.
Extend it to support bridge masters as well.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index df37a5137c25..dc832c0934c6 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1176,6 +1176,8 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid,
 	p = br_port_get_check_rtnl(dev);
 	if (p)
 		vg = nbp_vlan_group(p);
+	else if (netif_is_bridge_master(dev))
+		vg = br_vlan_group(netdev_priv(dev));
 	else
 		return -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 419476624c89905977aa945aea2f94f7d942caf5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 10 May 2018 13:13:04 +0300
Subject: mlxsw: reg: Add MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH

Add MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH to support VLAN-encapsulated
port mirroring.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 6218231e379e..3f4d7e22cece 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -6833,6 +6833,12 @@ enum mlxsw_reg_mpat_span_type {
 	 */
 	MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH = 0x0,
 
+	/* Remote SPAN Ethernet VLAN.
+	 * The packet is forwarded to the monitoring port on the monitoring
+	 * VLAN.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH = 0x1,
+
 	/* Encapsulated Remote SPAN Ethernet L3 GRE.
 	 * The packet is encapsulated with GRE header.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From e00698d1d7957669002a39c039423921d8694033 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 10 May 2018 13:13:05 +0300
Subject: mlxsw: spectrum_span: Support mirror-to-VLAN

Offload "tc action mirred mirror" to a device that is a vlan device on
top of a front-panel port device. The hardware encapsulates the mirrored
packets in a VLAN tag. That includes the case that the mirrored traffic
is already VLAN-tagged--in that case the monitor traffic will be
double-tagged, just like in the software path.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index cd9071ee19ad..d90582ee478f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -235,6 +235,14 @@ mlxsw_sp_span_entry_bridge(const struct net_device *br_dev,
 	return dev;
 }
 
+static struct net_device *
+mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
+			 u16 *p_vid)
+{
+	*p_vid = vlan_dev_vlan_id(vlan_dev);
+	return vlan_dev_real_dev(vlan_dev);
+}
+
 static __maybe_unused int
 mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 					union mlxsw_sp_l3addr saddr,
@@ -477,6 +485,61 @@ struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap6 = {
 };
 #endif
 
+static bool
+mlxsw_sp_span_vlan_can_handle(const struct net_device *dev)
+{
+	return is_vlan_dev(dev) &&
+	       mlxsw_sp_port_dev_check(vlan_dev_real_dev(dev));
+}
+
+static int
+mlxsw_sp_span_entry_vlan_parms(const struct net_device *to_dev,
+			       struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct net_device *real_dev;
+	u16 vid;
+
+	if (!(to_dev->flags & IFF_UP))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	real_dev = mlxsw_sp_span_entry_vlan(to_dev, &vid);
+	sparmsp->dest_port = netdev_priv(real_dev);
+	sparmsp->vid = vid;
+	return 0;
+}
+
+static int
+mlxsw_sp_span_entry_vlan_configure(struct mlxsw_sp_span_entry *span_entry,
+				   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_vlan_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_vlan = {
+	.can_handle = mlxsw_sp_span_vlan_can_handle,
+	.parms = mlxsw_sp_span_entry_vlan_parms,
+	.configure = mlxsw_sp_span_entry_vlan_configure,
+	.deconfigure = mlxsw_sp_span_entry_vlan_deconfigure,
+};
+
 static const
 struct mlxsw_sp_span_entry_ops *const mlxsw_sp_span_entry_types[] = {
 	&mlxsw_sp_span_entry_ops_phys,
@@ -486,6 +549,7 @@ struct mlxsw_sp_span_entry_ops *const mlxsw_sp_span_entry_types[] = {
 #if IS_ENABLED(CONFIG_IPV6_GRE)
 	&mlxsw_sp_span_entry_ops_gretap6,
 #endif
+	&mlxsw_sp_span_entry_ops_vlan,
 };
 
 static int
-- 
cgit v1.2.3-59-g8ed1b


From 03c44132393e821fb4fbd75b9c9798f13c46eca6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 10 May 2018 13:13:06 +0300
Subject: mlxsw: spectrum_span: Support VLAN under mirror-to-gretap

When mirroring to a gretap or ip6gretap device, allow the underlay
packet path to include VLAN devices. The following configurations are
supported in underlay:

- vlan over phys
- vlan-unaware bridge where the egress device is vlan over phys
- vlan over vlan-aware bridge where the egress device is phys

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 27 +++++++++++++++-------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index d90582ee478f..3b77990df599 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -176,21 +176,23 @@ mlxsw_sp_span_entry_bridge_8021q(const struct net_device *br_dev,
 {
 	struct bridge_vlan_info vinfo;
 	struct net_device *edev;
-	u16 pvid;
+	u16 vid = *p_vid;
 
-	if (WARN_ON(br_vlan_get_pvid(br_dev, &pvid)))
+	if (!vid && WARN_ON(br_vlan_get_pvid(br_dev, &vid)))
 		return NULL;
-	if (!pvid)
+	if (!vid ||
+	    br_vlan_get_info(br_dev, vid, &vinfo) ||
+	    !(vinfo.flags & BRIDGE_VLAN_INFO_BRENTRY))
 		return NULL;
 
-	edev = br_fdb_find_port(br_dev, dmac, pvid);
+	edev = br_fdb_find_port(br_dev, dmac, vid);
 	if (!edev)
 		return NULL;
 
-	if (br_vlan_get_info(edev, pvid, &vinfo))
+	if (br_vlan_get_info(edev, vid, &vinfo))
 		return NULL;
 	if (!(vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED))
-		*p_vid = pvid;
+		*p_vid = vid;
 	return edev;
 }
 
@@ -208,13 +210,13 @@ mlxsw_sp_span_entry_bridge(const struct net_device *br_dev,
 {
 	struct mlxsw_sp_bridge_port *bridge_port;
 	enum mlxsw_reg_spms_state spms_state;
+	struct net_device *dev = NULL;
 	struct mlxsw_sp_port *port;
-	struct net_device *dev;
 	u8 stp_state;
 
 	if (br_vlan_enabled(br_dev))
 		dev = mlxsw_sp_span_entry_bridge_8021q(br_dev, dmac, p_vid);
-	else
+	else if (!*p_vid)
 		dev = mlxsw_sp_span_entry_bridge_8021d(br_dev, dmac);
 	if (!dev)
 		return NULL;
@@ -261,12 +263,21 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
 		goto unoffloadable;
 
+	if (is_vlan_dev(l3edev))
+		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+
 	if (netif_is_bridge_master(l3edev)) {
 		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
 		if (!l3edev)
 			goto unoffloadable;
 	}
 
+	if (is_vlan_dev(l3edev)) {
+		if (vid || !(l3edev->flags & IFF_UP))
+			goto unoffloadable;
+		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+	}
+
 	if (!mlxsw_sp_port_dev_check(l3edev))
 		goto unoffloadable;
 
-- 
cgit v1.2.3-59-g8ed1b


From b3c594ab6fcf7a91453442755deb1f3941eeed64 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Thu, 10 May 2018 16:07:23 +0530
Subject: cxgb4: fix the wrong conversion of Mbps to Kbps

fix the wrong conversion where 1 Mbps was converted to
1024 Kbps.

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 24d2865b8806..1efcc85692f7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2886,13 +2886,13 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 	}
 
 	/* Convert from Mbps to Kbps */
-	req_rate = rate << 10;
+	req_rate = rate * 1000;
 
 	/* Max rate is 100 Gbps */
-	if (req_rate >= SCHED_MAX_RATE_KBPS) {
+	if (req_rate > SCHED_MAX_RATE_KBPS) {
 		dev_err(adap->pdev_dev,
 			"Invalid rate %u Mbps, Max rate is %u Mbps\n",
-			rate, SCHED_MAX_RATE_KBPS >> 10);
+			rate, SCHED_MAX_RATE_KBPS / 1000);
 		return -ERANGE;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 895262d85799fd4eaee84a89d024611b40330969 Mon Sep 17 00:00:00 2001
From: Boris Pismenny <borisp@mellanox.com>
Date: Thu, 10 May 2018 16:27:25 +0300
Subject: tls: Fix tls_device initialization

Add sg table initialization to fix a BUG_ON encountered when enabling
CONFIG_DEBUG_SG.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index ac45d6224e41..a7a8f8e20ff3 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -604,6 +604,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	INIT_LIST_HEAD(&offload_ctx->records_list);
 	list_add_tail(&start_marker_record->list, &offload_ctx->records_list);
 	spin_lock_init(&offload_ctx->lock);
+	sg_init_table(offload_ctx->sg_tx_data,
+		      ARRAY_SIZE(offload_ctx->sg_tx_data));
 
 	clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
 	ctx->push_pending_record = tls_device_push_pending_record;
-- 
cgit v1.2.3-59-g8ed1b


From ec9efb523cb8daf7b9d2e5c9cb80b255b716a777 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 10 May 2018 15:29:46 +0200
Subject: rocker: Postpone filtering of !added_by_user FDB

Breaking out of the switch in rocker_switchdev_event() still ends up
scheduling work, except an ill-defined one. This leads to an OOPS cited
below. Fix by postponing the check until rocker_switchdev_event_work().

[   23.148476] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[   23.148810] PGD 0 P4D 0
[   23.148982] Oops: 0000 [#1] PREEMPT SMP PTI
[   23.149190] Modules linked in: bridge stp llc iptable_nat nf_nat_ipv4 nf_nat e1000 rocker
[   23.149768] CPU: 0 PID: 239 Comm: kworker/u2:4 Not tainted 4.17.0-rc3-net_next_queue-custom #6
[   23.150298] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
[   23.150868] Workqueue: rocker rocker_switchdev_event_work [rocker]
[   23.151258] RIP: 0010:ofdpa_port_fdb+0x7b/0x230 [rocker]
[   23.151597] RSP: 0018:ffffc900004b3e18 EFLAGS: 00010246
[   23.151952] RAX: 00000000fffbc68c RBX: 0000000000000000 RCX: 0000000000000000
[   23.152363] RDX: 0000000000000010 RSI: ffff88003b4471e0 RDI: 00000000ffffffff
[   23.152768] RBP: ffff88003b4471c0 R08: ffff88003b4471e0 R09: ffff88003b4471c0
[   23.153141] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880036caf000
[   23.153515] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88003bc00000
[   23.153919] FS:  0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000
[   23.154444] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   23.154806] CR2: 0000000000000000 CR3: 0000000036eb6000 CR4: 00000000000006f0
[   23.155194] Call Trace:
[   23.155472]  rocker_switchdev_event_work+0x9b/0xd0 [rocker]
[   23.155850]  ? __schedule+0x231/0x700
[   23.156175]  process_one_work+0x1cf/0x3e0
[   23.156490]  worker_thread+0x26/0x3d0
[   23.156795]  ? trace_event_raw_event_workqueue_execute_start+0x80/0x80
[   23.157181]  kthread+0x10e/0x130
[   23.157485]  ? kthread_create_worker_on_cpu+0x40/0x40
[   23.157824]  ret_from_fork+0x35/0x40
[   23.158174] Code: 00 00 c1 e8 02 4c 8d 45 20 bf ff ff ff ff 83 e0 01 4c 89 65 20 88 45 14 48 8b 05 21 da 1f e2 4c 89 c6 4c 89 44 24 10 48 89 45 18 <41> 8b 45 00 89 45 28 41 0f b7 45 04 66 89 45 2c 0f b7 44 24 04
[   23.159346] RIP: ofdpa_port_fdb+0x7b/0x230 [rocker] RSP: ffffc900004b3e18
[   23.159742] CR2: 0000000000000000
[   23.160088] ---[ end trace f9b16d4cb6df0629 ]---

Fixes: 816a3bed9549 ("switchdev: Add fdb.added_by_user to switchdev notifications")
Suggested-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 152d6948611c..e73e4febeedb 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2738,6 +2738,8 @@ static void rocker_switchdev_event_work(struct work_struct *work)
 	switch (switchdev_work->event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		err = rocker_world_port_fdb_add(rocker_port, fdb_info);
 		if (err) {
 			netdev_dbg(rocker_port->dev, "fdb add failed err=%d\n", err);
@@ -2747,6 +2749,8 @@ static void rocker_switchdev_event_work(struct work_struct *work)
 		break;
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
 		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
 		err = rocker_world_port_fdb_del(rocker_port, fdb_info);
 		if (err)
 			netdev_dbg(rocker_port->dev, "fdb add failed err=%d\n", err);
@@ -2783,8 +2787,6 @@ static int rocker_switchdev_event(struct notifier_block *unused,
 	switch (event) {
 	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
 	case SWITCHDEV_FDB_DEL_TO_DEVICE:
-		if (!fdb_info->added_by_user)
-			break;
 		memcpy(&switchdev_work->fdb_info, ptr,
 		       sizeof(switchdev_work->fdb_info));
 		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From 68625b7631e040707b24197451a475a3e9197e2a Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Thu, 10 May 2018 11:09:21 +0800
Subject: bpf, doc: clarification for the meaning of 'id'

For me, as a reader whose mother language isn't English, the
old words bring a little difficulty to catch the meaning, this
patch rewords the subsection in a more clarificatory way.

This patch also add blank lines as separator at two places
to improve readability.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/filter.txt | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 5032e1263bc9..e6b4ebb2b243 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1142,6 +1142,7 @@ into a register from memory, the register's top 56 bits are known zero, while
 the low 8 are unknown - which is represented as the tnum (0x0; 0xff).  If we
 then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0;
 0x1ff), because of potential carries.
+
 Besides arithmetic, the register state can also be updated by conditional
 branches.  For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
 it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false'
@@ -1150,14 +1151,16 @@ BPF_JSGE) would instead update the signed minimum/maximum values.  Information
 from the signed and unsigned bounds can be combined; for instance if a value is
 first tested < 8 and then tested s> 4, the verifier will conclude that the value
 is also > 4 and s< 8, since the bounds prevent crossing the sign boundary.
+
 PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all
 pointers sharing that same variable offset.  This is important for packet range
-checks: after adding some variable to a packet pointer, if you then copy it to
-another register and (say) add a constant 4, both registers will share the same
-'id' but one will have a fixed offset of +4.  Then if it is bounds-checked and
-found to be less than a PTR_TO_PACKET_END, the other register is now known to
-have a safe range of at least 4 bytes.  See 'Direct packet access', below, for
-more on PTR_TO_PACKET ranges.
+checks: after adding a variable to a packet pointer register A, if you then copy
+it to another register B and then add a constant 4 to A, both registers will
+share the same 'id' but the A will have a fixed offset of +4.  Then if A is
+bounds-checked and found to be less than a PTR_TO_PACKET_END, the register B is
+now known to have a safe range of at least 4 bytes.  See 'Direct packet access',
+below, for more on PTR_TO_PACKET ranges.
+
 The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of
 the pointer returned from a map lookup.  This means that when one copy is
 checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs.
-- 
cgit v1.2.3-59-g8ed1b


From 6454743bc13e7dfd4f2720758ca3fcdea76b82a4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:19 -0700
Subject: net/ipv6: Rename fib6_lookup to fib6_node_lookup

Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  6 +++---
 net/ipv6/ip6_fib.c    | 14 ++++++++------
 net/ipv6/route.c      |  8 ++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a3ec08d05756..43ab545e64ea 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
-struct fib6_node *fib6_lookup(struct fib6_node *root,
-			      const struct in6_addr *daddr,
-			      const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr);
 
 struct fib6_node *fib6_locate(struct fib6_node *root,
 			      const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f0a4262a4789..487faffeae28 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
 	const struct in6_addr	*addr;		/* search key			*/
 };
 
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
-				       struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+					    struct lookup_args *args)
 {
 	struct fib6_node *fn;
 	__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
 #ifdef CONFIG_IPV6_SUBTREES
 				if (subtree) {
 					struct fib6_node *sfn;
-					sfn = fib6_lookup_1(subtree, args + 1);
+					sfn = fib6_node_lookup_1(subtree,
+								 args + 1);
 					if (!sfn)
 						goto backtrack;
 					fn = sfn;
@@ -1422,8 +1423,9 @@ backtrack:
 
 /* called with rcu_read_lock() held
  */
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
-			      const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+				   const struct in6_addr *daddr,
+				   const struct in6_addr *saddr)
 {
 	struct fib6_node *fn;
 	struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
 		}
 	};
 
-	fn = fib6_lookup_1(root, daddr ? args : args + 1);
+	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
 		fn = root;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index daa3662da0ee..443d2a0bc150 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 		pn = rcu_dereference(fn->parent);
 		sn = FIB6_SUBTREE(pn);
 		if (sn && sn != fn)
-			fn = fib6_lookup(sn, NULL, saddr);
+			fn = fib6_node_lookup(sn, NULL, saddr);
 		else
 			fn = pn;
 		if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 		flags &= ~RT6_LOOKUP_F_IFACE;
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	f6i = rcu_dereference(fn->leaf);
 	if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	rcu_read_lock();
 
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
 
 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2425,7 +2425,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
 	 */
 
 	rcu_read_lock();
-	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 restart:
 	for_each_fib6_node_rt_rcu(fn) {
 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
-- 
cgit v1.2.3-59-g8ed1b


From 3b290a31bbc5969f9193f73d547a6dc8a25c6f9e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:20 -0700
Subject: net/ipv6: Rename rt6_multipath_select

Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/route.c      | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 43ab545e64ea..2597d8fdd92f 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb, int strict);
+
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 443d2a0bc150..6a10608d9025 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct fib6_info *rt6_multipath_select(const struct net *net,
-					      struct fib6_info *match,
-					     struct flowi6 *fl6, int oif,
-					     const struct sk_buff *skb,
-					     int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+					struct fib6_info *match,
+					struct flowi6 *fl6, int oif,
+					const struct sk_buff *skb,
+					int strict)
 {
 	struct fib6_info *sibling, *next_sibling;
 
@@ -1068,8 +1068,9 @@ restart:
 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
-			f6i = rt6_multipath_select(net, f6i, fl6,
-						   fl6->flowi6_oif, skb, flags);
+			f6i = fib6_multipath_select(net, f6i, fl6,
+						    fl6->flowi6_oif, skb,
+						    flags);
 	}
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
 	if (f6i->fib6_nsiblings)
-		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
-- 
cgit v1.2.3-59-g8ed1b


From 1d053da910947afccec96d90892c0f5488c7a9cf Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:21 -0700
Subject: net/ipv6: Extract table lookup from ip6_pol_route

ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.

ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  4 ++++
 net/ipv6/route.c      | 39 +++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2597d8fdd92f..c70705f2647a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict);
+
 struct fib6_info *fib6_multipath_select(const struct net *net,
 					struct fib6_info *match,
 					struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6a10608d9025..019d8ba9021e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
 	rcu_read_unlock_bh();
 }
 
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6,
-			       const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+				    int oif, struct flowi6 *fl6, int strict)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct fib6_info *f6i;
-	struct rt6_info *rt;
-	int strict = 0;
-
-	strict |= flags & RT6_LOOKUP_F_IFACE;
-	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	if (net->ipv6.devconf_all->forwarding == 0)
-		strict |= RT6_LOOKUP_F_REACHABLE;
-
-	rcu_read_lock();
 
 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 	saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 redo_rt6_select:
 	f6i = rt6_select(net, fn, oif, strict);
-	if (f6i->fib6_nsiblings)
-		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
 	if (f6i == net->ipv6.fib6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1838,6 +1827,28 @@ redo_rt6_select:
 		}
 	}
 
+	return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
+{
+	struct fib6_info *f6i;
+	struct rt6_info *rt;
+	int strict = 0;
+
+	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
+
+	rcu_read_lock();
+
+	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+	if (f6i->fib6_nsiblings)
+		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
 	if (f6i == net->ipv6.fib6_null_entry) {
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From cc065a9eb96f7f2a29a04ca49331a9ccb1cfcfa2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:22 -0700
Subject: net/ipv6: Refactor fib6_rule_action

Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &net->ipv6.ip6_null_entry->dst;
 }
 
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+			   struct flowi6 *flp6, const struct net_device *dev)
+{
+	struct fib6_rule *r = (struct fib6_rule *)rule;
+
+	/* If we need to find a source address for this traffic,
+	 * we check the result if it meets requirement of the rule.
+	 */
+	if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+	    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+		struct in6_addr saddr;
+
+		if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+				       rt6_flags2srcprefs(flags), &saddr))
+			return -EAGAIN;
+
+		if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+			return -EAGAIN;
+
+		flp6->saddr = saddr;
+	}
+
+	return 0;
+}
+
 static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 			    int flags, struct fib_lookup_arg *arg)
 {
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 
 	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
-		struct fib6_rule *r = (struct fib6_rule *)rule;
-
-		/*
-		 * If we need to find a source address for this traffic,
-		 * we check the result if it meets requirement of the rule.
-		 */
-		if ((rule->flags & FIB_RULE_FIND_SADDR) &&
-		    r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
-			struct in6_addr saddr;
-
-			if (ipv6_dev_get_saddr(net,
-					       ip6_dst_idev(&rt->dst)->dev,
-					       &flp6->daddr,
-					       rt6_flags2srcprefs(flags),
-					       &saddr))
-				goto again;
-			if (!ipv6_prefix_equal(&saddr, &r->src.addr,
-					       r->src.plen))
-				goto again;
-			flp6->saddr = saddr;
-		}
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      ip6_dst_idev(&rt->dst)->dev);
+
+		if (err == -EAGAIN)
+			goto again;
+
 		err = rt->dst.error;
 		if (err != -EAGAIN)
 			goto out;
-- 
cgit v1.2.3-59-g8ed1b


From 138118ec96cbfc303c1d7cc05fbb2caf8382c95b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:23 -0700
Subject: net/ipv6: Add fib6_lookup

Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled)
to 60% faster than any of the dst based lookup methods (without custom
rules) and 25% faster with custom rules (e.g., l3mdev rule).

Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.

Caller must hold rcu lock as no reference is taken on the returned
fib entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip6_fib.h |  6 ++++
 net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6_fib.c    |  7 +++++
 3 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c70705f2647a..cc70f6da8462 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup);
 
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags);
+
 /* called with rcu lock held; caller needs to select path */
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
 	return fib_rules_seq_read(net, AF_INET6);
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	struct fib6_info *f6i;
+	int err;
+
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = fib6_table_lookup,
+			.lookup_data = &oif,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				       flowi6_to_flowi(fl6), flags, &arg);
+		if (err)
+			return ERR_PTR(err);
+
+		f6i = arg.result ? : net->ipv6.fib6_null_entry;
+	} else {
+		f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+					oif, fl6, flags);
+		if (!f6i || f6i == net->ipv6.fib6_null_entry)
+			f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+						oif, fl6, flags);
+	}
+
+	return f6i;
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
 	return 0;
 }
 
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-			    int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+				int flags, struct fib_lookup_arg *arg)
+{
+	struct flowi6 *flp6 = &flp->u.ip6;
+	struct net *net = rule->fr_net;
+	struct fib6_table *table;
+	struct fib6_info *f6i;
+	int err = -EAGAIN, *oif;
+	u32 tb_id;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		return -ENETUNREACH;
+	case FR_ACT_PROHIBIT:
+		return -EACCES;
+	case FR_ACT_BLACKHOLE:
+	default:
+		return -EINVAL;
+	}
+
+	tb_id = fib_rule_get_table(rule, arg);
+	table = fib6_get_table(net, tb_id);
+	if (!table)
+		return -EAGAIN;
+
+	oif = (int *)arg->lookup_data;
+	f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+	if (f6i != net->ipv6.fib6_null_entry) {
+		err = fib6_rule_saddr(net, rule, flags, flp6,
+				      fib6_info_nh_dev(f6i));
+
+		if (likely(!err))
+			arg->result = f6i;
+	}
+
+	return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	struct flowi6 *flp6 = &flp->u.ip6;
 	struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ out:
 	return err;
 }
 
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
+{
+	if (arg->lookup_ptr == fib6_table_lookup)
+		return fib6_rule_action_alt(rule, flp, flags, arg);
+
+	return __fib6_rule_action(rule, flp, flags, arg);
+}
+
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 487faffeae28..d1dc6017f5a6 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	return &rt->dst;
 }
 
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			      int flags)
+{
+	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
 static void __net_init fib6_tables_init(struct net *net)
 {
 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
-- 
cgit v1.2.3-59-g8ed1b


From d4bea421f7322400d804c2284739e42e61f78349 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:24 -0700
Subject: net/ipv6: Update fib6 tracepoint to take fib6_info

Similar to IPv4, IPv6 should use the FIB lookup result in the
tracepoint.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/trace/events/fib6.h | 14 +++++++-------
 net/ipv6/route.c            | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 7e8d48a81b91..1b8d951e3c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct rt6_info *rt,
+	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, rt, table, flp),
+	TP_ARGS(net, f6i, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
-		if (rt->rt6i_idev) {
-			__assign_str(name, rt->rt6i_idev->dev->name);
+		if (f6i->fib6_nh.nh_dev) {
+			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
 			__assign_str(name, "");
 		}
-		if (rt == net->ipv6.ip6_null_entry) {
+		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (rt) {
+		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = rt->rt6i_gateway;
+			*in6 = f6i->fib6_nh.nh_gw;
 		}
 	),
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 019d8ba9021e..73f9c29a5878 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1078,6 +1078,8 @@ restart:
 			goto restart;
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	/* Search through exception table */
 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
 	if (rt) {
@@ -1096,8 +1098,6 @@ restart:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, rt, table, fl6);
-
 	return rt;
 }
 
@@ -1827,6 +1827,8 @@ redo_rt6_select:
 		}
 	}
 
+	trace_fib6_table_lookup(net, f6i, table, fl6);
+
 	return f6i;
 }
 
@@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 		rt = net->ipv6.ip6_null_entry;
 		rcu_read_unlock();
 		dst_hold(&rt->dst);
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	}
 
@@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_use_noref(&rt->dst, jiffies);
 
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, rt, table, fl6);
 		return rt;
 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
@@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			dst_hold(&uncached_rt->dst);
 		}
 
-		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
 		return uncached_rt;
-
 	} else {
 		/* Get a percpu copy */
 
@@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 		local_bh_enable();
 		rcu_read_unlock();
-		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
+
 		return pcpu_rt;
 	}
 }
@@ -2491,7 +2489,7 @@ out:
 
 	rcu_read_unlock();
 
-	trace_fib6_table_lookup(net, ret, table, fl6);
+	trace_fib6_table_lookup(net, rt, table, fl6);
 	return ret;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 65a2022e89a4760f9702837e2d9d15a39a9c68a3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:25 -0700
Subject: net/ipv6: Add fib lookup stubs for use in bpf helper

Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table,
a stub to do a lookup in a specific table, fib6_table_lookup, and
a stub for a full route lookup.

The stubs are needed for core bpf code to handle the case when the
IPv6 module is not builtin.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/addrconf.h   | 14 ++++++++++++++
 net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++-
 net/ipv6/af_inet6.c      |  6 +++++-
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 8312cc25a3af..ff766ab207e0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,6 +223,20 @@ struct ipv6_stub {
 				 const struct in6_addr *addr);
 	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
 			       struct dst_entry **dst, struct flowi6 *fl6);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 32b564dfd02a..2fe754fd4f5e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
 	return -EAFNOSUPPORT;
 }
 
+static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
+			       int oif, struct flowi6 *fl6, int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+			 int flags)
+{
+	return NULL;
+}
+
+static struct fib6_info *
+eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
+				   struct flowi6 *fl6, int oif,
+				   const struct sk_buff *skb, int strict)
+{
+	return f6i;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
-	.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
+	.fib6_get_table    = eafnosupport_fib6_get_table,
+	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
+	.fib6_lookup       = eafnosupport_fib6_lookup,
+	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d0af96e0d109..50de8b0d4f70 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -889,7 +889,11 @@ static struct pernet_operations inet6_net_ops = {
 static const struct ipv6_stub ipv6_stub_impl = {
 	.ipv6_sock_mc_join = ipv6_sock_mc_join,
 	.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
-	.ipv6_dst_lookup = ip6_dst_lookup,
+	.ipv6_dst_lookup   = ip6_dst_lookup,
+	.fib6_get_table	   = fib6_get_table,
+	.fib6_table_lookup = fib6_table_lookup,
+	.fib6_lookup       = fib6_lookup,
+	.fib6_multipath_select = fib6_multipath_select,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
-- 
cgit v1.2.3-59-g8ed1b


From 87f5fc7e48dd3175b30dd03b41564e1a8e136323 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:26 -0700
Subject: bpf: Provide helper to do forwarding lookups in kernel FIB table

Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet continues up the stack.

If it is to be forwarded, the forwarding can be done directly if the
neighbor is already known. If the neighbor does not exist, the first
few packets go up the stack for neighbor resolution. Once resolved, the
xdp program provides the fast path.

On successful lookup the nexthop dmac, current device smac and egress
device index are returned.

The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6
are implemented in this patch. The API includes layer 4 parameters if
the XDP program chooses to do deep packet inspection to allow compare
against ACLs implemented as FIB rules.

Header rewrite is left to the XDP program.

The lookup takes 2 flags:
- BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes
  straight to the table associated with the device (expert setting for
  those looking to maximize throughput)

- BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective.
  Default is an ingress lookup.

Initial performance numbers collected by Jesper, forwarded packets/sec:

       Full stack    XDP FIB lookup    XDP Direct lookup
IPv4   1,947,969       7,074,156          7,415,333
IPv6   1,728,000       6,165,504          7,262,720

These number are single CPU core forwarding on a Broadwell
E5-1650 v4 @ 3.60GHz.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  81 +++++++++++++-
 net/core/filter.c        | 267 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 347 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..02e4112510f8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,33 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *             full lookup using FIB rules
+ *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
+ *             perspective (default is ingress)
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *
+ *     Return
+ *             Egress device index on success, 0 if packet needs to continue
+ *             up the stack for further processing or a negative error in case
+ *             of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1925,8 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 0baa715e4699..ca60d2872da4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -60,6 +60,10 @@
 #include <net/xfrm.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <linux/inetdevice.h>
+#include <net/ip_fib.h>
+#include <net/flow.h>
+#include <net/arp.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
+				  const struct neighbour *neigh,
+				  const struct net_device *dev)
+{
+	memcpy(params->dmac, neigh->ha, ETH_ALEN);
+	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
+	params->h_vlan_TCI = 0;
+	params->h_vlan_proto = 0;
+
+	return dev->ifindex;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in_device *in_dev;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct fib_result res;
+	struct fib_nh *nh;
+	struct flowi4 fl4;
+	int err;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	/* verify forwarding is enabled on this interface */
+	in_dev = __in_dev_get_rcu(dev);
+	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl4.flowi4_iif = 1;
+		fl4.flowi4_oif = params->ifindex;
+	} else {
+		fl4.flowi4_iif = params->ifindex;
+		fl4.flowi4_oif = 0;
+	}
+	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+	fl4.flowi4_flags = 0;
+
+	fl4.flowi4_proto = params->l4_protocol;
+	fl4.daddr = params->ipv4_dst;
+	fl4.saddr = params->ipv4_src;
+	fl4.fl4_sport = params->sport;
+	fl4.fl4_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib_table *tb;
+
+		tb = fib_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+	} else {
+		fl4.flowi4_mark = 0;
+		fl4.flowi4_secid = 0;
+		fl4.flowi4_tun_key.tun_id = 0;
+		fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
+	}
+
+	if (err || res.type != RTN_UNICAST)
+		return 0;
+
+	if (res.fi->fib_nhs > 1)
+		fib_select_path(net, &res, &fl4, NULL);
+
+	nh = &res.fi->fib_nh[res.nh_sel];
+
+	/* do not handle lwt encaps right now */
+	if (nh->nh_lwtstate)
+		return 0;
+
+	dev = nh->nh_dev;
+	if (unlikely(!dev))
+		return 0;
+
+	if (nh->nh_gw)
+		params->ipv4_dst = nh->nh_gw;
+
+	params->rt_metric = res.fi->fib_priority;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so
+	 * rcu_read_lock_bh is not needed here
+	 */
+	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
+			       u32 flags)
+{
+	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
+	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct fib6_info *f6i;
+	struct flowi6 fl6;
+	int strict = 0;
+	int oif;
+
+	/* link local addresses are never forwarded */
+	if (rt6_need_strict(dst) || rt6_need_strict(src))
+		return 0;
+
+	dev = dev_get_by_index_rcu(net, params->ifindex);
+	if (unlikely(!dev))
+		return -ENODEV;
+
+	idev = __in6_dev_get_safely(dev);
+	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+		return 0;
+
+	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
+		fl6.flowi6_iif = 1;
+		oif = fl6.flowi6_oif = params->ifindex;
+	} else {
+		oif = fl6.flowi6_iif = params->ifindex;
+		fl6.flowi6_oif = 0;
+		strict = RT6_LOOKUP_F_HAS_SADDR;
+	}
+	fl6.flowlabel = params->flowlabel;
+	fl6.flowi6_scope = 0;
+	fl6.flowi6_flags = 0;
+	fl6.mp_hash = 0;
+
+	fl6.flowi6_proto = params->l4_protocol;
+	fl6.daddr = *dst;
+	fl6.saddr = *src;
+	fl6.fl6_sport = params->sport;
+	fl6.fl6_dport = params->dport;
+
+	if (flags & BPF_FIB_LOOKUP_DIRECT) {
+		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
+		struct fib6_table *tb;
+
+		tb = ipv6_stub->fib6_get_table(net, tbid);
+		if (unlikely(!tb))
+			return 0;
+
+		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+	} else {
+		fl6.flowi6_mark = 0;
+		fl6.flowi6_secid = 0;
+		fl6.flowi6_tun_key.tun_id = 0;
+		fl6.flowi6_uid = sock_net_uid(net, NULL);
+
+		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+	}
+
+	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+		return 0;
+
+	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
+	    f6i->fib6_type != RTN_UNICAST))
+		return 0;
+
+	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
+		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
+						       fl6.flowi6_oif, NULL,
+						       strict);
+
+	if (f6i->fib6_nh.nh_lwtstate)
+		return 0;
+
+	if (f6i->fib6_flags & RTF_GATEWAY)
+		*dst = f6i->fib6_nh.nh_gw;
+
+	dev = f6i->fib6_nh.nh_dev;
+	params->rt_metric = f6i->fib6_metric;
+
+	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
+	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
+	 * because we need to get nd_tbl via the stub
+	 */
+	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				      ndisc_hashfn, dst, dev);
+	if (neigh)
+		return bpf_fib_set_fwd_params(params, neigh, dev);
+
+	return 0;
+}
+#endif
+
+BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+					   flags);
+#endif
+	}
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
+	.func		= bpf_xdp_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
+	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
+{
+	if (plen < sizeof(*params))
+		return -EINVAL;
+
+	switch (params->family) {
+#if IS_ENABLED(CONFIG_INET)
+	case AF_INET:
+		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+#endif
+	}
+	return -ENOTSUPP;
+}
+
+static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
+	.func		= bpf_skb_fib_lookup,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_PTR_TO_MEM,
+	.arg3_type      = ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_xdp_fib_lookup_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From fe616055f78457a0b78e0d3693d1ae26f2d7dab3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 9 May 2018 20:34:27 -0700
Subject: samples/bpf: Add example of ipv4 and ipv6 forwarding in XDP

Simple example of fast-path forwarding. It has a serious flaw
in not verifying the egress device index supports XDP forwarding.
If the egress device does not packets are dropped.

Take this only as a simple example of fast-path forwarding.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_fwd_kern.c                | 115 +++++++++++++++++++++++++
 samples/bpf/xdp_fwd_user.c                | 136 ++++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   3 +
 4 files changed, 258 insertions(+)
 create mode 100644 samples/bpf/xdp_fwd_kern.c
 create mode 100644 samples/bpf/xdp_fwd_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8e0c7fb6d7cc..28513d6be1bf 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -46,6 +46,7 @@ hostprogs-y += syscall_tp
 hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
+hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -100,6 +101,7 @@ syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
 xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
+xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -154,6 +156,7 @@ always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
+always += xdp_fwd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -201,6 +204,7 @@ HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
 HOSTLOADLIBES_xdp_adjust_tail += -lelf
 HOSTLOADLIBES_xdpsock += -lelf -pthread
+HOSTLOADLIBES_xdp_fwd += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
new file mode 100644
index 000000000000..cdf4fc383cc9
--- /dev/null
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include "bpf_helpers.h"
+
+#define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
+
+struct bpf_map_def SEC("maps") tx_port = {
+	.type = BPF_MAP_TYPE_DEVMAP,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 64,
+};
+
+static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct bpf_fib_lookup fib_params;
+	struct ethhdr *eth = data;
+	int out_index;
+	u16 h_proto;
+	u64 nh_off;
+
+	nh_off = sizeof(*eth);
+	if (data + nh_off > data_end)
+		return XDP_DROP;
+
+	__builtin_memset(&fib_params, 0, sizeof(fib_params));
+
+	h_proto = eth->h_proto;
+	if (h_proto == htons(ETH_P_IP)) {
+		struct iphdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET;
+		fib_params.tos		= iph->tos;
+		fib_params.l4_protocol	= iph->protocol;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->tot_len);
+		fib_params.ipv4_src	= iph->saddr;
+		fib_params.ipv4_dst	= iph->daddr;
+	} else if (h_proto == htons(ETH_P_IPV6)) {
+		struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
+		struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
+		struct ipv6hdr *iph = data + nh_off;
+
+		if (iph + 1 > data_end)
+			return XDP_DROP;
+
+		fib_params.family	= AF_INET6;
+		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.sport	= 0;
+		fib_params.dport	= 0;
+		fib_params.tot_len	= ntohs(iph->payload_len);
+		*src			= iph->saddr;
+		*dst			= iph->daddr;
+	} else {
+		return XDP_PASS;
+	}
+
+	fib_params.ifindex = ctx->ingress_ifindex;
+
+	out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
+
+	/* verify egress index has xdp support
+	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
+	 *       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+	 * NOTE: without verification that egress index supports XDP
+	 *       forwarding packets are dropped.
+	 */
+	if (out_index > 0) {
+		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+		return bpf_redirect_map(&tx_port, out_index, 0);
+	}
+
+	return XDP_PASS;
+}
+
+SEC("xdp_fwd")
+int xdp_fwd_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, 0);
+}
+
+SEC("xdp_fwd_direct")
+int xdp_fwd_direct_prog(struct xdp_md *ctx)
+{
+	return xdp_fwd_flags(ctx, BPF_FIB_LOOKUP_DIRECT);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
new file mode 100644
index 000000000000..9c6606f57126
--- /dev/null
+++ b/samples/bpf/xdp_fwd_user.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017-18 David Ahern <dsahern@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/limits.h>
+#include <net/if.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <libgen.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "libbpf.h"
+
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+static void usage(const char *prog)
+{
+	fprintf(stderr,
+		"usage: %s [OPTS] interface-list\n"
+		"\nOPTS:\n"
+		"    -d    detach program\n"
+		"    -D    direct table lookups (skip fib rules)\n",
+		prog);
+}
+
+int main(int argc, char **argv)
+{
+	char filename[PATH_MAX];
+	int opt, i, idx, err;
+	int prog_id = 0;
+	int attach = 1;
+	int ret = 0;
+
+	while ((opt = getopt(argc, argv, ":dD")) != -1) {
+		switch (opt) {
+		case 'd':
+			attach = 0;
+			break;
+		case 'D':
+			prog_id = 1;
+			break;
+		default:
+			usage(basename(argv[0]));
+			return 1;
+		}
+	}
+
+	if (optind == argc) {
+		usage(basename(argv[0]));
+		return 1;
+	}
+
+	if (attach) {
+		snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+		if (access(filename, O_RDONLY) < 0) {
+			printf("error accessing file %s: %s\n",
+				filename, strerror(errno));
+			return 1;
+		}
+
+		if (load_bpf_file(filename)) {
+			printf("%s", bpf_log_buf);
+			return 1;
+		}
+
+		if (!prog_fd[prog_id]) {
+			printf("load_bpf_file: %s\n", strerror(errno));
+			return 1;
+		}
+	}
+	if (attach) {
+		for (i = 1; i < 64; ++i)
+			bpf_map_update_elem(map_fd[0], &i, &i, 0);
+	}
+
+	for (i = optind; i < argc; ++i) {
+		idx = if_nametoindex(argv[i]);
+		if (!idx)
+			idx = strtoul(argv[i], NULL, 0);
+
+		if (!idx) {
+			fprintf(stderr, "Invalid arg\n");
+			return 1;
+		}
+		if (!attach) {
+			err = do_detach(idx, argv[i]);
+			if (err)
+				ret = err;
+		} else {
+			err = do_attach(idx, prog_fd[prog_id], argv[i]);
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..2375d06c706b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,9 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
+			     int plen, __u32 flags) =
+	(void *) BPF_FUNC_fib_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From 91bc07c9e8a0d553250275088fad337806f5a3cf Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Thu, 10 May 2018 15:26:51 -0700
Subject: selftests/bpf: Fix bash reference in Makefile

'|& ...' is a bash 4.0+ construct which is not guaranteed to be available
when using '$(shell ...)' in a Makefile. Fall back to the more portable
'2>&1 | ...'.

Fixes the following warning during compilation:

	/bin/sh: 1: Syntax error: "&" unexpected

Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9d762184b805..79d29d6cc719 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -90,9 +90,9 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
 $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
-BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help |& grep dwarfris)
-BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help |& grep BTF)
-BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --version |& grep LLVM)
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --version 2>&1 | grep LLVM)
 
 ifneq ($(BTF_LLC_PROBE),)
 ifneq ($(BTF_PAHOLE_PROBE),)
-- 
cgit v1.2.3-59-g8ed1b


From cb9c28ef579debbc51e48b37f2534390f5da8467 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Wed, 9 May 2018 11:04:59 +0900
Subject: bpf: sync tools bpf.h uapi header

Sync the header from include/uapi/linux/bpf.h which was updated to add
fib lookup helper function. This fixes selftests/bpf build failure.

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 82 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 81 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fff51c187d1e..02e4112510f8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
@@ -1827,6 +1828,33 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst,
+ *		ipv6_dst or mpls_out based on family, smac is set to mac
+ *		address of egress device, dmac is set to nexthop mac address,
+ *		rt_metric is set to metric from route.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *
+ *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
+ *             full lookup using FIB rules
+ *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
+ *             perspective (default is ingress)
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *
+ *     Return
+ *             Egress device index on success, 0 if packet needs to continue
+ *             up the stack for further processing or a negative error in case
+ *             of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1897,7 +1925,8 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2320,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
 
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+struct bpf_fib_lookup {
+	/* input */
+	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+	__u32	ifindex;  /* L3 device index for lookup */
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowlabel;	/* AF_INET6 */
+
+		/* output: metric of fib result */
+		__u32 rt_metric;
+	};
+
+	union {
+		__be32		mpls_in;
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, *dst is destination address.
+	 * output: bpf_fib_lookup sets to gateway address
+	 */
+	union {
+		/* return for MPLS lookups */
+		__be32		mpls_out[4];  /* support up to 4 labels */
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From e3687510fce2a5b039133be8979b2c8e8cbffa67 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:37 -0700
Subject: tools: bpftool: use PERF_SAMPLE_TIME instead of reading the clock

Ask the kernel to include sample time in each even instead of
reading the clock.  This is also more accurate because our
clock reading was done when user space would dump the buffer,
not when sample was produced.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map_perf_ring.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index c5a2ced8552d..9ae4bb8a2cad 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -39,6 +39,7 @@ struct event_ring_info {
 
 struct perf_event_sample {
 	struct perf_event_header header;
+	u64 time;
 	__u32 size;
 	unsigned char data[];
 };
@@ -57,17 +58,9 @@ print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
 		__u64 id;
 		__u64 lost;
 	} *lost = (void *)e;
-	struct timespec ts;
-
-	if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
-		perror("Can't read clock for timestamp");
-		return;
-	}
 
 	if (json_output) {
 		jsonw_start_object(json_wtr);
-		jsonw_name(json_wtr, "timestamp");
-		jsonw_uint(json_wtr, ts.tv_sec * 1000000000ull + ts.tv_nsec);
 		jsonw_name(json_wtr, "type");
 		jsonw_uint(json_wtr, e->header.type);
 		jsonw_name(json_wtr, "cpu");
@@ -75,6 +68,8 @@ print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
 		jsonw_name(json_wtr, "index");
 		jsonw_uint(json_wtr, ring->key);
 		if (e->header.type == PERF_RECORD_SAMPLE) {
+			jsonw_name(json_wtr, "timestamp");
+			jsonw_uint(json_wtr, e->time);
 			jsonw_name(json_wtr, "data");
 			print_data_json(e->data, e->size);
 		} else if (e->header.type == PERF_RECORD_LOST) {
@@ -89,8 +84,8 @@ print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
 		jsonw_end_object(json_wtr);
 	} else {
 		if (e->header.type == PERF_RECORD_SAMPLE) {
-			printf("== @%ld.%ld CPU: %d index: %d =====\n",
-			       (long)ts.tv_sec, ts.tv_nsec,
+			printf("== @%lld.%09lld CPU: %d index: %d =====\n",
+			       e->time / 1000000000ULL, e->time % 1000000000ULL,
 			       ring->cpu, ring->key);
 			fprint_hex(stdout, e->data, e->size, " ");
 			printf("\n");
@@ -185,7 +180,7 @@ static void perf_event_unmap(void *mem)
 static int bpf_perf_event_open(int map_fd, int key, int cpu)
 {
 	struct perf_event_attr attr = {
-		.sample_type = PERF_SAMPLE_RAW,
+		.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME,
 		.type = PERF_TYPE_SOFTWARE,
 		.config = PERF_COUNT_SW_BPF_OUTPUT,
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 74662ea5d41683e7ff723c35649b0192a8e6ba8f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:38 -0700
Subject: samples: bpf: rename struct bpf_map_def to avoid conflict with libbpf

Both tools/lib/bpf/libbpf.h and samples/bpf/bpf_load.h define their
own version of struct bpf_map_def.  The version in bpf_load.h has
more fields.  libbpf does not support inner maps and its definition
of struct bpf_map_def lacks the related fields.  Rename the definition
in bpf_load.h (samples/bpf) to avoid conflicts.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/bpf_load.c | 10 +++++-----
 samples/bpf/bpf_load.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index da9bccfaf391..a6b290de5632 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -420,7 +420,7 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
 
 	/* Keeping compatible with ELF maps section changes
 	 * ------------------------------------------------
-	 * The program size of struct bpf_map_def is known by loader
+	 * The program size of struct bpf_load_map_def is known by loader
 	 * code, but struct stored in ELF file can be different.
 	 *
 	 * Unfortunately sym[i].st_size is zero.  To calculate the
@@ -429,7 +429,7 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
 	 * symbols.
 	 */
 	map_sz_elf = data_maps->d_size / nr_maps;
-	map_sz_copy = sizeof(struct bpf_map_def);
+	map_sz_copy = sizeof(struct bpf_load_map_def);
 	if (map_sz_elf < map_sz_copy) {
 		/*
 		 * Backward compat, loading older ELF file with
@@ -448,8 +448,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
 
 	/* Memcpy relevant part of ELF maps data to loader maps */
 	for (i = 0; i < nr_maps; i++) {
+		struct bpf_load_map_def *def;
 		unsigned char *addr, *end;
-		struct bpf_map_def *def;
 		const char *map_name;
 		size_t offset;
 
@@ -464,9 +464,9 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
 
 		/* Symbol value is offset into ELF maps section data area */
 		offset = sym[i].st_value;
-		def = (struct bpf_map_def *)(data_maps->d_buf + offset);
+		def = (struct bpf_load_map_def *)(data_maps->d_buf + offset);
 		maps[i].elf_offset = offset;
-		memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
+		memset(&maps[i].def, 0, sizeof(struct bpf_load_map_def));
 		memcpy(&maps[i].def, def, map_sz_copy);
 
 		/* Verify no newer features were requested */
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 2c3d0b448632..f9da59bca0cc 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -7,7 +7,7 @@
 #define MAX_MAPS 32
 #define MAX_PROGS 32
 
-struct bpf_map_def {
+struct bpf_load_map_def {
 	unsigned int type;
 	unsigned int key_size;
 	unsigned int value_size;
@@ -21,7 +21,7 @@ struct bpf_map_data {
 	int fd;
 	char *name;
 	size_t elf_offset;
-	struct bpf_map_def def;
+	struct bpf_load_map_def def;
 };
 
 typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
-- 
cgit v1.2.3-59-g8ed1b


From 5f9380572b4bb24f60cd492b17331db6ee34a516 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:39 -0700
Subject: samples: bpf: compile and link against full libbpf

samples/bpf currently cherry-picks object files from tools/lib/bpf
to link against.  Just compile the full library and link statically
against it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 28513d6be1bf..79cdb66a5ea7 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -49,7 +49,7 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
+LIBBPF := ../../tools/lib/bpf/libbpf.a
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
@@ -74,10 +74,10 @@ offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
 spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
 map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
 test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
-test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
-test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o
-test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o $(CGROUP_HELPERS)
-test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o
+test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o $(LIBBPF)
+test_cgrp2_attach-objs := test_cgrp2_attach.o $(LIBBPF)
+test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(LIBBPF) $(CGROUP_HELPERS)
+test_cgrp2_sock-objs := test_cgrp2_sock.o $(LIBBPF)
 test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o
 xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 # reuse xdp1 source intentionally
@@ -91,7 +91,7 @@ tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
 xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
 test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
-per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
+per_socket_stats_example-objs := cookie_uid_helper_example.o $(LIBBPF)
 xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
@@ -165,6 +165,8 @@ HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
 HOSTCFLAGS += -I$(srctree)/tools/perf
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
+HOSTLOADLIBES_test_lru_dist += -lelf
+HOSTLOADLIBES_sock_example += -lelf
 HOSTLOADLIBES_fds_example += -lelf
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
@@ -176,6 +178,10 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_tracex7 += -lelf
+HOSTLOADLIBES_test_cgrp2_array_pin += -lelf
+HOSTLOADLIBES_test_cgrp2_attach += -lelf
+HOSTLOADLIBES_test_cgrp2_attach2 += -lelf
+HOSTLOADLIBES_test_cgrp2_sock += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
@@ -195,6 +201,7 @@ HOSTLOADLIBES_tc_l2_redirect += -l elf
 HOSTLOADLIBES_lwt_len_hist += -l elf
 HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
 HOSTLOADLIBES_test_map_in_map += -lelf
+HOSTLOADLIBES_per_socket_stats_example += -lelf
 HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
 HOSTLOADLIBES_xdp_redirect_cpu += -lelf
@@ -226,7 +233,7 @@ clean:
 	@rm -f *~
 
 $(LIBBPF): FORCE
-	$(MAKE) -C $(dir $@) $(notdir $@)
+	$(MAKE) -C $(dir $@)
 
 $(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
 	$(call if_changed_dep,cc_s_c)
-- 
cgit v1.2.3-59-g8ed1b


From d0cabbb021bee5c4b831a0235af9534ad07f8d3d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:40 -0700
Subject: tools: bpf: move the event reading loop to libbpf

There are two copies of event reading loop - in bpftool and
trace_helpers "library".  Consolidate them and move the code
to libbpf.  Return codes from trace_helpers are kept, but
renamed to include LIBBPF prefix.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile                        |  8 +++
 samples/bpf/trace_output_user.c             |  6 +-
 tools/bpf/bpftool/map_perf_ring.c           | 66 +++++-----------------
 tools/lib/bpf/Makefile                      |  2 +-
 tools/lib/bpf/libbpf.c                      | 61 ++++++++++++++++++++
 tools/lib/bpf/libbpf.h                      | 13 +++++
 tools/testing/selftests/bpf/Makefile        |  2 +-
 tools/testing/selftests/bpf/test_progs.c    |  6 +-
 tools/testing/selftests/bpf/trace_helpers.c | 87 ++++++++++-------------------
 tools/testing/selftests/bpf/trace_helpers.h | 11 ++--
 10 files changed, 139 insertions(+), 123 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 79cdb66a5ea7..8ce72d211c3e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -165,6 +165,14 @@ HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
 HOSTCFLAGS += -I$(srctree)/tools/perf
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
+HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/
+
+HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+
 HOSTLOADLIBES_test_lru_dist += -lelf
 HOSTLOADLIBES_sock_example += -lelf
 HOSTLOADLIBES_fds_example += -lelf
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index 5e78c2ecd08d..da98be721001 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -48,7 +48,7 @@ static int print_bpf_output(void *data, int size)
 	if (e->cookie != 0x12345678) {
 		printf("BUG pid %llx cookie %llx sized %d\n",
 		       e->pid, e->cookie, size);
-		return PERF_EVENT_ERROR;
+		return LIBBPF_PERF_EVENT_ERROR;
 	}
 
 	cnt++;
@@ -56,10 +56,10 @@ static int print_bpf_output(void *data, int size)
 	if (cnt == MAX_CNT) {
 		printf("recv %lld events per sec\n",
 		       MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
-		return PERF_EVENT_DONE;
+		return LIBBPF_PERF_EVENT_DONE;
 	}
 
-	return PERF_EVENT_CONT;
+	return LIBBPF_PERF_EVENT_CONT;
 }
 
 static void test_bpf_perf_event(void)
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 9ae4bb8a2cad..1832100d1b27 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -50,14 +50,15 @@ static void int_exit(int signo)
 	stop = true;
 }
 
-static void
-print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
+static enum bpf_perf_event_ret print_bpf_output(void *event, void *priv)
 {
+	struct event_ring_info *ring = priv;
+	struct perf_event_sample *e = event;
 	struct {
 		struct perf_event_header header;
 		__u64 id;
 		__u64 lost;
-	} *lost = (void *)e;
+	} *lost = event;
 
 	if (json_output) {
 		jsonw_start_object(json_wtr);
@@ -96,60 +97,23 @@ print_bpf_output(struct event_ring_info *ring, struct perf_event_sample *e)
 			       e->header.type, e->header.size);
 		}
 	}
+
+	return LIBBPF_PERF_EVENT_CONT;
 }
 
 static void
 perf_event_read(struct event_ring_info *ring, void **buf, size_t *buf_len)
 {
-	volatile struct perf_event_mmap_page *header = ring->mem;
-	__u64 buffer_size = MMAP_PAGE_CNT * get_page_size();
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	void *base, *begin, *end;
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return;
-
-	base = ((char *)header) + get_page_size();
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			if (*buf_len < e->header.size) {
-				free(*buf);
-				*buf = malloc(e->header.size);
-				if (!*buf) {
-					fprintf(stderr,
-						"can't allocate memory");
-					stop = true;
-					return;
-				}
-				*buf_len = e->header.size;
-			}
-
-			memcpy(*buf, begin, len);
-			memcpy(*buf + len, base, e->header.size - len);
-			e = (void *)*buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		print_bpf_output(ring, e);
+	enum bpf_perf_event_ret ret;
+
+	ret = bpf_perf_event_read_simple(ring->mem,
+					 MMAP_PAGE_CNT * get_page_size(),
+					 get_page_size(), buf, buf_len,
+					 print_bpf_output, ring);
+	if (ret != LIBBPF_PERF_EVENT_CONT) {
+		fprintf(stderr, "perf read loop failed with %d\n", ret);
+		stop = true;
 	}
-
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
 }
 
 static int perf_mmap_size(void)
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index e6d5f8d1477f..f3fab4af4260 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -69,7 +69,7 @@ FEATURE_USER = .libbpf
 FEATURE_TESTS = libelf libelf-getphdrnum libelf-mmap bpf
 FEATURE_DISPLAY = libelf bpf
 
-INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi
+INCLUDES = -I. -I$(srctree)/tools/include -I$(srctree)/tools/arch/$(ARCH)/include/uapi -I$(srctree)/tools/include/uapi -I$(srctree)/tools/perf
 FEATURE_CHECK_CFLAGS-bpf = $(INCLUDES)
 
 check_feat := 1
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7bcdca13083a..ce96f1fe3f37 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -31,6 +31,7 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <perf-sys.h>
 #include <asm/unistd.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -2210,3 +2211,63 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	*prog_fd = bpf_program__fd(first_prog);
 	return 0;
 }
+
+enum bpf_perf_event_ret
+bpf_perf_event_read_simple(void *mem, unsigned long size,
+			   unsigned long page_size, void **buf, size_t *buf_len,
+			   bpf_perf_event_print_t fn, void *priv)
+{
+	volatile struct perf_event_mmap_page *header = mem;
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	void *base, *begin, *end;
+	int ret;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return LIBBPF_PERF_EVENT_CONT;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + data_tail % size;
+	end = base + data_head % size;
+
+	while (begin != end) {
+		struct perf_event_header *ehdr;
+
+		ehdr = begin;
+		if (begin + ehdr->size > base + size) {
+			long len = base + size - begin;
+
+			if (*buf_len < ehdr->size) {
+				free(*buf);
+				*buf = malloc(ehdr->size);
+				if (!*buf) {
+					ret = LIBBPF_PERF_EVENT_ERROR;
+					break;
+				}
+				*buf_len = ehdr->size;
+			}
+
+			memcpy(*buf, begin, len);
+			memcpy(*buf + len, base, ehdr->size - len);
+			ehdr = (void *)*buf;
+			begin = base + ehdr->size - len;
+		} else if (begin + ehdr->size == base + size) {
+			begin = base;
+		} else {
+			begin += ehdr->size;
+		}
+
+		ret = fn(ehdr, priv);
+		if (ret != LIBBPF_PERF_EVENT_CONT)
+			break;
+
+		data_tail += ehdr->size;
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_tail;
+
+	return ret;
+}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 197f9ce2248c..ce681097584e 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -267,4 +267,17 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
 		  struct bpf_object **pobj, int *prog_fd);
 
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+
+enum bpf_perf_event_ret {
+	LIBBPF_PERF_EVENT_DONE	= 0,
+	LIBBPF_PERF_EVENT_ERROR	= -1,
+	LIBBPF_PERF_EVENT_CONT	= -2,
+};
+
+typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(void *event,
+							  void *priv);
+int bpf_perf_event_read_simple(void *mem, unsigned long size,
+			       unsigned long page_size,
+			       void **buf, size_t *buf_len,
+			       bpf_perf_event_print_t fn, void *priv);
 #endif
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 79d29d6cc719..438d4f93875b 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -10,7 +10,7 @@ ifneq ($(wildcard $(GENHDR)),)
   GENFLAGS := -DHAVE_GENHDR
 endif
 
-CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
+CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index ed197eef1cfc..f7731973ec68 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1337,12 +1337,12 @@ static int get_stack_print_output(void *data, int size)
 			good_user_stack = true;
 	}
 	if (!good_kern_stack || !good_user_stack)
-		return PERF_EVENT_ERROR;
+		return LIBBPF_PERF_EVENT_ERROR;
 
 	if (cnt == MAX_CNT_RAWTP)
-		return PERF_EVENT_DONE;
+		return LIBBPF_PERF_EVENT_DONE;
 
-	return PERF_EVENT_CONT;
+	return LIBBPF_PERF_EVENT_CONT;
 }
 
 static void test_get_stack_raw_tp(void)
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index ad025bd75f1c..8fb4fe8686e4 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -74,7 +74,7 @@ struct ksym *ksym_search(long key)
 
 static int page_size;
 static int page_cnt = 8;
-static volatile struct perf_event_mmap_page *header;
+static struct perf_event_mmap_page *header;
 
 int perf_event_mmap(int fd)
 {
@@ -107,74 +107,47 @@ struct perf_event_sample {
 	char data[];
 };
 
-static int perf_event_read(perf_event_print_fn fn)
+static enum bpf_perf_event_ret bpf_perf_event_print(void *event, void *priv)
 {
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	__u64 buffer_size = page_cnt * page_size;
-	void *base, *begin, *end;
-	char buf[256];
+	struct perf_event_sample *e = event;
+	perf_event_print_fn fn = priv;
 	int ret;
 
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return PERF_EVENT_CONT;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			assert(len < e->header.size);
-			memcpy(buf, begin, len);
-			memcpy(buf + len, base, e->header.size - len);
-			e = (void *) buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		if (e->header.type == PERF_RECORD_SAMPLE) {
-			ret = fn(e->data, e->size);
-			if (ret != PERF_EVENT_CONT)
-				return ret;
-		} else if (e->header.type == PERF_RECORD_LOST) {
-			struct {
-				struct perf_event_header header;
-				__u64 id;
-				__u64 lost;
-			} *lost = (void *) e;
-			printf("lost %lld events\n", lost->lost);
-		} else {
-			printf("unknown event type=%d size=%d\n",
-			       e->header.type, e->header.size);
-		}
+	if (e->header.type == PERF_RECORD_SAMPLE) {
+		ret = fn(e->data, e->size);
+		if (ret != LIBBPF_PERF_EVENT_CONT)
+			return ret;
+	} else if (e->header.type == PERF_RECORD_LOST) {
+		struct {
+			struct perf_event_header header;
+			__u64 id;
+			__u64 lost;
+		} *lost = (void *) e;
+		printf("lost %lld events\n", lost->lost);
+	} else {
+		printf("unknown event type=%d size=%d\n",
+		       e->header.type, e->header.size);
 	}
 
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
-	return PERF_EVENT_CONT;
+	return LIBBPF_PERF_EVENT_CONT;
 }
 
 int perf_event_poller(int fd, perf_event_print_fn output_fn)
 {
-	int ret;
+	enum bpf_perf_event_ret ret;
+	void *buf = NULL;
+	size_t len = 0;
 
 	for (;;) {
 		perf_event_poll(fd);
-		ret = perf_event_read(output_fn);
-		if (ret != PERF_EVENT_CONT)
-			return ret;
+		ret = bpf_perf_event_read_simple(header, page_cnt * page_size,
+						 page_size, &buf, &len,
+						 bpf_perf_event_print,
+						 output_fn);
+		if (ret != LIBBPF_PERF_EVENT_CONT)
+			break;
 	}
+	free(buf);
 
-	return PERF_EVENT_DONE;
+	return ret;
 }
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index fe3eefd21e86..36d90e3b1ea9 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -2,6 +2,8 @@
 #ifndef __TRACE_HELPER_H
 #define __TRACE_HELPER_H
 
+#include <libbpf.h>
+
 struct ksym {
 	long addr;
 	char *name;
@@ -10,14 +12,9 @@ struct ksym {
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
 
-typedef int (*perf_event_print_fn)(void *data, int size);
-
-/* return code for perf_event_print_fn */
-#define PERF_EVENT_DONE		0
-#define PERF_EVENT_ERROR	-1
-#define PERF_EVENT_CONT		-2
+typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
 int perf_event_mmap(int fd);
-/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
+/* return LIBBPF_PERF_EVENT_DONE or LIBBPF_PERF_EVENT_ERROR */
 int perf_event_poller(int fd, perf_event_print_fn output_fn);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2eb57bb8f6769fe94f8f9865342ffa0f6c257e0a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:41 -0700
Subject: tools: bpf: improve comments in libbpf.h

Fix spelling mistakes, improve and clarify the language of comments
in libbpf.h.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.h | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index ce681097584e..4574b9563278 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -52,8 +52,8 @@ enum libbpf_errno {
 int libbpf_strerror(int err, char *buf, size_t size);
 
 /*
- * In include/linux/compiler-gcc.h, __printf is defined. However
- * it should be better if libbpf.h doesn't depend on Linux header file.
+ * __printf is defined in include/linux/compiler-gcc.h. However,
+ * it would be better if libbpf.h didn't depend on Linux header files.
  * So instead of __printf, here we use gcc attribute directly.
  */
 typedef int (*libbpf_print_fn_t)(const char *, ...)
@@ -92,7 +92,7 @@ int bpf_object__set_priv(struct bpf_object *obj, void *priv,
 			 bpf_object_clear_priv_t clear_priv);
 void *bpf_object__priv(struct bpf_object *prog);
 
-/* Accessors of bpf_program. */
+/* Accessors of bpf_program */
 struct bpf_program;
 struct bpf_program *bpf_program__next(struct bpf_program *prog,
 				      struct bpf_object *obj);
@@ -121,28 +121,28 @@ struct bpf_insn;
 
 /*
  * Libbpf allows callers to adjust BPF programs before being loaded
- * into kernel. One program in an object file can be transform into
- * multiple variants to be attached to different code.
+ * into kernel. One program in an object file can be transformed into
+ * multiple variants to be attached to different hooks.
  *
  * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd
- * are APIs for this propose.
+ * form an API for this purpose.
  *
  * - bpf_program_prep_t:
- *   It defines 'preprocessor', which is a caller defined function
+ *   Defines a 'preprocessor', which is a caller defined function
  *   passed to libbpf through bpf_program__set_prep(), and will be
  *   called before program is loaded. The processor should adjust
- *   the program one time for each instances according to the number
+ *   the program one time for each instance according to the instance id
  *   passed to it.
  *
  * - bpf_program__set_prep:
- *   Attachs a preprocessor to a BPF program. The number of instances
- *   whould be created is also passed through this function.
+ *   Attaches a preprocessor to a BPF program. The number of instances
+ *   that should be created is also passed through this function.
  *
  * - bpf_program__nth_fd:
- *   After the program is loaded, get resuling fds from bpf program for
- *   each instances.
+ *   After the program is loaded, get resulting FD of a given instance
+ *   of the BPF program.
  *
- * If bpf_program__set_prep() is not used, the program whould be loaded
+ * If bpf_program__set_prep() is not used, the program would be loaded
  * without adjustment during bpf_object__load(). The program has only
  * one instance. In this case bpf_program__fd(prog) is equal to
  * bpf_program__nth_fd(prog, 0).
@@ -156,7 +156,7 @@ struct bpf_prog_prep_result {
 	struct bpf_insn *new_insn_ptr;
 	int new_insn_cnt;
 
-	/* If not NULL, result fd is set to it */
+	/* If not NULL, result FD is written to it. */
 	int *pfd;
 };
 
@@ -169,8 +169,8 @@ struct bpf_prog_prep_result {
  *  - res:	Output parameter, result of transformation.
  *
  * Return value:
- *  - Zero: pre-processing success.
- *  - Non-zero: pre-processing, stop loading.
+ *  - Zero:	pre-processing success.
+ *  - Non-zero:	pre-processing error, stop loading.
  */
 typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n,
 				  struct bpf_insn *insns, int insns_cnt,
@@ -182,7 +182,7 @@ int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
 int bpf_program__nth_fd(struct bpf_program *prog, int n);
 
 /*
- * Adjust type of bpf program. Default is kprobe.
+ * Adjust type of BPF program. Default is kprobe.
  */
 int bpf_program__set_socket_filter(struct bpf_program *prog);
 int bpf_program__set_tracepoint(struct bpf_program *prog);
@@ -206,10 +206,10 @@ bool bpf_program__is_xdp(struct bpf_program *prog);
 bool bpf_program__is_perf_event(struct bpf_program *prog);
 
 /*
- * We don't need __attribute__((packed)) now since it is
- * unnecessary for 'bpf_map_def' because they are all aligned.
- * In addition, using it will trigger -Wpacked warning message,
- * and will be treated as an error due to -Werror.
+ * No need for __attribute__((packed)), all members of 'bpf_map_def'
+ * are all aligned.  In addition, using __attribute__((packed))
+ * would trigger a -Wpacked warning message, and lead to an error
+ * if -Werror is set.
  */
 struct bpf_map_def {
 	unsigned int type;
@@ -220,8 +220,8 @@ struct bpf_map_def {
 };
 
 /*
- * There is another 'struct bpf_map' in include/linux/map.h. However,
- * it is not a uapi header so no need to consider name clash.
+ * The 'struct bpf_map' in include/linux/bpf.h is internal to the kernel,
+ * so no need to worry about a name clash.
  */
 struct bpf_map;
 struct bpf_map *
@@ -229,7 +229,7 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name);
 
 /*
  * Get bpf_map through the offset of corresponding struct bpf_map_def
- * in the bpf object file.
+ * in the BPF object file.
  */
 struct bpf_map *
 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
-- 
cgit v1.2.3-59-g8ed1b


From 17387dd5ac2c9c4e5f7d38b5affcd70e059f0a8c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:42 -0700
Subject: tools: bpf: don't complain about no kernel version for networking
 code

BPF programs only have to specify the target kernel version for
tracing related hooks, in networking world that requirement does
not really apply.  Loosen the checks in libbpf to reflect that.

bpf_object__open() users will continue to see the error for backward
compatibility (and because prog_type is not available there).

Error code for NULL file name is changed from ENOENT to EINVAL,
as it seems more appropriate, hopefully, that's an OK change.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ce96f1fe3f37..df54c4c9e48a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1438,9 +1438,37 @@ bpf_object__load_progs(struct bpf_object *obj)
 	return 0;
 }
 
-static int bpf_object__validate(struct bpf_object *obj)
+static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_SOCKET_FILTER:
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_SK_SKB:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+		return false;
+	case BPF_PROG_TYPE_UNSPEC:
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+	default:
+		return true;
+	}
+}
+
+static int bpf_object__validate(struct bpf_object *obj, bool needs_kver)
 {
-	if (obj->kern_version == 0) {
+	if (needs_kver && obj->kern_version == 0) {
 		pr_warning("%s doesn't provide kernel version\n",
 			   obj->path);
 		return -LIBBPF_ERRNO__KVERSION;
@@ -1449,7 +1477,8 @@ static int bpf_object__validate(struct bpf_object *obj)
 }
 
 static struct bpf_object *
-__bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz)
+__bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz,
+		   bool needs_kver)
 {
 	struct bpf_object *obj;
 	int err;
@@ -1467,7 +1496,7 @@ __bpf_object__open(const char *path, void *obj_buf, size_t obj_buf_sz)
 	CHECK_ERR(bpf_object__check_endianness(obj), err, out);
 	CHECK_ERR(bpf_object__elf_collect(obj), err, out);
 	CHECK_ERR(bpf_object__collect_reloc(obj), err, out);
-	CHECK_ERR(bpf_object__validate(obj), err, out);
+	CHECK_ERR(bpf_object__validate(obj, needs_kver), err, out);
 
 	bpf_object__elf_finish(obj);
 	return obj;
@@ -1484,7 +1513,7 @@ struct bpf_object *bpf_object__open(const char *path)
 
 	pr_debug("loading %s\n", path);
 
-	return __bpf_object__open(path, NULL, 0);
+	return __bpf_object__open(path, NULL, 0, true);
 }
 
 struct bpf_object *bpf_object__open_buffer(void *obj_buf,
@@ -1507,7 +1536,7 @@ struct bpf_object *bpf_object__open_buffer(void *obj_buf,
 	pr_debug("loading object '%s' from buffer\n",
 		 name);
 
-	return __bpf_object__open(name, obj_buf, obj_buf_sz);
+	return __bpf_object__open(name, obj_buf, obj_buf_sz, true);
 }
 
 int bpf_object__unload(struct bpf_object *obj)
@@ -2164,8 +2193,11 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 
 	if (!attr)
 		return -EINVAL;
+	if (!attr->file)
+		return -EINVAL;
 
-	obj = bpf_object__open(attr->file);
+	obj = __bpf_object__open(attr->file, NULL, 0,
+				 bpf_prog_type__needs_kver(attr->prog_type));
 	if (IS_ERR(obj))
 		return -ENOENT;
 
-- 
cgit v1.2.3-59-g8ed1b


From be5bca44aa6b37b88e900d5f5f155911d6984d86 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 10 May 2018 10:24:43 -0700
Subject: samples: bpf: convert some XDP samples from bpf_load to libbpf

Now that we can use full powers of libbpf in BPF samples, we
should perhaps make the simplest XDP programs not depend on
bpf_load helpers.  This way newcomers will be exposed to the
recommended library from the start.

Use of bpf_prog_load_xattr() will also make it trivial to later
on request offload of the programs by simply adding ifindex to
the xattr.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile               |  8 +++----
 samples/bpf/xdp1_user.c            | 31 ++++++++++++++++---------
 samples/bpf/xdp_adjust_tail_user.c | 36 +++++++++++++++++------------
 samples/bpf/xdp_rxq_info_user.c    | 46 +++++++++++++++++++++++++-------------
 4 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 8ce72d211c3e..9e255ca4059a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -79,9 +79,9 @@ test_cgrp2_attach-objs := test_cgrp2_attach.o $(LIBBPF)
 test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(LIBBPF) $(CGROUP_HELPERS)
 test_cgrp2_sock-objs := test_cgrp2_sock.o $(LIBBPF)
 test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o
-xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
+xdp1-objs := xdp1_user.o $(LIBBPF)
 # reuse xdp1 source intentionally
-xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
+xdp2-objs := xdp1_user.o $(LIBBPF)
 xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
@@ -96,10 +96,10 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
-xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
+xdp_rxq_info-objs := xdp_rxq_info_user.o $(LIBBPF)
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
-xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
+xdp_adjust_tail-objs := xdp_adjust_tail_user.o $(LIBBPF)
 xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
 xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
 
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index b901ee2b3336..b02c531510ed 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -16,9 +16,9 @@
 #include <libgen.h>
 #include <sys/resource.h>
 
-#include "bpf_load.h"
 #include "bpf_util.h"
-#include "libbpf.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
 
 static int ifindex;
 static __u32 xdp_flags;
@@ -31,7 +31,7 @@ static void int_exit(int sig)
 
 /* simple per-protocol drop counter
  */
-static void poll_stats(int interval)
+static void poll_stats(int map_fd, int interval)
 {
 	unsigned int nr_cpus = bpf_num_possible_cpus();
 	const unsigned int nr_keys = 256;
@@ -47,7 +47,7 @@ static void poll_stats(int interval)
 		for (key = 0; key < nr_keys; key++) {
 			__u64 sum = 0;
 
-			assert(bpf_map_lookup_elem(map_fd[0], &key, values) == 0);
+			assert(bpf_map_lookup_elem(map_fd, &key, values) == 0);
 			for (i = 0; i < nr_cpus; i++)
 				sum += (values[i] - prev[key][i]);
 			if (sum)
@@ -71,9 +71,14 @@ static void usage(const char *prog)
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type	= BPF_PROG_TYPE_XDP,
+	};
 	const char *optstr = "SN";
+	int prog_fd, map_fd, opt;
+	struct bpf_object *obj;
+	struct bpf_map *map;
 	char filename[256];
-	int opt;
 
 	while ((opt = getopt(argc, argv, optstr)) != -1) {
 		switch (opt) {
@@ -102,13 +107,19 @@ int main(int argc, char **argv)
 	ifindex = strtoul(argv[optind], NULL, 0);
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
 
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		return 1;
+
+	map = bpf_map__next(NULL, obj);
+	if (!map) {
+		printf("finding a map in obj file failed\n");
 		return 1;
 	}
+	map_fd = bpf_map__fd(map);
 
-	if (!prog_fd[0]) {
+	if (!prog_fd) {
 		printf("load_bpf_file: %s\n", strerror(errno));
 		return 1;
 	}
@@ -116,12 +127,12 @@ int main(int argc, char **argv)
 	signal(SIGINT, int_exit);
 	signal(SIGTERM, int_exit);
 
-	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
 		printf("link set xdp fd failed\n");
 		return 1;
 	}
 
-	poll_stats(2);
+	poll_stats(map_fd, 2);
 
 	return 0;
 }
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
index f621a541b574..3042ce37dae8 100644
--- a/samples/bpf/xdp_adjust_tail_user.c
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -18,9 +18,8 @@
 #include <netinet/ether.h>
 #include <unistd.h>
 #include <time.h>
-#include "bpf_load.h"
-#include "libbpf.h"
-#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
 
 #define STATS_INTERVAL_S 2U
 
@@ -36,7 +35,7 @@ static void int_exit(int sig)
 
 /* simple "icmp packet too big sent" counter
  */
-static void poll_stats(unsigned int kill_after_s)
+static void poll_stats(unsigned int map_fd, unsigned int kill_after_s)
 {
 	time_t started_at = time(NULL);
 	__u64 value = 0;
@@ -46,7 +45,7 @@ static void poll_stats(unsigned int kill_after_s)
 	while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
 		sleep(STATS_INTERVAL_S);
 
-		assert(bpf_map_lookup_elem(map_fd[0], &key, &value) == 0);
+		assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
 
 		printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
 	}
@@ -66,14 +65,17 @@ static void usage(const char *cmd)
 
 int main(int argc, char **argv)
 {
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type	= BPF_PROG_TYPE_XDP,
+	};
 	unsigned char opt_flags[256] = {};
 	unsigned int kill_after_s = 0;
 	const char *optstr = "i:T:SNh";
-	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	int i, prog_fd, map_fd, opt;
+	struct bpf_object *obj;
+	struct bpf_map *map;
 	char filename[256];
-	int opt;
-	int i;
-
 
 	for (i = 0; i < strlen(optstr); i++)
 		if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
@@ -115,13 +117,19 @@ int main(int argc, char **argv)
 	}
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
+
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		return 1;
 
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
+	map = bpf_map__next(NULL, obj);
+	if (!map) {
+		printf("finding a map in obj file failed\n");
 		return 1;
 	}
+	map_fd = bpf_map__fd(map);
 
-	if (!prog_fd[0]) {
+	if (!prog_fd) {
 		printf("load_bpf_file: %s\n", strerror(errno));
 		return 1;
 	}
@@ -129,12 +137,12 @@ int main(int argc, char **argv)
 	signal(SIGINT, int_exit);
 	signal(SIGTERM, int_exit);
 
-	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
 		printf("link set xdp fd failed\n");
 		return 1;
 	}
 
-	poll_stats(kill_after_s);
+	poll_stats(map_fd, kill_after_s);
 
 	bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
 
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
index 478d95412de4..e4e9ba52bff0 100644
--- a/samples/bpf/xdp_rxq_info_user.c
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -22,8 +22,8 @@ static const char *__doc__ = " XDP RX-queue info extract example\n\n"
 #include <arpa/inet.h>
 #include <linux/if_link.h>
 
-#include "libbpf.h"
-#include "bpf_load.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
 #include "bpf_util.h"
 
 static int ifindex = -1;
@@ -32,6 +32,9 @@ static char *ifname;
 
 static __u32 xdp_flags;
 
+static struct bpf_map *stats_global_map;
+static struct bpf_map *rx_queue_index_map;
+
 /* Exit return codes */
 #define EXIT_OK		0
 #define EXIT_FAIL		1
@@ -174,7 +177,7 @@ static struct datarec *alloc_record_per_cpu(void)
 
 static struct record *alloc_record_per_rxq(void)
 {
-	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
 	struct record *array;
 	size_t size;
 
@@ -190,7 +193,7 @@ static struct record *alloc_record_per_rxq(void)
 
 static struct stats_record *alloc_stats_record(void)
 {
-	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
 	struct stats_record *rec;
 	int i;
 
@@ -210,7 +213,7 @@ static struct stats_record *alloc_stats_record(void)
 
 static void free_stats_record(struct stats_record *r)
 {
-	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
 	int i;
 
 	for (i = 0; i < nr_rxqs; i++)
@@ -254,11 +257,11 @@ static void stats_collect(struct stats_record *rec)
 {
 	int fd, i, max_rxqs;
 
-	fd = map_data[1].fd; /* map: stats_global_map */
+	fd = bpf_map__fd(stats_global_map);
 	map_collect_percpu(fd, 0, &rec->stats);
 
-	fd = map_data[2].fd; /* map: rx_queue_index_map */
-	max_rxqs = map_data[2].def.max_entries;
+	fd = bpf_map__fd(rx_queue_index_map);
+	max_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
 	for (i = 0; i < max_rxqs; i++)
 		map_collect_percpu(fd, i, &rec->rxq[i]);
 }
@@ -304,8 +307,8 @@ static void stats_print(struct stats_record *stats_rec,
 			struct stats_record *stats_prev,
 			int action)
 {
+	unsigned int nr_rxqs = bpf_map__def(rx_queue_index_map)->max_entries;
 	unsigned int nr_cpus = bpf_num_possible_cpus();
-	unsigned int nr_rxqs = map_data[2].def.max_entries;
 	double pps = 0, err = 0;
 	struct record *rec, *prev;
 	double t;
@@ -419,31 +422,44 @@ static void stats_poll(int interval, int action)
 int main(int argc, char **argv)
 {
 	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type	= BPF_PROG_TYPE_XDP,
+	};
+	int prog_fd, map_fd, opt, err;
 	bool use_separators = true;
 	struct config cfg = { 0 };
+	struct bpf_object *obj;
+	struct bpf_map *map;
 	char filename[256];
 	int longindex = 0;
 	int interval = 2;
 	__u32 key = 0;
-	int opt, err;
 
 	char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
 	int action = XDP_PASS; /* Default action */
 	char *action_str = NULL;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
 
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
 		perror("setrlimit(RLIMIT_MEMLOCK)");
 		return 1;
 	}
 
-	if (load_bpf_file(filename)) {
-		fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		return EXIT_FAIL;
+
+	map = bpf_map__next(NULL, obj);
+	stats_global_map = bpf_map__next(map, obj);
+	rx_queue_index_map = bpf_map__next(stats_global_map, obj);
+	if (!map || !stats_global_map || !rx_queue_index_map) {
+		printf("finding a map in obj file failed\n");
 		return EXIT_FAIL;
 	}
+	map_fd = bpf_map__fd(map);
 
-	if (!prog_fd[0]) {
+	if (!prog_fd) {
 		fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
 		return EXIT_FAIL;
 	}
@@ -512,7 +528,7 @@ int main(int argc, char **argv)
 		setlocale(LC_NUMERIC, "en_US");
 
 	/* User-side setup ifindex in config_map */
-	err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+	err = bpf_map_update_elem(map_fd, &key, &cfg, 0);
 	if (err) {
 		fprintf(stderr, "Store config failed (err:%d)\n", err);
 		exit(EXIT_FAIL_BPF);
@@ -521,7 +537,7 @@ int main(int argc, char **argv)
 	/* Remove XDP program when program is interrupted */
 	signal(SIGINT, int_exit);
 
-	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
 		fprintf(stderr, "link set xdp fd failed\n");
 		return EXIT_FAIL_XDP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From bb322a90383603aaf71ad882a7c5a48b39a02603 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:29 -0700
Subject: net: phy: phylink: Use gpiod_get_value_cansleep()

The GPIO provider for the link GPIO line might require the use of the
_cansleep() API, utilize that. This is safe to do since we run in workqueue
context.

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index c582b2d7546c..412d1cf4fa66 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -360,7 +360,7 @@ static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_stat
 	if (pl->get_fixed_state)
 		pl->get_fixed_state(pl->netdev, state);
 	else if (pl->link_gpio)
-		state->link = !!gpiod_get_value(pl->link_gpio);
+		state->link = !!gpiod_get_value_cansleep(pl->link_gpio);
 }
 
 /* Flow control is resolved according to our and the link partners
-- 
cgit v1.2.3-59-g8ed1b


From daab3349ad1a69663ccad278ed71d55974d104b4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:30 -0700
Subject: net: phy: phylink: Release link GPIO

We are not releasing the link GPIO descriptor with gpiod_put() which results in
subsequent probing to get -EBUSY when calling fwnode_get_named_gpiod(). Fix this
by doing the release in phylink_destroy().

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 412d1cf4fa66..6392b5248cf5 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -612,6 +612,8 @@ void phylink_destroy(struct phylink *pl)
 {
 	if (pl->sfp_bus)
 		sfp_unregister_upstream(pl->sfp_bus);
+	if (!IS_ERR(pl->link_gpio))
+		gpiod_put(pl->link_gpio);
 
 	cancel_work_sync(&pl->resolve);
 	kfree(pl);
-- 
cgit v1.2.3-59-g8ed1b


From 9cd00a8aa42e44a5a3f555e4999803856a7871f5 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 10 May 2018 13:17:31 -0700
Subject: net: phy: phylink: Poll link GPIOs

When using a fixed link with a link GPIO, we need to poll that GPIO to
determine link state changes. This is consistent with what fixed_phy.c does.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 6392b5248cf5..581ce93ecaf9 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -19,6 +19,7 @@
 #include <linux/phylink.h>
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
+#include <linux/timer.h>
 #include <linux/workqueue.h>
 
 #include "sfp.h"
@@ -54,6 +55,7 @@ struct phylink {
 	/* The link configuration settings */
 	struct phylink_link_state link_config;
 	struct gpio_desc *link_gpio;
+	struct timer_list link_poll;
 	void (*get_fixed_state)(struct net_device *dev,
 				struct phylink_link_state *s);
 
@@ -500,6 +502,15 @@ static void phylink_run_resolve(struct phylink *pl)
 		queue_work(system_power_efficient_wq, &pl->resolve);
 }
 
+static void phylink_fixed_poll(struct timer_list *t)
+{
+	struct phylink *pl = container_of(t, struct phylink, link_poll);
+
+	mod_timer(t, jiffies + HZ);
+
+	phylink_run_resolve(pl);
+}
+
 static const struct sfp_upstream_ops sfp_phylink_ops;
 
 static int phylink_register_sfp(struct phylink *pl,
@@ -572,6 +583,7 @@ struct phylink *phylink_create(struct net_device *ndev,
 	pl->link_config.an_enabled = true;
 	pl->ops = ops;
 	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
+	timer_setup(&pl->link_poll, phylink_fixed_poll, 0);
 
 	bitmap_fill(pl->supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 	linkmode_copy(pl->link_config.advertising, pl->supported);
@@ -905,6 +917,8 @@ void phylink_start(struct phylink *pl)
 	clear_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	phylink_run_resolve(pl);
 
+	if (pl->link_an_mode == MLO_AN_FIXED && !IS_ERR(pl->link_gpio))
+		mod_timer(&pl->link_poll, jiffies + HZ);
 	if (pl->sfp_bus)
 		sfp_upstream_start(pl->sfp_bus);
 	if (pl->phydev)
@@ -929,6 +943,8 @@ void phylink_stop(struct phylink *pl)
 		phy_stop(pl->phydev);
 	if (pl->sfp_bus)
 		sfp_upstream_stop(pl->sfp_bus);
+	if (pl->link_an_mode == MLO_AN_FIXED && !IS_ERR(pl->link_gpio))
+		del_timer_sync(&pl->link_poll);
 
 	set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	queue_work(system_power_efficient_wq, &pl->resolve);
-- 
cgit v1.2.3-59-g8ed1b


From 11d8f3ddab1e2b0f148def287859d0903b7f8ac5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:32 -0700
Subject: net: dsa: Add PHYLINK switch operations

In preparation for adding support for PHYLINK within DSA, define a number of
operations that we will need and that switch drivers can start implementing.
Proper integration with PHYLINK will follow in subsequent patches.

We start selecting PHYLINK (which implies PHYLIB) in net/dsa/Kconfig
such that drivers can be guaranteed that this dependency is properly
taken care of and can start referencing PHYLINK helper functions without
requiring stubs or anything.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 24 ++++++++++++++++++++++++
 net/dsa/Kconfig   |  2 +-
 net/dsa/slave.c   |  5 +++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 462e9741b210..ed64c1f3f117 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -20,12 +20,14 @@
 #include <linux/of.h>
 #include <linux/ethtool.h>
 #include <linux/net_tstamp.h>
+#include <linux/phy.h>
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
 struct tc_action;
 struct phy_device;
 struct fixed_phy_status;
+struct phylink_link_state;
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE = 0,
@@ -353,6 +355,27 @@ struct dsa_switch_ops {
 	void	(*fixed_link_update)(struct dsa_switch *ds, int port,
 				struct fixed_phy_status *st);
 
+	/*
+	 * PHYLINK integration
+	 */
+	void	(*phylink_validate)(struct dsa_switch *ds, int port,
+				    unsigned long *supported,
+				    struct phylink_link_state *state);
+	int	(*phylink_mac_link_state)(struct dsa_switch *ds, int port,
+					  struct phylink_link_state *state);
+	void	(*phylink_mac_config)(struct dsa_switch *ds, int port,
+				      unsigned int mode,
+				      const struct phylink_link_state *state);
+	void	(*phylink_mac_an_restart)(struct dsa_switch *ds, int port);
+	void	(*phylink_mac_link_down)(struct dsa_switch *ds, int port,
+					 unsigned int mode,
+					 phy_interface_t interface);
+	void	(*phylink_mac_link_up)(struct dsa_switch *ds, int port,
+				       unsigned int mode,
+				       phy_interface_t interface,
+				       struct phy_device *phydev);
+	void	(*phylink_fixed_state)(struct dsa_switch *ds, int port,
+				       struct phylink_link_state *state);
 	/*
 	 * ethtool hardware statistics.
 	 */
@@ -595,5 +618,6 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
 int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
 #endif
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index bbf2c82cf7b2..4183e4ba27a5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -9,7 +9,7 @@ config NET_DSA
 	depends on HAVE_NET_DSA && MAY_USE_DEVLINK
 	depends on BRIDGE || BRIDGE=n
 	select NET_SWITCHDEV
-	select PHYLIB
+	select PHYLINK
 	---help---
 	  Say Y if you want to enable support for the hardware switches supported
 	  by the Distributed Switch Architecture.
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 746ab428a17a..6c2f042e3c29 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1119,6 +1119,11 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 				  dsa_slave_adjust_link, p->phy_interface);
 }
 
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
 static int dsa_slave_phy_setup(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-- 
cgit v1.2.3-59-g8ed1b


From bc0cb653a3f06b5a1ee9f12cc1c367a8736aca28 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:33 -0700
Subject: net: dsa: bcm_sf2: Implement phylink_mac_ops

Make the bcm_sf2 driver implement phylink_mac_ops since it needs to
support a wide variety of network interfaces: internal & external MDIO
PHYs, fixed PHYs, MoCA with MMIO link status.

A large amount of what needs to be done already exists under
bcm_sf2_sw_adjust_link() so we are essentially breaking this down into
the necessary operation for PHYLINK to work: mac_config, mac_link_up,
mac_link_down and validate. We can now entirely get rid of most of what
fixed_link_update() provided because only the link information is actually
necessary. We still have to force DUPLEX_FULL for legacy Device Tree bindings
that did not specify that before.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 214 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 206 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 97236cfcbae4..a20608b0329e 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -16,6 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
+#include <linux/phylink.h>
 #include <linux/mii.h>
 #include <linux/of.h>
 #include <linux/of_irq.h>
@@ -306,7 +307,8 @@ static int bcm_sf2_sw_mdio_write(struct mii_bus *bus, int addr, int regnum,
 
 static irqreturn_t bcm_sf2_switch_0_isr(int irq, void *dev_id)
 {
-	struct bcm_sf2_priv *priv = dev_id;
+	struct dsa_switch *ds = dev_id;
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 
 	priv->irq0_stat = intrl2_0_readl(priv, INTRL2_CPU_STATUS) &
 				~priv->irq0_mask;
@@ -317,16 +319,21 @@ static irqreturn_t bcm_sf2_switch_0_isr(int irq, void *dev_id)
 
 static irqreturn_t bcm_sf2_switch_1_isr(int irq, void *dev_id)
 {
-	struct bcm_sf2_priv *priv = dev_id;
+	struct dsa_switch *ds = dev_id;
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 
 	priv->irq1_stat = intrl2_1_readl(priv, INTRL2_CPU_STATUS) &
 				~priv->irq1_mask;
 	intrl2_1_writel(priv, priv->irq1_stat, INTRL2_CPU_CLEAR);
 
-	if (priv->irq1_stat & P_LINK_UP_IRQ(P7_IRQ_OFF))
-		priv->port_sts[7].link = 1;
-	if (priv->irq1_stat & P_LINK_DOWN_IRQ(P7_IRQ_OFF))
-		priv->port_sts[7].link = 0;
+	if (priv->irq1_stat & P_LINK_UP_IRQ(P7_IRQ_OFF)) {
+		priv->port_sts[7].link = true;
+		dsa_port_phylink_mac_change(ds, 7, true);
+	}
+	if (priv->irq1_stat & P_LINK_DOWN_IRQ(P7_IRQ_OFF)) {
+		priv->port_sts[7].link = false;
+		dsa_port_phylink_mac_change(ds, 7, false);
+	}
 
 	return IRQ_HANDLED;
 }
@@ -620,6 +627,192 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
 		status->pause = 1;
 }
 
+static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
+				unsigned long *supported,
+				struct phylink_link_state *state)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+	if (!phy_interface_mode_is_rgmii(state->interface) &&
+	    state->interface != PHY_INTERFACE_MODE_MII &&
+	    state->interface != PHY_INTERFACE_MODE_REVMII &&
+	    state->interface != PHY_INTERFACE_MODE_GMII &&
+	    state->interface != PHY_INTERFACE_MODE_INTERNAL &&
+	    state->interface != PHY_INTERFACE_MODE_MOCA) {
+		bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		dev_err(ds->dev,
+			"Unsupported interface: %d\n", state->interface);
+		return;
+	}
+
+	/* Allow all the expected bits */
+	phylink_set(mask, Autoneg);
+	phylink_set_port_modes(mask);
+	phylink_set(mask, Pause);
+	phylink_set(mask, Asym_Pause);
+
+	/* With the exclusion of MII and Reverse MII, we support Gigabit,
+	 * including Half duplex
+	 */
+	if (state->interface != PHY_INTERFACE_MODE_MII &&
+	    state->interface != PHY_INTERFACE_MODE_REVMII) {
+		phylink_set(mask, 1000baseT_Full);
+		phylink_set(mask, 1000baseT_Half);
+	}
+
+	phylink_set(mask, 10baseT_Half);
+	phylink_set(mask, 10baseT_Full);
+	phylink_set(mask, 100baseT_Half);
+	phylink_set(mask, 100baseT_Full);
+
+	bitmap_and(supported, supported, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void bcm_sf2_sw_mac_config(struct dsa_switch *ds, int port,
+				  unsigned int mode,
+				  const struct phylink_link_state *state)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	u32 id_mode_dis = 0, port_mode;
+	u32 reg, offset;
+
+	if (priv->type == BCM7445_DEVICE_ID)
+		offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
+	else
+		offset = CORE_STS_OVERRIDE_GMIIP2_PORT(port);
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_RGMII:
+		id_mode_dis = 1;
+		/* fallthrough */
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		port_mode = EXT_GPHY;
+		break;
+	case PHY_INTERFACE_MODE_MII:
+		port_mode = EXT_EPHY;
+		break;
+	case PHY_INTERFACE_MODE_REVMII:
+		port_mode = EXT_REVMII;
+		break;
+	default:
+		/* all other PHYs: internal and MoCA */
+		goto force_link;
+	}
+
+	/* Clear id_mode_dis bit, and the existing port mode, let
+	 * RGMII_MODE_EN bet set by mac_link_{up,down}
+	 */
+	reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+	reg &= ~ID_MODE_DIS;
+	reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT);
+	reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN);
+
+	reg |= port_mode;
+	if (id_mode_dis)
+		reg |= ID_MODE_DIS;
+
+	if (state->pause & MLO_PAUSE_TXRX_MASK) {
+		if (state->pause & MLO_PAUSE_TX)
+			reg |= TX_PAUSE_EN;
+		reg |= RX_PAUSE_EN;
+	}
+
+	reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+
+force_link:
+	/* Force link settings detected from the PHY */
+	reg = SW_OVERRIDE;
+	switch (state->speed) {
+	case SPEED_1000:
+		reg |= SPDSTS_1000 << SPEED_SHIFT;
+		break;
+	case SPEED_100:
+		reg |= SPDSTS_100 << SPEED_SHIFT;
+		break;
+	}
+
+	if (state->link)
+		reg |= LINK_STS;
+	if (state->duplex == DUPLEX_FULL)
+		reg |= DUPLX_MODE;
+
+	core_writel(priv, reg, offset);
+}
+
+static void bcm_sf2_sw_mac_link_set(struct dsa_switch *ds, int port,
+				    phy_interface_t interface, bool link)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	u32 reg;
+
+	if (!phy_interface_mode_is_rgmii(interface) &&
+	    interface != PHY_INTERFACE_MODE_MII &&
+	    interface != PHY_INTERFACE_MODE_REVMII)
+		return;
+
+	/* If the link is down, just disable the interface to conserve power */
+	reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+	if (link)
+		reg |= RGMII_MODE_EN;
+	else
+		reg &= ~RGMII_MODE_EN;
+	reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+}
+
+static void bcm_sf2_sw_mac_link_down(struct dsa_switch *ds, int port,
+				     unsigned int mode,
+				     phy_interface_t interface)
+{
+	bcm_sf2_sw_mac_link_set(ds, port, interface, false);
+}
+
+static void bcm_sf2_sw_mac_link_up(struct dsa_switch *ds, int port,
+				   unsigned int mode,
+				   phy_interface_t interface,
+				   struct phy_device *phydev)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct ethtool_eee *p = &priv->dev->ports[port].eee;
+
+	bcm_sf2_sw_mac_link_set(ds, port, interface, true);
+
+	if (mode == MLO_AN_PHY && phydev)
+		p->eee_enabled = b53_eee_init(ds, port, phydev);
+}
+
+static void bcm_sf2_sw_fixed_state(struct dsa_switch *ds, int port,
+				   struct phylink_link_state *status)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+
+	status->link = false;
+
+	/* MoCA port is special as we do not get link status from CORE_LNKSTS,
+	 * which means that we need to force the link at the port override
+	 * level to get the data to flow. We do use what the interrupt handler
+	 * did determine before.
+	 *
+	 * For the other ports, we just force the link status, since this is
+	 * a fixed PHY device.
+	 */
+	if (port == priv->moca_port) {
+		status->link = priv->port_sts[port].link;
+		/* For MoCA interfaces, also force a link down notification
+		 * since some version of the user-space daemon (mocad) use
+		 * cmd->autoneg to force the link, which messes up the PHY
+		 * state machine and make it go in PHY_FORCING state instead.
+		 */
+		if (!status->link)
+			netif_carrier_off(ds->ports[port].slave);
+		status->duplex = DUPLEX_FULL;
+	} else {
+		status->link = true;
+	}
+}
+
 static void bcm_sf2_enable_acb(struct dsa_switch *ds)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
@@ -863,6 +1056,11 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.get_phy_flags		= bcm_sf2_sw_get_phy_flags,
 	.adjust_link		= bcm_sf2_sw_adjust_link,
 	.fixed_link_update	= bcm_sf2_sw_fixed_link_update,
+	.phylink_validate	= bcm_sf2_sw_validate,
+	.phylink_mac_config	= bcm_sf2_sw_mac_config,
+	.phylink_mac_link_down	= bcm_sf2_sw_mac_link_down,
+	.phylink_mac_link_up	= bcm_sf2_sw_mac_link_up,
+	.phylink_fixed_state	= bcm_sf2_sw_fixed_state,
 	.suspend		= bcm_sf2_sw_suspend,
 	.resume			= bcm_sf2_sw_resume,
 	.get_wol		= bcm_sf2_sw_get_wol,
@@ -1065,14 +1263,14 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
 	bcm_sf2_intr_disable(priv);
 
 	ret = devm_request_irq(&pdev->dev, priv->irq0, bcm_sf2_switch_0_isr, 0,
-			       "switch_0", priv);
+			       "switch_0", ds);
 	if (ret < 0) {
 		pr_err("failed to request switch_0 IRQ\n");
 		goto out_mdio;
 	}
 
 	ret = devm_request_irq(&pdev->dev, priv->irq1, bcm_sf2_switch_1_isr, 0,
-			       "switch_1", priv);
+			       "switch_1", ds);
 	if (ret < 0) {
 		pr_err("failed to request switch_1 IRQ\n");
 		goto out_mdio;
-- 
cgit v1.2.3-59-g8ed1b


From c4aef9fc0d4b1d497c2f2973e3f123bcb96d6bbc Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:34 -0700
Subject: net: dsa: Eliminate dsa_slave_get_link()

Since we use PHYLIB to manage the per-port link indication, this will
also be reflected correctly in the network device's carrier state, so we
can use ethtool_op_get_link() instead.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6c2f042e3c29..729f18d23bdd 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -498,16 +498,6 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 		ds->ops->get_regs(ds, dp->index, regs, _p);
 }
 
-static u32 dsa_slave_get_link(struct net_device *dev)
-{
-	if (!dev->phydev)
-		return -ENODEV;
-
-	genphy_update_link(dev->phydev);
-
-	return dev->phydev->link;
-}
-
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
@@ -982,7 +972,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_regs_len		= dsa_slave_get_regs_len,
 	.get_regs		= dsa_slave_get_regs,
 	.nway_reset		= phy_ethtool_nway_reset,
-	.get_link		= dsa_slave_get_link,
+	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= dsa_slave_get_eeprom_len,
 	.get_eeprom		= dsa_slave_get_eeprom,
 	.set_eeprom		= dsa_slave_set_eeprom,
-- 
cgit v1.2.3-59-g8ed1b


From c9a2356f35409a667429254bc326b10f92c7ecce Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 10 May 2018 13:17:35 -0700
Subject: net: dsa: mv88e6xxx: add PHYLINK support

Add rudimentary phylink support to mv88e6xxx. This allows the driver
using user ports with fixed links to keep operating normally. User ports
with normal PHYs are not affected since the switch automatically manages
their link parameters. User facing ports which use a SFP/SFF with a
non-fixed link mode might require a call to phylink_mac_change() to
operate properly.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
[Andrew: fixed link setting after adding link polling]
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
[florian: expand commit message]
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 83 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/port.c | 39 +++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/port.h |  3 ++
 3 files changed, 125 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 5d933549dc49..1cebde80b101 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -31,6 +31,7 @@
 #include <linux/netdevice.h>
 #include <linux/gpio/consumer.h>
 #include <linux/phy.h>
+#include <linux/phylink.h>
 #include <net/dsa.h>
 
 #include "chip.h"
@@ -580,6 +581,83 @@ static void mv88e6xxx_adjust_link(struct dsa_switch *ds, int port,
 		dev_err(ds->dev, "p%d: failed to configure MAC\n", port);
 }
 
+static void mv88e6xxx_validate(struct dsa_switch *ds, int port,
+			       unsigned long *supported,
+			       struct phylink_link_state *state)
+{
+}
+
+static int mv88e6xxx_link_state(struct dsa_switch *ds, int port,
+				struct phylink_link_state *state)
+{
+	struct mv88e6xxx_chip *chip = ds->priv;
+	int err;
+
+	mutex_lock(&chip->reg_lock);
+	err = mv88e6xxx_port_link_state(chip, port, state);
+	mutex_unlock(&chip->reg_lock);
+
+	return err;
+}
+
+static void mv88e6xxx_mac_config(struct dsa_switch *ds, int port,
+				 unsigned int mode,
+				 const struct phylink_link_state *state)
+{
+	struct mv88e6xxx_chip *chip = ds->priv;
+	int speed, duplex, link, err;
+
+	if (mode == MLO_AN_PHY)
+		return;
+
+	if (mode == MLO_AN_FIXED) {
+		link = LINK_FORCED_UP;
+		speed = state->speed;
+		duplex = state->duplex;
+	} else {
+		speed = SPEED_UNFORCED;
+		duplex = DUPLEX_UNFORCED;
+		link = LINK_UNFORCED;
+	}
+
+	mutex_lock(&chip->reg_lock);
+	err = mv88e6xxx_port_setup_mac(chip, port, link, speed, duplex,
+				       state->interface);
+	mutex_unlock(&chip->reg_lock);
+
+	if (err && err != -EOPNOTSUPP)
+		dev_err(ds->dev, "p%d: failed to configure MAC\n", port);
+}
+
+static void mv88e6xxx_mac_link_force(struct dsa_switch *ds, int port, int link)
+{
+	struct mv88e6xxx_chip *chip = ds->priv;
+	int err;
+
+	mutex_lock(&chip->reg_lock);
+	err = chip->info->ops->port_set_link(chip, port, link);
+	mutex_unlock(&chip->reg_lock);
+
+	if (err)
+		dev_err(chip->dev, "p%d: failed to force MAC link\n", port);
+}
+
+static void mv88e6xxx_mac_link_down(struct dsa_switch *ds, int port,
+				    unsigned int mode,
+				    phy_interface_t interface)
+{
+	if (mode == MLO_AN_FIXED)
+		mv88e6xxx_mac_link_force(ds, port, LINK_FORCED_DOWN);
+}
+
+static void mv88e6xxx_mac_link_up(struct dsa_switch *ds, int port,
+				  unsigned int mode, phy_interface_t interface,
+				  struct phy_device *phydev)
+{
+	if (mode == MLO_AN_FIXED)
+		mv88e6xxx_mac_link_force(ds, port, LINK_FORCED_UP);
+}
+
 static int mv88e6xxx_stats_snapshot(struct mv88e6xxx_chip *chip, int port)
 {
 	if (!chip->info->ops->stats_snapshot)
@@ -4169,6 +4247,11 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.get_tag_protocol	= mv88e6xxx_get_tag_protocol,
 	.setup			= mv88e6xxx_setup,
 	.adjust_link		= mv88e6xxx_adjust_link,
+	.phylink_validate	= mv88e6xxx_validate,
+	.phylink_mac_link_state	= mv88e6xxx_link_state,
+	.phylink_mac_config	= mv88e6xxx_mac_config,
+	.phylink_mac_link_down	= mv88e6xxx_mac_link_down,
+	.phylink_mac_link_up	= mv88e6xxx_mac_link_up,
 	.get_strings		= mv88e6xxx_get_strings,
 	.get_ethtool_stats	= mv88e6xxx_get_ethtool_stats,
 	.get_sset_count		= mv88e6xxx_get_sset_count,
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 6315774d72b3..429d0ebcd5b1 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -15,6 +15,7 @@
 #include <linux/bitfield.h>
 #include <linux/if_bridge.h>
 #include <linux/phy.h>
+#include <linux/phylink.h>
 
 #include "chip.h"
 #include "port.h"
@@ -378,6 +379,44 @@ int mv88e6xxx_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode)
 	return 0;
 }
 
+int mv88e6xxx_port_link_state(struct mv88e6xxx_chip *chip, int port,
+			      struct phylink_link_state *state)
+{
+	int err;
+	u16 reg;
+
+	err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_STS, &reg);
+	if (err)
+		return err;
+
+	switch (reg & MV88E6XXX_PORT_STS_SPEED_MASK) {
+	case MV88E6XXX_PORT_STS_SPEED_10:
+		state->speed = SPEED_10;
+		break;
+	case MV88E6XXX_PORT_STS_SPEED_100:
+		state->speed = SPEED_100;
+		break;
+	case MV88E6XXX_PORT_STS_SPEED_1000:
+		state->speed = SPEED_1000;
+		break;
+	case MV88E6XXX_PORT_STS_SPEED_10000:
+		if ((reg &MV88E6XXX_PORT_STS_CMODE_MASK) ==
+		    MV88E6XXX_PORT_STS_CMODE_2500BASEX)
+			state->speed = SPEED_2500;
+		else
+			state->speed = SPEED_10000;
+		break;
+	}
+
+	state->duplex = reg & MV88E6XXX_PORT_STS_DUPLEX ?
+			DUPLEX_FULL : DUPLEX_HALF;
+	state->link = !!(reg & MV88E6XXX_PORT_STS_LINK);
+	state->an_enabled = 1;
+	state->an_complete = state->link;
+
+	return 0;
+}
+
 /* Offset 0x02: Jamming Control
  *
  * Do not limit the period of time that this port can be paused for by
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index b16d5f0e6e9c..5e1db1b221ca 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -29,6 +29,7 @@
 #define MV88E6XXX_PORT_STS_SPEED_10		0x0000
 #define MV88E6XXX_PORT_STS_SPEED_100		0x0100
 #define MV88E6XXX_PORT_STS_SPEED_1000		0x0200
+#define MV88E6XXX_PORT_STS_SPEED_10000		0x0300
 #define MV88E6352_PORT_STS_EEE			0x0040
 #define MV88E6165_PORT_STS_AM_DIS		0x0040
 #define MV88E6185_PORT_STS_MGMII		0x0040
@@ -295,6 +296,8 @@ int mv88e6390_port_pause_limit(struct mv88e6xxx_chip *chip, int port, u8 in,
 int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port,
 			      phy_interface_t mode);
 int mv88e6xxx_port_get_cmode(struct mv88e6xxx_chip *chip, int port, u8 *cmode);
+int mv88e6xxx_port_link_state(struct mv88e6xxx_chip *chip, int port,
+			      struct phylink_link_state *state);
 int mv88e6xxx_port_set_map_da(struct mv88e6xxx_chip *chip, int port);
 int mv88e6095_port_set_upstream_port(struct mv88e6xxx_chip *chip, int port,
 				     int upstream_port);
-- 
cgit v1.2.3-59-g8ed1b


From aab9c4067d2389d0adfc9c53806437df7b0fe3d5 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:36 -0700
Subject: net: dsa: Plug in PHYLINK support

Add support for PHYLINK within the DSA subsystem in order to support more
complex devices such as pluggable (SFP) and non-pluggable (SFF) modules, 10G
PHYs, and traditional PHYs. Using PHYLINK allows us to drop some amount of
complexity we had while probing fixed and non-fixed PHYs using Device Tree.

Because PHYLINK separates the Ethernet MAC/port configuration into different
stages, we let switch drivers implement those, and for now, we maintain
functionality by calling dsa_slave_adjust_link() during
phylink_mac_link_{up,down} which provides semantically equivalent steps.

Drivers willing to take advantage of PHYLINK should implement the phylink_mac_*
operations that DSA wraps.

We cannot quite remove the adjust_link() callback just yet, because a number of
drivers rely on that for configuring their "CPU" and "DSA" ports, this is done
dsa_port_setup_phy_of() and dsa_port_fixed_link_register_of() still.

Drivers that utilize fixed links for user-facing ports (e.g: bcm_sf2) will need
to implement phylink_mac_ops from now on to preserve functionality, since PHYLINK
*does not* create a phy_device instance for fixed links.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |   1 +
 net/dsa/dsa_priv.h |   9 --
 net/dsa/slave.c    | 294 +++++++++++++++++++++++++++++++----------------------
 3 files changed, 172 insertions(+), 132 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ed64c1f3f117..fdbd6082945d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -201,6 +201,7 @@ struct dsa_port {
 	u8			stp_state;
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
+	struct phylink		*pl;
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 053731473c99..3964c6f7a7c0 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -75,15 +75,6 @@ struct dsa_slave_priv {
 	/* DSA port data, such as switch, port index, etc. */
 	struct dsa_port		*dp;
 
-	/*
-	 * The phylib phy_device pointer for the PHY connected
-	 * to this port.
-	 */
-	phy_interface_t		phy_interface;
-	int			old_link;
-	int			old_pause;
-	int			old_duplex;
-
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 729f18d23bdd..1e3b6a6d8a40 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -13,6 +13,7 @@
 #include <linux/netdevice.h>
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
+#include <linux/phylink.h>
 #include <linux/of_net.h>
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
@@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev)
 	if (err)
 		goto clear_promisc;
 
-	if (dev->phydev)
-		phy_start(dev->phydev);
+	phylink_start(dp->pl);
 
 	return 0;
 
@@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev)
 	struct net_device *master = dsa_slave_to_master(dev);
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 
-	if (dev->phydev)
-		phy_stop(dev->phydev);
+	phylink_stop(dp->pl);
 
 	dsa_port_disable(dp, dev->phydev);
 
@@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 	}
 
-	if (!dev->phydev)
-		return -ENODEV;
-
-	return phy_mii_ioctl(dev->phydev, ifr, cmd);
+	return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
 }
 
 static int dsa_slave_port_attr_set(struct net_device *dev,
@@ -498,6 +494,13 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
 		ds->ops->get_regs(ds, dp->index, regs, _p);
 }
 
+static int dsa_slave_nway_reset(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_nway_reset(dp->pl);
+}
+
 static int dsa_slave_get_eeprom_len(struct net_device *dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
@@ -609,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	struct dsa_switch *ds = dp->ds;
 
+	phylink_ethtool_get_wol(dp->pl, w);
+
 	if (ds->ops->get_wol)
 		ds->ops->get_wol(ds, dp->index, w);
 }
@@ -619,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
 	struct dsa_switch *ds = dp->ds;
 	int ret = -EOPNOTSUPP;
 
+	phylink_ethtool_set_wol(dp->pl, w);
+
 	if (ds->ops->set_wol)
 		ret = ds->ops->set_wol(ds, dp->index, w);
 
@@ -642,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (ret)
 		return ret;
 
-	if (e->eee_enabled) {
-		ret = phy_init_eee(dev->phydev, 0);
-		if (ret)
-			return ret;
-	}
-
-	return phy_ethtool_set_eee(dev->phydev, e);
+	return phylink_ethtool_set_eee(dp->pl, e);
 }
 
 static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -668,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (ret)
 		return ret;
 
-	return phy_ethtool_get_eee(dev->phydev, e);
+	return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_slave_get_link_ksettings(struct net_device *dev,
+					struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_slave_set_link_ksettings(struct net_device *dev,
+					const struct ethtool_link_ksettings *cmd)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+
+	return phylink_ethtool_ksettings_set(dp->pl, cmd);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -971,7 +988,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
 	.get_regs		= dsa_slave_get_regs,
-	.nway_reset		= phy_ethtool_nway_reset,
+	.nway_reset		= dsa_slave_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_eeprom_len		= dsa_slave_get_eeprom_len,
 	.get_eeprom		= dsa_slave_get_eeprom,
@@ -983,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_wol		= dsa_slave_get_wol,
 	.set_eee		= dsa_slave_set_eee,
 	.get_eee		= dsa_slave_get_eee,
-	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
-	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
+	.get_link_ksettings	= dsa_slave_get_link_ksettings,
+	.set_link_ksettings	= dsa_slave_set_link_ksettings,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
 	.set_rxnfc		= dsa_slave_set_rxnfc,
 	.get_ts_info		= dsa_slave_get_ts_info,
@@ -1043,56 +1060,122 @@ static struct device_type dsa_type = {
 	.name	= "dsa",
 };
 
-static void dsa_slave_adjust_link(struct net_device *dev)
+static void dsa_slave_phylink_validate(struct net_device *dev,
+				       unsigned long *supported,
+				       struct phylink_link_state *state)
 {
 	struct dsa_port *dp = dsa_slave_to_port(dev);
-	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = dp->ds;
-	unsigned int status_changed = 0;
 
-	if (p->old_link != dev->phydev->link) {
-		status_changed = 1;
-		p->old_link = dev->phydev->link;
-	}
+	if (!ds->ops->phylink_validate)
+		return;
 
-	if (p->old_duplex != dev->phydev->duplex) {
-		status_changed = 1;
-		p->old_duplex = dev->phydev->duplex;
-	}
+	ds->ops->phylink_validate(ds, dp->index, supported, state);
+}
 
-	if (p->old_pause != dev->phydev->pause) {
-		status_changed = 1;
-		p->old_pause = dev->phydev->pause;
-	}
+static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
+					    struct phylink_link_state *state)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
 
-	if (ds->ops->adjust_link && status_changed)
-		ds->ops->adjust_link(ds, dp->index, dev->phydev);
+	/* Only called for SGMII and 802.3z */
+	if (!ds->ops->phylink_mac_link_state)
+		return -EOPNOTSUPP;
 
-	if (status_changed)
-		phy_print_status(dev->phydev);
+	return ds->ops->phylink_mac_link_state(ds, dp->index, state);
 }
 
-static int dsa_slave_fixed_link_update(struct net_device *dev,
-				       struct fixed_phy_status *status)
+static void dsa_slave_phylink_mac_config(struct net_device *dev,
+					 unsigned int mode,
+					 const struct phylink_link_state *state)
 {
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_config)
+		return;
 
-	if (dev) {
-		dp = dsa_slave_to_port(dev);
-		ds = dp->ds;
-		if (ds->ops->fixed_link_update)
-			ds->ops->fixed_link_update(ds, dp->index, status);
+	ds->ops->phylink_mac_config(ds, dp->index, mode, state);
+}
+
+static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_an_restart)
+		return;
+
+	ds->ops->phylink_mac_an_restart(ds, dp->index);
+}
+
+static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
+					    unsigned int mode,
+					    phy_interface_t interface)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_link_down) {
+		if (ds->ops->adjust_link && dev->phydev)
+			ds->ops->adjust_link(ds, dp->index, dev->phydev);
+		return;
 	}
 
-	return 0;
+	ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
+}
+
+static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
+					  unsigned int mode,
+					  phy_interface_t interface,
+					  struct phy_device *phydev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	if (!ds->ops->phylink_mac_link_up) {
+		if (ds->ops->adjust_link && dev->phydev)
+			ds->ops->adjust_link(ds, dp->index, dev->phydev);
+		return;
+	}
+
+	ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+}
+
+static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
+	.validate = dsa_slave_phylink_validate,
+	.mac_link_state = dsa_slave_phylink_mac_link_state,
+	.mac_config = dsa_slave_phylink_mac_config,
+	.mac_an_restart = dsa_slave_phylink_mac_an_restart,
+	.mac_link_down = dsa_slave_phylink_mac_link_down,
+	.mac_link_up = dsa_slave_phylink_mac_link_up,
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+	const struct dsa_port *dp = dsa_to_port(ds, port);
+
+	phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_slave_phylink_fixed_state(struct net_device *dev,
+					  struct phylink_link_state *state)
+{
+	struct dsa_port *dp = dsa_slave_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+
+	/* No need to check that this operation is valid, the callback would
+	 * not be called if it was not.
+	 */
+	ds->ops->phylink_fixed_state(ds, dp->index, state);
 }
 
 /* slave device setup *******************************************************/
 static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct dsa_switch *ds = dp->ds;
 
 	slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
@@ -1101,80 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
 		return -ENODEV;
 	}
 
-	/* Use already configured phy mode */
-	if (p->phy_interface == PHY_INTERFACE_MODE_NA)
-		p->phy_interface = slave_dev->phydev->interface;
-
-	return phy_connect_direct(slave_dev, slave_dev->phydev,
-				  dsa_slave_adjust_link, p->phy_interface);
-}
-
-void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
-{
+	return phylink_connect_phy(dp->pl, slave_dev->phydev);
 }
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
 
 static int dsa_slave_phy_setup(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
 	struct device_node *port_dn = dp->dn;
 	struct dsa_switch *ds = dp->ds;
-	struct device_node *phy_dn;
-	bool phy_is_fixed = false;
 	u32 phy_flags = 0;
 	int mode, ret;
 
 	mode = of_get_phy_mode(port_dn);
 	if (mode < 0)
 		mode = PHY_INTERFACE_MODE_NA;
-	p->phy_interface = mode;
 
-	phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
-	if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
-		/* In the case of a fixed PHY, the DT node associated
-		 * to the fixed PHY is the Port DT node
-		 */
-		ret = of_phy_register_fixed_link(port_dn);
-		if (ret) {
-			netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret);
-			return ret;
-		}
-		phy_is_fixed = true;
-		phy_dn = of_node_get(port_dn);
+	dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
+				&dsa_slave_phylink_mac_ops);
+	if (IS_ERR(dp->pl)) {
+		netdev_err(slave_dev,
+			   "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
+		return PTR_ERR(dp->pl);
 	}
 
+	/* Register only if the switch provides such a callback, since this
+	 * callback takes precedence over polling the link GPIO in PHYLINK
+	 * (see phylink_get_fixed_state).
+	 */
+	if (ds->ops->phylink_fixed_state)
+		phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
+
 	if (ds->ops->get_phy_flags)
 		phy_flags = ds->ops->get_phy_flags(ds, dp->index);
 
-	if (phy_dn) {
-		slave_dev->phydev = of_phy_connect(slave_dev, phy_dn,
-						   dsa_slave_adjust_link,
-						   phy_flags,
-						   p->phy_interface);
-		of_node_put(phy_dn);
-	}
-
-	if (slave_dev->phydev && phy_is_fixed)
-		fixed_phy_set_link_update(slave_dev->phydev,
-					  dsa_slave_fixed_link_update);
-
-	/* We could not connect to a designated PHY, so use the switch internal
-	 * MDIO bus instead
-	 */
-	if (!slave_dev->phydev) {
+	ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+	if (ret == -ENODEV) {
+		/* We could not connect to a designated PHY or SFP, so use the
+		 * switch internal MDIO bus instead
+		 */
 		ret = dsa_slave_phy_connect(slave_dev, dp->index);
 		if (ret) {
-			netdev_err(slave_dev, "failed to connect to port %d: %d\n",
+			netdev_err(slave_dev,
+				   "failed to connect to port %d: %d\n",
 				   dp->index, ret);
-			if (phy_is_fixed)
-				of_phy_deregister_fixed_link(port_dn);
+			phylink_destroy(dp->pl);
 			return ret;
 		}
 	}
 
-	phy_attached_info(slave_dev->phydev);
-
 	return 0;
 }
 
@@ -1189,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
 
 int dsa_slave_suspend(struct net_device *slave_dev)
 {
-	struct dsa_slave_priv *p = netdev_priv(slave_dev);
+	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
 
 	netif_device_detach(slave_dev);
 
-	if (slave_dev->phydev) {
-		phy_stop(slave_dev->phydev);
-		p->old_pause = -1;
-		p->old_link = -1;
-		p->old_duplex = -1;
-		phy_suspend(slave_dev->phydev);
-	}
+	rtnl_lock();
+	phylink_stop(dp->pl);
+	rtnl_unlock();
 
 	return 0;
 }
 
 int dsa_slave_resume(struct net_device *slave_dev)
 {
+	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+
 	netif_device_attach(slave_dev);
 
-	if (slave_dev->phydev) {
-		phy_resume(slave_dev->phydev);
-		phy_start(slave_dev->phydev);
-	}
+	rtnl_lock();
+	phylink_start(dp->pl);
+	rtnl_unlock();
 
 	return 0;
 }
@@ -1276,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port)
 	p->dp = port;
 	INIT_LIST_HEAD(&p->mall_tc_list);
 	p->xmit = cpu_dp->tag_ops->xmit;
-
-	p->old_pause = -1;
-	p->old_link = -1;
-	p->old_duplex = -1;
-
 	port->slave = slave_dev;
 
 	netif_carrier_off(slave_dev);
@@ -1303,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port)
 	return 0;
 
 out_phy:
-	phy_disconnect(slave_dev->phydev);
-	if (of_phy_is_fixed_link(port->dn))
-		of_phy_deregister_fixed_link(port->dn);
+	rtnl_lock();
+	phylink_disconnect_phy(p->dp->pl);
+	rtnl_unlock();
+	phylink_destroy(p->dp->pl);
 out_free:
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
@@ -1317,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 {
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
 	struct dsa_slave_priv *p = netdev_priv(slave_dev);
-	struct device_node *port_dn = dp->dn;
 
 	netif_carrier_off(slave_dev);
-	if (slave_dev->phydev) {
-		phy_disconnect(slave_dev->phydev);
+	rtnl_lock();
+	phylink_disconnect_phy(dp->pl);
+	rtnl_unlock();
 
-		if (of_phy_is_fixed_link(port_dn))
-			of_phy_deregister_fixed_link(port_dn);
-	}
 	dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
 	unregister_netdev(slave_dev);
+	phylink_destroy(dp->pl);
 	free_percpu(p->stats64);
 	free_netdev(slave_dev);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 58d56fcc3964f9be0a9ca42fd126bcd9dc7afc90 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 10 May 2018 13:17:37 -0700
Subject: net: dsa: bcm_sf2: Get rid of PHYLIB functions

Now that we have converted the bcm_sf2 driver to implement PHYLINK MAC
operations, we can remove the PHYLIB callbacks: adjust_link() and
fixed_link_update() which are no longer called by DSA.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 149 ----------------------------------------------
 1 file changed, 149 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index a20608b0329e..ac621f44237a 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -480,153 +480,6 @@ static u32 bcm_sf2_sw_get_phy_flags(struct dsa_switch *ds, int port)
 	return priv->hw_params.gphy_rev;
 }
 
-static void bcm_sf2_sw_adjust_link(struct dsa_switch *ds, int port,
-				   struct phy_device *phydev)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	struct ethtool_eee *p = &priv->dev->ports[port].eee;
-	u32 id_mode_dis = 0, port_mode;
-	const char *str = NULL;
-	u32 reg, offset;
-
-	if (priv->type == BCM7445_DEVICE_ID)
-		offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
-	else
-		offset = CORE_STS_OVERRIDE_GMIIP2_PORT(port);
-
-	switch (phydev->interface) {
-	case PHY_INTERFACE_MODE_RGMII:
-		str = "RGMII (no delay)";
-		id_mode_dis = 1;
-	case PHY_INTERFACE_MODE_RGMII_TXID:
-		if (!str)
-			str = "RGMII (TX delay)";
-		port_mode = EXT_GPHY;
-		break;
-	case PHY_INTERFACE_MODE_MII:
-		str = "MII";
-		port_mode = EXT_EPHY;
-		break;
-	case PHY_INTERFACE_MODE_REVMII:
-		str = "Reverse MII";
-		port_mode = EXT_REVMII;
-		break;
-	default:
-		/* All other PHYs: internal and MoCA */
-		goto force_link;
-	}
-
-	/* If the link is down, just disable the interface to conserve power */
-	if (!phydev->link) {
-		reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
-		reg &= ~RGMII_MODE_EN;
-		reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
-		goto force_link;
-	}
-
-	/* Clear id_mode_dis bit, and the existing port mode, but
-	 * make sure we enable the RGMII block for data to pass
-	 */
-	reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
-	reg &= ~ID_MODE_DIS;
-	reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT);
-	reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN);
-
-	reg |= port_mode | RGMII_MODE_EN;
-	if (id_mode_dis)
-		reg |= ID_MODE_DIS;
-
-	if (phydev->pause) {
-		if (phydev->asym_pause)
-			reg |= TX_PAUSE_EN;
-		reg |= RX_PAUSE_EN;
-	}
-
-	reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
-
-	pr_info("Port %d configured for %s\n", port, str);
-
-force_link:
-	/* Force link settings detected from the PHY */
-	reg = SW_OVERRIDE;
-	switch (phydev->speed) {
-	case SPEED_1000:
-		reg |= SPDSTS_1000 << SPEED_SHIFT;
-		break;
-	case SPEED_100:
-		reg |= SPDSTS_100 << SPEED_SHIFT;
-		break;
-	}
-
-	if (phydev->link)
-		reg |= LINK_STS;
-	if (phydev->duplex == DUPLEX_FULL)
-		reg |= DUPLX_MODE;
-
-	core_writel(priv, reg, offset);
-
-	if (!phydev->is_pseudo_fixed_link)
-		p->eee_enabled = b53_eee_init(ds, port, phydev);
-}
-
-static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port,
-					 struct fixed_phy_status *status)
-{
-	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	u32 duplex, pause, offset;
-	u32 reg;
-
-	if (priv->type == BCM7445_DEVICE_ID)
-		offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
-	else
-		offset = CORE_STS_OVERRIDE_GMIIP2_PORT(port);
-
-	duplex = core_readl(priv, CORE_DUPSTS);
-	pause = core_readl(priv, CORE_PAUSESTS);
-
-	status->link = 0;
-
-	/* MoCA port is special as we do not get link status from CORE_LNKSTS,
-	 * which means that we need to force the link at the port override
-	 * level to get the data to flow. We do use what the interrupt handler
-	 * did determine before.
-	 *
-	 * For the other ports, we just force the link status, since this is
-	 * a fixed PHY device.
-	 */
-	if (port == priv->moca_port) {
-		status->link = priv->port_sts[port].link;
-		/* For MoCA interfaces, also force a link down notification
-		 * since some version of the user-space daemon (mocad) use
-		 * cmd->autoneg to force the link, which messes up the PHY
-		 * state machine and make it go in PHY_FORCING state instead.
-		 */
-		if (!status->link)
-			netif_carrier_off(ds->ports[port].slave);
-		status->duplex = 1;
-	} else {
-		status->link = 1;
-		status->duplex = !!(duplex & (1 << port));
-	}
-
-	reg = core_readl(priv, offset);
-	reg |= SW_OVERRIDE;
-	if (status->link)
-		reg |= LINK_STS;
-	else
-		reg &= ~LINK_STS;
-	core_writel(priv, reg, offset);
-
-	if ((pause & (1 << port)) &&
-	    (pause & (1 << (port + PAUSESTS_TX_PAUSE_SHIFT)))) {
-		status->asym_pause = 1;
-		status->pause = 1;
-	}
-
-	if (pause & (1 << port))
-		status->pause = 1;
-}
-
 static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
 				unsigned long *supported,
 				struct phylink_link_state *state)
@@ -1054,8 +907,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.get_sset_count		= b53_get_sset_count,
 	.get_ethtool_phy_stats	= b53_get_ethtool_phy_stats,
 	.get_phy_flags		= bcm_sf2_sw_get_phy_flags,
-	.adjust_link		= bcm_sf2_sw_adjust_link,
-	.fixed_link_update	= bcm_sf2_sw_fixed_link_update,
 	.phylink_validate	= bcm_sf2_sw_validate,
 	.phylink_mac_config	= bcm_sf2_sw_mac_config,
 	.phylink_mac_link_down	= bcm_sf2_sw_mac_link_down,
-- 
cgit v1.2.3-59-g8ed1b


From 73a6bab5aa2a83cb7df85805e08bc03b4065aea7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 May 2018 14:59:43 -0700
Subject: tcp: switch pacing timer to softirq based hrtimer

linux-4.16 got support for softirq based hrtimers.
TCP can switch its pacing hrtimer to this variant, since this
avoids going through a tasklet and some atomic operations.

pacing timer logic looks like other (jiffies based) tcp timers.

v2: use hrtimer_try_to_cancel() in tcp_clear_xmit_timers()
    to correctly release reference on socket if needed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  4 ++-
 net/ipv4/tcp_output.c | 69 +++++++++++++++++++--------------------------------
 net/ipv4/tcp_timer.c  |  2 +-
 3 files changed, 29 insertions(+), 46 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe0fb86..3b1d617b0110 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -557,7 +557,9 @@ void tcp_fin(struct sock *sk);
 void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
 {
-	hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
+	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
+		sock_put(sk);
+
 	inet_csk_clear_xmit_timers(sk);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d07c0dcc99aa..0d8f950a9006 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -772,7 +772,7 @@ struct tsq_tasklet {
 };
 static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
 
-static void tcp_tsq_handler(struct sock *sk)
+static void tcp_tsq_write(struct sock *sk)
 {
 	if ((1 << sk->sk_state) &
 	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
@@ -789,6 +789,16 @@ static void tcp_tsq_handler(struct sock *sk)
 			       0, GFP_ATOMIC);
 	}
 }
+
+static void tcp_tsq_handler(struct sock *sk)
+{
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk))
+		tcp_tsq_write(sk);
+	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+		sock_hold(sk);
+	bh_unlock_sock(sk);
+}
 /*
  * One tasklet per cpu tries to send more skbs.
  * We run in tasklet context but need to disable irqs when
@@ -816,16 +826,7 @@ static void tcp_tasklet_func(unsigned long data)
 		smp_mb__before_atomic();
 		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 
-		if (!sk->sk_lock.owned &&
-		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
-			bh_lock_sock(sk);
-			if (!sock_owned_by_user(sk)) {
-				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
-				tcp_tsq_handler(sk);
-			}
-			bh_unlock_sock(sk);
-		}
-
+		tcp_tsq_handler(sk);
 		sk_free(sk);
 	}
 }
@@ -853,9 +854,10 @@ void tcp_release_cb(struct sock *sk)
 		nflags = flags & ~TCP_DEFERRED_ALL;
 	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
-	if (flags & TCPF_TSQ_DEFERRED)
-		tcp_tsq_handler(sk);
-
+	if (flags & TCPF_TSQ_DEFERRED) {
+		tcp_tsq_write(sk);
+		__sock_put(sk);
+	}
 	/* Here begins the tricky part :
 	 * We are called from release_sock() with :
 	 * 1) BH disabled
@@ -929,7 +931,7 @@ void tcp_wfree(struct sk_buff *skb)
 		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
 			goto out;
 
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
 		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
@@ -948,37 +950,17 @@ out:
 	sk_free(sk);
 }
 
-/* Note: Called under hard irq.
- * We can not call TCP stack right away.
+/* Note: Called under soft irq.
+ * We can call TCP stack right away, unless socket is owned by user.
  */
 enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
 {
 	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
 	struct sock *sk = (struct sock *)tp;
-	unsigned long nval, oval;
 
-	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
-		struct tsq_tasklet *tsq;
-		bool empty;
+	tcp_tsq_handler(sk);
+	sock_put(sk);
 
-		if (oval & TSQF_QUEUED)
-			break;
-
-		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
-		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
-		if (nval != oval)
-			continue;
-
-		if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
-			break;
-		/* queue this socket to tasklet queue */
-		tsq = this_cpu_ptr(&tsq_tasklet);
-		empty = list_empty(&tsq->head);
-		list_add(&tp->tsq_node, &tsq->head);
-		if (empty)
-			tasklet_schedule(&tsq->tasklet);
-		break;
-	}
 	return HRTIMER_NORESTART;
 }
 
@@ -1011,7 +993,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
 	do_div(len_ns, rate);
 	hrtimer_start(&tcp_sk(sk)->pacing_timer,
 		      ktime_add_ns(ktime_get(), len_ns),
-		      HRTIMER_MODE_ABS_PINNED);
+		      HRTIMER_MODE_ABS_PINNED_SOFT);
+	sock_hold(sk);
 }
 
 static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
@@ -1078,7 +1061,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	/* if no packet is in qdisc/device queue, then allow XPS to select
 	 * another queue. We can be called from tcp_tsq_handler()
-	 * which holds one reference to sk_wmem_alloc.
+	 * which holds one reference to sk.
 	 *
 	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
 	 * One way to get this would be to set skb->truesize = 2 on them.
@@ -2185,7 +2168,7 @@ static int tcp_mtu_probe(struct sock *sk)
 static bool tcp_pacing_check(const struct sock *sk)
 {
 	return tcp_needs_internal_pacing(sk) &&
-	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
+	       hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
 }
 
 /* TCP Small Queues :
@@ -2365,8 +2348,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 					  skb, limit, mss_now, gfp)))
 			break;
 
-		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
-			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f7d944855f8e..92bdf64fffae 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -713,6 +713,6 @@ void tcp_init_xmit_timers(struct sock *sk)
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
 				  &tcp_keepalive_timer);
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS_PINNED);
+		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 575b65bc5bff37ec502d5eab9fc91d0a8d5ec40e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 May 2018 19:07:13 -0700
Subject: udp: avoid refcount_t saturation in __udp_gso_segment()

For some reason, Willem thought that the issue we fixed for TCP
in commit 7ec318feeed1 ("tcp: gso: avoid refcount_t warning from
tcp_gso_segment()") was not relevant for UDP GSO.

But syzbot found its way.

refcount_t: saturated; leaking memory.
WARNING: CPU: 0 PID: 10261 at lib/refcount.c:78 refcount_add_not_zero+0x2d4/0x320 lib/refcount.c:78
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 10261 Comm: syz-executor5 Not tainted 4.17.0-rc3+ #38
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 panic+0x22f/0x4de kernel/panic.c:184
 __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
 report_bug+0x252/0x2d0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:178 [inline]
 do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:refcount_add_not_zero+0x2d4/0x320 lib/refcount.c:78
RSP: 0018:ffff880196db6b90 EFLAGS: 00010282
RAX: 0000000000000026 RBX: 00000000ffffff01 RCX: ffffc900040d9000
RDX: 0000000000004a29 RSI: ffffffff8160f6f1 RDI: ffff880196db66f0
RBP: ffff880196db6c78 R08: ffff8801b33d6740 R09: 0000000000000002
R10: ffff8801b33d6740 R11: 0000000000000000 R12: 0000000000000000
R13: 00000000ffffffff R14: ffff880196db6c50 R15: 0000000000020101
 refcount_add+0x1b/0x70 lib/refcount.c:102
 __udp_gso_segment+0xaa5/0xee0 net/ipv4/udp_offload.c:272
 udp4_ufo_fragment+0x592/0x7a0 net/ipv4/udp_offload.c:301
 inet_gso_segment+0x639/0x12b0 net/ipv4/af_inet.c:1342
 skb_mac_gso_segment+0x3ad/0x720 net/core/dev.c:2792
 __skb_gso_segment+0x3bb/0x870 net/core/dev.c:2865
 skb_gso_segment include/linux/netdevice.h:4050 [inline]
 validate_xmit_skb+0x54d/0xd90 net/core/dev.c:3122
 __dev_queue_xmit+0xbf8/0x34c0 net/core/dev.c:3579
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3620
 neigh_direct_output+0x15/0x20 net/core/neighbour.c:1401
 neigh_output include/net/neighbour.h:483 [inline]
 ip_finish_output2+0xa5f/0x1840 net/ipv4/ip_output.c:229
 ip_finish_output+0x828/0xf80 net/ipv4/ip_output.c:317
 NF_HOOK_COND include/linux/netfilter.h:277 [inline]
 ip_output+0x21b/0x850 net/ipv4/ip_output.c:405
 dst_output include/net/dst.h:444 [inline]
 ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
 ip_send_skb+0x40/0xe0 net/ipv4/ip_output.c:1434
 udp_send_skb.isra.37+0x5eb/0x1000 net/ipv4/udp.c:825
 udp_push_pending_frames+0x5c/0xf0 net/ipv4/udp.c:853
 udp_v6_push_pending_frames+0x380/0x3e0 net/ipv6/udp.c:1105
 udp_lib_setsockopt+0x59a/0x600 net/ipv4/udp.c:2403
 udpv6_setsockopt+0x95/0xa0 net/ipv6/udp.c:1447
 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046
 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903
 __do_sys_setsockopt net/socket.c:1914 [inline]
 __se_sys_setsockopt net/socket.c:1911 [inline]
 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: ad405857b174 ("udp: better wmem accounting on gso")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ede2a7305b90..92dc9e5a7ff3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -268,9 +268,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 		uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
 
 	/* update refcount for the packet */
-	if (copy_dtor)
-		refcount_add(sum_truesize - gso_skb->truesize,
-			     &sk->sk_wmem_alloc);
+	if (copy_dtor) {
+		int delta = sum_truesize - gso_skb->truesize;
+
+		/* In some pathological cases, delta can be negative.
+		 * We need to either use refcount_add() or refcount_sub_and_test()
+		 */
+		if (likely(delta >= 0))
+			refcount_add(delta, &sk->sk_wmem_alloc);
+		else
+			WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc));
+	}
 	return segs;
 }
 EXPORT_SYMBOL_GPL(__udp_gso_segment);
-- 
cgit v1.2.3-59-g8ed1b


From fc74ecbc293cba27fd8adee7a18fbda2f8d9e285 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 11 May 2018 11:57:30 +0300
Subject: mlxsw: spectrum_span: Rename misnamed variable l3edev

Calling the variable l3edev was relevant when neighbor lookup was the
last stage in the simulated pipeline. Now that mlxsw handles bridges and
vlan devices as well, calling it "L3" is a misnomer.

Thus in mlxsw_sp_span_dmac(), rename to "dev", because that function is
just a service routine where the distinction between tunnel and egress
device isn't necessary.

In mlxsw_sp_span_entry_tunnel_parms_common(), rename to "edev" to
emphasize that the routine traces packet egress.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 32 +++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 3b77990df599..adf1f789b166 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -137,14 +137,14 @@ struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = {
 
 static int mlxsw_sp_span_dmac(struct neigh_table *tbl,
 			      const void *pkey,
-			      struct net_device *l3edev,
+			      struct net_device *dev,
 			      unsigned char dmac[ETH_ALEN])
 {
-	struct neighbour *neigh = neigh_lookup(tbl, pkey, l3edev);
+	struct neighbour *neigh = neigh_lookup(tbl, pkey, dev);
 	int err = 0;
 
 	if (!neigh) {
-		neigh = neigh_create(tbl, pkey, l3edev);
+		neigh = neigh_create(tbl, pkey, dev);
 		if (IS_ERR(neigh))
 			return PTR_ERR(neigh);
 	}
@@ -246,7 +246,7 @@ mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
 }
 
 static __maybe_unused int
-mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
+mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
 					union mlxsw_sp_l3addr saddr,
 					union mlxsw_sp_l3addr daddr,
 					union mlxsw_sp_l3addr gw,
@@ -260,31 +260,31 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	if (mlxsw_sp_l3addr_is_zero(gw))
 		gw = daddr;
 
-	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+	if (!edev || mlxsw_sp_span_dmac(tbl, &gw, edev, dmac))
 		goto unoffloadable;
 
-	if (is_vlan_dev(l3edev))
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+	if (is_vlan_dev(edev))
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 
-	if (netif_is_bridge_master(l3edev)) {
-		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
-		if (!l3edev)
+	if (netif_is_bridge_master(edev)) {
+		edev = mlxsw_sp_span_entry_bridge(edev, dmac, &vid);
+		if (!edev)
 			goto unoffloadable;
 	}
 
-	if (is_vlan_dev(l3edev)) {
-		if (vid || !(l3edev->flags & IFF_UP))
+	if (is_vlan_dev(edev)) {
+		if (vid || !(edev->flags & IFF_UP))
 			goto unoffloadable;
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 	}
 
-	if (!mlxsw_sp_port_dev_check(l3edev))
+	if (!mlxsw_sp_port_dev_check(edev))
 		goto unoffloadable;
 
-	sparmsp->dest_port = netdev_priv(l3edev);
+	sparmsp->dest_port = netdev_priv(edev);
 	sparmsp->ttl = ttl;
 	memcpy(sparmsp->dmac, dmac, ETH_ALEN);
-	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
+	memcpy(sparmsp->smac, edev->dev_addr, ETH_ALEN);
 	sparmsp->saddr = saddr;
 	sparmsp->daddr = daddr;
 	sparmsp->vid = vid;
-- 
cgit v1.2.3-59-g8ed1b


From c41c0dd7a641cb28f7239e7086419741af78c448 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 11 May 2018 11:57:31 +0300
Subject: mlxsw: spectrum_span: Use a more fitting error code

ENOENT is suitable when an item is looked for in a collection and can't
be found. The failure here is actually a depletion of a resource, where
ENOBUFS is the more fitting error code.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index adf1f789b166..e5f4f7620ab7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -924,7 +924,7 @@ int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
 
 	span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms);
 	if (!span_entry)
-		return -ENOENT;
+		return -ENOBUFS;
 
 	netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n",
 		   span_entry->id);
-- 
cgit v1.2.3-59-g8ed1b


From d5db21a3e6977dcb42cee3d16cd69901fa66510a Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 11 May 2018 05:49:47 -0700
Subject: erspan: auto detect truncated ipv6 packets.

Currently the truncated bit is set only when 1) the mirrored packet
is larger than mtu and 2) the ipv4 packet tot_len is larger than
the actual skb->len.  This patch adds another case for detecting
whether ipv6 packet is truncated or not, by checking the ipv6 header
payload_len and the skb->len.

Reported-by: Xiaoyan Jin <xiaoyanj@vmware.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 6 ++++++
 net/ipv6/ip6_gre.c | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index dfe5b22f6ed4..2409e648454d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -579,6 +579,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	int version;
 	__be16 df;
 	int nhoff;
+	int thoff;
 
 	tun_info = skb_tunnel_info(skb);
 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -611,6 +612,11 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 		truncate = true;
 
+	thoff = skb_transport_header(skb) - skb_mac_header(skb);
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+		truncate = true;
+
 	if (version == 1) {
 		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
 				    ntohl(md->u.index), truncate, true);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b511818b268c..bede77f24784 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -897,6 +897,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	int err = -EINVAL;
 	__u32 mtu;
 	int nhoff;
+	int thoff;
 
 	if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
 		goto tx_err;
@@ -914,6 +915,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
 		truncate = true;
 
+	thoff = skb_transport_header(skb) - skb_mac_header(skb);
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
+		truncate = true;
+
 	if (skb_cow_head(skb, dev->needed_headroom))
 		goto tx_err;
 
-- 
cgit v1.2.3-59-g8ed1b


From dbdc8a21759959af2c4a702cacbb94f9b1864e73 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 11 May 2018 02:53:10 -0700
Subject: bonding: replace the return value type

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c  |  8 ++++----
 drivers/net/bonding/bond_main.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529e7bd1..601f67872e58 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
 		rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-			    struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+				    struct slave *tx_slave)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ out:
 	return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e4914e3a0..01bca863e7d8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 	return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+					struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+					  struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
@@ -3979,7 +3981,8 @@ out:
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+				       struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From ae35c6f7a5e78c3bcf1b468a3d352fa0021a22bb Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 11 May 2018 02:53:11 -0700
Subject: bonding: use the skb_get/set_queue_mapping

Use the skb_get_queue_mapping, skb_set_queue_mapping
and skb_rx_queue_recorded for skb queue_mapping in bonding
driver, but not use it directly.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 01bca863e7d8..966d09183744 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -247,7 +247,7 @@ void bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
 
 	BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
 		     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
-	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+	skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
 
 	if (unlikely(netpoll_tx_running(bond->dev)))
 		bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);
@@ -4040,12 +4040,12 @@ static inline int bond_slave_override(struct bonding *bond,
 	struct slave *slave = NULL;
 	struct list_head *iter;
 
-	if (!skb->queue_mapping)
+	if (!skb_rx_queue_recorded(skb))
 		return 1;
 
 	/* Find out if any slaves have the same mapping as this skb. */
 	bond_for_each_slave_rcu(bond, slave, iter) {
-		if (slave->queue_id == skb->queue_mapping) {
+		if (slave->queue_id == skb_get_queue_mapping(skb)) {
 			if (bond_slave_is_up(slave) &&
 			    slave->link == BOND_LINK_UP) {
 				bond_dev_queue_xmit(bond, skb, slave->dev);
@@ -4071,7 +4071,7 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
 
 	/* Save the original txq to restore before passing to the driver */
-	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
 
 	if (unlikely(txq >= dev->real_num_tx_queues)) {
 		do {
-- 
cgit v1.2.3-59-g8ed1b


From 804c1466c76717f48ee2e0f2e78ed24810a9818e Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Fri, 11 May 2018 02:53:12 -0700
Subject: net: doc: fix spelling mistake: "modrobe.d" -> "modprobe.d"

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 9ba04c0bab8d..c13214d073a4 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -140,7 +140,7 @@ bonding module at load time, or are specified via sysfs.
 
 	Module options may be given as command line arguments to the
 insmod or modprobe command, but are usually specified in either the
-/etc/modrobe.d/*.conf configuration files, or in a distro-specific
+/etc/modprobe.d/*.conf configuration files, or in a distro-specific
 configuration file (some of which are detailed in the next section).
 
 	Details on bonding support for sysfs is provided in the
-- 
cgit v1.2.3-59-g8ed1b


From f4a313b9713933af5c7928dbb6bedc8077ade572 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Fri, 11 May 2018 18:37:34 +0530
Subject: cxgb4: Add new T5 device id

Add 0x50ad device id for new T5 card.

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index 90b5274a8ba1..adacc6399131 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -187,6 +187,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x50ac), /* Custom T540-BT */
+	CH_PCI_ID_TABLE_FENTRY(0x50ad), /* Custom T520-CR */
 
 	/* T6 adapters:
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 0e249898cac811f833bdb4701b32385c1c272da5 Mon Sep 17 00:00:00 2001
From: Arjun Vynipadath <arjun@chelsio.com>
Date: Fri, 11 May 2018 18:34:43 +0530
Subject: cxgb4: Fix {vxlan/geneve}_port initialization

adapter->rawf_cnt was not initialized, thereby
ndo_udp_tunnel_{add/del} was returning immediately
without initializing {vxlan/geneve}_port.
Also initializes mps_encap_entry refcnt.

Fixes: 846eac3fccec ("cxgb4: implement udp tunnel callbacks")
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 21 +++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 1efcc85692f7..03767c0bfd5c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4276,6 +4276,20 @@ static int adap_init0(struct adapter *adap)
 	adap->tids.nftids = val[4] - val[3] + 1;
 	adap->sge.ingr_start = val[5];
 
+	if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) {
+		/* Read the raw mps entries. In T6, the last 2 tcam entries
+		 * are reserved for raw mac addresses (rawf = 2, one per port).
+		 */
+		params[0] = FW_PARAM_PFVF(RAWF_START);
+		params[1] = FW_PARAM_PFVF(RAWF_END);
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0, 2,
+				      params, val);
+		if (ret == 0) {
+			adap->rawf_start = val[0];
+			adap->rawf_cnt = val[1] - val[0] + 1;
+		}
+	}
+
 	/* qids (ingress/egress) returned from firmware can be anywhere
 	 * in the range from EQ(IQFLINT)_START to EQ(IQFLINT)_END.
 	 * Hence driver needs to allocate memory for this range to
@@ -5181,6 +5195,7 @@ static void free_some_resources(struct adapter *adapter)
 {
 	unsigned int i;
 
+	kvfree(adapter->mps_encap);
 	kvfree(adapter->smt);
 	kvfree(adapter->l2t);
 	kvfree(adapter->srq);
@@ -5677,6 +5692,12 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		adapter->params.offload = 0;
 	}
 
+	adapter->mps_encap = kvzalloc(sizeof(struct mps_encap_entry) *
+					  adapter->params.arch.mps_tcam_size,
+				      GFP_KERNEL);
+	if (!adapter->mps_encap)
+		dev_warn(&pdev->dev, "could not allocate MPS Encap entries, continuing\n");
+
 #if IS_ENABLED(CONFIG_IPV6)
 	if ((CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) &&
 	    (!(t4_read_reg(adapter, LE_DB_CONFIG_A) & ASLIPCOMPEN_F))) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index e3d4751f21ac..0e007ee72318 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1305,6 +1305,8 @@ enum fw_params_param_pfvf {
 	FW_PARAMS_PARAM_PFVF_HPFILTER_END = 0x33,
 	FW_PARAMS_PARAM_PFVF_TLS_START = 0x34,
 	FW_PARAMS_PARAM_PFVF_TLS_END = 0x35,
+	FW_PARAMS_PARAM_PFVF_RAWF_START = 0x36,
+	FW_PARAMS_PARAM_PFVF_RAWF_END = 0x37,
 	FW_PARAMS_PARAM_PFVF_NCRYPTO_LOOKASIDE = 0x39,
 	FW_PARAMS_PARAM_PFVF_PORT_CAPS32 = 0x3A,
 };
-- 
cgit v1.2.3-59-g8ed1b


From c50ae55e41ee226b96cf204cd1409bc0057b484e Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Fri, 11 May 2018 18:35:33 +0530
Subject: cxgb4: enable inner header checksum calculation

set cntrl bits to indicate whether inner header checksum
needs to be calculated whenever the packet is an encapsulated
packet and enable supported encap features.

Fixes: d0a1299c6bf7 ("cxgb4: add support for vxlan segmentation offload")
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  9 ++-
 drivers/net/ethernet/chelsio/cxgb4/sge.c        | 91 +++++++++++++++++++------
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h     |  5 ++
 3 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 03767c0bfd5c..1e31b9dfffee 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5626,8 +5626,15 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
 			NETIF_F_HW_TC;
 
-		if (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5)
+		if (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5) {
+			netdev->hw_enc_features |= NETIF_F_IP_CSUM |
+						   NETIF_F_IPV6_CSUM |
+						   NETIF_F_RXCSUM |
+						   NETIF_F_GSO_UDP_TUNNEL |
+						   NETIF_F_TSO | NETIF_F_TSO6;
+
 			netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+		}
 
 		if (highdma)
 			netdev->hw_features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 1a28df137e1f..0f87e973a158 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1072,12 +1072,27 @@ static void *inline_tx_skb_header(const struct sk_buff *skb,
 static u64 hwcsum(enum chip_type chip, const struct sk_buff *skb)
 {
 	int csum_type;
-	const struct iphdr *iph = ip_hdr(skb);
+	bool inner_hdr_csum = false;
+	u16 proto, ver;
 
-	if (iph->version == 4) {
-		if (iph->protocol == IPPROTO_TCP)
+	if (skb->encapsulation &&
+	    (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5))
+		inner_hdr_csum = true;
+
+	if (inner_hdr_csum) {
+		ver = inner_ip_hdr(skb)->version;
+		proto = (ver == 4) ? inner_ip_hdr(skb)->protocol :
+			inner_ipv6_hdr(skb)->nexthdr;
+	} else {
+		ver = ip_hdr(skb)->version;
+		proto = (ver == 4) ? ip_hdr(skb)->protocol :
+			ipv6_hdr(skb)->nexthdr;
+	}
+
+	if (ver == 4) {
+		if (proto == IPPROTO_TCP)
 			csum_type = TX_CSUM_TCPIP;
-		else if (iph->protocol == IPPROTO_UDP)
+		else if (proto == IPPROTO_UDP)
 			csum_type = TX_CSUM_UDPIP;
 		else {
 nocsum:			/*
@@ -1090,19 +1105,29 @@ nocsum:			/*
 		/*
 		 * this doesn't work with extension headers
 		 */
-		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph;
-
-		if (ip6h->nexthdr == IPPROTO_TCP)
+		if (proto == IPPROTO_TCP)
 			csum_type = TX_CSUM_TCPIP6;
-		else if (ip6h->nexthdr == IPPROTO_UDP)
+		else if (proto == IPPROTO_UDP)
 			csum_type = TX_CSUM_UDPIP6;
 		else
 			goto nocsum;
 	}
 
 	if (likely(csum_type >= TX_CSUM_TCPIP)) {
-		u64 hdr_len = TXPKT_IPHDR_LEN_V(skb_network_header_len(skb));
-		int eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
+		int eth_hdr_len, l4_len;
+		u64 hdr_len;
+
+		if (inner_hdr_csum) {
+			/* This allows checksum offload for all encapsulated
+			 * packets like GRE etc..
+			 */
+			l4_len = skb_inner_network_header_len(skb);
+			eth_hdr_len = skb_inner_network_offset(skb) - ETH_HLEN;
+		} else {
+			l4_len = skb_network_header_len(skb);
+			eth_hdr_len = skb_network_offset(skb) - ETH_HLEN;
+		}
+		hdr_len = TXPKT_IPHDR_LEN_V(l4_len);
 
 		if (CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5)
 			hdr_len |= TXPKT_ETHHDR_LEN_V(eth_hdr_len);
@@ -1273,7 +1298,7 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb,
 netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	u32 wr_mid, ctrl0, op;
-	u64 cntrl, *end;
+	u64 cntrl, *end, *sgl;
 	int qidx, credits;
 	unsigned int flits, ndesc;
 	struct adapter *adap;
@@ -1443,6 +1468,19 @@ out_free:	dev_kfree_skb_any(skb);
 				 TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
 				 TXPKT_IPHDR_LEN_V(l3hdr_len);
 		}
+		sgl = (u64 *)(cpl + 1); /* sgl start here */
+		if (unlikely((u8 *)sgl >= (u8 *)q->q.stat)) {
+			/* If current position is already at the end of the
+			 * txq, reset the current to point to start of the queue
+			 * and update the end ptr as well.
+			 */
+			if (sgl == (u64 *)q->q.stat) {
+				int left = (u8 *)end - (u8 *)q->q.stat;
+
+				end = (void *)q->q.desc + left;
+				sgl = (void *)q->q.desc;
+			}
+		}
 		q->tso++;
 		q->tx_cso += ssi->gso_segs;
 	} else {
@@ -1454,6 +1492,7 @@ out_free:	dev_kfree_skb_any(skb);
 		wr->op_immdlen = htonl(FW_WR_OP_V(op) |
 				       FW_WR_IMMDLEN_V(len));
 		cpl = (void *)(wr + 1);
+		sgl = (u64 *)(cpl + 1);
 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			cntrl = hwcsum(adap->params.chip, skb) |
 				TXPKT_IPCSUM_DIS_F;
@@ -1487,13 +1526,12 @@ out_free:	dev_kfree_skb_any(skb);
 	cpl->ctrl1 = cpu_to_be64(cntrl);
 
 	if (immediate) {
-		cxgb4_inline_tx_skb(skb, &q->q, cpl + 1);
+		cxgb4_inline_tx_skb(skb, &q->q, sgl);
 		dev_consume_skb_any(skb);
 	} else {
 		int last_desc;
 
-		cxgb4_write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1),
-				end, 0, addr);
+		cxgb4_write_sgl(skb, &q->q, (void *)sgl, end, 0, addr);
 		skb_orphan(skb);
 
 		last_desc = q->q.pidx + ndesc - 1;
@@ -2259,7 +2297,7 @@ static void cxgb4_sgetim_to_hwtstamp(struct adapter *adap,
 }
 
 static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
-		   const struct cpl_rx_pkt *pkt)
+		   const struct cpl_rx_pkt *pkt, unsigned long tnl_hdr_len)
 {
 	struct adapter *adapter = rxq->rspq.adap;
 	struct sge *s = &adapter->sge;
@@ -2275,6 +2313,8 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
 	}
 
 	copy_frags(skb, gl, s->pktshift);
+	if (tnl_hdr_len)
+		skb->csum_level = 1;
 	skb->len = gl->tot_len - s->pktshift;
 	skb->data_len = skb->len;
 	skb->truesize += skb->data_len;
@@ -2406,7 +2446,7 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 	struct sge *s = &q->adap->sge;
 	int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
 			    CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
-	u16 err_vec;
+	u16 err_vec, tnl_hdr_len = 0;
 	struct port_info *pi;
 	int ret = 0;
 
@@ -2415,16 +2455,19 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 
 	pkt = (const struct cpl_rx_pkt *)rsp;
 	/* Compressed error vector is enabled for T6 only */
-	if (q->adap->params.tp.rx_pkt_encap)
+	if (q->adap->params.tp.rx_pkt_encap) {
 		err_vec = T6_COMPR_RXERR_VEC_G(be16_to_cpu(pkt->err_vec));
-	else
+		tnl_hdr_len = T6_RX_TNLHDR_LEN_G(ntohs(pkt->err_vec));
+	} else {
 		err_vec = be16_to_cpu(pkt->err_vec);
+	}
 
 	csum_ok = pkt->csum_calc && !err_vec &&
 		  (q->netdev->features & NETIF_F_RXCSUM);
-	if ((pkt->l2info & htonl(RXF_TCP_F)) &&
+	if (((pkt->l2info & htonl(RXF_TCP_F)) ||
+	     tnl_hdr_len) &&
 	    (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
-		do_gro(rxq, si, pkt);
+		do_gro(rxq, si, pkt, tnl_hdr_len);
 		return 0;
 	}
 
@@ -2471,7 +2514,13 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 		} else if (pkt->l2info & htonl(RXF_IP_F)) {
 			__sum16 c = (__force __sum16)pkt->csum;
 			skb->csum = csum_unfold(c);
-			skb->ip_summed = CHECKSUM_COMPLETE;
+
+			if (tnl_hdr_len) {
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				skb->csum_level = 1;
+			} else {
+				skb->ip_summed = CHECKSUM_COMPLETE;
+			}
 			rxq->stats.rx_cso++;
 		}
 	} else {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
index fe2029e993a2..09e38f0733bd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -1233,6 +1233,11 @@ struct cpl_rx_pkt {
 #define T6_COMPR_RXERR_SUM_V(x) ((x) << T6_COMPR_RXERR_SUM_S)
 #define T6_COMPR_RXERR_SUM_F    T6_COMPR_RXERR_SUM_V(1U)
 
+#define T6_RX_TNLHDR_LEN_S    8
+#define T6_RX_TNLHDR_LEN_M    0xFF
+#define T6_RX_TNLHDR_LEN_V(x) ((x) << T6_RX_TNLHDR_LEN_S)
+#define T6_RX_TNLHDR_LEN_G(x) (((x) >> T6_RX_TNLHDR_LEN_S) & T6_RX_TNLHDR_LEN_M)
+
 struct cpl_trace_pkt {
 	u8 opcode;
 	u8 intf;
-- 
cgit v1.2.3-59-g8ed1b


From 443e2dab32a59b99730b999909718ed83fee4725 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Fri, 11 May 2018 18:36:16 +0530
Subject: cxgb4: avoid schedule while atomic

do not sleep while adding or deleting udp tunnel.

Fixes: 846eac3fccec ("cxgb4: implement udp tunnel callbacks")
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 1e31b9dfffee..3e4c533c1622 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3081,7 +3081,7 @@ static void cxgb_del_udp_tunnel(struct net_device *netdev,
 					   match_all_mac, match_all_mac,
 					   adapter->rawf_start +
 					    pi->port_id,
-					   1, pi->port_id, true);
+					   1, pi->port_id, false);
 		if (ret < 0) {
 			netdev_info(netdev, "Failed to free mac filter entry, for port %d\n",
 				    i);
@@ -3169,7 +3169,7 @@ static void cxgb_add_udp_tunnel(struct net_device *netdev,
 					    match_all_mac,
 					    adapter->rawf_start +
 					    pi->port_id,
-					    1, pi->port_id, true);
+					    1, pi->port_id, false);
 		if (ret < 0) {
 			netdev_info(netdev, "Failed to allocate a mac filter entry, not adding port %d\n",
 				    be16_to_cpu(ti->port));
-- 
cgit v1.2.3-59-g8ed1b


From b753a9faaf9aef1338c28ebd9ace6d749428788b Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Fri, 11 May 2018 13:08:19 -0500
Subject: net: phy: DP83TC811: Introduce support for the DP83TC811 phy

Add support for the DP83811 phy.

The DP83811 supports both rgmii and sgmii interfaces.
There are 2 part numbers for this the DP83TC811R does not
reliably support the SGMII interface but the DP83TC811S will.

There is not a way to differentiate these parts from the
hardware or register set.  So this is controlled via the DT
to indicate which phy mode is required.  Or the part can be
strapped to a certain interface.

Data sheet can be found here:
http://www.ti.com/product/DP83TC811S-Q1/description
http://www.ti.com/product/DP83TC811R-Q1/description

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig     |   5 +
 drivers/net/phy/Makefile    |   1 +
 drivers/net/phy/dp83tc811.c | 347 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 353 insertions(+)
 create mode 100644 drivers/net/phy/dp83tc811.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index ae2d69170dc3..19a5778a7ec7 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -291,6 +291,11 @@ config DP83822_PHY
 	---help---
 	  Supports the DP83822 PHY.
 
+config DP83TC811_PHY
+	tristate "Texas Instruments DP83TC822 PHY"
+	---help---
+	  Supports the DP83TC822 PHY.
+
 config DP83848_PHY
 	tristate "Texas Instruments DP83848 PHY"
 	---help---
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 22183b9d7e08..738246960e3b 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_CORTINA_PHY)	+= cortina.o
 obj-$(CONFIG_DAVICOM_PHY)	+= davicom.o
 obj-$(CONFIG_DP83640_PHY)	+= dp83640.o
 obj-$(CONFIG_DP83822_PHY)	+= dp83822.o
+obj-$(CONFIG_DP83TC811_PHY)	+= dp83tc811.o
 obj-$(CONFIG_DP83848_PHY)	+= dp83848.o
 obj-$(CONFIG_DP83867_PHY)	+= dp83867.o
 obj-$(CONFIG_FIXED_PHY)		+= fixed_phy.o
diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c
new file mode 100644
index 000000000000..081d99aa3985
--- /dev/null
+++ b/drivers/net/phy/dp83tc811.c
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for the Texas Instruments DP83TC811 PHY
+ *
+ * Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ */
+
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/phy.h>
+#include <linux/netdevice.h>
+
+#define DP83TC811_PHY_ID	0x2000a253
+#define DP83811_DEVADDR		0x1f
+
+#define MII_DP83811_SGMII_CTRL	0x09
+#define MII_DP83811_INT_STAT1	0x12
+#define MII_DP83811_INT_STAT2	0x13
+#define MII_DP83811_RESET_CTRL	0x1f
+
+#define DP83811_HW_RESET	BIT(15)
+#define DP83811_SW_RESET	BIT(14)
+
+/* INT_STAT1 bits */
+#define DP83811_RX_ERR_HF_INT_EN	BIT(0)
+#define DP83811_MS_TRAINING_INT_EN	BIT(1)
+#define DP83811_ANEG_COMPLETE_INT_EN	BIT(2)
+#define DP83811_ESD_EVENT_INT_EN	BIT(3)
+#define DP83811_WOL_INT_EN		BIT(4)
+#define DP83811_LINK_STAT_INT_EN	BIT(5)
+#define DP83811_ENERGY_DET_INT_EN	BIT(6)
+#define DP83811_LINK_QUAL_INT_EN	BIT(7)
+
+/* INT_STAT2 bits */
+#define DP83811_JABBER_DET_INT_EN	BIT(0)
+#define DP83811_POLARITY_INT_EN		BIT(1)
+#define DP83811_SLEEP_MODE_INT_EN	BIT(2)
+#define DP83811_OVERTEMP_INT_EN		BIT(3)
+#define DP83811_OVERVOLTAGE_INT_EN	BIT(6)
+#define DP83811_UNDERVOLTAGE_INT_EN	BIT(7)
+
+#define MII_DP83811_RXSOP1	0x04a5
+#define MII_DP83811_RXSOP2	0x04a6
+#define MII_DP83811_RXSOP3	0x04a7
+
+/* WoL Registers */
+#define MII_DP83811_WOL_CFG	0x04a0
+#define MII_DP83811_WOL_STAT	0x04a1
+#define MII_DP83811_WOL_DA1	0x04a2
+#define MII_DP83811_WOL_DA2	0x04a3
+#define MII_DP83811_WOL_DA3	0x04a4
+
+/* WoL bits */
+#define DP83811_WOL_MAGIC_EN	BIT(0)
+#define DP83811_WOL_SECURE_ON	BIT(5)
+#define DP83811_WOL_EN		BIT(7)
+#define DP83811_WOL_INDICATION_SEL BIT(8)
+#define DP83811_WOL_CLR_INDICATION BIT(11)
+
+/* SGMII CTRL bits */
+#define DP83811_TDR_AUTO		BIT(8)
+#define DP83811_SGMII_EN		BIT(12)
+#define DP83811_SGMII_AUTO_NEG_EN	BIT(13)
+#define DP83811_SGMII_TX_ERR_DIS	BIT(14)
+#define DP83811_SGMII_SOFT_RESET	BIT(15)
+
+static int dp83811_ack_interrupt(struct phy_device *phydev)
+{
+	int err;
+
+	err = phy_read(phydev, MII_DP83811_INT_STAT1);
+	if (err < 0)
+		return err;
+
+	err = phy_read(phydev, MII_DP83811_INT_STAT2);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int dp83811_set_wol(struct phy_device *phydev,
+			   struct ethtool_wolinfo *wol)
+{
+	struct net_device *ndev = phydev->attached_dev;
+	const u8 *mac;
+	u16 value;
+
+	if (wol->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) {
+		mac = (const u8 *)ndev->dev_addr;
+
+		if (!is_valid_ether_addr(mac))
+			return -EINVAL;
+
+		/* MAC addresses start with byte 5, but stored in mac[0].
+		 * 811 PHYs store bytes 4|5, 2|3, 0|1
+		 */
+		phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_DA1,
+			      (mac[1] << 8) | mac[0]);
+		phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_DA2,
+			      (mac[3] << 8) | mac[2]);
+		phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_DA3,
+			      (mac[5] << 8) | mac[4]);
+
+		value = phy_read_mmd(phydev, DP83811_DEVADDR,
+				     MII_DP83811_WOL_CFG);
+		if (wol->wolopts & WAKE_MAGIC)
+			value |= DP83811_WOL_MAGIC_EN;
+		else
+			value &= ~DP83811_WOL_MAGIC_EN;
+
+		if (wol->wolopts & WAKE_MAGICSECURE) {
+			phy_write_mmd(phydev, DP83811_DEVADDR,
+				      MII_DP83811_RXSOP1,
+				      (wol->sopass[1] << 8) | wol->sopass[0]);
+			phy_write_mmd(phydev, DP83811_DEVADDR,
+				      MII_DP83811_RXSOP2,
+				      (wol->sopass[3] << 8) | wol->sopass[2]);
+			phy_write_mmd(phydev, DP83811_DEVADDR,
+				      MII_DP83811_RXSOP3,
+				      (wol->sopass[5] << 8) | wol->sopass[4]);
+			value |= DP83811_WOL_SECURE_ON;
+		} else {
+			value &= ~DP83811_WOL_SECURE_ON;
+		}
+
+		value |= (DP83811_WOL_EN | DP83811_WOL_INDICATION_SEL |
+			  DP83811_WOL_CLR_INDICATION);
+		phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG,
+			      value);
+	} else {
+		value = phy_read_mmd(phydev, DP83811_DEVADDR,
+				     MII_DP83811_WOL_CFG);
+		value &= ~DP83811_WOL_EN;
+		phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG,
+			      value);
+	}
+
+	return 0;
+}
+
+static void dp83811_get_wol(struct phy_device *phydev,
+			    struct ethtool_wolinfo *wol)
+{
+	u16 sopass_val;
+	int value;
+
+	wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE);
+	wol->wolopts = 0;
+
+	value = phy_read_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG);
+
+	if (value & DP83811_WOL_MAGIC_EN)
+		wol->wolopts |= WAKE_MAGIC;
+
+	if (value & DP83811_WOL_SECURE_ON) {
+		sopass_val = phy_read_mmd(phydev, DP83811_DEVADDR,
+					  MII_DP83811_RXSOP1);
+		wol->sopass[0] = (sopass_val & 0xff);
+		wol->sopass[1] = (sopass_val >> 8);
+
+		sopass_val = phy_read_mmd(phydev, DP83811_DEVADDR,
+					  MII_DP83811_RXSOP2);
+		wol->sopass[2] = (sopass_val & 0xff);
+		wol->sopass[3] = (sopass_val >> 8);
+
+		sopass_val = phy_read_mmd(phydev, DP83811_DEVADDR,
+					  MII_DP83811_RXSOP3);
+		wol->sopass[4] = (sopass_val & 0xff);
+		wol->sopass[5] = (sopass_val >> 8);
+
+		wol->wolopts |= WAKE_MAGICSECURE;
+	}
+
+	/* WoL is not enabled so set wolopts to 0 */
+	if (!(value & DP83811_WOL_EN))
+		wol->wolopts = 0;
+}
+
+static int dp83811_config_intr(struct phy_device *phydev)
+{
+	int misr_status, err;
+
+	if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+		misr_status = phy_read(phydev, MII_DP83811_INT_STAT1);
+		if (misr_status < 0)
+			return misr_status;
+
+		misr_status |= (DP83811_RX_ERR_HF_INT_EN |
+				DP83811_MS_TRAINING_INT_EN |
+				DP83811_ANEG_COMPLETE_INT_EN |
+				DP83811_ESD_EVENT_INT_EN |
+				DP83811_WOL_INT_EN |
+				DP83811_LINK_STAT_INT_EN |
+				DP83811_ENERGY_DET_INT_EN |
+				DP83811_LINK_QUAL_INT_EN);
+
+		err = phy_write(phydev, MII_DP83811_INT_STAT1, misr_status);
+		if (err < 0)
+			return err;
+
+		misr_status = phy_read(phydev, MII_DP83811_INT_STAT2);
+		if (misr_status < 0)
+			return misr_status;
+
+		misr_status |= (DP83811_JABBER_DET_INT_EN |
+				DP83811_POLARITY_INT_EN |
+				DP83811_SLEEP_MODE_INT_EN |
+				DP83811_OVERTEMP_INT_EN |
+				DP83811_OVERVOLTAGE_INT_EN |
+				DP83811_UNDERVOLTAGE_INT_EN);
+
+		err = phy_write(phydev, MII_DP83811_INT_STAT2, misr_status);
+
+	} else {
+		err = phy_write(phydev, MII_DP83811_INT_STAT1, 0);
+		if (err < 0)
+			return err;
+
+		err = phy_write(phydev, MII_DP83811_INT_STAT1, 0);
+	}
+
+	return err;
+}
+
+static int dp83811_config_aneg(struct phy_device *phydev)
+{
+	int value, err;
+
+	if (phydev->interface == PHY_INTERFACE_MODE_SGMII) {
+		value = phy_read(phydev, MII_DP83811_SGMII_CTRL);
+		if (phydev->autoneg == AUTONEG_ENABLE) {
+			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+					(DP83811_SGMII_AUTO_NEG_EN | value));
+			if (err < 0)
+				return err;
+		} else {
+			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+					(~DP83811_SGMII_AUTO_NEG_EN & value));
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return genphy_config_aneg(phydev);
+}
+
+static int dp83811_config_init(struct phy_device *phydev)
+{
+	int value, err;
+
+	err = genphy_config_init(phydev);
+	if (err < 0)
+		return err;
+
+	if (phydev->interface == PHY_INTERFACE_MODE_SGMII) {
+		value = phy_read(phydev, MII_DP83811_SGMII_CTRL);
+		if (!(value & DP83811_SGMII_EN)) {
+			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+					(DP83811_SGMII_EN | value));
+			if (err < 0)
+				return err;
+		} else {
+			err = phy_write(phydev, MII_DP83811_SGMII_CTRL,
+					(~DP83811_SGMII_EN & value));
+			if (err < 0)
+				return err;
+		}
+	}
+
+	value = DP83811_WOL_MAGIC_EN | DP83811_WOL_SECURE_ON | DP83811_WOL_EN;
+
+	return phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG,
+	      value);
+}
+
+static int dp83811_phy_reset(struct phy_device *phydev)
+{
+	int err;
+
+	err = phy_write(phydev, MII_DP83811_RESET_CTRL, DP83811_HW_RESET);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int dp83811_suspend(struct phy_device *phydev)
+{
+	int value;
+
+	value = phy_read_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG);
+
+	if (!(value & DP83811_WOL_EN))
+		genphy_suspend(phydev);
+
+	return 0;
+}
+
+static int dp83811_resume(struct phy_device *phydev)
+{
+	int value;
+
+	genphy_resume(phydev);
+
+	value = phy_read_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG);
+
+	phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG, value |
+		      DP83811_WOL_CLR_INDICATION);
+
+	return 0;
+}
+
+static struct phy_driver dp83811_driver[] = {
+	{
+		.phy_id = DP83TC811_PHY_ID,
+		.phy_id_mask = 0xfffffff0,
+		.name = "TI DP83TC811",
+		.features = PHY_BASIC_FEATURES,
+		.flags = PHY_HAS_INTERRUPT,
+		.config_init = dp83811_config_init,
+		.config_aneg = dp83811_config_aneg,
+		.soft_reset = dp83811_phy_reset,
+		.get_wol = dp83811_get_wol,
+		.set_wol = dp83811_set_wol,
+		.ack_interrupt = dp83811_ack_interrupt,
+		.config_intr = dp83811_config_intr,
+		.suspend = dp83811_suspend,
+		.resume = dp83811_resume,
+	 },
+};
+module_phy_driver(dp83811_driver);
+
+static struct mdio_device_id __maybe_unused dp83811_tbl[] = {
+	{ DP83TC811_PHY_ID, 0xfffffff0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(mdio, dp83811_tbl);
+
+MODULE_DESCRIPTION("Texas Instruments DP83TC811 PHY driver");
+MODULE_AUTHOR("Dan Murphy <dmurphy@ti.com");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 41bd3d585da2fa480c3ded58965d9ccd2c9221e1 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Thu, 10 May 2018 14:57:35 +0200
Subject: wireless-drivers: Dynamically allocate struct station_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the addition of the TXQ stats to cfg80211, the station_info struct
has grown to be quite large, which results in warnings when allocated on
the stack. Fix the affected places to do dynamic allocations instead.

Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Reviewed-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath6kl/main.c             | 14 +++++---
 drivers/net/wireless/ath/wil6210/debugfs.c         | 22 +++++++-----
 drivers/net/wireless/ath/wil6210/wmi.c             | 19 ++++++----
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         | 18 ++++++----
 drivers/net/wireless/marvell/mwifiex/uap_event.c   | 25 ++++++++-----
 drivers/net/wireless/quantenna/qtnfmac/event.c     | 41 ++++++++++++++--------
 6 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/main.c b/drivers/net/wireless/ath/ath6kl/main.c
index db95f85751e3..808fb30be9ad 100644
--- a/drivers/net/wireless/ath/ath6kl/main.c
+++ b/drivers/net/wireless/ath/ath6kl/main.c
@@ -426,7 +426,7 @@ void ath6kl_connect_ap_mode_sta(struct ath6kl_vif *vif, u16 aid, u8 *mac_addr,
 {
 	u8 *ies = NULL, *wpa_ie = NULL, *pos;
 	size_t ies_len = 0;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 
 	ath6kl_dbg(ATH6KL_DBG_TRC, "new station %pM aid=%d\n", mac_addr, aid);
 
@@ -482,16 +482,20 @@ void ath6kl_connect_ap_mode_sta(struct ath6kl_vif *vif, u16 aid, u8 *mac_addr,
 			   keymgmt, ucipher, auth, apsd_info);
 
 	/* send event to application */
-	memset(&sinfo, 0, sizeof(sinfo));
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return;
 
 	/* TODO: sinfo.generation */
 
-	sinfo.assoc_req_ies = ies;
-	sinfo.assoc_req_ies_len = ies_len;
+	sinfo->assoc_req_ies = ies;
+	sinfo->assoc_req_ies_len = ies_len;
 
-	cfg80211_new_sta(vif->ndev, mac_addr, &sinfo, GFP_KERNEL);
+	cfg80211_new_sta(vif->ndev, mac_addr, sinfo, GFP_KERNEL);
 
 	netif_wake_queue(vif->ndev);
+
+	kfree(sinfo);
 }
 
 void disconnect_timer_handler(struct timer_list *t)
diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 8c90b3111f0b..11e46e44381e 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -1200,8 +1200,12 @@ static const struct file_operations fops_freq = {
 static int wil_link_debugfs_show(struct seq_file *s, void *data)
 {
 	struct wil6210_priv *wil = s->private;
-	struct station_info sinfo;
-	int i, rc;
+	struct station_info *sinfo;
+	int i, rc = 0;
+
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
 
 	for (i = 0; i < ARRAY_SIZE(wil->sta); i++) {
 		struct wil_sta_info *p = &wil->sta[i];
@@ -1229,19 +1233,21 @@ static int wil_link_debugfs_show(struct seq_file *s, void *data)
 
 		vif = (mid < wil->max_vifs) ? wil->vifs[mid] : NULL;
 		if (vif) {
-			rc = wil_cid_fill_sinfo(vif, i, &sinfo);
+			rc = wil_cid_fill_sinfo(vif, i, sinfo);
 			if (rc)
-				return rc;
+				goto out;
 
-			seq_printf(s, "  Tx_mcs = %d\n", sinfo.txrate.mcs);
-			seq_printf(s, "  Rx_mcs = %d\n", sinfo.rxrate.mcs);
-			seq_printf(s, "  SQ     = %d\n", sinfo.signal);
+			seq_printf(s, "  Tx_mcs = %d\n", sinfo->txrate.mcs);
+			seq_printf(s, "  Rx_mcs = %d\n", sinfo->rxrate.mcs);
+			seq_printf(s, "  SQ     = %d\n", sinfo->signal);
 		} else {
 			seq_puts(s, "  INVALID MID\n");
 		}
 	}
 
-	return 0;
+out:
+	kfree(sinfo);
+	return rc;
 }
 
 static int wil_link_seq_open(struct inode *inode, struct file *file)
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index a3dda9a97c1f..90de9a916241 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -824,7 +824,7 @@ static void wmi_evt_connect(struct wil6210_vif *vif, int id, void *d, int len)
 	struct wireless_dev *wdev = vif_to_wdev(vif);
 	struct wmi_connect_event *evt = d;
 	int ch; /* channel number */
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	u8 *assoc_req_ie, *assoc_resp_ie;
 	size_t assoc_req_ielen, assoc_resp_ielen;
 	/* capinfo(u16) + listen_interval(u16) + IEs */
@@ -940,6 +940,7 @@ static void wmi_evt_connect(struct wil6210_vif *vif, int id, void *d, int len)
 		vif->bss = NULL;
 	} else if ((wdev->iftype == NL80211_IFTYPE_AP) ||
 		   (wdev->iftype == NL80211_IFTYPE_P2P_GO)) {
+
 		if (rc) {
 			if (disable_ap_sme)
 				/* notify new_sta has failed */
@@ -947,16 +948,22 @@ static void wmi_evt_connect(struct wil6210_vif *vif, int id, void *d, int len)
 			goto out;
 		}
 
-		memset(&sinfo, 0, sizeof(sinfo));
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo) {
+			rc = -ENOMEM;
+			goto out;
+		}
 
-		sinfo.generation = wil->sinfo_gen++;
+		sinfo->generation = wil->sinfo_gen++;
 
 		if (assoc_req_ie) {
-			sinfo.assoc_req_ies = assoc_req_ie;
-			sinfo.assoc_req_ies_len = assoc_req_ielen;
+			sinfo->assoc_req_ies = assoc_req_ie;
+			sinfo->assoc_req_ies_len = assoc_req_ielen;
 		}
 
-		cfg80211_new_sta(ndev, evt->bssid, &sinfo, GFP_KERNEL);
+		cfg80211_new_sta(ndev, evt->bssid, sinfo, GFP_KERNEL);
+
+		kfree(sinfo);
 	} else {
 		wil_err(wil, "unhandled iftype %d for CID %d\n", wdev->iftype,
 			evt->cid);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index ff6b3514c501..79a124d1aa23 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -5498,7 +5498,7 @@ brcmf_notify_connect_status_ap(struct brcmf_cfg80211_info *cfg,
 	static int generation;
 	u32 event = e->event_code;
 	u32 reason = e->reason;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 
 	brcmf_dbg(CONN, "event %s (%u), reason %d\n",
 		  brcmf_fweh_event_name(event), event, reason);
@@ -5511,16 +5511,22 @@ brcmf_notify_connect_status_ap(struct brcmf_cfg80211_info *cfg,
 
 	if (((event == BRCMF_E_ASSOC_IND) || (event == BRCMF_E_REASSOC_IND)) &&
 	    (reason == BRCMF_E_STATUS_SUCCESS)) {
-		memset(&sinfo, 0, sizeof(sinfo));
 		if (!data) {
 			brcmf_err("No IEs present in ASSOC/REASSOC_IND");
 			return -EINVAL;
 		}
-		sinfo.assoc_req_ies = data;
-		sinfo.assoc_req_ies_len = e->datalen;
+
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo)
+			return -ENOMEM;
+
+		sinfo->assoc_req_ies = data;
+		sinfo->assoc_req_ies_len = e->datalen;
 		generation++;
-		sinfo.generation = generation;
-		cfg80211_new_sta(ndev, e->addr, &sinfo, GFP_KERNEL);
+		sinfo->generation = generation;
+		cfg80211_new_sta(ndev, e->addr, sinfo, GFP_KERNEL);
+
+		kfree(sinfo);
 	} else if ((event == BRCMF_E_DISASSOC_IND) ||
 		   (event == BRCMF_E_DEAUTH_IND) ||
 		   (event == BRCMF_E_DEAUTH)) {
diff --git a/drivers/net/wireless/marvell/mwifiex/uap_event.c b/drivers/net/wireless/marvell/mwifiex/uap_event.c
index e8c8728db15a..e86217a6b9ca 100644
--- a/drivers/net/wireless/marvell/mwifiex/uap_event.c
+++ b/drivers/net/wireless/marvell/mwifiex/uap_event.c
@@ -108,7 +108,7 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv)
 	struct mwifiex_adapter *adapter = priv->adapter;
 	int len, i;
 	u32 eventcause = adapter->event_cause;
-	struct station_info sinfo;
+	struct station_info *sinfo;
 	struct mwifiex_assoc_event *event;
 	struct mwifiex_sta_node *node;
 	u8 *deauth_mac;
@@ -117,7 +117,10 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv)
 
 	switch (eventcause) {
 	case EVENT_UAP_STA_ASSOC:
-		memset(&sinfo, 0, sizeof(sinfo));
+		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+		if (!sinfo)
+			return -ENOMEM;
+
 		event = (struct mwifiex_assoc_event *)
 			(adapter->event_body + MWIFIEX_UAP_EVENT_EXTRA_HEADER);
 		if (le16_to_cpu(event->type) == TLV_TYPE_UAP_MGMT_FRAME) {
@@ -132,28 +135,31 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv)
 				len = ETH_ALEN;
 
 			if (len != -1) {
-				sinfo.assoc_req_ies = &event->data[len];
-				len = (u8 *)sinfo.assoc_req_ies -
+				sinfo->assoc_req_ies = &event->data[len];
+				len = (u8 *)sinfo->assoc_req_ies -
 				      (u8 *)&event->frame_control;
-				sinfo.assoc_req_ies_len =
+				sinfo->assoc_req_ies_len =
 					le16_to_cpu(event->len) - (u16)len;
 			}
 		}
-		cfg80211_new_sta(priv->netdev, event->sta_addr, &sinfo,
+		cfg80211_new_sta(priv->netdev, event->sta_addr, sinfo,
 				 GFP_KERNEL);
 
 		node = mwifiex_add_sta_entry(priv, event->sta_addr);
 		if (!node) {
 			mwifiex_dbg(adapter, ERROR,
 				    "could not create station entry!\n");
+			kfree(sinfo);
 			return -1;
 		}
 
-		if (!priv->ap_11n_enabled)
+		if (!priv->ap_11n_enabled) {
+			kfree(sinfo);
 			break;
+		}
 
-		mwifiex_set_sta_ht_cap(priv, sinfo.assoc_req_ies,
-				       sinfo.assoc_req_ies_len, node);
+		mwifiex_set_sta_ht_cap(priv, sinfo->assoc_req_ies,
+				       sinfo->assoc_req_ies_len, node);
 
 		for (i = 0; i < MAX_NUM_TID; i++) {
 			if (node->is_11n_enabled)
@@ -163,6 +169,7 @@ int mwifiex_process_uap_event(struct mwifiex_private *priv)
 				node->ampdu_sta[i] = BA_STREAM_NOT_ALLOWED;
 		}
 		memset(node->rx_seq, 0xff, sizeof(node->rx_seq));
+		kfree(sinfo);
 		break;
 	case EVENT_UAP_STA_DEAUTH:
 		deauth_mac = adapter->event_body +
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index cb2a6c12f870..16617c44f81b 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -34,12 +34,13 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 {
 	const u8 *sta_addr;
 	u16 frame_control;
-	struct station_info sinfo = { 0 };
+	struct station_info *sinfo;
 	size_t payload_len;
 	u16 tlv_type;
 	u16 tlv_value_len;
 	size_t tlv_full_len;
 	const struct qlink_tlv_hdr *tlv;
+	int ret = 0;
 
 	if (unlikely(len < sizeof(*sta_assoc))) {
 		pr_err("VIF%u.%u: payload is too short (%u < %zu)\n",
@@ -53,6 +54,10 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 		return -EPROTO;
 	}
 
+	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+	if (!sinfo)
+		return -ENOMEM;
+
 	sta_addr = sta_assoc->sta_addr;
 	frame_control = le16_to_cpu(sta_assoc->frame_control);
 
@@ -61,9 +66,9 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 
 	qtnf_sta_list_add(vif, sta_addr);
 
-	sinfo.assoc_req_ies = NULL;
-	sinfo.assoc_req_ies_len = 0;
-	sinfo.generation = vif->generation;
+	sinfo->assoc_req_ies = NULL;
+	sinfo->assoc_req_ies_len = 0;
+	sinfo->generation = vif->generation;
 
 	payload_len = len - sizeof(*sta_assoc);
 	tlv = (const struct qlink_tlv_hdr *)sta_assoc->ies;
@@ -73,23 +78,27 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 		tlv_value_len = le16_to_cpu(tlv->len);
 		tlv_full_len = tlv_value_len + sizeof(struct qlink_tlv_hdr);
 
-		if (tlv_full_len > payload_len)
-			return -EINVAL;
+		if (tlv_full_len > payload_len) {
+			ret = -EINVAL;
+			goto out;
+		}
 
 		if (tlv_type == QTN_TLV_ID_IE_SET) {
 			const struct qlink_tlv_ie_set *ie_set;
 			unsigned int ie_len;
 
-			if (payload_len < sizeof(*ie_set))
-				return -EINVAL;
+			if (payload_len < sizeof(*ie_set)) {
+				ret = -EINVAL;
+				goto out;
+			}
 
 			ie_set = (const struct qlink_tlv_ie_set *)tlv;
 			ie_len = tlv_value_len -
 				(sizeof(*ie_set) - sizeof(ie_set->hdr));
 
 			if (ie_set->type == QLINK_IE_SET_ASSOC_REQ && ie_len) {
-				sinfo.assoc_req_ies = ie_set->ie_data;
-				sinfo.assoc_req_ies_len = ie_len;
+				sinfo->assoc_req_ies = ie_set->ie_data;
+				sinfo->assoc_req_ies_len = ie_len;
 			}
 		}
 
@@ -97,13 +106,17 @@ qtnf_event_handle_sta_assoc(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 		tlv = (struct qlink_tlv_hdr *)(tlv->val + tlv_value_len);
 	}
 
-	if (payload_len)
-		return -EINVAL;
+	if (payload_len) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	cfg80211_new_sta(vif->netdev, sta_assoc->sta_addr, &sinfo,
+	cfg80211_new_sta(vif->netdev, sta_assoc->sta_addr, sinfo,
 			 GFP_KERNEL);
 
-	return 0;
+out:
+	kfree(sinfo);
+	return ret;
 }
 
 static int
-- 
cgit v1.2.3-59-g8ed1b


From 6823dc0d91e5d238ca14a252228a5121d41eb517 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Thu, 10 May 2018 16:06:09 +0200
Subject: mt76x2: add a polling delay in mt76x2_mac_stop routine

Add a usleep_range in mt76x2_mac_stop routine in order to add
a polling delay checking values of MT_MAC_STATUS and IBI_R12 registers

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index d21e4a7c1bb9..dd4c1127797e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -370,12 +370,12 @@ void mt76x2_mac_stop(struct mt76x2_dev *dev, bool force)
 
 	/* Wait for MAC to become idle */
 	for (i = 0; i < 300; i++) {
-		if (mt76_rr(dev, MT_MAC_STATUS) &
-		    (MT_MAC_STATUS_RX | MT_MAC_STATUS_TX))
-			continue;
-
-		if (mt76_rr(dev, MT_BBP(IBI, 12)))
+		if ((mt76_rr(dev, MT_MAC_STATUS) &
+		     (MT_MAC_STATUS_RX | MT_MAC_STATUS_TX)) ||
+		    mt76_rr(dev, MT_BBP(IBI, 12))) {
+			usleep_range(10, 20);
 			continue;
+		}
 
 		stopped = true;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From b41c393676696c20456d5fd8cbe71453560ee06e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 10 May 2018 15:32:17 +0100
Subject: rsi: fix spelling mistake: "thead" -> "thread"

Trivial fix to spelling mistake in rsi_dbg debug message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_core.c b/drivers/net/wireless/rsi/rsi_91x_core.c
index 1f1b97220d43..3644d7d99463 100644
--- a/drivers/net/wireless/rsi/rsi_91x_core.c
+++ b/drivers/net/wireless/rsi/rsi_91x_core.c
@@ -485,7 +485,7 @@ void rsi_core_xmit(struct rsi_common *common, struct sk_buff *skb)
 	}
 
 	rsi_core_queue_pkt(common, skb);
-	rsi_dbg(DATA_TX_ZONE, "%s: ===> Scheduling TX thead <===\n", __func__);
+	rsi_dbg(DATA_TX_ZONE, "%s: ===> Scheduling TX thread <===\n", __func__);
 	rsi_set_event(&common->tx_thread.event);
 
 	return;
-- 
cgit v1.2.3-59-g8ed1b


From cfb353c0dc058bc1619cc226d3cbbda1f360bdd3 Mon Sep 17 00:00:00 2001
From: Yu Wang <yyuwang@codeaurora.org>
Date: Thu, 3 May 2018 14:01:25 +0800
Subject: ath10k: add quiet mode support for QCA6174/QCA9377

To enable thermal throttling for QCA6174 and QCA9377,
implement gen_pdev_set_quiet_mode for them.
With this change, quiet period for QCA6174/QCA9377 can
be also set/get via debugfs.
Usage:
To set quiet period:
 echo <period> > /sys/kernel/debug/ieee80211/phyX/ath10k/quiet_period
To get the current quiet period:
 cat /sys/kernel/debug/ieee80211/phyX/ath10k/quiet_period

This change has been verified with
QCA6174 hw3.2(fw: WLAN_RM.4.4.1-00102-QCARMSWP-1).

Signed-off-by: Yu Wang <yyuwang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi-tlv.c | 33 ++++++++++++++++++++++++++++++-
 drivers/net/wireless/ath/ath10k/wmi-tlv.h | 16 +++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
index 01f4eb201330..2e34a1fc5ba6 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c
@@ -3135,6 +3135,37 @@ ath10k_wmi_tlv_op_gen_tdls_peer_update(struct ath10k *ar,
 	return skb;
 }
 
+static struct sk_buff *
+ath10k_wmi_tlv_op_gen_pdev_set_quiet_mode(struct ath10k *ar, u32 period,
+					  u32 duration, u32 next_offset,
+					  u32 enabled)
+{
+	struct wmi_tlv_set_quiet_cmd *cmd;
+	struct wmi_tlv *tlv;
+	struct sk_buff *skb;
+
+	skb = ath10k_wmi_alloc_skb(ar, sizeof(*tlv) + sizeof(*cmd));
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	tlv = (void *)skb->data;
+	tlv->tag = __cpu_to_le16(WMI_TLV_TAG_STRUCT_PDEV_SET_QUIET_CMD);
+	tlv->len = __cpu_to_le16(sizeof(*cmd));
+	cmd = (void *)tlv->value;
+
+	/* vdev_id is not in use, set to 0 */
+	cmd->vdev_id = __cpu_to_le32(0);
+	cmd->period = __cpu_to_le32(period);
+	cmd->duration = __cpu_to_le32(duration);
+	cmd->next_start = __cpu_to_le32(next_offset);
+	cmd->enabled = __cpu_to_le32(enabled);
+
+	ath10k_dbg(ar, ATH10K_DBG_WMI,
+		   "wmi tlv quiet param: period %u duration %u enabled %d\n",
+		   period, duration, enabled);
+	return skb;
+}
+
 static struct sk_buff *
 ath10k_wmi_tlv_op_gen_wow_enable(struct ath10k *ar)
 {
@@ -3854,7 +3885,7 @@ static const struct wmi_ops wmi_tlv_ops = {
 	.gen_dbglog_cfg = ath10k_wmi_tlv_op_gen_dbglog_cfg,
 	.gen_pktlog_enable = ath10k_wmi_tlv_op_gen_pktlog_enable,
 	.gen_pktlog_disable = ath10k_wmi_tlv_op_gen_pktlog_disable,
-	/* .gen_pdev_set_quiet_mode not implemented */
+	.gen_pdev_set_quiet_mode = ath10k_wmi_tlv_op_gen_pdev_set_quiet_mode,
 	.gen_pdev_get_temperature = ath10k_wmi_tlv_op_gen_pdev_get_temperature,
 	/* .gen_addba_clear_resp not implemented */
 	/* .gen_addba_send not implemented */
diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.h b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
index 954c50bb3f66..3e1e340cd834 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -1952,6 +1953,21 @@ struct wmi_tlv_wow_add_del_event_cmd {
 	__le32 event_bitmap;
 } __packed;
 
+/* Command to set/unset chip in quiet mode */
+struct wmi_tlv_set_quiet_cmd {
+	__le32 vdev_id;
+
+	/* in TUs */
+	__le32 period;
+
+	/* in TUs */
+	__le32 duration;
+
+	/* offset in TUs */
+	__le32 next_start;
+	__le32 enabled;
+} __packed;
+
 struct wmi_tlv_wow_enable_cmd {
 	__le32 enable;
 } __packed;
-- 
cgit v1.2.3-59-g8ed1b


From 469bd5eab6224c966253e62c50af1084a7cdad64 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Sat, 5 May 2018 00:04:08 -0300
Subject: ath10k: snoc: Remove owner assignment from platform_driver

Structure platform_driver does not need to set the owner field, as this
will be populated by the driver core.

Generated by scripts/coccinelle/api/platform_no_drv_owner.cocci.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/snoc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 47a4d2a5bd4c..a3a7042fe13a 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -1385,7 +1385,6 @@ static struct platform_driver ath10k_snoc_driver = {
 		.remove = ath10k_snoc_remove,
 		.driver = {
 			.name   = "ath10k_snoc",
-			.owner = THIS_MODULE,
 			.of_match_table = ath10k_snoc_dt_match,
 		},
 };
-- 
cgit v1.2.3-59-g8ed1b


From e3148cc5fecf60dcbd07e5c9cae987976d25cb17 Mon Sep 17 00:00:00 2001
From: Erik Stromdahl <erik.stromdahl@gmail.com>
Date: Sun, 6 May 2018 15:25:00 +0200
Subject: ath10k: fix return value check in wake_tx_q op

ath10k_mac_tx_push_txq returns either a postive integer (length) on
success or a negative error code on error.

The "if (ret) break;" statement will thus always break out of the loop
immediately after ath10k_mac_tx_push_txq has returned (making the loop
pointless).

A side effect of this fix is that we will iterate the queue until
ath10k_mac_tx_push_txq returns -ENOENT. This will make sure the queue is
not added back to ar->txqs when it is empty. This could potentially
improve performance somewhat (I have seen a small improvement with SDIO
devices).

Signed-off-by: Erik Stromdahl <erik.stromdahl@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 3d7119ad7c7a..487a7a7380fd 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -4290,7 +4290,7 @@ static void ath10k_mac_op_wake_tx_queue(struct ieee80211_hw *hw,
 
 	while (ath10k_mac_tx_can_push(hw, f_txq) && max--) {
 		ret = ath10k_mac_tx_push_txq(hw, f_txq);
-		if (ret)
+		if (ret < 0)
 			break;
 	}
 	if (ret != -ENOENT)
-- 
cgit v1.2.3-59-g8ed1b


From 3e2740cda22473cca2abd117baeabf20d9024fba Mon Sep 17 00:00:00 2001
From: Kenneth Lu <kuohsianglu@gmail.com>
Date: Mon, 7 May 2018 17:39:48 +0800
Subject: ath10k: remove variables which set but not used

Variable buf_len and num_vdev_stats are being assigned but never read.
These are redundant and can be remove.

Signed-off-by: Kenneth Lu <kuohsianglu@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index bb8e9e259a56..46fb96ee5852 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -2318,7 +2318,6 @@ int ath10k_wmi_event_mgmt_rx(struct ath10k *ar, struct sk_buff *skb)
 	u32 phy_mode;
 	u32 snr;
 	u32 rate;
-	u32 buf_len;
 	u16 fc;
 	int ret;
 
@@ -2330,7 +2329,6 @@ int ath10k_wmi_event_mgmt_rx(struct ath10k *ar, struct sk_buff *skb)
 	}
 
 	channel = __le32_to_cpu(arg.channel);
-	buf_len = __le32_to_cpu(arg.buf_len);
 	rx_status = __le32_to_cpu(arg.status);
 	snr = __le32_to_cpu(arg.snr);
 	phy_mode = __le32_to_cpu(arg.phy_mode);
@@ -2740,14 +2738,13 @@ static int ath10k_wmi_main_op_pull_fw_stats(struct ath10k *ar,
 					    struct ath10k_fw_stats *stats)
 {
 	const struct wmi_stats_event *ev = (void *)skb->data;
-	u32 num_pdev_stats, num_vdev_stats, num_peer_stats;
+	u32 num_pdev_stats, num_peer_stats;
 	int i;
 
 	if (!skb_pull(skb, sizeof(*ev)))
 		return -EPROTO;
 
 	num_pdev_stats = __le32_to_cpu(ev->num_pdev_stats);
-	num_vdev_stats = __le32_to_cpu(ev->num_vdev_stats);
 	num_peer_stats = __le32_to_cpu(ev->num_peer_stats);
 
 	for (i = 0; i < num_pdev_stats; i++) {
@@ -2795,14 +2792,13 @@ static int ath10k_wmi_10x_op_pull_fw_stats(struct ath10k *ar,
 					   struct ath10k_fw_stats *stats)
 {
 	const struct wmi_stats_event *ev = (void *)skb->data;
-	u32 num_pdev_stats, num_vdev_stats, num_peer_stats;
+	u32 num_pdev_stats, num_peer_stats;
 	int i;
 
 	if (!skb_pull(skb, sizeof(*ev)))
 		return -EPROTO;
 
 	num_pdev_stats = __le32_to_cpu(ev->num_pdev_stats);
-	num_vdev_stats = __le32_to_cpu(ev->num_vdev_stats);
 	num_peer_stats = __le32_to_cpu(ev->num_peer_stats);
 
 	for (i = 0; i < num_pdev_stats; i++) {
@@ -2856,7 +2852,6 @@ static int ath10k_wmi_10_2_op_pull_fw_stats(struct ath10k *ar,
 	const struct wmi_10_2_stats_event *ev = (void *)skb->data;
 	u32 num_pdev_stats;
 	u32 num_pdev_ext_stats;
-	u32 num_vdev_stats;
 	u32 num_peer_stats;
 	int i;
 
@@ -2865,7 +2860,6 @@ static int ath10k_wmi_10_2_op_pull_fw_stats(struct ath10k *ar,
 
 	num_pdev_stats = __le32_to_cpu(ev->num_pdev_stats);
 	num_pdev_ext_stats = __le32_to_cpu(ev->num_pdev_ext_stats);
-	num_vdev_stats = __le32_to_cpu(ev->num_vdev_stats);
 	num_peer_stats = __le32_to_cpu(ev->num_peer_stats);
 
 	for (i = 0; i < num_pdev_stats; i++) {
@@ -2935,7 +2929,6 @@ static int ath10k_wmi_10_2_4_op_pull_fw_stats(struct ath10k *ar,
 	const struct wmi_10_2_stats_event *ev = (void *)skb->data;
 	u32 num_pdev_stats;
 	u32 num_pdev_ext_stats;
-	u32 num_vdev_stats;
 	u32 num_peer_stats;
 	int i;
 
@@ -2944,7 +2937,6 @@ static int ath10k_wmi_10_2_4_op_pull_fw_stats(struct ath10k *ar,
 
 	num_pdev_stats = __le32_to_cpu(ev->num_pdev_stats);
 	num_pdev_ext_stats = __le32_to_cpu(ev->num_pdev_ext_stats);
-	num_vdev_stats = __le32_to_cpu(ev->num_vdev_stats);
 	num_peer_stats = __le32_to_cpu(ev->num_peer_stats);
 
 	for (i = 0; i < num_pdev_stats; i++) {
-- 
cgit v1.2.3-59-g8ed1b


From 0be928850dac4dc3f9f4e0a9966b6c809557a494 Mon Sep 17 00:00:00 2001
From: Marcus Folkesson <marcus.folkesson@gmail.com>
Date: Wed, 9 May 2018 09:56:33 +0200
Subject: ath10k: hw: make consistent usage of ATH10K_FW_DIR in paths

For some reason not all entries used ATH10K_FW_DIR so fix that. No functional
changes.

Signed-off-by: Marcus Folkesson <marcus.folkesson@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/hw.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index b8bdabe73073..23467e9fefeb 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -85,11 +85,11 @@ enum qca9377_chip_id_rev {
 	QCA9377_HW_1_1_CHIP_ID_REV = 0x1,
 };
 
-#define QCA6174_HW_2_1_FW_DIR		"ath10k/QCA6174/hw2.1"
+#define QCA6174_HW_2_1_FW_DIR		ATH10K_FW_DIR "/QCA6174/hw2.1"
 #define QCA6174_HW_2_1_BOARD_DATA_FILE	"board.bin"
 #define QCA6174_HW_2_1_PATCH_LOAD_ADDR	0x1234
 
-#define QCA6174_HW_3_0_FW_DIR		"ath10k/QCA6174/hw3.0"
+#define QCA6174_HW_3_0_FW_DIR		ATH10K_FW_DIR "/QCA6174/hw3.0"
 #define QCA6174_HW_3_0_BOARD_DATA_FILE	"board.bin"
 #define QCA6174_HW_3_0_PATCH_LOAD_ADDR	0x1234
 
-- 
cgit v1.2.3-59-g8ed1b


From c3f7f31efe39209722a6fd19d7b7a4cceb7ca75c Mon Sep 17 00:00:00 2001
From: Govind Singh <govinds@codeaurora.org>
Date: Wed, 9 May 2018 15:03:21 +0530
Subject: ath10k: replace bit shifts with the BIT() macro for rx desc bits

Use the BIT() macro from 'linux/bitops.h' to define the rx desc
bit flags to have consistency with new definitions.

Signed-off-by: Govind Singh <govinds@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/rx_desc.h | 136 +++++++++++++++---------------
 1 file changed, 69 insertions(+), 67 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/rx_desc.h b/drivers/net/wireless/ath/ath10k/rx_desc.h
index 545deb6d7af1..ea4075d456fa 100644
--- a/drivers/net/wireless/ath/ath10k/rx_desc.h
+++ b/drivers/net/wireless/ath/ath10k/rx_desc.h
@@ -18,39 +18,41 @@
 #ifndef _RX_DESC_H_
 #define _RX_DESC_H_
 
+#include <linux/bitops.h>
+
 enum rx_attention_flags {
-	RX_ATTENTION_FLAGS_FIRST_MPDU          = 1 << 0,
-	RX_ATTENTION_FLAGS_LAST_MPDU           = 1 << 1,
-	RX_ATTENTION_FLAGS_MCAST_BCAST         = 1 << 2,
-	RX_ATTENTION_FLAGS_PEER_IDX_INVALID    = 1 << 3,
-	RX_ATTENTION_FLAGS_PEER_IDX_TIMEOUT    = 1 << 4,
-	RX_ATTENTION_FLAGS_POWER_MGMT          = 1 << 5,
-	RX_ATTENTION_FLAGS_NON_QOS             = 1 << 6,
-	RX_ATTENTION_FLAGS_NULL_DATA           = 1 << 7,
-	RX_ATTENTION_FLAGS_MGMT_TYPE           = 1 << 8,
-	RX_ATTENTION_FLAGS_CTRL_TYPE           = 1 << 9,
-	RX_ATTENTION_FLAGS_MORE_DATA           = 1 << 10,
-	RX_ATTENTION_FLAGS_EOSP                = 1 << 11,
-	RX_ATTENTION_FLAGS_U_APSD_TRIGGER      = 1 << 12,
-	RX_ATTENTION_FLAGS_FRAGMENT            = 1 << 13,
-	RX_ATTENTION_FLAGS_ORDER               = 1 << 14,
-	RX_ATTENTION_FLAGS_CLASSIFICATION      = 1 << 15,
-	RX_ATTENTION_FLAGS_OVERFLOW_ERR        = 1 << 16,
-	RX_ATTENTION_FLAGS_MSDU_LENGTH_ERR     = 1 << 17,
-	RX_ATTENTION_FLAGS_TCP_UDP_CHKSUM_FAIL = 1 << 18,
-	RX_ATTENTION_FLAGS_IP_CHKSUM_FAIL      = 1 << 19,
-	RX_ATTENTION_FLAGS_SA_IDX_INVALID      = 1 << 20,
-	RX_ATTENTION_FLAGS_DA_IDX_INVALID      = 1 << 21,
-	RX_ATTENTION_FLAGS_SA_IDX_TIMEOUT      = 1 << 22,
-	RX_ATTENTION_FLAGS_DA_IDX_TIMEOUT      = 1 << 23,
-	RX_ATTENTION_FLAGS_ENCRYPT_REQUIRED    = 1 << 24,
-	RX_ATTENTION_FLAGS_DIRECTED            = 1 << 25,
-	RX_ATTENTION_FLAGS_BUFFER_FRAGMENT     = 1 << 26,
-	RX_ATTENTION_FLAGS_MPDU_LENGTH_ERR     = 1 << 27,
-	RX_ATTENTION_FLAGS_TKIP_MIC_ERR        = 1 << 28,
-	RX_ATTENTION_FLAGS_DECRYPT_ERR         = 1 << 29,
-	RX_ATTENTION_FLAGS_FCS_ERR             = 1 << 30,
-	RX_ATTENTION_FLAGS_MSDU_DONE           = 1 << 31,
+	RX_ATTENTION_FLAGS_FIRST_MPDU          = BIT(0),
+	RX_ATTENTION_FLAGS_LAST_MPDU           = BIT(1),
+	RX_ATTENTION_FLAGS_MCAST_BCAST         = BIT(2),
+	RX_ATTENTION_FLAGS_PEER_IDX_INVALID    = BIT(3),
+	RX_ATTENTION_FLAGS_PEER_IDX_TIMEOUT    = BIT(4),
+	RX_ATTENTION_FLAGS_POWER_MGMT          = BIT(5),
+	RX_ATTENTION_FLAGS_NON_QOS             = BIT(6),
+	RX_ATTENTION_FLAGS_NULL_DATA           = BIT(7),
+	RX_ATTENTION_FLAGS_MGMT_TYPE           = BIT(8),
+	RX_ATTENTION_FLAGS_CTRL_TYPE           = BIT(9),
+	RX_ATTENTION_FLAGS_MORE_DATA           = BIT(10),
+	RX_ATTENTION_FLAGS_EOSP                = BIT(11),
+	RX_ATTENTION_FLAGS_U_APSD_TRIGGER      = BIT(12),
+	RX_ATTENTION_FLAGS_FRAGMENT            = BIT(13),
+	RX_ATTENTION_FLAGS_ORDER               = BIT(14),
+	RX_ATTENTION_FLAGS_CLASSIFICATION      = BIT(15),
+	RX_ATTENTION_FLAGS_OVERFLOW_ERR        = BIT(16),
+	RX_ATTENTION_FLAGS_MSDU_LENGTH_ERR     = BIT(17),
+	RX_ATTENTION_FLAGS_TCP_UDP_CHKSUM_FAIL = BIT(18),
+	RX_ATTENTION_FLAGS_IP_CHKSUM_FAIL      = BIT(19),
+	RX_ATTENTION_FLAGS_SA_IDX_INVALID      = BIT(20),
+	RX_ATTENTION_FLAGS_DA_IDX_INVALID      = BIT(21),
+	RX_ATTENTION_FLAGS_SA_IDX_TIMEOUT      = BIT(22),
+	RX_ATTENTION_FLAGS_DA_IDX_TIMEOUT      = BIT(23),
+	RX_ATTENTION_FLAGS_ENCRYPT_REQUIRED    = BIT(24),
+	RX_ATTENTION_FLAGS_DIRECTED            = BIT(25),
+	RX_ATTENTION_FLAGS_BUFFER_FRAGMENT     = BIT(26),
+	RX_ATTENTION_FLAGS_MPDU_LENGTH_ERR     = BIT(27),
+	RX_ATTENTION_FLAGS_TKIP_MIC_ERR        = BIT(28),
+	RX_ATTENTION_FLAGS_DECRYPT_ERR         = BIT(29),
+	RX_ATTENTION_FLAGS_FCS_ERR             = BIT(30),
+	RX_ATTENTION_FLAGS_MSDU_DONE           = BIT(31),
 };
 
 struct rx_attention {
@@ -254,15 +256,15 @@ enum htt_rx_mpdu_encrypt_type {
 #define RX_MPDU_START_INFO0_SEQ_NUM_LSB       16
 #define RX_MPDU_START_INFO0_ENCRYPT_TYPE_MASK 0xf0000000
 #define RX_MPDU_START_INFO0_ENCRYPT_TYPE_LSB  28
-#define RX_MPDU_START_INFO0_FROM_DS           (1 << 11)
-#define RX_MPDU_START_INFO0_TO_DS             (1 << 12)
-#define RX_MPDU_START_INFO0_ENCRYPTED         (1 << 13)
-#define RX_MPDU_START_INFO0_RETRY             (1 << 14)
-#define RX_MPDU_START_INFO0_TXBF_H_INFO       (1 << 15)
+#define RX_MPDU_START_INFO0_FROM_DS           BIT(11)
+#define RX_MPDU_START_INFO0_TO_DS             BIT(12)
+#define RX_MPDU_START_INFO0_ENCRYPTED         BIT(13)
+#define RX_MPDU_START_INFO0_RETRY             BIT(14)
+#define RX_MPDU_START_INFO0_TXBF_H_INFO       BIT(15)
 
 #define RX_MPDU_START_INFO1_TID_MASK 0xf0000000
 #define RX_MPDU_START_INFO1_TID_LSB  28
-#define RX_MPDU_START_INFO1_DIRECTED (1 << 16)
+#define RX_MPDU_START_INFO1_DIRECTED BIT(16)
 
 struct rx_mpdu_start {
 	__le32 info0;
@@ -357,13 +359,13 @@ struct rx_mpdu_start {
 #define RX_MPDU_END_INFO0_RESERVED_0_LSB      0
 #define RX_MPDU_END_INFO0_POST_DELIM_CNT_MASK 0x0fff0000
 #define RX_MPDU_END_INFO0_POST_DELIM_CNT_LSB  16
-#define RX_MPDU_END_INFO0_OVERFLOW_ERR        (1 << 13)
-#define RX_MPDU_END_INFO0_LAST_MPDU           (1 << 14)
-#define RX_MPDU_END_INFO0_POST_DELIM_ERR      (1 << 15)
-#define RX_MPDU_END_INFO0_MPDU_LENGTH_ERR     (1 << 28)
-#define RX_MPDU_END_INFO0_TKIP_MIC_ERR        (1 << 29)
-#define RX_MPDU_END_INFO0_DECRYPT_ERR         (1 << 30)
-#define RX_MPDU_END_INFO0_FCS_ERR             (1 << 31)
+#define RX_MPDU_END_INFO0_OVERFLOW_ERR        BIT(13)
+#define RX_MPDU_END_INFO0_LAST_MPDU           BIT(14)
+#define RX_MPDU_END_INFO0_POST_DELIM_ERR      BIT(15)
+#define RX_MPDU_END_INFO0_MPDU_LENGTH_ERR     BIT(28)
+#define RX_MPDU_END_INFO0_TKIP_MIC_ERR        BIT(29)
+#define RX_MPDU_END_INFO0_DECRYPT_ERR         BIT(30)
+#define RX_MPDU_END_INFO0_FCS_ERR             BIT(31)
 
 struct rx_mpdu_end {
 	__le32 info0;
@@ -422,12 +424,12 @@ struct rx_mpdu_end {
 #define RX_MSDU_START_INFO1_DECAP_FORMAT_LSB    8
 #define RX_MSDU_START_INFO1_SA_IDX_MASK         0x07ff0000
 #define RX_MSDU_START_INFO1_SA_IDX_LSB          16
-#define RX_MSDU_START_INFO1_IPV4_PROTO          (1 << 10)
-#define RX_MSDU_START_INFO1_IPV6_PROTO          (1 << 11)
-#define RX_MSDU_START_INFO1_TCP_PROTO           (1 << 12)
-#define RX_MSDU_START_INFO1_UDP_PROTO           (1 << 13)
-#define RX_MSDU_START_INFO1_IP_FRAG             (1 << 14)
-#define RX_MSDU_START_INFO1_TCP_ONLY_ACK        (1 << 15)
+#define RX_MSDU_START_INFO1_IPV4_PROTO          BIT(10)
+#define RX_MSDU_START_INFO1_IPV6_PROTO          BIT(11)
+#define RX_MSDU_START_INFO1_TCP_PROTO           BIT(12)
+#define RX_MSDU_START_INFO1_UDP_PROTO           BIT(13)
+#define RX_MSDU_START_INFO1_IP_FRAG             BIT(14)
+#define RX_MSDU_START_INFO1_TCP_ONLY_ACK        BIT(15)
 
 #define RX_MSDU_START_INFO2_DA_IDX_MASK         0x000007ff
 #define RX_MSDU_START_INFO2_DA_IDX_LSB          0
@@ -568,10 +570,10 @@ struct rx_msdu_start {
 
 #define RX_MSDU_END_INFO0_REPORTED_MPDU_LENGTH_MASK 0x00003fff
 #define RX_MSDU_END_INFO0_REPORTED_MPDU_LENGTH_LSB  0
-#define RX_MSDU_END_INFO0_FIRST_MSDU                (1 << 14)
-#define RX_MSDU_END_INFO0_LAST_MSDU                 (1 << 15)
-#define RX_MSDU_END_INFO0_PRE_DELIM_ERR             (1 << 30)
-#define RX_MSDU_END_INFO0_RESERVED_3B               (1 << 31)
+#define RX_MSDU_END_INFO0_FIRST_MSDU                BIT(14)
+#define RX_MSDU_END_INFO0_LAST_MSDU                 BIT(15)
+#define RX_MSDU_END_INFO0_PRE_DELIM_ERR             BIT(30)
+#define RX_MSDU_END_INFO0_RESERVED_3B               BIT(31)
 
 struct rx_msdu_end_common {
 	__le16 ip_hdr_cksum;
@@ -691,7 +693,7 @@ struct rx_msdu_end {
 #define HTT_RX_PPDU_START_PREAMBLE_VHT           0x0C
 #define HTT_RX_PPDU_START_PREAMBLE_VHT_WITH_TXBF 0x0D
 
-#define RX_PPDU_START_INFO0_IS_GREENFIELD (1 << 0)
+#define RX_PPDU_START_INFO0_IS_GREENFIELD BIT(0)
 
 #define RX_PPDU_START_INFO1_L_SIG_RATE_MASK    0x0000000f
 #define RX_PPDU_START_INFO1_L_SIG_RATE_LSB     0
@@ -701,15 +703,15 @@ struct rx_msdu_end {
 #define RX_PPDU_START_INFO1_L_SIG_TAIL_LSB     18
 #define RX_PPDU_START_INFO1_PREAMBLE_TYPE_MASK 0xff000000
 #define RX_PPDU_START_INFO1_PREAMBLE_TYPE_LSB  24
-#define RX_PPDU_START_INFO1_L_SIG_RATE_SELECT  (1 << 4)
-#define RX_PPDU_START_INFO1_L_SIG_PARITY       (1 << 17)
+#define RX_PPDU_START_INFO1_L_SIG_RATE_SELECT  BIT(4)
+#define RX_PPDU_START_INFO1_L_SIG_PARITY       BIT(17)
 
 #define RX_PPDU_START_INFO2_HT_SIG_VHT_SIG_A_1_MASK 0x00ffffff
 #define RX_PPDU_START_INFO2_HT_SIG_VHT_SIG_A_1_LSB  0
 
 #define RX_PPDU_START_INFO3_HT_SIG_VHT_SIG_A_2_MASK 0x00ffffff
 #define RX_PPDU_START_INFO3_HT_SIG_VHT_SIG_A_2_LSB  0
-#define RX_PPDU_START_INFO3_TXBF_H_INFO             (1 << 24)
+#define RX_PPDU_START_INFO3_TXBF_H_INFO             BIT(24)
 
 #define RX_PPDU_START_INFO4_VHT_SIG_B_MASK 0x1fffffff
 #define RX_PPDU_START_INFO4_VHT_SIG_B_LSB  0
@@ -898,14 +900,14 @@ struct rx_ppdu_start {
  *		Reserved: HW should fill with 0, FW should ignore.
  */
 
-#define RX_PPDU_END_FLAGS_PHY_ERR             (1 << 0)
-#define RX_PPDU_END_FLAGS_RX_LOCATION         (1 << 1)
-#define RX_PPDU_END_FLAGS_TXBF_H_INFO         (1 << 2)
+#define RX_PPDU_END_FLAGS_PHY_ERR             BIT(0)
+#define RX_PPDU_END_FLAGS_RX_LOCATION         BIT(1)
+#define RX_PPDU_END_FLAGS_TXBF_H_INFO         BIT(2)
 
 #define RX_PPDU_END_INFO0_RX_ANTENNA_MASK     0x00ffffff
 #define RX_PPDU_END_INFO0_RX_ANTENNA_LSB      0
-#define RX_PPDU_END_INFO0_FLAGS_TX_HT_VHT_ACK (1 << 24)
-#define RX_PPDU_END_INFO0_BB_CAPTURED_CHANNEL (1 << 25)
+#define RX_PPDU_END_INFO0_FLAGS_TX_HT_VHT_ACK BIT(24)
+#define RX_PPDU_END_INFO0_BB_CAPTURED_CHANNEL BIT(25)
 
 #define RX_PPDU_END_INFO1_PEER_IDX_MASK       0x1ffc
 #define RX_PPDU_END_INFO1_PEER_IDX_LSB        2
@@ -1265,9 +1267,9 @@ struct rx_ppdu_end {
  *		to 0.
  */
 
-#define FW_RX_DESC_INFO0_DISCARD  (1 << 0)
-#define FW_RX_DESC_INFO0_FORWARD  (1 << 1)
-#define FW_RX_DESC_INFO0_INSPECT  (1 << 5)
+#define FW_RX_DESC_INFO0_DISCARD  BIT(0)
+#define FW_RX_DESC_INFO0_FORWARD  BIT(1)
+#define FW_RX_DESC_INFO0_INSPECT  BIT(5)
 #define FW_RX_DESC_INFO0_EXT_MASK 0xC0
 #define FW_RX_DESC_INFO0_EXT_LSB  6
 
-- 
cgit v1.2.3-59-g8ed1b


From 777b4690fc4a39a753da97541de2d4fab3dc65b3 Mon Sep 17 00:00:00 2001
From: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Date: Wed, 9 May 2018 13:06:52 +0300
Subject: wil6210: disable tracing config option

Disable WIL6210_TRACING by default to avoid its performance overhead.

Signed-off-by: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/wil6210/Kconfig b/drivers/net/wireless/ath/wil6210/Kconfig
index b448926b0c0f..3548e8d5e18e 100644
--- a/drivers/net/wireless/ath/wil6210/Kconfig
+++ b/drivers/net/wireless/ath/wil6210/Kconfig
@@ -33,7 +33,7 @@ config WIL6210_TRACING
 	bool "wil6210 tracing support"
 	depends on WIL6210
 	depends on EVENT_TRACING
-	default y
+	default n
 	---help---
 	  Say Y here to enable tracepoints for the wil6210 driver
 	  using the kernel tracing infrastructure.  Select this
-- 
cgit v1.2.3-59-g8ed1b


From 8a4fa21438e38ed2db8c01a282de3995a6c0d75f Mon Sep 17 00:00:00 2001
From: Ahmad Masri <amasri@codeaurora.org>
Date: Wed, 9 May 2018 13:06:53 +0300
Subject: wil6210: align to latest auto generated wmi.h

Align to latest version of the auto generated wmi file
describing the interface with FW

Signed-off-by: Ahmad Masri <amasri@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/wmi.c |   6 +-
 drivers/net/wireless/ath/wil6210/wmi.h | 387 +++++++++++++++++++++++++++++++--
 2 files changed, 371 insertions(+), 22 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index a3dda9a97c1f..73efa13bc742 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1554,7 +1554,9 @@ int wmi_pcp_start(struct wil6210_vif *vif,
 		.pcp_max_assoc_sta = max_assoc_sta,
 		.hidden_ssid = hidden_ssid,
 		.is_go = is_go,
-		.disable_ap_sme = disable_ap_sme,
+		.ap_sme_offload_mode = disable_ap_sme ?
+				       WMI_AP_SME_OFFLOAD_PARTIAL :
+				       WMI_AP_SME_OFFLOAD_FULL,
 		.abft_len = wil->abft_len,
 	};
 	struct {
@@ -1574,7 +1576,7 @@ int wmi_pcp_start(struct wil6210_vif *vif,
 	}
 
 	if (disable_ap_sme &&
-	    !test_bit(WMI_FW_CAPABILITY_DISABLE_AP_SME,
+	    !test_bit(WMI_FW_CAPABILITY_AP_SME_OFFLOAD_PARTIAL,
 		      wil->fw_capabilities)) {
 		wil_err(wil, "disable_ap_sme not supported by FW\n");
 		return -EOPNOTSUPP;
diff --git a/drivers/net/wireless/ath/wil6210/wmi.h b/drivers/net/wireless/ath/wil6210/wmi.h
index d3e75f0ff245..dc503d903786 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.h
+++ b/drivers/net/wireless/ath/wil6210/wmi.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  * Copyright (c) 2012-2017 Qualcomm Atheros, Inc.
  * Copyright (c) 2006-2012 Wilocity
  *
@@ -29,8 +30,6 @@
 #ifndef __WILOCITY_WMI_H__
 #define __WILOCITY_WMI_H__
 
-/* General */
-#define WMI_MAX_ASSOC_STA		(8)
 #define WMI_DEFAULT_ASSOC_STA		(1)
 #define WMI_MAC_LEN			(6)
 #define WMI_PROX_RANGE_NUM		(3)
@@ -41,6 +40,19 @@
 #define WMI_RF_ETYPE_LENGTH		(3)
 #define WMI_RF_RX2TX_LENGTH		(3)
 #define WMI_RF_ETYPE_VAL_PER_RANGE	(5)
+/* DTYPE configuration array size
+ * must always be kept equal to (WMI_RF_DTYPE_LENGTH+1)
+ */
+#define WMI_RF_DTYPE_CONF_LENGTH	(4)
+/* ETYPE configuration array size
+ * must always be kept equal to
+ * (WMI_RF_ETYPE_LENGTH+WMI_RF_ETYPE_VAL_PER_RANGE)
+ */
+#define WMI_RF_ETYPE_CONF_LENGTH	(8)
+/* RX2TX configuration array size
+ * must always be kept equal to (WMI_RF_RX2TX_LENGTH+1)
+ */
+#define WMI_RF_RX2TX_CONF_LENGTH	(4)
 
 /* Mailbox interface
  * used for commands and events
@@ -61,7 +73,7 @@ enum wmi_fw_capability {
 	WMI_FW_CAPABILITY_PS_CONFIG			= 1,
 	WMI_FW_CAPABILITY_RF_SECTORS			= 2,
 	WMI_FW_CAPABILITY_MGMT_RETRY_LIMIT		= 3,
-	WMI_FW_CAPABILITY_DISABLE_AP_SME		= 4,
+	WMI_FW_CAPABILITY_AP_SME_OFFLOAD_PARTIAL	= 4,
 	WMI_FW_CAPABILITY_WMI_ONLY			= 5,
 	WMI_FW_CAPABILITY_THERMAL_THROTTLING		= 7,
 	WMI_FW_CAPABILITY_D3_SUSPEND			= 8,
@@ -73,6 +85,7 @@ enum wmi_fw_capability {
 	WMI_FW_CAPABILITY_LO_POWER_CALIB_FROM_OTP	= 14,
 	WMI_FW_CAPABILITY_PNO				= 15,
 	WMI_FW_CAPABILITY_REF_CLOCK_CONTROL		= 18,
+	WMI_FW_CAPABILITY_AP_SME_OFFLOAD_NONE		= 19,
 	WMI_FW_CAPABILITY_MAX,
 };
 
@@ -164,12 +177,14 @@ enum wmi_command_id {
 	WMI_SET_ACTIVE_SILENT_RSSI_TABLE_CMDID		= 0x85C,
 	WMI_RF_PWR_ON_DELAY_CMDID			= 0x85D,
 	WMI_SET_HIGH_POWER_TABLE_PARAMS_CMDID		= 0x85E,
+	WMI_FIXED_SCHEDULING_UL_CONFIG_CMDID		= 0x85F,
 	/* Performance monitoring commands */
 	WMI_BF_CTRL_CMDID				= 0x862,
 	WMI_NOTIFY_REQ_CMDID				= 0x863,
 	WMI_GET_STATUS_CMDID				= 0x864,
 	WMI_GET_RF_STATUS_CMDID				= 0x866,
 	WMI_GET_BASEBAND_TYPE_CMDID			= 0x867,
+	WMI_VRING_SWITCH_TIMING_CONFIG_CMDID		= 0x868,
 	WMI_UNIT_TEST_CMDID				= 0x900,
 	WMI_FLASH_READ_CMDID				= 0x902,
 	WMI_FLASH_WRITE_CMDID				= 0x903,
@@ -202,6 +217,7 @@ enum wmi_command_id {
 	WMI_GET_THERMAL_THROTTLING_CFG_CMDID		= 0x941,
 	/* Read Power Save profile type */
 	WMI_PS_DEV_PROFILE_CFG_READ_CMDID		= 0x942,
+	WMI_TSF_SYNC_CMDID				= 0x973,
 	WMI_TOF_SESSION_START_CMDID			= 0x991,
 	WMI_TOF_GET_CAPABILITIES_CMDID			= 0x992,
 	WMI_TOF_SET_LCR_CMDID				= 0x993,
@@ -218,11 +234,16 @@ enum wmi_command_id {
 	WMI_PRIO_TX_SECTORS_ORDER_CMDID			= 0x9A5,
 	WMI_PRIO_TX_SECTORS_NUMBER_CMDID		= 0x9A6,
 	WMI_PRIO_TX_SECTORS_SET_DEFAULT_CFG_CMDID	= 0x9A7,
+	WMI_BF_CONTROL_CMDID				= 0x9AA,
 	WMI_SCHEDULING_SCHEME_CMDID			= 0xA01,
 	WMI_FIXED_SCHEDULING_CONFIG_CMDID		= 0xA02,
 	WMI_ENABLE_FIXED_SCHEDULING_CMDID		= 0xA03,
 	WMI_SET_MULTI_DIRECTED_OMNIS_CONFIG_CMDID	= 0xA04,
 	WMI_SET_LONG_RANGE_CONFIG_CMDID			= 0xA05,
+	WMI_GET_ASSOC_LIST_CMDID			= 0xA06,
+	WMI_GET_CCA_INDICATIONS_CMDID			= 0xA07,
+	WMI_SET_CCA_INDICATIONS_BI_AVG_NUM_CMDID	= 0xA08,
+	WMI_INTERNAL_FW_IOCTL_CMDID			= 0xA0B,
 	WMI_SET_MAC_ADDRESS_CMDID			= 0xF003,
 	WMI_ABORT_SCAN_CMDID				= 0xF007,
 	WMI_SET_PROMISCUOUS_MODE_CMDID			= 0xF041,
@@ -484,6 +505,18 @@ enum wmi_rf_mgmt_type {
 	WMI_RF_MGMT_GET_STATUS	= 0x02,
 };
 
+/* WMI_BF_CONTROL_CMDID */
+enum wmi_bf_triggers {
+	WMI_BF_TRIGGER_RS_MCS1_TH_FAILURE		= 0x01,
+	WMI_BF_TRIGGER_RS_MCS1_NO_BACK_FAILURE		= 0x02,
+	WMI_BF_TRIGGER_MAX_CTS_FAILURE_IN_TXOP		= 0x04,
+	WMI_BF_TRIGGER_MAX_BACK_FAILURE			= 0x08,
+	WMI_BF_TRIGGER_FW				= 0x10,
+	WMI_BF_TRIGGER_MAX_CTS_FAILURE_IN_KEEP_ALIVE	= 0x20,
+	WMI_BF_TRIGGER_AOA				= 0x40,
+	WMI_BF_TRIGGER_MAX_CTS_FAILURE_IN_UPM		= 0x80,
+};
+
 /* WMI_RF_MGMT_CMDID */
 struct wmi_rf_mgmt_cmd {
 	__le32 rf_mgmt_type;
@@ -519,7 +552,9 @@ struct wmi_bcon_ctrl_cmd {
 	u8 disable_sec;
 	u8 hidden_ssid;
 	u8 is_go;
-	u8 reserved[2];
+	/* A-BFT length override if non-0 */
+	u8 abft_len;
+	u8 reserved;
 } __packed;
 
 /* WMI_PORT_ALLOCATE_CMDID */
@@ -584,6 +619,16 @@ struct wmi_power_mgmt_cfg_cmd {
 	u8 reserved[3];
 } __packed;
 
+/* WMI_PCP_START_CMDID */
+enum wmi_ap_sme_offload_mode {
+	/* Full AP SME in FW */
+	WMI_AP_SME_OFFLOAD_FULL		= 0x00,
+	/* Probe AP SME in FW */
+	WMI_AP_SME_OFFLOAD_PARTIAL	= 0x01,
+	/* AP SME in host */
+	WMI_AP_SME_OFFLOAD_NONE		= 0x02,
+};
+
 /* WMI_PCP_START_CMDID */
 struct wmi_pcp_start_cmd {
 	__le16 bcon_interval;
@@ -593,7 +638,8 @@ struct wmi_pcp_start_cmd {
 	u8 reserved0[5];
 	/* A-BFT length override if non-0 */
 	u8 abft_len;
-	u8 disable_ap_sme;
+	/* enum wmi_ap_sme_offload_mode_e */
+	u8 ap_sme_offload_mode;
 	u8 network_type;
 	u8 channel;
 	u8 disable_sec_offload;
@@ -607,6 +653,17 @@ struct wmi_sw_tx_req_cmd {
 	u8 payload[0];
 } __packed;
 
+/* WMI_VRING_SWITCH_TIMING_CONFIG_CMDID */
+struct wmi_vring_switch_timing_config_cmd {
+	/* Set vring timing configuration:
+	 *
+	 * defined interval for vring switch
+	 */
+	__le32 interval_usec;
+	/* vring inactivity threshold */
+	__le32 idle_th_usec;
+} __packed;
+
 struct wmi_sw_ring_cfg {
 	__le64 ring_mem_base;
 	__le16 ring_size;
@@ -642,6 +699,7 @@ enum wmi_vring_cfg_schd_params_priority {
 	WMI_SCH_PRIO_HIGH	= 0x01,
 };
 
+#define CIDXTID_EXTENDED_CID_TID		(0xFF)
 #define CIDXTID_CID_POS				(0)
 #define CIDXTID_CID_LEN				(4)
 #define CIDXTID_CID_MSK				(0xF)
@@ -662,6 +720,9 @@ struct wmi_vring_cfg {
 	struct wmi_sw_ring_cfg tx_sw_ring;
 	/* 0-23 vrings */
 	u8 ringid;
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 encap_trans_type;
 	/* 802.3 DS cfg */
@@ -671,6 +732,11 @@ struct wmi_vring_cfg {
 	u8 to_resolution;
 	u8 agg_max_wsize;
 	struct wmi_vring_cfg_schd schd_params;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved[2];
 } __packed;
 
 enum wmi_vring_cfg_cmd_action {
@@ -868,23 +934,42 @@ struct wmi_cfg_rx_chain_cmd {
 
 /* WMI_RCP_ADDBA_RESP_CMDID */
 struct wmi_rcp_addba_resp_cmd {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 dialog_token;
 	__le16 status_code;
 	/* ieee80211_ba_parameterset field to send */
 	__le16 ba_param_set;
 	__le16 ba_timeout;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved[2];
 } __packed;
 
 /* WMI_RCP_DELBA_CMDID */
 struct wmi_rcp_delba_cmd {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 reserved;
 	__le16 reason;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved2[2];
 } __packed;
 
 /* WMI_RCP_ADDBA_REQ_CMDID */
 struct wmi_rcp_addba_req_cmd {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 dialog_token;
 	/* ieee80211_ba_parameterset field as it received */
@@ -892,6 +977,11 @@ struct wmi_rcp_addba_req_cmd {
 	__le16 ba_timeout;
 	/* ieee80211_ba_seqstrl field as it received */
 	__le16 ba_seq_ctrl;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved[2];
 } __packed;
 
 /* WMI_SET_MAC_ADDRESS_CMDID */
@@ -902,15 +992,20 @@ struct wmi_set_mac_address_cmd {
 
 /* WMI_ECHO_CMDID
  * Check FW is alive
- * WMI_DEEP_ECHO_CMDID
- * Check FW and ucode are alive
  * Returned event: WMI_ECHO_RSP_EVENTID
- * same event for both commands
  */
 struct wmi_echo_cmd {
 	__le32 value;
 } __packed;
 
+/* WMI_DEEP_ECHO_CMDID
+ * Check FW and ucode are alive
+ * Returned event: WMI_ECHO_RSP_EVENTID
+ */
+struct wmi_deep_echo_cmd {
+	__le32 value;
+} __packed;
+
 /* WMI_RF_PWR_ON_DELAY_CMDID
  * set FW time parameters used through RF resetting
  *  RF reset consists of bringing its power down for a period of time, then
@@ -928,7 +1023,7 @@ struct wmi_rf_pwr_on_delay_cmd {
 	__le16 up_delay_usec;
 } __packed;
 
-/* \WMI_SET_HIGH_POWER_TABLE_PARAMS_CMDID
+/* WMI_SET_HIGH_POWER_TABLE_PARAMS_CMDID
  * This API controls the Tx and Rx gain over temperature.
  * It controls the Tx D-type, Rx D-type and Rx E-type amplifiers.
  * It also controls the Tx gain index, by controlling the Rx to Tx gain index
@@ -942,25 +1037,46 @@ struct wmi_set_high_power_table_params_cmd {
 	u8 tx_dtype_temp[WMI_RF_DTYPE_LENGTH];
 	u8 reserved0;
 	/* Tx D-type values to be used for each temperature range */
-	__le32 tx_dtype_conf[WMI_RF_DTYPE_LENGTH + 1];
+	__le32 tx_dtype_conf[WMI_RF_DTYPE_CONF_LENGTH];
+	/* Temperature range for Tx E-type parameters */
+	u8 tx_etype_temp[WMI_RF_ETYPE_LENGTH];
+	u8 reserved1;
+	/* Tx E-type values to be used for each temperature range.
+	 * The last 4 values of any range are the first 4 values of the next
+	 * range and so on
+	 */
+	__le32 tx_etype_conf[WMI_RF_ETYPE_CONF_LENGTH];
 	/* Temperature range for Rx D-type parameters */
 	u8 rx_dtype_temp[WMI_RF_DTYPE_LENGTH];
-	u8 reserved1;
+	u8 reserved2;
 	/* Rx D-type values to be used for each temperature range */
-	__le32 rx_dtype_conf[WMI_RF_DTYPE_LENGTH + 1];
+	__le32 rx_dtype_conf[WMI_RF_DTYPE_CONF_LENGTH];
 	/* Temperature range for Rx E-type parameters */
 	u8 rx_etype_temp[WMI_RF_ETYPE_LENGTH];
-	u8 reserved2;
+	u8 reserved3;
 	/* Rx E-type values to be used for each temperature range.
 	 * The last 4 values of any range are the first 4 values of the next
 	 * range and so on
 	 */
-	__le32 rx_etype_conf[WMI_RF_ETYPE_VAL_PER_RANGE + WMI_RF_ETYPE_LENGTH];
+	__le32 rx_etype_conf[WMI_RF_ETYPE_CONF_LENGTH];
 	/* Temperature range for rx_2_tx_offs parameters */
 	u8 rx_2_tx_temp[WMI_RF_RX2TX_LENGTH];
-	u8 reserved3;
+	u8 reserved4;
 	/* Rx to Tx gain index offset */
-	s8 rx_2_tx_offs[WMI_RF_RX2TX_LENGTH + 1];
+	s8 rx_2_tx_offs[WMI_RF_RX2TX_CONF_LENGTH];
+} __packed;
+
+/* WMI_FIXED_SCHEDULING_UL_CONFIG_CMDID
+ * This API sets rd parameter per mcs.
+ * Relevant only in Fixed Scheduling mode.
+ * Returned event: WMI_FIXED_SCHEDULING_UL_CONFIG_EVENTID
+ */
+struct wmi_fixed_scheduling_ul_config_cmd {
+	/* Use mcs -1 to set for every mcs */
+	s8 mcs;
+	/* Number of frames with rd bit set in a single virtual slot */
+	u8 rd_count_per_slot;
+	u8 reserved[2];
 } __packed;
 
 /* CMD: WMI_RF_XPM_READ_CMDID */
@@ -1267,6 +1383,93 @@ struct wmi_set_long_range_config_complete_event {
 	u8 reserved[3];
 } __packed;
 
+/* payload max size is 236 bytes: max event buffer size (256) - WMI headers
+ * (16) - prev struct field size (4)
+ */
+#define WMI_MAX_IOCTL_PAYLOAD_SIZE		(236)
+#define WMI_MAX_IOCTL_REPLY_PAYLOAD_SIZE	(236)
+#define WMI_MAX_INTERNAL_EVENT_PAYLOAD_SIZE	(236)
+
+enum wmi_internal_fw_ioctl_code {
+	WMI_INTERNAL_FW_CODE_NONE	= 0x0,
+	WMI_INTERNAL_FW_CODE_QCOM	= 0x1,
+};
+
+/* WMI_INTERNAL_FW_IOCTL_CMDID */
+struct wmi_internal_fw_ioctl_cmd {
+	/* enum wmi_internal_fw_ioctl_code */
+	__le16 code;
+	__le16 length;
+	/* payload max size is WMI_MAX_IOCTL_PAYLOAD_SIZE
+	 * Must be the last member of the struct
+	 */
+	__le32 payload[0];
+} __packed;
+
+/* WMI_INTERNAL_FW_IOCTL_EVENTID */
+struct wmi_internal_fw_ioctl_event {
+	/* wmi_fw_status */
+	u8 status;
+	u8 reserved;
+	__le16 length;
+	/* payload max size is WMI_MAX_IOCTL_REPLY_PAYLOAD_SIZE
+	 * Must be the last member of the struct
+	 */
+	__le32 payload[0];
+} __packed;
+
+/* WMI_INTERNAL_FW_EVENT_EVENTID */
+struct wmi_internal_fw_event_event {
+	__le16 id;
+	__le16 length;
+	/* payload max size is WMI_MAX_INTERNAL_EVENT_PAYLOAD_SIZE
+	 * Must be the last member of the struct
+	 */
+	__le32 payload[0];
+} __packed;
+
+/* WMI_BF_CONTROL_CMDID */
+struct wmi_bf_control_cmd {
+	/* wmi_bf_triggers */
+	__le32 triggers;
+	u8 cid;
+	/* DISABLED = 0, ENABLED = 1 , DRY_RUN = 2 */
+	u8 txss_mode;
+	/* DISABLED = 0, ENABLED = 1, DRY_RUN = 2 */
+	u8 brp_mode;
+	/* Max cts threshold (correspond to
+	 * WMI_BF_TRIGGER_MAX_CTS_FAILURE_IN_TXOP)
+	 */
+	u8 bf_trigger_max_cts_failure_thr;
+	/* Max cts threshold in dense (correspond to
+	 * WMI_BF_TRIGGER_MAX_CTS_FAILURE_IN_TXOP)
+	 */
+	u8 bf_trigger_max_cts_failure_dense_thr;
+	/* Max b-ack threshold (correspond to
+	 * WMI_BF_TRIGGER_MAX_BACK_FAILURE)
+	 */
+	u8 bf_trigger_max_back_failure_thr;
+	/* Max b-ack threshold in dense (correspond to
+	 * WMI_BF_TRIGGER_MAX_BACK_FAILURE)
+	 */
+	u8 bf_trigger_max_back_failure_dense_thr;
+	u8 reserved0;
+	/* Wrong sectors threshold */
+	__le32 wrong_sector_bis_thr;
+	/* BOOL to enable/disable long term trigger */
+	u8 long_term_enable;
+	/* 1 = Update long term thresholds from the long_term_mbps_th_tbl and
+	 * long_term_trig_timeout_per_mcs arrays, 0 = Ignore
+	 */
+	u8 long_term_update_thr;
+	/* Long term throughput threshold [Mbps] */
+	u8 long_term_mbps_th_tbl[WMI_NUM_MCS];
+	u8 reserved1;
+	/* Long term timeout threshold table [msec] */
+	__le16 long_term_trig_timeout_per_mcs[WMI_NUM_MCS];
+	u8 reserved2[2];
+} __packed;
+
 /* WMI Events
  * List of Events (target to host)
  */
@@ -1325,6 +1528,7 @@ enum wmi_event_id {
 	WMI_SET_SILENT_RSSI_TABLE_DONE_EVENTID		= 0x185C,
 	WMI_RF_PWR_ON_DELAY_RSP_EVENTID			= 0x185D,
 	WMI_SET_HIGH_POWER_TABLE_PARAMS_EVENTID		= 0x185E,
+	WMI_FIXED_SCHEDULING_UL_CONFIG_EVENTID		= 0x185F,
 	/* Performance monitoring events */
 	WMI_DATA_PORT_OPEN_EVENTID			= 0x1860,
 	WMI_WBE_LINK_DOWN_EVENTID			= 0x1861,
@@ -1334,6 +1538,7 @@ enum wmi_event_id {
 	WMI_VRING_EN_EVENTID				= 0x1865,
 	WMI_GET_RF_STATUS_EVENTID			= 0x1866,
 	WMI_GET_BASEBAND_TYPE_EVENTID			= 0x1867,
+	WMI_VRING_SWITCH_TIMING_CONFIG_EVENTID		= 0x1868,
 	WMI_UNIT_TEST_EVENTID				= 0x1900,
 	WMI_FLASH_READ_DONE_EVENTID			= 0x1902,
 	WMI_FLASH_WRITE_DONE_EVENTID			= 0x1903,
@@ -1363,6 +1568,7 @@ enum wmi_event_id {
 	WMI_GET_THERMAL_THROTTLING_CFG_EVENTID		= 0x1941,
 	/* return the Power Save profile */
 	WMI_PS_DEV_PROFILE_CFG_READ_EVENTID		= 0x1942,
+	WMI_TSF_SYNC_STATUS_EVENTID			= 0x1973,
 	WMI_TOF_SESSION_END_EVENTID			= 0x1991,
 	WMI_TOF_GET_CAPABILITIES_EVENTID		= 0x1992,
 	WMI_TOF_SET_LCR_EVENTID				= 0x1993,
@@ -1380,17 +1586,24 @@ enum wmi_event_id {
 	WMI_PRIO_TX_SECTORS_ORDER_EVENTID		= 0x19A5,
 	WMI_PRIO_TX_SECTORS_NUMBER_EVENTID		= 0x19A6,
 	WMI_PRIO_TX_SECTORS_SET_DEFAULT_CFG_EVENTID	= 0x19A7,
+	WMI_BF_CONTROL_EVENTID				= 0x19AA,
 	WMI_SCHEDULING_SCHEME_EVENTID			= 0x1A01,
 	WMI_FIXED_SCHEDULING_CONFIG_COMPLETE_EVENTID	= 0x1A02,
 	WMI_ENABLE_FIXED_SCHEDULING_COMPLETE_EVENTID	= 0x1A03,
 	WMI_SET_MULTI_DIRECTED_OMNIS_CONFIG_EVENTID	= 0x1A04,
 	WMI_SET_LONG_RANGE_CONFIG_COMPLETE_EVENTID	= 0x1A05,
+	WMI_GET_ASSOC_LIST_RES_EVENTID			= 0x1A06,
+	WMI_GET_CCA_INDICATIONS_EVENTID			= 0x1A07,
+	WMI_SET_CCA_INDICATIONS_BI_AVG_NUM_EVENTID	= 0x1A08,
+	WMI_INTERNAL_FW_EVENT_EVENTID			= 0x1A0A,
+	WMI_INTERNAL_FW_IOCTL_EVENTID			= 0x1A0B,
 	WMI_SET_CHANNEL_EVENTID				= 0x9000,
 	WMI_ASSOC_REQ_EVENTID				= 0x9001,
 	WMI_EAPOL_RX_EVENTID				= 0x9002,
 	WMI_MAC_ADDR_RESP_EVENTID			= 0x9003,
 	WMI_FW_VER_EVENTID				= 0x9004,
 	WMI_ACS_PASSIVE_SCAN_COMPLETE_EVENTID		= 0x9005,
+	WMI_INTERNAL_FW_SET_CHANNEL			= 0x9006,
 	WMI_COMMAND_NOT_SUPPORTED_EVENTID		= 0xFFFF,
 };
 
@@ -1462,12 +1675,16 @@ enum rf_type {
 	RF_UNKNOWN	= 0x00,
 	RF_MARLON	= 0x01,
 	RF_SPARROW	= 0x02,
+	RF_TALYNA1	= 0x03,
+	RF_TALYNA2	= 0x04,
 };
 
 /* WMI_GET_RF_STATUS_EVENTID */
 enum board_file_rf_type {
 	BF_RF_MARLON	= 0x00,
 	BF_RF_SPARROW	= 0x01,
+	BF_RF_TALYNA1	= 0x02,
+	BF_RF_TALYNA2	= 0x03,
 };
 
 /* WMI_GET_RF_STATUS_EVENTID */
@@ -1507,6 +1724,7 @@ enum baseband_type {
 	BASEBAND_SPARROW_M_C0	= 0x06,
 	BASEBAND_SPARROW_M_D0	= 0x07,
 	BASEBAND_TALYN_M_A0	= 0x08,
+	BASEBAND_TALYN_M_B0	= 0x09,
 };
 
 /* WMI_GET_BASEBAND_TYPE_EVENTID */
@@ -1551,7 +1769,11 @@ struct wmi_ready_event {
 	u8 numof_additional_mids;
 	/* rfc read calibration result. 5..15 */
 	u8 rfc_read_calib_result;
-	u8 reserved[3];
+	/* Max associated STAs supported by FW in AP mode (default 0 means 8
+	 * STA)
+	 */
+	u8 max_assoc_sta;
+	u8 reserved[2];
 } __packed;
 
 /* WMI_NOTIFY_REQ_DONE_EVENTID */
@@ -1666,13 +1888,13 @@ enum wmi_pno_result {
 };
 
 struct wmi_start_sched_scan_event {
-	/* pno_result */
+	/* wmi_pno_result */
 	u8 result;
 	u8 reserved[3];
 } __packed;
 
 struct wmi_stop_sched_scan_event {
-	/* pno_result */
+	/* wmi_pno_result */
 	u8 result;
 	u8 reserved[3];
 } __packed;
@@ -1739,9 +1961,17 @@ struct wmi_ba_status_event {
 
 /* WMI_DELBA_EVENTID */
 struct wmi_delba_event {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 from_initiator;
 	__le16 reason;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved[2];
 } __packed;
 
 /* WMI_VRING_CFG_DONE_EVENTID */
@@ -1754,13 +1984,24 @@ struct wmi_vring_cfg_done_event {
 
 /* WMI_RCP_ADDBA_RESP_SENT_EVENTID */
 struct wmi_rcp_addba_resp_sent_event {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 reserved;
 	__le16 status;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved2[2];
 } __packed;
 
 /* WMI_RCP_ADDBA_REQ_EVENTID */
 struct wmi_rcp_addba_req_event {
+	/* Used for cid less than 8. For higher cid set
+	 * CIDXTID_EXTENDED_CID_TID here and use cid and tid members instead
+	 */
 	u8 cidxtid;
 	u8 dialog_token;
 	/* ieee80211_ba_parameterset as it received */
@@ -1768,6 +2009,11 @@ struct wmi_rcp_addba_req_event {
 	__le16 ba_timeout;
 	/* ieee80211_ba_seqstrl field as it received */
 	__le16 ba_seq_ctrl;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 cid;
+	/* Used when cidxtid = CIDXTID_EXTENDED_CID_TID */
+	u8 tid;
+	u8 reserved[2];
 } __packed;
 
 /* WMI_CFG_RX_CHAIN_DONE_EVENTID */
@@ -1942,6 +2188,13 @@ struct wmi_set_high_power_table_params_event {
 	u8 reserved[3];
 } __packed;
 
+/* WMI_FIXED_SCHEDULING_UL_CONFIG_EVENTID */
+struct wmi_fixed_scheduling_ul_config_event {
+	/* wmi_fw_status */
+	u8 status;
+	u8 reserved[3];
+} __packed;
+
 /* WMI_TEMP_SENSE_DONE_EVENTID
  *
  * Measure MAC and radio temperatures
@@ -2290,6 +2543,8 @@ struct wmi_link_maintain_cfg {
 	__le32 bad_beacons_num_threshold;
 	/* SNR limit for bad_beacons_detector */
 	__le32 bad_beacons_snr_threshold_db;
+	/* timeout for disassoc response frame in uSec */
+	__le32 disconnect_timeout;
 } __packed;
 
 /* WMI_LINK_MAINTAIN_CFG_WRITE_CMDID */
@@ -2519,6 +2774,7 @@ enum wmi_tof_session_end_status {
 	WMI_TOF_SESSION_END_FAIL		= 0x01,
 	WMI_TOF_SESSION_END_PARAMS_ERROR	= 0x02,
 	WMI_TOF_SESSION_END_ABORTED		= 0x03,
+	WMI_TOF_SESSION_END_BUSY		= 0x04,
 };
 
 /* WMI_TOF_SESSION_END_EVENTID */
@@ -2925,7 +3181,40 @@ struct wmi_set_silent_rssi_table_done_event {
 	__le32 table;
 } __packed;
 
-/* \WMI_COMMAND_NOT_SUPPORTED_EVENTID */
+/* WMI_VRING_SWITCH_TIMING_CONFIG_EVENTID */
+struct wmi_vring_switch_timing_config_event {
+	/* enum wmi_fw_status */
+	u8 status;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_GET_ASSOC_LIST_RES_EVENTID */
+struct wmi_assoc_sta_info {
+	u8 mac[WMI_MAC_LEN];
+	u8 omni_index_address;
+	u8 reserved;
+} __packed;
+
+#define WMI_GET_ASSOC_LIST_SIZE	(8)
+
+/* WMI_GET_ASSOC_LIST_RES_EVENTID
+ * Returns up to MAX_ASSOC_STA_LIST_SIZE associated STAs
+ */
+struct wmi_get_assoc_list_res_event {
+	struct wmi_assoc_sta_info assoc_sta_list[WMI_GET_ASSOC_LIST_SIZE];
+	/* STA count */
+	u8 count;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_BF_CONTROL_EVENTID */
+struct wmi_bf_control_event {
+	/* wmi_fw_status */
+	u8 status;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_COMMAND_NOT_SUPPORTED_EVENTID */
 struct wmi_command_not_supported_event {
 	/* device id */
 	u8 mid;
@@ -2936,4 +3225,62 @@ struct wmi_command_not_supported_event {
 	__le16 reserved1;
 } __packed;
 
+/* WMI_TSF_SYNC_CMDID */
+struct wmi_tsf_sync_cmd {
+	/* The time interval to send announce frame in one BI */
+	u8 interval_ms;
+	/* The mcs to send announce frame */
+	u8 mcs;
+	u8 reserved[6];
+} __packed;
+
+/* WMI_TSF_SYNC_STATUS_EVENTID */
+enum wmi_tsf_sync_status {
+	WMI_TSF_SYNC_SUCCESS	= 0x00,
+	WMI_TSF_SYNC_FAILED	= 0x01,
+	WMI_TSF_SYNC_REJECTED	= 0x02,
+};
+
+/* WMI_TSF_SYNC_STATUS_EVENTID */
+struct wmi_tsf_sync_status_event {
+	/* enum wmi_tsf_sync_status */
+	u8 status;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_GET_CCA_INDICATIONS_EVENTID */
+struct wmi_get_cca_indications_event {
+	/* wmi_fw_status */
+	u8 status;
+	/* CCA-Energy Detect in percentage over last BI (0..100) */
+	u8 cca_ed_percent;
+	/* Averaged CCA-Energy Detect in percent over number of BIs (0..100) */
+	u8 cca_ed_avg_percent;
+	/* NAV percent over last BI (0..100) */
+	u8 nav_percent;
+	/* Averaged NAV percent over number of BIs (0..100) */
+	u8 nav_avg_percent;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_SET_CCA_INDICATIONS_BI_AVG_NUM_CMDID */
+struct wmi_set_cca_indications_bi_avg_num_cmd {
+	/* set the number of bis to average cca_ed (0..255) */
+	u8 bi_number;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_SET_CCA_INDICATIONS_BI_AVG_NUM_EVENTID */
+struct wmi_set_cca_indications_bi_avg_num_event {
+	/* wmi_fw_status */
+	u8 status;
+	u8 reserved[3];
+} __packed;
+
+/* WMI_INTERNAL_FW_SET_CHANNEL */
+struct wmi_internal_fw_set_channel_event {
+	u8 channel_num;
+	u8 reserved[3];
+} __packed;
+
 #endif /* __WILOCITY_WMI_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 9861bf3b818fbe810442b89230b80c0385ef9e04 Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Wed, 9 May 2018 13:06:54 +0300
Subject: wil6210: fix call to wil6210_disconnect during unload

Move the call to wil6210_disconnect so it will be called
before unregister_netdevice. This is because it calls
netif_carrier_off which is forbidden to call on an
unregistered net device. Calling netif_carrier_off can
add a link watch event which might be handled after
net device was freed, causing a kernel oops.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/netdev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c
index 05e9408e7ea3..eb6c14ed65a4 100644
--- a/drivers/net/wireless/ath/wil6210/netdev.c
+++ b/drivers/net/wireless/ath/wil6210/netdev.c
@@ -457,16 +457,16 @@ void wil_vif_remove(struct wil6210_priv *wil, u8 mid)
 		return;
 	}
 
+	mutex_lock(&wil->mutex);
+	wil6210_disconnect(vif, NULL, WLAN_REASON_DEAUTH_LEAVING, false);
+	mutex_unlock(&wil->mutex);
+
 	ndev = vif_to_ndev(vif);
 	/* during unregister_netdevice cfg80211_leave may perform operations
 	 * such as stop AP, disconnect, so we only clear the VIF afterwards
 	 */
 	unregister_netdevice(ndev);
 
-	mutex_lock(&wil->mutex);
-	wil6210_disconnect(vif, NULL, WLAN_REASON_DEAUTH_LEAVING, false);
-	mutex_unlock(&wil->mutex);
-
 	if (any_active && vif->mid != 0)
 		wmi_port_delete(wil, vif->mid);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f85c7e702d2185d3d553982f916b5ac96ae77a9 Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Wed, 9 May 2018 13:06:55 +0300
Subject: wil6210: change reply_size arg to u16 in wmi_call

Change the type of the argument reply_size from u8 to
u16 in order to support reply sizes > 255 bytes.
The driver already supports u16 reply size in all
other places, this was the only missing change.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/wil6210.h | 2 +-
 drivers/net/wireless/ath/wil6210/wmi.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h
index f9c5155025bc..9ac92921d47c 100644
--- a/drivers/net/wireless/ath/wil6210/wil6210.h
+++ b/drivers/net/wireless/ath/wil6210/wil6210.h
@@ -986,7 +986,7 @@ int wmi_read_hdr(struct wil6210_priv *wil, __le32 ptr,
 int wmi_send(struct wil6210_priv *wil, u16 cmdid, u8 mid, void *buf, u16 len);
 void wmi_recv_cmd(struct wil6210_priv *wil);
 int wmi_call(struct wil6210_priv *wil, u16 cmdid, u8 mid, void *buf, u16 len,
-	     u16 reply_id, void *reply, u8 reply_size, int to_msec);
+	     u16 reply_id, void *reply, u16 reply_size, int to_msec);
 void wmi_event_worker(struct work_struct *work);
 void wmi_event_flush(struct wil6210_priv *wil);
 int wmi_set_ssid(struct wil6210_vif *vif, u8 ssid_len, const void *ssid);
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index 73efa13bc742..fcd95299eb4f 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1416,7 +1416,7 @@ void wmi_recv_cmd(struct wil6210_priv *wil)
 }
 
 int wmi_call(struct wil6210_priv *wil, u16 cmdid, u8 mid, void *buf, u16 len,
-	     u16 reply_id, void *reply, u8 reply_size, int to_msec)
+	     u16 reply_id, void *reply, u16 reply_size, int to_msec)
 {
 	int rc;
 	unsigned long remain;
-- 
cgit v1.2.3-59-g8ed1b


From 1c21cc5fc4df4997eddc6e0b5c5bad225c3f1ecc Mon Sep 17 00:00:00 2001
From: Dedy Lansky <dlansky@codeaurora.org>
Date: Wed, 9 May 2018 13:06:56 +0300
Subject: wil6210: move WMI functionality out of wil_cfg80211_mgmt_tx

Rearrange the code by having new function wmi_mgmt_tx() to take care
of the WMI part of wil_cfg80211_mgmt_tx().

Signed-off-by: Dedy Lansky <dlansky@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 39 +++---------------------
 drivers/net/wireless/ath/wil6210/wil6210.h  |  1 +
 drivers/net/wireless/ath/wil6210/wmi.c      | 47 +++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index cdbb393863f3..109bae1436db 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -1081,17 +1081,11 @@ int wil_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 			 u64 *cookie)
 {
 	const u8 *buf = params->buf;
-	size_t len = params->len, total;
+	size_t len = params->len;
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	struct wil6210_vif *vif = wdev_to_vif(wil, wdev);
 	int rc;
-	bool tx_status = false;
-	struct ieee80211_mgmt *mgmt_frame = (void *)buf;
-	struct wmi_sw_tx_req_cmd *cmd;
-	struct {
-		struct wmi_cmd_hdr wmi;
-		struct wmi_sw_tx_complete_event evt;
-	} __packed evt;
+	bool tx_status;
 
 	/* Note, currently we do not support the "wait" parameter, user-space
 	 * must call remain_on_channel before mgmt_tx or listen on a channel
@@ -1100,34 +1094,9 @@ int wil_cfg80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 	 * different from currently "listened" channel and fail if it is.
 	 */
 
-	wil_dbg_misc(wil, "mgmt_tx mid %d\n", vif->mid);
-	wil_hex_dump_misc("mgmt tx frame ", DUMP_PREFIX_OFFSET, 16, 1, buf,
-			  len, true);
-
-	if (len < sizeof(struct ieee80211_hdr_3addr))
-		return -EINVAL;
-
-	total = sizeof(*cmd) + len;
-	if (total < len)
-		return -EINVAL;
-
-	cmd = kmalloc(total, GFP_KERNEL);
-	if (!cmd) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	memcpy(cmd->dst_mac, mgmt_frame->da, WMI_MAC_LEN);
-	cmd->len = cpu_to_le16(len);
-	memcpy(cmd->payload, buf, len);
-
-	rc = wmi_call(wil, WMI_SW_TX_REQ_CMDID, vif->mid, cmd, total,
-		      WMI_SW_TX_COMPLETE_EVENTID, &evt, sizeof(evt), 2000);
-	if (rc == 0)
-		tx_status = !evt.evt.status;
+	rc = wmi_mgmt_tx(vif, buf, len);
+	tx_status = (rc == 0);
 
-	kfree(cmd);
- out:
 	cfg80211_mgmt_tx_status(wdev, cookie ? *cookie : 0, buf, len,
 				tx_status, GFP_KERNEL);
 	return rc;
diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h
index 9ac92921d47c..c9120dc8a435 100644
--- a/drivers/net/wireless/ath/wil6210/wil6210.h
+++ b/drivers/net/wireless/ath/wil6210/wil6210.h
@@ -1150,5 +1150,6 @@ void wil6210_clear_halp(struct wil6210_priv *wil);
 int wmi_start_sched_scan(struct wil6210_priv *wil,
 			 struct cfg80211_sched_scan_request *request);
 int wmi_stop_sched_scan(struct wil6210_priv *wil);
+int wmi_mgmt_tx(struct wil6210_vif *vif, const u8 *buf, size_t len);
 
 #endif /* __WIL6210_H__ */
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index fcd95299eb4f..bf110e3b4477 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -2773,3 +2773,50 @@ int wmi_stop_sched_scan(struct wil6210_priv *wil)
 
 	return 0;
 }
+
+int wmi_mgmt_tx(struct wil6210_vif *vif, const u8 *buf, size_t len)
+{
+	size_t total;
+	struct wil6210_priv *wil = vif_to_wil(vif);
+	struct ieee80211_mgmt *mgmt_frame = (void *)buf;
+	struct wmi_sw_tx_req_cmd *cmd;
+	struct {
+		struct wmi_cmd_hdr wmi;
+		struct wmi_sw_tx_complete_event evt;
+	} __packed evt = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
+	int rc;
+
+	wil_dbg_misc(wil, "mgmt_tx mid %d\n", vif->mid);
+	wil_hex_dump_misc("mgmt tx frame ", DUMP_PREFIX_OFFSET, 16, 1, buf,
+			  len, true);
+
+	if (len < sizeof(struct ieee80211_hdr_3addr))
+		return -EINVAL;
+
+	total = sizeof(*cmd) + len;
+	if (total < len) {
+		wil_err(wil, "mgmt_tx invalid len %zu\n", len);
+		return -EINVAL;
+	}
+
+	cmd = kmalloc(total, GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	memcpy(cmd->dst_mac, mgmt_frame->da, WMI_MAC_LEN);
+	cmd->len = cpu_to_le16(len);
+	memcpy(cmd->payload, buf, len);
+
+	rc = wmi_call(wil, WMI_SW_TX_REQ_CMDID, vif->mid, cmd, total,
+		      WMI_SW_TX_COMPLETE_EVENTID, &evt, sizeof(evt), 2000);
+	if (!rc && evt.evt.status != WMI_FW_STATUS_SUCCESS) {
+		wil_err(wil, "mgmt_tx failed with status %d\n", evt.evt.status);
+		rc = -EINVAL;
+	}
+
+	kfree(cmd);
+
+	return rc;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 807b086053df9bdbc9bf732130e9acda1c161aa5 Mon Sep 17 00:00:00 2001
From: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Date: Wed, 9 May 2018 13:06:57 +0300
Subject: wil6210: Initialize reply struct of the WMI commands

WMI command reply saved in uninitialized struct.
In order to avoid accessing unset values from FW initialize
the reply struct.

Signed-off-by: Alexei Avshalom Lazar <ailizaro@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 22 ++++---
 drivers/net/wireless/ath/wil6210/debugfs.c  |  2 +
 drivers/net/wireless/ath/wil6210/main.c     |  2 +
 drivers/net/wireless/ath/wil6210/txrx.c     |  8 ++-
 drivers/net/wireless/ath/wil6210/wmi.c      | 97 ++++++++++++++++++-----------
 5 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 109bae1436db..78946f28d0c7 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -276,6 +276,8 @@ int wil_cid_fill_sinfo(struct wil6210_vif *vif, int cid,
 	struct wil_net_stats *stats = &wil->sta[cid].stats;
 	int rc;
 
+	memset(&reply, 0, sizeof(reply));
+
 	rc = wmi_call(wil, WMI_NOTIFY_REQ_CMDID, vif->mid, &cmd, sizeof(cmd),
 		      WMI_NOTIFY_REQ_DONE_EVENTID, &reply, sizeof(reply), 20);
 	if (rc)
@@ -2246,7 +2248,9 @@ static int wil_rf_sector_get_cfg(struct wiphy *wiphy,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_get_rf_sector_params_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_RF_SECTOR_STATUS_NOT_SUPPORTED_ERROR},
+	};
 	struct sk_buff *msg;
 	struct nlattr *nl_cfgs, *nl_cfg;
 	u32 i;
@@ -2292,7 +2296,6 @@ static int wil_rf_sector_get_cfg(struct wiphy *wiphy,
 	cmd.sector_idx = cpu_to_le16(sector_index);
 	cmd.sector_type = sector_type;
 	cmd.rf_modules_vec = rf_modules_vec & 0xFF;
-	memset(&reply, 0, sizeof(reply));
 	rc = wmi_call(wil, WMI_GET_RF_SECTOR_PARAMS_CMDID, vif->mid,
 		      &cmd, sizeof(cmd), WMI_GET_RF_SECTOR_PARAMS_DONE_EVENTID,
 		      &reply, sizeof(reply),
@@ -2367,7 +2370,9 @@ static int wil_rf_sector_set_cfg(struct wiphy *wiphy,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_set_rf_sector_params_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_RF_SECTOR_STATUS_NOT_SUPPORTED_ERROR},
+	};
 	struct nlattr *nl_cfg;
 	struct wmi_rf_sector_info *si;
 
@@ -2450,7 +2455,6 @@ static int wil_rf_sector_set_cfg(struct wiphy *wiphy,
 	}
 
 	cmd.rf_modules_vec = rf_modules_vec & 0xFF;
-	memset(&reply, 0, sizeof(reply));
 	rc = wmi_call(wil, WMI_SET_RF_SECTOR_PARAMS_CMDID, vif->mid,
 		      &cmd, sizeof(cmd), WMI_SET_RF_SECTOR_PARAMS_DONE_EVENTID,
 		      &reply, sizeof(reply),
@@ -2474,7 +2478,9 @@ static int wil_rf_sector_get_selected(struct wiphy *wiphy,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_get_selected_rf_sector_index_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_RF_SECTOR_STATUS_NOT_SUPPORTED_ERROR},
+	};
 	struct sk_buff *msg;
 
 	if (!test_bit(WMI_FW_CAPABILITY_RF_SECTORS, wil->fw_capabilities))
@@ -2514,7 +2520,6 @@ static int wil_rf_sector_get_selected(struct wiphy *wiphy,
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cid = (u8)cid;
 	cmd.sector_type = sector_type;
-	memset(&reply, 0, sizeof(reply));
 	rc = wmi_call(wil, WMI_GET_SELECTED_RF_SECTOR_INDEX_CMDID, vif->mid,
 		      &cmd, sizeof(cmd),
 		      WMI_GET_SELECTED_RF_SECTOR_INDEX_DONE_EVENTID,
@@ -2555,14 +2560,15 @@ static int wil_rf_sector_wmi_set_selected(struct wil6210_priv *wil,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_set_selected_rf_sector_index_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_RF_SECTOR_STATUS_NOT_SUPPORTED_ERROR},
+	};
 	int rc;
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.sector_idx = cpu_to_le16(sector_index);
 	cmd.sector_type = sector_type;
 	cmd.cid = (u8)cid;
-	memset(&reply, 0, sizeof(reply));
 	rc = wmi_call(wil, WMI_SET_SELECTED_RF_SECTOR_INDEX_CMDID, mid,
 		      &cmd, sizeof(cmd),
 		      WMI_SET_SELECTED_RF_SECTOR_INDEX_DONE_EVENTID,
diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index 8c90b3111f0b..d3b1069ebf7a 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -1078,6 +1078,8 @@ static int wil_bf_debugfs_show(struct seq_file *s, void *data)
 		struct wmi_notify_req_done_event evt;
 	} __packed reply;
 
+	memset(&reply, 0, sizeof(reply));
+
 	for (i = 0; i < ARRAY_SIZE(wil->sta); i++) {
 		u32 status;
 
diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c
index 82aec6b06d09..e7006c2428a0 100644
--- a/drivers/net/wireless/ath/wil6210/main.c
+++ b/drivers/net/wireless/ath/wil6210/main.c
@@ -342,6 +342,8 @@ void wil_disconnect_worker(struct work_struct *work)
 		/* already disconnected */
 		return;
 
+	memset(&reply, 0, sizeof(reply));
+
 	rc = wmi_call(wil, WMI_DISCONNECT_CMDID, vif->mid, NULL, 0,
 		      WMI_DISCONNECT_EVENTID, &reply, sizeof(reply),
 		      WIL6210_DISCONNECT_TO_MS);
diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c
index b60b9fcaaebd..411130a4f2ed 100644
--- a/drivers/net/wireless/ath/wil6210/txrx.c
+++ b/drivers/net/wireless/ath/wil6210/txrx.c
@@ -963,7 +963,9 @@ int wil_vring_init_tx(struct wil6210_vif *vif, int id, int size,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_vring_cfg_done_event cmd;
-	} __packed reply;
+	} __packed reply = {
+		.cmd = {.status = WMI_FW_STATUS_FAILURE},
+	};
 	struct vring *vring = &wil->vring_tx[id];
 	struct vring_tx_data *txdata = &wil->vring_tx_data[id];
 
@@ -1045,7 +1047,9 @@ int wil_vring_init_bcast(struct wil6210_vif *vif, int id, int size)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_vring_cfg_done_event cmd;
-	} __packed reply;
+	} __packed reply = {
+		.cmd = {.status = WMI_FW_STATUS_FAILURE},
+	};
 	struct vring *vring = &wil->vring_tx[id];
 	struct vring_tx_data *txdata = &wil->vring_tx_data[id];
 
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index bf110e3b4477..8a9bbd6bcea8 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -1509,7 +1509,9 @@ int wmi_led_cfg(struct wil6210_priv *wil, bool enable)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_led_cfg_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = cpu_to_le32(WMI_FW_STATUS_FAILURE)},
+	};
 
 	if (led_id == WIL_LED_INVALID_ID)
 		goto out;
@@ -1562,7 +1564,9 @@ int wmi_pcp_start(struct wil6210_vif *vif,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_pcp_started_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	if (!vif->privacy)
 		cmd.disable_sec = 1;
@@ -1639,6 +1643,8 @@ int wmi_get_ssid(struct wil6210_vif *vif, u8 *ssid_len, void *ssid)
 	} __packed reply;
 	int len; /* reply.cmd.ssid_len in CPU order */
 
+	memset(&reply, 0, sizeof(reply));
+
 	rc = wmi_call(wil, WMI_GET_SSID_CMDID, vif->mid, NULL, 0,
 		      WMI_GET_SSID_EVENTID, &reply, sizeof(reply), 20);
 	if (rc)
@@ -1674,6 +1680,8 @@ int wmi_get_channel(struct wil6210_priv *wil, int *channel)
 		struct wmi_set_pcp_channel_cmd cmd;
 	} __packed reply;
 
+	memset(&reply, 0, sizeof(reply));
+
 	rc = wmi_call(wil, WMI_GET_PCP_CHANNEL_CMDID, vif->mid, NULL, 0,
 		      WMI_GET_PCP_CHANNEL_EVENTID, &reply, sizeof(reply), 20);
 	if (rc)
@@ -1699,7 +1707,9 @@ int wmi_p2p_cfg(struct wil6210_vif *vif, int channel, int bi)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_p2p_cfg_done_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_wmi(wil, "sending WMI_P2P_CFG_CMDID\n");
 
@@ -1720,7 +1730,9 @@ int wmi_start_listen(struct wil6210_vif *vif)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_listen_started_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_wmi(wil, "sending WMI_START_LISTEN_CMDID\n");
 
@@ -1742,7 +1754,9 @@ int wmi_start_search(struct wil6210_vif *vif)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_search_started_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_wmi(wil, "sending WMI_START_SEARCH_CMDID\n");
 
@@ -1868,7 +1882,9 @@ int wmi_rxon(struct wil6210_priv *wil, bool on)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_listen_started_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_info(wil, "(%s)\n", on ? "on" : "off");
 
@@ -1910,6 +1926,8 @@ int wmi_rx_chain_add(struct wil6210_priv *wil, struct vring *vring)
 	} __packed evt;
 	int rc;
 
+	memset(&evt, 0, sizeof(evt));
+
 	if (wdev->iftype == NL80211_IFTYPE_MONITOR) {
 		struct ieee80211_channel *ch = wil->monitor_chandef.chan;
 
@@ -1939,14 +1957,14 @@ int wmi_rx_chain_add(struct wil6210_priv *wil, struct vring *vring)
 	if (rc)
 		return rc;
 
+	if (le32_to_cpu(evt.evt.status) != WMI_CFG_RX_CHAIN_SUCCESS)
+		rc = -EINVAL;
+
 	vring->hwtail = le32_to_cpu(evt.evt.rx_ring_tail_ptr);
 
 	wil_dbg_misc(wil, "Rx init: status %d tail 0x%08x\n",
 		     le32_to_cpu(evt.evt.status), vring->hwtail);
 
-	if (le32_to_cpu(evt.evt.status) != WMI_CFG_RX_CHAIN_SUCCESS)
-		rc = -EINVAL;
-
 	return rc;
 }
 
@@ -1964,6 +1982,8 @@ int wmi_get_temperature(struct wil6210_priv *wil, u32 *t_bb, u32 *t_rf)
 		struct wmi_temp_sense_done_event evt;
 	} __packed reply;
 
+	memset(&reply, 0, sizeof(reply));
+
 	rc = wmi_call(wil, WMI_TEMP_SENSE_CMDID, vif->mid, &cmd, sizeof(cmd),
 		      WMI_TEMP_SENSE_DONE_EVENTID, &reply, sizeof(reply), 100);
 	if (rc)
@@ -1996,6 +2016,7 @@ int wmi_disconnect_sta(struct wil6210_vif *vif, const u8 *mac,
 
 	wil_dbg_wmi(wil, "disconnect_sta: (%pM, reason %d)\n", mac, reason);
 
+	memset(&reply, 0, sizeof(reply));
 	vif->locally_generated_disc = true;
 	if (del_sta) {
 		ether_addr_copy(del_sta_cmd.dst_mac, mac);
@@ -2094,7 +2115,9 @@ int wmi_addba_rx_resp(struct wil6210_priv *wil,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_rcp_addba_resp_sent_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = cpu_to_le16(WMI_FW_STATUS_FAILURE)},
+	};
 
 	wil_dbg_wmi(wil,
 		    "ADDBA response for MID %d CID %d TID %d size %d timeout %d status %d AMSDU%s\n",
@@ -2127,13 +2150,13 @@ int wmi_ps_dev_profile_cfg(struct wil6210_priv *wil,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_ps_dev_profile_cfg_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = cpu_to_le32(WMI_PS_CFG_CMD_STATUS_ERROR)},
+	};
 	u32 status;
 
 	wil_dbg_wmi(wil, "Setting ps dev profile %d\n", ps_profile);
 
-	reply.evt.status = cpu_to_le32(WMI_PS_CFG_CMD_STATUS_ERROR);
-
 	rc = wmi_call(wil, WMI_PS_DEV_PROFILE_CFG_CMDID, vif->mid,
 		      &cmd, sizeof(cmd),
 		      WMI_PS_DEV_PROFILE_CFG_EVENTID, &reply, sizeof(reply),
@@ -2162,15 +2185,15 @@ int wmi_set_mgmt_retry(struct wil6210_priv *wil, u8 retry_short)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_set_mgmt_retry_limit_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_wmi(wil, "Setting mgmt retry short %d\n", retry_short);
 
 	if (!test_bit(WMI_FW_CAPABILITY_MGMT_RETRY_LIMIT, wil->fw_capabilities))
 		return -ENOTSUPP;
 
-	reply.evt.status = WMI_FW_STATUS_FAILURE;
-
 	rc = wmi_call(wil, WMI_SET_MGMT_RETRY_LIMIT_CMDID, vif->mid,
 		      &cmd, sizeof(cmd),
 		      WMI_SET_MGMT_RETRY_LIMIT_EVENTID, &reply, sizeof(reply),
@@ -2201,7 +2224,7 @@ int wmi_get_mgmt_retry(struct wil6210_priv *wil, u8 *retry_short)
 	if (!test_bit(WMI_FW_CAPABILITY_MGMT_RETRY_LIMIT, wil->fw_capabilities))
 		return -ENOTSUPP;
 
-	reply.evt.mgmt_retry_limit = 0;
+	memset(&reply, 0, sizeof(reply));
 	rc = wmi_call(wil, WMI_GET_MGMT_RETRY_LIMIT_CMDID, vif->mid, NULL, 0,
 		      WMI_GET_MGMT_RETRY_LIMIT_EVENTID, &reply, sizeof(reply),
 		      100);
@@ -2284,14 +2307,15 @@ int wmi_suspend(struct wil6210_priv *wil)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_traffic_suspend_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_TRAFFIC_SUSPEND_REJECTED_LINK_NOT_IDLE},
+	};
+
 	u32 suspend_to = WIL_WAIT_FOR_SUSPEND_RESUME_COMP;
 
 	wil->suspend_resp_rcvd = false;
 	wil->suspend_resp_comp = false;
 
-	reply.evt.status = WMI_TRAFFIC_SUSPEND_REJECTED_LINK_NOT_IDLE;
-
 	rc = wmi_call(wil, WMI_TRAFFIC_SUSPEND_CMDID, vif->mid,
 		      &cmd, sizeof(cmd),
 		      WMI_TRAFFIC_SUSPEND_EVENTID, &reply, sizeof(reply),
@@ -2367,10 +2391,11 @@ int wmi_resume(struct wil6210_priv *wil)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_traffic_resume_event evt;
-	} __packed reply;
-
-	reply.evt.status = WMI_TRAFFIC_RESUME_FAILED;
-	reply.evt.resume_triggers = WMI_RESUME_TRIGGER_UNKNOWN;
+	} __packed reply = {
+		.evt = {.status = WMI_TRAFFIC_RESUME_FAILED,
+			.resume_triggers =
+				cpu_to_le32(WMI_RESUME_TRIGGER_UNKNOWN)},
+	};
 
 	rc = wmi_call(wil, WMI_TRAFFIC_RESUME_CMDID, vif->mid, NULL, 0,
 		      WMI_TRAFFIC_RESUME_EVENTID, &reply, sizeof(reply),
@@ -2396,7 +2421,9 @@ int wmi_port_allocate(struct wil6210_priv *wil, u8 mid,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_port_allocated_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_misc(wil, "port allocate, mid %d iftype %d, mac %pM\n",
 		     mid, iftype, mac);
@@ -2421,8 +2448,6 @@ int wmi_port_allocate(struct wil6210_priv *wil, u8 mid,
 		return -EINVAL;
 	}
 
-	reply.evt.status = WMI_FW_STATUS_FAILURE;
-
 	rc = wmi_call(wil, WMI_PORT_ALLOCATE_CMDID, mid,
 		      &cmd, sizeof(cmd),
 		      WMI_PORT_ALLOCATED_EVENTID, &reply,
@@ -2449,12 +2474,12 @@ int wmi_port_delete(struct wil6210_priv *wil, u8 mid)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_port_deleted_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.status = WMI_FW_STATUS_FAILURE},
+	};
 
 	wil_dbg_misc(wil, "port delete, mid %d\n", mid);
 
-	reply.evt.status = WMI_FW_STATUS_FAILURE;
-
 	rc = wmi_call(wil, WMI_PORT_DELETE_CMDID, mid,
 		      &cmd, sizeof(cmd),
 		      WMI_PORT_DELETED_EVENTID, &reply,
@@ -2711,7 +2736,9 @@ int wmi_start_sched_scan(struct wil6210_priv *wil,
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_start_sched_scan_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.result = WMI_PNO_REJECT},
+	};
 
 	if (!test_bit(WMI_FW_CAPABILITY_PNO, wil->fw_capabilities))
 		return -ENOTSUPP;
@@ -2727,8 +2754,6 @@ int wmi_start_sched_scan(struct wil6210_priv *wil,
 	wmi_sched_scan_set_plans(wil, &cmd,
 				 request->scan_plans, request->n_scan_plans);
 
-	reply.evt.result = WMI_PNO_REJECT;
-
 	rc = wmi_call(wil, WMI_START_SCHED_SCAN_CMDID, vif->mid,
 		      &cmd, sizeof(cmd),
 		      WMI_START_SCHED_SCAN_EVENTID, &reply, sizeof(reply),
@@ -2752,13 +2777,13 @@ int wmi_stop_sched_scan(struct wil6210_priv *wil)
 	struct {
 		struct wmi_cmd_hdr wmi;
 		struct wmi_stop_sched_scan_event evt;
-	} __packed reply;
+	} __packed reply = {
+		.evt = {.result = WMI_PNO_REJECT},
+	};
 
 	if (!test_bit(WMI_FW_CAPABILITY_PNO, wil->fw_capabilities))
 		return -ENOTSUPP;
 
-	reply.evt.result = WMI_PNO_REJECT;
-
 	rc = wmi_call(wil, WMI_STOP_SCHED_SCAN_CMDID, vif->mid, NULL, 0,
 		      WMI_STOP_SCHED_SCAN_EVENTID, &reply, sizeof(reply),
 		      WIL_WMI_CALL_GENERAL_TO_MS);
-- 
cgit v1.2.3-59-g8ed1b


From 37f8d26d8347f659e4677b4e438708ce492262bf Mon Sep 17 00:00:00 2001
From: Dedy Lansky <dlansky@codeaurora.org>
Date: Wed, 9 May 2018 13:06:58 +0300
Subject: wil6210: remove unused rx_reorder members

Remove unused members from struct wil_tid_ampdu_rx

Signed-off-by: Dedy Lansky <dlansky@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/debugfs.c    |  3 +--
 drivers/net/wireless/ath/wil6210/rx_reorder.c |  7 +------
 drivers/net/wireless/ath/wil6210/wil6210.h    | 10 ----------
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/debugfs.c b/drivers/net/wireless/ath/wil6210/debugfs.c
index d3b1069ebf7a..524a7d689833 100644
--- a/drivers/net/wireless/ath/wil6210/debugfs.c
+++ b/drivers/net/wireless/ath/wil6210/debugfs.c
@@ -1379,8 +1379,7 @@ static void wil_print_rxtid(struct seq_file *s, struct wil_tid_ampdu_rx *r)
 	u16 index = ((r->head_seq_num - r->ssn) & 0xfff) % r->buf_size;
 	unsigned long long drop_dup = r->drop_dup, drop_old = r->drop_old;
 
-	seq_printf(s, "([%2d] %3d TU) 0x%03x [", r->buf_size, r->timeout,
-		   r->head_seq_num);
+	seq_printf(s, "([%2d]) 0x%03x [", r->buf_size, r->head_seq_num);
 	for (i = 0; i < r->buf_size; i++) {
 		if (i == index)
 			seq_printf(s, "%c", r->reorder_buf[i] ? 'O' : '|');
diff --git a/drivers/net/wireless/ath/wil6210/rx_reorder.c b/drivers/net/wireless/ath/wil6210/rx_reorder.c
index 14dcb0698dee..76f8084c1fd8 100644
--- a/drivers/net/wireless/ath/wil6210/rx_reorder.c
+++ b/drivers/net/wireless/ath/wil6210/rx_reorder.c
@@ -206,7 +206,6 @@ __acquires(&sta->tid_rx_lock) __releases(&sta->tid_rx_lock)
 
 	/* put the frame in the reordering buffer */
 	r->reorder_buf[index] = skb;
-	r->reorder_time[index] = jiffies;
 	r->stored_mpdu_num++;
 	wil_reorder_release(ndev, r);
 
@@ -252,11 +251,8 @@ struct wil_tid_ampdu_rx *wil_tid_ampdu_rx_alloc(struct wil6210_priv *wil,
 
 	r->reorder_buf =
 		kcalloc(size, sizeof(struct sk_buff *), GFP_KERNEL);
-	r->reorder_time =
-		kcalloc(size, sizeof(unsigned long), GFP_KERNEL);
-	if (!r->reorder_buf || !r->reorder_time) {
+	if (!r->reorder_buf) {
 		kfree(r->reorder_buf);
-		kfree(r->reorder_time);
 		kfree(r);
 		return NULL;
 	}
@@ -286,7 +282,6 @@ void wil_tid_ampdu_rx_free(struct wil6210_priv *wil,
 		kfree_skb(r->reorder_buf[i]);
 
 	kfree(r->reorder_buf);
-	kfree(r->reorder_time);
 	kfree(r);
 }
 
diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h
index c9120dc8a435..b623510c6f6c 100644
--- a/drivers/net/wireless/ath/wil6210/wil6210.h
+++ b/drivers/net/wireless/ath/wil6210/wil6210.h
@@ -493,38 +493,28 @@ struct pci_dev;
  * struct tid_ampdu_rx - TID aggregation information (Rx).
  *
  * @reorder_buf: buffer to reorder incoming aggregated MPDUs
- * @reorder_time: jiffies when skb was added
- * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
- * @reorder_timer: releases expired frames from the reorder buffer.
  * @last_rx: jiffies of last rx activity
  * @head_seq_num: head sequence number in reordering buffer.
  * @stored_mpdu_num: number of MPDUs in reordering buffer
  * @ssn: Starting Sequence Number expected to be aggregated.
  * @buf_size: buffer size for incoming A-MPDUs
- * @timeout: reset timer value (in TUs).
  * @ssn_last_drop: SSN of the last dropped frame
  * @total: total number of processed incoming frames
  * @drop_dup: duplicate frames dropped for this reorder buffer
  * @drop_old: old frames dropped for this reorder buffer
- * @dialog_token: dialog token for aggregation session
  * @first_time: true when this buffer used 1-st time
  */
 struct wil_tid_ampdu_rx {
 	struct sk_buff **reorder_buf;
-	unsigned long *reorder_time;
-	struct timer_list session_timer;
-	struct timer_list reorder_timer;
 	unsigned long last_rx;
 	u16 head_seq_num;
 	u16 stored_mpdu_num;
 	u16 ssn;
 	u16 buf_size;
-	u16 timeout;
 	u16 ssn_last_drop;
 	unsigned long long total; /* frames processed */
 	unsigned long long drop_dup;
 	unsigned long long drop_old;
-	u8 dialog_token;
 	bool first_time; /* is it 1-st time this buffer used? */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 3d6b72729cc2933906de8d2c602ae05e920b2122 Mon Sep 17 00:00:00 2001
From: Dedy Lansky <dlansky@codeaurora.org>
Date: Wed, 9 May 2018 13:06:59 +0300
Subject: wil6210: rate limit wil_rx_refill error

wil_err inside wil_rx_refill can flood the log buffer.
Replace it with wil_err_ratelimited.

Signed-off-by: Dedy Lansky <dlansky@codeaurora.org>
Signed-off-by: Maya Erez <merez@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wil6210/txrx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/wil6210/txrx.c b/drivers/net/wireless/ath/wil6210/txrx.c
index 411130a4f2ed..b9a9fa828961 100644
--- a/drivers/net/wireless/ath/wil6210/txrx.c
+++ b/drivers/net/wireless/ath/wil6210/txrx.c
@@ -652,8 +652,8 @@ static int wil_rx_refill(struct wil6210_priv *wil, int count)
 			v->swtail = next_tail) {
 		rc = wil_vring_alloc_skb(wil, v, v->swtail, headroom);
 		if (unlikely(rc)) {
-			wil_err(wil, "Error %d in wil_rx_refill[%d]\n",
-				rc, v->swtail);
+			wil_err_ratelimited(wil, "Error %d in rx refill[%d]\n",
+					    rc, v->swtail);
 			break;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 289e1f4e9e4a09c73a1c0152bb93855ea351ccda Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Sun, 13 May 2018 21:48:30 +0200
Subject: net: ipv4: ipconfig: fix unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_PROC_FS isn't set, variable ipconfig_dir isn't used.
net/ipv4/ipconfig.c:167:31: warning: ‘ipconfig_dir’ defined but not used [-Wunused-variable]
 static struct proc_dir_entry *ipconfig_dir;
                               ^~~~~~~~~~~~
Move the declaration of ipconfig_dir inside the CONFIG_PROC_FS ifdef to
fix the warning.

Fixes: c04d2cb2009f ("ipconfig: Write NTP server IPs to /proc/net/ipconfig/ntp_servers")
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index d839d74853fc..86c9f755de3d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -163,9 +163,6 @@ static u8 ic_domain[64];		/* DNS (not NIS) domain name */
  * Private state.
  */
 
-/* proc_dir_entry for /proc/net/ipconfig */
-static struct proc_dir_entry *ipconfig_dir;
-
 /* Name of user-selected boot device */
 static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
 
@@ -1292,6 +1289,8 @@ static int __init ic_dynamic(void)
 #endif /* IPCONFIG_DYNAMIC */
 
 #ifdef CONFIG_PROC_FS
+/* proc_dir_entry for /proc/net/ipconfig */
+static struct proc_dir_entry *ipconfig_dir;
 
 /* Name servers: */
 static int pnp_seq_show(struct seq_file *seq, void *v)
-- 
cgit v1.2.3-59-g8ed1b


From 8c46fcd783082118e72ebbe1937b93bb7df8a63c Mon Sep 17 00:00:00 2001
From: Marek Lindner <mareklindner@neomailbox.ch>
Date: Mon, 14 May 2018 06:38:49 +0800
Subject: batman-adv: disable ethtool link speed detection when auto
 negotiation off

Virtual interface drivers such as tun / tap interfaces, VLAN, etc tend
to initialize the interface throughput with some value for the sake of
having a throughput number to export via ethtool. This exported
throughput leaves batman-adv to conclude the interface throughput is
genuine (reflecting reality), thus no measurements are necessary.

Based on the observation that those interface types also tend to set
the link auto-negotiation to 'off', batman-adv shall check this
setting to differentiate between genuine link throughput information
and placeholders installed by virtual interfaces.

The "default throughput" setting exported via sysfs still allows to
configure the batman-adv throughput for the interface, thus disabling
the measurements.

Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/bat_v_elp.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 28687493599f..71c20c1d4002 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -127,7 +127,20 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
 	rtnl_lock();
 	ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
 	rtnl_unlock();
-	if (ret == 0) {
+
+	/* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
+	 * tend to initialize the interface throughput with some value for the
+	 * sake of having a throughput number to export via ethtool. This
+	 * exported throughput leaves batman-adv to conclude the interface
+	 * throughput is genuine (reflecting reality), thus no measurements
+	 * are necessary.
+	 *
+	 * Based on the observation that those interface types also tend to set
+	 * the link auto-negotiation to 'off', batman-adv shall check this
+	 * setting to differentiate between genuine link throughput information
+	 * and placeholders installed by virtual interfaces.
+	 */
+	if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
 		/* link characteristics might change over time */
 		if (link_settings.base.duplex == DUPLEX_FULL)
 			hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
-- 
cgit v1.2.3-59-g8ed1b


From 18cfb44cf1845d11d50f4733e039acb8b377c8eb Mon Sep 17 00:00:00 2001
From: Marek Lindner <mareklindner@neomailbox.ch>
Date: Mon, 14 May 2018 06:42:59 +0800
Subject: batman-adv: enable B.A.T.M.A.N. V compilation by default

Signed-off-by: Marek Lindner <mareklindner@neomailbox.ch>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index bee034a95208..de8034d80623 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -35,7 +35,7 @@ config BATMAN_ADV
 config BATMAN_ADV_BATMAN_V
 	bool "B.A.T.M.A.N. V protocol (experimental)"
 	depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
-	default n
+	default y
 	help
 	  This option enables the B.A.T.M.A.N. V protocol, the successor
 	  of the currently used B.A.T.M.A.N. IV protocol. The main
-- 
cgit v1.2.3-59-g8ed1b


From 0fccb85ad21d65c6ee1ea912acc3f0ae4df31876 Mon Sep 17 00:00:00 2001
From: Bruce Allan <bruce.w.allan@intel.com>
Date: Thu, 10 May 2018 05:59:39 -0700
Subject: virtchnl: Whitespace and parenthesis cleanup

Clean up existing instances of unnecessary parentheses in if
statement and change order of conditionals to make it easier to read

The opening /* should be followed by a single space and the closing */
should be preceded with a single space.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index b0a7f315bfbe..212b3822d180 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -485,7 +485,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_key);
 struct virtchnl_rss_lut {
 	u16 vsi_id;
 	u16 lut_entries;
-	u8 lut[1];        /* RSS lookup table*/
+	u8 lut[1];        /* RSS lookup table */
 };
 
 VIRTCHNL_CHECK_STRUCT_LEN(6, virtchnl_rss_lut);
@@ -819,7 +819,7 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 		return VIRTCHNL_ERR_PARAM;
 	}
 	/* few more checks */
-	if ((valid_len != msglen) || (err_msg_format))
+	if (err_msg_format || valid_len != msglen)
 		return VIRTCHNL_STATUS_ERR_OPCODE_MISMATCH;
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e691b771aa7f660c42697c15b86ada2180e68280 Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 10 May 2018 05:59:40 -0700
Subject: i40evf: Fix client header define

Fix up the VF client header define, since it is the same as the PF
client header.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
---
 drivers/net/ethernet/intel/i40evf/i40evf_client.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_client.h b/drivers/net/ethernet/intel/i40evf/i40evf_client.h
index fc6592c3de9c..5585f362048a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_client.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_client.h
@@ -1,8 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
-#ifndef _I40E_CLIENT_H_
-#define _I40E_CLIENT_H_
+#ifndef _I40EVF_CLIENT_H_
+#define _I40EVF_CLIENT_H_
 
 #define I40EVF_CLIENT_STR_LENGTH 10
 
@@ -166,4 +166,4 @@ struct i40e_client {
 /* used by clients */
 int i40evf_register_client(struct i40e_client *client);
 int i40evf_unregister_client(struct i40e_client *client);
-#endif /* _I40E_CLIENT_H_ */
+#endif /* _I40EVF_CLIENT_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 0ded9c61c13746991cbcc098cf00715526627e9a Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:41 -0700
Subject: i40e: calculate ethtool stats size in a separate function

Use a separate function to calculate the number of stats for
a particular device. This helps reduce the clutter in
i40e_get_sset_count().

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 28 +++++++++++++++++---------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index fc6a5eef141c..a62142f033d2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1658,6 +1658,23 @@ done:
 	return err;
 }
 
+static int i40e_get_stats_count(struct net_device *netdev)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+
+	if (vsi == pf->vsi[pf->lan_vsi] && pf->hw.partition_id == 1) {
+		if (pf->lan_veb != I40E_NO_VEB &&
+		    pf->flags & I40E_FLAG_VEB_STATS_ENABLED)
+			return I40E_PF_STATS_LEN(netdev) + I40E_VEB_STATS_TOTAL;
+		else
+			return I40E_PF_STATS_LEN(netdev);
+	} else {
+		return I40E_VSI_STATS_LEN(netdev);
+	}
+}
+
 static int i40e_get_sset_count(struct net_device *netdev, int sset)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
@@ -1668,16 +1685,7 @@ static int i40e_get_sset_count(struct net_device *netdev, int sset)
 	case ETH_SS_TEST:
 		return I40E_TEST_LEN;
 	case ETH_SS_STATS:
-		if (vsi == pf->vsi[pf->lan_vsi] && pf->hw.partition_id == 1) {
-			int len = I40E_PF_STATS_LEN(netdev);
-
-			if ((pf->lan_veb != I40E_NO_VEB) &&
-			    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED))
-				len += I40E_VEB_STATS_TOTAL;
-			return len;
-		} else {
-			return I40E_VSI_STATS_LEN(netdev);
-		}
+		return i40e_get_stats_count(netdev);
 	case ETH_SS_PRIV_FLAGS:
 		return I40E_PRIV_FLAGS_STR_LEN +
 			(pf->hw.pf_id == 0 ? I40E_GL_PRIV_FLAGS_STR_LEN : 0);
-- 
cgit v1.2.3-59-g8ed1b


From 7e20188176604bf10ac396efe31ddf39d3eb8cf4 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:42 -0700
Subject: i40e: remove duplicate pfc stats

The pfc related priority stats are already handled separately as these
stats are actually arrays of length I40E_MAX_USER_PRIORITY. Thus,
including them within i40e_gstrings_stats will just duplicate data.

Worse, the sizeof will be incorrect, as it will be the total size of the
stat arrays, which in this case is 8 * sizeof(u64), so we will only copy
the stat contents as if they were a u32.

Since we already correctly handle these stats else where, remove them
from the i40e_gstrings_stats.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index a62142f033d2..0f237397f52b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -103,10 +103,6 @@ static const struct i40e_stats i40e_gstrings_stats[] = {
 	I40E_PF_STAT("link_xoff_rx", stats.link_xoff_rx),
 	I40E_PF_STAT("link_xon_tx", stats.link_xon_tx),
 	I40E_PF_STAT("link_xoff_tx", stats.link_xoff_tx),
-	I40E_PF_STAT("priority_xon_rx", stats.priority_xon_rx),
-	I40E_PF_STAT("priority_xoff_rx", stats.priority_xoff_rx),
-	I40E_PF_STAT("priority_xon_tx", stats.priority_xon_tx),
-	I40E_PF_STAT("priority_xoff_tx", stats.priority_xoff_tx),
 	I40E_PF_STAT("rx_size_64", stats.rx_size_64),
 	I40E_PF_STAT("rx_size_127", stats.rx_size_127),
 	I40E_PF_STAT("rx_size_255", stats.rx_size_255),
-- 
cgit v1.2.3-59-g8ed1b


From 132ee00eed22e540aadbd111c2f9619c9bcc6413 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:43 -0700
Subject: i40e: cleanup whitespace for some ethtool stat definitions

A future patch is going to refactor some of the ethtool statistic code.
To keep the patches easy to review, cleanup some of the indentation used
for macro definitions first.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 0f237397f52b..e6e58e8404c5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -19,13 +19,13 @@ struct i40e_stats {
 }
 
 #define I40E_NETDEV_STAT(_net_stat) \
-		I40E_STAT(struct rtnl_link_stats64, #_net_stat, _net_stat)
+	I40E_STAT(struct rtnl_link_stats64, #_net_stat, _net_stat)
 #define I40E_PF_STAT(_name, _stat) \
-		I40E_STAT(struct i40e_pf, _name, _stat)
+	I40E_STAT(struct i40e_pf, _name, _stat)
 #define I40E_VSI_STAT(_name, _stat) \
-		I40E_STAT(struct i40e_vsi, _name, _stat)
+	I40E_STAT(struct i40e_vsi, _name, _stat)
 #define I40E_VEB_STAT(_name, _stat) \
-		I40E_STAT(struct i40e_veb, _name, _stat)
+	I40E_STAT(struct i40e_veb, _name, _stat)
 
 static const struct i40e_stats i40e_gstrings_net_stats[] = {
 	I40E_NETDEV_STAT(rx_packets),
@@ -144,9 +144,9 @@ static const struct i40e_stats i40e_gstrings_stats[] = {
 	    * 2 /* Tx and Rx together */                                     \
 	    * (sizeof(struct i40e_queue_stats) / sizeof(u64)))
 #define I40E_GLOBAL_STATS_LEN	ARRAY_SIZE(i40e_gstrings_stats)
-#define I40E_NETDEV_STATS_LEN   ARRAY_SIZE(i40e_gstrings_net_stats)
+#define I40E_NETDEV_STATS_LEN	ARRAY_SIZE(i40e_gstrings_net_stats)
 #define I40E_MISC_STATS_LEN	ARRAY_SIZE(i40e_gstrings_misc_stats)
-#define I40E_VSI_STATS_LEN(n)   (I40E_NETDEV_STATS_LEN + \
+#define I40E_VSI_STATS_LEN(n)	(I40E_NETDEV_STATS_LEN + \
 				 I40E_MISC_STATS_LEN + \
 				 I40E_QUEUE_STATS_LEN((n)))
 #define I40E_PFC_STATS_LEN ( \
-- 
cgit v1.2.3-59-g8ed1b


From ca12c9d4213fe8203d58632cd8ad25b5abcc7ef2 Mon Sep 17 00:00:00 2001
From: Patryk Małek <patryk.malek@intel.com>
Date: Thu, 10 May 2018 05:59:44 -0700
Subject: i40e: Fix recalculation of MSI-X vectors for VMDq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a recalculation of number of MSI-X
vectors for VMDq in the case where we have less
vectors available than we would want to reserve for
VMDq.

It fixes the issue where we recalculate vectors left
and vectors wanted but we didn't take into account
the reduced number of queue pairs per VSI.

Signed-off-by: Patryk Małek <patryk.malek@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c8659fbd7111..f17867ab9a90 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10309,21 +10309,28 @@ static int i40e_init_msix(struct i40e_pf *pf)
 
 	/* any vectors left over go for VMDq support */
 	if (pf->flags & I40E_FLAG_VMDQ_ENABLED) {
-		int vmdq_vecs_wanted = pf->num_vmdq_vsis * pf->num_vmdq_qps;
-		int vmdq_vecs = min_t(int, vectors_left, vmdq_vecs_wanted);
-
 		if (!vectors_left) {
 			pf->num_vmdq_msix = 0;
 			pf->num_vmdq_qps = 0;
 		} else {
+			int vmdq_vecs_wanted =
+				pf->num_vmdq_vsis * pf->num_vmdq_qps;
+			int vmdq_vecs =
+				min_t(int, vectors_left, vmdq_vecs_wanted);
+
 			/* if we're short on vectors for what's desired, we limit
 			 * the queues per vmdq.  If this is still more than are
 			 * available, the user will need to change the number of
 			 * queues/vectors used by the PF later with the ethtool
 			 * channels command
 			 */
-			if (vmdq_vecs < vmdq_vecs_wanted)
+			if (vectors_left < vmdq_vecs_wanted) {
 				pf->num_vmdq_qps = 1;
+				vmdq_vecs_wanted = pf->num_vmdq_vsis;
+				vmdq_vecs = min_t(int,
+						  vectors_left,
+						  vmdq_vecs_wanted);
+			}
 			pf->num_vmdq_msix = pf->num_vmdq_qps;
 
 			v_budget += vmdq_vecs;
-- 
cgit v1.2.3-59-g8ed1b


From 3f76d01f3efeeafe2e9f3c74074d6eecff201466 Mon Sep 17 00:00:00 2001
From: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Date: Thu, 10 May 2018 05:59:45 -0700
Subject: i40e: add tx_busy to ethtool stats

This patch adds the tx_busy stat to the ethtool stats. The tx_busy
stat tracks the number of times we return NETDEV_TX_BUSY to the stack
during transmit.

Signed-off-by: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index e6e58e8404c5..329e59eae4a1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -66,6 +66,7 @@ static const struct i40e_stats i40e_gstrings_misc_stats[] = {
 	I40E_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
 	I40E_VSI_STAT("tx_linearize", tx_linearize),
 	I40E_VSI_STAT("tx_force_wb", tx_force_wb),
+	I40E_VSI_STAT("tx_busy", tx_busy),
 	I40E_VSI_STAT("rx_alloc_fail", rx_buf_failed),
 	I40E_VSI_STAT("rx_pg_alloc_fail", rx_page_failed),
 };
-- 
cgit v1.2.3-59-g8ed1b


From aa4a0654035beaf8c64eddd0b254c299af548f4d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:46 -0700
Subject: i40evf: remove MAX_QUEUES and just use I40EVF_MAX_REQ_QUEUES

We don't really need to have separate definitions for MAX_QUEUES and
I40EVF_MAX_REQ_QUEUES, since we'll always be limited by how many queues
we request anyways. If we haven't enabled requesting the maximum number
of queues, there's no reason to have our call to alloc_etherdev_mq
actually pass the higher value, since we'd never enable those queues
anyways.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40evf/i40evf.h      | 1 -
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 98b834932dd3..96e537a35000 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -81,7 +81,6 @@ struct i40e_vsi {
 #define I40E_TX_DESC(R, i) (&(((struct i40e_tx_desc *)((R)->desc))[i]))
 #define I40E_TX_CTXTDESC(R, i) \
 	(&(((struct i40e_tx_context_desc *)((R)->desc))[i]))
-#define MAX_QUEUES 16
 #define I40EVF_MAX_REQ_QUEUES 4
 
 #define I40EVF_HKEY_ARRAY_SIZE ((I40E_VFQF_HKEY_MAX_INDEX + 1) * 4)
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 3f04a182903d..95a222d7ae43 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2331,7 +2331,7 @@ static int i40evf_validate_ch_config(struct i40evf_adapter *adapter,
 		total_max_rate += tx_rate;
 		num_qps += mqprio_qopt->qopt.count[i];
 	}
-	if (num_qps > MAX_QUEUES)
+	if (num_qps > I40EVF_MAX_REQ_QUEUES)
 		return -EINVAL;
 
 	ret = i40evf_validate_tx_bandwidth(adapter, total_max_rate);
@@ -3689,7 +3689,8 @@ static int i40evf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_master(pdev);
 
-	netdev = alloc_etherdev_mq(sizeof(struct i40evf_adapter), MAX_QUEUES);
+	netdev = alloc_etherdev_mq(sizeof(struct i40evf_adapter),
+				   I40EVF_MAX_REQ_QUEUES);
 	if (!netdev) {
 		err = -ENOMEM;
 		goto err_alloc_etherdev;
-- 
cgit v1.2.3-59-g8ed1b


From 9c0c3b83d32ed9678bcde50743497571b61759da Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:47 -0700
Subject: i40e: cleanup wording in a header comment

Fix up the English in the header comment for i40e_ptp_tx_hang.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ptp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index aa3daec2049d..6706141a5ccd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -317,7 +317,7 @@ void i40e_ptp_rx_hang(struct i40e_pf *pf)
  * This watchdog task is run periodically to make sure that we clear the Tx
  * timestamp logic if we don't obtain a timestamp in a reasonable amount of
  * time. It is unexpected in the normal case but if it occurs it results in
- * permanently prevent timestamps of future packets
+ * permanently preventing timestamps of future packets.
  **/
 void i40e_ptp_tx_hang(struct i40e_pf *pf)
 {
-- 
cgit v1.2.3-59-g8ed1b


From c79756cb5f084736b138da9319a02f7c72644548 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 10 May 2018 05:59:48 -0700
Subject: i40e: free the skb after clearing the bitlock

In commit bbc4e7d273b5 ("i40e: fix race condition with PTP_TX_IN_PROGRESS
bits") we modified the code which handles Tx timestamps so that we would
clear the progress bit as soon as possible.

A later commit 0bc0706b46cd ("i40e: check for Tx timestamp timeouts during
watchdog") introduced similar code for detecting and handling cleanup of
a blocked Tx timestamp. This code did not use the same pattern for cleaning
up the skb.

Update this code to wait to free the skb until after the bit lock is
free, by first setting the ptp_tx_skb to NULL and clearing the lock.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ptp.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 6706141a5ccd..d50d84927e6b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -321,6 +321,8 @@ void i40e_ptp_rx_hang(struct i40e_pf *pf)
  **/
 void i40e_ptp_tx_hang(struct i40e_pf *pf)
 {
+	struct sk_buff *skb;
+
 	if (!(pf->flags & I40E_FLAG_PTP) || !pf->ptp_tx)
 		return;
 
@@ -333,9 +335,12 @@ void i40e_ptp_tx_hang(struct i40e_pf *pf)
 	 * within a second it is reasonable to assume that we never will.
 	 */
 	if (time_is_before_jiffies(pf->ptp_tx_start + HZ)) {
-		dev_kfree_skb_any(pf->ptp_tx_skb);
+		skb = pf->ptp_tx_skb;
 		pf->ptp_tx_skb = NULL;
 		clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+
+		/* Free the skb after we clear the bitlock */
+		dev_kfree_skb_any(skb);
 		pf->tx_hwtstamp_timeouts++;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 27392e57189e6e6f77e78fe9aeb3a2c7e2ccbdf4 Mon Sep 17 00:00:00 2001
From: Paweł Jabłoński <pawel.jablonski@intel.com>
Date: Thu, 10 May 2018 05:59:49 -0700
Subject: i40evf: Fix a hardware reset support in VF driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a hardware reset support in VF driver.
It is needed because when a hardware reset is detected
adapter->state is in __I40EVF_RESETTING state before
i40evf_reset_task is called. Without this patch
unloading VF driver after a hardware reset ends
with a system crash.

Signed-off-by: Paweł Jabłoński <pawel.jablonski@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c     | 21 +++++++++++++++++++--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c |  3 ++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f17867ab9a90..b5daa5c9c7de 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2840,6 +2840,23 @@ static int i40e_vlan_rx_add_vid(struct net_device *netdev,
 	return ret;
 }
 
+/**
+ * i40e_vlan_rx_add_vid_up - Add a vlan id filter to HW offload in UP path
+ * @netdev: network interface to be adjusted
+ * @proto: unused protocol value
+ * @vid: vlan id to be added
+ **/
+static void i40e_vlan_rx_add_vid_up(struct net_device *netdev,
+				    __always_unused __be16 proto, u16 vid)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+
+	if (vid >= VLAN_N_VID)
+		return;
+	set_bit(vid, vsi->active_vlans);
+}
+
 /**
  * i40e_vlan_rx_kill_vid - Remove a vlan id filter from HW offload
  * @netdev: network interface to be adjusted
@@ -2882,8 +2899,8 @@ static void i40e_restore_vlan(struct i40e_vsi *vsi)
 		i40e_vlan_stripping_disable(vsi);
 
 	for_each_set_bit(vid, vsi->active_vlans, VLAN_N_VID)
-		i40e_vlan_rx_add_vid(vsi->netdev, htons(ETH_P_8021Q),
-				     vid);
+		i40e_vlan_rx_add_vid_up(vsi->netdev, htons(ETH_P_8021Q),
+					vid);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 95a222d7ae43..a7b87f935411 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1925,7 +1925,8 @@ continue_reset:
 	 * ndo_open() returning, so we can't assume it means all our open
 	 * tasks have finished, since we're not holding the rtnl_lock here.
 	 */
-	running = (adapter->state == __I40EVF_RUNNING);
+	running = ((adapter->state == __I40EVF_RUNNING) ||
+		   (adapter->state == __I40EVF_RESETTING));
 
 	if (running) {
 		netif_carrier_off(netdev);
-- 
cgit v1.2.3-59-g8ed1b


From 2724273e8fd00b512596a77ee063f49b25f36507 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 2 May 2018 15:17:17 +0530
Subject: vmcore: add API to collect hardware dump in second kernel

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/vmcore are as follows:

1. During probe (before hardware is initialized), device drivers
register to the vmcore module (via vmcore_add_device_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. vmcore module allocates the buffer with requested size. It adds
an Elf note and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to vmcore module.

Ensure that the device dump buffer size is always aligned to page size
so that it can be mmaped.

Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more
generic and reserve NT_VMCOREDD note type to indicate vmcore device
dump.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/Kconfig             |  15 +++++
 fs/proc/vmcore.c            | 135 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/crash_dump.h  |  18 ++++++
 include/linux/kcore.h       |   6 ++
 include/uapi/linux/elf.h    |   1 +
 include/uapi/linux/vmcore.h |  18 ++++++
 6 files changed, 184 insertions(+), 9 deletions(-)
 create mode 100644 include/uapi/linux/vmcore.h

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..0eaeb41453f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,21 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_VMCORE_DEVICE_DUMP
+	bool "Device Hardware/Firmware Log Collection"
+	depends on PROC_VMCORE
+	default n
+	help
+	  After kernel panic, device drivers can collect the device
+	  specific snapshot of their hardware or firmware before the
+	  underlying devices are initialized in crash recovery kernel.
+	  Note that the device driver must be present in the crash
+	  recovery kernel's initramfs to collect its underlying device
+	  snapshot.
+
+	  If you say Y here, the collected device dumps will be added
+	  as ELF notes to /proc/vmcore.
+
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..abb3dba0fa49 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
@@ -44,6 +45,12 @@ static u64 vmcore_size;
 
 static struct proc_dir_entry *proc_vmcore;
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
 };
 
 /**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- *                      vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
  *
  * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
  * the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
  * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
  * disabled and there's no need to allow users to mmap the buffer.
  */
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
 {
 #ifdef CONFIG_MMU
-	return vmalloc_user(notes_sz);
+	return vmalloc_user(size);
 #else
-	return vzalloc(notes_sz);
+	return vzalloc(size);
 #endif
 }
 
@@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -1145,6 +1150,115 @@ static int __init parse_crash_elf_headers(void)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+				  u32 size)
+{
+	struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+	vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+	vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+	vdd_hdr->n_type = NT_VMCOREDD;
+
+	strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+		sizeof(vdd_hdr->name));
+	memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	struct vmcoredd_node *dump;
+	void *buf = NULL;
+	size_t data_size;
+	int ret;
+
+	if (!data || !strlen(data->dump_name) ||
+	    !data->vmcoredd_callback || !data->size)
+		return -EINVAL;
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Keep size of the buffer page aligned so that it can be mmaped */
+	data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+			    PAGE_SIZE);
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vmcore_alloc_buf(data_size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	vmcoredd_write_header(buf, data, data_size -
+			      sizeof(struct vmcoredd_header));
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->vmcoredd_callback(data, buf +
+				      sizeof(struct vmcoredd_header));
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data_size;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&vmcoredd_mutex);
+	list_add_tail(&dump->list, &vmcoredd_list);
+	mutex_unlock(&vmcoredd_mutex);
+
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+	mutex_lock(&vmcoredd_mutex);
+	while (!list_empty(&vmcoredd_list)) {
+		struct vmcoredd_node *dump;
+
+		dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+					list);
+		list_del(&dump->list);
+		vfree(dump->buf);
+		vfree(dump);
+	}
+	mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
 /* Init function for vmcore module. */
 static int __init vmcore_init(void)
 {
@@ -1192,4 +1306,7 @@ void vmcore_cleanup(void)
 		kfree(m);
 	}
 	free_elfcorebuf();
+
+	/* clear vmcore device dump list */
+	vmcore_free_device_dumps();
 }
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f7ac2aa93269..3e4ba9d753c8 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -5,6 +5,7 @@
 #include <linux/kexec.h>
 #include <linux/proc_fs.h>
 #include <linux/elf.h>
+#include <uapi/linux/vmcore.h>
 
 #include <asm/pgtable.h> /* for pgprot_t */
 
@@ -93,4 +94,21 @@ static inline bool is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
 
 extern unsigned long saved_max_pfn;
+
+/* Device Dump information to be filled by drivers */
+struct vmcoredd_data {
+	char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
+	unsigned int size;                       /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
+};
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+int vmcore_add_device_dump(struct vmcoredd_data *data);
+#else
+static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 80db19d3a505..8de55e4b5ee9 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -28,6 +28,12 @@ struct vmcore {
 	loff_t offset;
 };
 
+struct vmcoredd_node {
+	struct list_head list;	/* List of dumps */
+	void *buf;		/* Buffer containing device's dump */
+	unsigned int size;	/* Size of the buffer */
+};
+
 #ifdef CONFIG_PROC_KCORE
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e2535d6dcec7..4e12c423b9fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
+#define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
new file mode 100644
index 000000000000..022619668e0e
--- /dev/null
+++ b/include/uapi/linux/vmcore.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_VMCORE_H
+#define _UAPI_VMCORE_H
+
+#include <linux/types.h>
+
+#define VMCOREDD_NOTE_NAME "LINUX"
+#define VMCOREDD_MAX_NAME_BYTES 44
+
+struct vmcoredd_header {
+	__u32 n_namesz; /* Name size */
+	__u32 n_descsz; /* Content size */
+	__u32 n_type;   /* NT_VMCOREDD */
+	__u8 name[8];   /* LINUX\0\0\0 */
+	__u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
+};
+
+#endif /* _UAPI_VMCORE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 7efe48df8a3df6d089d2e9a1d429c82cc5dcf2de Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 2 May 2018 15:17:18 +0530
Subject: vmcore: append device dumps to vmcore as elf notes

Update read and mmap logic to append device dumps as additional notes
before the other elf notes. We add device dumps before other elf notes
because the other elf notes may not fill the elf notes buffer
completely and we will end up with zero-filled data between the elf
notes and the device dumps. Tools will then try to decode this
zero-filled data as valid notes and we don't want that. Hence, adding
device dumps before the other elf notes ensure that zero-filled data
can be avoided. This also ensures that the device dumps and the
other elf notes can be properly mmaped at page aligned address.

Incorporate device dump size into the total vmcore size. Also update
offsets for other program headers after the device dumps are added.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/vmcore.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 4 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index abb3dba0fa49..247c3499e5bd 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -39,6 +39,8 @@ static size_t elfcorebuf_sz_orig;
 
 static char *elfnotes_buf;
 static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
 
 /* Total size of vmcore file. */
 static u64 vmcore_size;
@@ -51,6 +53,9 @@ static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -185,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+{
+	struct vmcoredd_node *dump;
+	u64 offset = 0;
+	int ret = 0;
+	size_t tsz;
+	char *buf;
+
+	mutex_lock(&vmcoredd_mutex);
+	list_for_each_entry(dump, &vmcoredd_list, list) {
+		if (start < offset + dump->size) {
+			tsz = min(offset + (u64)dump->size - start, (u64)size);
+			buf = dump->buf + start - offset;
+			if (copy_to(dst, buf, tsz, userbuf)) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+
+			size -= tsz;
+			start += tsz;
+			dst += tsz;
+
+			/* Leave now if buffer filled already */
+			if (!size)
+				goto out_unlock;
+		}
+		offset += dump->size;
+	}
+
+out_unlock:
+	mutex_unlock(&vmcoredd_mutex);
+	return ret;
+}
+
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+			       u64 start, size_t size)
+{
+	struct vmcoredd_node *dump;
+	u64 offset = 0;
+	int ret = 0;
+	size_t tsz;
+	char *buf;
+
+	mutex_lock(&vmcoredd_mutex);
+	list_for_each_entry(dump, &vmcoredd_list, list) {
+		if (start < offset + dump->size) {
+			tsz = min(offset + (u64)dump->size - start, (u64)size);
+			buf = dump->buf + start - offset;
+			if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+
+			size -= tsz;
+			start += tsz;
+			dst += tsz;
+
+			/* Leave now if buffer filled already */
+			if (!size)
+				goto out_unlock;
+		}
+		offset += dump->size;
+	}
+
+out_unlock:
+	mutex_unlock(&vmcoredd_mutex);
+	return ret;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
@@ -222,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
 	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
 		void *kaddr;
 
+		/* We add device dumps before other elf notes because the
+		 * other elf notes may not fill the elf notes buffer
+		 * completely and we will end up with zero-filled data
+		 * between the elf notes and the device dumps. Tools will
+		 * then try to decode this zero-filled data as valid notes
+		 * and we don't want that. Hence, adding device dumps before
+		 * the other elf notes ensure that zero-filled data can be
+		 * avoided.
+		 */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+		/* Read device dumps */
+		if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+			tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+				  (size_t)*fpos, buflen);
+			start = *fpos - elfcorebuf_sz;
+			if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+				return -EFAULT;
+
+			buflen -= tsz;
+			*fpos += tsz;
+			buffer += tsz;
+			acc += tsz;
+
+			/* leave now if filled buffer already */
+			if (!buflen)
+				return acc;
+		}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+		/* Read remaining elf notes */
 		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
-		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
 		if (copy_to(buffer, kaddr, tsz, userbuf))
 			return -EFAULT;
+
 		buflen -= tsz;
 		*fpos += tsz;
 		buffer += tsz;
@@ -451,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 	if (start < elfcorebuf_sz + elfnotes_sz) {
 		void *kaddr;
 
+		/* We add device dumps before other elf notes because the
+		 * other elf notes may not fill the elf notes buffer
+		 * completely and we will end up with zero-filled data
+		 * between the elf notes and the device dumps. Tools will
+		 * then try to decode this zero-filled data as valid notes
+		 * and we don't want that. Hence, adding device dumps before
+		 * the other elf notes ensure that zero-filled data can be
+		 * avoided. This also ensures that the device dumps and
+		 * other elf notes can be properly mmaped at page aligned
+		 * address.
+		 */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+		/* Read device dumps */
+		if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+			u64 start_off;
+
+			tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+				  (size_t)start, size);
+			start_off = start - elfcorebuf_sz;
+			if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+						start_off, tsz))
+				goto fail;
+
+			size -= tsz;
+			start += tsz;
+			len += tsz;
+
+			/* leave now if filled buffer already */
+			if (!size)
+				return 0;
+		}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+		/* Read remaining elf notes */
 		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
-		kaddr = elfnotes_buf + start - elfcorebuf_sz;
+		kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
 		if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
 						kaddr, tsz))
 			goto fail;
+
 		size -= tsz;
 		start += tsz;
 		len += tsz;
@@ -703,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
 
+	/* Store the size of all notes.  We need this to update the note
+	 * header when the device dumps will be added.
+	 */
+	elfnotes_orig_sz = phdr.p_memsz;
+
 	return 0;
 }
 
@@ -889,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
 
+	/* Store the size of all notes.  We need this to update the note
+	 * header when the device dumps will be added.
+	 */
+	elfnotes_orig_sz = phdr.p_memsz;
+
 	return 0;
 }
 
@@ -981,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 }
 
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
-					   struct list_head *vc_list)
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+				    struct list_head *vc_list)
 {
 	loff_t vmcore_off;
 	struct vmcore *m;
@@ -1174,6 +1326,92 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
 	memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
 }
 
+/**
+ * vmcoredd_update_program_headers - Update all Elf program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+					    size_t vmcoreddsz)
+{
+	unsigned char *e_ident = (unsigned char *)elfptr;
+	u64 start, end, size;
+	loff_t vmcore_off;
+	u32 i;
+
+	vmcore_off = elfcorebuf_sz + elfnotesz;
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+		Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+		/* Update all program headers */
+		for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+			if (phdr->p_type == PT_NOTE) {
+				/* Update note size */
+				phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+				phdr->p_filesz = phdr->p_memsz;
+				continue;
+			}
+
+			start = rounddown(phdr->p_offset, PAGE_SIZE);
+			end = roundup(phdr->p_offset + phdr->p_memsz,
+				      PAGE_SIZE);
+			size = end - start;
+			phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+			vmcore_off += size;
+		}
+	} else {
+		Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+		Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+		/* Update all program headers */
+		for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+			if (phdr->p_type == PT_NOTE) {
+				/* Update note size */
+				phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+				phdr->p_filesz = phdr->p_memsz;
+				continue;
+			}
+
+			start = rounddown(phdr->p_offset, PAGE_SIZE);
+			end = roundup(phdr->p_offset + phdr->p_memsz,
+				      PAGE_SIZE);
+			size = end - start;
+			phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+			vmcore_off += size;
+		}
+	}
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * Elf header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the Elf program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+	vmcoredd_orig_sz += dump_size;
+	elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+	vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+					vmcoredd_orig_sz);
+
+	/* Update vmcore list offsets */
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+	proc_vmcore->size = vmcore_size;
+}
+
 /**
  * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
  * @data: dump info.
@@ -1227,6 +1465,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
 	list_add_tail(&dump->list, &vmcoredd_list);
 	mutex_unlock(&vmcoredd_mutex);
 
+	vmcoredd_update_size(data_size);
 	return 0;
 
 out_err:
-- 
cgit v1.2.3-59-g8ed1b


From 1dde532dd0520a948fbc82f4522183104ee4808b Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 2 May 2018 15:17:19 +0530
Subject: cxgb4: collect hardware dump in second kernel

Register callback to collect hardware/firmware dumps in second kernel
before hardware/firmware is initialized. The dumps for each device
will be available as elf notes in /proc/vmcore in second kernel.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |  4 ++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 ++++++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |  3 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  | 10 ++++++++++
 4 files changed, 42 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 688f95440af2..01e7aad4ce5b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -50,6 +50,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
+#include <linux/crash_dump.h>
 #include <asm/io.h>
 #include "t4_chip_type.h"
 #include "cxgb4_uld.h"
@@ -964,6 +965,9 @@ struct adapter {
 	struct hma_data hma;
 
 	struct srq_data *srq;
+
+	/* Dump buffer for collecting logs in kdump kernel */
+	struct vmcoredd_data vmcoredd;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 143686c60234..085691eb2b95 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -488,3 +488,28 @@ void cxgb4_init_ethtool_dump(struct adapter *adapter)
 	adapter->eth_dump.version = adapter->params.fw_vers;
 	adapter->eth_dump.len = 0;
 }
+
+static int cxgb4_cudbg_vmcoredd_collect(struct vmcoredd_data *data, void *buf)
+{
+	struct adapter *adap = container_of(data, struct adapter, vmcoredd);
+	u32 len = data->size;
+
+	return cxgb4_cudbg_collect(adap, buf, &len, CXGB4_ETH_DUMP_ALL);
+}
+
+int cxgb4_cudbg_vmcore_add_dump(struct adapter *adap)
+{
+	struct vmcoredd_data *data = &adap->vmcoredd;
+	u32 len;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += CUDBG_DUMP_BUFF_SIZE;
+
+	data->size = len;
+	snprintf(data->dump_name, sizeof(data->dump_name), "%s_%s",
+		 cxgb4_driver_name, adap->name);
+	data->vmcoredd_callback = cxgb4_cudbg_vmcoredd_collect;
+
+	return vmcore_add_device_dump(data);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
index ce1ac9a1c878..ef59ba1ed968 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -41,8 +41,11 @@ enum CXGB4_ETHTOOL_DUMP_FLAGS {
 	CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
 };
 
+#define CXGB4_ETH_DUMP_ALL (CXGB4_ETH_DUMP_MEM | CXGB4_ETH_DUMP_HW)
+
 u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
 int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
 			u32 flag);
 void cxgb4_init_ethtool_dump(struct adapter *adapter);
+int cxgb4_cudbg_vmcore_add_dump(struct adapter *adap);
 #endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c54fd189d835..07baa5e1a8c5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5558,6 +5558,16 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto out_free_adapter;
 
+	if (is_kdump_kernel()) {
+		/* Collect hardware state and append to /proc/vmcore */
+		err = cxgb4_cudbg_vmcore_add_dump(adapter);
+		if (err) {
+			dev_warn(adapter->pdev_dev,
+				 "Fail collecting vmcore device dump, err: %d. Continuing\n",
+				 err);
+			err = 0;
+		}
+	}
 
 	if (!is_t4(adapter->params.chip)) {
 		s_qpp = (QUEUESPERPAGEPF0_S +
-- 
cgit v1.2.3-59-g8ed1b


From 408d2debb03e15f3dd3063a5a14befd3390c6eab Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 11 May 2018 17:16:34 -0400
Subject: net: dsa: mv88e6xxx: use helper for 6390 histogram

The Marvell 88E6390 model has its histogram mode bits moved in the
Global 1 Control 2 register. Use the previously introduced
mv88e6xxx_g1_ctl2_mask helper to set them.

At the same time complete the documentation of the said register.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/global1.c | 15 +++------------
 drivers/net/dsa/mv88e6xxx/global1.h | 12 +++++++++---
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index 244ee1ff9edc..0f2b05342c18 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -393,18 +393,9 @@ int mv88e6390_g1_rmu_disable(struct mv88e6xxx_chip *chip)
 
 int mv88e6390_g1_stats_set_histogram(struct mv88e6xxx_chip *chip)
 {
-	u16 val;
-	int err;
-
-	err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_CTL2, &val);
-	if (err)
-		return err;
-
-	val |= MV88E6XXX_G1_CTL2_HIST_RX_TX;
-
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_CTL2, val);
-
-	return err;
+	return mv88e6xxx_g1_ctl2_mask(chip, MV88E6390_G1_CTL2_HIST_MODE_MASK,
+				      MV88E6390_G1_CTL2_HIST_MODE_RX |
+				      MV88E6390_G1_CTL2_HIST_MODE_TX);
 }
 
 int mv88e6xxx_g1_set_device_number(struct mv88e6xxx_chip *chip, int index)
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index e186a026e1b1..c357b3ca9a09 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -201,12 +201,13 @@
 
 /* Offset 0x1C: Global Control 2 */
 #define MV88E6XXX_G1_CTL2			0x1c
-#define MV88E6XXX_G1_CTL2_HIST_RX		0x0040
-#define MV88E6XXX_G1_CTL2_HIST_TX		0x0080
-#define MV88E6XXX_G1_CTL2_HIST_RX_TX		0x00c0
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MASK	0xf000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_NONE	0xe000
 #define MV88E6185_G1_CTL2_CASCADE_PORT_MULTI	0xf000
+#define MV88E6352_G1_CTL2_HEADER_TYPE_MASK	0xc000
+#define MV88E6352_G1_CTL2_HEADER_TYPE_ORIG	0x0000
+#define MV88E6352_G1_CTL2_HEADER_TYPE_MGMT	0x4000
+#define MV88E6390_G1_CTL2_HEADER_TYPE_LAG	0x8000
 #define MV88E6352_G1_CTL2_RMU_MODE_MASK		0x3000
 #define MV88E6352_G1_CTL2_RMU_MODE_DISABLED	0x0000
 #define MV88E6352_G1_CTL2_RMU_MODE_PORT_4	0x1000
@@ -223,6 +224,11 @@
 #define MV88E6390_G1_CTL2_RMU_MODE_PORT_10	0x0300
 #define MV88E6390_G1_CTL2_RMU_MODE_ALL_DSA	0x0600
 #define MV88E6390_G1_CTL2_RMU_MODE_DISABLED	0x0700
+#define MV88E6390_G1_CTL2_HIST_MODE_MASK	0x00c0
+#define MV88E6390_G1_CTL2_HIST_MODE_RX		0x0040
+#define MV88E6390_G1_CTL2_HIST_MODE_TX		0x0080
+#define MV88E6352_G1_CTL2_CTR_MODE_MASK		0x0060
+#define MV88E6390_G1_CTL2_CTR_MODE		0x0020
 #define MV88E6XXX_G1_CTL2_DEVICE_NUMBER_MASK	0x001f
 
 /* Offset 0x1D: Stats Operation Register */
-- 
cgit v1.2.3-59-g8ed1b


From 93e18d61bfa950aaffff5fccde0f974e1e038f83 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 11 May 2018 17:16:35 -0400
Subject: net: dsa: mv88e6xxx: add IEEE and IP mapping ops

All Marvell switch families except 88E6390 have direct registers in
Global 1 for IEEE and IP priorities override mapping. The 88E6390 uses
indirect tables instead.

Add .ieee_pri_map and .ip_pri_map ops to distinct that and call them
from a mv88e6xxx_pri_setup helper. Only non-6390 are concerned ATM.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c    | 94 +++++++++++++++++++++++++------------
 drivers/net/dsa/mv88e6xxx/chip.h    |  3 ++
 drivers/net/dsa/mv88e6xxx/global1.c | 58 +++++++++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global1.h |  3 ++
 4 files changed, 127 insertions(+), 31 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2910f68c0e5c..ad456d18ce8a 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1104,6 +1104,25 @@ static void mv88e6xxx_port_stp_state_set(struct dsa_switch *ds, int port,
 		dev_err(ds->dev, "p%d: failed to update state\n", port);
 }
 
+static int mv88e6xxx_pri_setup(struct mv88e6xxx_chip *chip)
+{
+	int err;
+
+	if (chip->info->ops->ieee_pri_map) {
+		err = chip->info->ops->ieee_pri_map(chip);
+		if (err)
+			return err;
+	}
+
+	if (chip->info->ops->ip_pri_map) {
+		err = chip->info->ops->ip_pri_map(chip);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
 {
 	int target, port;
@@ -2252,37 +2271,6 @@ static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip *chip)
 {
 	int err;
 
-	/* Configure the IP ToS mapping registers. */
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_0, 0x0000);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_1, 0x0000);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_2, 0x5555);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_3, 0x5555);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_4, 0xaaaa);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_5, 0xaaaa);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_6, 0xffff);
-	if (err)
-		return err;
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_7, 0xffff);
-	if (err)
-		return err;
-
-	/* Configure the IEEE 802.1p priority mapping register. */
-	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IEEE_PRI, 0xfa41);
-	if (err)
-		return err;
-
 	/* Initialize the statistics unit */
 	err = mv88e6xxx_stats_set_histogram(chip);
 	if (err)
@@ -2365,6 +2353,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 	if (err)
 		goto unlock;
 
+	err = mv88e6xxx_pri_setup(chip);
+	if (err)
+		goto unlock;
+
 	/* Setup PTP Hardware Clock and timestamping */
 	if (chip->info->ptp_support) {
 		err = mv88e6xxx_ptp_setup(chip);
@@ -2592,6 +2584,8 @@ static int mv88e6xxx_set_eeprom(struct dsa_switch *ds,
 
 static const struct mv88e6xxx_ops mv88e6085_ops = {
 	/* MV88E6XXX_FAMILY_6097 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
 	.phy_read = mv88e6185_phy_ppu_read,
@@ -2628,6 +2622,8 @@ static const struct mv88e6xxx_ops mv88e6085_ops = {
 
 static const struct mv88e6xxx_ops mv88e6095_ops = {
 	/* MV88E6XXX_FAMILY_6095 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
 	.phy_read = mv88e6185_phy_ppu_read,
 	.phy_write = mv88e6185_phy_ppu_write,
@@ -2652,6 +2648,8 @@ static const struct mv88e6xxx_ops mv88e6095_ops = {
 
 static const struct mv88e6xxx_ops mv88e6097_ops = {
 	/* MV88E6XXX_FAMILY_6097 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -2686,6 +2684,8 @@ static const struct mv88e6xxx_ops mv88e6097_ops = {
 
 static const struct mv88e6xxx_ops mv88e6123_ops = {
 	/* MV88E6XXX_FAMILY_6165 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -2714,6 +2714,8 @@ static const struct mv88e6xxx_ops mv88e6123_ops = {
 
 static const struct mv88e6xxx_ops mv88e6131_ops = {
 	/* MV88E6XXX_FAMILY_6185 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
 	.phy_read = mv88e6185_phy_ppu_read,
 	.phy_write = mv88e6185_phy_ppu_write,
@@ -2747,6 +2749,8 @@ static const struct mv88e6xxx_ops mv88e6131_ops = {
 
 static const struct mv88e6xxx_ops mv88e6141_ops = {
 	/* MV88E6XXX_FAMILY_6341 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom8,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom8,
@@ -2784,6 +2788,8 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 
 static const struct mv88e6xxx_ops mv88e6161_ops = {
 	/* MV88E6XXX_FAMILY_6165 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -2817,6 +2823,8 @@ static const struct mv88e6xxx_ops mv88e6161_ops = {
 
 static const struct mv88e6xxx_ops mv88e6165_ops = {
 	/* MV88E6XXX_FAMILY_6165 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6165_phy_read,
@@ -2843,6 +2851,8 @@ static const struct mv88e6xxx_ops mv88e6165_ops = {
 
 static const struct mv88e6xxx_ops mv88e6171_ops = {
 	/* MV88E6XXX_FAMILY_6351 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -2877,6 +2887,8 @@ static const struct mv88e6xxx_ops mv88e6171_ops = {
 
 static const struct mv88e6xxx_ops mv88e6172_ops = {
 	/* MV88E6XXX_FAMILY_6352 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
@@ -2916,6 +2928,8 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
 
 static const struct mv88e6xxx_ops mv88e6175_ops = {
 	/* MV88E6XXX_FAMILY_6351 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -2951,6 +2965,8 @@ static const struct mv88e6xxx_ops mv88e6175_ops = {
 
 static const struct mv88e6xxx_ops mv88e6176_ops = {
 	/* MV88E6XXX_FAMILY_6352 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
@@ -2990,6 +3006,8 @@ static const struct mv88e6xxx_ops mv88e6176_ops = {
 
 static const struct mv88e6xxx_ops mv88e6185_ops = {
 	/* MV88E6XXX_FAMILY_6185 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.set_switch_mac = mv88e6xxx_g1_set_switch_mac,
 	.phy_read = mv88e6185_phy_ppu_read,
 	.phy_write = mv88e6185_phy_ppu_write,
@@ -3129,6 +3147,8 @@ static const struct mv88e6xxx_ops mv88e6191_ops = {
 
 static const struct mv88e6xxx_ops mv88e6240_ops = {
 	/* MV88E6XXX_FAMILY_6352 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
@@ -3208,6 +3228,8 @@ static const struct mv88e6xxx_ops mv88e6290_ops = {
 
 static const struct mv88e6xxx_ops mv88e6320_ops = {
 	/* MV88E6XXX_FAMILY_6320 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
@@ -3244,6 +3266,8 @@ static const struct mv88e6xxx_ops mv88e6320_ops = {
 
 static const struct mv88e6xxx_ops mv88e6321_ops = {
 	/* MV88E6XXX_FAMILY_6320 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
@@ -3278,6 +3302,8 @@ static const struct mv88e6xxx_ops mv88e6321_ops = {
 
 static const struct mv88e6xxx_ops mv88e6341_ops = {
 	/* MV88E6XXX_FAMILY_6341 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom8,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom8,
@@ -3316,6 +3342,8 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 
 static const struct mv88e6xxx_ops mv88e6350_ops = {
 	/* MV88E6XXX_FAMILY_6351 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -3350,6 +3378,8 @@ static const struct mv88e6xxx_ops mv88e6350_ops = {
 
 static const struct mv88e6xxx_ops mv88e6351_ops = {
 	/* MV88E6XXX_FAMILY_6351 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.set_switch_mac = mv88e6xxx_g2_set_switch_mac,
 	.phy_read = mv88e6xxx_g2_smi_phy_read,
@@ -3385,6 +3415,8 @@ static const struct mv88e6xxx_ops mv88e6351_ops = {
 
 static const struct mv88e6xxx_ops mv88e6352_ops = {
 	/* MV88E6XXX_FAMILY_6352 */
+	.ieee_pri_map = mv88e6085_g1_ieee_pri_map,
+	.ip_pri_map = mv88e6085_g1_ip_pri_map,
 	.irl_init_all = mv88e6352_g2_irl_init_all,
 	.get_eeprom = mv88e6xxx_g2_get_eeprom16,
 	.set_eeprom = mv88e6xxx_g2_set_eeprom16,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 8eb5dfe17f5c..012268046442 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -294,6 +294,9 @@ struct mv88e6xxx_mdio_bus {
 };
 
 struct mv88e6xxx_ops {
+	int (*ieee_pri_map)(struct mv88e6xxx_chip *chip);
+	int (*ip_pri_map)(struct mv88e6xxx_chip *chip);
+
 	/* Ingress Rate Limit unit (IRL) operations */
 	int (*irl_init_all)(struct mv88e6xxx_chip *chip, int port);
 
diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index 0f2b05342c18..d721ccf7d8be 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -241,6 +241,64 @@ int mv88e6185_g1_ppu_disable(struct mv88e6xxx_chip *chip)
 	return mv88e6185_g1_wait_ppu_disabled(chip);
 }
 
+/* Offset 0x10: IP-PRI Mapping Register 0
+ * Offset 0x11: IP-PRI Mapping Register 1
+ * Offset 0x12: IP-PRI Mapping Register 2
+ * Offset 0x13: IP-PRI Mapping Register 3
+ * Offset 0x14: IP-PRI Mapping Register 4
+ * Offset 0x15: IP-PRI Mapping Register 5
+ * Offset 0x16: IP-PRI Mapping Register 6
+ * Offset 0x17: IP-PRI Mapping Register 7
+ */
+
+int mv88e6085_g1_ip_pri_map(struct mv88e6xxx_chip *chip)
+{
+	int err;
+
+	/* Reset the IP TOS/DiffServ/Traffic priorities to defaults */
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_0, 0x0000);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_1, 0x0000);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_2, 0x5555);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_3, 0x5555);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_4, 0xaaaa);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_5, 0xaaaa);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_6, 0xffff);
+	if (err)
+		return err;
+
+	err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IP_PRI_7, 0xffff);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/* Offset 0x18: IEEE-PRI Register */
+
+int mv88e6085_g1_ieee_pri_map(struct mv88e6xxx_chip *chip)
+{
+	/* Reset the IEEE Tag priorities to defaults */
+	return mv88e6xxx_g1_write(chip, MV88E6XXX_G1_IEEE_PRI, 0xfa41);
+}
+
 /* Offset 0x1a: Monitor Control */
 /* Offset 0x1a: Monitor & MGMT Control on some devices */
 
diff --git a/drivers/net/dsa/mv88e6xxx/global1.h b/drivers/net/dsa/mv88e6xxx/global1.h
index c357b3ca9a09..7c791c1da4b9 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.h
+++ b/drivers/net/dsa/mv88e6xxx/global1.h
@@ -277,6 +277,9 @@ int mv88e6095_g1_set_cpu_port(struct mv88e6xxx_chip *chip, int port);
 int mv88e6390_g1_set_cpu_port(struct mv88e6xxx_chip *chip, int port);
 int mv88e6390_g1_mgmt_rsvd2cpu(struct mv88e6xxx_chip *chip);
 
+int mv88e6085_g1_ip_pri_map(struct mv88e6xxx_chip *chip);
+int mv88e6085_g1_ieee_pri_map(struct mv88e6xxx_chip *chip);
+
 int mv88e6185_g1_set_cascade_port(struct mv88e6xxx_chip *chip, int port);
 
 int mv88e6085_g1_rmu_disable(struct mv88e6xxx_chip *chip);
-- 
cgit v1.2.3-59-g8ed1b


From 447b1bb84bb51603f57ae426973a620a7a66a0db Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 11 May 2018 17:16:36 -0400
Subject: net: dsa: mv88e6xxx: add a stats setup function

Now that the Global 1 specific setup function only setup the statistics
unit, kill it in favor of a mv88e6xxx_stats_setup function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ad456d18ce8a..b23c11d9f4b2 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -995,14 +995,6 @@ static void mv88e6xxx_get_ethtool_stats(struct dsa_switch *ds, int port,
 
 }
 
-static int mv88e6xxx_stats_set_histogram(struct mv88e6xxx_chip *chip)
-{
-	if (chip->info->ops->stats_set_histogram)
-		return chip->info->ops->stats_set_histogram(chip);
-
-	return 0;
-}
-
 static int mv88e6xxx_get_regs_len(struct dsa_switch *ds, int port)
 {
 	return 32 * sizeof(u16);
@@ -2267,14 +2259,16 @@ static int mv88e6xxx_set_ageing_time(struct dsa_switch *ds,
 	return err;
 }
 
-static int mv88e6xxx_g1_setup(struct mv88e6xxx_chip *chip)
+static int mv88e6xxx_stats_setup(struct mv88e6xxx_chip *chip)
 {
 	int err;
 
 	/* Initialize the statistics unit */
-	err = mv88e6xxx_stats_set_histogram(chip);
-	if (err)
-		return err;
+	if (chip->info->ops->stats_set_histogram) {
+		err = chip->info->ops->stats_set_histogram(chip);
+		if (err)
+			return err;
+	}
 
 	return mv88e6xxx_g1_stats_clear(chip);
 }
@@ -2300,11 +2294,6 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 			goto unlock;
 	}
 
-	/* Setup Switch Global 1 Registers */
-	err = mv88e6xxx_g1_setup(chip);
-	if (err)
-		goto unlock;
-
 	err = mv88e6xxx_irl_setup(chip);
 	if (err)
 		goto unlock;
@@ -2368,6 +2357,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds)
 			goto unlock;
 	}
 
+	err = mv88e6xxx_stats_setup(chip);
+	if (err)
+		goto unlock;
+
 unlock:
 	mutex_unlock(&chip->reg_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From a4a78a97ee4bccb865006015340905c90b38cd8f Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:18 +0800
Subject: dt-bindings: net: dwmac-sun8i: Clean up clock delay chain
 descriptions

The clock delay chains found in the glue layer for dwmac-sun8i are only
used with RGMII PHYs. They are not intended for non-RGMII PHYs, such as
MII external PHYs or the internal PHY. Also, a recent SoC has a smaller
range of possible values for the delay chain.

This patch reformats the delay chain section of the device tree binding
to make it clear that the delay chains only apply to RGMII PHYs, and
make it easier to add the R40-specific bits later.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
index 3d6d5fa0c4d5..e04ce75e24a3 100644
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
@@ -28,10 +28,13 @@ Required properties:
   - allwinner,sun8i-a83t-system-controller
 
 Optional properties:
-- allwinner,tx-delay-ps: TX clock delay chain value in ps. Range value is 0-700. Default is 0)
-- allwinner,rx-delay-ps: RX clock delay chain value in ps. Range value is 0-3100. Default is 0)
-Both delay properties need to be a multiple of 100. They control the delay for
-external PHY.
+- allwinner,tx-delay-ps: TX clock delay chain value in ps.
+			 Range is 0-700. Default is 0.
+- allwinner,rx-delay-ps: RX clock delay chain value in ps.
+			 Range is 0-3100. Default is 0.
+Both delay properties need to be a multiple of 100. They control the
+clock delay for external RGMII PHY. They do not apply to the internal
+PHY or external non-RGMII PHYs.
 
 Optional properties for the following compatibles:
   - "allwinner,sun8i-h3-emac",
-- 
cgit v1.2.3-59-g8ed1b


From 9ed3fec3c336b71d532aaeda8d3239246aa43d61 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:19 +0800
Subject: dt-bindings: net: dwmac-sun8i: Sort syscon compatibles by
 alphabetical order

The A83T syscon compatible was appended to the syscon compatibles list,
instead of inserted in to preserve the ordering.

Move it to the proper place to keep the list sorted.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
index e04ce75e24a3..1b8e33e71651 100644
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
@@ -22,10 +22,10 @@ Required properties:
 - #size-cells: shall be 0
 - syscon: A phandle to the syscon of the SoC with one of the following
  compatible string:
+  - allwinner,sun8i-a83t-system-controller
   - allwinner,sun8i-h3-system-controller
   - allwinner,sun8i-v3s-system-controller
   - allwinner,sun50i-a64-system-controller
-  - allwinner,sun8i-a83t-system-controller
 
 Optional properties:
 - allwinner,tx-delay-ps: TX clock delay chain value in ps.
-- 
cgit v1.2.3-59-g8ed1b


From a6fe692e6eb554eb6f9e097142c7b7099edd203f Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:20 +0800
Subject: dt-bindings: net: dwmac-sun8i: simplify description of syscon
 property

The syscon property is used to point to the device that holds the glue
layer control register known as the "EMAC (or GMAC) clock register".

We do not need to explicitly list what compatible strings are needed, as
this information is readily available in the user manuals. Also the
"syscon" device type is more of an implementation detail. There are many
ways to access a register not in a device's address range, the syscon
interface being the most generic and unrestricted one.

Simplify the description so that it says what it is supposed to
describe.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
index 1b8e33e71651..1c0906a5c02b 100644
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
@@ -20,12 +20,7 @@ Required properties:
 - phy-handle: See ethernet.txt
 - #address-cells: shall be 1
 - #size-cells: shall be 0
-- syscon: A phandle to the syscon of the SoC with one of the following
- compatible string:
-  - allwinner,sun8i-a83t-system-controller
-  - allwinner,sun8i-h3-system-controller
-  - allwinner,sun8i-v3s-system-controller
-  - allwinner,sun50i-a64-system-controller
+- syscon: A phandle to the device containing the EMAC or GMAC clock register
 
 Optional properties:
 - allwinner,tx-delay-ps: TX clock delay chain value in ps.
-- 
cgit v1.2.3-59-g8ed1b


From eef8811d9219d197d158cda9233ce2f78ea0a790 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:21 +0800
Subject: dt-bindings: net: dwmac-sun8i: Add binding for GMAC on Allwinner R40
 SoC

The Allwinner R40 SoC has the EMAC controller supported by dwmac-sun8i.
It is named "GMAC", while EMAC refers to the 10/100 Mbps Ethernet
controller supported by sun4i-emac. The controller is the same, but
the R40 has the glue layer controls in the clock control unit (CCU),
with a reduced RX delay chain, and no TX delay chain.

This patch adds the R40 specific bits to the dwmac-sun8i binding.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dwmac-sun8i.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
index 1c0906a5c02b..cfe724398a12 100644
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
@@ -7,6 +7,7 @@ Required properties:
 - compatible: must be one of the following string:
 		"allwinner,sun8i-a83t-emac"
 		"allwinner,sun8i-h3-emac"
+		"allwinner,sun8i-r40-gmac"
 		"allwinner,sun8i-v3s-emac"
 		"allwinner,sun50i-a64-emac"
 - reg: address and length of the register for the device.
@@ -25,8 +26,10 @@ Required properties:
 Optional properties:
 - allwinner,tx-delay-ps: TX clock delay chain value in ps.
 			 Range is 0-700. Default is 0.
+			 Unavailable for allwinner,sun8i-r40-gmac
 - allwinner,rx-delay-ps: RX clock delay chain value in ps.
 			 Range is 0-3100. Default is 0.
+			 Range is 0-700 for allwinner,sun8i-r40-gmac
 Both delay properties need to be a multiple of 100. They control the
 clock delay for external RGMII PHY. They do not apply to the internal
 PHY or external non-RGMII PHYs.
-- 
cgit v1.2.3-59-g8ed1b


From 25ae15fb0ecda594306937ebf9ea65b7d0c4f592 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:22 +0800
Subject: net: stmmac: dwmac-sun8i: Use regmap_field for syscon register access

On the Allwinner R40, the "GMAC clock" register is located in the CCU
block, at a different register address than the other SoCs that have
it in the "system control" block.

This patch converts the use of regmap to regmap_field for mapping and
accessing the syscon register, so we can have the register address in
the variants data, and not in the actual register manipulation code.

This patch only converts regmap_read() and regmap_write() calls to
regmap_field_read() and regmap_field_write() calls. There are some
places where it might make sense to switch to regmap_field_update_bits(),
but this is not done here to keep the patch simple.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 42 +++++++++++++++++------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index a3fa65b1ca8e..bbc051474806 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -42,6 +42,7 @@
  *				This value is used for disabling properly EMAC
  *				and used as a good starting value in case of the
  *				boot process(uboot) leave some stuff.
+ * @syscon_field		reg_field for the syscon's gmac register
  * @soc_has_internal_phy:	Does the MAC embed an internal PHY
  * @support_mii:		Does the MAC handle MII
  * @support_rmii:		Does the MAC handle RMII
@@ -49,6 +50,7 @@
  */
 struct emac_variant {
 	u32 default_syscon_value;
+	const struct reg_field *syscon_field;
 	bool soc_has_internal_phy;
 	bool support_mii;
 	bool support_rmii;
@@ -71,13 +73,21 @@ struct sunxi_priv_data {
 	struct regulator *regulator;
 	struct reset_control *rst_ephy;
 	const struct emac_variant *variant;
-	struct regmap *regmap;
+	struct regmap_field *regmap_field;
 	bool internal_phy_powered;
 	void *mux_handle;
 };
 
+/* EMAC clock register @ 0x30 in the "system control" address range */
+static const struct reg_field sun8i_syscon_reg_field = {
+	.reg = 0x30,
+	.lsb = 0,
+	.msb = 31,
+};
+
 static const struct emac_variant emac_variant_h3 = {
 	.default_syscon_value = 0x58000,
+	.syscon_field = &sun8i_syscon_reg_field,
 	.soc_has_internal_phy = true,
 	.support_mii = true,
 	.support_rmii = true,
@@ -86,12 +96,14 @@ static const struct emac_variant emac_variant_h3 = {
 
 static const struct emac_variant emac_variant_v3s = {
 	.default_syscon_value = 0x38000,
+	.syscon_field = &sun8i_syscon_reg_field,
 	.soc_has_internal_phy = true,
 	.support_mii = true
 };
 
 static const struct emac_variant emac_variant_a83t = {
 	.default_syscon_value = 0,
+	.syscon_field = &sun8i_syscon_reg_field,
 	.soc_has_internal_phy = false,
 	.support_mii = true,
 	.support_rgmii = true
@@ -99,6 +111,7 @@ static const struct emac_variant emac_variant_a83t = {
 
 static const struct emac_variant emac_variant_a64 = {
 	.default_syscon_value = 0,
+	.syscon_field = &sun8i_syscon_reg_field,
 	.soc_has_internal_phy = false,
 	.support_mii = true,
 	.support_rmii = true,
@@ -216,7 +229,6 @@ static const struct emac_variant emac_variant_a64 = {
 #define SYSCON_ETCS_MII		0x0
 #define SYSCON_ETCS_EXT_GMII	0x1
 #define SYSCON_ETCS_INT_GMII	0x2
-#define SYSCON_EMAC_REG		0x30
 
 /* sun8i_dwmac_dma_reset() - reset the EMAC
  * Called from stmmac via stmmac_dma_ops->reset
@@ -745,7 +757,7 @@ static int mdio_mux_syscon_switch_fn(int current_child, int desired_child,
 	bool need_power_ephy = false;
 
 	if (current_child ^ desired_child) {
-		regmap_read(gmac->regmap, SYSCON_EMAC_REG, &reg);
+		regmap_field_read(gmac->regmap_field, &reg);
 		switch (desired_child) {
 		case DWMAC_SUN8I_MDIO_MUX_INTERNAL_ID:
 			dev_info(priv->device, "Switch mux to internal PHY");
@@ -763,7 +775,7 @@ static int mdio_mux_syscon_switch_fn(int current_child, int desired_child,
 				desired_child);
 			return -EINVAL;
 		}
-		regmap_write(gmac->regmap, SYSCON_EMAC_REG, val);
+		regmap_field_write(gmac->regmap_field, val);
 		if (need_power_ephy) {
 			ret = sun8i_dwmac_power_internal_phy(priv);
 			if (ret)
@@ -801,7 +813,7 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 	int ret;
 	u32 reg, val;
 
-	regmap_read(gmac->regmap, SYSCON_EMAC_REG, &val);
+	regmap_field_read(gmac->regmap_field, &val);
 	reg = gmac->variant->default_syscon_value;
 	if (reg != val)
 		dev_warn(priv->device,
@@ -883,7 +895,7 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 		return -EINVAL;
 	}
 
-	regmap_write(gmac->regmap, SYSCON_EMAC_REG, reg);
+	regmap_field_write(gmac->regmap_field, reg);
 
 	return 0;
 }
@@ -892,7 +904,7 @@ static void sun8i_dwmac_unset_syscon(struct sunxi_priv_data *gmac)
 {
 	u32 reg = gmac->variant->default_syscon_value;
 
-	regmap_write(gmac->regmap, SYSCON_EMAC_REG, reg);
+	regmap_field_write(gmac->regmap_field, reg);
 }
 
 static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv)
@@ -980,6 +992,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	int ret;
 	struct stmmac_priv *priv;
 	struct net_device *ndev;
+	struct regmap *regmap;
 
 	ret = stmmac_get_platform_resources(pdev, &stmmac_res);
 	if (ret)
@@ -1014,14 +1027,21 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 		gmac->regulator = NULL;
 	}
 
-	gmac->regmap = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
-						       "syscon");
-	if (IS_ERR(gmac->regmap)) {
-		ret = PTR_ERR(gmac->regmap);
+	regmap = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "syscon");
+	if (IS_ERR(regmap)) {
+		ret = PTR_ERR(regmap);
 		dev_err(&pdev->dev, "Unable to map syscon: %d\n", ret);
 		return ret;
 	}
 
+	gmac->regmap_field = devm_regmap_field_alloc(dev, regmap,
+						     *gmac->variant->syscon_field);
+	if (IS_ERR(gmac->regmap_field)) {
+		ret = PTR_ERR(gmac->regmap_field);
+		dev_err(dev, "Unable to map syscon register: %d\n", ret);
+		return ret;
+	}
+
 	plat_dat->interface = of_get_phy_mode(dev->of_node);
 
 	/* platform data specifying hardware features and callbacks.
-- 
cgit v1.2.3-59-g8ed1b


From 49a06cae6e7ceb77f17914c4617309e038c24c4d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:23 +0800
Subject: net: stmmac: dwmac-sun8i: Allow getting syscon regmap from external
 device

On the Allwinner R40 SoC, the "GMAC clock" register is in the CCU
address space. Using a standard syscon to access it provides no
coordination with the CCU driver for register access. Neither does
it prevent this and other drivers from accessing other, maybe critical,
clock control registers. On other SoCs, the register is in the "system
control" address space, which might also contain controls for mapping
SRAM to devices or the CPU. This hardware has the same issues.

Instead, for these types of setups, we let the device containing the
control register create a regmap tied to it. We can then get the device
from the existing syscon phandle, and retrieve the regmap with
dev_get_regmap().

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 50 ++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index bbc051474806..79e104a20e20 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -983,6 +983,34 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv)
 	return mac;
 }
 
+static struct regmap *sun8i_dwmac_get_syscon_from_dev(struct device_node *node)
+{
+	struct device_node *syscon_node;
+	struct platform_device *syscon_pdev;
+	struct regmap *regmap = NULL;
+
+	syscon_node = of_parse_phandle(node, "syscon", 0);
+	if (!syscon_node)
+		return ERR_PTR(-ENODEV);
+
+	syscon_pdev = of_find_device_by_node(syscon_node);
+	if (!syscon_pdev) {
+		/* platform device might not be probed yet */
+		regmap = ERR_PTR(-EPROBE_DEFER);
+		goto out_put_node;
+	}
+
+	/* If no regmap is found then the other device driver is at fault */
+	regmap = dev_get_regmap(&syscon_pdev->dev, NULL);
+	if (!regmap)
+		regmap = ERR_PTR(-EINVAL);
+
+	platform_device_put(syscon_pdev);
+out_put_node:
+	of_node_put(syscon_node);
+	return regmap;
+}
+
 static int sun8i_dwmac_probe(struct platform_device *pdev)
 {
 	struct plat_stmmacenet_data *plat_dat;
@@ -1027,7 +1055,27 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 		gmac->regulator = NULL;
 	}
 
-	regmap = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "syscon");
+	/* The "GMAC clock control" register might be located in the
+	 * CCU address range (on the R40), or the system control address
+	 * range (on most other sun8i and later SoCs).
+	 *
+	 * The former controls most if not all clocks in the SoC. The
+	 * latter has an SoC identification register, and on some SoCs,
+	 * controls to map device specific SRAM to either the intended
+	 * peripheral, or the CPU address space.
+	 *
+	 * In either case, there should be a coordinated and restricted
+	 * method of accessing the register needed here. This is done by
+	 * having the device export a custom regmap, instead of a generic
+	 * syscon, which grants all access to all registers.
+	 *
+	 * To support old device trees, we fall back to using the syscon
+	 * interface if possible.
+	 */
+	regmap = sun8i_dwmac_get_syscon_from_dev(pdev->dev.of_node);
+	if (IS_ERR(regmap))
+		regmap = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+							 "syscon");
 	if (IS_ERR(regmap)) {
 		ret = PTR_ERR(regmap);
 		dev_err(&pdev->dev, "Unable to map syscon: %d\n", ret);
-- 
cgit v1.2.3-59-g8ed1b


From 7b270b725a867587957676a8dae862caad558011 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:24 +0800
Subject: net: stmmac: dwmac-sun8i: Support different ranges for TX/RX delay
 chains

On the R40 SoC, the RX delay chain only has a range of 0~7 (hundred
picoseconds), instead of 0~31. Also the TX delay chain is completely
absent.

This patch adds support for different ranges by adding per-compatible
maximum values in the variant data. A maximum of 0 indicates that the
delay chain is not supported or absent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 32 ++++++++++++++++-------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 79e104a20e20..4f5612a3c855 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -47,6 +47,12 @@
  * @support_mii:		Does the MAC handle MII
  * @support_rmii:		Does the MAC handle RMII
  * @support_rgmii:		Does the MAC handle RGMII
+ *
+ * @rx_delay_max:		Maximum raw value for RX delay chain
+ * @tx_delay_max:		Maximum raw value for TX delay chain
+ *				These two also indicate the bitmask for
+ *				the RX and TX delay chain registers. A
+ *				value of zero indicates this is not supported.
  */
 struct emac_variant {
 	u32 default_syscon_value;
@@ -55,6 +61,8 @@ struct emac_variant {
 	bool support_mii;
 	bool support_rmii;
 	bool support_rgmii;
+	u8 rx_delay_max;
+	u8 tx_delay_max;
 };
 
 /* struct sunxi_priv_data - hold all sunxi private data
@@ -91,7 +99,9 @@ static const struct emac_variant emac_variant_h3 = {
 	.soc_has_internal_phy = true,
 	.support_mii = true,
 	.support_rmii = true,
-	.support_rgmii = true
+	.support_rgmii = true,
+	.rx_delay_max = 31,
+	.tx_delay_max = 7,
 };
 
 static const struct emac_variant emac_variant_v3s = {
@@ -106,7 +116,9 @@ static const struct emac_variant emac_variant_a83t = {
 	.syscon_field = &sun8i_syscon_reg_field,
 	.soc_has_internal_phy = false,
 	.support_mii = true,
-	.support_rgmii = true
+	.support_rgmii = true,
+	.rx_delay_max = 31,
+	.tx_delay_max = 7,
 };
 
 static const struct emac_variant emac_variant_a64 = {
@@ -115,7 +127,9 @@ static const struct emac_variant emac_variant_a64 = {
 	.soc_has_internal_phy = false,
 	.support_mii = true,
 	.support_rmii = true,
-	.support_rgmii = true
+	.support_rgmii = true,
+	.rx_delay_max = 31,
+	.tx_delay_max = 7,
 };
 
 #define EMAC_BASIC_CTL0 0x00
@@ -219,9 +233,7 @@ static const struct emac_variant emac_variant_a64 = {
 #define SYSCON_RMII_EN		BIT(13) /* 1: enable RMII (overrides EPIT) */
 
 /* Generic system control EMAC_CLK bits */
-#define SYSCON_ETXDC_MASK		GENMASK(2, 0)
 #define SYSCON_ETXDC_SHIFT		10
-#define SYSCON_ERXDC_MASK		GENMASK(4, 0)
 #define SYSCON_ERXDC_SHIFT		5
 /* EMAC PHY Interface Type */
 #define SYSCON_EPIT			BIT(2) /* 1: RGMII, 0: MII */
@@ -847,8 +859,9 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 		}
 		val /= 100;
 		dev_dbg(priv->device, "set tx-delay to %x\n", val);
-		if (val <= SYSCON_ETXDC_MASK) {
-			reg &= ~(SYSCON_ETXDC_MASK << SYSCON_ETXDC_SHIFT);
+		if (val <= gmac->variant->tx_delay_max) {
+			reg &= ~(gmac->variant->tx_delay_max <<
+				 SYSCON_ETXDC_SHIFT);
 			reg |= (val << SYSCON_ETXDC_SHIFT);
 		} else {
 			dev_err(priv->device, "Invalid TX clock delay: %d\n",
@@ -864,8 +877,9 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
 		}
 		val /= 100;
 		dev_dbg(priv->device, "set rx-delay to %x\n", val);
-		if (val <= SYSCON_ERXDC_MASK) {
-			reg &= ~(SYSCON_ERXDC_MASK << SYSCON_ERXDC_SHIFT);
+		if (val <= gmac->variant->rx_delay_max) {
+			reg &= ~(gmac->variant->rx_delay_max <<
+				 SYSCON_ERXDC_SHIFT);
 			reg |= (val << SYSCON_ERXDC_SHIFT);
 		} else {
 			dev_err(priv->device, "Invalid RX clock delay: %d\n",
-- 
cgit v1.2.3-59-g8ed1b


From 9bf5085aaaadd6488a0ea6121e0cb7dadaf0652a Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 14 May 2018 03:14:25 +0800
Subject: net: stmmac: dwmac-sun8i: Add support for GMAC on Allwinner R40 SoC

The Allwinner R40 SoC has the EMAC controller supported by dwmac-sun8i.
It is named "GMAC", while EMAC refers to the 10/100 Mbps Ethernet
controller supported by sun4i-emac. The controller is the same, but
the R40 has the glue layer controls in the clock control unit (CCU),
with a reduced RX delay chain, and no TX delay chain.

This patch adds support for it using the framework laid out by previous
patches to map the differences.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 4f5612a3c855..2f7f0915f071 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -93,6 +93,13 @@ static const struct reg_field sun8i_syscon_reg_field = {
 	.msb = 31,
 };
 
+/* EMAC clock register @ 0x164 in the CCU address range */
+static const struct reg_field sun8i_ccu_reg_field = {
+	.reg = 0x164,
+	.lsb = 0,
+	.msb = 31,
+};
+
 static const struct emac_variant emac_variant_h3 = {
 	.default_syscon_value = 0x58000,
 	.syscon_field = &sun8i_syscon_reg_field,
@@ -121,6 +128,14 @@ static const struct emac_variant emac_variant_a83t = {
 	.tx_delay_max = 7,
 };
 
+static const struct emac_variant emac_variant_r40 = {
+	.default_syscon_value = 0,
+	.syscon_field = &sun8i_ccu_reg_field,
+	.support_mii = true,
+	.support_rgmii = true,
+	.rx_delay_max = 7,
+};
+
 static const struct emac_variant emac_variant_a64 = {
 	.default_syscon_value = 0,
 	.syscon_field = &sun8i_syscon_reg_field,
@@ -1160,6 +1175,8 @@ static const struct of_device_id sun8i_dwmac_match[] = {
 		.data = &emac_variant_v3s },
 	{ .compatible = "allwinner,sun8i-a83t-emac",
 		.data = &emac_variant_a83t },
+	{ .compatible = "allwinner,sun8i-r40-gmac",
+		.data = &emac_variant_r40 },
 	{ .compatible = "allwinner,sun50i-a64-emac",
 		.data = &emac_variant_a64 },
 	{ }
-- 
cgit v1.2.3-59-g8ed1b


From 81c7288b170a59d01427fa33d6591da1dbf59277 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Sun, 13 May 2018 17:44:27 -0300
Subject: sched: cls: enable verbose logging

Currently, when the rule is not to be exclusively executed by the
hardware, extack is not passed along and offloading failures don't
get logged. The idea was that hardware failures are okay because the
rule will get executed in software then and this way it doesn't confuse
unware users.

But this is not helpful in case one needs to understand why a certain
rule failed to get offloaded. Considering it may have been a temporary
failure, like resources exceeded or so, reproducing it later and knowing
that it is triggering the same reason may be challenging.

The ultimate goal is to improve Open vSwitch debuggability when using
flower offloading.

This patch adds a new flag to enable verbose logging. With the flag set,
extack will be passed to the driver, which will be able to log the
error. As the operation itself probably won't fail (not because of this,
at least), current iproute will already log it as a Warning.

The flag is generic, so it can be reused later. No need to restrict it
just for HW offloading. The command line will follow the syntax that
tc-ebpf already uses, tc ... [ verbose ] ... , and extend its meaning.

For example:
# ./tc qdisc add dev p7p1 ingress
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower verbose \
	src_mac ed:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
Warning: TC offload is disabled on net device.
# echo $?
0
# ./tc filter add dev p7p1 parent ffff: protocol ip prio 1 \
	flower \
	src_mac ff:13:db:00:00:00 dst_mac 01:80:c2:00:00:d0 \
	src_ip 56.0.0.0 dst_ip 55.0.0.0 action drop
# echo $?
0

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 6 ++++--
 include/uapi/linux/pkt_cls.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e828d31be5da..0005f0b40fe9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -683,9 +683,11 @@ static inline bool tc_skip_sw(u32 flags)
 /* SKIP_HW and SKIP_SW are mutually exclusive flags. */
 static inline bool tc_flags_valid(u32 flags)
 {
-	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))
+	if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW |
+		      TCA_CLS_FLAGS_VERBOSE))
 		return false;
 
+	flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW;
 	if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW)))
 		return false;
 
@@ -705,7 +707,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	if (tc_skip_sw(flags))
+	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be05e66c167b..84e4c1d0f874 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -129,6 +129,7 @@ enum {
 #define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
 #define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
 #define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
+#define TCA_CLS_FLAGS_VERBOSE	(1 << 4) /* verbose logging */
 
 /* U32 filters */
 
-- 
cgit v1.2.3-59-g8ed1b


From bde4c563a93f643cee9854b0d354710e1ff69284 Mon Sep 17 00:00:00 2001
From: Hernán Gonzalez <hernan@vanguardiasur.com.ar>
Date: Sun, 13 May 2018 20:33:49 -0300
Subject: net: ethernet: ti: Use ERR_CAST instead of ERR_PTR(PTR_ERR())
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)).

drivers/net/ethernet/ti/cpts.c:567:9-16: WARNING: ERR_CAST can be used with cpts->refclk
Generated by: scripts/coccinelle/api/err_cast.cocci

Signed-off-by: Hernán Gonzalez <hernan@vanguardiasur.com.ar>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index e7b76f6b4f67..7842f094f2ef 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -564,7 +564,7 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs,
 	cpts->refclk = devm_clk_get(dev, "cpts");
 	if (IS_ERR(cpts->refclk)) {
 		dev_err(dev, "Failed to get cpts refclk\n");
-		return ERR_PTR(PTR_ERR(cpts->refclk));
+		return ERR_CAST(cpts->refclk);
 	}
 
 	clk_prepare(cpts->refclk);
-- 
cgit v1.2.3-59-g8ed1b


From 55c0211dcb5d33955ed977de81373cef91de665b Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Mon, 14 May 2018 09:40:44 +0300
Subject: mlxsw: spectrum_span: Support LAG under mirror-to-gretap

When resolving a path that the packet will take after being encapsulated
in mirror-to-gretap scenarios, one of the devices en route could be a
LAG. In that case, mirror to first up slave that corresponds to a front
panel port.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index e5f4f7620ab7..da3f7f527360 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -245,6 +245,19 @@ mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
 	return vlan_dev_real_dev(vlan_dev);
 }
 
+static struct net_device *
+mlxsw_sp_span_entry_lag(struct net_device *lag_dev)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter)
+		if ((dev->flags & IFF_UP) && mlxsw_sp_port_dev_check(dev))
+			return dev;
+
+	return NULL;
+}
+
 static __maybe_unused int
 mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
 					union mlxsw_sp_l3addr saddr,
@@ -278,6 +291,14 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
 		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 	}
 
+	if (netif_is_lag_master(edev)) {
+		if (!(edev->flags & IFF_UP))
+			goto unoffloadable;
+		edev = mlxsw_sp_span_entry_lag(edev);
+		if (!edev)
+			goto unoffloadable;
+	}
+
 	if (!mlxsw_sp_port_dev_check(edev))
 		goto unoffloadable;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7cfac881660ac7b7229950e0a942201be63c15d1 Mon Sep 17 00:00:00 2001
From: Arjun Vynipadath <arjun@chelsio.com>
Date: Mon, 14 May 2018 13:24:43 +0530
Subject: cxgb4: do not fail vf instatiation in slave mode

We no longer require a check for cxgb4 to be MASTER
when configuring SRIOV, It was required when we had
module parameter to instantiate vf.

Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 07baa5e1a8c5..130d1eed7993 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5276,13 +5276,9 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs)
 	u32 pcie_fw;
 
 	pcie_fw = readl(adap->regs + PCIE_FW_A);
-	/* Check if cxgb4 is the MASTER and fw is initialized */
-	if (num_vfs &&
-	    (!(pcie_fw & PCIE_FW_INIT_F) ||
-	    !(pcie_fw & PCIE_FW_MASTER_VLD_F) ||
-	    PCIE_FW_MASTER_G(pcie_fw) != CXGB4_UNIFIED_PF)) {
-		dev_warn(&pdev->dev,
-			 "cxgb4 driver needs to be MASTER to support SRIOV\n");
+	/* Check if fw is initialized */
+	if (!(pcie_fw & PCIE_FW_INIT_F)) {
+		dev_warn(&pdev->dev, "Device not initialized\n");
 		return -EOPNOTSUPP;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From bae77c5eb5b2107e300fb02da2311f2aa0d8ee3c Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 7 May 2018 10:50:48 -0700
Subject: bpf: enable stackmap with build_id in nmi context

Currently, we cannot parse build_id in nmi context because of
up_read(&current->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 init/Kconfig          |  1 +
 kernel/bpf/stackmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index f013afc74b11..480a4f2713d9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1391,6 +1391,7 @@ config BPF_SYSCALL
 	bool "Enable bpf() system call"
 	select ANON_INODES
 	select BPF
+	select IRQ_WORK
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate eBPF
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b41512..b59ace0f0f09 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work irq_work;
+	struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work;
+
+	work = container_of(entry, struct stack_map_irq_work, irq_work);
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
+	bool in_nmi_ctx = in_nmi();
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work;
+
+	if (in_nmi_ctx) {
+		work = this_cpu_ptr(&up_read_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!in_nmi_ctx) {
+		up_read(&current->mm->mmap_sem);
+	} else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->irq_work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&up_read_work, cpu);
+		init_irq_work(&work->irq_work, do_up_read);
+	}
+	return 0;
+}
+subsys_initcall(stack_map_init);
-- 
cgit v1.2.3-59-g8ed1b


From 13790d1cc72c22f6e7a1248e5f6d20d720797e95 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 7 May 2018 10:50:49 -0700
Subject: bpf: add selftest for stackmap with build_id in NMI context

This new test captures stackmap with build_id with hardware event
PERF_COUNT_HW_CPU_CYCLES.

Because we only support one ips-to-build_id lookup per cpu in NMI
context, stack_amap will not be able to do the lookup in this test.
Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.

urandom_read.c is extended to run configurable cycles so that it can be
caught by the perf event.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_progs.c   | 134 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/urandom_read.c |  10 ++-
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index f7731973ec68..3ecf733330c1 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1272,6 +1272,139 @@ out:
 	return;
 }
 
+static void test_stacktrace_build_id_nmi(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {
+		.sample_freq = 5000,
+		.freq = 1,
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		return;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open",
+		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	/*
+	 * We intentionally skip compare_stack_ips(). This is because we
+	 * only support one in_nmi() ips-to-build_id translation per cpu
+	 * at any time, thus stack_amap here will always fallback to
+	 * BPF_STACK_BUILD_ID_IP;
+	 */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+}
+
 #define MAX_CNT_RAWTP	10ull
 #define MAX_STACK_RAWTP	100
 struct get_stack_trace_t {
@@ -1425,6 +1558,7 @@ int main(void)
 	test_tp_attach_query();
 	test_stacktrace_map();
 	test_stacktrace_build_id();
+	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
 
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index 4acfdebf36fa..9de8b7cb4e6d 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -6,15 +6,21 @@
 #include <stdlib.h>
 
 #define BUF_SIZE 256
-int main(void)
+
+int main(int argc, char *argv[])
 {
 	int fd = open("/dev/urandom", O_RDONLY);
 	int i;
 	char buf[BUF_SIZE];
+	int count = 4;
 
 	if (fd < 0)
 		return 1;
-	for (i = 0; i < 4; ++i)
+
+	if (argc == 2)
+		count = atoi(argv[1]);
+
+	for (i = 0; i < count; ++i)
 		read(fd, buf, BUF_SIZE);
 
 	close(fd);
-- 
cgit v1.2.3-59-g8ed1b


From 53ea24c20cea32b1dc70673402b496c4a5291d2d Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Mon, 14 May 2018 17:29:15 +0900
Subject: samples/bpf: xdp_monitor, accept short options

Updated optstring parameter for getopt_long() to accept short options.
Also updated usage() function.

Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_monitor_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 894bc64c2cac..05ad3f590c91 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -58,7 +58,7 @@ static void usage(char *argv[])
 			printf(" flag (internal value:%d)",
 			       *long_options[i].flag);
 		else
-			printf("(internal short-option: -%c)",
+			printf("short-option: -%c",
 			       long_options[i].val);
 		printf("\n");
 	}
@@ -594,7 +594,7 @@ int main(int argc, char **argv)
 	snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
 
 	/* Parse commands line args */
-	while ((opt = getopt_long(argc, argv, "h",
+	while ((opt = getopt_long(argc, argv, "hDSs:",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
 		case 'D':
-- 
cgit v1.2.3-59-g8ed1b


From b8c931ba3c739b49bd536d35851712d838857757 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 7 Sep 2017 22:30:58 +0300
Subject: net/mlx5e: Remove redundant vport context vlan update

In delete vlan flow an extra call to mlx5e_vport_context_update_vlans
was added by mistake, remove it.

Fixes: 86d722ad2c3b ("net/mlx5: Use flow steering infrastructure for mlx5_en")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Gal Pressman <galp@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index f64dda2bed31..76cc10e44080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -277,7 +277,6 @@ static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
 		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID:
-		mlx5e_vport_context_update_vlans(priv);
 		if (priv->fs.vlan.active_cvlans_rule[vid]) {
 			mlx5_del_flow_rules(priv->fs.vlan.active_cvlans_rule[vid]);
 			priv->fs.vlan.active_cvlans_rule[vid] = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From e36d4810f2e98110e2b0de11ffc60f71a0a26805 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Mon, 13 Nov 2017 18:07:50 +0200
Subject: net/mlx5e: Skip redundant checks when providing NUD lastuse feedback

It's redundant to continue the loop if we found one flow whose lastuse value
being newer than the last one we reported, since this is enough for us to
trigger a NUD update (neigh_event_send()).

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b94276db3ce9..9b0e3ccffa66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -982,6 +982,8 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 				}
 			}
 		}
+		if (neigh_used)
+			break;
 	}
 
 	if (neigh_used) {
-- 
cgit v1.2.3-59-g8ed1b


From bd206fd52e039870027d23d6de9f432272fc61d5 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 23 Nov 2017 14:29:25 +0200
Subject: net/mlx5e: Use u8 instead of int for LRO number of segments

Range of LRO number of segments fits in u8.
Also, bring initialization and declaration together to
save code.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 7bbf0db27a01..e10422b845ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -681,11 +681,10 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 				      struct mlx5e_rq *rq,
 				      struct sk_buff *skb)
 {
+	u8 lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	struct net_device *netdev = rq->netdev;
-	int lro_num_seg;
 
 	skb->mac_len = ETH_HLEN;
-	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	if (lro_num_seg > 1) {
 		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
 		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
-- 
cgit v1.2.3-59-g8ed1b


From efb6d7a20c856e8714d17f5a4ce509df893c4f01 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 23 Nov 2017 14:39:22 +0200
Subject: net/mlx5e: Use bool as return type for mlx5e_xdp_handle

Function returns boolean values, use bool instead of int.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index e10422b845ca..abc6ca8168d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -807,9 +807,9 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 }
 
 /* returns true if packet was consumed by xdp */
-static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
-				   struct mlx5e_dma_info *di,
-				   void *va, u16 *rx_headroom, u32 *len)
+static inline bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
+				    struct mlx5e_dma_info *di,
+				    void *va, u16 *rx_headroom, u32 *len)
 {
 	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
 	struct xdp_buff xdp;
-- 
cgit v1.2.3-59-g8ed1b


From 4601266095501d2af4a877f6ee0ecf18af8cdcd8 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun, 8 Apr 2018 09:45:32 +0300
Subject: net/mlx5e: Remove double defined DMAC header re-write element

The firmware DMAC_47_16 header re-write token was defined twice,
clean it up.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reported-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9b0e3ccffa66..969a1768da71 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1579,7 +1579,6 @@ struct mlx5_fields {
 		{MLX5_ACTION_IN_FIELD_OUT_ ## fw_field, size, offsetof(struct pedit_headers, field) + (off)}
 
 static struct mlx5_fields fields[] = {
-	OFFLOAD(DMAC_47_16, 4, eth.h_dest[0], 0),
 	OFFLOAD(DMAC_47_16, 4, eth.h_dest[0], 0),
 	OFFLOAD(DMAC_15_0,  2, eth.h_dest[4], 0),
 	OFFLOAD(SMAC_47_16, 4, eth.h_source[0], 0),
-- 
cgit v1.2.3-59-g8ed1b


From b3a433de7b421d700de99ac281d8384494020cd2 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun, 8 Apr 2018 09:59:22 +0300
Subject: net/mlx5e: Clean static checker complaints on TC offload and VF reps
 code

Clean warning/check complaints made by checkpatch on en_{tc,rep}.c

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c |  8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c  | 11 +++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 876c3e4c6193..a689f4c90fe3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -681,8 +681,8 @@ static int mlx5e_rep_open(struct net_device *dev)
 		goto unlock;
 
 	if (!mlx5_modify_vport_admin_state(priv->mdev,
-			MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
-			rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_UP))
+					   MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+					   rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_UP))
 		netif_carrier_on(dev);
 
 unlock:
@@ -699,8 +699,8 @@ static int mlx5e_rep_close(struct net_device *dev)
 
 	mutex_lock(&priv->state_lock);
 	mlx5_modify_vport_admin_state(priv->mdev,
-			MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
-			rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
+				      MLX5_QUERY_VPORT_STATE_IN_OP_MOD_ESW_VPORT,
+				      rep->vport, MLX5_ESW_VPORT_ADMIN_STATE_DOWN);
 	ret = mlx5e_close_locked(dev);
 	mutex_unlock(&priv->state_lock);
 	return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 969a1768da71..85ad0948951c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -97,7 +97,7 @@ enum {
 };
 
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
-#define MLX5E_TC_TABLE_MAX_GROUP_SIZE (1 << 16)
+#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16)
 
 struct mlx5e_hairpin {
 	struct mlx5_hairpin *pair;
@@ -789,7 +789,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
 	mlx5_del_flow_rules(flow->rule);
 	mlx5_fc_destroy(priv->mdev, counter);
 
-	if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) {
+	if (!mlx5e_tc_num_filters(priv) && priv->fs.tc.t) {
 		mlx5_destroy_flow_table(priv->fs.tc.t);
 		priv->fs.tc.t = NULL;
 	}
@@ -1765,12 +1765,12 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 		err = -EOPNOTSUPP; /* can't be all optimistic */
 
 		if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK) {
-			printk(KERN_WARNING "mlx5: legacy pedit isn't offloaded\n");
+			netdev_warn(priv->netdev, "legacy pedit isn't offloaded\n");
 			goto out_err;
 		}
 
 		if (cmd != TCA_PEDIT_KEY_EX_CMD_SET && cmd != TCA_PEDIT_KEY_EX_CMD_ADD) {
-			printk(KERN_WARNING "mlx5: pedit cmd %d isn't offloaded\n", cmd);
+			netdev_warn(priv->netdev, "pedit cmd %d isn't offloaded\n", cmd);
 			goto out_err;
 		}
 
@@ -1794,8 +1794,7 @@ static int parse_tc_pedit_action(struct mlx5e_priv *priv,
 	for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) {
 		cmd_masks = &masks[cmd];
 		if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) {
-			printk(KERN_WARNING "mlx5: attempt to offload an unsupported field (cmd %d)\n",
-			       cmd);
+			netdev_warn(priv->netdev, "attempt to offload an unsupported field (cmd %d)\n", cmd);
 			print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS,
 				       16, 1, cmd_masks, sizeof(zero_masks), true);
 			err = -EOPNOTSUPP;
-- 
cgit v1.2.3-59-g8ed1b


From c180f675349de832030eaa03badbac6f347f487a Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 14:46:02 +0300
Subject: net/mlx5e: Avoid redundant zeroing of offloaded TC flow attributes

This is not needed as the attributes are zeroed out on allocation.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 85ad0948951c..7d88e813bad1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1923,7 +1923,6 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 		return -EINVAL;
 
 	attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
-	attr->action = 0;
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
@@ -2464,7 +2463,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
-	memset(attr, 0, sizeof(*attr));
 	attr->in_rep = rpriv->rep;
 
 	tcf_exts_to_list(exts, &actions);
-- 
cgit v1.2.3-59-g8ed1b


From 31c8eba5e860f5f35918b2f72c45e689805d1c32 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 14:55:39 +0300
Subject: net/mlx5e: Return success when TC offloaded fdb actions parsed ok

Reaching here, means we didn't err anywhere, so lets just
return success.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 7d88e813bad1..c15bd7b6a840 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2458,7 +2458,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 	bool encap = false;
-	int err = 0;
 
 	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
@@ -2474,6 +2473,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 		}
 
 		if (is_tcf_pedit(a)) {
+			int err;
+
 			err = parse_tc_pedit_action(priv, a, MLX5_FLOW_NAMESPACE_FDB,
 						    parse_attr);
 			if (err)
@@ -2561,7 +2562,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	if (!actions_match_supported(priv, exts, parse_attr, flow))
 		return -EOPNOTSUPP;
 
-	return err;
+	return 0;
 }
 
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
-- 
cgit v1.2.3-59-g8ed1b


From 1cab1cd74bee68aea786e075e979921f686f88a2 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 14:58:25 +0300
Subject: net/mlx5e: Use local actions var while processing offloaded TC flow
 actions

Use local actions variable while parsing the actions of offloaded TC flow.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 44 ++++++++++++++-----------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index c15bd7b6a840..24b6ca205c74 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1917,6 +1917,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
+	u32 action = 0;
 	int err;
 
 	if (!tcf_exts_has_actions(exts))
@@ -1927,10 +1928,10 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
 		if (is_tcf_gact_shot(a)) {
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+			action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
 			if (MLX5_CAP_FLOWTABLE(priv->mdev,
 					       flow_table_properties_nic_receive.flow_counter))
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			continue;
 		}
 
@@ -1940,13 +1941,13 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			if (err)
 				return err;
 
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR |
-					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR |
+				  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 			continue;
 		}
 
 		if (is_tcf_csum(a)) {
-			if (csum_offload_supported(priv, attr->action,
+			if (csum_offload_supported(priv, action,
 						   tcf_csum_update_flags(a)))
 				continue;
 
@@ -1960,8 +1961,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			    same_hw_devs(priv, netdev_priv(peer_dev))) {
 				parse_attr->mirred_ifindex = peer_dev->ifindex;
 				flow->flags |= MLX5E_TC_FLOW_HAIRPIN;
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
-						MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			} else {
 				netdev_warn(priv->netdev, "device %s not on same HW, can't offload\n",
 					    peer_dev->name);
@@ -1980,13 +1981,14 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			}
 
 			attr->flow_tag = mark;
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 			continue;
 		}
 
 		return -EINVAL;
 	}
 
+	attr->action = action;
 	if (!actions_match_supported(priv, exts, parse_attr, flow))
 		return -EOPNOTSUPP;
 
@@ -2458,6 +2460,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 	bool encap = false;
+	u32 action = 0;
 
 	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
@@ -2467,8 +2470,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
 		if (is_tcf_gact_shot(a)) {
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
-					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
+				  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			continue;
 		}
 
@@ -2480,12 +2483,12 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			if (err)
 				return err;
 
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
 			continue;
 		}
 
 		if (is_tcf_csum(a)) {
-			if (csum_offload_supported(priv, attr->action,
+			if (csum_offload_supported(priv, action,
 						   tcf_csum_update_flags(a)))
 				continue;
 
@@ -2500,8 +2503,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 			if (switchdev_port_same_parent_id(priv->netdev,
 							  out_dev)) {
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
-					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				out_priv = netdev_priv(out_dev);
 				rpriv = out_priv->ppriv;
 				attr->out_rep = rpriv->rep;
@@ -2509,9 +2512,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				parse_attr->mirred_ifindex = out_dev->ifindex;
 				parse_attr->tun_info = *info;
 				attr->parse_attr = parse_attr;
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
-					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
-					MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+					  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				/* attr->out_rep is resolved when we handle encap */
 			} else {
 				pr_err("devices %s %s not on same switch HW, can't offload forwarding\n",
@@ -2532,9 +2535,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 		if (is_tcf_vlan(a)) {
 			if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) {
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
+				action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
 			} else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) {
-				attr->action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH;
+				action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH;
 				attr->vlan_vid = tcf_vlan_push_vid(a);
 				if (mlx5_eswitch_vlan_actions_supported(priv->mdev)) {
 					attr->vlan_prio = tcf_vlan_push_prio(a);
@@ -2552,13 +2555,14 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 		}
 
 		if (is_tcf_tunnel_release(a)) {
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
 			continue;
 		}
 
 		return -EINVAL;
 	}
 
+	attr->action = action;
 	if (!actions_match_supported(priv, exts, parse_attr, flow))
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3-59-g8ed1b


From 547829004c98941f73d010c87c2111e29a6c03ae Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 15:33:52 +0300
Subject: net/mlx5e: Properly order min inline mode setup while parsing TC
 matches

Set the initial value to none instead of L2, and set to L2
where the previous initial value was assumed. Make sure to
parse L2 matches before L3 matches and L3 before L4.

This is a pre-step to get the match level for more purposes
other than the validating the needed vs. actual inline level.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 129 ++++++++++++++----------
 1 file changed, 76 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 24b6ca205c74..78f5c47affb4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1201,7 +1201,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 	u16 addr_type = 0;
 	u8 ip_proto = 0;
 
-	*min_inline = MLX5_INLINE_MODE_L2;
+	*min_inline = MLX5_INLINE_MODE_NONE;
 
 	if (f->dissector->used_keys &
 	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
@@ -1251,58 +1251,6 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 					 inner_headers);
 	}
 
-	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
-		struct flow_dissector_key_control *key =
-			skb_flow_dissector_target(f->dissector,
-						  FLOW_DISSECTOR_KEY_CONTROL,
-						  f->key);
-
-		struct flow_dissector_key_control *mask =
-			skb_flow_dissector_target(f->dissector,
-						  FLOW_DISSECTOR_KEY_CONTROL,
-						  f->mask);
-		addr_type = key->addr_type;
-
-		/* the HW doesn't support frag first/later */
-		if (mask->flags & FLOW_DIS_FIRST_FRAG)
-			return -EOPNOTSUPP;
-
-		if (mask->flags & FLOW_DIS_IS_FRAGMENT) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag,
-				 key->flags & FLOW_DIS_IS_FRAGMENT);
-
-			/* the HW doesn't need L3 inline to match on frag=no */
-			if (key->flags & FLOW_DIS_IS_FRAGMENT)
-				*min_inline = MLX5_INLINE_MODE_IP;
-		}
-	}
-
-	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
-		struct flow_dissector_key_basic *key =
-			skb_flow_dissector_target(f->dissector,
-						  FLOW_DISSECTOR_KEY_BASIC,
-						  f->key);
-		struct flow_dissector_key_basic *mask =
-			skb_flow_dissector_target(f->dissector,
-						  FLOW_DISSECTOR_KEY_BASIC,
-						  f->mask);
-		ip_proto = key->ip_proto;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype,
-			 ntohs(mask->n_proto));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
-			 ntohs(key->n_proto));
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-			 mask->ip_proto);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-			 key->ip_proto);
-
-		if (mask->ip_proto)
-			*min_inline = MLX5_INLINE_MODE_IP;
-	}
-
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct flow_dissector_key_eth_addrs *key =
 			skb_flow_dissector_target(f->dissector,
@@ -1326,6 +1274,9 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
 					     smac_47_16),
 				key->src);
+
+		if (!is_zero_ether_addr(mask->src) || !is_zero_ether_addr(mask->dst))
+			*min_inline = MLX5_INLINE_MODE_L2;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
@@ -1346,9 +1297,79 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 
 			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, mask->vlan_priority);
 			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, key->vlan_priority);
+
+			*min_inline = MLX5_INLINE_MODE_L2;
 		}
 	}
 
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype,
+			 ntohs(mask->n_proto));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+			 ntohs(key->n_proto));
+
+		if (mask->n_proto)
+			*min_inline = MLX5_INLINE_MODE_L2;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+
+		struct flow_dissector_key_control *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->mask);
+		addr_type = key->addr_type;
+
+		/* the HW doesn't support frag first/later */
+		if (mask->flags & FLOW_DIS_FIRST_FRAG)
+			return -EOPNOTSUPP;
+
+		if (mask->flags & FLOW_DIS_IS_FRAGMENT) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag,
+				 key->flags & FLOW_DIS_IS_FRAGMENT);
+
+			/* the HW doesn't need L3 inline to match on frag=no */
+			if (!(key->flags & FLOW_DIS_IS_FRAGMENT))
+				*min_inline = MLX5_INLINE_MODE_L2;
+	/* ***  L2 attributes parsing up to here *** */
+			else
+				*min_inline = MLX5_INLINE_MODE_IP;
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		ip_proto = key->ip_proto;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 mask->ip_proto);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 key->ip_proto);
+
+		if (mask->ip_proto)
+			*min_inline = MLX5_INLINE_MODE_IP;
+	}
+
 	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
 		struct flow_dissector_key_ipv4_addrs *key =
 			skb_flow_dissector_target(f->dissector,
@@ -1433,6 +1454,8 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			*min_inline = MLX5_INLINE_MODE_IP;
 	}
 
+	/* ***  L3 attributes parsing up to here *** */
+
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
 		struct flow_dissector_key_ports *key =
 			skb_flow_dissector_target(f->dissector,
-- 
cgit v1.2.3-59-g8ed1b


From d708f902989b844907c5f7720abe67812a256c33 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 15:42:06 +0300
Subject: net/mlx5e: Get the required HW match level while parsing TC flow
 matches

Introduce levels of matching on headers of offloaded flows
(none, L2, L3, L4) that follow the inline mode levels.

This is pre-step for us to offload flows without any
matches on headers.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 34 +++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |  7 +++++
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 78f5c47affb4..e84bcea8b071 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1192,7 +1192,7 @@ vxlan_match_offload_err:
 static int __parse_cls_flower(struct mlx5e_priv *priv,
 			      struct mlx5_flow_spec *spec,
 			      struct tc_cls_flower_offload *f,
-			      u8 *min_inline)
+			      u8 *match_level)
 {
 	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 				       outer_headers);
@@ -1201,7 +1201,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 	u16 addr_type = 0;
 	u8 ip_proto = 0;
 
-	*min_inline = MLX5_INLINE_MODE_NONE;
+	*match_level = MLX5_MATCH_NONE;
 
 	if (f->dissector->used_keys &
 	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
@@ -1276,7 +1276,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 				key->src);
 
 		if (!is_zero_ether_addr(mask->src) || !is_zero_ether_addr(mask->dst))
-			*min_inline = MLX5_INLINE_MODE_L2;
+			*match_level = MLX5_MATCH_L2;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
@@ -1298,7 +1298,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, mask->vlan_priority);
 			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, key->vlan_priority);
 
-			*min_inline = MLX5_INLINE_MODE_L2;
+			*match_level = MLX5_MATCH_L2;
 		}
 	}
 
@@ -1317,7 +1317,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			 ntohs(key->n_proto));
 
 		if (mask->n_proto)
-			*min_inline = MLX5_INLINE_MODE_L2;
+			*match_level = MLX5_MATCH_L2;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
@@ -1343,10 +1343,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 
 			/* the HW doesn't need L3 inline to match on frag=no */
 			if (!(key->flags & FLOW_DIS_IS_FRAGMENT))
-				*min_inline = MLX5_INLINE_MODE_L2;
+				*match_level = MLX5_INLINE_MODE_L2;
 	/* ***  L2 attributes parsing up to here *** */
 			else
-				*min_inline = MLX5_INLINE_MODE_IP;
+				*match_level = MLX5_INLINE_MODE_IP;
 		}
 	}
 
@@ -1367,7 +1367,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			 key->ip_proto);
 
 		if (mask->ip_proto)
-			*min_inline = MLX5_INLINE_MODE_IP;
+			*match_level = MLX5_MATCH_L3;
 	}
 
 	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
@@ -1394,7 +1394,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 		       &key->dst, sizeof(key->dst));
 
 		if (mask->src || mask->dst)
-			*min_inline = MLX5_INLINE_MODE_IP;
+			*match_level = MLX5_MATCH_L3;
 	}
 
 	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
@@ -1423,7 +1423,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 
 		if (ipv6_addr_type(&mask->src) != IPV6_ADDR_ANY ||
 		    ipv6_addr_type(&mask->dst) != IPV6_ADDR_ANY)
-			*min_inline = MLX5_INLINE_MODE_IP;
+			*match_level = MLX5_MATCH_L3;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_IP)) {
@@ -1451,7 +1451,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			return -EOPNOTSUPP;
 
 		if (mask->tos || mask->ttl)
-			*min_inline = MLX5_INLINE_MODE_IP;
+			*match_level = MLX5_MATCH_L3;
 	}
 
 	/* ***  L3 attributes parsing up to here *** */
@@ -1496,7 +1496,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 		}
 
 		if (mask->src || mask->dst)
-			*min_inline = MLX5_INLINE_MODE_TCP_UDP;
+			*match_level = MLX5_MATCH_L4;
 	}
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_TCP)) {
@@ -1515,7 +1515,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
 			 ntohs(key->flags));
 
 		if (mask->flags)
-			*min_inline = MLX5_INLINE_MODE_TCP_UDP;
+			*match_level = MLX5_MATCH_L4;
 	}
 
 	return 0;
@@ -1530,19 +1530,19 @@ static int parse_cls_flower(struct mlx5e_priv *priv,
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_eswitch_rep *rep;
-	u8 min_inline;
+	u8 match_level;
 	int err;
 
-	err = __parse_cls_flower(priv, spec, f, &min_inline);
+	err = __parse_cls_flower(priv, spec, f, &match_level);
 
 	if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH)) {
 		rep = rpriv->rep;
 		if (rep->vport != FDB_UPLINK_VPORT &&
 		    (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
-		    esw->offloads.inline_mode < min_inline)) {
+		    esw->offloads.inline_mode < match_level)) {
 			netdev_warn(priv->netdev,
 				    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
-				    min_inline, esw->offloads.inline_mode);
+				    match_level, esw->offloads.inline_mode);
 			return -EOPNOTSUPP;
 		}
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4cd773fa55e3..efae77dd1e35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -227,6 +227,13 @@ enum {
 	SET_VLAN_INSERT	= BIT(1)
 };
 
+enum mlx5_flow_match_level {
+	MLX5_MATCH_NONE	= MLX5_INLINE_MODE_NONE,
+	MLX5_MATCH_L2	= MLX5_INLINE_MODE_L2,
+	MLX5_MATCH_L3	= MLX5_INLINE_MODE_IP,
+	MLX5_MATCH_L4	= MLX5_INLINE_MODE_TCP_UDP,
+};
+
 struct mlx5_esw_flow_attr {
 	struct mlx5_eswitch_rep *in_rep;
 	struct mlx5_eswitch_rep *out_rep;
-- 
cgit v1.2.3-59-g8ed1b


From 38aa51c134b56b7ea61bea79b428c5fbcd95f285 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 5 Apr 2018 15:50:36 +0300
Subject: net/mlx5e: Support offloaded TC flows with no matches on headers

For example:
    tc filter add dev ens2f0_0 parent ffff: flower skip_sw action drop

Note that for eswitch flows, we still always match on the source port.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c            | 10 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |  8 ++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index e84bcea8b071..1dc24e3a0841 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -58,6 +58,7 @@ struct mlx5_nic_flow_attr {
 	u32 flow_tag;
 	u32 mod_hdr_id;
 	u32 hairpin_tirn;
+	u8 match_level;
 	struct mlx5_flow_table	*hairpin_ft;
 };
 
@@ -753,7 +754,9 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		table_created = true;
 	}
 
-	parse_attr->spec.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	if (attr->match_level != MLX5_MATCH_NONE)
+		parse_attr->spec.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+
 	rule = mlx5_add_flow_rules(priv->fs.tc.t, &parse_attr->spec,
 				   &flow_act, dest, dest_ix);
 
@@ -1547,6 +1550,11 @@ static int parse_cls_flower(struct mlx5e_priv *priv,
 		}
 	}
 
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		flow->esw_attr->match_level = match_level;
+	else
+		flow->nic_attr->match_level = match_level;
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index efae77dd1e35..edf47a4d549e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -245,6 +245,7 @@ struct mlx5_esw_flow_attr {
 	bool	vlan_handled;
 	u32	encap_id;
 	u32	mod_hdr_id;
+	u8	match_level;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 35e256eb2f6e..8dd0eca03202 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -91,8 +91,12 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
 
-	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
-				      MLX5_MATCH_MISC_PARAMETERS;
+	if (attr->match_level == MLX5_MATCH_NONE)
+		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	else
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
+					      MLX5_MATCH_MISC_PARAMETERS;
+
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
 		spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS;
 
-- 
cgit v1.2.3-59-g8ed1b


From 1e7477ae8bbeadfa90064d6bedc421b12bc2d43d Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Wed, 28 Mar 2018 13:26:50 +0300
Subject: net/mlx5e: Report all channels with min RX WQEs timeout

Report all channels which got timeout on posting the minimal number of
RX WQEs and not only the first one. Avoid busy wait on every channel,
when one of the RQs check got timeout, poll once for the remaining RQs.

In addition, add channel index to log when failed to get min RX WQEs
This info is needed in order to debug in case of dysfunctional channel.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0f2b66b41899..b0d6dde2754f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -747,23 +747,24 @@ static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
 	mlx5_core_destroy_rq(rq->mdev, rq->rqn);
 }
 
-static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
+static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
 {
-	unsigned long exp_time = jiffies + msecs_to_jiffies(20000);
+	unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
 	struct mlx5e_channel *c = rq->channel;
 
 	struct mlx5_wq_ll *wq = &rq->wq;
 	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5_wq_ll_get_size(wq));
 
-	while (time_before(jiffies, exp_time)) {
+	do {
 		if (wq->cur_sz >= min_wqes)
 			return 0;
 
 		msleep(20);
-	}
+	} while (time_before(jiffies, exp_time));
+
+	netdev_warn(c->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
+		    c->ix, rq->rqn, wq->cur_sz, min_wqes);
 
-	netdev_warn(c->netdev, "Failed to get min RX wqes on RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
-		    rq->rqn, wq->cur_sz, min_wqes);
 	return -ETIMEDOUT;
 }
 
@@ -2128,13 +2129,11 @@ static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
 	int err = 0;
 	int i;
 
-	for (i = 0; i < chs->num; i++) {
-		err = mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq);
-		if (err)
-			break;
-	}
+	for (i = 0; i < chs->num; i++)
+		err |= mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq,
+						  err ? 0 : 20000);
 
-	return err;
+	return err ? -ETIMEDOUT : 0;
 }
 
 static void mlx5e_deactivate_channels(struct mlx5e_channels *chs)
-- 
cgit v1.2.3-59-g8ed1b


From af5a6c93656eec9a44617c77fae62af54d5a5061 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 23 Jan 2018 12:27:11 +0200
Subject: net/mlx5e: Use __set_bit for adaptive-moderation bit in RQ state

Make the code more clear by replacing the existing code with __set_bit.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b0d6dde2754f..417bf2e8ab85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -820,7 +820,7 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 		goto err_destroy_rq;
 
 	if (params->rx_dim_enabled)
-		c->rq.state |= BIT(MLX5E_RQ_STATE_AM);
+		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);
 
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From bfbe2057531026ac20bba9f45d3a21a98f3c592a Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 23 Jan 2018 12:28:12 +0200
Subject: net/mlx5e: Use test bit in en accel xmit flow

Replace (mask & bit) check with test_bit.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
index 68fcb40a2847..f20074dbef32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
@@ -49,7 +49,7 @@ static inline struct sk_buff *mlx5e_accel_handle_tx(struct sk_buff *skb,
 						    u16 *pi)
 {
 #ifdef CONFIG_MLX5_EN_TLS
-	if (sq->state & BIT(MLX5E_SQ_STATE_TLS)) {
+	if (test_bit(MLX5E_SQ_STATE_TLS, &sq->state)) {
 		skb = mlx5e_tls_handle_tx_skb(dev, sq, skb, wqe, pi);
 		if (unlikely(!skb))
 			return NULL;
@@ -57,7 +57,7 @@ static inline struct sk_buff *mlx5e_accel_handle_tx(struct sk_buff *skb,
 #endif
 
 #ifdef CONFIG_MLX5_EN_IPSEC
-	if (sq->state & BIT(MLX5E_SQ_STATE_IPSEC)) {
+	if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) {
 		skb = mlx5e_ipsec_handle_tx_skb(dev, *wqe, skb);
 		if (unlikely(!skb))
 			return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 0e5c04f6b52b248cfdb7e791fd5702f12270df7b Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 23 Jan 2018 12:23:26 +0200
Subject: net/mlx5e: Remove MLX5E_TEST_BIT macro

MLX5E_TEST_BIT macro is the same as the already existent test_bit,
remove it and replace all usages.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 10 +++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |  4 ++--
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9cc07da09b70..7c930088e96e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -288,8 +288,6 @@ enum {
 	MLX5E_RQ_STATE_AM,
 };
 
-#define MLX5E_TEST_BIT(state, nr) (state & BIT(nr))
-
 struct mlx5e_cq {
 	/* data path - accessed per cqe */
 	struct mlx5_cqwq           wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index abc6ca8168d2..53f72923b164 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -450,7 +450,7 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	struct mlx5_wq_ll *wq = &rq->wq;
 	int err;
 
-	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
 	if (mlx5_wq_ll_is_full(wq))
@@ -508,7 +508,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 	struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
 	struct mlx5_cqe64 *cqe;
 
-	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
 		return;
 
 	cqe = mlx5_cqwq_get_cqe(&cq->wq);
@@ -525,7 +525,7 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->wq;
 
-	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
 	mlx5e_poll_ico_cq(&rq->channel->icosq.cq, rq);
@@ -1132,7 +1132,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 	struct mlx5_cqe64 *cqe;
 	int work_done = 0;
 
-	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return 0;
 
 	if (cq->decmprs_left)
@@ -1185,7 +1185,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
 
 	sq = container_of(cq, struct mlx5e_xdpsq, cq);
 
-	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
 		return false;
 
 	cqe = mlx5_cqwq_get_cqe(&cq->wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 047614d2eda2..2d3f17da5f5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -450,7 +450,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 
 	sq = container_of(cq, struct mlx5e_txqsq, cq);
 
-	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_ENABLED)))
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
 		return false;
 
 	cqe = mlx5_cqwq_get_cqe(&cq->wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 900661d45c8a..5d6f9ce2bf80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -48,7 +48,7 @@ static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 {
 	struct net_dim_sample dim_sample;
 
-	if (unlikely(!MLX5E_TEST_BIT(sq->state, MLX5E_SQ_STATE_AM)))
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
 	net_dim_sample(sq->cq.event_ctr, sq->stats.packets, sq->stats.bytes,
@@ -60,7 +60,7 @@ static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 {
 	struct net_dim_sample dim_sample;
 
-	if (unlikely(!MLX5E_TEST_BIT(rq->state, MLX5E_RQ_STATE_AM)))
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
 	net_dim_sample(rq->cq.event_ctr, rq->stats.packets, rq->stats.bytes,
-- 
cgit v1.2.3-59-g8ed1b


From 0631b6583f66cef6f8af3387e7d320cb1f652b75 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:27 +0200
Subject: bpf, mips: remove unused function

The ool_skb_header_pointer() and size_to_len() is unused same as
tmp_offset, therefore remove all of them.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/mips/net/ebpf_jit.c | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 7ba7df9c28fc..aeb7b1b0f202 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -95,7 +95,6 @@ enum reg_val_type {
  * struct jit_ctx - JIT context
  * @skf:		The sk_filter
  * @stack_size:		eBPF stack size
- * @tmp_offset:		eBPF $sp offset to 8-byte temporary memory
  * @idx:		Instruction index
  * @flags:		JIT flags
  * @offsets:		Instruction offsets
@@ -105,7 +104,6 @@ enum reg_val_type {
 struct jit_ctx {
 	const struct bpf_prog *skf;
 	int stack_size;
-	int tmp_offset;
 	u32 idx;
 	u32 flags;
 	u32 *offsets;
@@ -293,7 +291,6 @@ static int gen_int_prologue(struct jit_ctx *ctx)
 	locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0;
 
 	stack_adjust += locals_size;
-	ctx->tmp_offset = locals_size;
 
 	ctx->stack_size = stack_adjust;
 
@@ -399,7 +396,6 @@ static void gen_imm_to_reg(const struct bpf_insn *insn, int reg,
 		emit_instr(ctx, lui, reg, upper >> 16);
 		emit_instr(ctx, addiu, reg, reg, lower);
 	}
-
 }
 
 static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
@@ -547,28 +543,6 @@ static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	return 0;
 }
 
-static void * __must_check
-ool_skb_header_pointer(const struct sk_buff *skb, int offset,
-		       int len, void *buffer)
-{
-	return skb_header_pointer(skb, offset, len, buffer);
-}
-
-static int size_to_len(const struct bpf_insn *insn)
-{
-	switch (BPF_SIZE(insn->code)) {
-	case BPF_B:
-		return 1;
-	case BPF_H:
-		return 2;
-	case BPF_W:
-		return 4;
-	case BPF_DW:
-		return 8;
-	}
-	return 0;
-}
-
 static void emit_const_to_reg(struct jit_ctx *ctx, int dst, u64 value)
 {
 	if (value >= 0xffffffffffff8000ull || value < 0x8000ull) {
-- 
cgit v1.2.3-59-g8ed1b


From 631b1e3b838ec25f509852247d1d5d10b8021444 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:28 +0200
Subject: bpf, sparc: remove unused variable

Since fe83963b7c38 ("bpf, sparc64: remove ld_abs/ld_ind") it's not
used anymore therefore remove it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/sparc/net/bpf_jit_comp_64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 9f5918e0693a..222785af550b 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -894,7 +894,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	const int i = insn - ctx->prog->insnsi;
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
-	u32 *func;
 
 	if (insn->src_reg == BPF_REG_FP)
 		ctx->saw_frame_pointer = true;
-- 
cgit v1.2.3-59-g8ed1b


From 36256009b2d532621a5ee4b304da0e4b54e48acb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:29 +0200
Subject: bpf, x64: clean up retpoline emission slightly

Make the RETPOLINE_{RA,ED}X_BPF_JIT() a bit more readable by
cleaning up the macro, aligning comments and spacing.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 2cd344d1a6e5..2f700a1db851 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -301,9 +301,9 @@ do {									\
  *    jmp *%edx for x86_32
  */
 #ifdef CONFIG_RETPOLINE
-#ifdef CONFIG_X86_64
-# define RETPOLINE_RAX_BPF_JIT_SIZE	17
-# define RETPOLINE_RAX_BPF_JIT()				\
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	17
+#  define RETPOLINE_RAX_BPF_JIT()				\
 do {								\
 	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
 	/* spec_trap: */					\
@@ -314,8 +314,8 @@ do {								\
 	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
 	EMIT1(0xC3);             /* retq */			\
 } while (0)
-#else
-# define RETPOLINE_EDX_BPF_JIT()				\
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
 do {								\
 	EMIT1_off32(0xE8, 7);	 /* call do_rop */		\
 	/* spec_trap: */					\
@@ -326,17 +326,16 @@ do {								\
 	EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */		\
 	EMIT1(0xC3);             /* ret */			\
 } while (0)
-#endif
+# endif
 #else /* !CONFIG_RETPOLINE */
-
-#ifdef CONFIG_X86_64
-# define RETPOLINE_RAX_BPF_JIT_SIZE	2
-# define RETPOLINE_RAX_BPF_JIT()				\
-	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
-#else
-# define RETPOLINE_EDX_BPF_JIT()				\
-	EMIT2(0xFF, 0xE2) /* jmp *%edx */
-#endif
+# ifdef CONFIG_X86_64
+#  define RETPOLINE_RAX_BPF_JIT_SIZE	2
+#  define RETPOLINE_RAX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE0);       /* jmp *%rax */
+# else /* !CONFIG_X86_64 */
+#  define RETPOLINE_EDX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE2)        /* jmp *%edx */
+# endif
 #endif
 
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 38ca930601631585fbd7826f8f6908055a8dbc9e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:30 +0200
Subject: bpf, arm32: save 4 bytes of unneeded stack space

The extra skb_copy_bits() buffer is not used anymore, therefore
remove the extra 4 byte stack space requirement.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 82689b999257..d3ea6454e775 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -234,18 +234,11 @@ static void jit_fill_hole(void *area, unsigned int size)
 #define SCRATCH_SIZE 80
 
 /* total stack size used in JITed code */
-#define _STACK_SIZE \
-	(ctx->prog->aux->stack_depth + \
-	 + SCRATCH_SIZE + \
-	 + 4 /* extra for skb_copy_bits buffer */)
-
-#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+#define _STACK_SIZE	(ctx->prog->aux->stack_depth + SCRATCH_SIZE)
+#define STACK_SIZE	ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
 
 /* Get the offset of eBPF REGISTERs stored on scratch space. */
-#define STACK_VAR(off) (STACK_SIZE-off-4)
-
-/* Offset of skb_copy_bits buffer */
-#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+#define STACK_VAR(off) (STACK_SIZE - off)
 
 #if __LINUX_ARM_ARCH__ < 7
 
-- 
cgit v1.2.3-59-g8ed1b


From 09ece3d0f2ee524b8d1d3c775d9eeb50c40558d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:31 +0200
Subject: bpf, arm64: save 4 bytes of unneeded stack space

Follow-up to 816d9ef32a8b ("bpf, arm64: remove ld_abs/ld_ind") in
that the extra 4 byte JIT scratchpad is not needed anymore since it
was in ld_abs/ld_ind as stack buffer for bpf_load_pointer().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 0b40c8fb0706..85113cab2046 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -21,7 +21,6 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/printk.h>
-#include <linux/skbuff.h>
 #include <linux/slab.h>
 
 #include <asm/byteorder.h>
@@ -188,7 +187,7 @@ static int build_prologue(struct jit_ctx *ctx)
 	 *                        | ... | BPF prog stack
 	 *                        |     |
 	 *                        +-----+ <= (BPF_FP - prog->aux->stack_depth)
-	 *                        |RSVD | JIT scratchpad
+	 *                        |RSVD | padding
 	 * current A64_SP =>      +-----+ <= (BPF_FP - ctx->stack_size)
 	 *                        |     |
 	 *                        | ... | Function call stack
@@ -220,9 +219,7 @@ static int build_prologue(struct jit_ctx *ctx)
 		return -1;
 	}
 
-	/* 4 byte extra for skb_copy_bits buffer */
-	ctx->stack_size = prog->aux->stack_depth + 4;
-	ctx->stack_size = STACK_ALIGN(ctx->stack_size);
+	ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
 
 	/* Set up function call stack */
 	emit(A64_SUB_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 6d2eea6fb0791bf191043a68937c9aa59c5ea678 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:32 +0200
Subject: bpf, arm64: optimize 32/64 immediate emission

Improve the JIT to emit 64 and 32 bit immediates, the current
algorithm is not optimal and we often emit more instructions
than actually needed. arm64 has movz, movn, movk variants but
for the current 64 bit immediates we only use movz with a
series of movk when needed.

For example loading ffffffffffffabab emits the following 4
instructions in the JIT today:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: ffff, shift: 16, result: 00000000ffffabab
  * movk: ffff, shift: 32, result: 0000ffffffffabab
  * movk: ffff, shift: 48, result: ffffffffffffabab

Whereas after the patch the same load only needs a single
instruction:

  * movn: 5454, shift:  0, result: ffffffffffffabab

Another example where two extra instructions can be saved:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: 1f2f, shift: 16, result: 000000001f2fabab
  * movk: ffff, shift: 32, result: 0000ffff1f2fabab
  * movk: ffff, shift: 48, result: ffffffff1f2fabab

After the patch:

  * movn: e0d0, shift: 16, result: ffffffff1f2fffff
  * movk: abab, shift:  0, result: ffffffff1f2fabab

Another example with movz, before:

  * movz: 0000, shift:  0, result: 0000000000000000
  * movk: fea0, shift: 32, result: 0000fea000000000

After:

  * movz: fea0, shift: 32, result: 0000fea000000000

Moreover, reuse emit_a64_mov_i() for 32 bit immediates that
are loaded via emit_a64_mov_i64() which is a similar optimization
as done in 6fe8b9c1f41d ("bpf, x64: save several bytes by using
mov over movabsq when possible"). On arm64, the latter allows to
use a single instruction with movn due to zero extension where
otherwise two would be needed. And last but not least add a
missing optimization in emit_a64_mov_i() where movn is used but
the subsequent movk not needed. With some of the Cilium programs
in use, this shrinks the needed instructions by about three
percent. Tested on Cavium ThunderX CN8890.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 85 +++++++++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 85113cab2046..c8a2620312c0 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -79,23 +79,66 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
 	ctx->idx++;
 }
 
+static inline void emit_a64_mov_i(const int is64, const int reg,
+				  const s32 val, struct jit_ctx *ctx)
+{
+	u16 hi = val >> 16;
+	u16 lo = val & 0xffff;
+
+	if (hi & 0x8000) {
+		if (hi == 0xffff) {
+			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
+		} else {
+			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
+			if (lo != 0xffff)
+				emit(A64_MOVK(is64, reg, lo, 0), ctx);
+		}
+	} else {
+		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
+		if (hi)
+			emit(A64_MOVK(is64, reg, hi, 16), ctx);
+	}
+}
+
+static int i64_i16_blocks(const u64 val, bool inverse)
+{
+	return (((val >>  0) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000));
+}
+
 static inline void emit_a64_mov_i64(const int reg, const u64 val,
 				    struct jit_ctx *ctx)
 {
-	u64 tmp = val;
-	int shift = 0;
-
-	emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
-	tmp >>= 16;
-	shift += 16;
-	while (tmp) {
-		if (tmp & 0xffff)
-			emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
-		tmp >>= 16;
-		shift += 16;
+	u64 nrm_tmp = val, rev_tmp = ~val;
+	bool inverse;
+	int shift;
+
+	if (!(nrm_tmp >> 32))
+		return emit_a64_mov_i(0, reg, (u32)val, ctx);
+
+	inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false);
+	shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) :
+					  (fls64(nrm_tmp) - 1)), 16), 0);
+	if (inverse)
+		emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx);
+	else
+		emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+	shift -= 16;
+	while (shift >= 0) {
+		if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000))
+			emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+		shift -= 16;
 	}
 }
 
+/*
+ * This is an unoptimized 64 immediate emission used for BPF to BPF call
+ * addresses. It will always do a full 64 bit decomposition as otherwise
+ * more complexity in the last extra pass is required since we previously
+ * reserved 4 instructions for the address.
+ */
 static inline void emit_addr_mov_i64(const int reg, const u64 val,
 				     struct jit_ctx *ctx)
 {
@@ -110,26 +153,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
 	}
 }
 
-static inline void emit_a64_mov_i(const int is64, const int reg,
-				  const s32 val, struct jit_ctx *ctx)
-{
-	u16 hi = val >> 16;
-	u16 lo = val & 0xffff;
-
-	if (hi & 0x8000) {
-		if (hi == 0xffff) {
-			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
-		} else {
-			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
-			emit(A64_MOVK(is64, reg, lo, 0), ctx);
-		}
-	} else {
-		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
-		if (hi)
-			emit(A64_MOVK(is64, reg, hi, 16), ctx);
-	}
-}
-
 static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 				 const struct jit_ctx *ctx)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 56ea6a8b4949c66d550d05e92da8ef4351b90027 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:33 +0200
Subject: bpf, arm64: save 4 bytes in prologue when ebpf insns came from cbpf

We can trivially save 4 bytes in prologue for cBPF since tail calls
can never be used from there. The register push/pop is pairwise,
here, x25 (fp) and x26 (tcc), so no point in changing that, only
reset to zero is not needed.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index c8a2620312c0..a6fdaea07c63 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -185,7 +185,7 @@ static inline int epilogue_offset(const struct jit_ctx *ctx)
 /* Tail call offset to jump into */
 #define PROLOGUE_OFFSET 7
 
-static int build_prologue(struct jit_ctx *ctx)
+static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 {
 	const struct bpf_prog *prog = ctx->prog;
 	const u8 r6 = bpf2a64[BPF_REG_6];
@@ -232,14 +232,16 @@ static int build_prologue(struct jit_ctx *ctx)
 	/* Set up BPF prog stack base register */
 	emit(A64_MOV(1, fp, A64_SP), ctx);
 
-	/* Initialize tail_call_cnt */
-	emit(A64_MOVZ(1, tcc, 0, 0), ctx);
+	if (!ebpf_from_cbpf) {
+		/* Initialize tail_call_cnt */
+		emit(A64_MOVZ(1, tcc, 0, 0), ctx);
 
-	cur_offset = ctx->idx - idx0;
-	if (cur_offset != PROLOGUE_OFFSET) {
-		pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n",
-			    cur_offset, PROLOGUE_OFFSET);
-		return -1;
+		cur_offset = ctx->idx - idx0;
+		if (cur_offset != PROLOGUE_OFFSET) {
+			pr_err_once("PROLOGUE_OFFSET = %d, expected %d!\n",
+				    cur_offset, PROLOGUE_OFFSET);
+			return -1;
+		}
 	}
 
 	ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
@@ -806,6 +808,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	struct bpf_prog *tmp, *orig_prog = prog;
 	struct bpf_binary_header *header;
 	struct arm64_jit_data *jit_data;
+	bool was_classic = bpf_prog_was_classic(prog);
 	bool tmp_blinded = false;
 	bool extra_pass = false;
 	struct jit_ctx ctx;
@@ -860,7 +863,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
-	if (build_prologue(&ctx)) {
+	if (build_prologue(&ctx, was_classic)) {
 		prog = orig_prog;
 		goto out_off;
 	}
@@ -883,7 +886,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 skip_init_ctx:
 	ctx.idx = 0;
 
-	build_prologue(&ctx);
+	build_prologue(&ctx, was_classic);
 
 	if (build_body(&ctx)) {
 		bpf_jit_binary_free(header);
-- 
cgit v1.2.3-59-g8ed1b


From a82d8cd398716b41f0fbe3882e3fe3f5ccf9f9cf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 14 May 2018 23:22:34 +0200
Subject: bpf: add ld64 imm test cases

Add test cases where we combine semi-random imm values, mainly for testing
JITs when they have different encoding options for 64 bit immediates in
order to reduce resulting image size.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/bpf_rand.h      | 80 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_verifier.c | 62 ++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/bpf_rand.h

diff --git a/tools/testing/selftests/bpf/bpf_rand.h b/tools/testing/selftests/bpf/bpf_rand.h
new file mode 100644
index 000000000000..59bf3e1a9371
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_rand.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_RAND__
+#define __BPF_RAND__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <time.h>
+
+static inline uint64_t bpf_rand_mask(uint64_t mask)
+{
+	return (((uint64_t)(uint32_t)rand()) |
+	        ((uint64_t)(uint32_t)rand() << 32)) & mask;
+}
+
+#define bpf_rand_ux(x, m)			\
+static inline uint64_t bpf_rand_u##x(int shift)	\
+{						\
+	return bpf_rand_mask((m)) << shift;	\
+}
+
+bpf_rand_ux( 8,               0xffULL)
+bpf_rand_ux(16,             0xffffULL)
+bpf_rand_ux(24,           0xffffffULL)
+bpf_rand_ux(32,         0xffffffffULL)
+bpf_rand_ux(40,       0xffffffffffULL)
+bpf_rand_ux(48,     0xffffffffffffULL)
+bpf_rand_ux(56,   0xffffffffffffffULL)
+bpf_rand_ux(64, 0xffffffffffffffffULL)
+
+static inline void bpf_semi_rand_init(void)
+{
+	srand(time(NULL));
+}
+
+static inline uint64_t bpf_semi_rand_get(void)
+{
+	switch (rand() % 39) {
+	case  0: return 0x000000ff00000000ULL | bpf_rand_u8(0);
+	case  1: return 0xffffffff00000000ULL | bpf_rand_u16(0);
+	case  2: return 0x00000000ffff0000ULL | bpf_rand_u16(0);
+	case  3: return 0x8000000000000000ULL | bpf_rand_u32(0);
+	case  4: return 0x00000000f0000000ULL | bpf_rand_u32(0);
+	case  5: return 0x0000000100000000ULL | bpf_rand_u24(0);
+	case  6: return 0x800ff00000000000ULL | bpf_rand_u32(0);
+	case  7: return 0x7fffffff00000000ULL | bpf_rand_u32(0);
+	case  8: return 0xffffffffffffff00ULL ^ bpf_rand_u32(24);
+	case  9: return 0xffffffffffffff00ULL | bpf_rand_u8(0);
+	case 10: return 0x0000000010000000ULL | bpf_rand_u32(0);
+	case 11: return 0xf000000000000000ULL | bpf_rand_u8(0);
+	case 12: return 0x0000f00000000000ULL | bpf_rand_u8(8);
+	case 13: return 0x000000000f000000ULL | bpf_rand_u8(16);
+	case 14: return 0x0000000000000f00ULL | bpf_rand_u8(32);
+	case 15: return 0x00fff00000000f00ULL | bpf_rand_u8(48);
+	case 16: return 0x00007fffffffffffULL ^ bpf_rand_u32(1);
+	case 17: return 0xffff800000000000ULL | bpf_rand_u8(4);
+	case 18: return 0xffff800000000000ULL | bpf_rand_u8(20);
+	case 19: return (0xffffffc000000000ULL + 0x80000ULL) | bpf_rand_u32(0);
+	case 20: return (0xffffffc000000000ULL - 0x04000000ULL) | bpf_rand_u32(0);
+	case 21: return 0x0000000000000000ULL | bpf_rand_u8(55) | bpf_rand_u32(20);
+	case 22: return 0xffffffffffffffffULL ^ bpf_rand_u8(3) ^ bpf_rand_u32(40);
+	case 23: return 0x0000000000000000ULL | bpf_rand_u8(bpf_rand_u8(0) % 64);
+	case 24: return 0x0000000000000000ULL | bpf_rand_u16(bpf_rand_u8(0) % 64);
+	case 25: return 0xffffffffffffffffULL ^ bpf_rand_u8(bpf_rand_u8(0) % 64);
+	case 26: return 0xffffffffffffffffULL ^ bpf_rand_u40(bpf_rand_u8(0) % 64);
+	case 27: return 0x0000800000000000ULL;
+	case 28: return 0x8000000000000000ULL;
+	case 29: return 0x0000000000000000ULL;
+	case 30: return 0xffffffffffffffffULL;
+	case 31: return bpf_rand_u16(bpf_rand_u8(0) % 64);
+	case 32: return bpf_rand_u24(bpf_rand_u8(0) % 64);
+	case 33: return bpf_rand_u32(bpf_rand_u8(0) % 64);
+	case 34: return bpf_rand_u40(bpf_rand_u8(0) % 64);
+	case 35: return bpf_rand_u48(bpf_rand_u8(0) % 64);
+	case 36: return bpf_rand_u56(bpf_rand_u8(0) % 64);
+	case 37: return bpf_rand_u64(bpf_rand_u8(0) % 64);
+	default: return bpf_rand_u64(0);
+	}
+}
+
+#endif /* __BPF_RAND__ */
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 275b4570b5b8..a877af00605d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -41,6 +41,7 @@
 # endif
 #endif
 #include "bpf_rlimit.h"
+#include "bpf_rand.h"
 #include "../../../include/linux/filter.h"
 
 #ifndef ARRAY_SIZE
@@ -152,6 +153,30 @@ static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
 	insn[i] = BPF_EXIT_INSN();
 }
 
+static void bpf_fill_rand_ld_dw(struct bpf_test *self)
+{
+	struct bpf_insn *insn = self->insns;
+	uint64_t res = 0;
+	int i = 0;
+
+	insn[i++] = BPF_MOV32_IMM(BPF_REG_0, 0);
+	while (i < self->retval) {
+		uint64_t val = bpf_semi_rand_get();
+		struct bpf_insn tmp[2] = { BPF_LD_IMM64(BPF_REG_1, val) };
+
+		res ^= val;
+		insn[i++] = tmp[0];
+		insn[i++] = tmp[1];
+		insn[i++] = BPF_ALU64_REG(BPF_XOR, BPF_REG_0, BPF_REG_1);
+	}
+	insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_0);
+	insn[i++] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32);
+	insn[i++] = BPF_ALU64_REG(BPF_XOR, BPF_REG_0, BPF_REG_1);
+	insn[i] = BPF_EXIT_INSN();
+	res ^= (res >> 32);
+	self->retval = (uint32_t)res;
+}
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -11974,6 +11999,42 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.retval = 10,
 	},
+	{
+		"ld_dw: xor semi-random 64 bit imms, test 1",
+		.insns = { },
+		.data = { },
+		.fill_helper = bpf_fill_rand_ld_dw,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 4090,
+	},
+	{
+		"ld_dw: xor semi-random 64 bit imms, test 2",
+		.insns = { },
+		.data = { },
+		.fill_helper = bpf_fill_rand_ld_dw,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 2047,
+	},
+	{
+		"ld_dw: xor semi-random 64 bit imms, test 3",
+		.insns = { },
+		.data = { },
+		.fill_helper = bpf_fill_rand_ld_dw,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 511,
+	},
+	{
+		"ld_dw: xor semi-random 64 bit imms, test 4",
+		.insns = { },
+		.data = { },
+		.fill_helper = bpf_fill_rand_ld_dw,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+		.retval = 5,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -12346,5 +12407,6 @@ int main(int argc, char **argv)
 		return EXIT_FAILURE;
 	}
 
+	bpf_semi_rand_init();
 	return do_test(unpriv, from, to);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 96faf246d07cb35e7cc2c926b4e32fefab0dd8a6 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 14 May 2018 10:29:56 +0100
Subject: net: stmmac: Add Jose Abreu as co-maintainer

I'm offering to be a co-maintainer for stmmac driver.

As per discussion with Alexandre, I will arrange to get STM32 boards to
test patches in GMAC version 3.x and 4.1. I also have HW to test GMAC
version 5.

Looking forward to contribute to net-dev!

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cecf461678b5..c6989d04afef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13385,6 +13385,7 @@ F:	drivers/media/usb/stk1160/
 STMMAC ETHERNET DRIVER
 M:	Giuseppe Cavallaro <peppe.cavallaro@st.com>
 M:	Alexandre Torgue <alexandre.torgue@st.com>
+M:	Jose Abreu <joabreu@synopsys.com>
 L:	netdev@vger.kernel.org
 W:	http://www.stlinux.com
 S:	Supported
-- 
cgit v1.2.3-59-g8ed1b


From 98f3697f8d4129366b5215e47719581db2d54df2 Mon Sep 17 00:00:00 2001
From: Kumar Sanghvi <kumaras@chelsio.com>
Date: Mon, 14 May 2018 17:51:21 +0530
Subject: cxgb4: add tc flower match support for tunnel VNI

Adds support for matching flows based on tunnel VNI value.
Introduces fw APIs for allocating/removing MPS entries related
to encapsulation. And uses the same while adding/deleting filters
for offloading flows based on tunnel VNI match.

Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  9 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  | 91 ++++++++++++++++++++--
 .../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c   | 18 +++++
 drivers/net/ethernet/chelsio/cxgb4/l2t.c           |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c         | 86 ++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h       |  4 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h      | 34 ++++++++
 7 files changed, 237 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 01e7aad4ce5b..211086bdbe20 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1038,6 +1038,7 @@ struct ch_sched_queue {
 #define VF_BITWIDTH 8
 #define IVLAN_BITWIDTH 16
 #define OVLAN_BITWIDTH 16
+#define ENCAP_VNI_BITWIDTH 24
 
 /* Filter matching rules.  These consist of a set of ingress packet field
  * (value, mask) tuples.  The associated ingress packet field matches the
@@ -1068,6 +1069,7 @@ struct ch_filter_tuple {
 	uint32_t ivlan_vld:1;                   /* inner VLAN valid */
 	uint32_t ovlan_vld:1;                   /* outer VLAN valid */
 	uint32_t pfvf_vld:1;                    /* PF/VF valid */
+	uint32_t encap_vld:1;			/* Encapsulation valid */
 	uint32_t macidx:MACIDX_BITWIDTH;        /* exact match MAC index */
 	uint32_t fcoe:FCOE_BITWIDTH;            /* FCoE packet */
 	uint32_t iport:IPORT_BITWIDTH;          /* ingress port */
@@ -1078,6 +1080,7 @@ struct ch_filter_tuple {
 	uint32_t vf:VF_BITWIDTH;                /* PCI-E VF ID */
 	uint32_t ivlan:IVLAN_BITWIDTH;          /* inner VLAN */
 	uint32_t ovlan:OVLAN_BITWIDTH;          /* outer VLAN */
+	uint32_t vni:ENCAP_VNI_BITWIDTH;	/* VNI of tunnel */
 
 	/* Uncompressed header matching field rules.  These are always
 	 * available for field rules.
@@ -1694,6 +1697,12 @@ int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
 int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
 			 const u8 *addr, const u8 *mask, unsigned int idx,
 			 u8 lookup_type, u8 port_id, bool sleep_ok);
+int t4_free_encap_mac_filt(struct adapter *adap, unsigned int viid, int idx,
+			   bool sleep_ok);
+int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid,
+			    const u8 *addr, const u8 *mask, unsigned int vni,
+			    unsigned int vni_mask, u8 dip_hit, u8 lookup_type,
+			    bool sleep_ok);
 int t4_alloc_raw_mac_filt(struct adapter *adap, unsigned int viid,
 			  const u8 *addr, const u8 *mask, unsigned int idx,
 			  u8 lookup_type, u8 port_id, bool sleep_ok);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index aae980205ece..ac4d7f75fdc2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -265,6 +265,8 @@ static int validate_filter(struct net_device *dev,
 			fs->mask.pfvf_vld) ||
 	    unsupported(fconf, VNIC_ID_F, fs->val.ovlan_vld,
 			fs->mask.ovlan_vld) ||
+	    unsupported(fconf, VNIC_ID_F, fs->val.encap_vld,
+			fs->mask.encap_vld) ||
 	    unsupported(fconf, VLAN_F, fs->val.ivlan_vld, fs->mask.ivlan_vld))
 		return -EOPNOTSUPP;
 
@@ -275,8 +277,12 @@ static int validate_filter(struct net_device *dev,
 	 * carries that overlap, we need to translate any PF/VF
 	 * specification into that internal format below.
 	 */
-	if (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
-	    is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld))
+	if ((is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+	     is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld)) ||
+	    (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+	     is_field_set(fs->val.encap_vld, fs->mask.encap_vld)) ||
+	    (is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
+	     is_field_set(fs->val.encap_vld, fs->mask.encap_vld)))
 		return -EOPNOTSUPP;
 	if (unsupported(iconf, VNIC_F, fs->val.pfvf_vld, fs->mask.pfvf_vld) ||
 	    (is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
@@ -306,6 +312,9 @@ static int validate_filter(struct net_device *dev,
 	     fs->newvlan == VLAN_REWRITE))
 		return -EOPNOTSUPP;
 
+	if (fs->val.encap_vld &&
+	    CHELSIO_CHIP_VERSION(adapter->params.chip) < CHELSIO_T6)
+		return -EOPNOTSUPP;
 	return 0;
 }
 
@@ -705,6 +714,8 @@ int delete_filter(struct adapter *adapter, unsigned int fidx)
  */
 void clear_filter(struct adapter *adap, struct filter_entry *f)
 {
+	struct port_info *pi = netdev_priv(f->dev);
+
 	/* If the new or old filter have loopback rewriteing rules then we'll
 	 * need to free any existing L2T, SMT, CLIP entries of filter
 	 * rule.
@@ -715,6 +726,12 @@ void clear_filter(struct adapter *adap, struct filter_entry *f)
 	if (f->smt)
 		cxgb4_smt_release(f->smt);
 
+	if (f->fs.val.encap_vld && f->fs.val.ovlan_vld)
+		if (atomic_dec_and_test(&adap->mps_encap[f->fs.val.ovlan &
+							 0x1ff].refcnt))
+			t4_free_encap_mac_filt(adap, pi->viid,
+					       f->fs.val.ovlan & 0x1ff, 0);
+
 	if ((f->fs.hash || is_t6(adap->params.chip)) && f->fs.type)
 		cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
 
@@ -840,6 +857,10 @@ bool is_filter_exact_match(struct adapter *adap,
 	if (!is_hashfilter(adap))
 		return false;
 
+	 /* Keep tunnel VNI match disabled for hash-filters for now */
+	if (fs->mask.encap_vld)
+		return false;
+
 	if (fs->type) {
 		if (is_inaddr_any(fs->val.fip, AF_INET6) ||
 		    !is_addr_all_mask(fs->mask.fip, AF_INET6))
@@ -961,8 +982,12 @@ static u64 hash_filter_ntuple(struct ch_filter_specification *fs,
 		ntuple |= (u64)(fs->val.tos) << tp->tos_shift;
 
 	if (tp->vnic_shift >= 0) {
-		if ((adap->params.tp.ingress_config & VNIC_F) &&
-		    fs->mask.pfvf_vld)
+		if ((adap->params.tp.ingress_config & USE_ENC_IDX_F) &&
+		    fs->mask.encap_vld)
+			ntuple |= (u64)((fs->val.encap_vld << 16) |
+					(fs->val.ovlan)) << tp->vnic_shift;
+		else if ((adap->params.tp.ingress_config & VNIC_F) &&
+			 fs->mask.pfvf_vld)
 			ntuple |= (u64)((fs->val.pfvf_vld << 16) |
 					(fs->val.pf << 13) |
 					(fs->val.vf)) << tp->vnic_shift;
@@ -1076,6 +1101,7 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
 				 struct filter_ctx *ctx)
 {
 	struct adapter *adapter = netdev2adap(dev);
+	struct port_info *pi = netdev_priv(dev);
 	struct tid_info *t = &adapter->tids;
 	struct filter_entry *f;
 	struct sk_buff *skb;
@@ -1142,13 +1168,34 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
 		f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
 		f->fs.val.ovlan_vld = fs->val.pfvf_vld;
 		f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+	} else if (iconf & USE_ENC_IDX_F) {
+		if (f->fs.val.encap_vld) {
+			struct port_info *pi = netdev_priv(f->dev);
+			u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+
+			/* allocate MPS TCAM entry */
+			ret = t4_alloc_encap_mac_filt(adapter, pi->viid,
+						      match_all_mac,
+						      match_all_mac,
+						      f->fs.val.vni,
+						      f->fs.mask.vni,
+						      0, 1, 1);
+			if (ret < 0)
+				goto free_atid;
+
+			atomic_inc(&adapter->mps_encap[ret].refcnt);
+			f->fs.val.ovlan = ret;
+			f->fs.mask.ovlan = 0xffff;
+			f->fs.val.ovlan_vld = 1;
+			f->fs.mask.ovlan_vld = 1;
+		}
 	}
 
 	size = sizeof(struct cpl_t6_act_open_req);
 	if (f->fs.type) {
 		ret = cxgb4_clip_get(f->dev, (const u32 *)&f->fs.val.lip, 1);
 		if (ret)
-			goto free_atid;
+			goto free_mps;
 
 		skb = alloc_skb(size, GFP_KERNEL);
 		if (!skb) {
@@ -1163,7 +1210,7 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
 		skb = alloc_skb(size, GFP_KERNEL);
 		if (!skb) {
 			ret = -ENOMEM;
-			goto free_atid;
+			goto free_mps;
 		}
 
 		mk_act_open_req(f, skb,
@@ -1179,6 +1226,10 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
 free_clip:
 	cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
 
+free_mps:
+	if (f->fs.val.encap_vld && f->fs.val.ovlan_vld)
+		t4_free_encap_mac_filt(adapter, pi->viid, f->fs.val.ovlan, 1);
+
 free_atid:
 	cxgb4_free_atid(t, atid);
 
@@ -1360,6 +1411,27 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
 		f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
 		f->fs.val.ovlan_vld = fs->val.pfvf_vld;
 		f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+	} else if (iconf & USE_ENC_IDX_F) {
+		if (f->fs.val.encap_vld) {
+			struct port_info *pi = netdev_priv(f->dev);
+			u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+
+			/* allocate MPS TCAM entry */
+			ret = t4_alloc_encap_mac_filt(adapter, pi->viid,
+						      match_all_mac,
+						      match_all_mac,
+						      f->fs.val.vni,
+						      f->fs.mask.vni,
+						      0, 1, 1);
+			if (ret < 0)
+				goto free_clip;
+
+			atomic_inc(&adapter->mps_encap[ret].refcnt);
+			f->fs.val.ovlan = ret;
+			f->fs.mask.ovlan = 0x1ff;
+			f->fs.val.ovlan_vld = 1;
+			f->fs.mask.ovlan_vld = 1;
+		}
 	}
 
 	/* Attempt to set the filter.  If we don't succeed, we clear
@@ -1376,6 +1448,13 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
 	}
 
 	return ret;
+
+free_clip:
+	if (is_t6(adapter->params.chip) && f->fs.type)
+		cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
+	cxgb4_clear_ftid(&adapter->tids, filter_id,
+			 fs->type ? PF_INET6 : PF_INET, chip_ver);
+	return ret;
 }
 
 static int cxgb4_del_hash_filter(struct net_device *dev, int filter_id,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 36563364bae7..3ddd2c4acf68 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -194,6 +194,23 @@ static void cxgb4_process_flow_match(struct net_device *dev,
 		fs->mask.tos = mask->tos;
 	}
 
+	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *key, *mask;
+
+		key = skb_flow_dissector_target(cls->dissector,
+						FLOW_DISSECTOR_KEY_ENC_KEYID,
+						cls->key);
+		mask = skb_flow_dissector_target(cls->dissector,
+						 FLOW_DISSECTOR_KEY_ENC_KEYID,
+						 cls->mask);
+		fs->val.vni = be32_to_cpu(key->keyid);
+		fs->mask.vni = be32_to_cpu(mask->keyid);
+		if (fs->mask.vni) {
+			fs->val.encap_vld = 1;
+			fs->mask.encap_vld = 1;
+		}
+	}
+
 	if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
 		struct flow_dissector_key_vlan *key, *mask;
 		u16 vlan_tci, vlan_tci_mask;
@@ -247,6 +264,7 @@ static int cxgb4_validate_flow_match(struct net_device *dev,
 	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
 	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
 	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
 	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
 	      BIT(FLOW_DISSECTOR_KEY_IP))) {
 		netdev_warn(dev, "Unsupported key used: 0x%x\n",
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index 1817a0307d26..77c2c538b1fd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -491,7 +491,7 @@ u64 cxgb4_select_ntuple(struct net_device *dev,
 	if (tp->protocol_shift >= 0)
 		ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift;
 
-	if (tp->vnic_shift >= 0) {
+	if (tp->vnic_shift >= 0 && (tp->ingress_config & VNIC_F)) {
 		u32 viid = cxgb4_port_viid(dev);
 		u32 vf = FW_VIID_VIN_G(viid);
 		u32 pf = FW_VIID_PFN_G(viid);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 7cb3ef466cc7..df5e7c79223b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -7512,6 +7512,43 @@ int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
 	return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok);
 }
 
+/**
+ *      t4_free_encap_mac_filt - frees MPS entry at given index
+ *      @adap: the adapter
+ *      @viid: the VI id
+ *      @idx: index of MPS entry to be freed
+ *      @sleep_ok: call is allowed to sleep
+ *
+ *      Frees the MPS entry at supplied index
+ *
+ *      Returns a negative error number or zero on success
+ */
+int t4_free_encap_mac_filt(struct adapter *adap, unsigned int viid,
+			   int idx, bool sleep_ok)
+{
+	struct fw_vi_mac_exact *p;
+	u8 addr[] = {0, 0, 0, 0, 0, 0};
+	struct fw_vi_mac_cmd c;
+	int ret = 0;
+	u32 exact;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_CMD_EXEC_V(0) |
+				   FW_VI_MAC_CMD_VIID_V(viid));
+	exact = FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_EXACTMAC);
+	c.freemacs_to_len16 = cpu_to_be32(FW_VI_MAC_CMD_FREEMACS_V(0) |
+					  exact |
+					  FW_CMD_LEN16_V(1));
+	p = c.u.exact;
+	p->valid_to_idx = cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+				      FW_VI_MAC_CMD_IDX_V(idx));
+	memcpy(p->macaddr, addr, sizeof(p->macaddr));
+	ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+	return ret;
+}
+
 /**
  *	t4_free_raw_mac_filt - Frees a raw mac entry in mps tcam
  *	@adap: the adapter
@@ -7562,6 +7599,55 @@ int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
 	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
 }
 
+/**
+ *      t4_alloc_encap_mac_filt - Adds a mac entry in mps tcam with VNI support
+ *      @adap: the adapter
+ *      @viid: the VI id
+ *      @mac: the MAC address
+ *      @mask: the mask
+ *      @vni: the VNI id for the tunnel protocol
+ *      @vni_mask: mask for the VNI id
+ *      @dip_hit: to enable DIP match for the MPS entry
+ *      @lookup_type: MAC address for inner (1) or outer (0) header
+ *      @sleep_ok: call is allowed to sleep
+ *
+ *      Allocates an MPS entry with specified MAC address and VNI value.
+ *
+ *      Returns a negative error number or the allocated index for this mac.
+ */
+int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid,
+			    const u8 *addr, const u8 *mask, unsigned int vni,
+			    unsigned int vni_mask, u8 dip_hit, u8 lookup_type,
+			    bool sleep_ok)
+{
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_vni *p = c.u.exact_vni;
+	int ret = 0;
+	u32 val;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+				   FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+				   FW_VI_MAC_CMD_VIID_V(viid));
+	val = FW_CMD_LEN16_V(1) |
+	      FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_EXACTMAC_VNI);
+	c.freemacs_to_len16 = cpu_to_be32(val);
+	p->valid_to_idx = cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+				      FW_VI_MAC_CMD_IDX_V(FW_VI_MAC_ADD_MAC));
+	memcpy(p->macaddr, addr, sizeof(p->macaddr));
+	memcpy(p->macaddr_mask, mask, sizeof(p->macaddr_mask));
+
+	p->lookup_type_to_vni =
+		cpu_to_be32(FW_VI_MAC_CMD_VNI_V(vni) |
+			    FW_VI_MAC_CMD_DIP_HIT_V(dip_hit) |
+			    FW_VI_MAC_CMD_LOOKUP_TYPE_V(lookup_type));
+	p->vni_mask_pkd = cpu_to_be32(FW_VI_MAC_CMD_VNI_MASK_V(vni_mask));
+	ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+	if (ret == 0)
+		ret = FW_VI_MAC_CMD_IDX_G(be16_to_cpu(p->valid_to_idx));
+	return ret;
+}
+
 /**
  *	t4_alloc_raw_mac_filt - Adds a mac entry in mps tcam
  *	@adap: the adapter
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 276fdf214b75..843f8cada1de 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1598,6 +1598,10 @@
 #define VNIC_V(x) ((x) << VNIC_S)
 #define VNIC_F    VNIC_V(1U)
 
+#define USE_ENC_IDX_S		13
+#define USE_ENC_IDX_V(x)	((x) << USE_ENC_IDX_S)
+#define USE_ENC_IDX_F		USE_ENC_IDX_V(1U)
+
 #define CSUM_HAS_PSEUDO_HDR_S    10
 #define CSUM_HAS_PSEUDO_HDR_V(x) ((x) << CSUM_HAS_PSEUDO_HDR_S)
 #define CSUM_HAS_PSEUDO_HDR_F    CSUM_HAS_PSEUDO_HDR_V(1U)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 0e007ee72318..e6b2e9549d56 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2158,6 +2158,14 @@ struct fw_vi_mac_cmd {
 			__be64 data0m_pkd;
 			__be32 data1m[2];
 		} raw;
+		struct fw_vi_mac_vni {
+			__be16 valid_to_idx;
+			__u8 macaddr[6];
+			__be16 r7;
+			__u8 macaddr_mask[6];
+			__be32 lookup_type_to_vni;
+			__be32 vni_mask_pkd;
+		} exact_vni[2];
 	} u;
 };
 
@@ -2205,6 +2213,32 @@ struct fw_vi_mac_cmd {
 #define FW_VI_MAC_CMD_RAW_IDX_G(x)      \
 	(((x) >> FW_VI_MAC_CMD_RAW_IDX_S) & FW_VI_MAC_CMD_RAW_IDX_M)
 
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_S	31
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_M	0x1
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_V(x)	((x) << FW_VI_MAC_CMD_LOOKUP_TYPE_S)
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_LOOKUP_TYPE_S) & FW_VI_MAC_CMD_LOOKUP_TYPE_M)
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_F	FW_VI_MAC_CMD_LOOKUP_TYPE_V(1U)
+
+#define FW_VI_MAC_CMD_DIP_HIT_S		30
+#define FW_VI_MAC_CMD_DIP_HIT_M		0x1
+#define FW_VI_MAC_CMD_DIP_HIT_V(x)	((x) << FW_VI_MAC_CMD_DIP_HIT_S)
+#define FW_VI_MAC_CMD_DIP_HIT_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_DIP_HIT_S) & FW_VI_MAC_CMD_DIP_HIT_M)
+#define FW_VI_MAC_CMD_DIP_HIT_F		FW_VI_MAC_CMD_DIP_HIT_V(1U)
+
+#define FW_VI_MAC_CMD_VNI_S		0
+#define FW_VI_MAC_CMD_VNI_M		0xffffff
+#define FW_VI_MAC_CMD_VNI_V(x)		((x) << FW_VI_MAC_CMD_VNI_S)
+#define FW_VI_MAC_CMD_VNI_G(x)		\
+	(((x) >> FW_VI_MAC_CMD_VNI_S) & FW_VI_MAC_CMD_VNI_M)
+
+#define FW_VI_MAC_CMD_VNI_MASK_S	0
+#define FW_VI_MAC_CMD_VNI_MASK_M	0xffffff
+#define FW_VI_MAC_CMD_VNI_MASK_V(x)	((x) << FW_VI_MAC_CMD_VNI_MASK_S)
+#define FW_VI_MAC_CMD_VNI_MASK_G(x)	\
+	(((x) >> FW_VI_MAC_CMD_VNI_MASK_S) & FW_VI_MAC_CMD_VNI_MASK_M)
+
 #define FW_RXMODE_MTU_NO_CHG	65535
 
 struct fw_vi_rxmode_cmd {
-- 
cgit v1.2.3-59-g8ed1b


From b9fd683982c9d190cbccd8a32d885bf84bb4a12d Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:36 -0300
Subject: sctp: add sctp_packet_singleton

Factor out the code for generating singletons. It's used only once, but
helps to keep the context contained.

The const variables are to ease the reading of subsequent calls in there.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index dee7cbd54831..300bd0dfc7c1 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -776,6 +776,20 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 	sctp_outq_flush(q, 0, gfp);
 }
 
+static int sctp_packet_singleton(struct sctp_transport *transport,
+				 struct sctp_chunk *chunk, gfp_t gfp)
+{
+	const struct sctp_association *asoc = transport->asoc;
+	const __u16 sport = asoc->base.bind_addr.port;
+	const __u16 dport = asoc->peer.port;
+	const __u32 vtag = asoc->peer.i.init_tag;
+	struct sctp_packet singleton;
+
+	sctp_packet_init(&singleton, transport, sport, dport);
+	sctp_packet_config(&singleton, vtag, 0);
+	sctp_packet_append_chunk(&singleton, chunk);
+	return sctp_packet_transmit(&singleton, gfp);
+}
 
 /*
  * Try to flush an outqueue.
@@ -789,10 +803,7 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
-	struct sctp_packet singleton;
 	struct sctp_association *asoc = q->asoc;
-	__u16 sport = asoc->base.bind_addr.port;
-	__u16 dport = asoc->peer.port;
 	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
 	struct sctp_transport *new_transport;
@@ -905,10 +916,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_INIT:
 		case SCTP_CID_INIT_ACK:
 		case SCTP_CID_SHUTDOWN_COMPLETE:
-			sctp_packet_init(&singleton, transport, sport, dport);
-			sctp_packet_config(&singleton, vtag, 0);
-			sctp_packet_append_chunk(&singleton, chunk);
-			error = sctp_packet_transmit(&singleton, gfp);
+			error = sctp_packet_singleton(transport, chunk, gfp);
 			if (error < 0) {
 				asoc->base.sk->sk_err = -error;
 				return;
-- 
cgit v1.2.3-59-g8ed1b


From 0d634b0c94c9d8298f24a58e4af56c85dc06bb35 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:37 -0300
Subject: sctp: factor out sctp_outq_select_transport

We had two spots doing such complex operation and they were very close to
each other, a bit more tailored to here or there.

This patch unifies these under the same function,
sctp_outq_select_transport, which knows how to handle control chunks and
original transmissions (but not retransmissions).

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 187 +++++++++++++++++++++++++---------------------------
 1 file changed, 90 insertions(+), 97 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 300bd0dfc7c1..bda50596d4bf 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -791,6 +791,90 @@ static int sctp_packet_singleton(struct sctp_transport *transport,
 	return sctp_packet_transmit(&singleton, gfp);
 }
 
+static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
+				       struct sctp_association *asoc,
+				       struct sctp_transport **transport,
+				       struct list_head *transport_list)
+{
+	struct sctp_transport *new_transport = chunk->transport;
+	struct sctp_transport *curr = *transport;
+	bool changed = false;
+
+	if (!new_transport) {
+		if (!sctp_chunk_is_data(chunk)) {
+			/*
+			 * If we have a prior transport pointer, see if
+			 * the destination address of the chunk
+			 * matches the destination address of the
+			 * current transport.  If not a match, then
+			 * try to look up the transport with a given
+			 * destination address.  We do this because
+			 * after processing ASCONFs, we may have new
+			 * transports created.
+			 */
+			if (curr && sctp_cmp_addr_exact(&chunk->dest,
+							&curr->ipaddr))
+				new_transport = curr;
+			else
+				new_transport = sctp_assoc_lookup_paddr(asoc,
+								  &chunk->dest);
+		}
+
+		/* if we still don't have a new transport, then
+		 * use the current active path.
+		 */
+		if (!new_transport)
+			new_transport = asoc->peer.active_path;
+	} else {
+		__u8 type;
+
+		switch (new_transport->state) {
+		case SCTP_INACTIVE:
+		case SCTP_UNCONFIRMED:
+		case SCTP_PF:
+			/* If the chunk is Heartbeat or Heartbeat Ack,
+			 * send it to chunk->transport, even if it's
+			 * inactive.
+			 *
+			 * 3.3.6 Heartbeat Acknowledgement:
+			 * ...
+			 * A HEARTBEAT ACK is always sent to the source IP
+			 * address of the IP datagram containing the
+			 * HEARTBEAT chunk to which this ack is responding.
+			 * ...
+			 *
+			 * ASCONF_ACKs also must be sent to the source.
+			 */
+			type = chunk->chunk_hdr->type;
+			if (type != SCTP_CID_HEARTBEAT &&
+			    type != SCTP_CID_HEARTBEAT_ACK &&
+			    type != SCTP_CID_ASCONF_ACK)
+				new_transport = asoc->peer.active_path;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Are we switching transports? Take care of transport locks. */
+	if (new_transport != curr) {
+		changed = true;
+		curr = new_transport;
+		*transport = curr;
+		if (list_empty(&curr->send_ready))
+			list_add_tail(&curr->send_ready, transport_list);
+
+		sctp_packet_config(&curr->packet, asoc->peer.i.init_tag,
+				   asoc->peer.ecn_capable);
+		/* We've switched transports, so apply the
+		 * Burst limit to the new transport.
+		 */
+		sctp_transport_burst_limited(curr);
+	}
+
+	return changed;
+}
+
 /*
  * Try to flush an outqueue.
  *
@@ -806,7 +890,6 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 	struct sctp_association *asoc = q->asoc;
 	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
-	struct sctp_transport *new_transport;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
 	int error = 0;
@@ -843,68 +926,12 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 		list_del_init(&chunk->list);
 
-		/* Pick the right transport to use. */
-		new_transport = chunk->transport;
-
-		if (!new_transport) {
-			/*
-			 * If we have a prior transport pointer, see if
-			 * the destination address of the chunk
-			 * matches the destination address of the
-			 * current transport.  If not a match, then
-			 * try to look up the transport with a given
-			 * destination address.  We do this because
-			 * after processing ASCONFs, we may have new
-			 * transports created.
-			 */
-			if (transport &&
-			    sctp_cmp_addr_exact(&chunk->dest,
-						&transport->ipaddr))
-					new_transport = transport;
-			else
-				new_transport = sctp_assoc_lookup_paddr(asoc,
-								&chunk->dest);
-
-			/* if we still don't have a new transport, then
-			 * use the current active path.
-			 */
-			if (!new_transport)
-				new_transport = asoc->peer.active_path;
-		} else if ((new_transport->state == SCTP_INACTIVE) ||
-			   (new_transport->state == SCTP_UNCONFIRMED) ||
-			   (new_transport->state == SCTP_PF)) {
-			/* If the chunk is Heartbeat or Heartbeat Ack,
-			 * send it to chunk->transport, even if it's
-			 * inactive.
-			 *
-			 * 3.3.6 Heartbeat Acknowledgement:
-			 * ...
-			 * A HEARTBEAT ACK is always sent to the source IP
-			 * address of the IP datagram containing the
-			 * HEARTBEAT chunk to which this ack is responding.
-			 * ...
-			 *
-			 * ASCONF_ACKs also must be sent to the source.
-			 */
-			if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT &&
-			    chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK &&
-			    chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK)
-				new_transport = asoc->peer.active_path;
-		}
-
-		/* Are we switching transports?
-		 * Take care of transport locks.
+		/* Pick the right transport to use. Should always be true for
+		 * the first chunk as we don't have a transport by then.
 		 */
-		if (new_transport != transport) {
-			transport = new_transport;
-			if (list_empty(&transport->send_ready)) {
-				list_add_tail(&transport->send_ready,
-					      &transport_list);
-			}
+		if (sctp_outq_select_transport(chunk, asoc, &transport,
+					       &transport_list))
 			packet = &transport->packet;
-			sctp_packet_config(packet, vtag,
-					   asoc->peer.ecn_capable);
-		}
 
 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -1072,43 +1099,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 				goto sctp_flush_out;
 			}
 
-			/* If there is a specified transport, use it.
-			 * Otherwise, we want to use the active path.
-			 */
-			new_transport = chunk->transport;
-			if (!new_transport ||
-			    ((new_transport->state == SCTP_INACTIVE) ||
-			     (new_transport->state == SCTP_UNCONFIRMED) ||
-			     (new_transport->state == SCTP_PF)))
-				new_transport = asoc->peer.active_path;
-			if (new_transport->state == SCTP_UNCONFIRMED) {
-				WARN_ONCE(1, "Attempt to send packet on unconfirmed path.");
-				sctp_sched_dequeue_done(q, chunk);
-				sctp_chunk_fail(chunk, 0);
-				sctp_chunk_free(chunk);
-				continue;
-			}
-
-			/* Change packets if necessary.  */
-			if (new_transport != transport) {
-				transport = new_transport;
-
-				/* Schedule to have this transport's
-				 * packet flushed.
-				 */
-				if (list_empty(&transport->send_ready)) {
-					list_add_tail(&transport->send_ready,
-						      &transport_list);
-				}
-
+			if (sctp_outq_select_transport(chunk, asoc, &transport,
+						       &transport_list))
 				packet = &transport->packet;
-				sctp_packet_config(packet, vtag,
-						   asoc->peer.ecn_capable);
-				/* We've switched transports, so apply the
-				 * Burst limit to the new transport.
-				 */
-				sctp_transport_burst_limited(transport);
-			}
 
 			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 				 "skb->users:%d\n",
-- 
cgit v1.2.3-59-g8ed1b


From 7a0b9df65a9e291b614850891af485d1121ef54c Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:38 -0300
Subject: sctp: move the flush of ctrl chunks into its own function

Named sctp_outq_flush_ctrl and, with that, keep the contexts contained.

One small fix embedded is the reset of one_packet at every iteration.
This allows bundling of some control chunks in case they were preceeded by
another control chunk that cannot be bundled.

Other than this, it has the same behavior.

Changes since v2:
- Fixed panic reported by kbuild test robot if building with
  only up to this patch applied, due to bad parameter to
  sctp_outq_select_transport and by not initializing packet after
  calling sctp_outq_flush_ctrl.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 92 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 36 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bda50596d4bf..92f14f51edf2 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -875,45 +875,21 @@ static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
 	return changed;
 }
 
-/*
- * Try to flush an outqueue.
- *
- * Description: Send everything in q which we legally can, subject to
- * congestion limitations.
- * * Note: This function can be called from multiple contexts so appropriate
- * locking concerns must be made.  Today we use the sock lock to protect
- * this function.
- */
-static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush_ctrl(struct sctp_outq *q,
+				 struct sctp_transport **_transport,
+				 struct list_head *transport_list,
+				 gfp_t gfp)
 {
-	struct sctp_packet *packet;
+	struct sctp_transport *transport = *_transport;
 	struct sctp_association *asoc = q->asoc;
-	__u32 vtag = asoc->peer.i.init_tag;
-	struct sctp_transport *transport = NULL;
+	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
-	int error = 0;
-	int start_timer = 0;
-	int one_packet = 0;
-
-	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
-
-	/*
-	 * 6.10 Bundling
-	 *   ...
-	 *   When bundling control chunks with DATA chunks, an
-	 *   endpoint MUST place control chunks first in the outbound
-	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
-	 *   within a SCTP packet in increasing order of TSN.
-	 *   ...
-	 */
+	int one_packet, error;
 
 	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+		one_packet = 0;
+
 		/* RFC 5061, 5.3
 		 * F1) This means that until such time as the ASCONF
 		 * containing the add is acknowledged, the sender MUST
@@ -929,9 +905,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		/* Pick the right transport to use. Should always be true for
 		 * the first chunk as we don't have a transport by then.
 		 */
-		if (sctp_outq_select_transport(chunk, asoc, &transport,
-					       &transport_list))
+		if (sctp_outq_select_transport(chunk, asoc, _transport,
+					       transport_list)) {
+			transport = *_transport;
 			packet = &transport->packet;
+		}
 
 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -954,6 +932,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			if (sctp_test_T_bit(chunk))
 				packet->vtag = asoc->c.my_vtag;
 			/* fallthru */
+
 		/* The following chunks are "response" chunks, i.e.
 		 * they are generated in response to something we
 		 * received.  If we are sending these, then we can
@@ -979,7 +958,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
 							    one_packet, gfp);
-			if (status  != SCTP_XMIT_OK) {
+			if (status != SCTP_XMIT_OK) {
 				/* put the chunk back */
 				list_add(&chunk->list, &q->control_chunk_list);
 				break;
@@ -1006,6 +985,47 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			BUG();
 		}
 	}
+}
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made.  Today we use the sock lock to protect
+ * this function.
+ */
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+{
+	struct sctp_packet *packet;
+	struct sctp_association *asoc = q->asoc;
+	__u32 vtag = asoc->peer.i.init_tag;
+	struct sctp_transport *transport = NULL;
+	struct sctp_chunk *chunk;
+	enum sctp_xmit status;
+	int error = 0;
+	int start_timer = 0;
+
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	struct list_head *ltransport;
+
+	INIT_LIST_HEAD(&transport_list);
+	packet = NULL;
+
+	/*
+	 * 6.10 Bundling
+	 *   ...
+	 *   When bundling control chunks with DATA chunks, an
+	 *   endpoint MUST place control chunks first in the outbound
+	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
+	 *   within a SCTP packet in increasing order of TSN.
+	 *   ...
+	 */
+
+	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
+	packet = &transport->packet;
 
 	if (q->asoc->src_out_of_asoc_ok)
 		goto sctp_flush_out;
-- 
cgit v1.2.3-59-g8ed1b


From 96e0418e812e9c30211b55ffcddc5f03bfd96919 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:39 -0300
Subject: sctp: move outq data rtx code out of sctp_outq_flush

This patch renames current sctp_outq_flush_rtx to __sctp_outq_flush_rtx
and create a new sctp_outq_flush_rtx, with the code that was on
sctp_outq_flush. Again, the idea is to have functions with small and
defined objectives.

Yes, there is an open-coded path selection in the now sctp_outq_flush_rtx.
That is kept as is for now because it may be very different when we
implement retransmission path selection algorithms for CMT-SCTP.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 101 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 43 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 92f14f51edf2..49e80bf2ade7 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -601,14 +601,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 
 /*
  * Transmit DATA chunks on the retransmit queue.  Upon return from
- * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
+ * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which
  * need to be transmitted by the caller.
  * We assume that pkt->transport has already been set.
  *
  * The return value is a normal kernel error return value.
  */
-static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
-			       int rtx_timeout, int *start_timer)
+static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
+				 int rtx_timeout, int *start_timer)
 {
 	struct sctp_transport *transport = pkt->transport;
 	struct sctp_chunk *chunk, *chunk1;
@@ -987,6 +987,57 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 	}
 }
 
+/* Returns false if new data shouldn't be sent */
+static bool sctp_outq_flush_rtx(struct sctp_outq *q,
+				struct sctp_transport **_transport,
+				struct list_head *transport_list,
+				int rtx_timeout)
+{
+	struct sctp_transport *transport = *_transport;
+	struct sctp_packet *packet = transport ? &transport->packet : NULL;
+	struct sctp_association *asoc = q->asoc;
+	int error, start_timer = 0;
+
+	if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
+		return false;
+
+	if (transport != asoc->peer.retran_path) {
+		/* Switch transports & prepare the packet.  */
+		transport = asoc->peer.retran_path;
+		*_transport = transport;
+
+		if (list_empty(&transport->send_ready))
+			list_add_tail(&transport->send_ready,
+				      transport_list);
+
+		packet = &transport->packet;
+		sctp_packet_config(packet, asoc->peer.i.init_tag,
+				   asoc->peer.ecn_capable);
+	}
+
+	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer);
+	if (error < 0)
+		asoc->base.sk->sk_err = -error;
+
+	if (start_timer) {
+		sctp_transport_reset_t3_rtx(transport);
+		transport->last_time_sent = jiffies;
+	}
+
+	/* This can happen on COOKIE-ECHO resend.  Only
+	 * one chunk can get bundled with a COOKIE-ECHO.
+	 */
+	if (packet->has_cookie_echo)
+		return false;
+
+	/* Don't send new data if there is still data
+	 * waiting to retransmit.
+	 */
+	if (!list_empty(&q->retransmit))
+		return false;
+
+	return true;
+}
 /*
  * Try to flush an outqueue.
  *
@@ -1000,12 +1051,10 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_packet *packet;
 	struct sctp_association *asoc = q->asoc;
-	__u32 vtag = asoc->peer.i.init_tag;
 	struct sctp_transport *transport = NULL;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
 	int error = 0;
-	int start_timer = 0;
 
 	/* These transports have chunks to send. */
 	struct list_head transport_list;
@@ -1053,45 +1102,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		 * current cwnd).
 		 */
 		if (!list_empty(&q->retransmit)) {
-			if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
-				goto sctp_flush_out;
-			if (transport == asoc->peer.retran_path)
-				goto retran;
-
-			/* Switch transports & prepare the packet.  */
-
-			transport = asoc->peer.retran_path;
-
-			if (list_empty(&transport->send_ready)) {
-				list_add_tail(&transport->send_ready,
-					      &transport_list);
-			}
-
+			if (!sctp_outq_flush_rtx(q, &transport, &transport_list,
+						 rtx_timeout))
+				break;
+			/* We may have switched current transport */
 			packet = &transport->packet;
-			sctp_packet_config(packet, vtag,
-					   asoc->peer.ecn_capable);
-		retran:
-			error = sctp_outq_flush_rtx(q, packet,
-						    rtx_timeout, &start_timer);
-			if (error < 0)
-				asoc->base.sk->sk_err = -error;
-
-			if (start_timer) {
-				sctp_transport_reset_t3_rtx(transport);
-				transport->last_time_sent = jiffies;
-			}
-
-			/* This can happen on COOKIE-ECHO resend.  Only
-			 * one chunk can get bundled with a COOKIE-ECHO.
-			 */
-			if (packet->has_cookie_echo)
-				goto sctp_flush_out;
-
-			/* Don't send new data if there is still data
-			 * waiting to retransmit.
-			 */
-			if (!list_empty(&q->retransmit))
-				goto sctp_flush_out;
 		}
 
 		/* Apply Max.Burst limitation to the current transport in
-- 
cgit v1.2.3-59-g8ed1b


From cb93cc5d06d9b1016326376a4980d11a9040afd2 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:40 -0300
Subject: sctp: move flushing of data chunks out of sctp_outq_flush

To the new sctp_outq_flush_data. Again, smaller functions and with well
defined objectives.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 149 ++++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 74 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 49e80bf2ade7..bfa2e43dfd31 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1038,46 +1038,17 @@ static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 
 	return true;
 }
-/*
- * Try to flush an outqueue.
- *
- * Description: Send everything in q which we legally can, subject to
- * congestion limitations.
- * * Note: This function can be called from multiple contexts so appropriate
- * locking concerns must be made.  Today we use the sock lock to protect
- * this function.
- */
-static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+
+static void sctp_outq_flush_data(struct sctp_outq *q,
+				 struct sctp_transport **_transport,
+				 struct list_head *transport_list,
+				 int rtx_timeout, gfp_t gfp)
 {
-	struct sctp_packet *packet;
+	struct sctp_transport *transport = *_transport;
+	struct sctp_packet *packet = transport ? &transport->packet : NULL;
 	struct sctp_association *asoc = q->asoc;
-	struct sctp_transport *transport = NULL;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
-	int error = 0;
-
-	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
-
-	/*
-	 * 6.10 Bundling
-	 *   ...
-	 *   When bundling control chunks with DATA chunks, an
-	 *   endpoint MUST place control chunks first in the outbound
-	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
-	 *   within a SCTP packet in increasing order of TSN.
-	 *   ...
-	 */
-
-	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
-	packet = &transport->packet;
-
-	if (q->asoc->src_out_of_asoc_ok)
-		goto sctp_flush_out;
 
 	/* Is it OK to send data chunks?  */
 	switch (asoc->state) {
@@ -1102,10 +1073,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		 * current cwnd).
 		 */
 		if (!list_empty(&q->retransmit)) {
-			if (!sctp_outq_flush_rtx(q, &transport, &transport_list,
+			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
 						 rtx_timeout))
 				break;
 			/* We may have switched current transport */
+			transport = *_transport;
 			packet = &transport->packet;
 		}
 
@@ -1131,12 +1103,14 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 			if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
 				sctp_outq_head_data(q, chunk);
-				goto sctp_flush_out;
+				break;
 			}
 
-			if (sctp_outq_select_transport(chunk, asoc, &transport,
-						       &transport_list))
+			if (sctp_outq_select_transport(chunk, asoc, _transport,
+						       transport_list)) {
+				transport = *_transport;
 				packet = &transport->packet;
+			}
 
 			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 				 "skb->users:%d\n",
@@ -1148,8 +1122,10 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 			/* Add the chunk to the packet.  */
 			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
-
 			switch (status) {
+			case SCTP_XMIT_OK:
+				break;
+
 			case SCTP_XMIT_PMTU_FULL:
 			case SCTP_XMIT_RWND_FULL:
 			case SCTP_XMIT_DELAY:
@@ -1161,41 +1137,25 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 					 status);
 
 				sctp_outq_head_data(q, chunk);
-				goto sctp_flush_out;
-
-			case SCTP_XMIT_OK:
-				/* The sender is in the SHUTDOWN-PENDING state,
-				 * The sender MAY set the I-bit in the DATA
-				 * chunk header.
-				 */
-				if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
-					chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
-				if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-					asoc->stats.ouodchunks++;
-				else
-					asoc->stats.oodchunks++;
-
-				/* Only now it's safe to consider this
-				 * chunk as sent, sched-wise.
-				 */
-				sctp_sched_dequeue_done(q, chunk);
-
-				break;
-
-			default:
-				BUG();
+				return;
 			}
 
-			/* BUG: We assume that the sctp_packet_transmit()
-			 * call below will succeed all the time and add the
-			 * chunk to the transmitted list and restart the
-			 * timers.
-			 * It is possible that the call can fail under OOM
-			 * conditions.
-			 *
-			 * Is this really a problem?  Won't this behave
-			 * like a lost TSN?
+			/* The sender is in the SHUTDOWN-PENDING state,
+			 * The sender MAY set the I-bit in the DATA
+			 * chunk header.
 			 */
+			if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+				chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+				asoc->stats.ouodchunks++;
+			else
+				asoc->stats.oodchunks++;
+
+			/* Only now it's safe to consider this
+			 * chunk as sent, sched-wise.
+			 */
+			sctp_sched_dequeue_done(q, chunk);
+
 			list_add_tail(&chunk->transmitted_list,
 				      &transport->transmitted);
 
@@ -1206,7 +1166,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 			 * COOKIE-ECHO chunk.
 			 */
 			if (packet->has_cookie_echo)
-				goto sctp_flush_out;
+				break;
 		}
 		break;
 
@@ -1214,6 +1174,47 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		/* Do nothing.  */
 		break;
 	}
+}
+
+/*
+ * Try to flush an outqueue.
+ *
+ * Description: Send everything in q which we legally can, subject to
+ * congestion limitations.
+ * * Note: This function can be called from multiple contexts so appropriate
+ * locking concerns must be made.  Today we use the sock lock to protect
+ * this function.
+ */
+static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
+{
+	struct sctp_packet *packet;
+	struct sctp_association *asoc = q->asoc;
+	struct sctp_transport *transport = NULL;
+	int error = 0;
+
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	struct list_head *ltransport;
+
+	INIT_LIST_HEAD(&transport_list);
+	packet = NULL;
+
+	/*
+	 * 6.10 Bundling
+	 *   ...
+	 *   When bundling control chunks with DATA chunks, an
+	 *   endpoint MUST place control chunks first in the outbound
+	 *   SCTP packet.  The transmitter MUST transmit DATA chunks
+	 *   within a SCTP packet in increasing order of TSN.
+	 *   ...
+	 */
+
+	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
+
+	if (q->asoc->src_out_of_asoc_ok)
+		goto sctp_flush_out;
+
+	sctp_outq_flush_data(q, &transport, &transport_list, rtx_timeout, gfp);
 
 sctp_flush_out:
 
-- 
cgit v1.2.3-59-g8ed1b


From 4bf21b61f2bb3611ddb4fb75170c83905dd5c29e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:41 -0300
Subject: sctp: move transport flush code out of sctp_outq_flush

To the new sctp_outq_flush_transports.

Comment on Nagle is outdated and removed. Nagle is performed earlier, while
checking if the chunk fits the packet: if the outq length is not enough to
fill the packet, it returns SCTP_XMIT_DELAY.

So by when it gets to sctp_outq_flush_transports, it has to go through all
enlisted transports.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 56 +++++++++++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bfa2e43dfd31..44465e64857b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1176,6 +1176,29 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 	}
 }
 
+static void sctp_outq_flush_transports(struct sctp_outq *q,
+				       struct list_head *transport_list,
+				       gfp_t gfp)
+{
+	struct list_head *ltransport;
+	struct sctp_packet *packet;
+	struct sctp_transport *t;
+	int error = 0;
+
+	while ((ltransport = sctp_list_dequeue(transport_list)) != NULL) {
+		t = list_entry(ltransport, struct sctp_transport, send_ready);
+		packet = &t->packet;
+		if (!sctp_packet_empty(packet)) {
+			error = sctp_packet_transmit(packet, gfp);
+			if (error < 0)
+				q->asoc->base.sk->sk_err = -error;
+		}
+
+		/* Clear the burst limited state, if any */
+		sctp_transport_burst_reset(t);
+	}
+}
+
 /*
  * Try to flush an outqueue.
  *
@@ -1187,17 +1210,10 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
  */
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
-	struct sctp_packet *packet;
-	struct sctp_association *asoc = q->asoc;
+	/* Current transport being used. It's NOT the same as curr active one */
 	struct sctp_transport *transport = NULL;
-	int error = 0;
-
 	/* These transports have chunks to send. */
-	struct list_head transport_list;
-	struct list_head *ltransport;
-
-	INIT_LIST_HEAD(&transport_list);
-	packet = NULL;
+	LIST_HEAD(transport_list);
 
 	/*
 	 * 6.10 Bundling
@@ -1218,27 +1234,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 
 sctp_flush_out:
 
-	/* Before returning, examine all the transports touched in
-	 * this call.  Right now, we bluntly force clear all the
-	 * transports.  Things might change after we implement Nagle.
-	 * But such an examination is still required.
-	 *
-	 * --xguo
-	 */
-	while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) {
-		struct sctp_transport *t = list_entry(ltransport,
-						      struct sctp_transport,
-						      send_ready);
-		packet = &t->packet;
-		if (!sctp_packet_empty(packet)) {
-			error = sctp_packet_transmit(packet, gfp);
-			if (error < 0)
-				asoc->base.sk->sk_err = -error;
-		}
-
-		/* Clear the burst limited state, if any */
-		sctp_transport_burst_reset(t);
-	}
+	sctp_outq_flush_transports(q, &transport_list, gfp);
 }
 
 /* Update unack_data based on the incoming SACK chunk */
-- 
cgit v1.2.3-59-g8ed1b


From 6605f694823965fe53ce761dad3aaf965861dc7e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:42 -0300
Subject: sctp: make use of gfp on retransmissions

Retransmissions may be triggered when in user context, so lets make use
of gfp.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 44465e64857b..e1632b8e2900 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -608,7 +608,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
  * The return value is a normal kernel error return value.
  */
 static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
-				 int rtx_timeout, int *start_timer)
+				 int rtx_timeout, int *start_timer, gfp_t gfp)
 {
 	struct sctp_transport *transport = pkt->transport;
 	struct sctp_chunk *chunk, *chunk1;
@@ -684,12 +684,12 @@ redo:
 				 * control chunks are already freed so there
 				 * is nothing we can do.
 				 */
-				sctp_packet_transmit(pkt, GFP_ATOMIC);
+				sctp_packet_transmit(pkt, gfp);
 				goto redo;
 			}
 
 			/* Send this packet.  */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* If we are retransmitting, we should only
 			 * send a single packet.
@@ -705,7 +705,7 @@ redo:
 
 		case SCTP_XMIT_RWND_FULL:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* Stop sending DATA as there is no more room
 			 * at the receiver.
@@ -715,7 +715,7 @@ redo:
 
 		case SCTP_XMIT_DELAY:
 			/* Send this packet. */
-			error = sctp_packet_transmit(pkt, GFP_ATOMIC);
+			error = sctp_packet_transmit(pkt, gfp);
 
 			/* Stop sending DATA because of nagle delay. */
 			done = 1;
@@ -991,7 +991,7 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 				struct sctp_transport **_transport,
 				struct list_head *transport_list,
-				int rtx_timeout)
+				int rtx_timeout, gfp_t gfp)
 {
 	struct sctp_transport *transport = *_transport;
 	struct sctp_packet *packet = transport ? &transport->packet : NULL;
@@ -1015,7 +1015,8 @@ static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 				   asoc->peer.ecn_capable);
 	}
 
-	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer);
+	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer,
+				      gfp);
 	if (error < 0)
 		asoc->base.sk->sk_err = -error;
 
@@ -1074,7 +1075,7 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 		 */
 		if (!list_empty(&q->retransmit)) {
 			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
-						 rtx_timeout))
+						 rtx_timeout, gfp))
 				break;
 			/* We may have switched current transport */
 			transport = *_transport;
-- 
cgit v1.2.3-59-g8ed1b


From 4fdbb0efb9de51295071fdf968d476c1f0605d21 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:34:43 -0300
Subject: sctp: rework switch cases in sctp_outq_flush_data

Remove an inner one, which tended to be error prone due to the cascading
and it can be replaced by a simple if ().

Rework the outer one so that the actual flush code is not inside it. Now
we first validate if we can or cannot send data, return if not, and then
the flush code.

Suggested-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 191 +++++++++++++++++++++++++---------------------------
 1 file changed, 93 insertions(+), 98 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e1632b8e2900..e9c22b3db11c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1058,122 +1058,117 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 		 * chunk.
 		 */
 		if (!packet || !packet->has_cookie_echo)
-			break;
+			return;
 
 		/* fallthru */
 	case SCTP_STATE_ESTABLISHED:
 	case SCTP_STATE_SHUTDOWN_PENDING:
 	case SCTP_STATE_SHUTDOWN_RECEIVED:
-		/*
-		 * RFC 2960 6.1  Transmission of DATA Chunks
-		 *
-		 * C) When the time comes for the sender to transmit,
-		 * before sending new DATA chunks, the sender MUST
-		 * first transmit any outstanding DATA chunks which
-		 * are marked for retransmission (limited by the
-		 * current cwnd).
-		 */
-		if (!list_empty(&q->retransmit)) {
-			if (!sctp_outq_flush_rtx(q, _transport, transport_list,
-						 rtx_timeout, gfp))
-				break;
-			/* We may have switched current transport */
-			transport = *_transport;
-			packet = &transport->packet;
-		}
+		break;
 
-		/* Apply Max.Burst limitation to the current transport in
-		 * case it will be used for new data.  We are going to
-		 * rest it before we return, but we want to apply the limit
-		 * to the currently queued data.
-		 */
-		if (transport)
-			sctp_transport_burst_limited(transport);
-
-		/* Finally, transmit new packets.  */
-		while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
-			__u32 sid = ntohs(chunk->subh.data_hdr->stream);
-
-			/* Has this chunk expired? */
-			if (sctp_chunk_abandoned(chunk)) {
-				sctp_sched_dequeue_done(q, chunk);
-				sctp_chunk_fail(chunk, 0);
-				sctp_chunk_free(chunk);
-				continue;
-			}
+	default:
+		/* Do nothing. */
+		return;
+	}
 
-			if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
-				sctp_outq_head_data(q, chunk);
-				break;
-			}
+	/*
+	 * RFC 2960 6.1  Transmission of DATA Chunks
+	 *
+	 * C) When the time comes for the sender to transmit,
+	 * before sending new DATA chunks, the sender MUST
+	 * first transmit any outstanding DATA chunks which
+	 * are marked for retransmission (limited by the
+	 * current cwnd).
+	 */
+	if (!list_empty(&q->retransmit)) {
+		if (!sctp_outq_flush_rtx(q, _transport, transport_list,
+					 rtx_timeout, gfp))
+			return;
+		/* We may have switched current transport */
+		transport = *_transport;
+		packet = &transport->packet;
+	}
 
-			if (sctp_outq_select_transport(chunk, asoc, _transport,
-						       transport_list)) {
-				transport = *_transport;
-				packet = &transport->packet;
-			}
+	/* Apply Max.Burst limitation to the current transport in
+	 * case it will be used for new data.  We are going to
+	 * rest it before we return, but we want to apply the limit
+	 * to the currently queued data.
+	 */
+	if (transport)
+		sctp_transport_burst_limited(transport);
 
-			pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
-				 "skb->users:%d\n",
-				 __func__, q, chunk, chunk && chunk->chunk_hdr ?
-				 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
-				 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
-				 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
-				 refcount_read(&chunk->skb->users) : -1);
-
-			/* Add the chunk to the packet.  */
-			status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
-			switch (status) {
-			case SCTP_XMIT_OK:
-				break;
+	/* Finally, transmit new packets.  */
+	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+		__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
-			case SCTP_XMIT_PMTU_FULL:
-			case SCTP_XMIT_RWND_FULL:
-			case SCTP_XMIT_DELAY:
-				/* We could not append this chunk, so put
-				 * the chunk back on the output queue.
-				 */
-				pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
-					 __func__, ntohl(chunk->subh.data_hdr->tsn),
-					 status);
+		/* Has this chunk expired? */
+		if (sctp_chunk_abandoned(chunk)) {
+			sctp_sched_dequeue_done(q, chunk);
+			sctp_chunk_fail(chunk, 0);
+			sctp_chunk_free(chunk);
+			continue;
+		}
 
-				sctp_outq_head_data(q, chunk);
-				return;
-			}
+		if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
+			sctp_outq_head_data(q, chunk);
+			break;
+		}
 
-			/* The sender is in the SHUTDOWN-PENDING state,
-			 * The sender MAY set the I-bit in the DATA
-			 * chunk header.
-			 */
-			if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
-				chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
-			if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-				asoc->stats.ouodchunks++;
-			else
-				asoc->stats.oodchunks++;
+		if (sctp_outq_select_transport(chunk, asoc, _transport,
+					       transport_list)) {
+			transport = *_transport;
+			packet = &transport->packet;
+		}
 
-			/* Only now it's safe to consider this
-			 * chunk as sent, sched-wise.
+		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
+			 "skb->users:%d\n",
+			 __func__, q, chunk, chunk && chunk->chunk_hdr ?
+			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
+			 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
+			 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
+			 refcount_read(&chunk->skb->users) : -1);
+
+		/* Add the chunk to the packet.  */
+		status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
+		if (status != SCTP_XMIT_OK) {
+			/* We could not append this chunk, so put
+			 * the chunk back on the output queue.
 			 */
-			sctp_sched_dequeue_done(q, chunk);
+			pr_debug("%s: could not transmit tsn:0x%x, status:%d\n",
+				 __func__, ntohl(chunk->subh.data_hdr->tsn),
+				 status);
 
-			list_add_tail(&chunk->transmitted_list,
-				      &transport->transmitted);
+			sctp_outq_head_data(q, chunk);
+			break;
+		}
 
-			sctp_transport_reset_t3_rtx(transport);
-			transport->last_time_sent = jiffies;
+		/* The sender is in the SHUTDOWN-PENDING state,
+		 * The sender MAY set the I-bit in the DATA
+		 * chunk header.
+		 */
+		if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+			chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
+		if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
+			asoc->stats.ouodchunks++;
+		else
+			asoc->stats.oodchunks++;
 
-			/* Only let one DATA chunk get bundled with a
-			 * COOKIE-ECHO chunk.
-			 */
-			if (packet->has_cookie_echo)
-				break;
-		}
-		break;
+		/* Only now it's safe to consider this
+		 * chunk as sent, sched-wise.
+		 */
+		sctp_sched_dequeue_done(q, chunk);
 
-	default:
-		/* Do nothing.  */
-		break;
+		list_add_tail(&chunk->transmitted_list,
+			      &transport->transmitted);
+
+		sctp_transport_reset_t3_rtx(transport);
+		transport->last_time_sent = jiffies;
+
+		/* Only let one DATA chunk get bundled with a
+		 * COOKIE-ECHO chunk.
+		 */
+		if (packet->has_cookie_echo)
+			break;
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From bb543847a9c1b3904180b22add5e522f1f6c11c7 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:35:18 -0300
Subject: sctp: add sctp_flush_ctx, a context struct on outq_flush routines

With this struct we avoid passing lots of variables around and taking care
of updating the current transport/packet.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 182 +++++++++++++++++++++++++---------------------------
 1 file changed, 88 insertions(+), 94 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e9c22b3db11c..db94a2513dd8 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -791,13 +791,22 @@ static int sctp_packet_singleton(struct sctp_transport *transport,
 	return sctp_packet_transmit(&singleton, gfp);
 }
 
-static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
-				       struct sctp_association *asoc,
-				       struct sctp_transport **transport,
-				       struct list_head *transport_list)
+/* Struct to hold the context during sctp outq flush */
+struct sctp_flush_ctx {
+	struct sctp_outq *q;
+	/* Current transport being used. It's NOT the same as curr active one */
+	struct sctp_transport *transport;
+	/* These transports have chunks to send. */
+	struct list_head transport_list;
+	gfp_t gfp;
+};
+
+/* transport: current transport */
+static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
+				       struct sctp_chunk *chunk)
 {
 	struct sctp_transport *new_transport = chunk->transport;
-	struct sctp_transport *curr = *transport;
+	struct sctp_association *asoc = ctx->q->asoc;
 	bool changed = false;
 
 	if (!new_transport) {
@@ -812,9 +821,9 @@ static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
 			 * after processing ASCONFs, we may have new
 			 * transports created.
 			 */
-			if (curr && sctp_cmp_addr_exact(&chunk->dest,
-							&curr->ipaddr))
-				new_transport = curr;
+			if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest,
+							&ctx->transport->ipaddr))
+				new_transport = ctx->transport;
 			else
 				new_transport = sctp_assoc_lookup_paddr(asoc,
 								  &chunk->dest);
@@ -857,37 +866,33 @@ static bool sctp_outq_select_transport(struct sctp_chunk *chunk,
 	}
 
 	/* Are we switching transports? Take care of transport locks. */
-	if (new_transport != curr) {
+	if (new_transport != ctx->transport) {
 		changed = true;
-		curr = new_transport;
-		*transport = curr;
-		if (list_empty(&curr->send_ready))
-			list_add_tail(&curr->send_ready, transport_list);
+		ctx->transport = new_transport;
+		if (list_empty(&ctx->transport->send_ready))
+			list_add_tail(&ctx->transport->send_ready,
+				      &ctx->transport_list);
 
-		sctp_packet_config(&curr->packet, asoc->peer.i.init_tag,
+		sctp_packet_config(&ctx->transport->packet, asoc->peer.i.init_tag,
 				   asoc->peer.ecn_capable);
 		/* We've switched transports, so apply the
 		 * Burst limit to the new transport.
 		 */
-		sctp_transport_burst_limited(curr);
+		sctp_transport_burst_limited(ctx->transport);
 	}
 
 	return changed;
 }
 
-static void sctp_outq_flush_ctrl(struct sctp_outq *q,
-				 struct sctp_transport **_transport,
-				 struct list_head *transport_list,
-				 gfp_t gfp)
+static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 {
-	struct sctp_transport *transport = *_transport;
-	struct sctp_association *asoc = q->asoc;
+	struct sctp_association *asoc = ctx->q->asoc;
 	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
 	int one_packet, error;
 
-	list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+	list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) {
 		one_packet = 0;
 
 		/* RFC 5061, 5.3
@@ -905,11 +910,8 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 		/* Pick the right transport to use. Should always be true for
 		 * the first chunk as we don't have a transport by then.
 		 */
-		if (sctp_outq_select_transport(chunk, asoc, _transport,
-					       transport_list)) {
-			transport = *_transport;
-			packet = &transport->packet;
-		}
+		if (sctp_outq_select_transport(ctx, chunk))
+			packet = &ctx->transport->packet;
 
 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -921,7 +923,8 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 		case SCTP_CID_INIT:
 		case SCTP_CID_INIT_ACK:
 		case SCTP_CID_SHUTDOWN_COMPLETE:
-			error = sctp_packet_singleton(transport, chunk, gfp);
+			error = sctp_packet_singleton(ctx->transport, chunk,
+						      ctx->gfp);
 			if (error < 0) {
 				asoc->base.sk->sk_err = -error;
 				return;
@@ -957,10 +960,10 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 		case SCTP_CID_I_FWD_TSN:
 		case SCTP_CID_RECONF:
 			status = sctp_packet_transmit_chunk(packet, chunk,
-							    one_packet, gfp);
+							    one_packet, ctx->gfp);
 			if (status != SCTP_XMIT_OK) {
 				/* put the chunk back */
-				list_add(&chunk->list, &q->control_chunk_list);
+				list_add(&chunk->list, &ctx->q->control_chunk_list);
 				break;
 			}
 
@@ -971,12 +974,12 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 			 */
 			if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN ||
 			    chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) {
-				sctp_transport_reset_t3_rtx(transport);
-				transport->last_time_sent = jiffies;
+				sctp_transport_reset_t3_rtx(ctx->transport);
+				ctx->transport->last_time_sent = jiffies;
 			}
 
 			if (chunk == asoc->strreset_chunk)
-				sctp_transport_reset_reconf_timer(transport);
+				sctp_transport_reset_reconf_timer(ctx->transport);
 
 			break;
 
@@ -988,41 +991,38 @@ static void sctp_outq_flush_ctrl(struct sctp_outq *q,
 }
 
 /* Returns false if new data shouldn't be sent */
-static bool sctp_outq_flush_rtx(struct sctp_outq *q,
-				struct sctp_transport **_transport,
-				struct list_head *transport_list,
-				int rtx_timeout, gfp_t gfp)
+static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
+				int rtx_timeout)
 {
-	struct sctp_transport *transport = *_transport;
-	struct sctp_packet *packet = transport ? &transport->packet : NULL;
-	struct sctp_association *asoc = q->asoc;
+	struct sctp_packet *packet = ctx->transport ? &ctx->transport->packet :
+				     NULL;
+	struct sctp_association *asoc = ctx->q->asoc;
 	int error, start_timer = 0;
 
 	if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
 		return false;
 
-	if (transport != asoc->peer.retran_path) {
+	if (ctx->transport != asoc->peer.retran_path) {
 		/* Switch transports & prepare the packet.  */
-		transport = asoc->peer.retran_path;
-		*_transport = transport;
+		ctx->transport = asoc->peer.retran_path;
 
-		if (list_empty(&transport->send_ready))
-			list_add_tail(&transport->send_ready,
-				      transport_list);
+		if (list_empty(&ctx->transport->send_ready))
+			list_add_tail(&ctx->transport->send_ready,
+				      &ctx->transport_list);
 
-		packet = &transport->packet;
+		packet = &ctx->transport->packet;
 		sctp_packet_config(packet, asoc->peer.i.init_tag,
 				   asoc->peer.ecn_capable);
 	}
 
-	error = __sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer,
-				      gfp);
+	error = __sctp_outq_flush_rtx(ctx->q, packet, rtx_timeout, &start_timer,
+				      ctx->gfp);
 	if (error < 0)
 		asoc->base.sk->sk_err = -error;
 
 	if (start_timer) {
-		sctp_transport_reset_t3_rtx(transport);
-		transport->last_time_sent = jiffies;
+		sctp_transport_reset_t3_rtx(ctx->transport);
+		ctx->transport->last_time_sent = jiffies;
 	}
 
 	/* This can happen on COOKIE-ECHO resend.  Only
@@ -1034,20 +1034,18 @@ static bool sctp_outq_flush_rtx(struct sctp_outq *q,
 	/* Don't send new data if there is still data
 	 * waiting to retransmit.
 	 */
-	if (!list_empty(&q->retransmit))
+	if (!list_empty(&ctx->q->retransmit))
 		return false;
 
 	return true;
 }
 
-static void sctp_outq_flush_data(struct sctp_outq *q,
-				 struct sctp_transport **_transport,
-				 struct list_head *transport_list,
-				 int rtx_timeout, gfp_t gfp)
+static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
+				 int rtx_timeout)
 {
-	struct sctp_transport *transport = *_transport;
-	struct sctp_packet *packet = transport ? &transport->packet : NULL;
-	struct sctp_association *asoc = q->asoc;
+	struct sctp_packet *packet = ctx->transport ? &ctx->transport->packet :
+				     NULL;
+	struct sctp_association *asoc = ctx->q->asoc;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
 
@@ -1080,13 +1078,11 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 	 * are marked for retransmission (limited by the
 	 * current cwnd).
 	 */
-	if (!list_empty(&q->retransmit)) {
-		if (!sctp_outq_flush_rtx(q, _transport, transport_list,
-					 rtx_timeout, gfp))
+	if (!list_empty(&ctx->q->retransmit)) {
+		if (!sctp_outq_flush_rtx(ctx, rtx_timeout))
 			return;
 		/* We may have switched current transport */
-		transport = *_transport;
-		packet = &transport->packet;
+		packet = &ctx->transport->packet;
 	}
 
 	/* Apply Max.Burst limitation to the current transport in
@@ -1094,42 +1090,39 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 	 * rest it before we return, but we want to apply the limit
 	 * to the currently queued data.
 	 */
-	if (transport)
-		sctp_transport_burst_limited(transport);
+	if (ctx->transport)
+		sctp_transport_burst_limited(ctx->transport);
 
 	/* Finally, transmit new packets.  */
-	while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
+	while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) {
 		__u32 sid = ntohs(chunk->subh.data_hdr->stream);
 
 		/* Has this chunk expired? */
 		if (sctp_chunk_abandoned(chunk)) {
-			sctp_sched_dequeue_done(q, chunk);
+			sctp_sched_dequeue_done(ctx->q, chunk);
 			sctp_chunk_fail(chunk, 0);
 			sctp_chunk_free(chunk);
 			continue;
 		}
 
 		if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
-			sctp_outq_head_data(q, chunk);
+			sctp_outq_head_data(ctx->q, chunk);
 			break;
 		}
 
-		if (sctp_outq_select_transport(chunk, asoc, _transport,
-					       transport_list)) {
-			transport = *_transport;
-			packet = &transport->packet;
-		}
+		if (sctp_outq_select_transport(ctx, chunk))
+			packet = &ctx->transport->packet;
 
 		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 			 "skb->users:%d\n",
-			 __func__, q, chunk, chunk && chunk->chunk_hdr ?
+			 __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ?
 			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 			 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
 			 chunk->skb ? chunk->skb->head : NULL, chunk->skb ?
 			 refcount_read(&chunk->skb->users) : -1);
 
 		/* Add the chunk to the packet.  */
-		status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
+		status = sctp_packet_transmit_chunk(packet, chunk, 0, ctx->gfp);
 		if (status != SCTP_XMIT_OK) {
 			/* We could not append this chunk, so put
 			 * the chunk back on the output queue.
@@ -1138,7 +1131,7 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 				 __func__, ntohl(chunk->subh.data_hdr->tsn),
 				 status);
 
-			sctp_outq_head_data(q, chunk);
+			sctp_outq_head_data(ctx->q, chunk);
 			break;
 		}
 
@@ -1156,13 +1149,13 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 		/* Only now it's safe to consider this
 		 * chunk as sent, sched-wise.
 		 */
-		sctp_sched_dequeue_done(q, chunk);
+		sctp_sched_dequeue_done(ctx->q, chunk);
 
 		list_add_tail(&chunk->transmitted_list,
-			      &transport->transmitted);
+			      &ctx->transport->transmitted);
 
-		sctp_transport_reset_t3_rtx(transport);
-		transport->last_time_sent = jiffies;
+		sctp_transport_reset_t3_rtx(ctx->transport);
+		ctx->transport->last_time_sent = jiffies;
 
 		/* Only let one DATA chunk get bundled with a
 		 * COOKIE-ECHO chunk.
@@ -1172,22 +1165,20 @@ static void sctp_outq_flush_data(struct sctp_outq *q,
 	}
 }
 
-static void sctp_outq_flush_transports(struct sctp_outq *q,
-				       struct list_head *transport_list,
-				       gfp_t gfp)
+static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
 {
 	struct list_head *ltransport;
 	struct sctp_packet *packet;
 	struct sctp_transport *t;
 	int error = 0;
 
-	while ((ltransport = sctp_list_dequeue(transport_list)) != NULL) {
+	while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) {
 		t = list_entry(ltransport, struct sctp_transport, send_ready);
 		packet = &t->packet;
 		if (!sctp_packet_empty(packet)) {
-			error = sctp_packet_transmit(packet, gfp);
+			error = sctp_packet_transmit(packet, ctx->gfp);
 			if (error < 0)
-				q->asoc->base.sk->sk_err = -error;
+				ctx->q->asoc->base.sk->sk_err = -error;
 		}
 
 		/* Clear the burst limited state, if any */
@@ -1204,12 +1195,15 @@ static void sctp_outq_flush_transports(struct sctp_outq *q,
  * locking concerns must be made.  Today we use the sock lock to protect
  * this function.
  */
+
 static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 {
-	/* Current transport being used. It's NOT the same as curr active one */
-	struct sctp_transport *transport = NULL;
-	/* These transports have chunks to send. */
-	LIST_HEAD(transport_list);
+	struct sctp_flush_ctx ctx = {
+		.q = q,
+		.transport = NULL,
+		.transport_list = LIST_HEAD_INIT(ctx.transport_list),
+		.gfp = gfp,
+	};
 
 	/*
 	 * 6.10 Bundling
@@ -1221,16 +1215,16 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 	 *   ...
 	 */
 
-	sctp_outq_flush_ctrl(q, &transport, &transport_list, gfp);
+	sctp_outq_flush_ctrl(&ctx);
 
 	if (q->asoc->src_out_of_asoc_ok)
 		goto sctp_flush_out;
 
-	sctp_outq_flush_data(q, &transport, &transport_list, rtx_timeout, gfp);
+	sctp_outq_flush_data(&ctx, rtx_timeout);
 
 sctp_flush_out:
 
-	sctp_outq_flush_transports(q, &transport_list, gfp);
+	sctp_outq_flush_transports(&ctx);
 }
 
 /* Update unack_data based on the incoming SACK chunk */
-- 
cgit v1.2.3-59-g8ed1b


From e136e965df596d8e4fffa4ae0b202fd4c388568f Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:35:19 -0300
Subject: sctp: add asoc and packet to sctp_flush_ctx

Pre-compute these so the compiler won't reload them (due to
no-strict-aliasing).

Changes since v2:
- Do not replace a return with a break in sctp_outq_flush_data

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 97 ++++++++++++++++++++++++-----------------------------
 1 file changed, 44 insertions(+), 53 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index db94a2513dd8..68b7baea3fea 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -798,16 +798,17 @@ struct sctp_flush_ctx {
 	struct sctp_transport *transport;
 	/* These transports have chunks to send. */
 	struct list_head transport_list;
+	struct sctp_association *asoc;
+	/* Packet on the current transport above */
+	struct sctp_packet *packet;
 	gfp_t gfp;
 };
 
 /* transport: current transport */
-static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
+static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 				       struct sctp_chunk *chunk)
 {
 	struct sctp_transport *new_transport = chunk->transport;
-	struct sctp_association *asoc = ctx->q->asoc;
-	bool changed = false;
 
 	if (!new_transport) {
 		if (!sctp_chunk_is_data(chunk)) {
@@ -825,7 +826,7 @@ static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 							&ctx->transport->ipaddr))
 				new_transport = ctx->transport;
 			else
-				new_transport = sctp_assoc_lookup_paddr(asoc,
+				new_transport = sctp_assoc_lookup_paddr(ctx->asoc,
 								  &chunk->dest);
 		}
 
@@ -833,7 +834,7 @@ static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 		 * use the current active path.
 		 */
 		if (!new_transport)
-			new_transport = asoc->peer.active_path;
+			new_transport = ctx->asoc->peer.active_path;
 	} else {
 		__u8 type;
 
@@ -858,7 +859,7 @@ static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 			if (type != SCTP_CID_HEARTBEAT &&
 			    type != SCTP_CID_HEARTBEAT_ACK &&
 			    type != SCTP_CID_ASCONF_ACK)
-				new_transport = asoc->peer.active_path;
+				new_transport = ctx->asoc->peer.active_path;
 			break;
 		default:
 			break;
@@ -867,27 +868,25 @@ static bool sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 
 	/* Are we switching transports? Take care of transport locks. */
 	if (new_transport != ctx->transport) {
-		changed = true;
 		ctx->transport = new_transport;
+		ctx->packet = &ctx->transport->packet;
+
 		if (list_empty(&ctx->transport->send_ready))
 			list_add_tail(&ctx->transport->send_ready,
 				      &ctx->transport_list);
 
-		sctp_packet_config(&ctx->transport->packet, asoc->peer.i.init_tag,
-				   asoc->peer.ecn_capable);
+		sctp_packet_config(ctx->packet,
+				   ctx->asoc->peer.i.init_tag,
+				   ctx->asoc->peer.ecn_capable);
 		/* We've switched transports, so apply the
 		 * Burst limit to the new transport.
 		 */
 		sctp_transport_burst_limited(ctx->transport);
 	}
-
-	return changed;
 }
 
 static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 {
-	struct sctp_association *asoc = ctx->q->asoc;
-	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk, *tmp;
 	enum sctp_xmit status;
 	int one_packet, error;
@@ -901,7 +900,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 		 * NOT use the new IP address as a source for ANY SCTP
 		 * packet except on carrying an ASCONF Chunk.
 		 */
-		if (asoc->src_out_of_asoc_ok &&
+		if (ctx->asoc->src_out_of_asoc_ok &&
 		    chunk->chunk_hdr->type != SCTP_CID_ASCONF)
 			continue;
 
@@ -910,8 +909,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 		/* Pick the right transport to use. Should always be true for
 		 * the first chunk as we don't have a transport by then.
 		 */
-		if (sctp_outq_select_transport(ctx, chunk))
-			packet = &ctx->transport->packet;
+		sctp_outq_select_transport(ctx, chunk);
 
 		switch (chunk->chunk_hdr->type) {
 		/*
@@ -926,14 +924,14 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 			error = sctp_packet_singleton(ctx->transport, chunk,
 						      ctx->gfp);
 			if (error < 0) {
-				asoc->base.sk->sk_err = -error;
+				ctx->asoc->base.sk->sk_err = -error;
 				return;
 			}
 			break;
 
 		case SCTP_CID_ABORT:
 			if (sctp_test_T_bit(chunk))
-				packet->vtag = asoc->c.my_vtag;
+				ctx->packet->vtag = ctx->asoc->c.my_vtag;
 			/* fallthru */
 
 		/* The following chunks are "response" chunks, i.e.
@@ -959,7 +957,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 		case SCTP_CID_FWD_TSN:
 		case SCTP_CID_I_FWD_TSN:
 		case SCTP_CID_RECONF:
-			status = sctp_packet_transmit_chunk(packet, chunk,
+			status = sctp_packet_transmit_chunk(ctx->packet, chunk,
 							    one_packet, ctx->gfp);
 			if (status != SCTP_XMIT_OK) {
 				/* put the chunk back */
@@ -967,7 +965,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 				break;
 			}
 
-			asoc->stats.octrlchunks++;
+			ctx->asoc->stats.octrlchunks++;
 			/* PR-SCTP C5) If a FORWARD TSN is sent, the
 			 * sender MUST assure that at least one T3-rtx
 			 * timer is running.
@@ -978,7 +976,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 				ctx->transport->last_time_sent = jiffies;
 			}
 
-			if (chunk == asoc->strreset_chunk)
+			if (chunk == ctx->asoc->strreset_chunk)
 				sctp_transport_reset_reconf_timer(ctx->transport);
 
 			break;
@@ -994,31 +992,28 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
 				int rtx_timeout)
 {
-	struct sctp_packet *packet = ctx->transport ? &ctx->transport->packet :
-				     NULL;
-	struct sctp_association *asoc = ctx->q->asoc;
 	int error, start_timer = 0;
 
-	if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
+	if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED)
 		return false;
 
-	if (ctx->transport != asoc->peer.retran_path) {
+	if (ctx->transport != ctx->asoc->peer.retran_path) {
 		/* Switch transports & prepare the packet.  */
-		ctx->transport = asoc->peer.retran_path;
+		ctx->transport = ctx->asoc->peer.retran_path;
+		ctx->packet = &ctx->transport->packet;
 
 		if (list_empty(&ctx->transport->send_ready))
 			list_add_tail(&ctx->transport->send_ready,
 				      &ctx->transport_list);
 
-		packet = &ctx->transport->packet;
-		sctp_packet_config(packet, asoc->peer.i.init_tag,
-				   asoc->peer.ecn_capable);
+		sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag,
+				   ctx->asoc->peer.ecn_capable);
 	}
 
-	error = __sctp_outq_flush_rtx(ctx->q, packet, rtx_timeout, &start_timer,
-				      ctx->gfp);
+	error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout,
+				      &start_timer, ctx->gfp);
 	if (error < 0)
-		asoc->base.sk->sk_err = -error;
+		ctx->asoc->base.sk->sk_err = -error;
 
 	if (start_timer) {
 		sctp_transport_reset_t3_rtx(ctx->transport);
@@ -1028,7 +1023,7 @@ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
 	/* This can happen on COOKIE-ECHO resend.  Only
 	 * one chunk can get bundled with a COOKIE-ECHO.
 	 */
-	if (packet->has_cookie_echo)
+	if (ctx->packet->has_cookie_echo)
 		return false;
 
 	/* Don't send new data if there is still data
@@ -1043,19 +1038,16 @@ static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx,
 static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 				 int rtx_timeout)
 {
-	struct sctp_packet *packet = ctx->transport ? &ctx->transport->packet :
-				     NULL;
-	struct sctp_association *asoc = ctx->q->asoc;
 	struct sctp_chunk *chunk;
 	enum sctp_xmit status;
 
 	/* Is it OK to send data chunks?  */
-	switch (asoc->state) {
+	switch (ctx->asoc->state) {
 	case SCTP_STATE_COOKIE_ECHOED:
 		/* Only allow bundling when this packet has a COOKIE-ECHO
 		 * chunk.
 		 */
-		if (!packet || !packet->has_cookie_echo)
+		if (!ctx->packet || !ctx->packet->has_cookie_echo)
 			return;
 
 		/* fallthru */
@@ -1078,12 +1070,9 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 	 * are marked for retransmission (limited by the
 	 * current cwnd).
 	 */
-	if (!list_empty(&ctx->q->retransmit)) {
-		if (!sctp_outq_flush_rtx(ctx, rtx_timeout))
-			return;
-		/* We may have switched current transport */
-		packet = &ctx->transport->packet;
-	}
+	if (!list_empty(&ctx->q->retransmit) &&
+	    !sctp_outq_flush_rtx(ctx, rtx_timeout))
+		return;
 
 	/* Apply Max.Burst limitation to the current transport in
 	 * case it will be used for new data.  We are going to
@@ -1105,13 +1094,12 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 			continue;
 		}
 
-		if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
+		if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) {
 			sctp_outq_head_data(ctx->q, chunk);
 			break;
 		}
 
-		if (sctp_outq_select_transport(ctx, chunk))
-			packet = &ctx->transport->packet;
+		sctp_outq_select_transport(ctx, chunk);
 
 		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
 			 "skb->users:%d\n",
@@ -1122,7 +1110,8 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 			 refcount_read(&chunk->skb->users) : -1);
 
 		/* Add the chunk to the packet.  */
-		status = sctp_packet_transmit_chunk(packet, chunk, 0, ctx->gfp);
+		status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0,
+						    ctx->gfp);
 		if (status != SCTP_XMIT_OK) {
 			/* We could not append this chunk, so put
 			 * the chunk back on the output queue.
@@ -1139,12 +1128,12 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 		 * The sender MAY set the I-bit in the DATA
 		 * chunk header.
 		 */
-		if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
+		if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING)
 			chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM;
 		if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
-			asoc->stats.ouodchunks++;
+			ctx->asoc->stats.ouodchunks++;
 		else
-			asoc->stats.oodchunks++;
+			ctx->asoc->stats.oodchunks++;
 
 		/* Only now it's safe to consider this
 		 * chunk as sent, sched-wise.
@@ -1160,7 +1149,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 		/* Only let one DATA chunk get bundled with a
 		 * COOKIE-ECHO chunk.
 		 */
-		if (packet->has_cookie_echo)
+		if (ctx->packet->has_cookie_echo)
 			break;
 	}
 }
@@ -1202,6 +1191,8 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		.q = q,
 		.transport = NULL,
 		.transport_list = LIST_HEAD_INIT(ctx.transport_list),
+		.asoc = q->asoc,
+		.packet = NULL,
 		.gfp = gfp,
 	};
 
-- 
cgit v1.2.3-59-g8ed1b


From 5884f35f0d8924a1937c040e420255800c45ef0e Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Mon, 14 May 2018 14:35:20 -0300
Subject: sctp: checkpatch fixups

A collection of fixups from previous patches, left for later to not
introduce unnecessary changes while moving code around.

Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/outqueue.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 68b7baea3fea..d68aa33485a9 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -812,8 +812,7 @@ static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx,
 
 	if (!new_transport) {
 		if (!sctp_chunk_is_data(chunk)) {
-			/*
-			 * If we have a prior transport pointer, see if
+			/* If we have a prior transport pointer, see if
 			 * the destination address of the chunk
 			 * matches the destination address of the
 			 * current transport.  If not a match, then
@@ -912,8 +911,7 @@ static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx)
 		sctp_outq_select_transport(ctx, chunk);
 
 		switch (chunk->chunk_hdr->type) {
-		/*
-		 * 6.10 Bundling
+		/* 6.10 Bundling
 		 *   ...
 		 *   An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN
 		 *   COMPLETE with any other chunks.  [Send them immediately.]
@@ -1061,8 +1059,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 		return;
 	}
 
-	/*
-	 * RFC 2960 6.1  Transmission of DATA Chunks
+	/* RFC 2960 6.1  Transmission of DATA Chunks
 	 *
 	 * C) When the time comes for the sender to transmit,
 	 * before sending new DATA chunks, the sender MUST
@@ -1101,8 +1098,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx,
 
 		sctp_outq_select_transport(ctx, chunk);
 
-		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p "
-			 "skb->users:%d\n",
+		pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n",
 			 __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ?
 			 sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 			 "illegal chunk", ntohl(chunk->subh.data_hdr->tsn),
@@ -1175,8 +1171,7 @@ static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx)
 	}
 }
 
-/*
- * Try to flush an outqueue.
+/* Try to flush an outqueue.
  *
  * Description: Send everything in q which we legally can, subject to
  * congestion limitations.
@@ -1196,8 +1191,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 		.gfp = gfp,
 	};
 
-	/*
-	 * 6.10 Bundling
+	/* 6.10 Bundling
 	 *   ...
 	 *   When bundling control chunks with DATA chunks, an
 	 *   endpoint MUST place control chunks first in the outbound
@@ -1768,7 +1762,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 	if (TSN_lte(tsn, ctsn))
 		goto pass;
 
-	/* 3.3.4 Selective Acknowledgement (SACK) (3):
+	/* 3.3.4 Selective Acknowledgment (SACK) (3):
 	 *
 	 * Gap Ack Blocks:
 	 *  These fields contain the Gap Ack Blocks. They are repeated
-- 
cgit v1.2.3-59-g8ed1b


From 1204aa17f3b4f63e67ac9b7c9afa9496485969c5 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Thu, 10 May 2018 15:21:39 +0200
Subject: brcmfmac: set WIPHY_FLAG_HAVE_AP_SME flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

brcmfmac is a FullMAC driver and it implements/uses cfg80211 interface
for stations management. At the same time it doesn't receive or pass up
management frames.

This flag indicates that authenticator doesn't have to subscribe to or
handle management frames. Some authenticators (e.g. hostapd) were
working with brcmfmac thanks to some extra assumptions. This commit
clears up the situation.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 79a124d1aa23..84135dac49b3 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6518,6 +6518,7 @@ static int brcmf_setup_wiphy(struct wiphy *wiphy, struct brcmf_if *ifp)
 
 	wiphy->flags |= WIPHY_FLAG_NETNS_OK |
 			WIPHY_FLAG_PS_ON_BY_DEFAULT |
+			WIPHY_FLAG_HAVE_AP_SME |
 			WIPHY_FLAG_OFFCHAN_TX |
 			WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
 	if (brcmf_feat_is_enabled(ifp, BRCMF_FEAT_TDLS))
-- 
cgit v1.2.3-59-g8ed1b


From 54b5172087aeae61150835c91e68f084a9644f1c Mon Sep 17 00:00:00 2001
From: Sanjay Kumar Konduri <sanjay.konduri@redpinesignals.com>
Date: Fri, 11 May 2018 20:27:51 +0530
Subject: rsi: Add null check for virtual interfaces in wowlan config

When the "poweroff" command is executed after wowlan enabled, we have
observed a system crash. In the system "poweroff" sequence, network-manager
is sent to inactive state by cleaning up the network interfaces, using
rsi_mac80211_remove_interface() and when driver tries to access those
network interfaces in rsi_wowlan_config() which was invoked by SDIO
shutdown, results in a crash. Added a NULL check before accessing the
network interfaces in rsi_wowlan_config().

Signed-off-by: Sanjay Kumar Konduri <sanjay.konduri@redpinesignals.com>
Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Sushant Kumar Mishra <sushant.mishra@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 80e7f4f4f188..743706eeffa2 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1804,10 +1804,15 @@ int rsi_config_wowlan(struct rsi_hw *adapter, struct cfg80211_wowlan *wowlan)
 	struct rsi_common *common = adapter->priv;
 	u16 triggers = 0;
 	u16 rx_filter_word = 0;
-	struct ieee80211_bss_conf *bss = &adapter->vifs[0]->bss_conf;
+	struct ieee80211_bss_conf *bss = NULL;
 
 	rsi_dbg(INFO_ZONE, "Config WoWLAN to device\n");
 
+	if (!adapter->vifs[0])
+		return -EINVAL;
+
+	bss = &adapter->vifs[0]->bss_conf;
+
 	if (WARN_ON(!wowlan)) {
 		rsi_dbg(ERR_ZONE, "WoW triggers not enabled\n");
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From d76f8513fa9050b6fff477beb453828e5024ef4b Mon Sep 17 00:00:00 2001
From: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Date: Fri, 11 May 2018 20:27:52 +0530
Subject: rsi: reset hibernate_resume flag to work hibernate resume in coex
 mode.

In coex mode, observed hibernate resume is not working properly, as the
hibernate_resume flag is not getting reset in rsi_coex_recv_pkt(),
when common card ready indication received from firmware. Hence resetting
hibernate_resume flag in this function.

Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Sushant Kumar Mishra <sushant.mishra@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_coex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/rsi/rsi_91x_coex.c b/drivers/net/wireless/rsi/rsi_91x_coex.c
index d055099dadf1..c8ba148f8c6c 100644
--- a/drivers/net/wireless/rsi/rsi_91x_coex.c
+++ b/drivers/net/wireless/rsi/rsi_91x_coex.c
@@ -73,6 +73,7 @@ int rsi_coex_recv_pkt(struct rsi_common *common, u8 *msg)
 	switch (msg_type) {
 	case COMMON_CARD_READY_IND:
 		rsi_dbg(INFO_ZONE, "common card ready received\n");
+		common->hibernate_resume = false;
 		rsi_handle_card_ready(common, msg);
 		break;
 	case SLEEP_NOTIFY_IND:
-- 
cgit v1.2.3-59-g8ed1b


From 1d18c5584c05bb059a0ce2fca5bf3c523d6fa4d2 Mon Sep 17 00:00:00 2001
From: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Date: Fri, 11 May 2018 20:27:53 +0530
Subject: rsi: Set wowlan flag while writing wowlan config parameters

As wowlan enable flag did not set, while writing wowlan parameters to
card using rsi_send_vap_dynamic_update, which results firmware is unable to
set wowlan configurations. Hence, setting wowlan flag before sending
parameters.

Signed-off-by: Siva Rebbagondla <siva.rebbagondla@redpinesignals.com>
Signed-off-by: Sushant Kumar Mishra <sushant.mishra@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 743706eeffa2..3faa0449a5ef 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1818,6 +1818,7 @@ int rsi_config_wowlan(struct rsi_hw *adapter, struct cfg80211_wowlan *wowlan)
 		return -EINVAL;
 	}
 
+	common->wow_flags |= RSI_WOW_ENABLED;
 	triggers = rsi_wow_map_triggers(common, wowlan);
 	if (!triggers) {
 		rsi_dbg(ERR_ZONE, "%s:No valid WoW triggers\n", __func__);
@@ -1840,7 +1841,6 @@ int rsi_config_wowlan(struct rsi_hw *adapter, struct cfg80211_wowlan *wowlan)
 
 	rx_filter_word = (ALLOW_DATA_ASSOC_PEER | DISALLOW_BEACONS);
 	rsi_send_rx_filter_frame(common, rx_filter_word);
-	common->wow_flags |= RSI_WOW_ENABLED;
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 66cffd6daab76caebab26eb803b92182414fc182 Mon Sep 17 00:00:00 2001
From: Taketo Kabe <kabe@sra-tohoku.co.jp>
Date: Sun, 13 May 2018 18:16:40 +0900
Subject: b43: fix transmit failure when VT is switched

Setup:
Using BCM4306 rev.03 chip based CardBus wireless card.
IRQ is shared with yenta (cardbus bridge) and i915 (display) driver.
For firmware, installed latest but dated openfwwf 5.2
(http://netweb.ing.unibs.it/~openfwwf/)

How-to-reproduce:
Do "ssh <NetBSD-remotehost>", then "ls -lR /" to generate traffic, then
repeatedly switch VTs by Alt-F1<>Alt-F2.
Eventually (within a minute) the card stops working.
You can receive traffic but no transmission.
For unknown reason it doesn't occur when just generating traffic by
"ssh <remotehost> ls -lR /".

With CONFIG_B43_DEBUG=y kernel config, when it stops,
the debug message shows
    kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 148, but got 180
The slot offset I observed so far was always 32.

When err_out2 is not set to make error messages successive,
the debug output will be like this:
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 148
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 150
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 120
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 152
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 122
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 154
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 124
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 156
kernel: b43-phy1 debug: Out of order TX status report on DMA ring 1. Expected 116, but got 126
The TX ring alternates between 2 sequences; the ring seems
to be completely confused. Controller restart is needed.

Workaround(1):
This problem doesn't occur when using propriatory firmware
you will extract by b43-fwcutter, so it may be a bug in
openfwwf firmware, as the comment in the b43_dma_handle_txstatus() suggests.
I wasn't able to find a bug in the terse openfwwf code though.

Workaround(2):
Using "pio=1" option to not use DMA makes this problem to
not occur.

Description of the patch:
This patch will forcibly reset the controller to make it
work again. Very kludgy and doesn't look right, but
the traffic will continue to flow.

Signed-off-by: Taketo Kabe <kabe@sra-tohoku.co.jp>
Reviewed-by: Michael Buesch <m@bues.ch>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/b43/dma.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/broadcom/b43/dma.c b/drivers/net/wireless/broadcom/b43/dma.c
index 6837064908be..6b0e1ec346cb 100644
--- a/drivers/net/wireless/broadcom/b43/dma.c
+++ b/drivers/net/wireless/broadcom/b43/dma.c
@@ -1484,7 +1484,7 @@ void b43_dma_handle_txstatus(struct b43_wldev *dev,
 	int slot, firstused;
 	bool frame_succeed;
 	int skip;
-	static u8 err_out1, err_out2;
+	static u8 err_out1;
 
 	ring = parse_cookie(dev, status->cookie, &slot);
 	if (unlikely(!ring))
@@ -1518,13 +1518,13 @@ void b43_dma_handle_txstatus(struct b43_wldev *dev,
 			}
 		} else {
 			/* More than a single header/data pair were missed.
-			 * Report this error once.
+			 * Report this error, and reset the controller to
+			 * revive operation.
 			 */
-			if (!err_out2)
-				b43dbg(dev->wl,
-				       "Out of order TX status report on DMA ring %d. Expected %d, but got %d\n",
-				       ring->index, firstused, slot);
-			err_out2 = 1;
+			b43dbg(dev->wl,
+			       "Out of order TX status report on DMA ring %d. Expected %d, but got %d\n",
+			       ring->index, firstused, slot);
+			b43_controller_restart(dev, "Out of order TX");
 			return;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 2bf3e2ef425bc2a164f10b554b7db6a8b4090ef4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 14 May 2018 22:35:02 -0700
Subject: samples: bpf: include bpf/bpf.h instead of local libbpf.h

There are two files in the tree called libbpf.h which is becoming
problematic.  Most samples don't actually need the local libbpf.h
they simply include it to get to bpf/bpf.h.  Include bpf/bpf.h
directly instead.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/bpf_load.c                            | 2 +-
 samples/bpf/bpf_load.h                            | 2 +-
 samples/bpf/cpustat_user.c                        | 2 +-
 samples/bpf/lathist_user.c                        | 2 +-
 samples/bpf/load_sock_ops.c                       | 2 +-
 samples/bpf/lwt_len_hist_user.c                   | 2 +-
 samples/bpf/map_perf_test_user.c                  | 2 +-
 samples/bpf/sock_example.h                        | 1 -
 samples/bpf/sockex1_user.c                        | 2 +-
 samples/bpf/sockex2_user.c                        | 2 +-
 samples/bpf/sockex3_user.c                        | 2 +-
 samples/bpf/syscall_tp_user.c                     | 2 +-
 samples/bpf/tc_l2_redirect_user.c                 | 2 +-
 samples/bpf/test_cgrp2_array_pin.c                | 2 +-
 samples/bpf/test_current_task_under_cgroup_user.c | 2 +-
 samples/bpf/test_lru_dist.c                       | 2 +-
 samples/bpf/test_map_in_map_user.c                | 2 +-
 samples/bpf/test_overhead_user.c                  | 2 +-
 samples/bpf/test_probe_write_user_user.c          | 2 +-
 samples/bpf/trace_output_user.c                   | 2 +-
 samples/bpf/tracex1_user.c                        | 2 +-
 samples/bpf/tracex2_user.c                        | 2 +-
 samples/bpf/tracex3_user.c                        | 2 +-
 samples/bpf/tracex4_user.c                        | 2 +-
 samples/bpf/tracex5_user.c                        | 2 +-
 samples/bpf/tracex6_user.c                        | 2 +-
 samples/bpf/tracex7_user.c                        | 2 +-
 samples/bpf/xdp_fwd_user.c                        | 2 +-
 samples/bpf/xdp_monitor_user.c                    | 2 +-
 samples/bpf/xdp_redirect_cpu_user.c               | 2 +-
 samples/bpf/xdp_redirect_map_user.c               | 2 +-
 samples/bpf/xdp_redirect_user.c                   | 2 +-
 samples/bpf/xdp_router_ipv4_user.c                | 2 +-
 samples/bpf/xdp_tx_iptunnel_user.c                | 2 +-
 samples/bpf/xdpsock_user.c                        | 2 +-
 35 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index a6b290de5632..89161c9ed466 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -24,7 +24,7 @@
 #include <poll.h>
 #include <ctype.h>
 #include <assert.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "perf-sys.h"
 
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index f9da59bca0cc..814894a12974 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -2,7 +2,7 @@
 #ifndef __BPF_LOAD_H
 #define __BPF_LOAD_H
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 #define MAX_MAPS 32
 #define MAX_PROGS 32
diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c
index 2b4cd1ae57c5..869a99406dbf 100644
--- a/samples/bpf/cpustat_user.c
+++ b/samples/bpf/cpustat_user.c
@@ -17,7 +17,7 @@
 #include <sys/resource.h>
 #include <sys/wait.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #define MAX_CPU			8
diff --git a/samples/bpf/lathist_user.c b/samples/bpf/lathist_user.c
index 6477bad5b4e2..c8e88cc84e61 100644
--- a/samples/bpf/lathist_user.c
+++ b/samples/bpf/lathist_user.c
@@ -10,7 +10,7 @@
 #include <stdlib.h>
 #include <signal.h>
 #include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #define MAX_ENTRIES	20
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
index e5da6cf71a3e..8ecb41ea0c03 100644
--- a/samples/bpf/load_sock_ops.c
+++ b/samples/bpf/load_sock_ops.c
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include <unistd.h>
 #include <errno.h>
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
index 7fcb94c09112..587b68b1f8dd 100644
--- a/samples/bpf/lwt_len_hist_user.c
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -9,7 +9,7 @@
 #include <errno.h>
 #include <arpa/inet.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_util.h"
 
 #define MAX_INDEX 64
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 519d9af4b04a..38b7b1a96cc2 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -21,7 +21,7 @@
 #include <arpa/inet.h>
 #include <errno.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #define TEST_BIT(t) (1U << (t))
diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h
index 772d5dad8465..a27d7579bc73 100644
--- a/samples/bpf/sock_example.h
+++ b/samples/bpf/sock_example.h
@@ -9,7 +9,6 @@
 #include <net/if.h>
 #include <linux/if_packet.h>
 #include <arpa/inet.h>
-#include "libbpf.h"
 
 static inline int open_raw_sock(const char *name)
 {
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index 2be935c2627d..93ec01c56104 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "sock_example.h"
 #include <unistd.h>
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 44fe0805b087..1d5c6e9a6d27 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "sock_example.h"
 #include <unistd.h>
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 495ee02e2fb7..5ba3ae9d180b 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <linux/bpf.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "sock_example.h"
 #include <unistd.h>
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index 9169d3207f18..1a1d0059a277 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -16,7 +16,7 @@
 #include <assert.h>
 #include <stdbool.h>
 #include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 /* This program verifies bpf attachment to tracepoint sys_enter_* and sys_exit_*.
diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c
index 28995a776560..7ec45c3e8f56 100644
--- a/samples/bpf/tc_l2_redirect_user.c
+++ b/samples/bpf/tc_l2_redirect_user.c
@@ -13,7 +13,7 @@
 #include <string.h>
 #include <errno.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 static void usage(void)
 {
diff --git a/samples/bpf/test_cgrp2_array_pin.c b/samples/bpf/test_cgrp2_array_pin.c
index 8a1b8b5d8def..242184292f59 100644
--- a/samples/bpf/test_cgrp2_array_pin.c
+++ b/samples/bpf/test_cgrp2_array_pin.c
@@ -14,7 +14,7 @@
 #include <errno.h>
 #include <fcntl.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 static void usage(void)
 {
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 65b5fb51c1db..4be4874ca2bc 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -9,7 +9,7 @@
 #include <stdio.h>
 #include <linux/bpf.h>
 #include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include <linux/bpf.h>
 #include "cgroup_helpers.h"
diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c
index 73c357142268..eec3e2509ce8 100644
--- a/samples/bpf/test_lru_dist.c
+++ b/samples/bpf/test_lru_dist.c
@@ -21,7 +21,7 @@
 #include <stdlib.h>
 #include <time.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_util.h"
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c
index 1aca18539d8d..e308858f7bcf 100644
--- a/samples/bpf/test_map_in_map_user.c
+++ b/samples/bpf/test_map_in_map_user.c
@@ -13,7 +13,7 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #define PORT_A		(map_fd[0])
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
index e1d35e07a10e..6caf47afa635 100644
--- a/samples/bpf/test_overhead_user.c
+++ b/samples/bpf/test_overhead_user.c
@@ -19,7 +19,7 @@
 #include <string.h>
 #include <time.h>
 #include <sys/resource.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #define MAX_CNT 1000000
diff --git a/samples/bpf/test_probe_write_user_user.c b/samples/bpf/test_probe_write_user_user.c
index bf8e3a9f3067..045eb5e30f54 100644
--- a/samples/bpf/test_probe_write_user_user.c
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -3,7 +3,7 @@
 #include <assert.h>
 #include <linux/bpf.h>
 #include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include <sys/socket.h>
 #include <string.h>
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index da98be721001..4837d73edefe 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -18,7 +18,7 @@
 #include <sys/mman.h>
 #include <time.h>
 #include <signal.h>
-#include "libbpf.h"
+#include <libbpf.h>
 #include "bpf_load.h"
 #include "perf-sys.h"
 #include "trace_helpers.h"
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
index 3dcb475fb135..af8c20608ab5 100644
--- a/samples/bpf/tracex1_user.c
+++ b/samples/bpf/tracex1_user.c
@@ -2,7 +2,7 @@
 #include <stdio.h>
 #include <linux/bpf.h>
 #include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 int main(int ac, char **argv)
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index efb5e61918df..1a81e6a5c2ea 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <sys/resource.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "bpf_util.h"
 
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index fe372239d505..6c6b10f4c3ee 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -13,7 +13,7 @@
 #include <linux/bpf.h>
 #include <sys/resource.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "bpf_util.h"
 
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
index 22c644f1f4c3..14625c898e43 100644
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -14,7 +14,7 @@
 #include <linux/bpf.h>
 #include <sys/resource.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 struct pair {
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
index 4e2774b731f0..c4ab91c89494 100644
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
@@ -5,7 +5,7 @@
 #include <linux/filter.h>
 #include <linux/seccomp.h>
 #include <sys/prctl.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include <sys/resource.h>
 
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index 89ab8d408474..4bb3c830adb2 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -16,7 +16,7 @@
 #include <unistd.h>
 
 #include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "perf-sys.h"
 
 #define SAMPLE_PERIOD  0x7fffffffffffffffULL
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
index 8a52ac492e8b..ea6dae78f0df 100644
--- a/samples/bpf/tracex7_user.c
+++ b/samples/bpf/tracex7_user.c
@@ -3,7 +3,7 @@
 #include <stdio.h>
 #include <linux/bpf.h>
 #include <unistd.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 int main(int argc, char **argv)
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index 9c6606f57126..a87a2048ed32 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -26,7 +26,7 @@
 
 #include "bpf_load.h"
 #include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 
 static int do_attach(int idx, int fd, const char *name)
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 05ad3f590c91..bf09b5188acd 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -26,7 +26,7 @@ static const char *__doc_err_only__=
 #include <net/if.h>
 #include <time.h>
 
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 #include "bpf_util.h"
 
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
index 23744a8aaf21..f6efaefd485b 100644
--- a/samples/bpf/xdp_redirect_cpu_user.c
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -28,7 +28,7 @@ static const char *__doc__ =
  * use bpf/libbpf.h), but cannot as (currently) needed for XDP
  * attaching to a device via bpf_set_link_xdp_fd()
  */
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_load.h"
 
 #include "bpf_util.h"
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 7eae07d7293e..4445e76854b5 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -24,7 +24,7 @@
 
 #include "bpf_load.h"
 #include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 static int ifindex_in;
 static int ifindex_out;
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index b701b5c21342..81a69e36cb78 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -24,7 +24,7 @@
 
 #include "bpf_load.h"
 #include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 static int ifindex_in;
 static int ifindex_out;
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 6296741c1fbd..b2b4dfa776c8 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -16,7 +16,7 @@
 #include <sys/socket.h>
 #include <unistd.h>
 #include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include <arpa/inet.h>
 #include <fcntl.h>
 #include <poll.h>
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index f0a787268a87..a4ccc33adac0 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -18,7 +18,7 @@
 #include <unistd.h>
 #include <time.h>
 #include "bpf_load.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 #include "bpf_util.h"
 #include "xdp_tx_iptunnel_common.h"
 
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 4b8a7cf3e63b..7fe60f6f7d53 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -38,7 +38,7 @@
 
 #include "bpf_load.h"
 #include "bpf_util.h"
-#include "libbpf.h"
+#include <bpf/bpf.h>
 
 #include "xdpsock.h"
 
-- 
cgit v1.2.3-59-g8ed1b


From 8d93045077aeede62127b6d6663bfdd31f6240da Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 14 May 2018 22:35:03 -0700
Subject: samples: bpf: rename libbpf.h to bpf_insn.h

The libbpf.h file in samples is clashing with libbpf's header.
Since it only includes a subset of filter.h instruction helpers
rename it to bpf_insn.h.  Drop the unnecessary include of bpf/bpf.h.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/bpf_insn.h                  | 197 +++++++++++++++++++++++++++++++
 samples/bpf/cookie_uid_helper_example.c |   2 +-
 samples/bpf/fds_example.c               |   4 +-
 samples/bpf/libbpf.h                    | 199 --------------------------------
 samples/bpf/sock_example.c              |   3 +-
 samples/bpf/test_cgrp2_attach.c         |   3 +-
 samples/bpf/test_cgrp2_attach2.c        |   3 +-
 samples/bpf/test_cgrp2_sock.c           |   3 +-
 samples/bpf/test_cgrp2_sock2.c          |   3 +-
 9 files changed, 211 insertions(+), 206 deletions(-)
 create mode 100644 samples/bpf/bpf_insn.h
 delete mode 100644 samples/bpf/libbpf.h

diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h
new file mode 100644
index 000000000000..20dc5cefec84
--- /dev/null
+++ b/samples/bpf/bpf_insn.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* eBPF instruction mini library */
+#ifndef __BPF_INSN_H
+#define __BPF_INSN_H
+
+struct bpf_insn;
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_MOV32_REG(DST, SRC)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM)					\
+	BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = (__u32) (IMM) }),			\
+	((struct bpf_insn) {					\
+		.code  = 0, /* zero is reserved opcode */	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = ((__u64) (IMM)) >> 32 })
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD	1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD)				\
+	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
+	((struct bpf_insn) {					\
+		.code  = CODE,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN()						\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_EXIT,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#endif
diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
index 8eca27e595ae..deb0e3e0324d 100644
--- a/samples/bpf/cookie_uid_helper_example.c
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -51,7 +51,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 #include <bpf/bpf.h>
-#include "libbpf.h"
+#include "bpf_insn.h"
 
 #define PORT 8888
 
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index e29bd52ff9e8..9854854f05d1 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -12,8 +12,10 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 
+#include <bpf/bpf.h>
+
+#include "bpf_insn.h"
 #include "bpf_load.h"
-#include "libbpf.h"
 #include "sock_example.h"
 
 #define BPF_F_PIN	(1 << 0)
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
deleted file mode 100644
index 18bfee5aab6b..000000000000
--- a/samples/bpf/libbpf.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* eBPF mini library */
-#ifndef __LIBBPF_H
-#define __LIBBPF_H
-
-#include <bpf/bpf.h>
-
-struct bpf_insn;
-
-/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
-
-#define BPF_ALU64_REG(OP, DST, SRC)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = 0,					\
-		.imm   = 0 })
-
-#define BPF_ALU32_REG(OP, DST, SRC)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = 0,					\
-		.imm   = 0 })
-
-/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
-
-#define BPF_ALU64_IMM(OP, DST, IMM)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = IMM })
-
-#define BPF_ALU32_IMM(OP, DST, IMM)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = IMM })
-
-/* Short form of mov, dst_reg = src_reg */
-
-#define BPF_MOV64_REG(DST, SRC)					\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = 0,					\
-		.imm   = 0 })
-
-#define BPF_MOV32_REG(DST, SRC)					\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = 0,					\
-		.imm   = 0 })
-
-/* Short form of mov, dst_reg = imm32 */
-
-#define BPF_MOV64_IMM(DST, IMM)					\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = IMM })
-
-#define BPF_MOV32_IMM(DST, IMM)					\
-	((struct bpf_insn) {					\
-		.code  = BPF_ALU | BPF_MOV | BPF_K,		\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = IMM })
-
-/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
-#define BPF_LD_IMM64(DST, IMM)					\
-	BPF_LD_IMM64_RAW(DST, 0, IMM)
-
-#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = 0,					\
-		.imm   = (__u32) (IMM) }),			\
-	((struct bpf_insn) {					\
-		.code  = 0, /* zero is reserved opcode */	\
-		.dst_reg = 0,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = ((__u64) (IMM)) >> 32 })
-
-#ifndef BPF_PSEUDO_MAP_FD
-# define BPF_PSEUDO_MAP_FD	1
-#endif
-
-/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
-#define BPF_LD_MAP_FD(DST, MAP_FD)				\
-	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
-
-
-/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
-
-#define BPF_LD_ABS(SIZE, IMM)					\
-	((struct bpf_insn) {					\
-		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
-		.dst_reg = 0,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = IMM })
-
-/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
-
-#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
-	((struct bpf_insn) {					\
-		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = OFF,					\
-		.imm   = 0 })
-
-/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
-
-#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
-	((struct bpf_insn) {					\
-		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = OFF,					\
-		.imm   = 0 })
-
-/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
-
-#define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
-	((struct bpf_insn) {					\
-		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = OFF,					\
-		.imm   = 0 })
-
-/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
-
-#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = OFF,					\
-		.imm   = IMM })
-
-/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
-
-#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = OFF,					\
-		.imm   = 0 })
-
-/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
-
-#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
-	((struct bpf_insn) {					\
-		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
-		.dst_reg = DST,					\
-		.src_reg = 0,					\
-		.off   = OFF,					\
-		.imm   = IMM })
-
-/* Raw code statement block */
-
-#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
-	((struct bpf_insn) {					\
-		.code  = CODE,					\
-		.dst_reg = DST,					\
-		.src_reg = SRC,					\
-		.off   = OFF,					\
-		.imm   = IMM })
-
-/* Program exit */
-
-#define BPF_EXIT_INSN()						\
-	((struct bpf_insn) {					\
-		.code  = BPF_JMP | BPF_EXIT,			\
-		.dst_reg = 0,					\
-		.src_reg = 0,					\
-		.off   = 0,					\
-		.imm   = 0 })
-
-#endif
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 33a637507c00..60ec467c78ab 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -26,7 +26,8 @@
 #include <linux/if_ether.h>
 #include <linux/ip.h>
 #include <stddef.h>
-#include "libbpf.h"
+#include <bpf/bpf.h>
+#include "bpf_insn.h"
 #include "sock_example.h"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 4bfcaf93fcf3..20fbd1241db3 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -28,8 +28,9 @@
 #include <fcntl.h>
 
 #include <linux/bpf.h>
+#include <bpf/bpf.h>
 
-#include "libbpf.h"
+#include "bpf_insn.h"
 
 enum {
 	MAP_KEY_PACKETS,
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 1af412ec6007..b453e6a161be 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -24,8 +24,9 @@
 #include <unistd.h>
 
 #include <linux/bpf.h>
+#include <bpf/bpf.h>
 
-#include "libbpf.h"
+#include "bpf_insn.h"
 #include "cgroup_helpers.h"
 
 #define FOO		"/foo"
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index e79594dd629b..b0811da5a00f 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -21,8 +21,9 @@
 #include <net/if.h>
 #include <inttypes.h>
 #include <linux/bpf.h>
+#include <bpf/bpf.h>
 
-#include "libbpf.h"
+#include "bpf_insn.h"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
 
diff --git a/samples/bpf/test_cgrp2_sock2.c b/samples/bpf/test_cgrp2_sock2.c
index e53f1f6f0867..3b5be2364975 100644
--- a/samples/bpf/test_cgrp2_sock2.c
+++ b/samples/bpf/test_cgrp2_sock2.c
@@ -19,8 +19,9 @@
 #include <fcntl.h>
 #include <net/if.h>
 #include <linux/bpf.h>
+#include <bpf/bpf.h>
 
-#include "libbpf.h"
+#include "bpf_insn.h"
 #include "bpf_load.h"
 
 static int usage(const char *argv0)
-- 
cgit v1.2.3-59-g8ed1b


From 787360f8c2b87d4ae4858bb8736a19c289904885 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 14 May 2018 22:35:04 -0700
Subject: samples: bpf: fix build after move to compiling full libbpf.a
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are many ways users may compile samples, some of them got
broken by commit 5f9380572b4b ("samples: bpf: compile and link
against full libbpf").  Improve path resolution and make libbpf
building a dependency of source files to force its build.

Samples should now again build with any of:
 cd samples/bpf; make
 make samples/bpf/
 make -C samples/bpf
 cd samples/bpf; make O=builddir
 make samples/bpf/ O=builddir
 make -C samples/bpf O=builddir
 export KBUILD_OUTPUT=builddir
 make samples/bpf/
 make -C samples/bpf

Fixes: 5f9380572b4b ("samples: bpf: compile and link against full libbpf")
Reported-by: Björn Töpel <bjorn.topel@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9e255ca4059a..0dae77c88d2e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
+
+BPF_SAMPLES_PATH ?= $(abspath $(srctree)/$(src))
+TOOLS_PATH := $(BPF_SAMPLES_PATH)/../../tools
+
 # List of programs to build
 hostprogs-y := test_lru_dist
 hostprogs-y += sock_example
@@ -49,7 +53,8 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 
 # Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/libbpf.a
+LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
+
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
@@ -233,15 +238,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
 endif
 
 # Trick to allow make to be run from this directory
-all: $(LIBBPF)
-	$(MAKE) -C ../../ $(CURDIR)/
+all:
+	$(MAKE) -C ../../ $(CURDIR)/ BPF_SAMPLES_PATH=$(CURDIR)
 
 clean:
 	$(MAKE) -C ../../ M=$(CURDIR) clean
 	@rm -f *~
 
 $(LIBBPF): FORCE
-	$(MAKE) -C $(dir $@)
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+	$(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(BPF_SAMPLES_PATH)/../../ O=
 
 $(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
 	$(call if_changed_dep,cc_s_c)
@@ -272,7 +278,8 @@ verify_target_bpf: verify_cmds
 		exit 2; \
 	else true; fi
 
-$(src)/*.c: verify_target_bpf
+$(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
+$(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 
-- 
cgit v1.2.3-59-g8ed1b


From 0cc54db1818ad38f400be9f24871f3b7bf09e911 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 14 May 2018 22:35:05 -0700
Subject: samples: bpf: move libbpf from object dependencies to libs

Make complains that it doesn't know how to make libbpf.a:

scripts/Makefile.host:106: target 'samples/bpf/../../tools/lib/bpf/libbpf.a' doesn't match the target pattern

Now that we have it as a dependency of the sources simply add libbpf.a
to libraries not objects.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile | 145 ++++++++++++++++++---------------------------------
 1 file changed, 51 insertions(+), 94 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0dae77c88d2e..0036a77c2d97 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -58,55 +58,53 @@ LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
-test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
-sock_example-objs := sock_example.o $(LIBBPF)
-fds_example-objs := bpf_load.o $(LIBBPF) fds_example.o
-sockex1-objs := bpf_load.o $(LIBBPF) sockex1_user.o
-sockex2-objs := bpf_load.o $(LIBBPF) sockex2_user.o
-sockex3-objs := bpf_load.o $(LIBBPF) sockex3_user.o
-tracex1-objs := bpf_load.o $(LIBBPF) tracex1_user.o
-tracex2-objs := bpf_load.o $(LIBBPF) tracex2_user.o
-tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
-tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
-tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
-tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
-load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
-test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
-lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
-map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
-test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
-test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o $(LIBBPF)
-test_cgrp2_attach-objs := test_cgrp2_attach.o $(LIBBPF)
-test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(LIBBPF) $(CGROUP_HELPERS)
-test_cgrp2_sock-objs := test_cgrp2_sock.o $(LIBBPF)
-test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o
-xdp1-objs := xdp1_user.o $(LIBBPF)
+fds_example-objs := bpf_load.o fds_example.o
+sockex1-objs := bpf_load.o sockex1_user.o
+sockex2-objs := bpf_load.o sockex2_user.o
+sockex3-objs := bpf_load.o sockex3_user.o
+tracex1-objs := bpf_load.o tracex1_user.o
+tracex2-objs := bpf_load.o tracex2_user.o
+tracex3-objs := bpf_load.o tracex3_user.o
+tracex4-objs := bpf_load.o tracex4_user.o
+tracex5-objs := bpf_load.o tracex5_user.o
+tracex6-objs := bpf_load.o tracex6_user.o
+tracex7-objs := bpf_load.o tracex7_user.o
+load_sock_ops-objs := bpf_load.o load_sock_ops.o
+test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
+trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
+lathist-objs := bpf_load.o lathist_user.o
+offwaketime-objs := bpf_load.o offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o spintest_user.o $(TRACE_HELPERS)
+map_perf_test-objs := bpf_load.o map_perf_test_user.o
+test_overhead-objs := bpf_load.o test_overhead_user.o
+test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
+test_cgrp2_attach-objs := test_cgrp2_attach.o
+test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS)
+test_cgrp2_sock-objs := test_cgrp2_sock.o
+test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
+xdp1-objs := xdp1_user.o
 # reuse xdp1 source intentionally
-xdp2-objs := xdp1_user.o $(LIBBPF)
-xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
-test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
+xdp2-objs := xdp1_user.o
+xdp_router_ipv4-objs := bpf_load.o xdp_router_ipv4_user.o
+test_current_task_under_cgroup-objs := bpf_load.o $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
-tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
-lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
-xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
-test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
-per_socket_stats_example-objs := cookie_uid_helper_example.o $(LIBBPF)
-xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
-xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
-xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
-xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
-xdp_rxq_info-objs := xdp_rxq_info_user.o $(LIBBPF)
-syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
-cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
-xdp_adjust_tail-objs := xdp_adjust_tail_user.o $(LIBBPF)
-xdpsock-objs := bpf_load.o $(LIBBPF) xdpsock_user.o
-xdp_fwd-objs := bpf_load.o $(LIBBPF) xdp_fwd_user.o
+trace_event-objs := bpf_load.o trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o sampleip_user.o $(TRACE_HELPERS)
+tc_l2_redirect-objs := bpf_load.o tc_l2_redirect_user.o
+lwt_len_hist-objs := bpf_load.o lwt_len_hist_user.o
+xdp_tx_iptunnel-objs := bpf_load.o xdp_tx_iptunnel_user.o
+test_map_in_map-objs := bpf_load.o test_map_in_map_user.o
+per_socket_stats_example-objs := cookie_uid_helper_example.o
+xdp_redirect-objs := bpf_load.o xdp_redirect_user.o
+xdp_redirect_map-objs := bpf_load.o xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
+xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_rxq_info-objs := xdp_rxq_info_user.o
+syscall_tp-objs := bpf_load.o syscall_tp_user.o
+cpustat-objs := bpf_load.o cpustat_user.o
+xdp_adjust_tail-objs := xdp_adjust_tail_user.o
+xdpsock-objs := bpf_load.o xdpsock_user.o
+xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -178,53 +176,12 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 
-HOSTLOADLIBES_test_lru_dist += -lelf
-HOSTLOADLIBES_sock_example += -lelf
-HOSTLOADLIBES_fds_example += -lelf
-HOSTLOADLIBES_sockex1 += -lelf
-HOSTLOADLIBES_sockex2 += -lelf
-HOSTLOADLIBES_sockex3 += -lelf
-HOSTLOADLIBES_tracex1 += -lelf
-HOSTLOADLIBES_tracex2 += -lelf
-HOSTLOADLIBES_tracex3 += -lelf
-HOSTLOADLIBES_tracex4 += -lelf -lrt
-HOSTLOADLIBES_tracex5 += -lelf
-HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
-HOSTLOADLIBES_test_cgrp2_array_pin += -lelf
-HOSTLOADLIBES_test_cgrp2_attach += -lelf
-HOSTLOADLIBES_test_cgrp2_attach2 += -lelf
-HOSTLOADLIBES_test_cgrp2_sock += -lelf
-HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
-HOSTLOADLIBES_load_sock_ops += -lelf
-HOSTLOADLIBES_test_probe_write_user += -lelf
-HOSTLOADLIBES_trace_output += -lelf -lrt
-HOSTLOADLIBES_lathist += -lelf
-HOSTLOADLIBES_offwaketime += -lelf
-HOSTLOADLIBES_spintest += -lelf
-HOSTLOADLIBES_map_perf_test += -lelf -lrt
-HOSTLOADLIBES_test_overhead += -lelf -lrt
-HOSTLOADLIBES_xdp1 += -lelf
-HOSTLOADLIBES_xdp2 += -lelf
-HOSTLOADLIBES_xdp_router_ipv4 += -lelf
-HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
-HOSTLOADLIBES_trace_event += -lelf
-HOSTLOADLIBES_sampleip += -lelf
-HOSTLOADLIBES_tc_l2_redirect += -l elf
-HOSTLOADLIBES_lwt_len_hist += -l elf
-HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
-HOSTLOADLIBES_test_map_in_map += -lelf
-HOSTLOADLIBES_per_socket_stats_example += -lelf
-HOSTLOADLIBES_xdp_redirect += -lelf
-HOSTLOADLIBES_xdp_redirect_map += -lelf
-HOSTLOADLIBES_xdp_redirect_cpu += -lelf
-HOSTLOADLIBES_xdp_monitor += -lelf
-HOSTLOADLIBES_xdp_rxq_info += -lelf
-HOSTLOADLIBES_syscall_tp += -lelf
-HOSTLOADLIBES_cpustat += -lelf
-HOSTLOADLIBES_xdp_adjust_tail += -lelf
-HOSTLOADLIBES_xdpsock += -lelf -pthread
-HOSTLOADLIBES_xdp_fwd += -lelf
+HOST_LOADLIBES		+= $(LIBBPF) -lelf
+HOSTLOADLIBES_tracex4		+= -lrt
+HOSTLOADLIBES_trace_output	+= -lrt
+HOSTLOADLIBES_map_perf_test	+= -lrt
+HOSTLOADLIBES_test_overhead	+= -lrt
+HOSTLOADLIBES_xdpsock		+= -pthread
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
-- 
cgit v1.2.3-59-g8ed1b


From 768759edb9a1bd1b3fc38313b6578e5c8b252aee Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 14 May 2018 22:35:06 -0700
Subject: samples: bpf: make the build less noisy

Building samples with clang ignores the $(Q) setting, always
printing full command to the output.  Make it less verbose.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0036a77c2d97..62d1aa1a4cf3 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -244,7 +244,8 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 # But, there is no easy way to fix it, so just exclude it since it is
 # useless for BPF samples.
 $(obj)/%.o: $(src)/%.c
-	$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
+	@echo "  CLANG-bpf " $@
+	$(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
 		-I$(srctree)/tools/testing/selftests/bpf/ \
 		-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
 		-D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
-- 
cgit v1.2.3-59-g8ed1b


From 4712c1b203dd04d9b0f3139000bc4bfbb94b0c05 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 14 May 2018 15:42:12 +0200
Subject: bpf, doc: add basic README.rst file

A README.rst file in a directory have special meaning for sites like
github, which auto renders the contents.  Plus search engines like
Google also index these README.rst files.

Auto rendering allow us to use links, for (re)directing eBPF users to
other places where docs live.  The end-goal would be to direct users
towards https://www.kernel.org/doc/html/latest but we haven't written
the full docs yet, so we start out small and take this incrementally.

This directory itself contains some useful docs, which can be linked
to from the README.rst file (verified this works for github).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/README.rst | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 Documentation/bpf/README.rst

diff --git a/Documentation/bpf/README.rst b/Documentation/bpf/README.rst
new file mode 100644
index 000000000000..329469c33db8
--- /dev/null
+++ b/Documentation/bpf/README.rst
@@ -0,0 +1,36 @@
+=================
+BPF documentation
+=================
+
+This directory contains documentation for the BPF (Berkeley Packet
+Filter) facility, with a focus on the extended BPF version (eBPF).
+
+This kernel side documentation is still work in progress.  The main
+textual documentation is (for historical reasons) described in
+`Documentation/networking/filter.txt`_, which describe both classical
+and extended BPF instruction-set.
+The Cilium project also maintains a `BPF and XDP Reference Guide`_
+that goes into great technical depth about the BPF Architecture.
+
+The primary info for the bpf syscall is available in the `man-pages`_
+for `bpf(2)`_.
+
+
+
+Frequently asked questions (FAQ)
+================================
+
+Two sets of Questions and Answers (Q&A) are maintained.
+
+* QA for common questions about BPF see: bpf_design_QA_
+
+* QA for developers interacting with BPF subsystem: bpf_devel_QA_
+
+
+.. Links:
+.. _bpf_design_QA: bpf_design_QA.txt
+.. _bpf_devel_QA:  bpf_devel_QA.txt
+.. _Documentation/networking/filter.txt: ../networking/filter.txt
+.. _man-pages: https://www.kernel.org/doc/man-pages/
+.. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
+.. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/
-- 
cgit v1.2.3-59-g8ed1b


From 192092faa02dd5e5d1ff875d7512a5d803db95a0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 14 May 2018 15:42:17 +0200
Subject: bpf, doc: rename txt files to rst files

This will cause them to get auto rendered, e.g. when viewing them on GitHub.
Followup patches will correct the content to be RST compliant.

Also adjust README.rst to point to the renamed files.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/README.rst        |   4 +-
 Documentation/bpf/bpf_design_QA.rst | 156 ++++++++++
 Documentation/bpf/bpf_design_QA.txt | 156 ----------
 Documentation/bpf/bpf_devel_QA.rst  | 570 ++++++++++++++++++++++++++++++++++++
 Documentation/bpf/bpf_devel_QA.txt  | 570 ------------------------------------
 5 files changed, 728 insertions(+), 728 deletions(-)
 create mode 100644 Documentation/bpf/bpf_design_QA.rst
 delete mode 100644 Documentation/bpf/bpf_design_QA.txt
 create mode 100644 Documentation/bpf/bpf_devel_QA.rst
 delete mode 100644 Documentation/bpf/bpf_devel_QA.txt

diff --git a/Documentation/bpf/README.rst b/Documentation/bpf/README.rst
index 329469c33db8..b9a80c9e9392 100644
--- a/Documentation/bpf/README.rst
+++ b/Documentation/bpf/README.rst
@@ -28,8 +28,8 @@ Two sets of Questions and Answers (Q&A) are maintained.
 
 
 .. Links:
-.. _bpf_design_QA: bpf_design_QA.txt
-.. _bpf_devel_QA:  bpf_devel_QA.txt
+.. _bpf_design_QA: bpf_design_QA.rst
+.. _bpf_devel_QA:  bpf_devel_QA.rst
 .. _Documentation/networking/filter.txt: ../networking/filter.txt
 .. _man-pages: https://www.kernel.org/doc/man-pages/
 .. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
new file mode 100644
index 000000000000..f3e458a0bb2f
--- /dev/null
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -0,0 +1,156 @@
+BPF extensibility and applicability to networking, tracing, security
+in the linux kernel and several user space implementations of BPF
+virtual machine led to a number of misunderstanding on what BPF actually is.
+This short QA is an attempt to address that and outline a direction
+of where BPF is heading long term.
+
+Q: Is BPF a generic instruction set similar to x64 and arm64?
+A: NO.
+
+Q: Is BPF a generic virtual machine ?
+A: NO.
+
+BPF is generic instruction set _with_ C calling convention.
+
+Q: Why C calling convention was chosen?
+A: Because BPF programs are designed to run in the linux kernel
+   which is written in C, hence BPF defines instruction set compatible
+   with two most used architectures x64 and arm64 (and takes into
+   consideration important quirks of other architectures) and
+   defines calling convention that is compatible with C calling
+   convention of the linux kernel on those architectures.
+
+Q: can multiple return values be supported in the future?
+A: NO. BPF allows only register R0 to be used as return value.
+
+Q: can more than 5 function arguments be supported in the future?
+A: NO. BPF calling convention only allows registers R1-R5 to be used
+   as arguments. BPF is not a standalone instruction set.
+   (unlike x64 ISA that allows msft, cdecl and other conventions)
+
+Q: can BPF programs access instruction pointer or return address?
+A: NO.
+
+Q: can BPF programs access stack pointer ?
+A: NO. Only frame pointer (register R10) is accessible.
+   From compiler point of view it's necessary to have stack pointer.
+   For example LLVM defines register R11 as stack pointer in its
+   BPF backend, but it makes sure that generated code never uses it.
+
+Q: Does C-calling convention diminishes possible use cases?
+A: YES. BPF design forces addition of major functionality in the form
+   of kernel helper functions and kernel objects like BPF maps with
+   seamless interoperability between them. It lets kernel call into
+   BPF programs and programs call kernel helpers with zero overhead.
+   As all of them were native C code. That is particularly the case
+   for JITed BPF programs that are indistinguishable from
+   native kernel C code.
+
+Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
+A: Soft yes. At least for now until BPF core has support for
+   bpf-to-bpf calls, indirect calls, loops, global variables,
+   jump tables, read only sections and all other normal constructs
+   that C code can produce.
+
+Q: Can loops be supported in a safe way?
+A: It's not clear yet. BPF developers are trying to find a way to
+   support bounded loops where the verifier can guarantee that
+   the program terminates in less than 4096 instructions.
+
+Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
+   C code cannot express them and has to use builtin intrinsics?
+A: This is artifact of compatibility with classic BPF. Modern
+   networking code in BPF performs better without them.
+   See 'direct packet access'.
+
+Q: It seems not all BPF instructions are one-to-one to native CPU.
+   For example why BPF_JNE and other compare and jumps are not cpu-like?
+A: This was necessary to avoid introducing flags into ISA which are
+   impossible to make generic and efficient across CPU architectures.
+
+Q: why BPF_DIV instruction doesn't map to x64 div?
+A: Because if we picked one-to-one relationship to x64 it would have made
+   it more complicated to support on arm64 and other archs. Also it
+   needs div-by-zero runtime check.
+
+Q: why there is no BPF_SDIV for signed divide operation?
+A: Because it would be rarely used. llvm errors in such case and
+   prints a suggestion to use unsigned divide instead
+
+Q: Why BPF has implicit prologue and epilogue?
+A: Because architectures like sparc have register windows and in general
+   there are enough subtle differences between architectures, so naive
+   store return address into stack won't work. Another reason is BPF has
+   to be safe from division by zero (and legacy exception path
+   of LD_ABS insn). Those instructions need to invoke epilogue and
+   return implicitly.
+
+Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
+A: Because classic BPF didn't have them and BPF authors felt that compiler
+   workaround would be acceptable. Turned out that programs lose performance
+   due to lack of these compare instructions and they were added.
+   These two instructions is a perfect example what kind of new BPF
+   instructions are acceptable and can be added in the future.
+   These two already had equivalent instructions in native CPUs.
+   New instructions that don't have one-to-one mapping to HW instructions
+   will not be accepted.
+
+Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
+   registers which makes BPF inefficient virtual machine for 32-bit
+   CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
+   be added to BPF in the future?
+A: NO. The first thing to improve performance on 32-bit archs is to teach
+   LLVM to generate code that uses 32-bit subregisters. Then second step
+   is to teach verifier to mark operations where zero-ing upper bits
+   is unnecessary. Then JITs can take advantage of those markings and
+   drastically reduce size of generated code and improve performance.
+
+Q: Does BPF have a stable ABI?
+A: YES. BPF instructions, arguments to BPF programs, set of helper
+   functions and their arguments, recognized return codes are all part
+   of ABI. However when tracing programs are using bpf_probe_read() helper
+   to walk kernel internal datastructures and compile with kernel
+   internal headers these accesses can and will break with newer
+   kernels. The union bpf_attr -> kern_version is checked at load time
+   to prevent accidentally loading kprobe-based bpf programs written
+   for a different kernel. Networking programs don't do kern_version check.
+
+Q: How much stack space a BPF program uses?
+A: Currently all program types are limited to 512 bytes of stack
+   space, but the verifier computes the actual amount of stack used
+   and both interpreter and most JITed code consume necessary amount.
+
+Q: Can BPF be offloaded to HW?
+A: YES. BPF HW offload is supported by NFP driver.
+
+Q: Does classic BPF interpreter still exist?
+A: NO. Classic BPF programs are converted into extend BPF instructions.
+
+Q: Can BPF call arbitrary kernel functions?
+A: NO. BPF programs can only call a set of helper functions which
+   is defined for every program type.
+
+Q: Can BPF overwrite arbitrary kernel memory?
+A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
+   and bpf_probe_read_str() helpers. Networking programs cannot read
+   arbitrary memory, since they don't have access to these helpers.
+   Programs can never read or write arbitrary memory directly.
+
+Q: Can BPF overwrite arbitrary user memory?
+A: Sort-of. Tracing BPF programs can overwrite the user memory
+   of the current task with bpf_probe_write_user(). Every time such
+   program is loaded the kernel will print warning message, so
+   this helper is only useful for experiments and prototypes.
+   Tracing BPF programs are root only.
+
+Q: When bpf_trace_printk() helper is used the kernel prints nasty
+   warning message. Why is that?
+A: This is done to nudge program authors into better interfaces when
+   programs need to pass data to user space. Like bpf_perf_event_output()
+   can be used to efficiently stream data via perf ring buffer.
+   BPF maps can be used for asynchronous data sharing between kernel
+   and user space. bpf_trace_printk() should only be used for debugging.
+
+Q: Can BPF functionality such as new program or map types, new
+   helpers, etc be added out of kernel module code?
+A: NO.
diff --git a/Documentation/bpf/bpf_design_QA.txt b/Documentation/bpf/bpf_design_QA.txt
deleted file mode 100644
index f3e458a0bb2f..000000000000
--- a/Documentation/bpf/bpf_design_QA.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-BPF extensibility and applicability to networking, tracing, security
-in the linux kernel and several user space implementations of BPF
-virtual machine led to a number of misunderstanding on what BPF actually is.
-This short QA is an attempt to address that and outline a direction
-of where BPF is heading long term.
-
-Q: Is BPF a generic instruction set similar to x64 and arm64?
-A: NO.
-
-Q: Is BPF a generic virtual machine ?
-A: NO.
-
-BPF is generic instruction set _with_ C calling convention.
-
-Q: Why C calling convention was chosen?
-A: Because BPF programs are designed to run in the linux kernel
-   which is written in C, hence BPF defines instruction set compatible
-   with two most used architectures x64 and arm64 (and takes into
-   consideration important quirks of other architectures) and
-   defines calling convention that is compatible with C calling
-   convention of the linux kernel on those architectures.
-
-Q: can multiple return values be supported in the future?
-A: NO. BPF allows only register R0 to be used as return value.
-
-Q: can more than 5 function arguments be supported in the future?
-A: NO. BPF calling convention only allows registers R1-R5 to be used
-   as arguments. BPF is not a standalone instruction set.
-   (unlike x64 ISA that allows msft, cdecl and other conventions)
-
-Q: can BPF programs access instruction pointer or return address?
-A: NO.
-
-Q: can BPF programs access stack pointer ?
-A: NO. Only frame pointer (register R10) is accessible.
-   From compiler point of view it's necessary to have stack pointer.
-   For example LLVM defines register R11 as stack pointer in its
-   BPF backend, but it makes sure that generated code never uses it.
-
-Q: Does C-calling convention diminishes possible use cases?
-A: YES. BPF design forces addition of major functionality in the form
-   of kernel helper functions and kernel objects like BPF maps with
-   seamless interoperability between them. It lets kernel call into
-   BPF programs and programs call kernel helpers with zero overhead.
-   As all of them were native C code. That is particularly the case
-   for JITed BPF programs that are indistinguishable from
-   native kernel C code.
-
-Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
-A: Soft yes. At least for now until BPF core has support for
-   bpf-to-bpf calls, indirect calls, loops, global variables,
-   jump tables, read only sections and all other normal constructs
-   that C code can produce.
-
-Q: Can loops be supported in a safe way?
-A: It's not clear yet. BPF developers are trying to find a way to
-   support bounded loops where the verifier can guarantee that
-   the program terminates in less than 4096 instructions.
-
-Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
-   C code cannot express them and has to use builtin intrinsics?
-A: This is artifact of compatibility with classic BPF. Modern
-   networking code in BPF performs better without them.
-   See 'direct packet access'.
-
-Q: It seems not all BPF instructions are one-to-one to native CPU.
-   For example why BPF_JNE and other compare and jumps are not cpu-like?
-A: This was necessary to avoid introducing flags into ISA which are
-   impossible to make generic and efficient across CPU architectures.
-
-Q: why BPF_DIV instruction doesn't map to x64 div?
-A: Because if we picked one-to-one relationship to x64 it would have made
-   it more complicated to support on arm64 and other archs. Also it
-   needs div-by-zero runtime check.
-
-Q: why there is no BPF_SDIV for signed divide operation?
-A: Because it would be rarely used. llvm errors in such case and
-   prints a suggestion to use unsigned divide instead
-
-Q: Why BPF has implicit prologue and epilogue?
-A: Because architectures like sparc have register windows and in general
-   there are enough subtle differences between architectures, so naive
-   store return address into stack won't work. Another reason is BPF has
-   to be safe from division by zero (and legacy exception path
-   of LD_ABS insn). Those instructions need to invoke epilogue and
-   return implicitly.
-
-Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
-A: Because classic BPF didn't have them and BPF authors felt that compiler
-   workaround would be acceptable. Turned out that programs lose performance
-   due to lack of these compare instructions and they were added.
-   These two instructions is a perfect example what kind of new BPF
-   instructions are acceptable and can be added in the future.
-   These two already had equivalent instructions in native CPUs.
-   New instructions that don't have one-to-one mapping to HW instructions
-   will not be accepted.
-
-Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
-   registers which makes BPF inefficient virtual machine for 32-bit
-   CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
-   be added to BPF in the future?
-A: NO. The first thing to improve performance on 32-bit archs is to teach
-   LLVM to generate code that uses 32-bit subregisters. Then second step
-   is to teach verifier to mark operations where zero-ing upper bits
-   is unnecessary. Then JITs can take advantage of those markings and
-   drastically reduce size of generated code and improve performance.
-
-Q: Does BPF have a stable ABI?
-A: YES. BPF instructions, arguments to BPF programs, set of helper
-   functions and their arguments, recognized return codes are all part
-   of ABI. However when tracing programs are using bpf_probe_read() helper
-   to walk kernel internal datastructures and compile with kernel
-   internal headers these accesses can and will break with newer
-   kernels. The union bpf_attr -> kern_version is checked at load time
-   to prevent accidentally loading kprobe-based bpf programs written
-   for a different kernel. Networking programs don't do kern_version check.
-
-Q: How much stack space a BPF program uses?
-A: Currently all program types are limited to 512 bytes of stack
-   space, but the verifier computes the actual amount of stack used
-   and both interpreter and most JITed code consume necessary amount.
-
-Q: Can BPF be offloaded to HW?
-A: YES. BPF HW offload is supported by NFP driver.
-
-Q: Does classic BPF interpreter still exist?
-A: NO. Classic BPF programs are converted into extend BPF instructions.
-
-Q: Can BPF call arbitrary kernel functions?
-A: NO. BPF programs can only call a set of helper functions which
-   is defined for every program type.
-
-Q: Can BPF overwrite arbitrary kernel memory?
-A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
-   and bpf_probe_read_str() helpers. Networking programs cannot read
-   arbitrary memory, since they don't have access to these helpers.
-   Programs can never read or write arbitrary memory directly.
-
-Q: Can BPF overwrite arbitrary user memory?
-A: Sort-of. Tracing BPF programs can overwrite the user memory
-   of the current task with bpf_probe_write_user(). Every time such
-   program is loaded the kernel will print warning message, so
-   this helper is only useful for experiments and prototypes.
-   Tracing BPF programs are root only.
-
-Q: When bpf_trace_printk() helper is used the kernel prints nasty
-   warning message. Why is that?
-A: This is done to nudge program authors into better interfaces when
-   programs need to pass data to user space. Like bpf_perf_event_output()
-   can be used to efficiently stream data via perf ring buffer.
-   BPF maps can be used for asynchronous data sharing between kernel
-   and user space. bpf_trace_printk() should only be used for debugging.
-
-Q: Can BPF functionality such as new program or map types, new
-   helpers, etc be added out of kernel module code?
-A: NO.
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
new file mode 100644
index 000000000000..da57601153a0
--- /dev/null
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -0,0 +1,570 @@
+This document provides information for the BPF subsystem about various
+workflows related to reporting bugs, submitting patches, and queueing
+patches for stable kernels.
+
+For general information about submitting patches, please refer to
+Documentation/process/. This document only describes additional specifics
+related to BPF.
+
+Reporting bugs:
+---------------
+
+Q: How do I report bugs for BPF kernel code?
+
+A: Since all BPF kernel development as well as bpftool and iproute2 BPF
+   loader development happens through the netdev kernel mailing list,
+   please report any found issues around BPF to the following mailing
+   list:
+
+     netdev@vger.kernel.org
+
+   This may also include issues related to XDP, BPF tracing, etc.
+
+   Given netdev has a high volume of traffic, please also add the BPF
+   maintainers to Cc (from kernel MAINTAINERS file):
+
+     Alexei Starovoitov <ast@kernel.org>
+     Daniel Borkmann <daniel@iogearbox.net>
+
+   In case a buggy commit has already been identified, make sure to keep
+   the actual commit authors in Cc as well for the report. They can
+   typically be identified through the kernel's git tree.
+
+   Please do *not* report BPF issues to bugzilla.kernel.org since it
+   is a guarantee that the reported issue will be overlooked.
+
+Submitting patches:
+-------------------
+
+Q: To which mailing list do I need to submit my BPF patches?
+
+A: Please submit your BPF patches to the netdev kernel mailing list:
+
+     netdev@vger.kernel.org
+
+   Historically, BPF came out of networking and has always been maintained
+   by the kernel networking community. Although these days BPF touches
+   many other subsystems as well, the patches are still routed mainly
+   through the networking community.
+
+   In case your patch has changes in various different subsystems (e.g.
+   tracing, security, etc), make sure to Cc the related kernel mailing
+   lists and maintainers from there as well, so they are able to review
+   the changes and provide their Acked-by's to the patches.
+
+Q: Where can I find patches currently under discussion for BPF subsystem?
+
+A: All patches that are Cc'ed to netdev are queued for review under netdev
+   patchwork project:
+
+     http://patchwork.ozlabs.org/project/netdev/list/
+
+   Those patches which target BPF, are assigned to a 'bpf' delegate for
+   further processing from BPF maintainers. The current queue with
+   patches under review can be found at:
+
+     https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+
+   Once the patches have been reviewed by the BPF community as a whole
+   and approved by the BPF maintainers, their status in patchwork will be
+   changed to 'Accepted' and the submitter will be notified by mail. This
+   means that the patches look good from a BPF perspective and have been
+   applied to one of the two BPF kernel trees.
+
+   In case feedback from the community requires a respin of the patches,
+   their status in patchwork will be set to 'Changes Requested', and purged
+   from the current review queue. Likewise for cases where patches would
+   get rejected or are not applicable to the BPF trees (but assigned to
+   the 'bpf' delegate).
+
+Q: How do the changes make their way into Linux?
+
+A: There are two BPF kernel trees (git repositories). Once patches have
+   been accepted by the BPF maintainers, they will be applied to one
+   of the two BPF trees:
+
+     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
+     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
+
+   The bpf tree itself is for fixes only, whereas bpf-next for features,
+   cleanups or other kind of improvements ("next-like" content). This is
+   analogous to net and net-next trees for networking. Both bpf and
+   bpf-next will only have a master branch in order to simplify against
+   which branch patches should get rebased to.
+
+   Accumulated BPF patches in the bpf tree will regularly get pulled
+   into the net kernel tree. Likewise, accumulated BPF patches accepted
+   into the bpf-next tree will make their way into net-next tree. net and
+   net-next are both run by David S. Miller. From there, they will go
+   into the kernel mainline tree run by Linus Torvalds. To read up on the
+   process of net and net-next being merged into the mainline tree, see
+   the netdev FAQ under:
+
+     Documentation/networking/netdev-FAQ.txt
+
+   Occasionally, to prevent merge conflicts, we might send pull requests
+   to other trees (e.g. tracing) with a small subset of the patches, but
+   net and net-next are always the main trees targeted for integration.
+
+   The pull requests will contain a high-level summary of the accumulated
+   patches and can be searched on netdev kernel mailing list through the
+   following subject lines (yyyy-mm-dd is the date of the pull request):
+
+     pull-request: bpf yyyy-mm-dd
+     pull-request: bpf-next yyyy-mm-dd
+
+Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
+   applied to?
+
+A: The process is the very same as described in the netdev FAQ, so
+   please read up on it. The subject line must indicate whether the
+   patch is a fix or rather "next-like" content in order to let the
+   maintainers know whether it is targeted at bpf or bpf-next.
+
+   For fixes eventually landing in bpf -> net tree, the subject must
+   look like:
+
+     git format-patch --subject-prefix='PATCH bpf' start..finish
+
+   For features/improvements/etc that should eventually land in
+   bpf-next -> net-next, the subject must look like:
+
+     git format-patch --subject-prefix='PATCH bpf-next' start..finish
+
+   If unsure whether the patch or patch series should go into bpf
+   or net directly, or bpf-next or net-next directly, it is not a
+   problem either if the subject line says net or net-next as target.
+   It is eventually up to the maintainers to do the delegation of
+   the patches.
+
+   If it is clear that patches should go into bpf or bpf-next tree,
+   please make sure to rebase the patches against those trees in
+   order to reduce potential conflicts.
+
+   In case the patch or patch series has to be reworked and sent out
+   again in a second or later revision, it is also required to add a
+   version number (v2, v3, ...) into the subject prefix:
+
+     git format-patch --subject-prefix='PATCH net-next v2' start..finish
+
+   When changes have been requested to the patch series, always send the
+   whole patch series again with the feedback incorporated (never send
+   individual diffs on top of the old series).
+
+Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
+
+A: It means that the patch looks good for mainline inclusion from
+   a BPF point of view.
+
+   Be aware that this is not a final verdict that the patch will
+   automatically get accepted into net or net-next trees eventually:
+
+   On the netdev kernel mailing list reviews can come in at any point
+   in time. If discussions around a patch conclude that they cannot
+   get included as-is, we will either apply a follow-up fix or drop
+   them from the trees entirely. Therefore, we also reserve to rebase
+   the trees when deemed necessary. After all, the purpose of the tree
+   is to i) accumulate and stage BPF patches for integration into trees
+   like net and net-next, and ii) run extensive BPF test suite and
+   workloads on the patches before they make their way any further.
+
+   Once the BPF pull request was accepted by David S. Miller, then
+   the patches end up in net or net-next tree, respectively, and
+   make their way from there further into mainline. Again, see the
+   netdev FAQ for additional information e.g. on how often they are
+   merged to mainline.
+
+Q: How long do I need to wait for feedback on my BPF patches?
+
+A: We try to keep the latency low. The usual time to feedback will
+   be around 2 or 3 business days. It may vary depending on the
+   complexity of changes and current patch load.
+
+Q: How often do you send pull requests to major kernel trees like
+   net or net-next?
+
+A: Pull requests will be sent out rather often in order to not
+   accumulate too many patches in bpf or bpf-next.
+
+   As a rule of thumb, expect pull requests for each tree regularly
+   at the end of the week. In some cases pull requests could additionally
+   come also in the middle of the week depending on the current patch
+   load or urgency.
+
+Q: Are patches applied to bpf-next when the merge window is open?
+
+A: For the time when the merge window is open, bpf-next will not be
+   processed. This is roughly analogous to net-next patch processing,
+   so feel free to read up on the netdev FAQ about further details.
+
+   During those two weeks of merge window, we might ask you to resend
+   your patch series once bpf-next is open again. Once Linus released
+   a v*-rc1 after the merge window, we continue processing of bpf-next.
+
+   For non-subscribers to kernel mailing lists, there is also a status
+   page run by David S. Miller on net-next that provides guidance:
+
+     http://vger.kernel.org/~davem/net-next.html
+
+Q: I made a BPF verifier change, do I need to add test cases for
+   BPF kernel selftests?
+
+A: If the patch has changes to the behavior of the verifier, then yes,
+   it is absolutely necessary to add test cases to the BPF kernel
+   selftests suite. If they are not present and we think they are
+   needed, then we might ask for them before accepting any changes.
+
+   In particular, test_verifier.c is tracking a high number of BPF test
+   cases, including a lot of corner cases that LLVM BPF back end may
+   generate out of the restricted C code. Thus, adding test cases is
+   absolutely crucial to make sure future changes do not accidentally
+   affect prior use-cases. Thus, treat those test cases as: verifier
+   behavior that is not tracked in test_verifier.c could potentially
+   be subject to change.
+
+Q: When should I add code to samples/bpf/ and when to BPF kernel
+   selftests?
+
+A: In general, we prefer additions to BPF kernel selftests rather than
+   samples/bpf/. The rationale is very simple: kernel selftests are
+   regularly run by various bots to test for kernel regressions.
+
+   The more test cases we add to BPF selftests, the better the coverage
+   and the less likely it is that those could accidentally break. It is
+   not that BPF kernel selftests cannot demo how a specific feature can
+   be used.
+
+   That said, samples/bpf/ may be a good place for people to get started,
+   so it might be advisable that simple demos of features could go into
+   samples/bpf/, but advanced functional and corner-case testing rather
+   into kernel selftests.
+
+   If your sample looks like a test case, then go for BPF kernel selftests
+   instead!
+
+Q: When should I add code to the bpftool?
+
+A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
+   a central user space tool for debugging and introspection of BPF programs
+   and maps that are active in the kernel. If UAPI changes related to BPF
+   enable for dumping additional information of programs or maps, then
+   bpftool should be extended as well to support dumping them.
+
+Q: When should I add code to iproute2's BPF loader?
+
+A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
+   convention is that those control-path related changes are added to
+   iproute2's BPF loader as well from user space side. This is not only
+   useful to have UAPI changes properly designed to be usable, but also
+   to make those changes available to a wider user base of major
+   downstream distributions.
+
+Q: Do you accept patches as well for iproute2's BPF loader?
+
+A: Patches for the iproute2's BPF loader have to be sent to:
+
+     netdev@vger.kernel.org
+
+   While those patches are not processed by the BPF kernel maintainers,
+   please keep them in Cc as well, so they can be reviewed.
+
+   The official git repository for iproute2 is run by Stephen Hemminger
+   and can be found at:
+
+     https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
+
+   The patches need to have a subject prefix of '[PATCH iproute2 master]'
+   or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
+   target branch where the patch should be applied to. Meaning, if kernel
+   changes went into the net-next kernel tree, then the related iproute2
+   changes need to go into the iproute2 net-next branch, otherwise they
+   can be targeted at master branch. The iproute2 net-next branch will get
+   merged into the master branch after the current iproute2 version from
+   master has been released.
+
+   Like BPF, the patches end up in patchwork under the netdev project and
+   are delegated to 'shemminger' for further processing:
+
+     http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
+
+Q: What is the minimum requirement before I submit my BPF patches?
+
+A: When submitting patches, always take the time and properly test your
+   patches *prior* to submission. Never rush them! If maintainers find
+   that your patches have not been properly tested, it is a good way to
+   get them grumpy. Testing patch submissions is a hard requirement!
+
+   Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
+   same applies to fixes that target bpf-next, where the affected commit
+   is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
+   in order to identify follow-up commits and tremendously helps for people
+   having to do backporting, so it is a must have!
+
+   We also don't accept patches with an empty commit message. Take your
+   time and properly write up a high quality commit message, it is
+   essential!
+
+   Think about it this way: other developers looking at your code a month
+   from now need to understand *why* a certain change has been done that
+   way, and whether there have been flaws in the analysis or assumptions
+   that the original author did. Thus providing a proper rationale and
+   describing the use-case for the changes is a must.
+
+   Patch submissions with >1 patch must have a cover letter which includes
+   a high level description of the series. This high level summary will
+   then be placed into the merge commit by the BPF maintainers such that
+   it is also accessible from the git log for future reference.
+
+Q: What do I need to consider when adding a new instruction or feature
+   that would require BPF JIT and/or LLVM integration as well?
+
+A: We try hard to keep all BPF JITs up to date such that the same user
+   experience can be guaranteed when running BPF programs on different
+   architectures without having the program punt to the less efficient
+   interpreter in case the in-kernel BPF JIT is enabled.
+
+   If you are unable to implement or test the required JIT changes for
+   certain architectures, please work together with the related BPF JIT
+   developers in order to get the feature implemented in a timely manner.
+   Please refer to the git log (arch/*/net/) to locate the necessary
+   people for helping out.
+
+   Also always make sure to add BPF test cases (e.g. test_bpf.c and
+   test_verifier.c) for new instructions, so that they can receive
+   broad test coverage and help run-time testing the various BPF JITs.
+
+   In case of new BPF instructions, once the changes have been accepted
+   into the Linux kernel, please implement support into LLVM's BPF back
+   end. See LLVM section below for further information.
+
+Stable submission:
+------------------
+
+Q: I need a specific BPF commit in stable kernels. What should I do?
+
+A: In case you need a specific fix in stable kernels, first check whether
+   the commit has already been applied in the related linux-*.y branches:
+
+     https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
+
+   If not the case, then drop an email to the BPF maintainers with the
+   netdev kernel mailing list in Cc and ask for the fix to be queued up:
+
+     netdev@vger.kernel.org
+
+   The process in general is the same as on netdev itself, see also the
+   netdev FAQ document.
+
+Q: Do you also backport to kernels not currently maintained as stable?
+
+A: No. If you need a specific BPF commit in kernels that are currently not
+   maintained by the stable maintainers, then you are on your own.
+
+   The current stable and longterm stable kernels are all listed here:
+
+     https://www.kernel.org/
+
+Q: The BPF patch I am about to submit needs to go to stable as well. What
+   should I do?
+
+A: The same rules apply as with netdev patch submissions in general, see
+   netdev FAQ under:
+
+     Documentation/networking/netdev-FAQ.txt
+
+   Never add "Cc: stable@vger.kernel.org" to the patch description, but
+   ask the BPF maintainers to queue the patches instead. This can be done
+   with a note, for example, under the "---" part of the patch which does
+   not go into the git log. Alternatively, this can be done as a simple
+   request by mail instead.
+
+Q: Where do I find currently queued BPF patches that will be submitted
+   to stable?
+
+A: Once patches that fix critical bugs got applied into the bpf tree, they
+   are queued up for stable submission under:
+
+     http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
+
+   They will be on hold there at minimum until the related commit made its
+   way into the mainline kernel tree.
+
+   After having been under broader exposure, the queued patches will be
+   submitted by the BPF maintainers to the stable maintainers.
+
+Testing patches:
+----------------
+
+Q: Which BPF kernel selftests version should I run my kernel against?
+
+A: If you run a kernel xyz, then always run the BPF kernel selftests from
+   that kernel xyz as well. Do not expect that the BPF selftest from the
+   latest mainline tree will pass all the time.
+
+   In particular, test_bpf.c and test_verifier.c have a large number of
+   test cases and are constantly updated with new BPF test sequences, or
+   existing ones are adapted to verifier changes e.g. due to verifier
+   becoming smarter and being able to better track certain things.
+
+LLVM:
+-----
+
+Q: Where do I find LLVM with BPF support?
+
+A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
+
+   All major distributions these days ship LLVM with BPF back end enabled,
+   so for the majority of use-cases it is not required to compile LLVM by
+   hand anymore, just install the distribution provided package.
+
+   LLVM's static compiler lists the supported targets through 'llc --version',
+   make sure BPF targets are listed. Example:
+
+     $ llc --version
+     LLVM (http://llvm.org/):
+       LLVM version 6.0.0svn
+       Optimized build.
+       Default target: x86_64-unknown-linux-gnu
+       Host CPU: skylake
+
+       Registered Targets:
+         bpf    - BPF (host endian)
+         bpfeb  - BPF (big endian)
+         bpfel  - BPF (little endian)
+         x86    - 32-bit X86: Pentium-Pro and above
+         x86-64 - 64-bit X86: EM64T and AMD64
+
+   For developers in order to utilize the latest features added to LLVM's
+   BPF back end, it is advisable to run the latest LLVM releases. Support
+   for new BPF kernel features such as additions to the BPF instruction
+   set are often developed together.
+
+   All LLVM releases can be found at: http://releases.llvm.org/
+
+Q: Got it, so how do I build LLVM manually anyway?
+
+A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
+   that set up, proceed with building the latest LLVM and clang version
+   from the git repositories:
+
+     $ git clone http://llvm.org/git/llvm.git
+     $ cd llvm/tools
+     $ git clone --depth 1 http://llvm.org/git/clang.git
+     $ cd ..; mkdir build; cd build
+     $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
+                -DBUILD_SHARED_LIBS=OFF           \
+                -DCMAKE_BUILD_TYPE=Release        \
+                -DLLVM_BUILD_RUNTIME=OFF
+     $ make -j $(getconf _NPROCESSORS_ONLN)
+
+   The built binaries can then be found in the build/bin/ directory, where
+   you can point the PATH variable to.
+
+Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
+   generation back end or about LLVM generated code that the verifier
+   refuses to accept?
+
+A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
+   infrastructure and it ties deeply into verification of programs from the
+   kernel side. Therefore, any issues on either side need to be investigated
+   and fixed whenever necessary.
+
+   Therefore, please make sure to bring them up at netdev kernel mailing
+   list and Cc BPF maintainers for LLVM and kernel bits:
+
+     Yonghong Song <yhs@fb.com>
+     Alexei Starovoitov <ast@kernel.org>
+     Daniel Borkmann <daniel@iogearbox.net>
+
+   LLVM also has an issue tracker where BPF related bugs can be found:
+
+     https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
+
+   However, it is better to reach out through mailing lists with having
+   maintainers in Cc.
+
+Q: I have added a new BPF instruction to the kernel, how can I integrate
+   it into LLVM?
+
+A: LLVM has a -mcpu selector for the BPF back end in order to allow the
+   selection of BPF instruction set extensions. By default the 'generic'
+   processor target is used, which is the base instruction set (v1) of BPF.
+
+   LLVM has an option to select -mcpu=probe where it will probe the host
+   kernel for supported BPF instruction set extensions and selects the
+   optimal set automatically.
+
+   For cross-compilation, a specific version can be select manually as well.
+
+     $ llc -march bpf -mcpu=help
+     Available CPUs for this target:
+
+       generic - Select the generic processor.
+       probe   - Select the probe processor.
+       v1      - Select the v1 processor.
+       v2      - Select the v2 processor.
+     [...]
+
+   Newly added BPF instructions to the Linux kernel need to follow the same
+   scheme, bump the instruction set version and implement probing for the
+   extensions such that -mcpu=probe users can benefit from the optimization
+   transparently when upgrading their kernels.
+
+   If you are unable to implement support for the newly added BPF instruction
+   please reach out to BPF developers for help.
+
+   By the way, the BPF kernel selftests run with -mcpu=probe for better
+   test coverage.
+
+Q: In some cases clang flag "-target bpf" is used but in other cases the
+   default clang target, which matches the underlying architecture, is used.
+   What is the difference and when I should use which?
+
+A: Although LLVM IR generation and optimization try to stay architecture
+   independent, "-target <arch>" still has some impact on generated code:
+
+     - BPF program may recursively include header file(s) with file scope
+       inline assembly codes. The default target can handle this well,
+       while bpf target may fail if bpf backend assembler does not
+       understand these assembly codes, which is true in most cases.
+
+     - When compiled without -g, additional elf sections, e.g.,
+       .eh_frame and .rela.eh_frame, may be present in the object file
+       with default target, but not with bpf target.
+
+     - The default target may turn a C switch statement into a switch table
+       lookup and jump operation. Since the switch table is placed
+       in the global readonly section, the bpf program will fail to load.
+       The bpf target does not support switch table optimization.
+       The clang option "-fno-jump-tables" can be used to disable
+       switch table generation.
+
+     - For clang -target bpf, it is guaranteed that pointer or long /
+       unsigned long types will always have a width of 64 bit, no matter
+       whether underlying clang binary or default target (or kernel) is
+       32 bit. However, when native clang target is used, then it will
+       compile these types based on the underlying architecture's conventions,
+       meaning in case of 32 bit architecture, pointer or long / unsigned
+       long types e.g. in BPF context structure will have width of 32 bit
+       while the BPF LLVM back end still operates in 64 bit. The native
+       target is mostly needed in tracing for the case of walking pt_regs
+       or other kernel structures where CPU's register width matters.
+       Otherwise, clang -target bpf is generally recommended.
+
+   You should use default target when:
+
+     - Your program includes a header file, e.g., ptrace.h, which eventually
+       pulls in some header files containing file scope host assembly codes.
+     - You can add "-fno-jump-tables" to work around the switch table issue.
+
+   Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
+   when:
+
+     - Your program uses data structures with pointer or long / unsigned long
+       types that interface with BPF helpers or context data structures. Access
+       into these structures is verified by the BPF verifier and may result
+       in verification failures if the native architecture is not aligned with
+       the BPF architecture, e.g. 64-bit. An example of this is
+       BPF_PROG_TYPE_SK_MSG require '-target bpf'
+
+Happy BPF hacking!
diff --git a/Documentation/bpf/bpf_devel_QA.txt b/Documentation/bpf/bpf_devel_QA.txt
deleted file mode 100644
index da57601153a0..000000000000
--- a/Documentation/bpf/bpf_devel_QA.txt
+++ /dev/null
@@ -1,570 +0,0 @@
-This document provides information for the BPF subsystem about various
-workflows related to reporting bugs, submitting patches, and queueing
-patches for stable kernels.
-
-For general information about submitting patches, please refer to
-Documentation/process/. This document only describes additional specifics
-related to BPF.
-
-Reporting bugs:
----------------
-
-Q: How do I report bugs for BPF kernel code?
-
-A: Since all BPF kernel development as well as bpftool and iproute2 BPF
-   loader development happens through the netdev kernel mailing list,
-   please report any found issues around BPF to the following mailing
-   list:
-
-     netdev@vger.kernel.org
-
-   This may also include issues related to XDP, BPF tracing, etc.
-
-   Given netdev has a high volume of traffic, please also add the BPF
-   maintainers to Cc (from kernel MAINTAINERS file):
-
-     Alexei Starovoitov <ast@kernel.org>
-     Daniel Borkmann <daniel@iogearbox.net>
-
-   In case a buggy commit has already been identified, make sure to keep
-   the actual commit authors in Cc as well for the report. They can
-   typically be identified through the kernel's git tree.
-
-   Please do *not* report BPF issues to bugzilla.kernel.org since it
-   is a guarantee that the reported issue will be overlooked.
-
-Submitting patches:
--------------------
-
-Q: To which mailing list do I need to submit my BPF patches?
-
-A: Please submit your BPF patches to the netdev kernel mailing list:
-
-     netdev@vger.kernel.org
-
-   Historically, BPF came out of networking and has always been maintained
-   by the kernel networking community. Although these days BPF touches
-   many other subsystems as well, the patches are still routed mainly
-   through the networking community.
-
-   In case your patch has changes in various different subsystems (e.g.
-   tracing, security, etc), make sure to Cc the related kernel mailing
-   lists and maintainers from there as well, so they are able to review
-   the changes and provide their Acked-by's to the patches.
-
-Q: Where can I find patches currently under discussion for BPF subsystem?
-
-A: All patches that are Cc'ed to netdev are queued for review under netdev
-   patchwork project:
-
-     http://patchwork.ozlabs.org/project/netdev/list/
-
-   Those patches which target BPF, are assigned to a 'bpf' delegate for
-   further processing from BPF maintainers. The current queue with
-   patches under review can be found at:
-
-     https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
-
-   Once the patches have been reviewed by the BPF community as a whole
-   and approved by the BPF maintainers, their status in patchwork will be
-   changed to 'Accepted' and the submitter will be notified by mail. This
-   means that the patches look good from a BPF perspective and have been
-   applied to one of the two BPF kernel trees.
-
-   In case feedback from the community requires a respin of the patches,
-   their status in patchwork will be set to 'Changes Requested', and purged
-   from the current review queue. Likewise for cases where patches would
-   get rejected or are not applicable to the BPF trees (but assigned to
-   the 'bpf' delegate).
-
-Q: How do the changes make their way into Linux?
-
-A: There are two BPF kernel trees (git repositories). Once patches have
-   been accepted by the BPF maintainers, they will be applied to one
-   of the two BPF trees:
-
-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
-
-   The bpf tree itself is for fixes only, whereas bpf-next for features,
-   cleanups or other kind of improvements ("next-like" content). This is
-   analogous to net and net-next trees for networking. Both bpf and
-   bpf-next will only have a master branch in order to simplify against
-   which branch patches should get rebased to.
-
-   Accumulated BPF patches in the bpf tree will regularly get pulled
-   into the net kernel tree. Likewise, accumulated BPF patches accepted
-   into the bpf-next tree will make their way into net-next tree. net and
-   net-next are both run by David S. Miller. From there, they will go
-   into the kernel mainline tree run by Linus Torvalds. To read up on the
-   process of net and net-next being merged into the mainline tree, see
-   the netdev FAQ under:
-
-     Documentation/networking/netdev-FAQ.txt
-
-   Occasionally, to prevent merge conflicts, we might send pull requests
-   to other trees (e.g. tracing) with a small subset of the patches, but
-   net and net-next are always the main trees targeted for integration.
-
-   The pull requests will contain a high-level summary of the accumulated
-   patches and can be searched on netdev kernel mailing list through the
-   following subject lines (yyyy-mm-dd is the date of the pull request):
-
-     pull-request: bpf yyyy-mm-dd
-     pull-request: bpf-next yyyy-mm-dd
-
-Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
-   applied to?
-
-A: The process is the very same as described in the netdev FAQ, so
-   please read up on it. The subject line must indicate whether the
-   patch is a fix or rather "next-like" content in order to let the
-   maintainers know whether it is targeted at bpf or bpf-next.
-
-   For fixes eventually landing in bpf -> net tree, the subject must
-   look like:
-
-     git format-patch --subject-prefix='PATCH bpf' start..finish
-
-   For features/improvements/etc that should eventually land in
-   bpf-next -> net-next, the subject must look like:
-
-     git format-patch --subject-prefix='PATCH bpf-next' start..finish
-
-   If unsure whether the patch or patch series should go into bpf
-   or net directly, or bpf-next or net-next directly, it is not a
-   problem either if the subject line says net or net-next as target.
-   It is eventually up to the maintainers to do the delegation of
-   the patches.
-
-   If it is clear that patches should go into bpf or bpf-next tree,
-   please make sure to rebase the patches against those trees in
-   order to reduce potential conflicts.
-
-   In case the patch or patch series has to be reworked and sent out
-   again in a second or later revision, it is also required to add a
-   version number (v2, v3, ...) into the subject prefix:
-
-     git format-patch --subject-prefix='PATCH net-next v2' start..finish
-
-   When changes have been requested to the patch series, always send the
-   whole patch series again with the feedback incorporated (never send
-   individual diffs on top of the old series).
-
-Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
-
-A: It means that the patch looks good for mainline inclusion from
-   a BPF point of view.
-
-   Be aware that this is not a final verdict that the patch will
-   automatically get accepted into net or net-next trees eventually:
-
-   On the netdev kernel mailing list reviews can come in at any point
-   in time. If discussions around a patch conclude that they cannot
-   get included as-is, we will either apply a follow-up fix or drop
-   them from the trees entirely. Therefore, we also reserve to rebase
-   the trees when deemed necessary. After all, the purpose of the tree
-   is to i) accumulate and stage BPF patches for integration into trees
-   like net and net-next, and ii) run extensive BPF test suite and
-   workloads on the patches before they make their way any further.
-
-   Once the BPF pull request was accepted by David S. Miller, then
-   the patches end up in net or net-next tree, respectively, and
-   make their way from there further into mainline. Again, see the
-   netdev FAQ for additional information e.g. on how often they are
-   merged to mainline.
-
-Q: How long do I need to wait for feedback on my BPF patches?
-
-A: We try to keep the latency low. The usual time to feedback will
-   be around 2 or 3 business days. It may vary depending on the
-   complexity of changes and current patch load.
-
-Q: How often do you send pull requests to major kernel trees like
-   net or net-next?
-
-A: Pull requests will be sent out rather often in order to not
-   accumulate too many patches in bpf or bpf-next.
-
-   As a rule of thumb, expect pull requests for each tree regularly
-   at the end of the week. In some cases pull requests could additionally
-   come also in the middle of the week depending on the current patch
-   load or urgency.
-
-Q: Are patches applied to bpf-next when the merge window is open?
-
-A: For the time when the merge window is open, bpf-next will not be
-   processed. This is roughly analogous to net-next patch processing,
-   so feel free to read up on the netdev FAQ about further details.
-
-   During those two weeks of merge window, we might ask you to resend
-   your patch series once bpf-next is open again. Once Linus released
-   a v*-rc1 after the merge window, we continue processing of bpf-next.
-
-   For non-subscribers to kernel mailing lists, there is also a status
-   page run by David S. Miller on net-next that provides guidance:
-
-     http://vger.kernel.org/~davem/net-next.html
-
-Q: I made a BPF verifier change, do I need to add test cases for
-   BPF kernel selftests?
-
-A: If the patch has changes to the behavior of the verifier, then yes,
-   it is absolutely necessary to add test cases to the BPF kernel
-   selftests suite. If they are not present and we think they are
-   needed, then we might ask for them before accepting any changes.
-
-   In particular, test_verifier.c is tracking a high number of BPF test
-   cases, including a lot of corner cases that LLVM BPF back end may
-   generate out of the restricted C code. Thus, adding test cases is
-   absolutely crucial to make sure future changes do not accidentally
-   affect prior use-cases. Thus, treat those test cases as: verifier
-   behavior that is not tracked in test_verifier.c could potentially
-   be subject to change.
-
-Q: When should I add code to samples/bpf/ and when to BPF kernel
-   selftests?
-
-A: In general, we prefer additions to BPF kernel selftests rather than
-   samples/bpf/. The rationale is very simple: kernel selftests are
-   regularly run by various bots to test for kernel regressions.
-
-   The more test cases we add to BPF selftests, the better the coverage
-   and the less likely it is that those could accidentally break. It is
-   not that BPF kernel selftests cannot demo how a specific feature can
-   be used.
-
-   That said, samples/bpf/ may be a good place for people to get started,
-   so it might be advisable that simple demos of features could go into
-   samples/bpf/, but advanced functional and corner-case testing rather
-   into kernel selftests.
-
-   If your sample looks like a test case, then go for BPF kernel selftests
-   instead!
-
-Q: When should I add code to the bpftool?
-
-A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
-   a central user space tool for debugging and introspection of BPF programs
-   and maps that are active in the kernel. If UAPI changes related to BPF
-   enable for dumping additional information of programs or maps, then
-   bpftool should be extended as well to support dumping them.
-
-Q: When should I add code to iproute2's BPF loader?
-
-A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
-   convention is that those control-path related changes are added to
-   iproute2's BPF loader as well from user space side. This is not only
-   useful to have UAPI changes properly designed to be usable, but also
-   to make those changes available to a wider user base of major
-   downstream distributions.
-
-Q: Do you accept patches as well for iproute2's BPF loader?
-
-A: Patches for the iproute2's BPF loader have to be sent to:
-
-     netdev@vger.kernel.org
-
-   While those patches are not processed by the BPF kernel maintainers,
-   please keep them in Cc as well, so they can be reviewed.
-
-   The official git repository for iproute2 is run by Stephen Hemminger
-   and can be found at:
-
-     https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
-
-   The patches need to have a subject prefix of '[PATCH iproute2 master]'
-   or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
-   target branch where the patch should be applied to. Meaning, if kernel
-   changes went into the net-next kernel tree, then the related iproute2
-   changes need to go into the iproute2 net-next branch, otherwise they
-   can be targeted at master branch. The iproute2 net-next branch will get
-   merged into the master branch after the current iproute2 version from
-   master has been released.
-
-   Like BPF, the patches end up in patchwork under the netdev project and
-   are delegated to 'shemminger' for further processing:
-
-     http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
-
-Q: What is the minimum requirement before I submit my BPF patches?
-
-A: When submitting patches, always take the time and properly test your
-   patches *prior* to submission. Never rush them! If maintainers find
-   that your patches have not been properly tested, it is a good way to
-   get them grumpy. Testing patch submissions is a hard requirement!
-
-   Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
-   same applies to fixes that target bpf-next, where the affected commit
-   is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
-   in order to identify follow-up commits and tremendously helps for people
-   having to do backporting, so it is a must have!
-
-   We also don't accept patches with an empty commit message. Take your
-   time and properly write up a high quality commit message, it is
-   essential!
-
-   Think about it this way: other developers looking at your code a month
-   from now need to understand *why* a certain change has been done that
-   way, and whether there have been flaws in the analysis or assumptions
-   that the original author did. Thus providing a proper rationale and
-   describing the use-case for the changes is a must.
-
-   Patch submissions with >1 patch must have a cover letter which includes
-   a high level description of the series. This high level summary will
-   then be placed into the merge commit by the BPF maintainers such that
-   it is also accessible from the git log for future reference.
-
-Q: What do I need to consider when adding a new instruction or feature
-   that would require BPF JIT and/or LLVM integration as well?
-
-A: We try hard to keep all BPF JITs up to date such that the same user
-   experience can be guaranteed when running BPF programs on different
-   architectures without having the program punt to the less efficient
-   interpreter in case the in-kernel BPF JIT is enabled.
-
-   If you are unable to implement or test the required JIT changes for
-   certain architectures, please work together with the related BPF JIT
-   developers in order to get the feature implemented in a timely manner.
-   Please refer to the git log (arch/*/net/) to locate the necessary
-   people for helping out.
-
-   Also always make sure to add BPF test cases (e.g. test_bpf.c and
-   test_verifier.c) for new instructions, so that they can receive
-   broad test coverage and help run-time testing the various BPF JITs.
-
-   In case of new BPF instructions, once the changes have been accepted
-   into the Linux kernel, please implement support into LLVM's BPF back
-   end. See LLVM section below for further information.
-
-Stable submission:
-------------------
-
-Q: I need a specific BPF commit in stable kernels. What should I do?
-
-A: In case you need a specific fix in stable kernels, first check whether
-   the commit has already been applied in the related linux-*.y branches:
-
-     https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
-
-   If not the case, then drop an email to the BPF maintainers with the
-   netdev kernel mailing list in Cc and ask for the fix to be queued up:
-
-     netdev@vger.kernel.org
-
-   The process in general is the same as on netdev itself, see also the
-   netdev FAQ document.
-
-Q: Do you also backport to kernels not currently maintained as stable?
-
-A: No. If you need a specific BPF commit in kernels that are currently not
-   maintained by the stable maintainers, then you are on your own.
-
-   The current stable and longterm stable kernels are all listed here:
-
-     https://www.kernel.org/
-
-Q: The BPF patch I am about to submit needs to go to stable as well. What
-   should I do?
-
-A: The same rules apply as with netdev patch submissions in general, see
-   netdev FAQ under:
-
-     Documentation/networking/netdev-FAQ.txt
-
-   Never add "Cc: stable@vger.kernel.org" to the patch description, but
-   ask the BPF maintainers to queue the patches instead. This can be done
-   with a note, for example, under the "---" part of the patch which does
-   not go into the git log. Alternatively, this can be done as a simple
-   request by mail instead.
-
-Q: Where do I find currently queued BPF patches that will be submitted
-   to stable?
-
-A: Once patches that fix critical bugs got applied into the bpf tree, they
-   are queued up for stable submission under:
-
-     http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
-
-   They will be on hold there at minimum until the related commit made its
-   way into the mainline kernel tree.
-
-   After having been under broader exposure, the queued patches will be
-   submitted by the BPF maintainers to the stable maintainers.
-
-Testing patches:
-----------------
-
-Q: Which BPF kernel selftests version should I run my kernel against?
-
-A: If you run a kernel xyz, then always run the BPF kernel selftests from
-   that kernel xyz as well. Do not expect that the BPF selftest from the
-   latest mainline tree will pass all the time.
-
-   In particular, test_bpf.c and test_verifier.c have a large number of
-   test cases and are constantly updated with new BPF test sequences, or
-   existing ones are adapted to verifier changes e.g. due to verifier
-   becoming smarter and being able to better track certain things.
-
-LLVM:
------
-
-Q: Where do I find LLVM with BPF support?
-
-A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
-
-   All major distributions these days ship LLVM with BPF back end enabled,
-   so for the majority of use-cases it is not required to compile LLVM by
-   hand anymore, just install the distribution provided package.
-
-   LLVM's static compiler lists the supported targets through 'llc --version',
-   make sure BPF targets are listed. Example:
-
-     $ llc --version
-     LLVM (http://llvm.org/):
-       LLVM version 6.0.0svn
-       Optimized build.
-       Default target: x86_64-unknown-linux-gnu
-       Host CPU: skylake
-
-       Registered Targets:
-         bpf    - BPF (host endian)
-         bpfeb  - BPF (big endian)
-         bpfel  - BPF (little endian)
-         x86    - 32-bit X86: Pentium-Pro and above
-         x86-64 - 64-bit X86: EM64T and AMD64
-
-   For developers in order to utilize the latest features added to LLVM's
-   BPF back end, it is advisable to run the latest LLVM releases. Support
-   for new BPF kernel features such as additions to the BPF instruction
-   set are often developed together.
-
-   All LLVM releases can be found at: http://releases.llvm.org/
-
-Q: Got it, so how do I build LLVM manually anyway?
-
-A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
-   that set up, proceed with building the latest LLVM and clang version
-   from the git repositories:
-
-     $ git clone http://llvm.org/git/llvm.git
-     $ cd llvm/tools
-     $ git clone --depth 1 http://llvm.org/git/clang.git
-     $ cd ..; mkdir build; cd build
-     $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
-                -DBUILD_SHARED_LIBS=OFF           \
-                -DCMAKE_BUILD_TYPE=Release        \
-                -DLLVM_BUILD_RUNTIME=OFF
-     $ make -j $(getconf _NPROCESSORS_ONLN)
-
-   The built binaries can then be found in the build/bin/ directory, where
-   you can point the PATH variable to.
-
-Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
-   generation back end or about LLVM generated code that the verifier
-   refuses to accept?
-
-A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
-   infrastructure and it ties deeply into verification of programs from the
-   kernel side. Therefore, any issues on either side need to be investigated
-   and fixed whenever necessary.
-
-   Therefore, please make sure to bring them up at netdev kernel mailing
-   list and Cc BPF maintainers for LLVM and kernel bits:
-
-     Yonghong Song <yhs@fb.com>
-     Alexei Starovoitov <ast@kernel.org>
-     Daniel Borkmann <daniel@iogearbox.net>
-
-   LLVM also has an issue tracker where BPF related bugs can be found:
-
-     https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
-
-   However, it is better to reach out through mailing lists with having
-   maintainers in Cc.
-
-Q: I have added a new BPF instruction to the kernel, how can I integrate
-   it into LLVM?
-
-A: LLVM has a -mcpu selector for the BPF back end in order to allow the
-   selection of BPF instruction set extensions. By default the 'generic'
-   processor target is used, which is the base instruction set (v1) of BPF.
-
-   LLVM has an option to select -mcpu=probe where it will probe the host
-   kernel for supported BPF instruction set extensions and selects the
-   optimal set automatically.
-
-   For cross-compilation, a specific version can be select manually as well.
-
-     $ llc -march bpf -mcpu=help
-     Available CPUs for this target:
-
-       generic - Select the generic processor.
-       probe   - Select the probe processor.
-       v1      - Select the v1 processor.
-       v2      - Select the v2 processor.
-     [...]
-
-   Newly added BPF instructions to the Linux kernel need to follow the same
-   scheme, bump the instruction set version and implement probing for the
-   extensions such that -mcpu=probe users can benefit from the optimization
-   transparently when upgrading their kernels.
-
-   If you are unable to implement support for the newly added BPF instruction
-   please reach out to BPF developers for help.
-
-   By the way, the BPF kernel selftests run with -mcpu=probe for better
-   test coverage.
-
-Q: In some cases clang flag "-target bpf" is used but in other cases the
-   default clang target, which matches the underlying architecture, is used.
-   What is the difference and when I should use which?
-
-A: Although LLVM IR generation and optimization try to stay architecture
-   independent, "-target <arch>" still has some impact on generated code:
-
-     - BPF program may recursively include header file(s) with file scope
-       inline assembly codes. The default target can handle this well,
-       while bpf target may fail if bpf backend assembler does not
-       understand these assembly codes, which is true in most cases.
-
-     - When compiled without -g, additional elf sections, e.g.,
-       .eh_frame and .rela.eh_frame, may be present in the object file
-       with default target, but not with bpf target.
-
-     - The default target may turn a C switch statement into a switch table
-       lookup and jump operation. Since the switch table is placed
-       in the global readonly section, the bpf program will fail to load.
-       The bpf target does not support switch table optimization.
-       The clang option "-fno-jump-tables" can be used to disable
-       switch table generation.
-
-     - For clang -target bpf, it is guaranteed that pointer or long /
-       unsigned long types will always have a width of 64 bit, no matter
-       whether underlying clang binary or default target (or kernel) is
-       32 bit. However, when native clang target is used, then it will
-       compile these types based on the underlying architecture's conventions,
-       meaning in case of 32 bit architecture, pointer or long / unsigned
-       long types e.g. in BPF context structure will have width of 32 bit
-       while the BPF LLVM back end still operates in 64 bit. The native
-       target is mostly needed in tracing for the case of walking pt_regs
-       or other kernel structures where CPU's register width matters.
-       Otherwise, clang -target bpf is generally recommended.
-
-   You should use default target when:
-
-     - Your program includes a header file, e.g., ptrace.h, which eventually
-       pulls in some header files containing file scope host assembly codes.
-     - You can add "-fno-jump-tables" to work around the switch table issue.
-
-   Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
-   when:
-
-     - Your program uses data structures with pointer or long / unsigned long
-       types that interface with BPF helpers or context data structures. Access
-       into these structures is verified by the BPF verifier and may result
-       in verification failures if the native architecture is not aligned with
-       the BPF architecture, e.g. 64-bit. An example of this is
-       BPF_PROG_TYPE_SK_MSG require '-target bpf'
-
-Happy BPF hacking!
-- 
cgit v1.2.3-59-g8ed1b


From 1a6ac1d59dc3b4077c643c3be70f9e650e267afe Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 14 May 2018 15:42:22 +0200
Subject: bpf, doc: convert bpf_design_QA.rst to use RST formatting

The RST formatting is done such that that when rendered or converted
to different formats, an automatic index with links are created to the
subsections.

Thus, the questions are created as sections (or subsections), in-order
to get the wanted auto-generated FAQ/QA index.

Special thanks to Quentin Monnet <quentin.monnet@netronome.com> who
have reviewed and corrected both RST formatting and GitHub rendering
issues in this file.  Those commits have been squashed.

I've manually tested that this also renders nicely if included as part
of the kernel 'make htmldocs'.  As the end-goal is for this to become
more integrated with kernel-doc project/movement.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/bpf_design_QA.rst | 223 +++++++++++++++++++++++-------------
 1 file changed, 144 insertions(+), 79 deletions(-)

diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index f3e458a0bb2f..6780a6d81745 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -1,156 +1,221 @@
+==============
+BPF Design Q&A
+==============
+
 BPF extensibility and applicability to networking, tracing, security
 in the linux kernel and several user space implementations of BPF
 virtual machine led to a number of misunderstanding on what BPF actually is.
 This short QA is an attempt to address that and outline a direction
 of where BPF is heading long term.
 
+.. contents::
+    :local:
+    :depth: 3
+
+Questions and Answers
+=====================
+
 Q: Is BPF a generic instruction set similar to x64 and arm64?
+-------------------------------------------------------------
 A: NO.
 
 Q: Is BPF a generic virtual machine ?
+-------------------------------------
 A: NO.
 
-BPF is generic instruction set _with_ C calling convention.
+BPF is generic instruction set *with* C calling convention.
+-----------------------------------------------------------
 
 Q: Why C calling convention was chosen?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 A: Because BPF programs are designed to run in the linux kernel
-   which is written in C, hence BPF defines instruction set compatible
-   with two most used architectures x64 and arm64 (and takes into
-   consideration important quirks of other architectures) and
-   defines calling convention that is compatible with C calling
-   convention of the linux kernel on those architectures.
+which is written in C, hence BPF defines instruction set compatible
+with two most used architectures x64 and arm64 (and takes into
+consideration important quirks of other architectures) and
+defines calling convention that is compatible with C calling
+convention of the linux kernel on those architectures.
 
 Q: can multiple return values be supported in the future?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: NO. BPF allows only register R0 to be used as return value.
 
 Q: can more than 5 function arguments be supported in the future?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: NO. BPF calling convention only allows registers R1-R5 to be used
-   as arguments. BPF is not a standalone instruction set.
-   (unlike x64 ISA that allows msft, cdecl and other conventions)
+as arguments. BPF is not a standalone instruction set.
+(unlike x64 ISA that allows msft, cdecl and other conventions)
 
 Q: can BPF programs access instruction pointer or return address?
+-----------------------------------------------------------------
 A: NO.
 
 Q: can BPF programs access stack pointer ?
-A: NO. Only frame pointer (register R10) is accessible.
-   From compiler point of view it's necessary to have stack pointer.
-   For example LLVM defines register R11 as stack pointer in its
-   BPF backend, but it makes sure that generated code never uses it.
+------------------------------------------
+A: NO.
+
+Only frame pointer (register R10) is accessible.
+From compiler point of view it's necessary to have stack pointer.
+For example LLVM defines register R11 as stack pointer in its
+BPF backend, but it makes sure that generated code never uses it.
 
 Q: Does C-calling convention diminishes possible use cases?
-A: YES. BPF design forces addition of major functionality in the form
-   of kernel helper functions and kernel objects like BPF maps with
-   seamless interoperability between them. It lets kernel call into
-   BPF programs and programs call kernel helpers with zero overhead.
-   As all of them were native C code. That is particularly the case
-   for JITed BPF programs that are indistinguishable from
-   native kernel C code.
+-----------------------------------------------------------
+A: YES.
+
+BPF design forces addition of major functionality in the form
+of kernel helper functions and kernel objects like BPF maps with
+seamless interoperability between them. It lets kernel call into
+BPF programs and programs call kernel helpers with zero overhead.
+As all of them were native C code. That is particularly the case
+for JITed BPF programs that are indistinguishable from
+native kernel C code.
 
 Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
-A: Soft yes. At least for now until BPF core has support for
-   bpf-to-bpf calls, indirect calls, loops, global variables,
-   jump tables, read only sections and all other normal constructs
-   that C code can produce.
+------------------------------------------------------------------------
+A: Soft yes.
+
+At least for now until BPF core has support for
+bpf-to-bpf calls, indirect calls, loops, global variables,
+jump tables, read only sections and all other normal constructs
+that C code can produce.
 
 Q: Can loops be supported in a safe way?
-A: It's not clear yet. BPF developers are trying to find a way to
-   support bounded loops where the verifier can guarantee that
-   the program terminates in less than 4096 instructions.
+----------------------------------------
+A: It's not clear yet.
+
+BPF developers are trying to find a way to
+support bounded loops where the verifier can guarantee that
+the program terminates in less than 4096 instructions.
+
+Instruction level questions
+---------------------------
+
+Q: LD_ABS and LD_IND instructions vs C code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
-   C code cannot express them and has to use builtin intrinsics?
+C code cannot express them and has to use builtin intrinsics?
+
 A: This is artifact of compatibility with classic BPF. Modern
-   networking code in BPF performs better without them.
-   See 'direct packet access'.
+networking code in BPF performs better without them.
+See 'direct packet access'.
 
+Q: BPF instructions mapping not one-to-one to native CPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Q: It seems not all BPF instructions are one-to-one to native CPU.
-   For example why BPF_JNE and other compare and jumps are not cpu-like?
+For example why BPF_JNE and other compare and jumps are not cpu-like?
+
 A: This was necessary to avoid introducing flags into ISA which are
-   impossible to make generic and efficient across CPU architectures.
+impossible to make generic and efficient across CPU architectures.
 
 Q: why BPF_DIV instruction doesn't map to x64 div?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because if we picked one-to-one relationship to x64 it would have made
-   it more complicated to support on arm64 and other archs. Also it
-   needs div-by-zero runtime check.
+it more complicated to support on arm64 and other archs. Also it
+needs div-by-zero runtime check.
 
 Q: why there is no BPF_SDIV for signed divide operation?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because it would be rarely used. llvm errors in such case and
-   prints a suggestion to use unsigned divide instead
+prints a suggestion to use unsigned divide instead
 
 Q: Why BPF has implicit prologue and epilogue?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because architectures like sparc have register windows and in general
-   there are enough subtle differences between architectures, so naive
-   store return address into stack won't work. Another reason is BPF has
-   to be safe from division by zero (and legacy exception path
-   of LD_ABS insn). Those instructions need to invoke epilogue and
-   return implicitly.
+there are enough subtle differences between architectures, so naive
+store return address into stack won't work. Another reason is BPF has
+to be safe from division by zero (and legacy exception path
+of LD_ABS insn). Those instructions need to invoke epilogue and
+return implicitly.
 
 Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because classic BPF didn't have them and BPF authors felt that compiler
-   workaround would be acceptable. Turned out that programs lose performance
-   due to lack of these compare instructions and they were added.
-   These two instructions is a perfect example what kind of new BPF
-   instructions are acceptable and can be added in the future.
-   These two already had equivalent instructions in native CPUs.
-   New instructions that don't have one-to-one mapping to HW instructions
-   will not be accepted.
-
+workaround would be acceptable. Turned out that programs lose performance
+due to lack of these compare instructions and they were added.
+These two instructions is a perfect example what kind of new BPF
+instructions are acceptable and can be added in the future.
+These two already had equivalent instructions in native CPUs.
+New instructions that don't have one-to-one mapping to HW instructions
+will not be accepted.
+
+Q: BPF 32-bit subregister requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
-   registers which makes BPF inefficient virtual machine for 32-bit
-   CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
-   be added to BPF in the future?
+registers which makes BPF inefficient virtual machine for 32-bit
+CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
+be added to BPF in the future?
+
 A: NO. The first thing to improve performance on 32-bit archs is to teach
-   LLVM to generate code that uses 32-bit subregisters. Then second step
-   is to teach verifier to mark operations where zero-ing upper bits
-   is unnecessary. Then JITs can take advantage of those markings and
-   drastically reduce size of generated code and improve performance.
+LLVM to generate code that uses 32-bit subregisters. Then second step
+is to teach verifier to mark operations where zero-ing upper bits
+is unnecessary. Then JITs can take advantage of those markings and
+drastically reduce size of generated code and improve performance.
 
 Q: Does BPF have a stable ABI?
+------------------------------
 A: YES. BPF instructions, arguments to BPF programs, set of helper
-   functions and their arguments, recognized return codes are all part
-   of ABI. However when tracing programs are using bpf_probe_read() helper
-   to walk kernel internal datastructures and compile with kernel
-   internal headers these accesses can and will break with newer
-   kernels. The union bpf_attr -> kern_version is checked at load time
-   to prevent accidentally loading kprobe-based bpf programs written
-   for a different kernel. Networking programs don't do kern_version check.
+functions and their arguments, recognized return codes are all part
+of ABI. However when tracing programs are using bpf_probe_read() helper
+to walk kernel internal datastructures and compile with kernel
+internal headers these accesses can and will break with newer
+kernels. The union bpf_attr -> kern_version is checked at load time
+to prevent accidentally loading kprobe-based bpf programs written
+for a different kernel. Networking programs don't do kern_version check.
 
 Q: How much stack space a BPF program uses?
+-------------------------------------------
 A: Currently all program types are limited to 512 bytes of stack
-   space, but the verifier computes the actual amount of stack used
-   and both interpreter and most JITed code consume necessary amount.
+space, but the verifier computes the actual amount of stack used
+and both interpreter and most JITed code consume necessary amount.
 
 Q: Can BPF be offloaded to HW?
+------------------------------
 A: YES. BPF HW offload is supported by NFP driver.
 
 Q: Does classic BPF interpreter still exist?
+--------------------------------------------
 A: NO. Classic BPF programs are converted into extend BPF instructions.
 
 Q: Can BPF call arbitrary kernel functions?
+-------------------------------------------
 A: NO. BPF programs can only call a set of helper functions which
-   is defined for every program type.
+is defined for every program type.
 
 Q: Can BPF overwrite arbitrary kernel memory?
-A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
-   and bpf_probe_read_str() helpers. Networking programs cannot read
-   arbitrary memory, since they don't have access to these helpers.
-   Programs can never read or write arbitrary memory directly.
+---------------------------------------------
+A: NO.
+
+Tracing bpf programs can *read* arbitrary memory with bpf_probe_read()
+and bpf_probe_read_str() helpers. Networking programs cannot read
+arbitrary memory, since they don't have access to these helpers.
+Programs can never read or write arbitrary memory directly.
 
 Q: Can BPF overwrite arbitrary user memory?
-A: Sort-of. Tracing BPF programs can overwrite the user memory
-   of the current task with bpf_probe_write_user(). Every time such
-   program is loaded the kernel will print warning message, so
-   this helper is only useful for experiments and prototypes.
-   Tracing BPF programs are root only.
+-------------------------------------------
+A: Sort-of.
+
+Tracing BPF programs can overwrite the user memory
+of the current task with bpf_probe_write_user(). Every time such
+program is loaded the kernel will print warning message, so
+this helper is only useful for experiments and prototypes.
+Tracing BPF programs are root only.
 
+Q: bpf_trace_printk() helper warning
+------------------------------------
 Q: When bpf_trace_printk() helper is used the kernel prints nasty
-   warning message. Why is that?
+warning message. Why is that?
+
 A: This is done to nudge program authors into better interfaces when
-   programs need to pass data to user space. Like bpf_perf_event_output()
-   can be used to efficiently stream data via perf ring buffer.
-   BPF maps can be used for asynchronous data sharing between kernel
-   and user space. bpf_trace_printk() should only be used for debugging.
+programs need to pass data to user space. Like bpf_perf_event_output()
+can be used to efficiently stream data via perf ring buffer.
+BPF maps can be used for asynchronous data sharing between kernel
+and user space. bpf_trace_printk() should only be used for debugging.
 
+Q: New functionality via kernel modules?
+----------------------------------------
 Q: Can BPF functionality such as new program or map types, new
-   helpers, etc be added out of kernel module code?
+helpers, etc be added out of kernel module code?
+
 A: NO.
-- 
cgit v1.2.3-59-g8ed1b


From 542228384888f5ad11fa6ffd59947a29a1f4452e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 14 May 2018 15:42:27 +0200
Subject: bpf, doc: convert bpf_devel_QA.rst to use RST formatting

Same story as bpf_design_QA.rst RST format conversion.

Again thanks to Quentin Monnet <quentin.monnet@netronome.com> for
fixes and patches that have been squashed.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/bpf_devel_QA.rst | 799 +++++++++++++++++++------------------
 1 file changed, 420 insertions(+), 379 deletions(-)

diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index da57601153a0..2254bdeae990 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -1,424 +1,446 @@
+=================================
+HOWTO interact with BPF subsystem
+=================================
+
 This document provides information for the BPF subsystem about various
 workflows related to reporting bugs, submitting patches, and queueing
 patches for stable kernels.
 
 For general information about submitting patches, please refer to
-Documentation/process/. This document only describes additional specifics
+`Documentation/process/`_. This document only describes additional specifics
 related to BPF.
 
-Reporting bugs:
----------------
+.. contents::
+    :local:
+    :depth: 2
 
-Q: How do I report bugs for BPF kernel code?
+Reporting bugs
+==============
 
+Q: How do I report bugs for BPF kernel code?
+--------------------------------------------
 A: Since all BPF kernel development as well as bpftool and iproute2 BPF
-   loader development happens through the netdev kernel mailing list,
-   please report any found issues around BPF to the following mailing
-   list:
+loader development happens through the netdev kernel mailing list,
+please report any found issues around BPF to the following mailing
+list:
 
-     netdev@vger.kernel.org
+ netdev@vger.kernel.org
 
-   This may also include issues related to XDP, BPF tracing, etc.
+This may also include issues related to XDP, BPF tracing, etc.
 
-   Given netdev has a high volume of traffic, please also add the BPF
-   maintainers to Cc (from kernel MAINTAINERS file):
+Given netdev has a high volume of traffic, please also add the BPF
+maintainers to Cc (from kernel MAINTAINERS_ file):
 
-     Alexei Starovoitov <ast@kernel.org>
-     Daniel Borkmann <daniel@iogearbox.net>
+* Alexei Starovoitov <ast@kernel.org>
+* Daniel Borkmann <daniel@iogearbox.net>
 
-   In case a buggy commit has already been identified, make sure to keep
-   the actual commit authors in Cc as well for the report. They can
-   typically be identified through the kernel's git tree.
+In case a buggy commit has already been identified, make sure to keep
+the actual commit authors in Cc as well for the report. They can
+typically be identified through the kernel's git tree.
 
-   Please do *not* report BPF issues to bugzilla.kernel.org since it
-   is a guarantee that the reported issue will be overlooked.
+**Please do NOT report BPF issues to bugzilla.kernel.org since it
+is a guarantee that the reported issue will be overlooked.**
 
-Submitting patches:
--------------------
+Submitting patches
+==================
 
 Q: To which mailing list do I need to submit my BPF patches?
-
+------------------------------------------------------------
 A: Please submit your BPF patches to the netdev kernel mailing list:
 
-     netdev@vger.kernel.org
+ netdev@vger.kernel.org
 
-   Historically, BPF came out of networking and has always been maintained
-   by the kernel networking community. Although these days BPF touches
-   many other subsystems as well, the patches are still routed mainly
-   through the networking community.
+Historically, BPF came out of networking and has always been maintained
+by the kernel networking community. Although these days BPF touches
+many other subsystems as well, the patches are still routed mainly
+through the networking community.
 
-   In case your patch has changes in various different subsystems (e.g.
-   tracing, security, etc), make sure to Cc the related kernel mailing
-   lists and maintainers from there as well, so they are able to review
-   the changes and provide their Acked-by's to the patches.
+In case your patch has changes in various different subsystems (e.g.
+tracing, security, etc), make sure to Cc the related kernel mailing
+lists and maintainers from there as well, so they are able to review
+the changes and provide their Acked-by's to the patches.
 
 Q: Where can I find patches currently under discussion for BPF subsystem?
-
+-------------------------------------------------------------------------
 A: All patches that are Cc'ed to netdev are queued for review under netdev
-   patchwork project:
+patchwork project:
 
-     http://patchwork.ozlabs.org/project/netdev/list/
+  http://patchwork.ozlabs.org/project/netdev/list/
 
-   Those patches which target BPF, are assigned to a 'bpf' delegate for
-   further processing from BPF maintainers. The current queue with
-   patches under review can be found at:
+Those patches which target BPF, are assigned to a 'bpf' delegate for
+further processing from BPF maintainers. The current queue with
+patches under review can be found at:
 
-     https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+  https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
 
-   Once the patches have been reviewed by the BPF community as a whole
-   and approved by the BPF maintainers, their status in patchwork will be
-   changed to 'Accepted' and the submitter will be notified by mail. This
-   means that the patches look good from a BPF perspective and have been
-   applied to one of the two BPF kernel trees.
+Once the patches have been reviewed by the BPF community as a whole
+and approved by the BPF maintainers, their status in patchwork will be
+changed to 'Accepted' and the submitter will be notified by mail. This
+means that the patches look good from a BPF perspective and have been
+applied to one of the two BPF kernel trees.
 
-   In case feedback from the community requires a respin of the patches,
-   their status in patchwork will be set to 'Changes Requested', and purged
-   from the current review queue. Likewise for cases where patches would
-   get rejected or are not applicable to the BPF trees (but assigned to
-   the 'bpf' delegate).
+In case feedback from the community requires a respin of the patches,
+their status in patchwork will be set to 'Changes Requested', and purged
+from the current review queue. Likewise for cases where patches would
+get rejected or are not applicable to the BPF trees (but assigned to
+the 'bpf' delegate).
 
 Q: How do the changes make their way into Linux?
-
+------------------------------------------------
 A: There are two BPF kernel trees (git repositories). Once patches have
-   been accepted by the BPF maintainers, they will be applied to one
-   of the two BPF trees:
+been accepted by the BPF maintainers, they will be applied to one
+of the two BPF trees:
 
-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
 
-   The bpf tree itself is for fixes only, whereas bpf-next for features,
-   cleanups or other kind of improvements ("next-like" content). This is
-   analogous to net and net-next trees for networking. Both bpf and
-   bpf-next will only have a master branch in order to simplify against
-   which branch patches should get rebased to.
+The bpf tree itself is for fixes only, whereas bpf-next for features,
+cleanups or other kind of improvements ("next-like" content). This is
+analogous to net and net-next trees for networking. Both bpf and
+bpf-next will only have a master branch in order to simplify against
+which branch patches should get rebased to.
 
-   Accumulated BPF patches in the bpf tree will regularly get pulled
-   into the net kernel tree. Likewise, accumulated BPF patches accepted
-   into the bpf-next tree will make their way into net-next tree. net and
-   net-next are both run by David S. Miller. From there, they will go
-   into the kernel mainline tree run by Linus Torvalds. To read up on the
-   process of net and net-next being merged into the mainline tree, see
-   the netdev FAQ under:
+Accumulated BPF patches in the bpf tree will regularly get pulled
+into the net kernel tree. Likewise, accumulated BPF patches accepted
+into the bpf-next tree will make their way into net-next tree. net and
+net-next are both run by David S. Miller. From there, they will go
+into the kernel mainline tree run by Linus Torvalds. To read up on the
+process of net and net-next being merged into the mainline tree, see
+the `netdev FAQ`_ under:
 
-     Documentation/networking/netdev-FAQ.txt
+ `Documentation/networking/netdev-FAQ.txt`_
 
-   Occasionally, to prevent merge conflicts, we might send pull requests
-   to other trees (e.g. tracing) with a small subset of the patches, but
-   net and net-next are always the main trees targeted for integration.
+Occasionally, to prevent merge conflicts, we might send pull requests
+to other trees (e.g. tracing) with a small subset of the patches, but
+net and net-next are always the main trees targeted for integration.
 
-   The pull requests will contain a high-level summary of the accumulated
-   patches and can be searched on netdev kernel mailing list through the
-   following subject lines (yyyy-mm-dd is the date of the pull request):
+The pull requests will contain a high-level summary of the accumulated
+patches and can be searched on netdev kernel mailing list through the
+following subject lines (``yyyy-mm-dd`` is the date of the pull
+request)::
 
-     pull-request: bpf yyyy-mm-dd
-     pull-request: bpf-next yyyy-mm-dd
+  pull-request: bpf yyyy-mm-dd
+  pull-request: bpf-next yyyy-mm-dd
 
-Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
-   applied to?
+Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be applied to?
+---------------------------------------------------------------------------------
 
-A: The process is the very same as described in the netdev FAQ, so
-   please read up on it. The subject line must indicate whether the
-   patch is a fix or rather "next-like" content in order to let the
-   maintainers know whether it is targeted at bpf or bpf-next.
+A: The process is the very same as described in the `netdev FAQ`_, so
+please read up on it. The subject line must indicate whether the
+patch is a fix or rather "next-like" content in order to let the
+maintainers know whether it is targeted at bpf or bpf-next.
 
-   For fixes eventually landing in bpf -> net tree, the subject must
-   look like:
+For fixes eventually landing in bpf -> net tree, the subject must
+look like::
 
-     git format-patch --subject-prefix='PATCH bpf' start..finish
+  git format-patch --subject-prefix='PATCH bpf' start..finish
 
-   For features/improvements/etc that should eventually land in
-   bpf-next -> net-next, the subject must look like:
+For features/improvements/etc that should eventually land in
+bpf-next -> net-next, the subject must look like::
 
-     git format-patch --subject-prefix='PATCH bpf-next' start..finish
+  git format-patch --subject-prefix='PATCH bpf-next' start..finish
 
-   If unsure whether the patch or patch series should go into bpf
-   or net directly, or bpf-next or net-next directly, it is not a
-   problem either if the subject line says net or net-next as target.
-   It is eventually up to the maintainers to do the delegation of
-   the patches.
+If unsure whether the patch or patch series should go into bpf
+or net directly, or bpf-next or net-next directly, it is not a
+problem either if the subject line says net or net-next as target.
+It is eventually up to the maintainers to do the delegation of
+the patches.
 
-   If it is clear that patches should go into bpf or bpf-next tree,
-   please make sure to rebase the patches against those trees in
-   order to reduce potential conflicts.
+If it is clear that patches should go into bpf or bpf-next tree,
+please make sure to rebase the patches against those trees in
+order to reduce potential conflicts.
 
-   In case the patch or patch series has to be reworked and sent out
-   again in a second or later revision, it is also required to add a
-   version number (v2, v3, ...) into the subject prefix:
+In case the patch or patch series has to be reworked and sent out
+again in a second or later revision, it is also required to add a
+version number (``v2``, ``v3``, ...) into the subject prefix::
 
-     git format-patch --subject-prefix='PATCH net-next v2' start..finish
+  git format-patch --subject-prefix='PATCH net-next v2' start..finish
 
-   When changes have been requested to the patch series, always send the
-   whole patch series again with the feedback incorporated (never send
-   individual diffs on top of the old series).
+When changes have been requested to the patch series, always send the
+whole patch series again with the feedback incorporated (never send
+individual diffs on top of the old series).
 
 Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
-
+-----------------------------------------------------------------------
 A: It means that the patch looks good for mainline inclusion from
-   a BPF point of view.
-
-   Be aware that this is not a final verdict that the patch will
-   automatically get accepted into net or net-next trees eventually:
-
-   On the netdev kernel mailing list reviews can come in at any point
-   in time. If discussions around a patch conclude that they cannot
-   get included as-is, we will either apply a follow-up fix or drop
-   them from the trees entirely. Therefore, we also reserve to rebase
-   the trees when deemed necessary. After all, the purpose of the tree
-   is to i) accumulate and stage BPF patches for integration into trees
-   like net and net-next, and ii) run extensive BPF test suite and
-   workloads on the patches before they make their way any further.
-
-   Once the BPF pull request was accepted by David S. Miller, then
-   the patches end up in net or net-next tree, respectively, and
-   make their way from there further into mainline. Again, see the
-   netdev FAQ for additional information e.g. on how often they are
-   merged to mainline.
+a BPF point of view.
 
-Q: How long do I need to wait for feedback on my BPF patches?
+Be aware that this is not a final verdict that the patch will
+automatically get accepted into net or net-next trees eventually:
+
+On the netdev kernel mailing list reviews can come in at any point
+in time. If discussions around a patch conclude that they cannot
+get included as-is, we will either apply a follow-up fix or drop
+them from the trees entirely. Therefore, we also reserve to rebase
+the trees when deemed necessary. After all, the purpose of the tree
+is to:
+
+i) accumulate and stage BPF patches for integration into trees
+   like net and net-next, and
 
+ii) run extensive BPF test suite and
+    workloads on the patches before they make their way any further.
+
+Once the BPF pull request was accepted by David S. Miller, then
+the patches end up in net or net-next tree, respectively, and
+make their way from there further into mainline. Again, see the
+`netdev FAQ`_ for additional information e.g. on how often they are
+merged to mainline.
+
+Q: How long do I need to wait for feedback on my BPF patches?
+-------------------------------------------------------------
 A: We try to keep the latency low. The usual time to feedback will
-   be around 2 or 3 business days. It may vary depending on the
-   complexity of changes and current patch load.
+be around 2 or 3 business days. It may vary depending on the
+complexity of changes and current patch load.
 
-Q: How often do you send pull requests to major kernel trees like
-   net or net-next?
+Q: How often do you send pull requests to major kernel trees like net or net-next?
+----------------------------------------------------------------------------------
 
 A: Pull requests will be sent out rather often in order to not
-   accumulate too many patches in bpf or bpf-next.
+accumulate too many patches in bpf or bpf-next.
 
-   As a rule of thumb, expect pull requests for each tree regularly
-   at the end of the week. In some cases pull requests could additionally
-   come also in the middle of the week depending on the current patch
-   load or urgency.
+As a rule of thumb, expect pull requests for each tree regularly
+at the end of the week. In some cases pull requests could additionally
+come also in the middle of the week depending on the current patch
+load or urgency.
 
 Q: Are patches applied to bpf-next when the merge window is open?
-
+-----------------------------------------------------------------
 A: For the time when the merge window is open, bpf-next will not be
-   processed. This is roughly analogous to net-next patch processing,
-   so feel free to read up on the netdev FAQ about further details.
+processed. This is roughly analogous to net-next patch processing,
+so feel free to read up on the `netdev FAQ`_ about further details.
 
-   During those two weeks of merge window, we might ask you to resend
-   your patch series once bpf-next is open again. Once Linus released
-   a v*-rc1 after the merge window, we continue processing of bpf-next.
+During those two weeks of merge window, we might ask you to resend
+your patch series once bpf-next is open again. Once Linus released
+a ``v*-rc1`` after the merge window, we continue processing of bpf-next.
 
-   For non-subscribers to kernel mailing lists, there is also a status
-   page run by David S. Miller on net-next that provides guidance:
+For non-subscribers to kernel mailing lists, there is also a status
+page run by David S. Miller on net-next that provides guidance:
 
-     http://vger.kernel.org/~davem/net-next.html
+  http://vger.kernel.org/~davem/net-next.html
 
+Q: Verifier changes and test cases
+----------------------------------
 Q: I made a BPF verifier change, do I need to add test cases for
-   BPF kernel selftests?
+BPF kernel selftests_?
 
 A: If the patch has changes to the behavior of the verifier, then yes,
-   it is absolutely necessary to add test cases to the BPF kernel
-   selftests suite. If they are not present and we think they are
-   needed, then we might ask for them before accepting any changes.
-
-   In particular, test_verifier.c is tracking a high number of BPF test
-   cases, including a lot of corner cases that LLVM BPF back end may
-   generate out of the restricted C code. Thus, adding test cases is
-   absolutely crucial to make sure future changes do not accidentally
-   affect prior use-cases. Thus, treat those test cases as: verifier
-   behavior that is not tracked in test_verifier.c could potentially
-   be subject to change.
-
-Q: When should I add code to samples/bpf/ and when to BPF kernel
-   selftests?
-
-A: In general, we prefer additions to BPF kernel selftests rather than
-   samples/bpf/. The rationale is very simple: kernel selftests are
-   regularly run by various bots to test for kernel regressions.
-
-   The more test cases we add to BPF selftests, the better the coverage
-   and the less likely it is that those could accidentally break. It is
-   not that BPF kernel selftests cannot demo how a specific feature can
-   be used.
-
-   That said, samples/bpf/ may be a good place for people to get started,
-   so it might be advisable that simple demos of features could go into
-   samples/bpf/, but advanced functional and corner-case testing rather
-   into kernel selftests.
-
-   If your sample looks like a test case, then go for BPF kernel selftests
-   instead!
+it is absolutely necessary to add test cases to the BPF kernel
+selftests_ suite. If they are not present and we think they are
+needed, then we might ask for them before accepting any changes.
+
+In particular, test_verifier.c is tracking a high number of BPF test
+cases, including a lot of corner cases that LLVM BPF back end may
+generate out of the restricted C code. Thus, adding test cases is
+absolutely crucial to make sure future changes do not accidentally
+affect prior use-cases. Thus, treat those test cases as: verifier
+behavior that is not tracked in test_verifier.c could potentially
+be subject to change.
+
+Q: samples/bpf preference vs selftests?
+---------------------------------------
+Q: When should I add code to `samples/bpf/`_ and when to BPF kernel
+selftests_ ?
+
+A: In general, we prefer additions to BPF kernel selftests_ rather than
+`samples/bpf/`_. The rationale is very simple: kernel selftests are
+regularly run by various bots to test for kernel regressions.
+
+The more test cases we add to BPF selftests, the better the coverage
+and the less likely it is that those could accidentally break. It is
+not that BPF kernel selftests cannot demo how a specific feature can
+be used.
+
+That said, `samples/bpf/`_ may be a good place for people to get started,
+so it might be advisable that simple demos of features could go into
+`samples/bpf/`_, but advanced functional and corner-case testing rather
+into kernel selftests.
+
+If your sample looks like a test case, then go for BPF kernel selftests
+instead!
 
 Q: When should I add code to the bpftool?
-
+-----------------------------------------
 A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
-   a central user space tool for debugging and introspection of BPF programs
-   and maps that are active in the kernel. If UAPI changes related to BPF
-   enable for dumping additional information of programs or maps, then
-   bpftool should be extended as well to support dumping them.
+a central user space tool for debugging and introspection of BPF programs
+and maps that are active in the kernel. If UAPI changes related to BPF
+enable for dumping additional information of programs or maps, then
+bpftool should be extended as well to support dumping them.
 
 Q: When should I add code to iproute2's BPF loader?
-
-A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
-   convention is that those control-path related changes are added to
-   iproute2's BPF loader as well from user space side. This is not only
-   useful to have UAPI changes properly designed to be usable, but also
-   to make those changes available to a wider user base of major
-   downstream distributions.
+---------------------------------------------------
+A: For UAPI changes related to the XDP or tc layer (e.g. ``cls_bpf``),
+the convention is that those control-path related changes are added to
+iproute2's BPF loader as well from user space side. This is not only
+useful to have UAPI changes properly designed to be usable, but also
+to make those changes available to a wider user base of major
+downstream distributions.
 
 Q: Do you accept patches as well for iproute2's BPF loader?
-
+-----------------------------------------------------------
 A: Patches for the iproute2's BPF loader have to be sent to:
 
-     netdev@vger.kernel.org
+  netdev@vger.kernel.org
 
-   While those patches are not processed by the BPF kernel maintainers,
-   please keep them in Cc as well, so they can be reviewed.
+While those patches are not processed by the BPF kernel maintainers,
+please keep them in Cc as well, so they can be reviewed.
 
-   The official git repository for iproute2 is run by Stephen Hemminger
-   and can be found at:
+The official git repository for iproute2 is run by Stephen Hemminger
+and can be found at:
 
-     https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
+  https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
 
-   The patches need to have a subject prefix of '[PATCH iproute2 master]'
-   or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
-   target branch where the patch should be applied to. Meaning, if kernel
-   changes went into the net-next kernel tree, then the related iproute2
-   changes need to go into the iproute2 net-next branch, otherwise they
-   can be targeted at master branch. The iproute2 net-next branch will get
-   merged into the master branch after the current iproute2 version from
-   master has been released.
+The patches need to have a subject prefix of '``[PATCH iproute2
+master]``' or '``[PATCH iproute2 net-next]``'. '``master``' or
+'``net-next``' describes the target branch where the patch should be
+applied to. Meaning, if kernel changes went into the net-next kernel
+tree, then the related iproute2 changes need to go into the iproute2
+net-next branch, otherwise they can be targeted at master branch. The
+iproute2 net-next branch will get merged into the master branch after
+the current iproute2 version from master has been released.
 
-   Like BPF, the patches end up in patchwork under the netdev project and
-   are delegated to 'shemminger' for further processing:
+Like BPF, the patches end up in patchwork under the netdev project and
+are delegated to 'shemminger' for further processing:
 
-     http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
+  http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
 
 Q: What is the minimum requirement before I submit my BPF patches?
-
+------------------------------------------------------------------
 A: When submitting patches, always take the time and properly test your
-   patches *prior* to submission. Never rush them! If maintainers find
-   that your patches have not been properly tested, it is a good way to
-   get them grumpy. Testing patch submissions is a hard requirement!
-
-   Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
-   same applies to fixes that target bpf-next, where the affected commit
-   is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
-   in order to identify follow-up commits and tremendously helps for people
-   having to do backporting, so it is a must have!
-
-   We also don't accept patches with an empty commit message. Take your
-   time and properly write up a high quality commit message, it is
-   essential!
-
-   Think about it this way: other developers looking at your code a month
-   from now need to understand *why* a certain change has been done that
-   way, and whether there have been flaws in the analysis or assumptions
-   that the original author did. Thus providing a proper rationale and
-   describing the use-case for the changes is a must.
-
-   Patch submissions with >1 patch must have a cover letter which includes
-   a high level description of the series. This high level summary will
-   then be placed into the merge commit by the BPF maintainers such that
-   it is also accessible from the git log for future reference.
-
+patches *prior* to submission. Never rush them! If maintainers find
+that your patches have not been properly tested, it is a good way to
+get them grumpy. Testing patch submissions is a hard requirement!
+
+Note, fixes that go to bpf tree *must* have a ``Fixes:`` tag included.
+The same applies to fixes that target bpf-next, where the affected
+commit is in net-next (or in some cases bpf-next). The ``Fixes:`` tag is
+crucial in order to identify follow-up commits and tremendously helps
+for people having to do backporting, so it is a must have!
+
+We also don't accept patches with an empty commit message. Take your
+time and properly write up a high quality commit message, it is
+essential!
+
+Think about it this way: other developers looking at your code a month
+from now need to understand *why* a certain change has been done that
+way, and whether there have been flaws in the analysis or assumptions
+that the original author did. Thus providing a proper rationale and
+describing the use-case for the changes is a must.
+
+Patch submissions with >1 patch must have a cover letter which includes
+a high level description of the series. This high level summary will
+then be placed into the merge commit by the BPF maintainers such that
+it is also accessible from the git log for future reference.
+
+Q: Features changing BPF JIT and/or LLVM
+----------------------------------------
 Q: What do I need to consider when adding a new instruction or feature
-   that would require BPF JIT and/or LLVM integration as well?
+that would require BPF JIT and/or LLVM integration as well?
 
 A: We try hard to keep all BPF JITs up to date such that the same user
-   experience can be guaranteed when running BPF programs on different
-   architectures without having the program punt to the less efficient
-   interpreter in case the in-kernel BPF JIT is enabled.
+experience can be guaranteed when running BPF programs on different
+architectures without having the program punt to the less efficient
+interpreter in case the in-kernel BPF JIT is enabled.
 
-   If you are unable to implement or test the required JIT changes for
-   certain architectures, please work together with the related BPF JIT
-   developers in order to get the feature implemented in a timely manner.
-   Please refer to the git log (arch/*/net/) to locate the necessary
-   people for helping out.
+If you are unable to implement or test the required JIT changes for
+certain architectures, please work together with the related BPF JIT
+developers in order to get the feature implemented in a timely manner.
+Please refer to the git log (``arch/*/net/``) to locate the necessary
+people for helping out.
 
-   Also always make sure to add BPF test cases (e.g. test_bpf.c and
-   test_verifier.c) for new instructions, so that they can receive
-   broad test coverage and help run-time testing the various BPF JITs.
+Also always make sure to add BPF test cases (e.g. test_bpf.c and
+test_verifier.c) for new instructions, so that they can receive
+broad test coverage and help run-time testing the various BPF JITs.
 
-   In case of new BPF instructions, once the changes have been accepted
-   into the Linux kernel, please implement support into LLVM's BPF back
-   end. See LLVM section below for further information.
+In case of new BPF instructions, once the changes have been accepted
+into the Linux kernel, please implement support into LLVM's BPF back
+end. See LLVM_ section below for further information.
 
-Stable submission:
-------------------
+Stable submission
+=================
 
 Q: I need a specific BPF commit in stable kernels. What should I do?
-
+--------------------------------------------------------------------
 A: In case you need a specific fix in stable kernels, first check whether
-   the commit has already been applied in the related linux-*.y branches:
+the commit has already been applied in the related ``linux-*.y`` branches:
 
-     https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
+  https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
 
-   If not the case, then drop an email to the BPF maintainers with the
-   netdev kernel mailing list in Cc and ask for the fix to be queued up:
+If not the case, then drop an email to the BPF maintainers with the
+netdev kernel mailing list in Cc and ask for the fix to be queued up:
 
-     netdev@vger.kernel.org
+  netdev@vger.kernel.org
 
-   The process in general is the same as on netdev itself, see also the
-   netdev FAQ document.
+The process in general is the same as on netdev itself, see also the
+`netdev FAQ`_ document.
 
 Q: Do you also backport to kernels not currently maintained as stable?
-
+----------------------------------------------------------------------
 A: No. If you need a specific BPF commit in kernels that are currently not
-   maintained by the stable maintainers, then you are on your own.
+maintained by the stable maintainers, then you are on your own.
 
-   The current stable and longterm stable kernels are all listed here:
+The current stable and longterm stable kernels are all listed here:
 
-     https://www.kernel.org/
+  https://www.kernel.org/
 
-Q: The BPF patch I am about to submit needs to go to stable as well. What
-   should I do?
+Q: The BPF patch I am about to submit needs to go to stable as well
+-------------------------------------------------------------------
+What should I do?
 
 A: The same rules apply as with netdev patch submissions in general, see
-   netdev FAQ under:
+`netdev FAQ`_ under:
 
-     Documentation/networking/netdev-FAQ.txt
+  `Documentation/networking/netdev-FAQ.txt`_
 
-   Never add "Cc: stable@vger.kernel.org" to the patch description, but
-   ask the BPF maintainers to queue the patches instead. This can be done
-   with a note, for example, under the "---" part of the patch which does
-   not go into the git log. Alternatively, this can be done as a simple
-   request by mail instead.
+Never add "``Cc: stable@vger.kernel.org``" to the patch description, but
+ask the BPF maintainers to queue the patches instead. This can be done
+with a note, for example, under the ``---`` part of the patch which does
+not go into the git log. Alternatively, this can be done as a simple
+request by mail instead.
 
+Q: Queue stable patches
+-----------------------
 Q: Where do I find currently queued BPF patches that will be submitted
-   to stable?
+to stable?
 
 A: Once patches that fix critical bugs got applied into the bpf tree, they
-   are queued up for stable submission under:
+are queued up for stable submission under:
 
-     http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
+  http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
 
-   They will be on hold there at minimum until the related commit made its
-   way into the mainline kernel tree.
+They will be on hold there at minimum until the related commit made its
+way into the mainline kernel tree.
 
-   After having been under broader exposure, the queued patches will be
-   submitted by the BPF maintainers to the stable maintainers.
+After having been under broader exposure, the queued patches will be
+submitted by the BPF maintainers to the stable maintainers.
 
-Testing patches:
-----------------
+Testing patches
+===============
 
 Q: Which BPF kernel selftests version should I run my kernel against?
+---------------------------------------------------------------------
+A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
+from that kernel ``xyz`` as well. Do not expect that the BPF selftest
+from the latest mainline tree will pass all the time.
 
-A: If you run a kernel xyz, then always run the BPF kernel selftests from
-   that kernel xyz as well. Do not expect that the BPF selftest from the
-   latest mainline tree will pass all the time.
-
-   In particular, test_bpf.c and test_verifier.c have a large number of
-   test cases and are constantly updated with new BPF test sequences, or
-   existing ones are adapted to verifier changes e.g. due to verifier
-   becoming smarter and being able to better track certain things.
+In particular, test_bpf.c and test_verifier.c have a large number of
+test cases and are constantly updated with new BPF test sequences, or
+existing ones are adapted to verifier changes e.g. due to verifier
+becoming smarter and being able to better track certain things.
 
-LLVM:
------
+LLVM
+====
 
 Q: Where do I find LLVM with BPF support?
-
+-----------------------------------------
 A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
 
-   All major distributions these days ship LLVM with BPF back end enabled,
-   so for the majority of use-cases it is not required to compile LLVM by
-   hand anymore, just install the distribution provided package.
+All major distributions these days ship LLVM with BPF back end enabled,
+so for the majority of use-cases it is not required to compile LLVM by
+hand anymore, just install the distribution provided package.
 
-   LLVM's static compiler lists the supported targets through 'llc --version',
-   make sure BPF targets are listed. Example:
+LLVM's static compiler lists the supported targets through
+``llc --version``, make sure BPF targets are listed. Example::
 
      $ llc --version
      LLVM (http://llvm.org/):
@@ -434,18 +456,18 @@ A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
          x86    - 32-bit X86: Pentium-Pro and above
          x86-64 - 64-bit X86: EM64T and AMD64
 
-   For developers in order to utilize the latest features added to LLVM's
-   BPF back end, it is advisable to run the latest LLVM releases. Support
-   for new BPF kernel features such as additions to the BPF instruction
-   set are often developed together.
+For developers in order to utilize the latest features added to LLVM's
+BPF back end, it is advisable to run the latest LLVM releases. Support
+for new BPF kernel features such as additions to the BPF instruction
+set are often developed together.
 
-   All LLVM releases can be found at: http://releases.llvm.org/
+All LLVM releases can be found at: http://releases.llvm.org/
 
 Q: Got it, so how do I build LLVM manually anyway?
-
+--------------------------------------------------
 A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
-   that set up, proceed with building the latest LLVM and clang version
-   from the git repositories:
+that set up, proceed with building the latest LLVM and clang version
+from the git repositories::
 
      $ git clone http://llvm.org/git/llvm.git
      $ cd llvm/tools
@@ -457,44 +479,51 @@ A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
                 -DLLVM_BUILD_RUNTIME=OFF
      $ make -j $(getconf _NPROCESSORS_ONLN)
 
-   The built binaries can then be found in the build/bin/ directory, where
-   you can point the PATH variable to.
+The built binaries can then be found in the build/bin/ directory, where
+you can point the PATH variable to.
 
+Q: Reporting LLVM BPF issues
+----------------------------
 Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
-   generation back end or about LLVM generated code that the verifier
-   refuses to accept?
+generation back end or about LLVM generated code that the verifier
+refuses to accept?
+
+A: Yes, please do!
 
-A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
-   infrastructure and it ties deeply into verification of programs from the
-   kernel side. Therefore, any issues on either side need to be investigated
-   and fixed whenever necessary.
+LLVM's BPF back end is a key piece of the whole BPF
+infrastructure and it ties deeply into verification of programs from the
+kernel side. Therefore, any issues on either side need to be investigated
+and fixed whenever necessary.
 
-   Therefore, please make sure to bring them up at netdev kernel mailing
-   list and Cc BPF maintainers for LLVM and kernel bits:
+Therefore, please make sure to bring them up at netdev kernel mailing
+list and Cc BPF maintainers for LLVM and kernel bits:
 
-     Yonghong Song <yhs@fb.com>
-     Alexei Starovoitov <ast@kernel.org>
-     Daniel Borkmann <daniel@iogearbox.net>
+* Yonghong Song <yhs@fb.com>
+* Alexei Starovoitov <ast@kernel.org>
+* Daniel Borkmann <daniel@iogearbox.net>
 
-   LLVM also has an issue tracker where BPF related bugs can be found:
+LLVM also has an issue tracker where BPF related bugs can be found:
 
-     https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
+  https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
 
-   However, it is better to reach out through mailing lists with having
-   maintainers in Cc.
+However, it is better to reach out through mailing lists with having
+maintainers in Cc.
 
+Q: New BPF instruction for kernel and LLVM
+------------------------------------------
 Q: I have added a new BPF instruction to the kernel, how can I integrate
-   it into LLVM?
+it into LLVM?
 
-A: LLVM has a -mcpu selector for the BPF back end in order to allow the
-   selection of BPF instruction set extensions. By default the 'generic'
-   processor target is used, which is the base instruction set (v1) of BPF.
+A: LLVM has a ``-mcpu`` selector for the BPF back end in order to allow
+the selection of BPF instruction set extensions. By default the
+``generic`` processor target is used, which is the base instruction set
+(v1) of BPF.
 
-   LLVM has an option to select -mcpu=probe where it will probe the host
-   kernel for supported BPF instruction set extensions and selects the
-   optimal set automatically.
+LLVM has an option to select ``-mcpu=probe`` where it will probe the host
+kernel for supported BPF instruction set extensions and selects the
+optimal set automatically.
 
-   For cross-compilation, a specific version can be select manually as well.
+For cross-compilation, a specific version can be select manually as well ::
 
      $ llc -march bpf -mcpu=help
      Available CPUs for this target:
@@ -505,66 +534,78 @@ A: LLVM has a -mcpu selector for the BPF back end in order to allow the
        v2      - Select the v2 processor.
      [...]
 
-   Newly added BPF instructions to the Linux kernel need to follow the same
-   scheme, bump the instruction set version and implement probing for the
-   extensions such that -mcpu=probe users can benefit from the optimization
-   transparently when upgrading their kernels.
+Newly added BPF instructions to the Linux kernel need to follow the same
+scheme, bump the instruction set version and implement probing for the
+extensions such that ``-mcpu=probe`` users can benefit from the
+optimization transparently when upgrading their kernels.
 
-   If you are unable to implement support for the newly added BPF instruction
-   please reach out to BPF developers for help.
+If you are unable to implement support for the newly added BPF instruction
+please reach out to BPF developers for help.
 
-   By the way, the BPF kernel selftests run with -mcpu=probe for better
-   test coverage.
+By the way, the BPF kernel selftests run with ``-mcpu=probe`` for better
+test coverage.
 
-Q: In some cases clang flag "-target bpf" is used but in other cases the
-   default clang target, which matches the underlying architecture, is used.
-   What is the difference and when I should use which?
+Q: clang flag for target bpf?
+-----------------------------
+Q: In some cases clang flag ``-target bpf`` is used but in other cases the
+default clang target, which matches the underlying architecture, is used.
+What is the difference and when I should use which?
 
 A: Although LLVM IR generation and optimization try to stay architecture
-   independent, "-target <arch>" still has some impact on generated code:
-
-     - BPF program may recursively include header file(s) with file scope
-       inline assembly codes. The default target can handle this well,
-       while bpf target may fail if bpf backend assembler does not
-       understand these assembly codes, which is true in most cases.
-
-     - When compiled without -g, additional elf sections, e.g.,
-       .eh_frame and .rela.eh_frame, may be present in the object file
-       with default target, but not with bpf target.
-
-     - The default target may turn a C switch statement into a switch table
-       lookup and jump operation. Since the switch table is placed
-       in the global readonly section, the bpf program will fail to load.
-       The bpf target does not support switch table optimization.
-       The clang option "-fno-jump-tables" can be used to disable
-       switch table generation.
-
-     - For clang -target bpf, it is guaranteed that pointer or long /
-       unsigned long types will always have a width of 64 bit, no matter
-       whether underlying clang binary or default target (or kernel) is
-       32 bit. However, when native clang target is used, then it will
-       compile these types based on the underlying architecture's conventions,
-       meaning in case of 32 bit architecture, pointer or long / unsigned
-       long types e.g. in BPF context structure will have width of 32 bit
-       while the BPF LLVM back end still operates in 64 bit. The native
-       target is mostly needed in tracing for the case of walking pt_regs
-       or other kernel structures where CPU's register width matters.
-       Otherwise, clang -target bpf is generally recommended.
-
-   You should use default target when:
-
-     - Your program includes a header file, e.g., ptrace.h, which eventually
-       pulls in some header files containing file scope host assembly codes.
-     - You can add "-fno-jump-tables" to work around the switch table issue.
-
-   Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
-   when:
-
-     - Your program uses data structures with pointer or long / unsigned long
-       types that interface with BPF helpers or context data structures. Access
-       into these structures is verified by the BPF verifier and may result
-       in verification failures if the native architecture is not aligned with
-       the BPF architecture, e.g. 64-bit. An example of this is
-       BPF_PROG_TYPE_SK_MSG require '-target bpf'
+independent, ``-target <arch>`` still has some impact on generated code:
+
+- BPF program may recursively include header file(s) with file scope
+  inline assembly codes. The default target can handle this well,
+  while ``bpf`` target may fail if bpf backend assembler does not
+  understand these assembly codes, which is true in most cases.
+
+- When compiled without ``-g``, additional elf sections, e.g.,
+  .eh_frame and .rela.eh_frame, may be present in the object file
+  with default target, but not with ``bpf`` target.
+
+- The default target may turn a C switch statement into a switch table
+  lookup and jump operation. Since the switch table is placed
+  in the global readonly section, the bpf program will fail to load.
+  The bpf target does not support switch table optimization.
+  The clang option ``-fno-jump-tables`` can be used to disable
+  switch table generation.
+
+- For clang ``-target bpf``, it is guaranteed that pointer or long /
+  unsigned long types will always have a width of 64 bit, no matter
+  whether underlying clang binary or default target (or kernel) is
+  32 bit. However, when native clang target is used, then it will
+  compile these types based on the underlying architecture's conventions,
+  meaning in case of 32 bit architecture, pointer or long / unsigned
+  long types e.g. in BPF context structure will have width of 32 bit
+  while the BPF LLVM back end still operates in 64 bit. The native
+  target is mostly needed in tracing for the case of walking ``pt_regs``
+  or other kernel structures where CPU's register width matters.
+  Otherwise, ``clang -target bpf`` is generally recommended.
+
+You should use default target when:
+
+- Your program includes a header file, e.g., ptrace.h, which eventually
+  pulls in some header files containing file scope host assembly codes.
+
+- You can add ``-fno-jump-tables`` to work around the switch table issue.
+
+Otherwise, you can use ``bpf`` target. Additionally, you *must* use bpf target
+when:
+
+- Your program uses data structures with pointer or long / unsigned long
+  types that interface with BPF helpers or context data structures. Access
+  into these structures is verified by the BPF verifier and may result
+  in verification failures if the native architecture is not aligned with
+  the BPF architecture, e.g. 64-bit. An example of this is
+  BPF_PROG_TYPE_SK_MSG require ``-target bpf``
+
+
+.. Links
+.. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/
+.. _MAINTAINERS: ../../MAINTAINERS
+.. _Documentation/networking/netdev-FAQ.txt: ../networking/netdev-FAQ.txt
+.. _netdev FAQ: ../networking/netdev-FAQ.txt
+.. _samples/bpf/: ../../samples/bpf/
+.. _selftests: ../../tools/testing/selftests/bpf/
 
 Happy BPF hacking!
-- 
cgit v1.2.3-59-g8ed1b


From b7a27c3aafa252d1e416c223cb97c123de4ed28a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 14 May 2018 15:42:32 +0200
Subject: bpf, doc: howto use/run the BPF selftests

I always forget howto run the BPF selftests. Thus, lets add that info
to the QA document.

Documentation was based on Cilium's documentation:
 http://cilium.readthedocs.io/en/latest/bpf/#verifying-the-setup

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/bpf_devel_QA.rst | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index 2254bdeae990..0e7c1d946e83 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -417,6 +417,33 @@ submitted by the BPF maintainers to the stable maintainers.
 Testing patches
 ===============
 
+Q: How to run BPF selftests
+---------------------------
+A: After you have booted into the newly compiled kernel, navigate to
+the BPF selftests_ suite in order to test BPF functionality (current
+working directory points to the root of the cloned git tree)::
+
+  $ cd tools/testing/selftests/bpf/
+  $ make
+
+To run the verifier tests::
+
+  $ sudo ./test_verifier
+
+The verifier tests print out all the current checks being
+performed. The summary at the end of running all tests will dump
+information of test successes and failures::
+
+  Summary: 418 PASSED, 0 FAILED
+
+In order to run through all BPF selftests, the following command is
+needed::
+
+  $ sudo make run_tests
+
+See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
+document for further documentation.
+
 Q: Which BPF kernel selftests version should I run my kernel against?
 ---------------------------------------------------------------------
 A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
@@ -607,5 +634,7 @@ when:
 .. _netdev FAQ: ../networking/netdev-FAQ.txt
 .. _samples/bpf/: ../../samples/bpf/
 .. _selftests: ../../tools/testing/selftests/bpf/
+.. _Documentation/dev-tools/kselftest.rst:
+   https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
 
 Happy BPF hacking!
-- 
cgit v1.2.3-59-g8ed1b


From f2467c2dbc019548052f3a64dc1efd01c0ae27aa Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 14 May 2018 17:11:29 -0700
Subject: selftests/bpf: make sure build-id is on

--build-id may not be a default linker config.
Make sure it's used when linking urandom_read test program.
Otherwise test_stacktrace_build_id[_nmi] tests will be failling.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 438d4f93875b..133ebc68cbe4 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -19,7 +19,7 @@ all: $(TEST_CUSTOM_PROGS)
 $(TEST_CUSTOM_PROGS): urandom_read
 
 urandom_read: urandom_read.c
-	$(CC) -o $(TEST_CUSTOM_PROGS) -static $<
+	$(CC) -o $(TEST_CUSTOM_PROGS) -static $< -Wl,--build-id
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-- 
cgit v1.2.3-59-g8ed1b


From 763ece85f45a6b93268e25a0abf02922f911dab4 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Tue, 15 May 2018 11:14:44 +0200
Subject: brcmfmac: fix initialization of struct cfg80211_inform_bss variable

This patch fixes a sparse warning "Using plain integer as NULL pointer"
about cfg80211_inform_bss structure initialization.

Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 84135dac49b3..f5b405c98047 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2737,7 +2737,7 @@ static s32 brcmf_inform_single_bss(struct brcmf_cfg80211_info *cfg,
 	u16 notify_interval;
 	u8 *notify_ie;
 	size_t notify_ielen;
-	struct cfg80211_inform_bss bss_data = { 0 };
+	struct cfg80211_inform_bss bss_data = {};
 
 	if (le32_to_cpu(bi->length) > WL_BSS_INFO_MAX) {
 		brcmf_err("Bss info is larger than buffer. Discarding\n");
-- 
cgit v1.2.3-59-g8ed1b


From e5cd3abcb31a48d4ea91bd32f0618802ca5f3592 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:16 -0700
Subject: bpf: sockmap, refactor sockmap routines to work with hashmap

This patch only refactors the existing sockmap code. This will allow
much of the psock initialization code path and bpf helper codes to
work for both sockmap bpf map types that are backed by an array, the
currently supported type, and the new hash backed bpf map type
sockhash.

Most the fallout comes from three changes,

  - Pushing bpf programs into an independent structure so we
    can use it from the htab struct in the next patch.
  - Generalizing helpers to use void *key instead of the hardcoded
    u32.
  - Instead of passing map/key through the metadata we now do
    the lookup inline. This avoids storing the key in the metadata
    which will be useful when keys can be longer than 4 bytes. We
    rename the sk pointers to sk_redir at this point as well to
    avoid any confusion between the current sk pointer and the
    redirect pointer sk_redir.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |   3 +-
 include/net/tcp.h      |   3 +-
 kernel/bpf/sockmap.c   | 148 +++++++++++++++++++++++++++++--------------------
 net/core/filter.c      |  31 +++--------
 4 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index da7e16523128..9dbcb9d55921 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -515,9 +515,8 @@ struct sk_msg_buff {
 	int sg_end;
 	struct scatterlist sg_data[MAX_SKB_FRAGS];
 	bool sg_copy[MAX_SKB_FRAGS];
-	__u32 key;
 	__u32 flags;
-	struct bpf_map *map;
+	struct sock *sk_redir;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cf803fe0fb86..059287374ba0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -814,9 +814,8 @@ struct tcp_skb_cb {
 #endif
 		} header;	/* For incoming skbs */
 		struct {
-			__u32 key;
 			__u32 flags;
-			struct bpf_map *map;
+			struct sock *sk_redir;
 			void *data_end;
 		} bpf;
 	};
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca568c2b..beab9ec9b023 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -48,14 +48,18 @@
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
+struct bpf_sock_progs {
 	struct bpf_prog *bpf_tx_msg;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
 };
 
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_sock_progs progs;
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -461,7 +465,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
 static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
 {
 	return ((_rc == SK_PASS) ?
-	       (md->map ? __SK_REDIRECT : __SK_PASS) :
+	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
 	       __SK_DROP);
 }
 
@@ -1092,7 +1096,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 * when we orphan the skb so that we don't have the possibility
 	 * to reference a stale map.
 	 */
-	TCP_SKB_CB(skb)->bpf.map = NULL;
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
 	bpf_compute_data_pointers(skb);
 	preempt_disable();
@@ -1102,7 +1106,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	/* Moving return codes from UAPI namespace into internal namespace */
 	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
 		__SK_DROP;
 }
 
@@ -1372,7 +1376,6 @@ static int smap_init_sock(struct smap_psock *psock,
 }
 
 static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_stab *stab,
 			    struct bpf_prog *verdict,
 			    struct bpf_prog *parse)
 {
@@ -1450,14 +1453,13 @@ static void smap_gc_work(struct work_struct *w)
 	kfree(psock);
 }
 
-static struct smap_psock *smap_init_psock(struct sock *sock,
-					  struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
 {
 	struct smap_psock *psock;
 
 	psock = kzalloc_node(sizeof(struct smap_psock),
 			     GFP_ATOMIC | __GFP_NOWARN,
-			     stab->map.numa_node);
+			     node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -1662,40 +1664,26 @@ out:
  *  - sock_map must use READ_ONCE and (cmp)xchg operations
  *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+				      struct bpf_sock_progs *progs,
+				      struct sock *sock,
+				      struct sock **map_link,
+				      void *key)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct sock *osock, *sock;
+	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
 	bool new = false;
 	int err;
 
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	sock = READ_ONCE(stab->sock_map[i]);
-	if (flags == BPF_EXIST && !sock)
-		return -ENOENT;
-	else if (flags == BPF_NOEXIST && sock)
-		return -EEXIST;
-
-	sock = skops->sk;
-
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
 	 * this results in an error.
 	 */
-	verdict = READ_ONCE(stab->bpf_verdict);
-	parse = READ_ONCE(stab->bpf_parse);
-	tx_msg = READ_ONCE(stab->bpf_tx_msg);
+	verdict = READ_ONCE(progs->bpf_verdict);
+	parse = READ_ONCE(progs->bpf_parse);
+	tx_msg = READ_ONCE(progs->bpf_tx_msg);
 
 	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1703,11 +1691,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(progs->bpf_verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(progs->bpf_parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);
@@ -1715,7 +1703,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	}
 
 	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+		tx_msg = bpf_prog_inc_not_zero(progs->bpf_tx_msg);
 		if (IS_ERR(tx_msg)) {
 			if (verdict)
 				bpf_prog_put(verdict);
@@ -1748,7 +1736,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 	} else {
-		psock = smap_init_psock(sock, stab);
+		psock = smap_init_psock(sock, map->numa_node);
 		if (IS_ERR(psock)) {
 			err = PTR_ERR(psock);
 			goto out_progs;
@@ -1763,7 +1751,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = -ENOMEM;
 		goto out_progs;
 	}
-	e->entry = &stab->sock_map[i];
 
 	/* 3. At this point we have a reference to a valid psock that is
 	 * running. Attach any BPF programs needed.
@@ -1780,7 +1767,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = smap_init_sock(psock, sock);
 		if (err)
 			goto out_free;
-		smap_init_progs(psock, stab, verdict, parse);
+		smap_init_progs(psock, verdict, parse);
 		smap_start_sock(psock, sock);
 	}
 
@@ -1789,19 +1776,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	 * it with. Because we can only have a single set of programs if
 	 * old_sock has a strp we can stop it.
 	 */
-	list_add_tail(&e->list, &psock->maps);
-	write_unlock_bh(&sock->sk_callback_lock);
-
-	osock = xchg(&stab->sock_map[i], sock);
-	if (osock) {
-		struct smap_psock *opsock = smap_psock_sk(osock);
-
-		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
-		smap_release_sock(opsock, osock);
-		write_unlock_bh(&osock->sk_callback_lock);
+	if (map_link) {
+		e->entry = map_link;
+		list_add_tail(&e->list, &psock->maps);
 	}
-	return 0;
+	write_unlock_bh(&sock->sk_callback_lock);
+	return err;
 out_free:
 	smap_release_sock(psock, sock);
 out_progs:
@@ -1816,23 +1796,69 @@ out_progs:
 	return err;
 }
 
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs = &stab->progs;
+	struct sock *osock, *sock;
+	u32 i = *(u32 *)key;
+	int err;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	sock = READ_ONCE(stab->sock_map[i]);
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
+		return -EEXIST;
+
+	sock = skops->sk;
+	err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+					 key);
+	if (err)
+		goto out;
+
+	osock = xchg(&stab->sock_map[i], sock);
+	if (osock) {
+		struct smap_psock *opsock = smap_psock_sk(osock);
+
+		write_lock_bh(&osock->sk_callback_lock);
+		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_release_sock(opsock, osock);
+		write_unlock_bh(&osock->sk_callback_lock);
+	}
+out:
+	return 0;
+}
+
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+{
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
 		return -EINVAL;
+	}
 
 	switch (type) {
 	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&stab->bpf_tx_msg, prog);
+		orig = xchg(&progs->bpf_tx_msg, prog);
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&stab->bpf_parse, prog);
+		orig = xchg(&progs->bpf_parse, prog);
 		break;
 	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&stab->bpf_verdict, prog);
+		orig = xchg(&progs->bpf_verdict, prog);
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -1881,16 +1907,18 @@ static int sock_map_update_elem(struct bpf_map *map,
 static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	orig = xchg(&stab->bpf_parse, NULL);
+	progs = &stab->progs;
+	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
-	orig = xchg(&stab->bpf_verdict, NULL);
+	orig = xchg(&progs->bpf_verdict, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 
-	orig = xchg(&stab->bpf_tx_msg, NULL);
+	orig = xchg(&progs->bpf_tx_msg, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index ca60d2872da4..61a3ed6bac25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2083,9 +2083,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	tcb->bpf.key = key;
 	tcb->bpf.flags = flags;
-	tcb->bpf.map = map;
+	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
@@ -2093,16 +2094,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 struct sock *do_sk_redirect_map(struct sk_buff *skb)
 {
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-	struct sock *sk = NULL;
-
-	if (tcb->bpf.map) {
-		sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key);
 
-		tcb->bpf.key = 0;
-		tcb->bpf.map = NULL;
-	}
-
-	return sk;
+	return tcb->bpf.sk_redir;
 }
 
 static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
@@ -2122,25 +2115,17 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
-	msg->key = key;
 	msg->flags = flags;
-	msg->map = map;
+	msg->sk_redir = __sock_map_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
 
 	return SK_PASS;
 }
 
 struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
 {
-	struct sock *sk = NULL;
-
-	if (msg->map) {
-		sk = __sock_map_lookup_elem(msg->map, msg->key);
-
-		msg->key = 0;
-		msg->map = NULL;
-	}
-
-	return sk;
+	return msg->sk_redir;
 }
 
 static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-- 
cgit v1.2.3-59-g8ed1b


From 81110384441a59cff47430f20f049e69b98c17f4 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:17 -0700
Subject: bpf: sockmap, add hash map support

Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.

To support this add hash support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h       |   8 +
 include/linux/bpf_types.h |   1 +
 include/uapi/linux/bpf.h  |  54 ++++-
 kernel/bpf/core.c         |   1 +
 kernel/bpf/sockmap.c      | 494 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c     |  14 +-
 net/core/filter.c         |  58 ++++++
 7 files changed, 611 insertions(+), 19 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474bf7ee..ed0122b45b63 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -668,6 +668,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -675,6 +676,12 @@ static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 	return NULL;
 }
 
+static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
+						    void *key)
+{
+	return NULL;
+}
+
 static inline int sock_map_prog(struct bpf_map *map,
 				struct bpf_prog *prog,
 				u32 type)
@@ -724,6 +731,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d7df1b323082..b67f8793de0d 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #if defined(CONFIG_XDP_SOCKETS)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 02e4112510f8..d94d333a8225 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1828,7 +1829,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- *
  * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
@@ -1855,6 +1855,53 @@ union bpf_attr {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ union bpf_attr {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d0d7d9462368..2194c6a9df42 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1707,6 +1707,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index beab9ec9b023..56879c9fd3a4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
 	struct bpf_sock_progs progs;
 };
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bucket *buckets;
+	atomic_t count;
+	u32 n_buckets;
+	u32 elem_size;
+	struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+	struct rcu_head rcu;
+	struct hlist_node hash_node;
+	u32 hash;
+	struct sock *sk;
+	char key[0];
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -67,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
+	struct htab_elem *hash_link;
+	struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -195,6 +219,12 @@ out:
 	rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	}
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		osk = cmpxchg(e->entry, sk, NULL);
-		if (osk == sk) {
-			list_del(&e->list);
-			smap_release_sock(psock, sk);
+		if (e->entry) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		} else {
+			hlist_del_rcu(&e->hash_link->hash_node);
+			smap_release_sock(psock, e->hash_link->sk);
+			free_htab_elem(e->htab, e->hash_link);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -1527,12 +1563,14 @@ free_stab:
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+			     struct sock **entry,
+			     struct htab_elem *hash_link)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
+		if (e->entry == entry || e->hash_link == hash_link) {
 			list_del(&e->list);
 			break;
 		}
@@ -1570,7 +1608,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_list_remove(psock, &stab->sock_map[i], NULL);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1629,7 +1667,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_list_remove(psock, &stab->sock_map[k], NULL);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1746,10 +1784,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 		new = true;
 	}
 
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e) {
-		err = -ENOMEM;
-		goto out_progs;
+	if (map_link) {
+		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+		if (!e) {
+			err = -ENOMEM;
+			goto out_progs;
+		}
 	}
 
 	/* 3. At this point we have a reference to a valid psock that is
@@ -1783,6 +1823,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
+	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
@@ -1829,7 +1870,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_list_remove(opsock, &stab->sock_map[i], NULL);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
@@ -1846,6 +1887,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 
 		progs = &stab->progs;
+	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
 	} else {
 		return -EINVAL;
 	}
@@ -1906,11 +1951,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 
 static void sock_map_release(struct bpf_map *map)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	progs = &stab->progs;
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	}
+
 	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
@@ -1923,6 +1976,390 @@ static void sock_map_release(struct bpf_map *map)
 		bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->value_size != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	err = -EINVAL;
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
+		goto free_htab;
+
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_htab;
+
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = bpf_map_area_alloc(
+				htab->n_buckets * sizeof(struct bucket),
+				htab->map.numa_node);
+	if (!htab->buckets)
+		goto free_htab;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			struct sock *sock = l->sk;
+			struct smap_psock *psock;
+
+			hlist_del_rcu(&l->hash_node);
+			write_lock_bh(&sock->sk_callback_lock);
+			psock = smap_psock_sk(sock);
+			/* This check handles a racing sock event that can get
+			 * the sk_callback_lock before this case but after xchg
+			 * causing the refcnt to hit zero and sock user data
+			 * (psock) to be null and queued for garbage collection.
+			 */
+			if (likely(psock)) {
+				smap_list_remove(psock, NULL, l);
+				smap_release_sock(psock, sock);
+			}
+			write_unlock_bh(&sock->sk_callback_lock);
+			kfree(l);
+		}
+	}
+	rcu_read_unlock();
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+					      void *key, u32 key_size, u32 hash,
+					      struct sock *sk,
+					      struct htab_elem *old_elem)
+{
+	struct htab_elem *l_new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old_elem) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			     htab->map.numa_node);
+	if (!l_new)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(l_new->key, key, key_size);
+	l_new->sk = sk;
+	l_new->hash = hash;
+	return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+				  void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l, *next_l;
+	struct hlist_head *h;
+	u32 hash, key_size;
+	int i = 0;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+	if (!key)
+		goto find_first_elem;
+	hash = htab_map_hash(key, key_size);
+	h = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(h, hash, key, key_size);
+	if (!l)
+		goto find_first_elem;
+	next_l = hlist_entry_safe(
+		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+		     struct htab_elem, hash_node);
+	if (next_l) {
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		h = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(h)),
+				struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* iterated over all buckets and all elements */
+	return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				     struct bpf_map *map,
+				     void *key, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_sock_progs *progs = &htab->progs;
+	struct htab_elem *l_new = NULL, *l_old;
+	struct smap_psock_map_entry *e = NULL;
+	struct hlist_head *head;
+	struct smap_psock *psock;
+	u32 key_size, hash;
+	struct sock *sock;
+	struct bucket *b;
+	int err;
+
+	sock = skops->sk;
+
+	if (sock->sk_type != SOCK_STREAM ||
+	    sock->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e)
+		return -ENOMEM;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+	if (err)
+		goto err;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_bh(&b->lock);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+	if (l_old && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto bucket_err;
+	}
+	if (!l_old && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto bucket_err;
+	}
+
+	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+	if (IS_ERR(l_new)) {
+		err = PTR_ERR(l_new);
+		goto bucket_err;
+	}
+
+	psock = smap_psock_sk(sock);
+	if (unlikely(!psock)) {
+		err = -EINVAL;
+		goto bucket_err;
+	}
+
+	e->hash_link = l_new;
+	e->htab = container_of(map, struct bpf_htab, map);
+	list_add_tail(&e->list, &psock->maps);
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		psock = smap_psock_sk(l_old->sk);
+
+		hlist_del_rcu(&l_old->hash_node);
+		smap_list_remove(psock, NULL, l_old);
+		smap_release_sock(psock, l_old->sk);
+		free_htab_elem(htab, l_old);
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return 0;
+bucket_err:
+	raw_spin_unlock_bh(&b->lock);
+err:
+	kfree(e);
+	psock = smap_psock_sk(sock);
+	if (psock)
+		smap_release_sock(psock, sock);
+	return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	fput(socket->file);
+	return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		struct sock *sock = l->sk;
+		struct smap_psock *psock;
+
+		hlist_del_rcu(&l->hash_node);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, NULL, l);
+			smap_release_sock(psock, sock);
+		}
+		write_unlock_bh(&sock->sk_callback_lock);
+		free_htab_elem(htab, l);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 key_size, hash;
+	struct bucket *b;
+	struct sock *sk;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	sk = l ? l->sk : NULL;
+	raw_spin_unlock_bh(&b->lock);
+	return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -1933,6 +2370,15 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_release_uref = sock_map_release,
 };
 
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc = sock_hash_alloc,
+	.map_free = sock_hash_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_hash_get_next_key,
+	.map_update_elem = sock_hash_update_elem,
+	.map_delete_elem = sock_hash_delete_elem,
+};
+
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
 	   struct bpf_map *, map, void *, key, u64, flags)
 {
@@ -1950,3 +2396,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d92d9c37affd..a9e4b1372da6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2093,6 +2093,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SOCKHASH:
+		if (func_id != BPF_FUNC_sk_redirect_hash &&
+		    func_id != BPF_FUNC_sock_hash_update &&
+		    func_id != BPF_FUNC_map_delete_elem &&
+		    func_id != BPF_FUNC_msg_redirect_hash)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2130,11 +2137,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_sk_redirect_map:
 	case BPF_FUNC_msg_redirect_map:
+	case BPF_FUNC_sock_map_update:
 		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
 			goto error;
 		break;
-	case BPF_FUNC_sock_map_update:
-		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+	case BPF_FUNC_sk_redirect_hash:
+	case BPF_FUNC_msg_redirect_hash:
+	case BPF_FUNC_sock_hash_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
 	default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 61a3ed6bac25..6d0d1560bd70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2074,6 +2074,33 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -2108,6 +2135,31 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+
+	return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
@@ -4502,6 +4554,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sock_ops_cb_flags_set_proto;
 	case BPF_FUNC_sock_map_update:
 		return &bpf_sock_map_update_proto;
+	case BPF_FUNC_sock_hash_update:
+		return &bpf_sock_hash_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4513,6 +4567,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_redirect_hash:
+		return &bpf_msg_redirect_hash_proto;
 	case BPF_FUNC_msg_apply_bytes:
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
@@ -4544,6 +4600,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_redirect_hash:
+		return &bpf_sk_redirect_hash_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cd1436a26718b2c33a290e5db24d1507887626e6 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 14 May 2018 22:04:54 +0200
Subject: dt-bindings: net: add DT bindings for Microsemi MIIM

DT bindings for the Microsemi MII Management Controller found on Microsemi
SoCs

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/mscc-miim.txt          | 26 ++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/mscc-miim.txt

diff --git a/Documentation/devicetree/bindings/net/mscc-miim.txt b/Documentation/devicetree/bindings/net/mscc-miim.txt
new file mode 100644
index 000000000000..7104679cf59d
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mscc-miim.txt
@@ -0,0 +1,26 @@
+Microsemi MII Management Controller (MIIM) / MDIO
+=================================================
+
+Properties:
+- compatible: must be "mscc,ocelot-miim"
+- reg: The base address of the MDIO bus controller register bank. Optionally, a
+  second register bank can be defined if there is an associated reset register
+  for internal PHYs
+- #address-cells: Must be <1>.
+- #size-cells: Must be <0>.  MDIO addresses have no size component.
+- interrupts: interrupt specifier (refer to the interrupt binding)
+
+Typically an MDIO bus might have several children.
+
+Example:
+	mdio@107009c {
+		#address-cells = <1>;
+		#size-cells = <0>;
+		compatible = "mscc,ocelot-miim";
+		reg = <0x107009c 0x36>, <0x10700f0 0x8>;
+		interrupts = <14>;
+
+		phy0: ethernet-phy@0 {
+			reg = <0>;
+		};
+	};
-- 
cgit v1.2.3-59-g8ed1b


From 542671fe4d86ad42b132d8814b6847ee1414aba6 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 14 May 2018 22:04:55 +0200
Subject: net: phy: mscc-miim: Add MDIO driver

Add a driver for the Microsemi MII Management controller (MIIM) found on
Microsemi SoCs.
On Ocelot, there are two controllers, one is connected to the internal
PHYs, the other one can communicate with external PHYs.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig          |   7 ++
 drivers/net/phy/Makefile         |   1 +
 drivers/net/phy/mdio-mscc-miim.c | 197 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 205 insertions(+)
 create mode 100644 drivers/net/phy/mdio-mscc-miim.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 19a5778a7ec7..0e2305ccc91f 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -123,6 +123,13 @@ config MDIO_MOXART
 	  This driver supports the MDIO interface found in the network
 	  interface units of the MOXA ART SoC
 
+config MDIO_MSCC_MIIM
+	tristate "Microsemi MIIM interface support"
+	depends on HAS_IOMEM
+	help
+	  This driver supports the MIIM (MDIO) interface found in the network
+	  switches of the Microsemi SoCs
+
 config MDIO_OCTEON
 	tristate "Octeon and some ThunderX SOCs MDIO buses"
 	depends on 64BIT
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 738246960e3b..5805c0b7d60e 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_MDIO_GPIO)		+= mdio-gpio.o
 obj-$(CONFIG_MDIO_HISI_FEMAC)	+= mdio-hisi-femac.o
 obj-$(CONFIG_MDIO_I2C)		+= mdio-i2c.o
 obj-$(CONFIG_MDIO_MOXART)	+= mdio-moxart.o
+obj-$(CONFIG_MDIO_MSCC_MIIM)	+= mdio-mscc-miim.o
 obj-$(CONFIG_MDIO_OCTEON)	+= mdio-octeon.o
 obj-$(CONFIG_MDIO_SUN4I)	+= mdio-sun4i.o
 obj-$(CONFIG_MDIO_THUNDER)	+= mdio-thunder.o
diff --git a/drivers/net/phy/mdio-mscc-miim.c b/drivers/net/phy/mdio-mscc-miim.c
new file mode 100644
index 000000000000..8c689ccfdbca
--- /dev/null
+++ b/drivers/net/phy/mdio-mscc-miim.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Driver for the MDIO interface of Microsemi network switches.
+ *
+ * Author: Alexandre Belloni <alexandre.belloni@bootlin.com>
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/of_mdio.h>
+
+#define MSCC_MIIM_REG_STATUS		0x0
+#define		MSCC_MIIM_STATUS_STAT_BUSY	BIT(3)
+#define MSCC_MIIM_REG_CMD		0x8
+#define		MSCC_MIIM_CMD_OPR_WRITE		BIT(1)
+#define		MSCC_MIIM_CMD_OPR_READ		BIT(2)
+#define		MSCC_MIIM_CMD_WRDATA_SHIFT	4
+#define		MSCC_MIIM_CMD_REGAD_SHIFT	20
+#define		MSCC_MIIM_CMD_PHYAD_SHIFT	25
+#define		MSCC_MIIM_CMD_VLD		BIT(31)
+#define MSCC_MIIM_REG_DATA		0xC
+#define		MSCC_MIIM_DATA_ERROR		(BIT(16) | BIT(17))
+
+#define MSCC_PHY_REG_PHY_CFG	0x0
+#define		PHY_CFG_PHY_ENA		(BIT(0) | BIT(1) | BIT(2) | BIT(3))
+#define		PHY_CFG_PHY_COMMON_RESET BIT(4)
+#define		PHY_CFG_PHY_RESET	(BIT(5) | BIT(6) | BIT(7) | BIT(8))
+#define MSCC_PHY_REG_PHY_STATUS	0x4
+
+struct mscc_miim_dev {
+	void __iomem *regs;
+	void __iomem *phy_regs;
+};
+
+static int mscc_miim_wait_ready(struct mii_bus *bus)
+{
+	struct mscc_miim_dev *miim = bus->priv;
+	u32 val;
+
+	readl_poll_timeout(miim->regs + MSCC_MIIM_REG_STATUS, val,
+			   !(val & MSCC_MIIM_STATUS_STAT_BUSY), 100, 250000);
+	if (val & MSCC_MIIM_STATUS_STAT_BUSY)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int mscc_miim_read(struct mii_bus *bus, int mii_id, int regnum)
+{
+	struct mscc_miim_dev *miim = bus->priv;
+	u32 val;
+	int ret;
+
+	ret = mscc_miim_wait_ready(bus);
+	if (ret)
+		goto out;
+
+	writel(MSCC_MIIM_CMD_VLD | (mii_id << MSCC_MIIM_CMD_PHYAD_SHIFT) |
+	       (regnum << MSCC_MIIM_CMD_REGAD_SHIFT) | MSCC_MIIM_CMD_OPR_READ,
+	       miim->regs + MSCC_MIIM_REG_CMD);
+
+	ret = mscc_miim_wait_ready(bus);
+	if (ret)
+		goto out;
+
+	val = readl(miim->regs + MSCC_MIIM_REG_DATA);
+	if (val & MSCC_MIIM_DATA_ERROR) {
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = val & 0xFFFF;
+out:
+	return ret;
+}
+
+static int mscc_miim_write(struct mii_bus *bus, int mii_id,
+			   int regnum, u16 value)
+{
+	struct mscc_miim_dev *miim = bus->priv;
+	int ret;
+
+	ret = mscc_miim_wait_ready(bus);
+	if (ret < 0)
+		goto out;
+
+	writel(MSCC_MIIM_CMD_VLD | (mii_id << MSCC_MIIM_CMD_PHYAD_SHIFT) |
+	       (regnum << MSCC_MIIM_CMD_REGAD_SHIFT) |
+	       (value << MSCC_MIIM_CMD_WRDATA_SHIFT) |
+	       MSCC_MIIM_CMD_OPR_WRITE,
+	       miim->regs + MSCC_MIIM_REG_CMD);
+
+out:
+	return ret;
+}
+
+static int mscc_miim_reset(struct mii_bus *bus)
+{
+	struct mscc_miim_dev *miim = bus->priv;
+
+	if (miim->phy_regs) {
+		writel(0, miim->phy_regs + MSCC_PHY_REG_PHY_CFG);
+		writel(0x1ff, miim->phy_regs + MSCC_PHY_REG_PHY_CFG);
+		mdelay(500);
+	}
+
+	return 0;
+}
+
+static int mscc_miim_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct mii_bus *bus;
+	struct mscc_miim_dev *dev;
+	int ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENODEV;
+
+	bus = devm_mdiobus_alloc_size(&pdev->dev, sizeof(*dev));
+	if (!bus)
+		return -ENOMEM;
+
+	bus->name = "mscc_miim";
+	bus->read = mscc_miim_read;
+	bus->write = mscc_miim_write;
+	bus->reset = mscc_miim_reset;
+	snprintf(bus->id, MII_BUS_ID_SIZE, "%s-mii", dev_name(&pdev->dev));
+	bus->parent = &pdev->dev;
+
+	dev = bus->priv;
+	dev->regs = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(dev->regs)) {
+		dev_err(&pdev->dev, "Unable to map MIIM registers\n");
+		return PTR_ERR(dev->regs);
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (res) {
+		dev->phy_regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(dev->phy_regs)) {
+			dev_err(&pdev->dev, "Unable to map internal phy registers\n");
+			return PTR_ERR(dev->phy_regs);
+		}
+	}
+
+	if (pdev->dev.of_node)
+		ret = of_mdiobus_register(bus, pdev->dev.of_node);
+	else
+		ret = mdiobus_register(bus);
+
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, bus);
+
+	return 0;
+}
+
+static int mscc_miim_remove(struct platform_device *pdev)
+{
+	struct mii_bus *bus = platform_get_drvdata(pdev);
+
+	mdiobus_unregister(bus);
+
+	return 0;
+}
+
+static const struct of_device_id mscc_miim_match[] = {
+	{ .compatible = "mscc,ocelot-miim" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, mscc_miim_match);
+
+static struct platform_driver mscc_miim_driver = {
+	.probe = mscc_miim_probe,
+	.remove = mscc_miim_remove,
+	.driver = {
+		.name = "mscc-miim",
+		.of_match_table = mscc_miim_match,
+	},
+};
+
+module_platform_driver(mscc_miim_driver);
+
+MODULE_DESCRIPTION("Microsemi MIIM driver");
+MODULE_AUTHOR("Alexandre Belloni <alexandre.belloni@bootlin.com>");
+MODULE_LICENSE("Dual MIT/GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 44b801e0f0970cd8ce124cb0e292108a455fac30 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 14 May 2018 22:04:56 +0200
Subject: dt-bindings: net: add DT bindings for Microsemi Ocelot Switch

DT bindings for the Ethernet switch found on Microsemi Ocelot platforms.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/mscc-ocelot.txt        | 82 ++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/mscc-ocelot.txt

diff --git a/Documentation/devicetree/bindings/net/mscc-ocelot.txt b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
new file mode 100644
index 000000000000..0a84711abece
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mscc-ocelot.txt
@@ -0,0 +1,82 @@
+Microsemi Ocelot network Switch
+===============================
+
+The Microsemi Ocelot network switch can be found on Microsemi SoCs (VSC7513,
+VSC7514)
+
+Required properties:
+- compatible: Should be "mscc,vsc7514-switch"
+- reg: Must contain an (offset, length) pair of the register set for each
+  entry in reg-names.
+- reg-names: Must include the following entries:
+  - "sys"
+  - "rew"
+  - "qs"
+  - "hsio"
+  - "qsys"
+  - "ana"
+  - "portX" with X from 0 to the number of last port index available on that
+    switch
+- interrupts: Should contain the switch interrupts for frame extraction and
+  frame injection
+- interrupt-names: should contain the interrupt names: "xtr", "inj"
+- ethernet-ports: A container for child nodes representing switch ports.
+
+The ethernet-ports container has the following properties
+
+Required properties:
+
+- #address-cells: Must be 1
+- #size-cells: Must be 0
+
+Each port node must have the following mandatory properties:
+- reg: Describes the port address in the switch
+
+Port nodes may also contain the following optional standardised
+properties, described in binding documents:
+
+- phy-handle: Phandle to a PHY on an MDIO bus. See
+  Documentation/devicetree/bindings/net/ethernet.txt for details.
+
+Example:
+
+	switch@1010000 {
+		compatible = "mscc,vsc7514-switch";
+		reg = <0x1010000 0x10000>,
+		      <0x1030000 0x10000>,
+		      <0x1080000 0x100>,
+		      <0x10d0000 0x10000>,
+		      <0x11e0000 0x100>,
+		      <0x11f0000 0x100>,
+		      <0x1200000 0x100>,
+		      <0x1210000 0x100>,
+		      <0x1220000 0x100>,
+		      <0x1230000 0x100>,
+		      <0x1240000 0x100>,
+		      <0x1250000 0x100>,
+		      <0x1260000 0x100>,
+		      <0x1270000 0x100>,
+		      <0x1280000 0x100>,
+		      <0x1800000 0x80000>,
+		      <0x1880000 0x10000>;
+		reg-names = "sys", "rew", "qs", "hsio", "port0",
+			    "port1", "port2", "port3", "port4", "port5",
+			    "port6", "port7", "port8", "port9", "port10",
+			    "qsys", "ana";
+		interrupts = <21 22>;
+		interrupt-names = "xtr", "inj";
+
+		ethernet-ports {
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			port0: port@0 {
+				reg = <0>;
+				phy-handle = <&phy0>;
+			};
+			port1: port@1 {
+				reg = <1>;
+				phy-handle = <&phy1>;
+			};
+		};
+	};
-- 
cgit v1.2.3-59-g8ed1b


From a556c76adc052c979ef9e80f0cd3fa1379ff4943 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 14 May 2018 22:04:57 +0200
Subject: net: mscc: Add initial Ocelot switch support

Add a driver for Microsemi Ocelot Ethernet switch support.

This makes two modules:
mscc_ocelot_common handles all the common features that doesn't depend on
how the switch is integrated in the SoC. Currently, it handles offloading
bridging to the hardware. ocelot_io.c handles register accesses. This is
unfortunately needed because the register layout is packed and then depends
on the number of ports available on the switch. The register definition
files are automatically generated.

ocelot_board handles the switch integration on the SoC and on the board.

Frame injection and extraction to/from the CPU port is currently done using
register accesses which is quite slow. DMA is possible but the port is not
able to absorb the whole switch bandwidth.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/Kconfig                |    1 +
 drivers/net/ethernet/Makefile               |    1 +
 drivers/net/ethernet/mscc/Kconfig           |   30 +
 drivers/net/ethernet/mscc/Makefile          |    5 +
 drivers/net/ethernet/mscc/ocelot.c          | 1333 +++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot.h          |  572 ++++++++++++
 drivers/net/ethernet/mscc/ocelot_ana.h      |  625 +++++++++++++
 drivers/net/ethernet/mscc/ocelot_board.c    |  316 +++++++
 drivers/net/ethernet/mscc/ocelot_dev.h      |  275 ++++++
 drivers/net/ethernet/mscc/ocelot_dev_gmii.h |  154 ++++
 drivers/net/ethernet/mscc/ocelot_hsio.h     |  785 ++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_io.c       |  116 +++
 drivers/net/ethernet/mscc/ocelot_qs.h       |   78 ++
 drivers/net/ethernet/mscc/ocelot_qsys.h     |  270 ++++++
 drivers/net/ethernet/mscc/ocelot_regs.c     |  497 ++++++++++
 drivers/net/ethernet/mscc/ocelot_rew.h      |   81 ++
 drivers/net/ethernet/mscc/ocelot_sys.h      |  144 +++
 17 files changed, 5283 insertions(+)
 create mode 100644 drivers/net/ethernet/mscc/Kconfig
 create mode 100644 drivers/net/ethernet/mscc/Makefile
 create mode 100644 drivers/net/ethernet/mscc/ocelot.c
 create mode 100644 drivers/net/ethernet/mscc/ocelot.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_ana.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_board.c
 create mode 100644 drivers/net/ethernet/mscc/ocelot_dev.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_dev_gmii.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_hsio.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_io.c
 create mode 100644 drivers/net/ethernet/mscc/ocelot_qs.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_qsys.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_regs.c
 create mode 100644 drivers/net/ethernet/mscc/ocelot_rew.h
 create mode 100644 drivers/net/ethernet/mscc/ocelot_sys.h

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 603a5704dab8..54d71e1c48d5 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -114,6 +114,7 @@ source "drivers/net/ethernet/mediatek/Kconfig"
 source "drivers/net/ethernet/mellanox/Kconfig"
 source "drivers/net/ethernet/micrel/Kconfig"
 source "drivers/net/ethernet/microchip/Kconfig"
+source "drivers/net/ethernet/mscc/Kconfig"
 source "drivers/net/ethernet/moxa/Kconfig"
 source "drivers/net/ethernet/myricom/Kconfig"
 
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 2bfd2eea50bf..8fbfe9ce2fa5 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_NET_VENDOR_MEDIATEK) += mediatek/
 obj-$(CONFIG_NET_VENDOR_MELLANOX) += mellanox/
 obj-$(CONFIG_NET_VENDOR_MICREL) += micrel/
 obj-$(CONFIG_NET_VENDOR_MICROCHIP) += microchip/
+obj-$(CONFIG_NET_VENDOR_MICROSEMI) += mscc/
 obj-$(CONFIG_NET_VENDOR_MOXART) += moxa/
 obj-$(CONFIG_NET_VENDOR_MYRI) += myricom/
 obj-$(CONFIG_FEALNX) += fealnx.o
diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig
new file mode 100644
index 000000000000..36c84625d54e
--- /dev/null
+++ b/drivers/net/ethernet/mscc/Kconfig
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+config NET_VENDOR_MICROSEMI
+	bool "Microsemi devices"
+	default y
+	help
+	  If you have a network (Ethernet) card belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Microsemi devices.
+
+if NET_VENDOR_MICROSEMI
+
+config MSCC_OCELOT_SWITCH
+	tristate "Ocelot switch driver"
+	depends on NET_SWITCHDEV
+	depends on HAS_IOMEM
+	select PHYLIB
+	select REGMAP_MMIO
+	help
+	  This driver supports the Ocelot network switch device.
+
+config MSCC_OCELOT_SWITCH_OCELOT
+	tristate "Ocelot switch driver on Ocelot"
+	depends on MSCC_OCELOT_SWITCH
+	help
+	  This driver supports the Ocelot network switch device as present on
+	  the Ocelot SoCs.
+
+endif # NET_VENDOR_MICROSEMI
diff --git a/drivers/net/ethernet/mscc/Makefile b/drivers/net/ethernet/mscc/Makefile
new file mode 100644
index 000000000000..cb52a3b128ae
--- /dev/null
+++ b/drivers/net/ethernet/mscc/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+obj-$(CONFIG_MSCC_OCELOT_SWITCH) += mscc_ocelot_common.o
+mscc_ocelot_common-y := ocelot.o ocelot_io.o
+mscc_ocelot_common-y += ocelot_regs.o
+obj-$(CONFIG_MSCC_OCELOT_SWITCH_OCELOT) += ocelot_board.o
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
new file mode 100644
index 000000000000..c8c74aa548d9
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -0,0 +1,1333 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_bridge.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/skbuff.h>
+#include <net/arp.h>
+#include <net/netevent.h>
+#include <net/rtnetlink.h>
+#include <net/switchdev.h>
+
+#include "ocelot.h"
+
+/* MAC table entry types.
+ * ENTRYTYPE_NORMAL is subject to aging.
+ * ENTRYTYPE_LOCKED is not subject to aging.
+ * ENTRYTYPE_MACv4 is not subject to aging. For IPv4 multicast.
+ * ENTRYTYPE_MACv6 is not subject to aging. For IPv6 multicast.
+ */
+enum macaccess_entry_type {
+	ENTRYTYPE_NORMAL = 0,
+	ENTRYTYPE_LOCKED,
+	ENTRYTYPE_MACv4,
+	ENTRYTYPE_MACv6,
+};
+
+struct ocelot_mact_entry {
+	u8 mac[ETH_ALEN];
+	u16 vid;
+	enum macaccess_entry_type type;
+};
+
+static inline int ocelot_mact_wait_for_completion(struct ocelot *ocelot)
+{
+	unsigned int val, timeout = 10;
+
+	/* Wait for the issued mac table command to be completed, or timeout.
+	 * When the command read from  ANA_TABLES_MACACCESS is
+	 * MACACCESS_CMD_IDLE, the issued command completed successfully.
+	 */
+	do {
+		val = ocelot_read(ocelot, ANA_TABLES_MACACCESS);
+		val &= ANA_TABLES_MACACCESS_MAC_TABLE_CMD_M;
+	} while (val != MACACCESS_CMD_IDLE && timeout--);
+
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static void ocelot_mact_select(struct ocelot *ocelot,
+			       const unsigned char mac[ETH_ALEN],
+			       unsigned int vid)
+{
+	u32 macl = 0, mach = 0;
+
+	/* Set the MAC address to handle and the vlan associated in a format
+	 * understood by the hardware.
+	 */
+	mach |= vid    << 16;
+	mach |= mac[0] << 8;
+	mach |= mac[1] << 0;
+	macl |= mac[2] << 24;
+	macl |= mac[3] << 16;
+	macl |= mac[4] << 8;
+	macl |= mac[5] << 0;
+
+	ocelot_write(ocelot, macl, ANA_TABLES_MACLDATA);
+	ocelot_write(ocelot, mach, ANA_TABLES_MACHDATA);
+
+}
+
+static int ocelot_mact_learn(struct ocelot *ocelot, int port,
+			     const unsigned char mac[ETH_ALEN],
+			     unsigned int vid,
+			     enum macaccess_entry_type type)
+{
+	ocelot_mact_select(ocelot, mac, vid);
+
+	/* Issue a write command */
+	ocelot_write(ocelot, ANA_TABLES_MACACCESS_VALID |
+			     ANA_TABLES_MACACCESS_DEST_IDX(port) |
+			     ANA_TABLES_MACACCESS_ENTRYTYPE(type) |
+			     ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN),
+			     ANA_TABLES_MACACCESS);
+
+	return ocelot_mact_wait_for_completion(ocelot);
+}
+
+static int ocelot_mact_forget(struct ocelot *ocelot,
+			      const unsigned char mac[ETH_ALEN],
+			      unsigned int vid)
+{
+	ocelot_mact_select(ocelot, mac, vid);
+
+	/* Issue a forget command */
+	ocelot_write(ocelot,
+		     ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_FORGET),
+		     ANA_TABLES_MACACCESS);
+
+	return ocelot_mact_wait_for_completion(ocelot);
+}
+
+static void ocelot_mact_init(struct ocelot *ocelot)
+{
+	/* Configure the learning mode entries attributes:
+	 * - Do not copy the frame to the CPU extraction queues.
+	 * - Use the vlan and mac_cpoy for dmac lookup.
+	 */
+	ocelot_rmw(ocelot, 0,
+		   ANA_AGENCTRL_LEARN_CPU_COPY | ANA_AGENCTRL_IGNORE_DMAC_FLAGS
+		   | ANA_AGENCTRL_LEARN_FWD_KILL
+		   | ANA_AGENCTRL_LEARN_IGNORE_VLAN,
+		   ANA_AGENCTRL);
+
+	/* Clear the MAC table */
+	ocelot_write(ocelot, MACACCESS_CMD_INIT, ANA_TABLES_MACACCESS);
+}
+
+static inline int ocelot_vlant_wait_for_completion(struct ocelot *ocelot)
+{
+	unsigned int val, timeout = 10;
+
+	/* Wait for the issued mac table command to be completed, or timeout.
+	 * When the command read from ANA_TABLES_MACACCESS is
+	 * MACACCESS_CMD_IDLE, the issued command completed successfully.
+	 */
+	do {
+		val = ocelot_read(ocelot, ANA_TABLES_VLANACCESS);
+		val &= ANA_TABLES_VLANACCESS_VLAN_TBL_CMD_M;
+	} while (val != ANA_TABLES_VLANACCESS_CMD_IDLE && timeout--);
+
+	if (!timeout)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static void ocelot_vlan_init(struct ocelot *ocelot)
+{
+	/* Clear VLAN table, by default all ports are members of all VLANs */
+	ocelot_write(ocelot, ANA_TABLES_VLANACCESS_CMD_INIT,
+		     ANA_TABLES_VLANACCESS);
+	ocelot_vlant_wait_for_completion(ocelot);
+}
+
+/* Watermark encode
+ * Bit 8:   Unit; 0:1, 1:16
+ * Bit 7-0: Value to be multiplied with unit
+ */
+static u16 ocelot_wm_enc(u16 value)
+{
+	if (value >= BIT(8))
+		return BIT(8) | (value / 16);
+
+	return value;
+}
+
+static void ocelot_port_adjust_link(struct net_device *dev)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	u8 p = port->chip_port;
+	int speed, atop_wm, mode = 0;
+
+	switch (dev->phydev->speed) {
+	case SPEED_10:
+		speed = OCELOT_SPEED_10;
+		break;
+	case SPEED_100:
+		speed = OCELOT_SPEED_100;
+		break;
+	case SPEED_1000:
+		speed = OCELOT_SPEED_1000;
+		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
+		break;
+	case SPEED_2500:
+		speed = OCELOT_SPEED_2500;
+		mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
+		break;
+	default:
+		netdev_err(dev, "Unsupported PHY speed: %d\n",
+			   dev->phydev->speed);
+		return;
+	}
+
+	phy_print_status(dev->phydev);
+
+	if (!dev->phydev->link)
+		return;
+
+	/* Only full duplex supported for now */
+	ocelot_port_writel(port, DEV_MAC_MODE_CFG_FDX_ENA |
+			   mode, DEV_MAC_MODE_CFG);
+
+	/* Set MAC IFG Gaps
+	 * FDX: TX_IFG = 5, RX_IFG1 = RX_IFG2 = 0
+	 * !FDX: TX_IFG = 5, RX_IFG1 = RX_IFG2 = 5
+	 */
+	ocelot_port_writel(port, DEV_MAC_IFG_CFG_TX_IFG(5), DEV_MAC_IFG_CFG);
+
+	/* Load seed (0) and set MAC HDX late collision  */
+	ocelot_port_writel(port, DEV_MAC_HDX_CFG_LATE_COL_POS(67) |
+			   DEV_MAC_HDX_CFG_SEED_LOAD,
+			   DEV_MAC_HDX_CFG);
+	mdelay(1);
+	ocelot_port_writel(port, DEV_MAC_HDX_CFG_LATE_COL_POS(67),
+			   DEV_MAC_HDX_CFG);
+
+	/* Disable HDX fast control */
+	ocelot_port_writel(port, DEV_PORT_MISC_HDX_FAST_DIS, DEV_PORT_MISC);
+
+	/* SGMII only for now */
+	ocelot_port_writel(port, PCS1G_MODE_CFG_SGMII_MODE_ENA, PCS1G_MODE_CFG);
+	ocelot_port_writel(port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
+
+	/* Enable PCS */
+	ocelot_port_writel(port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
+
+	/* No aneg on SGMII */
+	ocelot_port_writel(port, 0, PCS1G_ANEG_CFG);
+
+	/* No loopback */
+	ocelot_port_writel(port, 0, PCS1G_LB_CFG);
+
+	/* Set Max Length and maximum tags allowed */
+	ocelot_port_writel(port, VLAN_ETH_FRAME_LEN, DEV_MAC_MAXLEN_CFG);
+	ocelot_port_writel(port, DEV_MAC_TAGS_CFG_TAG_ID(ETH_P_8021AD) |
+			   DEV_MAC_TAGS_CFG_VLAN_AWR_ENA |
+			   DEV_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA,
+			   DEV_MAC_TAGS_CFG);
+
+	/* Enable MAC module */
+	ocelot_port_writel(port, DEV_MAC_ENA_CFG_RX_ENA |
+			   DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG);
+
+	/* Take MAC, Port, Phy (intern) and PCS (SGMII/Serdes) clock out of
+	 * reset */
+	ocelot_port_writel(port, DEV_CLOCK_CFG_LINK_SPEED(speed),
+			   DEV_CLOCK_CFG);
+
+	/* Set SMAC of Pause frame (00:00:00:00:00:00) */
+	ocelot_port_writel(port, 0, DEV_MAC_FC_MAC_HIGH_CFG);
+	ocelot_port_writel(port, 0, DEV_MAC_FC_MAC_LOW_CFG);
+
+	/* No PFC */
+	ocelot_write_gix(ocelot, ANA_PFC_PFC_CFG_FC_LINK_SPEED(speed),
+			 ANA_PFC_PFC_CFG, p);
+
+	/* Set Pause WM hysteresis
+	 * 152 = 6 * VLAN_ETH_FRAME_LEN / OCELOT_BUFFER_CELL_SZ
+	 * 101 = 4 * VLAN_ETH_FRAME_LEN / OCELOT_BUFFER_CELL_SZ
+	 */
+	ocelot_write_rix(ocelot, SYS_PAUSE_CFG_PAUSE_ENA |
+			 SYS_PAUSE_CFG_PAUSE_STOP(101) |
+			 SYS_PAUSE_CFG_PAUSE_START(152), SYS_PAUSE_CFG, p);
+
+	/* Core: Enable port for frame transfer */
+	ocelot_write_rix(ocelot, QSYS_SWITCH_PORT_MODE_INGRESS_DROP_MODE |
+			 QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG(1) |
+			 QSYS_SWITCH_PORT_MODE_PORT_ENA,
+			 QSYS_SWITCH_PORT_MODE, p);
+
+	/* Flow control */
+	ocelot_write_rix(ocelot, SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) |
+			 SYS_MAC_FC_CFG_RX_FC_ENA | SYS_MAC_FC_CFG_TX_FC_ENA |
+			 SYS_MAC_FC_CFG_ZERO_PAUSE_ENA |
+			 SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) |
+			 SYS_MAC_FC_CFG_FC_LINK_SPEED(speed),
+			 SYS_MAC_FC_CFG, p);
+	ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, p);
+
+	/* Tail dropping watermark */
+	atop_wm = (ocelot->shared_queue_sz - 9 * VLAN_ETH_FRAME_LEN) / OCELOT_BUFFER_CELL_SZ;
+	ocelot_write_rix(ocelot, ocelot_wm_enc(9 * VLAN_ETH_FRAME_LEN),
+			 SYS_ATOP, p);
+	ocelot_write(ocelot, ocelot_wm_enc(atop_wm), SYS_ATOP_TOT_CFG);
+}
+
+static int ocelot_port_open(struct net_device *dev)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	int err;
+
+	/* Enable receiving frames on the port, and activate auto-learning of
+	 * MAC addresses.
+	 */
+	ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO |
+			 ANA_PORT_PORT_CFG_RECV_ENA |
+			 ANA_PORT_PORT_CFG_PORTID_VAL(port->chip_port),
+			 ANA_PORT_PORT_CFG, port->chip_port);
+
+	err = phy_connect_direct(dev, port->phy, &ocelot_port_adjust_link,
+				 PHY_INTERFACE_MODE_NA);
+	if (err) {
+		netdev_err(dev, "Could not attach to PHY\n");
+		return err;
+	}
+
+	dev->phydev = port->phy;
+
+	phy_attached_info(port->phy);
+	phy_start(port->phy);
+	return 0;
+}
+
+static int ocelot_port_stop(struct net_device *dev)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+
+	phy_disconnect(port->phy);
+
+	dev->phydev = NULL;
+
+	ocelot_port_writel(port, 0, DEV_MAC_ENA_CFG);
+	ocelot_rmw_rix(port->ocelot, 0, QSYS_SWITCH_PORT_MODE_PORT_ENA,
+			 QSYS_SWITCH_PORT_MODE, port->chip_port);
+	return 0;
+}
+
+/* Generate the IFH for frame injection
+ *
+ * The IFH is a 128bit-value
+ * bit 127: bypass the analyzer processing
+ * bit 56-67: destination mask
+ * bit 28-29: pop_cnt: 3 disables all rewriting of the frame
+ * bit 20-27: cpu extraction queue mask
+ * bit 16: tag type 0: C-tag, 1: S-tag
+ * bit 0-11: VID
+ */
+static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info)
+{
+	ifh[0] = IFH_INJ_BYPASS;
+	ifh[1] = (0xff00 & info->port) >> 8;
+	ifh[2] = (0xff & info->port) << 24;
+	ifh[3] = IFH_INJ_POP_CNT_DISABLE | (info->cpuq << 20) |
+		 (info->tag_type << 16) | info->vid;
+
+	return 0;
+}
+
+static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	u32 val, ifh[IFH_LEN];
+	struct frame_info info = {};
+	u8 grp = 0; /* Send everything on CPU group 0 */
+	unsigned int i, count, last;
+
+	val = ocelot_read(ocelot, QS_INJ_STATUS);
+	if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))) ||
+	    (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp))))
+		return NETDEV_TX_BUSY;
+
+	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
+			 QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
+
+	info.port = BIT(port->chip_port);
+	info.cpuq = 0xff;
+	ocelot_gen_ifh(ifh, &info);
+
+	for (i = 0; i < IFH_LEN; i++)
+		ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
+
+	count = (skb->len + 3) / 4;
+	last = skb->len % 4;
+	for (i = 0; i < count; i++) {
+		ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp);
+	}
+
+	/* Add padding */
+	while (i < (OCELOT_BUFFER_CELL_SZ / 4)) {
+		ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp);
+		i++;
+	}
+
+	/* Indicate EOF and valid bytes in last word */
+	ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
+			 QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) |
+			 QS_INJ_CTRL_EOF,
+			 QS_INJ_CTRL, grp);
+
+	/* Add dummy CRC */
+	ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp);
+	skb_tx_timestamp(skb);
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+
+static void ocelot_mact_mc_reset(struct ocelot_port *port)
+{
+	struct ocelot *ocelot = port->ocelot;
+	struct netdev_hw_addr *ha, *n;
+
+	/* Free and forget all the MAC addresses stored in the port private mc
+	 * list. These are mc addresses that were previously added by calling
+	 * ocelot_mact_mc_add().
+	 */
+	list_for_each_entry_safe(ha, n, &port->mc, list) {
+		ocelot_mact_forget(ocelot, ha->addr, port->pvid);
+		list_del(&ha->list);
+		kfree(ha);
+	}
+}
+
+static int ocelot_mact_mc_add(struct ocelot_port *port,
+			      struct netdev_hw_addr *hw_addr)
+{
+	struct ocelot *ocelot = port->ocelot;
+	struct netdev_hw_addr *ha = kzalloc(sizeof(*ha), GFP_KERNEL);
+
+	if (!ha)
+		return -ENOMEM;
+
+	memcpy(ha, hw_addr, sizeof(*ha));
+	list_add_tail(&ha->list, &port->mc);
+
+	ocelot_mact_learn(ocelot, PGID_CPU, ha->addr, port->pvid,
+			  ENTRYTYPE_LOCKED);
+
+	return 0;
+}
+
+static void ocelot_set_rx_mode(struct net_device *dev)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	struct netdev_hw_addr *ha;
+	int i;
+	u32 val;
+
+	/* This doesn't handle promiscuous mode because the bridge core is
+	 * setting IFF_PROMISC on all slave interfaces and all frames would be
+	 * forwarded to the CPU port.
+	 */
+	val = GENMASK(ocelot->num_phys_ports - 1, 0);
+	for (i = ocelot->num_phys_ports + 1; i < PGID_CPU; i++)
+		ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i);
+
+	/* Handle the device multicast addresses. First remove all the
+	 * previously installed addresses and then add the latest ones to the
+	 * mac table.
+	 */
+	ocelot_mact_mc_reset(port);
+	netdev_for_each_mc_addr(ha, dev)
+		ocelot_mact_mc_add(port, ha);
+}
+
+static int ocelot_port_get_phys_port_name(struct net_device *dev,
+					  char *buf, size_t len)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	int ret;
+
+	ret = snprintf(buf, len, "p%d", port->chip_port);
+	if (ret >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ocelot_port_set_mac_address(struct net_device *dev, void *p)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	const struct sockaddr *addr = p;
+
+	/* Learn the new net device MAC address in the mac table. */
+	ocelot_mact_learn(ocelot, PGID_CPU, addr->sa_data, port->pvid,
+			  ENTRYTYPE_LOCKED);
+	/* Then forget the previous one. */
+	ocelot_mact_forget(ocelot, dev->dev_addr, port->pvid);
+
+	ether_addr_copy(dev->dev_addr, addr->sa_data);
+	return 0;
+}
+
+static void ocelot_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+
+	/* Configure the port to read the stats from */
+	ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(port->chip_port),
+		     SYS_STAT_CFG);
+
+	/* Get Rx stats */
+	stats->rx_bytes = ocelot_read(ocelot, SYS_COUNT_RX_OCTETS);
+	stats->rx_packets = ocelot_read(ocelot, SYS_COUNT_RX_SHORTS) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_FRAGMENTS) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_JABBERS) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_LONGS) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_64) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_65_127) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_128_255) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_256_1023) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_1024_1526) +
+			    ocelot_read(ocelot, SYS_COUNT_RX_1527_MAX);
+	stats->multicast = ocelot_read(ocelot, SYS_COUNT_RX_MULTICAST);
+	stats->rx_dropped = dev->stats.rx_dropped;
+
+	/* Get Tx stats */
+	stats->tx_bytes = ocelot_read(ocelot, SYS_COUNT_TX_OCTETS);
+	stats->tx_packets = ocelot_read(ocelot, SYS_COUNT_TX_64) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_65_127) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_128_511) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_512_1023) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_1024_1526) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_1527_MAX);
+	stats->tx_dropped = ocelot_read(ocelot, SYS_COUNT_TX_DROPS) +
+			    ocelot_read(ocelot, SYS_COUNT_TX_AGING);
+	stats->collisions = ocelot_read(ocelot, SYS_COUNT_TX_COLLISION);
+}
+
+static int ocelot_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev, const unsigned char *addr,
+			  u16 vid, u16 flags)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+
+	return ocelot_mact_learn(ocelot, port->chip_port, addr, vid,
+				 ENTRYTYPE_NORMAL);
+}
+
+static int ocelot_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
+			  struct net_device *dev,
+			  const unsigned char *addr, u16 vid)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+
+	return ocelot_mact_forget(ocelot, addr, vid);
+}
+
+struct ocelot_dump_ctx {
+	struct net_device *dev;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx;
+};
+
+static int ocelot_fdb_do_dump(struct ocelot_mact_entry *entry,
+			      struct ocelot_dump_ctx *dump)
+{
+	u32 portid = NETLINK_CB(dump->cb->skb).portid;
+	u32 seq = dump->cb->nlh->nlmsg_seq;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	if (dump->idx < dump->cb->args[2])
+		goto skip;
+
+	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+			sizeof(*ndm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family  = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags   = NTF_SELF;
+	ndm->ndm_type    = 0;
+	ndm->ndm_ifindex = dump->dev->ifindex;
+	ndm->ndm_state   = NUD_REACHABLE;
+
+	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, entry->mac))
+		goto nla_put_failure;
+
+	if (entry->vid && nla_put_u16(dump->skb, NDA_VLAN, entry->vid))
+		goto nla_put_failure;
+
+	nlmsg_end(dump->skb, nlh);
+
+skip:
+	dump->idx++;
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(dump->skb, nlh);
+	return -EMSGSIZE;
+}
+
+static inline int ocelot_mact_read(struct ocelot_port *port, int row, int col,
+				   struct ocelot_mact_entry *entry)
+{
+	struct ocelot *ocelot = port->ocelot;
+	char mac[ETH_ALEN];
+	u32 val, dst, macl, mach;
+
+	/* Set row and column to read from */
+	ocelot_field_write(ocelot, ANA_TABLES_MACTINDX_M_INDEX, row);
+	ocelot_field_write(ocelot, ANA_TABLES_MACTINDX_BUCKET, col);
+
+	/* Issue a read command */
+	ocelot_write(ocelot,
+		     ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_READ),
+		     ANA_TABLES_MACACCESS);
+
+	if (ocelot_mact_wait_for_completion(ocelot))
+		return -ETIMEDOUT;
+
+	/* Read the entry flags */
+	val = ocelot_read(ocelot, ANA_TABLES_MACACCESS);
+	if (!(val & ANA_TABLES_MACACCESS_VALID))
+		return -EINVAL;
+
+	/* If the entry read has another port configured as its destination,
+	 * do not report it.
+	 */
+	dst = (val & ANA_TABLES_MACACCESS_DEST_IDX_M) >> 3;
+	if (dst != port->chip_port)
+		return -EINVAL;
+
+	/* Get the entry's MAC address and VLAN id */
+	macl = ocelot_read(ocelot, ANA_TABLES_MACLDATA);
+	mach = ocelot_read(ocelot, ANA_TABLES_MACHDATA);
+
+	mac[0] = (mach >> 8)  & 0xff;
+	mac[1] = (mach >> 0)  & 0xff;
+	mac[2] = (macl >> 24) & 0xff;
+	mac[3] = (macl >> 16) & 0xff;
+	mac[4] = (macl >> 8)  & 0xff;
+	mac[5] = (macl >> 0)  & 0xff;
+
+	entry->vid = (mach >> 16) & 0xfff;
+	ether_addr_copy(entry->mac, mac);
+
+	return 0;
+}
+
+static int ocelot_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct net_device *dev,
+			   struct net_device *filter_dev, int *idx)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	int i, j, ret = 0;
+	struct ocelot_dump_ctx dump = {
+		.dev = dev,
+		.skb = skb,
+		.cb = cb,
+		.idx = *idx,
+	};
+
+	struct ocelot_mact_entry entry;
+
+	/* Loop through all the mac tables entries. There are 1024 rows of 4
+	 * entries.
+	 */
+	for (i = 0; i < 1024; i++) {
+		for (j = 0; j < 4; j++) {
+			ret = ocelot_mact_read(port, i, j, &entry);
+			/* If the entry is invalid (wrong port, invalid...),
+			 * skip it.
+			 */
+			if (ret == -EINVAL)
+				continue;
+			else if (ret)
+				goto end;
+
+			ret = ocelot_fdb_do_dump(&entry, &dump);
+			if (ret)
+				goto end;
+		}
+	}
+
+end:
+	*idx = dump.idx;
+	return ret;
+}
+
+static const struct net_device_ops ocelot_port_netdev_ops = {
+	.ndo_open			= ocelot_port_open,
+	.ndo_stop			= ocelot_port_stop,
+	.ndo_start_xmit			= ocelot_port_xmit,
+	.ndo_set_rx_mode		= ocelot_set_rx_mode,
+	.ndo_get_phys_port_name		= ocelot_port_get_phys_port_name,
+	.ndo_set_mac_address		= ocelot_port_set_mac_address,
+	.ndo_get_stats64		= ocelot_get_stats64,
+	.ndo_fdb_add			= ocelot_fdb_add,
+	.ndo_fdb_del			= ocelot_fdb_del,
+	.ndo_fdb_dump			= ocelot_fdb_dump,
+};
+
+static void ocelot_get_strings(struct net_device *netdev, u32 sset, u8 *data)
+{
+	struct ocelot_port *port = netdev_priv(netdev);
+	struct ocelot *ocelot = port->ocelot;
+	int i;
+
+	if (sset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < ocelot->num_stats; i++)
+		memcpy(data + i * ETH_GSTRING_LEN, ocelot->stats_layout[i].name,
+		       ETH_GSTRING_LEN);
+}
+
+static void ocelot_check_stats(struct work_struct *work)
+{
+	struct delayed_work *del_work = to_delayed_work(work);
+	struct ocelot *ocelot = container_of(del_work, struct ocelot, stats_work);
+	int i, j;
+
+	mutex_lock(&ocelot->stats_lock);
+
+	for (i = 0; i < ocelot->num_phys_ports; i++) {
+		/* Configure the port to read the stats from */
+		ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(i), SYS_STAT_CFG);
+
+		for (j = 0; j < ocelot->num_stats; j++) {
+			u32 val;
+			unsigned int idx = i * ocelot->num_stats + j;
+
+			val = ocelot_read_rix(ocelot, SYS_COUNT_RX_OCTETS,
+					      ocelot->stats_layout[j].offset);
+
+			if (val < (ocelot->stats[idx] & U32_MAX))
+				ocelot->stats[idx] += (u64)1 << 32;
+
+			ocelot->stats[idx] = (ocelot->stats[idx] &
+					      ~(u64)U32_MAX) + val;
+		}
+	}
+
+	cancel_delayed_work(&ocelot->stats_work);
+	queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work,
+			   OCELOT_STATS_CHECK_DELAY);
+
+	mutex_unlock(&ocelot->stats_lock);
+}
+
+static void ocelot_get_ethtool_stats(struct net_device *dev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	int i;
+
+	/* check and update now */
+	ocelot_check_stats(&ocelot->stats_work.work);
+
+	/* Copy all counters */
+	for (i = 0; i < ocelot->num_stats; i++)
+		*data++ = ocelot->stats[port->chip_port * ocelot->num_stats + i];
+}
+
+static int ocelot_get_sset_count(struct net_device *dev, int sset)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+
+	if (sset != ETH_SS_STATS)
+		return -EOPNOTSUPP;
+	return ocelot->num_stats;
+}
+
+static const struct ethtool_ops ocelot_ethtool_ops = {
+	.get_strings		= ocelot_get_strings,
+	.get_ethtool_stats	= ocelot_get_ethtool_stats,
+	.get_sset_count		= ocelot_get_sset_count,
+};
+
+static int ocelot_port_attr_get(struct net_device *dev,
+				struct switchdev_attr *attr)
+{
+	struct ocelot_port *ocelot_port = netdev_priv(dev);
+	struct ocelot *ocelot = ocelot_port->ocelot;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(ocelot->base_mac);
+		memcpy(&attr->u.ppid.id, &ocelot->base_mac,
+		       attr->u.ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int ocelot_port_attr_stp_state_set(struct ocelot_port *ocelot_port,
+					  struct switchdev_trans *trans,
+					  u8 state)
+{
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	u32 port_cfg;
+	int port, i;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	if (!(BIT(ocelot_port->chip_port) & ocelot->bridge_mask))
+		return 0;
+
+	port_cfg = ocelot_read_gix(ocelot, ANA_PORT_PORT_CFG,
+				   ocelot_port->chip_port);
+
+	switch (state) {
+	case BR_STATE_FORWARDING:
+		ocelot->bridge_fwd_mask |= BIT(ocelot_port->chip_port);
+		/* Fallthrough */
+	case BR_STATE_LEARNING:
+		port_cfg |= ANA_PORT_PORT_CFG_LEARN_ENA;
+		break;
+
+	default:
+		port_cfg &= ~ANA_PORT_PORT_CFG_LEARN_ENA;
+		ocelot->bridge_fwd_mask &= ~BIT(ocelot_port->chip_port);
+		break;
+	}
+
+	ocelot_write_gix(ocelot, port_cfg, ANA_PORT_PORT_CFG,
+			 ocelot_port->chip_port);
+
+	/* Apply FWD mask. The loop is needed to add/remove the current port as
+	 * a source for the other ports.
+	 */
+	for (port = 0; port < ocelot->num_phys_ports; port++) {
+		if (ocelot->bridge_fwd_mask & BIT(port)) {
+			unsigned long mask = ocelot->bridge_fwd_mask & ~BIT(port);
+
+			for (i = 0; i < ocelot->num_phys_ports; i++) {
+				unsigned long bond_mask = ocelot->lags[i];
+
+				if (!bond_mask)
+					continue;
+
+				if (bond_mask & BIT(port)) {
+					mask &= ~bond_mask;
+					break;
+				}
+			}
+
+			ocelot_write_rix(ocelot,
+					 BIT(ocelot->num_phys_ports) | mask,
+					 ANA_PGID_PGID, PGID_SRC + port);
+		} else {
+			/* Only the CPU port, this is compatible with link
+			 * aggregation.
+			 */
+			ocelot_write_rix(ocelot,
+					 BIT(ocelot->num_phys_ports),
+					 ANA_PGID_PGID, PGID_SRC + port);
+		}
+	}
+
+	return 0;
+}
+
+static void ocelot_port_attr_ageing_set(struct ocelot_port *ocelot_port,
+					unsigned long ageing_clock_t)
+{
+	struct ocelot *ocelot = ocelot_port->ocelot;
+	unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t);
+	u32 ageing_time = jiffies_to_msecs(ageing_jiffies) / 1000;
+
+	ocelot_write(ocelot, ANA_AUTOAGE_AGE_PERIOD(ageing_time / 2),
+		     ANA_AUTOAGE);
+}
+
+static void ocelot_port_attr_mc_set(struct ocelot_port *port, bool mc)
+{
+	struct ocelot *ocelot = port->ocelot;
+	u32 val = ocelot_read_gix(ocelot, ANA_PORT_CPU_FWD_CFG,
+				  port->chip_port);
+
+	if (mc)
+		val |= ANA_PORT_CPU_FWD_CFG_CPU_IGMP_REDIR_ENA |
+		       ANA_PORT_CPU_FWD_CFG_CPU_MLD_REDIR_ENA |
+		       ANA_PORT_CPU_FWD_CFG_CPU_IPMC_CTRL_COPY_ENA;
+	else
+		val &= ~(ANA_PORT_CPU_FWD_CFG_CPU_IGMP_REDIR_ENA |
+			 ANA_PORT_CPU_FWD_CFG_CPU_MLD_REDIR_ENA |
+			 ANA_PORT_CPU_FWD_CFG_CPU_IPMC_CTRL_COPY_ENA);
+
+	ocelot_write_gix(ocelot, val, ANA_PORT_CPU_FWD_CFG, port->chip_port);
+}
+
+static int ocelot_port_attr_set(struct net_device *dev,
+				const struct switchdev_attr *attr,
+				struct switchdev_trans *trans)
+{
+	struct ocelot_port *ocelot_port = netdev_priv(dev);
+	int err = 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		ocelot_port_attr_stp_state_set(ocelot_port, trans,
+					       attr->u.stp_state);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		ocelot_port_attr_ageing_set(ocelot_port, attr->u.ageing_time);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
+		ocelot_port_attr_mc_set(ocelot_port, !attr->u.mc_disabled);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static struct ocelot_multicast *ocelot_multicast_get(struct ocelot *ocelot,
+						     const unsigned char *addr,
+						     u16 vid)
+{
+	struct ocelot_multicast *mc;
+
+	list_for_each_entry(mc, &ocelot->multicast, list) {
+		if (ether_addr_equal(mc->addr, addr) && mc->vid == vid)
+			return mc;
+	}
+
+	return NULL;
+}
+
+static int ocelot_port_obj_add_mdb(struct net_device *dev,
+				   const struct switchdev_obj_port_mdb *mdb,
+				   struct switchdev_trans *trans)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	struct ocelot_multicast *mc;
+	unsigned char addr[ETH_ALEN];
+	u16 vid = mdb->vid;
+	bool new = false;
+
+	if (!vid)
+		vid = 1;
+
+	mc = ocelot_multicast_get(ocelot, mdb->addr, vid);
+	if (!mc) {
+		mc = devm_kzalloc(ocelot->dev, sizeof(*mc), GFP_KERNEL);
+		if (!mc)
+			return -ENOMEM;
+
+		memcpy(mc->addr, mdb->addr, ETH_ALEN);
+		mc->vid = vid;
+
+		list_add_tail(&mc->list, &ocelot->multicast);
+		new = true;
+	}
+
+	memcpy(addr, mc->addr, ETH_ALEN);
+	addr[0] = 0;
+
+	if (!new) {
+		addr[2] = mc->ports << 0;
+		addr[1] = mc->ports << 8;
+		ocelot_mact_forget(ocelot, addr, vid);
+	}
+
+	mc->ports |= BIT(port->chip_port);
+	addr[2] = mc->ports << 0;
+	addr[1] = mc->ports << 8;
+
+	return ocelot_mact_learn(ocelot, 0, addr, vid, ENTRYTYPE_MACv4);
+}
+
+static int ocelot_port_obj_del_mdb(struct net_device *dev,
+				   const struct switchdev_obj_port_mdb *mdb)
+{
+	struct ocelot_port *port = netdev_priv(dev);
+	struct ocelot *ocelot = port->ocelot;
+	struct ocelot_multicast *mc;
+	unsigned char addr[ETH_ALEN];
+	u16 vid = mdb->vid;
+
+	if (!vid)
+		vid = 1;
+
+	mc = ocelot_multicast_get(ocelot, mdb->addr, vid);
+	if (!mc)
+		return -ENOENT;
+
+	memcpy(addr, mc->addr, ETH_ALEN);
+	addr[2] = mc->ports << 0;
+	addr[1] = mc->ports << 8;
+	addr[0] = 0;
+	ocelot_mact_forget(ocelot, addr, vid);
+
+	mc->ports &= ~BIT(port->chip_port);
+	if (!mc->ports) {
+		list_del(&mc->list);
+		devm_kfree(ocelot->dev, mc);
+		return 0;
+	}
+
+	addr[2] = mc->ports << 0;
+	addr[1] = mc->ports << 8;
+
+	return ocelot_mact_learn(ocelot, 0, addr, vid, ENTRYTYPE_MACv4);
+}
+
+static int ocelot_port_obj_add(struct net_device *dev,
+			       const struct switchdev_obj *obj,
+			       struct switchdev_trans *trans)
+{
+	int ret = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		ret = ocelot_port_obj_add_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj),
+					      trans);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+static int ocelot_port_obj_del(struct net_device *dev,
+			       const struct switchdev_obj *obj)
+{
+	int ret = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		ret = ocelot_port_obj_del_mdb(dev, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+static const struct switchdev_ops ocelot_port_switchdev_ops = {
+	.switchdev_port_attr_get	= ocelot_port_attr_get,
+	.switchdev_port_attr_set	= ocelot_port_attr_set,
+	.switchdev_port_obj_add		= ocelot_port_obj_add,
+	.switchdev_port_obj_del		= ocelot_port_obj_del,
+};
+
+static int ocelot_port_bridge_join(struct ocelot_port *ocelot_port,
+				   struct net_device *bridge)
+{
+	struct ocelot *ocelot = ocelot_port->ocelot;
+
+	if (!ocelot->bridge_mask) {
+		ocelot->hw_bridge_dev = bridge;
+	} else {
+		if (ocelot->hw_bridge_dev != bridge)
+			/* This is adding the port to a second bridge, this is
+			 * unsupported */
+			return -ENODEV;
+	}
+
+	ocelot->bridge_mask |= BIT(ocelot_port->chip_port);
+
+	return 0;
+}
+
+static void ocelot_port_bridge_leave(struct ocelot_port *ocelot_port,
+				     struct net_device *bridge)
+{
+	struct ocelot *ocelot = ocelot_port->ocelot;
+
+	ocelot->bridge_mask &= ~BIT(ocelot_port->chip_port);
+
+	if (!ocelot->bridge_mask)
+		ocelot->hw_bridge_dev = NULL;
+}
+
+/* Checks if the net_device instance given to us originate from our driver. */
+static bool ocelot_netdevice_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &ocelot_port_netdev_ops;
+}
+
+static int ocelot_netdevice_port_event(struct net_device *dev,
+				       unsigned long event,
+				       struct netdev_notifier_changeupper_info *info)
+{
+	struct ocelot_port *ocelot_port = netdev_priv(dev);
+	int err = 0;
+
+	if (!ocelot_netdevice_dev_check(dev))
+		return 0;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		if (netif_is_bridge_master(info->upper_dev)) {
+			if (info->linking)
+				err = ocelot_port_bridge_join(ocelot_port,
+							      info->upper_dev);
+			else
+				ocelot_port_bridge_leave(ocelot_port,
+							 info->upper_dev);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int ocelot_netdevice_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	int ret;
+
+	if (netif_is_lag_master(dev)) {
+		struct net_device *slave;
+		struct list_head *iter;
+
+		netdev_for_each_lower_dev(dev, slave, iter) {
+			ret = ocelot_netdevice_port_event(slave, event, info);
+			if (ret)
+				goto notify;
+		}
+	} else {
+		ret = ocelot_netdevice_port_event(dev, event, info);
+	}
+
+notify:
+	return notifier_from_errno(ret);
+}
+
+struct notifier_block ocelot_netdevice_nb __read_mostly = {
+	.notifier_call = ocelot_netdevice_event,
+};
+EXPORT_SYMBOL(ocelot_netdevice_nb);
+
+int ocelot_probe_port(struct ocelot *ocelot, u8 port,
+		      void __iomem *regs,
+		      struct phy_device *phy)
+{
+	struct ocelot_port *ocelot_port;
+	struct net_device *dev;
+	int err;
+
+	dev = alloc_etherdev(sizeof(struct ocelot_port));
+	if (!dev)
+		return -ENOMEM;
+	SET_NETDEV_DEV(dev, ocelot->dev);
+	ocelot_port = netdev_priv(dev);
+	ocelot_port->dev = dev;
+	ocelot_port->ocelot = ocelot;
+	ocelot_port->regs = regs;
+	ocelot_port->chip_port = port;
+	ocelot_port->phy = phy;
+	INIT_LIST_HEAD(&ocelot_port->mc);
+	ocelot->ports[port] = ocelot_port;
+
+	dev->netdev_ops = &ocelot_port_netdev_ops;
+	dev->ethtool_ops = &ocelot_ethtool_ops;
+	dev->switchdev_ops = &ocelot_port_switchdev_ops;
+
+	memcpy(dev->dev_addr, ocelot->base_mac, ETH_ALEN);
+	dev->dev_addr[ETH_ALEN - 1] += port;
+	ocelot_mact_learn(ocelot, PGID_CPU, dev->dev_addr, ocelot_port->pvid,
+			  ENTRYTYPE_LOCKED);
+
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(ocelot->dev, "register_netdev failed\n");
+		goto err_register_netdev;
+	}
+
+	return 0;
+
+err_register_netdev:
+	free_netdev(dev);
+	return err;
+}
+EXPORT_SYMBOL(ocelot_probe_port);
+
+int ocelot_init(struct ocelot *ocelot)
+{
+	u32 port;
+	int i, cpu = ocelot->num_phys_ports;
+	char queue_name[32];
+
+	ocelot->stats = devm_kcalloc(ocelot->dev,
+				     ocelot->num_phys_ports * ocelot->num_stats,
+				     sizeof(u64), GFP_KERNEL);
+	if (!ocelot->stats)
+		return -ENOMEM;
+
+	mutex_init(&ocelot->stats_lock);
+	snprintf(queue_name, sizeof(queue_name), "%s-stats",
+		 dev_name(ocelot->dev));
+	ocelot->stats_queue = create_singlethread_workqueue(queue_name);
+	if (!ocelot->stats_queue)
+		return -ENOMEM;
+
+	ocelot_mact_init(ocelot);
+	ocelot_vlan_init(ocelot);
+
+	for (port = 0; port < ocelot->num_phys_ports; port++) {
+		/* Clear all counters (5 groups) */
+		ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(port) |
+				     SYS_STAT_CFG_STAT_CLEAR_SHOT(0x7f),
+			     SYS_STAT_CFG);
+	}
+
+	/* Only use S-Tag */
+	ocelot_write(ocelot, ETH_P_8021AD, SYS_VLAN_ETYPE_CFG);
+
+	/* Aggregation mode */
+	ocelot_write(ocelot, ANA_AGGR_CFG_AC_SMAC_ENA |
+			     ANA_AGGR_CFG_AC_DMAC_ENA |
+			     ANA_AGGR_CFG_AC_IP4_SIPDIP_ENA |
+			     ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA, ANA_AGGR_CFG);
+
+	/* Set MAC age time to default value. The entry is aged after
+	 * 2*AGE_PERIOD
+	 */
+	ocelot_write(ocelot,
+		     ANA_AUTOAGE_AGE_PERIOD(BR_DEFAULT_AGEING_TIME / 2 / HZ),
+		     ANA_AUTOAGE);
+
+	/* Disable learning for frames discarded by VLAN ingress filtering */
+	regmap_field_write(ocelot->regfields[ANA_ADVLEARN_VLAN_CHK], 1);
+
+	/* Setup frame ageing - fixed value "2 sec" - in 6.5 us units */
+	ocelot_write(ocelot, SYS_FRM_AGING_AGE_TX_ENA |
+		     SYS_FRM_AGING_MAX_AGE(307692), SYS_FRM_AGING);
+
+	/* Setup flooding PGIDs */
+	ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) |
+			 ANA_FLOODING_FLD_BROADCAST(PGID_MC) |
+			 ANA_FLOODING_FLD_UNICAST(PGID_UC),
+			 ANA_FLOODING, 0);
+	ocelot_write(ocelot, ANA_FLOODING_IPMC_FLD_MC6_DATA(PGID_MCIPV6) |
+		     ANA_FLOODING_IPMC_FLD_MC6_CTRL(PGID_MC) |
+		     ANA_FLOODING_IPMC_FLD_MC4_DATA(PGID_MCIPV4) |
+		     ANA_FLOODING_IPMC_FLD_MC4_CTRL(PGID_MC),
+		     ANA_FLOODING_IPMC);
+
+	for (port = 0; port < ocelot->num_phys_ports; port++) {
+		/* Transmit the frame to the local port. */
+		ocelot_write_rix(ocelot, BIT(port), ANA_PGID_PGID, port);
+		/* Do not forward BPDU frames to the front ports. */
+		ocelot_write_gix(ocelot,
+				 ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA(0xffff),
+				 ANA_PORT_CPU_FWD_BPDU_CFG,
+				 port);
+		/* Ensure bridging is disabled */
+		ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_SRC + port);
+	}
+
+	/* Configure and enable the CPU port. */
+	ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, cpu);
+	ocelot_write_rix(ocelot, BIT(cpu), ANA_PGID_PGID, PGID_CPU);
+	ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_RECV_ENA |
+			 ANA_PORT_PORT_CFG_PORTID_VAL(cpu),
+			 ANA_PORT_PORT_CFG, cpu);
+
+	/* Allow broadcast MAC frames. */
+	for (i = ocelot->num_phys_ports + 1; i < PGID_CPU; i++) {
+		u32 val = ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports - 1, 0));
+
+		ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i);
+	}
+	ocelot_write_rix(ocelot,
+			 ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports, 0)),
+			 ANA_PGID_PGID, PGID_MC);
+	ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV4);
+	ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV6);
+
+	/* CPU port Injection/Extraction configuration */
+	ocelot_write_rix(ocelot, QSYS_SWITCH_PORT_MODE_INGRESS_DROP_MODE |
+			 QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG(1) |
+			 QSYS_SWITCH_PORT_MODE_PORT_ENA,
+			 QSYS_SWITCH_PORT_MODE, cpu);
+	ocelot_write_rix(ocelot, SYS_PORT_MODE_INCL_XTR_HDR(1) |
+			 SYS_PORT_MODE_INCL_INJ_HDR(1), SYS_PORT_MODE, cpu);
+	/* Allow manual injection via DEVCPU_QS registers, and byte swap these
+	 * registers endianness.
+	 */
+	ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_BYTE_SWAP |
+			 QS_INJ_GRP_CFG_MODE(1), QS_INJ_GRP_CFG, 0);
+	ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_BYTE_SWAP |
+			 QS_XTR_GRP_CFG_MODE(1), QS_XTR_GRP_CFG, 0);
+	ocelot_write(ocelot, ANA_CPUQ_CFG_CPUQ_MIRROR(2) |
+		     ANA_CPUQ_CFG_CPUQ_LRN(2) |
+		     ANA_CPUQ_CFG_CPUQ_MAC_COPY(2) |
+		     ANA_CPUQ_CFG_CPUQ_SRC_COPY(2) |
+		     ANA_CPUQ_CFG_CPUQ_LOCKED_PORTMOVE(2) |
+		     ANA_CPUQ_CFG_CPUQ_ALLBRIDGE(6) |
+		     ANA_CPUQ_CFG_CPUQ_IPMC_CTRL(6) |
+		     ANA_CPUQ_CFG_CPUQ_IGMP(6) |
+		     ANA_CPUQ_CFG_CPUQ_MLD(6), ANA_CPUQ_CFG);
+	for (i = 0; i < 16; i++)
+		ocelot_write_rix(ocelot, ANA_CPUQ_8021_CFG_CPUQ_GARP_VAL(6) |
+				 ANA_CPUQ_8021_CFG_CPUQ_BPDU_VAL(6),
+				 ANA_CPUQ_8021_CFG, i);
+
+	INIT_DELAYED_WORK(&ocelot->stats_work, ocelot_check_stats);
+	queue_delayed_work(ocelot->stats_queue, &ocelot->stats_work,
+			   OCELOT_STATS_CHECK_DELAY);
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_init);
+
+void ocelot_deinit(struct ocelot *ocelot)
+{
+	destroy_workqueue(ocelot->stats_queue);
+	mutex_destroy(&ocelot->stats_lock);
+}
+EXPORT_SYMBOL(ocelot_deinit);
+
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
new file mode 100644
index 000000000000..097bd12a10d4
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -0,0 +1,572 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_H_
+#define _MSCC_OCELOT_H_
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#include "ocelot_ana.h"
+#include "ocelot_dev.h"
+#include "ocelot_hsio.h"
+#include "ocelot_qsys.h"
+#include "ocelot_rew.h"
+#include "ocelot_sys.h"
+#include "ocelot_qs.h"
+
+#define PGID_AGGR    64
+#define PGID_SRC     80
+
+/* Reserved PGIDs */
+#define PGID_CPU     (PGID_AGGR - 5)
+#define PGID_UC      (PGID_AGGR - 4)
+#define PGID_MC      (PGID_AGGR - 3)
+#define PGID_MCIPV4  (PGID_AGGR - 2)
+#define PGID_MCIPV6  (PGID_AGGR - 1)
+
+#define OCELOT_BUFFER_CELL_SZ 60
+
+#define OCELOT_STATS_CHECK_DELAY (2 * HZ)
+
+#define IFH_LEN 4
+
+struct frame_info {
+	u32 len;
+	u16 port;
+	u16 vid;
+	u8 cpuq;
+	u8 tag_type;
+};
+
+#define IFH_INJ_BYPASS	BIT(31)
+#define IFH_INJ_POP_CNT_DISABLE (3 << 28)
+
+#define IFH_TAG_TYPE_C 0
+#define IFH_TAG_TYPE_S 1
+
+#define OCELOT_SPEED_2500 0
+#define OCELOT_SPEED_1000 1
+#define OCELOT_SPEED_100  2
+#define OCELOT_SPEED_10   3
+
+#define TARGET_OFFSET 24
+#define REG_MASK GENMASK(TARGET_OFFSET - 1, 0)
+#define REG(reg, offset) [reg & REG_MASK] = offset
+
+enum ocelot_target {
+	ANA = 1,
+	QS,
+	QSYS,
+	REW,
+	SYS,
+	HSIO,
+	TARGET_MAX,
+};
+
+enum ocelot_reg {
+	ANA_ADVLEARN = ANA << TARGET_OFFSET,
+	ANA_VLANMASK,
+	ANA_PORT_B_DOMAIN,
+	ANA_ANAGEFIL,
+	ANA_ANEVENTS,
+	ANA_STORMLIMIT_BURST,
+	ANA_STORMLIMIT_CFG,
+	ANA_ISOLATED_PORTS,
+	ANA_COMMUNITY_PORTS,
+	ANA_AUTOAGE,
+	ANA_MACTOPTIONS,
+	ANA_LEARNDISC,
+	ANA_AGENCTRL,
+	ANA_MIRRORPORTS,
+	ANA_EMIRRORPORTS,
+	ANA_FLOODING,
+	ANA_FLOODING_IPMC,
+	ANA_SFLOW_CFG,
+	ANA_PORT_MODE,
+	ANA_CUT_THRU_CFG,
+	ANA_PGID_PGID,
+	ANA_TABLES_ANMOVED,
+	ANA_TABLES_MACHDATA,
+	ANA_TABLES_MACLDATA,
+	ANA_TABLES_STREAMDATA,
+	ANA_TABLES_MACACCESS,
+	ANA_TABLES_MACTINDX,
+	ANA_TABLES_VLANACCESS,
+	ANA_TABLES_VLANTIDX,
+	ANA_TABLES_ISDXACCESS,
+	ANA_TABLES_ISDXTIDX,
+	ANA_TABLES_ENTRYLIM,
+	ANA_TABLES_PTP_ID_HIGH,
+	ANA_TABLES_PTP_ID_LOW,
+	ANA_TABLES_STREAMACCESS,
+	ANA_TABLES_STREAMTIDX,
+	ANA_TABLES_SEQ_HISTORY,
+	ANA_TABLES_SEQ_MASK,
+	ANA_TABLES_SFID_MASK,
+	ANA_TABLES_SFIDACCESS,
+	ANA_TABLES_SFIDTIDX,
+	ANA_MSTI_STATE,
+	ANA_OAM_UPM_LM_CNT,
+	ANA_SG_ACCESS_CTRL,
+	ANA_SG_CONFIG_REG_1,
+	ANA_SG_CONFIG_REG_2,
+	ANA_SG_CONFIG_REG_3,
+	ANA_SG_CONFIG_REG_4,
+	ANA_SG_CONFIG_REG_5,
+	ANA_SG_GCL_GS_CONFIG,
+	ANA_SG_GCL_TI_CONFIG,
+	ANA_SG_STATUS_REG_1,
+	ANA_SG_STATUS_REG_2,
+	ANA_SG_STATUS_REG_3,
+	ANA_PORT_VLAN_CFG,
+	ANA_PORT_DROP_CFG,
+	ANA_PORT_QOS_CFG,
+	ANA_PORT_VCAP_CFG,
+	ANA_PORT_VCAP_S1_KEY_CFG,
+	ANA_PORT_VCAP_S2_CFG,
+	ANA_PORT_PCP_DEI_MAP,
+	ANA_PORT_CPU_FWD_CFG,
+	ANA_PORT_CPU_FWD_BPDU_CFG,
+	ANA_PORT_CPU_FWD_GARP_CFG,
+	ANA_PORT_CPU_FWD_CCM_CFG,
+	ANA_PORT_PORT_CFG,
+	ANA_PORT_POL_CFG,
+	ANA_PORT_PTP_CFG,
+	ANA_PORT_PTP_DLY1_CFG,
+	ANA_PORT_PTP_DLY2_CFG,
+	ANA_PORT_SFID_CFG,
+	ANA_PFC_PFC_CFG,
+	ANA_PFC_PFC_TIMER,
+	ANA_IPT_OAM_MEP_CFG,
+	ANA_IPT_IPT,
+	ANA_PPT_PPT,
+	ANA_FID_MAP_FID_MAP,
+	ANA_AGGR_CFG,
+	ANA_CPUQ_CFG,
+	ANA_CPUQ_CFG2,
+	ANA_CPUQ_8021_CFG,
+	ANA_DSCP_CFG,
+	ANA_DSCP_REWR_CFG,
+	ANA_VCAP_RNG_TYPE_CFG,
+	ANA_VCAP_RNG_VAL_CFG,
+	ANA_VRAP_CFG,
+	ANA_VRAP_HDR_DATA,
+	ANA_VRAP_HDR_MASK,
+	ANA_DISCARD_CFG,
+	ANA_FID_CFG,
+	ANA_POL_PIR_CFG,
+	ANA_POL_CIR_CFG,
+	ANA_POL_MODE_CFG,
+	ANA_POL_PIR_STATE,
+	ANA_POL_CIR_STATE,
+	ANA_POL_STATE,
+	ANA_POL_FLOWC,
+	ANA_POL_HYST,
+	ANA_POL_MISC_CFG,
+	QS_XTR_GRP_CFG = QS << TARGET_OFFSET,
+	QS_XTR_RD,
+	QS_XTR_FRM_PRUNING,
+	QS_XTR_FLUSH,
+	QS_XTR_DATA_PRESENT,
+	QS_XTR_CFG,
+	QS_INJ_GRP_CFG,
+	QS_INJ_WR,
+	QS_INJ_CTRL,
+	QS_INJ_STATUS,
+	QS_INJ_ERR,
+	QS_INH_DBG,
+	QSYS_PORT_MODE = QSYS << TARGET_OFFSET,
+	QSYS_SWITCH_PORT_MODE,
+	QSYS_STAT_CNT_CFG,
+	QSYS_EEE_CFG,
+	QSYS_EEE_THRES,
+	QSYS_IGR_NO_SHARING,
+	QSYS_EGR_NO_SHARING,
+	QSYS_SW_STATUS,
+	QSYS_EXT_CPU_CFG,
+	QSYS_PAD_CFG,
+	QSYS_CPU_GROUP_MAP,
+	QSYS_QMAP,
+	QSYS_ISDX_SGRP,
+	QSYS_TIMED_FRAME_ENTRY,
+	QSYS_TFRM_MISC,
+	QSYS_TFRM_PORT_DLY,
+	QSYS_TFRM_TIMER_CFG_1,
+	QSYS_TFRM_TIMER_CFG_2,
+	QSYS_TFRM_TIMER_CFG_3,
+	QSYS_TFRM_TIMER_CFG_4,
+	QSYS_TFRM_TIMER_CFG_5,
+	QSYS_TFRM_TIMER_CFG_6,
+	QSYS_TFRM_TIMER_CFG_7,
+	QSYS_TFRM_TIMER_CFG_8,
+	QSYS_RED_PROFILE,
+	QSYS_RES_QOS_MODE,
+	QSYS_RES_CFG,
+	QSYS_RES_STAT,
+	QSYS_EGR_DROP_MODE,
+	QSYS_EQ_CTRL,
+	QSYS_EVENTS_CORE,
+	QSYS_QMAXSDU_CFG_0,
+	QSYS_QMAXSDU_CFG_1,
+	QSYS_QMAXSDU_CFG_2,
+	QSYS_QMAXSDU_CFG_3,
+	QSYS_QMAXSDU_CFG_4,
+	QSYS_QMAXSDU_CFG_5,
+	QSYS_QMAXSDU_CFG_6,
+	QSYS_QMAXSDU_CFG_7,
+	QSYS_PREEMPTION_CFG,
+	QSYS_CIR_CFG,
+	QSYS_EIR_CFG,
+	QSYS_SE_CFG,
+	QSYS_SE_DWRR_CFG,
+	QSYS_SE_CONNECT,
+	QSYS_SE_DLB_SENSE,
+	QSYS_CIR_STATE,
+	QSYS_EIR_STATE,
+	QSYS_SE_STATE,
+	QSYS_HSCH_MISC_CFG,
+	QSYS_TAG_CONFIG,
+	QSYS_TAS_PARAM_CFG_CTRL,
+	QSYS_PORT_MAX_SDU,
+	QSYS_PARAM_CFG_REG_1,
+	QSYS_PARAM_CFG_REG_2,
+	QSYS_PARAM_CFG_REG_3,
+	QSYS_PARAM_CFG_REG_4,
+	QSYS_PARAM_CFG_REG_5,
+	QSYS_GCL_CFG_REG_1,
+	QSYS_GCL_CFG_REG_2,
+	QSYS_PARAM_STATUS_REG_1,
+	QSYS_PARAM_STATUS_REG_2,
+	QSYS_PARAM_STATUS_REG_3,
+	QSYS_PARAM_STATUS_REG_4,
+	QSYS_PARAM_STATUS_REG_5,
+	QSYS_PARAM_STATUS_REG_6,
+	QSYS_PARAM_STATUS_REG_7,
+	QSYS_PARAM_STATUS_REG_8,
+	QSYS_PARAM_STATUS_REG_9,
+	QSYS_GCL_STATUS_REG_1,
+	QSYS_GCL_STATUS_REG_2,
+	REW_PORT_VLAN_CFG = REW << TARGET_OFFSET,
+	REW_TAG_CFG,
+	REW_PORT_CFG,
+	REW_DSCP_CFG,
+	REW_PCP_DEI_QOS_MAP_CFG,
+	REW_PTP_CFG,
+	REW_PTP_DLY1_CFG,
+	REW_RED_TAG_CFG,
+	REW_DSCP_REMAP_DP1_CFG,
+	REW_DSCP_REMAP_CFG,
+	REW_STAT_CFG,
+	REW_REW_STICKY,
+	REW_PPT,
+	SYS_COUNT_RX_OCTETS = SYS << TARGET_OFFSET,
+	SYS_COUNT_RX_UNICAST,
+	SYS_COUNT_RX_MULTICAST,
+	SYS_COUNT_RX_BROADCAST,
+	SYS_COUNT_RX_SHORTS,
+	SYS_COUNT_RX_FRAGMENTS,
+	SYS_COUNT_RX_JABBERS,
+	SYS_COUNT_RX_CRC_ALIGN_ERRS,
+	SYS_COUNT_RX_SYM_ERRS,
+	SYS_COUNT_RX_64,
+	SYS_COUNT_RX_65_127,
+	SYS_COUNT_RX_128_255,
+	SYS_COUNT_RX_256_1023,
+	SYS_COUNT_RX_1024_1526,
+	SYS_COUNT_RX_1527_MAX,
+	SYS_COUNT_RX_PAUSE,
+	SYS_COUNT_RX_CONTROL,
+	SYS_COUNT_RX_LONGS,
+	SYS_COUNT_RX_CLASSIFIED_DROPS,
+	SYS_COUNT_TX_OCTETS,
+	SYS_COUNT_TX_UNICAST,
+	SYS_COUNT_TX_MULTICAST,
+	SYS_COUNT_TX_BROADCAST,
+	SYS_COUNT_TX_COLLISION,
+	SYS_COUNT_TX_DROPS,
+	SYS_COUNT_TX_PAUSE,
+	SYS_COUNT_TX_64,
+	SYS_COUNT_TX_65_127,
+	SYS_COUNT_TX_128_511,
+	SYS_COUNT_TX_512_1023,
+	SYS_COUNT_TX_1024_1526,
+	SYS_COUNT_TX_1527_MAX,
+	SYS_COUNT_TX_AGING,
+	SYS_RESET_CFG,
+	SYS_SR_ETYPE_CFG,
+	SYS_VLAN_ETYPE_CFG,
+	SYS_PORT_MODE,
+	SYS_FRONT_PORT_MODE,
+	SYS_FRM_AGING,
+	SYS_STAT_CFG,
+	SYS_SW_STATUS,
+	SYS_MISC_CFG,
+	SYS_REW_MAC_HIGH_CFG,
+	SYS_REW_MAC_LOW_CFG,
+	SYS_TIMESTAMP_OFFSET,
+	SYS_CMID,
+	SYS_PAUSE_CFG,
+	SYS_PAUSE_TOT_CFG,
+	SYS_ATOP,
+	SYS_ATOP_TOT_CFG,
+	SYS_MAC_FC_CFG,
+	SYS_MMGT,
+	SYS_MMGT_FAST,
+	SYS_EVENTS_DIF,
+	SYS_EVENTS_CORE,
+	SYS_CNT,
+	SYS_PTP_STATUS,
+	SYS_PTP_TXSTAMP,
+	SYS_PTP_NXT,
+	SYS_PTP_CFG,
+	SYS_RAM_INIT,
+	SYS_CM_ADDR,
+	SYS_CM_DATA_WR,
+	SYS_CM_DATA_RD,
+	SYS_CM_OP,
+	SYS_CM_DATA,
+	HSIO_PLL5G_CFG0 = HSIO << TARGET_OFFSET,
+	HSIO_PLL5G_CFG1,
+	HSIO_PLL5G_CFG2,
+	HSIO_PLL5G_CFG3,
+	HSIO_PLL5G_CFG4,
+	HSIO_PLL5G_CFG5,
+	HSIO_PLL5G_CFG6,
+	HSIO_PLL5G_STATUS0,
+	HSIO_PLL5G_STATUS1,
+	HSIO_PLL5G_BIST_CFG0,
+	HSIO_PLL5G_BIST_CFG1,
+	HSIO_PLL5G_BIST_CFG2,
+	HSIO_PLL5G_BIST_STAT0,
+	HSIO_PLL5G_BIST_STAT1,
+	HSIO_RCOMP_CFG0,
+	HSIO_RCOMP_STATUS,
+	HSIO_SYNC_ETH_CFG,
+	HSIO_SYNC_ETH_PLL_CFG,
+	HSIO_S1G_DES_CFG,
+	HSIO_S1G_IB_CFG,
+	HSIO_S1G_OB_CFG,
+	HSIO_S1G_SER_CFG,
+	HSIO_S1G_COMMON_CFG,
+	HSIO_S1G_PLL_CFG,
+	HSIO_S1G_PLL_STATUS,
+	HSIO_S1G_DFT_CFG0,
+	HSIO_S1G_DFT_CFG1,
+	HSIO_S1G_DFT_CFG2,
+	HSIO_S1G_TP_CFG,
+	HSIO_S1G_RC_PLL_BIST_CFG,
+	HSIO_S1G_MISC_CFG,
+	HSIO_S1G_DFT_STATUS,
+	HSIO_S1G_MISC_STATUS,
+	HSIO_MCB_S1G_ADDR_CFG,
+	HSIO_S6G_DIG_CFG,
+	HSIO_S6G_DFT_CFG0,
+	HSIO_S6G_DFT_CFG1,
+	HSIO_S6G_DFT_CFG2,
+	HSIO_S6G_TP_CFG0,
+	HSIO_S6G_TP_CFG1,
+	HSIO_S6G_RC_PLL_BIST_CFG,
+	HSIO_S6G_MISC_CFG,
+	HSIO_S6G_OB_ANEG_CFG,
+	HSIO_S6G_DFT_STATUS,
+	HSIO_S6G_ERR_CNT,
+	HSIO_S6G_MISC_STATUS,
+	HSIO_S6G_DES_CFG,
+	HSIO_S6G_IB_CFG,
+	HSIO_S6G_IB_CFG1,
+	HSIO_S6G_IB_CFG2,
+	HSIO_S6G_IB_CFG3,
+	HSIO_S6G_IB_CFG4,
+	HSIO_S6G_IB_CFG5,
+	HSIO_S6G_OB_CFG,
+	HSIO_S6G_OB_CFG1,
+	HSIO_S6G_SER_CFG,
+	HSIO_S6G_COMMON_CFG,
+	HSIO_S6G_PLL_CFG,
+	HSIO_S6G_ACJTAG_CFG,
+	HSIO_S6G_GP_CFG,
+	HSIO_S6G_IB_STATUS0,
+	HSIO_S6G_IB_STATUS1,
+	HSIO_S6G_ACJTAG_STATUS,
+	HSIO_S6G_PLL_STATUS,
+	HSIO_S6G_REVID,
+	HSIO_MCB_S6G_ADDR_CFG,
+	HSIO_HW_CFG,
+	HSIO_HW_QSGMII_CFG,
+	HSIO_HW_QSGMII_STAT,
+	HSIO_CLK_CFG,
+	HSIO_TEMP_SENSOR_CTRL,
+	HSIO_TEMP_SENSOR_CFG,
+	HSIO_TEMP_SENSOR_STAT,
+};
+
+enum ocelot_regfield {
+	ANA_ADVLEARN_VLAN_CHK,
+	ANA_ADVLEARN_LEARN_MIRROR,
+	ANA_ANEVENTS_FLOOD_DISCARD,
+	ANA_ANEVENTS_MSTI_DROP,
+	ANA_ANEVENTS_ACLKILL,
+	ANA_ANEVENTS_ACLUSED,
+	ANA_ANEVENTS_AUTOAGE,
+	ANA_ANEVENTS_VS2TTL1,
+	ANA_ANEVENTS_STORM_DROP,
+	ANA_ANEVENTS_LEARN_DROP,
+	ANA_ANEVENTS_AGED_ENTRY,
+	ANA_ANEVENTS_CPU_LEARN_FAILED,
+	ANA_ANEVENTS_AUTO_LEARN_FAILED,
+	ANA_ANEVENTS_LEARN_REMOVE,
+	ANA_ANEVENTS_AUTO_LEARNED,
+	ANA_ANEVENTS_AUTO_MOVED,
+	ANA_ANEVENTS_DROPPED,
+	ANA_ANEVENTS_CLASSIFIED_DROP,
+	ANA_ANEVENTS_CLASSIFIED_COPY,
+	ANA_ANEVENTS_VLAN_DISCARD,
+	ANA_ANEVENTS_FWD_DISCARD,
+	ANA_ANEVENTS_MULTICAST_FLOOD,
+	ANA_ANEVENTS_UNICAST_FLOOD,
+	ANA_ANEVENTS_DEST_KNOWN,
+	ANA_ANEVENTS_BUCKET3_MATCH,
+	ANA_ANEVENTS_BUCKET2_MATCH,
+	ANA_ANEVENTS_BUCKET1_MATCH,
+	ANA_ANEVENTS_BUCKET0_MATCH,
+	ANA_ANEVENTS_CPU_OPERATION,
+	ANA_ANEVENTS_DMAC_LOOKUP,
+	ANA_ANEVENTS_SMAC_LOOKUP,
+	ANA_ANEVENTS_SEQ_GEN_ERR_0,
+	ANA_ANEVENTS_SEQ_GEN_ERR_1,
+	ANA_TABLES_MACACCESS_B_DOM,
+	ANA_TABLES_MACTINDX_BUCKET,
+	ANA_TABLES_MACTINDX_M_INDEX,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_VLD,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_FP,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T,
+	SYS_RESET_CFG_CORE_ENA,
+	SYS_RESET_CFG_MEM_ENA,
+	SYS_RESET_CFG_MEM_INIT,
+	REGFIELD_MAX
+};
+
+struct ocelot_multicast {
+	struct list_head list;
+	unsigned char addr[ETH_ALEN];
+	u16 vid;
+	u16 ports;
+};
+
+struct ocelot_port;
+
+struct ocelot_stat_layout {
+	u32 offset;
+	char name[ETH_GSTRING_LEN];
+};
+
+struct ocelot {
+	struct device *dev;
+
+	struct regmap *targets[TARGET_MAX];
+	struct regmap_field *regfields[REGFIELD_MAX];
+	const u32 *const *map;
+	const struct ocelot_stat_layout *stats_layout;
+	unsigned int num_stats;
+
+	u8 base_mac[ETH_ALEN];
+
+	struct net_device *hw_bridge_dev;
+	u16 bridge_mask;
+	u16 bridge_fwd_mask;
+
+	struct workqueue_struct *ocelot_owq;
+
+	int shared_queue_sz;
+
+	u8 num_phys_ports;
+	u8 num_cpu_ports;
+	struct ocelot_port **ports;
+
+	u16 lags[16];
+
+	/* Keep track of the vlan port masks */
+	u32 vlan_mask[VLAN_N_VID];
+
+	struct list_head multicast;
+
+	/* Workqueue to check statistics for overflow with its lock */
+	struct mutex stats_lock;
+	u64 *stats;
+	struct delayed_work stats_work;
+	struct workqueue_struct *stats_queue;
+};
+
+struct ocelot_port {
+	struct net_device *dev;
+	struct ocelot *ocelot;
+	struct phy_device *phy;
+	void __iomem *regs;
+	u8 chip_port;
+	/* Keep a track of the mc addresses added to the mac table, so that they
+	 * can be removed when needed.
+	 */
+	struct list_head mc;
+
+	/* Ingress default VLAN (pvid) */
+	u16 pvid;
+
+	/* Egress default VLAN (vid) */
+	u16 vid;
+
+	u8 vlan_aware;
+
+	u64 *stats;
+};
+
+u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
+#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi))
+#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri))
+#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0)
+
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
+#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi))
+#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri))
+#define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0)
+
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 mask,
+		     u32 offset);
+#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi))
+#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri))
+#define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0)
+
+u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
+void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
+
+int ocelot_regfields_init(struct ocelot *ocelot,
+			  const struct reg_field *const regfields);
+struct regmap *ocelot_io_platform_init(struct ocelot *ocelot,
+				       struct platform_device *pdev,
+				       const char *name);
+
+#define ocelot_field_write(ocelot, reg, val) regmap_field_write((ocelot)->regfields[(reg)], (val))
+#define ocelot_field_read(ocelot, reg, val) regmap_field_read((ocelot)->regfields[(reg)], (val))
+
+int ocelot_init(struct ocelot *ocelot);
+void ocelot_deinit(struct ocelot *ocelot);
+int ocelot_chip_init(struct ocelot *ocelot);
+int ocelot_probe_port(struct ocelot *ocelot, u8 port,
+		      void __iomem *regs,
+		      struct phy_device *phy);
+
+extern struct notifier_block ocelot_netdevice_nb;
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_ana.h b/drivers/net/ethernet/mscc/ocelot_ana.h
new file mode 100644
index 000000000000..841c6ec22b64
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_ana.h
@@ -0,0 +1,625 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_ANA_H_
+#define _MSCC_OCELOT_ANA_H_
+
+#define ANA_ANAGEFIL_B_DOM_EN                             BIT(22)
+#define ANA_ANAGEFIL_B_DOM_VAL                            BIT(21)
+#define ANA_ANAGEFIL_AGE_LOCKED                           BIT(20)
+#define ANA_ANAGEFIL_PID_EN                               BIT(19)
+#define ANA_ANAGEFIL_PID_VAL(x)                           (((x) << 14) & GENMASK(18, 14))
+#define ANA_ANAGEFIL_PID_VAL_M                            GENMASK(18, 14)
+#define ANA_ANAGEFIL_PID_VAL_X(x)                         (((x) & GENMASK(18, 14)) >> 14)
+#define ANA_ANAGEFIL_VID_EN                               BIT(13)
+#define ANA_ANAGEFIL_VID_VAL(x)                           ((x) & GENMASK(12, 0))
+#define ANA_ANAGEFIL_VID_VAL_M                            GENMASK(12, 0)
+
+#define ANA_STORMLIMIT_CFG_RSZ                            0x4
+
+#define ANA_STORMLIMIT_CFG_STORM_RATE(x)                  (((x) << 3) & GENMASK(6, 3))
+#define ANA_STORMLIMIT_CFG_STORM_RATE_M                   GENMASK(6, 3)
+#define ANA_STORMLIMIT_CFG_STORM_RATE_X(x)                (((x) & GENMASK(6, 3)) >> 3)
+#define ANA_STORMLIMIT_CFG_STORM_UNIT                     BIT(2)
+#define ANA_STORMLIMIT_CFG_STORM_MODE(x)                  ((x) & GENMASK(1, 0))
+#define ANA_STORMLIMIT_CFG_STORM_MODE_M                   GENMASK(1, 0)
+
+#define ANA_AUTOAGE_AGE_FAST                              BIT(21)
+#define ANA_AUTOAGE_AGE_PERIOD(x)                         (((x) << 1) & GENMASK(20, 1))
+#define ANA_AUTOAGE_AGE_PERIOD_M                          GENMASK(20, 1)
+#define ANA_AUTOAGE_AGE_PERIOD_X(x)                       (((x) & GENMASK(20, 1)) >> 1)
+#define ANA_AUTOAGE_AUTOAGE_LOCKED                        BIT(0)
+
+#define ANA_MACTOPTIONS_REDUCED_TABLE                     BIT(1)
+#define ANA_MACTOPTIONS_SHADOW                            BIT(0)
+
+#define ANA_AGENCTRL_FID_MASK(x)                          (((x) << 12) & GENMASK(23, 12))
+#define ANA_AGENCTRL_FID_MASK_M                           GENMASK(23, 12)
+#define ANA_AGENCTRL_FID_MASK_X(x)                        (((x) & GENMASK(23, 12)) >> 12)
+#define ANA_AGENCTRL_IGNORE_DMAC_FLAGS                    BIT(11)
+#define ANA_AGENCTRL_IGNORE_SMAC_FLAGS                    BIT(10)
+#define ANA_AGENCTRL_FLOOD_SPECIAL                        BIT(9)
+#define ANA_AGENCTRL_FLOOD_IGNORE_VLAN                    BIT(8)
+#define ANA_AGENCTRL_MIRROR_CPU                           BIT(7)
+#define ANA_AGENCTRL_LEARN_CPU_COPY                       BIT(6)
+#define ANA_AGENCTRL_LEARN_FWD_KILL                       BIT(5)
+#define ANA_AGENCTRL_LEARN_IGNORE_VLAN                    BIT(4)
+#define ANA_AGENCTRL_CPU_CPU_KILL_ENA                     BIT(3)
+#define ANA_AGENCTRL_GREEN_COUNT_MODE                     BIT(2)
+#define ANA_AGENCTRL_YELLOW_COUNT_MODE                    BIT(1)
+#define ANA_AGENCTRL_RED_COUNT_MODE                       BIT(0)
+
+#define ANA_FLOODING_RSZ                                  0x4
+
+#define ANA_FLOODING_FLD_UNICAST(x)                       (((x) << 12) & GENMASK(17, 12))
+#define ANA_FLOODING_FLD_UNICAST_M                        GENMASK(17, 12)
+#define ANA_FLOODING_FLD_UNICAST_X(x)                     (((x) & GENMASK(17, 12)) >> 12)
+#define ANA_FLOODING_FLD_BROADCAST(x)                     (((x) << 6) & GENMASK(11, 6))
+#define ANA_FLOODING_FLD_BROADCAST_M                      GENMASK(11, 6)
+#define ANA_FLOODING_FLD_BROADCAST_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define ANA_FLOODING_FLD_MULTICAST(x)                     ((x) & GENMASK(5, 0))
+#define ANA_FLOODING_FLD_MULTICAST_M                      GENMASK(5, 0)
+
+#define ANA_FLOODING_IPMC_FLD_MC4_CTRL(x)                 (((x) << 18) & GENMASK(23, 18))
+#define ANA_FLOODING_IPMC_FLD_MC4_CTRL_M                  GENMASK(23, 18)
+#define ANA_FLOODING_IPMC_FLD_MC4_CTRL_X(x)               (((x) & GENMASK(23, 18)) >> 18)
+#define ANA_FLOODING_IPMC_FLD_MC4_DATA(x)                 (((x) << 12) & GENMASK(17, 12))
+#define ANA_FLOODING_IPMC_FLD_MC4_DATA_M                  GENMASK(17, 12)
+#define ANA_FLOODING_IPMC_FLD_MC4_DATA_X(x)               (((x) & GENMASK(17, 12)) >> 12)
+#define ANA_FLOODING_IPMC_FLD_MC6_CTRL(x)                 (((x) << 6) & GENMASK(11, 6))
+#define ANA_FLOODING_IPMC_FLD_MC6_CTRL_M                  GENMASK(11, 6)
+#define ANA_FLOODING_IPMC_FLD_MC6_CTRL_X(x)               (((x) & GENMASK(11, 6)) >> 6)
+#define ANA_FLOODING_IPMC_FLD_MC6_DATA(x)                 ((x) & GENMASK(5, 0))
+#define ANA_FLOODING_IPMC_FLD_MC6_DATA_M                  GENMASK(5, 0)
+
+#define ANA_SFLOW_CFG_RSZ                                 0x4
+
+#define ANA_SFLOW_CFG_SF_RATE(x)                          (((x) << 2) & GENMASK(13, 2))
+#define ANA_SFLOW_CFG_SF_RATE_M                           GENMASK(13, 2)
+#define ANA_SFLOW_CFG_SF_RATE_X(x)                        (((x) & GENMASK(13, 2)) >> 2)
+#define ANA_SFLOW_CFG_SF_SAMPLE_RX                        BIT(1)
+#define ANA_SFLOW_CFG_SF_SAMPLE_TX                        BIT(0)
+
+#define ANA_PORT_MODE_RSZ                                 0x4
+
+#define ANA_PORT_MODE_REDTAG_PARSE_CFG                    BIT(3)
+#define ANA_PORT_MODE_VLAN_PARSE_CFG(x)                   (((x) << 1) & GENMASK(2, 1))
+#define ANA_PORT_MODE_VLAN_PARSE_CFG_M                    GENMASK(2, 1)
+#define ANA_PORT_MODE_VLAN_PARSE_CFG_X(x)                 (((x) & GENMASK(2, 1)) >> 1)
+#define ANA_PORT_MODE_L3_PARSE_CFG                        BIT(0)
+
+#define ANA_CUT_THRU_CFG_RSZ                              0x4
+
+#define ANA_PGID_PGID_RSZ                                 0x4
+
+#define ANA_PGID_PGID_PGID(x)                             ((x) & GENMASK(11, 0))
+#define ANA_PGID_PGID_PGID_M                              GENMASK(11, 0)
+#define ANA_PGID_PGID_CPUQ_DST_PGID(x)                    (((x) << 27) & GENMASK(29, 27))
+#define ANA_PGID_PGID_CPUQ_DST_PGID_M                     GENMASK(29, 27)
+#define ANA_PGID_PGID_CPUQ_DST_PGID_X(x)                  (((x) & GENMASK(29, 27)) >> 27)
+
+#define ANA_TABLES_MACHDATA_VID(x)                        (((x) << 16) & GENMASK(28, 16))
+#define ANA_TABLES_MACHDATA_VID_M                         GENMASK(28, 16)
+#define ANA_TABLES_MACHDATA_VID_X(x)                      (((x) & GENMASK(28, 16)) >> 16)
+#define ANA_TABLES_MACHDATA_MACHDATA(x)                   ((x) & GENMASK(15, 0))
+#define ANA_TABLES_MACHDATA_MACHDATA_M                    GENMASK(15, 0)
+
+#define ANA_TABLES_STREAMDATA_SSID_VALID                  BIT(16)
+#define ANA_TABLES_STREAMDATA_SSID(x)                     (((x) << 9) & GENMASK(15, 9))
+#define ANA_TABLES_STREAMDATA_SSID_M                      GENMASK(15, 9)
+#define ANA_TABLES_STREAMDATA_SSID_X(x)                   (((x) & GENMASK(15, 9)) >> 9)
+#define ANA_TABLES_STREAMDATA_SFID_VALID                  BIT(8)
+#define ANA_TABLES_STREAMDATA_SFID(x)                     ((x) & GENMASK(7, 0))
+#define ANA_TABLES_STREAMDATA_SFID_M                      GENMASK(7, 0)
+
+#define ANA_TABLES_MACACCESS_MAC_CPU_COPY                 BIT(15)
+#define ANA_TABLES_MACACCESS_SRC_KILL                     BIT(14)
+#define ANA_TABLES_MACACCESS_IGNORE_VLAN                  BIT(13)
+#define ANA_TABLES_MACACCESS_AGED_FLAG                    BIT(12)
+#define ANA_TABLES_MACACCESS_VALID                        BIT(11)
+#define ANA_TABLES_MACACCESS_ENTRYTYPE(x)                 (((x) << 9) & GENMASK(10, 9))
+#define ANA_TABLES_MACACCESS_ENTRYTYPE_M                  GENMASK(10, 9)
+#define ANA_TABLES_MACACCESS_ENTRYTYPE_X(x)               (((x) & GENMASK(10, 9)) >> 9)
+#define ANA_TABLES_MACACCESS_DEST_IDX(x)                  (((x) << 3) & GENMASK(8, 3))
+#define ANA_TABLES_MACACCESS_DEST_IDX_M                   GENMASK(8, 3)
+#define ANA_TABLES_MACACCESS_DEST_IDX_X(x)                (((x) & GENMASK(8, 3)) >> 3)
+#define ANA_TABLES_MACACCESS_MAC_TABLE_CMD(x)             ((x) & GENMASK(2, 0))
+#define ANA_TABLES_MACACCESS_MAC_TABLE_CMD_M              GENMASK(2, 0)
+#define MACACCESS_CMD_IDLE                     0
+#define MACACCESS_CMD_LEARN                    1
+#define MACACCESS_CMD_FORGET                   2
+#define MACACCESS_CMD_AGE                      3
+#define MACACCESS_CMD_GET_NEXT                 4
+#define MACACCESS_CMD_INIT                     5
+#define MACACCESS_CMD_READ                     6
+#define MACACCESS_CMD_WRITE                    7
+
+#define ANA_TABLES_VLANACCESS_VLAN_PORT_MASK(x)           (((x) << 2) & GENMASK(13, 2))
+#define ANA_TABLES_VLANACCESS_VLAN_PORT_MASK_M            GENMASK(13, 2)
+#define ANA_TABLES_VLANACCESS_VLAN_PORT_MASK_X(x)         (((x) & GENMASK(13, 2)) >> 2)
+#define ANA_TABLES_VLANACCESS_VLAN_TBL_CMD(x)             ((x) & GENMASK(1, 0))
+#define ANA_TABLES_VLANACCESS_VLAN_TBL_CMD_M              GENMASK(1, 0)
+#define ANA_TABLES_VLANACCESS_CMD_IDLE                    0x0
+#define ANA_TABLES_VLANACCESS_CMD_WRITE                   0x2
+#define ANA_TABLES_VLANACCESS_CMD_INIT                    0x3
+
+#define ANA_TABLES_VLANTIDX_VLAN_SEC_FWD_ENA              BIT(17)
+#define ANA_TABLES_VLANTIDX_VLAN_FLOOD_DIS                BIT(16)
+#define ANA_TABLES_VLANTIDX_VLAN_PRIV_VLAN                BIT(15)
+#define ANA_TABLES_VLANTIDX_VLAN_LEARN_DISABLED           BIT(14)
+#define ANA_TABLES_VLANTIDX_VLAN_MIRROR                   BIT(13)
+#define ANA_TABLES_VLANTIDX_VLAN_SRC_CHK                  BIT(12)
+#define ANA_TABLES_VLANTIDX_V_INDEX(x)                    ((x) & GENMASK(11, 0))
+#define ANA_TABLES_VLANTIDX_V_INDEX_M                     GENMASK(11, 0)
+
+#define ANA_TABLES_ISDXACCESS_ISDX_PORT_MASK(x)           (((x) << 2) & GENMASK(8, 2))
+#define ANA_TABLES_ISDXACCESS_ISDX_PORT_MASK_M            GENMASK(8, 2)
+#define ANA_TABLES_ISDXACCESS_ISDX_PORT_MASK_X(x)         (((x) & GENMASK(8, 2)) >> 2)
+#define ANA_TABLES_ISDXACCESS_ISDX_TBL_CMD(x)             ((x) & GENMASK(1, 0))
+#define ANA_TABLES_ISDXACCESS_ISDX_TBL_CMD_M              GENMASK(1, 0)
+
+#define ANA_TABLES_ISDXTIDX_ISDX_SDLBI(x)                 (((x) << 21) & GENMASK(28, 21))
+#define ANA_TABLES_ISDXTIDX_ISDX_SDLBI_M                  GENMASK(28, 21)
+#define ANA_TABLES_ISDXTIDX_ISDX_SDLBI_X(x)               (((x) & GENMASK(28, 21)) >> 21)
+#define ANA_TABLES_ISDXTIDX_ISDX_MSTI(x)                  (((x) << 15) & GENMASK(20, 15))
+#define ANA_TABLES_ISDXTIDX_ISDX_MSTI_M                   GENMASK(20, 15)
+#define ANA_TABLES_ISDXTIDX_ISDX_MSTI_X(x)                (((x) & GENMASK(20, 15)) >> 15)
+#define ANA_TABLES_ISDXTIDX_ISDX_ES0_KEY_ENA              BIT(14)
+#define ANA_TABLES_ISDXTIDX_ISDX_FORCE_ENA                BIT(10)
+#define ANA_TABLES_ISDXTIDX_ISDX_INDEX(x)                 ((x) & GENMASK(7, 0))
+#define ANA_TABLES_ISDXTIDX_ISDX_INDEX_M                  GENMASK(7, 0)
+
+#define ANA_TABLES_ENTRYLIM_RSZ                           0x4
+
+#define ANA_TABLES_ENTRYLIM_ENTRYLIM(x)                   (((x) << 14) & GENMASK(17, 14))
+#define ANA_TABLES_ENTRYLIM_ENTRYLIM_M                    GENMASK(17, 14)
+#define ANA_TABLES_ENTRYLIM_ENTRYLIM_X(x)                 (((x) & GENMASK(17, 14)) >> 14)
+#define ANA_TABLES_ENTRYLIM_ENTRYSTAT(x)                  ((x) & GENMASK(13, 0))
+#define ANA_TABLES_ENTRYLIM_ENTRYSTAT_M                   GENMASK(13, 0)
+
+#define ANA_TABLES_STREAMACCESS_GEN_REC_SEQ_NUM(x)        (((x) << 4) & GENMASK(31, 4))
+#define ANA_TABLES_STREAMACCESS_GEN_REC_SEQ_NUM_M         GENMASK(31, 4)
+#define ANA_TABLES_STREAMACCESS_GEN_REC_SEQ_NUM_X(x)      (((x) & GENMASK(31, 4)) >> 4)
+#define ANA_TABLES_STREAMACCESS_SEQ_GEN_REC_ENA           BIT(3)
+#define ANA_TABLES_STREAMACCESS_GEN_REC_TYPE              BIT(2)
+#define ANA_TABLES_STREAMACCESS_STREAM_TBL_CMD(x)         ((x) & GENMASK(1, 0))
+#define ANA_TABLES_STREAMACCESS_STREAM_TBL_CMD_M          GENMASK(1, 0)
+
+#define ANA_TABLES_STREAMTIDX_SEQ_GEN_ERR_STATUS(x)       (((x) << 30) & GENMASK(31, 30))
+#define ANA_TABLES_STREAMTIDX_SEQ_GEN_ERR_STATUS_M        GENMASK(31, 30)
+#define ANA_TABLES_STREAMTIDX_SEQ_GEN_ERR_STATUS_X(x)     (((x) & GENMASK(31, 30)) >> 30)
+#define ANA_TABLES_STREAMTIDX_S_INDEX(x)                  (((x) << 16) & GENMASK(22, 16))
+#define ANA_TABLES_STREAMTIDX_S_INDEX_M                   GENMASK(22, 16)
+#define ANA_TABLES_STREAMTIDX_S_INDEX_X(x)                (((x) & GENMASK(22, 16)) >> 16)
+#define ANA_TABLES_STREAMTIDX_FORCE_SF_BEHAVIOUR          BIT(14)
+#define ANA_TABLES_STREAMTIDX_SEQ_HISTORY_LEN(x)          (((x) << 8) & GENMASK(13, 8))
+#define ANA_TABLES_STREAMTIDX_SEQ_HISTORY_LEN_M           GENMASK(13, 8)
+#define ANA_TABLES_STREAMTIDX_SEQ_HISTORY_LEN_X(x)        (((x) & GENMASK(13, 8)) >> 8)
+#define ANA_TABLES_STREAMTIDX_RESET_ON_ROGUE              BIT(7)
+#define ANA_TABLES_STREAMTIDX_REDTAG_POP                  BIT(6)
+#define ANA_TABLES_STREAMTIDX_STREAM_SPLIT                BIT(5)
+#define ANA_TABLES_STREAMTIDX_SEQ_SPACE_LOG2(x)           ((x) & GENMASK(4, 0))
+#define ANA_TABLES_STREAMTIDX_SEQ_SPACE_LOG2_M            GENMASK(4, 0)
+
+#define ANA_TABLES_SEQ_MASK_SPLIT_MASK(x)                 (((x) << 16) & GENMASK(22, 16))
+#define ANA_TABLES_SEQ_MASK_SPLIT_MASK_M                  GENMASK(22, 16)
+#define ANA_TABLES_SEQ_MASK_SPLIT_MASK_X(x)               (((x) & GENMASK(22, 16)) >> 16)
+#define ANA_TABLES_SEQ_MASK_INPUT_PORT_MASK(x)            ((x) & GENMASK(6, 0))
+#define ANA_TABLES_SEQ_MASK_INPUT_PORT_MASK_M             GENMASK(6, 0)
+
+#define ANA_TABLES_SFID_MASK_IGR_PORT_MASK(x)             (((x) << 1) & GENMASK(7, 1))
+#define ANA_TABLES_SFID_MASK_IGR_PORT_MASK_M              GENMASK(7, 1)
+#define ANA_TABLES_SFID_MASK_IGR_PORT_MASK_X(x)           (((x) & GENMASK(7, 1)) >> 1)
+#define ANA_TABLES_SFID_MASK_IGR_SRCPORT_MATCH_ENA        BIT(0)
+
+#define ANA_TABLES_SFIDACCESS_IGR_PRIO_MATCH_ENA          BIT(22)
+#define ANA_TABLES_SFIDACCESS_IGR_PRIO(x)                 (((x) << 19) & GENMASK(21, 19))
+#define ANA_TABLES_SFIDACCESS_IGR_PRIO_M                  GENMASK(21, 19)
+#define ANA_TABLES_SFIDACCESS_IGR_PRIO_X(x)               (((x) & GENMASK(21, 19)) >> 19)
+#define ANA_TABLES_SFIDACCESS_FORCE_BLOCK                 BIT(18)
+#define ANA_TABLES_SFIDACCESS_MAX_SDU_LEN(x)              (((x) << 2) & GENMASK(17, 2))
+#define ANA_TABLES_SFIDACCESS_MAX_SDU_LEN_M               GENMASK(17, 2)
+#define ANA_TABLES_SFIDACCESS_MAX_SDU_LEN_X(x)            (((x) & GENMASK(17, 2)) >> 2)
+#define ANA_TABLES_SFIDACCESS_SFID_TBL_CMD(x)             ((x) & GENMASK(1, 0))
+#define ANA_TABLES_SFIDACCESS_SFID_TBL_CMD_M              GENMASK(1, 0)
+
+#define ANA_TABLES_SFIDTIDX_SGID_VALID                    BIT(26)
+#define ANA_TABLES_SFIDTIDX_SGID(x)                       (((x) << 18) & GENMASK(25, 18))
+#define ANA_TABLES_SFIDTIDX_SGID_M                        GENMASK(25, 18)
+#define ANA_TABLES_SFIDTIDX_SGID_X(x)                     (((x) & GENMASK(25, 18)) >> 18)
+#define ANA_TABLES_SFIDTIDX_POL_ENA                       BIT(17)
+#define ANA_TABLES_SFIDTIDX_POL_IDX(x)                    (((x) << 8) & GENMASK(16, 8))
+#define ANA_TABLES_SFIDTIDX_POL_IDX_M                     GENMASK(16, 8)
+#define ANA_TABLES_SFIDTIDX_POL_IDX_X(x)                  (((x) & GENMASK(16, 8)) >> 8)
+#define ANA_TABLES_SFIDTIDX_SFID_INDEX(x)                 ((x) & GENMASK(7, 0))
+#define ANA_TABLES_SFIDTIDX_SFID_INDEX_M                  GENMASK(7, 0)
+
+#define ANA_MSTI_STATE_RSZ                                0x4
+
+#define ANA_OAM_UPM_LM_CNT_RSZ                            0x4
+
+#define ANA_SG_ACCESS_CTRL_SGID(x)                        ((x) & GENMASK(7, 0))
+#define ANA_SG_ACCESS_CTRL_SGID_M                         GENMASK(7, 0)
+#define ANA_SG_ACCESS_CTRL_CONFIG_CHANGE                  BIT(28)
+
+#define ANA_SG_CONFIG_REG_3_BASE_TIME_SEC_MSB(x)          ((x) & GENMASK(15, 0))
+#define ANA_SG_CONFIG_REG_3_BASE_TIME_SEC_MSB_M           GENMASK(15, 0)
+#define ANA_SG_CONFIG_REG_3_LIST_LENGTH(x)                (((x) << 16) & GENMASK(18, 16))
+#define ANA_SG_CONFIG_REG_3_LIST_LENGTH_M                 GENMASK(18, 16)
+#define ANA_SG_CONFIG_REG_3_LIST_LENGTH_X(x)              (((x) & GENMASK(18, 16)) >> 16)
+#define ANA_SG_CONFIG_REG_3_GATE_ENABLE                   BIT(20)
+#define ANA_SG_CONFIG_REG_3_INIT_IPS(x)                   (((x) << 24) & GENMASK(27, 24))
+#define ANA_SG_CONFIG_REG_3_INIT_IPS_M                    GENMASK(27, 24)
+#define ANA_SG_CONFIG_REG_3_INIT_IPS_X(x)                 (((x) & GENMASK(27, 24)) >> 24)
+#define ANA_SG_CONFIG_REG_3_INIT_GATE_STATE               BIT(28)
+
+#define ANA_SG_GCL_GS_CONFIG_RSZ                          0x4
+
+#define ANA_SG_GCL_GS_CONFIG_IPS(x)                       ((x) & GENMASK(3, 0))
+#define ANA_SG_GCL_GS_CONFIG_IPS_M                        GENMASK(3, 0)
+#define ANA_SG_GCL_GS_CONFIG_GATE_STATE                   BIT(4)
+
+#define ANA_SG_GCL_TI_CONFIG_RSZ                          0x4
+
+#define ANA_SG_STATUS_REG_3_CFG_CHG_TIME_SEC_MSB(x)       ((x) & GENMASK(15, 0))
+#define ANA_SG_STATUS_REG_3_CFG_CHG_TIME_SEC_MSB_M        GENMASK(15, 0)
+#define ANA_SG_STATUS_REG_3_GATE_STATE                    BIT(16)
+#define ANA_SG_STATUS_REG_3_IPS(x)                        (((x) << 20) & GENMASK(23, 20))
+#define ANA_SG_STATUS_REG_3_IPS_M                         GENMASK(23, 20)
+#define ANA_SG_STATUS_REG_3_IPS_X(x)                      (((x) & GENMASK(23, 20)) >> 20)
+#define ANA_SG_STATUS_REG_3_CONFIG_PENDING                BIT(24)
+
+#define ANA_PORT_VLAN_CFG_GSZ                             0x100
+
+#define ANA_PORT_VLAN_CFG_VLAN_VID_AS_ISDX                BIT(21)
+#define ANA_PORT_VLAN_CFG_VLAN_AWARE_ENA                  BIT(20)
+#define ANA_PORT_VLAN_CFG_VLAN_POP_CNT(x)                 (((x) << 18) & GENMASK(19, 18))
+#define ANA_PORT_VLAN_CFG_VLAN_POP_CNT_M                  GENMASK(19, 18)
+#define ANA_PORT_VLAN_CFG_VLAN_POP_CNT_X(x)               (((x) & GENMASK(19, 18)) >> 18)
+#define ANA_PORT_VLAN_CFG_VLAN_INNER_TAG_ENA              BIT(17)
+#define ANA_PORT_VLAN_CFG_VLAN_TAG_TYPE                   BIT(16)
+#define ANA_PORT_VLAN_CFG_VLAN_DEI                        BIT(15)
+#define ANA_PORT_VLAN_CFG_VLAN_PCP(x)                     (((x) << 12) & GENMASK(14, 12))
+#define ANA_PORT_VLAN_CFG_VLAN_PCP_M                      GENMASK(14, 12)
+#define ANA_PORT_VLAN_CFG_VLAN_PCP_X(x)                   (((x) & GENMASK(14, 12)) >> 12)
+#define ANA_PORT_VLAN_CFG_VLAN_VID(x)                     ((x) & GENMASK(11, 0))
+#define ANA_PORT_VLAN_CFG_VLAN_VID_M                      GENMASK(11, 0)
+
+#define ANA_PORT_DROP_CFG_GSZ                             0x100
+
+#define ANA_PORT_DROP_CFG_DROP_UNTAGGED_ENA               BIT(6)
+#define ANA_PORT_DROP_CFG_DROP_S_TAGGED_ENA               BIT(5)
+#define ANA_PORT_DROP_CFG_DROP_C_TAGGED_ENA               BIT(4)
+#define ANA_PORT_DROP_CFG_DROP_PRIO_S_TAGGED_ENA          BIT(3)
+#define ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA          BIT(2)
+#define ANA_PORT_DROP_CFG_DROP_NULL_MAC_ENA               BIT(1)
+#define ANA_PORT_DROP_CFG_DROP_MC_SMAC_ENA                BIT(0)
+
+#define ANA_PORT_QOS_CFG_GSZ                              0x100
+
+#define ANA_PORT_QOS_CFG_DP_DEFAULT_VAL                   BIT(8)
+#define ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL(x)               (((x) << 5) & GENMASK(7, 5))
+#define ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL_M                GENMASK(7, 5)
+#define ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL_X(x)             (((x) & GENMASK(7, 5)) >> 5)
+#define ANA_PORT_QOS_CFG_QOS_DSCP_ENA                     BIT(4)
+#define ANA_PORT_QOS_CFG_QOS_PCP_ENA                      BIT(3)
+#define ANA_PORT_QOS_CFG_DSCP_TRANSLATE_ENA               BIT(2)
+#define ANA_PORT_QOS_CFG_DSCP_REWR_CFG(x)                 ((x) & GENMASK(1, 0))
+#define ANA_PORT_QOS_CFG_DSCP_REWR_CFG_M                  GENMASK(1, 0)
+
+#define ANA_PORT_VCAP_CFG_GSZ                             0x100
+
+#define ANA_PORT_VCAP_CFG_S1_ENA                          BIT(14)
+#define ANA_PORT_VCAP_CFG_S1_DMAC_DIP_ENA(x)              (((x) << 11) & GENMASK(13, 11))
+#define ANA_PORT_VCAP_CFG_S1_DMAC_DIP_ENA_M               GENMASK(13, 11)
+#define ANA_PORT_VCAP_CFG_S1_DMAC_DIP_ENA_X(x)            (((x) & GENMASK(13, 11)) >> 11)
+#define ANA_PORT_VCAP_CFG_S1_VLAN_INNER_TAG_ENA(x)        (((x) << 8) & GENMASK(10, 8))
+#define ANA_PORT_VCAP_CFG_S1_VLAN_INNER_TAG_ENA_M         GENMASK(10, 8)
+#define ANA_PORT_VCAP_CFG_S1_VLAN_INNER_TAG_ENA_X(x)      (((x) & GENMASK(10, 8)) >> 8)
+#define ANA_PORT_VCAP_CFG_PAG_VAL(x)                      ((x) & GENMASK(7, 0))
+#define ANA_PORT_VCAP_CFG_PAG_VAL_M                       GENMASK(7, 0)
+
+#define ANA_PORT_VCAP_S1_KEY_CFG_GSZ                      0x100
+#define ANA_PORT_VCAP_S1_KEY_CFG_RSZ                      0x4
+
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP6_CFG(x)        (((x) << 4) & GENMASK(6, 4))
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP6_CFG_M         GENMASK(6, 4)
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP6_CFG_X(x)      (((x) & GENMASK(6, 4)) >> 4)
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP4_CFG(x)        (((x) << 2) & GENMASK(3, 2))
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP4_CFG_M         GENMASK(3, 2)
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_IP4_CFG_X(x)      (((x) & GENMASK(3, 2)) >> 2)
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_OTHER_CFG(x)      ((x) & GENMASK(1, 0))
+#define ANA_PORT_VCAP_S1_KEY_CFG_S1_KEY_OTHER_CFG_M       GENMASK(1, 0)
+
+#define ANA_PORT_VCAP_S2_CFG_GSZ                          0x100
+
+#define ANA_PORT_VCAP_S2_CFG_S2_UDP_PAYLOAD_ENA(x)        (((x) << 17) & GENMASK(18, 17))
+#define ANA_PORT_VCAP_S2_CFG_S2_UDP_PAYLOAD_ENA_M         GENMASK(18, 17)
+#define ANA_PORT_VCAP_S2_CFG_S2_UDP_PAYLOAD_ENA_X(x)      (((x) & GENMASK(18, 17)) >> 17)
+#define ANA_PORT_VCAP_S2_CFG_S2_ETYPE_PAYLOAD_ENA(x)      (((x) << 15) & GENMASK(16, 15))
+#define ANA_PORT_VCAP_S2_CFG_S2_ETYPE_PAYLOAD_ENA_M       GENMASK(16, 15)
+#define ANA_PORT_VCAP_S2_CFG_S2_ETYPE_PAYLOAD_ENA_X(x)    (((x) & GENMASK(16, 15)) >> 15)
+#define ANA_PORT_VCAP_S2_CFG_S2_ENA                       BIT(14)
+#define ANA_PORT_VCAP_S2_CFG_S2_SNAP_DIS(x)               (((x) << 12) & GENMASK(13, 12))
+#define ANA_PORT_VCAP_S2_CFG_S2_SNAP_DIS_M                GENMASK(13, 12)
+#define ANA_PORT_VCAP_S2_CFG_S2_SNAP_DIS_X(x)             (((x) & GENMASK(13, 12)) >> 12)
+#define ANA_PORT_VCAP_S2_CFG_S2_ARP_DIS(x)                (((x) << 10) & GENMASK(11, 10))
+#define ANA_PORT_VCAP_S2_CFG_S2_ARP_DIS_M                 GENMASK(11, 10)
+#define ANA_PORT_VCAP_S2_CFG_S2_ARP_DIS_X(x)              (((x) & GENMASK(11, 10)) >> 10)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_TCPUDP_DIS(x)          (((x) << 8) & GENMASK(9, 8))
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_TCPUDP_DIS_M           GENMASK(9, 8)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_TCPUDP_DIS_X(x)        (((x) & GENMASK(9, 8)) >> 8)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_OTHER_DIS(x)           (((x) << 6) & GENMASK(7, 6))
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_OTHER_DIS_M            GENMASK(7, 6)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP_OTHER_DIS_X(x)         (((x) & GENMASK(7, 6)) >> 6)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP6_CFG(x)                (((x) << 2) & GENMASK(5, 2))
+#define ANA_PORT_VCAP_S2_CFG_S2_IP6_CFG_M                 GENMASK(5, 2)
+#define ANA_PORT_VCAP_S2_CFG_S2_IP6_CFG_X(x)              (((x) & GENMASK(5, 2)) >> 2)
+#define ANA_PORT_VCAP_S2_CFG_S2_OAM_DIS(x)                ((x) & GENMASK(1, 0))
+#define ANA_PORT_VCAP_S2_CFG_S2_OAM_DIS_M                 GENMASK(1, 0)
+
+#define ANA_PORT_PCP_DEI_MAP_GSZ                          0x100
+#define ANA_PORT_PCP_DEI_MAP_RSZ                          0x4
+
+#define ANA_PORT_PCP_DEI_MAP_DP_PCP_DEI_VAL               BIT(3)
+#define ANA_PORT_PCP_DEI_MAP_QOS_PCP_DEI_VAL(x)           ((x) & GENMASK(2, 0))
+#define ANA_PORT_PCP_DEI_MAP_QOS_PCP_DEI_VAL_M            GENMASK(2, 0)
+
+#define ANA_PORT_CPU_FWD_CFG_GSZ                          0x100
+
+#define ANA_PORT_CPU_FWD_CFG_CPU_VRAP_REDIR_ENA           BIT(7)
+#define ANA_PORT_CPU_FWD_CFG_CPU_MLD_REDIR_ENA            BIT(6)
+#define ANA_PORT_CPU_FWD_CFG_CPU_IGMP_REDIR_ENA           BIT(5)
+#define ANA_PORT_CPU_FWD_CFG_CPU_IPMC_CTRL_COPY_ENA       BIT(4)
+#define ANA_PORT_CPU_FWD_CFG_CPU_SRC_COPY_ENA             BIT(3)
+#define ANA_PORT_CPU_FWD_CFG_CPU_ALLBRIDGE_DROP_ENA       BIT(2)
+#define ANA_PORT_CPU_FWD_CFG_CPU_ALLBRIDGE_REDIR_ENA      BIT(1)
+#define ANA_PORT_CPU_FWD_CFG_CPU_OAM_ENA                  BIT(0)
+
+#define ANA_PORT_CPU_FWD_BPDU_CFG_GSZ                     0x100
+
+#define ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_DROP_ENA(x)        (((x) << 16) & GENMASK(31, 16))
+#define ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_DROP_ENA_M         GENMASK(31, 16)
+#define ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_DROP_ENA_X(x)      (((x) & GENMASK(31, 16)) >> 16)
+#define ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA(x)       ((x) & GENMASK(15, 0))
+#define ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA_M        GENMASK(15, 0)
+
+#define ANA_PORT_CPU_FWD_GARP_CFG_GSZ                     0x100
+
+#define ANA_PORT_CPU_FWD_GARP_CFG_GARP_DROP_ENA(x)        (((x) << 16) & GENMASK(31, 16))
+#define ANA_PORT_CPU_FWD_GARP_CFG_GARP_DROP_ENA_M         GENMASK(31, 16)
+#define ANA_PORT_CPU_FWD_GARP_CFG_GARP_DROP_ENA_X(x)      (((x) & GENMASK(31, 16)) >> 16)
+#define ANA_PORT_CPU_FWD_GARP_CFG_GARP_REDIR_ENA(x)       ((x) & GENMASK(15, 0))
+#define ANA_PORT_CPU_FWD_GARP_CFG_GARP_REDIR_ENA_M        GENMASK(15, 0)
+
+#define ANA_PORT_CPU_FWD_CCM_CFG_GSZ                      0x100
+
+#define ANA_PORT_CPU_FWD_CCM_CFG_CCM_DROP_ENA(x)          (((x) << 16) & GENMASK(31, 16))
+#define ANA_PORT_CPU_FWD_CCM_CFG_CCM_DROP_ENA_M           GENMASK(31, 16)
+#define ANA_PORT_CPU_FWD_CCM_CFG_CCM_DROP_ENA_X(x)        (((x) & GENMASK(31, 16)) >> 16)
+#define ANA_PORT_CPU_FWD_CCM_CFG_CCM_REDIR_ENA(x)         ((x) & GENMASK(15, 0))
+#define ANA_PORT_CPU_FWD_CCM_CFG_CCM_REDIR_ENA_M          GENMASK(15, 0)
+
+#define ANA_PORT_PORT_CFG_GSZ                             0x100
+
+#define ANA_PORT_PORT_CFG_SRC_MIRROR_ENA                  BIT(15)
+#define ANA_PORT_PORT_CFG_LIMIT_DROP                      BIT(14)
+#define ANA_PORT_PORT_CFG_LIMIT_CPU                       BIT(13)
+#define ANA_PORT_PORT_CFG_LOCKED_PORTMOVE_DROP            BIT(12)
+#define ANA_PORT_PORT_CFG_LOCKED_PORTMOVE_CPU             BIT(11)
+#define ANA_PORT_PORT_CFG_LEARNDROP                       BIT(10)
+#define ANA_PORT_PORT_CFG_LEARNCPU                        BIT(9)
+#define ANA_PORT_PORT_CFG_LEARNAUTO                       BIT(8)
+#define ANA_PORT_PORT_CFG_LEARN_ENA                       BIT(7)
+#define ANA_PORT_PORT_CFG_RECV_ENA                        BIT(6)
+#define ANA_PORT_PORT_CFG_PORTID_VAL(x)                   (((x) << 2) & GENMASK(5, 2))
+#define ANA_PORT_PORT_CFG_PORTID_VAL_M                    GENMASK(5, 2)
+#define ANA_PORT_PORT_CFG_PORTID_VAL_X(x)                 (((x) & GENMASK(5, 2)) >> 2)
+#define ANA_PORT_PORT_CFG_USE_B_DOM_TBL                   BIT(1)
+#define ANA_PORT_PORT_CFG_LSR_MODE                        BIT(0)
+
+#define ANA_PORT_POL_CFG_GSZ                              0x100
+
+#define ANA_PORT_POL_CFG_POL_CPU_REDIR_8021               BIT(19)
+#define ANA_PORT_POL_CFG_POL_CPU_REDIR_IP                 BIT(18)
+#define ANA_PORT_POL_CFG_PORT_POL_ENA                     BIT(17)
+#define ANA_PORT_POL_CFG_QUEUE_POL_ENA(x)                 (((x) << 9) & GENMASK(16, 9))
+#define ANA_PORT_POL_CFG_QUEUE_POL_ENA_M                  GENMASK(16, 9)
+#define ANA_PORT_POL_CFG_QUEUE_POL_ENA_X(x)               (((x) & GENMASK(16, 9)) >> 9)
+#define ANA_PORT_POL_CFG_POL_ORDER(x)                     ((x) & GENMASK(8, 0))
+#define ANA_PORT_POL_CFG_POL_ORDER_M                      GENMASK(8, 0)
+
+#define ANA_PORT_PTP_CFG_GSZ                              0x100
+
+#define ANA_PORT_PTP_CFG_PTP_BACKPLANE_MODE               BIT(0)
+
+#define ANA_PORT_PTP_DLY1_CFG_GSZ                         0x100
+
+#define ANA_PORT_PTP_DLY2_CFG_GSZ                         0x100
+
+#define ANA_PORT_SFID_CFG_GSZ                             0x100
+#define ANA_PORT_SFID_CFG_RSZ                             0x4
+
+#define ANA_PORT_SFID_CFG_SFID_VALID                      BIT(8)
+#define ANA_PORT_SFID_CFG_SFID(x)                         ((x) & GENMASK(7, 0))
+#define ANA_PORT_SFID_CFG_SFID_M                          GENMASK(7, 0)
+
+#define ANA_PFC_PFC_CFG_GSZ                               0x40
+
+#define ANA_PFC_PFC_CFG_RX_PFC_ENA(x)                     (((x) << 2) & GENMASK(9, 2))
+#define ANA_PFC_PFC_CFG_RX_PFC_ENA_M                      GENMASK(9, 2)
+#define ANA_PFC_PFC_CFG_RX_PFC_ENA_X(x)                   (((x) & GENMASK(9, 2)) >> 2)
+#define ANA_PFC_PFC_CFG_FC_LINK_SPEED(x)                  ((x) & GENMASK(1, 0))
+#define ANA_PFC_PFC_CFG_FC_LINK_SPEED_M                   GENMASK(1, 0)
+
+#define ANA_PFC_PFC_TIMER_GSZ                             0x40
+#define ANA_PFC_PFC_TIMER_RSZ                             0x4
+
+#define ANA_IPT_OAM_MEP_CFG_GSZ                           0x8
+
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_P(x)                  (((x) << 6) & GENMASK(10, 6))
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_P_M                   GENMASK(10, 6)
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_P_X(x)                (((x) & GENMASK(10, 6)) >> 6)
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX(x)                    (((x) << 1) & GENMASK(5, 1))
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_M                     GENMASK(5, 1)
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_X(x)                  (((x) & GENMASK(5, 1)) >> 1)
+#define ANA_IPT_OAM_MEP_CFG_MEP_IDX_ENA                   BIT(0)
+
+#define ANA_IPT_IPT_GSZ                                   0x8
+
+#define ANA_IPT_IPT_IPT_CFG(x)                            (((x) << 15) & GENMASK(16, 15))
+#define ANA_IPT_IPT_IPT_CFG_M                             GENMASK(16, 15)
+#define ANA_IPT_IPT_IPT_CFG_X(x)                          (((x) & GENMASK(16, 15)) >> 15)
+#define ANA_IPT_IPT_ISDX_P(x)                             (((x) << 7) & GENMASK(14, 7))
+#define ANA_IPT_IPT_ISDX_P_M                              GENMASK(14, 7)
+#define ANA_IPT_IPT_ISDX_P_X(x)                           (((x) & GENMASK(14, 7)) >> 7)
+#define ANA_IPT_IPT_PPT_IDX(x)                            ((x) & GENMASK(6, 0))
+#define ANA_IPT_IPT_PPT_IDX_M                             GENMASK(6, 0)
+
+#define ANA_PPT_PPT_RSZ                                   0x4
+
+#define ANA_FID_MAP_FID_MAP_RSZ                           0x4
+
+#define ANA_FID_MAP_FID_MAP_FID_C_VAL(x)                  (((x) << 6) & GENMASK(11, 6))
+#define ANA_FID_MAP_FID_MAP_FID_C_VAL_M                   GENMASK(11, 6)
+#define ANA_FID_MAP_FID_MAP_FID_C_VAL_X(x)                (((x) & GENMASK(11, 6)) >> 6)
+#define ANA_FID_MAP_FID_MAP_FID_B_VAL(x)                  ((x) & GENMASK(5, 0))
+#define ANA_FID_MAP_FID_MAP_FID_B_VAL_M                   GENMASK(5, 0)
+
+#define ANA_AGGR_CFG_AC_RND_ENA                           BIT(7)
+#define ANA_AGGR_CFG_AC_DMAC_ENA                          BIT(6)
+#define ANA_AGGR_CFG_AC_SMAC_ENA                          BIT(5)
+#define ANA_AGGR_CFG_AC_IP6_FLOW_LBL_ENA                  BIT(4)
+#define ANA_AGGR_CFG_AC_IP6_TCPUDP_ENA                    BIT(3)
+#define ANA_AGGR_CFG_AC_IP4_SIPDIP_ENA                    BIT(2)
+#define ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA                    BIT(1)
+#define ANA_AGGR_CFG_AC_ISDX_ENA                          BIT(0)
+
+#define ANA_CPUQ_CFG_CPUQ_MLD(x)                          (((x) << 27) & GENMASK(29, 27))
+#define ANA_CPUQ_CFG_CPUQ_MLD_M                           GENMASK(29, 27)
+#define ANA_CPUQ_CFG_CPUQ_MLD_X(x)                        (((x) & GENMASK(29, 27)) >> 27)
+#define ANA_CPUQ_CFG_CPUQ_IGMP(x)                         (((x) << 24) & GENMASK(26, 24))
+#define ANA_CPUQ_CFG_CPUQ_IGMP_M                          GENMASK(26, 24)
+#define ANA_CPUQ_CFG_CPUQ_IGMP_X(x)                       (((x) & GENMASK(26, 24)) >> 24)
+#define ANA_CPUQ_CFG_CPUQ_IPMC_CTRL(x)                    (((x) << 21) & GENMASK(23, 21))
+#define ANA_CPUQ_CFG_CPUQ_IPMC_CTRL_M                     GENMASK(23, 21)
+#define ANA_CPUQ_CFG_CPUQ_IPMC_CTRL_X(x)                  (((x) & GENMASK(23, 21)) >> 21)
+#define ANA_CPUQ_CFG_CPUQ_ALLBRIDGE(x)                    (((x) << 18) & GENMASK(20, 18))
+#define ANA_CPUQ_CFG_CPUQ_ALLBRIDGE_M                     GENMASK(20, 18)
+#define ANA_CPUQ_CFG_CPUQ_ALLBRIDGE_X(x)                  (((x) & GENMASK(20, 18)) >> 18)
+#define ANA_CPUQ_CFG_CPUQ_LOCKED_PORTMOVE(x)              (((x) << 15) & GENMASK(17, 15))
+#define ANA_CPUQ_CFG_CPUQ_LOCKED_PORTMOVE_M               GENMASK(17, 15)
+#define ANA_CPUQ_CFG_CPUQ_LOCKED_PORTMOVE_X(x)            (((x) & GENMASK(17, 15)) >> 15)
+#define ANA_CPUQ_CFG_CPUQ_SRC_COPY(x)                     (((x) << 12) & GENMASK(14, 12))
+#define ANA_CPUQ_CFG_CPUQ_SRC_COPY_M                      GENMASK(14, 12)
+#define ANA_CPUQ_CFG_CPUQ_SRC_COPY_X(x)                   (((x) & GENMASK(14, 12)) >> 12)
+#define ANA_CPUQ_CFG_CPUQ_MAC_COPY(x)                     (((x) << 9) & GENMASK(11, 9))
+#define ANA_CPUQ_CFG_CPUQ_MAC_COPY_M                      GENMASK(11, 9)
+#define ANA_CPUQ_CFG_CPUQ_MAC_COPY_X(x)                   (((x) & GENMASK(11, 9)) >> 9)
+#define ANA_CPUQ_CFG_CPUQ_LRN(x)                          (((x) << 6) & GENMASK(8, 6))
+#define ANA_CPUQ_CFG_CPUQ_LRN_M                           GENMASK(8, 6)
+#define ANA_CPUQ_CFG_CPUQ_LRN_X(x)                        (((x) & GENMASK(8, 6)) >> 6)
+#define ANA_CPUQ_CFG_CPUQ_MIRROR(x)                       (((x) << 3) & GENMASK(5, 3))
+#define ANA_CPUQ_CFG_CPUQ_MIRROR_M                        GENMASK(5, 3)
+#define ANA_CPUQ_CFG_CPUQ_MIRROR_X(x)                     (((x) & GENMASK(5, 3)) >> 3)
+#define ANA_CPUQ_CFG_CPUQ_SFLOW(x)                        ((x) & GENMASK(2, 0))
+#define ANA_CPUQ_CFG_CPUQ_SFLOW_M                         GENMASK(2, 0)
+
+#define ANA_CPUQ_8021_CFG_RSZ                             0x4
+
+#define ANA_CPUQ_8021_CFG_CPUQ_BPDU_VAL(x)                (((x) << 6) & GENMASK(8, 6))
+#define ANA_CPUQ_8021_CFG_CPUQ_BPDU_VAL_M                 GENMASK(8, 6)
+#define ANA_CPUQ_8021_CFG_CPUQ_BPDU_VAL_X(x)              (((x) & GENMASK(8, 6)) >> 6)
+#define ANA_CPUQ_8021_CFG_CPUQ_GARP_VAL(x)                (((x) << 3) & GENMASK(5, 3))
+#define ANA_CPUQ_8021_CFG_CPUQ_GARP_VAL_M                 GENMASK(5, 3)
+#define ANA_CPUQ_8021_CFG_CPUQ_GARP_VAL_X(x)              (((x) & GENMASK(5, 3)) >> 3)
+#define ANA_CPUQ_8021_CFG_CPUQ_CCM_VAL(x)                 ((x) & GENMASK(2, 0))
+#define ANA_CPUQ_8021_CFG_CPUQ_CCM_VAL_M                  GENMASK(2, 0)
+
+#define ANA_DSCP_CFG_RSZ                                  0x4
+
+#define ANA_DSCP_CFG_DP_DSCP_VAL                          BIT(11)
+#define ANA_DSCP_CFG_QOS_DSCP_VAL(x)                      (((x) << 8) & GENMASK(10, 8))
+#define ANA_DSCP_CFG_QOS_DSCP_VAL_M                       GENMASK(10, 8)
+#define ANA_DSCP_CFG_QOS_DSCP_VAL_X(x)                    (((x) & GENMASK(10, 8)) >> 8)
+#define ANA_DSCP_CFG_DSCP_TRANSLATE_VAL(x)                (((x) << 2) & GENMASK(7, 2))
+#define ANA_DSCP_CFG_DSCP_TRANSLATE_VAL_M                 GENMASK(7, 2)
+#define ANA_DSCP_CFG_DSCP_TRANSLATE_VAL_X(x)              (((x) & GENMASK(7, 2)) >> 2)
+#define ANA_DSCP_CFG_DSCP_TRUST_ENA                       BIT(1)
+#define ANA_DSCP_CFG_DSCP_REWR_ENA                        BIT(0)
+
+#define ANA_DSCP_REWR_CFG_RSZ                             0x4
+
+#define ANA_VCAP_RNG_TYPE_CFG_RSZ                         0x4
+
+#define ANA_VCAP_RNG_VAL_CFG_RSZ                          0x4
+
+#define ANA_VCAP_RNG_VAL_CFG_VCAP_RNG_MIN_VAL(x)          (((x) << 16) & GENMASK(31, 16))
+#define ANA_VCAP_RNG_VAL_CFG_VCAP_RNG_MIN_VAL_M           GENMASK(31, 16)
+#define ANA_VCAP_RNG_VAL_CFG_VCAP_RNG_MIN_VAL_X(x)        (((x) & GENMASK(31, 16)) >> 16)
+#define ANA_VCAP_RNG_VAL_CFG_VCAP_RNG_MAX_VAL(x)          ((x) & GENMASK(15, 0))
+#define ANA_VCAP_RNG_VAL_CFG_VCAP_RNG_MAX_VAL_M           GENMASK(15, 0)
+
+#define ANA_VRAP_CFG_VRAP_VLAN_AWARE_ENA                  BIT(12)
+#define ANA_VRAP_CFG_VRAP_VID(x)                          ((x) & GENMASK(11, 0))
+#define ANA_VRAP_CFG_VRAP_VID_M                           GENMASK(11, 0)
+
+#define ANA_DISCARD_CFG_DROP_TAGGING_ISDX0                BIT(3)
+#define ANA_DISCARD_CFG_DROP_CTRLPROT_ISDX0               BIT(2)
+#define ANA_DISCARD_CFG_DROP_TAGGING_S2_ENA               BIT(1)
+#define ANA_DISCARD_CFG_DROP_CTRLPROT_S2_ENA              BIT(0)
+
+#define ANA_FID_CFG_VID_MC_ENA                            BIT(0)
+
+#define ANA_POL_PIR_CFG_GSZ                               0x20
+
+#define ANA_POL_PIR_CFG_PIR_RATE(x)                       (((x) << 6) & GENMASK(20, 6))
+#define ANA_POL_PIR_CFG_PIR_RATE_M                        GENMASK(20, 6)
+#define ANA_POL_PIR_CFG_PIR_RATE_X(x)                     (((x) & GENMASK(20, 6)) >> 6)
+#define ANA_POL_PIR_CFG_PIR_BURST(x)                      ((x) & GENMASK(5, 0))
+#define ANA_POL_PIR_CFG_PIR_BURST_M                       GENMASK(5, 0)
+
+#define ANA_POL_CIR_CFG_GSZ                               0x20
+
+#define ANA_POL_CIR_CFG_CIR_RATE(x)                       (((x) << 6) & GENMASK(20, 6))
+#define ANA_POL_CIR_CFG_CIR_RATE_M                        GENMASK(20, 6)
+#define ANA_POL_CIR_CFG_CIR_RATE_X(x)                     (((x) & GENMASK(20, 6)) >> 6)
+#define ANA_POL_CIR_CFG_CIR_BURST(x)                      ((x) & GENMASK(5, 0))
+#define ANA_POL_CIR_CFG_CIR_BURST_M                       GENMASK(5, 0)
+
+#define ANA_POL_MODE_CFG_GSZ                              0x20
+
+#define ANA_POL_MODE_CFG_IPG_SIZE(x)                      (((x) << 5) & GENMASK(9, 5))
+#define ANA_POL_MODE_CFG_IPG_SIZE_M                       GENMASK(9, 5)
+#define ANA_POL_MODE_CFG_IPG_SIZE_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
+#define ANA_POL_MODE_CFG_FRM_MODE(x)                      (((x) << 3) & GENMASK(4, 3))
+#define ANA_POL_MODE_CFG_FRM_MODE_M                       GENMASK(4, 3)
+#define ANA_POL_MODE_CFG_FRM_MODE_X(x)                    (((x) & GENMASK(4, 3)) >> 3)
+#define ANA_POL_MODE_CFG_DLB_COUPLED                      BIT(2)
+#define ANA_POL_MODE_CFG_CIR_ENA                          BIT(1)
+#define ANA_POL_MODE_CFG_OVERSHOOT_ENA                    BIT(0)
+
+#define ANA_POL_PIR_STATE_GSZ                             0x20
+
+#define ANA_POL_CIR_STATE_GSZ                             0x20
+
+#define ANA_POL_STATE_GSZ                                 0x20
+
+#define ANA_POL_FLOWC_RSZ                                 0x4
+
+#define ANA_POL_FLOWC_POL_FLOWC                           BIT(0)
+
+#define ANA_POL_HYST_POL_FC_HYST(x)                       (((x) << 4) & GENMASK(9, 4))
+#define ANA_POL_HYST_POL_FC_HYST_M                        GENMASK(9, 4)
+#define ANA_POL_HYST_POL_FC_HYST_X(x)                     (((x) & GENMASK(9, 4)) >> 4)
+#define ANA_POL_HYST_POL_STOP_HYST(x)                     ((x) & GENMASK(3, 0))
+#define ANA_POL_HYST_POL_STOP_HYST_M                      GENMASK(3, 0)
+
+#define ANA_POL_MISC_CFG_POL_CLOSE_ALL                    BIT(1)
+#define ANA_POL_MISC_CFG_POL_LEAK_DIS                     BIT(0)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
new file mode 100644
index 000000000000..18df7d934e81
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/of_mdio.h>
+#include <linux/of_platform.h>
+#include <linux/skbuff.h>
+
+#include "ocelot.h"
+
+static int ocelot_parse_ifh(u32 *ifh, struct frame_info *info)
+{
+	int i;
+	u8 llen, wlen;
+
+	/* The IFH is in network order, switch to CPU order */
+	for (i = 0; i < IFH_LEN; i++)
+		ifh[i] = ntohl((__force __be32)ifh[i]);
+
+	wlen = (ifh[1] >> 7) & 0xff;
+	llen = (ifh[1] >> 15) & 0x3f;
+	info->len = OCELOT_BUFFER_CELL_SZ * wlen + llen - 80;
+
+	info->port = (ifh[2] & GENMASK(14, 11)) >> 11;
+
+	info->cpuq = (ifh[3] & GENMASK(27, 20)) >> 20;
+	info->tag_type = (ifh[3] & GENMASK(16, 16)) >> 16;
+	info->vid = ifh[3] & GENMASK(11, 0);
+
+	return 0;
+}
+
+static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh,
+				u32 *rval)
+{
+	u32 val;
+	u32 bytes_valid;
+
+	val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+	if (val == XTR_NOT_READY) {
+		if (ifh)
+			return -EIO;
+
+		do {
+			val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+		} while (val == XTR_NOT_READY);
+	}
+
+	switch (val) {
+	case XTR_ABORT:
+		return -EIO;
+	case XTR_EOF_0:
+	case XTR_EOF_1:
+	case XTR_EOF_2:
+	case XTR_EOF_3:
+	case XTR_PRUNED:
+		bytes_valid = XTR_VALID_BYTES(val);
+		val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+		if (val == XTR_ESCAPE)
+			*rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+		else
+			*rval = val;
+
+		return bytes_valid;
+	case XTR_ESCAPE:
+		*rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+
+		return 4;
+	default:
+		*rval = val;
+
+		return 4;
+	}
+}
+
+static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
+{
+	struct ocelot *ocelot = arg;
+	int i = 0, grp = 0;
+	int err = 0;
+
+	if (!(ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)))
+		return IRQ_NONE;
+
+	do {
+		struct sk_buff *skb;
+		struct net_device *dev;
+		u32 *buf;
+		int sz, len;
+		u32 ifh[4];
+		u32 val;
+		struct frame_info info;
+
+		for (i = 0; i < IFH_LEN; i++) {
+			err = ocelot_rx_frame_word(ocelot, grp, true, &ifh[i]);
+			if (err != 4)
+				break;
+		}
+
+		if (err != 4)
+			break;
+
+		ocelot_parse_ifh(ifh, &info);
+
+		dev = ocelot->ports[info.port]->dev;
+
+		skb = netdev_alloc_skb(dev, info.len);
+
+		if (unlikely(!skb)) {
+			netdev_err(dev, "Unable to allocate sk_buff\n");
+			err = -ENOMEM;
+			break;
+		}
+		buf = (u32 *)skb_put(skb, info.len);
+
+		len = 0;
+		do {
+			sz = ocelot_rx_frame_word(ocelot, grp, false, &val);
+			*buf++ = val;
+			len += sz;
+		} while ((sz == 4) && (len < info.len));
+
+		if (sz < 0) {
+			err = sz;
+			break;
+		}
+
+		/* Everything we see on an interface that is in the HW bridge
+		 * has already been forwarded.
+		 */
+		if (ocelot->bridge_mask & BIT(info.port))
+			skb->offload_fwd_mark = 1;
+
+		skb->protocol = eth_type_trans(skb, dev);
+		netif_rx(skb);
+		dev->stats.rx_bytes += len;
+		dev->stats.rx_packets++;
+	} while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp));
+
+	if (err)
+		while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp))
+			ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+
+	return IRQ_HANDLED;
+}
+
+static const struct of_device_id mscc_ocelot_match[] = {
+	{ .compatible = "mscc,vsc7514-switch" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, mscc_ocelot_match);
+
+static int mscc_ocelot_probe(struct platform_device *pdev)
+{
+	int err, irq;
+	unsigned int i;
+	struct device_node *np = pdev->dev.of_node;
+	struct device_node *ports, *portnp;
+	struct ocelot *ocelot;
+	u32 val;
+
+	struct {
+		enum ocelot_target id;
+		char *name;
+	} res[] = {
+		{ SYS, "sys" },
+		{ REW, "rew" },
+		{ QSYS, "qsys" },
+		{ ANA, "ana" },
+		{ QS, "qs" },
+		{ HSIO, "hsio" },
+	};
+
+	if (!np && !pdev->dev.platform_data)
+		return -ENODEV;
+
+	ocelot = devm_kzalloc(&pdev->dev, sizeof(*ocelot), GFP_KERNEL);
+	if (!ocelot)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, ocelot);
+	ocelot->dev = &pdev->dev;
+
+	for (i = 0; i < ARRAY_SIZE(res); i++) {
+		struct regmap *target;
+
+		target = ocelot_io_platform_init(ocelot, pdev, res[i].name);
+		if (IS_ERR(target))
+			return PTR_ERR(target);
+
+		ocelot->targets[res[i].id] = target;
+	}
+
+	err = ocelot_chip_init(ocelot);
+	if (err)
+		return err;
+
+	irq = platform_get_irq_byname(pdev, "xtr");
+	if (irq < 0)
+		return -ENODEV;
+
+	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+					ocelot_xtr_irq_handler, IRQF_ONESHOT,
+					"frame extraction", ocelot);
+	if (err)
+		return err;
+
+	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_INIT], 1);
+	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_ENA], 1);
+
+	do {
+		msleep(1);
+		regmap_field_read(ocelot->regfields[SYS_RESET_CFG_MEM_INIT],
+				  &val);
+	} while (val);
+
+	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_ENA], 1);
+	regmap_field_write(ocelot->regfields[SYS_RESET_CFG_CORE_ENA], 1);
+
+	ocelot->num_cpu_ports = 1; /* 1 port on the switch, two groups */
+
+	ports = of_get_child_by_name(np, "ethernet-ports");
+	if (!ports) {
+		dev_err(&pdev->dev, "no ethernet-ports child node found\n");
+		return -ENODEV;
+	}
+
+	ocelot->num_phys_ports = of_get_child_count(ports);
+
+	ocelot->ports = devm_kcalloc(&pdev->dev, ocelot->num_phys_ports,
+				     sizeof(struct ocelot_port *), GFP_KERNEL);
+
+	INIT_LIST_HEAD(&ocelot->multicast);
+	ocelot_init(ocelot);
+
+	ocelot_rmw(ocelot, HSIO_HW_CFG_DEV1G_4_MODE |
+		     HSIO_HW_CFG_DEV1G_6_MODE |
+		     HSIO_HW_CFG_DEV1G_9_MODE,
+		     HSIO_HW_CFG_DEV1G_4_MODE |
+		     HSIO_HW_CFG_DEV1G_6_MODE |
+		     HSIO_HW_CFG_DEV1G_9_MODE,
+		     HSIO_HW_CFG);
+
+	for_each_available_child_of_node(ports, portnp) {
+		struct device_node *phy_node;
+		struct phy_device *phy;
+		struct resource *res;
+		void __iomem *regs;
+		char res_name[8];
+		u32 port;
+
+		if (of_property_read_u32(portnp, "reg", &port))
+			continue;
+
+		snprintf(res_name, sizeof(res_name), "port%d", port);
+
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   res_name);
+		regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(regs))
+			continue;
+
+		phy_node = of_parse_phandle(portnp, "phy-handle", 0);
+		if (!phy_node)
+			continue;
+
+		phy = of_phy_find_device(phy_node);
+		if (!phy)
+			continue;
+
+		err = ocelot_probe_port(ocelot, port, regs, phy);
+		if (err) {
+			dev_err(&pdev->dev, "failed to probe ports\n");
+			goto err_probe_ports;
+		}
+	}
+
+	register_netdevice_notifier(&ocelot_netdevice_nb);
+
+	dev_info(&pdev->dev, "Ocelot switch probed\n");
+
+	return 0;
+
+err_probe_ports:
+	return err;
+}
+
+static int mscc_ocelot_remove(struct platform_device *pdev)
+{
+	struct ocelot *ocelot = platform_get_drvdata(pdev);
+
+	ocelot_deinit(ocelot);
+	unregister_netdevice_notifier(&ocelot_netdevice_nb);
+
+	return 0;
+}
+
+static struct platform_driver mscc_ocelot_driver = {
+	.probe = mscc_ocelot_probe,
+	.remove = mscc_ocelot_remove,
+	.driver = {
+		.name = "ocelot-switch",
+		.of_match_table = mscc_ocelot_match,
+	},
+};
+
+module_platform_driver(mscc_ocelot_driver);
+
+MODULE_DESCRIPTION("Microsemi Ocelot switch driver");
+MODULE_AUTHOR("Alexandre Belloni <alexandre.belloni@bootlin.com>");
+MODULE_LICENSE("Dual MIT/GPL");
diff --git a/drivers/net/ethernet/mscc/ocelot_dev.h b/drivers/net/ethernet/mscc/ocelot_dev.h
new file mode 100644
index 000000000000..0a50d53bbd3f
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_dev.h
@@ -0,0 +1,275 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_DEV_H_
+#define _MSCC_OCELOT_DEV_H_
+
+#define DEV_CLOCK_CFG                                     0x0
+
+#define DEV_CLOCK_CFG_MAC_TX_RST                          BIT(7)
+#define DEV_CLOCK_CFG_MAC_RX_RST                          BIT(6)
+#define DEV_CLOCK_CFG_PCS_TX_RST                          BIT(5)
+#define DEV_CLOCK_CFG_PCS_RX_RST                          BIT(4)
+#define DEV_CLOCK_CFG_PORT_RST                            BIT(3)
+#define DEV_CLOCK_CFG_PHY_RST                             BIT(2)
+#define DEV_CLOCK_CFG_LINK_SPEED(x)                       ((x) & GENMASK(1, 0))
+#define DEV_CLOCK_CFG_LINK_SPEED_M                        GENMASK(1, 0)
+
+#define DEV_PORT_MISC                                     0x4
+
+#define DEV_PORT_MISC_FWD_ERROR_ENA                       BIT(4)
+#define DEV_PORT_MISC_FWD_PAUSE_ENA                       BIT(3)
+#define DEV_PORT_MISC_FWD_CTRL_ENA                        BIT(2)
+#define DEV_PORT_MISC_DEV_LOOP_ENA                        BIT(1)
+#define DEV_PORT_MISC_HDX_FAST_DIS                        BIT(0)
+
+#define DEV_EVENTS                                        0x8
+
+#define DEV_EEE_CFG                                       0xc
+
+#define DEV_EEE_CFG_EEE_ENA                               BIT(22)
+#define DEV_EEE_CFG_EEE_TIMER_AGE(x)                      (((x) << 15) & GENMASK(21, 15))
+#define DEV_EEE_CFG_EEE_TIMER_AGE_M                       GENMASK(21, 15)
+#define DEV_EEE_CFG_EEE_TIMER_AGE_X(x)                    (((x) & GENMASK(21, 15)) >> 15)
+#define DEV_EEE_CFG_EEE_TIMER_WAKEUP(x)                   (((x) << 8) & GENMASK(14, 8))
+#define DEV_EEE_CFG_EEE_TIMER_WAKEUP_M                    GENMASK(14, 8)
+#define DEV_EEE_CFG_EEE_TIMER_WAKEUP_X(x)                 (((x) & GENMASK(14, 8)) >> 8)
+#define DEV_EEE_CFG_EEE_TIMER_HOLDOFF(x)                  (((x) << 1) & GENMASK(7, 1))
+#define DEV_EEE_CFG_EEE_TIMER_HOLDOFF_M                   GENMASK(7, 1)
+#define DEV_EEE_CFG_EEE_TIMER_HOLDOFF_X(x)                (((x) & GENMASK(7, 1)) >> 1)
+#define DEV_EEE_CFG_PORT_LPI                              BIT(0)
+
+#define DEV_RX_PATH_DELAY                                 0x10
+
+#define DEV_TX_PATH_DELAY                                 0x14
+
+#define DEV_PTP_PREDICT_CFG                               0x18
+
+#define DEV_PTP_PREDICT_CFG_PTP_PHY_PREDICT_CFG(x)        (((x) << 4) & GENMASK(11, 4))
+#define DEV_PTP_PREDICT_CFG_PTP_PHY_PREDICT_CFG_M         GENMASK(11, 4)
+#define DEV_PTP_PREDICT_CFG_PTP_PHY_PREDICT_CFG_X(x)      (((x) & GENMASK(11, 4)) >> 4)
+#define DEV_PTP_PREDICT_CFG_PTP_PHASE_PREDICT_CFG(x)      ((x) & GENMASK(3, 0))
+#define DEV_PTP_PREDICT_CFG_PTP_PHASE_PREDICT_CFG_M       GENMASK(3, 0)
+
+#define DEV_MAC_ENA_CFG                                   0x1c
+
+#define DEV_MAC_ENA_CFG_RX_ENA                            BIT(4)
+#define DEV_MAC_ENA_CFG_TX_ENA                            BIT(0)
+
+#define DEV_MAC_MODE_CFG                                  0x20
+
+#define DEV_MAC_MODE_CFG_FC_WORD_SYNC_ENA                 BIT(8)
+#define DEV_MAC_MODE_CFG_GIGA_MODE_ENA                    BIT(4)
+#define DEV_MAC_MODE_CFG_FDX_ENA                          BIT(0)
+
+#define DEV_MAC_MAXLEN_CFG                                0x24
+
+#define DEV_MAC_TAGS_CFG                                  0x28
+
+#define DEV_MAC_TAGS_CFG_TAG_ID(x)                        (((x) << 16) & GENMASK(31, 16))
+#define DEV_MAC_TAGS_CFG_TAG_ID_M                         GENMASK(31, 16)
+#define DEV_MAC_TAGS_CFG_TAG_ID_X(x)                      (((x) & GENMASK(31, 16)) >> 16)
+#define DEV_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA                 BIT(2)
+#define DEV_MAC_TAGS_CFG_PB_ENA                           BIT(1)
+#define DEV_MAC_TAGS_CFG_VLAN_AWR_ENA                     BIT(0)
+
+#define DEV_MAC_ADV_CHK_CFG                               0x2c
+
+#define DEV_MAC_ADV_CHK_CFG_LEN_DROP_ENA                  BIT(0)
+
+#define DEV_MAC_IFG_CFG                                   0x30
+
+#define DEV_MAC_IFG_CFG_RESTORE_OLD_IPG_CHECK             BIT(17)
+#define DEV_MAC_IFG_CFG_REDUCED_TX_IFG                    BIT(16)
+#define DEV_MAC_IFG_CFG_TX_IFG(x)                         (((x) << 8) & GENMASK(12, 8))
+#define DEV_MAC_IFG_CFG_TX_IFG_M                          GENMASK(12, 8)
+#define DEV_MAC_IFG_CFG_TX_IFG_X(x)                       (((x) & GENMASK(12, 8)) >> 8)
+#define DEV_MAC_IFG_CFG_RX_IFG2(x)                        (((x) << 4) & GENMASK(7, 4))
+#define DEV_MAC_IFG_CFG_RX_IFG2_M                         GENMASK(7, 4)
+#define DEV_MAC_IFG_CFG_RX_IFG2_X(x)                      (((x) & GENMASK(7, 4)) >> 4)
+#define DEV_MAC_IFG_CFG_RX_IFG1(x)                        ((x) & GENMASK(3, 0))
+#define DEV_MAC_IFG_CFG_RX_IFG1_M                         GENMASK(3, 0)
+
+#define DEV_MAC_HDX_CFG                                   0x34
+
+#define DEV_MAC_HDX_CFG_BYPASS_COL_SYNC                   BIT(26)
+#define DEV_MAC_HDX_CFG_OB_ENA                            BIT(25)
+#define DEV_MAC_HDX_CFG_WEXC_DIS                          BIT(24)
+#define DEV_MAC_HDX_CFG_SEED(x)                           (((x) << 16) & GENMASK(23, 16))
+#define DEV_MAC_HDX_CFG_SEED_M                            GENMASK(23, 16)
+#define DEV_MAC_HDX_CFG_SEED_X(x)                         (((x) & GENMASK(23, 16)) >> 16)
+#define DEV_MAC_HDX_CFG_SEED_LOAD                         BIT(12)
+#define DEV_MAC_HDX_CFG_RETRY_AFTER_EXC_COL_ENA           BIT(8)
+#define DEV_MAC_HDX_CFG_LATE_COL_POS(x)                   ((x) & GENMASK(6, 0))
+#define DEV_MAC_HDX_CFG_LATE_COL_POS_M                    GENMASK(6, 0)
+
+#define DEV_MAC_DBG_CFG                                   0x38
+
+#define DEV_MAC_DBG_CFG_TBI_MODE                          BIT(4)
+#define DEV_MAC_DBG_CFG_IFG_CRS_EXT_CHK_ENA               BIT(0)
+
+#define DEV_MAC_FC_MAC_LOW_CFG                            0x3c
+
+#define DEV_MAC_FC_MAC_HIGH_CFG                           0x40
+
+#define DEV_MAC_STICKY                                    0x44
+
+#define DEV_MAC_STICKY_RX_IPG_SHRINK_STICKY               BIT(9)
+#define DEV_MAC_STICKY_RX_PREAM_SHRINK_STICKY             BIT(8)
+#define DEV_MAC_STICKY_RX_CARRIER_EXT_STICKY              BIT(7)
+#define DEV_MAC_STICKY_RX_CARRIER_EXT_ERR_STICKY          BIT(6)
+#define DEV_MAC_STICKY_RX_JUNK_STICKY                     BIT(5)
+#define DEV_MAC_STICKY_TX_RETRANSMIT_STICKY               BIT(4)
+#define DEV_MAC_STICKY_TX_JAM_STICKY                      BIT(3)
+#define DEV_MAC_STICKY_TX_FIFO_OFLW_STICKY                BIT(2)
+#define DEV_MAC_STICKY_TX_FRM_LEN_OVR_STICKY              BIT(1)
+#define DEV_MAC_STICKY_TX_ABORT_STICKY                    BIT(0)
+
+#define PCS1G_CFG                                         0x48
+
+#define PCS1G_CFG_LINK_STATUS_TYPE                        BIT(4)
+#define PCS1G_CFG_AN_LINK_CTRL_ENA                        BIT(1)
+#define PCS1G_CFG_PCS_ENA                                 BIT(0)
+
+#define PCS1G_MODE_CFG                                    0x4c
+
+#define PCS1G_MODE_CFG_UNIDIR_MODE_ENA                    BIT(4)
+#define PCS1G_MODE_CFG_SGMII_MODE_ENA                     BIT(0)
+
+#define PCS1G_SD_CFG                                      0x50
+
+#define PCS1G_SD_CFG_SD_SEL                               BIT(8)
+#define PCS1G_SD_CFG_SD_POL                               BIT(4)
+#define PCS1G_SD_CFG_SD_ENA                               BIT(0)
+
+#define PCS1G_ANEG_CFG                                    0x54
+
+#define PCS1G_ANEG_CFG_ADV_ABILITY(x)                     (((x) << 16) & GENMASK(31, 16))
+#define PCS1G_ANEG_CFG_ADV_ABILITY_M                      GENMASK(31, 16)
+#define PCS1G_ANEG_CFG_ADV_ABILITY_X(x)                   (((x) & GENMASK(31, 16)) >> 16)
+#define PCS1G_ANEG_CFG_SW_RESOLVE_ENA                     BIT(8)
+#define PCS1G_ANEG_CFG_ANEG_RESTART_ONE_SHOT              BIT(1)
+#define PCS1G_ANEG_CFG_ANEG_ENA                           BIT(0)
+
+#define PCS1G_ANEG_NP_CFG                                 0x58
+
+#define PCS1G_ANEG_NP_CFG_NP_TX(x)                        (((x) << 16) & GENMASK(31, 16))
+#define PCS1G_ANEG_NP_CFG_NP_TX_M                         GENMASK(31, 16)
+#define PCS1G_ANEG_NP_CFG_NP_TX_X(x)                      (((x) & GENMASK(31, 16)) >> 16)
+#define PCS1G_ANEG_NP_CFG_NP_LOADED_ONE_SHOT              BIT(0)
+
+#define PCS1G_LB_CFG                                      0x5c
+
+#define PCS1G_LB_CFG_RA_ENA                               BIT(4)
+#define PCS1G_LB_CFG_GMII_PHY_LB_ENA                      BIT(1)
+#define PCS1G_LB_CFG_TBI_HOST_LB_ENA                      BIT(0)
+
+#define PCS1G_DBG_CFG                                     0x60
+
+#define PCS1G_DBG_CFG_UDLT                                BIT(0)
+
+#define PCS1G_CDET_CFG                                    0x64
+
+#define PCS1G_CDET_CFG_CDET_ENA                           BIT(0)
+
+#define PCS1G_ANEG_STATUS                                 0x68
+
+#define PCS1G_ANEG_STATUS_LP_ADV_ABILITY(x)               (((x) << 16) & GENMASK(31, 16))
+#define PCS1G_ANEG_STATUS_LP_ADV_ABILITY_M                GENMASK(31, 16)
+#define PCS1G_ANEG_STATUS_LP_ADV_ABILITY_X(x)             (((x) & GENMASK(31, 16)) >> 16)
+#define PCS1G_ANEG_STATUS_PR                              BIT(4)
+#define PCS1G_ANEG_STATUS_PAGE_RX_STICKY                  BIT(3)
+#define PCS1G_ANEG_STATUS_ANEG_COMPLETE                   BIT(0)
+
+#define PCS1G_ANEG_NP_STATUS                              0x6c
+
+#define PCS1G_LINK_STATUS                                 0x70
+
+#define PCS1G_LINK_STATUS_DELAY_VAR(x)                    (((x) << 12) & GENMASK(15, 12))
+#define PCS1G_LINK_STATUS_DELAY_VAR_M                     GENMASK(15, 12)
+#define PCS1G_LINK_STATUS_DELAY_VAR_X(x)                  (((x) & GENMASK(15, 12)) >> 12)
+#define PCS1G_LINK_STATUS_SIGNAL_DETECT                   BIT(8)
+#define PCS1G_LINK_STATUS_LINK_STATUS                     BIT(4)
+#define PCS1G_LINK_STATUS_SYNC_STATUS                     BIT(0)
+
+#define PCS1G_LINK_DOWN_CNT                               0x74
+
+#define PCS1G_STICKY                                      0x78
+
+#define PCS1G_STICKY_LINK_DOWN_STICKY                     BIT(4)
+#define PCS1G_STICKY_OUT_OF_SYNC_STICKY                   BIT(0)
+
+#define PCS1G_DEBUG_STATUS                                0x7c
+
+#define PCS1G_LPI_CFG                                     0x80
+
+#define PCS1G_LPI_CFG_QSGMII_MS_SEL                       BIT(20)
+#define PCS1G_LPI_CFG_RX_LPI_OUT_DIS                      BIT(17)
+#define PCS1G_LPI_CFG_LPI_TESTMODE                        BIT(16)
+#define PCS1G_LPI_CFG_LPI_RX_WTIM(x)                      (((x) << 4) & GENMASK(5, 4))
+#define PCS1G_LPI_CFG_LPI_RX_WTIM_M                       GENMASK(5, 4)
+#define PCS1G_LPI_CFG_LPI_RX_WTIM_X(x)                    (((x) & GENMASK(5, 4)) >> 4)
+#define PCS1G_LPI_CFG_TX_ASSERT_LPIDLE                    BIT(0)
+
+#define PCS1G_LPI_WAKE_ERROR_CNT                          0x84
+
+#define PCS1G_LPI_STATUS                                  0x88
+
+#define PCS1G_LPI_STATUS_RX_LPI_FAIL                      BIT(16)
+#define PCS1G_LPI_STATUS_RX_LPI_EVENT_STICKY              BIT(12)
+#define PCS1G_LPI_STATUS_RX_QUIET                         BIT(9)
+#define PCS1G_LPI_STATUS_RX_LPI_MODE                      BIT(8)
+#define PCS1G_LPI_STATUS_TX_LPI_EVENT_STICKY              BIT(4)
+#define PCS1G_LPI_STATUS_TX_QUIET                         BIT(1)
+#define PCS1G_LPI_STATUS_TX_LPI_MODE                      BIT(0)
+
+#define PCS1G_TSTPAT_MODE_CFG                             0x8c
+
+#define PCS1G_TSTPAT_STATUS                               0x90
+
+#define PCS1G_TSTPAT_STATUS_JTP_ERR_CNT(x)                (((x) << 8) & GENMASK(15, 8))
+#define PCS1G_TSTPAT_STATUS_JTP_ERR_CNT_M                 GENMASK(15, 8)
+#define PCS1G_TSTPAT_STATUS_JTP_ERR_CNT_X(x)              (((x) & GENMASK(15, 8)) >> 8)
+#define PCS1G_TSTPAT_STATUS_JTP_ERR                       BIT(4)
+#define PCS1G_TSTPAT_STATUS_JTP_LOCK                      BIT(0)
+
+#define DEV_PCS_FX100_CFG                                 0x94
+
+#define DEV_PCS_FX100_CFG_SD_SEL                          BIT(26)
+#define DEV_PCS_FX100_CFG_SD_POL                          BIT(25)
+#define DEV_PCS_FX100_CFG_SD_ENA                          BIT(24)
+#define DEV_PCS_FX100_CFG_LOOPBACK_ENA                    BIT(20)
+#define DEV_PCS_FX100_CFG_SWAP_MII_ENA                    BIT(16)
+#define DEV_PCS_FX100_CFG_RXBITSEL(x)                     (((x) << 12) & GENMASK(15, 12))
+#define DEV_PCS_FX100_CFG_RXBITSEL_M                      GENMASK(15, 12)
+#define DEV_PCS_FX100_CFG_RXBITSEL_X(x)                   (((x) & GENMASK(15, 12)) >> 12)
+#define DEV_PCS_FX100_CFG_SIGDET_CFG(x)                   (((x) << 9) & GENMASK(10, 9))
+#define DEV_PCS_FX100_CFG_SIGDET_CFG_M                    GENMASK(10, 9)
+#define DEV_PCS_FX100_CFG_SIGDET_CFG_X(x)                 (((x) & GENMASK(10, 9)) >> 9)
+#define DEV_PCS_FX100_CFG_LINKHYST_TM_ENA                 BIT(8)
+#define DEV_PCS_FX100_CFG_LINKHYSTTIMER(x)                (((x) << 4) & GENMASK(7, 4))
+#define DEV_PCS_FX100_CFG_LINKHYSTTIMER_M                 GENMASK(7, 4)
+#define DEV_PCS_FX100_CFG_LINKHYSTTIMER_X(x)              (((x) & GENMASK(7, 4)) >> 4)
+#define DEV_PCS_FX100_CFG_UNIDIR_MODE_ENA                 BIT(3)
+#define DEV_PCS_FX100_CFG_FEFCHK_ENA                      BIT(2)
+#define DEV_PCS_FX100_CFG_FEFGEN_ENA                      BIT(1)
+#define DEV_PCS_FX100_CFG_PCS_ENA                         BIT(0)
+
+#define DEV_PCS_FX100_STATUS                              0x98
+
+#define DEV_PCS_FX100_STATUS_EDGE_POS_PTP(x)              (((x) << 8) & GENMASK(11, 8))
+#define DEV_PCS_FX100_STATUS_EDGE_POS_PTP_M               GENMASK(11, 8)
+#define DEV_PCS_FX100_STATUS_EDGE_POS_PTP_X(x)            (((x) & GENMASK(11, 8)) >> 8)
+#define DEV_PCS_FX100_STATUS_PCS_ERROR_STICKY             BIT(7)
+#define DEV_PCS_FX100_STATUS_FEF_FOUND_STICKY             BIT(6)
+#define DEV_PCS_FX100_STATUS_SSD_ERROR_STICKY             BIT(5)
+#define DEV_PCS_FX100_STATUS_SYNC_LOST_STICKY             BIT(4)
+#define DEV_PCS_FX100_STATUS_FEF_STATUS                   BIT(2)
+#define DEV_PCS_FX100_STATUS_SIGNAL_DETECT                BIT(1)
+#define DEV_PCS_FX100_STATUS_SYNC_STATUS                  BIT(0)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_dev_gmii.h b/drivers/net/ethernet/mscc/ocelot_dev_gmii.h
new file mode 100644
index 000000000000..6aa40ea223a2
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_dev_gmii.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_DEV_GMII_H_
+#define _MSCC_OCELOT_DEV_GMII_H_
+
+#define DEV_GMII_PORT_MODE_CLOCK_CFG                      0x0
+
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_MAC_TX_RST           BIT(5)
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_MAC_RX_RST           BIT(4)
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_PORT_RST             BIT(3)
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_PHY_RST              BIT(2)
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_LINK_SPEED(x)        ((x) & GENMASK(1, 0))
+#define DEV_GMII_PORT_MODE_CLOCK_CFG_LINK_SPEED_M         GENMASK(1, 0)
+
+#define DEV_GMII_PORT_MODE_PORT_MISC                      0x4
+
+#define DEV_GMII_PORT_MODE_PORT_MISC_MPLS_RX_ENA          BIT(5)
+#define DEV_GMII_PORT_MODE_PORT_MISC_FWD_ERROR_ENA        BIT(4)
+#define DEV_GMII_PORT_MODE_PORT_MISC_FWD_PAUSE_ENA        BIT(3)
+#define DEV_GMII_PORT_MODE_PORT_MISC_FWD_CTRL_ENA         BIT(2)
+#define DEV_GMII_PORT_MODE_PORT_MISC_GMII_LOOP_ENA        BIT(1)
+#define DEV_GMII_PORT_MODE_PORT_MISC_DEV_LOOP_ENA         BIT(0)
+
+#define DEV_GMII_PORT_MODE_EVENTS                         0x8
+
+#define DEV_GMII_PORT_MODE_EEE_CFG                        0xc
+
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_ENA                BIT(22)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_AGE(x)       (((x) << 15) & GENMASK(21, 15))
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_AGE_M        GENMASK(21, 15)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_AGE_X(x)     (((x) & GENMASK(21, 15)) >> 15)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_WAKEUP(x)    (((x) << 8) & GENMASK(14, 8))
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_WAKEUP_M     GENMASK(14, 8)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_WAKEUP_X(x)  (((x) & GENMASK(14, 8)) >> 8)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_HOLDOFF(x)   (((x) << 1) & GENMASK(7, 1))
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_HOLDOFF_M    GENMASK(7, 1)
+#define DEV_GMII_PORT_MODE_EEE_CFG_EEE_TIMER_HOLDOFF_X(x) (((x) & GENMASK(7, 1)) >> 1)
+#define DEV_GMII_PORT_MODE_EEE_CFG_PORT_LPI               BIT(0)
+
+#define DEV_GMII_PORT_MODE_RX_PATH_DELAY                  0x10
+
+#define DEV_GMII_PORT_MODE_TX_PATH_DELAY                  0x14
+
+#define DEV_GMII_PORT_MODE_PTP_PREDICT_CFG                0x18
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_ENA_CFG               0x1c
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_ENA_CFG_RX_ENA        BIT(4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_ENA_CFG_TX_ENA        BIT(0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_MODE_CFG              0x20
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_MODE_CFG_FC_WORD_SYNC_ENA BIT(8)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_MODE_CFG_GIGA_MODE_ENA BIT(4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_MODE_CFG_FDX_ENA      BIT(0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_MAXLEN_CFG            0x24
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG              0x28
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_TAG_ID(x)    (((x) << 16) & GENMASK(31, 16))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_TAG_ID_M     GENMASK(31, 16)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_TAG_ID_X(x)  (((x) & GENMASK(31, 16)) >> 16)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_PB_ENA       BIT(1)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_VLAN_AWR_ENA BIT(0)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA BIT(2)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_ADV_CHK_CFG           0x2c
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_ADV_CHK_CFG_LEN_DROP_ENA BIT(0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG               0x30
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RESTORE_OLD_IPG_CHECK BIT(17)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_REDUCED_TX_IFG BIT(16)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_TX_IFG(x)     (((x) << 8) & GENMASK(12, 8))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_TX_IFG_M      GENMASK(12, 8)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_TX_IFG_X(x)   (((x) & GENMASK(12, 8)) >> 8)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RX_IFG2(x)    (((x) << 4) & GENMASK(7, 4))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RX_IFG2_M     GENMASK(7, 4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RX_IFG2_X(x)  (((x) & GENMASK(7, 4)) >> 4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RX_IFG1(x)    ((x) & GENMASK(3, 0))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_IFG_CFG_RX_IFG1_M     GENMASK(3, 0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG               0x34
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_BYPASS_COL_SYNC BIT(26)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_OB_ENA        BIT(25)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_WEXC_DIS      BIT(24)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_SEED(x)       (((x) << 16) & GENMASK(23, 16))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_SEED_M        GENMASK(23, 16)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_SEED_X(x)     (((x) & GENMASK(23, 16)) >> 16)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_SEED_LOAD     BIT(12)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_RETRY_AFTER_EXC_COL_ENA BIT(8)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_LATE_COL_POS(x) ((x) & GENMASK(6, 0))
+#define DEV_GMII_MAC_CFG_STATUS_MAC_HDX_CFG_LATE_COL_POS_M GENMASK(6, 0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_DBG_CFG               0x38
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_DBG_CFG_TBI_MODE      BIT(4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_DBG_CFG_IFG_CRS_EXT_CHK_ENA BIT(0)
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_FC_MAC_LOW_CFG        0x3c
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_FC_MAC_HIGH_CFG       0x40
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY                0x44
+
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_RX_IPG_SHRINK_STICKY BIT(9)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_RX_PREAM_SHRINK_STICKY BIT(8)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_RX_CARRIER_EXT_STICKY BIT(7)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_RX_CARRIER_EXT_ERR_STICKY BIT(6)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_RX_JUNK_STICKY BIT(5)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_TX_RETRANSMIT_STICKY BIT(4)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_TX_JAM_STICKY  BIT(3)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_TX_FIFO_OFLW_STICKY BIT(2)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_TX_FRM_LEN_OVR_STICKY BIT(1)
+#define DEV_GMII_MAC_CFG_STATUS_MAC_STICKY_TX_ABORT_STICKY BIT(0)
+
+#define DEV_GMII_MM_CONFIG_ENABLE_CONFIG                  0x48
+
+#define DEV_GMII_MM_CONFIG_ENABLE_CONFIG_MM_RX_ENA        BIT(0)
+#define DEV_GMII_MM_CONFIG_ENABLE_CONFIG_MM_TX_ENA        BIT(4)
+#define DEV_GMII_MM_CONFIG_ENABLE_CONFIG_KEEP_S_AFTER_D   BIT(8)
+
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG                   0x4c
+
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_DIS    BIT(0)
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_TIME(x) (((x) << 4) & GENMASK(11, 4))
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_TIME_M GENMASK(11, 4)
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_TIME_X(x) (((x) & GENMASK(11, 4)) >> 4)
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_VERIF_TIMER_UNITS(x) (((x) << 12) & GENMASK(13, 12))
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_VERIF_TIMER_UNITS_M GENMASK(13, 12)
+#define DEV_GMII_MM_CONFIG_VERIF_CONFIG_VERIF_TIMER_UNITS_X(x) (((x) & GENMASK(13, 12)) >> 12)
+
+#define DEV_GMII_MM_STATISTICS_MM_STATUS                  0x50
+
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_PRMPT_ACTIVE_STATUS BIT(0)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_PRMPT_ACTIVE_STICKY BIT(4)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_PRMPT_VERIFY_STATE(x) (((x) << 8) & GENMASK(10, 8))
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_PRMPT_VERIFY_STATE_M GENMASK(10, 8)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_PRMPT_VERIFY_STATE_X(x) (((x) & GENMASK(10, 8)) >> 8)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_UNEXP_RX_PFRM_STICKY BIT(12)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_UNEXP_TX_PFRM_STICKY BIT(16)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_MM_RX_FRAME_STATUS BIT(20)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_MM_TX_FRAME_STATUS BIT(24)
+#define DEV_GMII_MM_STATISTICS_MM_STATUS_MM_TX_PRMPT_STATUS BIT(28)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_hsio.h b/drivers/net/ethernet/mscc/ocelot_hsio.h
new file mode 100644
index 000000000000..d93ddec3931b
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_hsio.h
@@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_HSIO_H_
+#define _MSCC_OCELOT_HSIO_H_
+
+#define HSIO_PLL5G_CFG0_ENA_ROT                           BIT(31)
+#define HSIO_PLL5G_CFG0_ENA_LANE                          BIT(30)
+#define HSIO_PLL5G_CFG0_ENA_CLKTREE                       BIT(29)
+#define HSIO_PLL5G_CFG0_DIV4                              BIT(28)
+#define HSIO_PLL5G_CFG0_ENA_LOCK_FINE                     BIT(27)
+#define HSIO_PLL5G_CFG0_SELBGV820(x)                      (((x) << 23) & GENMASK(26, 23))
+#define HSIO_PLL5G_CFG0_SELBGV820_M                       GENMASK(26, 23)
+#define HSIO_PLL5G_CFG0_SELBGV820_X(x)                    (((x) & GENMASK(26, 23)) >> 23)
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES(x)                    (((x) << 18) & GENMASK(22, 18))
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES_M                     GENMASK(22, 18)
+#define HSIO_PLL5G_CFG0_LOOP_BW_RES_X(x)                  (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_PLL5G_CFG0_SELCPI(x)                         (((x) << 16) & GENMASK(17, 16))
+#define HSIO_PLL5G_CFG0_SELCPI_M                          GENMASK(17, 16)
+#define HSIO_PLL5G_CFG0_SELCPI_X(x)                       (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_PLL5G_CFG0_ENA_VCO_CONTRH                    BIT(15)
+#define HSIO_PLL5G_CFG0_ENA_CP1                           BIT(14)
+#define HSIO_PLL5G_CFG0_ENA_VCO_BUF                       BIT(13)
+#define HSIO_PLL5G_CFG0_ENA_BIAS                          BIT(12)
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV(x)                    (((x) << 6) & GENMASK(11, 6))
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_M                     GENMASK(11, 6)
+#define HSIO_PLL5G_CFG0_CPU_CLK_DIV_X(x)                  (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_PLL5G_CFG0_CORE_CLK_DIV(x)                   ((x) & GENMASK(5, 0))
+#define HSIO_PLL5G_CFG0_CORE_CLK_DIV_M                    GENMASK(5, 0)
+
+#define HSIO_PLL5G_CFG1_ENA_DIRECT                        BIT(18)
+#define HSIO_PLL5G_CFG1_ROT_SPEED                         BIT(17)
+#define HSIO_PLL5G_CFG1_ROT_DIR                           BIT(16)
+#define HSIO_PLL5G_CFG1_READBACK_DATA_SEL                 BIT(15)
+#define HSIO_PLL5G_CFG1_RC_ENABLE                         BIT(14)
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA(x)                   (((x) << 6) & GENMASK(13, 6))
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_M                    GENMASK(13, 6)
+#define HSIO_PLL5G_CFG1_RC_CTRL_DATA_X(x)                 (((x) & GENMASK(13, 6)) >> 6)
+#define HSIO_PLL5G_CFG1_QUARTER_RATE                      BIT(5)
+#define HSIO_PLL5G_CFG1_PWD_TX                            BIT(4)
+#define HSIO_PLL5G_CFG1_PWD_RX                            BIT(3)
+#define HSIO_PLL5G_CFG1_OUT_OF_RANGE_RECAL_ENA            BIT(2)
+#define HSIO_PLL5G_CFG1_HALF_RATE                         BIT(1)
+#define HSIO_PLL5G_CFG1_FORCE_SET_ENA                     BIT(0)
+
+#define HSIO_PLL5G_CFG2_ENA_TEST_MODE                     BIT(30)
+#define HSIO_PLL5G_CFG2_ENA_PFD_IN_FLIP                   BIT(29)
+#define HSIO_PLL5G_CFG2_ENA_VCO_NREF_TESTOUT              BIT(28)
+#define HSIO_PLL5G_CFG2_ENA_FBTESTOUT                     BIT(27)
+#define HSIO_PLL5G_CFG2_ENA_RCPLL                         BIT(26)
+#define HSIO_PLL5G_CFG2_ENA_CP2                           BIT(25)
+#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS1                   BIT(24)
+#define HSIO_PLL5G_CFG2_AMPC_SEL(x)                       (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG2_AMPC_SEL_M                        GENMASK(23, 16)
+#define HSIO_PLL5G_CFG2_AMPC_SEL_X(x)                     (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG2_ENA_CLK_BYPASS                    BIT(15)
+#define HSIO_PLL5G_CFG2_PWD_AMPCTRL_N                     BIT(14)
+#define HSIO_PLL5G_CFG2_ENA_AMPCTRL                       BIT(13)
+#define HSIO_PLL5G_CFG2_ENA_AMP_CTRL_FORCE                BIT(12)
+#define HSIO_PLL5G_CFG2_FRC_FSM_POR                       BIT(11)
+#define HSIO_PLL5G_CFG2_DISABLE_FSM_POR                   BIT(10)
+#define HSIO_PLL5G_CFG2_GAIN_TEST(x)                      (((x) << 5) & GENMASK(9, 5))
+#define HSIO_PLL5G_CFG2_GAIN_TEST_M                       GENMASK(9, 5)
+#define HSIO_PLL5G_CFG2_GAIN_TEST_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_PLL5G_CFG2_EN_RESET_OVERRUN                  BIT(4)
+#define HSIO_PLL5G_CFG2_EN_RESET_LIM_DET                  BIT(3)
+#define HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET                  BIT(2)
+#define HSIO_PLL5G_CFG2_DISABLE_FSM                       BIT(1)
+#define HSIO_PLL5G_CFG2_ENA_GAIN_TEST                     BIT(0)
+
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL(x)               (((x) << 22) & GENMASK(23, 22))
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_M                GENMASK(23, 22)
+#define HSIO_PLL5G_CFG3_TEST_ANA_OUT_SEL_X(x)             (((x) & GENMASK(23, 22)) >> 22)
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL(x)                    (((x) << 19) & GENMASK(21, 19))
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL_M                     GENMASK(21, 19)
+#define HSIO_PLL5G_CFG3_TESTOUT_SEL_X(x)                  (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_PLL5G_CFG3_ENA_ANA_TEST_OUT                  BIT(18)
+#define HSIO_PLL5G_CFG3_ENA_TEST_OUT                      BIT(17)
+#define HSIO_PLL5G_CFG3_SEL_FBDCLK                        BIT(16)
+#define HSIO_PLL5G_CFG3_SEL_CML_CMOS_PFD                  BIT(15)
+#define HSIO_PLL5G_CFG3_RST_FB_N                          BIT(14)
+#define HSIO_PLL5G_CFG3_FORCE_VCO_CONTRH                  BIT(13)
+#define HSIO_PLL5G_CFG3_FORCE_LO                          BIT(12)
+#define HSIO_PLL5G_CFG3_FORCE_HI                          BIT(11)
+#define HSIO_PLL5G_CFG3_FORCE_ENA                         BIT(10)
+#define HSIO_PLL5G_CFG3_FORCE_CP                          BIT(9)
+#define HSIO_PLL5G_CFG3_FBDIVSEL_TST_ENA                  BIT(8)
+#define HSIO_PLL5G_CFG3_FBDIVSEL(x)                       ((x) & GENMASK(7, 0))
+#define HSIO_PLL5G_CFG3_FBDIVSEL_M                        GENMASK(7, 0)
+
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_M                    GENMASK(23, 16)
+#define HSIO_PLL5G_CFG4_IB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG4_IB_CTRL(x)                        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_CFG4_IB_CTRL_M                         GENMASK(15, 0)
+
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL(x)                   (((x) << 16) & GENMASK(23, 16))
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_M                    GENMASK(23, 16)
+#define HSIO_PLL5G_CFG5_OB_BIAS_CTRL_X(x)                 (((x) & GENMASK(23, 16)) >> 16)
+#define HSIO_PLL5G_CFG5_OB_CTRL(x)                        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_CFG5_OB_CTRL_M                         GENMASK(15, 0)
+
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_SRC                    BIT(23)
+#define HSIO_PLL5G_CFG6_REFCLK_SEL(x)                     (((x) << 20) & GENMASK(22, 20))
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_M                      GENMASK(22, 20)
+#define HSIO_PLL5G_CFG6_REFCLK_SEL_X(x)                   (((x) & GENMASK(22, 20)) >> 20)
+#define HSIO_PLL5G_CFG6_REFCLK_SRC                        BIT(19)
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL(x)                    (((x) << 16) & GENMASK(17, 16))
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL_M                     GENMASK(17, 16)
+#define HSIO_PLL5G_CFG6_POR_DEL_SEL_X(x)                  (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL(x)                  (((x) << 8) & GENMASK(15, 8))
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL_M                   GENMASK(15, 8)
+#define HSIO_PLL5G_CFG6_DIV125REF_SEL_X(x)                (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_PLL5G_CFG6_ENA_REFCLKC2                      BIT(7)
+#define HSIO_PLL5G_CFG6_ENA_FBCLKC2                       BIT(6)
+#define HSIO_PLL5G_CFG6_DDR_CLK_DIV(x)                    ((x) & GENMASK(5, 0))
+#define HSIO_PLL5G_CFG6_DDR_CLK_DIV_M                     GENMASK(5, 0)
+
+#define HSIO_PLL5G_STATUS0_RANGE_LIM                      BIT(12)
+#define HSIO_PLL5G_STATUS0_OUT_OF_RANGE_ERR               BIT(11)
+#define HSIO_PLL5G_STATUS0_CALIBRATION_ERR                BIT(10)
+#define HSIO_PLL5G_STATUS0_CALIBRATION_DONE               BIT(9)
+#define HSIO_PLL5G_STATUS0_READBACK_DATA(x)               (((x) << 1) & GENMASK(8, 1))
+#define HSIO_PLL5G_STATUS0_READBACK_DATA_M                GENMASK(8, 1)
+#define HSIO_PLL5G_STATUS0_READBACK_DATA_X(x)             (((x) & GENMASK(8, 1)) >> 1)
+#define HSIO_PLL5G_STATUS0_LOCK_STATUS                    BIT(0)
+
+#define HSIO_PLL5G_STATUS1_SIG_DEL(x)                     (((x) << 21) & GENMASK(28, 21))
+#define HSIO_PLL5G_STATUS1_SIG_DEL_M                      GENMASK(28, 21)
+#define HSIO_PLL5G_STATUS1_SIG_DEL_X(x)                   (((x) & GENMASK(28, 21)) >> 21)
+#define HSIO_PLL5G_STATUS1_GAIN_STAT(x)                   (((x) << 16) & GENMASK(20, 16))
+#define HSIO_PLL5G_STATUS1_GAIN_STAT_M                    GENMASK(20, 16)
+#define HSIO_PLL5G_STATUS1_GAIN_STAT_X(x)                 (((x) & GENMASK(20, 16)) >> 16)
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF(x)                   (((x) << 4) & GENMASK(13, 4))
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF_M                    GENMASK(13, 4)
+#define HSIO_PLL5G_STATUS1_FBCNT_DIF_X(x)                 (((x) & GENMASK(13, 4)) >> 4)
+#define HSIO_PLL5G_STATUS1_FSM_STAT(x)                    (((x) << 1) & GENMASK(3, 1))
+#define HSIO_PLL5G_STATUS1_FSM_STAT_M                     GENMASK(3, 1)
+#define HSIO_PLL5G_STATUS1_FSM_STAT_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_PLL5G_STATUS1_FSM_LOCK                       BIT(0)
+
+#define HSIO_PLL5G_BIST_CFG0_PLLB_START_BIST              BIT(31)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_MEAS_MODE               BIT(30)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT(x)          (((x) << 20) & GENMASK(23, 20))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_M           GENMASK(23, 20)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_REPEAT_X(x)        (((x) & GENMASK(23, 20)) >> 20)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT(x)          (((x) << 16) & GENMASK(19, 16))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_M           GENMASK(19, 16)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_LOCK_UNCERT_X(x)        (((x) & GENMASK(19, 16)) >> 16)
+#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE(x)       ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_BIST_CFG0_PLLB_DIV_FACTOR_PRE_M        GENMASK(15, 0)
+
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT(x)            (((x) << 4) & GENMASK(7, 4))
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_M             GENMASK(7, 4)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FSM_STAT_X(x)          (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_BUSY                   BIT(2)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_DONE_N                 BIT(1)
+#define HSIO_PLL5G_BIST_STAT0_PLLB_FAIL                   BIT(0)
+
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT(x)             (((x) << 16) & GENMASK(31, 16))
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_M              GENMASK(31, 16)
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_OUT_X(x)           (((x) & GENMASK(31, 16)) >> 16)
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF(x)        ((x) & GENMASK(15, 0))
+#define HSIO_PLL5G_BIST_STAT1_PLLB_CNT_REF_DIFF_M         GENMASK(15, 0)
+
+#define HSIO_RCOMP_CFG0_PWD_ENA                           BIT(13)
+#define HSIO_RCOMP_CFG0_RUN_CAL                           BIT(12)
+#define HSIO_RCOMP_CFG0_SPEED_SEL(x)                      (((x) << 10) & GENMASK(11, 10))
+#define HSIO_RCOMP_CFG0_SPEED_SEL_M                       GENMASK(11, 10)
+#define HSIO_RCOMP_CFG0_SPEED_SEL_X(x)                    (((x) & GENMASK(11, 10)) >> 10)
+#define HSIO_RCOMP_CFG0_MODE_SEL(x)                       (((x) << 8) & GENMASK(9, 8))
+#define HSIO_RCOMP_CFG0_MODE_SEL_M                        GENMASK(9, 8)
+#define HSIO_RCOMP_CFG0_MODE_SEL_X(x)                     (((x) & GENMASK(9, 8)) >> 8)
+#define HSIO_RCOMP_CFG0_FORCE_ENA                         BIT(4)
+#define HSIO_RCOMP_CFG0_RCOMP_VAL(x)                      ((x) & GENMASK(3, 0))
+#define HSIO_RCOMP_CFG0_RCOMP_VAL_M                       GENMASK(3, 0)
+
+#define HSIO_RCOMP_STATUS_BUSY                            BIT(12)
+#define HSIO_RCOMP_STATUS_DELTA_ALERT                     BIT(7)
+#define HSIO_RCOMP_STATUS_RCOMP(x)                        ((x) & GENMASK(3, 0))
+#define HSIO_RCOMP_STATUS_RCOMP_M                         GENMASK(3, 0)
+
+#define HSIO_SYNC_ETH_CFG_RSZ                             0x4
+
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC(x)             (((x) << 4) & GENMASK(7, 4))
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_M              GENMASK(7, 4)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_SRC_X(x)           (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV(x)             (((x) << 1) & GENMASK(3, 1))
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_M              GENMASK(3, 1)
+#define HSIO_SYNC_ETH_CFG_SEL_RECO_CLK_DIV_X(x)           (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_SYNC_ETH_CFG_RECO_CLK_ENA                    BIT(0)
+
+#define HSIO_SYNC_ETH_PLL_CFG_PLL_AUTO_SQUELCH_ENA        BIT(0)
+
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
+#define HSIO_S1G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(12, 11)
+#define HSIO_S1G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 8) & GENMASK(10, 8))
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(10, 8)
+#define HSIO_S1G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(10, 8)) >> 8)
+#define HSIO_S1G_DES_CFG_DES_BW_ANA(x)                    (((x) << 5) & GENMASK(7, 5))
+#define HSIO_S1G_DES_CFG_DES_BW_ANA_M                     GENMASK(7, 5)
+#define HSIO_S1G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(7, 5)) >> 5)
+#define HSIO_S1G_DES_CFG_DES_SWAP_ANA                     BIT(4)
+#define HSIO_S1G_DES_CFG_DES_BW_HYST(x)                   (((x) << 1) & GENMASK(3, 1))
+#define HSIO_S1G_DES_CFG_DES_BW_HYST_M                    GENMASK(3, 1)
+#define HSIO_S1G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_S1G_DES_CFG_DES_SWAP_HYST                    BIT(0)
+
+#define HSIO_S1G_IB_CFG_IB_FX100_ENA                      BIT(27)
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST(x)                    (((x) << 24) & GENMASK(26, 24))
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST_M                     GENMASK(26, 24)
+#define HSIO_S1G_IB_CFG_ACJTAG_HYST_X(x)                  (((x) & GENMASK(26, 24)) >> 24)
+#define HSIO_S1G_IB_CFG_IB_DET_LEV(x)                     (((x) << 19) & GENMASK(21, 19))
+#define HSIO_S1G_IB_CFG_IB_DET_LEV_M                      GENMASK(21, 19)
+#define HSIO_S1G_IB_CFG_IB_DET_LEV_X(x)                   (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_S1G_IB_CFG_IB_HYST_LEV                       BIT(14)
+#define HSIO_S1G_IB_CFG_IB_ENA_CMV_TERM                   BIT(13)
+#define HSIO_S1G_IB_CFG_IB_ENA_DC_COUPLING                BIT(12)
+#define HSIO_S1G_IB_CFG_IB_ENA_DETLEV                     BIT(11)
+#define HSIO_S1G_IB_CFG_IB_ENA_HYST                       BIT(10)
+#define HSIO_S1G_IB_CFG_IB_ENA_OFFSET_COMP                BIT(9)
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN(x)                     (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_M                      GENMASK(8, 6)
+#define HSIO_S1G_IB_CFG_IB_EQ_GAIN_X(x)                   (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ(x)             (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_M              GENMASK(5, 4)
+#define HSIO_S1G_IB_CFG_IB_SEL_CORNER_FREQ_X(x)           (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S1G_IB_CFG_IB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S1G_OB_CFG_OB_SLP(x)                         (((x) << 17) & GENMASK(18, 17))
+#define HSIO_S1G_OB_CFG_OB_SLP_M                          GENMASK(18, 17)
+#define HSIO_S1G_OB_CFG_OB_SLP_X(x)                       (((x) & GENMASK(18, 17)) >> 17)
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL(x)                    (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_M                     GENMASK(16, 13)
+#define HSIO_S1G_OB_CFG_OB_AMP_CTRL_X(x)                  (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL(x)               (((x) << 10) & GENMASK(12, 10))
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_M                GENMASK(12, 10)
+#define HSIO_S1G_OB_CFG_OB_CMM_BIAS_CTRL_X(x)             (((x) & GENMASK(12, 10)) >> 10)
+#define HSIO_S1G_OB_CFG_OB_DIS_VCM_CTRL                   BIT(9)
+#define HSIO_S1G_OB_CFG_OB_EN_MEAS_VREG                   BIT(8)
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL(x)                    (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_M                     GENMASK(7, 4)
+#define HSIO_S1G_OB_CFG_OB_VCM_CTRL_X(x)                  (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S1G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S1G_SER_CFG_SER_IDLE                         BIT(9)
+#define HSIO_S1G_SER_CFG_SER_DEEMPH                       BIT(8)
+#define HSIO_S1G_SER_CFG_SER_CPMD_SEL                     BIT(7)
+#define HSIO_S1G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
+#define HSIO_S1G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S1G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
+#define HSIO_S1G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S1G_SER_CFG_SER_ENHYS                        BIT(3)
+#define HSIO_S1G_SER_CFG_SER_BIG_WIN                      BIT(2)
+#define HSIO_S1G_SER_CFG_SER_EN_WIN                       BIT(1)
+#define HSIO_S1G_SER_CFG_SER_ENALI                        BIT(0)
+
+#define HSIO_S1G_COMMON_CFG_SYS_RST                       BIT(31)
+#define HSIO_S1G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(21)
+#define HSIO_S1G_COMMON_CFG_ENA_LANE                      BIT(18)
+#define HSIO_S1G_COMMON_CFG_PWD_RX                        BIT(17)
+#define HSIO_S1G_COMMON_CFG_PWD_TX                        BIT(16)
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 13) & GENMASK(15, 13))
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL_M                   GENMASK(15, 13)
+#define HSIO_S1G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(15, 13)) >> 13)
+#define HSIO_S1G_COMMON_CFG_ENA_DIRECT                    BIT(12)
+#define HSIO_S1G_COMMON_CFG_ENA_ELOOP                     BIT(11)
+#define HSIO_S1G_COMMON_CFG_ENA_FLOOP                     BIT(10)
+#define HSIO_S1G_COMMON_CFG_ENA_ILOOP                     BIT(9)
+#define HSIO_S1G_COMMON_CFG_ENA_PLOOP                     BIT(8)
+#define HSIO_S1G_COMMON_CFG_HRATE                         BIT(7)
+#define HSIO_S1G_COMMON_CFG_IF_MODE                       BIT(0)
+
+#define HSIO_S1G_PLL_CFG_PLL_ENA_FB_DIV2                  BIT(22)
+#define HSIO_S1G_PLL_CFG_PLL_ENA_RC_DIV2                  BIT(21)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(15, 8)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_ENA                      BIT(7)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(6)
+#define HSIO_S1G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(5)
+#define HSIO_S1G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(3)
+
+#define HSIO_S1G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(12)
+#define HSIO_S1G_PLL_STATUS_PLL_CAL_ERR                   BIT(11)
+#define HSIO_S1G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(10)
+#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
+#define HSIO_S1G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
+
+#define HSIO_S1G_DFT_CFG0_LAZYBIT                         BIT(31)
+#define HSIO_S1G_DFT_CFG0_INV_DIS                         BIT(23)
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
+#define HSIO_S1G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
+#define HSIO_S1G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S1G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
+#define HSIO_S1G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S1G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
+#define HSIO_S1G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
+#define HSIO_S1G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
+#define HSIO_S1G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
+
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S1G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S1G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_DFT_CFG1_TX_JI_ENA                       BIT(3)
+#define HSIO_S1G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S1G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S1G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S1G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S1G_DFT_CFG2_RX_JI_ENA                       BIT(3)
+#define HSIO_S1G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S1G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(17, 16))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(17, 16)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
+#define HSIO_S1G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
+
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
+#define HSIO_S1G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
+#define HSIO_S1G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
+#define HSIO_S1G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
+#define HSIO_S1G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
+#define HSIO_S1G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
+#define HSIO_S1G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
+#define HSIO_S1G_MISC_CFG_LANE_RST                        BIT(0)
+
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
+#define HSIO_S1G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
+#define HSIO_S1G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
+#define HSIO_S1G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
+#define HSIO_S1G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
+#define HSIO_S1G_DFT_STATUS_BIST_ERROR                    BIT(0)
+
+#define HSIO_S1G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
+
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_WR_ONE_SHOT        BIT(31)
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_RD_ONE_SHOT        BIT(30)
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR(x)            ((x) & GENMASK(8, 0))
+#define HSIO_MCB_S1G_ADDR_CFG_SERDES1G_ADDR_M             GENMASK(8, 0)
+
+#define HSIO_S6G_DIG_CFG_GP(x)                            (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_DIG_CFG_GP_M                             GENMASK(18, 16)
+#define HSIO_S6G_DIG_CFG_GP_X(x)                          (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_DIG_CFG_TX_BIT_DOUBLING_MODE_ENA         BIT(7)
+#define HSIO_S6G_DIG_CFG_SIGDET_TESTMODE                  BIT(6)
+#define HSIO_S6G_DIG_CFG_SIGDET_AST(x)                    (((x) << 3) & GENMASK(5, 3))
+#define HSIO_S6G_DIG_CFG_SIGDET_AST_M                     GENMASK(5, 3)
+#define HSIO_S6G_DIG_CFG_SIGDET_AST_X(x)                  (((x) & GENMASK(5, 3)) >> 3)
+#define HSIO_S6G_DIG_CFG_SIGDET_DST(x)                    ((x) & GENMASK(2, 0))
+#define HSIO_S6G_DIG_CFG_SIGDET_DST_M                     GENMASK(2, 0)
+
+#define HSIO_S6G_DFT_CFG0_LAZYBIT                         BIT(31)
+#define HSIO_S6G_DFT_CFG0_INV_DIS                         BIT(23)
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL(x)                     (((x) << 20) & GENMASK(21, 20))
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL_M                      GENMASK(21, 20)
+#define HSIO_S6G_DFT_CFG0_PRBS_SEL_X(x)                   (((x) & GENMASK(21, 20)) >> 20)
+#define HSIO_S6G_DFT_CFG0_TEST_MODE(x)                    (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_DFT_CFG0_TEST_MODE_M                     GENMASK(18, 16)
+#define HSIO_S6G_DFT_CFG0_TEST_MODE_X(x)                  (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_DFT_CFG0_RX_PHS_CORR_DIS                 BIT(4)
+#define HSIO_S6G_DFT_CFG0_RX_PDSENS_ENA                   BIT(3)
+#define HSIO_S6G_DFT_CFG0_RX_DFT_ENA                      BIT(2)
+#define HSIO_S6G_DFT_CFG0_TX_DFT_ENA                      BIT(0)
+
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S6G_DFT_CFG1_TX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S6G_DFT_CFG1_TX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_DFT_CFG1_TX_JI_ENA                       BIT(3)
+#define HSIO_S6G_DFT_CFG1_TX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S6G_DFT_CFG1_TX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL(x)               (((x) << 8) & GENMASK(17, 8))
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_M                GENMASK(17, 8)
+#define HSIO_S6G_DFT_CFG2_RX_JITTER_AMPL_X(x)             (((x) & GENMASK(17, 8)) >> 8)
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ(x)                 (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_M                  GENMASK(7, 4)
+#define HSIO_S6G_DFT_CFG2_RX_STEP_FREQ_X(x)               (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_DFT_CFG2_RX_JI_ENA                       BIT(3)
+#define HSIO_S6G_DFT_CFG2_RX_WAVEFORM_SEL                 BIT(2)
+#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_DIR                  BIT(1)
+#define HSIO_S6G_DFT_CFG2_RX_FREQOFF_ENA                  BIT(0)
+
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_ENA             BIT(20)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH(x)     (((x) << 16) & GENMASK(19, 16))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_M      GENMASK(19, 16)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_FBS_HIGH_X(x)   (((x) & GENMASK(19, 16)) >> 16)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH(x)         (((x) << 8) & GENMASK(15, 8))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_M          GENMASK(15, 8)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_HIGH_X(x)       (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW(x)          ((x) & GENMASK(7, 0))
+#define HSIO_S6G_RC_PLL_BIST_CFG_PLL_BIST_LOW_M           GENMASK(7, 0)
+
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK(x)                 (((x) << 13) & GENMASK(14, 13))
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_M                  GENMASK(14, 13)
+#define HSIO_S6G_MISC_CFG_SEL_RECO_CLK_X(x)               (((x) & GENMASK(14, 13)) >> 13)
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE(x)          (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_M           GENMASK(12, 11)
+#define HSIO_S6G_MISC_CFG_DES_100FX_KICK_MODE_X(x)        (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_SWAP             BIT(10)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_MODE             BIT(9)
+#define HSIO_S6G_MISC_CFG_DES_100FX_CPMD_ENA              BIT(8)
+#define HSIO_S6G_MISC_CFG_RX_BUS_FLIP_ENA                 BIT(7)
+#define HSIO_S6G_MISC_CFG_TX_BUS_FLIP_ENA                 BIT(6)
+#define HSIO_S6G_MISC_CFG_RX_LPI_MODE_ENA                 BIT(5)
+#define HSIO_S6G_MISC_CFG_TX_LPI_MODE_ENA                 BIT(4)
+#define HSIO_S6G_MISC_CFG_RX_DATA_INV_ENA                 BIT(3)
+#define HSIO_S6G_MISC_CFG_TX_DATA_INV_ENA                 BIT(2)
+#define HSIO_S6G_MISC_CFG_LANE_RST                        BIT(0)
+
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0(x)               (((x) << 23) & GENMASK(28, 23))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_M                GENMASK(28, 23)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST0_X(x)             (((x) & GENMASK(28, 23)) >> 23)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1(x)               (((x) << 18) & GENMASK(22, 18))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_M                GENMASK(22, 18)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_POST1_X(x)             (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC(x)                (((x) << 13) & GENMASK(17, 13))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_M                 GENMASK(17, 13)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_PREC_X(x)              (((x) & GENMASK(17, 13)) >> 13)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS(x)             (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_M              GENMASK(8, 6)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_ENA_CAS_X(x)           (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_OB_ANEG_CFG_AN_OB_LEV_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_DFT_STATUS_PRBS_SYNC_STAT                BIT(8)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_NOT_DONE             BIT(7)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_FAILED               BIT(6)
+#define HSIO_S6G_DFT_STATUS_PLL_BIST_TIMEOUT_ERR          BIT(5)
+#define HSIO_S6G_DFT_STATUS_BIST_ACTIVE                   BIT(3)
+#define HSIO_S6G_DFT_STATUS_BIST_NOSYNC                   BIT(2)
+#define HSIO_S6G_DFT_STATUS_BIST_COMPLETE_N               BIT(1)
+#define HSIO_S6G_DFT_STATUS_BIST_ERROR                    BIT(0)
+
+#define HSIO_S6G_MISC_STATUS_DES_100FX_PHASE_SEL          BIT(0)
+
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL(x)                  (((x) << 13) & GENMASK(16, 13))
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_M                   GENMASK(16, 13)
+#define HSIO_S6G_DES_CFG_DES_PHS_CTRL_X(x)                (((x) & GENMASK(16, 13)) >> 13)
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL(x)                 (((x) << 10) & GENMASK(12, 10))
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_M                  GENMASK(12, 10)
+#define HSIO_S6G_DES_CFG_DES_MBTR_CTRL_X(x)               (((x) & GENMASK(12, 10)) >> 10)
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL(x)                  (((x) << 8) & GENMASK(9, 8))
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_M                   GENMASK(9, 8)
+#define HSIO_S6G_DES_CFG_DES_CPMD_SEL_X(x)                (((x) & GENMASK(9, 8)) >> 8)
+#define HSIO_S6G_DES_CFG_DES_BW_HYST(x)                   (((x) << 5) & GENMASK(7, 5))
+#define HSIO_S6G_DES_CFG_DES_BW_HYST_M                    GENMASK(7, 5)
+#define HSIO_S6G_DES_CFG_DES_BW_HYST_X(x)                 (((x) & GENMASK(7, 5)) >> 5)
+#define HSIO_S6G_DES_CFG_DES_SWAP_HYST                    BIT(4)
+#define HSIO_S6G_DES_CFG_DES_BW_ANA(x)                    (((x) << 1) & GENMASK(3, 1))
+#define HSIO_S6G_DES_CFG_DES_BW_ANA_M                     GENMASK(3, 1)
+#define HSIO_S6G_DES_CFG_DES_BW_ANA_X(x)                  (((x) & GENMASK(3, 1)) >> 1)
+#define HSIO_S6G_DES_CFG_DES_SWAP_ANA                     BIT(0)
+
+#define HSIO_S6G_IB_CFG_IB_SOFSI(x)                       (((x) << 29) & GENMASK(30, 29))
+#define HSIO_S6G_IB_CFG_IB_SOFSI_M                        GENMASK(30, 29)
+#define HSIO_S6G_IB_CFG_IB_SOFSI_X(x)                     (((x) & GENMASK(30, 29)) >> 29)
+#define HSIO_S6G_IB_CFG_IB_VBULK_SEL                      BIT(28)
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ(x)                    (((x) << 24) & GENMASK(27, 24))
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_M                     GENMASK(27, 24)
+#define HSIO_S6G_IB_CFG_IB_RTRM_ADJ_X(x)                  (((x) & GENMASK(27, 24)) >> 24)
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ(x)                    (((x) << 20) & GENMASK(23, 20))
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_M                     GENMASK(23, 20)
+#define HSIO_S6G_IB_CFG_IB_ICML_ADJ_X(x)                  (((x) & GENMASK(23, 20)) >> 20)
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL(x)               (((x) << 18) & GENMASK(19, 18))
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_M                GENMASK(19, 18)
+#define HSIO_S6G_IB_CFG_IB_TERM_MODE_SEL_X(x)             (((x) & GENMASK(19, 18)) >> 18)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL(x)             (((x) << 15) & GENMASK(17, 15))
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_M              GENMASK(17, 15)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_CLK_SEL_X(x)           (((x) & GENMASK(17, 15)) >> 15)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP(x)              (((x) << 13) & GENMASK(14, 13))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_M               GENMASK(14, 13)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_HP_X(x)            (((x) & GENMASK(14, 13)) >> 13)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID(x)             (((x) << 11) & GENMASK(12, 11))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_M              GENMASK(12, 11)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_MID_X(x)           (((x) & GENMASK(12, 11)) >> 11)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP(x)              (((x) << 9) & GENMASK(10, 9))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_M               GENMASK(10, 9)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_LP_X(x)            (((x) & GENMASK(10, 9)) >> 9)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET(x)          (((x) << 7) & GENMASK(8, 7))
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_M           GENMASK(8, 7)
+#define HSIO_S6G_IB_CFG_IB_REG_PAT_SEL_OFFSET_X(x)        (((x) & GENMASK(8, 7)) >> 7)
+#define HSIO_S6G_IB_CFG_IB_ANA_TEST_ENA                   BIT(6)
+#define HSIO_S6G_IB_CFG_IB_SIG_DET_ENA                    BIT(5)
+#define HSIO_S6G_IB_CFG_IB_CONCUR                         BIT(4)
+#define HSIO_S6G_IB_CFG_IB_CAL_ENA                        BIT(3)
+#define HSIO_S6G_IB_CFG_IB_SAM_ENA                        BIT(2)
+#define HSIO_S6G_IB_CFG_IB_EQZ_ENA                        BIT(1)
+#define HSIO_S6G_IB_CFG_IB_REG_ENA                        BIT(0)
+
+#define HSIO_S6G_IB_CFG1_IB_TJTAG(x)                      (((x) << 17) & GENMASK(21, 17))
+#define HSIO_S6G_IB_CFG1_IB_TJTAG_M                       GENMASK(21, 17)
+#define HSIO_S6G_IB_CFG1_IB_TJTAG_X(x)                    (((x) & GENMASK(21, 17)) >> 17)
+#define HSIO_S6G_IB_CFG1_IB_TSDET(x)                      (((x) << 12) & GENMASK(16, 12))
+#define HSIO_S6G_IB_CFG1_IB_TSDET_M                       GENMASK(16, 12)
+#define HSIO_S6G_IB_CFG1_IB_TSDET_X(x)                    (((x) & GENMASK(16, 12)) >> 12)
+#define HSIO_S6G_IB_CFG1_IB_SCALY(x)                      (((x) << 8) & GENMASK(11, 8))
+#define HSIO_S6G_IB_CFG1_IB_SCALY_M                       GENMASK(11, 8)
+#define HSIO_S6G_IB_CFG1_IB_SCALY_X(x)                    (((x) & GENMASK(11, 8)) >> 8)
+#define HSIO_S6G_IB_CFG1_IB_FILT_HP                       BIT(7)
+#define HSIO_S6G_IB_CFG1_IB_FILT_MID                      BIT(6)
+#define HSIO_S6G_IB_CFG1_IB_FILT_LP                       BIT(5)
+#define HSIO_S6G_IB_CFG1_IB_FILT_OFFSET                   BIT(4)
+#define HSIO_S6G_IB_CFG1_IB_FRC_HP                        BIT(3)
+#define HSIO_S6G_IB_CFG1_IB_FRC_MID                       BIT(2)
+#define HSIO_S6G_IB_CFG1_IB_FRC_LP                        BIT(1)
+#define HSIO_S6G_IB_CFG1_IB_FRC_OFFSET                    BIT(0)
+
+#define HSIO_S6G_IB_CFG2_IB_TINFV(x)                      (((x) << 27) & GENMASK(29, 27))
+#define HSIO_S6G_IB_CFG2_IB_TINFV_M                       GENMASK(29, 27)
+#define HSIO_S6G_IB_CFG2_IB_TINFV_X(x)                    (((x) & GENMASK(29, 27)) >> 27)
+#define HSIO_S6G_IB_CFG2_IB_OINFI(x)                      (((x) << 22) & GENMASK(26, 22))
+#define HSIO_S6G_IB_CFG2_IB_OINFI_M                       GENMASK(26, 22)
+#define HSIO_S6G_IB_CFG2_IB_OINFI_X(x)                    (((x) & GENMASK(26, 22)) >> 22)
+#define HSIO_S6G_IB_CFG2_IB_TAUX(x)                       (((x) << 19) & GENMASK(21, 19))
+#define HSIO_S6G_IB_CFG2_IB_TAUX_M                        GENMASK(21, 19)
+#define HSIO_S6G_IB_CFG2_IB_TAUX_X(x)                     (((x) & GENMASK(21, 19)) >> 19)
+#define HSIO_S6G_IB_CFG2_IB_OINFS(x)                      (((x) << 16) & GENMASK(18, 16))
+#define HSIO_S6G_IB_CFG2_IB_OINFS_M                       GENMASK(18, 16)
+#define HSIO_S6G_IB_CFG2_IB_OINFS_X(x)                    (((x) & GENMASK(18, 16)) >> 16)
+#define HSIO_S6G_IB_CFG2_IB_OCALS(x)                      (((x) << 10) & GENMASK(15, 10))
+#define HSIO_S6G_IB_CFG2_IB_OCALS_M                       GENMASK(15, 10)
+#define HSIO_S6G_IB_CFG2_IB_OCALS_X(x)                    (((x) & GENMASK(15, 10)) >> 10)
+#define HSIO_S6G_IB_CFG2_IB_TCALV(x)                      (((x) << 5) & GENMASK(9, 5))
+#define HSIO_S6G_IB_CFG2_IB_TCALV_M                       GENMASK(9, 5)
+#define HSIO_S6G_IB_CFG2_IB_TCALV_X(x)                    (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_S6G_IB_CFG2_IB_UMAX(x)                       (((x) << 3) & GENMASK(4, 3))
+#define HSIO_S6G_IB_CFG2_IB_UMAX_M                        GENMASK(4, 3)
+#define HSIO_S6G_IB_CFG2_IB_UMAX_X(x)                     (((x) & GENMASK(4, 3)) >> 3)
+#define HSIO_S6G_IB_CFG2_IB_UREG(x)                       ((x) & GENMASK(2, 0))
+#define HSIO_S6G_IB_CFG2_IB_UREG_M                        GENMASK(2, 0)
+
+#define HSIO_S6G_IB_CFG3_IB_INI_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG3_IB_INI_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG3_IB_INI_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG3_IB_INI_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG3_IB_INI_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG3_IB_INI_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG3_IB_INI_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG3_IB_INI_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG3_IB_INI_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG3_IB_INI_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG4_IB_MAX_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG4_IB_MAX_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG4_IB_MAX_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG4_IB_MAX_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP(x)                     (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP_M                      GENMASK(23, 18)
+#define HSIO_S6G_IB_CFG5_IB_MIN_HP_X(x)                   (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID(x)                    (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID_M                     GENMASK(17, 12)
+#define HSIO_S6G_IB_CFG5_IB_MIN_MID_X(x)                  (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP(x)                     (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP_M                      GENMASK(11, 6)
+#define HSIO_S6G_IB_CFG5_IB_MIN_LP_X(x)                   (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET(x)                 ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_CFG5_IB_MIN_OFFSET_M                  GENMASK(5, 0)
+
+#define HSIO_S6G_OB_CFG_OB_IDLE                           BIT(31)
+#define HSIO_S6G_OB_CFG_OB_ENA1V_MODE                     BIT(30)
+#define HSIO_S6G_OB_CFG_OB_POL                            BIT(29)
+#define HSIO_S6G_OB_CFG_OB_POST0(x)                       (((x) << 23) & GENMASK(28, 23))
+#define HSIO_S6G_OB_CFG_OB_POST0_M                        GENMASK(28, 23)
+#define HSIO_S6G_OB_CFG_OB_POST0_X(x)                     (((x) & GENMASK(28, 23)) >> 23)
+#define HSIO_S6G_OB_CFG_OB_PREC(x)                        (((x) << 18) & GENMASK(22, 18))
+#define HSIO_S6G_OB_CFG_OB_PREC_M                         GENMASK(22, 18)
+#define HSIO_S6G_OB_CFG_OB_PREC_X(x)                      (((x) & GENMASK(22, 18)) >> 18)
+#define HSIO_S6G_OB_CFG_OB_R_ADJ_MUX                      BIT(17)
+#define HSIO_S6G_OB_CFG_OB_R_ADJ_PDR                      BIT(16)
+#define HSIO_S6G_OB_CFG_OB_POST1(x)                       (((x) << 11) & GENMASK(15, 11))
+#define HSIO_S6G_OB_CFG_OB_POST1_M                        GENMASK(15, 11)
+#define HSIO_S6G_OB_CFG_OB_POST1_X(x)                     (((x) & GENMASK(15, 11)) >> 11)
+#define HSIO_S6G_OB_CFG_OB_R_COR                          BIT(10)
+#define HSIO_S6G_OB_CFG_OB_SEL_RCTRL                      BIT(9)
+#define HSIO_S6G_OB_CFG_OB_SR_H                           BIT(8)
+#define HSIO_S6G_OB_CFG_OB_SR(x)                          (((x) << 4) & GENMASK(7, 4))
+#define HSIO_S6G_OB_CFG_OB_SR_M                           GENMASK(7, 4)
+#define HSIO_S6G_OB_CFG_OB_SR_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
+#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL(x)               ((x) & GENMASK(3, 0))
+#define HSIO_S6G_OB_CFG_OB_RESISTOR_CTRL_M                GENMASK(3, 0)
+
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS(x)                    (((x) << 6) & GENMASK(8, 6))
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_M                     GENMASK(8, 6)
+#define HSIO_S6G_OB_CFG1_OB_ENA_CAS_X(x)                  (((x) & GENMASK(8, 6)) >> 6)
+#define HSIO_S6G_OB_CFG1_OB_LEV(x)                        ((x) & GENMASK(5, 0))
+#define HSIO_S6G_OB_CFG1_OB_LEV_M                         GENMASK(5, 0)
+
+#define HSIO_S6G_SER_CFG_SER_4TAP_ENA                     BIT(8)
+#define HSIO_S6G_SER_CFG_SER_CPMD_SEL                     BIT(7)
+#define HSIO_S6G_SER_CFG_SER_SWAP_CPMD                    BIT(6)
+#define HSIO_S6G_SER_CFG_SER_ALISEL(x)                    (((x) << 4) & GENMASK(5, 4))
+#define HSIO_S6G_SER_CFG_SER_ALISEL_M                     GENMASK(5, 4)
+#define HSIO_S6G_SER_CFG_SER_ALISEL_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
+#define HSIO_S6G_SER_CFG_SER_ENHYS                        BIT(3)
+#define HSIO_S6G_SER_CFG_SER_BIG_WIN                      BIT(2)
+#define HSIO_S6G_SER_CFG_SER_EN_WIN                       BIT(1)
+#define HSIO_S6G_SER_CFG_SER_ENALI                        BIT(0)
+
+#define HSIO_S6G_COMMON_CFG_SYS_RST                       BIT(17)
+#define HSIO_S6G_COMMON_CFG_SE_DIV2_ENA                   BIT(16)
+#define HSIO_S6G_COMMON_CFG_SE_AUTO_SQUELCH_ENA           BIT(15)
+#define HSIO_S6G_COMMON_CFG_ENA_LANE                      BIT(14)
+#define HSIO_S6G_COMMON_CFG_PWD_RX                        BIT(13)
+#define HSIO_S6G_COMMON_CFG_PWD_TX                        BIT(12)
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL(x)                  (((x) << 9) & GENMASK(11, 9))
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL_M                   GENMASK(11, 9)
+#define HSIO_S6G_COMMON_CFG_LANE_CTRL_X(x)                (((x) & GENMASK(11, 9)) >> 9)
+#define HSIO_S6G_COMMON_CFG_ENA_DIRECT                    BIT(8)
+#define HSIO_S6G_COMMON_CFG_ENA_ELOOP                     BIT(7)
+#define HSIO_S6G_COMMON_CFG_ENA_FLOOP                     BIT(6)
+#define HSIO_S6G_COMMON_CFG_ENA_ILOOP                     BIT(5)
+#define HSIO_S6G_COMMON_CFG_ENA_PLOOP                     BIT(4)
+#define HSIO_S6G_COMMON_CFG_HRATE                         BIT(3)
+#define HSIO_S6G_COMMON_CFG_QRATE                         BIT(2)
+#define HSIO_S6G_COMMON_CFG_IF_MODE(x)                    ((x) & GENMASK(1, 0))
+#define HSIO_S6G_COMMON_CFG_IF_MODE_M                     GENMASK(1, 0)
+
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS(x)                  (((x) << 16) & GENMASK(17, 16))
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_M                   GENMASK(17, 16)
+#define HSIO_S6G_PLL_CFG_PLL_ENA_OFFS_X(x)                (((x) & GENMASK(17, 16)) >> 16)
+#define HSIO_S6G_PLL_CFG_PLL_DIV4                         BIT(15)
+#define HSIO_S6G_PLL_CFG_PLL_ENA_ROT                      BIT(14)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA(x)             (((x) << 6) & GENMASK(13, 6))
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_M              GENMASK(13, 6)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_CTRL_DATA_X(x)           (((x) & GENMASK(13, 6)) >> 6)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_ENA                      BIT(5)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_FORCE_SET_ENA            BIT(4)
+#define HSIO_S6G_PLL_CFG_PLL_FSM_OOR_RECAL_ENA            BIT(3)
+#define HSIO_S6G_PLL_CFG_PLL_RB_DATA_SEL                  BIT(2)
+#define HSIO_S6G_PLL_CFG_PLL_ROT_DIR                      BIT(1)
+#define HSIO_S6G_PLL_CFG_PLL_ROT_FRQ                      BIT(0)
+
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_N            BIT(5)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_DATA_P            BIT(4)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_INIT_CLK               BIT(3)
+#define HSIO_S6G_ACJTAG_CFG_OB_DIRECT                     BIT(2)
+#define HSIO_S6G_ACJTAG_CFG_ACJTAG_ENA                    BIT(1)
+#define HSIO_S6G_ACJTAG_CFG_JTAG_CTRL_ENA                 BIT(0)
+
+#define HSIO_S6G_GP_CFG_GP_MSB(x)                         (((x) << 16) & GENMASK(31, 16))
+#define HSIO_S6G_GP_CFG_GP_MSB_M                          GENMASK(31, 16)
+#define HSIO_S6G_GP_CFG_GP_MSB_X(x)                       (((x) & GENMASK(31, 16)) >> 16)
+#define HSIO_S6G_GP_CFG_GP_LSB(x)                         ((x) & GENMASK(15, 0))
+#define HSIO_S6G_GP_CFG_GP_LSB_M                          GENMASK(15, 0)
+
+#define HSIO_S6G_IB_STATUS0_IB_CAL_DONE                   BIT(8)
+#define HSIO_S6G_IB_STATUS0_IB_HP_GAIN_ACT                BIT(7)
+#define HSIO_S6G_IB_STATUS0_IB_MID_GAIN_ACT               BIT(6)
+#define HSIO_S6G_IB_STATUS0_IB_LP_GAIN_ACT                BIT(5)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ACT                 BIT(4)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_VLD                 BIT(3)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSET_ERR                 BIT(2)
+#define HSIO_S6G_IB_STATUS0_IB_OFFSDIR                    BIT(1)
+#define HSIO_S6G_IB_STATUS0_IB_SIG_DET                    BIT(0)
+
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT(x)            (((x) << 18) & GENMASK(23, 18))
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_M             GENMASK(23, 18)
+#define HSIO_S6G_IB_STATUS1_IB_HP_GAIN_STAT_X(x)          (((x) & GENMASK(23, 18)) >> 18)
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT(x)           (((x) << 12) & GENMASK(17, 12))
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_M            GENMASK(17, 12)
+#define HSIO_S6G_IB_STATUS1_IB_MID_GAIN_STAT_X(x)         (((x) & GENMASK(17, 12)) >> 12)
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT(x)            (((x) << 6) & GENMASK(11, 6))
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_M             GENMASK(11, 6)
+#define HSIO_S6G_IB_STATUS1_IB_LP_GAIN_STAT_X(x)          (((x) & GENMASK(11, 6)) >> 6)
+#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT(x)             ((x) & GENMASK(5, 0))
+#define HSIO_S6G_IB_STATUS1_IB_OFFSET_STAT_M              GENMASK(5, 0)
+
+#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_N         BIT(2)
+#define HSIO_S6G_ACJTAG_STATUS_ACJTAG_CAPT_DATA_P         BIT(1)
+#define HSIO_S6G_ACJTAG_STATUS_IB_DIRECT                  BIT(0)
+
+#define HSIO_S6G_PLL_STATUS_PLL_CAL_NOT_DONE              BIT(10)
+#define HSIO_S6G_PLL_STATUS_PLL_CAL_ERR                   BIT(9)
+#define HSIO_S6G_PLL_STATUS_PLL_OUT_OF_RANGE_ERR          BIT(8)
+#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA(x)                ((x) & GENMASK(7, 0))
+#define HSIO_S6G_PLL_STATUS_PLL_RB_DATA_M                 GENMASK(7, 0)
+
+#define HSIO_S6G_REVID_SERDES_REV(x)                      (((x) << 26) & GENMASK(31, 26))
+#define HSIO_S6G_REVID_SERDES_REV_M                       GENMASK(31, 26)
+#define HSIO_S6G_REVID_SERDES_REV_X(x)                    (((x) & GENMASK(31, 26)) >> 26)
+#define HSIO_S6G_REVID_RCPLL_REV(x)                       (((x) << 21) & GENMASK(25, 21))
+#define HSIO_S6G_REVID_RCPLL_REV_M                        GENMASK(25, 21)
+#define HSIO_S6G_REVID_RCPLL_REV_X(x)                     (((x) & GENMASK(25, 21)) >> 21)
+#define HSIO_S6G_REVID_SER_REV(x)                         (((x) << 16) & GENMASK(20, 16))
+#define HSIO_S6G_REVID_SER_REV_M                          GENMASK(20, 16)
+#define HSIO_S6G_REVID_SER_REV_X(x)                       (((x) & GENMASK(20, 16)) >> 16)
+#define HSIO_S6G_REVID_DES_REV(x)                         (((x) << 10) & GENMASK(15, 10))
+#define HSIO_S6G_REVID_DES_REV_M                          GENMASK(15, 10)
+#define HSIO_S6G_REVID_DES_REV_X(x)                       (((x) & GENMASK(15, 10)) >> 10)
+#define HSIO_S6G_REVID_OB_REV(x)                          (((x) << 5) & GENMASK(9, 5))
+#define HSIO_S6G_REVID_OB_REV_M                           GENMASK(9, 5)
+#define HSIO_S6G_REVID_OB_REV_X(x)                        (((x) & GENMASK(9, 5)) >> 5)
+#define HSIO_S6G_REVID_IB_REV(x)                          ((x) & GENMASK(4, 0))
+#define HSIO_S6G_REVID_IB_REV_M                           GENMASK(4, 0)
+
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_WR_ONE_SHOT        BIT(31)
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_RD_ONE_SHOT        BIT(30)
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR(x)            ((x) & GENMASK(24, 0))
+#define HSIO_MCB_S6G_ADDR_CFG_SERDES6G_ADDR_M             GENMASK(24, 0)
+
+#define HSIO_HW_CFG_DEV2G5_10_MODE                        BIT(6)
+#define HSIO_HW_CFG_DEV1G_9_MODE                          BIT(5)
+#define HSIO_HW_CFG_DEV1G_6_MODE                          BIT(4)
+#define HSIO_HW_CFG_DEV1G_5_MODE                          BIT(3)
+#define HSIO_HW_CFG_DEV1G_4_MODE                          BIT(2)
+#define HSIO_HW_CFG_PCIE_ENA                              BIT(1)
+#define HSIO_HW_CFG_QSGMII_ENA                            BIT(0)
+
+#define HSIO_HW_QSGMII_CFG_SHYST_DIS                      BIT(3)
+#define HSIO_HW_QSGMII_CFG_E_DET_ENA                      BIT(2)
+#define HSIO_HW_QSGMII_CFG_USE_I1_ENA                     BIT(1)
+#define HSIO_HW_QSGMII_CFG_FLIP_LANES                     BIT(0)
+
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS(x)           (((x) << 1) & GENMASK(6, 1))
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_M            GENMASK(6, 1)
+#define HSIO_HW_QSGMII_STAT_DELAY_VAR_X200PS_X(x)         (((x) & GENMASK(6, 1)) >> 1)
+#define HSIO_HW_QSGMII_STAT_SYNC                          BIT(0)
+
+#define HSIO_CLK_CFG_CLKDIV_PHY(x)                        (((x) << 1) & GENMASK(8, 1))
+#define HSIO_CLK_CFG_CLKDIV_PHY_M                         GENMASK(8, 1)
+#define HSIO_CLK_CFG_CLKDIV_PHY_X(x)                      (((x) & GENMASK(8, 1)) >> 1)
+#define HSIO_CLK_CFG_CLKDIV_PHY_DIS                       BIT(0)
+
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_TEMP_RD               BIT(5)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_RUN                   BIT(4)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_NO_RST                BIT(3)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_POWER_UP              BIT(2)
+#define HSIO_TEMP_SENSOR_CTRL_FORCE_CLK                   BIT(1)
+#define HSIO_TEMP_SENSOR_CTRL_SAMPLE_ENA                  BIT(0)
+
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID(x)                   (((x) << 8) & GENMASK(15, 8))
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID_M                    GENMASK(15, 8)
+#define HSIO_TEMP_SENSOR_CFG_RUN_WID_X(x)                 (((x) & GENMASK(15, 8)) >> 8)
+#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER(x)                ((x) & GENMASK(7, 0))
+#define HSIO_TEMP_SENSOR_CFG_SAMPLE_PER_M                 GENMASK(7, 0)
+
+#define HSIO_TEMP_SENSOR_STAT_TEMP_VALID                  BIT(8)
+#define HSIO_TEMP_SENSOR_STAT_TEMP(x)                     ((x) & GENMASK(7, 0))
+#define HSIO_TEMP_SENSOR_STAT_TEMP_M                      GENMASK(7, 0)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c
new file mode 100644
index 000000000000..c6db8ad31fdf
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_io.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include "ocelot.h"
+
+u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset)
+{
+	u16 target = reg >> TARGET_OFFSET;
+	u32 val;
+
+	WARN_ON(!target);
+
+	regmap_read(ocelot->targets[target],
+		    ocelot->map[target][reg & REG_MASK] + offset, &val);
+	return val;
+}
+EXPORT_SYMBOL(__ocelot_read_ix);
+
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset)
+{
+	u16 target = reg >> TARGET_OFFSET;
+
+	WARN_ON(!target);
+
+	regmap_write(ocelot->targets[target],
+		     ocelot->map[target][reg & REG_MASK] + offset, val);
+}
+EXPORT_SYMBOL(__ocelot_write_ix);
+
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
+		     u32 offset)
+{
+	u16 target = reg >> TARGET_OFFSET;
+
+	WARN_ON(!target);
+
+	regmap_update_bits(ocelot->targets[target],
+			   ocelot->map[target][reg & REG_MASK] + offset,
+			   mask, val);
+}
+EXPORT_SYMBOL(__ocelot_rmw_ix);
+
+u32 ocelot_port_readl(struct ocelot_port *port, u32 reg)
+{
+	return readl(port->regs + reg);
+}
+EXPORT_SYMBOL(ocelot_port_readl);
+
+void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
+{
+	writel(val, port->regs + reg);
+}
+EXPORT_SYMBOL(ocelot_port_writel);
+
+int ocelot_regfields_init(struct ocelot *ocelot,
+			  const struct reg_field *const regfields)
+{
+	unsigned int i;
+	u16 target;
+
+	for (i = 0; i < REGFIELD_MAX; i++) {
+		struct reg_field regfield = {};
+		u32 reg = regfields[i].reg;
+
+		if (!reg)
+			continue;
+
+		target = regfields[i].reg >> TARGET_OFFSET;
+
+		regfield.reg = ocelot->map[target][reg & REG_MASK];
+		regfield.lsb = regfields[i].lsb;
+		regfield.msb = regfields[i].msb;
+
+		ocelot->regfields[i] =
+		devm_regmap_field_alloc(ocelot->dev,
+					ocelot->targets[target],
+					regfield);
+
+		if (IS_ERR(ocelot->regfields[i]))
+			return PTR_ERR(ocelot->regfields[i]);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_regfields_init);
+
+static struct regmap_config ocelot_regmap_config = {
+	.reg_bits	= 32,
+	.val_bits	= 32,
+	.reg_stride	= 4,
+};
+
+struct regmap *ocelot_io_platform_init(struct ocelot *ocelot,
+				       struct platform_device *pdev,
+				       const char *name)
+{
+	struct resource *res;
+	void __iomem *regs;
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, name);
+	regs = devm_ioremap_resource(ocelot->dev, res);
+	if (IS_ERR(regs))
+		return ERR_CAST(regs);
+
+	ocelot_regmap_config.name = name;
+	return devm_regmap_init_mmio(ocelot->dev, regs,
+				     &ocelot_regmap_config);
+}
+EXPORT_SYMBOL(ocelot_io_platform_init);
diff --git a/drivers/net/ethernet/mscc/ocelot_qs.h b/drivers/net/ethernet/mscc/ocelot_qs.h
new file mode 100644
index 000000000000..d18ae726c01d
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_qs.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_QS_H_
+#define _MSCC_OCELOT_QS_H_
+
+/* TODO handle BE */
+#define XTR_EOF_0          0x00000080U
+#define XTR_EOF_1          0x01000080U
+#define XTR_EOF_2          0x02000080U
+#define XTR_EOF_3          0x03000080U
+#define XTR_PRUNED         0x04000080U
+#define XTR_ABORT          0x05000080U
+#define XTR_ESCAPE         0x06000080U
+#define XTR_NOT_READY      0x07000080U
+#define XTR_VALID_BYTES(x) (4 - (((x) >> 24) & 3))
+
+#define QS_XTR_GRP_CFG_RSZ                                0x4
+
+#define QS_XTR_GRP_CFG_MODE(x)                            (((x) << 2) & GENMASK(3, 2))
+#define QS_XTR_GRP_CFG_MODE_M                             GENMASK(3, 2)
+#define QS_XTR_GRP_CFG_MODE_X(x)                          (((x) & GENMASK(3, 2)) >> 2)
+#define QS_XTR_GRP_CFG_STATUS_WORD_POS                    BIT(1)
+#define QS_XTR_GRP_CFG_BYTE_SWAP                          BIT(0)
+
+#define QS_XTR_RD_RSZ                                     0x4
+
+#define QS_XTR_FRM_PRUNING_RSZ                            0x4
+
+#define QS_XTR_CFG_DP_WM(x)                               (((x) << 5) & GENMASK(7, 5))
+#define QS_XTR_CFG_DP_WM_M                                GENMASK(7, 5)
+#define QS_XTR_CFG_DP_WM_X(x)                             (((x) & GENMASK(7, 5)) >> 5)
+#define QS_XTR_CFG_SCH_WM(x)                              (((x) << 2) & GENMASK(4, 2))
+#define QS_XTR_CFG_SCH_WM_M                               GENMASK(4, 2)
+#define QS_XTR_CFG_SCH_WM_X(x)                            (((x) & GENMASK(4, 2)) >> 2)
+#define QS_XTR_CFG_OFLW_ERR_STICKY(x)                     ((x) & GENMASK(1, 0))
+#define QS_XTR_CFG_OFLW_ERR_STICKY_M                      GENMASK(1, 0)
+
+#define QS_INJ_GRP_CFG_RSZ                                0x4
+
+#define QS_INJ_GRP_CFG_MODE(x)                            (((x) << 2) & GENMASK(3, 2))
+#define QS_INJ_GRP_CFG_MODE_M                             GENMASK(3, 2)
+#define QS_INJ_GRP_CFG_MODE_X(x)                          (((x) & GENMASK(3, 2)) >> 2)
+#define QS_INJ_GRP_CFG_BYTE_SWAP                          BIT(0)
+
+#define QS_INJ_WR_RSZ                                     0x4
+
+#define QS_INJ_CTRL_RSZ                                   0x4
+
+#define QS_INJ_CTRL_GAP_SIZE(x)                           (((x) << 21) & GENMASK(24, 21))
+#define QS_INJ_CTRL_GAP_SIZE_M                            GENMASK(24, 21)
+#define QS_INJ_CTRL_GAP_SIZE_X(x)                         (((x) & GENMASK(24, 21)) >> 21)
+#define QS_INJ_CTRL_ABORT                                 BIT(20)
+#define QS_INJ_CTRL_EOF                                   BIT(19)
+#define QS_INJ_CTRL_SOF                                   BIT(18)
+#define QS_INJ_CTRL_VLD_BYTES(x)                          (((x) << 16) & GENMASK(17, 16))
+#define QS_INJ_CTRL_VLD_BYTES_M                           GENMASK(17, 16)
+#define QS_INJ_CTRL_VLD_BYTES_X(x)                        (((x) & GENMASK(17, 16)) >> 16)
+
+#define QS_INJ_STATUS_WMARK_REACHED(x)                    (((x) << 4) & GENMASK(5, 4))
+#define QS_INJ_STATUS_WMARK_REACHED_M                     GENMASK(5, 4)
+#define QS_INJ_STATUS_WMARK_REACHED_X(x)                  (((x) & GENMASK(5, 4)) >> 4)
+#define QS_INJ_STATUS_FIFO_RDY(x)                         (((x) << 2) & GENMASK(3, 2))
+#define QS_INJ_STATUS_FIFO_RDY_M                          GENMASK(3, 2)
+#define QS_INJ_STATUS_FIFO_RDY_X(x)                       (((x) & GENMASK(3, 2)) >> 2)
+#define QS_INJ_STATUS_INJ_IN_PROGRESS(x)                  ((x) & GENMASK(1, 0))
+#define QS_INJ_STATUS_INJ_IN_PROGRESS_M                   GENMASK(1, 0)
+
+#define QS_INJ_ERR_RSZ                                    0x4
+
+#define QS_INJ_ERR_ABORT_ERR_STICKY                       BIT(1)
+#define QS_INJ_ERR_WR_ERR_STICKY                          BIT(0)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_qsys.h b/drivers/net/ethernet/mscc/ocelot_qsys.h
new file mode 100644
index 000000000000..aa7267d5ca77
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_qsys.h
@@ -0,0 +1,270 @@
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * License: Dual MIT/GPL
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_QSYS_H_
+#define _MSCC_OCELOT_QSYS_H_
+
+#define QSYS_PORT_MODE_RSZ                                0x4
+
+#define QSYS_PORT_MODE_DEQUEUE_DIS                        BIT(1)
+#define QSYS_PORT_MODE_DEQUEUE_LATE                       BIT(0)
+
+#define QSYS_SWITCH_PORT_MODE_RSZ                         0x4
+
+#define QSYS_SWITCH_PORT_MODE_PORT_ENA                    BIT(14)
+#define QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG(x)             (((x) << 11) & GENMASK(13, 11))
+#define QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG_M              GENMASK(13, 11)
+#define QSYS_SWITCH_PORT_MODE_SCH_NEXT_CFG_X(x)           (((x) & GENMASK(13, 11)) >> 11)
+#define QSYS_SWITCH_PORT_MODE_YEL_RSRVD                   BIT(10)
+#define QSYS_SWITCH_PORT_MODE_INGRESS_DROP_MODE           BIT(9)
+#define QSYS_SWITCH_PORT_MODE_TX_PFC_ENA(x)               (((x) << 1) & GENMASK(8, 1))
+#define QSYS_SWITCH_PORT_MODE_TX_PFC_ENA_M                GENMASK(8, 1)
+#define QSYS_SWITCH_PORT_MODE_TX_PFC_ENA_X(x)             (((x) & GENMASK(8, 1)) >> 1)
+#define QSYS_SWITCH_PORT_MODE_TX_PFC_MODE                 BIT(0)
+
+#define QSYS_STAT_CNT_CFG_TX_GREEN_CNT_MODE               BIT(5)
+#define QSYS_STAT_CNT_CFG_TX_YELLOW_CNT_MODE              BIT(4)
+#define QSYS_STAT_CNT_CFG_DROP_GREEN_CNT_MODE             BIT(3)
+#define QSYS_STAT_CNT_CFG_DROP_YELLOW_CNT_MODE            BIT(2)
+#define QSYS_STAT_CNT_CFG_DROP_COUNT_ONCE                 BIT(1)
+#define QSYS_STAT_CNT_CFG_DROP_COUNT_EGRESS               BIT(0)
+
+#define QSYS_EEE_CFG_RSZ                                  0x4
+
+#define QSYS_EEE_THRES_EEE_HIGH_BYTES(x)                  (((x) << 8) & GENMASK(15, 8))
+#define QSYS_EEE_THRES_EEE_HIGH_BYTES_M                   GENMASK(15, 8)
+#define QSYS_EEE_THRES_EEE_HIGH_BYTES_X(x)                (((x) & GENMASK(15, 8)) >> 8)
+#define QSYS_EEE_THRES_EEE_HIGH_FRAMES(x)                 ((x) & GENMASK(7, 0))
+#define QSYS_EEE_THRES_EEE_HIGH_FRAMES_M                  GENMASK(7, 0)
+
+#define QSYS_SW_STATUS_RSZ                                0x4
+
+#define QSYS_EXT_CPU_CFG_EXT_CPU_PORT(x)                  (((x) << 8) & GENMASK(12, 8))
+#define QSYS_EXT_CPU_CFG_EXT_CPU_PORT_M                   GENMASK(12, 8)
+#define QSYS_EXT_CPU_CFG_EXT_CPU_PORT_X(x)                (((x) & GENMASK(12, 8)) >> 8)
+#define QSYS_EXT_CPU_CFG_EXT_CPUQ_MSK(x)                  ((x) & GENMASK(7, 0))
+#define QSYS_EXT_CPU_CFG_EXT_CPUQ_MSK_M                   GENMASK(7, 0)
+
+#define QSYS_QMAP_GSZ                                     0x4
+
+#define QSYS_QMAP_SE_BASE(x)                              (((x) << 5) & GENMASK(12, 5))
+#define QSYS_QMAP_SE_BASE_M                               GENMASK(12, 5)
+#define QSYS_QMAP_SE_BASE_X(x)                            (((x) & GENMASK(12, 5)) >> 5)
+#define QSYS_QMAP_SE_IDX_SEL(x)                           (((x) << 2) & GENMASK(4, 2))
+#define QSYS_QMAP_SE_IDX_SEL_M                            GENMASK(4, 2)
+#define QSYS_QMAP_SE_IDX_SEL_X(x)                         (((x) & GENMASK(4, 2)) >> 2)
+#define QSYS_QMAP_SE_INP_SEL(x)                           ((x) & GENMASK(1, 0))
+#define QSYS_QMAP_SE_INP_SEL_M                            GENMASK(1, 0)
+
+#define QSYS_ISDX_SGRP_GSZ                                0x4
+
+#define QSYS_TIMED_FRAME_ENTRY_GSZ                        0x4
+
+#define QSYS_TFRM_MISC_TIMED_CANCEL_SLOT(x)               (((x) << 9) & GENMASK(18, 9))
+#define QSYS_TFRM_MISC_TIMED_CANCEL_SLOT_M                GENMASK(18, 9)
+#define QSYS_TFRM_MISC_TIMED_CANCEL_SLOT_X(x)             (((x) & GENMASK(18, 9)) >> 9)
+#define QSYS_TFRM_MISC_TIMED_CANCEL_1SHOT                 BIT(8)
+#define QSYS_TFRM_MISC_TIMED_SLOT_MODE_MC                 BIT(7)
+#define QSYS_TFRM_MISC_TIMED_ENTRY_FAST_CNT(x)            ((x) & GENMASK(6, 0))
+#define QSYS_TFRM_MISC_TIMED_ENTRY_FAST_CNT_M             GENMASK(6, 0)
+
+#define QSYS_RED_PROFILE_RSZ                              0x4
+
+#define QSYS_RED_PROFILE_WM_RED_LOW(x)                    (((x) << 8) & GENMASK(15, 8))
+#define QSYS_RED_PROFILE_WM_RED_LOW_M                     GENMASK(15, 8)
+#define QSYS_RED_PROFILE_WM_RED_LOW_X(x)                  (((x) & GENMASK(15, 8)) >> 8)
+#define QSYS_RED_PROFILE_WM_RED_HIGH(x)                   ((x) & GENMASK(7, 0))
+#define QSYS_RED_PROFILE_WM_RED_HIGH_M                    GENMASK(7, 0)
+
+#define QSYS_RES_CFG_GSZ                                  0x8
+
+#define QSYS_RES_STAT_GSZ                                 0x8
+
+#define QSYS_RES_STAT_INUSE(x)                            (((x) << 12) & GENMASK(23, 12))
+#define QSYS_RES_STAT_INUSE_M                             GENMASK(23, 12)
+#define QSYS_RES_STAT_INUSE_X(x)                          (((x) & GENMASK(23, 12)) >> 12)
+#define QSYS_RES_STAT_MAXUSE(x)                           ((x) & GENMASK(11, 0))
+#define QSYS_RES_STAT_MAXUSE_M                            GENMASK(11, 0)
+
+#define QSYS_EVENTS_CORE_EV_FDC(x)                        (((x) << 2) & GENMASK(4, 2))
+#define QSYS_EVENTS_CORE_EV_FDC_M                         GENMASK(4, 2)
+#define QSYS_EVENTS_CORE_EV_FDC_X(x)                      (((x) & GENMASK(4, 2)) >> 2)
+#define QSYS_EVENTS_CORE_EV_FRD(x)                        ((x) & GENMASK(1, 0))
+#define QSYS_EVENTS_CORE_EV_FRD_M                         GENMASK(1, 0)
+
+#define QSYS_QMAXSDU_CFG_0_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_1_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_2_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_3_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_4_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_5_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_6_RSZ                            0x4
+
+#define QSYS_QMAXSDU_CFG_7_RSZ                            0x4
+
+#define QSYS_PREEMPTION_CFG_RSZ                           0x4
+
+#define QSYS_PREEMPTION_CFG_P_QUEUES(x)                   ((x) & GENMASK(7, 0))
+#define QSYS_PREEMPTION_CFG_P_QUEUES_M                    GENMASK(7, 0)
+#define QSYS_PREEMPTION_CFG_MM_ADD_FRAG_SIZE(x)           (((x) << 8) & GENMASK(9, 8))
+#define QSYS_PREEMPTION_CFG_MM_ADD_FRAG_SIZE_M            GENMASK(9, 8)
+#define QSYS_PREEMPTION_CFG_MM_ADD_FRAG_SIZE_X(x)         (((x) & GENMASK(9, 8)) >> 8)
+#define QSYS_PREEMPTION_CFG_STRICT_IPG(x)                 (((x) << 12) & GENMASK(13, 12))
+#define QSYS_PREEMPTION_CFG_STRICT_IPG_M                  GENMASK(13, 12)
+#define QSYS_PREEMPTION_CFG_STRICT_IPG_X(x)               (((x) & GENMASK(13, 12)) >> 12)
+#define QSYS_PREEMPTION_CFG_HOLD_ADVANCE(x)               (((x) << 16) & GENMASK(31, 16))
+#define QSYS_PREEMPTION_CFG_HOLD_ADVANCE_M                GENMASK(31, 16)
+#define QSYS_PREEMPTION_CFG_HOLD_ADVANCE_X(x)             (((x) & GENMASK(31, 16)) >> 16)
+
+#define QSYS_CIR_CFG_GSZ                                  0x80
+
+#define QSYS_CIR_CFG_CIR_RATE(x)                          (((x) << 6) & GENMASK(20, 6))
+#define QSYS_CIR_CFG_CIR_RATE_M                           GENMASK(20, 6)
+#define QSYS_CIR_CFG_CIR_RATE_X(x)                        (((x) & GENMASK(20, 6)) >> 6)
+#define QSYS_CIR_CFG_CIR_BURST(x)                         ((x) & GENMASK(5, 0))
+#define QSYS_CIR_CFG_CIR_BURST_M                          GENMASK(5, 0)
+
+#define QSYS_EIR_CFG_GSZ                                  0x80
+
+#define QSYS_EIR_CFG_EIR_RATE(x)                          (((x) << 7) & GENMASK(21, 7))
+#define QSYS_EIR_CFG_EIR_RATE_M                           GENMASK(21, 7)
+#define QSYS_EIR_CFG_EIR_RATE_X(x)                        (((x) & GENMASK(21, 7)) >> 7)
+#define QSYS_EIR_CFG_EIR_BURST(x)                         (((x) << 1) & GENMASK(6, 1))
+#define QSYS_EIR_CFG_EIR_BURST_M                          GENMASK(6, 1)
+#define QSYS_EIR_CFG_EIR_BURST_X(x)                       (((x) & GENMASK(6, 1)) >> 1)
+#define QSYS_EIR_CFG_EIR_MARK_ENA                         BIT(0)
+
+#define QSYS_SE_CFG_GSZ                                   0x80
+
+#define QSYS_SE_CFG_SE_DWRR_CNT(x)                        (((x) << 6) & GENMASK(9, 6))
+#define QSYS_SE_CFG_SE_DWRR_CNT_M                         GENMASK(9, 6)
+#define QSYS_SE_CFG_SE_DWRR_CNT_X(x)                      (((x) & GENMASK(9, 6)) >> 6)
+#define QSYS_SE_CFG_SE_RR_ENA                             BIT(5)
+#define QSYS_SE_CFG_SE_AVB_ENA                            BIT(4)
+#define QSYS_SE_CFG_SE_FRM_MODE(x)                        (((x) << 2) & GENMASK(3, 2))
+#define QSYS_SE_CFG_SE_FRM_MODE_M                         GENMASK(3, 2)
+#define QSYS_SE_CFG_SE_FRM_MODE_X(x)                      (((x) & GENMASK(3, 2)) >> 2)
+#define QSYS_SE_CFG_SE_EXC_ENA                            BIT(1)
+#define QSYS_SE_CFG_SE_EXC_FWD                            BIT(0)
+
+#define QSYS_SE_DWRR_CFG_GSZ                              0x80
+#define QSYS_SE_DWRR_CFG_RSZ                              0x4
+
+#define QSYS_SE_CONNECT_GSZ                               0x80
+
+#define QSYS_SE_CONNECT_SE_OUTP_IDX(x)                    (((x) << 17) & GENMASK(24, 17))
+#define QSYS_SE_CONNECT_SE_OUTP_IDX_M                     GENMASK(24, 17)
+#define QSYS_SE_CONNECT_SE_OUTP_IDX_X(x)                  (((x) & GENMASK(24, 17)) >> 17)
+#define QSYS_SE_CONNECT_SE_INP_IDX(x)                     (((x) << 9) & GENMASK(16, 9))
+#define QSYS_SE_CONNECT_SE_INP_IDX_M                      GENMASK(16, 9)
+#define QSYS_SE_CONNECT_SE_INP_IDX_X(x)                   (((x) & GENMASK(16, 9)) >> 9)
+#define QSYS_SE_CONNECT_SE_OUTP_CON(x)                    (((x) << 5) & GENMASK(8, 5))
+#define QSYS_SE_CONNECT_SE_OUTP_CON_M                     GENMASK(8, 5)
+#define QSYS_SE_CONNECT_SE_OUTP_CON_X(x)                  (((x) & GENMASK(8, 5)) >> 5)
+#define QSYS_SE_CONNECT_SE_INP_CNT(x)                     (((x) << 1) & GENMASK(4, 1))
+#define QSYS_SE_CONNECT_SE_INP_CNT_M                      GENMASK(4, 1)
+#define QSYS_SE_CONNECT_SE_INP_CNT_X(x)                   (((x) & GENMASK(4, 1)) >> 1)
+#define QSYS_SE_CONNECT_SE_TERMINAL                       BIT(0)
+
+#define QSYS_SE_DLB_SENSE_GSZ                             0x80
+
+#define QSYS_SE_DLB_SENSE_SE_DLB_PRIO(x)                  (((x) << 11) & GENMASK(13, 11))
+#define QSYS_SE_DLB_SENSE_SE_DLB_PRIO_M                   GENMASK(13, 11)
+#define QSYS_SE_DLB_SENSE_SE_DLB_PRIO_X(x)                (((x) & GENMASK(13, 11)) >> 11)
+#define QSYS_SE_DLB_SENSE_SE_DLB_SPORT(x)                 (((x) << 7) & GENMASK(10, 7))
+#define QSYS_SE_DLB_SENSE_SE_DLB_SPORT_M                  GENMASK(10, 7)
+#define QSYS_SE_DLB_SENSE_SE_DLB_SPORT_X(x)               (((x) & GENMASK(10, 7)) >> 7)
+#define QSYS_SE_DLB_SENSE_SE_DLB_DPORT(x)                 (((x) << 3) & GENMASK(6, 3))
+#define QSYS_SE_DLB_SENSE_SE_DLB_DPORT_M                  GENMASK(6, 3)
+#define QSYS_SE_DLB_SENSE_SE_DLB_DPORT_X(x)               (((x) & GENMASK(6, 3)) >> 3)
+#define QSYS_SE_DLB_SENSE_SE_DLB_PRIO_ENA                 BIT(2)
+#define QSYS_SE_DLB_SENSE_SE_DLB_SPORT_ENA                BIT(1)
+#define QSYS_SE_DLB_SENSE_SE_DLB_DPORT_ENA                BIT(0)
+
+#define QSYS_CIR_STATE_GSZ                                0x80
+
+#define QSYS_CIR_STATE_CIR_LVL(x)                         (((x) << 4) & GENMASK(25, 4))
+#define QSYS_CIR_STATE_CIR_LVL_M                          GENMASK(25, 4)
+#define QSYS_CIR_STATE_CIR_LVL_X(x)                       (((x) & GENMASK(25, 4)) >> 4)
+#define QSYS_CIR_STATE_SHP_TIME(x)                        ((x) & GENMASK(3, 0))
+#define QSYS_CIR_STATE_SHP_TIME_M                         GENMASK(3, 0)
+
+#define QSYS_EIR_STATE_GSZ                                0x80
+
+#define QSYS_SE_STATE_GSZ                                 0x80
+
+#define QSYS_SE_STATE_SE_OUTP_LVL(x)                      (((x) << 1) & GENMASK(2, 1))
+#define QSYS_SE_STATE_SE_OUTP_LVL_M                       GENMASK(2, 1)
+#define QSYS_SE_STATE_SE_OUTP_LVL_X(x)                    (((x) & GENMASK(2, 1)) >> 1)
+#define QSYS_SE_STATE_SE_WAS_YEL                          BIT(0)
+
+#define QSYS_HSCH_MISC_CFG_SE_CONNECT_VLD                 BIT(8)
+#define QSYS_HSCH_MISC_CFG_FRM_ADJ(x)                     (((x) << 3) & GENMASK(7, 3))
+#define QSYS_HSCH_MISC_CFG_FRM_ADJ_M                      GENMASK(7, 3)
+#define QSYS_HSCH_MISC_CFG_FRM_ADJ_X(x)                   (((x) & GENMASK(7, 3)) >> 3)
+#define QSYS_HSCH_MISC_CFG_LEAK_DIS                       BIT(2)
+#define QSYS_HSCH_MISC_CFG_QSHP_EXC_ENA                   BIT(1)
+#define QSYS_HSCH_MISC_CFG_PFC_BYP_UPD                    BIT(0)
+
+#define QSYS_TAG_CONFIG_RSZ                               0x4
+
+#define QSYS_TAG_CONFIG_ENABLE                            BIT(0)
+#define QSYS_TAG_CONFIG_LINK_SPEED(x)                     (((x) << 4) & GENMASK(5, 4))
+#define QSYS_TAG_CONFIG_LINK_SPEED_M                      GENMASK(5, 4)
+#define QSYS_TAG_CONFIG_LINK_SPEED_X(x)                   (((x) & GENMASK(5, 4)) >> 4)
+#define QSYS_TAG_CONFIG_INIT_GATE_STATE(x)                (((x) << 8) & GENMASK(15, 8))
+#define QSYS_TAG_CONFIG_INIT_GATE_STATE_M                 GENMASK(15, 8)
+#define QSYS_TAG_CONFIG_INIT_GATE_STATE_X(x)              (((x) & GENMASK(15, 8)) >> 8)
+#define QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES(x)             (((x) << 16) & GENMASK(23, 16))
+#define QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES_M              GENMASK(23, 16)
+#define QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES_X(x)           (((x) & GENMASK(23, 16)) >> 16)
+
+#define QSYS_TAS_PARAM_CFG_CTRL_PORT_NUM(x)               ((x) & GENMASK(7, 0))
+#define QSYS_TAS_PARAM_CFG_CTRL_PORT_NUM_M                GENMASK(7, 0)
+#define QSYS_TAS_PARAM_CFG_CTRL_ALWAYS_GUARD_BAND_SCH_Q   BIT(8)
+#define QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE             BIT(16)
+
+#define QSYS_PORT_MAX_SDU_RSZ                             0x4
+
+#define QSYS_PARAM_CFG_REG_3_BASE_TIME_SEC_MSB(x)         ((x) & GENMASK(15, 0))
+#define QSYS_PARAM_CFG_REG_3_BASE_TIME_SEC_MSB_M          GENMASK(15, 0)
+#define QSYS_PARAM_CFG_REG_3_LIST_LENGTH(x)               (((x) << 16) & GENMASK(31, 16))
+#define QSYS_PARAM_CFG_REG_3_LIST_LENGTH_M                GENMASK(31, 16)
+#define QSYS_PARAM_CFG_REG_3_LIST_LENGTH_X(x)             (((x) & GENMASK(31, 16)) >> 16)
+
+#define QSYS_GCL_CFG_REG_1_GCL_ENTRY_NUM(x)               ((x) & GENMASK(5, 0))
+#define QSYS_GCL_CFG_REG_1_GCL_ENTRY_NUM_M                GENMASK(5, 0)
+#define QSYS_GCL_CFG_REG_1_GATE_STATE(x)                  (((x) << 8) & GENMASK(15, 8))
+#define QSYS_GCL_CFG_REG_1_GATE_STATE_M                   GENMASK(15, 8)
+#define QSYS_GCL_CFG_REG_1_GATE_STATE_X(x)                (((x) & GENMASK(15, 8)) >> 8)
+
+#define QSYS_PARAM_STATUS_REG_3_BASE_TIME_SEC_MSB(x)      ((x) & GENMASK(15, 0))
+#define QSYS_PARAM_STATUS_REG_3_BASE_TIME_SEC_MSB_M       GENMASK(15, 0)
+#define QSYS_PARAM_STATUS_REG_3_LIST_LENGTH(x)            (((x) << 16) & GENMASK(31, 16))
+#define QSYS_PARAM_STATUS_REG_3_LIST_LENGTH_M             GENMASK(31, 16)
+#define QSYS_PARAM_STATUS_REG_3_LIST_LENGTH_X(x)          (((x) & GENMASK(31, 16)) >> 16)
+
+#define QSYS_PARAM_STATUS_REG_8_CFG_CHG_TIME_SEC_MSB(x)   ((x) & GENMASK(15, 0))
+#define QSYS_PARAM_STATUS_REG_8_CFG_CHG_TIME_SEC_MSB_M    GENMASK(15, 0)
+#define QSYS_PARAM_STATUS_REG_8_OPER_GATE_STATE(x)        (((x) << 16) & GENMASK(23, 16))
+#define QSYS_PARAM_STATUS_REG_8_OPER_GATE_STATE_M         GENMASK(23, 16)
+#define QSYS_PARAM_STATUS_REG_8_OPER_GATE_STATE_X(x)      (((x) & GENMASK(23, 16)) >> 16)
+#define QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING            BIT(24)
+
+#define QSYS_GCL_STATUS_REG_1_GCL_ENTRY_NUM(x)            ((x) & GENMASK(5, 0))
+#define QSYS_GCL_STATUS_REG_1_GCL_ENTRY_NUM_M             GENMASK(5, 0)
+#define QSYS_GCL_STATUS_REG_1_GATE_STATE(x)               (((x) << 8) & GENMASK(15, 8))
+#define QSYS_GCL_STATUS_REG_1_GATE_STATE_M                GENMASK(15, 8)
+#define QSYS_GCL_STATUS_REG_1_GATE_STATE_X(x)             (((x) & GENMASK(15, 8)) >> 8)
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_regs.c b/drivers/net/ethernet/mscc/ocelot_regs.c
new file mode 100644
index 000000000000..e334b406c40c
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_regs.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+#include "ocelot.h"
+
+static const u32 ocelot_ana_regmap[] = {
+	REG(ANA_ADVLEARN,                  0x009000),
+	REG(ANA_VLANMASK,                  0x009004),
+	REG(ANA_PORT_B_DOMAIN,             0x009008),
+	REG(ANA_ANAGEFIL,                  0x00900c),
+	REG(ANA_ANEVENTS,                  0x009010),
+	REG(ANA_STORMLIMIT_BURST,          0x009014),
+	REG(ANA_STORMLIMIT_CFG,            0x009018),
+	REG(ANA_ISOLATED_PORTS,            0x009028),
+	REG(ANA_COMMUNITY_PORTS,           0x00902c),
+	REG(ANA_AUTOAGE,                   0x009030),
+	REG(ANA_MACTOPTIONS,               0x009034),
+	REG(ANA_LEARNDISC,                 0x009038),
+	REG(ANA_AGENCTRL,                  0x00903c),
+	REG(ANA_MIRRORPORTS,               0x009040),
+	REG(ANA_EMIRRORPORTS,              0x009044),
+	REG(ANA_FLOODING,                  0x009048),
+	REG(ANA_FLOODING_IPMC,             0x00904c),
+	REG(ANA_SFLOW_CFG,                 0x009050),
+	REG(ANA_PORT_MODE,                 0x009080),
+	REG(ANA_PGID_PGID,                 0x008c00),
+	REG(ANA_TABLES_ANMOVED,            0x008b30),
+	REG(ANA_TABLES_MACHDATA,           0x008b34),
+	REG(ANA_TABLES_MACLDATA,           0x008b38),
+	REG(ANA_TABLES_MACACCESS,          0x008b3c),
+	REG(ANA_TABLES_MACTINDX,           0x008b40),
+	REG(ANA_TABLES_VLANACCESS,         0x008b44),
+	REG(ANA_TABLES_VLANTIDX,           0x008b48),
+	REG(ANA_TABLES_ISDXACCESS,         0x008b4c),
+	REG(ANA_TABLES_ISDXTIDX,           0x008b50),
+	REG(ANA_TABLES_ENTRYLIM,           0x008b00),
+	REG(ANA_TABLES_PTP_ID_HIGH,        0x008b54),
+	REG(ANA_TABLES_PTP_ID_LOW,         0x008b58),
+	REG(ANA_MSTI_STATE,                0x008e00),
+	REG(ANA_PORT_VLAN_CFG,             0x007000),
+	REG(ANA_PORT_DROP_CFG,             0x007004),
+	REG(ANA_PORT_QOS_CFG,              0x007008),
+	REG(ANA_PORT_VCAP_CFG,             0x00700c),
+	REG(ANA_PORT_VCAP_S1_KEY_CFG,      0x007010),
+	REG(ANA_PORT_VCAP_S2_CFG,          0x00701c),
+	REG(ANA_PORT_PCP_DEI_MAP,          0x007020),
+	REG(ANA_PORT_CPU_FWD_CFG,          0x007060),
+	REG(ANA_PORT_CPU_FWD_BPDU_CFG,     0x007064),
+	REG(ANA_PORT_CPU_FWD_GARP_CFG,     0x007068),
+	REG(ANA_PORT_CPU_FWD_CCM_CFG,      0x00706c),
+	REG(ANA_PORT_PORT_CFG,             0x007070),
+	REG(ANA_PORT_POL_CFG,              0x007074),
+	REG(ANA_PORT_PTP_CFG,              0x007078),
+	REG(ANA_PORT_PTP_DLY1_CFG,         0x00707c),
+	REG(ANA_OAM_UPM_LM_CNT,            0x007c00),
+	REG(ANA_PORT_PTP_DLY2_CFG,         0x007080),
+	REG(ANA_PFC_PFC_CFG,               0x008800),
+	REG(ANA_PFC_PFC_TIMER,             0x008804),
+	REG(ANA_IPT_OAM_MEP_CFG,           0x008000),
+	REG(ANA_IPT_IPT,                   0x008004),
+	REG(ANA_PPT_PPT,                   0x008ac0),
+	REG(ANA_FID_MAP_FID_MAP,           0x000000),
+	REG(ANA_AGGR_CFG,                  0x0090b4),
+	REG(ANA_CPUQ_CFG,                  0x0090b8),
+	REG(ANA_CPUQ_CFG2,                 0x0090bc),
+	REG(ANA_CPUQ_8021_CFG,             0x0090c0),
+	REG(ANA_DSCP_CFG,                  0x009100),
+	REG(ANA_DSCP_REWR_CFG,             0x009200),
+	REG(ANA_VCAP_RNG_TYPE_CFG,         0x009240),
+	REG(ANA_VCAP_RNG_VAL_CFG,          0x009260),
+	REG(ANA_VRAP_CFG,                  0x009280),
+	REG(ANA_VRAP_HDR_DATA,             0x009284),
+	REG(ANA_VRAP_HDR_MASK,             0x009288),
+	REG(ANA_DISCARD_CFG,               0x00928c),
+	REG(ANA_FID_CFG,                   0x009290),
+	REG(ANA_POL_PIR_CFG,               0x004000),
+	REG(ANA_POL_CIR_CFG,               0x004004),
+	REG(ANA_POL_MODE_CFG,              0x004008),
+	REG(ANA_POL_PIR_STATE,             0x00400c),
+	REG(ANA_POL_CIR_STATE,             0x004010),
+	REG(ANA_POL_STATE,                 0x004014),
+	REG(ANA_POL_FLOWC,                 0x008b80),
+	REG(ANA_POL_HYST,                  0x008bec),
+	REG(ANA_POL_MISC_CFG,              0x008bf0),
+};
+
+static const u32 ocelot_qs_regmap[] = {
+	REG(QS_XTR_GRP_CFG,                0x000000),
+	REG(QS_XTR_RD,                     0x000008),
+	REG(QS_XTR_FRM_PRUNING,            0x000010),
+	REG(QS_XTR_FLUSH,                  0x000018),
+	REG(QS_XTR_DATA_PRESENT,           0x00001c),
+	REG(QS_XTR_CFG,                    0x000020),
+	REG(QS_INJ_GRP_CFG,                0x000024),
+	REG(QS_INJ_WR,                     0x00002c),
+	REG(QS_INJ_CTRL,                   0x000034),
+	REG(QS_INJ_STATUS,                 0x00003c),
+	REG(QS_INJ_ERR,                    0x000040),
+	REG(QS_INH_DBG,                    0x000048),
+};
+
+static const u32 ocelot_hsio_regmap[] = {
+	REG(HSIO_PLL5G_CFG0,               0x000000),
+	REG(HSIO_PLL5G_CFG1,               0x000004),
+	REG(HSIO_PLL5G_CFG2,               0x000008),
+	REG(HSIO_PLL5G_CFG3,               0x00000c),
+	REG(HSIO_PLL5G_CFG4,               0x000010),
+	REG(HSIO_PLL5G_CFG5,               0x000014),
+	REG(HSIO_PLL5G_CFG6,               0x000018),
+	REG(HSIO_PLL5G_STATUS0,            0x00001c),
+	REG(HSIO_PLL5G_STATUS1,            0x000020),
+	REG(HSIO_PLL5G_BIST_CFG0,          0x000024),
+	REG(HSIO_PLL5G_BIST_CFG1,          0x000028),
+	REG(HSIO_PLL5G_BIST_CFG2,          0x00002c),
+	REG(HSIO_PLL5G_BIST_STAT0,         0x000030),
+	REG(HSIO_PLL5G_BIST_STAT1,         0x000034),
+	REG(HSIO_RCOMP_CFG0,               0x000038),
+	REG(HSIO_RCOMP_STATUS,             0x00003c),
+	REG(HSIO_SYNC_ETH_CFG,             0x000040),
+	REG(HSIO_SYNC_ETH_PLL_CFG,         0x000048),
+	REG(HSIO_S1G_DES_CFG,              0x00004c),
+	REG(HSIO_S1G_IB_CFG,               0x000050),
+	REG(HSIO_S1G_OB_CFG,               0x000054),
+	REG(HSIO_S1G_SER_CFG,              0x000058),
+	REG(HSIO_S1G_COMMON_CFG,           0x00005c),
+	REG(HSIO_S1G_PLL_CFG,              0x000060),
+	REG(HSIO_S1G_PLL_STATUS,           0x000064),
+	REG(HSIO_S1G_DFT_CFG0,             0x000068),
+	REG(HSIO_S1G_DFT_CFG1,             0x00006c),
+	REG(HSIO_S1G_DFT_CFG2,             0x000070),
+	REG(HSIO_S1G_TP_CFG,               0x000074),
+	REG(HSIO_S1G_RC_PLL_BIST_CFG,      0x000078),
+	REG(HSIO_S1G_MISC_CFG,             0x00007c),
+	REG(HSIO_S1G_DFT_STATUS,           0x000080),
+	REG(HSIO_S1G_MISC_STATUS,          0x000084),
+	REG(HSIO_MCB_S1G_ADDR_CFG,         0x000088),
+	REG(HSIO_S6G_DIG_CFG,              0x00008c),
+	REG(HSIO_S6G_DFT_CFG0,             0x000090),
+	REG(HSIO_S6G_DFT_CFG1,             0x000094),
+	REG(HSIO_S6G_DFT_CFG2,             0x000098),
+	REG(HSIO_S6G_TP_CFG0,              0x00009c),
+	REG(HSIO_S6G_TP_CFG1,              0x0000a0),
+	REG(HSIO_S6G_RC_PLL_BIST_CFG,      0x0000a4),
+	REG(HSIO_S6G_MISC_CFG,             0x0000a8),
+	REG(HSIO_S6G_OB_ANEG_CFG,          0x0000ac),
+	REG(HSIO_S6G_DFT_STATUS,           0x0000b0),
+	REG(HSIO_S6G_ERR_CNT,              0x0000b4),
+	REG(HSIO_S6G_MISC_STATUS,          0x0000b8),
+	REG(HSIO_S6G_DES_CFG,              0x0000bc),
+	REG(HSIO_S6G_IB_CFG,               0x0000c0),
+	REG(HSIO_S6G_IB_CFG1,              0x0000c4),
+	REG(HSIO_S6G_IB_CFG2,              0x0000c8),
+	REG(HSIO_S6G_IB_CFG3,              0x0000cc),
+	REG(HSIO_S6G_IB_CFG4,              0x0000d0),
+	REG(HSIO_S6G_IB_CFG5,              0x0000d4),
+	REG(HSIO_S6G_OB_CFG,               0x0000d8),
+	REG(HSIO_S6G_OB_CFG1,              0x0000dc),
+	REG(HSIO_S6G_SER_CFG,              0x0000e0),
+	REG(HSIO_S6G_COMMON_CFG,           0x0000e4),
+	REG(HSIO_S6G_PLL_CFG,              0x0000e8),
+	REG(HSIO_S6G_ACJTAG_CFG,           0x0000ec),
+	REG(HSIO_S6G_GP_CFG,               0x0000f0),
+	REG(HSIO_S6G_IB_STATUS0,           0x0000f4),
+	REG(HSIO_S6G_IB_STATUS1,           0x0000f8),
+	REG(HSIO_S6G_ACJTAG_STATUS,        0x0000fc),
+	REG(HSIO_S6G_PLL_STATUS,           0x000100),
+	REG(HSIO_S6G_REVID,                0x000104),
+	REG(HSIO_MCB_S6G_ADDR_CFG,         0x000108),
+	REG(HSIO_HW_CFG,                   0x00010c),
+	REG(HSIO_HW_QSGMII_CFG,            0x000110),
+	REG(HSIO_HW_QSGMII_STAT,           0x000114),
+	REG(HSIO_CLK_CFG,                  0x000118),
+	REG(HSIO_TEMP_SENSOR_CTRL,         0x00011c),
+	REG(HSIO_TEMP_SENSOR_CFG,          0x000120),
+	REG(HSIO_TEMP_SENSOR_STAT,         0x000124),
+};
+
+static const u32 ocelot_qsys_regmap[] = {
+	REG(QSYS_PORT_MODE,                0x011200),
+	REG(QSYS_SWITCH_PORT_MODE,         0x011234),
+	REG(QSYS_STAT_CNT_CFG,             0x011264),
+	REG(QSYS_EEE_CFG,                  0x011268),
+	REG(QSYS_EEE_THRES,                0x011294),
+	REG(QSYS_IGR_NO_SHARING,           0x011298),
+	REG(QSYS_EGR_NO_SHARING,           0x01129c),
+	REG(QSYS_SW_STATUS,                0x0112a0),
+	REG(QSYS_EXT_CPU_CFG,              0x0112d0),
+	REG(QSYS_PAD_CFG,                  0x0112d4),
+	REG(QSYS_CPU_GROUP_MAP,            0x0112d8),
+	REG(QSYS_QMAP,                     0x0112dc),
+	REG(QSYS_ISDX_SGRP,                0x011400),
+	REG(QSYS_TIMED_FRAME_ENTRY,        0x014000),
+	REG(QSYS_TFRM_MISC,                0x011310),
+	REG(QSYS_TFRM_PORT_DLY,            0x011314),
+	REG(QSYS_TFRM_TIMER_CFG_1,         0x011318),
+	REG(QSYS_TFRM_TIMER_CFG_2,         0x01131c),
+	REG(QSYS_TFRM_TIMER_CFG_3,         0x011320),
+	REG(QSYS_TFRM_TIMER_CFG_4,         0x011324),
+	REG(QSYS_TFRM_TIMER_CFG_5,         0x011328),
+	REG(QSYS_TFRM_TIMER_CFG_6,         0x01132c),
+	REG(QSYS_TFRM_TIMER_CFG_7,         0x011330),
+	REG(QSYS_TFRM_TIMER_CFG_8,         0x011334),
+	REG(QSYS_RED_PROFILE,              0x011338),
+	REG(QSYS_RES_QOS_MODE,             0x011378),
+	REG(QSYS_RES_CFG,                  0x012000),
+	REG(QSYS_RES_STAT,                 0x012004),
+	REG(QSYS_EGR_DROP_MODE,            0x01137c),
+	REG(QSYS_EQ_CTRL,                  0x011380),
+	REG(QSYS_EVENTS_CORE,              0x011384),
+	REG(QSYS_CIR_CFG,                  0x000000),
+	REG(QSYS_EIR_CFG,                  0x000004),
+	REG(QSYS_SE_CFG,                   0x000008),
+	REG(QSYS_SE_DWRR_CFG,              0x00000c),
+	REG(QSYS_SE_CONNECT,               0x00003c),
+	REG(QSYS_SE_DLB_SENSE,             0x000040),
+	REG(QSYS_CIR_STATE,                0x000044),
+	REG(QSYS_EIR_STATE,                0x000048),
+	REG(QSYS_SE_STATE,                 0x00004c),
+	REG(QSYS_HSCH_MISC_CFG,            0x011388),
+};
+
+static const u32 ocelot_rew_regmap[] = {
+	REG(REW_PORT_VLAN_CFG,             0x000000),
+	REG(REW_TAG_CFG,                   0x000004),
+	REG(REW_PORT_CFG,                  0x000008),
+	REG(REW_DSCP_CFG,                  0x00000c),
+	REG(REW_PCP_DEI_QOS_MAP_CFG,       0x000010),
+	REG(REW_PTP_CFG,                   0x000050),
+	REG(REW_PTP_DLY1_CFG,              0x000054),
+	REG(REW_DSCP_REMAP_DP1_CFG,        0x000690),
+	REG(REW_DSCP_REMAP_CFG,            0x000790),
+	REG(REW_STAT_CFG,                  0x000890),
+	REG(REW_PPT,                       0x000680),
+};
+
+static const u32 ocelot_sys_regmap[] = {
+	REG(SYS_COUNT_RX_OCTETS,           0x000000),
+	REG(SYS_COUNT_RX_UNICAST,          0x000004),
+	REG(SYS_COUNT_RX_MULTICAST,        0x000008),
+	REG(SYS_COUNT_RX_BROADCAST,        0x00000c),
+	REG(SYS_COUNT_RX_SHORTS,           0x000010),
+	REG(SYS_COUNT_RX_FRAGMENTS,        0x000014),
+	REG(SYS_COUNT_RX_JABBERS,          0x000018),
+	REG(SYS_COUNT_RX_CRC_ALIGN_ERRS,   0x00001c),
+	REG(SYS_COUNT_RX_SYM_ERRS,         0x000020),
+	REG(SYS_COUNT_RX_64,               0x000024),
+	REG(SYS_COUNT_RX_65_127,           0x000028),
+	REG(SYS_COUNT_RX_128_255,          0x00002c),
+	REG(SYS_COUNT_RX_256_1023,         0x000030),
+	REG(SYS_COUNT_RX_1024_1526,        0x000034),
+	REG(SYS_COUNT_RX_1527_MAX,         0x000038),
+	REG(SYS_COUNT_RX_PAUSE,            0x00003c),
+	REG(SYS_COUNT_RX_CONTROL,          0x000040),
+	REG(SYS_COUNT_RX_LONGS,            0x000044),
+	REG(SYS_COUNT_RX_CLASSIFIED_DROPS, 0x000048),
+	REG(SYS_COUNT_TX_OCTETS,           0x000100),
+	REG(SYS_COUNT_TX_UNICAST,          0x000104),
+	REG(SYS_COUNT_TX_MULTICAST,        0x000108),
+	REG(SYS_COUNT_TX_BROADCAST,        0x00010c),
+	REG(SYS_COUNT_TX_COLLISION,        0x000110),
+	REG(SYS_COUNT_TX_DROPS,            0x000114),
+	REG(SYS_COUNT_TX_PAUSE,            0x000118),
+	REG(SYS_COUNT_TX_64,               0x00011c),
+	REG(SYS_COUNT_TX_65_127,           0x000120),
+	REG(SYS_COUNT_TX_128_511,          0x000124),
+	REG(SYS_COUNT_TX_512_1023,         0x000128),
+	REG(SYS_COUNT_TX_1024_1526,        0x00012c),
+	REG(SYS_COUNT_TX_1527_MAX,         0x000130),
+	REG(SYS_COUNT_TX_AGING,            0x000170),
+	REG(SYS_RESET_CFG,                 0x000508),
+	REG(SYS_CMID,                      0x00050c),
+	REG(SYS_VLAN_ETYPE_CFG,            0x000510),
+	REG(SYS_PORT_MODE,                 0x000514),
+	REG(SYS_FRONT_PORT_MODE,           0x000548),
+	REG(SYS_FRM_AGING,                 0x000574),
+	REG(SYS_STAT_CFG,                  0x000578),
+	REG(SYS_SW_STATUS,                 0x00057c),
+	REG(SYS_MISC_CFG,                  0x0005ac),
+	REG(SYS_REW_MAC_HIGH_CFG,          0x0005b0),
+	REG(SYS_REW_MAC_LOW_CFG,           0x0005dc),
+	REG(SYS_CM_ADDR,                   0x000500),
+	REG(SYS_CM_DATA,                   0x000504),
+	REG(SYS_PAUSE_CFG,                 0x000608),
+	REG(SYS_PAUSE_TOT_CFG,             0x000638),
+	REG(SYS_ATOP,                      0x00063c),
+	REG(SYS_ATOP_TOT_CFG,              0x00066c),
+	REG(SYS_MAC_FC_CFG,                0x000670),
+	REG(SYS_MMGT,                      0x00069c),
+	REG(SYS_MMGT_FAST,                 0x0006a0),
+	REG(SYS_EVENTS_DIF,                0x0006a4),
+	REG(SYS_EVENTS_CORE,               0x0006b4),
+	REG(SYS_CNT,                       0x000000),
+	REG(SYS_PTP_STATUS,                0x0006b8),
+	REG(SYS_PTP_TXSTAMP,               0x0006bc),
+	REG(SYS_PTP_NXT,                   0x0006c0),
+	REG(SYS_PTP_CFG,                   0x0006c4),
+};
+
+static const u32 *ocelot_regmap[] = {
+	[ANA] = ocelot_ana_regmap,
+	[QS] = ocelot_qs_regmap,
+	[HSIO] = ocelot_hsio_regmap,
+	[QSYS] = ocelot_qsys_regmap,
+	[REW] = ocelot_rew_regmap,
+	[SYS] = ocelot_sys_regmap,
+};
+
+static const struct reg_field ocelot_regfields[] = {
+	[ANA_ADVLEARN_VLAN_CHK] = REG_FIELD(ANA_ADVLEARN, 11, 11),
+	[ANA_ADVLEARN_LEARN_MIRROR] = REG_FIELD(ANA_ADVLEARN, 0, 10),
+	[ANA_ANEVENTS_MSTI_DROP] = REG_FIELD(ANA_ANEVENTS, 27, 27),
+	[ANA_ANEVENTS_ACLKILL] = REG_FIELD(ANA_ANEVENTS, 26, 26),
+	[ANA_ANEVENTS_ACLUSED] = REG_FIELD(ANA_ANEVENTS, 25, 25),
+	[ANA_ANEVENTS_AUTOAGE] = REG_FIELD(ANA_ANEVENTS, 24, 24),
+	[ANA_ANEVENTS_VS2TTL1] = REG_FIELD(ANA_ANEVENTS, 23, 23),
+	[ANA_ANEVENTS_STORM_DROP] = REG_FIELD(ANA_ANEVENTS, 22, 22),
+	[ANA_ANEVENTS_LEARN_DROP] = REG_FIELD(ANA_ANEVENTS, 21, 21),
+	[ANA_ANEVENTS_AGED_ENTRY] = REG_FIELD(ANA_ANEVENTS, 20, 20),
+	[ANA_ANEVENTS_CPU_LEARN_FAILED] = REG_FIELD(ANA_ANEVENTS, 19, 19),
+	[ANA_ANEVENTS_AUTO_LEARN_FAILED] = REG_FIELD(ANA_ANEVENTS, 18, 18),
+	[ANA_ANEVENTS_LEARN_REMOVE] = REG_FIELD(ANA_ANEVENTS, 17, 17),
+	[ANA_ANEVENTS_AUTO_LEARNED] = REG_FIELD(ANA_ANEVENTS, 16, 16),
+	[ANA_ANEVENTS_AUTO_MOVED] = REG_FIELD(ANA_ANEVENTS, 15, 15),
+	[ANA_ANEVENTS_DROPPED] = REG_FIELD(ANA_ANEVENTS, 14, 14),
+	[ANA_ANEVENTS_CLASSIFIED_DROP] = REG_FIELD(ANA_ANEVENTS, 13, 13),
+	[ANA_ANEVENTS_CLASSIFIED_COPY] = REG_FIELD(ANA_ANEVENTS, 12, 12),
+	[ANA_ANEVENTS_VLAN_DISCARD] = REG_FIELD(ANA_ANEVENTS, 11, 11),
+	[ANA_ANEVENTS_FWD_DISCARD] = REG_FIELD(ANA_ANEVENTS, 10, 10),
+	[ANA_ANEVENTS_MULTICAST_FLOOD] = REG_FIELD(ANA_ANEVENTS, 9, 9),
+	[ANA_ANEVENTS_UNICAST_FLOOD] = REG_FIELD(ANA_ANEVENTS, 8, 8),
+	[ANA_ANEVENTS_DEST_KNOWN] = REG_FIELD(ANA_ANEVENTS, 7, 7),
+	[ANA_ANEVENTS_BUCKET3_MATCH] = REG_FIELD(ANA_ANEVENTS, 6, 6),
+	[ANA_ANEVENTS_BUCKET2_MATCH] = REG_FIELD(ANA_ANEVENTS, 5, 5),
+	[ANA_ANEVENTS_BUCKET1_MATCH] = REG_FIELD(ANA_ANEVENTS, 4, 4),
+	[ANA_ANEVENTS_BUCKET0_MATCH] = REG_FIELD(ANA_ANEVENTS, 3, 3),
+	[ANA_ANEVENTS_CPU_OPERATION] = REG_FIELD(ANA_ANEVENTS, 2, 2),
+	[ANA_ANEVENTS_DMAC_LOOKUP] = REG_FIELD(ANA_ANEVENTS, 1, 1),
+	[ANA_ANEVENTS_SMAC_LOOKUP] = REG_FIELD(ANA_ANEVENTS, 0, 0),
+	[ANA_TABLES_MACACCESS_B_DOM] = REG_FIELD(ANA_TABLES_MACACCESS, 18, 18),
+	[ANA_TABLES_MACTINDX_BUCKET] = REG_FIELD(ANA_TABLES_MACTINDX, 10, 11),
+	[ANA_TABLES_MACTINDX_M_INDEX] = REG_FIELD(ANA_TABLES_MACTINDX, 0, 9),
+	[QSYS_TIMED_FRAME_ENTRY_TFRM_VLD] = REG_FIELD(QSYS_TIMED_FRAME_ENTRY, 20, 20),
+	[QSYS_TIMED_FRAME_ENTRY_TFRM_FP] = REG_FIELD(QSYS_TIMED_FRAME_ENTRY, 8, 19),
+	[QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO] = REG_FIELD(QSYS_TIMED_FRAME_ENTRY, 4, 7),
+	[QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL] = REG_FIELD(QSYS_TIMED_FRAME_ENTRY, 1, 3),
+	[QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T] = REG_FIELD(QSYS_TIMED_FRAME_ENTRY, 0, 0),
+	[SYS_RESET_CFG_CORE_ENA] = REG_FIELD(SYS_RESET_CFG, 2, 2),
+	[SYS_RESET_CFG_MEM_ENA] = REG_FIELD(SYS_RESET_CFG, 1, 1),
+	[SYS_RESET_CFG_MEM_INIT] = REG_FIELD(SYS_RESET_CFG, 0, 0),
+};
+
+static const struct ocelot_stat_layout ocelot_stats_layout[] = {
+	{ .name = "rx_octets", .offset = 0x00, },
+	{ .name = "rx_unicast", .offset = 0x01, },
+	{ .name = "rx_multicast", .offset = 0x02, },
+	{ .name = "rx_broadcast", .offset = 0x03, },
+	{ .name = "rx_shorts", .offset = 0x04, },
+	{ .name = "rx_fragments", .offset = 0x05, },
+	{ .name = "rx_jabbers", .offset = 0x06, },
+	{ .name = "rx_crc_align_errs", .offset = 0x07, },
+	{ .name = "rx_sym_errs", .offset = 0x08, },
+	{ .name = "rx_frames_below_65_octets", .offset = 0x09, },
+	{ .name = "rx_frames_65_to_127_octets", .offset = 0x0A, },
+	{ .name = "rx_frames_128_to_255_octets", .offset = 0x0B, },
+	{ .name = "rx_frames_256_to_511_octets", .offset = 0x0C, },
+	{ .name = "rx_frames_512_to_1023_octets", .offset = 0x0D, },
+	{ .name = "rx_frames_1024_to_1526_octets", .offset = 0x0E, },
+	{ .name = "rx_frames_over_1526_octets", .offset = 0x0F, },
+	{ .name = "rx_pause", .offset = 0x10, },
+	{ .name = "rx_control", .offset = 0x11, },
+	{ .name = "rx_longs", .offset = 0x12, },
+	{ .name = "rx_classified_drops", .offset = 0x13, },
+	{ .name = "rx_red_prio_0", .offset = 0x14, },
+	{ .name = "rx_red_prio_1", .offset = 0x15, },
+	{ .name = "rx_red_prio_2", .offset = 0x16, },
+	{ .name = "rx_red_prio_3", .offset = 0x17, },
+	{ .name = "rx_red_prio_4", .offset = 0x18, },
+	{ .name = "rx_red_prio_5", .offset = 0x19, },
+	{ .name = "rx_red_prio_6", .offset = 0x1A, },
+	{ .name = "rx_red_prio_7", .offset = 0x1B, },
+	{ .name = "rx_yellow_prio_0", .offset = 0x1C, },
+	{ .name = "rx_yellow_prio_1", .offset = 0x1D, },
+	{ .name = "rx_yellow_prio_2", .offset = 0x1E, },
+	{ .name = "rx_yellow_prio_3", .offset = 0x1F, },
+	{ .name = "rx_yellow_prio_4", .offset = 0x20, },
+	{ .name = "rx_yellow_prio_5", .offset = 0x21, },
+	{ .name = "rx_yellow_prio_6", .offset = 0x22, },
+	{ .name = "rx_yellow_prio_7", .offset = 0x23, },
+	{ .name = "rx_green_prio_0", .offset = 0x24, },
+	{ .name = "rx_green_prio_1", .offset = 0x25, },
+	{ .name = "rx_green_prio_2", .offset = 0x26, },
+	{ .name = "rx_green_prio_3", .offset = 0x27, },
+	{ .name = "rx_green_prio_4", .offset = 0x28, },
+	{ .name = "rx_green_prio_5", .offset = 0x29, },
+	{ .name = "rx_green_prio_6", .offset = 0x2A, },
+	{ .name = "rx_green_prio_7", .offset = 0x2B, },
+	{ .name = "tx_octets", .offset = 0x40, },
+	{ .name = "tx_unicast", .offset = 0x41, },
+	{ .name = "tx_multicast", .offset = 0x42, },
+	{ .name = "tx_broadcast", .offset = 0x43, },
+	{ .name = "tx_collision", .offset = 0x44, },
+	{ .name = "tx_drops", .offset = 0x45, },
+	{ .name = "tx_pause", .offset = 0x46, },
+	{ .name = "tx_frames_below_65_octets", .offset = 0x47, },
+	{ .name = "tx_frames_65_to_127_octets", .offset = 0x48, },
+	{ .name = "tx_frames_128_255_octets", .offset = 0x49, },
+	{ .name = "tx_frames_256_511_octets", .offset = 0x4A, },
+	{ .name = "tx_frames_512_1023_octets", .offset = 0x4B, },
+	{ .name = "tx_frames_1024_1526_octets", .offset = 0x4C, },
+	{ .name = "tx_frames_over_1526_octets", .offset = 0x4D, },
+	{ .name = "tx_yellow_prio_0", .offset = 0x4E, },
+	{ .name = "tx_yellow_prio_1", .offset = 0x4F, },
+	{ .name = "tx_yellow_prio_2", .offset = 0x50, },
+	{ .name = "tx_yellow_prio_3", .offset = 0x51, },
+	{ .name = "tx_yellow_prio_4", .offset = 0x52, },
+	{ .name = "tx_yellow_prio_5", .offset = 0x53, },
+	{ .name = "tx_yellow_prio_6", .offset = 0x54, },
+	{ .name = "tx_yellow_prio_7", .offset = 0x55, },
+	{ .name = "tx_green_prio_0", .offset = 0x56, },
+	{ .name = "tx_green_prio_1", .offset = 0x57, },
+	{ .name = "tx_green_prio_2", .offset = 0x58, },
+	{ .name = "tx_green_prio_3", .offset = 0x59, },
+	{ .name = "tx_green_prio_4", .offset = 0x5A, },
+	{ .name = "tx_green_prio_5", .offset = 0x5B, },
+	{ .name = "tx_green_prio_6", .offset = 0x5C, },
+	{ .name = "tx_green_prio_7", .offset = 0x5D, },
+	{ .name = "tx_aged", .offset = 0x5E, },
+	{ .name = "drop_local", .offset = 0x80, },
+	{ .name = "drop_tail", .offset = 0x81, },
+	{ .name = "drop_yellow_prio_0", .offset = 0x82, },
+	{ .name = "drop_yellow_prio_1", .offset = 0x83, },
+	{ .name = "drop_yellow_prio_2", .offset = 0x84, },
+	{ .name = "drop_yellow_prio_3", .offset = 0x85, },
+	{ .name = "drop_yellow_prio_4", .offset = 0x86, },
+	{ .name = "drop_yellow_prio_5", .offset = 0x87, },
+	{ .name = "drop_yellow_prio_6", .offset = 0x88, },
+	{ .name = "drop_yellow_prio_7", .offset = 0x89, },
+	{ .name = "drop_green_prio_0", .offset = 0x8A, },
+	{ .name = "drop_green_prio_1", .offset = 0x8B, },
+	{ .name = "drop_green_prio_2", .offset = 0x8C, },
+	{ .name = "drop_green_prio_3", .offset = 0x8D, },
+	{ .name = "drop_green_prio_4", .offset = 0x8E, },
+	{ .name = "drop_green_prio_5", .offset = 0x8F, },
+	{ .name = "drop_green_prio_6", .offset = 0x90, },
+	{ .name = "drop_green_prio_7", .offset = 0x91, },
+};
+
+static void ocelot_pll5_init(struct ocelot *ocelot)
+{
+	/* Configure PLL5. This will need a proper CCF driver
+	 * The values are coming from the VTSS API for Ocelot
+	 */
+	ocelot_write(ocelot, HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
+		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8), HSIO_PLL5G_CFG4);
+	ocelot_write(ocelot, HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
+		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
+		     HSIO_PLL5G_CFG0_ENA_BIAS |
+		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
+		     HSIO_PLL5G_CFG0_ENA_CP1 |
+		     HSIO_PLL5G_CFG0_SELCPI(2) |
+		     HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
+		     HSIO_PLL5G_CFG0_SELBGV820(4) |
+		     HSIO_PLL5G_CFG0_DIV4 |
+		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
+		     HSIO_PLL5G_CFG0_ENA_LANE, HSIO_PLL5G_CFG0);
+	ocelot_write(ocelot, HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
+		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
+		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
+		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
+		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
+		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10), HSIO_PLL5G_CFG2);
+}
+
+int ocelot_chip_init(struct ocelot *ocelot)
+{
+	int ret;
+
+	ocelot->map = ocelot_regmap;
+	ocelot->stats_layout = ocelot_stats_layout;
+	ocelot->num_stats = ARRAY_SIZE(ocelot_stats_layout);
+	ocelot->shared_queue_sz = 224 * 1024;
+
+	ret = ocelot_regfields_init(ocelot, ocelot_regfields);
+	if (ret)
+		return ret;
+
+	ocelot_pll5_init(ocelot);
+
+	eth_random_addr(ocelot->base_mac);
+	ocelot->base_mac[5] &= 0xf0;
+
+	return 0;
+}
+EXPORT_SYMBOL(ocelot_chip_init);
diff --git a/drivers/net/ethernet/mscc/ocelot_rew.h b/drivers/net/ethernet/mscc/ocelot_rew.h
new file mode 100644
index 000000000000..210914b7e20f
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_rew.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_REW_H_
+#define _MSCC_OCELOT_REW_H_
+
+#define REW_PORT_VLAN_CFG_GSZ                             0x80
+
+#define REW_PORT_VLAN_CFG_PORT_TPID(x)                    (((x) << 16) & GENMASK(31, 16))
+#define REW_PORT_VLAN_CFG_PORT_TPID_M                     GENMASK(31, 16)
+#define REW_PORT_VLAN_CFG_PORT_TPID_X(x)                  (((x) & GENMASK(31, 16)) >> 16)
+#define REW_PORT_VLAN_CFG_PORT_DEI                        BIT(15)
+#define REW_PORT_VLAN_CFG_PORT_PCP(x)                     (((x) << 12) & GENMASK(14, 12))
+#define REW_PORT_VLAN_CFG_PORT_PCP_M                      GENMASK(14, 12)
+#define REW_PORT_VLAN_CFG_PORT_PCP_X(x)                   (((x) & GENMASK(14, 12)) >> 12)
+#define REW_PORT_VLAN_CFG_PORT_VID(x)                     ((x) & GENMASK(11, 0))
+#define REW_PORT_VLAN_CFG_PORT_VID_M                      GENMASK(11, 0)
+
+#define REW_TAG_CFG_GSZ                                   0x80
+
+#define REW_TAG_CFG_TAG_CFG(x)                            (((x) << 7) & GENMASK(8, 7))
+#define REW_TAG_CFG_TAG_CFG_M                             GENMASK(8, 7)
+#define REW_TAG_CFG_TAG_CFG_X(x)                          (((x) & GENMASK(8, 7)) >> 7)
+#define REW_TAG_CFG_TAG_TPID_CFG(x)                       (((x) << 5) & GENMASK(6, 5))
+#define REW_TAG_CFG_TAG_TPID_CFG_M                        GENMASK(6, 5)
+#define REW_TAG_CFG_TAG_TPID_CFG_X(x)                     (((x) & GENMASK(6, 5)) >> 5)
+#define REW_TAG_CFG_TAG_VID_CFG                           BIT(4)
+#define REW_TAG_CFG_TAG_PCP_CFG(x)                        (((x) << 2) & GENMASK(3, 2))
+#define REW_TAG_CFG_TAG_PCP_CFG_M                         GENMASK(3, 2)
+#define REW_TAG_CFG_TAG_PCP_CFG_X(x)                      (((x) & GENMASK(3, 2)) >> 2)
+#define REW_TAG_CFG_TAG_DEI_CFG(x)                        ((x) & GENMASK(1, 0))
+#define REW_TAG_CFG_TAG_DEI_CFG_M                         GENMASK(1, 0)
+
+#define REW_PORT_CFG_GSZ                                  0x80
+
+#define REW_PORT_CFG_ES0_EN                               BIT(5)
+#define REW_PORT_CFG_FCS_UPDATE_NONCPU_CFG(x)             (((x) << 3) & GENMASK(4, 3))
+#define REW_PORT_CFG_FCS_UPDATE_NONCPU_CFG_M              GENMASK(4, 3)
+#define REW_PORT_CFG_FCS_UPDATE_NONCPU_CFG_X(x)           (((x) & GENMASK(4, 3)) >> 3)
+#define REW_PORT_CFG_FCS_UPDATE_CPU_ENA                   BIT(2)
+#define REW_PORT_CFG_FLUSH_ENA                            BIT(1)
+#define REW_PORT_CFG_AGE_DIS                              BIT(0)
+
+#define REW_DSCP_CFG_GSZ                                  0x80
+
+#define REW_PCP_DEI_QOS_MAP_CFG_GSZ                       0x80
+#define REW_PCP_DEI_QOS_MAP_CFG_RSZ                       0x4
+
+#define REW_PCP_DEI_QOS_MAP_CFG_DEI_QOS_VAL               BIT(3)
+#define REW_PCP_DEI_QOS_MAP_CFG_PCP_QOS_VAL(x)            ((x) & GENMASK(2, 0))
+#define REW_PCP_DEI_QOS_MAP_CFG_PCP_QOS_VAL_M             GENMASK(2, 0)
+
+#define REW_PTP_CFG_GSZ                                   0x80
+
+#define REW_PTP_CFG_PTP_BACKPLANE_MODE                    BIT(7)
+#define REW_PTP_CFG_GP_CFG_UNUSED(x)                      (((x) << 3) & GENMASK(6, 3))
+#define REW_PTP_CFG_GP_CFG_UNUSED_M                       GENMASK(6, 3)
+#define REW_PTP_CFG_GP_CFG_UNUSED_X(x)                    (((x) & GENMASK(6, 3)) >> 3)
+#define REW_PTP_CFG_PTP_1STEP_DIS                         BIT(2)
+#define REW_PTP_CFG_PTP_2STEP_DIS                         BIT(1)
+#define REW_PTP_CFG_PTP_UDP_KEEP                          BIT(0)
+
+#define REW_PTP_DLY1_CFG_GSZ                              0x80
+
+#define REW_RED_TAG_CFG_GSZ                               0x80
+
+#define REW_RED_TAG_CFG_RED_TAG_CFG                       BIT(0)
+
+#define REW_DSCP_REMAP_DP1_CFG_RSZ                        0x4
+
+#define REW_DSCP_REMAP_CFG_RSZ                            0x4
+
+#define REW_REW_STICKY_ES0_TAGB_PUSH_FAILED               BIT(0)
+
+#define REW_PPT_RSZ                                       0x4
+
+#endif
diff --git a/drivers/net/ethernet/mscc/ocelot_sys.h b/drivers/net/ethernet/mscc/ocelot_sys.h
new file mode 100644
index 000000000000..16f91e172bcb
--- /dev/null
+++ b/drivers/net/ethernet/mscc/ocelot_sys.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_SYS_H_
+#define _MSCC_OCELOT_SYS_H_
+
+#define SYS_COUNT_RX_OCTETS_RSZ                           0x4
+
+#define SYS_COUNT_TX_OCTETS_RSZ                           0x4
+
+#define SYS_PORT_MODE_RSZ                                 0x4
+
+#define SYS_PORT_MODE_DATA_WO_TS(x)                       (((x) << 5) & GENMASK(6, 5))
+#define SYS_PORT_MODE_DATA_WO_TS_M                        GENMASK(6, 5)
+#define SYS_PORT_MODE_DATA_WO_TS_X(x)                     (((x) & GENMASK(6, 5)) >> 5)
+#define SYS_PORT_MODE_INCL_INJ_HDR(x)                     (((x) << 3) & GENMASK(4, 3))
+#define SYS_PORT_MODE_INCL_INJ_HDR_M                      GENMASK(4, 3)
+#define SYS_PORT_MODE_INCL_INJ_HDR_X(x)                   (((x) & GENMASK(4, 3)) >> 3)
+#define SYS_PORT_MODE_INCL_XTR_HDR(x)                     (((x) << 1) & GENMASK(2, 1))
+#define SYS_PORT_MODE_INCL_XTR_HDR_M                      GENMASK(2, 1)
+#define SYS_PORT_MODE_INCL_XTR_HDR_X(x)                   (((x) & GENMASK(2, 1)) >> 1)
+#define SYS_PORT_MODE_INJ_HDR_ERR                         BIT(0)
+
+#define SYS_FRONT_PORT_MODE_RSZ                           0x4
+
+#define SYS_FRONT_PORT_MODE_HDX_MODE                      BIT(0)
+
+#define SYS_FRM_AGING_AGE_TX_ENA                          BIT(20)
+#define SYS_FRM_AGING_MAX_AGE(x)                          ((x) & GENMASK(19, 0))
+#define SYS_FRM_AGING_MAX_AGE_M                           GENMASK(19, 0)
+
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT(x)                   (((x) << 10) & GENMASK(16, 10))
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT_M                    GENMASK(16, 10)
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT_X(x)                 (((x) & GENMASK(16, 10)) >> 10)
+#define SYS_STAT_CFG_STAT_VIEW(x)                         ((x) & GENMASK(9, 0))
+#define SYS_STAT_CFG_STAT_VIEW_M                          GENMASK(9, 0)
+
+#define SYS_SW_STATUS_RSZ                                 0x4
+
+#define SYS_SW_STATUS_PORT_RX_PAUSED                      BIT(0)
+
+#define SYS_MISC_CFG_PTP_RSRV_CLR                         BIT(1)
+#define SYS_MISC_CFG_PTP_DIS_NEG_RO                       BIT(0)
+
+#define SYS_REW_MAC_HIGH_CFG_RSZ                          0x4
+
+#define SYS_REW_MAC_LOW_CFG_RSZ                           0x4
+
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG(x)              (((x) << 6) & GENMASK(21, 6))
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_M               GENMASK(21, 6)
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_X(x)            (((x) & GENMASK(21, 6)) >> 6)
+#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET(x)          ((x) & GENMASK(5, 0))
+#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET_M           GENMASK(5, 0)
+
+#define SYS_PAUSE_CFG_RSZ                                 0x4
+
+#define SYS_PAUSE_CFG_PAUSE_START(x)                      (((x) << 10) & GENMASK(18, 10))
+#define SYS_PAUSE_CFG_PAUSE_START_M                       GENMASK(18, 10)
+#define SYS_PAUSE_CFG_PAUSE_START_X(x)                    (((x) & GENMASK(18, 10)) >> 10)
+#define SYS_PAUSE_CFG_PAUSE_STOP(x)                       (((x) << 1) & GENMASK(9, 1))
+#define SYS_PAUSE_CFG_PAUSE_STOP_M                        GENMASK(9, 1)
+#define SYS_PAUSE_CFG_PAUSE_STOP_X(x)                     (((x) & GENMASK(9, 1)) >> 1)
+#define SYS_PAUSE_CFG_PAUSE_ENA                           BIT(0)
+
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START(x)              (((x) << 9) & GENMASK(17, 9))
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_M               GENMASK(17, 9)
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_X(x)            (((x) & GENMASK(17, 9)) >> 9)
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP(x)               ((x) & GENMASK(8, 0))
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP_M                GENMASK(8, 0)
+
+#define SYS_ATOP_RSZ                                      0x4
+
+#define SYS_MAC_FC_CFG_RSZ                                0x4
+
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED(x)                   (((x) << 26) & GENMASK(27, 26))
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED_M                    GENMASK(27, 26)
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED_X(x)                 (((x) & GENMASK(27, 26)) >> 26)
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG(x)                  (((x) << 20) & GENMASK(25, 20))
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_M                   GENMASK(25, 20)
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_X(x)                (((x) & GENMASK(25, 20)) >> 20)
+#define SYS_MAC_FC_CFG_ZERO_PAUSE_ENA                     BIT(18)
+#define SYS_MAC_FC_CFG_TX_FC_ENA                          BIT(17)
+#define SYS_MAC_FC_CFG_RX_FC_ENA                          BIT(16)
+#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG(x)                   ((x) & GENMASK(15, 0))
+#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG_M                    GENMASK(15, 0)
+
+#define SYS_MMGT_RELCNT(x)                                (((x) << 16) & GENMASK(31, 16))
+#define SYS_MMGT_RELCNT_M                                 GENMASK(31, 16)
+#define SYS_MMGT_RELCNT_X(x)                              (((x) & GENMASK(31, 16)) >> 16)
+#define SYS_MMGT_FREECNT(x)                               ((x) & GENMASK(15, 0))
+#define SYS_MMGT_FREECNT_M                                GENMASK(15, 0)
+
+#define SYS_MMGT_FAST_FREEVLD(x)                          (((x) << 4) & GENMASK(7, 4))
+#define SYS_MMGT_FAST_FREEVLD_M                           GENMASK(7, 4)
+#define SYS_MMGT_FAST_FREEVLD_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
+#define SYS_MMGT_FAST_RELVLD(x)                           ((x) & GENMASK(3, 0))
+#define SYS_MMGT_FAST_RELVLD_M                            GENMASK(3, 0)
+
+#define SYS_EVENTS_DIF_RSZ                                0x4
+
+#define SYS_EVENTS_DIF_EV_DRX(x)                          (((x) << 6) & GENMASK(8, 6))
+#define SYS_EVENTS_DIF_EV_DRX_M                           GENMASK(8, 6)
+#define SYS_EVENTS_DIF_EV_DRX_X(x)                        (((x) & GENMASK(8, 6)) >> 6)
+#define SYS_EVENTS_DIF_EV_DTX(x)                          ((x) & GENMASK(5, 0))
+#define SYS_EVENTS_DIF_EV_DTX_M                           GENMASK(5, 0)
+
+#define SYS_EVENTS_CORE_EV_FWR                            BIT(2)
+#define SYS_EVENTS_CORE_EV_ANA(x)                         ((x) & GENMASK(1, 0))
+#define SYS_EVENTS_CORE_EV_ANA_M                          GENMASK(1, 0)
+
+#define SYS_CNT_GSZ                                       0x4
+
+#define SYS_PTP_STATUS_PTP_TXSTAMP_OAM                    BIT(29)
+#define SYS_PTP_STATUS_PTP_OVFL                           BIT(28)
+#define SYS_PTP_STATUS_PTP_MESS_VLD                       BIT(27)
+#define SYS_PTP_STATUS_PTP_MESS_ID(x)                     (((x) << 21) & GENMASK(26, 21))
+#define SYS_PTP_STATUS_PTP_MESS_ID_M                      GENMASK(26, 21)
+#define SYS_PTP_STATUS_PTP_MESS_ID_X(x)                   (((x) & GENMASK(26, 21)) >> 21)
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT(x)                 (((x) << 16) & GENMASK(20, 16))
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT_M                  GENMASK(20, 16)
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT_X(x)               (((x) & GENMASK(20, 16)) >> 16)
+#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID(x)                 ((x) & GENMASK(15, 0))
+#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID_M                  GENMASK(15, 0)
+
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP(x)                    ((x) & GENMASK(29, 0))
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_M                     GENMASK(29, 0)
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC                   BIT(31)
+
+#define SYS_PTP_NXT_PTP_NXT                               BIT(0)
+
+#define SYS_PTP_CFG_PTP_STAMP_WID(x)                      (((x) << 2) & GENMASK(7, 2))
+#define SYS_PTP_CFG_PTP_STAMP_WID_M                       GENMASK(7, 2)
+#define SYS_PTP_CFG_PTP_STAMP_WID_X(x)                    (((x) & GENMASK(7, 2)) >> 2)
+#define SYS_PTP_CFG_PTP_CF_ROLL_MODE(x)                   ((x) & GENMASK(1, 0))
+#define SYS_PTP_CFG_PTP_CF_ROLL_MODE_M                    GENMASK(1, 0)
+
+#define SYS_RAM_INIT_RAM_INIT                             BIT(1)
+#define SYS_RAM_INIT_RAM_CFG_HOOK                         BIT(0)
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 0ce60edd78d3efa8570362846ea76ce1beb3c2b8 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 14 May 2018 22:05:00 +0200
Subject: MAINTAINERS: Add entry for Microsemi Ethernet switches

Add myself as a maintainer for the Microsemi Ethernet switches.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c6989d04afef..658880464b9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9279,6 +9279,12 @@ F:	include/linux/cciss*.h
 F:	include/uapi/linux/cciss*.h
 F:	Documentation/scsi/smartpqi.txt
 
+MICROSEMI ETHERNET SWITCH DRIVER
+M:	Alexandre Belloni <alexandre.belloni@bootlin.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/ethernet/mscc/
+
 MICROSOFT SURFACE PRO 3 BUTTON DRIVER
 M:	Chen Yu <yu.c.chen@intel.com>
 L:	platform-driver-x86@vger.kernel.org
-- 
cgit v1.2.3-59-g8ed1b


From 2312e050f42b0fcdc8a49bd11df1d3015859f2ab Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:05 +0100
Subject: net: hns3: Fix for deadlock problem occurring when unregistering
 ae_algo

When hnae3_unregister_ae_algo is called by PF, pci_disable_sriov is
called. And then, hns3_remove is called by VF. We get deadlocked in
this case.

Since VF pci device is dependent on PF pci device, When PF pci device
is removed, VF pci device must be removed. Also, To solve the deadlock
problem, VF pci device should be removed before PF pci device is removed.

This patch moves pci_enable/disable_sriov from hclge to hns3 to solve
the deadlock problem.

Also, we do not need to return EPROBE_DEFER in hnae3_register_ae_dev,
because SRIOV is no longer enabled in the context calling
hnae3_register_ae_dev. Mutex_trylock can be replaced with mutex_lock.

Fixes: 424eb834a9be ("net: hns3: Unified HNS3 {VF|PF} Ethernet Driver for hip08 SoC")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c        | 12 +---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    | 74 +++++++++++++++++++++-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 42 ++----------
 3 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 02145f2de820..1686cebead96 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -196,17 +196,9 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 	const struct pci_device_id *id;
 	struct hnae3_ae_algo *ae_algo;
 	struct hnae3_client *client;
-	int ret = 0, lock_acquired;
+	int ret = 0;
 
-	/* we can get deadlocked if SRIOV is being enabled in context to probe
-	 * and probe gets called again in same context. This can happen when
-	 * pci_enable_sriov() is called to create VFs from PF probes context.
-	 * Therefore, for simplicity uniformly defering further probing in all
-	 * cases where we detect contention.
-	 */
-	lock_acquired = mutex_trylock(&hnae3_common_lock);
-	if (!lock_acquired)
-		return -EPROBE_DEFER;
+	mutex_lock(&hnae3_common_lock);
 
 	list_add_tail(&ae_dev->node, &hnae3_ae_dev_list);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 4031174181a0..af9e90f87109 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1487,6 +1487,68 @@ static const struct net_device_ops hns3_nic_netdev_ops = {
 	.ndo_set_vf_vlan	= hns3_ndo_set_vf_vlan,
 };
 
+static bool hns3_is_phys_func(struct pci_dev *pdev)
+{
+	u32 dev_id = pdev->device;
+
+	switch (dev_id) {
+	case HNAE3_DEV_ID_GE:
+	case HNAE3_DEV_ID_25GE:
+	case HNAE3_DEV_ID_25GE_RDMA:
+	case HNAE3_DEV_ID_25GE_RDMA_MACSEC:
+	case HNAE3_DEV_ID_50GE_RDMA:
+	case HNAE3_DEV_ID_50GE_RDMA_MACSEC:
+	case HNAE3_DEV_ID_100G_RDMA_MACSEC:
+		return true;
+	case HNAE3_DEV_ID_100G_VF:
+	case HNAE3_DEV_ID_100G_RDMA_DCB_PFC_VF:
+		return false;
+	default:
+		dev_warn(&pdev->dev, "un-recognized pci device-id %d",
+			 dev_id);
+	}
+
+	return false;
+}
+
+static int get_num_req_vfs(struct pci_dev *pdev)
+{
+	/* a variable vf num will be supported later */
+	return pci_sriov_get_totalvfs(pdev);
+}
+
+static void hns3_enable_sriov(struct pci_dev *pdev)
+{
+	int num_req_vfs = get_num_req_vfs(pdev);
+	int ret;
+
+	/* Enable SRIOV */
+	if (!num_req_vfs)
+		return;
+
+	dev_info(&pdev->dev, "active VFs(%d) found, enabling SRIOV\n",
+		 num_req_vfs);
+
+	ret = pci_enable_sriov(pdev, num_req_vfs);
+	if (ret)
+		dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
+}
+
+static void hns3_disable_sriov(struct pci_dev *pdev)
+{
+	/* If our VFs are assigned we cannot shut down SR-IOV
+	 * without causing issues, so just leave the hardware
+	 * available but disabled
+	 */
+	if (pci_vfs_assigned(pdev)) {
+		dev_warn(&pdev->dev,
+			 "disabling driver while VFs are assigned\n");
+		return;
+	}
+
+	pci_disable_sriov(pdev);
+}
+
 /* hns3_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in hns3_pci_tbl
@@ -1514,7 +1576,14 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ae_dev->dev_type = HNAE3_DEV_KNIC;
 	pci_set_drvdata(pdev, ae_dev);
 
-	return hnae3_register_ae_dev(ae_dev);
+	ret = hnae3_register_ae_dev(ae_dev);
+	if (ret)
+		return ret;
+
+	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
+		hns3_enable_sriov(pdev);
+
+	return 0;
 }
 
 /* hns3_remove - Device removal routine
@@ -1524,6 +1593,9 @@ static void hns3_remove(struct pci_dev *pdev)
 {
 	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
 
+	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
+		hns3_disable_sriov(pdev);
+
 	hnae3_unregister_ae_dev(ae_dev);
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 316ec8427891..343197a57240 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1473,21 +1473,8 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 	hdev->vport = vport;
 	hdev->num_alloc_vport = num_vport;
 
-#ifdef CONFIG_PCI_IOV
-	/* Enable SRIOV */
-	if (hdev->num_req_vfs) {
-		dev_info(&pdev->dev, "active VFs(%d) found, enabling SRIOV\n",
-			 hdev->num_req_vfs);
-		ret = pci_enable_sriov(hdev->pdev, hdev->num_req_vfs);
-		if (ret) {
-			hdev->num_alloc_vfs = 0;
-			dev_err(&pdev->dev, "SRIOV enable failed %d\n",
-				ret);
-			return ret;
-		}
-	}
-	hdev->num_alloc_vfs = hdev->num_req_vfs;
-#endif
+	if (IS_ENABLED(CONFIG_PCI_IOV))
+		hdev->num_alloc_vfs = hdev->num_req_vfs;
 
 	for (i = 0; i < num_vport; i++) {
 		vport->back = hdev;
@@ -2946,21 +2933,6 @@ static void hclge_service_task(struct work_struct *work)
 	hclge_service_complete(hdev);
 }
 
-static void hclge_disable_sriov(struct hclge_dev *hdev)
-{
-	/* If our VFs are assigned we cannot shut down SR-IOV
-	 * without causing issues, so just leave the hardware
-	 * available but disabled
-	 */
-	if (pci_vfs_assigned(hdev->pdev)) {
-		dev_warn(&hdev->pdev->dev,
-			 "disabling driver while VFs are assigned\n");
-		return;
-	}
-
-	pci_disable_sriov(hdev->pdev);
-}
-
 struct hclge_vport *hclge_get_vport(struct hnae3_handle *handle)
 {
 	/* VF handle has no client */
@@ -5540,7 +5512,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	ret = hclge_map_tqp(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Map tqp error, ret = %d.\n", ret);
-		goto err_sriov_disable;
+		goto err_msi_irq_uninit;
 	}
 
 	if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) {
@@ -5548,7 +5520,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 		if (ret) {
 			dev_err(&hdev->pdev->dev,
 				"mdio config fail ret=%d\n", ret);
-			goto err_sriov_disable;
+			goto err_msi_irq_uninit;
 		}
 	}
 
@@ -5612,9 +5584,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 err_mdiobus_unreg:
 	if (hdev->hw.mac.phydev)
 		mdiobus_unregister(hdev->hw.mac.mdio_bus);
-err_sriov_disable:
-	if (IS_ENABLED(CONFIG_PCI_IOV))
-		hclge_disable_sriov(hdev);
 err_msi_irq_uninit:
 	hclge_misc_irq_uninit(hdev);
 err_msi_uninit:
@@ -5717,9 +5686,6 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 	set_bit(HCLGE_STATE_DOWN, &hdev->state);
 
-	if (IS_ENABLED(CONFIG_PCI_IOV))
-		hclge_disable_sriov(hdev);
-
 	if (hdev->service_timer.function)
 		del_timer_sync(&hdev->service_timer);
 	if (hdev->service_task.func)
-- 
cgit v1.2.3-59-g8ed1b


From 3e249d3bed677f18a8a9b87b2b9cf1beaeef886c Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:06 +0100
Subject: net: hns3: Fix for the null pointer problem occurring when
 initializing ae_dev failed

When initializing ae_dev failed during loading hclge.ko, the drvdata will
be set to null. When removing hns3.ko, we get a null ae_dev. It causes the
null pointer problem.

This patch removes pci_set_drvdata from error handle of hclge_init_ae_dev
to fix the bug, since pci_set_drvdata has been called in hns3_remove.
Also, we do not need to uninit the ae_dev which is not initialized. And
it may be the one which is initialized failed.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c               | 6 ++++++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   | 5 +----
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 6 ++----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 1686cebead96..ab2e72ccceb1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -168,6 +168,9 @@ void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo)
 	mutex_lock(&hnae3_common_lock);
 	/* Check if there are matched ae_dev */
 	list_for_each_entry(ae_dev, &hnae3_ae_dev_list, node) {
+		if (!hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))
+			continue;
+
 		id = pci_match_id(ae_algo->pdev_id_table, ae_dev->pdev);
 		if (!id)
 			continue;
@@ -256,6 +259,9 @@ void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev)
 	mutex_lock(&hnae3_common_lock);
 	/* Check if there are matched ae_algo */
 	list_for_each_entry(ae_algo, &hnae3_ae_algo_list, node) {
+		if (!hnae_get_bit(ae_dev->flag, HNAE3_DEV_INITED_B))
+			continue;
+
 		id = pci_match_id(ae_algo->pdev_id_table, ae_dev->pdev);
 		if (!id)
 			continue;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 343197a57240..c2501b1ec819 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5379,7 +5379,7 @@ static int hclge_pci_init(struct hclge_dev *hdev)
 	ret = pci_enable_device(pdev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to enable PCI device\n");
-		goto err_no_drvdata;
+		return ret;
 	}
 
 	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
@@ -5417,8 +5417,6 @@ err_clr_master:
 	pci_release_regions(pdev);
 err_disable_device:
 	pci_disable_device(pdev);
-err_no_drvdata:
-	pci_set_drvdata(pdev, NULL);
 
 	return ret;
 }
@@ -5594,7 +5592,6 @@ err_pci_uninit:
 	pci_clear_master(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
 out:
 	return ret;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 2dbffcef7109..b7578c695d1c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1563,7 +1563,7 @@ static int hclgevf_pci_init(struct hclgevf_dev *hdev)
 	ret = pci_enable_device(pdev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to enable PCI device\n");
-		goto err_no_drvdata;
+		return ret;
 	}
 
 	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
@@ -1595,8 +1595,7 @@ err_clr_master:
 	pci_release_regions(pdev);
 err_disable_device:
 	pci_disable_device(pdev);
-err_no_drvdata:
-	pci_set_drvdata(pdev, NULL);
+
 	return ret;
 }
 
@@ -1608,7 +1607,6 @@ static void hclgevf_pci_uninit(struct hclgevf_dev *hdev)
 	pci_clear_master(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
 }
 
 static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
-- 
cgit v1.2.3-59-g8ed1b


From e3afa96365c916319a1068ac50b838b2256cc6df Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:07 +0100
Subject: net: hns3: Add a check for client instance init state

If the client instance is initializd failed, we do not need to uninit it.
This patch adds a state check to check init state of client instance.

Fixes: 38caee9d3ee8 ("net: hns3: Add support of the HNAE3 framework")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c | 15 ++++++++++++---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index ab2e72ccceb1..21cb0c5e5b31 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -50,13 +50,22 @@ static int hnae3_match_n_instantiate(struct hnae3_client *client,
 	/* now, (un-)instantiate client by calling lower layer */
 	if (is_reg) {
 		ret = ae_dev->ops->init_client_instance(client, ae_dev);
-		if (ret)
+		if (ret) {
 			dev_err(&ae_dev->pdev->dev,
 				"fail to instantiate client\n");
-		return ret;
+			return ret;
+		}
+
+		hnae_set_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B, 1);
+		return 0;
+	}
+
+	if (hnae_get_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B)) {
+		ae_dev->ops->uninit_client_instance(client, ae_dev);
+
+		hnae_set_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B, 0);
 	}
 
-	ae_dev->ops->uninit_client_instance(client, ae_dev);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 804ea83e1713..ffec2318cf50 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -52,6 +52,7 @@
 #define HNAE3_DEV_INITED_B			0x0
 #define HNAE3_DEV_SUPPORT_ROCE_B		0x1
 #define HNAE3_DEV_SUPPORT_DCB_B			0x2
+#define HNAE3_CLIENT_INITED_B			0x3
 
 #define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\
 		BIT(HNAE3_DEV_SUPPORT_ROCE_B))
-- 
cgit v1.2.3-59-g8ed1b


From 50fbc237b75720ca08d1fd9c6408cfe5c2217bbf Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:08 +0100
Subject: net: hns3: Change return type of hnae3_register_ae_dev

If hclge.ko has not been inserted, the value of ret always is zero
in hnae3_register_ae_dev. If hclge.ko has been inserted, the value
of ret is zero or non zero. Different execution ways have different
results. It is confusing.

The ae_dev which is initialized failed can be reinitialized when we
remove hclge.ko and insert it again. For the case initializing client
instance, it is just like the case initializing ae_dev. The main function
of hnae3_register_ae_dev is adding the ae_dev to ad_dev list. Because
adding ae_dev is always ok, we does not need to return any in this
function.

This patch changes the return type of hnae3_register_ae_dev from int
to void.

Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c     | 5 +----
 drivers/net/ethernet/hisilicon/hns3/hnae3.h     | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 +---
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 21cb0c5e5b31..cb93295c9f62 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -203,7 +203,7 @@ EXPORT_SYMBOL(hnae3_unregister_ae_algo);
  * @ae_dev: the AE device
  * NOTE: the duplicated name will not be checked
  */
-int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
+void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 {
 	const struct pci_device_id *id;
 	struct hnae3_ae_algo *ae_algo;
@@ -224,7 +224,6 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 		if (!ae_dev->ops) {
 			dev_err(&ae_dev->pdev->dev, "ae_dev ops are null\n");
-			ret = -EOPNOTSUPP;
 			goto out_err;
 		}
 
@@ -251,8 +250,6 @@ int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 out_err:
 	mutex_unlock(&hnae3_common_lock);
-
-	return ret;
 }
 EXPORT_SYMBOL(hnae3_register_ae_dev);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index ffec2318cf50..ea6e6ead28a1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -515,7 +515,7 @@ struct hnae3_handle {
 #define hnae_get_bit(origin, shift) \
 	hnae_get_field((origin), (0x1 << (shift)), (shift))
 
-int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev);
+void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev);
 void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev);
 
 void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index af9e90f87109..ac75b5d6a252 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1576,9 +1576,7 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ae_dev->dev_type = HNAE3_DEV_KNIC;
 	pci_set_drvdata(pdev, ae_dev);
 
-	ret = hnae3_register_ae_dev(ae_dev);
-	if (ret)
-		return ret;
+	hnae3_register_ae_dev(ae_dev);
 
 	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
 		hns3_enable_sriov(pdev);
-- 
cgit v1.2.3-59-g8ed1b


From 854cf33a63106667fea7265b9c222566dbb5edc6 Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:09 +0100
Subject: net: hns3: Change return type of hnae3_register_ae_algo

The ae_algo is used by many ae_devs. It is not only belong to just a
ae_dev. Initializing ae_dev failed does not represent registering ae_algo
failed. Because the action of registering ae_algo just is adding ae_algo
to the ae_algo list and it is always is true, it make no sense to define
return type as int.

This patch changes the return type of hnae3_register_ae_algo from int to
void.

Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c               | 4 +---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h               | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   | 4 +++-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 +++-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index cb93295c9f62..3b1c3966b7d4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(hnae3_unregister_client);
  * @ae_algo: AE algorithm
  * NOTE: the duplicated name will not be checked
  */
-int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo)
+void hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo)
 {
 	const struct pci_device_id *id;
 	struct hnae3_ae_dev *ae_dev;
@@ -160,8 +160,6 @@ int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo)
 	}
 
 	mutex_unlock(&hnae3_common_lock);
-
-	return ret;
 }
 EXPORT_SYMBOL(hnae3_register_ae_algo);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index ea6e6ead28a1..2f266ef96582 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -519,7 +519,7 @@ void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev);
 void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev);
 
 void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo);
-int hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo);
+void hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo);
 
 void hnae3_unregister_client(struct hnae3_client *client);
 int hnae3_register_client(struct hnae3_client *client);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c2501b1ec819..d060903f47e0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -6250,7 +6250,9 @@ static int hclge_init(void)
 {
 	pr_info("%s is initializing\n", HCLGE_NAME);
 
-	return hnae3_register_ae_algo(&ae_algo);
+	hnae3_register_ae_algo(&ae_algo);
+
+	return 0;
 }
 
 static void hclge_exit(void)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index b7578c695d1c..f1f4a17340a5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1852,7 +1852,9 @@ static int hclgevf_init(void)
 {
 	pr_info("%s is initializing\n", HCLGEVF_NAME);
 
-	return hnae3_register_ae_algo(&ae_algovf);
+	hnae3_register_ae_algo(&ae_algovf);
+
+	return 0;
 }
 
 static void hclgevf_exit(void)
-- 
cgit v1.2.3-59-g8ed1b


From 0c698257c7befa8d1ec1b8d767758c3d73a2686a Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:10 +0100
Subject: net: hns3: Change return value in hnae3_register_client

A client includes many client instance. Just like ae_algo, Initializing
client instance failed does not represent registering client failed.
The action of registering client just is adding client to the client
list and the result always is true. This patch changes the return
value of hnae3_register_client form a variable value to a fixed value,
makes the function always return ok.

Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 3b1c3966b7d4..bd3c232234d7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -98,7 +98,7 @@ int hnae3_register_client(struct hnae3_client *client)
 exit:
 	mutex_unlock(&hnae3_common_lock);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(hnae3_register_client);
 
-- 
cgit v1.2.3-59-g8ed1b


From 67bf2541f4b9a091e928d75eca544bca4c5db142 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 15 May 2018 19:20:11 +0100
Subject: net: hns3: Fixes the back pressure setting when sriov is enabled

When sriov is enabled, the Qset and tc mapping is not longer one
to one relation.

This patch fixes it by mapping all pf and vf's Qset to tc.

Fixes: 848440544b41 ("net: hns3: Add support of TX Scheduler & Shaper to HNS3 driver")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c  | 45 +++++++++++++++++++---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h  |  5 +++
 2 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index c69ecab460f9..262c125f8137 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -500,7 +500,8 @@ static int hclge_tm_qs_schd_mode_cfg(struct hclge_dev *hdev, u16 qs_id, u8 mode)
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
 
-static int hclge_tm_qs_bp_cfg(struct hclge_dev *hdev, u8 tc)
+static int hclge_tm_qs_bp_cfg(struct hclge_dev *hdev, u8 tc, u8 grp_id,
+			      u32 bit_map)
 {
 	struct hclge_bp_to_qs_map_cmd *bp_to_qs_map_cmd;
 	struct hclge_desc desc;
@@ -511,9 +512,8 @@ static int hclge_tm_qs_bp_cfg(struct hclge_dev *hdev, u8 tc)
 	bp_to_qs_map_cmd = (struct hclge_bp_to_qs_map_cmd *)desc.data;
 
 	bp_to_qs_map_cmd->tc_id = tc;
-
-	/* Qset and tc is one by one mapping */
-	bp_to_qs_map_cmd->qs_bit_map = cpu_to_le32(1 << tc);
+	bp_to_qs_map_cmd->qs_group_id = grp_id;
+	bp_to_qs_map_cmd->qs_bit_map = cpu_to_le32(bit_map);
 
 	return hclge_cmd_send(&hdev->hw, &desc, 1);
 }
@@ -1167,6 +1167,41 @@ static int hclge_pfc_setup_hw(struct hclge_dev *hdev)
 				      hdev->tm_info.hw_pfc_map);
 }
 
+/* Each Tc has a 1024 queue sets to backpress, it divides to
+ * 32 group, each group contains 32 queue sets, which can be
+ * represented by u32 bitmap.
+ */
+static int hclge_bp_setup_hw(struct hclge_dev *hdev, u8 tc)
+{
+	struct hclge_vport *vport = hdev->vport;
+	u32 i, k, qs_bitmap;
+	int ret;
+
+	for (i = 0; i < HCLGE_BP_GRP_NUM; i++) {
+		qs_bitmap = 0;
+
+		for (k = 0; k < hdev->num_alloc_vport; k++) {
+			u16 qs_id = vport->qs_offset + tc;
+			u8 grp, sub_grp;
+
+			grp = hnae_get_field(qs_id, HCLGE_BP_GRP_ID_M,
+					     HCLGE_BP_GRP_ID_S);
+			sub_grp = hnae_get_field(qs_id, HCLGE_BP_SUB_GRP_ID_M,
+						 HCLGE_BP_SUB_GRP_ID_S);
+			if (i == grp)
+				qs_bitmap |= (1 << sub_grp);
+
+			vport++;
+		}
+
+		ret = hclge_tm_qs_bp_cfg(hdev, tc, i, qs_bitmap);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int hclge_mac_pause_setup_hw(struct hclge_dev *hdev)
 {
 	bool tx_en, rx_en;
@@ -1218,7 +1253,7 @@ int hclge_pause_setup_hw(struct hclge_dev *hdev)
 		dev_warn(&hdev->pdev->dev, "set pfc pause failed:%d\n", ret);
 
 	for (i = 0; i < hdev->tm_info.num_tc; i++) {
-		ret = hclge_tm_qs_bp_cfg(hdev, i);
+		ret = hclge_bp_setup_hw(hdev, i);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
index 2dbe177581e9..c2b6e8a6700f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.h
@@ -89,6 +89,11 @@ struct hclge_pg_shapping_cmd {
 	__le32 pg_shapping_para;
 };
 
+#define HCLGE_BP_GRP_NUM		32
+#define HCLGE_BP_SUB_GRP_ID_S		0
+#define HCLGE_BP_SUB_GRP_ID_M		GENMASK(4, 0)
+#define HCLGE_BP_GRP_ID_S		5
+#define HCLGE_BP_GRP_ID_M		GENMASK(9, 5)
 struct hclge_bp_to_qs_map_cmd {
 	u8 tc_id;
 	u8 rsvd[2];
-- 
cgit v1.2.3-59-g8ed1b


From be8d8cdb8ebf3afd841c109dd035fd789a0c7d53 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Tue, 15 May 2018 19:20:12 +0100
Subject: net: hns3: Fix for fiber link up problem

When hclge_ae_start is called, hdev->hw.mac.link may be set
to one after up/down multi-times, which does not correspond to
the link state of netdev when the netdev is up.

This fixes it by setting hdev->hw.mac.link to zero when
hclge_ae_start is called.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index d060903f47e0..75d9b8c5793a 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3756,6 +3756,7 @@ static int hclge_ae_start(struct hnae3_handle *handle)
 	hclge_cfg_mac_mode(hdev, true);
 	clear_bit(HCLGE_STATE_DOWN, &hdev->state);
 	mod_timer(&hdev->service_timer, jiffies + HZ);
+	hdev->hw.mac.link = 0;
 
 	/* reset tqp stats */
 	hclge_reset_tqp_stats(handle);
@@ -3792,7 +3793,6 @@ static void hclge_ae_stop(struct hnae3_handle *handle)
 
 	/* reset tqp stats */
 	hclge_reset_tqp_stats(handle);
-	hclge_update_link_status(hdev);
 }
 
 static int hclge_get_mac_vlan_cmd_status(struct hclge_vport *vport,
-- 
cgit v1.2.3-59-g8ed1b


From fa8d82e853e84de998e91e95d7a09aa04a7f79ee Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Tue, 15 May 2018 19:20:13 +0100
Subject: net: hns3: Add support of .sriov_configure in HNS3 driver

As HNS3 driver will enable SRIOV default and enable all VFs the
HW support, if PF and VF driver compiled to kernel, VF driver
will work on host default, it is not right.

This patch adds support for hns3_driver.sriov_configure to support
user configs the VF_num, and do not enable sriov default.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Suggested-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 58 ++++++++++++++-----------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ac75b5d6a252..e85ff3805c96 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1511,29 +1511,6 @@ static bool hns3_is_phys_func(struct pci_dev *pdev)
 	return false;
 }
 
-static int get_num_req_vfs(struct pci_dev *pdev)
-{
-	/* a variable vf num will be supported later */
-	return pci_sriov_get_totalvfs(pdev);
-}
-
-static void hns3_enable_sriov(struct pci_dev *pdev)
-{
-	int num_req_vfs = get_num_req_vfs(pdev);
-	int ret;
-
-	/* Enable SRIOV */
-	if (!num_req_vfs)
-		return;
-
-	dev_info(&pdev->dev, "active VFs(%d) found, enabling SRIOV\n",
-		 num_req_vfs);
-
-	ret = pci_enable_sriov(pdev, num_req_vfs);
-	if (ret)
-		dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
-}
-
 static void hns3_disable_sriov(struct pci_dev *pdev)
 {
 	/* If our VFs are assigned we cannot shut down SR-IOV
@@ -1578,9 +1555,6 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	hnae3_register_ae_dev(ae_dev);
 
-	if (hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))
-		hns3_enable_sriov(pdev);
-
 	return 0;
 }
 
@@ -1597,11 +1571,43 @@ static void hns3_remove(struct pci_dev *pdev)
 	hnae3_unregister_ae_dev(ae_dev);
 }
 
+/**
+ * hns3_pci_sriov_configure
+ * @pdev: pointer to a pci_dev structure
+ * @num_vfs: number of VFs to allocate
+ *
+ * Enable or change the number of VFs. Called when the user updates the number
+ * of VFs in sysfs.
+ **/
+int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	int ret;
+
+	if (!(hns3_is_phys_func(pdev) && IS_ENABLED(CONFIG_PCI_IOV))) {
+		dev_warn(&pdev->dev, "Can not config SRIOV\n");
+		return -EINVAL;
+	}
+
+	if (num_vfs) {
+		ret = pci_enable_sriov(pdev, num_vfs);
+		if (ret)
+			dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
+	} else if (!pci_vfs_assigned(pdev)) {
+		pci_disable_sriov(pdev);
+	} else {
+		dev_warn(&pdev->dev,
+			 "Unable to free VFs because some are assigned to VMs.\n");
+	}
+
+	return 0;
+}
+
 static struct pci_driver hns3_driver = {
 	.name     = hns3_driver_name,
 	.id_table = hns3_pci_tbl,
 	.probe    = hns3_probe,
 	.remove   = hns3_remove,
+	.sriov_configure = hns3_pci_sriov_configure,
 };
 
 /* set default feature to hns3 */
-- 
cgit v1.2.3-59-g8ed1b


From 6a814413eb9eb76d61de154d4f7b46e2a1558d7a Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Tue, 15 May 2018 19:20:14 +0100
Subject: net: hns3: Fixes the missing PCI iounmap for various legs

We call pcim_iomap in hclge_pci_init, pcim_iounmap should be called
in error handle of hclge_init_ae_dev.

We call pcim_iomap in hclge_pci_init, but do not call pcim_iounmap in
hclge_pci_uninit. When we remove the hclge.ko and insert it again, a
problem that pci can not map will happen. pcim_iounmap need to be called
in hclge_pci_uninit.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 75d9b8c5793a..46435c85b160 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5425,6 +5425,7 @@ static void hclge_pci_uninit(struct hclge_dev *hdev)
 {
 	struct pci_dev *pdev = hdev->pdev;
 
+	pcim_iounmap(pdev, hdev->hw.io_base);
 	pci_free_irq_vectors(pdev);
 	pci_clear_master(pdev);
 	pci_release_mem_regions(pdev);
@@ -5589,6 +5590,7 @@ err_msi_uninit:
 err_cmd_uninit:
 	hclge_destroy_cmd_queue(&hdev->hw);
 err_pci_uninit:
+	pcim_iounmap(pdev, hdev->hw.io_base);
 	pci_clear_master(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
-- 
cgit v1.2.3-59-g8ed1b


From 569bc643656826d5305aebdc5d5500c99881b2e5 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:54 +0200
Subject: net/smc: no tx work trigger for fallback sockets

If TCP_NODELAY is set or TCP_CORK is reset, setsockopt triggers the
tx worker. This does not make sense, if the SMC socket switched to
the TCP fallback when the connection is created. This patch adds
the additional check for the fallback case.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 17688a02035b..83403be46a4a 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1353,14 +1353,14 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	case TCP_NODELAY:
 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
-			if (val)
+			if (val && !smc->use_fallback)
 				mod_delayed_work(system_wq, &smc->conn.tx_work,
 						 0);
 		}
 		break;
 	case TCP_CORK:
 		if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
-			if (!val)
+			if (!val && !smc->use_fallback)
 				mod_delayed_work(system_wq, &smc->conn.tx_work,
 						 0);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 44aa81ce953aadd683765e6416ad53cc61d0624a Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:55 +0200
Subject: net/smc: register new rmbs with the peer

Register new rmb buffers with the remote peer by exchanging a
confirm_rkey llc message.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 20 ++++++++++++++------
 net/smc/smc_core.c |  1 +
 net/smc/smc_core.h |  2 ++
 net/smc/smc_llc.c  | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 net/smc/smc_llc.h  |  2 ++
 5 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 83403be46a4a..397ba2182453 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -293,14 +293,22 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
-/* register a new rmb */
-static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
+/* register a new rmb, optionally send confirm_rkey msg to register with peer */
+static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
+		       bool conf_rkey)
 {
 	/* register memory region for new rmb */
 	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
 		rmb_desc->regerr = 1;
 		return -EFAULT;
 	}
+	if (!conf_rkey)
+		return 0;
+	/* exchange confirm_rkey msg with peer */
+	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
+		rmb_desc->regerr = 1;
+		return -EFAULT;
+	}
 	return 0;
 }
 
@@ -334,7 +342,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 
 	smc_wr_remember_qp_attr(link);
 
-	if (smc_reg_rmb(link, smc->conn.rmb_desc))
+	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_INTERR;
 
 	/* send CONFIRM LINK response over RoCE fabric */
@@ -488,7 +496,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 		}
 	} else {
 		if (!smc->conn.rmb_desc->reused) {
-			if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
+			if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) {
 				reason_code = SMC_CLC_DECL_INTERR;
 				goto decline_rdma_unlock;
 			}
@@ -729,7 +737,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 
 	link = &lgr->lnk[SMC_SINGLE_LINK];
 
-	if (smc_reg_rmb(link, smc->conn.rmb_desc))
+	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
 		return SMC_CLC_DECL_INTERR;
 
 	/* send CONFIRM LINK request to client over the RoCE fabric */
@@ -866,7 +874,7 @@ static void smc_listen_work(struct work_struct *work)
 
 	if (local_contact != SMC_FIRST_CONTACT) {
 		if (!new_smc->conn.rmb_desc->reused) {
-			if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
+			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) {
 				reason_code = SMC_CLC_DECL_INTERR;
 				goto decline_rdma_unlock;
 			}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 9c74844e2008..29ae077b0ec9 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -210,6 +210,7 @@ static int smc_lgr_create(struct smc_sock *smc,
 	init_completion(&lnk->llc_confirm_resp);
 	init_completion(&lnk->llc_add);
 	init_completion(&lnk->llc_add_resp);
+	init_completion(&lnk->llc_confirm_rkey);
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fca8624e5e71..9398e235d74b 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -105,6 +105,8 @@ struct smc_link {
 	struct delayed_work	llc_testlink_wrk; /* testlink worker */
 	struct completion	llc_testlink_resp; /* wait for rx of testlink */
 	int			llc_testlink_time; /* testlink interval */
+	struct completion	llc_confirm_rkey; /* wait 4 rx of cnf rkey */
+	int			llc_confirm_rkey_rc; /* rc from cnf rkey msg */
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 33b4d856f4c6..b118f80b9c41 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -214,6 +214,31 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
 	return rc;
 }
 
+/* send LLC confirm rkey request */
+static int smc_llc_send_confirm_rkey(struct smc_link *link,
+				     struct smc_buf_desc *rmb_desc)
+{
+	struct smc_llc_msg_confirm_rkey *rkeyllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
+	memset(rkeyllc, 0, sizeof(*rkeyllc));
+	rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
+	rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+	rkeyllc->rtoken[0].rmb_key =
+		htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+	rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64(
+		(u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
 /* send ADD LINK request or response */
 int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
 			  union ib_gid *gid,
@@ -413,7 +438,9 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
 
 	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		/* unused as long as we don't send this type of msg */
+		link->llc_confirm_rkey_rc = llc->hd.flags &
+					    SMC_LLC_FLAG_RKEY_NEG;
+		complete(&link->llc_confirm_rkey);
 	} else {
 		rc = smc_rtoken_add(lgr,
 				    llc->rtoken[0].rmb_vaddr,
@@ -503,7 +530,7 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 	}
 }
 
-/***************************** worker ****************************************/
+/***************************** worker, utils *********************************/
 
 static void smc_llc_testlink_work(struct work_struct *work)
 {
@@ -562,6 +589,22 @@ void smc_llc_link_flush(struct smc_link *link)
 	cancel_delayed_work_sync(&link->llc_testlink_wrk);
 }
 
+/* register a new rtoken at the remote peer */
+int smc_llc_do_confirm_rkey(struct smc_link *link,
+			    struct smc_buf_desc *rmb_desc)
+{
+	int rc;
+
+	reinit_completion(&link->llc_confirm_rkey);
+	smc_llc_send_confirm_rkey(link, rmb_desc);
+	/* receive CONFIRM RKEY response from server over RoCE fabric */
+	rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey,
+						       SMC_LLC_WAIT_TIME);
+	if (rc <= 0 || link->llc_confirm_rkey_rc)
+		return -EFAULT;
+	return 0;
+}
+
 /***************************** init, exit, misc ******************************/
 
 static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index d6e42116485e..728823dd8ae5 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -47,6 +47,8 @@ int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
 void smc_llc_link_active(struct smc_link *link, int testlink_time);
 void smc_llc_link_inactive(struct smc_link *link);
 void smc_llc_link_flush(struct smc_link *link);
+int smc_llc_do_confirm_rkey(struct smc_link *link,
+			    struct smc_buf_desc *rmb_desc);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 9fcdf8e983b5c8367064067cd8e74b6cfb90eef7 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:56 +0200
Subject: net/smc: remove unnecessary cast

Remove an unneeded (void *) cast from the calls to
smc_llc_send_message(). No functional changes.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index b118f80b9c41..29f59b47b559 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -451,7 +451,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link,
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
 		if (rc < 0)
 			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
-		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+		smc_llc_send_message(link, llc, sizeof(*llc));
 	}
 }
 
@@ -463,7 +463,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
 	} else {
 		/* ignore rtokens for other links, we have only one link */
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+		smc_llc_send_message(link, llc, sizeof(*llc));
 	}
 }
 
@@ -491,7 +491,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link,
 		}
 
 		llc->hd.flags |= SMC_LLC_FLAG_RESP;
-		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+		smc_llc_send_message(link, llc, sizeof(*llc));
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d97935faee4ba2cc37676afd955e9f519a5affbf Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:57 +0200
Subject: net/smc: simplify test_link function usage

Make smc_llc_send_test_link() static and remove it from the header file.
And to send a test_link response set the response flag and send the
message back as-is, without using smc_llc_send_test_link(). And because
smc_llc_send_test_link() must no longer send responses, remove the
response flag handling from the function.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 12 +++++-------
 net/smc/smc_llc.h |  2 --
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 29f59b47b559..229e967c7940 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -296,9 +296,8 @@ int smc_llc_send_delete_link(struct smc_link *link,
 	return rc;
 }
 
-/* send LLC test link request or response */
-int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
-			   enum smc_llc_reqresp reqresp)
+/* send LLC test link request */
+static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 {
 	struct smc_llc_msg_test_link *testllc;
 	struct smc_wr_tx_pend_priv *pend;
@@ -312,8 +311,6 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
 	memset(testllc, 0, sizeof(*testllc));
 	testllc->hd.common.type = SMC_LLC_TEST_LINK;
 	testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
-	if (reqresp == SMC_LLC_RESP)
-		testllc->hd.flags |= SMC_LLC_FLAG_RESP;
 	memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
@@ -425,7 +422,8 @@ static void smc_llc_rx_test_link(struct smc_link *link,
 		if (link->state == SMC_LNK_ACTIVE)
 			complete(&link->llc_testlink_resp);
 	} else {
-		smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP);
+		llc->hd.flags |= SMC_LLC_FLAG_RESP;
+		smc_llc_send_message(link, llc, sizeof(*llc));
 	}
 }
 
@@ -551,7 +549,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
 		goto out;
 	}
 	reinit_completion(&link->llc_testlink_resp);
-	smc_llc_send_test_link(link, user_data, SMC_LLC_REQ);
+	smc_llc_send_test_link(link, user_data);
 	/* receive TEST LINK response over RoCE fabric */
 	rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
 						       SMC_LLC_WAIT_TIME);
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 728823dd8ae5..fb137f24aa6b 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -42,8 +42,6 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
 			  enum smc_llc_reqresp reqresp);
 int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp);
-int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
-			   enum smc_llc_reqresp reqresp);
 void smc_llc_link_active(struct smc_link *link, int testlink_time);
 void smc_llc_link_inactive(struct smc_link *link);
 void smc_llc_link_flush(struct smc_link *link);
-- 
cgit v1.2.3-59-g8ed1b


From b32cf4ab68c06619bacefc7157857a08e7fee695 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:58 +0200
Subject: net/smc: move link llc initialization to llc layer

Move the llc layer specific initialization and cleanup out of smc_core.c
into smc_llc.c (smc_llc_link_init and smc_llc_link_clear). Move all
initialization of a link into the new init function.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  6 +-----
 net/smc/smc_llc.c  | 11 ++++++++++-
 net/smc/smc_llc.h  |  1 +
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 29ae077b0ec9..ba7dae819ac8 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -194,6 +194,7 @@ static int smc_lgr_create(struct smc_sock *smc,
 		smc_ib_setup_per_ibdev(smcibdev);
 	get_random_bytes(rndvec, sizeof(rndvec));
 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+	smc_llc_link_init(lnk);
 	rc = smc_wr_alloc_link_mem(lnk);
 	if (rc)
 		goto free_lgr;
@@ -206,11 +207,6 @@ static int smc_lgr_create(struct smc_sock *smc,
 	rc = smc_wr_create_link(lnk);
 	if (rc)
 		goto destroy_qp;
-	init_completion(&lnk->llc_confirm);
-	init_completion(&lnk->llc_confirm_resp);
-	init_completion(&lnk->llc_add);
-	init_completion(&lnk->llc_add_resp);
-	init_completion(&lnk->llc_confirm_rkey);
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 229e967c7940..5dbeced6cea5 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -562,10 +562,19 @@ out:
 	schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
 }
 
-void smc_llc_link_active(struct smc_link *link, int testlink_time)
+void smc_llc_link_init(struct smc_link *link)
 {
+	init_completion(&link->llc_confirm);
+	init_completion(&link->llc_confirm_resp);
+	init_completion(&link->llc_add);
+	init_completion(&link->llc_add_resp);
+	init_completion(&link->llc_confirm_rkey);
 	init_completion(&link->llc_testlink_resp);
 	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+}
+
+void smc_llc_link_active(struct smc_link *link, int testlink_time)
+{
 	link->state = SMC_LNK_ACTIVE;
 	if (testlink_time) {
 		link->llc_testlink_time = testlink_time * HZ;
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index fb137f24aa6b..d7f52a60c8d6 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -42,6 +42,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
 			  enum smc_llc_reqresp reqresp);
 int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp);
+void smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link, int testlink_time);
 void smc_llc_link_inactive(struct smc_link *link);
 void smc_llc_link_flush(struct smc_link *link);
-- 
cgit v1.2.3-59-g8ed1b


From 2a4c57a9e731fa7bd20b028ca078a0ab0ec2485e Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:04:59 +0200
Subject: net/smc: use a workqueue to defer llc send

SMC handles deferred work in tasklets. As tasklets cannot sleep this
can result in rare EBUSY conditions, so defer this work in a work queue.
The high level api functions do not defer work because they can sleep
until the llc send is actually completed.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  10 ++--
 net/smc/smc_core.h |   1 +
 net/smc/smc_llc.c  | 132 ++++++++++++++++++++++++++++++++++++++---------------
 net/smc/smc_llc.h  |   4 +-
 4 files changed, 104 insertions(+), 43 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ba7dae819ac8..4523bbad8a15 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -194,10 +194,12 @@ static int smc_lgr_create(struct smc_sock *smc,
 		smc_ib_setup_per_ibdev(smcibdev);
 	get_random_bytes(rndvec, sizeof(rndvec));
 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
-	smc_llc_link_init(lnk);
-	rc = smc_wr_alloc_link_mem(lnk);
+	rc = smc_llc_link_init(lnk);
 	if (rc)
 		goto free_lgr;
+	rc = smc_wr_alloc_link_mem(lnk);
+	if (rc)
+		goto clear_llc_lnk;
 	rc = smc_ib_create_protection_domain(lnk);
 	if (rc)
 		goto free_link_mem;
@@ -221,6 +223,8 @@ dealloc_pd:
 	smc_ib_dealloc_protection_domain(lnk);
 free_link_mem:
 	smc_wr_free_link_mem(lnk);
+clear_llc_lnk:
+	smc_llc_link_clear(lnk);
 free_lgr:
 	kfree(lgr);
 out:
@@ -266,6 +270,7 @@ void smc_conn_free(struct smc_connection *conn)
 static void smc_link_clear(struct smc_link *lnk)
 {
 	lnk->peer_qpn = 0;
+	smc_llc_link_clear(lnk);
 	smc_ib_modify_qp_reset(lnk);
 	smc_wr_free_link(lnk);
 	smc_ib_destroy_queue_pair(lnk);
@@ -323,7 +328,6 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr)
 /* remove a link group */
 void smc_lgr_free(struct smc_link_group *lgr)
 {
-	smc_llc_link_flush(&lgr->lnk[SMC_SINGLE_LINK]);
 	smc_lgr_free_bufs(lgr);
 	smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 	kfree(lgr);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 9398e235d74b..d1753850684b 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -96,6 +96,7 @@ struct smc_link {
 	u8			link_id;	/* unique # within link group */
 
 	enum smc_link_state	state;		/* state of link */
+	struct workqueue_struct *llc_wq;	/* single thread work queue */
 	struct completion	llc_confirm;	/* wait for rx of conf link */
 	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
 	int			llc_confirm_rc; /* rc from confirm link msg */
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5dbeced6cea5..5d7926cc472c 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -239,6 +239,25 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
 	return rc;
 }
 
+/* prepare an add link message */
+static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
+				  struct smc_link *link, u8 mac[],
+				  union ib_gid *gid,
+				  enum smc_llc_reqresp reqresp)
+{
+	memset(addllc, 0, sizeof(*addllc));
+	addllc->hd.common.type = SMC_LLC_ADD_LINK;
+	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	if (reqresp == SMC_LLC_RESP) {
+		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+		/* always reject more links for now */
+		addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+		addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+	}
+	memcpy(addllc->sender_mac, mac, ETH_ALEN);
+	memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+}
+
 /* send ADD LINK request or response */
 int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
 			  union ib_gid *gid,
@@ -253,22 +272,28 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
 	if (rc)
 		return rc;
 	addllc = (struct smc_llc_msg_add_link *)wr_buf;
-	memset(addllc, 0, sizeof(*addllc));
-	addllc->hd.common.type = SMC_LLC_ADD_LINK;
-	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
-	if (reqresp == SMC_LLC_RESP) {
-		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
-		/* always reject more links for now */
-		addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
-		addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
-	}
-	memcpy(addllc->sender_mac, mac, ETH_ALEN);
-	memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+	smc_llc_prep_add_link(addllc, link, mac, gid, reqresp);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
 }
 
+/* prepare a delete link message */
+static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
+				     struct smc_link *link,
+				     enum smc_llc_reqresp reqresp)
+{
+	memset(delllc, 0, sizeof(*delllc));
+	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
+	delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	if (reqresp == SMC_LLC_RESP)
+		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	/* DEL_LINK_ALL because only 1 link supported */
+	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+	delllc->link_num = link->link_id;
+}
+
 /* send DELETE LINK request or response */
 int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp)
@@ -282,15 +307,7 @@ int smc_llc_send_delete_link(struct smc_link *link,
 	if (rc)
 		return rc;
 	delllc = (struct smc_llc_msg_del_link *)wr_buf;
-	memset(delllc, 0, sizeof(*delllc));
-	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
-	delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
-	if (reqresp == SMC_LLC_RESP)
-		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
-	/* DEL_LINK_ALL because only 1 link supported */
-	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
-	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
-	delllc->link_num = link->link_id;
+	smc_llc_prep_delete_link(delllc, link, reqresp);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -317,20 +334,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 	return rc;
 }
 
-/* send a prepared message */
-static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+struct smc_llc_send_work {
+	struct work_struct work;
+	struct smc_link *link;
+	int llclen;
+	union smc_llc_msg llcbuf;
+};
+
+/* worker that sends a prepared message */
+static void smc_llc_send_message_work(struct work_struct *work)
 {
+	struct smc_llc_send_work *llcwrk = container_of(work,
+						struct smc_llc_send_work, work);
 	struct smc_wr_tx_pend_priv *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (llcwrk->link->state == SMC_LNK_INACTIVE)
+		goto out;
+	rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend);
 	if (rc)
-		return rc;
-	memcpy(wr_buf, llcbuf, llclen);
-	/* send llc message */
-	rc = smc_wr_tx_send(link, pend);
-	return rc;
+		goto out;
+	memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen);
+	smc_wr_tx_send(llcwrk->link, pend);
+out:
+	kfree(llcwrk);
+}
+
+/* copy llcbuf and schedule an llc send on link */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+{
+	struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
+
+	if (!wrk)
+		return -ENOMEM;
+	INIT_WORK(&wrk->work, smc_llc_send_message_work);
+	wrk->link = link;
+	wrk->llclen = llclen;
+	memcpy(&wrk->llcbuf, llcbuf, llclen);
+	queue_work(link->llc_wq, &wrk->work);
+	return 0;
 }
 
 /********************************* receive ***********************************/
@@ -381,17 +424,18 @@ static void smc_llc_rx_add_link(struct smc_link *link,
 		}
 
 		if (lgr->role == SMC_SERV) {
-			smc_llc_send_add_link(link,
+			smc_llc_prep_add_link(llc, link,
 					link->smcibdev->mac[link->ibport - 1],
 					&link->smcibdev->gid[link->ibport - 1],
 					SMC_LLC_REQ);
 
 		} else {
-			smc_llc_send_add_link(link,
+			smc_llc_prep_add_link(llc, link,
 					link->smcibdev->mac[link->ibport - 1],
 					&link->smcibdev->gid[link->ibport - 1],
 					SMC_LLC_RESP);
 		}
+		smc_llc_send_message(link, llc, sizeof(*llc));
 	}
 }
 
@@ -407,9 +451,11 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 	} else {
 		if (lgr->role == SMC_SERV) {
 			smc_lgr_forget(lgr);
-			smc_llc_send_delete_link(link, SMC_LLC_REQ);
+			smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ);
+			smc_llc_send_message(link, llc, sizeof(*llc));
 		} else {
-			smc_llc_send_delete_link(link, SMC_LLC_RESP);
+			smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP);
+			smc_llc_send_message(link, llc, sizeof(*llc));
 			smc_lgr_terminate(lgr);
 		}
 	}
@@ -559,11 +605,19 @@ static void smc_llc_testlink_work(struct work_struct *work)
 	}
 	next_interval = link->llc_testlink_time;
 out:
-	schedule_delayed_work(&link->llc_testlink_wrk, next_interval);
+	queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
+			   next_interval);
 }
 
-void smc_llc_link_init(struct smc_link *link)
+int smc_llc_link_init(struct smc_link *link)
 {
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+	link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
+					       *((u32 *)lgr->id),
+					       link->link_id);
+	if (!link->llc_wq)
+		return -ENOMEM;
 	init_completion(&link->llc_confirm);
 	init_completion(&link->llc_confirm_resp);
 	init_completion(&link->llc_add);
@@ -571,6 +625,7 @@ void smc_llc_link_init(struct smc_link *link)
 	init_completion(&link->llc_confirm_rkey);
 	init_completion(&link->llc_testlink_resp);
 	INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work);
+	return 0;
 }
 
 void smc_llc_link_active(struct smc_link *link, int testlink_time)
@@ -578,8 +633,8 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
 	link->state = SMC_LNK_ACTIVE;
 	if (testlink_time) {
 		link->llc_testlink_time = testlink_time * HZ;
-		schedule_delayed_work(&link->llc_testlink_wrk,
-				      link->llc_testlink_time);
+		queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk,
+				   link->llc_testlink_time);
 	}
 }
 
@@ -591,9 +646,10 @@ void smc_llc_link_inactive(struct smc_link *link)
 }
 
 /* called in worker context */
-void smc_llc_link_flush(struct smc_link *link)
+void smc_llc_link_clear(struct smc_link *link)
 {
-	cancel_delayed_work_sync(&link->llc_testlink_wrk);
+	flush_workqueue(link->llc_wq);
+	destroy_workqueue(link->llc_wq);
 }
 
 /* register a new rtoken at the remote peer */
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index d7f52a60c8d6..65c8645e96a1 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -42,10 +42,10 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
 			  enum smc_llc_reqresp reqresp);
 int smc_llc_send_delete_link(struct smc_link *link,
 			     enum smc_llc_reqresp reqresp);
-void smc_llc_link_init(struct smc_link *link);
+int smc_llc_link_init(struct smc_link *link);
 void smc_llc_link_active(struct smc_link *link, int testlink_time);
 void smc_llc_link_inactive(struct smc_link *link);
-void smc_llc_link_flush(struct smc_link *link);
+void smc_llc_link_clear(struct smc_link *link);
 int smc_llc_do_confirm_rkey(struct smc_link *link,
 			    struct smc_buf_desc *rmb_desc);
 int smc_llc_init(void) __init;
-- 
cgit v1.2.3-59-g8ed1b


From 1401ea045b96ae1112bf384e6f03f909643f04be Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:05:00 +0200
Subject: net/smc: handle all error codes from smc_conn_create()

Always set a reason_code when smc_conn_create() returns an error code.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 397ba2182453..ecf9ba68501b 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -463,6 +463,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 		else if (rc == -ENOLINK)
 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+		else
+			reason_code = SMC_CLC_DECL_INTERR; /* other error */
 		goto decline_rdma_unlock;
 	}
 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
-- 
cgit v1.2.3-59-g8ed1b


From 3cf52eb11fa00b8dc8a773893fd808e3a09b3961 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:05:01 +0200
Subject: net/smc: set link inactive before calling smc_lgr_free()

Before smc_lgr_free() is called the link must be set inactive by calling
smc_llc_link_inactive().

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 1 +
 net/smc/smc_core.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ecf9ba68501b..d15762b057c0 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1644,6 +1644,7 @@ static void __exit smc_exit(void)
 	spin_unlock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
 		list_del_init(&lgr->list);
+		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 		cancel_delayed_work_sync(&lgr->free_work);
 		smc_lgr_free(lgr); /* free link group */
 	}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4523bbad8a15..a48ee00b65ff 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -148,8 +148,11 @@ static void smc_lgr_free_work(struct work_struct *work)
 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
 free:
 	spin_unlock_bh(&smc_lgr_list.lock);
-	if (!delayed_work_pending(&lgr->free_work))
+	if (!delayed_work_pending(&lgr->free_work)) {
+		if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
+			smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 		smc_lgr_free(lgr);
+	}
 }
 
 /* create a new SMC link group */
-- 
cgit v1.2.3-59-g8ed1b


From 8f332a743cd1943c7d1a1fb806bb0261e35d1b65 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:05:02 +0200
Subject: net/smc: drop messages when link state is inactive

Drop incoming messages when the link is flagged as inactive.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5d7926cc472c..5800a6b43d83 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -548,6 +548,8 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 		return; /* short message */
 	if (llc->raw.hdr.length != sizeof(*llc))
 		return; /* invalid message */
+	if (link->state == SMC_LNK_INACTIVE)
+		return; /* link not active, drop msg */
 
 	switch (llc->raw.hdr.common.type) {
 	case SMC_LLC_TEST_LINK:
-- 
cgit v1.2.3-59-g8ed1b


From 517c300e8f041ca9a32a5b3204960d77ed1393b6 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Tue, 15 May 2018 17:05:03 +0200
Subject: net/smc: check for pending termination

Avoid to run the processing in smc_lgr_terminate() more than once,
remember when the link group termination is triggered.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_clc.c  | 2 +-
 net/smc/smc_core.c | 5 ++++-
 net/smc/smc_core.h | 3 ++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 3a988c22f627..236cb3f12c71 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -316,7 +316,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	if (clcm->type == SMC_CLC_DECLINE) {
 		reason_code = SMC_CLC_DECL_REPLY;
 		if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
-			smc->conn.lgr->sync_err = true;
+			smc->conn.lgr->sync_err = 1;
 			smc_lgr_terminate(smc->conn.lgr);
 		}
 	}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a48ee00b65ff..08c05cd0bbae 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -172,7 +172,7 @@ static int smc_lgr_create(struct smc_sock *smc,
 		goto out;
 	}
 	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
-	lgr->sync_err = false;
+	lgr->sync_err = 0;
 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
 	lgr->vlan_id = vlan_id;
 	rwlock_init(&lgr->sndbufs_lock);
@@ -352,6 +352,9 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	struct smc_sock *smc;
 	struct rb_node *node;
 
+	if (lgr->terminating)
+		return;	/* lgr already terminating */
+	lgr->terminating = 1;
 	smc_lgr_forget(lgr);
 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index d1753850684b..845dc073de13 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -166,7 +166,8 @@ struct smc_link_group {
 
 	u8			id[SMC_LGR_ID_SIZE];	/* unique lgr id */
 	struct delayed_work	free_work;	/* delayed freeing of an lgr */
-	bool			sync_err;	/* lgr no longer fits to peer */
+	u8			sync_err : 1;	/* lgr no longer fits to peer */
+	u8			terminating : 1;/* lgr is terminating */
 };
 
 /* Find the connection associated with the given alert token in the link group.
-- 
cgit v1.2.3-59-g8ed1b


From 2578041011c43c38f3742e161cdb99b710d5a9f8 Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Mon, 14 May 2018 14:48:07 -0400
Subject: bonding: don't queue up extraneous rlb updates

arps for incomplete entries can't be sent anyway.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index a8f60982d483..9f7031ec56d5 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -421,7 +421,8 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 			if (assigned_slave) {
 				rx_hash_table[index].slave = assigned_slave;
 				if (!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-							     mac_bcast)) {
+							     mac_bcast) &&
+				    !is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
 					bond_info->rx_hashtbl[index].ntt = 1;
 					bond_info->rx_ntt = 1;
 					/* A slave has been removed from the
@@ -524,7 +525,8 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			ntt = 1;
 		}
@@ -565,7 +567,8 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 		if ((client_info->ip_src == src_ip) &&
 		    !ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 					     bond->dev->dev_addr) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond_info->rx_ntt = 1;
 		}
@@ -641,7 +644,8 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		ether_addr_copy(client_info->mac_src, arp->mac_src);
 		client_info->slave = assigned_slave;
 
-		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast)) {
+		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
+		    !is_zero_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond->alb_info.rx_ntt = 1;
 		} else {
@@ -733,8 +737,10 @@ static void rlb_rebalance(struct bonding *bond)
 		assigned_slave = __rlb_next_rx_slave(bond);
 		if (assigned_slave && (client_info->slave != assigned_slave)) {
 			client_info->slave = assigned_slave;
-			client_info->ntt = 1;
-			ntt = 1;
+			if (!is_zero_ether_addr(client_info->mac_dst)) {
+				client_info->ntt = 1;
+				ntt = 1;
+			}
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From cbeeea70de457ce4de89197d72943ab455b4172c Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Mon, 14 May 2018 14:48:08 -0400
Subject: bonding: use common mac addr checks

Replace homegrown mac addr checks with faster defs from etherdevice.h

Note that this will also prevent any rlb arp updates for multicast
addresses, however this should have been forbidden anyway.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 9f7031ec56d5..67fd1af1d1de 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -40,11 +40,6 @@
 #include <net/bonding.h>
 #include <net/bond_alb.h>
 
-
-
-static const u8 mac_bcast[ETH_ALEN + 2] __long_aligned = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-};
 static const u8 mac_v6_allmcast[ETH_ALEN + 2] __long_aligned = {
 	0x33, 0x33, 0x00, 0x00, 0x00, 0x01
 };
@@ -420,9 +415,7 @@ static void rlb_clear_slave(struct bonding *bond, struct slave *slave)
 
 			if (assigned_slave) {
 				rx_hash_table[index].slave = assigned_slave;
-				if (!ether_addr_equal_64bits(rx_hash_table[index].mac_dst,
-							     mac_bcast) &&
-				    !is_zero_ether_addr(rx_hash_table[index].mac_dst)) {
+				if (is_valid_ether_addr(rx_hash_table[index].mac_dst)) {
 					bond_info->rx_hashtbl[index].ntt = 1;
 					bond_info->rx_ntt = 1;
 					/* A slave has been removed from the
@@ -525,8 +518,7 @@ static void rlb_req_update_slave_clients(struct bonding *bond, struct slave *sla
 		client_info = &(bond_info->rx_hashtbl[hash_index]);
 
 		if ((client_info->slave == slave) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		    is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			ntt = 1;
 		}
@@ -567,8 +559,7 @@ static void rlb_req_update_subnet_clients(struct bonding *bond, __be32 src_ip)
 		if ((client_info->ip_src == src_ip) &&
 		    !ether_addr_equal_64bits(client_info->slave->dev->dev_addr,
 					     bond->dev->dev_addr) &&
-		    !ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		    is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond_info->rx_ntt = 1;
 		}
@@ -596,7 +587,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		if ((client_info->ip_src == arp->ip_src) &&
 		    (client_info->ip_dst == arp->ip_dst)) {
 			/* the entry is already assigned to this client */
-			if (!ether_addr_equal_64bits(arp->mac_dst, mac_bcast)) {
+			if (!is_broadcast_ether_addr(arp->mac_dst)) {
 				/* update mac address from arp */
 				ether_addr_copy(client_info->mac_dst, arp->mac_dst);
 			}
@@ -644,8 +635,7 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 		ether_addr_copy(client_info->mac_src, arp->mac_src);
 		client_info->slave = assigned_slave;
 
-		if (!ether_addr_equal_64bits(client_info->mac_dst, mac_bcast) &&
-		    !is_zero_ether_addr(client_info->mac_dst)) {
+		if (is_valid_ether_addr(client_info->mac_dst)) {
 			client_info->ntt = 1;
 			bond->alb_info.rx_ntt = 1;
 		} else {
@@ -1418,9 +1408,9 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	case ETH_P_IP: {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
-		    (iph->daddr == ip_bcast) ||
-		    (iph->protocol == IPPROTO_IGMP)) {
+		if (is_broadcast_ether_addr(eth_data->h_dest) ||
+		    iph->daddr == ip_bcast ||
+		    iph->protocol == IPPROTO_IGMP) {
 			do_tx_balance = false;
 			break;
 		}
@@ -1432,7 +1422,7 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		/* IPv6 doesn't really use broadcast mac address, but leave
 		 * that here just in case.
 		 */
-		if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast)) {
+		if (is_broadcast_ether_addr(eth_data->h_dest)) {
 			do_tx_balance = false;
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From e79c1055749e3183a2beee04a24da378623329c5 Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Mon, 14 May 2018 14:48:09 -0400
Subject: bonding: allow use of tx hashing in balance-alb

The rx load balancing provided by balance-alb is not mutually
exclusive with using hashing for tx selection, and should provide a decent
speed increase because this eliminates spinlocks and cache contention.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_alb.c     | 20 ++++++++++++++++++--
 drivers/net/bonding/bond_main.c    | 25 +++++++++++++++----------
 drivers/net/bonding/bond_options.c |  2 +-
 include/net/bonding.h              | 11 +++++++++--
 4 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 67fd1af1d1de..e82108c917a6 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1478,8 +1478,24 @@ netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	}
 
 	if (do_tx_balance) {
-		hash_index = _simple_hash(hash_start, hash_size);
-		tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		if (bond->params.tlb_dynamic_lb) {
+			hash_index = _simple_hash(hash_start, hash_size);
+			tx_slave = tlb_choose_channel(bond, hash_index, skb->len);
+		} else {
+			/*
+			 * do_tx_balance means we are free to select the tx_slave
+			 * So we do exactly what tlb would do for hash selection
+			 */
+
+			struct bond_up_slave *slaves;
+			unsigned int count;
+
+			slaves = rcu_dereference(bond->slave_arr);
+			count = slaves ? READ_ONCE(slaves->count) : 0;
+			if (likely(count))
+				tx_slave = slaves->arr[bond_xmit_hash(bond, skb) %
+						       count];
+		}
 	}
 
 	return bond_do_alb_xmit(skb, bond, tx_slave);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4176e1d95f47..a4cd7f6bfd4d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -159,7 +159,7 @@ module_param(min_links, int, 0);
 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");
 
 module_param(xmit_hash_policy, charp, 0);
-MODULE_PARM_DESC(xmit_hash_policy, "balance-xor and 802.3ad hashing method; "
+MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
 				   "0 for layer 2 (default), 1 for layer 3+4, "
 				   "2 for layer 2+3, 3 for encap layer 2+3, "
 				   "4 for encap layer 3+4");
@@ -1735,7 +1735,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
 		unblock_netpoll_tx();
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	bond->nest_level = dev_get_nest_level(bond_dev);
@@ -1870,7 +1870,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 	if (BOND_MODE(bond) == BOND_MODE_8023AD)
 		bond_3ad_unbind_slave(slave);
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, slave);
 
 	netdev_info(bond_dev, "Releasing %s interface %s\n",
@@ -3102,7 +3102,7 @@ static int bond_slave_netdev_event(unsigned long event,
 		 * events. If these (miimon/arpmon) parameters are configured
 		 * then array gets refreshed twice and that should be fine!
 		 */
-		if (bond_mode_uses_xmit_hash(bond))
+		if (bond_mode_can_use_xmit_hash(bond))
 			bond_update_slave_arr(bond, NULL);
 		break;
 	case NETDEV_CHANGEMTU:
@@ -3322,7 +3322,7 @@ static int bond_open(struct net_device *bond_dev)
 		 */
 		if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
 			return -ENOMEM;
-		if (bond->params.tlb_dynamic_lb)
+		if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
 			queue_delayed_work(bond->wq, &bond->alb_work, 0);
 	}
 
@@ -3341,7 +3341,7 @@ static int bond_open(struct net_device *bond_dev)
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
-	if (bond_mode_uses_xmit_hash(bond))
+	if (bond_mode_can_use_xmit_hash(bond))
 		bond_update_slave_arr(bond, NULL);
 
 	return 0;
@@ -3894,7 +3894,7 @@ err:
  * to determine the slave interface -
  * (a) BOND_MODE_8023AD
  * (b) BOND_MODE_XOR
- * (c) BOND_MODE_TLB && tlb_dynamic_lb == 0
+ * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
  *
  * The caller is expected to hold RTNL only and NO other lock!
  */
@@ -3947,6 +3947,11 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
 			continue;
 		if (skipslave == slave)
 			continue;
+
+		netdev_dbg(bond->dev,
+			   "Adding slave dev %s to tx hash array[%d]\n",
+			   slave->dev->name, new_arr->count);
+
 		new_arr->arr[new_arr->count++] = slave;
 	}
 
@@ -4324,9 +4329,9 @@ static int bond_check_params(struct bond_params *params)
 	}
 
 	if (xmit_hash_policy) {
-		if ((bond_mode != BOND_MODE_XOR) &&
-		    (bond_mode != BOND_MODE_8023AD) &&
-		    (bond_mode != BOND_MODE_TLB)) {
+		if (bond_mode == BOND_MODE_ROUNDROBIN ||
+		    bond_mode == BOND_MODE_ACTIVEBACKUP ||
+		    bond_mode == BOND_MODE_BROADCAST) {
 			pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
 				bond_mode_name(bond_mode));
 		} else {
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 58c705f24f96..8a945c9341d6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -395,7 +395,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.id = BOND_OPT_TLB_DYNAMIC_LB,
 		.name = "tlb_dynamic_lb",
 		.desc = "Enable dynamic flow shuffling",
-		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB)),
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)),
 		.values = bond_tlb_dynamic_lb_tbl,
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.set = bond_option_tlb_dynamic_lb_set,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b52235158836..808f1d167349 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -285,8 +285,15 @@ static inline bool bond_needs_speed_duplex(const struct bonding *bond)
 
 static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
 {
-	return (BOND_MODE(bond) == BOND_MODE_TLB)  &&
-	       (bond->params.tlb_dynamic_lb == 0);
+	return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0);
+}
+
+static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
+{
+	return (BOND_MODE(bond) == BOND_MODE_8023AD ||
+		BOND_MODE(bond) == BOND_MODE_XOR ||
+		BOND_MODE(bond) == BOND_MODE_TLB ||
+		BOND_MODE(bond) == BOND_MODE_ALB);
 }
 
 static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
-- 
cgit v1.2.3-59-g8ed1b


From 1386c36b30388f46a95100924bfcae75160db715 Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Mon, 14 May 2018 14:48:10 -0400
Subject: bonding: allow carrier and link status to determine link state

In a mixed environment it may be difficult to tell if your hardware
support carrier, if it does not it can always report true. With a new
use_carrier option of 2, we can check both carrier and link status
sequentially, instead of one or the other

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt |  4 ++--
 drivers/net/bonding/bond_main.c      | 12 ++++++++----
 drivers/net/bonding/bond_options.c   |  7 ++++---
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index c13214d073a4..86d07fbb592d 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -828,8 +828,8 @@ use_carrier
 	MII / ETHTOOL ioctl method to determine the link state.
 
 	A value of 1 enables the use of netif_carrier_ok(), a value of
-	0 will use the deprecated MII / ETHTOOL ioctls.  The default
-	value is 1.
+	0 will use the deprecated MII / ETHTOOL ioctls. A value of 2
+	will check both.  The default value is 1.
 
 xmit_hash_policy
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a4cd7f6bfd4d..e4c253dc7dfb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -132,7 +132,7 @@ MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
 			    "in milliseconds");
 module_param(use_carrier, int, 0);
 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
-			      "0 for off, 1 for on (default)");
+			      "0 for off, 1 for on (default), 2 for carrier then legacy checks");
 module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
 		       "1 for active-backup, 2 for balance-xor, "
@@ -434,12 +434,16 @@ static int bond_check_dev_link(struct bonding *bond,
 	int (*ioctl)(struct net_device *, struct ifreq *, int);
 	struct ifreq ifr;
 	struct mii_ioctl_data *mii;
+	bool carrier = true;
 
 	if (!reporting && !netif_running(slave_dev))
 		return 0;
 
 	if (bond->params.use_carrier)
-		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+		carrier = netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
+
+	if (!carrier)
+		return carrier;
 
 	/* Try to get link status using Ethtool first. */
 	if (slave_dev->ethtool_ops->get_link)
@@ -4403,8 +4407,8 @@ static int bond_check_params(struct bond_params *params)
 		downdelay = 0;
 	}
 
-	if ((use_carrier != 0) && (use_carrier != 1)) {
-		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
+	if (use_carrier < 0 || use_carrier > 2) {
+		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0-2), so it was set to 1\n",
 			use_carrier);
 		use_carrier = 1;
 	}
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 8a945c9341d6..dba6cef05134 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -164,9 +164,10 @@ static const struct bond_opt_value bond_primary_reselect_tbl[] = {
 };
 
 static const struct bond_opt_value bond_use_carrier_tbl[] = {
-	{ "off", 0,  0},
-	{ "on",  1,  BOND_VALFLAG_DEFAULT},
-	{ NULL,  -1, 0}
+	{ "off",  0,  0},
+	{ "on",   1,  BOND_VALFLAG_DEFAULT},
+	{ "both", 2,  0},
+	{ NULL,  -1,  0}
 };
 
 static const struct bond_opt_value bond_all_slaves_active_tbl[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 32f7b44d0f5661044fcfa84e9ad402ed9d759107 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 10:50:31 +0200
Subject: sched: manipulate __QDISC_STATE_RUNNING in qdisc_run_* helpers

Currently NOLOCK qdiscs pay a measurable overhead to atomically
manipulate the __QDISC_STATE_RUNNING. Such bit is flipped twice per
packet in the uncontended scenario with packet rate below the
line rate: on packed dequeue and on the next, failing dequeue attempt.

This changeset moves the bit manipulation into the qdisc_run_{begin,end}
helpers, so that the bit is now flipped only once per packet, with
measurable performance improvement in the uncontended scenario.

This also allows simplifying the qdisc teardown code path - since
qdisc_is_running() is now effective for each qdisc type - and avoid a
possible race between qdisc_run() and dev_deactivate_many(), as now
the some_qdisc_is_busy() can properly detect NOLOCK qdiscs being busy
dequeuing packets.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 10 +++++++++-
 net/core/dev.c            |  2 +-
 net/sched/sch_generic.c   | 31 +++++++++----------------------
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 5154c8300262..4d2b37226e75 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -113,13 +113,19 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
 
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		return test_bit(__QDISC_STATE_RUNNING, &qdisc->state);
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
-	if (qdisc_is_running(qdisc))
+	if (qdisc->flags & TCQ_F_NOLOCK) {
+		if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state))
+			return false;
+	} else if (qdisc_is_running(qdisc)) {
 		return false;
+	}
 	/* Variant of write_seqcount_begin() telling lockdep a trylock
 	 * was attempted.
 	 */
@@ -131,6 +137,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
 	write_seqcount_end(&qdisc->running);
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		clear_bit(__QDISC_STATE_RUNNING, &qdisc->state);
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/core/dev.c b/net/core/dev.c
index 9f4390182384..7ca19f47a92a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3244,7 +3244,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			rc = NET_XMIT_DROP;
 		} else {
 			rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
-			__qdisc_run(q);
+			qdisc_run(q);
 		}
 
 		if (unlikely(to_free))
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b6ff98..ff3ce71aec93 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -373,33 +373,24 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
  */
 static inline bool qdisc_restart(struct Qdisc *q, int *packets)
 {
-	bool more, validate, nolock = q->flags & TCQ_F_NOLOCK;
 	spinlock_t *root_lock = NULL;
 	struct netdev_queue *txq;
 	struct net_device *dev;
 	struct sk_buff *skb;
+	bool validate;
 
 	/* Dequeue packet */
-	if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
-		return false;
-
 	skb = dequeue_skb(q, &validate, packets);
-	if (unlikely(!skb)) {
-		if (nolock)
-			clear_bit(__QDISC_STATE_RUNNING, &q->state);
+	if (unlikely(!skb))
 		return false;
-	}
 
-	if (!nolock)
+	if (!(q->flags & TCQ_F_NOLOCK))
 		root_lock = qdisc_lock(q);
 
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
-	more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
-	if (nolock)
-		clear_bit(__QDISC_STATE_RUNNING, &q->state);
-	return more;
+	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
 }
 
 void __qdisc_run(struct Qdisc *q)
@@ -1131,17 +1122,13 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 		dev_queue = netdev_get_tx_queue(dev, i);
 		q = dev_queue->qdisc_sleeping;
 
-		if (q->flags & TCQ_F_NOLOCK) {
-			val = test_bit(__QDISC_STATE_SCHED, &q->state);
-		} else {
-			root_lock = qdisc_lock(q);
-			spin_lock_bh(root_lock);
+		root_lock = qdisc_lock(q);
+		spin_lock_bh(root_lock);
 
-			val = (qdisc_is_running(q) ||
-			       test_bit(__QDISC_STATE_SCHED, &q->state));
+		val = (qdisc_is_running(q) ||
+		       test_bit(__QDISC_STATE_SCHED, &q->state));
 
-			spin_unlock_bh(root_lock);
-		}
+		spin_unlock_bh(root_lock);
 
 		if (val)
 			return true;
-- 
cgit v1.2.3-59-g8ed1b


From 2f42a12832813519974ac814f5308f7a706b7ed0 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Tue, 15 May 2018 10:14:40 -0400
Subject: tc-testing: fixed copy-pasting error in police tests

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/tc-testing/tc-tests/actions/police.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
index 38d85a1d7492..f03763d81617 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
@@ -401,11 +401,11 @@
         ],
         "cmdUnderTest": "$TC actions add action police rate 10mbit burst 10k index 4294967295",
         "expExitCode": "0",
-        "verifyCmd": "$TC actions get action mirred index 4294967295",
+        "verifyCmd": "$TC actions get action police index 4294967295",
         "matchPattern": "action order [0-9]*:  police 0xffffffff rate 10Mbit burst 10Kb mtu 2Kb",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action mirred"
+            "$TC actions flush action police"
         ]
     },
     {
-- 
cgit v1.2.3-59-g8ed1b


From 55df3e9754531060b07eb32a2ca5a981c0fd4b3d Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Tue, 15 May 2018 14:31:14 -0400
Subject: tc-testing: updated mirred and vlan with more tests

Added extra test cases for different control actions (reclassify, pipe
etc.), cookies, max values & exceeding maximum, and replace existing
actions unit tests.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../tc-testing/tc-tests/actions/mirred.json        |  24 +-
 .../tc-testing/tc-tests/actions/vlan.json          | 320 +++++++++++++++++++--
 2 files changed, 324 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
index 443c9b3c8664..6e4edfae1799 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
@@ -340,7 +340,7 @@
     },
     {
         "id": "8b69",
-        "name": "Add mirred mirror action with maximum index",
+        "name": "Add mirred mirror action with index at 32-bit maximum",
         "category": [
             "actions",
             "mirred"
@@ -362,6 +362,28 @@
             "$TC actions flush action mirred"
         ]
     },
+    {
+        "id": "3f66",
+        "name": "Add mirred mirror action with index exceeding 32-bit maximum",
+        "category": [
+            "actions",
+            "mirred"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action mirred",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 429496729555",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action mirred index 429496729555",
+        "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 429496729555",
+        "matchCount": "0",
+        "teardown": []
+    },
     {
         "id": "a70e",
         "name": "Delete mirred mirror action",
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json
index 4510ddfa6e54..69ea09eefffc 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json
@@ -1,7 +1,7 @@
 [
     {
         "id": "6f5a",
-        "name": "Add vlan pop action",
+        "name": "Add vlan pop action with pipe opcode",
         "category": [
             "actions",
             "vlan"
@@ -14,18 +14,18 @@
                 255
             ]
         ],
-        "cmdUnderTest": "$TC actions add action vlan pop index 8",
+        "cmdUnderTest": "$TC actions add action vlan pop pipe index 8",
         "expExitCode": "0",
         "verifyCmd": "$TC actions list action vlan",
-        "matchPattern": "action order [0-9]+: vlan.*pop.*index 8 ref",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*pipe.*index 8 ref",
         "matchCount": "1",
         "teardown": [
             "$TC actions flush action vlan"
         ]
     },
     {
-        "id": "ee6f",
-        "name": "Add vlan pop action with large index",
+        "id": "df35",
+        "name": "Add vlan pop action with pass opcode",
         "category": [
             "actions",
             "vlan"
@@ -38,10 +38,82 @@
                 255
             ]
         ],
-        "cmdUnderTest": "$TC actions add action vlan pop index 4294967295",
+        "cmdUnderTest": "$TC actions add action vlan pop pass index 8",
         "expExitCode": "0",
-        "verifyCmd": "$TC actions list action vlan",
-        "matchPattern": "action order [0-9]+: vlan.*pop.*index 4294967295 ref",
+        "verifyCmd": "$TC actions get action vlan index 8",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*pass.*index 8 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "b0d4",
+        "name": "Add vlan pop action with drop opcode",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop drop index 8",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 8",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*drop.*index 8 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "95ee",
+        "name": "Add vlan pop action with reclassify opcode",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop reclassify index 8",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 8",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*reclassify.*index 8 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "0283",
+        "name": "Add vlan pop action with continue opcode",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop continue index 8",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 8",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*continue.*index 8 ref",
         "matchCount": "1",
         "teardown": [
             "$TC actions flush action vlan"
@@ -95,6 +167,74 @@
             "$TC actions flush action vlan"
         ]
     },
+    {
+        "id": "a178",
+        "name": "Add vlan pop action with invalid opcode",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop foo index 8",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions list action vlan",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*foo.*index 8 ref",
+        "matchCount": "0",
+        "teardown": []
+    },
+    {
+        "id": "ee6f",
+        "name": "Add vlan pop action with index at 32-bit maximum",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop index 4294967295",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions list action vlan",
+        "matchPattern": "action order [0-9]+: vlan.*pop.*index 4294967295 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "0dfa",
+        "name": "Add vlan pop action with index exceeding 32-bit maximum",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan pop reclassify index 429496729599",
+        "expExitCode": "255",
+        "verifyCmd": "$TC actions get action vlan index 429496729599",
+        "matchPattern": "action order [0-9]+: vlan.*pop.reclassify.*index 429496729599",
+        "matchCount": "0",
+        "teardown": []
+    },
     {
         "id": "2b91",
         "name": "Add vlan invalid action",
@@ -115,13 +255,11 @@
         "verifyCmd": "$TC actions list action vlan",
         "matchPattern": "action order [0-9]+: vlan.*bad_mode",
         "matchCount": "0",
-        "teardown": [
-            "$TC actions flush action vlan"
-        ]
+        "teardown": []
     },
     {
         "id": "57fc",
-        "name": "Add vlan action with invalid protocol type",
+        "name": "Add vlan push action with invalid protocol type",
         "category": [
             "actions",
             "vlan"
@@ -139,9 +277,7 @@
         "verifyCmd": "$TC actions list action vlan",
         "matchPattern": "action order [0-9]+: vlan.*push",
         "matchCount": "0",
-        "teardown": [
-            "$TC actions flush action vlan"
-        ]
+        "teardown": []
     },
     {
         "id": "3989",
@@ -215,6 +351,30 @@
             "$TC actions flush action vlan"
         ]
     },
+    {
+        "id": "1f4b",
+        "name": "Add vlan push action with maximum 12-bit vlan ID",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan push id 4094 index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 1",
+        "matchPattern": "action order [0-9]+: vlan.*push id 4094.*protocol 802.1Q.*priority 0.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
     {
         "id": "1f7b",
         "name": "Add vlan push action with invalid vlan ID",
@@ -239,6 +399,30 @@
             "$TC actions flush action vlan"
         ]
     },
+    {
+        "id": "fe40",
+        "name": "Add vlan push action with maximum 3-bit IEEE 802.1p priority",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ]
+        ],
+        "cmdUnderTest": "$TC actions add action vlan push id 4 priority 7 reclassify index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 1",
+        "matchPattern": "action order [0-9]+: vlan.*push id 4.*protocol 802.1Q.*priority 7.*reclassify.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
     {
         "id": "5d02",
         "name": "Add vlan push action with invalid IEEE 802.1p priority",
@@ -259,9 +443,7 @@
         "verifyCmd": "$TC actions list action vlan",
         "matchPattern": "action order [0-9]+: vlan.*push id 5.*index 1 ref",
         "matchCount": "0",
-        "teardown": [
-            "$TC actions flush action vlan"
-        ]
+        "teardown": []
     },
     {
         "id": "6812",
@@ -311,6 +493,106 @@
             "$TC actions flush action vlan"
         ]
     },
+    {
+        "id": "3deb",
+        "name": "Replace existing vlan push action with new ID",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action vlan push id 500 pipe index 12"
+        ],
+        "cmdUnderTest": "$TC actions replace action vlan push id 700 pipe index 12",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 12",
+        "matchPattern": "action order [0-9]+: vlan.*push id 700 protocol 802.1Q priority 0 pipe.*index 12 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "9e76",
+        "name": "Replace existing vlan push action with new protocol",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action vlan push id 1 protocol 802.1Q pipe index 1"
+        ],
+        "cmdUnderTest": "$TC actions replace action vlan push id 1 protocol 802.1ad pipe index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 1",
+        "matchPattern": "action order [0-9]+: vlan.*push id 1 protocol 802.1ad priority 0 pipe.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "ede4",
+        "name": "Replace existing vlan push action with new priority",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action vlan push id 1 protocol 802.1Q priority 3 reclassify index 1"
+        ],
+        "cmdUnderTest": "$TC actions replace action vlan push id 1 priority 4 reclassify index 1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 1",
+        "matchPattern": "action order [0-9]+: vlan.*push id 1 protocol 802.1Q priority 4 reclassify.*index 1 ref",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
+    {
+        "id": "d413",
+        "name": "Replace existing vlan pop action with new cookie",
+        "category": [
+            "actions",
+            "vlan"
+        ],
+        "setup": [
+            [
+                "$TC actions flush action vlan",
+                0,
+                1,
+                255
+            ],
+            "$TC actions add action vlan pop continue index 1 cookie 22334455"
+        ],
+        "cmdUnderTest": "$TC actions replace action vlan pop continue index 1 cookie a1b1c2d1",
+        "expExitCode": "0",
+        "verifyCmd": "$TC actions get action vlan index 1",
+        "matchPattern": "action order [0-9]+: vlan.*pop continue.*index 1 ref.*cookie a1b1c2d1",
+        "matchCount": "1",
+        "teardown": [
+            "$TC actions flush action vlan"
+        ]
+    },
     {
         "id": "83a4",
         "name": "Delete vlan pop action",
@@ -385,7 +667,7 @@
     },
     {
         "id": "1d78",
-        "name": "Add vlan action with cookie",
+        "name": "Add vlan push action with cookie",
         "category": [
             "actions",
             "vlan"
-- 
cgit v1.2.3-59-g8ed1b


From b3c898e20b1881b0876c3e811c58b039b37dd5fd Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Wed, 16 May 2018 14:02:13 -0400
Subject: Revert "bonding: allow carrier and link status to determine link
 state"

This reverts commit 1386c36b30388f46a95100924bfcae75160db715.

We don't want to encourage drivers to not report carrier status
correctly, therefore remove this commit.

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/bonding.txt |  4 ++--
 drivers/net/bonding/bond_main.c      | 12 ++++--------
 drivers/net/bonding/bond_options.c   |  7 +++----
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 86d07fbb592d..c13214d073a4 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -828,8 +828,8 @@ use_carrier
 	MII / ETHTOOL ioctl method to determine the link state.
 
 	A value of 1 enables the use of netif_carrier_ok(), a value of
-	0 will use the deprecated MII / ETHTOOL ioctls. A value of 2
-	will check both.  The default value is 1.
+	0 will use the deprecated MII / ETHTOOL ioctls.  The default
+	value is 1.
 
 xmit_hash_policy
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e4c253dc7dfb..a4cd7f6bfd4d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -132,7 +132,7 @@ MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
 			    "in milliseconds");
 module_param(use_carrier, int, 0);
 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
-			      "0 for off, 1 for on (default), 2 for carrier then legacy checks");
+			      "0 for off, 1 for on (default)");
 module_param(mode, charp, 0);
 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
 		       "1 for active-backup, 2 for balance-xor, "
@@ -434,16 +434,12 @@ static int bond_check_dev_link(struct bonding *bond,
 	int (*ioctl)(struct net_device *, struct ifreq *, int);
 	struct ifreq ifr;
 	struct mii_ioctl_data *mii;
-	bool carrier = true;
 
 	if (!reporting && !netif_running(slave_dev))
 		return 0;
 
 	if (bond->params.use_carrier)
-		carrier = netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
-
-	if (!carrier)
-		return carrier;
+		return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
 
 	/* Try to get link status using Ethtool first. */
 	if (slave_dev->ethtool_ops->get_link)
@@ -4407,8 +4403,8 @@ static int bond_check_params(struct bond_params *params)
 		downdelay = 0;
 	}
 
-	if (use_carrier < 0 || use_carrier > 2) {
-		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0-2), so it was set to 1\n",
+	if ((use_carrier != 0) && (use_carrier != 1)) {
+		pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
 			use_carrier);
 		use_carrier = 1;
 	}
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index dba6cef05134..8a945c9341d6 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -164,10 +164,9 @@ static const struct bond_opt_value bond_primary_reselect_tbl[] = {
 };
 
 static const struct bond_opt_value bond_use_carrier_tbl[] = {
-	{ "off",  0,  0},
-	{ "on",   1,  BOND_VALFLAG_DEFAULT},
-	{ "both", 2,  0},
-	{ NULL,  -1,  0}
+	{ "off", 0,  0},
+	{ "on",  1,  BOND_VALFLAG_DEFAULT},
+	{ NULL,  -1, 0}
 };
 
 static const struct bond_opt_value bond_all_slaves_active_tbl[] = {
-- 
cgit v1.2.3-59-g8ed1b


From c6213eb1aee308e67377fd1890d84f7284caf531 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 15 May 2018 18:37:25 -0500
Subject: net: ethernet: ti: cpsw-phy-sel: check bus_find_device() ret value

This fixes klockworks warnings: Pointer 'dev' returned from call to
function 'bus_find_device' at line 179 may be NULL and will be dereferenced
at line 181.

    cpsw-phy-sel.c:179: 'dev' is assigned the return value from function 'bus_find_device'.
    bus.c:342: 'bus_find_device' explicitly returns a NULL value.
    cpsw-phy-sel.c:181: 'dev' is dereferenced by passing argument 1 to function 'dev_get_drvdata'.
    device.h:1024: 'dev' is passed to function 'dev_get_drvdata'.
    device.h:1026: 'dev' is explicitly dereferenced.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
[nsekhar@ti.com: add an error message, fix return path]
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw-phy-sel.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c
index 18013645e76c..0c1adad7415d 100644
--- a/drivers/net/ethernet/ti/cpsw-phy-sel.c
+++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c
@@ -177,12 +177,18 @@ void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave)
 	}
 
 	dev = bus_find_device(&platform_bus_type, NULL, node, match);
-	of_node_put(node);
+	if (!dev) {
+		dev_err(dev, "unable to find platform device for %pOF\n", node);
+		goto out;
+	}
+
 	priv = dev_get_drvdata(dev);
 
 	priv->cpsw_phy_sel(priv, phy_mode, slave);
 
 	put_device(dev);
+out:
+	of_node_put(node);
 }
 EXPORT_SYMBOL_GPL(cpsw_phy_sel);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6d07a68a59274fa168a3ddfbcfe5df029ba0afed Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:56:18 -0700
Subject: of: mdio: Fall back to mdiobus_register() with NULL device_node

When the device_node specified is NULL, fall back to mdiobus_register().
We have a number of drivers having a similar pattern which is:

if (np)
	of_mdiobus_register()
else
	mdiobus_register()

so incorporate that behavior within the core of_mdiobus_register()
function. This is also consistent with the stub version that we defined
when CONFIG_OF=n.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/of/of_mdio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 8c0c92712fc9..d963baf8e53a 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -204,6 +204,9 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 	bool scanphys = false;
 	int addr, rc;
 
+	if (!np)
+		return mdiobus_register(mdio);
+
 	/* Do not continue if the node is disabled */
 	if (!of_device_is_available(np))
 		return -ENODEV;
-- 
cgit v1.2.3-59-g8ed1b


From 00e798c7d1ea5c4514401f17db8300db934291cb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 May 2018 16:56:19 -0700
Subject: drivers: net: Remove device_node checks with of_mdiobus_register()

A number of drivers have the following pattern:

if (np)
	of_mdiobus_register()
else
	mdiobus_register()

which the implementation of of_mdiobus_register() now takes care of.
Remove that pattern in drivers that strictly adhere to it.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Reviewed-by: Fugang Duan <fugang.duan@nxp.com>
Reviewed-by: Antoine Tenart <antoine.tenart@bootlin.com>
Reviewed-by: Jose Abreu <joabreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c                         |  8 ++------
 drivers/net/dsa/mv88e6xxx/chip.c                  |  5 +----
 drivers/net/ethernet/cadence/macb_main.c          | 12 +++---------
 drivers/net/ethernet/freescale/fec_main.c         |  8 ++------
 drivers/net/ethernet/marvell/mvmdio.c             |  5 +----
 drivers/net/ethernet/renesas/sh_eth.c             | 11 +++--------
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c |  5 +----
 drivers/net/ethernet/ti/davinci_mdio.c            |  8 +++-----
 drivers/net/phy/mdio-gpio.c                       |  6 +-----
 drivers/net/phy/mdio-mscc-miim.c                  |  6 +-----
 drivers/net/usb/lan78xx.c                         |  7 ++-----
 11 files changed, 20 insertions(+), 61 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index ac621f44237a..02e8982519ce 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -450,12 +450,8 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
 	priv->slave_mii_bus->parent = ds->dev->parent;
 	priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
 
-	if (dn)
-		err = of_mdiobus_register(priv->slave_mii_bus, dn);
-	else
-		err = mdiobus_register(priv->slave_mii_bus);
-
-	if (err)
+	err = of_mdiobus_register(priv->slave_mii_bus, dn);
+	if (err && dn)
 		of_node_put(dn);
 
 	return err;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index b23c11d9f4b2..2bb3f03ee1cb 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2454,10 +2454,7 @@ static int mv88e6xxx_mdio_register(struct mv88e6xxx_chip *chip,
 			return err;
 	}
 
-	if (np)
-		err = of_mdiobus_register(bus, np);
-	else
-		err = mdiobus_register(bus);
+	err = of_mdiobus_register(bus, np);
 	if (err) {
 		dev_err(chip->dev, "Cannot register MDIO bus (%d)\n", err);
 		mv88e6xxx_g2_irq_mdio_free(chip, bus);
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index b4c9268100bb..3e93df5d4e3b 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -591,16 +591,10 @@ static int macb_mii_init(struct macb *bp)
 	dev_set_drvdata(&bp->dev->dev, bp->mii_bus);
 
 	np = bp->pdev->dev.of_node;
+	if (pdata)
+		bp->mii_bus->phy_mask = pdata->phy_mask;
 
-	if (np) {
-		err = of_mdiobus_register(bp->mii_bus, np);
-	} else {
-		if (pdata)
-			bp->mii_bus->phy_mask = pdata->phy_mask;
-
-		err = mdiobus_register(bp->mii_bus);
-	}
-
+	err = of_mdiobus_register(bp->mii_bus, np);
 	if (err)
 		goto err_out_free_mdiobus;
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index d4604bc8eb5b..f3e43db0d6cb 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2052,13 +2052,9 @@ static int fec_enet_mii_init(struct platform_device *pdev)
 	fep->mii_bus->parent = &pdev->dev;
 
 	node = of_get_child_by_name(pdev->dev.of_node, "mdio");
-	if (node) {
-		err = of_mdiobus_register(fep->mii_bus, node);
+	err = of_mdiobus_register(fep->mii_bus, node);
+	if (node)
 		of_node_put(node);
-	} else {
-		err = mdiobus_register(fep->mii_bus);
-	}
-
 	if (err)
 		goto err_out_free_mdiobus;
 
diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index 0495487f7b42..c5dac6bd2be4 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -348,10 +348,7 @@ static int orion_mdio_probe(struct platform_device *pdev)
 		goto out_mdio;
 	}
 
-	if (pdev->dev.of_node)
-		ret = of_mdiobus_register(bus, pdev->dev.of_node);
-	else
-		ret = mdiobus_register(bus);
+	ret = of_mdiobus_register(bus, pdev->dev.of_node);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
 		goto out_mdio;
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 5970d9e5ddf1..8dd41e08a6c6 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3025,15 +3025,10 @@ static int sh_mdio_init(struct sh_eth_private *mdp,
 		 pdev->name, pdev->id);
 
 	/* register MDIO bus */
-	if (dev->of_node) {
-		ret = of_mdiobus_register(mdp->mii_bus, dev->of_node);
-	} else {
-		if (pd->phy_irq > 0)
-			mdp->mii_bus->irq[pd->phy] = pd->phy_irq;
-
-		ret = mdiobus_register(mdp->mii_bus);
-	}
+	if (pd->phy_irq > 0)
+		mdp->mii_bus->irq[pd->phy] = pd->phy_irq;
 
+	ret = of_mdiobus_register(mdp->mii_bus, dev->of_node);
 	if (ret)
 		goto out_free_bus;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index f5f37bfa1d58..5df1a608e566 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -233,10 +233,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 	new_bus->phy_mask = mdio_bus_data->phy_mask;
 	new_bus->parent = priv->device;
 
-	if (mdio_node)
-		err = of_mdiobus_register(new_bus, mdio_node);
-	else
-		err = mdiobus_register(new_bus);
+	err = of_mdiobus_register(new_bus, mdio_node);
 	if (err != 0) {
 		dev_err(dev, "Cannot register the MDIO bus\n");
 		goto bus_register_fail;
diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c
index 98a1c97fb95e..8ac72831af05 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -429,12 +429,10 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 	 * defined to support backward compatibility with DTs which assume that
 	 * Davinci MDIO will always scan the bus for PHYs detection.
 	 */
-	if (dev->of_node && of_get_child_count(dev->of_node)) {
+	if (dev->of_node && of_get_child_count(dev->of_node))
 		data->skip_scan = true;
-		ret = of_mdiobus_register(data->bus, dev->of_node);
-	} else {
-		ret = mdiobus_register(data->bus);
-	}
+
+	ret = of_mdiobus_register(data->bus, dev->of_node);
 	if (ret)
 		goto bail_out;
 
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index b501221819e1..4e4c8daf44c3 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -179,11 +179,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
 	if (!new_bus)
 		return -ENODEV;
 
-	if (pdev->dev.of_node)
-		ret = of_mdiobus_register(new_bus, pdev->dev.of_node);
-	else
-		ret = mdiobus_register(new_bus);
-
+	ret = of_mdiobus_register(new_bus, pdev->dev.of_node);
 	if (ret)
 		mdio_gpio_bus_deinit(&pdev->dev);
 
diff --git a/drivers/net/phy/mdio-mscc-miim.c b/drivers/net/phy/mdio-mscc-miim.c
index 8c689ccfdbca..badbc99bedd3 100644
--- a/drivers/net/phy/mdio-mscc-miim.c
+++ b/drivers/net/phy/mdio-mscc-miim.c
@@ -151,11 +151,7 @@ static int mscc_miim_probe(struct platform_device *pdev)
 		}
 	}
 
-	if (pdev->dev.of_node)
-		ret = of_mdiobus_register(bus, pdev->dev.of_node);
-	else
-		ret = mdiobus_register(bus);
-
+	ret = of_mdiobus_register(bus, pdev->dev.of_node);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
 		return ret;
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 91761436709a..8dff87ec6d99 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1843,12 +1843,9 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev)
 	}
 
 	node = of_get_child_by_name(dev->udev->dev.of_node, "mdio");
-	if (node) {
-		ret = of_mdiobus_register(dev->mdiobus, node);
+	ret = of_mdiobus_register(dev->mdiobus, node);
+	if (node)
 		of_node_put(node);
-	} else {
-		ret = mdiobus_register(dev->mdiobus);
-	}
 	if (ret) {
 		netdev_err(dev->net, "can't register MDIO bus\n");
 		goto exit1;
-- 
cgit v1.2.3-59-g8ed1b


From 1eece799d3f611a28a25319aabbccd4ac948098f Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 15 May 2018 18:52:00 -0600
Subject: net: qualcomm: rmnet: Capture all drops in transmit path

Packets in transmit path could potentially be dropped if there were
errors while adding the MAP header or the checksum header.
Increment the tx_drops stats in these cases.

Additionally, refactor the code to free the packet and increment
the tx_drops stat under a single label.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c    | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 6fcd586e9804..7fd86d40a337 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -148,7 +148,7 @@ static int rmnet_map_egress_handler(struct sk_buff *skb,
 
 	if (skb_headroom(skb) < required_headroom) {
 		if (pskb_expand_head(skb, required_headroom, 0, GFP_KERNEL))
-			goto fail;
+			return -ENOMEM;
 	}
 
 	if (port->data_format & RMNET_FLAGS_EGRESS_MAP_CKSUMV4)
@@ -156,17 +156,13 @@ static int rmnet_map_egress_handler(struct sk_buff *skb,
 
 	map_header = rmnet_map_add_map_header(skb, additional_header_len, 0);
 	if (!map_header)
-		goto fail;
+		return -ENOMEM;
 
 	map_header->mux_id = mux_id;
 
 	skb->protocol = htons(ETH_P_MAP);
 
 	return 0;
-
-fail:
-	kfree_skb(skb);
-	return -ENOMEM;
 }
 
 static void
@@ -228,15 +224,18 @@ void rmnet_egress_handler(struct sk_buff *skb)
 	mux_id = priv->mux_id;
 
 	port = rmnet_get_port(skb->dev);
-	if (!port) {
-		kfree_skb(skb);
-		return;
-	}
+	if (!port)
+		goto drop;
 
 	if (rmnet_map_egress_handler(skb, port, mux_id, orig_dev))
-		return;
+		goto drop;
 
 	rmnet_vnd_tx_fixup(skb, orig_dev);
 
 	dev_queue_xmit(skb);
+	return;
+
+drop:
+	this_cpu_inc(priv->pcpu_stats->stats.tx_drops);
+	kfree_skb(skb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From bbde32d38bfbbc4a6970498c7470a8a817122735 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 15 May 2018 18:52:01 -0600
Subject: net: qualcomm: rmnet: Add support for ethtool private stats

Add ethtool private stats handler to debug the handling of packets
with checksum offload header / trailer. This allows to keep track of
the number of packets for which hardware computes the checksum and
counts and reasons where checksum computation was skipped in hardware
and was done in the network stack.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h | 13 +++++
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   | 64 ++++++++++++++++------
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c    | 51 +++++++++++++++++
 3 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 0b5b5da80198..34ac45a774e7 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -54,11 +54,24 @@ struct rmnet_pcpu_stats {
 	struct u64_stats_sync syncp;
 };
 
+struct rmnet_priv_stats {
+	u64 csum_ok;
+	u64 csum_valid_unset;
+	u64 csum_validation_failed;
+	u64 csum_err_bad_buffer;
+	u64 csum_err_invalid_ip_version;
+	u64 csum_err_invalid_transport;
+	u64 csum_fragmented_pkt;
+	u64 csum_skipped;
+	u64 csum_sw;
+};
+
 struct rmnet_priv {
 	u8 mux_id;
 	struct net_device *real_dev;
 	struct rmnet_pcpu_stats __percpu *pcpu_stats;
 	struct gro_cells gro_cells;
+	struct rmnet_priv_stats stats;
 };
 
 struct rmnet_port *rmnet_get_port(struct net_device *real_dev);
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
index a6ea09416f8d..57a9c314a665 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -48,7 +48,8 @@ static __sum16 *rmnet_map_get_csum_field(unsigned char protocol,
 
 static int
 rmnet_map_ipv4_dl_csum_trailer(struct sk_buff *skb,
-			       struct rmnet_map_dl_csum_trailer *csum_trailer)
+			       struct rmnet_map_dl_csum_trailer *csum_trailer,
+			       struct rmnet_priv *priv)
 {
 	__sum16 *csum_field, csum_temp, pseudo_csum, hdr_csum, ip_payload_csum;
 	u16 csum_value, csum_value_final;
@@ -58,19 +59,25 @@ rmnet_map_ipv4_dl_csum_trailer(struct sk_buff *skb,
 
 	ip4h = (struct iphdr *)(skb->data);
 	if ((ntohs(ip4h->frag_off) & IP_MF) ||
-	    ((ntohs(ip4h->frag_off) & IP_OFFSET) > 0))
+	    ((ntohs(ip4h->frag_off) & IP_OFFSET) > 0)) {
+		priv->stats.csum_fragmented_pkt++;
 		return -EOPNOTSUPP;
+	}
 
 	txporthdr = skb->data + ip4h->ihl * 4;
 
 	csum_field = rmnet_map_get_csum_field(ip4h->protocol, txporthdr);
 
-	if (!csum_field)
+	if (!csum_field) {
+		priv->stats.csum_err_invalid_transport++;
 		return -EPROTONOSUPPORT;
+	}
 
 	/* RFC 768 - Skip IPv4 UDP packets where sender checksum field is 0 */
-	if (*csum_field == 0 && ip4h->protocol == IPPROTO_UDP)
+	if (*csum_field == 0 && ip4h->protocol == IPPROTO_UDP) {
+		priv->stats.csum_skipped++;
 		return 0;
+	}
 
 	csum_value = ~ntohs(csum_trailer->csum_value);
 	hdr_csum = ~ip_fast_csum(ip4h, (int)ip4h->ihl);
@@ -102,16 +109,20 @@ rmnet_map_ipv4_dl_csum_trailer(struct sk_buff *skb,
 		}
 	}
 
-	if (csum_value_final == ntohs((__force __be16)*csum_field))
+	if (csum_value_final == ntohs((__force __be16)*csum_field)) {
+		priv->stats.csum_ok++;
 		return 0;
-	else
+	} else {
+		priv->stats.csum_validation_failed++;
 		return -EINVAL;
+	}
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int
 rmnet_map_ipv6_dl_csum_trailer(struct sk_buff *skb,
-			       struct rmnet_map_dl_csum_trailer *csum_trailer)
+			       struct rmnet_map_dl_csum_trailer *csum_trailer,
+			       struct rmnet_priv *priv)
 {
 	__sum16 *csum_field, ip6_payload_csum, pseudo_csum, csum_temp;
 	u16 csum_value, csum_value_final;
@@ -125,8 +136,10 @@ rmnet_map_ipv6_dl_csum_trailer(struct sk_buff *skb,
 	txporthdr = skb->data + sizeof(struct ipv6hdr);
 	csum_field = rmnet_map_get_csum_field(ip6h->nexthdr, txporthdr);
 
-	if (!csum_field)
+	if (!csum_field) {
+		priv->stats.csum_err_invalid_transport++;
 		return -EPROTONOSUPPORT;
+	}
 
 	csum_value = ~ntohs(csum_trailer->csum_value);
 	ip6_hdr_csum = (__force __be16)
@@ -164,10 +177,13 @@ rmnet_map_ipv6_dl_csum_trailer(struct sk_buff *skb,
 		}
 	}
 
-	if (csum_value_final == ntohs((__force __be16)*csum_field))
+	if (csum_value_final == ntohs((__force __be16)*csum_field)) {
+		priv->stats.csum_ok++;
 		return 0;
-	else
+	} else {
+		priv->stats.csum_validation_failed++;
 		return -EINVAL;
+	}
 }
 #endif
 
@@ -339,24 +355,34 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb,
  */
 int rmnet_map_checksum_downlink_packet(struct sk_buff *skb, u16 len)
 {
+	struct rmnet_priv *priv = netdev_priv(skb->dev);
 	struct rmnet_map_dl_csum_trailer *csum_trailer;
 
-	if (unlikely(!(skb->dev->features & NETIF_F_RXCSUM)))
+	if (unlikely(!(skb->dev->features & NETIF_F_RXCSUM))) {
+		priv->stats.csum_sw++;
 		return -EOPNOTSUPP;
+	}
 
 	csum_trailer = (struct rmnet_map_dl_csum_trailer *)(skb->data + len);
 
-	if (!csum_trailer->valid)
+	if (!csum_trailer->valid) {
+		priv->stats.csum_valid_unset++;
 		return -EINVAL;
+	}
 
-	if (skb->protocol == htons(ETH_P_IP))
-		return rmnet_map_ipv4_dl_csum_trailer(skb, csum_trailer);
-	else if (skb->protocol == htons(ETH_P_IPV6))
+	if (skb->protocol == htons(ETH_P_IP)) {
+		return rmnet_map_ipv4_dl_csum_trailer(skb, csum_trailer, priv);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
 #if IS_ENABLED(CONFIG_IPV6)
-		return rmnet_map_ipv6_dl_csum_trailer(skb, csum_trailer);
+		return rmnet_map_ipv6_dl_csum_trailer(skb, csum_trailer, priv);
 #else
+		priv->stats.csum_err_invalid_ip_version++;
 		return -EPROTONOSUPPORT;
 #endif
+	} else {
+		priv->stats.csum_err_invalid_ip_version++;
+		return -EPROTONOSUPPORT;
+	}
 
 	return 0;
 }
@@ -367,6 +393,7 @@ int rmnet_map_checksum_downlink_packet(struct sk_buff *skb, u16 len)
 void rmnet_map_checksum_uplink_packet(struct sk_buff *skb,
 				      struct net_device *orig_dev)
 {
+	struct rmnet_priv *priv = netdev_priv(orig_dev);
 	struct rmnet_map_ul_csum_header *ul_header;
 	void *iphdr;
 
@@ -389,8 +416,11 @@ void rmnet_map_checksum_uplink_packet(struct sk_buff *skb,
 			rmnet_map_ipv6_ul_csum_header(iphdr, ul_header, skb);
 			return;
 #else
+			priv->stats.csum_err_invalid_ip_version++;
 			goto sw_csum;
 #endif
+		} else {
+			priv->stats.csum_err_invalid_ip_version++;
 		}
 	}
 
@@ -399,4 +429,6 @@ sw_csum:
 	ul_header->csum_insert_offset = 0;
 	ul_header->csum_enabled = 0;
 	ul_header->udp_ip4_ind = 0;
+
+	priv->stats.csum_sw++;
 }
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index 2ea16a088de8..cb02e1a015c1 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -152,6 +152,56 @@ static const struct net_device_ops rmnet_vnd_ops = {
 	.ndo_get_stats64 = rmnet_get_stats64,
 };
 
+static const char rmnet_gstrings_stats[][ETH_GSTRING_LEN] = {
+	"Checksum ok",
+	"Checksum valid bit not set",
+	"Checksum validation failed",
+	"Checksum error bad buffer",
+	"Checksum error bad ip version",
+	"Checksum error bad transport",
+	"Checksum skipped on ip fragment",
+	"Checksum skipped",
+	"Checksum computed in software",
+};
+
+static void rmnet_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(buf, &rmnet_gstrings_stats,
+		       sizeof(rmnet_gstrings_stats));
+		break;
+	}
+}
+
+static int rmnet_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(rmnet_gstrings_stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void rmnet_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	struct rmnet_priv *priv = netdev_priv(dev);
+	struct rmnet_priv_stats *st = &priv->stats;
+
+	if (!data)
+		return;
+
+	memcpy(data, st, ARRAY_SIZE(rmnet_gstrings_stats) * sizeof(u64));
+}
+
+static const struct ethtool_ops rmnet_ethtool_ops = {
+	.get_ethtool_stats = rmnet_get_ethtool_stats,
+	.get_strings = rmnet_get_strings,
+	.get_sset_count = rmnet_get_sset_count,
+};
+
 /* Called by kernel whenever a new rmnet<n> device is created. Sets MTU,
  * flags, ARP type, needed headroom, etc...
  */
@@ -170,6 +220,7 @@ void rmnet_vnd_setup(struct net_device *rmnet_dev)
 	rmnet_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
 
 	rmnet_dev->needs_free_netdev = true;
+	rmnet_dev->ethtool_ops = &rmnet_ethtool_ops;
 }
 
 /* Exposed API */
-- 
cgit v1.2.3-59-g8ed1b


From 721ce0f644401c923a266cb084c40ebc58b18f93 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 15 May 2018 18:52:02 -0600
Subject: net: qualcomm: rmnet: Remove redundant command check

The command packet size is already checked once in
rmnet_map_deaggregate() for the header, packet and trailer size, so
this additional check is not needed.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index 78fdad0c6f76..56a93df962e6 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -69,17 +69,9 @@ static void rmnet_map_send_ack(struct sk_buff *skb,
 	struct rmnet_map_control_command *cmd;
 	int xmit_status;
 
-	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4) {
-		if (skb->len < sizeof(struct rmnet_map_header) +
-		    RMNET_MAP_GET_LENGTH(skb) +
-		    sizeof(struct rmnet_map_dl_csum_trailer)) {
-			kfree_skb(skb);
-			return;
-		}
-
-		skb_trim(skb, skb->len -
-			 sizeof(struct rmnet_map_dl_csum_trailer));
-	}
+	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4)
+		skb_trim(skb,
+			 skb->len - sizeof(struct rmnet_map_dl_csum_trailer));
 
 	skb->protocol = htons(ETH_P_MAP);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9e6881d3665688d14b2ad4860f4e28af4ee02b63 Mon Sep 17 00:00:00 2001
From: Hemanth Puranik <hpuranik@codeaurora.org>
Date: Wed, 16 May 2018 06:40:53 +0530
Subject: net: qcom/emac: Encapsulate sgmii ops under one structure

This patch introduces ops structure for sgmii, This by ensures that
we do not need dummy functions in case of emulation platforms.

Signed-off-by: Hemanth Puranik <hpuranik@codeaurora.org>
Acked-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac-mac.c   |   5 +-
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 128 ++++++++++++++----------
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.h |  32 +++---
 drivers/net/ethernet/qualcomm/emac/emac.c       |   9 +-
 4 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index d5a32b7c7dc5..092718a03786 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -920,14 +920,13 @@ static void emac_mac_rx_descs_refill(struct emac_adapter *adpt,
 static void emac_adjust_link(struct net_device *netdev)
 {
 	struct emac_adapter *adpt = netdev_priv(netdev);
-	struct emac_sgmii *sgmii = &adpt->phy;
 	struct phy_device *phydev = netdev->phydev;
 
 	if (phydev->link) {
 		emac_mac_start(adpt);
-		sgmii->link_up(adpt);
+		emac_sgmii_link_change(adpt, true);
 	} else {
-		sgmii->link_down(adpt);
+		emac_sgmii_link_change(adpt, false);
 		emac_mac_stop(adpt);
 	}
 
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
index e8ab512ee7e3..562420b834df 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
@@ -53,6 +53,46 @@
 
 #define SERDES_START_WAIT_TIMES			100
 
+int emac_sgmii_init(struct emac_adapter *adpt)
+{
+	if (!(adpt->phy.sgmii_ops && adpt->phy.sgmii_ops->init))
+		return 0;
+
+	return adpt->phy.sgmii_ops->init(adpt);
+}
+
+int emac_sgmii_open(struct emac_adapter *adpt)
+{
+	if (!(adpt->phy.sgmii_ops && adpt->phy.sgmii_ops->open))
+		return 0;
+
+	return adpt->phy.sgmii_ops->open(adpt);
+}
+
+void emac_sgmii_close(struct emac_adapter *adpt)
+{
+	if (!(adpt->phy.sgmii_ops && adpt->phy.sgmii_ops->close))
+		return;
+
+	adpt->phy.sgmii_ops->close(adpt);
+}
+
+int emac_sgmii_link_change(struct emac_adapter *adpt, bool link_state)
+{
+	if (!(adpt->phy.sgmii_ops && adpt->phy.sgmii_ops->link_change))
+		return 0;
+
+	return adpt->phy.sgmii_ops->link_change(adpt, link_state);
+}
+
+void emac_sgmii_reset(struct emac_adapter *adpt)
+{
+	if (!(adpt->phy.sgmii_ops && adpt->phy.sgmii_ops->reset))
+		return;
+
+	adpt->phy.sgmii_ops->reset(adpt);
+}
+
 /* Initialize the SGMII link between the internal and external PHYs. */
 static void emac_sgmii_link_init(struct emac_adapter *adpt)
 {
@@ -163,21 +203,21 @@ static void emac_sgmii_reset_prepare(struct emac_adapter *adpt)
 	msleep(50);
 }
 
-void emac_sgmii_reset(struct emac_adapter *adpt)
+static void emac_sgmii_common_reset(struct emac_adapter *adpt)
 {
 	int ret;
 
 	emac_sgmii_reset_prepare(adpt);
 	emac_sgmii_link_init(adpt);
 
-	ret = adpt->phy.initialize(adpt);
+	ret = emac_sgmii_init(adpt);
 	if (ret)
 		netdev_err(adpt->netdev,
 			   "could not reinitialize internal PHY (error=%i)\n",
 			   ret);
 }
 
-static int emac_sgmii_open(struct emac_adapter *adpt)
+static int emac_sgmii_common_open(struct emac_adapter *adpt)
 {
 	struct emac_sgmii *sgmii = &adpt->phy;
 	int ret;
@@ -201,43 +241,53 @@ static int emac_sgmii_open(struct emac_adapter *adpt)
 	return 0;
 }
 
-static int emac_sgmii_close(struct emac_adapter *adpt)
+static void emac_sgmii_common_close(struct emac_adapter *adpt)
 {
 	struct emac_sgmii *sgmii = &adpt->phy;
 
 	/* Make sure interrupts are disabled */
 	writel(0, sgmii->base + EMAC_SGMII_PHY_INTERRUPT_MASK);
 	free_irq(sgmii->irq, adpt);
-
-	return 0;
 }
 
 /* The error interrupts are only valid after the link is up */
-static int emac_sgmii_link_up(struct emac_adapter *adpt)
+static int emac_sgmii_common_link_change(struct emac_adapter *adpt, bool linkup)
 {
 	struct emac_sgmii *sgmii = &adpt->phy;
 	int ret;
 
-	/* Clear and enable interrupts */
-	ret = emac_sgmii_irq_clear(adpt, 0xff);
-	if (ret)
-		return ret;
+	if (linkup) {
+		/* Clear and enable interrupts */
+		ret = emac_sgmii_irq_clear(adpt, 0xff);
+		if (ret)
+			return ret;
 
-	writel(SGMII_ISR_MASK, sgmii->base + EMAC_SGMII_PHY_INTERRUPT_MASK);
+		writel(SGMII_ISR_MASK,
+		       sgmii->base + EMAC_SGMII_PHY_INTERRUPT_MASK);
+	} else {
+		/* Disable interrupts */
+		writel(0, sgmii->base + EMAC_SGMII_PHY_INTERRUPT_MASK);
+		synchronize_irq(sgmii->irq);
+	}
 
 	return 0;
 }
 
-static int emac_sgmii_link_down(struct emac_adapter *adpt)
-{
-	struct emac_sgmii *sgmii = &adpt->phy;
-
-	/* Disable interrupts */
-	writel(0, sgmii->base + EMAC_SGMII_PHY_INTERRUPT_MASK);
-	synchronize_irq(sgmii->irq);
+static struct sgmii_ops qdf2432_ops = {
+	.init = emac_sgmii_init_qdf2432,
+	.open = emac_sgmii_common_open,
+	.close = emac_sgmii_common_close,
+	.link_change = emac_sgmii_common_link_change,
+	.reset = emac_sgmii_common_reset,
+};
 
-	return 0;
-}
+static struct sgmii_ops qdf2400_ops = {
+	.init = emac_sgmii_init_qdf2400,
+	.open = emac_sgmii_common_open,
+	.close = emac_sgmii_common_close,
+	.link_change = emac_sgmii_common_link_change,
+	.reset = emac_sgmii_common_reset,
+};
 
 static int emac_sgmii_acpi_match(struct device *dev, void *data)
 {
@@ -249,7 +299,7 @@ static int emac_sgmii_acpi_match(struct device *dev, void *data)
 		{}
 	};
 	const struct acpi_device_id *id = acpi_match_device(match_table, dev);
-	emac_sgmii_function *initialize = data;
+	struct sgmii_ops **ops = data;
 
 	if (id) {
 		acpi_handle handle = ACPI_HANDLE(dev);
@@ -270,10 +320,10 @@ static int emac_sgmii_acpi_match(struct device *dev, void *data)
 
 		switch (hrv) {
 		case 1:
-			*initialize = emac_sgmii_init_qdf2432;
+			*ops = &qdf2432_ops;
 			return 1;
 		case 2:
-			*initialize = emac_sgmii_init_qdf2400;
+			*ops = &qdf2400_ops;
 			return 1;
 		}
 	}
@@ -294,14 +344,6 @@ static const struct of_device_id emac_sgmii_dt_match[] = {
 	{}
 };
 
-/* Dummy function for systems without an internal PHY. This avoids having
- * to check for NULL pointers before calling the functions.
- */
-static int emac_sgmii_dummy(struct emac_adapter *adpt)
-{
-	return 0;
-}
-
 int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 {
 	struct platform_device *sgmii_pdev = NULL;
@@ -312,22 +354,11 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 	if (has_acpi_companion(&pdev->dev)) {
 		struct device *dev;
 
-		dev = device_find_child(&pdev->dev, &phy->initialize,
+		dev = device_find_child(&pdev->dev, &phy->sgmii_ops,
 					emac_sgmii_acpi_match);
 
 		if (!dev) {
 			dev_warn(&pdev->dev, "cannot find internal phy node\n");
-			/* There is typically no internal PHY on emulation
-			 * systems, so if we can't find the node, assume
-			 * we are on an emulation system and stub-out
-			 * support for the internal PHY.  These systems only
-			 * use ACPI.
-			 */
-			phy->open = emac_sgmii_dummy;
-			phy->close = emac_sgmii_dummy;
-			phy->link_up = emac_sgmii_dummy;
-			phy->link_down = emac_sgmii_dummy;
-
 			return 0;
 		}
 
@@ -355,14 +386,9 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 			goto error_put_device;
 		}
 
-		phy->initialize = (emac_sgmii_function)match->data;
+		phy->sgmii_ops->init = match->data;
 	}
 
-	phy->open = emac_sgmii_open;
-	phy->close = emac_sgmii_close;
-	phy->link_up = emac_sgmii_link_up;
-	phy->link_down = emac_sgmii_link_down;
-
 	/* Base address is the first address */
 	res = platform_get_resource(sgmii_pdev, IORESOURCE_MEM, 0);
 	if (!res) {
@@ -386,7 +412,7 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 		}
 	}
 
-	ret = phy->initialize(adpt);
+	ret = emac_sgmii_init(adpt);
 	if (ret)
 		goto error;
 
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.h b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.h
index e7c0c3b2baa4..31ba21eb61d2 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.h
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.h
@@ -16,36 +16,44 @@
 struct emac_adapter;
 struct platform_device;
 
-typedef int (*emac_sgmii_function)(struct emac_adapter *adpt);
+/** emac_sgmii - internal emac phy
+ * @init initialization function
+ * @open called when the driver is opened
+ * @close called when the driver is closed
+ * @link_change called when the link state changes
+ */
+struct sgmii_ops {
+	int (*init)(struct emac_adapter *adpt);
+	int (*open)(struct emac_adapter *adpt);
+	void (*close)(struct emac_adapter *adpt);
+	int (*link_change)(struct emac_adapter *adpt, bool link_state);
+	void (*reset)(struct emac_adapter *adpt);
+};
 
 /** emac_sgmii - internal emac phy
  * @base base address
  * @digital per-lane digital block
  * @irq the interrupt number
  * @decode_error_count reference count of consecutive decode errors
- * @initialize initialization function
- * @open called when the driver is opened
- * @close called when the driver is closed
- * @link_up called when the link comes up
- * @link_down called when the link comes down
+ * @sgmii_ops sgmii ops
  */
 struct emac_sgmii {
 	void __iomem		*base;
 	void __iomem		*digital;
 	unsigned int		irq;
 	atomic_t		decode_error_count;
-	emac_sgmii_function	initialize;
-	emac_sgmii_function	open;
-	emac_sgmii_function	close;
-	emac_sgmii_function	link_up;
-	emac_sgmii_function	link_down;
+	struct	sgmii_ops	*sgmii_ops;
 };
 
 int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt);
-void emac_sgmii_reset(struct emac_adapter *adpt);
 
 int emac_sgmii_init_fsm9900(struct emac_adapter *adpt);
 int emac_sgmii_init_qdf2432(struct emac_adapter *adpt);
 int emac_sgmii_init_qdf2400(struct emac_adapter *adpt);
 
+int emac_sgmii_init(struct emac_adapter *adpt);
+int emac_sgmii_open(struct emac_adapter *adpt);
+void emac_sgmii_close(struct emac_adapter *adpt);
+int emac_sgmii_link_change(struct emac_adapter *adpt, bool link_state);
+void emac_sgmii_reset(struct emac_adapter *adpt);
 #endif
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 13235baf4766..2a0cbc535a2e 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -253,7 +253,7 @@ static int emac_open(struct net_device *netdev)
 		return ret;
 	}
 
-	ret = adpt->phy.open(adpt);
+	ret = emac_sgmii_open(adpt);
 	if (ret) {
 		emac_mac_rx_tx_rings_free_all(adpt);
 		free_irq(irq->irq, irq);
@@ -264,7 +264,7 @@ static int emac_open(struct net_device *netdev)
 	if (ret) {
 		emac_mac_rx_tx_rings_free_all(adpt);
 		free_irq(irq->irq, irq);
-		adpt->phy.close(adpt);
+		emac_sgmii_close(adpt);
 		return ret;
 	}
 
@@ -278,7 +278,7 @@ static int emac_close(struct net_device *netdev)
 
 	mutex_lock(&adpt->reset_lock);
 
-	adpt->phy.close(adpt);
+	emac_sgmii_close(adpt);
 	emac_mac_down(adpt);
 	emac_mac_rx_tx_rings_free_all(adpt);
 
@@ -761,11 +761,10 @@ static void emac_shutdown(struct platform_device *pdev)
 {
 	struct net_device *netdev = dev_get_drvdata(&pdev->dev);
 	struct emac_adapter *adpt = netdev_priv(netdev);
-	struct emac_sgmii *sgmii = &adpt->phy;
 
 	if (netdev->flags & IFF_UP) {
 		/* Closing the SGMII turns off its interrupts */
-		sgmii->close(adpt);
+		emac_sgmii_close(adpt);
 
 		/* Resetting the MAC turns off all DMA and its interrupts */
 		emac_mac_reset(adpt);
-- 
cgit v1.2.3-59-g8ed1b


From 93120eba4890e0fcf1e42059d495a7d852db31ed Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 16 May 2018 18:59:19 +0800
Subject: net: stmmac: Remove useless test before clk_disable_unprepare

clk_disable_unprepare() already checks that the clock pointer is valid.
No need to test it before calling it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 13133b30b575..f08625a02cea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -1104,30 +1104,20 @@ static int gmac_clk_enable(struct rk_priv_data *bsp_priv, bool enable)
 	} else {
 		if (bsp_priv->clk_enabled) {
 			if (phy_iface == PHY_INTERFACE_MODE_RMII) {
-				if (!IS_ERR(bsp_priv->mac_clk_rx))
-					clk_disable_unprepare(
-						bsp_priv->mac_clk_rx);
+				clk_disable_unprepare(bsp_priv->mac_clk_rx);
 
-				if (!IS_ERR(bsp_priv->clk_mac_ref))
-					clk_disable_unprepare(
-						bsp_priv->clk_mac_ref);
+				clk_disable_unprepare(bsp_priv->clk_mac_ref);
 
-				if (!IS_ERR(bsp_priv->clk_mac_refout))
-					clk_disable_unprepare(
-						bsp_priv->clk_mac_refout);
+				clk_disable_unprepare(bsp_priv->clk_mac_refout);
 			}
 
-			if (!IS_ERR(bsp_priv->clk_phy))
-				clk_disable_unprepare(bsp_priv->clk_phy);
+			clk_disable_unprepare(bsp_priv->clk_phy);
 
-			if (!IS_ERR(bsp_priv->aclk_mac))
-				clk_disable_unprepare(bsp_priv->aclk_mac);
+			clk_disable_unprepare(bsp_priv->aclk_mac);
 
-			if (!IS_ERR(bsp_priv->pclk_mac))
-				clk_disable_unprepare(bsp_priv->pclk_mac);
+			clk_disable_unprepare(bsp_priv->pclk_mac);
 
-			if (!IS_ERR(bsp_priv->mac_clk_tx))
-				clk_disable_unprepare(bsp_priv->mac_clk_tx);
+			clk_disable_unprepare(bsp_priv->mac_clk_tx);
 			/**
 			 * if (!IS_ERR(bsp_priv->clk_mac))
 			 *	clk_disable_unprepare(bsp_priv->clk_mac);
-- 
cgit v1.2.3-59-g8ed1b


From 76e597eb15297d402685ee1442405d3eea70136b Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 16 May 2018 19:18:22 +0800
Subject: net: ethoc: Remove useless test before clk_disable_unprepare

clk_disable_unprepare() already checks that the clock pointer is valid.
No need to test it before calling it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ethoc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 8bb0db990c8f..00a57273b753 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1246,8 +1246,7 @@ error:
 	mdiobus_unregister(priv->mdio);
 	mdiobus_free(priv->mdio);
 free2:
-	if (priv->clk)
-		clk_disable_unprepare(priv->clk);
+	clk_disable_unprepare(priv->clk);
 free:
 	free_netdev(netdev);
 out:
@@ -1271,8 +1270,7 @@ static int ethoc_remove(struct platform_device *pdev)
 			mdiobus_unregister(priv->mdio);
 			mdiobus_free(priv->mdio);
 		}
-		if (priv->clk)
-			clk_disable_unprepare(priv->clk);
+		clk_disable_unprepare(priv->clk);
 		unregister_netdev(netdev);
 		free_netdev(netdev);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 974f6c046ad90d071f96c76ff8ab29355a798c41 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:38 +0300
Subject: qed: LL2 flush isles when connection is closed

Driver should free all pending isles once it gets a FLUSH cqe from FW.
Part of iSCSI out of order flow.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index c3c1a99a0252..be63f0a2705d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -591,6 +591,27 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	}
 }
 
+static bool
+qed_ll2_lb_rxq_handler_slowpath(struct qed_hwfn *p_hwfn,
+				struct core_rx_slow_path_cqe *p_cqe)
+{
+	struct ooo_opaque *iscsi_ooo;
+	u32 cid;
+
+	if (p_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH)
+		return false;
+
+	iscsi_ooo = (struct ooo_opaque *)&p_cqe->opaque_data;
+	if (iscsi_ooo->ooo_opcode != TCP_EVENT_DELETE_ISLES)
+		return false;
+
+	/* Need to make a flush */
+	cid = le32_to_cpu(iscsi_ooo->cid);
+	qed_ooo_release_connection_isles(p_hwfn, p_hwfn->p_ooo_info, cid);
+
+	return true;
+}
+
 static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 				  struct qed_ll2_info *p_ll2_conn)
 {
@@ -617,6 +638,11 @@ static int qed_ll2_lb_rxq_handler(struct qed_hwfn *p_hwfn,
 		cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain);
 		cqe_type = cqe->rx_cqe_sp.type;
 
+		if (cqe_type == CORE_RX_CQE_TYPE_SLOW_PATH)
+			if (qed_ll2_lb_rxq_handler_slowpath(p_hwfn,
+							    &cqe->rx_cqe_sp))
+				continue;
+
 		if (cqe_type != CORE_RX_CQE_TYPE_REGULAR) {
 			DP_NOTICE(p_hwfn,
 				  "Got a non-regular LB LL2 completion [type 0x%02x]\n",
-- 
cgit v1.2.3-59-g8ed1b


From 6291c608a908a9d62be650373bce7f734311d07c Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:39 +0300
Subject: qed: Fix possibility of list corruption during rmmod flows

The ll2 flows of flushing the txq/rxq need to be synchronized with the
regular fp processing. Caused list corruption during load/unload stress
tests.

Fixes: 0a7fb11c23c0f ("qed: Add Light L2 support")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index be63f0a2705d..4a78294d6481 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -292,6 +292,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_tx_packet *p_pkt = NULL;
 	struct qed_ll2_info *p_ll2_conn;
 	struct qed_ll2_tx_queue *p_tx;
+	unsigned long flags = 0;
 	dma_addr_t tx_frag;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
@@ -300,6 +301,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_tx = &p_ll2_conn->tx_queue;
 
+	spin_lock_irqsave(&p_tx->lock, flags);
 	while (!list_empty(&p_tx->active_descq)) {
 		p_pkt = list_first_entry(&p_tx->active_descq,
 					 struct qed_ll2_tx_packet, list_entry);
@@ -309,6 +311,7 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 		list_del(&p_pkt->list_entry);
 		b_last_packet = list_empty(&p_tx->active_descq);
 		list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+		spin_unlock_irqrestore(&p_tx->lock, flags);
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
 
@@ -328,7 +331,9 @@ static void qed_ll2_txq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      b_last_frag,
 						      b_last_packet);
 		}
+		spin_lock_irqsave(&p_tx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_tx->lock, flags);
 }
 
 static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
@@ -556,6 +561,7 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 	struct qed_ll2_info *p_ll2_conn = NULL;
 	struct qed_ll2_rx_packet *p_pkt = NULL;
 	struct qed_ll2_rx_queue *p_rx;
+	unsigned long flags = 0;
 
 	p_ll2_conn = qed_ll2_handle_sanity_inactive(p_hwfn, connection_handle);
 	if (!p_ll2_conn)
@@ -563,13 +569,14 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 
 	p_rx = &p_ll2_conn->rx_queue;
 
+	spin_lock_irqsave(&p_rx->lock, flags);
 	while (!list_empty(&p_rx->active_descq)) {
 		p_pkt = list_first_entry(&p_rx->active_descq,
 					 struct qed_ll2_rx_packet, list_entry);
 		if (!p_pkt)
 			break;
-
 		list_move_tail(&p_pkt->list_entry, &p_rx->free_descq);
+		spin_unlock_irqrestore(&p_rx->lock, flags);
 
 		if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO) {
 			struct qed_ooo_buffer *p_buffer;
@@ -588,7 +595,9 @@ static void qed_ll2_rxq_flush(struct qed_hwfn *p_hwfn, u8 connection_handle)
 						      cookie,
 						      rx_buf_addr, b_last);
 		}
+		spin_lock_irqsave(&p_rx->lock, flags);
 	}
+	spin_unlock_irqrestore(&p_rx->lock, flags);
 }
 
 static bool
-- 
cgit v1.2.3-59-g8ed1b


From fc16f56b7bfd5bfe812ba2cd3a4c8943977e8306 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Wed, 16 May 2018 14:44:40 +0300
Subject: qed: Fix LL2 race during connection terminate

Stress on qedi/qedr load unload lead to list_del corruption.
This is due to ll2 connection terminate freeing resources without
verifying that no more ll2 processing will occur.

This patch unregisters the ll2 status block before terminating
the connection to assure this race does not occur.

Fixes: 1d6cff4fca4366 ("qed: Add iSCSI out of order packet handling")
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ll2.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 4a78294d6481..c97ebd681c47 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -829,6 +829,9 @@ static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie;
 	int rc;
 
+	if (!QED_LL2_RX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	rc = qed_ll2_lb_rxq_handler(p_hwfn, p_ll2_conn);
 	if (rc)
 		return rc;
@@ -849,6 +852,9 @@ static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie)
 	u16 new_idx = 0, num_bds = 0;
 	int rc;
 
+	if (!QED_LL2_TX_REGISTERED(p_ll2_conn))
+		return 0;
+
 	new_idx = le16_to_cpu(*p_tx->p_fw_cons);
 	num_bds = ((s16)new_idx - (s16)p_tx->bds_idx);
 
@@ -1915,17 +1921,25 @@ int qed_ll2_terminate_connection(void *cxt, u8 connection_handle)
 
 	/* Stop Tx & Rx of connection, if needed */
 	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->tx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_tx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_txq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
 	}
 
 	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
+		p_ll2_conn->rx_queue.b_cb_registred = false;
+		smp_wmb(); /* Make sure this is seen by ll2_lb_rxq_completion */
 		rc = qed_sp_ll2_rx_queue_stop(p_hwfn, p_ll2_conn);
 		if (rc)
 			goto out;
+
 		qed_ll2_rxq_flush(p_hwfn, connection_handle);
+		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
 	}
 
 	if (p_ll2_conn->input.conn_type == QED_LL2_TYPE_OOO)
@@ -1974,16 +1988,6 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle)
 	if (!p_ll2_conn)
 		return;
 
-	if (QED_LL2_RX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->rx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->rx_queue.rx_sb_index);
-	}
-
-	if (QED_LL2_TX_REGISTERED(p_ll2_conn)) {
-		p_ll2_conn->tx_queue.b_cb_registred = false;
-		qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
-	}
-
 	kfree(p_ll2_conn->tx_queue.descq_mem);
 	qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8e725f7caafb8e820e05707fe9853023375438cf Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Wed, 16 May 2018 19:51:15 +0530
Subject: cxgb4: update LE-TCAM collection for T6

For T6, clip table is separated from main TCAM. So, update LE-TCAM
collection logic to collect clip table TCAM as well. IPv6 takes
4 entries in clip table TCAM compared to 2 entries in main TCAM.

Also, in case of errors, keep LE-TCAM collected so far and set the
status to partial dump.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h |  3 ++
 drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h     |  1 +
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c    | 40 ++++++++++++++++++-----
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h      |  1 +
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index b57acb8dc35b..740a18ba4229 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -235,6 +235,9 @@ struct cudbg_vpd_data {
 };
 
 #define CUDBG_MAX_TCAM_TID 0x800
+#define CUDBG_T6_CLIP 1536
+#define CUDBG_MAX_TID_COMP_EN 6144
+#define CUDBG_MAX_TID_COMP_DIS 3072
 
 enum cudbg_le_entry_types {
 	LE_ET_UNKNOWN = 0,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
index 8568a51f6414..215fe6260fd7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h
@@ -24,6 +24,7 @@
 #define CUDBG_STATUS_NOT_IMPLEMENTED -28
 #define CUDBG_SYSTEM_ERROR -29
 #define CUDBG_STATUS_CCLK_NOT_DEFINED -32
+#define CUDBG_STATUS_PARTIAL_DATA -41
 
 #define CUDBG_MAJOR_VERSION 1
 #define CUDBG_MINOR_VERSION 14
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 9da6f57901a9..4feb7eca0acf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -2366,8 +2366,11 @@ void cudbg_fill_le_tcam_info(struct adapter *padap,
 	value = t4_read_reg(padap, LE_DB_ROUTING_TABLE_INDEX_A);
 	tcam_region->routing_start = value;
 
-	/*Get clip table index */
-	value = t4_read_reg(padap, LE_DB_CLIP_TABLE_INDEX_A);
+	/* Get clip table index. For T6 there is separate CLIP TCAM */
+	if (is_t6(padap->params.chip))
+		value = t4_read_reg(padap, LE_DB_CLCAM_TID_BASE_A);
+	else
+		value = t4_read_reg(padap, LE_DB_CLIP_TABLE_INDEX_A);
 	tcam_region->clip_start = value;
 
 	/* Get filter table index */
@@ -2392,8 +2395,16 @@ void cudbg_fill_le_tcam_info(struct adapter *padap,
 					       tcam_region->tid_hash_base;
 		}
 	} else { /* hash not enabled */
-		tcam_region->max_tid = CUDBG_MAX_TCAM_TID;
+		if (is_t6(padap->params.chip))
+			tcam_region->max_tid = (value & ASLIPCOMPEN_F) ?
+					       CUDBG_MAX_TID_COMP_EN :
+					       CUDBG_MAX_TID_COMP_DIS;
+		else
+			tcam_region->max_tid = CUDBG_MAX_TCAM_TID;
 	}
+
+	if (is_t6(padap->params.chip))
+		tcam_region->max_tid += CUDBG_T6_CLIP;
 }
 
 int cudbg_collect_le_tcam(struct cudbg_init *pdbg_init,
@@ -2423,18 +2434,31 @@ int cudbg_collect_le_tcam(struct cudbg_init *pdbg_init,
 	for (i = 0; i < tcam_region.max_tid; ) {
 		rc = cudbg_read_tid(pdbg_init, i, tid_data);
 		if (rc) {
-			cudbg_err->sys_err = rc;
-			cudbg_put_buff(pdbg_init, &temp_buff);
-			return rc;
+			cudbg_err->sys_warn = CUDBG_STATUS_PARTIAL_DATA;
+			/* Update tcam header and exit */
+			tcam_region.max_tid = i;
+			memcpy(temp_buff.data, &tcam_region,
+			       sizeof(struct cudbg_tcam));
+			goto out;
 		}
 
-		/* ipv6 takes two tids */
-		cudbg_is_ipv6_entry(tid_data, tcam_region) ? i += 2 : i++;
+		if (cudbg_is_ipv6_entry(tid_data, tcam_region)) {
+			/* T6 CLIP TCAM: ipv6 takes 4 entries */
+			if (is_t6(padap->params.chip) &&
+			    i >= tcam_region.clip_start &&
+			    i < tcam_region.clip_start + CUDBG_T6_CLIP)
+				i += 4;
+			else /* Main TCAM: ipv6 takes two tids */
+				i += 2;
+		} else {
+			i++;
+		}
 
 		tid_data++;
 		bytes += sizeof(struct cudbg_tid_data);
 	}
 
+out:
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 843f8cada1de..6b55aa2eb2a5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -2999,6 +2999,7 @@
 #define LE_DB_HASH_TID_BASE_A 0x19c30
 #define LE_DB_HASH_TBL_BASE_ADDR_A 0x19c30
 #define LE_DB_INT_CAUSE_A 0x19c3c
+#define LE_DB_CLCAM_TID_BASE_A 0x19df4
 #define LE_DB_TID_HASHBASE_A 0x19df8
 #define T6_LE_DB_HASH_TID_BASE_A 0x19df8
 
-- 
cgit v1.2.3-59-g8ed1b


From b8b394faa92418521829bbd4ae9cb22fb6c504ed Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:18 -0700
Subject: bpf: selftest additions for SOCKHASH

This runs existing SOCKMAP tests with SOCKHASH map type. To do this
we push programs into include file and build two BPF programs. One
for SOCKHASH and one for SOCKMAP.

We then run the entire test suite with each type.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h                   |  54 +++-
 tools/testing/selftests/bpf/Makefile             |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h        |   8 +
 tools/testing/selftests/bpf/test_sockhash_kern.c |   5 +
 tools/testing/selftests/bpf/test_sockmap.c       |  27 +-
 tools/testing/selftests/bpf/test_sockmap_kern.c  | 343 +--------------------
 tools/testing/selftests/bpf/test_sockmap_kern.h  | 363 +++++++++++++++++++++++
 7 files changed, 453 insertions(+), 349 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sockhash_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_sockmap_kern.h

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 02e4112510f8..d94d333a8225 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
 	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
 };
 
 enum bpf_prog_type {
@@ -1828,7 +1829,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- *
  * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
@@ -1855,6 +1855,53 @@ union bpf_attr {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1926,7 +1973,10 @@ union bpf_attr {
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
 	FN(skb_load_bytes_relative),	\
-	FN(fib_lookup),
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 133ebc68cbe4..1eb0fa2aba92 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o
+	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 2375d06c706b..8f143dfb3700 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -75,9 +75,14 @@ static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) =
 	(void *) BPF_FUNC_sock_ops_cb_flags_set;
 static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
+static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) =
+	(void *) BPF_FUNC_sk_redirect_hash;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
 				  unsigned long long flags) =
 	(void *) BPF_FUNC_sock_map_update;
+static int (*bpf_sock_hash_update)(void *map, void *key, void *value,
+				   unsigned long long flags) =
+	(void *) BPF_FUNC_sock_hash_update;
 static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 					void *buf, unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_event_read_value;
@@ -88,6 +93,9 @@ static int (*bpf_override_return)(void *ctx, unsigned long rc) =
 	(void *) BPF_FUNC_override_return;
 static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) =
 	(void *) BPF_FUNC_msg_redirect_map;
+static int (*bpf_msg_redirect_hash)(void *ctx,
+				    void *map, void *key, int flags) =
+	(void *) BPF_FUNC_msg_redirect_hash;
 static int (*bpf_msg_apply_bytes)(void *ctx, int len) =
 	(void *) BPF_FUNC_msg_apply_bytes;
 static int (*bpf_msg_cork_bytes)(void *ctx, int len) =
diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/test_sockhash_kern.c
new file mode 100644
index 000000000000..e6755916442a
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockhash_kern.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#undef SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 29c022d23f4e..eb17fae458e6 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -47,7 +47,8 @@ static void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-#define BPF_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
 #define CG_PATH "/sockmap"
 
 /* global sockets */
@@ -1260,9 +1261,8 @@ int prog_type[] = {
 	BPF_PROG_TYPE_SK_MSG,
 };
 
-static int populate_progs(void)
+static int populate_progs(char *bpf_file)
 {
-	char *bpf_file = BPF_FILENAME;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
 	int i = 0;
@@ -1306,11 +1306,11 @@ static int populate_progs(void)
 	return 0;
 }
 
-static int test_suite(void)
+static int __test_suite(char *bpf_file)
 {
 	int cg_fd, err;
 
-	err = populate_progs();
+	err = populate_progs(bpf_file);
 	if (err < 0) {
 		fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
 		return err;
@@ -1347,17 +1347,30 @@ static int test_suite(void)
 
 out:
 	printf("Summary: %i PASSED %i FAILED\n", passed, failed);
+	cleanup_cgroup_environment();
 	close(cg_fd);
 	return err;
 }
 
+static int test_suite(void)
+{
+	int err;
+
+	err = __test_suite(BPF_SOCKMAP_FILENAME);
+	if (err)
+		goto out;
+	err = __test_suite(BPF_SOCKHASH_FILENAME);
+out:
+	return err;
+}
+
 int main(int argc, char **argv)
 {
 	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
 	int iov_count = 1, length = 1024, rate = 1;
 	struct sockmap_options options = {0};
 	int opt, longindex, err, cg_fd = 0;
-	char *bpf_file = BPF_FILENAME;
+	char *bpf_file = BPF_SOCKMAP_FILENAME;
 	int test = PING_PONG;
 
 	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
@@ -1438,7 +1451,7 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
-	err = populate_progs();
+	err = populate_progs(bpf_file);
 	if (err) {
 		fprintf(stderr, "populate program: (%s) %s\n",
 			bpf_file, strerror(errno));
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/test_sockmap_kern.c
index 33de97e2b6b6..677b2ed1cc1e 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.c
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.c
@@ -1,340 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
-#include <stddef.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/in.h>
-#include <linux/udp.h>
-#include <linux/tcp.h>
-#include <linux/pkt_cls.h>
-#include <sys/socket.h>
-#include "bpf_helpers.h"
-#include "bpf_endian.h"
-
-/* Sockmap sample program connects a client and a backend together
- * using cgroups.
- *
- *    client:X <---> frontend:80 client:X <---> backend:80
- *
- * For simplicity we hard code values here and bind 1:1. The hard
- * coded values are part of the setup in sockmap.sh script that
- * is associated with this BPF program.
- *
- * The bpf_printk is verbose and prints information as connections
- * are established and verdicts are decided.
- */
-
-#define bpf_printk(fmt, ...)					\
-({								\
-	       char ____fmt[] = fmt;				\
-	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
-				##__VA_ARGS__);			\
-})
-
-struct bpf_map_def SEC("maps") sock_map = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_txmsg = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_map_redir = {
-	.type = BPF_MAP_TYPE_SOCKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 20,
-};
-
-struct bpf_map_def SEC("maps") sock_apply_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_cork_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_pull_bytes = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 2
-};
-
-struct bpf_map_def SEC("maps") sock_redir_flags = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-struct bpf_map_def SEC("maps") sock_skb_opts = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = 1
-};
-
-SEC("sk_skb1")
-int bpf_prog1(struct __sk_buff *skb)
-{
-	return skb->len;
-}
-
-SEC("sk_skb2")
-int bpf_prog2(struct __sk_buff *skb)
-{
-	__u32 lport = skb->local_port;
-	__u32 rport = skb->remote_port;
-	int len, *f, ret, zero = 0;
-	__u64 flags = 0;
-
-	if (lport == 10000)
-		ret = 10;
-	else
-		ret = 1;
-
-	len = (__u32)skb->data_end - (__u32)skb->data;
-	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
-	if (f && *f) {
-		ret = 3;
-		flags = *f;
-	}
-
-	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
-		   len, flags);
-	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
-}
-
-SEC("sockops")
-int bpf_sockmap(struct bpf_sock_ops *skops)
-{
-	__u32 lport, rport;
-	int op, err = 0, index, key, ret;
-
-
-	op = (int) skops->op;
-
-	switch (op) {
-	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (lport == 10000) {
-			ret = 1;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
-		lport = skops->local_port;
-		rport = skops->remote_port;
-
-		if (bpf_ntohl(rport) == 10001) {
-			ret = 10;
-			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST);
-			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
-				   lport, bpf_ntohl(rport), err);
-		}
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-SEC("sk_msg1")
-int bpf_prog4(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	return SK_PASS;
-}
-
-SEC("sk_msg2")
-int bpf_prog5(struct sk_msg_md *msg)
-{
-	int err1 = -1, err2 = -1, zero = 0, one = 1;
-	int *bytes, *start, *end, len1, len2;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-		int err;
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
-		   len1, err1, err2);
-	return SK_PASS;
-}
-
-SEC("sk_msg3")
-int bpf_prog6(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1, key = 0;
-	int *start, *end, *f;
-	__u64 flags = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-}
-
-SEC("sk_msg4")
-int bpf_prog7(struct sk_msg_md *msg)
-{
-	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
-	int *f, *bytes, *start, *end, len1, len2;
-	__u64 flags = 0;
-
-		int err;
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		err1 = bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		err2 = bpf_msg_cork_bytes(msg, *bytes);
-	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
-
-		bpf_printk("sk_msg2: pull(%i:%i)\n",
-			   start ? *start : 0, end ? *end : 0);
-		err = bpf_msg_pull_data(msg, *start, *end, 0);
-		if (err)
-			bpf_printk("sk_msg2: pull_data err %i\n",
-				   err);
-		len2 = (__u64)msg->data_end - (__u64)msg->data;
-		bpf_printk("sk_msg2: length update %i->%i\n",
-			   len1, len2);
-	}
-	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
-	if (f && *f) {
-		key = 2;
-		flags = *f;
-	}
-	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
-		   len1, flags, err1 ? err1 : err2);
-	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
-	bpf_printk("sk_msg3: err %i\n", err);
-	return err;
-}
-
-SEC("sk_msg5")
-int bpf_prog8(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes) {
-		ret = bpf_msg_apply_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	} else {
-		return SK_DROP;
-	}
-	return SK_PASS;
-}
-SEC("sk_msg6")
-int bpf_prog9(struct sk_msg_md *msg)
-{
-	void *data_end = (void *)(long) msg->data_end;
-	void *data = (void *)(long) msg->data;
-	int ret = 0, *bytes, zero = 0;
-
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes) {
-		if (((__u64)data_end - (__u64)data) >= *bytes)
-			return SK_PASS;
-		ret = bpf_msg_cork_bytes(msg, *bytes);
-		if (ret)
-			return SK_DROP;
-	}
-	return SK_PASS;
-}
-
-SEC("sk_msg7")
-int bpf_prog10(struct sk_msg_md *msg)
-{
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
-
-	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
-	if (bytes)
-		bpf_msg_apply_bytes(msg, *bytes);
-	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
-	if (bytes)
-		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end)
-		bpf_msg_pull_data(msg, *start, *end, 0);
-
-	return SK_DROP;
-}
-
-int _version SEC("version") = 1;
-char _license[] SEC("license") = "GPL";
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#define SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKMAP
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
new file mode 100644
index 000000000000..8e8e41780bb9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ *    client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") sock_map = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_txmsg = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_map_redir = {
+	.type = TEST_MAP_TYPE,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 20,
+};
+
+struct bpf_map_def SEC("maps") sock_apply_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_cork_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_pull_bytes = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 2
+};
+
+struct bpf_map_def SEC("maps") sock_redir_flags = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+struct bpf_map_def SEC("maps") sock_skb_opts = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1
+};
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+	__u32 lport = skb->local_port;
+	__u32 rport = skb->remote_port;
+	int len, *f, ret, zero = 0;
+	__u64 flags = 0;
+
+	if (lport == 10000)
+		ret = 10;
+	else
+		ret = 1;
+
+	len = (__u32)skb->data_end - (__u32)skb->data;
+	f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+	if (f && *f) {
+		ret = 3;
+		flags = *f;
+	}
+
+	bpf_printk("sk_skb2: redirect(%iB) flags=%i\n",
+		   len, flags);
+#ifdef SOCKMAP
+	return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+#else
+	return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
+#endif
+
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+	__u32 lport, rport;
+	int op, err = 0, index, key, ret;
+
+
+	op = (int) skops->op;
+
+	switch (op) {
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (lport == 10000) {
+			ret = 1;
+#ifdef SOCKMAP
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+#else
+			err = bpf_sock_hash_update(skops, &sock_map, &ret,
+						   BPF_NOEXIST);
+#endif
+			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+		lport = skops->local_port;
+		rport = skops->remote_port;
+
+		if (bpf_ntohl(rport) == 10001) {
+			ret = 10;
+#ifdef SOCKMAP
+			err = bpf_sock_map_update(skops, &sock_map, &ret,
+						  BPF_NOEXIST);
+#else
+			err = bpf_sock_hash_update(skops, &sock_map, &ret,
+						   BPF_NOEXIST);
+#endif
+			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
+				   lport, bpf_ntohl(rport), err);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog5(struct sk_msg_md *msg)
+{
+	int err1 = -1, err2 = -1, zero = 0, one = 1;
+	int *bytes, *start, *end, len1, len2;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+		int err;
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
+		   len1, err1, err2);
+	return SK_PASS;
+}
+
+SEC("sk_msg3")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1, key = 0;
+	int *start, *end, *f;
+	__u64 flags = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+#ifdef SOCKMAP
+	return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+	return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+}
+
+SEC("sk_msg4")
+int bpf_prog7(struct sk_msg_md *msg)
+{
+	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
+	int *f, *bytes, *start, *end, len1, len2;
+	__u64 flags = 0;
+
+		int err;
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		err1 = bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		err2 = bpf_msg_cork_bytes(msg, *bytes);
+	len1 = (__u64)msg->data_end - (__u64)msg->data;
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end) {
+
+		bpf_printk("sk_msg2: pull(%i:%i)\n",
+			   start ? *start : 0, end ? *end : 0);
+		err = bpf_msg_pull_data(msg, *start, *end, 0);
+		if (err)
+			bpf_printk("sk_msg2: pull_data err %i\n",
+				   err);
+		len2 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length update %i->%i\n",
+			   len1, len2);
+	}
+	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+	if (f && *f) {
+		key = 2;
+		flags = *f;
+	}
+	bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n",
+		   len1, flags, err1 ? err1 : err2);
+#ifdef SOCKMAP
+	err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+	err = bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+	bpf_printk("sk_msg3: err %i\n", err);
+	return err;
+}
+
+SEC("sk_msg5")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes) {
+		ret = bpf_msg_apply_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	} else {
+		return SK_DROP;
+	}
+	return SK_PASS;
+}
+SEC("sk_msg6")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+	void *data_end = (void *)(long) msg->data_end;
+	void *data = (void *)(long) msg->data;
+	int ret = 0, *bytes, zero = 0;
+
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes) {
+		if (((__u64)data_end - (__u64)data) >= *bytes)
+			return SK_PASS;
+		ret = bpf_msg_cork_bytes(msg, *bytes);
+		if (ret)
+			return SK_DROP;
+	}
+	return SK_PASS;
+}
+
+SEC("sk_msg7")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+	int *bytes, zero = 0, one = 1;
+	int *start, *end;
+
+	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+	if (bytes)
+		bpf_msg_apply_bytes(msg, *bytes);
+	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+	if (bytes)
+		bpf_msg_cork_bytes(msg, *bytes);
+	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	if (start && end)
+		bpf_msg_pull_data(msg, *start, *end, 0);
+
+	return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From 62c52d1fddb5ef201e3a25d7bd1b79fcb0ca42b8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 14 May 2018 10:00:19 -0700
Subject: bpf: bpftool, support for sockhash

This adds the SOCKHASH map type to bpftools so that we get correct
pretty printing.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index af6766e956ba..097b1a5e046b 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -66,6 +66,7 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_DEVMAP]		= "devmap",
 	[BPF_MAP_TYPE_SOCKMAP]		= "sockmap",
 	[BPF_MAP_TYPE_CPUMAP]		= "cpumap",
+	[BPF_MAP_TYPE_SOCKHASH]		= "sockhash",
 };
 
 static bool map_is_per_cpu(__u32 type)
-- 
cgit v1.2.3-59-g8ed1b


From 44edef77bd92730e1520b07f5ae2c9f4628738a8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 15 May 2018 16:20:52 -0700
Subject: samples/bpf: Decrement ttl in fib forwarding example

Only consider forwarding packets if ttl in received packet is > 1 and
decrement ttl before handing off to bpf_redirect_map.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_fwd_kern.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index cdf4fc383cc9..4a6be0f87505 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -30,12 +30,24 @@ struct bpf_map_def SEC("maps") tx_port = {
 	.max_entries = 64,
 };
 
+/* from include/net/ip.h */
+static __always_inline int ip_decrease_ttl(struct iphdr *iph)
+{
+	u32 check = (__force u32)iph->check;
+
+	check += (__force u32)htons(0x0100);
+	iph->check = (__force __sum16)(check + (check >= 0xFFFF));
+	return --iph->ttl;
+}
+
 static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data = (void *)(long)ctx->data;
 	struct bpf_fib_lookup fib_params;
 	struct ethhdr *eth = data;
+	struct ipv6hdr *ip6h;
+	struct iphdr *iph;
 	int out_index;
 	u16 h_proto;
 	u64 nh_off;
@@ -48,11 +60,14 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 	h_proto = eth->h_proto;
 	if (h_proto == htons(ETH_P_IP)) {
-		struct iphdr *iph = data + nh_off;
+		iph = data + nh_off;
 
 		if (iph + 1 > data_end)
 			return XDP_DROP;
 
+		if (iph->ttl <= 1)
+			return XDP_PASS;
+
 		fib_params.family	= AF_INET;
 		fib_params.tos		= iph->tos;
 		fib_params.l4_protocol	= iph->protocol;
@@ -64,19 +79,22 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	} else if (h_proto == htons(ETH_P_IPV6)) {
 		struct in6_addr *src = (struct in6_addr *) fib_params.ipv6_src;
 		struct in6_addr *dst = (struct in6_addr *) fib_params.ipv6_dst;
-		struct ipv6hdr *iph = data + nh_off;
 
-		if (iph + 1 > data_end)
+		ip6h = data + nh_off;
+		if (ip6h + 1 > data_end)
 			return XDP_DROP;
 
+		if (ip6h->hop_limit <= 1)
+			return XDP_PASS;
+
 		fib_params.family	= AF_INET6;
-		fib_params.flowlabel	= *(__be32 *)iph & IPV6_FLOWINFO_MASK;
-		fib_params.l4_protocol	= iph->nexthdr;
+		fib_params.flowlabel	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+		fib_params.l4_protocol	= ip6h->nexthdr;
 		fib_params.sport	= 0;
 		fib_params.dport	= 0;
-		fib_params.tot_len	= ntohs(iph->payload_len);
-		*src			= iph->saddr;
-		*dst			= iph->daddr;
+		fib_params.tot_len	= ntohs(ip6h->payload_len);
+		*src			= ip6h->saddr;
+		*dst			= ip6h->daddr;
 	} else {
 		return XDP_PASS;
 	}
@@ -92,6 +110,11 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 	 *       forwarding packets are dropped.
 	 */
 	if (out_index > 0) {
+		if (h_proto == htons(ETH_P_IP))
+			ip_decrease_ttl(iph);
+		else if (h_proto == htons(ETH_P_IPV6))
+			ip6h->hop_limit--;
+
 		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
 		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
 		return bpf_redirect_map(&tx_port, out_index, 0);
-- 
cgit v1.2.3-59-g8ed1b


From be2d04d11fd33bd46622f94619aae1596d9f9303 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 16 May 2018 22:27:41 +0200
Subject: bpf: add __printf verification to bpf_verifier_vlog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__printf is useful to verify format and arguments. ‘bpf_verifier_vlog’
function is used twice in verifier.c in both cases the caller function
already uses the __printf gcc attribute.

Remove the following warning, triggered with W=1:

  kernel/bpf/verifier.c:176:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format]

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8f70dc181e23..c286813deaeb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -200,8 +200,8 @@ struct bpf_verifier_env {
 	u32 subprog_cnt;
 };
 
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
-		       va_list args);
+__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
+				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-- 
cgit v1.2.3-59-g8ed1b


From f0307a7ed17fa8925321a4f58f5ca56eeedd4fa3 Mon Sep 17 00:00:00 2001
From: David Beckett <david.beckett@netronome.com>
Date: Wed, 16 May 2018 14:02:49 -0700
Subject: libbpf: add ifindex to enable offload support

BPF programs currently can only be offloaded using iproute2. This
patch will allow programs to be offloaded using libbpf calls.

Signed-off-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/bpf.c    |  2 ++
 tools/lib/bpf/bpf.h    |  2 ++
 tools/lib/bpf/libbpf.c | 18 +++++++++++++++---
 tools/lib/bpf/libbpf.h |  1 +
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a3a8fb2ac697..6a8a00097fd8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -91,6 +91,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	attr.btf_fd = create_attr->btf_fd;
 	attr.btf_key_id = create_attr->btf_key_id;
 	attr.btf_value_id = create_attr->btf_value_id;
+	attr.map_ifindex = create_attr->map_ifindex;
 
 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
@@ -201,6 +202,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	attr.log_size = 0;
 	attr.log_level = 0;
 	attr.kern_version = load_attr->kern_version;
+	attr.prog_ifindex = load_attr->prog_ifindex;
 	memcpy(attr.prog_name, load_attr->name,
 	       min(name_len, BPF_OBJ_NAME_LEN - 1));
 
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index fb3a146d92ff..15bff7728cf1 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -38,6 +38,7 @@ struct bpf_create_map_attr {
 	__u32 btf_fd;
 	__u32 btf_key_id;
 	__u32 btf_value_id;
+	__u32 map_ifindex;
 };
 
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
@@ -64,6 +65,7 @@ struct bpf_load_program_attr {
 	size_t insns_cnt;
 	const char *license;
 	__u32 kern_version;
+	__u32 prog_ifindex;
 };
 
 /* Recommend log buffer size */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index df54c4c9e48a..3dbe217bf23e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -178,6 +178,7 @@ struct bpf_program {
 	/* Index in elf obj file, for relocation use. */
 	int idx;
 	char *name;
+	int prog_ifindex;
 	char *section_name;
 	struct bpf_insn *insns;
 	size_t insns_cnt, main_prog_cnt;
@@ -213,6 +214,7 @@ struct bpf_map {
 	int fd;
 	char *name;
 	size_t offset;
+	int map_ifindex;
 	struct bpf_map_def def;
 	uint32_t btf_key_id;
 	uint32_t btf_value_id;
@@ -1091,6 +1093,7 @@ bpf_object__create_maps(struct bpf_object *obj)
 		int *pfd = &map->fd;
 
 		create_attr.name = map->name;
+		create_attr.map_ifindex = map->map_ifindex;
 		create_attr.map_type = def->type;
 		create_attr.map_flags = def->map_flags;
 		create_attr.key_size = def->key_size;
@@ -1273,7 +1276,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 static int
 load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	     const char *name, struct bpf_insn *insns, int insns_cnt,
-	     char *license, u32 kern_version, int *pfd)
+	     char *license, u32 kern_version, int *pfd, int prog_ifindex)
 {
 	struct bpf_load_program_attr load_attr;
 	char *log_buf;
@@ -1287,6 +1290,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	load_attr.insns_cnt = insns_cnt;
 	load_attr.license = license;
 	load_attr.kern_version = kern_version;
+	load_attr.prog_ifindex = prog_ifindex;
 
 	if (!load_attr.insns || !load_attr.insns_cnt)
 		return -EINVAL;
@@ -1368,7 +1372,8 @@ bpf_program__load(struct bpf_program *prog,
 		}
 		err = load_program(prog->type, prog->expected_attach_type,
 				   prog->name, prog->insns, prog->insns_cnt,
-				   license, kern_version, &fd);
+				   license, kern_version, &fd,
+				   prog->prog_ifindex);
 		if (!err)
 			prog->instances.fds[0] = fd;
 		goto out;
@@ -1399,7 +1404,8 @@ bpf_program__load(struct bpf_program *prog,
 		err = load_program(prog->type, prog->expected_attach_type,
 				   prog->name, result.new_insn_ptr,
 				   result.new_insn_cnt,
-				   license, kern_version, &fd);
+				   license, kern_version, &fd,
+				   prog->prog_ifindex);
 
 		if (err) {
 			pr_warning("Loading the %dth instance of program '%s' failed\n",
@@ -2188,6 +2194,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	enum bpf_attach_type expected_attach_type;
 	enum bpf_prog_type prog_type;
 	struct bpf_object *obj;
+	struct bpf_map *map;
 	int section_idx;
 	int err;
 
@@ -2207,6 +2214,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 		 * section name.
 		 */
 		prog_type = attr->prog_type;
+		prog->prog_ifindex = attr->ifindex;
 		expected_attach_type = attr->expected_attach_type;
 		if (prog_type == BPF_PROG_TYPE_UNSPEC) {
 			section_idx = bpf_program__identify_section(prog);
@@ -2227,6 +2235,10 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 			first_prog = prog;
 	}
 
+	bpf_map__for_each(map, obj) {
+		map->map_ifindex = attr->ifindex;
+	}
+
 	if (!first_prog) {
 		pr_warning("object file doesn't contain bpf program\n");
 		bpf_object__close(obj);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 4574b9563278..cd3fd8d782c7 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -259,6 +259,7 @@ struct bpf_prog_load_attr {
 	const char *file;
 	enum bpf_prog_type prog_type;
 	enum bpf_attach_type expected_attach_type;
+	int ifindex;
 };
 
 int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
-- 
cgit v1.2.3-59-g8ed1b


From 683d2ac3904c74a625ea62d7fcc6c1efd84be087 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 16 May 2018 14:06:26 -0700
Subject: bpf: fix sock hashmap kmalloc warning

syzbot reported a kernel warning below:
  WARNING: CPU: 0 PID: 4499 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  Kernel panic - not syncing: panic_on_warn set ...

  CPU: 0 PID: 4499 Comm: syz-executor050 Not tainted 4.17.0-rc3+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x1b9/0x294 lib/dump_stack.c:113
   panic+0x22f/0x4de kernel/panic.c:184
   __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
   report_bug+0x252/0x2d0 lib/bug.c:186
   fixup_bug arch/x86/kernel/traps.c:178 [inline]
   do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
   invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
  RIP: 0010:kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  RSP: 0018:ffff8801d907fc58 EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff8801aeecb280 RCX: ffffffff8185ebd7
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000ffffffe1
  RBP: ffff8801d907fc58 R08: ffff8801adb5e1c0 R09: ffffed0035a84700
  R10: ffffed0035a84700 R11: ffff8801ad423803 R12: ffff8801aeecb280
  R13: 00000000fffffff4 R14: ffff8801ad891a00 R15: 00000000014200c0
   __do_kmalloc mm/slab.c:3713 [inline]
   __kmalloc+0x25/0x760 mm/slab.c:3727
   kmalloc include/linux/slab.h:517 [inline]
   map_get_next_key+0x24a/0x640 kernel/bpf/syscall.c:858
   __do_sys_bpf kernel/bpf/syscall.c:2131 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:2096 [inline]
   __x64_sys_bpf+0x354/0x4f0 kernel/bpf/syscall.c:2096
   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

The test case is against sock hashmap with a key size 0xffffffe1.
Such a large key size will cause the below code in function
sock_hash_alloc() overflowing and produces a smaller elem_size,
hence map creation will be successful.
    htab->elem_size = sizeof(struct htab_elem) +
                      round_up(htab->map.key_size, 8);

Later, when map_get_next_key is called and kernel tries
to allocate the key unsuccessfully, it will issue
the above warning.

Similar to hashtab, ensure the key size is at most
MAX_BPF_STACK for a successful map creation.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Reported-by: syzbot+e4566d29080e7f3460ff@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 56879c9fd3a4..79f5e8988889 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1990,6 +1990,12 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		return ERR_PTR(-E2BIG);
+
 	err = bpf_tcp_ulp_register();
 	if (err && err != -EEXIST)
 		return ERR_PTR(err);
-- 
cgit v1.2.3-59-g8ed1b


From e23afe5e7cba89cd0744c5218eda1b3553455c17 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 16 May 2018 16:38:14 -0700
Subject: bpf: sockmap, on update propagate errors back to userspace

When an error happens in the update sockmap element logic also pass
the err up to the user.

Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 79f5e8988889..c6de1393df63 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1875,7 +1875,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
 out:
-	return 0;
+	return err;
 }
 
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
-- 
cgit v1.2.3-59-g8ed1b


From a5898e97f01acfbcafe9e90196b1c811a46ebf44 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 May 2018 17:46:45 -0700
Subject: net/mlx5: Vport, Use 'kvfree()' for memory allocated by 'kvzalloc()'

When 'kvzalloc()' is used to allocate memory, 'kvfree()' must be used to
free it.

Fixes: 9efa75254593d ("net/mlx5_core: Introduce access functions to query vport RoCE fields")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 177e076b8d17..719cecb182c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -511,7 +511,7 @@ int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
 	*system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
 					nic_vport_context.system_image_guid);
 
-	kfree(out);
+	kvfree(out);
 
 	return 0;
 }
@@ -531,7 +531,7 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
 	*node_guid = MLX5_GET64(query_nic_vport_context_out, out,
 				nic_vport_context.node_guid);
 
-	kfree(out);
+	kvfree(out);
 
 	return 0;
 }
@@ -587,7 +587,7 @@ int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
 	*qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
 				   nic_vport_context.qkey_violation_counter);
 
-	kfree(out);
+	kvfree(out);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e574978ae52a3fb1346bc37de2f2221dd878c065 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 May 2018 17:49:01 -0700
Subject: net/mlx5: Eswitch, Use 'kvfree()' for memory allocated by
 'kvzalloc()'

When 'kvzalloc()' is used to allocate memory, 'kvfree()' must be used to
free it.

Fixes: fed9ce22bf8ae ("net/mlx5: E-Switch, Add API to create vport rx rules")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 35e256eb2f6e..b123f8a52ad8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -663,7 +663,7 @@ static int esw_create_vport_rx_group(struct mlx5_eswitch *esw)
 
 	esw->offloads.vport_rx_group = g;
 out:
-	kfree(flow_group_in);
+	kvfree(flow_group_in);
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 909d434406f9e7887ffb1ddaff1a7d0da50075b1 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 16 May 2018 17:50:19 -0700
Subject: IB/mlx5: Use 'kvfree()' for memory allocated by 'kvzalloc()'

When 'kvzalloc()' is used to allocate memory, 'kvfree()' must be used to
free it.

Fixes: 1cbe6fc86ccfe ("IB/mlx5: Add support for CQE compressing")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 77d257ec899b..6d52ea03574e 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -849,7 +849,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	return 0;
 
 err_cqb:
-	kfree(*cqb);
+	kvfree(*cqb);
 
 err_db:
 	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
-- 
cgit v1.2.3-59-g8ed1b


From 01cd267bff52619a53fa05c930ea5ed53493d21a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 May 2018 10:05:38 +0200
Subject: netfilter: fix fallout from xt/nf osf separation

Stephen Rothwell says:
  today's linux-next build (x86_64 allmodconfig) produced this warning:
  ./usr/include/linux/netfilter/nf_osf.h:25: found __[us]{8,16,32,64} type without #include <linux/types.h>

Fix that up and also move kernel-private struct out of uapi (it was not
exposed in any released kernel version).

tested via allmodconfig build + make headers_check.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_osf.h      | 6 ++++++
 include/uapi/linux/netfilter/nf_osf.h | 8 ++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/linux/netfilter/nf_osf.h b/include/linux/netfilter/nf_osf.h
index a2b39602e87d..0e114c492fb8 100644
--- a/include/linux/netfilter/nf_osf.h
+++ b/include/linux/netfilter/nf_osf.h
@@ -21,6 +21,12 @@ enum osf_fmatch_states {
 	FMATCH_OPT_WRONG,
 };
 
+struct nf_osf_finger {
+	struct rcu_head			rcu_head;
+	struct list_head		finger_entry;
+	struct nf_osf_user_finger	finger;
+};
+
 bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  int hooknum, struct net_device *in, struct net_device *out,
 		  const struct nf_osf_info *info, struct net *net,
diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 45376eae31ef..8f2f2f403183 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -1,6 +1,8 @@
 #ifndef _NF_OSF_H
 #define _NF_OSF_H
 
+#include <linux/types.h>
+
 #define MAXGENRELEN	32
 
 #define NF_OSF_GENRE	(1 << 0)
@@ -57,12 +59,6 @@ struct nf_osf_user_finger {
 	struct nf_osf_opt	opt[MAX_IPOPTLEN];
 };
 
-struct nf_osf_finger {
-	struct rcu_head			rcu_head;
-	struct list_head		finger_entry;
-	struct nf_osf_user_finger	finger;
-};
-
 struct nf_osf_nlmsg {
 	struct nf_osf_user_finger	f;
 	struct iphdr			ip;
-- 
cgit v1.2.3-59-g8ed1b


From e65eebec9c67dfd973d8b79844b992ff828d00df Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 11 May 2018 21:55:39 +0200
Subject: netfilter: nf_tables: remove old nf_log based tracing

nfnetlink tracing is available since nft 0.6 (June 2016).
Remove old nf_log based tracing to avoid rule counter in main loop.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 9cf47c4cb9d5..d457d854fcae 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -41,7 +41,7 @@ static const struct nf_loginfo trace_loginfo = {
 
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 					const struct nft_chain *chain,
-					int rulenum, enum nft_trace_types type)
+					enum nft_trace_types type)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
 
@@ -52,22 +52,16 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 	info->type = type;
 
 	nft_trace_notify(info);
-
-	nf_log_trace(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
-		     nft_in(pkt), nft_out(pkt), &trace_loginfo,
-		     "TRACE: %s:%s:%s:%u ",
-		     chain->table->name, chain->name, comments[type], rulenum);
 }
 
 static inline void nft_trace_packet(struct nft_traceinfo *info,
 				    const struct nft_chain *chain,
 				    const struct nft_rule *rule,
-				    int rulenum,
 				    enum nft_trace_types type)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->rule = rule;
-		__nft_trace_packet(info, chain, rulenum, type);
+		__nft_trace_packet(info, chain, type);
 	}
 }
 
@@ -133,7 +127,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 struct nft_jumpstack {
 	const struct nft_chain	*chain;
 	const struct nft_rule	*rule;
-	int			rulenum;
 };
 
 unsigned int
@@ -146,7 +139,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 	struct nft_regs regs;
 	unsigned int stackptr = 0;
 	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
-	int rulenum;
 	unsigned int gencursor = nft_genmask_cur(net);
 	struct nft_traceinfo info;
 
@@ -154,7 +146,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 	if (static_branch_unlikely(&nft_trace_enabled))
 		nft_trace_init(&info, pkt, &regs.verdict, basechain);
 do_chain:
-	rulenum = 0;
 	rule = list_entry(&chain->rules, struct nft_rule, list);
 next_rule:
 	regs.verdict.code = NFT_CONTINUE;
@@ -164,8 +155,6 @@ next_rule:
 		if (unlikely(rule->genmask & gencursor))
 			continue;
 
-		rulenum++;
-
 		nft_rule_for_each_expr(expr, last, rule) {
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, &regs);
@@ -183,7 +172,7 @@ next_rule:
 			continue;
 		case NFT_CONTINUE:
 			nft_trace_packet(&info, chain, rule,
-					 rulenum, NFT_TRACETYPE_RULE);
+					 NFT_TRACETYPE_RULE);
 			continue;
 		}
 		break;
@@ -195,7 +184,7 @@ next_rule:
 	case NF_QUEUE:
 	case NF_STOLEN:
 		nft_trace_packet(&info, chain, rule,
-				 rulenum, NFT_TRACETYPE_RULE);
+				 NFT_TRACETYPE_RULE);
 		return regs.verdict.code;
 	}
 
@@ -204,21 +193,19 @@ next_rule:
 		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
 		jumpstack[stackptr].chain = chain;
 		jumpstack[stackptr].rule  = rule;
-		jumpstack[stackptr].rulenum = rulenum;
 		stackptr++;
 		/* fall through */
 	case NFT_GOTO:
 		nft_trace_packet(&info, chain, rule,
-				 rulenum, NFT_TRACETYPE_RULE);
+				 NFT_TRACETYPE_RULE);
 
 		chain = regs.verdict.chain;
 		goto do_chain;
 	case NFT_CONTINUE:
-		rulenum++;
 		/* fall through */
 	case NFT_RETURN:
 		nft_trace_packet(&info, chain, rule,
-				 rulenum, NFT_TRACETYPE_RETURN);
+				 NFT_TRACETYPE_RETURN);
 		break;
 	default:
 		WARN_ON(1);
@@ -228,12 +215,10 @@ next_rule:
 		stackptr--;
 		chain = jumpstack[stackptr].chain;
 		rule  = jumpstack[stackptr].rule;
-		rulenum = jumpstack[stackptr].rulenum;
 		goto next_rule;
 	}
 
-	nft_trace_packet(&info, basechain, NULL, -1,
-			 NFT_TRACETYPE_POLICY);
+	nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
 
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
-- 
cgit v1.2.3-59-g8ed1b


From 978d8f9055c3a7c35db2ac99cd2580b993396e33 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 11 May 2018 00:14:17 +0200
Subject: netfilter: nft_numgen: add map lookups for numgen random operations

This patch uses the map lookup already included to be applied
for random number generation.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_numgen.c | 76 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 8a64db8f2e69..cdbc62a53933 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -166,18 +166,43 @@ struct nft_ng_random {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
+	struct nft_set		*map;
 };
 
+static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+{
+	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
+
+	return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
+	       priv->offset;
+}
+
 static void nft_ng_random_eval(const struct nft_expr *expr,
 			       struct nft_regs *regs,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_ng_random *priv = nft_expr_priv(expr);
-	struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-	u32 val;
 
-	val = reciprocal_scale(prandom_u32_state(state), priv->modulus);
-	regs->data[priv->dreg] = val + priv->offset;
+	regs->data[priv->dreg] = nft_ng_random_gen(priv);
+}
+
+static void nft_ng_random_map_eval(const struct nft_expr *expr,
+				   struct nft_regs *regs,
+				   const struct nft_pktinfo *pkt)
+{
+	struct nft_ng_random *priv = nft_expr_priv(expr);
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = nft_ng_random_gen(priv);
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
 }
 
 static int nft_ng_random_init(const struct nft_ctx *ctx,
@@ -204,6 +229,23 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_ng_random_map_init(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr,
+				  const struct nlattr * const tb[])
+{
+	struct nft_ng_random *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_ng_random_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_NG_SET_NAME],
+					  tb[NFTA_NG_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -212,6 +254,22 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
 			   priv->offset);
 }
 
+static int nft_ng_random_map_dump(struct sk_buff *skb,
+				  const struct nft_expr *expr)
+{
+	const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+	if (nft_ng_dump(skb, priv->dreg, priv->modulus,
+			NFT_NG_RANDOM, priv->offset) ||
+	    nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_ng_type;
 static const struct nft_expr_ops nft_ng_inc_ops = {
 	.type		= &nft_ng_type,
@@ -237,6 +295,14 @@ static const struct nft_expr_ops nft_ng_random_ops = {
 	.dump		= nft_ng_random_dump,
 };
 
+static const struct nft_expr_ops nft_ng_random_map_ops = {
+	.type		= &nft_ng_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_ng_random)),
+	.eval		= nft_ng_random_map_eval,
+	.init		= nft_ng_random_map_init,
+	.dump		= nft_ng_random_map_dump,
+};
+
 static const struct nft_expr_ops *
 nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
@@ -255,6 +321,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 			return &nft_ng_inc_map_ops;
 		return &nft_ng_inc_ops;
 	case NFT_NG_RANDOM:
+		if (tb[NFTA_NG_SET_NAME])
+			return &nft_ng_random_map_ops;
 		return &nft_ng_random_ops;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From b9ccc07e3f31ad8073697982bac014fbceef7ecb Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Fri, 11 May 2018 00:14:48 +0200
Subject: netfilter: nft_hash: add map lookups for hashing operations

This patch creates new attributes to accept a map as argument and
then perform the lookup with the generated hash accordingly.

Both current hash functions are supported: Jenkins and Symmetric Hash.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_hash.c                 | 131 ++++++++++++++++++++++++++++++-
 2 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ce031cf72288..9c71f024f9cc 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -856,6 +856,8 @@ enum nft_hash_types {
  * @NFTA_HASH_SEED: seed value (NLA_U32)
  * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32)
  * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types)
+ * @NFTA_HASH_SET_NAME: name of the map to lookup (NLA_STRING)
+ * @NFTA_HASH_SET_ID: id of the map (NLA_U32)
  */
 enum nft_hash_attributes {
 	NFTA_HASH_UNSPEC,
@@ -866,6 +868,8 @@ enum nft_hash_attributes {
 	NFTA_HASH_SEED,
 	NFTA_HASH_OFFSET,
 	NFTA_HASH_TYPE,
+	NFTA_HASH_SET_NAME,
+	NFTA_HASH_SET_ID,
 	__NFTA_HASH_MAX,
 };
 #define NFTA_HASH_MAX	(__NFTA_HASH_MAX - 1)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e235c17f1b8b..f0fc21f88775 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -25,6 +25,7 @@ struct nft_jhash {
 	u32			modulus;
 	u32			seed;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_jhash_eval(const struct nft_expr *expr,
@@ -35,14 +36,39 @@ static void nft_jhash_eval(const struct nft_expr *expr,
 	const void *data = &regs->data[priv->sreg];
 	u32 h;
 
-	h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus);
+	h = reciprocal_scale(jhash(data, priv->len, priv->seed),
+			     priv->modulus);
+
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_jhash_map_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	const void *data = &regs->data[priv->sreg];
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(jhash(data, priv->len, priv->seed),
+					priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 struct nft_symhash {
 	enum nft_registers      dreg:8;
 	u32			modulus;
 	u32			offset;
+	struct nft_set		*map;
 };
 
 static void nft_symhash_eval(const struct nft_expr *expr,
@@ -58,6 +84,28 @@ static void nft_symhash_eval(const struct nft_expr *expr,
 	regs->data[priv->dreg] = h + priv->offset;
 }
 
+static void nft_symhash_map_eval(const struct nft_expr *expr,
+				 struct nft_regs *regs,
+				 const struct nft_pktinfo *pkt)
+{
+	struct nft_symhash *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	const struct nft_set *map = priv->map;
+	const struct nft_set_ext *ext;
+	u32 result;
+	bool found;
+
+	result = reciprocal_scale(__skb_get_hash_symmetric(skb),
+				  priv->modulus) + priv->offset;
+
+	found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
+	if (!found)
+		return;
+
+	nft_data_copy(&regs->data[priv->dreg],
+		      nft_set_ext_data(ext), map->dlen);
+}
+
 static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SREG]	= { .type = NLA_U32 },
 	[NFTA_HASH_DREG]	= { .type = NLA_U32 },
@@ -66,6 +114,9 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
 	[NFTA_HASH_SEED]	= { .type = NLA_U32 },
 	[NFTA_HASH_OFFSET]	= { .type = NLA_U32 },
 	[NFTA_HASH_TYPE]	= { .type = NLA_U32 },
+	[NFTA_HASH_SET_NAME]	= { .type = NLA_STRING,
+				    .len = NFT_SET_MAXNAMELEN - 1 },
+	[NFTA_HASH_SET_ID]	= { .type = NLA_U32 },
 };
 
 static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -115,6 +166,23 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_jhash_map_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_jhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_symhash_init(const struct nft_ctx *ctx,
 			    const struct nft_expr *expr,
 			    const struct nlattr * const tb[])
@@ -141,6 +209,23 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, sizeof(u32));
 }
 
+static int nft_symhash_map_init(const struct nft_ctx *ctx,
+				const struct nft_expr *expr,
+				const struct nlattr * const tb[])
+{
+	struct nft_jhash *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+
+	nft_symhash_init(ctx, expr, tb);
+	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
+					  tb[NFTA_HASH_SET_NAME],
+					  tb[NFTA_HASH_SET_ID], genmask);
+	if (IS_ERR(priv->map))
+		return PTR_ERR(priv->map);
+
+	return 0;
+}
+
 static int nft_jhash_dump(struct sk_buff *skb,
 			  const struct nft_expr *expr)
 {
@@ -168,6 +253,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_jhash_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr)
+{
+	const struct nft_jhash *priv = nft_expr_priv(expr);
+
+	if (nft_jhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static int nft_symhash_dump(struct sk_buff *skb,
 			    const struct nft_expr *expr)
 {
@@ -188,6 +285,18 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_symhash_map_dump(struct sk_buff *skb,
+				const struct nft_expr *expr)
+{
+	const struct nft_symhash *priv = nft_expr_priv(expr);
+
+	if (nft_symhash_dump(skb, expr) ||
+	    nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
+		return -1;
+
+	return 0;
+}
+
 static struct nft_expr_type nft_hash_type;
 static const struct nft_expr_ops nft_jhash_ops = {
 	.type		= &nft_hash_type,
@@ -197,6 +306,14 @@ static const struct nft_expr_ops nft_jhash_ops = {
 	.dump		= nft_jhash_dump,
 };
 
+static const struct nft_expr_ops nft_jhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
+	.eval		= nft_jhash_map_eval,
+	.init		= nft_jhash_map_init,
+	.dump		= nft_jhash_map_dump,
+};
+
 static const struct nft_expr_ops nft_symhash_ops = {
 	.type		= &nft_hash_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
@@ -205,6 +322,14 @@ static const struct nft_expr_ops nft_symhash_ops = {
 	.dump		= nft_symhash_dump,
 };
 
+static const struct nft_expr_ops nft_symhash_map_ops = {
+	.type		= &nft_hash_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
+	.eval		= nft_symhash_map_eval,
+	.init		= nft_symhash_map_init,
+	.dump		= nft_symhash_map_dump,
+};
+
 static const struct nft_expr_ops *
 nft_hash_select_ops(const struct nft_ctx *ctx,
 		    const struct nlattr * const tb[])
@@ -217,8 +342,12 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
 	type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
 	switch (type) {
 	case NFT_HASH_SYM:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_symhash_map_ops;
 		return &nft_symhash_ops;
 	case NFT_HASH_JENKINS:
+		if (tb[NFTA_HASH_SET_NAME])
+			return &nft_jhash_map_ops;
 		return &nft_jhash_ops;
 	default:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 9cfbfa701b55868cda4d638164887d5c74c7bfdd Mon Sep 17 00:00:00 2001
From: Cathy Zhou <cathy.zhou@oracle.com>
Date: Fri, 13 Apr 2018 11:28:37 -0700
Subject: ixgbe: cleanup sparse warnings

Sparse complains valid conversions between restricted types, force
attribute is used to avoid those warnings.

Signed-off-by: Cathy Zhou <cathy.zhou@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c  | 13 ++++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c  | 25 +++++++++++++--------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 29 +++++++++++++++----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_model.h  | 16 +++++++-------
 drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c   |  9 ++++----
 7 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
index 42f63b943ea0..1e49716f52bc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c
@@ -1436,7 +1436,8 @@ void ixgbe_atr_compute_perfect_hash_82599(union ixgbe_atr_input *input,
 {
 
 	u32 hi_hash_dword, lo_hash_dword, flow_vm_vlan;
-	u32 bucket_hash = 0, hi_dword = 0;
+	u32 bucket_hash = 0;
+	__be32 hi_dword = 0;
 	int i;
 
 	/* Apply masks to input data */
@@ -1475,7 +1476,7 @@ void ixgbe_atr_compute_perfect_hash_82599(union ixgbe_atr_input *input,
 	 * Limit hash to 13 bits since max bucket count is 8K.
 	 * Store result at the end of the input stream.
 	 */
-	input->formatted.bkt_hash = bucket_hash & 0x1FFF;
+	input->formatted.bkt_hash = (__force __be16)(bucket_hash & 0x1FFF);
 }
 
 /**
@@ -1584,7 +1585,7 @@ s32 ixgbe_fdir_set_input_mask_82599(struct ixgbe_hw *hw,
 		return IXGBE_ERR_CONFIG;
 	}
 
-	switch (input_mask->formatted.flex_bytes & 0xFFFF) {
+	switch ((__force u16)input_mask->formatted.flex_bytes & 0xFFFF) {
 	case 0x0000:
 		/* Mask Flex Bytes */
 		fdirm |= IXGBE_FDIRM_FLEX;
@@ -1654,13 +1655,13 @@ s32 ixgbe_fdir_write_perfect_filter_82599(struct ixgbe_hw *hw,
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRPORT, fdirport);
 
 	/* record vlan (little-endian) and flex_bytes(big-endian) */
-	fdirvlan = IXGBE_STORE_AS_BE16(input->formatted.flex_bytes);
+	fdirvlan = IXGBE_STORE_AS_BE16((__force u16)input->formatted.flex_bytes);
 	fdirvlan <<= IXGBE_FDIRVLAN_FLEX_SHIFT;
 	fdirvlan |= ntohs(input->formatted.vlan_id);
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRVLAN, fdirvlan);
 
 	/* configure FDIRHASH register */
-	fdirhash = input->formatted.bkt_hash;
+	fdirhash = (__force u32)input->formatted.bkt_hash;
 	fdirhash |= soft_id << IXGBE_FDIRHASH_SIG_SW_INDEX_SHIFT;
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRHASH, fdirhash);
 
@@ -1698,7 +1699,7 @@ s32 ixgbe_fdir_erase_perfect_filter_82599(struct ixgbe_hw *hw,
 	s32 err;
 
 	/* configure FDIRHASH register */
-	fdirhash = input->formatted.bkt_hash;
+	fdirhash = (__force u32)input->formatted.bkt_hash;
 	fdirhash |= soft_id << IXGBE_FDIRHASH_SIG_SW_INDEX_SHIFT;
 	IXGBE_WRITE_REG(hw, IXGBE_FDIRHASH, fdirhash);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 8d038837f72b..3f5c350716bb 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -3626,7 +3626,7 @@ s32 ixgbe_hic_unlocked(struct ixgbe_hw *hw, u32 *buffer, u32 length,
 	 */
 	for (i = 0; i < dword_len; i++)
 		IXGBE_WRITE_REG_ARRAY(hw, IXGBE_FLEX_MNG,
-				      i, cpu_to_le32(buffer[i]));
+				      i, (__force u32)cpu_to_le32(buffer[i]));
 
 	/* Setting this bit tells the ARC that a new command is pending. */
 	IXGBE_WRITE_REG(hw, IXGBE_HICR, hicr | IXGBE_HICR_C);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
index b5219e024f70..94b3165ff543 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
@@ -440,7 +440,7 @@ int ixgbe_fcoe_ddp(struct ixgbe_adapter *adapter,
 	case cpu_to_le32(IXGBE_RXDADV_STAT_FCSTAT_FCPRSP):
 		dma_unmap_sg(&adapter->pdev->dev, ddp->sgl,
 			     ddp->sgc, DMA_FROM_DEVICE);
-		ddp->err = ddp_err;
+		ddp->err = (__force u32)ddp_err;
 		ddp->sgl = NULL;
 		ddp->sgc = 0;
 		/* fall through */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 195c0b65eee2..99b170f1efd1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -19,8 +19,9 @@ static void ixgbe_ipsec_set_tx_sa(struct ixgbe_hw *hw, u16 idx,
 	int i;
 
 	for (i = 0; i < 4; i++)
-		IXGBE_WRITE_REG(hw, IXGBE_IPSTXKEY(i), cpu_to_be32(key[3 - i]));
-	IXGBE_WRITE_REG(hw, IXGBE_IPSTXSALT, cpu_to_be32(salt));
+		IXGBE_WRITE_REG(hw, IXGBE_IPSTXKEY(i),
+				(__force u32)cpu_to_be32(key[3 - i]));
+	IXGBE_WRITE_REG(hw, IXGBE_IPSTXSALT, (__force u32)cpu_to_be32(salt));
 	IXGBE_WRITE_FLUSH(hw);
 
 	reg = IXGBE_READ_REG(hw, IXGBE_IPSTXIDX);
@@ -69,7 +70,8 @@ static void ixgbe_ipsec_set_rx_sa(struct ixgbe_hw *hw, u16 idx, __be32 spi,
 	int i;
 
 	/* store the SPI (in bigendian) and IPidx */
-	IXGBE_WRITE_REG(hw, IXGBE_IPSRXSPI, cpu_to_le32(spi));
+	IXGBE_WRITE_REG(hw, IXGBE_IPSRXSPI,
+			(__force u32)cpu_to_le32((__force u32)spi));
 	IXGBE_WRITE_REG(hw, IXGBE_IPSRXIPIDX, ip_idx);
 	IXGBE_WRITE_FLUSH(hw);
 
@@ -77,8 +79,9 @@ static void ixgbe_ipsec_set_rx_sa(struct ixgbe_hw *hw, u16 idx, __be32 spi,
 
 	/* store the key, salt, and mode */
 	for (i = 0; i < 4; i++)
-		IXGBE_WRITE_REG(hw, IXGBE_IPSRXKEY(i), cpu_to_be32(key[3 - i]));
-	IXGBE_WRITE_REG(hw, IXGBE_IPSRXSALT, cpu_to_be32(salt));
+		IXGBE_WRITE_REG(hw, IXGBE_IPSRXKEY(i),
+				(__force u32)cpu_to_be32(key[3 - i]));
+	IXGBE_WRITE_REG(hw, IXGBE_IPSRXSALT, (__force u32)cpu_to_be32(salt));
 	IXGBE_WRITE_REG(hw, IXGBE_IPSRXMOD, mode);
 	IXGBE_WRITE_FLUSH(hw);
 
@@ -97,7 +100,8 @@ static void ixgbe_ipsec_set_rx_ip(struct ixgbe_hw *hw, u16 idx, __be32 addr[])
 
 	/* store the ip address */
 	for (i = 0; i < 4; i++)
-		IXGBE_WRITE_REG(hw, IXGBE_IPSRXIPADDR(i), cpu_to_le32(addr[i]));
+		IXGBE_WRITE_REG(hw, IXGBE_IPSRXIPADDR(i),
+				(__force u32)cpu_to_le32((__force u32)addr[i]));
 	IXGBE_WRITE_FLUSH(hw);
 
 	ixgbe_ipsec_set_rx_item(hw, idx, ips_rx_ip_tbl);
@@ -367,7 +371,8 @@ static struct xfrm_state *ixgbe_ipsec_find_rx_state(struct ixgbe_ipsec *ipsec,
 	struct xfrm_state *ret = NULL;
 
 	rcu_read_lock();
-	hash_for_each_possible_rcu(ipsec->rx_sa_list, rsa, hlist, spi)
+	hash_for_each_possible_rcu(ipsec->rx_sa_list, rsa, hlist,
+				   (__force u32)spi) {
 		if (spi == rsa->xs->id.spi &&
 		    ((ip4 && *daddr == rsa->xs->id.daddr.a4) ||
 		      (!ip4 && !memcmp(daddr, &rsa->xs->id.daddr.a6,
@@ -377,6 +382,7 @@ static struct xfrm_state *ixgbe_ipsec_find_rx_state(struct ixgbe_ipsec *ipsec,
 			xfrm_state_hold(ret);
 			break;
 		}
+	}
 	rcu_read_unlock();
 	return ret;
 }
@@ -569,7 +575,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 
 		/* hash the new entry for faster search in Rx path */
 		hash_add_rcu(ipsec->rx_sa_list, &ipsec->rx_tbl[sa_idx].hlist,
-			     rsa.xs->id.spi);
+			     (__force u64)rsa.xs->id.spi);
 	} else {
 		struct tx_sa tsa;
 
@@ -653,7 +659,8 @@ static void ixgbe_ipsec_del_sa(struct xfrm_state *xs)
 			if (!ipsec->ip_tbl[ipi].ref_cnt) {
 				memset(&ipsec->ip_tbl[ipi], 0,
 				       sizeof(struct rx_ip_sa));
-				ixgbe_ipsec_set_rx_ip(hw, ipi, zerobuf);
+				ixgbe_ipsec_set_rx_ip(hw, ipi,
+						      (__force __be32 *)zerobuf);
 			}
 		}
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6652b201df5b..163b34a9572d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -727,8 +727,8 @@ static void ixgbe_dump(struct ixgbe_adapter *adapter)
 					ring_desc = "";
 				pr_info("T [0x%03X]    %016llX %016llX %016llX %08X %p %016llX %p%s",
 					i,
-					le64_to_cpu(u0->a),
-					le64_to_cpu(u0->b),
+					le64_to_cpu((__force __le64)u0->a),
+					le64_to_cpu((__force __le64)u0->b),
 					(u64)dma_unmap_addr(tx_buffer, dma),
 					dma_unmap_len(tx_buffer, len),
 					tx_buffer->next_to_watch,
@@ -839,15 +839,15 @@ rx_ring_summary:
 				/* Descriptor Done */
 				pr_info("RWB[0x%03X]     %016llX %016llX ---------------- %p%s\n",
 					i,
-					le64_to_cpu(u0->a),
-					le64_to_cpu(u0->b),
+					le64_to_cpu((__force __le64)u0->a),
+					le64_to_cpu((__force __le64)u0->b),
 					rx_buffer_info->skb,
 					ring_desc);
 			} else {
 				pr_info("R  [0x%03X]     %016llX %016llX %016llX %p%s\n",
 					i,
-					le64_to_cpu(u0->a),
-					le64_to_cpu(u0->b),
+					le64_to_cpu((__force __le64)u0->a),
+					le64_to_cpu((__force __le64)u0->b),
 					(u64)rx_buffer_info->dma,
 					rx_buffer_info->skb,
 					ring_desc);
@@ -7751,7 +7751,7 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
 
 	/* remove payload length from inner checksum */
 	paylen = skb->len - l4_offset;
-	csum_replace_by_diff(&l4.tcp->check, htonl(paylen));
+	csum_replace_by_diff(&l4.tcp->check, (__force __wsum)htonl(paylen));
 
 	/* update gso size and bytecount with header size */
 	first->gso_segs = skb_shinfo(skb)->gso_segs;
@@ -9104,7 +9104,8 @@ static int ixgbe_clsu32_build_input(struct ixgbe_fdir_filter *input,
 
 		for (j = 0; field_ptr[j].val; j++) {
 			if (field_ptr[j].off == off) {
-				field_ptr[j].val(input, mask, val, m);
+				field_ptr[j].val(input, mask, (__force u32)val,
+						 (__force u32)m);
 				input->filter.formatted.flow_type |=
 					field_ptr[j].type;
 				found_entry = true;
@@ -9113,8 +9114,10 @@ static int ixgbe_clsu32_build_input(struct ixgbe_fdir_filter *input,
 		}
 		if (nexthdr) {
 			if (nexthdr->off == cls->knode.sel->keys[i].off &&
-			    nexthdr->val == cls->knode.sel->keys[i].val &&
-			    nexthdr->mask == cls->knode.sel->keys[i].mask)
+			    nexthdr->val ==
+			    (__force u32)cls->knode.sel->keys[i].val &&
+			    nexthdr->mask ==
+			    (__force u32)cls->knode.sel->keys[i].mask)
 				found_jump_field = true;
 			else
 				continue;
@@ -9218,7 +9221,8 @@ static int ixgbe_configure_clsu32(struct ixgbe_adapter *adapter,
 		for (i = 0; nexthdr[i].jump; i++) {
 			if (nexthdr[i].o != cls->knode.sel->offoff ||
 			    nexthdr[i].s != cls->knode.sel->offshift ||
-			    nexthdr[i].m != cls->knode.sel->offmask)
+			    nexthdr[i].m !=
+			    (__force u32)cls->knode.sel->offmask)
 				return err;
 
 			jump = kzalloc(sizeof(*jump), GFP_KERNEL);
@@ -9991,7 +9995,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 		}
 	} else {
 		for (i = 0; i < adapter->num_rx_queues; i++)
-			xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+			(void)xchg(&adapter->rx_ring[i]->xdp_prog,
+			    adapter->xdp_prog);
 	}
 
 	if (old_prog)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
index fcae520647ce..1e6cf220f543 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_model.h
@@ -29,8 +29,8 @@ static inline int ixgbe_mat_prgm_sip(struct ixgbe_fdir_filter *input,
 				     union ixgbe_atr_input *mask,
 				     u32 val, u32 m)
 {
-	input->filter.formatted.src_ip[0] = val;
-	mask->formatted.src_ip[0] = m;
+	input->filter.formatted.src_ip[0] = (__force __be32)val;
+	mask->formatted.src_ip[0] = (__force __be32)m;
 	return 0;
 }
 
@@ -38,8 +38,8 @@ static inline int ixgbe_mat_prgm_dip(struct ixgbe_fdir_filter *input,
 				     union ixgbe_atr_input *mask,
 				     u32 val, u32 m)
 {
-	input->filter.formatted.dst_ip[0] = val;
-	mask->formatted.dst_ip[0] = m;
+	input->filter.formatted.dst_ip[0] = (__force __be32)val;
+	mask->formatted.dst_ip[0] = (__force __be32)m;
 	return 0;
 }
 
@@ -55,10 +55,10 @@ static inline int ixgbe_mat_prgm_ports(struct ixgbe_fdir_filter *input,
 				       union ixgbe_atr_input *mask,
 				       u32 val, u32 m)
 {
-	input->filter.formatted.src_port = val & 0xffff;
-	mask->formatted.src_port = m & 0xffff;
-	input->filter.formatted.dst_port = val >> 16;
-	mask->formatted.dst_port = m >> 16;
+	input->filter.formatted.src_port = (__force __be16)(val & 0xffff);
+	mask->formatted.src_port = (__force __be16)(m & 0xffff);
+	input->filter.formatted.dst_port = (__force __be16)(val >> 16);
+	mask->formatted.dst_port = (__force __be16)(m >> 16);
 
 	return 0;
 };
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
index ac71ed76a54b..a8148c7126e5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -878,8 +878,9 @@ static s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw,
 		buffer.hdr.req.checksum = FW_DEFAULT_CHECKSUM;
 
 		/* convert offset from words to bytes */
-		buffer.address = cpu_to_be32((offset + current_word) * 2);
-		buffer.length = cpu_to_be16(words_to_read * 2);
+		buffer.address = (__force u32)cpu_to_be32((offset +
+							   current_word) * 2);
+		buffer.length = (__force u16)cpu_to_be16(words_to_read * 2);
 		buffer.pad2 = 0;
 		buffer.pad3 = 0;
 
@@ -1089,9 +1090,9 @@ static s32 ixgbe_read_ee_hostif_X550(struct ixgbe_hw *hw, u16 offset, u16 *data)
 	buffer.hdr.req.checksum = FW_DEFAULT_CHECKSUM;
 
 	/* convert offset from words to bytes */
-	buffer.address = cpu_to_be32(offset * 2);
+	buffer.address = (__force u32)cpu_to_be32(offset * 2);
 	/* one word */
-	buffer.length = cpu_to_be16(sizeof(u16));
+	buffer.length = (__force u16)cpu_to_be16(sizeof(u16));
 
 	status = hw->mac.ops.acquire_swfw_sync(hw, mask);
 	if (status)
-- 
cgit v1.2.3-59-g8ed1b


From eeacb7166df6ed1cffa923ea04c70043285783b8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Thu, 17 May 2018 13:43:56 +0100
Subject: bpf: change eBPF helper doc parsing script to allow for smaller
 indent

Documentation for eBPF helpers can be parsed from bpf.h and eventually
turned into a man page. Commit 6f96674dbd8c ("bpf: relax constraints on
formatting for eBPF helper documentation") changed the script used to
parse it, in order to allow for different indent style and to ease the
work for writing documentation for future helpers.

The script currently considers that the first tab can be replaced by 6
to 8 spaces. But the documentation for bpf_fib_lookup() uses a mix of
tabs (for the "Description" part) and of spaces ("Return" part), and
only has 5 space long indent for the latter.

We probably do not want to change the values accepted by the script each
time a new helper gets a new indent style. However, it is worth noting
that with those 5 spaces, the "Description" and "Return" part *look*
aligned in the generated patch and in `git show`, so it is likely other
helper authors will use the same length. Therefore, allow for helper
documentation to use 5 spaces only for the first indent level.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 scripts/bpf_helpers_doc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 8f59897fbda1..5010a4d5bfba 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -95,7 +95,7 @@ class HeaderParser(object):
         return capture.group(1)
 
     def parse_desc(self):
-        p = re.compile(' \* ?(?:\t| {6,8})Description$')
+        p = re.compile(' \* ?(?:\t| {5,8})Description$')
         capture = p.match(self.line)
         if not capture:
             # Helper can have empty description and we might be parsing another
@@ -109,7 +109,7 @@ class HeaderParser(object):
             if self.line == ' *\n':
                 desc += '\n'
             else:
-                p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
+                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
                     desc += capture.group(1) + '\n'
@@ -118,7 +118,7 @@ class HeaderParser(object):
         return desc
 
     def parse_ret(self):
-        p = re.compile(' \* ?(?:\t| {6,8})Return$')
+        p = re.compile(' \* ?(?:\t| {5,8})Return$')
         capture = p.match(self.line)
         if not capture:
             # Helper can have empty retval and we might be parsing another
@@ -132,7 +132,7 @@ class HeaderParser(object):
             if self.line == ' *\n':
                 ret += '\n'
             else:
-                p = re.compile(' \* ?(?:\t| {6,8})(?:\t| {8})(.*)')
+                p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
                 capture = p.match(self.line)
                 if capture:
                     ret += capture.group(1) + '\n'
-- 
cgit v1.2.3-59-g8ed1b


From b212d815e77c72be921979119c715166cc8987b1 Mon Sep 17 00:00:00 2001
From: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
Date: Wed, 2 May 2018 17:26:28 -0300
Subject: ixgbe/ixgbevf: Free IRQ when PCI error recovery removes the device

Since commit f7f37e7ff2b9 ("ixgbe: handle close/suspend race with
netif_device_detach/present") ixgbe_close_suspend is called, from
ixgbe_close, only if the device is present, i.e. if it isn't detached.
That exposed a situation where IRQs weren't freed if a PCI error
recovery system opts to remove the device. For such case the pci channel
state is set to pci_channel_io_perm_failure and ixgbe_io_error_detected
was returning PCI_ERS_RESULT_DISCONNECT before calling
ixgbe_close_suspend consequentially not freeing IRQ and crashing when
the remove handler calls pci_disable_device, hitting a BUG_ON at
free_msi_irqs, which asserts that there is no non-free IRQ associated
with the device to be removed:

BUG_ON(irq_has_action(entry->irq + i));

The issue is fixed by calling the ixgbe_close_suspend before evaluate
the pci channel state.

Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
Reported-by: Abdul Haleem <abdhalee@in.ibm.com>
Signed-off-by: Mauro S M Rodrigues <maurosr@linux.vnet.ibm.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 6 +++---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 163b34a9572d..a52d92e182ee 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10935,14 +10935,14 @@ skip_bad_vf_detection:
 	rtnl_lock();
 	netif_device_detach(netdev);
 
+	if (netif_running(netdev))
+		ixgbe_close_suspend(adapter);
+
 	if (state == pci_channel_io_perm_failure) {
 		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (netif_running(netdev))
-		ixgbe_close_suspend(adapter);
-
 	if (!test_and_set_bit(__IXGBE_DISABLED, &adapter->state))
 		pci_disable_device(pdev);
 	rtnl_unlock();
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 1ccce6cd51fc..9a939dcaf727 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4747,14 +4747,14 @@ static pci_ers_result_t ixgbevf_io_error_detected(struct pci_dev *pdev,
 	rtnl_lock();
 	netif_device_detach(netdev);
 
+	if (netif_running(netdev))
+		ixgbevf_close_suspend(adapter);
+
 	if (state == pci_channel_io_perm_failure) {
 		rtnl_unlock();
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
-	if (netif_running(netdev))
-		ixgbevf_close_suspend(adapter);
-
 	if (!test_and_set_bit(__IXGBEVF_DISABLED, &adapter->state))
 		pci_disable_device(pdev);
 	rtnl_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 6710f970d9979d8f03f6e292bb729b2ee1526d0e Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Wed, 2 May 2018 23:59:30 -0400
Subject: ixgbe: release lock for the duration of ixgbe_suspend_close()

Currently, during device_shutdown() ixgbe holds rtnl_lock for the duration
of lengthy ixgbe_close_suspend(). On machines with multiple ixgbe cards
this lock prevents scaling if device_shutdown() function is multi-threaded.

It is not necessary to hold this lock during ixgbe_close_suspend()
as it is not held when ixgbe_close() is called also during shutdown but for
kexec case.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a52d92e182ee..5ddfb93ed491 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6698,8 +6698,15 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	rtnl_lock();
 	netif_device_detach(netdev);
 
-	if (netif_running(netdev))
+	if (netif_running(netdev)) {
+		/* Suspend takes a long time, device_shutdown may be
+		 * parallelized this function, so drop lock for the
+		 * duration of this call.
+		 */
+		rtnl_unlock();
 		ixgbe_close_suspend(adapter);
+		rtnl_lock();
+	}
 
 	ixgbe_clear_interrupt_scheme(adapter);
 	rtnl_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From a8d9bb3d448f3c9a26a885a3104738098a6625c7 Mon Sep 17 00:00:00 2001
From: Emil Tantilov <emil.s.tantilov@intel.com>
Date: Mon, 14 May 2018 11:16:11 -0700
Subject: ixgbe: force VF to grab new MAC on driver reload

Do not validate the MAC address during a reset, unless the MAC was set on
the host. This way the VF will get a new MAC address every time it reloads.

Remove the "no MAC address assigned" message since it will get spammed on
reset and it doesn't help much as the MAC on the VF is randomly generated.

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 2649c06d877b..6f59933cdff7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -854,14 +854,11 @@ static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf)
 
 	/* reply to reset with ack and vf mac address */
 	msgbuf[0] = IXGBE_VF_RESET;
-	if (!is_zero_ether_addr(vf_mac)) {
+	if (!is_zero_ether_addr(vf_mac) && adapter->vfinfo[vf].pf_set_mac) {
 		msgbuf[0] |= IXGBE_VT_MSGTYPE_ACK;
 		memcpy(addr, vf_mac, ETH_ALEN);
 	} else {
 		msgbuf[0] |= IXGBE_VT_MSGTYPE_NACK;
-		dev_warn(&adapter->pdev->dev,
-			 "VF %d has no MAC address assigned, you may have to assign one manually\n",
-			 vf);
 	}
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 6e7d0ba1e59b1a306761a731e67634c0f2efea2a Mon Sep 17 00:00:00 2001
From: Emil Tantilov <emil.s.tantilov@intel.com>
Date: Mon, 14 May 2018 11:16:16 -0700
Subject: ixgbevf: fix MAC address changes through ixgbevf_set_mac()

Set hw->mac.perm_addr in ixgbevf_set_mac() in order to avoid losing the
custom MAC on reset. This can happen in the following case:

>ip link set $vf address $mac
>ethtool -r $vf

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 9a939dcaf727..083041129539 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4164,6 +4164,7 @@ static int ixgbevf_set_mac(struct net_device *netdev, void *p)
 		return -EPERM;
 
 	ether_addr_copy(hw->mac.addr, addr->sa_data);
+	ether_addr_copy(hw->mac.perm_addr, addr->sa_data);
 	ether_addr_copy(netdev->dev_addr, addr->sa_data);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 43c89b16427f97607cdc9a64fe2a84935568af64 Mon Sep 17 00:00:00 2001
From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Date: Mon, 16 Apr 2018 09:55:36 -0700
Subject: ice: Update NVM AQ command functions

This patch updates the NVM read/erase/update AQ commands to align with
the latest specification.

Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 13 +++++++------
 drivers/net/ethernet/intel/ice/ice_nvm.c        |  7 ++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 7dc5f045e969..7541ec2270b3 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1049,7 +1049,9 @@ struct ice_aqc_set_event_mask {
  * NVM Update commands (indirect 0x0703)
  */
 struct ice_aqc_nvm {
-	u8	cmd_flags;
+	__le16 offset_low;
+	u8 offset_high;
+	u8 cmd_flags;
 #define ICE_AQC_NVM_LAST_CMD		BIT(0)
 #define ICE_AQC_NVM_PCIR_REQ		BIT(0)	/* Used by NVM Update reply */
 #define ICE_AQC_NVM_PRESERVATION_S	1
@@ -1058,12 +1060,11 @@ struct ice_aqc_nvm {
 #define ICE_AQC_NVM_PRESERVE_ALL	BIT(1)
 #define ICE_AQC_NVM_PRESERVE_SELECTED	(3 << CSR_AQ_NVM_PRESERVATION_S)
 #define ICE_AQC_NVM_FLASH_ONLY		BIT(7)
-	u8	module_typeid;
-	__le16	length;
+	__le16 module_typeid;
+	__le16 length;
 #define ICE_AQC_NVM_ERASE_LEN	0xFFFF
-	__le32	offset;
-	__le32	addr_high;
-	__le32	addr_low;
+	__le32 addr_high;
+	__le32 addr_low;
 };
 
 /* Get/Set RSS key (indirect 0x0B04/0x0B02) */
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c
index fa7a69ac92b0..92da0a626ce0 100644
--- a/drivers/net/ethernet/intel/ice/ice_nvm.c
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.c
@@ -16,7 +16,7 @@
  * Read the NVM using the admin queue commands (0x0701)
  */
 static enum ice_status
-ice_aq_read_nvm(struct ice_hw *hw, u8 module_typeid, u32 offset, u16 length,
+ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset, u16 length,
 		void *data, bool last_command, struct ice_sq_cd *cd)
 {
 	struct ice_aq_desc desc;
@@ -33,8 +33,9 @@ ice_aq_read_nvm(struct ice_hw *hw, u8 module_typeid, u32 offset, u16 length,
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->cmd_flags |= ICE_AQC_NVM_LAST_CMD;
-	cmd->module_typeid = module_typeid;
-	cmd->offset = cpu_to_le32(offset);
+	cmd->module_typeid = cpu_to_le16(module_typeid);
+	cmd->offset_low = cpu_to_le16(offset & 0xFFFF);
+	cmd->offset_high = (offset >> 16) & 0xFF;
 	cmd->length = cpu_to_le16(length);
 
 	return ice_aq_send_cmd(hw, &desc, data, length, cd);
-- 
cgit v1.2.3-59-g8ed1b


From 96009c7d500efdd5534e83b2e3eb2c58d4b137ae Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 16:24:36 +0200
Subject: sched: replace __QDISC_STATE_RUNNING bit with a spin lock

So that we can use lockdep on it.
The newly introduced sequence lock has the same scope of busylock,
so it shares the same lockdep annotation, but it's only used for
NOLOCK qdiscs.

With this changeset we acquire such lock in the control path around
flushing operation (qdisc reset), to allow more NOLOCK qdisc perf
improvement in the next patch.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 10 +++++-----
 net/sched/sch_generic.c   | 11 +++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4d2b37226e75..98c10a28cd01 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -30,7 +30,6 @@ struct qdisc_rate_table {
 enum qdisc_state_t {
 	__QDISC_STATE_SCHED,
 	__QDISC_STATE_DEACTIVATED,
-	__QDISC_STATE_RUNNING,
 };
 
 struct qdisc_size_table {
@@ -102,6 +101,7 @@ struct Qdisc {
 	refcount_t		refcnt;
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	spinlock_t		seqlock;
 };
 
 static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
@@ -111,17 +111,17 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
 	refcount_inc(&qdisc->refcnt);
 }
 
-static inline bool qdisc_is_running(const struct Qdisc *qdisc)
+static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
-		return test_bit(__QDISC_STATE_RUNNING, &qdisc->state);
+		return spin_is_locked(&qdisc->seqlock);
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK) {
-		if (test_and_set_bit(__QDISC_STATE_RUNNING, &qdisc->state))
+		if (!spin_trylock(&qdisc->seqlock))
 			return false;
 	} else if (qdisc_is_running(qdisc)) {
 		return false;
@@ -138,7 +138,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
 	write_seqcount_end(&qdisc->running);
 	if (qdisc->flags & TCQ_F_NOLOCK)
-		clear_bit(__QDISC_STATE_RUNNING, &qdisc->state);
+		spin_unlock(&qdisc->seqlock);
 }
 
 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ff3ce71aec93..a126f16bc30b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -858,6 +858,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	lockdep_set_class(&sch->busylock,
 			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
 
+	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
+	spin_lock_init(&sch->seqlock);
+	lockdep_set_class(&sch->busylock,
+			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
+
 	seqcount_init(&sch->running);
 	lockdep_set_class(&sch->running,
 			  dev->qdisc_running_key ?: &qdisc_running_key);
@@ -1097,6 +1102,10 @@ static void dev_deactivate_queue(struct net_device *dev,
 
 	qdisc = rtnl_dereference(dev_queue->qdisc);
 	if (qdisc) {
+		bool nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+		if (nolock)
+			spin_lock_bh(&qdisc->seqlock);
 		spin_lock_bh(qdisc_lock(qdisc));
 
 		if (!(qdisc->flags & TCQ_F_BUILTIN))
@@ -1106,6 +1115,8 @@ static void dev_deactivate_queue(struct net_device *dev,
 		qdisc_reset(qdisc);
 
 		spin_unlock_bh(qdisc_lock(qdisc));
+		if (nolock)
+			spin_unlock_bh(&qdisc->seqlock);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 021a17ed796b62383f7623f4fea73787abddad77 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 May 2018 16:24:37 +0200
Subject: pfifo_fast: drop unneeded additional lock on dequeue

After the previous patch, for NOLOCK qdiscs, q->seqlock is
always held when the dequeue() is invoked, we can drop
any additional locking to protect such operation.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skb_array.h | 5 +++++
 net/sched/sch_generic.c   | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index a6b6e8bb3d7b..62d9b0a6329f 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -97,6 +97,11 @@ static inline bool skb_array_empty_any(struct skb_array *a)
 	return ptr_ring_empty_any(&a->ring);
 }
 
+static inline struct sk_buff *__skb_array_consume(struct skb_array *a)
+{
+	return __ptr_ring_consume(&a->ring);
+}
+
 static inline struct sk_buff *skb_array_consume(struct skb_array *a)
 {
 	return ptr_ring_consume(&a->ring);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a126f16bc30b..760ab1b09f8b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -656,7 +656,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 		if (__skb_array_empty(q))
 			continue;
 
-		skb = skb_array_consume_bh(q);
+		skb = __skb_array_consume(q);
 	}
 	if (likely(skb)) {
 		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
@@ -697,7 +697,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc)
 		if (!q->ring.queue)
 			continue;
 
-		while ((skb = skb_array_consume_bh(q)) != NULL)
+		while ((skb = __skb_array_consume(q)) != NULL)
 			kfree_skb(skb);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 9611d6d6e24cd40ff887bdbb4dfe36a2ee88d488 Mon Sep 17 00:00:00 2001
From: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Date: Thu, 17 May 2018 01:21:45 +0300
Subject: net: ethernet: ti: cpsw: disable mq feature for "AM33xx ES1.0"
 devices

The early versions of am33xx devices, related to ES1.0 SoC revision
have errata limiting mq support. That's the same errata as
commit 7da1160002f1 ("drivers: net: cpsw: add am335x errata workarround for
interrutps")

AM33xx Errata [1] Advisory 1.0.9
http://www.ti.com/lit/er/sprz360f/sprz360f.pdf

After additional investigation were found that drivers w/a is
propagated on all AM33xx SoCs and on DM814x. But the errata exists
only for ES1.0 of AM33xx family, limiting mq support for revisions
after ES1.0. So, disable mq support only for related SoCs and use
separate polls for revisions allowing mq.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 109 +++++++++++++++++++++++------------------
 1 file changed, 60 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 28d893b93d30..a7285dddfd29 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -36,6 +36,7 @@
 #include <linux/of_device.h>
 #include <linux/if_vlan.h>
 #include <linux/kmemleak.h>
+#include <linux/sys_soc.h>
 
 #include <linux/pinctrl/consumer.h>
 
@@ -957,7 +958,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
+static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
 {
 	u32			ch_map;
 	int			num_tx, cur_budget, ch;
@@ -984,7 +985,21 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 	if (num_tx < budget) {
 		napi_complete(napi_tx);
 		writel(0xff, &cpsw->wr_regs->tx_en);
-		if (cpsw->quirk_irq && cpsw->tx_irq_disabled) {
+	}
+
+	return num_tx;
+}
+
+static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
+{
+	struct cpsw_common *cpsw = napi_to_cpsw(napi_tx);
+	int num_tx;
+
+	num_tx = cpdma_chan_process(cpsw->txv[0].ch, budget);
+	if (num_tx < budget) {
+		napi_complete(napi_tx);
+		writel(0xff, &cpsw->wr_regs->tx_en);
+		if (cpsw->tx_irq_disabled) {
 			cpsw->tx_irq_disabled = false;
 			enable_irq(cpsw->irqs_table[1]);
 		}
@@ -993,7 +1008,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 	return num_tx;
 }
 
-static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
+static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
 {
 	u32			ch_map;
 	int			num_rx, cur_budget, ch;
@@ -1020,7 +1035,21 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 	if (num_rx < budget) {
 		napi_complete_done(napi_rx, num_rx);
 		writel(0xff, &cpsw->wr_regs->rx_en);
-		if (cpsw->quirk_irq && cpsw->rx_irq_disabled) {
+	}
+
+	return num_rx;
+}
+
+static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
+{
+	struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
+	int num_rx;
+
+	num_rx = cpdma_chan_process(cpsw->rxv[0].ch, budget);
+	if (num_rx < budget) {
+		napi_complete_done(napi_rx, num_rx);
+		writel(0xff, &cpsw->wr_regs->rx_en);
+		if (cpsw->rx_irq_disabled) {
 			cpsw->rx_irq_disabled = false;
 			enable_irq(cpsw->irqs_table[0]);
 		}
@@ -2364,9 +2393,9 @@ static void cpsw_get_channels(struct net_device *ndev,
 {
 	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
 
+	ch->max_rx = cpsw->quirk_irq ? 1 : CPSW_MAX_QUEUES;
+	ch->max_tx = cpsw->quirk_irq ? 1 : CPSW_MAX_QUEUES;
 	ch->max_combined = 0;
-	ch->max_rx = CPSW_MAX_QUEUES;
-	ch->max_tx = CPSW_MAX_QUEUES;
 	ch->max_other = 0;
 	ch->other_count = 0;
 	ch->rx_count = cpsw->rx_ch_num;
@@ -2377,6 +2406,11 @@ static void cpsw_get_channels(struct net_device *ndev,
 static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
 				  struct ethtool_channels *ch)
 {
+	if (cpsw->quirk_irq) {
+		dev_err(cpsw->dev, "Maximum one tx/rx queue is allowed");
+		return -EOPNOTSUPP;
+	}
+
 	if (ch->combined_count)
 		return -EINVAL;
 
@@ -2917,44 +2951,20 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
 	return ret;
 }
 
-#define CPSW_QUIRK_IRQ		BIT(0)
-
-static const struct platform_device_id cpsw_devtype[] = {
-	{
-		/* keep it for existing comaptibles */
-		.name = "cpsw",
-		.driver_data = CPSW_QUIRK_IRQ,
-	}, {
-		.name = "am335x-cpsw",
-		.driver_data = CPSW_QUIRK_IRQ,
-	}, {
-		.name = "am4372-cpsw",
-		.driver_data = 0,
-	}, {
-		.name = "dra7-cpsw",
-		.driver_data = 0,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(platform, cpsw_devtype);
-
-enum ti_cpsw_type {
-	CPSW = 0,
-	AM335X_CPSW,
-	AM4372_CPSW,
-	DRA7_CPSW,
-};
-
 static const struct of_device_id cpsw_of_mtable[] = {
-	{ .compatible = "ti,cpsw", .data = &cpsw_devtype[CPSW], },
-	{ .compatible = "ti,am335x-cpsw", .data = &cpsw_devtype[AM335X_CPSW], },
-	{ .compatible = "ti,am4372-cpsw", .data = &cpsw_devtype[AM4372_CPSW], },
-	{ .compatible = "ti,dra7-cpsw", .data = &cpsw_devtype[DRA7_CPSW], },
+	{ .compatible = "ti,cpsw"},
+	{ .compatible = "ti,am335x-cpsw"},
+	{ .compatible = "ti,am4372-cpsw"},
+	{ .compatible = "ti,dra7-cpsw"},
 	{ /* sentinel */ },
 };
 MODULE_DEVICE_TABLE(of, cpsw_of_mtable);
 
+static const struct soc_device_attribute cpsw_soc_devices[] = {
+	{ .family = "AM33xx", .revision = "ES1.0"},
+	{ /* sentinel */ }
+};
+
 static int cpsw_probe(struct platform_device *pdev)
 {
 	struct clk			*clk;
@@ -2966,9 +2976,9 @@ static int cpsw_probe(struct platform_device *pdev)
 	void __iomem			*ss_regs;
 	void __iomem			*cpts_regs;
 	struct resource			*res, *ss_res;
-	const struct of_device_id	*of_id;
 	struct gpio_descs		*mode;
 	u32 slave_offset, sliver_offset, slave_size;
+	const struct soc_device_attribute *soc;
 	struct cpsw_common		*cpsw;
 	int ret = 0, i;
 	int irq;
@@ -3141,6 +3151,10 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dt_ret;
 	}
 
+	soc = soc_device_match(cpsw_soc_devices);
+	if (soc)
+		cpsw->quirk_irq = 1;
+
 	cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
 	if (IS_ERR(cpsw->txv[0].ch)) {
 		dev_err(priv->dev, "error initializing tx dma channel\n");
@@ -3180,19 +3194,16 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dma_ret;
 	}
 
-	of_id = of_match_device(cpsw_of_mtable, &pdev->dev);
-	if (of_id) {
-		pdev->id_entry = of_id->data;
-		if (pdev->id_entry->driver_data)
-			cpsw->quirk_irq = true;
-	}
-
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	ndev->ethtool_ops = &cpsw_ethtool_ops;
-	netif_napi_add(ndev, &cpsw->napi_rx, cpsw_rx_poll, CPSW_POLL_WEIGHT);
-	netif_tx_napi_add(ndev, &cpsw->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT);
+	netif_napi_add(ndev, &cpsw->napi_rx,
+		       cpsw->quirk_irq ? cpsw_rx_poll : cpsw_rx_mq_poll,
+		       CPSW_POLL_WEIGHT);
+	netif_tx_napi_add(ndev, &cpsw->napi_tx,
+			  cpsw->quirk_irq ? cpsw_tx_poll : cpsw_tx_mq_poll,
+			  CPSW_POLL_WEIGHT);
 	cpsw_split_res(ndev);
 
 	/* register the network device */
-- 
cgit v1.2.3-59-g8ed1b


From 20b654dfe1beaca60ab51894ff405a049248433d Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:10 -0700
Subject: tcp: support DUPACK threshold in RACK

This patch adds support for the classic DUPACK threshold rule
(#DupThresh) in RACK.

When the number of packets SACKed is greater or equal to the
threshold, RACK sets the reordering window to zero which would
immediately mark all the unsacked packets below the highest SACKed
sequence lost. Since this approach is known to not work well with
reordering, RACK only uses it if no reordering has been observed.

The DUPACK threshold rule is a particularly useful extension to the
fast recoveries triggered by RACK reordering timer. For example
data-center transfers where the RTT is much smaller than a timer
tick, or high RTT path where the default RTT/4 may take too long.

Note that this patch differs slightly from RFC6675. RFC6675
considers a packet lost when at least #DupThresh higher-sequence
packets are SACKed.

With RACK, for connections that have seen reordering, RACK
continues to use a dynamically-adaptive time-based reordering
window to detect losses. But for connections on which we have not
yet seen reordering, this patch considers a packet lost when at
least one higher sequence packet is SACKed and the total number
of SACKed packets is at least DupThresh. For example, suppose a
connection has not seen reordering, and sends 10 packets, and
packets 3, 5, 7 are SACKed. RFC6675 considers packets 1 and 2
lost. RACK considers packets 1, 2, 4, 6 lost.

There is some small risk of spurious retransmits here due to
reordering. However, this is mostly limited to the first flight of
a connection on which the sender receives SACKs from reordering.
And RFC 6675 and FACK loss detection have a similar risk on the
first flight with reordering (it's just that the risk of spurious
retransmits from reordering was slightly narrower for those older
algorithms due to the margin of 3*MSS).

Also the minimum reordering window is reduced from 1 msec to 0
to recover quicker on short RTT transfers. Therefore RACK is more
aggressive in marking packets lost during recovery to reduce the
reordering window timeouts.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  1 +
 include/net/tcp.h                      |  1 +
 net/ipv4/tcp_recovery.c                | 40 +++++++++++++++++++++++-----------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 59afc9a10b4f..13bbac50dc8b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -451,6 +451,7 @@ tcp_recovery - INTEGER
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
 	      retransmissions and tail drops.
 	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
+	RACK: 0x4 disables RACK's DUPACK threshold heuristic
 
 	Default: 0x1
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08eab58ef70..994f869d793c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];
 
 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
+#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3a81720ac0c4..1c1bdf12a96f 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -21,6 +21,32 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
+u32 tcp_rack_reo_wnd(const struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->rack.reord) {
+		/* If reordering has not been observed, be aggressive during
+		 * the recovery or starting the recovery by DUPACK threshold.
+		 */
+		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
+			return 0;
+
+		if (tp->sacked_out >= tp->reordering &&
+		    !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+			return 0;
+	}
+
+	/* To be more reordering resilient, allow min_rtt/4 settling delay.
+	 * Use min_rtt instead of the smoothed RTT because reordering is
+	 * often a path property and less related to queuing or delayed ACKs.
+	 * Upon receiving DSACKs, linearly increase the window up to the
+	 * smoothed RTT.
+	 */
+	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
+		   tp->srtt_us >> 3);
+}
+
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
  *
  * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +70,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 min_rtt = tcp_min_rtt(tp);
 	struct sk_buff *skb, *n;
 	u32 reo_wnd;
 
 	*reo_timeout = 0;
-	/* To be more reordering resilient, allow min_rtt/4 settling delay
-	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
-	 * RTT because reordering is often a path property and less related
-	 * to queuing or delayed ACKs.
-	 */
-	reo_wnd = 1000;
-	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
-	    min_rtt != ~0U) {
-		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
-		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
-	}
-
+	reo_wnd = tcp_rack_reo_wnd(sk);
 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
 				 tcp_tsorted_anchor) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-- 
cgit v1.2.3-59-g8ed1b


From b38a51fec1c1f693f03b1aa19d0622123634d4b7 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:11 -0700
Subject: tcp: disable RFC6675 loss detection

This patch disables RFC6675 loss detection and make sysctl
net.ipv4.tcp_recovery = 1 controls a binary choice between RACK
(1) or RFC6675 (0).

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 ++-
 net/ipv4/tcp_input.c                   | 12 ++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 13bbac50dc8b..ea304a23c8d7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -449,7 +449,8 @@ tcp_recovery - INTEGER
 	features.
 
 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
-	      retransmissions and tail drops.
+	      retransmissions and tail drops. It also subsumes and disables
+	      RFC6675 recovery for SACK connections.
 	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
 	RACK: 0x4 disables RACK's DUPACK threshold heuristic
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b188e0d75edd..ccbe04f80040 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2035,6 +2035,11 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tp->sacked_out + 1;
 }
 
+static bool tcp_is_rack(const struct sock *sk)
+{
+	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
 /* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
@@ -2141,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 		return true;
 
 	/* Not-A-Trick#2 : Classic rule... */
-	if (tcp_dupack_heuristics(tp) > tp->reordering)
+	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
 		return true;
 
 	return false;
@@ -2722,8 +2727,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* Use RACK to detect loss */
-	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
+	if (tcp_is_rack(sk)) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk);
@@ -2862,7 +2866,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		fast_rexmit = 1;
 	}
 
-	if (do_lost)
+	if (!tcp_is_rack(sk) && do_lost)
 		tcp_update_scoreboard(sk, fast_rexmit);
 	*rexmit = REXMIT_LOST;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6ac06ecd3a5d1dd1aaea5c2a8f6d6e4c81d5de6a Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:12 -0700
Subject: tcp: simpler NewReno implementation

This is a rewrite of NewReno loss recovery implementation that is
simpler and standalone for readability and better performance by
using less states.

Note that NewReno refers to RFC6582 as a modification to the fast
recovery algorithm. It is used only if the connection does not
support SACK in Linux. It should not to be confused with the Reno
(AIMD) congestion control.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_input.c    | 19 +++++++++++--------
 net/ipv4/tcp_recovery.c | 27 +++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 994f869d793c..9a3ce379b994 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1877,6 +1877,7 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ccbe04f80040..076206873e3e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2223,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_is_reno(tp)) {
-		tcp_mark_head_lost(sk, 1, 1);
-	} else {
+	if (tcp_is_sack(tp)) {
 		int sacked_upto = tp->sacked_out - tp->reordering;
 		if (sacked_upto >= 0)
 			tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2723,11 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
 	return false;
 }
 
-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_is_rack(sk)) {
+	if (tcp_rtx_queue_empty(sk))
+		return;
+
+	if (unlikely(tcp_is_reno(tp))) {
+		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
+	} else if (tcp_is_rack(sk)) {
 		u32 prior_retrans = tp->retrans_out;
 
 		tcp_rack_mark_lost(sk);
@@ -2823,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 			tcp_try_keep_open(sk);
 			return;
 		}
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		break;
 	case TCP_CA_Loss:
 		tcp_process_loss(sk, flag, is_dupack, rexmit);
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
 		      (*ack_flag & FLAG_LOST_RETRANS)))
 			return;
@@ -2844,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
 			tcp_try_undo_dsack(sk);
 
-		tcp_rack_identify_loss(sk, ack_flag);
+		tcp_identify_packet_loss(sk, ack_flag);
 		if (!tcp_time_to_recover(sk, flag)) {
 			tcp_try_to_open(sk, flag);
 			return;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 1c1bdf12a96f..299b0e38aa9a 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -216,3 +216,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
 		tp->rack.reo_wnd_steps = 1;
 	}
 }
+
+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
+ * the next unacked packet upon receiving
+ * a) three or more DUPACKs to start the fast recovery
+ * b) an ACK acknowledging new data during the fast recovery.
+ */
+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
+{
+	const u8 state = inet_csk(sk)->icsk_ca_state;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
+	    (state == TCP_CA_Recovery && snd_una_advanced)) {
+		struct sk_buff *skb = tcp_rtx_queue_head(sk);
+		u32 mss;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+			return;
+
+		mss = tcp_skb_mss(skb);
+		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
+			tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+				     mss, mss, GFP_ATOMIC);
+
+		tcp_skb_mark_lost_uncond_verify(tp, skb);
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b


From d716bfdb10b4250617783c94253e48b0e85adcb1 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:13 -0700
Subject: tcp: account lost retransmit after timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous approach for the lost and retransmit bits was to
wipe the slate clean: zero all the lost and retransmit bits,
correspondingly zero the lost_out and retrans_out counters, and
then add back the lost bits (and correspondingly increment lost_out).

The new approach is to treat this very much like marking packets
lost in fast recovery. We don’t wipe the slate clean. We just say
that for all packets that were not yet marked sacked or lost, we now
mark them as lost in exactly the same way we do for fast recovery.

This fixes the lost retransmit accounting at RTO time and greatly
simplifies the RTO code by sharing much of the logic with Fast
Recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  1 +
 net/ipv4/tcp_input.c    | 18 +++---------------
 net/ipv4/tcp_recovery.c |  4 ++--
 3 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9a3ce379b994..2b5372ef2e0e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1877,6 +1877,7 @@ void tcp_v4_init(void);
 void tcp_init(void);
 
 /* tcp_recovery.c */
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
 void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 076206873e3e..6fb0a28977a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1929,7 +1929,6 @@ void tcp_enter_loss(struct sock *sk)
 	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
-	bool mark_lost;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1945,9 +1944,6 @@ void tcp_enter_loss(struct sock *sk)
 	tp->snd_cwnd_cnt   = 0;
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-	tp->retrans_out = 0;
-	tp->lost_out = 0;
-
 	if (tcp_is_reno(tp))
 		tcp_reset_reno_sack(tp);
 
@@ -1959,21 +1955,13 @@ void tcp_enter_loss(struct sock *sk)
 		/* Mark SACK reneging until we recover from this loss event. */
 		tp->is_sack_reneg = 1;
 	}
-	tcp_clear_all_retrans_hints(tp);
-
 	skb_rbtree_walk_from(skb) {
-		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
-			     is_reneg);
-		if (mark_lost)
-			tcp_sum_lost(tp, skb);
-		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-		if (mark_lost) {
+		if (is_reneg)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
-			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			tp->lost_out += tcp_skb_pcount(skb);
-		}
+		tcp_mark_skb_lost(sk, skb);
 	}
 	tcp_verify_left_out(tp);
+	tcp_clear_all_retrans_hints(tp);
 
 	/* Timeout in disordered state after receiving substantial DUPACKs
 	 * suggests that the degree of reordering is over-estimated.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 299b0e38aa9a..b2f9be388bf3 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
 #include <linux/tcp.h>
 #include <net/tcp.h>
 
-static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -95,7 +95,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		remaining = tp->rack.rtt_us + reo_wnd -
 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
 		if (remaining <= 0) {
-			tcp_rack_mark_skb_lost(sk, skb);
+			tcp_mark_skb_lost(sk, skb);
 			list_del_init(&skb->tcp_tsorted_anchor);
 		} else {
 			/* Record maximum wait time */
-- 
cgit v1.2.3-59-g8ed1b


From 2ad55f5660158d8a12c06486d32a7c47fa5f116b Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:14 -0700
Subject: tcp: new helper tcp_timeout_mark_lost

Refactor using a new helper, tcp_timeout_mark_loss(), that marks packets
lost upon RTO.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 50 +++++++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6fb0a28977a0..af32accda2a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,18 +1917,43 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
 	tp->undo_retrans = tp->retrans_out ? : -1;
 }
 
-/* Enter Loss state. If we detect SACK reneging, forget all SACK information
+/* If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
  */
+static void tcp_timeout_mark_lost(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	bool is_reneg;			/* is receiver reneging on SACKs? */
+
+	skb = tcp_rtx_queue_head(sk);
+	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+	if (is_reneg) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+		tp->sacked_out = 0;
+		/* Mark SACK reneging until we recover from this loss event. */
+		tp->is_sack_reneg = 1;
+	} else if (tcp_is_reno(tp)) {
+		tcp_reset_reno_sack(tp);
+	}
+
+	skb_rbtree_walk_from(skb) {
+		if (is_reneg)
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+		tcp_mark_skb_lost(sk, skb);
+	}
+	tcp_verify_left_out(tp);
+	tcp_clear_all_retrans_hints(tp);
+}
+
+/* Enter Loss state. */
 void tcp_enter_loss(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
-	struct sk_buff *skb;
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
-	bool is_reneg;			/* is receiver reneging on SACKs? */
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1944,24 +1969,7 @@ void tcp_enter_loss(struct sock *sk)
 	tp->snd_cwnd_cnt   = 0;
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-	if (tcp_is_reno(tp))
-		tcp_reset_reno_sack(tp);
-
-	skb = tcp_rtx_queue_head(sk);
-	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
-	if (is_reneg) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
-		tp->sacked_out = 0;
-		/* Mark SACK reneging until we recover from this loss event. */
-		tp->is_sack_reneg = 1;
-	}
-	skb_rbtree_walk_from(skb) {
-		if (is_reneg)
-			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
-		tcp_mark_skb_lost(sk, skb);
-	}
-	tcp_verify_left_out(tp);
-	tcp_clear_all_retrans_hints(tp);
+	tcp_timeout_mark_lost(sk);
 
 	/* Timeout in disordered state after receiving substantial DUPACKs
 	 * suggests that the degree of reordering is over-estimated.
-- 
cgit v1.2.3-59-g8ed1b


From c77d62ffae377de331a19020c0b598a6faceb0ef Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:15 -0700
Subject: tcp: separate loss marking and state update on RTO

Previously when TCP times out, it first updates cwnd and ssthresh,
marks packets lost, and then updates congestion state again. This
was fine because everything not yet delivered is marked lost,
so the inflight is always 0 and cwnd can be safely set to 1 to
retransmit one packet on timeout.

But the inflight may not always be 0 on timeout if TCP changes to
mark packets lost based on packet sent time. Therefore we must
first mark the packet lost, then set the cwnd based on the
(updated) inflight.

This is not a pure refactor. Congestion control may potentially
break if it uses (not yet updated) inflight to compute ssthresh.
Fortunately all existing congestion control modules does not do that.
Also it changes the inflight when CA_LOSS_EVENT is called, and only
westwood processes such an event but does not use inflight.

This change has two other minor side benefits:
1) consistent with Fast Recovery s.t. the inflight is updated
   first before tcp_enter_recovery flips state to CA_Recovery.

2) avoid intertwining loss marking with state update, making the
   code more readable.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index af32accda2a9..1ccc97b368c7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1955,6 +1955,8 @@ void tcp_enter_loss(struct sock *sk)
 	struct net *net = sock_net(sk);
 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 
+	tcp_timeout_mark_lost(sk);
+
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
 	    !after(tp->high_seq, tp->snd_una) ||
@@ -1969,8 +1971,6 @@ void tcp_enter_loss(struct sock *sk)
 	tp->snd_cwnd_cnt   = 0;
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-	tcp_timeout_mark_lost(sk);
-
 	/* Timeout in disordered state after receiving substantial DUPACKs
 	 * suggests that the degree of reordering is over-estimated.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From b8fef65a8a76c2f887c56bf8e9684ff04e8d3b9f Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:16 -0700
Subject: tcp: new helper tcp_rack_skb_timeout

Create and export a new helper tcp_rack_skb_timeout and move tcp_is_rack
to prepare the final RTO change.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  2 ++
 net/ipv4/tcp_input.c    | 10 +++++-----
 net/ipv4/tcp_recovery.c |  9 +++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2b5372ef2e0e..6deb540297cc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1879,6 +1879,8 @@ void tcp_init(void);
 /* tcp_recovery.c */
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
 void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
+extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
+				u32 reo_wnd);
 extern void tcp_rack_mark_lost(struct sock *sk);
 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 			     u64 xmit_time);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1ccc97b368c7..ba8a8e3464aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,6 +1917,11 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
 	tp->undo_retrans = tp->retrans_out ? : -1;
 }
 
+static bool tcp_is_rack(const struct sock *sk)
+{
+	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+}
+
 /* If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
@@ -2031,11 +2036,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
 	return tp->sacked_out + 1;
 }
 
-static bool tcp_is_rack(const struct sock *sk)
-{
-	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
-}
-
 /* Linux NewReno/SACK/ECN state machine.
  * --------------------------------------
  *
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index b2f9be388bf3..30cbfb69b1de 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -47,6 +47,12 @@ u32 tcp_rack_reo_wnd(const struct sock *sk)
 		   tp->srtt_us >> 3);
 }
 
+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
+{
+	return tp->rack.rtt_us + reo_wnd -
+	       tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+}
+
 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
  *
  * Marks a packet lost, if some packet sent later has been (s)acked.
@@ -92,8 +98,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		/* A packet is lost if it has not been s/acked beyond
 		 * the recent RTT plus the reordering window.
 		 */
-		remaining = tp->rack.rtt_us + reo_wnd -
-			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
 		if (remaining <= 0) {
 			tcp_mark_skb_lost(sk, skb);
 			list_del_init(&skb->tcp_tsorted_anchor);
-- 
cgit v1.2.3-59-g8ed1b


From 56f8c5d78f5746f48abf6d1f0bc2945dc59277ee Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 16 May 2018 16:40:17 -0700
Subject: tcp: don't mark recently sent packets lost on RTO

An RTO event indicates the head has not been acked for a long time
after its last (re)transmission. But the other packets are not
necessarily lost if they have been only sent recently (for example
due to application limit). This patch would prohibit marking packets
sent within an RTT to be lost on RTO event, using similar logic in
TCP RACK detection.

Normally the head (SND.UNA) would be marked lost since RTO should
fire strictly after the head was sent. An exception is when the
most recent RACK RTT measurement is larger than the (previous)
RTO. To address this exception the head is always marked lost.

Congestion control interaction: since we may not mark every packet
lost, the congestion window may be more than 1 (inflight plus 1).
But only one packet will be retransmitted after RTO, since
tcp_retransmit_timer() calls tcp_retransmit_skb(...,segs=1). The
connection still performs slow start from one packet (with Cubic
congestion control).

This commit was tested in an A/B test with Google web servers,
and showed a reduction of 2% in (spurious) retransmits post
timeout (SlowStartRetrans), and correspondingly reduced DSACKs
(DSACKIgnoredOld) by 7%.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ba8a8e3464aa..0bf032839548 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1929,11 +1929,11 @@ static bool tcp_is_rack(const struct sock *sk)
 static void tcp_timeout_mark_lost(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
+	struct sk_buff *skb, *head;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 
-	skb = tcp_rtx_queue_head(sk);
-	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+	head = tcp_rtx_queue_head(sk);
+	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
 	if (is_reneg) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
@@ -1943,9 +1943,13 @@ static void tcp_timeout_mark_lost(struct sock *sk)
 		tcp_reset_reno_sack(tp);
 	}
 
+	skb = head;
 	skb_rbtree_walk_from(skb) {
 		if (is_reneg)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+		else if (tcp_is_rack(sk) && skb != head &&
+			 tcp_rack_skb_timeout(tp, skb, 0) > 0)
+			continue; /* Don't mark recently sent ones lost yet */
 		tcp_mark_skb_lost(sk, skb);
 	}
 	tcp_verify_left_out(tp);
@@ -1972,7 +1976,7 @@ void tcp_enter_loss(struct sock *sk)
 		tcp_ca_event(sk, CA_EVENT_LOSS);
 		tcp_init_undo(tp);
 	}
-	tp->snd_cwnd	   = 1;
+	tp->snd_cwnd	   = tcp_packets_in_flight(tp) + 1;
 	tp->snd_cwnd_cnt   = 0;
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7e878b605f05bf0c09aa0d81e05fb7878bf8c7d4 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Wed, 16 May 2018 19:09:23 -0700
Subject: bonding: introduce link change helper

Introduce an new common helper to avoid redundancy.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a4cd7f6bfd4d..06efdf6a762b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2137,6 +2137,24 @@ static int bond_miimon_inspect(struct bonding *bond)
 	return commit;
 }
 
+static void bond_miimon_link_change(struct bonding *bond,
+				    struct slave *slave,
+				    char link)
+{
+	switch (BOND_MODE(bond)) {
+	case BOND_MODE_8023AD:
+		bond_3ad_handle_link_change(slave, link);
+		break;
+	case BOND_MODE_TLB:
+	case BOND_MODE_ALB:
+		bond_alb_handle_link_change(bond, slave, link);
+		break;
+	case BOND_MODE_XOR:
+		bond_update_slave_arr(bond, NULL);
+		break;
+	}
+}
+
 static void bond_miimon_commit(struct bonding *bond)
 {
 	struct list_head *iter;
@@ -2178,16 +2196,7 @@ static void bond_miimon_commit(struct bonding *bond)
 				    slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
 				    slave->duplex ? "full" : "half");
 
-			/* notify ad that the link status has changed */
-			if (BOND_MODE(bond) == BOND_MODE_8023AD)
-				bond_3ad_handle_link_change(slave, BOND_LINK_UP);
-
-			if (bond_is_lb(bond))
-				bond_alb_handle_link_change(bond, slave,
-							    BOND_LINK_UP);
-
-			if (BOND_MODE(bond) == BOND_MODE_XOR)
-				bond_update_slave_arr(bond, NULL);
+			bond_miimon_link_change(bond, slave, BOND_LINK_UP);
 
 			if (!bond->curr_active_slave || slave == primary)
 				goto do_failover;
@@ -2209,16 +2218,7 @@ static void bond_miimon_commit(struct bonding *bond)
 			netdev_info(bond->dev, "link status definitely down for interface %s, disabling it\n",
 				    slave->dev->name);
 
-			if (BOND_MODE(bond) == BOND_MODE_8023AD)
-				bond_3ad_handle_link_change(slave,
-							    BOND_LINK_DOWN);
-
-			if (bond_is_lb(bond))
-				bond_alb_handle_link_change(bond, slave,
-							    BOND_LINK_DOWN);
-
-			if (BOND_MODE(bond) == BOND_MODE_XOR)
-				bond_update_slave_arr(bond, NULL);
+			bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);
 
 			if (slave == rcu_access_pointer(bond->curr_active_slave))
 				goto do_failover;
-- 
cgit v1.2.3-59-g8ed1b


From 93c65d13d8a0b7c272868d4a9779f96fc973df26 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 17 May 2018 11:46:41 +0800
Subject: vmxnet3: Replace msleep(1) with usleep_range()

As documented in Documentation/timers/timers-howto.txt,
replace msleep(1) with usleep_range().

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vmxnet3/vmxnet3_drv.c     | 6 +++---
 drivers/net/vmxnet3/vmxnet3_ethtool.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 9ebe2a689966..2234a334e96e 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2945,7 +2945,7 @@ vmxnet3_close(struct net_device *netdev)
 	 * completion.
 	 */
 	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
-		msleep(1);
+		usleep_range(1000, 2000);
 
 	vmxnet3_quiesce_dev(adapter);
 
@@ -2995,7 +2995,7 @@ vmxnet3_change_mtu(struct net_device *netdev, int new_mtu)
 	 * completion.
 	 */
 	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
-		msleep(1);
+		usleep_range(1000, 2000);
 
 	if (netif_running(netdev)) {
 		vmxnet3_quiesce_dev(adapter);
@@ -3567,7 +3567,7 @@ static void vmxnet3_shutdown_device(struct pci_dev *pdev)
 	 * completion.
 	 */
 	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
-		msleep(1);
+		usleep_range(1000, 2000);
 
 	if (test_and_set_bit(VMXNET3_STATE_BIT_QUIESCED,
 			     &adapter->state)) {
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 2ff27314e047..559db051a500 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -600,7 +600,7 @@ vmxnet3_set_ringparam(struct net_device *netdev,
 	 * completion.
 	 */
 	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
-		msleep(1);
+		usleep_range(1000, 2000);
 
 	if (netif_running(netdev)) {
 		vmxnet3_quiesce_dev(adapter);
-- 
cgit v1.2.3-59-g8ed1b


From dcd3e73ae7bc82e303969e415d8c3d3147c1af6a Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:30 +0200
Subject: net: mvpp2: align the ethtool ops definition

Cosmetic patch to align the ethtool functions to ops definitions. This
patch does not change in any way the driver's behaviour.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 6f410235987c..77dd91e3d962 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -7859,18 +7859,18 @@ static const struct net_device_ops mvpp2_netdev_ops = {
 };
 
 static const struct ethtool_ops mvpp2_eth_tool_ops = {
-	.nway_reset	= phy_ethtool_nway_reset,
-	.get_link	= ethtool_op_get_link,
-	.set_coalesce	= mvpp2_ethtool_set_coalesce,
-	.get_coalesce	= mvpp2_ethtool_get_coalesce,
-	.get_drvinfo	= mvpp2_ethtool_get_drvinfo,
-	.get_ringparam	= mvpp2_ethtool_get_ringparam,
-	.set_ringparam	= mvpp2_ethtool_set_ringparam,
-	.get_strings	= mvpp2_ethtool_get_strings,
-	.get_ethtool_stats = mvpp2_ethtool_get_stats,
-	.get_sset_count	= mvpp2_ethtool_get_sset_count,
-	.get_link_ksettings = phy_ethtool_get_link_ksettings,
-	.set_link_ksettings = phy_ethtool_set_link_ksettings,
+	.nway_reset		= phy_ethtool_nway_reset,
+	.get_link		= ethtool_op_get_link,
+	.set_coalesce		= mvpp2_ethtool_set_coalesce,
+	.get_coalesce		= mvpp2_ethtool_get_coalesce,
+	.get_drvinfo		= mvpp2_ethtool_get_drvinfo,
+	.get_ringparam		= mvpp2_ethtool_get_ringparam,
+	.set_ringparam		= mvpp2_ethtool_set_ringparam,
+	.get_strings		= mvpp2_ethtool_get_strings,
+	.get_ethtool_stats	= mvpp2_ethtool_get_stats,
+	.get_sset_count		= mvpp2_ethtool_get_sset_count,
+	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
+	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
 };
 
 /* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
-- 
cgit v1.2.3-59-g8ed1b


From 4bb043262878c9fc9b51ad67fc229bb973d957c8 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:31 +0200
Subject: net: mvpp2: phylink support

Convert the PPv2 driver to implement phylink helpers, and use phylink in
DT mode. The other mode supported is ACPI, which will need further work
in order to be entirely compatible with phylink.

The MAC and GoP configuration functions were completely moved to fit
into the phylink helpers. When a PHY is always present between the MAC
and the physical port, phylink only is used, but when this is not the
case (the MAC directly is connected to the physical port) the link IRQ
is used to detect changes in the link state and call phylink_mac_change.

The ACPI mode do not uses phylink as of now, and the changes shouldn't
impact its use.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/Kconfig |   1 +
 drivers/net/ethernet/marvell/mvpp2.c | 846 +++++++++++++++++++++--------------
 2 files changed, 509 insertions(+), 338 deletions(-)

diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index ebe5c9148935..cc2f7701e71e 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -86,6 +86,7 @@ config MVPP2
 	depends on ARCH_MVEBU || COMPILE_TEST
 	depends on HAS_DMA
 	select MVMDIO
+	select PHYLINK
 	---help---
 	  This driver supports the network interface units in the
 	  Marvell ARMADA 375, 7K and 8K SoCs.
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 77dd91e3d962..60093f1e6297 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -29,6 +29,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/phy.h>
+#include <linux/phylink.h>
 #include <linux/phy/phy.h>
 #include <linux/clk.h>
 #include <linux/hrtimer.h>
@@ -359,15 +360,23 @@
 #define     MVPP2_GMAC_FORCE_LINK_PASS		BIT(1)
 #define     MVPP2_GMAC_IN_BAND_AUTONEG		BIT(2)
 #define     MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS	BIT(3)
+#define     MVPP2_GMAC_IN_BAND_RESTART_AN	BIT(4)
 #define     MVPP2_GMAC_CONFIG_MII_SPEED	BIT(5)
 #define     MVPP2_GMAC_CONFIG_GMII_SPEED	BIT(6)
 #define     MVPP2_GMAC_AN_SPEED_EN		BIT(7)
 #define     MVPP2_GMAC_FC_ADV_EN		BIT(9)
+#define     MVPP2_GMAC_FC_ADV_ASM_EN		BIT(10)
 #define     MVPP2_GMAC_FLOW_CTRL_AUTONEG	BIT(11)
 #define     MVPP2_GMAC_CONFIG_FULL_DUPLEX	BIT(12)
 #define     MVPP2_GMAC_AN_DUPLEX_EN		BIT(13)
 #define MVPP2_GMAC_STATUS0			0x10
 #define     MVPP2_GMAC_STATUS0_LINK_UP		BIT(0)
+#define     MVPP2_GMAC_STATUS0_GMII_SPEED	BIT(1)
+#define     MVPP2_GMAC_STATUS0_MII_SPEED	BIT(2)
+#define     MVPP2_GMAC_STATUS0_FULL_DUPLEX	BIT(3)
+#define     MVPP2_GMAC_STATUS0_RX_PAUSE		BIT(6)
+#define     MVPP2_GMAC_STATUS0_TX_PAUSE		BIT(7)
+#define     MVPP2_GMAC_STATUS0_AN_COMPLETE	BIT(11)
 #define MVPP2_GMAC_PORT_FIFO_CFG_1_REG		0x1c
 #define     MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS	6
 #define     MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK	0x1fc0
@@ -379,6 +388,8 @@
 #define     MVPP22_GMAC_INT_MASK_LINK_STAT	BIT(1)
 #define MVPP22_GMAC_CTRL_4_REG			0x90
 #define     MVPP22_CTRL4_EXT_PIN_GMII_SEL	BIT(0)
+#define     MVPP22_CTRL4_RX_FC_EN		BIT(3)
+#define     MVPP22_CTRL4_TX_FC_EN		BIT(4)
 #define     MVPP22_CTRL4_DP_CLK_SEL		BIT(5)
 #define     MVPP22_CTRL4_SYNC_BYPASS_DIS	BIT(6)
 #define     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE	BIT(7)
@@ -392,6 +403,7 @@
 #define     MVPP22_XLG_CTRL0_PORT_EN		BIT(0)
 #define     MVPP22_XLG_CTRL0_MAC_RESET_DIS	BIT(1)
 #define     MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN	BIT(7)
+#define     MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN	BIT(8)
 #define     MVPP22_XLG_CTRL0_MIB_CNT_DIS	BIT(14)
 #define MVPP22_XLG_CTRL1_REG			0x104
 #define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS	0
@@ -413,6 +425,7 @@
 #define     MVPP22_XLG_CTRL4_FWD_FC		BIT(5)
 #define     MVPP22_XLG_CTRL4_FWD_PFC		BIT(6)
 #define     MVPP22_XLG_CTRL4_MACMODSELECT_GMAC	BIT(12)
+#define     MVPP22_XLG_CTRL4_EN_IDLE_CHECK	BIT(14)
 
 /* SMI registers. PPv2.2 only, relative to priv->iface_base. */
 #define MVPP22_SMI_MISC_CFG_REG			0x1204
@@ -1017,6 +1030,9 @@ struct mvpp2_port {
 	/* Firmware node associated to the port */
 	struct fwnode_handle *fwnode;
 
+	/* Is a PHY always connected to the port */
+	bool has_phy;
+
 	/* Per-port registers' base address */
 	void __iomem *base;
 	void __iomem *stats_base;
@@ -1044,12 +1060,11 @@ struct mvpp2_port {
 	struct mutex gather_stats_lock;
 	struct delayed_work stats_work;
 
+	struct device_node *of_node;
+
 	phy_interface_t phy_interface;
-	struct device_node *phy_node;
+	struct phylink *phylink;
 	struct phy *comphy;
-	unsigned int link;
-	unsigned int duplex;
-	unsigned int speed;
 
 	struct mvpp2_bm_pool *pool_long;
 	struct mvpp2_bm_pool *pool_short;
@@ -1338,6 +1353,12 @@ struct mvpp2_bm_pool {
 	 (addr) < (txq_pcpu)->tso_headers_dma + \
 	 (txq_pcpu)->size * TSO_HEADER_SIZE)
 
+/* The prototype is added here to be used in start_dev when using ACPI. This
+ * will be removed once phylink is used for all modes (dt+ACPI).
+ */
+static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+			     const struct phylink_link_state *state);
+
 /* Queue modes */
 #define MVPP2_QDIST_SINGLE_MODE	0
 #define MVPP2_QDIST_MULTI_MODE	1
@@ -4969,133 +4990,6 @@ static int mvpp22_comphy_init(struct mvpp2_port *port)
 	return phy_power_on(port->comphy);
 }
 
-static void mvpp2_port_mii_gmac_configure_mode(struct mvpp2_port *port)
-{
-	u32 val;
-
-	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
-		val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
-		val |= MVPP22_CTRL4_SYNC_BYPASS_DIS | MVPP22_CTRL4_DP_CLK_SEL |
-		       MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
-		val &= ~MVPP22_CTRL4_EXT_PIN_GMII_SEL;
-		writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
-	} else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
-		val = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
-		val |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
-		       MVPP22_CTRL4_SYNC_BYPASS_DIS |
-		       MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
-		val &= ~MVPP22_CTRL4_DP_CLK_SEL;
-		writel(val, port->base + MVPP22_GMAC_CTRL_4_REG);
-	}
-
-	/* The port is connected to a copper PHY */
-	val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-	val &= ~MVPP2_GMAC_PORT_TYPE_MASK;
-	writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
-
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	val |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS |
-	       MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
-	       MVPP2_GMAC_AN_DUPLEX_EN;
-	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
-		val |= MVPP2_GMAC_IN_BAND_AUTONEG;
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
-static void mvpp2_port_mii_gmac_configure(struct mvpp2_port *port)
-{
-	u32 val;
-
-	/* Force link down */
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
-	val |= MVPP2_GMAC_FORCE_LINK_DOWN;
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-
-	/* Set the GMAC in a reset state */
-	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-	val |= MVPP2_GMAC_PORT_RESET_MASK;
-	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	/* Configure the PCS and in-band AN */
-	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
-	        val |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
-	} else if (phy_interface_mode_is_rgmii(port->phy_interface)) {
-		val &= ~MVPP2_GMAC_PCS_ENABLE_MASK;
-	}
-	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	mvpp2_port_mii_gmac_configure_mode(port);
-
-	/* Unset the GMAC reset state */
-	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-	val &= ~MVPP2_GMAC_PORT_RESET_MASK;
-	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	/* Stop forcing link down */
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
-static void mvpp2_port_mii_xlg_configure(struct mvpp2_port *port)
-{
-	u32 val;
-
-	if (port->gop_id != 0)
-		return;
-
-	val = readl(port->base + MVPP22_XLG_CTRL0_REG);
-	val |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
-	writel(val, port->base + MVPP22_XLG_CTRL0_REG);
-
-	val = readl(port->base + MVPP22_XLG_CTRL4_REG);
-	val &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
-	val |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC;
-	writel(val, port->base + MVPP22_XLG_CTRL4_REG);
-}
-
-static void mvpp22_port_mii_set(struct mvpp2_port *port)
-{
-	u32 val;
-
-	/* Only GOP port 0 has an XLG MAC */
-	if (port->gop_id == 0) {
-		val = readl(port->base + MVPP22_XLG_CTRL3_REG);
-		val &= ~MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
-
-		if (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
-		    port->phy_interface == PHY_INTERFACE_MODE_10GKR)
-			val |= MVPP22_XLG_CTRL3_MACMODESELECT_10G;
-		else
-			val |= MVPP22_XLG_CTRL3_MACMODESELECT_GMAC;
-
-		writel(val, port->base + MVPP22_XLG_CTRL3_REG);
-	}
-}
-
-static void mvpp2_port_mii_set(struct mvpp2_port *port)
-{
-	if (port->priv->hw_version == MVPP22)
-		mvpp22_port_mii_set(port);
-
-	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII)
-		mvpp2_port_mii_gmac_configure(port);
-	else if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
-		mvpp2_port_mii_xlg_configure(port);
-}
-
-static void mvpp2_port_fc_adv_enable(struct mvpp2_port *port)
-{
-	u32 val;
-
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	val |= MVPP2_GMAC_FC_ADV_EN;
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
 static void mvpp2_port_enable(struct mvpp2_port *port)
 {
 	u32 val;
@@ -5126,8 +5020,11 @@ static void mvpp2_port_disable(struct mvpp2_port *port)
 	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
 	     port->phy_interface == PHY_INTERFACE_MODE_10GKR)) {
 		val = readl(port->base + MVPP22_XLG_CTRL0_REG);
-		val &= ~(MVPP22_XLG_CTRL0_PORT_EN |
-			 MVPP22_XLG_CTRL0_MAC_RESET_DIS);
+		val &= ~MVPP22_XLG_CTRL0_PORT_EN;
+		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
+
+		/* Disable & reset should be done separately */
+		val &= ~MVPP22_XLG_CTRL0_MAC_RESET_DIS;
 		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
 	} else {
 		val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
@@ -5147,13 +5044,14 @@ static void mvpp2_port_periodic_xon_disable(struct mvpp2_port *port)
 }
 
 /* Configure loopback port */
-static void mvpp2_port_loopback_set(struct mvpp2_port *port)
+static void mvpp2_port_loopback_set(struct mvpp2_port *port,
+				    const struct phylink_link_state *state)
 {
 	u32 val;
 
 	val = readl(port->base + MVPP2_GMAC_CTRL_1_REG);
 
-	if (port->speed == 1000)
+	if (state->speed == 1000)
 		val |= MVPP2_GMAC_GMII_LB_EN_MASK;
 	else
 		val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
@@ -5331,10 +5229,6 @@ static void mvpp2_defaults_set(struct mvpp2_port *port)
 	int tx_port_num, val, queue, ptxq, lrxq;
 
 	if (port->priv->hw_version == MVPP21) {
-		/* Configure port to loopback if needed */
-		if (port->flags & MVPP2_F_LOOPBACK)
-			mvpp2_port_loopback_set(port);
-
 		/* Update TX FIFO MIN Threshold */
 		val = readl(port->base + MVPP2_GMAC_PORT_FIFO_CFG_1_REG);
 		val &= ~MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK;
@@ -6382,6 +6276,11 @@ static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
 		}
 	}
 
+	if (port->phylink) {
+		phylink_mac_change(port->phylink, link);
+		goto handled;
+	}
+
 	if (!netif_running(dev) || !event)
 		goto handled;
 
@@ -6406,111 +6305,6 @@ handled:
 	return IRQ_HANDLED;
 }
 
-static void mvpp2_gmac_set_autoneg(struct mvpp2_port *port,
-				   struct phy_device *phydev)
-{
-	u32 val;
-
-	if (port->phy_interface != PHY_INTERFACE_MODE_RGMII &&
-	    port->phy_interface != PHY_INTERFACE_MODE_RGMII_ID &&
-	    port->phy_interface != PHY_INTERFACE_MODE_RGMII_RXID &&
-	    port->phy_interface != PHY_INTERFACE_MODE_RGMII_TXID &&
-	    port->phy_interface != PHY_INTERFACE_MODE_SGMII)
-		return;
-
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	val &= ~(MVPP2_GMAC_CONFIG_MII_SPEED |
-		 MVPP2_GMAC_CONFIG_GMII_SPEED |
-		 MVPP2_GMAC_CONFIG_FULL_DUPLEX |
-		 MVPP2_GMAC_AN_SPEED_EN |
-		 MVPP2_GMAC_AN_DUPLEX_EN);
-
-	if (phydev->duplex)
-		val |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
-
-	if (phydev->speed == SPEED_1000)
-		val |= MVPP2_GMAC_CONFIG_GMII_SPEED;
-	else if (phydev->speed == SPEED_100)
-		val |= MVPP2_GMAC_CONFIG_MII_SPEED;
-
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
-/* Adjust link */
-static void mvpp2_link_event(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct phy_device *phydev = dev->phydev;
-	bool link_reconfigured = false;
-	u32 val;
-
-	if (phydev->link) {
-		if (port->phy_interface != phydev->interface && port->comphy) {
-	                /* disable current port for reconfiguration */
-	                mvpp2_interrupts_disable(port);
-	                netif_carrier_off(port->dev);
-	                mvpp2_port_disable(port);
-			phy_power_off(port->comphy);
-
-	                /* comphy reconfiguration */
-	                port->phy_interface = phydev->interface;
-	                mvpp22_comphy_init(port);
-
-	                /* gop/mac reconfiguration */
-	                mvpp22_gop_init(port);
-	                mvpp2_port_mii_set(port);
-
-	                link_reconfigured = true;
-		}
-
-		if ((port->speed != phydev->speed) ||
-		    (port->duplex != phydev->duplex)) {
-			mvpp2_gmac_set_autoneg(port, phydev);
-
-			port->duplex = phydev->duplex;
-			port->speed  = phydev->speed;
-		}
-	}
-
-	if (phydev->link != port->link || link_reconfigured) {
-		port->link = phydev->link;
-
-		if (phydev->link) {
-			if (port->phy_interface == PHY_INTERFACE_MODE_RGMII ||
-			    port->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
-			    port->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
-			    port->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID ||
-			    port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
-				val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-				val |= (MVPP2_GMAC_FORCE_LINK_PASS |
-					MVPP2_GMAC_FORCE_LINK_DOWN);
-				writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-			}
-
-			mvpp2_interrupts_enable(port);
-			mvpp2_port_enable(port);
-
-			mvpp2_egress_enable(port);
-			mvpp2_ingress_enable(port);
-			netif_carrier_on(dev);
-			netif_tx_wake_all_queues(dev);
-		} else {
-			port->duplex = -1;
-			port->speed = 0;
-
-			netif_tx_stop_all_queues(dev);
-			netif_carrier_off(dev);
-			mvpp2_ingress_disable(port);
-			mvpp2_egress_disable(port);
-
-			mvpp2_port_disable(port);
-			mvpp2_interrupts_disable(port);
-		}
-
-		phy_print_status(phydev);
-	}
-}
-
 static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
 {
 	ktime_t interval;
@@ -7118,11 +6912,29 @@ static int mvpp2_poll(struct napi_struct *napi, int budget)
 	return rx_done;
 }
 
-/* Set hw internals when starting port */
-static void mvpp2_start_dev(struct mvpp2_port *port)
+static void mvpp22_mode_reconfigure(struct mvpp2_port *port)
 {
-	struct net_device *ndev = port->dev;
-	int i;
+	u32 ctrl3;
+
+	/* comphy reconfiguration */
+	mvpp22_comphy_init(port);
+
+	/* gop reconfiguration */
+	mvpp22_gop_init(port);
+
+	/* Only GOP port 0 has an XLG MAC */
+	if (port->gop_id == 0) {
+		ctrl3 = readl(port->base + MVPP22_XLG_CTRL3_REG);
+		ctrl3 &= ~MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
+
+		if (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+		    port->phy_interface == PHY_INTERFACE_MODE_10GKR)
+			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_10G;
+		else
+			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_GMAC;
+
+		writel(ctrl3, port->base + MVPP22_XLG_CTRL3_REG);
+	}
 
 	if (port->gop_id == 0 &&
 	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
@@ -7130,6 +6942,12 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
 		mvpp2_xlg_max_rx_size_set(port);
 	else
 		mvpp2_gmac_max_rx_size_set(port);
+}
+
+/* Set hw internals when starting port */
+static void mvpp2_start_dev(struct mvpp2_port *port)
+{
+	int i;
 
 	mvpp2_txp_max_tx_size_set(port);
 
@@ -7139,42 +6957,39 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
 	/* Enable interrupts on all CPUs */
 	mvpp2_interrupts_enable(port);
 
-	if (port->priv->hw_version == MVPP22) {
-		mvpp22_comphy_init(port);
-		mvpp22_gop_init(port);
+	if (port->priv->hw_version == MVPP22)
+		mvpp22_mode_reconfigure(port);
+
+	if (port->phylink) {
+		phylink_start(port->phylink);
+	} else {
+		/* Phylink isn't used as of now for ACPI, so the MAC has to be
+		 * configured manually when the interface is started. This will
+		 * be removed as soon as the phylink ACPI support lands in.
+		 */
+		struct phylink_link_state state = {
+			.interface = port->phy_interface,
+			.link = 1,
+		};
+		mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
 	}
 
-	mvpp2_port_mii_set(port);
-	mvpp2_port_enable(port);
-	if (ndev->phydev)
-		phy_start(ndev->phydev);
 	netif_tx_start_all_queues(port->dev);
 }
 
 /* Set hw internals when stopping port */
 static void mvpp2_stop_dev(struct mvpp2_port *port)
 {
-	struct net_device *ndev = port->dev;
 	int i;
 
-	/* Stop new packets from arriving to RXQs */
-	mvpp2_ingress_disable(port);
-
-	mdelay(10);
-
 	/* Disable interrupts on all CPUs */
 	mvpp2_interrupts_disable(port);
 
 	for (i = 0; i < port->nqvecs; i++)
 		napi_disable(&port->qvecs[i].napi);
 
-	netif_carrier_off(port->dev);
-	netif_tx_stop_all_queues(port->dev);
-
-	mvpp2_egress_disable(port);
-	mvpp2_port_disable(port);
-	if (ndev->phydev)
-		phy_stop(ndev->phydev);
+	if (port->phylink)
+		phylink_stop(port->phylink);
 	phy_power_off(port->comphy);
 }
 
@@ -7233,40 +7048,6 @@ static void mvpp21_get_mac_address(struct mvpp2_port *port, unsigned char *addr)
 	addr[5] = (mac_addr_l >> MVPP2_GMAC_SA_LOW_OFFS) & 0xFF;
 }
 
-static int mvpp2_phy_connect(struct mvpp2_port *port)
-{
-	struct phy_device *phy_dev;
-
-	/* No PHY is attached */
-	if (!port->phy_node)
-		return 0;
-
-	phy_dev = of_phy_connect(port->dev, port->phy_node, mvpp2_link_event, 0,
-				 port->phy_interface);
-	if (!phy_dev) {
-		netdev_err(port->dev, "cannot connect to phy\n");
-		return -ENODEV;
-	}
-	phy_dev->supported &= PHY_GBIT_FEATURES;
-	phy_dev->advertising = phy_dev->supported;
-
-	port->link    = 0;
-	port->duplex  = 0;
-	port->speed   = 0;
-
-	return 0;
-}
-
-static void mvpp2_phy_disconnect(struct mvpp2_port *port)
-{
-	struct net_device *ndev = port->dev;
-
-	if (!ndev->phydev)
-		return;
-
-	phy_disconnect(ndev->phydev);
-}
-
 static int mvpp2_irqs_init(struct mvpp2_port *port)
 {
 	int err, i;
@@ -7350,6 +7131,7 @@ static int mvpp2_open(struct net_device *dev)
 	struct mvpp2 *priv = port->priv;
 	unsigned char mac_bcast[ETH_ALEN] = {
 			0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+	bool valid = false;
 	int err;
 
 	err = mvpp2_prs_mac_da_accept(port, mac_bcast, true);
@@ -7392,7 +7174,19 @@ static int mvpp2_open(struct net_device *dev)
 		goto err_cleanup_txqs;
 	}
 
-	if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq) {
+	/* Phylink isn't supported yet in ACPI mode */
+	if (port->of_node) {
+		err = phylink_of_phy_connect(port->phylink, port->of_node, 0);
+		if (err) {
+			netdev_err(port->dev, "could not attach PHY (%d)\n",
+				   err);
+			goto err_free_irq;
+		}
+
+		valid = true;
+	}
+
+	if (priv->hw_version == MVPP22 && port->link_irq && !port->phylink) {
 		err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
 				  dev->name, port);
 		if (err) {
@@ -7402,14 +7196,20 @@ static int mvpp2_open(struct net_device *dev)
 		}
 
 		mvpp22_gop_setup_irq(port);
-	}
 
-	/* In default link is down */
-	netif_carrier_off(port->dev);
+		/* In default link is down */
+		netif_carrier_off(port->dev);
 
-	err = mvpp2_phy_connect(port);
-	if (err < 0)
-		goto err_free_link_irq;
+		valid = true;
+	} else {
+		port->link_irq = 0;
+	}
+
+	if (!valid) {
+		netdev_err(port->dev,
+			   "invalid configuration: no dt or link IRQ");
+		goto err_free_irq;
+	}
 
 	/* Unmask interrupts on all CPUs */
 	on_each_cpu(mvpp2_interrupts_unmask, port, 1);
@@ -7426,9 +7226,6 @@ static int mvpp2_open(struct net_device *dev)
 
 	return 0;
 
-err_free_link_irq:
-	if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
-		free_irq(port->link_irq, port);
 err_free_irq:
 	mvpp2_irqs_deinit(port);
 err_cleanup_txqs:
@@ -7442,17 +7239,17 @@ static int mvpp2_stop(struct net_device *dev)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
 	struct mvpp2_port_pcpu *port_pcpu;
-	struct mvpp2 *priv = port->priv;
 	int cpu;
 
 	mvpp2_stop_dev(port);
-	mvpp2_phy_disconnect(port);
 
 	/* Mask interrupts on all CPUs */
 	on_each_cpu(mvpp2_interrupts_mask, port, 1);
 	mvpp2_shared_interrupt_mask_unmask(port, true);
 
-	if (priv->hw_version == MVPP22 && !port->phy_node && port->link_irq)
+	if (port->phylink)
+		phylink_disconnect_phy(port->phylink);
+	if (port->link_irq)
 		free_irq(port->link_irq, port);
 
 	mvpp2_irqs_deinit(port);
@@ -7658,16 +7455,12 @@ mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 
 static int mvpp2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-	int ret;
+	struct mvpp2_port *port = netdev_priv(dev);
 
-	if (!dev->phydev)
+	if (!port->phylink)
 		return -ENOTSUPP;
 
-	ret = phy_mii_ioctl(dev->phydev, ifr, cmd);
-	if (!ret)
-		mvpp2_link_event(dev);
-
-	return ret;
+	return phylink_mii_ioctl(port->phylink, ifr, cmd);
 }
 
 static int mvpp2_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
@@ -7714,6 +7507,16 @@ static int mvpp2_set_features(struct net_device *dev,
 
 /* Ethtool methods */
 
+static int mvpp2_ethtool_nway_reset(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_nway_reset(port->phylink);
+}
+
 /* Set interrupt coalescing for ethtools */
 static int mvpp2_ethtool_set_coalesce(struct net_device *dev,
 				      struct ethtool_coalesce *c)
@@ -7842,6 +7645,50 @@ err_out:
 	return err;
 }
 
+static void mvpp2_ethtool_get_pause_param(struct net_device *dev,
+					  struct ethtool_pauseparam *pause)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return;
+
+	phylink_ethtool_get_pauseparam(port->phylink, pause);
+}
+
+static int mvpp2_ethtool_set_pause_param(struct net_device *dev,
+					 struct ethtool_pauseparam *pause)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_set_pauseparam(port->phylink, pause);
+}
+
+static int mvpp2_ethtool_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_ksettings_get(port->phylink, cmd);
+}
+
+static int mvpp2_ethtool_set_link_ksettings(struct net_device *dev,
+					    const struct ethtool_link_ksettings *cmd)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_ksettings_set(port->phylink, cmd);
+}
+
 /* Device ops */
 
 static const struct net_device_ops mvpp2_netdev_ops = {
@@ -7859,7 +7706,7 @@ static const struct net_device_ops mvpp2_netdev_ops = {
 };
 
 static const struct ethtool_ops mvpp2_eth_tool_ops = {
-	.nway_reset		= phy_ethtool_nway_reset,
+	.nway_reset		= mvpp2_ethtool_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.set_coalesce		= mvpp2_ethtool_set_coalesce,
 	.get_coalesce		= mvpp2_ethtool_get_coalesce,
@@ -7869,8 +7716,10 @@ static const struct ethtool_ops mvpp2_eth_tool_ops = {
 	.get_strings		= mvpp2_ethtool_get_strings,
 	.get_ethtool_stats	= mvpp2_ethtool_get_stats,
 	.get_sset_count		= mvpp2_ethtool_get_sset_count,
-	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
-	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
+	.get_pauseparam		= mvpp2_ethtool_get_pause_param,
+	.set_pauseparam		= mvpp2_ethtool_set_pause_param,
+	.get_link_ksettings	= mvpp2_ethtool_get_link_ksettings,
+	.set_link_ksettings	= mvpp2_ethtool_set_link_ksettings,
 };
 
 /* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
@@ -8172,18 +8021,330 @@ static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
 	eth_hw_addr_random(dev);
 }
 
+static void mvpp2_phylink_validate(struct net_device *dev,
+				   unsigned long *supported,
+				   struct phylink_link_state *state)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+	phylink_set(mask, Autoneg);
+	phylink_set_port_modes(mask);
+	phylink_set(mask, Pause);
+	phylink_set(mask, Asym_Pause);
+
+	phylink_set(mask, 10baseT_Half);
+	phylink_set(mask, 10baseT_Full);
+	phylink_set(mask, 100baseT_Half);
+	phylink_set(mask, 100baseT_Full);
+	phylink_set(mask, 1000baseT_Full);
+	phylink_set(mask, 10000baseT_Full);
+
+	if (state->interface == PHY_INTERFACE_MODE_10GKR) {
+		phylink_set(mask, 10000baseCR_Full);
+		phylink_set(mask, 10000baseSR_Full);
+		phylink_set(mask, 10000baseLR_Full);
+		phylink_set(mask, 10000baseLRM_Full);
+		phylink_set(mask, 10000baseER_Full);
+		phylink_set(mask, 10000baseKR_Full);
+	}
+
+	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void mvpp22_xlg_link_state(struct mvpp2_port *port,
+				  struct phylink_link_state *state)
+{
+	u32 val;
+
+	state->speed = SPEED_10000;
+	state->duplex = 1;
+	state->an_complete = 1;
+
+	val = readl(port->base + MVPP22_XLG_STATUS);
+	state->link = !!(val & MVPP22_XLG_STATUS_LINK_UP);
+
+	state->pause = 0;
+	val = readl(port->base + MVPP22_XLG_CTRL0_REG);
+	if (val & MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN)
+		state->pause |= MLO_PAUSE_TX;
+	if (val & MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN)
+		state->pause |= MLO_PAUSE_RX;
+}
+
+static void mvpp2_gmac_link_state(struct mvpp2_port *port,
+				  struct phylink_link_state *state)
+{
+	u32 val;
+
+	val = readl(port->base + MVPP2_GMAC_STATUS0);
+
+	state->an_complete = !!(val & MVPP2_GMAC_STATUS0_AN_COMPLETE);
+	state->link = !!(val & MVPP2_GMAC_STATUS0_LINK_UP);
+	state->duplex = !!(val & MVPP2_GMAC_STATUS0_FULL_DUPLEX);
+
+	if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
+		state->speed = SPEED_1000;
+	else if (val & MVPP2_GMAC_STATUS0_MII_SPEED)
+		state->speed = SPEED_100;
+	else
+		state->speed = SPEED_10;
+
+	state->pause = 0;
+	if (val & MVPP2_GMAC_STATUS0_RX_PAUSE)
+		state->pause |= MLO_PAUSE_RX;
+	if (val & MVPP2_GMAC_STATUS0_TX_PAUSE)
+		state->pause |= MLO_PAUSE_TX;
+}
+
+static int mvpp2_phylink_mac_link_state(struct net_device *dev,
+					struct phylink_link_state *state)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (port->priv->hw_version == MVPP22 && port->gop_id == 0) {
+		u32 mode = readl(port->base + MVPP22_XLG_CTRL3_REG);
+		mode &= MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
+
+		if (mode == MVPP22_XLG_CTRL3_MACMODESELECT_10G) {
+			mvpp22_xlg_link_state(port, state);
+			return 1;
+		}
+	}
+
+	mvpp2_gmac_link_state(port, state);
+	return 1;
+}
+
+static void mvpp2_mac_an_restart(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (port->phy_interface != PHY_INTERFACE_MODE_SGMII)
+		return;
+
+	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	/* The RESTART_AN bit is cleared by the h/w after restarting the AN
+	 * process.
+	 */
+	val |= MVPP2_GMAC_IN_BAND_RESTART_AN | MVPP2_GMAC_IN_BAND_AUTONEG;
+	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_xlg_config(struct mvpp2_port *port, unsigned int mode,
+			     const struct phylink_link_state *state)
+{
+	u32 ctrl0, ctrl4;
+
+	ctrl0 = readl(port->base + MVPP22_XLG_CTRL0_REG);
+	ctrl4 = readl(port->base + MVPP22_XLG_CTRL4_REG);
+
+	if (state->pause & MLO_PAUSE_TX)
+		ctrl0 |= MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN;
+	if (state->pause & MLO_PAUSE_RX)
+		ctrl0 |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
+
+	ctrl4 &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
+	ctrl4 |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC |
+		 MVPP22_XLG_CTRL4_EN_IDLE_CHECK;
+
+	writel(ctrl0, port->base + MVPP22_XLG_CTRL0_REG);
+	writel(ctrl4, port->base + MVPP22_XLG_CTRL4_REG);
+}
+
+static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
+			      const struct phylink_link_state *state)
+{
+	u32 an, ctrl0, ctrl2, ctrl4;
+
+	an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+	ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+	ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
+
+	/* Force link down */
+	an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+	an |= MVPP2_GMAC_FORCE_LINK_DOWN;
+	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+
+	/* Set the GMAC in a reset state */
+	ctrl2 |= MVPP2_GMAC_PORT_RESET_MASK;
+	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+	an &= ~(MVPP2_GMAC_CONFIG_MII_SPEED | MVPP2_GMAC_CONFIG_GMII_SPEED |
+		MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FC_ADV_EN |
+		MVPP2_GMAC_FC_ADV_ASM_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
+		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN |
+		MVPP2_GMAC_FORCE_LINK_DOWN);
+	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
+	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
+
+	if (!phy_interface_mode_is_rgmii(state->interface))
+		an |= MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG;
+
+	if (state->duplex)
+		an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+	if (phylink_test(state->advertising, Pause))
+		an |= MVPP2_GMAC_FC_ADV_EN;
+	if (phylink_test(state->advertising, Asym_Pause))
+		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
+
+	if (state->interface == PHY_INTERFACE_MODE_SGMII) {
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
+		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
+
+		ctrl4 &= ~(MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+			   MVPP22_CTRL4_RX_FC_EN | MVPP22_CTRL4_TX_FC_EN);
+		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_DP_CLK_SEL |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+
+		if (state->pause & MLO_PAUSE_TX)
+			ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
+		if (state->pause & MLO_PAUSE_RX)
+			ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
+	} else if (phy_interface_mode_is_rgmii(state->interface)) {
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS;
+
+		if (state->speed == SPEED_1000)
+			an |= MVPP2_GMAC_CONFIG_GMII_SPEED;
+		else if (state->speed == SPEED_100)
+			an |= MVPP2_GMAC_CONFIG_MII_SPEED;
+
+		ctrl4 &= ~MVPP22_CTRL4_DP_CLK_SEL;
+		ctrl4 |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+			 MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+	}
+
+	writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
+	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+	writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
+	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+			     const struct phylink_link_state *state)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	/* Check for invalid configuration */
+	if (state->interface == PHY_INTERFACE_MODE_10GKR && port->gop_id != 0) {
+		netdev_err(dev, "Invalid mode on %s\n", dev->name);
+		return;
+	}
+
+	netif_tx_stop_all_queues(port->dev);
+	if (!port->has_phy)
+		netif_carrier_off(port->dev);
+
+	/* Make sure the port is disabled when reconfiguring the mode */
+	mvpp2_port_disable(port);
+
+	if (port->priv->hw_version == MVPP22 &&
+	    port->phy_interface != state->interface) {
+		port->phy_interface = state->interface;
+
+		/* Reconfigure the serdes lanes */
+		phy_power_off(port->comphy);
+		mvpp22_mode_reconfigure(port);
+	}
+
+	/* mac (re)configuration */
+	if (state->interface == PHY_INTERFACE_MODE_10GKR)
+		mvpp2_xlg_config(port, mode, state);
+	else if (phy_interface_mode_is_rgmii(state->interface) ||
+		 state->interface == PHY_INTERFACE_MODE_SGMII)
+		mvpp2_gmac_config(port, mode, state);
+
+	if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
+		mvpp2_port_loopback_set(port, state);
+
+	/* If the port already was up, make sure it's still in the same state */
+	if (state->link || !port->has_phy) {
+		mvpp2_port_enable(port);
+
+		mvpp2_egress_enable(port);
+		mvpp2_ingress_enable(port);
+		if (!port->has_phy)
+			netif_carrier_on(dev);
+		netif_tx_wake_all_queues(dev);
+	}
+}
+
+static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+			      phy_interface_t interface, struct phy_device *phy)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (!phylink_autoneg_inband(mode) &&
+	    interface != PHY_INTERFACE_MODE_10GKR) {
+		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+		val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
+		if (phy_interface_mode_is_rgmii(interface))
+			val |= MVPP2_GMAC_FORCE_LINK_PASS;
+		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	}
+
+	mvpp2_port_enable(port);
+
+	mvpp2_egress_enable(port);
+	mvpp2_ingress_enable(port);
+	netif_tx_wake_all_queues(dev);
+}
+
+static void mvpp2_mac_link_down(struct net_device *dev, unsigned int mode,
+				phy_interface_t interface)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (!phylink_autoneg_inband(mode) &&
+	    interface != PHY_INTERFACE_MODE_10GKR) {
+		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+		val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+		val |= MVPP2_GMAC_FORCE_LINK_DOWN;
+		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	}
+
+	netif_tx_stop_all_queues(dev);
+	mvpp2_egress_disable(port);
+	mvpp2_ingress_disable(port);
+
+	/* When using link interrupts to notify phylink of a MAC state change,
+	 * we do not want the port to be disabled (we want to receive further
+	 * interrupts, to be notified when the port will have a link later).
+	 */
+	if (!port->has_phy)
+		return;
+
+	mvpp2_port_disable(port);
+}
+
+static const struct phylink_mac_ops mvpp2_phylink_ops = {
+	.validate = mvpp2_phylink_validate,
+	.mac_link_state = mvpp2_phylink_mac_link_state,
+	.mac_an_restart = mvpp2_mac_an_restart,
+	.mac_config = mvpp2_mac_config,
+	.mac_link_up = mvpp2_mac_link_up,
+	.mac_link_down = mvpp2_mac_link_down,
+};
+
 /* Ports initialization */
 static int mvpp2_port_probe(struct platform_device *pdev,
 			    struct fwnode_handle *port_fwnode,
 			    struct mvpp2 *priv)
 {
-	struct device_node *phy_node;
 	struct phy *comphy = NULL;
 	struct mvpp2_port *port;
 	struct mvpp2_port_pcpu *port_pcpu;
 	struct device_node *port_node = to_of_node(port_fwnode);
 	struct net_device *dev;
 	struct resource *res;
+	struct phylink *phylink;
 	char *mac_from = "";
 	unsigned int ntxqs, nrxqs;
 	bool has_tx_irqs;
@@ -8212,11 +8373,6 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	if (!dev)
 		return -ENOMEM;
 
-	if (port_node)
-		phy_node = of_parse_phandle(port_node, "phy", 0);
-	else
-		phy_node = NULL;
-
 	phy_mode = fwnode_get_phy_mode(port_fwnode);
 	if (phy_mode < 0) {
 		dev_err(&pdev->dev, "incorrect phy mode\n");
@@ -8249,6 +8405,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	port = netdev_priv(dev);
 	port->dev = dev;
 	port->fwnode = port_fwnode;
+	port->has_phy = !!of_find_property(port_node, "phy", NULL);
 	port->ntxqs = ntxqs;
 	port->nrxqs = nrxqs;
 	port->priv = priv;
@@ -8279,7 +8436,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	else
 		port->first_rxq = port->id * priv->max_port_rxqs;
 
-	port->phy_node = phy_node;
+	port->of_node = port_node;
 	port->phy_interface = phy_mode;
 	port->comphy = comphy;
 
@@ -8340,9 +8497,6 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 
 	mvpp2_port_periodic_xon_disable(port);
 
-	if (priv->hw_version == MVPP21)
-		mvpp2_port_fc_adv_enable(port);
-
 	mvpp2_port_reset(port);
 
 	port->pcpu = alloc_percpu(struct mvpp2_port_pcpu);
@@ -8386,10 +8540,23 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	/* 9704 == 9728 - 20 and rounding to 8 */
 	dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE;
 
+	/* Phylink isn't used w/ ACPI as of now */
+	if (port_node) {
+		phylink = phylink_create(dev, port_fwnode, phy_mode,
+					 &mvpp2_phylink_ops);
+		if (IS_ERR(phylink)) {
+			err = PTR_ERR(phylink);
+			goto err_free_port_pcpu;
+		}
+		port->phylink = phylink;
+	} else {
+		port->phylink = NULL;
+	}
+
 	err = register_netdev(dev);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to register netdev\n");
-		goto err_free_port_pcpu;
+		goto err_phylink;
 	}
 	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
 
@@ -8397,6 +8564,9 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 
 	return 0;
 
+err_phylink:
+	if (port->phylink)
+		phylink_destroy(port->phylink);
 err_free_port_pcpu:
 	free_percpu(port->pcpu);
 err_free_txq_pcpu:
@@ -8410,7 +8580,6 @@ err_free_irq:
 err_deinit_qvecs:
 	mvpp2_queue_vectors_deinit(port);
 err_free_netdev:
-	of_node_put(phy_node);
 	free_netdev(dev);
 	return err;
 }
@@ -8421,7 +8590,8 @@ static void mvpp2_port_remove(struct mvpp2_port *port)
 	int i;
 
 	unregister_netdev(port->dev);
-	of_node_put(port->phy_node);
+	if (port->phylink)
+		phylink_destroy(port->phylink);
 	free_percpu(port->pcpu);
 	free_percpu(port->stats);
 	for (i = 0; i < port->ntxqs; i++)
-- 
cgit v1.2.3-59-g8ed1b


From 5490b8725d80cdfd5266a3732e23d61d0e9f7d2f Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:32 +0200
Subject: phy: add 2.5G SGMII mode to the phy_mode enum

This patch adds one more generic PHY mode to the phy_mode enum, to allow
configuring generic PHYs to the 2.5G SGMII mode by using the set_mode
callback.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy/phy.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index c9d14eeee7f5..9713aebdd348 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -36,6 +36,7 @@ enum phy_mode {
 	PHY_MODE_USB_DEVICE_SS,
 	PHY_MODE_USB_OTG,
 	PHY_MODE_SGMII,
+	PHY_MODE_2500SGMII,
 	PHY_MODE_10GKR,
 	PHY_MODE_UFS_HS_A,
 	PHY_MODE_UFS_HS_B,
-- 
cgit v1.2.3-59-g8ed1b


From 9ad8bd819b9a94b614576cb82641197e6d828aeb Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:33 +0200
Subject: phy: cp110-comphy: 2.5G SGMII mode

This patch allow the CP110 comphy to configure some lanes in the
2.5G SGMII mode. This mode is quite close to SGMII and uses nearly the
same code path.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/phy/marvell/phy-mvebu-cp110-comphy.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
index a0d522154cdf..4ef429250d7b 100644
--- a/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
+++ b/drivers/phy/marvell/phy-mvebu-cp110-comphy.c
@@ -135,19 +135,25 @@ struct mvebu_comhy_conf {
 static const struct mvebu_comhy_conf mvebu_comphy_cp110_modes[] = {
 	/* lane 0 */
 	MVEBU_COMPHY_CONF(0, 1, PHY_MODE_SGMII, 0x1),
+	MVEBU_COMPHY_CONF(0, 1, PHY_MODE_2500SGMII, 0x1),
 	/* lane 1 */
 	MVEBU_COMPHY_CONF(1, 2, PHY_MODE_SGMII, 0x1),
+	MVEBU_COMPHY_CONF(1, 2, PHY_MODE_2500SGMII, 0x1),
 	/* lane 2 */
 	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_SGMII, 0x1),
+	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_2500SGMII, 0x1),
 	MVEBU_COMPHY_CONF(2, 0, PHY_MODE_10GKR, 0x1),
 	/* lane 3 */
 	MVEBU_COMPHY_CONF(3, 1, PHY_MODE_SGMII, 0x2),
+	MVEBU_COMPHY_CONF(3, 1, PHY_MODE_2500SGMII, 0x2),
 	/* lane 4 */
 	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_SGMII, 0x2),
+	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_2500SGMII, 0x2),
 	MVEBU_COMPHY_CONF(4, 0, PHY_MODE_10GKR, 0x2),
 	MVEBU_COMPHY_CONF(4, 1, PHY_MODE_SGMII, 0x1),
 	/* lane 5 */
 	MVEBU_COMPHY_CONF(5, 2, PHY_MODE_SGMII, 0x1),
+	MVEBU_COMPHY_CONF(5, 2, PHY_MODE_2500SGMII, 0x1),
 };
 
 struct mvebu_comphy_priv {
@@ -206,6 +212,10 @@ static void mvebu_comphy_ethernet_init_reset(struct mvebu_comphy_lane *lane,
 	if (mode == PHY_MODE_10GKR)
 		val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0xe) |
 		       MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0xe);
+	else if (mode == PHY_MODE_2500SGMII)
+		val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0x8) |
+		       MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0x8) |
+		       MVEBU_COMPHY_SERDES_CFG0_HALF_BUS;
 	else if (mode == PHY_MODE_SGMII)
 		val |= MVEBU_COMPHY_SERDES_CFG0_GEN_RX(0x6) |
 		       MVEBU_COMPHY_SERDES_CFG0_GEN_TX(0x6) |
@@ -296,13 +306,13 @@ static int mvebu_comphy_init_plls(struct mvebu_comphy_lane *lane,
 	return 0;
 }
 
-static int mvebu_comphy_set_mode_sgmii(struct phy *phy)
+static int mvebu_comphy_set_mode_sgmii(struct phy *phy, enum phy_mode mode)
 {
 	struct mvebu_comphy_lane *lane = phy_get_drvdata(phy);
 	struct mvebu_comphy_priv *priv = lane->priv;
 	u32 val;
 
-	mvebu_comphy_ethernet_init_reset(lane, PHY_MODE_SGMII);
+	mvebu_comphy_ethernet_init_reset(lane, mode);
 
 	val = readl(priv->base + MVEBU_COMPHY_RX_CTRL1(lane->id));
 	val &= ~MVEBU_COMPHY_RX_CTRL1_CLK8T_EN;
@@ -487,7 +497,8 @@ static int mvebu_comphy_power_on(struct phy *phy)
 
 	switch (lane->mode) {
 	case PHY_MODE_SGMII:
-		ret = mvebu_comphy_set_mode_sgmii(phy);
+	case PHY_MODE_2500SGMII:
+		ret = mvebu_comphy_set_mode_sgmii(phy, lane->mode);
 		break;
 	case PHY_MODE_10GKR:
 		ret = mvebu_comphy_set_mode_10gkr(phy);
-- 
cgit v1.2.3-59-g8ed1b


From d97c9f4ab000bd7bc0bbca78e65add9081096002 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:34 +0200
Subject: net: mvpp2: 1000baseX support

This patch adds the 1000Base-X PHY mode support in the Marvell PPv2
driver. 1000Base-X is quite close the SGMII and uses nearly the same
code path.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 72 +++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 60093f1e6297..ece61f1727e4 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -4870,6 +4870,7 @@ static int mvpp22_gop_init(struct mvpp2_port *port)
 		mvpp22_gop_init_rgmii(port);
 		break;
 	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_1000BASEX:
 		mvpp22_gop_init_sgmii(port);
 		break;
 	case PHY_INTERFACE_MODE_10GKR:
@@ -4907,7 +4908,8 @@ static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
 	u32 val;
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
 		/* Enable the GMAC link status irq for this port */
 		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 		val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
@@ -4937,7 +4939,8 @@ static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
 	}
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 		val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
 		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
@@ -4949,7 +4952,8 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
 	u32 val;
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_MASK);
 		val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
 		writel(val, port->base + MVPP22_GMAC_INT_MASK);
@@ -4974,6 +4978,7 @@ static int mvpp22_comphy_init(struct mvpp2_port *port)
 
 	switch (port->phy_interface) {
 	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_1000BASEX:
 		mode = PHY_MODE_SGMII;
 		break;
 	case PHY_INTERFACE_MODE_10GKR:
@@ -5056,7 +5061,8 @@ static void mvpp2_port_loopback_set(struct mvpp2_port *port,
 	else
 		val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
 
-	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII)
+	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX)
 		val |= MVPP2_GMAC_PCS_LB_EN_MASK;
 	else
 		val &= ~MVPP2_GMAC_PCS_LB_EN_MASK;
@@ -6266,7 +6272,8 @@ static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
 				link = true;
 		}
 	} else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-		   port->phy_interface == PHY_INTERFACE_MODE_SGMII) {
+		   port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+		   port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_STAT);
 		if (val & MVPP22_GMAC_INT_STAT_LINK) {
 			event = true;
@@ -8032,20 +8039,25 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 	phylink_set(mask, Pause);
 	phylink_set(mask, Asym_Pause);
 
-	phylink_set(mask, 10baseT_Half);
-	phylink_set(mask, 10baseT_Full);
-	phylink_set(mask, 100baseT_Half);
-	phylink_set(mask, 100baseT_Full);
-	phylink_set(mask, 1000baseT_Full);
-	phylink_set(mask, 10000baseT_Full);
-
-	if (state->interface == PHY_INTERFACE_MODE_10GKR) {
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_10GKR:
 		phylink_set(mask, 10000baseCR_Full);
 		phylink_set(mask, 10000baseSR_Full);
 		phylink_set(mask, 10000baseLR_Full);
 		phylink_set(mask, 10000baseLRM_Full);
 		phylink_set(mask, 10000baseER_Full);
 		phylink_set(mask, 10000baseKR_Full);
+		/* Fall-through */
+	default:
+		phylink_set(mask, 10baseT_Half);
+		phylink_set(mask, 10baseT_Full);
+		phylink_set(mask, 100baseT_Half);
+		phylink_set(mask, 100baseT_Full);
+		phylink_set(mask, 10000baseT_Full);
+		/* Fall-through */
+	case PHY_INTERFACE_MODE_1000BASEX:
+		phylink_set(mask, 1000baseT_Full);
+		phylink_set(mask, 1000baseX_Full);
 	}
 
 	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -8084,12 +8096,18 @@ static void mvpp2_gmac_link_state(struct mvpp2_port *port,
 	state->link = !!(val & MVPP2_GMAC_STATUS0_LINK_UP);
 	state->duplex = !!(val & MVPP2_GMAC_STATUS0_FULL_DUPLEX);
 
-	if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
+	switch (port->phy_interface) {
+	case PHY_INTERFACE_MODE_1000BASEX:
 		state->speed = SPEED_1000;
-	else if (val & MVPP2_GMAC_STATUS0_MII_SPEED)
-		state->speed = SPEED_100;
-	else
-		state->speed = SPEED_10;
+		break;
+	default:
+		if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
+			state->speed = SPEED_1000;
+		else if (val & MVPP2_GMAC_STATUS0_MII_SPEED)
+			state->speed = SPEED_100;
+		else
+			state->speed = SPEED_10;
+	}
 
 	state->pause = 0;
 	if (val & MVPP2_GMAC_STATUS0_RX_PAUSE)
@@ -8181,8 +8199,18 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
 	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
 
-	if (!phy_interface_mode_is_rgmii(state->interface))
+	if (state->interface == PHY_INTERFACE_MODE_1000BASEX) {
+		/* 1000BaseX port cannot negotiate speed nor can it negotiate
+		 * duplex: they are always operating with a fixed speed of
+		 * 1000Mbps in full duplex, so force 1000 speed and full duplex
+		 * here.
+		 */
+		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
+		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
+		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+	} else if (!phy_interface_mode_is_rgmii(state->interface)) {
 		an |= MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG;
+	}
 
 	if (state->duplex)
 		an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
@@ -8191,7 +8219,8 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	if (phylink_test(state->advertising, Asym_Pause))
 		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
 
-	if (state->interface == PHY_INTERFACE_MODE_SGMII) {
+	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
+	    state->interface == PHY_INTERFACE_MODE_1000BASEX) {
 		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
 		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
 
@@ -8256,7 +8285,8 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
 	if (state->interface == PHY_INTERFACE_MODE_10GKR)
 		mvpp2_xlg_config(port, mode, state);
 	else if (phy_interface_mode_is_rgmii(state->interface) ||
-		 state->interface == PHY_INTERFACE_MODE_SGMII)
+		 state->interface == PHY_INTERFACE_MODE_SGMII ||
+		 state->interface == PHY_INTERFACE_MODE_1000BASEX)
 		mvpp2_gmac_config(port, mode, state);
 
 	if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
-- 
cgit v1.2.3-59-g8ed1b


From a6fe31de866f35d2d926a0ee13125fead935846b Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Thu, 17 May 2018 10:29:35 +0200
Subject: net: mvpp2: 2500baseX support

This patch adds the 2500Base-X PHY mode support in the Marvell PPv2
driver. 2500Base-X is quite close to 1000Base-X and SGMII modes and uses
nearly the same code path.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 51 +++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index ece61f1727e4..5e580482769e 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -4871,6 +4871,7 @@ static int mvpp22_gop_init(struct mvpp2_port *port)
 		break;
 	case PHY_INTERFACE_MODE_SGMII:
 	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
 		mvpp22_gop_init_sgmii(port);
 		break;
 	case PHY_INTERFACE_MODE_10GKR:
@@ -4909,7 +4910,8 @@ static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
 		/* Enable the GMAC link status irq for this port */
 		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 		val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
@@ -4940,7 +4942,8 @@ static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
 		val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
 		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
@@ -4953,7 +4956,8 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
 
 	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_MASK);
 		val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
 		writel(val, port->base + MVPP22_GMAC_INT_MASK);
@@ -4968,6 +4972,16 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
 	mvpp22_gop_unmask_irq(port);
 }
 
+/* Sets the PHY mode of the COMPHY (which configures the serdes lanes).
+ *
+ * The PHY mode used by the PPv2 driver comes from the network subsystem, while
+ * the one given to the COMPHY comes from the generic PHY subsystem. Hence they
+ * differ.
+ *
+ * The COMPHY configures the serdes lanes regardless of the actual use of the
+ * lanes by the physical layer. This is why configurations like
+ * "PPv2 (2500BaseX) - COMPHY (2500SGMII)" are valid.
+ */
 static int mvpp22_comphy_init(struct mvpp2_port *port)
 {
 	enum phy_mode mode;
@@ -4981,6 +4995,9 @@ static int mvpp22_comphy_init(struct mvpp2_port *port)
 	case PHY_INTERFACE_MODE_1000BASEX:
 		mode = PHY_MODE_SGMII;
 		break;
+	case PHY_INTERFACE_MODE_2500BASEX:
+		mode = PHY_MODE_2500SGMII;
+		break;
 	case PHY_INTERFACE_MODE_10GKR:
 		mode = PHY_MODE_10GKR;
 		break;
@@ -5062,7 +5079,8 @@ static void mvpp2_port_loopback_set(struct mvpp2_port *port,
 		val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
 
 	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX)
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX)
 		val |= MVPP2_GMAC_PCS_LB_EN_MASK;
 	else
 		val &= ~MVPP2_GMAC_PCS_LB_EN_MASK;
@@ -6273,7 +6291,8 @@ static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
 		}
 	} else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
 		   port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-		   port->phy_interface == PHY_INTERFACE_MODE_1000BASEX) {
+		   port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+		   port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
 		val = readl(port->base + MVPP22_GMAC_INT_STAT);
 		if (val & MVPP22_GMAC_INT_STAT_LINK) {
 			event = true;
@@ -8056,8 +8075,10 @@ static void mvpp2_phylink_validate(struct net_device *dev,
 		phylink_set(mask, 10000baseT_Full);
 		/* Fall-through */
 	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
 		phylink_set(mask, 1000baseT_Full);
 		phylink_set(mask, 1000baseX_Full);
+		phylink_set(mask, 2500baseX_Full);
 	}
 
 	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -8100,6 +8121,9 @@ static void mvpp2_gmac_link_state(struct mvpp2_port *port,
 	case PHY_INTERFACE_MODE_1000BASEX:
 		state->speed = SPEED_1000;
 		break;
+	case PHY_INTERFACE_MODE_2500BASEX:
+		state->speed = SPEED_2500;
+		break;
 	default:
 		if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
 			state->speed = SPEED_1000;
@@ -8199,11 +8223,12 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
 	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
 
-	if (state->interface == PHY_INTERFACE_MODE_1000BASEX) {
-		/* 1000BaseX port cannot negotiate speed nor can it negotiate
-		 * duplex: they are always operating with a fixed speed of
-		 * 1000Mbps in full duplex, so force 1000 speed and full duplex
-		 * here.
+	if (state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
+		/* 1000BaseX and 2500BaseX ports cannot negotiate speed nor can
+		 * they negotiate duplex: they are always operating with a fixed
+		 * speed of 1000/2500Mbps in full duplex, so force 1000/2500
+		 * speed and full duplex here.
 		 */
 		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
 		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
@@ -8220,7 +8245,8 @@ static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
 		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
 
 	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
-	    state->interface == PHY_INTERFACE_MODE_1000BASEX) {
+	    state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
 		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
 		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
 
@@ -8286,7 +8312,8 @@ static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
 		mvpp2_xlg_config(port, mode, state);
 	else if (phy_interface_mode_is_rgmii(state->interface) ||
 		 state->interface == PHY_INTERFACE_MODE_SGMII ||
-		 state->interface == PHY_INTERFACE_MODE_1000BASEX)
+		 state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+		 state->interface == PHY_INTERFACE_MODE_2500BASEX)
 		mvpp2_gmac_config(port, mode, state);
 
 	if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
-- 
cgit v1.2.3-59-g8ed1b


From 914365f1c97d3aa1881fd6fc477c4799a39c00f9 Mon Sep 17 00:00:00 2001
From: Yan Markman <ymarkman@marvell.com>
Date: Thu, 17 May 2018 10:34:25 +0200
Subject: net: mvpp2: avoid checking for free aggregated descriptors twice

Avoid repeating the check for free aggregated descriptors when it
already failed at the beginning of the function.

Signed-off-by: Yan Markman <ymarkman@marvell.com>
[Antoine: commit message]
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 5e580482769e..de664b3f45f2 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -5487,11 +5487,10 @@ static int mvpp2_aggr_desc_num_check(struct mvpp2 *priv,
 					     MVPP2_AGGR_TXQ_STATUS_REG(cpu));
 
 		aggr_txq->count = val & MVPP2_AGGR_TXQ_PENDING_MASK;
-	}
-
-	if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE)
-		return -ENOMEM;
 
+		if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE)
+			return -ENOMEM;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5b0ab2f41d7593d87762dce1cbaf548f988a2917 Mon Sep 17 00:00:00 2001
From: Yan Markman <ymarkman@marvell.com>
Date: Thu, 17 May 2018 10:34:26 +0200
Subject: net: mvpp2: set mac address does not require the stop/start sequence

Remove special stop/start handling from the set_mac_address callback.
All this special care is not needed, and can be removed. It also
simplifies the up/down status in the driver and helps avoiding possible
link status mismatch issues.

Signed-off-by: Yan Markman <ymarkman@marvell.com>
[Antoine: commit message]
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 38 +++++++-----------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index de664b3f45f2..8c9496002bd7 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -7357,42 +7357,18 @@ static void mvpp2_set_rx_mode(struct net_device *dev)
 
 static int mvpp2_set_mac_address(struct net_device *dev, void *p)
 {
-	struct mvpp2_port *port = netdev_priv(dev);
 	const struct sockaddr *addr = p;
 	int err;
 
-	if (!is_valid_ether_addr(addr->sa_data)) {
-		err = -EADDRNOTAVAIL;
-		goto log_error;
-	}
-
-	if (!netif_running(dev)) {
-		err = mvpp2_prs_update_mac_da(dev, addr->sa_data);
-		if (!err)
-			return 0;
-		/* Reconfigure parser to accept the original MAC address */
-		err = mvpp2_prs_update_mac_da(dev, dev->dev_addr);
-		if (err)
-			goto log_error;
-	}
-
-	mvpp2_stop_dev(port);
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
 
 	err = mvpp2_prs_update_mac_da(dev, addr->sa_data);
-	if (!err)
-		goto out_start;
-
-	/* Reconfigure parser accept the original MAC address */
-	err = mvpp2_prs_update_mac_da(dev, dev->dev_addr);
-	if (err)
-		goto log_error;
-out_start:
-	mvpp2_start_dev(port);
-	mvpp2_egress_enable(port);
-	mvpp2_ingress_enable(port);
-	return 0;
-log_error:
-	netdev_err(dev, "failed to change MAC address\n");
+	if (err) {
+		/* Reconfigure parser accept the original MAC address */
+		mvpp2_prs_update_mac_da(dev, dev->dev_addr);
+		netdev_err(dev, "failed to change MAC address\n");
+	}
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 934e0f8330a483a582805c640ef9885b4e69aa63 Mon Sep 17 00:00:00 2001
From: Yan Markman <ymarkman@marvell.com>
Date: Thu, 17 May 2018 10:34:27 +0200
Subject: net: mvpp2: print rx error with rate-limit

Prevent flood of RX error prints during heavy traffic with weak signal
in link by checking net_ratelimit() before using netdev_err().

Signed-off-by: Yan Markman <ymarkman@marvell.com>
[Antoine: small rework, commit message]
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 8c9496002bd7..7ba7e5938c2e 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -6381,21 +6381,23 @@ static void mvpp2_rx_error(struct mvpp2_port *port,
 {
 	u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
 	size_t sz = mvpp2_rxdesc_size_get(port, rx_desc);
+	char *err_str = NULL;
 
 	switch (status & MVPP2_RXD_ERR_CODE_MASK) {
 	case MVPP2_RXD_ERR_CRC:
-		netdev_err(port->dev, "bad rx status %08x (crc error), size=%zu\n",
-			   status, sz);
+		err_str = "crc";
 		break;
 	case MVPP2_RXD_ERR_OVERRUN:
-		netdev_err(port->dev, "bad rx status %08x (overrun error), size=%zu\n",
-			   status, sz);
+		err_str = "overrun";
 		break;
 	case MVPP2_RXD_ERR_RESOURCE:
-		netdev_err(port->dev, "bad rx status %08x (resource error), size=%zu\n",
-			   status, sz);
+		err_str = "resource";
 		break;
 	}
+	if (err_str && net_ratelimit())
+		netdev_err(port->dev,
+			   "bad rx status %08x (%s error), size=%zu\n",
+			   status, err_str, sz);
 }
 
 /* Handle RX checksum offload */
-- 
cgit v1.2.3-59-g8ed1b


From 3b734ff604d32e3e1d7db877d801818967ad325f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 17 May 2018 12:06:43 +0200
Subject: nfp: flower: fix error path during representor creation

Don't store repr pointer to reprs array until the representor is
successfully created. This avoids message about "representor
destruction" even when it was never created. Also it cleans-up the flow.
Also, check return value after port alloc.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c  | 13 +++++++++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |  9 +++++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  1 +
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 84e3b9f5abb1..4e67c0cbf9f0 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -247,12 +247,16 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 			err = -ENOMEM;
 			goto err_reprs_clean;
 		}
-		RCU_INIT_POINTER(reprs->reprs[i], repr);
 
 		/* For now we only support 1 PF */
 		WARN_ON(repr_type == NFP_REPR_TYPE_PF && i);
 
 		port = nfp_port_alloc(app, port_type, repr);
+		if (IS_ERR(port)) {
+			err = PTR_ERR(port);
+			nfp_repr_free(repr);
+			goto err_reprs_clean;
+		}
 		if (repr_type == NFP_REPR_TYPE_PF) {
 			port->pf_id = i;
 			port->vnic = priv->nn->dp.ctrl_bar;
@@ -271,9 +275,11 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 				    port_id, port, priv->nn->dp.netdev);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
+		RCU_INIT_POINTER(reprs->reprs[i], repr);
 		nfp_info(app->cpp, "%s%d Representor(%s) created\n",
 			 repr_type == NFP_REPR_TYPE_PF ? "PF" : "VF", i,
 			 repr->name);
@@ -344,16 +350,17 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 			err = -ENOMEM;
 			goto err_reprs_clean;
 		}
-		RCU_INIT_POINTER(reprs->reprs[phys_port], repr);
 
 		port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 		err = nfp_port_init_phy_port(app->pf, app, port, i);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -365,6 +372,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 				    cmsg_port_id, port, priv->nn->dp.netdev);
 		if (err) {
 			nfp_port_free(port);
+			nfp_repr_free(repr);
 			goto err_reprs_clean;
 		}
 
@@ -373,6 +381,7 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 					     eth_tbl->ports[i].base,
 					     phys_port);
 
+		RCU_INIT_POINTER(reprs->reprs[phys_port], repr);
 		nfp_info(app->cpp, "Phys Port %d Representor(%s) created\n",
 			 phys_port, repr->name);
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 0cd077addb26..6e79da91e475 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -348,12 +348,17 @@ err_clean:
 	return err;
 }
 
-static void nfp_repr_free(struct nfp_repr *repr)
+static void __nfp_repr_free(struct nfp_repr *repr)
 {
 	free_percpu(repr->stats);
 	free_netdev(repr->netdev);
 }
 
+void nfp_repr_free(struct net_device *netdev)
+{
+	__nfp_repr_free(netdev_priv(netdev));
+}
+
 struct net_device *nfp_repr_alloc(struct nfp_app *app)
 {
 	struct net_device *netdev;
@@ -385,7 +390,7 @@ static void nfp_repr_clean_and_free(struct nfp_repr *repr)
 	nfp_info(repr->app->cpp, "Destroying Representor(%s)\n",
 		 repr->netdev->name);
 	nfp_repr_clean(repr);
-	nfp_repr_free(repr);
+	__nfp_repr_free(repr);
 }
 
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index a621e8ff528e..cd756a15445f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -123,6 +123,7 @@ void nfp_repr_inc_rx_stats(struct net_device *netdev, unsigned int len);
 int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 		  u32 cmsg_port_id, struct nfp_port *port,
 		  struct net_device *pf_netdev);
+void nfp_repr_free(struct net_device *netdev);
 struct net_device *nfp_repr_alloc(struct nfp_app *app);
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs);
 void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
-- 
cgit v1.2.3-59-g8ed1b


From be7f3e59997b7744e8be153b76fca28ac5b71354 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 03:54:21 -0700
Subject: net/smc: init conn.tx_work & conn.send_lock sooner

syzkaller found that following program crashes the host :

{
  int fd = socket(AF_SMC, SOCK_STREAM, 0);
  int val = 1;

  listen(fd, 0);
  shutdown(fd, SHUT_RDWR);
  setsockopt(fd, 6, TCP_NODELAY, &val, 4);
}

Simply initialize conn.tx_work & conn.send_lock at socket creation,
rather than deeper in the stack.

ODEBUG: assert_init not available (active state 0) object type: timer_list hint:           (null)
WARNING: CPU: 1 PID: 13988 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 13988 Comm: syz-executor0 Not tainted 4.17.0-rc4+ #46
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 panic+0x22f/0x4de kernel/panic.c:184
 __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
 report_bug+0x252/0x2d0 lib/bug.c:186
 fixup_bug arch/x86/kernel/traps.c:178 [inline]
 do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326
RSP: 0018:ffff880197a37880 EFLAGS: 00010086
RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffc90001ed0000
RDX: 0000000000004aaf RSI: ffffffff8160f6f1 RDI: 0000000000000001
RBP: ffff880197a378c0 R08: ffff8801aa7a0080 R09: ffffed003b5e3eb2
R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001
R13: ffffffff88d96980 R14: ffffffff87fa19a0 R15: ffffffff81666ec0
 debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692
 debug_timer_assert_init kernel/time/timer.c:724 [inline]
 debug_assert_init kernel/time/timer.c:776 [inline]
 del_timer+0x74/0x140 kernel/time/timer.c:1198
 try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223
 mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592
 mod_delayed_work include/linux/workqueue.h:541 [inline]
 smc_setsockopt+0x387/0x6d0 net/smc/af_smc.c:1367
 __sys_setsockopt+0x1bd/0x390 net/socket.c:1903
 __do_sys_setsockopt net/socket.c:1914 [inline]
 __se_sys_setsockopt net/socket.c:1911 [inline]
 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ursula Braun <ubraun@linux.ibm.com>
Cc: linux-s390@vger.kernel.org
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 2 ++
 net/smc/smc_tx.c | 4 +---
 net/smc/smc_tx.h | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index d15762b057c0..6ad4f6c771c3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -193,8 +193,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 	sk->sk_protocol = protocol;
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
 	INIT_LIST_HEAD(&smc->accept_q);
 	spin_lock_init(&smc->accept_q_lock);
+	spin_lock_init(&smc->conn.send_lock);
 	sk->sk_prot->hash(sk);
 	sk_refcnt_debug_inc(sk);
 
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 58dfe0bd9d60..08a7de98bb03 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -450,7 +450,7 @@ out_unlock:
 /* Wakeup sndbuf consumers from process context
  * since there is more data to transmit
  */
-static void smc_tx_work(struct work_struct *work)
+void smc_tx_work(struct work_struct *work)
 {
 	struct smc_connection *conn = container_of(to_delayed_work(work),
 						   struct smc_connection,
@@ -512,6 +512,4 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 void smc_tx_init(struct smc_sock *smc)
 {
 	smc->sk.sk_write_space = smc_tx_write_space;
-	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
-	spin_lock_init(&smc->conn.send_lock);
 }
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 78255964fa4d..8f64b12bf03c 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -27,6 +27,7 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
 	return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
 }
 
+void smc_tx_work(struct work_struct *work);
 void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
-- 
cgit v1.2.3-59-g8ed1b


From 990a9d4975c974b05a8abaaeca5bbcfc55d31dcb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 17 May 2018 15:33:36 +0300
Subject: net/ncsi: prevent a couple array underflows

We recently refactored this code and introduced a static checker
warning.  Smatch complains that if cmd->index is zero then we would
underflow the arrays.  That's obviously true.

The question is whether we prevent cmd->index from being zero at a
different level.  I've looked at the code and I don't immediately see
a check for that.

Fixes: 062b3e1b6d4f ("net/ncsi: Refactor MAC, VLAN filters")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-rsp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index ce9497966ebe..a6b7c7d5c829 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -347,7 +347,7 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr)
 
 	cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd);
 	ncf = &nc->vlan_filter;
-	if (cmd->index > ncf->n_vids)
+	if (cmd->index == 0 || cmd->index > ncf->n_vids)
 		return -ERANGE;
 
 	/* Add or remove the VLAN filter. Remember HW indexes from 1 */
@@ -445,7 +445,8 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr)
 	ncf = &nc->mac_filter;
 	bitmap = &ncf->bitmap;
 
-	if (cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
+	if (cmd->index == 0 ||
+	    cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed)
 		return -ERANGE;
 
 	index = (cmd->index - 1) * ETH_ALEN;
-- 
cgit v1.2.3-59-g8ed1b


From 7c5995b33d6e40316d366205a7926d122c5366a7 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Thu, 17 May 2018 09:12:49 -0400
Subject: tc-testing: fixed copy-pasting error in ife tests

Reported-by: Vlad Buslov <vladbu@mellanox.com>
Reported-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/tc-testing/tc-tests/actions/ife.json | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
index 03777730ef29..de97e4ff705c 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
@@ -20,7 +20,7 @@
         "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow mark.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -44,7 +44,7 @@
         "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use mark.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -68,7 +68,7 @@
         "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*allow mark.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -92,7 +92,7 @@
         "matchPattern": "action order [0-9]*: ife encode action drop.*type 0xED3E.*use mark 789.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -116,7 +116,7 @@
         "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use mark 656768.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -140,7 +140,7 @@
         "matchPattern": "action order [0-9]*: ife encode action jump 1.*type 0xED3E.*use mark 65.*index 2",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -164,7 +164,7 @@
         "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use mark 4294967295.*index 90",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -210,7 +210,7 @@
         "matchPattern": "action order [0-9]*: ife encode action pass.*type 0xED3E.*allow prio.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -234,7 +234,7 @@
         "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0xED3E.*use prio 7.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -258,7 +258,7 @@
         "matchPattern": "action order [0-9]*: ife encode action continue.*type 0xED3E.*use prio 3.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -282,7 +282,7 @@
         "matchPattern": "action order [0-9]*: ife encode action drop.*type 0xED3E.*allow prio.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -306,7 +306,7 @@
         "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use prio 998877.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -330,7 +330,7 @@
         "matchPattern": "action order [0-9]*: ife encode action jump 10.*type 0xED3E.*use prio 998877.*index 9",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
@@ -354,7 +354,7 @@
         "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0xED3E.*use prio 4294967295.*index 99",
         "matchCount": "1",
         "teardown": [
-            "$TC actions flush action skbedit"
+            "$TC actions flush action ife"
         ]
     },
     {
-- 
cgit v1.2.3-59-g8ed1b


From 0e4364560361d57e8cd873a8990327f3471d7d8a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 17 May 2018 09:08:43 -0500
Subject: bpf: sockmap, fix uninitialized variable

There is a potential execution path in which variable err is
returned without being properly initialized previously.

Fix this by initializing variable err to 0.

Addresses-Coverity-ID: 1468964 ("Uninitialized scalar variable")
Fixes: e5cd3abcb31a ("bpf: sockmap, refactor sockmap routines to work with hashmap")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c6de1393df63..41b41fcc0466 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1713,7 +1713,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
 	bool new = false;
-	int err;
+	int err = 0;
 
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
-- 
cgit v1.2.3-59-g8ed1b


From a78622932c27e8ec33e5ba180f3d2e87fb806b28 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 17 May 2018 09:11:02 -0500
Subject: bpf: sockmap, fix double-free

`e' is being freed twice.

Fix this by removing one of the kfree() calls.

Addresses-Coverity-ID: 1468983 ("Double free")
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 41b41fcc0466..c682669570dc 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1823,7 +1823,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
 	write_unlock_bh(&sock->sk_callback_lock);
 	return err;
 out_free:
-	kfree(e);
 	smap_release_sock(psock, sock);
 out_progs:
 	if (verdict)
-- 
cgit v1.2.3-59-g8ed1b


From 8a8633978b842c88fbcfe00d4e5dde96048f630e Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 17 May 2018 12:05:00 -0700
Subject: qede: Add build_skb() support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes use of build_skb() throughout in driver's receieve
data path [HW gro flow and non HW gro flow]. With this, driver can
build skb directly from the page segments which are already mapped
to the hardware instead of allocating new SKB via netdev_alloc_skb()
and memcpy the data which is quite costly.

This really improves performance (keeping same or slight gain in rx
throughput) in terms of CPU utilization which is significantly reduced
[almost half] in non HW gro flow where for every incoming MTU sized
packet driver had to allocate skb, memcpy headers etc. Additionally
in that flow, it also gets rid of bunch of additional overheads
[eth_get_headlen() etc.] to split headers and data in the skb.

Tested with:
system: 2 sockets, 4 cores per socket, hyperthreading, 2x4x2=16 cores
iperf [server]: iperf -s
iperf [client]: iperf -c <server_ip> -t 500 -i 10 -P 32

HW GRO off – w/o build_skb(), throughput: 36.8 Gbits/sec

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
Average:     all    0.59    0.00   32.93    0.00    0.00   43.07    0.00    0.00   23.42

HW GRO off - with build_skb(), throughput: 36.9 Gbits/sec

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
Average:     all    0.70    0.00   31.70    0.00    0.00   25.68    0.00    0.00   41.92

HW GRO on - w/o build_skb(), throughput: 36.9 Gbits/sec

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
Average:     all    0.86    0.00   24.14    0.00    0.00    6.59    0.00    0.00   68.41

HW GRO on - with build_skb(), throughput: 37.5 Gbits/sec

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
Average:     all    0.87    0.00   23.75    0.00    0.00    6.19    0.00    0.00   69.19

Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |   5 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |   3 +-
 drivers/net/ethernet/qlogic/qede/qede_fp.c      | 227 +++++++++++++-----------
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  76 ++------
 4 files changed, 137 insertions(+), 174 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 9935978c5542..2d3f09ed413b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -290,15 +290,12 @@ struct qede_agg_info {
 	 * aggregation.
 	 */
 	struct sw_rx_data buffer;
-	dma_addr_t buffer_mapping;
-
 	struct sk_buff *skb;
 
 	/* We need some structs from the start cookie until termination */
 	u16 vlan_tag;
-	u16 start_cqe_bd_len;
-	u8 start_cqe_placement_offset;
 
+	bool tpa_start_fail;
 	u8 state;
 	u8 frag_id;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index ecbf1ded7a39..8c6fdad91986 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1508,7 +1508,8 @@ static int qede_selftest_receive_traffic(struct qede_dev *edev)
 		len =  le16_to_cpu(fp_cqe->len_on_first_bd);
 		data_ptr = (u8 *)(page_address(sw_rx_data->data) +
 				  fp_cqe->placement_offset +
-				  sw_rx_data->page_offset);
+				  sw_rx_data->page_offset +
+				  rxq->rx_headroom);
 		if (ether_addr_equal(data_ptr,  edev->ndev->dev_addr) &&
 		    ether_addr_equal(data_ptr + ETH_ALEN,
 				     edev->ndev->dev_addr)) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 14941303189d..6c702399b801 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -660,7 +660,8 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 
 	/* Add one frag and update the appropriate fields in the skb */
 	skb_fill_page_desc(skb, tpa_info->frag_id++,
-			   current_bd->data, current_bd->page_offset,
+			   current_bd->data,
+			   current_bd->page_offset + rxq->rx_headroom,
 			   len_on_bd);
 
 	if (unlikely(qede_realloc_rx_buffer(rxq, current_bd))) {
@@ -671,8 +672,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 		goto out;
 	}
 
-	qed_chain_consume(&rxq->rx_bd_ring);
-	rxq->sw_rx_cons++;
+	qede_rx_bd_ring_consume(rxq);
 
 	skb->data_len += len_on_bd;
 	skb->truesize += rxq->rx_buf_seg_size;
@@ -721,64 +721,129 @@ static u8 qede_check_tunn_csum(u16 flag)
 	return QEDE_CSUM_UNNECESSARY | tcsum;
 }
 
+static inline struct sk_buff *
+qede_build_skb(struct qede_rx_queue *rxq,
+	       struct sw_rx_data *bd, u16 len, u16 pad)
+{
+	struct sk_buff *skb;
+	void *buf;
+
+	buf = page_address(bd->data) + bd->page_offset;
+	skb = build_skb(buf, rxq->rx_buf_seg_size);
+
+	skb_reserve(skb, pad);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static struct sk_buff *
+qede_tpa_rx_build_skb(struct qede_dev *edev,
+		      struct qede_rx_queue *rxq,
+		      struct sw_rx_data *bd, u16 len, u16 pad,
+		      bool alloc_skb)
+{
+	struct sk_buff *skb;
+
+	skb = qede_build_skb(rxq, bd, len, pad);
+	bd->page_offset += rxq->rx_buf_seg_size;
+
+	if (bd->page_offset == PAGE_SIZE) {
+		if (unlikely(qede_alloc_rx_buffer(rxq, true))) {
+			DP_NOTICE(edev,
+				  "Failed to allocate RX buffer for tpa start\n");
+			bd->page_offset -= rxq->rx_buf_seg_size;
+			page_ref_inc(bd->data);
+			dev_kfree_skb_any(skb);
+			return NULL;
+		}
+	} else {
+		page_ref_inc(bd->data);
+		qede_reuse_page(rxq, bd);
+	}
+
+	/* We've consumed the first BD and prepared an SKB */
+	qede_rx_bd_ring_consume(rxq);
+
+	return skb;
+}
+
+static struct sk_buff *
+qede_rx_build_skb(struct qede_dev *edev,
+		  struct qede_rx_queue *rxq,
+		  struct sw_rx_data *bd, u16 len, u16 pad)
+{
+	struct sk_buff *skb = NULL;
+
+	/* For smaller frames still need to allocate skb, memcpy
+	 * data and benefit in reusing the page segment instead of
+	 * un-mapping it.
+	 */
+	if ((len + pad <= edev->rx_copybreak)) {
+		unsigned int offset = bd->page_offset + pad;
+
+		skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
+		if (unlikely(!skb))
+			return NULL;
+
+		skb_reserve(skb, pad);
+		memcpy(skb_put(skb, len),
+		       page_address(bd->data) + offset, len);
+		qede_reuse_page(rxq, bd);
+		goto out;
+	}
+
+	skb = qede_build_skb(rxq, bd, len, pad);
+
+	if (unlikely(qede_realloc_rx_buffer(rxq, bd))) {
+		/* Incr page ref count to reuse on allocation failure so
+		 * that it doesn't get freed while freeing SKB [as its
+		 * already mapped there].
+		 */
+		page_ref_inc(bd->data);
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+out:
+	/* We've consumed the first BD and prepared an SKB */
+	qede_rx_bd_ring_consume(rxq);
+
+	return skb;
+}
+
 static void qede_tpa_start(struct qede_dev *edev,
 			   struct qede_rx_queue *rxq,
 			   struct eth_fast_path_rx_tpa_start_cqe *cqe)
 {
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
-	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
-	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
-	struct sw_rx_data *replace_buf = &tpa_info->buffer;
-	dma_addr_t mapping = tpa_info->buffer_mapping;
 	struct sw_rx_data *sw_rx_data_cons;
-	struct sw_rx_data *sw_rx_data_prod;
+	u16 pad;
 
 	sw_rx_data_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
-	sw_rx_data_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	pad = cqe->placement_offset + rxq->rx_headroom;
 
-	/* Use pre-allocated replacement buffer - we can't release the agg.
-	 * start until its over and we don't want to risk allocation failing
-	 * here, so re-allocate when aggregation will be over.
-	 */
-	sw_rx_data_prod->mapping = replace_buf->mapping;
-
-	sw_rx_data_prod->data = replace_buf->data;
-	rx_bd_prod->addr.hi = cpu_to_le32(upper_32_bits(mapping));
-	rx_bd_prod->addr.lo = cpu_to_le32(lower_32_bits(mapping));
-	sw_rx_data_prod->page_offset = replace_buf->page_offset;
-
-	rxq->sw_rx_prod++;
+	tpa_info->skb = qede_tpa_rx_build_skb(edev, rxq, sw_rx_data_cons,
+					      le16_to_cpu(cqe->len_on_first_bd),
+					      pad, false);
+	tpa_info->buffer.page_offset = sw_rx_data_cons->page_offset;
+	tpa_info->buffer.mapping = sw_rx_data_cons->mapping;
 
-	/* move partial skb from cons to pool (don't unmap yet)
-	 * save mapping, incase we drop the packet later on.
-	 */
-	tpa_info->buffer = *sw_rx_data_cons;
-	mapping = HILO_U64(le32_to_cpu(rx_bd_cons->addr.hi),
-			   le32_to_cpu(rx_bd_cons->addr.lo));
-
-	tpa_info->buffer_mapping = mapping;
-	rxq->sw_rx_cons++;
-
-	/* set tpa state to start only if we are able to allocate skb
-	 * for this aggregation, otherwise mark as error and aggregation will
-	 * be dropped
-	 */
-	tpa_info->skb = netdev_alloc_skb(edev->ndev,
-					 le16_to_cpu(cqe->len_on_first_bd));
 	if (unlikely(!tpa_info->skb)) {
 		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
+
+		/* Consume from ring but do not produce since
+		 * this might be used by FW still, it will be re-used
+		 * at TPA end.
+		 */
+		tpa_info->tpa_start_fail = true;
+		qede_rx_bd_ring_consume(rxq);
 		tpa_info->state = QEDE_AGG_STATE_ERROR;
 		goto cons_buf;
 	}
 
-	/* Start filling in the aggregation info */
-	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
 	tpa_info->frag_id = 0;
 	tpa_info->state = QEDE_AGG_STATE_START;
 
-	/* Store some information from first CQE */
-	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
-	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
 	if ((le16_to_cpu(cqe->pars_flags.flags) >>
 	     PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT) &
 	    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
@@ -899,6 +964,10 @@ static int qede_tpa_end(struct qede_dev *edev,
 	tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
 	skb = tpa_info->skb;
 
+	if (tpa_info->buffer.page_offset == PAGE_SIZE)
+		dma_unmap_page(rxq->dev, tpa_info->buffer.mapping,
+			       PAGE_SIZE, rxq->data_direction);
+
 	for (i = 0; cqe->len_list[i]; i++)
 		qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
 				   le16_to_cpu(cqe->len_list[i]));
@@ -919,11 +988,6 @@ static int qede_tpa_end(struct qede_dev *edev,
 		       "Strange - total packet len [cqe] is %4x but SKB has len %04x\n",
 		       le16_to_cpu(cqe->total_packet_len), skb->len);
 
-	memcpy(skb->data,
-	       page_address(tpa_info->buffer.data) +
-	       tpa_info->start_cqe_placement_offset +
-	       tpa_info->buffer.page_offset, tpa_info->start_cqe_bd_len);
-
 	/* Finalize the SKB */
 	skb->protocol = eth_type_trans(skb, edev->ndev);
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -940,6 +1004,12 @@ static int qede_tpa_end(struct qede_dev *edev,
 	return 1;
 err:
 	tpa_info->state = QEDE_AGG_STATE_NONE;
+
+	if (tpa_info->tpa_start_fail) {
+		qede_reuse_page(rxq, &tpa_info->buffer);
+		tpa_info->tpa_start_fail = false;
+	}
+
 	dev_kfree_skb_any(tpa_info->skb);
 	tpa_info->skb = NULL;
 	return 0;
@@ -1058,65 +1128,6 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 	return false;
 }
 
-static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
-					    struct qede_rx_queue *rxq,
-					    struct sw_rx_data *bd, u16 len,
-					    u16 pad)
-{
-	unsigned int offset = bd->page_offset + pad;
-	struct skb_frag_struct *frag;
-	struct page *page = bd->data;
-	unsigned int pull_len;
-	struct sk_buff *skb;
-	unsigned char *va;
-
-	/* Allocate a new SKB with a sufficient large header len */
-	skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
-	if (unlikely(!skb))
-		return NULL;
-
-	/* Copy data into SKB - if it's small, we can simply copy it and
-	 * re-use the already allcoated & mapped memory.
-	 */
-	if (len + pad <= edev->rx_copybreak) {
-		skb_put_data(skb, page_address(page) + offset, len);
-		qede_reuse_page(rxq, bd);
-		goto out;
-	}
-
-	frag = &skb_shinfo(skb)->frags[0];
-
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
-			page, offset, len, rxq->rx_buf_seg_size);
-
-	va = skb_frag_address(frag);
-	pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
-
-	/* Align the pull_len to optimize memcpy */
-	memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
-
-	/* Correct the skb & frag sizes offset after the pull */
-	skb_frag_size_sub(frag, pull_len);
-	frag->page_offset += pull_len;
-	skb->data_len -= pull_len;
-	skb->tail += pull_len;
-
-	if (unlikely(qede_realloc_rx_buffer(rxq, bd))) {
-		/* Incr page ref count to reuse on allocation failure so
-		 * that it doesn't get freed while freeing SKB [as its
-		 * already mapped there].
-		 */
-		page_ref_inc(page);
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
-
-out:
-	/* We've consumed the first BD and prepared an SKB */
-	qede_rx_bd_ring_consume(rxq);
-	return skb;
-}
-
 static int qede_rx_build_jumbo(struct qede_dev *edev,
 			       struct qede_rx_queue *rxq,
 			       struct sk_buff *skb,
@@ -1157,7 +1168,7 @@ static int qede_rx_build_jumbo(struct qede_dev *edev,
 			       PAGE_SIZE, DMA_FROM_DEVICE);
 
 		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
-				   bd->data, 0, cur_size);
+				   bd->data, rxq->rx_headroom, cur_size);
 
 		skb->truesize += PAGE_SIZE;
 		skb->data_len += cur_size;
@@ -1256,7 +1267,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 	/* Basic validation passed; Need to prepare an SKB. This would also
 	 * guarantee to finally consume the first BD upon success.
 	 */
-	skb = qede_rx_allocate_skb(edev, rxq, bd, len, pad);
+	skb = qede_rx_build_skb(edev, rxq, bd, len, pad);
 	if (!skb) {
 		rxq->rx_alloc_errors++;
 		qede_recycle_rx_bd_ring(rxq, fp_cqe->bd_num);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 89c581c3c21a..40e2b923af39 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1197,30 +1197,8 @@ static void qede_free_rx_buffers(struct qede_dev *edev,
 	}
 }
 
-static void qede_free_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
-{
-	int i;
-
-	if (edev->gro_disable)
-		return;
-
-	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
-		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->buffer;
-
-		if (replace_buf->data) {
-			dma_unmap_page(&edev->pdev->dev,
-				       replace_buf->mapping,
-				       PAGE_SIZE, DMA_FROM_DEVICE);
-			__free_page(replace_buf->data);
-		}
-	}
-}
-
 static void qede_free_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 {
-	qede_free_sge_mem(edev, rxq);
-
 	/* Free rx buffers */
 	qede_free_rx_buffers(edev, rxq);
 
@@ -1232,45 +1210,15 @@ static void qede_free_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 	edev->ops->common->chain_free(edev->cdev, &rxq->rx_comp_ring);
 }
 
-static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
+static void qede_set_tpa_param(struct qede_rx_queue *rxq)
 {
-	dma_addr_t mapping;
 	int i;
 
-	if (edev->gro_disable)
-		return 0;
-
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->buffer;
-
-		replace_buf->data = alloc_pages(GFP_ATOMIC, 0);
-		if (unlikely(!replace_buf->data)) {
-			DP_NOTICE(edev,
-				  "Failed to allocate TPA skb pool [replacement buffer]\n");
-			goto err;
-		}
-
-		mapping = dma_map_page(&edev->pdev->dev, replace_buf->data, 0,
-				       PAGE_SIZE, DMA_FROM_DEVICE);
-		if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
-			DP_NOTICE(edev,
-				  "Failed to map TPA replacement buffer\n");
-			goto err;
-		}
 
-		replace_buf->mapping = mapping;
-		tpa_info->buffer.page_offset = 0;
-		tpa_info->buffer_mapping = mapping;
 		tpa_info->state = QEDE_AGG_STATE_NONE;
 	}
-
-	return 0;
-err:
-	qede_free_sge_mem(edev, rxq);
-	edev->gro_disable = 1;
-	edev->ndev->features &= ~NETIF_F_GRO_HW;
-	return -ENOMEM;
 }
 
 /* This function allocates all memory needed per Rx queue */
@@ -1281,19 +1229,24 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 	rxq->num_rx_buffers = edev->q_num_rx_buffers;
 
 	rxq->rx_buf_size = NET_IP_ALIGN + ETH_OVERHEAD + edev->ndev->mtu;
-	rxq->rx_headroom = edev->xdp_prog ? XDP_PACKET_HEADROOM : 0;
+
+	rxq->rx_headroom = edev->xdp_prog ? XDP_PACKET_HEADROOM : NET_SKB_PAD;
+	size = rxq->rx_headroom +
+	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	/* Make sure that the headroom and  payload fit in a single page */
-	if (rxq->rx_buf_size + rxq->rx_headroom > PAGE_SIZE)
-		rxq->rx_buf_size = PAGE_SIZE - rxq->rx_headroom;
+	if (rxq->rx_buf_size + size > PAGE_SIZE)
+		rxq->rx_buf_size = PAGE_SIZE - size;
 
-	/* Segment size to spilt a page in multiple equal parts,
+	/* Segment size to spilt a page in multiple equal parts ,
 	 * unless XDP is used in which case we'd use the entire page.
 	 */
-	if (!edev->xdp_prog)
-		rxq->rx_buf_seg_size = roundup_pow_of_two(rxq->rx_buf_size);
-	else
+	if (!edev->xdp_prog) {
+		size = size + rxq->rx_buf_size;
+		rxq->rx_buf_seg_size = roundup_pow_of_two(size);
+	} else {
 		rxq->rx_buf_seg_size = PAGE_SIZE;
+	}
 
 	/* Allocate the parallel driver ring for Rx buffers */
 	size = sizeof(*rxq->sw_rx_ring) * RX_RING_SIZE;
@@ -1337,7 +1290,8 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 		}
 	}
 
-	rc = qede_alloc_sge_mem(edev, rxq);
+	if (!edev->gro_disable)
+		qede_set_tpa_param(rxq);
 err:
 	return rc;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 33fa382324ebd1513cdf25ceb7ee4995b304ddac Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 17 May 2018 12:29:47 -0700
Subject: vlan: Add extack messages for link create

Add informative messages for error paths related to adding a
VLAN to a device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c         | 11 ++++++++---
 net/8021q/vlan.h         |  3 ++-
 net/8021q/vlan_netlink.c | 45 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5505ee6ebdbe..73a65789271b 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 }
 
 int vlan_check_real_dev(struct net_device *real_dev,
-			__be16 protocol, u16 vlan_id)
+			__be16 protocol, u16 vlan_id,
+			struct netlink_ext_ack *extack)
 {
 	const char *name = real_dev->name;
 
 	if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
 		pr_info("VLANs not supported on %s\n", name);
+		NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
 		return -EOPNOTSUPP;
 	}
 
-	if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL)
+	if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
+		NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
 		return -EEXIST;
+	}
 
 	return 0;
 }
@@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	if (vlan_id >= VLAN_VID_MASK)
 		return -ERANGE;
 
-	err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id);
+	err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
+				  NULL);
 	if (err < 0)
 		return err;
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index e23aac3e4d37..44df1c3df02d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
 void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 
 int vlan_check_real_dev(struct net_device *real_dev,
-			__be16 protocol, u16 vlan_id);
+			__be16 protocol, u16 vlan_id,
+			struct netlink_ext_ack *extack);
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 6689c0b272a7..9b60c1e399e2 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
 	int err;
 
 	if (tb[IFLA_ADDRESS]) {
-		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
 			return -EINVAL;
-		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+		}
+		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid link address");
 			return -EADDRNOTAVAIL;
+		}
 	}
 
-	if (!data)
+	if (!data) {
+		NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified");
 		return -EINVAL;
+	}
 
 	if (data[IFLA_VLAN_PROTOCOL]) {
 		switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
@@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		case htons(ETH_P_8021AD):
 			break;
 		default:
+			NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol");
 			return -EPROTONOSUPPORT;
 		}
 	}
 
 	if (data[IFLA_VLAN_ID]) {
 		id = nla_get_u16(data[IFLA_VLAN_ID]);
-		if (id >= VLAN_VID_MASK)
+		if (id >= VLAN_VID_MASK) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id");
 			return -ERANGE;
+		}
 	}
 	if (data[IFLA_VLAN_FLAGS]) {
 		flags = nla_data(data[IFLA_VLAN_FLAGS]);
 		if ((flags->flags & flags->mask) &
 		    ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
-		      VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+		      VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
 			return -EINVAL;
+		}
 	}
 
 	err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map");
 		return err;
+	}
 	err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map");
 		return err;
+	}
 	return 0;
 }
 
@@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	__be16 proto;
 	int err;
 
-	if (!data[IFLA_VLAN_ID])
+	if (!data[IFLA_VLAN_ID]) {
+		NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified");
 		return -EINVAL;
+	}
 
-	if (!tb[IFLA_LINK])
+	if (!tb[IFLA_LINK]) {
+		NL_SET_ERR_MSG_MOD(extack, "link not specified");
 		return -EINVAL;
+	}
+
 	real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
-	if (!real_dev)
+	if (!real_dev) {
+		NL_SET_ERR_MSG_MOD(extack, "link does not exist");
 		return -ENODEV;
+	}
 
 	if (data[IFLA_VLAN_PROTOCOL])
 		proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
@@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
 	vlan->flags	 = VLAN_FLAG_REORDER_HDR;
 
-	err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);
+	err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id,
+				  extack);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2652113ff043ca2ce1cb3be529b5ca9270c421d4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 May 2018 13:07:43 -0700
Subject: net: ethernet: ti: Allow most drivers with COMPILE_TEST

Most of the TI drivers build just fine with COMPILE_TEST, cpmac (AR7) is
the exception because it uses a header file from
arch/mips/include/asm/mach-ar7/ar7.h and keystone netcp which requires
help from drivers/soc/ti/ for queue management helpers.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/Kconfig | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 48a541eb0af2..9263d638bd6d 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -18,7 +18,7 @@ if NET_VENDOR_TI
 
 config TI_DAVINCI_EMAC
 	tristate "TI DaVinci EMAC Support"
-	depends on ARM && ( ARCH_DAVINCI || ARCH_OMAP3 )
+	depends on ARM && ( ARCH_DAVINCI || ARCH_OMAP3 ) || COMPILE_TEST
 	select TI_DAVINCI_MDIO
 	select TI_DAVINCI_CPDMA
 	select PHYLIB
@@ -30,7 +30,7 @@ config TI_DAVINCI_EMAC
 
 config TI_DAVINCI_MDIO
 	tristate "TI DaVinci MDIO Support"
-	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || ARCH_KEYSTONE
+	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || ARCH_KEYSTONE || COMPILE_TEST
 	select PHYLIB
 	---help---
 	  This driver supports TI's DaVinci MDIO module.
@@ -40,7 +40,7 @@ config TI_DAVINCI_MDIO
 
 config TI_DAVINCI_CPDMA
 	tristate "TI DaVinci CPDMA Support"
-	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS
+	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
 	---help---
 	  This driver supports TI's DaVinci CPDMA dma engine.
 
@@ -60,7 +60,7 @@ config TI_CPSW_ALE
 
 config TI_CPSW
 	tristate "TI CPSW Switch Support"
-	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS
+	depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST
 	select TI_DAVINCI_CPDMA
 	select TI_DAVINCI_MDIO
 	select TI_CPSW_PHY_SEL
@@ -75,7 +75,7 @@ config TI_CPSW
 
 config TI_CPTS
 	bool "TI Common Platform Time Sync (CPTS) Support"
-	depends on TI_CPSW || TI_KEYSTONE_NETCP
+	depends on TI_CPSW || TI_KEYSTONE_NETCP || COMPILE_TEST
 	depends on POSIX_TIMERS
 	---help---
 	  This driver supports the Common Platform Time Sync unit of
-- 
cgit v1.2.3-59-g8ed1b


From 78cc6e7ef957037d7562e56d4463cf26da4e9e0e Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 May 2018 13:07:44 -0700
Subject: net: ethernet: freescale: Allow FEC with COMPILE_TEST

The Freescale FEC driver builds fine with COMPILE_TEST, so make that
possible.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/Kconfig    | 2 +-
 drivers/net/ethernet/freescale/fec.h      | 2 +-
 drivers/net/ethernet/freescale/fec_main.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index 6e490fd2345d..a580a3dcbe59 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -22,7 +22,7 @@ if NET_VENDOR_FREESCALE
 config FEC
 	tristate "FEC ethernet controller (of ColdFire and some i.MX CPUs)"
 	depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \
-		   ARCH_MXC || SOC_IMX28)
+		   ARCH_MXC || SOC_IMX28 || COMPILE_TEST)
 	default ARCH_MXC || SOC_IMX28 if ARM
 	select PHYLIB
 	imply PTP_1588_CLOCK
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index e7381f8ef89d..4778b663653e 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -21,7 +21,7 @@
 
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
     defined(CONFIG_M520x) || defined(CONFIG_M532x) || defined(CONFIG_ARM) || \
-    defined(CONFIG_ARM64)
+    defined(CONFIG_ARM64) || defined(CONFIG_COMPILE_TEST)
 /*
  *	Just figures, Motorola would have to change the offsets for
  *	registers in the same peripheral device on different models
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index f3e43db0d6cb..4358f586e28f 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2107,7 +2107,7 @@ static int fec_enet_get_regs_len(struct net_device *ndev)
 /* List of registers that can be safety be read to dump them with ethtool */
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
 	defined(CONFIG_M520x) || defined(CONFIG_M532x) || defined(CONFIG_ARM) || \
-	defined(CONFIG_ARM64)
+	defined(CONFIG_ARM64) || defined(CONFIG_COMPILE_TEST)
 static u32 fec_enet_register_offset[] = {
 	FEC_IEVENT, FEC_IMASK, FEC_R_DES_ACTIVE_0, FEC_X_DES_ACTIVE_0,
 	FEC_ECNTRL, FEC_MII_DATA, FEC_MII_SPEED, FEC_MIB_CTRLSTAT, FEC_R_CNTRL,
-- 
cgit v1.2.3-59-g8ed1b


From 3c0596f8bec016210e9f470411f70a1a8ea41bcd Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 May 2018 13:07:45 -0700
Subject: net: phy: Allow MDIO_MOXART and MDIO_SUN4I with COMPILE_TEST

Those drivers build just fine with COMPILE_TEST, so make that possible.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 0e2305ccc91f..343989f9f9d9 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -118,7 +118,7 @@ config MDIO_I2C
 
 config MDIO_MOXART
 	tristate "MOXA ART MDIO interface support"
-	depends on ARCH_MOXART
+	depends on ARCH_MOXART || COMPILE_TEST
 	help
 	  This driver supports the MDIO interface found in the network
 	  interface units of the MOXA ART SoC
@@ -142,7 +142,7 @@ config MDIO_OCTEON
 
 config MDIO_SUN4I
 	tristate "Allwinner sun4i MDIO interface support"
-	depends on ARCH_SUNXI
+	depends on ARCH_SUNXI || COMPILE_TEST
 	help
 	  This driver supports the MDIO interface found in the network
 	  interface units of the Allwinner SoC that have an EMAC (A10,
-- 
cgit v1.2.3-59-g8ed1b


From a6d04569120d53ebac1f0e6ffd5497b22c75f037 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 5 Dec 2017 10:38:58 +0200
Subject: net/mlx5: Add merged e-switch cap

When merged e-switch is supported, the per-port e-switch is logically
merged into one e-switch that spans both physical ports and all the VFs.
Under merged eswitch, both the matching on source vport and setting
destination vport can have a 2nd attribute which is the vhca id of the
eswitch owner.

For example:
esw0: {match: <src vport=1 owner=0> action: fwd to <dst vport=7, owner=1>}
is a flow set on eswitch0 matching on source vport=1 from his eswitch
and the action being fwd to dest vport=7 of eswitch1.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz Klein <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1aad455538f4..ef15f751a984 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -557,7 +557,8 @@ struct mlx5_ifc_e_switch_cap_bits {
 	u8         vport_svlan_insert[0x1];
 	u8         vport_cvlan_insert_if_not_exist[0x1];
 	u8         vport_cvlan_insert_overwrite[0x1];
-	u8         reserved_at_5[0x19];
+	u8         reserved_at_5[0x18];
+	u8         merged_eswitch[0x1];
 	u8         nic_vport_node_guid_modify[0x1];
 	u8         nic_vport_port_guid_modify[0x1];
 
-- 
cgit v1.2.3-59-g8ed1b


From 65360e5451121e493944776a7e33d1345c8483e6 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Thu, 22 Mar 2018 11:56:51 +0200
Subject: net/mlx5: Properly handle a vport destination when setting FTE

When creating FTE, properly distinguish between destination being vport
or tir. The previous code just worked accidentally b/c of both dest being
in the same offset within a union.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index ef5afd7c9325..0bfce6a82c91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -372,6 +372,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			if (dst->dest_attr.type ==
 			    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
 				id = dst->dest_attr.ft->id;
+			} else if (dst->dest_attr.type ==
+				   MLX5_FLOW_DESTINATION_TYPE_VPORT) {
+				id = dst->dest_attr.vport_num;
 			} else {
 				id = dst->dest_attr.tir_num;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From b17f7fc10facca9e1b2b9cef39af0a8179796c14 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Thu, 22 Mar 2018 12:32:12 +0200
Subject: net/mlx5: Add destination e-switch owner

The destination e-switch owner allows a rule in namespace of one e-switch
owner to point to a vport that is natively associated with another
e-switch owner.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c            | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             | 8 +++++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c            | 2 +-
 include/linux/mlx5/fs.h                                      | 6 +++++-
 include/linux/mlx5/mlx5_ifc.h                                | 5 +++--
 7 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index d93ff567b40d..b3820a34e773 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -235,7 +235,7 @@ const char *parse_fs_dst(struct trace_seq *p,
 
 	switch (dst->type) {
 	case MLX5_FLOW_DESTINATION_TYPE_VPORT:
-		trace_seq_printf(p, "vport=%u\n", dst->vport_num);
+		trace_seq_printf(p, "vport=%u\n", dst->vport.num);
 		break;
 	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
 		trace_seq_printf(p, "ft=%p\n", dst->ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 332bc56306bf..9a24314b817a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -192,7 +192,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	}
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = vport;
+	dest.vport.num = vport;
 
 	esw_debug(esw->dev,
 		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b123f8a52ad8..90c8cb31e633 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -71,7 +71,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-		dest[i].vport_num = attr->out_rep->vport;
+		dest[i].vport.num = attr->out_rep->vport;
 		i++;
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
@@ -343,7 +343,7 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 
 	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = vport;
+	dest.vport.num = vport;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
@@ -387,7 +387,7 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dmac_c[0] = 0x01;
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-	dest.vport_num = 0;
+	dest.vport.num = 0;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 0bfce6a82c91..5a00deff5457 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -374,7 +374,13 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 				id = dst->dest_attr.ft->id;
 			} else if (dst->dest_attr.type ==
 				   MLX5_FLOW_DESTINATION_TYPE_VPORT) {
-				id = dst->dest_attr.vport_num;
+				id = dst->dest_attr.vport.num;
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id_valid,
+					 dst->dest_attr.vport.vhca_id_valid);
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id,
+					 dst->dest_attr.vport.vhca_id);
 			} else {
 				id = dst->dest_attr.tir_num;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index de51e7c39bc8..5a80279b052a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1404,7 +1404,7 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 {
 	if (d1->type == d2->type) {
 		if ((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
-		     d1->vport_num == d2->vport_num) ||
+		     d1->vport.num == d2->vport.num) ||
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
 		     d1->ft == d2->ft) ||
 		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 47aecc4fa8c2..9f4d32e41c06 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -90,8 +90,12 @@ struct mlx5_flow_destination {
 	union {
 		u32			tir_num;
 		struct mlx5_flow_table	*ft;
-		u32			vport_num;
 		struct mlx5_fc		*counter;
+		struct {
+			u16		num;
+			u16		vhca_id;
+			bool		vhca_id_valid;
+		} vport;
 	};
 };
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ef15f751a984..3d17709bc30c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1148,8 +1148,9 @@ enum mlx5_flow_destination_type {
 struct mlx5_ifc_dest_format_struct_bits {
 	u8         destination_type[0x8];
 	u8         destination_id[0x18];
-
-	u8         reserved_at_20[0x20];
+	u8         destination_eswitch_owner_vhca_id_valid[0x1];
+	u8         reserved_at_21[0xf];
+	u8         destination_eswitch_owner_vhca_id[0x10];
 };
 
 struct mlx5_ifc_flow_counter_list_bits {
-- 
cgit v1.2.3-59-g8ed1b


From 56e858df9f4b917cc49d78831a7f5692075bc78f Mon Sep 17 00:00:00 2001
From: Rabie Loulou <rabiel@mellanox.com>
Date: Sun, 18 Mar 2018 08:29:04 +0200
Subject: net/mlx5e: Explicitly set destination e-switch in FDB rules

Set a specific destination e-switch when setting a destination vport.

Signed-off-by: Rabie Loulou <rabiel@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c            | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 +++++
 3 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4197001f9801..880adc810ccc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -836,6 +836,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		out_priv = netdev_priv(encap_dev);
 		rpriv = out_priv->ppriv;
 		attr->out_rep = rpriv->rep;
+		attr->out_mdev = out_priv->mdev;
 	}
 
 	err = mlx5_eswitch_add_vlan_action(esw, attr);
@@ -2501,6 +2502,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				out_priv = netdev_priv(out_dev);
 				rpriv = out_priv->ppriv;
 				attr->out_rep = rpriv->rep;
+				attr->out_mdev = out_priv->mdev;
 			} else if (encap) {
 				parse_attr->mirred_ifindex = out_dev->ifindex;
 				parse_attr->tun_info = *info;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4cd773fa55e3..ac5db54823a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -230,6 +230,7 @@ enum {
 struct mlx5_esw_flow_attr {
 	struct mlx5_eswitch_rep *in_rep;
 	struct mlx5_eswitch_rep *out_rep;
+	struct mlx5_core_dev	*out_mdev;
 
 	int	action;
 	__be16	vlan_proto;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 90c8cb31e633..ea93867d1ab4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -72,6 +72,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 		dest[i].vport.num = attr->out_rep->vport;
+		if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) {
+			dest[i].vport.vhca_id =
+				MLX5_CAP_GEN(attr->out_mdev, vhca_id);
+			dest[i].vport.vhca_id_valid = 1;
+		}
 		i++;
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-- 
cgit v1.2.3-59-g8ed1b


From 3e99df87722874c669f6c24a79bd1b4ba5fec819 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Sun, 18 Mar 2018 09:02:06 +0200
Subject: net/mlx5: Add source e-switch owner

The source e-switch owner allows a vport on one e-switch port be associated
with a rule defined on the second port e-switch.

The role of the source eswitch owner valid bit in the flow group is to
allow the firmware fail driver attempts to wild card the source eswitch
match field. If this bit is not set, the firmware ignores the source
eswitch owner field totally.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 ++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5a80279b052a..b1a2ca0ff320 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1372,6 +1372,8 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft,
 	struct mlx5_core_dev *dev = get_dev(&ft->node);
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	void *match_criteria_addr;
+	u8 src_esw_owner_mask_on;
+	void *misc;
 	int err;
 	u32 *in;
 
@@ -1384,6 +1386,14 @@ static int create_auto_flow_group(struct mlx5_flow_table *ft,
 	MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
 	MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
 		 fg->max_ftes - 1);
+
+	misc = MLX5_ADDR_OF(fte_match_param, fg->mask.match_criteria,
+			    misc_parameters);
+	src_esw_owner_mask_on = !!MLX5_GET(fte_match_set_misc, misc,
+					 source_eswitch_owner_vhca_id);
+	MLX5_SET(create_flow_group_in, in,
+		 source_eswitch_owner_vhca_id_valid, src_esw_owner_mask_on);
+
 	match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
 					   in, match_criteria);
 	memcpy(match_criteria_addr, fg->mask.match_criteria,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3d17709bc30c..9c3538f1b8b9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -412,7 +412,7 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         reserved_at_0[0x8];
 	u8         source_sqn[0x18];
 
-	u8         reserved_at_20[0x10];
+	u8         source_eswitch_owner_vhca_id[0x10];
 	u8         source_port[0x10];
 
 	u8         outer_second_prio[0x3];
@@ -6995,7 +6995,9 @@ struct mlx5_ifc_create_flow_group_in_bits {
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
 
-	u8         reserved_at_c0[0x20];
+	u8         source_eswitch_owner_vhca_id_valid[0x1];
+
+	u8         reserved_at_c1[0x1f];
 
 	u8         start_flow_index[0x20];
 
-- 
cgit v1.2.3-59-g8ed1b


From 10ff5359f883412728ba816046ee3a696625ca02 Mon Sep 17 00:00:00 2001
From: Shahar Klein <shahark@mellanox.com>
Date: Sun, 18 Mar 2018 09:03:49 +0200
Subject: net/mlx5e: Explicitly set source e-switch in offloaded TC rules

Set a specific source e-switch when setting a rule that matches on the
ingress port.

Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c            | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++++++
 3 files changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 880adc810ccc..1d2ba687b902 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2462,6 +2462,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 
 	memset(attr, 0, sizeof(*attr));
 	attr->in_rep = rpriv->rep;
+	attr->in_mdev = priv->mdev;
 
 	tcf_exts_to_list(exts, &actions);
 	list_for_each_entry(a, &actions, list) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ac5db54823a1..98a306e02640 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -231,6 +231,7 @@ struct mlx5_esw_flow_attr {
 	struct mlx5_eswitch_rep *in_rep;
 	struct mlx5_eswitch_rep *out_rep;
 	struct mlx5_core_dev	*out_mdev;
+	struct mlx5_core_dev	*in_mdev;
 
 	int	action;
 	__be16	vlan_proto;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index ea93867d1ab4..6c83eef5141a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -93,8 +93,16 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
 	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
 
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET(fte_match_set_misc, misc,
+			 source_eswitch_owner_vhca_id,
+			 MLX5_CAP_GEN(attr->in_mdev, vhca_id));
+
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
 	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc,
+				 source_eswitch_owner_vhca_id);
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
 				      MLX5_MATCH_MISC_PARAMETERS;
-- 
cgit v1.2.3-59-g8ed1b


From b1d90e6bbd149fe37b871d690d68aab4137a8987 Mon Sep 17 00:00:00 2001
From: Rabie Loulou <rabiel@mellanox.com>
Date: Thu, 11 Jan 2018 10:34:32 +0200
Subject: net/mlx5e: Offload TC eswitch rules for VFs belonging to different
 PFs

When the merged eswitch capability is supported, allow offloading rules
between VFs which belong to different PFs (and hence have different
eswitch affinity).

Signed-off-by: Rabie Loulou <rabiel@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Shahar Klein <shahark@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 630dd6dcabb9..77c3f8b8ae96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2077,6 +2077,20 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
 	return 0;
 }
 
+static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
+				  struct net_device *peer_netdev)
+{
+	struct mlx5e_priv *peer_priv;
+
+	peer_priv = netdev_priv(peer_netdev);
+
+	return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) &&
+		(priv->netdev->netdev_ops == peer_netdev->netdev_ops) &&
+		same_hw_devs(priv, peer_priv) &&
+		MLX5_VPORT_MANAGER(peer_priv->mdev) &&
+		(peer_priv->mdev->priv.eswitch->mode == SRIOV_OFFLOADS));
+}
+
 static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
 				   struct net_device *mirred_dev,
 				   struct net_device **out_dev,
@@ -2535,7 +2549,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			out_dev = tcf_mirred_dev(a);
 
 			if (switchdev_port_same_parent_id(priv->netdev,
-							  out_dev)) {
+							  out_dev) ||
+			    is_merged_eswitch_dev(priv, out_dev)) {
 				action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
 					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				out_priv = netdev_priv(out_dev);
-- 
cgit v1.2.3-59-g8ed1b


From 60bd4af814fec164c42bdd2efd7984b85d6b1e1e Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 18 Apr 2018 13:45:11 +0300
Subject: net/mlx5e: Add ingress/egress indication for offloaded TC flows

When an e-switch TC rule is offloaded through the egdev (egress
device) mechanism, we treat this as egress, all other cases (NIC
and e-switch) are considred ingress.

This is preparation step that will allow us to  identify "wrong"
stat/del offload calls made by the TC core on egdev based flows and
ignore them.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  3 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 ++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 32 ++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 38 +++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h   | 13 ++++++--
 5 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7c930088e96e..51a1d36a56c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1118,9 +1118,6 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
 int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
 			       struct ethtool_flash *flash);
 
-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
-			    void *cb_priv);
-
 /* mlx5e generic netdev management API */
 struct net_device*
 mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 417bf2e8ab85..27e8375a476b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3136,22 +3136,23 @@ out:
 
 #ifdef CONFIG_MLX5_ESWITCH
 static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
-				     struct tc_cls_flower_offload *cls_flower)
+				     struct tc_cls_flower_offload *cls_flower,
+				     int flags)
 {
 	switch (cls_flower->command) {
 	case TC_CLSFLOWER_REPLACE:
-		return mlx5e_configure_flower(priv, cls_flower);
+		return mlx5e_configure_flower(priv, cls_flower, flags);
 	case TC_CLSFLOWER_DESTROY:
-		return mlx5e_delete_flower(priv, cls_flower);
+		return mlx5e_delete_flower(priv, cls_flower, flags);
 	case TC_CLSFLOWER_STATS:
-		return mlx5e_stats_flower(priv, cls_flower);
+		return mlx5e_stats_flower(priv, cls_flower, flags);
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
-			    void *cb_priv)
+static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				   void *cb_priv)
 {
 	struct mlx5e_priv *priv = cb_priv;
 
@@ -3160,7 +3161,7 @@ int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_setup_tc_cls_flower(priv, type_data);
+		return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index a689f4c90fe3..182b636552a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -723,15 +723,31 @@ static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
 
 static int
 mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
-			      struct tc_cls_flower_offload *cls_flower)
+			      struct tc_cls_flower_offload *cls_flower, int flags)
 {
 	switch (cls_flower->command) {
 	case TC_CLSFLOWER_REPLACE:
-		return mlx5e_configure_flower(priv, cls_flower);
+		return mlx5e_configure_flower(priv, cls_flower, flags);
 	case TC_CLSFLOWER_DESTROY:
-		return mlx5e_delete_flower(priv, cls_flower);
+		return mlx5e_delete_flower(priv, cls_flower, flags);
 	case TC_CLSFLOWER_STATS:
-		return mlx5e_stats_flower(priv, cls_flower);
+		return mlx5e_stats_flower(priv, cls_flower, flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data,
+				       void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -747,7 +763,7 @@ static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
 
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_rep_setup_tc_cls_flower(priv, type_data);
+		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1107,7 +1123,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 
 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
 	upriv = netdev_priv(uplink_rpriv->netdev);
-	err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
+	err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb_egdev,
 					 upriv);
 	if (err)
 		goto err_neigh_cleanup;
@@ -1122,7 +1138,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	return 0;
 
 err_egdev_cleanup:
-	tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
 				     upriv);
 
 err_neigh_cleanup:
@@ -1151,7 +1167,7 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 	uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
 						    REP_ETH);
 	upriv = netdev_priv(uplink_rpriv->netdev);
-	tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
 				     upriv);
 	mlx5e_rep_neigh_cleanup(rpriv);
 	mlx5e_detach_netdev(priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 77c3f8b8ae96..26a1312ec9f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -62,12 +62,16 @@ struct mlx5_nic_flow_attr {
 	struct mlx5_flow_table	*hairpin_ft;
 };
 
+#define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1)
+
 enum {
-	MLX5E_TC_FLOW_ESWITCH	= BIT(0),
-	MLX5E_TC_FLOW_NIC	= BIT(1),
-	MLX5E_TC_FLOW_OFFLOADED	= BIT(2),
-	MLX5E_TC_FLOW_HAIRPIN	= BIT(3),
-	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(4),
+	MLX5E_TC_FLOW_INGRESS	= MLX5E_TC_INGRESS,
+	MLX5E_TC_FLOW_EGRESS	= MLX5E_TC_EGRESS,
+	MLX5E_TC_FLOW_ESWITCH	= BIT(MLX5E_TC_FLOW_BASE),
+	MLX5E_TC_FLOW_NIC	= BIT(MLX5E_TC_FLOW_BASE + 1),
+	MLX5E_TC_FLOW_OFFLOADED	= BIT(MLX5E_TC_FLOW_BASE + 2),
+	MLX5E_TC_FLOW_HAIRPIN	= BIT(MLX5E_TC_FLOW_BASE + 3),
+	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
 };
 
 struct mlx5e_tc_flow {
@@ -2618,8 +2622,20 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	return 0;
 }
 
+static void get_flags(int flags, u8 *flow_flags)
+{
+	u8 __flow_flags = 0;
+
+	if (flags & MLX5E_TC_INGRESS)
+		__flow_flags |= MLX5E_TC_FLOW_INGRESS;
+	if (flags & MLX5E_TC_EGRESS)
+		__flow_flags |= MLX5E_TC_FLOW_EGRESS;
+
+	*flow_flags = __flow_flags;
+}
+
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
-			   struct tc_cls_flower_offload *f)
+			   struct tc_cls_flower_offload *f, int flags)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
@@ -2628,11 +2644,13 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 	int attr_size, err = 0;
 	u8 flow_flags = 0;
 
+	get_flags(flags, &flow_flags);
+
 	if (esw && esw->mode == SRIOV_OFFLOADS) {
-		flow_flags = MLX5E_TC_FLOW_ESWITCH;
+		flow_flags |= MLX5E_TC_FLOW_ESWITCH;
 		attr_size  = sizeof(struct mlx5_esw_flow_attr);
 	} else {
-		flow_flags = MLX5E_TC_FLOW_NIC;
+		flow_flags |= MLX5E_TC_FLOW_NIC;
 		attr_size  = sizeof(struct mlx5_nic_flow_attr);
 	}
 
@@ -2691,7 +2709,7 @@ err_free:
 }
 
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
-			struct tc_cls_flower_offload *f)
+			struct tc_cls_flower_offload *f, int flags)
 {
 	struct mlx5e_tc_flow *flow;
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
@@ -2711,7 +2729,7 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 }
 
 int mlx5e_stats_flower(struct mlx5e_priv *priv,
-		       struct tc_cls_flower_offload *f)
+		       struct tc_cls_flower_offload *f, int flags)
 {
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
 	struct mlx5e_tc_flow *flow;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index c14c263a739b..2255345c2e18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -38,16 +38,23 @@
 #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
 
 #ifdef CONFIG_MLX5_ESWITCH
+
+enum {
+	MLX5E_TC_INGRESS = BIT(0),
+	MLX5E_TC_EGRESS  = BIT(1),
+	MLX5E_TC_LAST_EXPORTED_BIT = 1,
+};
+
 int mlx5e_tc_init(struct mlx5e_priv *priv);
 void mlx5e_tc_cleanup(struct mlx5e_priv *priv);
 
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
-			   struct tc_cls_flower_offload *f);
+			   struct tc_cls_flower_offload *f, int flags);
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
-			struct tc_cls_flower_offload *f);
+			struct tc_cls_flower_offload *f, int flags);
 
 int mlx5e_stats_flower(struct mlx5e_priv *priv,
-		       struct tc_cls_flower_offload *f);
+		       struct tc_cls_flower_offload *f, int flags);
 
 struct mlx5e_encap_entry;
 void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
-- 
cgit v1.2.3-59-g8ed1b


From 05866c82360aef6005de1920d585ae604309e732 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 10 Apr 2018 14:27:43 +0300
Subject: net/mlx5e: Prepare for shared table to keep TC eswitch flows

This is a refactoring step to be able and store the hash table which
keeps track of offloaded TC flows in a different location for NIC
vs e-switch rules.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h    |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 39 +++++++++++++------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 51a1d36a56c5..bc91a7335c93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -634,7 +634,6 @@ struct mlx5e_flow_table {
 struct mlx5e_tc_table {
 	struct mlx5_flow_table		*t;
 
-	struct rhashtable_params        ht_params;
 	struct rhashtable               ht;
 
 	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 26a1312ec9f8..1c90586d7f58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2634,12 +2634,24 @@ static void get_flags(int flags, u8 *flow_flags)
 	*flow_flags = __flow_flags;
 }
 
+static const struct rhashtable_params tc_ht_params = {
+	.head_offset = offsetof(struct mlx5e_tc_flow, node),
+	.key_offset = offsetof(struct mlx5e_tc_flow, cookie),
+	.key_len = sizeof(((struct mlx5e_tc_flow *)0)->cookie),
+	.automatic_shrinking = true,
+};
+
+static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
+{
+	return &priv->fs.tc.ht;
+}
+
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
 			   struct tc_cls_flower_offload *f, int flags)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
-	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	struct rhashtable *tc_ht = get_tc_ht(priv);
 	struct mlx5e_tc_flow *flow;
 	int attr_size, err = 0;
 	u8 flow_flags = 0;
@@ -2693,8 +2705,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 	    !(flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP))
 		kvfree(parse_attr);
 
-	err = rhashtable_insert_fast(&tc->ht, &flow->node,
-				     tc->ht_params);
+	err = rhashtable_insert_fast(tc_ht, &flow->node, tc_ht_params);
 	if (err) {
 		mlx5e_tc_del_flow(priv, flow);
 		kfree(flow);
@@ -2711,15 +2722,14 @@ err_free:
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
 			struct tc_cls_flower_offload *f, int flags)
 {
+	struct rhashtable *tc_ht = get_tc_ht(priv);
 	struct mlx5e_tc_flow *flow;
-	struct mlx5e_tc_table *tc = &priv->fs.tc;
 
-	flow = rhashtable_lookup_fast(&tc->ht, &f->cookie,
-				      tc->ht_params);
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
 	if (!flow)
 		return -EINVAL;
 
-	rhashtable_remove_fast(&tc->ht, &flow->node, tc->ht_params);
+	rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params);
 
 	mlx5e_tc_del_flow(priv, flow);
 
@@ -2731,15 +2741,14 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 int mlx5e_stats_flower(struct mlx5e_priv *priv,
 		       struct tc_cls_flower_offload *f, int flags)
 {
-	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	struct rhashtable *tc_ht = get_tc_ht(priv);
 	struct mlx5e_tc_flow *flow;
 	struct mlx5_fc *counter;
 	u64 bytes;
 	u64 packets;
 	u64 lastuse;
 
-	flow = rhashtable_lookup_fast(&tc->ht, &f->cookie,
-				      tc->ht_params);
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
 	if (!flow)
 		return -EINVAL;
 
@@ -2757,13 +2766,6 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	return 0;
 }
 
-static const struct rhashtable_params mlx5e_tc_flow_ht_params = {
-	.head_offset = offsetof(struct mlx5e_tc_flow, node),
-	.key_offset = offsetof(struct mlx5e_tc_flow, cookie),
-	.key_len = sizeof(((struct mlx5e_tc_flow *)0)->cookie),
-	.automatic_shrinking = true,
-};
-
 int mlx5e_tc_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
@@ -2771,8 +2773,7 @@ int mlx5e_tc_init(struct mlx5e_priv *priv)
 	hash_init(tc->mod_hdr_tbl);
 	hash_init(tc->hairpin_tbl);
 
-	tc->ht_params = mlx5e_tc_flow_ht_params;
-	return rhashtable_init(&tc->ht, &tc->ht_params);
+	return rhashtable_init(&tc->ht, &tc_ht_params);
 }
 
 static void _mlx5e_tc_del_flow(void *ptr, void *arg)
-- 
cgit v1.2.3-59-g8ed1b


From 655dc3d2b91bf241f5baca5eb2bc2b1e22a561ff Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 10 Apr 2018 18:34:36 +0300
Subject: net/mlx5e: Use shared table for offloaded TC eswitch flows

Currently, each representor netdev use their own hash table to keep
the mapping from TC flow (f->cookie) to the driver offloaded instance.
The table is the one which originally was added for offloading TC NIC
(not eswitch) rules.

This scheme breaks when the core TC code calls us to add the same flow
twice, (e.g under egdev use case) since we don't spot that and offload
a 2nd flow into the HW with the wrong source vport.

As a pre-step to solve that, we move to use a single table which keeps
all offloaded TC eswitch flows. The table is located at the eswitch
uplink representor object.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 19 +++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   | 29 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h   | 11 +++++----
 5 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 27e8375a476b..b5a7580b12fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4462,7 +4462,7 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
 		goto err_destroy_direct_tirs;
 	}
 
-	err = mlx5e_tc_init(priv);
+	err = mlx5e_tc_nic_init(priv);
 	if (err)
 		goto err_destroy_flow_steering;
 
@@ -4483,7 +4483,7 @@ err_destroy_indirect_rqts:
 
 static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv)
 {
-	mlx5e_tc_cleanup(priv);
+	mlx5e_tc_nic_cleanup(priv);
 	mlx5e_destroy_flow_steering(priv);
 	mlx5e_destroy_direct_tirs(priv);
 	mlx5e_destroy_indirect_tirs(priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 182b636552a6..aa32592a54cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -981,14 +981,8 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
 	}
 	rpriv->vport_rx_rule = flow_rule;
 
-	err = mlx5e_tc_init(priv);
-	if (err)
-		goto err_del_flow_rule;
-
 	return 0;
 
-err_del_flow_rule:
-	mlx5_del_flow_rules(rpriv->vport_rx_rule);
 err_destroy_direct_tirs:
 	mlx5e_destroy_direct_tirs(priv);
 err_destroy_direct_rqts:
@@ -1000,7 +994,6 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
 {
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 
-	mlx5e_tc_cleanup(priv);
 	mlx5_del_flow_rules(rpriv->vport_rx_rule);
 	mlx5e_destroy_direct_tirs(priv);
 	mlx5e_destroy_direct_rqts(priv);
@@ -1058,8 +1051,15 @@ mlx5e_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	if (err)
 		goto err_remove_sqs;
 
+	/* init shared tc flow table */
+	err = mlx5e_tc_esw_init(&rpriv->tc_ht);
+	if (err)
+		goto  err_neigh_cleanup;
+
 	return 0;
 
+err_neigh_cleanup:
+	mlx5e_rep_neigh_cleanup(rpriv);
 err_remove_sqs:
 	mlx5e_remove_sqs_fwd_rules(priv);
 	return err;
@@ -1074,9 +1074,8 @@ mlx5e_nic_rep_unload(struct mlx5_eswitch_rep *rep)
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_remove_sqs_fwd_rules(priv);
 
-	/* clean (and re-init) existing uplink offloaded TC rules */
-	mlx5e_tc_cleanup(priv);
-	mlx5e_tc_init(priv);
+	/* clean uplink offloaded TC rules, delete shared tc flow table */
+	mlx5e_tc_esw_cleanup(&rpriv->tc_ht);
 
 	mlx5e_rep_neigh_cleanup(rpriv);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index b9b481f2833a..844d32d5c29f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -59,6 +59,7 @@ struct mlx5e_rep_priv {
 	struct net_device      *netdev;
 	struct mlx5_flow_handle *vport_rx_rule;
 	struct list_head       vport_sqs_list;
+	struct rhashtable      tc_ht; /* valid for uplink rep */
 };
 
 static inline
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 1c90586d7f58..05c90b4f8a31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -76,6 +76,7 @@ enum {
 
 struct mlx5e_tc_flow {
 	struct rhash_head	node;
+	struct mlx5e_priv	*priv;
 	u64			cookie;
 	u8			flags;
 	struct mlx5_flow_handle *rule;
@@ -2643,7 +2644,14 @@ static const struct rhashtable_params tc_ht_params = {
 
 static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
 {
-	return &priv->fs.tc.ht;
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *uplink_rpriv;
+
+	if (MLX5_VPORT_MANAGER(priv->mdev) && esw->mode == SRIOV_OFFLOADS) {
+		uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+		return &uplink_rpriv->tc_ht;
+	} else
+		return &priv->fs.tc.ht;
 }
 
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
@@ -2675,6 +2683,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 
 	flow->cookie = f->cookie;
 	flow->flags = flow_flags;
+	flow->priv = priv;
 
 	err = parse_cls_flower(priv, flow, &parse_attr->spec, f);
 	if (err < 0)
@@ -2766,7 +2775,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	return 0;
 }
 
-int mlx5e_tc_init(struct mlx5e_priv *priv)
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
 
@@ -2779,20 +2788,30 @@ int mlx5e_tc_init(struct mlx5e_priv *priv)
 static void _mlx5e_tc_del_flow(void *ptr, void *arg)
 {
 	struct mlx5e_tc_flow *flow = ptr;
-	struct mlx5e_priv *priv = arg;
+	struct mlx5e_priv *priv = flow->priv;
 
 	mlx5e_tc_del_flow(priv, flow);
 	kfree(flow);
 }
 
-void mlx5e_tc_cleanup(struct mlx5e_priv *priv)
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv)
 {
 	struct mlx5e_tc_table *tc = &priv->fs.tc;
 
-	rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, priv);
+	rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, NULL);
 
 	if (!IS_ERR_OR_NULL(tc->t)) {
 		mlx5_destroy_flow_table(tc->t);
 		tc->t = NULL;
 	}
 }
+
+int mlx5e_tc_esw_init(struct rhashtable *tc_ht)
+{
+	return rhashtable_init(tc_ht, &tc_ht_params);
+}
+
+void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht)
+{
+	rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 2255345c2e18..59e52b845beb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -45,8 +45,11 @@ enum {
 	MLX5E_TC_LAST_EXPORTED_BIT = 1,
 };
 
-int mlx5e_tc_init(struct mlx5e_priv *priv);
-void mlx5e_tc_cleanup(struct mlx5e_priv *priv);
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_tc_esw_init(struct rhashtable *tc_ht);
+void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht);
 
 int mlx5e_configure_flower(struct mlx5e_priv *priv,
 			   struct tc_cls_flower_offload *f, int flags);
@@ -71,8 +74,8 @@ static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
 }
 
 #else /* CONFIG_MLX5_ESWITCH */
-static inline int  mlx5e_tc_init(struct mlx5e_priv *priv) { return 0; }
-static inline void mlx5e_tc_cleanup(struct mlx5e_priv *priv) {}
+static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
 static inline int  mlx5e_tc_num_filters(struct mlx5e_priv *priv) { return 0; }
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 8f8ae8953fb34ac01723f1dae5b231f64a3c526b Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 10 Apr 2018 19:24:51 +0300
Subject: net/mlx5e: Ignore attempts to offload multiple times a TC flow

For VF->VF and uplink->VF rules, the TC core (cls_api) attempts
to offload the same flow multiple times into the driver, b/c we
registered to the egdev callback.

Use the flow cookie to ignore attempts to add such flows, we can't
reject them (return error), b/c this will fail the offload attempt,
so we ignore that. We indentify wrong stat/del calls using the flow
ingress/egress flags, here we do return error to the core.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 05c90b4f8a31..674f1d7d2737 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2666,6 +2666,12 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 
 	get_flags(flags, &flow_flags);
 
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
+	if (flow) {
+		netdev_warn_once(priv->netdev, "flow cookie %lx already exists, ignoring\n", f->cookie);
+		return 0;
+	}
+
 	if (esw && esw->mode == SRIOV_OFFLOADS) {
 		flow_flags |= MLX5E_TC_FLOW_ESWITCH;
 		attr_size  = sizeof(struct mlx5_esw_flow_attr);
@@ -2728,6 +2734,17 @@ err_free:
 	return err;
 }
 
+#define DIRECTION_MASK (MLX5E_TC_INGRESS | MLX5E_TC_EGRESS)
+#define FLOW_DIRECTION_MASK (MLX5E_TC_FLOW_INGRESS | MLX5E_TC_FLOW_EGRESS)
+
+static bool same_flow_direction(struct mlx5e_tc_flow *flow, int flags)
+{
+	if ((flow->flags & FLOW_DIRECTION_MASK) == (flags & DIRECTION_MASK))
+		return true;
+
+	return false;
+}
+
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
 			struct tc_cls_flower_offload *f, int flags)
 {
@@ -2735,7 +2752,7 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 
 	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
-	if (!flow)
+	if (!flow || !same_flow_direction(flow, flags))
 		return -EINVAL;
 
 	rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params);
@@ -2758,7 +2775,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	u64 lastuse;
 
 	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
-	if (!flow)
+	if (!flow || !same_flow_direction(flow, flags))
 		return -EINVAL;
 
 	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
-- 
cgit v1.2.3-59-g8ed1b


From a228060a7c9ab88597eeac131e4578595d5d46ae Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Mon, 30 Apr 2018 11:42:26 +0300
Subject: net/mlx5e: Add HW vport counters to representor ethtool stats

Currently the representor only report the SW (slow-path) traffic
counters.

Add packet/bytes reporting of the HW counters, which account for the
total amount of traffic that was handled by the vport, both slow and
fast (offloaded) paths. The newly exposed counters are named
vport_rx/tx_packets/bytes.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 35 ++++++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index aa32592a54cb..c3034f58aa33 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -66,18 +66,36 @@ static const struct counter_desc sw_rep_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
 };
 
-#define NUM_VPORT_REP_COUNTERS	ARRAY_SIZE(sw_rep_stats_desc)
+struct vport_stats {
+	u64 vport_rx_packets;
+	u64 vport_tx_packets;
+	u64 vport_rx_bytes;
+	u64 vport_tx_bytes;
+};
+
+static const struct counter_desc vport_rep_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_bytes) },
+};
+
+#define NUM_VPORT_REP_SW_COUNTERS ARRAY_SIZE(sw_rep_stats_desc)
+#define NUM_VPORT_REP_HW_COUNTERS ARRAY_SIZE(vport_rep_stats_desc)
 
 static void mlx5e_rep_get_strings(struct net_device *dev,
 				  u32 stringset, uint8_t *data)
 {
-	int i;
+	int i, j;
 
 	switch (stringset) {
 	case ETH_SS_STATS:
-		for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+		for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++)
 			strcpy(data + (i * ETH_GSTRING_LEN),
 			       sw_rep_stats_desc[i].format);
+		for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++)
+			strcpy(data + (i * ETH_GSTRING_LEN),
+			       vport_rep_stats_desc[j].format);
 		break;
 	}
 }
@@ -140,7 +158,7 @@ static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
 					struct ethtool_stats *stats, u64 *data)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
-	int i;
+	int i, j;
 
 	if (!data)
 		return;
@@ -148,18 +166,23 @@ static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
 	mutex_lock(&priv->state_lock);
 	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
 		mlx5e_rep_update_sw_counters(priv);
+	mlx5e_rep_update_hw_counters(priv);
 	mutex_unlock(&priv->state_lock);
 
-	for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+	for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++)
 		data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
 					       sw_rep_stats_desc, i);
+
+	for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++)
+		data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport,
+					       vport_rep_stats_desc, j);
 }
 
 static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset)
 {
 	switch (sset) {
 	case ETH_SS_STATS:
-		return NUM_VPORT_REP_COUNTERS;
+		return NUM_VPORT_REP_SW_COUNTERS + NUM_VPORT_REP_HW_COUNTERS;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 2b05393b063b53648b55ddba751d1cdcd6462a6f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 3 Apr 2018 16:40:20 +0200
Subject: Bluetooth: hci_bcm: Add broken-irq dmi blacklist and add Meegopad T08
 to it

The Meegopad T08 hdmi-stick (think Intel computestick) has a brcm43430
wifi/bt combo chip. The BCM2E90 ACPI device describing the BT part does
contain a valid ActiveLow GpioInt entry, but the GPIO it points to never
goes low, so either the IRQ pin is not connected, or the ACPI resource-
table points to the wrong GPIO.

Eitherway things will not work if we try to use the specified IRQ, this
commits adds a DMI based broken-irq blacklist and disables use of the IRQ
and thus also runtime-pm for devices on this list.

This blacklist starts with the the Meegopad T08, fixing bluetooth not
working on this hdmi-stick. Since this is not a battery powered device
the loss of runtime-pm is not really an issue.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_bcm.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index 441f5e1deb11..4dccb50089d7 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -794,6 +794,20 @@ static const struct acpi_gpio_mapping acpi_bcm_int_first_gpios[] = {
 	{ },
 };
 
+/* Some firmware reports an IRQ which does not work (wrong pin in fw table?) */
+static const struct dmi_system_id bcm_broken_irq_dmi_table[] = {
+	{
+		.ident = "Meegopad T08",
+		.matches = {
+			DMI_EXACT_MATCH(DMI_BOARD_VENDOR,
+					"To be filled by OEM."),
+			DMI_EXACT_MATCH(DMI_BOARD_NAME, "T3 MRD"),
+			DMI_EXACT_MATCH(DMI_BOARD_VERSION, "V1.1"),
+		},
+	},
+	{ }
+};
+
 #ifdef CONFIG_ACPI
 /* IRQ polarity of some chipsets are not defined correctly in ACPI table. */
 static const struct dmi_system_id bcm_active_low_irq_dmi_table[] = {
@@ -904,6 +918,8 @@ static int bcm_gpio_set_shutdown(struct bcm_device *dev, bool powered)
 
 static int bcm_get_resources(struct bcm_device *dev)
 {
+	const struct dmi_system_id *dmi_id;
+
 	dev->name = dev_name(dev->dev);
 
 	if (x86_apple_machine && !bcm_apple_get_resources(dev))
@@ -936,6 +952,13 @@ static int bcm_get_resources(struct bcm_device *dev)
 		dev->irq = gpiod_to_irq(gpio);
 	}
 
+	dmi_id = dmi_first_match(bcm_broken_irq_dmi_table);
+	if (dmi_id) {
+		dev_info(dev->dev, "%s: Has a broken IRQ config, disabling IRQ support / runtime-pm\n",
+			 dmi_id->ident);
+		dev->irq = 0;
+	}
+
 	dev_dbg(dev->dev, "BCM irq: %d\n", dev->irq);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e6ba8208a4b40c0175c8c5e48462dfbfb189c13d Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 3 Apr 2018 17:35:21 +0200
Subject: Bluetooth: hci_bcm: Remove irq-active-low DMI quirk for the Thinkpad
 8

Interrupts specified through an "Interrupt" ACPI resource (versus through
a "GpioInt" resource) are now always assumed to be active low.

When this change was originally made the Thinkpad 8 quirk was kept around
because it was uncertain if the Thinkpad 8 uses an "Interrupt" or a
"GpioInt" resource.

Bug https://bugzilla.kernel.org/show_bug.cgi?id=196701 has a DSDT for the
Thinkpad 8 attached and it uses an "Interrupt" resource, so the quirk is
not necessary and the quirk, as well as the irq-active-low quirk handling
code can be removed.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_bcm.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index 4dccb50089d7..04c4b93b4c6f 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -809,18 +809,6 @@ static const struct dmi_system_id bcm_broken_irq_dmi_table[] = {
 };
 
 #ifdef CONFIG_ACPI
-/* IRQ polarity of some chipsets are not defined correctly in ACPI table. */
-static const struct dmi_system_id bcm_active_low_irq_dmi_table[] = {
-	{	/* Handle ThinkPad 8 tablets with BCM2E55 chipset ACPI ID */
-		.ident = "Lenovo ThinkPad 8",
-		.matches = {
-			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-			DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "ThinkPad 8"),
-		},
-	},
-	{ }
-};
-
 static int bcm_resource(struct acpi_resource *ares, void *data)
 {
 	struct bcm_device *dev = data;
@@ -967,7 +955,6 @@ static int bcm_get_resources(struct bcm_device *dev)
 static int bcm_acpi_probe(struct bcm_device *dev)
 {
 	LIST_HEAD(resources);
-	const struct dmi_system_id *dmi_id;
 	const struct acpi_gpio_mapping *gpio_mapping = acpi_bcm_int_last_gpios;
 	struct resource_entry *entry;
 	int ret;
@@ -1014,13 +1001,6 @@ static int bcm_acpi_probe(struct bcm_device *dev)
 		dev->irq_active_low = irq_polarity;
 		dev_warn(dev->dev, "Overwriting IRQ polarity to active %s by module-param\n",
 			 dev->irq_active_low ? "low" : "high");
-	} else {
-		dmi_id = dmi_first_match(bcm_active_low_irq_dmi_table);
-		if (dmi_id) {
-			dev_warn(dev->dev, "%s: Overwriting IRQ polarity to active low",
-				 dmi_id->ident);
-			dev->irq_active_low = true;
-		}
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 61a1ecfc8037e7eba9577a931cb8e1e92b636926 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Fri, 6 Apr 2018 11:23:45 +0200
Subject: Bluetooth: btqcomsmd: Fix rx/tx stats

HCI RX/TX byte counters were only incremented when sending ACL packets.
To reflect the real HCI traffic, we need to increment these counters on
HCI events and HCI commands as well.

Increment error counter on rpmsg errors.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btqcomsmd.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/bluetooth/btqcomsmd.c b/drivers/bluetooth/btqcomsmd.c
index 2c9a5fc9137d..7df3eed1ef5e 100644
--- a/drivers/bluetooth/btqcomsmd.c
+++ b/drivers/bluetooth/btqcomsmd.c
@@ -65,6 +65,7 @@ static int btqcomsmd_cmd_callback(struct rpmsg_device *rpdev, void *data,
 {
 	struct btqcomsmd *btq = priv;
 
+	btq->hdev->stat.byte_rx += count;
 	return btqcomsmd_recv(btq->hdev, HCI_EVENT_PKT, data, count);
 }
 
@@ -76,12 +77,21 @@ static int btqcomsmd_send(struct hci_dev *hdev, struct sk_buff *skb)
 	switch (hci_skb_pkt_type(skb)) {
 	case HCI_ACLDATA_PKT:
 		ret = rpmsg_send(btq->acl_channel, skb->data, skb->len);
+		if (ret) {
+			hdev->stat.err_tx++;
+			break;
+		}
 		hdev->stat.acl_tx++;
 		hdev->stat.byte_tx += skb->len;
 		break;
 	case HCI_COMMAND_PKT:
 		ret = rpmsg_send(btq->cmd_channel, skb->data, skb->len);
+		if (ret) {
+			hdev->stat.err_tx++;
+			break;
+		}
 		hdev->stat.cmd_tx++;
+		hdev->stat.byte_tx += skb->len;
 		break;
 	default:
 		ret = -EILSEQ;
-- 
cgit v1.2.3-59-g8ed1b


From 7dc5fe0814c35ec4e7d2e8fa30abab72e0e6a172 Mon Sep 17 00:00:00 2001
From: Amit Pundir <amit.pundir@linaro.org>
Date: Mon, 16 Apr 2018 12:10:24 +0530
Subject: Bluetooth: hci_qca: Avoid missing rampatch failure with userspace fw
 loader

AOSP use userspace firmware loader to load firmwares, which will
return -EAGAIN in case qca/rampatch_00440302.bin is not found.
Since there is no rampatch for dragonboard820c QCA controller
revision, just make it work as is.

CC: Loic Poulain <loic.poulain@linaro.org>
CC: Nicolas Dechesne <nicolas.dechesne@linaro.org>
CC: Marcel Holtmann <marcel@holtmann.org>
CC: Johan Hedberg <johan.hedberg@gmail.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_qca.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 05ec530b8a3a..330e9b29e145 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -935,6 +935,12 @@ static int qca_setup(struct hci_uart *hu)
 	} else if (ret == -ENOENT) {
 		/* No patch/nvm-config found, run with original fw/config */
 		ret = 0;
+	} else if (ret == -EAGAIN) {
+		/*
+		 * Userspace firmware loader will return -EAGAIN in case no
+		 * patch/nvm-config is found, so run with original fw/config.
+		 */
+		ret = 0;
 	}
 
 	/* Setup bdaddr */
-- 
cgit v1.2.3-59-g8ed1b


From c2e50c3ea55a6c75a807aee010ed3b27635913b1 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 29 Mar 2018 21:15:21 +0200
Subject: arm64: dts: apq8096-db820c: Enable wlan and bt en pins

This patch enables regulators and gpios for the Qualcomm QCA6174 BT/WLAN
combo controller.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 .../boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi    | 22 +++++++++++++++
 arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi       | 31 ++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
index 59b29ddfb6e9..5d50f45c4df7 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
@@ -14,6 +14,28 @@
 		};
 	};
 
+	bt_en_gpios: bt_en_gpios {
+		pinconf {
+			pins = "gpio19";
+			function = PMIC_GPIO_FUNC_NORMAL;
+			output-low;
+			power-source = <PM8994_GPIO_S4>; // 1.8V
+			qcom,drive-strength = <PMIC_GPIO_STRENGTH_LOW>;
+			bias-pull-down;
+		};
+	};
+
+	wlan_en_gpios: wlan_en_gpios {
+		pinconf {
+			pins = "gpio8";
+			function = PMIC_GPIO_FUNC_NORMAL;
+			output-low;
+			power-source = <PM8994_GPIO_S4>; // 1.8V
+			qcom,drive-strength = <PMIC_GPIO_STRENGTH_LOW>;
+			bias-pull-down;
+		};
+	};
+
 	volume_up_gpio: pm8996_gpio2 {
 		pinconf {
 			pins = "gpio2";
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
index 1c8f1b86472d..ec5e6eee0a7a 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
@@ -139,9 +139,40 @@
 			pinctrl-0 = <&usb2_vbus_det_gpio>;
 		};
 
+		bt_en: bt-en-1-8v {
+			pinctrl-names = "default";
+			pinctrl-0 = <&bt_en_gpios>;
+			compatible = "regulator-fixed";
+			regulator-name = "bt-en-regulator";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+
+			/* WLAN card specific delay */
+			startup-delay-us = <70000>;
+			enable-active-high;
+		};
+
+		wlan_en: wlan-en-1-8v {
+			pinctrl-names = "default";
+			pinctrl-0 = <&wlan_en_gpios>;
+			compatible = "regulator-fixed";
+			regulator-name = "wlan-en-regulator";
+			regulator-min-microvolt = <1800000>;
+			regulator-max-microvolt = <1800000>;
+
+			gpio = <&pm8994_gpios 8 0>;
+
+			/* WLAN card specific delay */
+			startup-delay-us = <70000>;
+			enable-active-high;
+		};
+
 		agnoc@0 {
 			qcom,pcie@600000 {
+				status = "okay";
 				perst-gpio = <&msmgpio 35 GPIO_ACTIVE_LOW>;
+				vddpe-supply = <&wlan_en>;
+				vddpe1-supply = <&bt_en>;
 			};
 
 			qcom,pcie@608000 {
-- 
cgit v1.2.3-59-g8ed1b


From 58abab5607586b0d440a291873f124b9dc4a25d3 Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@linaro.org>
Date: Thu, 29 Mar 2018 21:15:22 +0200
Subject: arm64: dts: apq8096-db820c: enable bluetooth node

Add a new serial node for the Qualcomm BT controller QCA6174. This
allows automatic probing and hci registration through the serdev
framework instead of relying on the userspace helpers.

Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi  | 26 +++++++++++++++++++
 .../boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi    | 10 ++++++++
 arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi       | 30 ++++++++++++++++++++++
 arch/arm64/boot/dts/qcom/msm8996.dtsi              | 10 ++++++++
 4 files changed, 76 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
index 24552f19b3fa..6a573875d45a 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c-pins.dtsi
@@ -36,4 +36,30 @@
 			drive-strength = <2>;	/* 2 MA */
 		};
 	};
+
+	blsp1_uart1_default: blsp1_uart1_default {
+		mux {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			function = "blsp_uart2";
+		};
+
+		config {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			drive-strength = <16>;
+			bias-disable;
+		};
+	};
+
+	blsp1_uart1_sleep: blsp1_uart1_sleep {
+		mux {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			function = "gpio";
+		};
+
+		config {
+			pins = "gpio41", "gpio42", "gpio43", "gpio44";
+			drive-strength = <2>;
+			bias-disable;
+		};
+	};
 };
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
index 5d50f45c4df7..6167af955659 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c-pmic-pins.dtsi
@@ -48,6 +48,16 @@
 		};
 	};
 
+	divclk4_pin_a: divclk4 {
+		pinconf {
+			pins = "gpio18";
+			function = PMIC_GPIO_FUNC_FUNC2;
+
+			bias-disable;
+			power-source = <PM8994_GPIO_S4>;
+		};
+	};
+
 	usb3_vbus_det_gpio: pm8996_gpio22 {
 		pinconf {
 			pins = "gpio22";
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
index ec5e6eee0a7a..4b8bb026346e 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
@@ -23,6 +23,7 @@
 	aliases {
 		serial0 = &blsp2_uart1;
 		serial1 = &blsp2_uart2;
+		serial2 = &blsp1_uart1;
 		i2c0	= &blsp1_i2c2;
 		i2c1	= &blsp2_i2c1;
 		i2c2	= &blsp2_i2c0;
@@ -34,7 +35,36 @@
 		stdout-path = "serial0:115200n8";
 	};
 
+	clocks {
+		divclk4: divclk4 {
+			compatible = "fixed-clock";
+			#clock-cells = <0>;
+			clock-frequency = <32768>;
+			clock-output-names = "divclk4";
+
+			pinctrl-names = "default";
+			pinctrl-0 = <&divclk4_pin_a>;
+		};
+	};
+
 	soc {
+		serial@7570000 {
+			label = "BT-UART";
+			status = "okay";
+			pinctrl-names = "default", "sleep";
+			pinctrl-0 = <&blsp1_uart1_default>;
+			pinctrl-1 = <&blsp1_uart1_sleep>;
+
+			bluetooth {
+				compatible = "qcom,qca6174-bt";
+
+				/* bt_disable_n gpio */
+				enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
+
+				clocks = <&divclk4>;
+			};
+		};
+
 		serial@75b0000 {
 			label = "LS-UART1";
 			status = "okay";
diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 410ae787ebb4..f8e49d0b4681 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -419,6 +419,16 @@
 			#clock-cells = <1>;
 		};
 
+		blsp1_uart1: serial@7570000 {
+			compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
+			reg = <0x07570000 0x1000>;
+			interrupts = <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&gcc GCC_BLSP1_UART2_APPS_CLK>,
+				 <&gcc GCC_BLSP1_AHB_CLK>;
+			clock-names = "core", "iface";
+			status = "disabled";
+		};
+
 		blsp1_spi0: spi@7575000 {
 			compatible = "qcom,spi-qup-v2.2.1";
 			reg = <0x07575000 0x600>;
-- 
cgit v1.2.3-59-g8ed1b


From 52b0900f6fb4dfd92bdf1db3e8e380f572549272 Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@linaro.org>
Date: Thu, 29 Mar 2018 21:15:23 +0200
Subject: dt-bindings: net: bluetooth: Add qualcomm-bluetooth

Add binding document for serial bluetooth chips using Qualcomm protocol.

Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 .../devicetree/bindings/net/qualcomm-bluetooth.txt | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt

diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
new file mode 100644
index 000000000000..0ea18a53cc29
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
@@ -0,0 +1,30 @@
+Qualcomm Bluetooth Chips
+---------------------
+
+This documents the binding structure and common properties for serial
+attached Qualcomm devices.
+
+Serial attached Qualcomm devices shall be a child node of the host UART
+device the slave device is attached to.
+
+Required properties:
+ - compatible: should contain one of the following:
+   * "qcom,qca6174-bt"
+
+Optional properties:
+ - enable-gpios: gpio specifier used to enable chip
+ - clocks: clock provided to the controller (SUSCLK_32KHZ)
+
+Example:
+
+serial@7570000 {
+	label = "BT-UART";
+	status = "okay";
+
+	bluetooth {
+		compatible = "qcom,qca6174-bt";
+
+		enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
+		clocks = <&divclk4>;
+	};
+};
-- 
cgit v1.2.3-59-g8ed1b


From 05ba533c5c1155839cf1538085b67488f1d3d308 Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@linaro.org>
Date: Thu, 29 Mar 2018 21:15:24 +0200
Subject: Bluetooth: hci_qca: Add serdev support

Add support for Qualcomm serial slave devices. Probe the serial device,
retrieve its maximum speed and register a new hci uart device.

Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/Kconfig   |   1 +
 drivers/bluetooth/hci_qca.c | 110 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index 010f5f579e68..f3c643a0473c 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -197,6 +197,7 @@ config BT_HCIUART_BCM
 config BT_HCIUART_QCA
 	bool "Qualcomm Atheros protocol support"
 	depends on BT_HCIUART
+	depends on BT_HCIUART_SERDEV
 	select BT_HCIUART_H4
 	select BT_QCA
 	help
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 330e9b29e145..f05382b5a65d 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -29,7 +29,12 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/clk.h>
 #include <linux/debugfs.h>
+#include <linux/gpio/consumer.h>
+#include <linux/mod_devicetable.h>
+#include <linux/module.h>
+#include <linux/serdev.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
@@ -50,6 +55,9 @@
 #define IBS_TX_IDLE_TIMEOUT_MS		2000
 #define BAUDRATE_SETTLE_TIMEOUT_MS	300
 
+/* susclk rate */
+#define SUSCLK_RATE_32KHZ	32768
+
 /* HCI_IBS transmit side sleep protocol states */
 enum tx_ibs_states {
 	HCI_IBS_TX_ASLEEP,
@@ -111,6 +119,12 @@ struct qca_data {
 	u64 votes_off;
 };
 
+struct qca_serdev {
+	struct hci_uart	 serdev_hu;
+	struct gpio_desc *bt_en;
+	struct clk	 *susclk;
+};
+
 static void __serial_clock_on(struct tty_struct *tty)
 {
 	/* TODO: Some chipset requires to enable UART clock on client
@@ -386,6 +400,7 @@ static void hci_ibs_wake_retrans_timeout(struct timer_list *t)
 /* Initialize protocol */
 static int qca_open(struct hci_uart *hu)
 {
+	struct qca_serdev *qcadev;
 	struct qca_data *qca;
 
 	BT_DBG("hu %p qca_open", hu);
@@ -444,6 +459,13 @@ static int qca_open(struct hci_uart *hu)
 	timer_setup(&qca->tx_idle_timer, hci_ibs_tx_idle_timeout, 0);
 	qca->tx_idle_delay = IBS_TX_IDLE_TIMEOUT_MS;
 
+	if (hu->serdev) {
+		serdev_device_open(hu->serdev);
+
+		qcadev = serdev_device_get_drvdata(hu->serdev);
+		gpiod_set_value_cansleep(qcadev->bt_en, 1);
+	}
+
 	BT_DBG("HCI_UART_QCA open, tx_idle_delay=%u, wake_retrans=%u",
 	       qca->tx_idle_delay, qca->wake_retrans);
 
@@ -512,6 +534,7 @@ static int qca_flush(struct hci_uart *hu)
 /* Close protocol */
 static int qca_close(struct hci_uart *hu)
 {
+	struct qca_serdev *qcadev;
 	struct qca_data *qca = hu->priv;
 
 	BT_DBG("hu %p qca close", hu);
@@ -525,6 +548,13 @@ static int qca_close(struct hci_uart *hu)
 	destroy_workqueue(qca->workqueue);
 	qca->hu = NULL;
 
+	if (hu->serdev) {
+		serdev_device_close(hu->serdev);
+
+		qcadev = serdev_device_get_drvdata(hu->serdev);
+		gpiod_set_value_cansleep(qcadev->bt_en, 0);
+	}
+
 	kfree_skb(qca->rx_skb);
 
 	hu->priv = NULL;
@@ -885,6 +915,14 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate)
 	return 0;
 }
 
+static inline void host_set_baudrate(struct hci_uart *hu, unsigned int speed)
+{
+	if (hu->serdev)
+		serdev_device_set_baudrate(hu->serdev, speed);
+	else
+		hci_uart_set_baudrate(hu, speed);
+}
+
 static int qca_setup(struct hci_uart *hu)
 {
 	struct hci_dev *hdev = hu->hdev;
@@ -905,7 +943,7 @@ static int qca_setup(struct hci_uart *hu)
 		speed = hu->proto->init_speed;
 
 	if (speed)
-		hci_uart_set_baudrate(hu, speed);
+		host_set_baudrate(hu, speed);
 
 	/* Setup user speed if needed */
 	speed = 0;
@@ -924,7 +962,7 @@ static int qca_setup(struct hci_uart *hu)
 				   ret);
 			return ret;
 		}
-		hci_uart_set_baudrate(hu, speed);
+		host_set_baudrate(hu, speed);
 	}
 
 	/* Setup patch / NVM configurations */
@@ -964,12 +1002,80 @@ static struct hci_uart_proto qca_proto = {
 	.dequeue	= qca_dequeue,
 };
 
+static int qca_serdev_probe(struct serdev_device *serdev)
+{
+	struct qca_serdev *qcadev;
+	int err;
+
+	qcadev = devm_kzalloc(&serdev->dev, sizeof(*qcadev), GFP_KERNEL);
+	if (!qcadev)
+		return -ENOMEM;
+
+	qcadev->serdev_hu.serdev = serdev;
+	serdev_device_set_drvdata(serdev, qcadev);
+
+	qcadev->bt_en = devm_gpiod_get(&serdev->dev, "enable",
+				       GPIOD_OUT_LOW);
+	if (IS_ERR(qcadev->bt_en)) {
+		dev_err(&serdev->dev, "failed to acquire enable gpio\n");
+		return PTR_ERR(qcadev->bt_en);
+	}
+
+	qcadev->susclk = devm_clk_get(&serdev->dev, NULL);
+	if (IS_ERR(qcadev->susclk)) {
+		dev_err(&serdev->dev, "failed to acquire clk\n");
+		return PTR_ERR(qcadev->susclk);
+	}
+
+	err = clk_set_rate(qcadev->susclk, SUSCLK_RATE_32KHZ);
+	if (err)
+		return err;
+
+	err = clk_prepare_enable(qcadev->susclk);
+	if (err)
+		return err;
+
+	err = hci_uart_register_device(&qcadev->serdev_hu, &qca_proto);
+	if (err)
+		clk_disable_unprepare(qcadev->susclk);
+
+	return err;
+}
+
+static void qca_serdev_remove(struct serdev_device *serdev)
+{
+	struct qca_serdev *qcadev = serdev_device_get_drvdata(serdev);
+
+	hci_uart_unregister_device(&qcadev->serdev_hu);
+
+	clk_disable_unprepare(qcadev->susclk);
+}
+
+static const struct of_device_id qca_bluetooth_of_match[] = {
+	{ .compatible = "qcom,qca6174-bt" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, qca_bluetooth_of_match);
+
+static struct serdev_device_driver qca_serdev_driver = {
+	.probe = qca_serdev_probe,
+	.remove = qca_serdev_remove,
+	.driver = {
+		.name = "hci_uart_qca",
+		.of_match_table = qca_bluetooth_of_match,
+	},
+};
+
 int __init qca_init(void)
 {
+	serdev_device_driver_register(&qca_serdev_driver);
+
 	return hci_uart_register_proto(&qca_proto);
 }
 
 int __exit qca_deinit(void)
 {
+	serdev_device_driver_unregister(&qca_serdev_driver);
+
 	return hci_uart_unregister_proto(&qca_proto);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1b2525c05bf9c661f6631e598b3bef5ce810085a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:02 +0200
Subject: Bluetooth: btbcm: Stop using upper nibble of rev to chose between
 uart/USB paths

btbcm_setup_patchram() was using the upper nibble of the revision code to
determine if we are dealing with an uart or USB connected bcm-bt device,
but just as btbcm_initialize() has started accepting 1 and 2 as uart
connected devices, I've now encountered an USB connected device (0a5c:216c)
which has 0 in the upper nibble. So it seems that the upper nibble is not
really a reliable indicator of the bus type.

Instead check hdev->bus which does give us a reliable indication. This
fixes the patchram code trying to load the patchram by the fallback BCM.hcd
filename, now it correctly requests BCM43142A0-0a5c-216c.hcd.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index 6659f113042c..f44ddc0a6b25 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -465,9 +465,11 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 	if (err)
 		return err;
 
-	switch ((rev & 0xf000) >> 12) {
-	case 0:
-	case 3:
+	/* Upper nibble of rev should be between 0 and 3? */
+	if (((rev & 0xf000) >> 12) > 3)
+		return 0;
+
+	if (hdev->bus != HCI_USB) {
 		for (i = 0; bcm_uart_subver_table[i].name; i++) {
 			if (subver == bcm_uart_subver_table[i].subver) {
 				hw_name = bcm_uart_subver_table[i].name;
@@ -477,9 +479,7 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 
 		snprintf(fw_name, sizeof(fw_name), "brcm/%s.hcd",
 			 hw_name ? : "BCM");
-		break;
-	case 1:
-	case 2:
+	} else {
 		/* Read USB Product Info */
 		skb = btbcm_read_usb_product(hdev);
 		if (IS_ERR(skb))
@@ -498,9 +498,6 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 
 		snprintf(fw_name, sizeof(fw_name), "brcm/%s-%4.4x-%4.4x.hcd",
 			 hw_name ? : "BCM", vid, pid);
-		break;
-	default:
-		return 0;
 	}
 
 	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
-- 
cgit v1.2.3-59-g8ed1b


From f7328381bf64137e958d9fd0e070752d1366e91b Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:03 +0200
Subject: Bluetooth: btbcm: Factor out common code to determine subversion

We are using the same loop in both the UART and USB bus cases, refactor
things a bit to share the loop.

This is mostly meant to improve readability.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index f44ddc0a6b25..e3393bceb1a0 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -315,10 +315,12 @@ static int btbcm_read_info(struct hci_dev *hdev)
 	return 0;
 }
 
-static const struct {
+struct bcm_subver_table {
 	u16 subver;
 	const char *name;
-} bcm_uart_subver_table[] = {
+};
+
+static const struct bcm_subver_table bcm_uart_subver_table[] = {
 	{ 0x4103, "BCM4330B1"	},	/* 002.001.003 */
 	{ 0x410e, "BCM43341B0"	},	/* 002.001.014 */
 	{ 0x4406, "BCM4324B3"	},	/* 002.004.006 */
@@ -418,10 +420,7 @@ int btbcm_finalize(struct hci_dev *hdev)
 }
 EXPORT_SYMBOL_GPL(btbcm_finalize);
 
-static const struct {
-	u16 subver;
-	const char *name;
-} bcm_usb_subver_table[] = {
+static const struct bcm_subver_table bcm_usb_subver_table[] = {
 	{ 0x210b, "BCM43142A0"	},	/* 001.001.011 */
 	{ 0x2112, "BCM4314A0"	},	/* 001.001.018 */
 	{ 0x2118, "BCM20702A0"	},	/* 001.001.024 */
@@ -443,6 +442,7 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 	const char *hw_name = NULL;
 	struct sk_buff *skb;
 	struct hci_rp_read_local_version *ver;
+	const struct bcm_subver_table *bcm_subver_table;
 	int i, err;
 
 	/* Reset */
@@ -469,17 +469,17 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 	if (((rev & 0xf000) >> 12) > 3)
 		return 0;
 
-	if (hdev->bus != HCI_USB) {
-		for (i = 0; bcm_uart_subver_table[i].name; i++) {
-			if (subver == bcm_uart_subver_table[i].subver) {
-				hw_name = bcm_uart_subver_table[i].name;
-				break;
-			}
+	bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table :
+						    bcm_uart_subver_table;
+
+	for (i = 0; bcm_subver_table[i].name; i++) {
+		if (subver == bcm_subver_table[i].subver) {
+			hw_name = bcm_subver_table[i].name;
+			break;
 		}
+	}
 
-		snprintf(fw_name, sizeof(fw_name), "brcm/%s.hcd",
-			 hw_name ? : "BCM");
-	} else {
+	if (hdev->bus == HCI_USB) {
 		/* Read USB Product Info */
 		skb = btbcm_read_usb_product(hdev);
 		if (IS_ERR(skb))
@@ -489,15 +489,11 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 		pid = get_unaligned_le16(skb->data + 3);
 		kfree_skb(skb);
 
-		for (i = 0; bcm_usb_subver_table[i].name; i++) {
-			if (subver == bcm_usb_subver_table[i].subver) {
-				hw_name = bcm_usb_subver_table[i].name;
-				break;
-			}
-		}
-
 		snprintf(fw_name, sizeof(fw_name), "brcm/%s-%4.4x-%4.4x.hcd",
 			 hw_name ? : "BCM", vid, pid);
+	} else {
+		snprintf(fw_name, sizeof(fw_name), "brcm/%s.hcd",
+			 hw_name ? : "BCM");
 	}
 
 	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
-- 
cgit v1.2.3-59-g8ed1b


From fec0de23ea11429478fa2c24ace5cf4cc9cd2191 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:04 +0200
Subject: Bluetooth: btbcm: Make btbcm_initialize() also work for USB connected
 devices

Make btbcm_initialize() also work for USB connected device,
btbcm_initialize() and btbcm_setup_patchram() are quite similar,
this is a preparation patch for making btbcm_setup_patchram() use
btbcm_initialize() to remove the code duplication.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c | 72 ++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index e3393bceb1a0..dd243afe32de 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -332,12 +332,27 @@ static const struct bcm_subver_table bcm_uart_subver_table[] = {
 	{ }
 };
 
+static const struct bcm_subver_table bcm_usb_subver_table[] = {
+	{ 0x210b, "BCM43142A0"	},	/* 001.001.011 */
+	{ 0x2112, "BCM4314A0"	},	/* 001.001.018 */
+	{ 0x2118, "BCM20702A0"	},	/* 001.001.024 */
+	{ 0x2126, "BCM4335A0"	},	/* 001.001.038 */
+	{ 0x220e, "BCM20702A1"	},	/* 001.002.014 */
+	{ 0x230f, "BCM4354A2"	},	/* 001.003.015 */
+	{ 0x4106, "BCM4335B0"	},	/* 002.001.006 */
+	{ 0x410e, "BCM20702B0"	},	/* 002.001.014 */
+	{ 0x6109, "BCM4335C0"	},	/* 003.001.009 */
+	{ 0x610c, "BCM4354"	},	/* 003.001.012 */
+	{ }
+};
+
 int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len)
 {
-	u16 subver, rev;
+	u16 subver, rev, pid, vid;
 	const char *hw_name = NULL;
 	struct sk_buff *skb;
 	struct hci_rp_read_local_version *ver;
+	const struct bcm_subver_table *bcm_subver_table;
 	int i, err;
 
 	/* Reset */
@@ -360,22 +375,35 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len)
 	if (err)
 		return err;
 
-	switch ((rev & 0xf000) >> 12) {
-	case 0:
-	case 1:
-	case 2:
-	case 3:
-		for (i = 0; bcm_uart_subver_table[i].name; i++) {
-			if (subver == bcm_uart_subver_table[i].subver) {
-				hw_name = bcm_uart_subver_table[i].name;
-				break;
-			}
+	/* Upper nibble of rev should be between 0 and 3? */
+	if (((rev & 0xf000) >> 12) > 3)
+		return 0;
+
+	bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table :
+						    bcm_uart_subver_table;
+
+	for (i = 0; bcm_subver_table[i].name; i++) {
+		if (subver == bcm_subver_table[i].subver) {
+			hw_name = bcm_subver_table[i].name;
+			break;
 		}
+	}
 
-		snprintf(fw_name, len, "brcm/%s.hcd", hw_name ? : "BCM");
-		break;
-	default:
-		return 0;
+	if (hdev->bus == HCI_USB) {
+		/* Read USB Product Info */
+		skb = btbcm_read_usb_product(hdev);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		vid = get_unaligned_le16(skb->data + 1);
+		pid = get_unaligned_le16(skb->data + 3);
+		kfree_skb(skb);
+
+		snprintf(fw_name, len, "brcm/%s-%4.4x-%4.4x.hcd",
+			 hw_name ? : "BCM", vid, pid);
+	} else {
+		snprintf(fw_name, len, "brcm/%s.hcd",
+			 hw_name ? : "BCM");
 	}
 
 	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
@@ -420,20 +448,6 @@ int btbcm_finalize(struct hci_dev *hdev)
 }
 EXPORT_SYMBOL_GPL(btbcm_finalize);
 
-static const struct bcm_subver_table bcm_usb_subver_table[] = {
-	{ 0x210b, "BCM43142A0"	},	/* 001.001.011 */
-	{ 0x2112, "BCM4314A0"	},	/* 001.001.018 */
-	{ 0x2118, "BCM20702A0"	},	/* 001.001.024 */
-	{ 0x2126, "BCM4335A0"	},	/* 001.001.038 */
-	{ 0x220e, "BCM20702A1"	},	/* 001.002.014 */
-	{ 0x230f, "BCM4354A2"	},	/* 001.003.015 */
-	{ 0x4106, "BCM4335B0"	},	/* 002.001.006 */
-	{ 0x410e, "BCM20702B0"	},	/* 002.001.014 */
-	{ 0x6109, "BCM4335C0"	},	/* 003.001.009 */
-	{ 0x610c, "BCM4354"	},	/* 003.001.012 */
-	{ }
-};
-
 int btbcm_setup_patchram(struct hci_dev *hdev)
 {
 	char fw_name[64];
-- 
cgit v1.2.3-59-g8ed1b


From 22ac191652cc73a390c60c1e0faafae53031d9f6 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:05 +0200
Subject: Bluetooth: btbcm: Allow using btbcm_initialize() for reinit

btbcm_finalize() does a re-init of the controller, which is almost the
same as the initial init. Modify btbcm_initialize() so that it can be
used for this re-init and modify btbcm_finalize() to use it.

As an added bonus this also makes the dev_info from btbcm_finalize()
use the proper hw_name instead of always printing "BCM".

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c   | 33 ++++++++++-----------------------
 drivers/bluetooth/btbcm.h   |  5 +++--
 drivers/bluetooth/hci_bcm.c |  2 +-
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index dd243afe32de..fdcbeb489a2e 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -346,7 +346,8 @@ static const struct bcm_subver_table bcm_usb_subver_table[] = {
 	{ }
 };
 
-int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len)
+int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len,
+		     bool reinit)
 {
 	u16 subver, rev, pid, vid;
 	const char *hw_name = NULL;
@@ -371,9 +372,11 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len)
 	kfree_skb(skb);
 
 	/* Read controller information */
-	err = btbcm_read_info(hdev);
-	if (err)
-		return err;
+	if (!reinit) {
+		err = btbcm_read_info(hdev);
+		if (err)
+			return err;
+	}
 
 	/* Upper nibble of rev should be between 0 and 3? */
 	if (((rev & 0xf000) >> 12) > 3)
@@ -416,30 +419,14 @@ EXPORT_SYMBOL_GPL(btbcm_initialize);
 
 int btbcm_finalize(struct hci_dev *hdev)
 {
-	struct sk_buff *skb;
-	struct hci_rp_read_local_version *ver;
-	u16 subver, rev;
+	char fw_name[64];
 	int err;
 
-	/* Reset */
-	err = btbcm_reset(hdev);
+	/* Re-initialize */
+	err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), true);
 	if (err)
 		return err;
 
-	/* Read Local Version Info */
-	skb = btbcm_read_local_version(hdev);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
-
-	ver = (struct hci_rp_read_local_version *)skb->data;
-	rev = le16_to_cpu(ver->hci_rev);
-	subver = le16_to_cpu(ver->lmp_subver);
-	kfree_skb(skb);
-
-	bt_dev_info(hdev, "BCM (%3.3u.%3.3u.%3.3u) build %4.4u",
-		    (subver & 0xe000) >> 13, (subver & 0x1f00) >> 8,
-		    (subver & 0x00ff), rev & 0x0fff);
-
 	btbcm_check_bdaddr(hdev);
 
 	set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
diff --git a/drivers/bluetooth/btbcm.h b/drivers/bluetooth/btbcm.h
index cfe6ad4cc621..5346515c880c 100644
--- a/drivers/bluetooth/btbcm.h
+++ b/drivers/bluetooth/btbcm.h
@@ -73,7 +73,8 @@ int btbcm_patchram(struct hci_dev *hdev, const struct firmware *fw);
 int btbcm_setup_patchram(struct hci_dev *hdev);
 int btbcm_setup_apple(struct hci_dev *hdev);
 
-int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len);
+int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len,
+		     bool reinit);
 int btbcm_finalize(struct hci_dev *hdev);
 
 #else
@@ -104,7 +105,7 @@ static inline int btbcm_setup_apple(struct hci_dev *hdev)
 }
 
 static inline int btbcm_initialize(struct hci_dev *hdev, char *fw_name,
-				   size_t len)
+				   size_t len, bool reinit)
 {
 	return 0;
 }
diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index 04c4b93b4c6f..f06f0f1132fb 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -501,7 +501,7 @@ static int bcm_setup(struct hci_uart *hu)
 	hu->hdev->set_diag = bcm_set_diag;
 	hu->hdev->set_bdaddr = btbcm_set_bdaddr;
 
-	err = btbcm_initialize(hu->hdev, fw_name, sizeof(fw_name));
+	err = btbcm_initialize(hu->hdev, fw_name, sizeof(fw_name), false);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 59ce5699be6d4bc6f24e337c52418b509d64063f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:06 +0200
Subject: Bluetooth: btbcm: Remove duplicate code from btbcm_setup_patchram()

btbcm_setup_patchram() starts with initializing the controller (and
getting the firmware filename) and then after loading the firmware,
does a re-init. This almost entirely duplicates the code in
btbcm_initialize(), use that function instead.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c | 78 +++--------------------------------------------
 1 file changed, 5 insertions(+), 73 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index fdcbeb489a2e..d44aa6ff1ed5 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -439,68 +439,14 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 {
 	char fw_name[64];
 	const struct firmware *fw;
-	u16 subver, rev, pid, vid;
-	const char *hw_name = NULL;
 	struct sk_buff *skb;
-	struct hci_rp_read_local_version *ver;
-	const struct bcm_subver_table *bcm_subver_table;
-	int i, err;
-
-	/* Reset */
-	err = btbcm_reset(hdev);
-	if (err)
-		return err;
-
-	/* Read Local Version Info */
-	skb = btbcm_read_local_version(hdev);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
-
-	ver = (struct hci_rp_read_local_version *)skb->data;
-	rev = le16_to_cpu(ver->hci_rev);
-	subver = le16_to_cpu(ver->lmp_subver);
-	kfree_skb(skb);
+	int err;
 
-	/* Read controller information */
-	err = btbcm_read_info(hdev);
+	/* Initialize */
+	err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), false);
 	if (err)
 		return err;
 
-	/* Upper nibble of rev should be between 0 and 3? */
-	if (((rev & 0xf000) >> 12) > 3)
-		return 0;
-
-	bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table :
-						    bcm_uart_subver_table;
-
-	for (i = 0; bcm_subver_table[i].name; i++) {
-		if (subver == bcm_subver_table[i].subver) {
-			hw_name = bcm_subver_table[i].name;
-			break;
-		}
-	}
-
-	if (hdev->bus == HCI_USB) {
-		/* Read USB Product Info */
-		skb = btbcm_read_usb_product(hdev);
-		if (IS_ERR(skb))
-			return PTR_ERR(skb);
-
-		vid = get_unaligned_le16(skb->data + 1);
-		pid = get_unaligned_le16(skb->data + 3);
-		kfree_skb(skb);
-
-		snprintf(fw_name, sizeof(fw_name), "brcm/%s-%4.4x-%4.4x.hcd",
-			 hw_name ? : "BCM", vid, pid);
-	} else {
-		snprintf(fw_name, sizeof(fw_name), "brcm/%s.hcd",
-			 hw_name ? : "BCM");
-	}
-
-	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
-		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
-		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
-
 	err = request_firmware(&fw, fw_name, &hdev->dev);
 	if (err < 0) {
 		bt_dev_info(hdev, "BCM: Patch %s not found", fw_name);
@@ -511,25 +457,11 @@ int btbcm_setup_patchram(struct hci_dev *hdev)
 
 	release_firmware(fw);
 
-	/* Reset */
-	err = btbcm_reset(hdev);
+	/* Re-initialize */
+	err = btbcm_initialize(hdev, fw_name, sizeof(fw_name), true);
 	if (err)
 		return err;
 
-	/* Read Local Version Info */
-	skb = btbcm_read_local_version(hdev);
-	if (IS_ERR(skb))
-		return PTR_ERR(skb);
-
-	ver = (struct hci_rp_read_local_version *)skb->data;
-	rev = le16_to_cpu(ver->hci_rev);
-	subver = le16_to_cpu(ver->lmp_subver);
-	kfree_skb(skb);
-
-	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
-		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
-		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
-
 	/* Read Local Name */
 	skb = btbcm_read_local_name(hdev);
 	if (IS_ERR(skb))
-- 
cgit v1.2.3-59-g8ed1b


From 2cc6d0794cbab470b2d82d5a7547c865fd61e0f3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 20 Apr 2018 14:44:07 +0200
Subject: Bluetooth: btbcm: btbcm_initialize(): Initialize hw_name to "BCM"

Initialize hw_name to "BCM", this avoids the need for a number of NULL
checks on hw_name later.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btbcm.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index d44aa6ff1ed5..99cde1f9467d 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -350,7 +350,7 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len,
 		     bool reinit)
 {
 	u16 subver, rev, pid, vid;
-	const char *hw_name = NULL;
+	const char *hw_name = "BCM";
 	struct sk_buff *skb;
 	struct hci_rp_read_local_version *ver;
 	const struct bcm_subver_table *bcm_subver_table;
@@ -403,14 +403,13 @@ int btbcm_initialize(struct hci_dev *hdev, char *fw_name, size_t len,
 		kfree_skb(skb);
 
 		snprintf(fw_name, len, "brcm/%s-%4.4x-%4.4x.hcd",
-			 hw_name ? : "BCM", vid, pid);
+			 hw_name, vid, pid);
 	} else {
-		snprintf(fw_name, len, "brcm/%s.hcd",
-			 hw_name ? : "BCM");
+		snprintf(fw_name, len, "brcm/%s.hcd", hw_name);
 	}
 
 	bt_dev_info(hdev, "%s (%3.3u.%3.3u.%3.3u) build %4.4u",
-		    hw_name ? : "BCM", (subver & 0xe000) >> 13,
+		    hw_name, (subver & 0xe000) >> 13,
 		    (subver & 0x1f00) >> 8, (subver & 0x00ff), rev & 0x0fff);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From ee6493462f74013c6f365429401b716500aff838 Mon Sep 17 00:00:00 2001
From: Chriz Chow <cmcvista@gmail.com>
Date: Fri, 20 Apr 2018 15:46:24 +0800
Subject: Bluetooth: Prevent buffer overflow for large advertisement data

There are some controllers sending out advertising data with illegal
length value which is longer than HCI_MAX_AD_LENGTH, causing the
buffer last_adv_data overflows. To avoid these controllers from
overflowing the buffer, we do not process the advertisement data
if its length is incorrect.

Signed-off-by: Chriz Chow <chriz.chow@aminocom.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_event.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 139707cd9d35..235b5aaab23d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
 		struct hci_ev_le_advertising_info *ev = ptr;
 		s8 rssi;
 
-		rssi = ev->data[ev->length];
-		process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
-				   ev->bdaddr_type, NULL, 0, rssi,
-				   ev->data, ev->length);
+		if (ev->length <= HCI_MAX_AD_LENGTH) {
+			rssi = ev->data[ev->length];
+			process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
+					   ev->bdaddr_type, NULL, 0, rssi,
+					   ev->data, ev->length);
+		} else {
+			bt_dev_err(hdev, "Dropping invalid advertising data");
+		}
 
 		ptr += sizeof(*ev) + ev->length + 1;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 67d8cee432f5564bba3fe9e0d8b1f07c3e41dda2 Mon Sep 17 00:00:00 2001
From: John Keeping <john@metanate.com>
Date: Thu, 19 Apr 2018 16:29:37 +0100
Subject: Bluetooth: use wait_event API instead of open-coding it

I've seen timeout errors from HCI commands where it looks like
schedule_timeout() has returned immediately; additional logging for the
error case gives:

	req_status=1 req_result=0 remaining=10000 jiffies

so the device is still in state HCI_REQ_PEND and the value returned by
schedule_timeout() is the same as the original timeout (HCI_INIT_TIMEOUT
on a system with HZ=1000).

Use wait_event_interruptible_timeout() instead of open-coding similar
behaviour which is subject to the spurious failure described above.

Signed-off-by: John Keeping <john@metanate.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 66c0781773df..e44d34734834 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err)
 struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
 				  const void *param, u8 event, u32 timeout)
 {
-	DECLARE_WAITQUEUE(wait, current);
 	struct hci_request req;
 	struct sk_buff *skb;
 	int err = 0;
@@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
 
 	hdev->req_status = HCI_REQ_PEND;
 
-	add_wait_queue(&hdev->req_wait_q, &wait);
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	err = hci_req_run_skb(&req, hci_req_sync_complete);
-	if (err < 0) {
-		remove_wait_queue(&hdev->req_wait_q, &wait);
-		set_current_state(TASK_RUNNING);
+	if (err < 0)
 		return ERR_PTR(err);
-	}
 
-	schedule_timeout(timeout);
+	err = wait_event_interruptible_timeout(hdev->req_wait_q,
+			hdev->req_status != HCI_REQ_PEND, timeout);
 
-	remove_wait_queue(&hdev->req_wait_q, &wait);
-
-	if (signal_pending(current))
+	if (err == -ERESTARTSYS)
 		return ERR_PTR(-EINTR);
 
 	switch (hdev->req_status) {
@@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
 		   unsigned long opt, u32 timeout, u8 *hci_status)
 {
 	struct hci_request req;
-	DECLARE_WAITQUEUE(wait, current);
 	int err = 0;
 
 	BT_DBG("%s start", hdev->name);
@@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
 		return err;
 	}
 
-	add_wait_queue(&hdev->req_wait_q, &wait);
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	err = hci_req_run_skb(&req, hci_req_sync_complete);
 	if (err < 0) {
 		hdev->req_status = 0;
 
-		remove_wait_queue(&hdev->req_wait_q, &wait);
-		set_current_state(TASK_RUNNING);
-
 		/* ENODATA means the HCI request command queue is empty.
 		 * This can happen when a request with conditionals doesn't
 		 * trigger any commands to be sent. This is normal behavior
@@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
 		return err;
 	}
 
-	schedule_timeout(timeout);
-
-	remove_wait_queue(&hdev->req_wait_q, &wait);
+	err = wait_event_interruptible_timeout(hdev->req_wait_q,
+			hdev->req_status != HCI_REQ_PEND, timeout);
 
-	if (signal_pending(current))
+	if (err == -ERESTARTSYS)
 		return -EINTR;
 
 	switch (hdev->req_status) {
-- 
cgit v1.2.3-59-g8ed1b


From d666fc5479ad76a1bcbe6476d4997cea714bab2d Mon Sep 17 00:00:00 2001
From: Vic Wei <vwei@codeaurora.org>
Date: Mon, 23 Apr 2018 15:17:07 -0700
Subject: Bluetooth: btusb: add ID for LiteOn 04ca:301a

Contains a QCA6174A chipset, with USB BT. Let's support loading
firmware on it.

From usb-devices:
T:  Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=12   MxCh= 0
D:  Ver= 2.01 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=04ca ProdID=301a Rev= 0.01
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb

Signed-off-by: Vic Wei <vwei@codeaurora.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index b937cc1e2c07..a470a2313935 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -276,6 +276,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x04ca, 0x3011), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x04ca, 0x3015), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x04ca, 0x3016), .driver_info = BTUSB_QCA_ROME },
+	{ USB_DEVICE(0x04ca, 0x301a), .driver_info = BTUSB_QCA_ROME },
 
 	/* Broadcom BCM2035 */
 	{ USB_DEVICE(0x0a5c, 0x2009), .driver_info = BTUSB_BCM92035 },
-- 
cgit v1.2.3-59-g8ed1b


From d36dfb3e9b990b45bb812fd144b19737649a69ba Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Wed, 25 Apr 2018 19:36:29 -0300
Subject: Bluetooth: hci_ldisc: Provide a 'default' switch case

When both CONFIG_BT_HCIUART_INTEL and CONFIG_BT_HCIUART_BCM are not
selected, sparse complains like this:

drivers/bluetooth/hci_ldisc.c:437:9: warning: switch with no cases

Fix the sparse warning by proving a default switch case.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_ldisc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index b6a71705b7d6..954213e5daa5 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -447,6 +447,8 @@ static int hci_uart_setup(struct hci_dev *hdev)
 		btbcm_check_bdaddr(hdev);
 		break;
 #endif
+	default:
+		break;
 	}
 
 done:
-- 
cgit v1.2.3-59-g8ed1b


From d6ee6ad774a986d4faaa794a0980e7c50ed359c6 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 26 Apr 2018 13:13:26 +0200
Subject: Bluetooth: Add __hci_cmd_send function

This function allows to send a HCI command without expecting any
controller event/response in return. This is allowed for vendor-
specific commands only.

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b619a190ff12..893bbbb5d2fa 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1393,6 +1393,8 @@ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
 			       const void *param, u32 timeout);
 struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
 				  const void *param, u8 event, u32 timeout);
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+		   const void *param);
 
 int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 		 const void *param);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 40d260f2bea5..b0ee9edaae35 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3422,6 +3422,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
 	return 0;
 }
 
+int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
+		   const void *param)
+{
+	struct sk_buff *skb;
+
+	if (hci_opcode_ogf(opcode) != 0x3f) {
+		/* A controller receiving a command shall respond with either
+		 * a Command Status Event or a Command Complete Event.
+		 * Therefore, all standard HCI commands must be sent via the
+		 * standard API, using hci_send_cmd or hci_cmd_sync helpers.
+		 * Some vendors do not comply with this rule for vendor-specific
+		 * commands and do not return any event. We want to support
+		 * unresponded commands for such cases only.
+		 */
+		bt_dev_err(hdev, "unresponded command not supported");
+		return -EINVAL;
+	}
+
+	skb = hci_prepare_cmd(hdev, opcode, plen, param);
+	if (!skb) {
+		bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+			   opcode);
+		return -ENOMEM;
+	}
+
+	hci_send_frame(hdev, skb);
+
+	return 0;
+}
+EXPORT_SYMBOL(__hci_cmd_send);
+
 /* Get data from the previously sent command */
 void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6e03126adda345c1c04c5029acbc92063867aea2 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Thu, 26 Apr 2018 13:13:27 +0200
Subject: Bluetooth: btqca: Add AR3002 rampatch support

This patch adds rampatch download compatibility for ROME >= 3.2.
Starting with ROME 3.2, the 'download mode' field of the rampatch
header indicates if the controller acknowledges (or not) the received
rampatch segments. If not, we need to send all the segments without
expecting any event from the controller (except for the last segment).
Goal is (I assume) to speed-up rampatch download.

This fixes BT on Dragonboard-600c P2 which includes the following BT
controller:

hci0: ROME Patch Version Request
hci0: Product:0x00000008
hci0: Patch  :0x00000111
hci0: ROM    :0x00000302
hci0: SOC    :0x00000023

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btqca.c | 104 ++++++++++++++++++++++------------------------
 drivers/bluetooth/btqca.h |  11 ++++-
 2 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index 2793d4180d2f..8219816c54a0 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -127,28 +127,41 @@ static void rome_tlv_check_data(struct rome_config *config,
 	BT_DBG("TLV Type\t\t : 0x%x", type_len & 0x000000ff);
 	BT_DBG("Length\t\t : %d bytes", length);
 
+	config->dnld_mode = ROME_SKIP_EVT_NONE;
+
 	switch (config->type) {
 	case TLV_TYPE_PATCH:
 		tlv_patch = (struct tlv_type_patch *)tlv->data;
-		BT_DBG("Total Length\t\t : %d bytes",
+
+		/* For Rome version 1.1 to 3.1, all segment commands
+		 * are acked by a vendor specific event (VSE).
+		 * For Rome >= 3.2, the download mode field indicates
+		 * if VSE is skipped by the controller.
+		 * In case VSE is skipped, only the last segment is acked.
+		 */
+		config->dnld_mode = tlv_patch->download_mode;
+
+		BT_DBG("Total Length           : %d bytes",
 		       le32_to_cpu(tlv_patch->total_size));
-		BT_DBG("Patch Data Length\t : %d bytes",
+		BT_DBG("Patch Data Length      : %d bytes",
 		       le32_to_cpu(tlv_patch->data_length));
 		BT_DBG("Signing Format Version : 0x%x",
 		       tlv_patch->format_version);
-		BT_DBG("Signature Algorithm\t : 0x%x",
+		BT_DBG("Signature Algorithm    : 0x%x",
 		       tlv_patch->signature);
-		BT_DBG("Reserved\t\t : 0x%x",
-		       le16_to_cpu(tlv_patch->reserved1));
-		BT_DBG("Product ID\t\t : 0x%04x",
+		BT_DBG("Download mode          : 0x%x",
+		       tlv_patch->download_mode);
+		BT_DBG("Reserved               : 0x%x",
+		       tlv_patch->reserved1);
+		BT_DBG("Product ID             : 0x%04x",
 		       le16_to_cpu(tlv_patch->product_id));
-		BT_DBG("Rom Build Version\t : 0x%04x",
+		BT_DBG("Rom Build Version      : 0x%04x",
 		       le16_to_cpu(tlv_patch->rom_build));
-		BT_DBG("Patch Version\t\t : 0x%04x",
+		BT_DBG("Patch Version          : 0x%04x",
 		       le16_to_cpu(tlv_patch->patch_version));
-		BT_DBG("Reserved\t\t : 0x%x",
+		BT_DBG("Reserved               : 0x%x",
 		       le16_to_cpu(tlv_patch->reserved2));
-		BT_DBG("Patch Entry Address\t : 0x%x",
+		BT_DBG("Patch Entry Address    : 0x%x",
 		       le32_to_cpu(tlv_patch->entry));
 		break;
 
@@ -194,8 +207,8 @@ static void rome_tlv_check_data(struct rome_config *config,
 	}
 }
 
-static int rome_tlv_send_segment(struct hci_dev *hdev, int idx, int seg_size,
-				 const u8 *data)
+static int rome_tlv_send_segment(struct hci_dev *hdev, int seg_size,
+				 const u8 *data, enum rome_tlv_dnld_mode mode)
 {
 	struct sk_buff *skb;
 	struct edl_event_hdr *edl;
@@ -203,12 +216,14 @@ static int rome_tlv_send_segment(struct hci_dev *hdev, int idx, int seg_size,
 	u8 cmd[MAX_SIZE_PER_TLV_SEGMENT + 2];
 	int err = 0;
 
-	BT_DBG("%s: Download segment #%d size %d", hdev->name, idx, seg_size);
-
 	cmd[0] = EDL_PATCH_TLV_REQ_CMD;
 	cmd[1] = seg_size;
 	memcpy(cmd + 2, data, seg_size);
 
+	if (mode == ROME_SKIP_EVT_VSE_CC || mode == ROME_SKIP_EVT_VSE)
+		return __hci_cmd_send(hdev, EDL_PATCH_CMD_OPCODE, seg_size + 2,
+				      cmd);
+
 	skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, seg_size + 2, cmd,
 				HCI_VENDOR_PKT, HCI_INIT_TIMEOUT);
 	if (IS_ERR(skb)) {
@@ -245,47 +260,12 @@ out:
 	return err;
 }
 
-static int rome_tlv_download_request(struct hci_dev *hdev,
-				     const struct firmware *fw)
-{
-	const u8 *buffer, *data;
-	int total_segment, remain_size;
-	int ret, i;
-
-	if (!fw || !fw->data)
-		return -EINVAL;
-
-	total_segment = fw->size / MAX_SIZE_PER_TLV_SEGMENT;
-	remain_size = fw->size % MAX_SIZE_PER_TLV_SEGMENT;
-
-	BT_DBG("%s: Total segment num %d remain size %d total size %zu",
-	       hdev->name, total_segment, remain_size, fw->size);
-
-	data = fw->data;
-	for (i = 0; i < total_segment; i++) {
-		buffer = data + i * MAX_SIZE_PER_TLV_SEGMENT;
-		ret = rome_tlv_send_segment(hdev, i, MAX_SIZE_PER_TLV_SEGMENT,
-					    buffer);
-		if (ret < 0)
-			return -EIO;
-	}
-
-	if (remain_size) {
-		buffer = data + total_segment * MAX_SIZE_PER_TLV_SEGMENT;
-		ret = rome_tlv_send_segment(hdev, total_segment, remain_size,
-					    buffer);
-		if (ret < 0)
-			return -EIO;
-	}
-
-	return 0;
-}
-
 static int rome_download_firmware(struct hci_dev *hdev,
 				  struct rome_config *config)
 {
 	const struct firmware *fw;
-	int ret;
+	const u8 *segment;
+	int ret, remain, i = 0;
 
 	bt_dev_info(hdev, "ROME Downloading %s", config->fwname);
 
@@ -298,10 +278,24 @@ static int rome_download_firmware(struct hci_dev *hdev,
 
 	rome_tlv_check_data(config, fw);
 
-	ret = rome_tlv_download_request(hdev, fw);
-	if (ret) {
-		BT_ERR("%s: Failed to download file: %s (%d)", hdev->name,
-		       config->fwname, ret);
+	segment = fw->data;
+	remain = fw->size;
+	while (remain > 0) {
+		int segsize = min(MAX_SIZE_PER_TLV_SEGMENT, remain);
+
+		bt_dev_dbg(hdev, "Send segment %d, size %d", i++, segsize);
+
+		remain -= segsize;
+		/* The last segment is always acked regardless download mode */
+		if (!remain || segsize < MAX_SIZE_PER_TLV_SEGMENT)
+			config->dnld_mode = ROME_SKIP_EVT_NONE;
+
+		ret = rome_tlv_send_segment(hdev, segsize, segment,
+					    config->dnld_mode);
+		if (ret)
+			break;
+
+		segment += segsize;
 	}
 
 	release_firmware(fw);
diff --git a/drivers/bluetooth/btqca.h b/drivers/bluetooth/btqca.h
index 65e994b96c47..13d77fd873b6 100644
--- a/drivers/bluetooth/btqca.h
+++ b/drivers/bluetooth/btqca.h
@@ -61,6 +61,13 @@ enum qca_bardrate {
 	QCA_BAUDRATE_RESERVED
 };
 
+enum rome_tlv_dnld_mode {
+	ROME_SKIP_EVT_NONE,
+	ROME_SKIP_EVT_VSE,
+	ROME_SKIP_EVT_CC,
+	ROME_SKIP_EVT_VSE_CC
+};
+
 enum rome_tlv_type {
 	TLV_TYPE_PATCH = 1,
 	TLV_TYPE_NVM
@@ -70,6 +77,7 @@ struct rome_config {
 	u8 type;
 	char fwname[64];
 	uint8_t user_baud_rate;
+	enum rome_tlv_dnld_mode dnld_mode;
 };
 
 struct edl_event_hdr {
@@ -94,7 +102,8 @@ struct tlv_type_patch {
 	__le32 data_length;
 	__u8   format_version;
 	__u8   signature;
-	__le16 reserved1;
+	__u8   download_mode;
+	__u8   reserved1;
 	__le16 product_id;
 	__le16 rom_build;
 	__le16 patch_version;
-- 
cgit v1.2.3-59-g8ed1b


From df2445bf77833674ebf790d2e6fcfd1d389b8a7b Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Fri, 27 Apr 2018 10:09:58 -0700
Subject: Bluetooth: Add a new 13d3:3496 QCA_ROME device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch we can't establish a SCO connection with this
adapter.

This adapter is named "IMC Networks" under lsusb.

T:  Bus=01 Lev=01 Prnt=01 Port=07 Cnt=02 Dev#=  3 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3496 Rev= 0.01
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index a470a2313935..91882f54c7bd 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -277,6 +277,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x04ca, 0x3015), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x04ca, 0x3016), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x04ca, 0x301a), .driver_info = BTUSB_QCA_ROME },
+	{ USB_DEVICE(0x13d3, 0x3496), .driver_info = BTUSB_QCA_ROME },
 
 	/* Broadcom BCM2035 */
 	{ USB_DEVICE(0x0a5c, 0x2009), .driver_info = BTUSB_BCM92035 },
-- 
cgit v1.2.3-59-g8ed1b


From 8689c051a20195b228e19acb155c7d6e48a86753 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <aspriel@gmail.com>
Date: Thu, 10 May 2018 13:50:12 +0200
Subject: cfg80211: dynamically allocate per-tid stats for station info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the addition of TXQ stats in the per-tid statistics the struct
station_info grew significantly. This resulted in stack size warnings
due to the structure itself being above the limit for the warnings.

Add an allocation function that those who want to provide per-tid
stats should use to allocate the tid array, i.e.
struct station_info::pertid.

Cc: Toke Høiland-Jørgensen <toke@toke.dk>
Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Signed-off-by: Arend van Spriel <aspriel@gmail.com>
[johannes: fix missing BIT() and logic by removing]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  | 10 +++++++++-
 net/mac80211/sta_info.c |  9 +++++----
 net/wireless/nl80211.c  |  3 ++-
 net/wireless/util.c     | 11 +++++++++++
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8db6071b6063..8984d24d68b7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1184,6 +1184,7 @@ struct cfg80211_tid_stats {
  * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
  * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
  *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
+ *	Note that this doesn't use the @filled bit, but is used if non-NULL.
  * @ack_signal: signal strength (in dBm) of the last ACK frame.
  * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
  *	been sent.
@@ -1230,7 +1231,7 @@ struct station_info {
 	u64 rx_beacon;
 	u64 rx_duration;
 	u8 rx_beacon_signal_avg;
-	struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1];
+	struct cfg80211_tid_stats *pertid;
 	s8 ack_signal;
 	s8 avg_ack_signal;
 };
@@ -5701,6 +5702,13 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
 					struct ieee80211_channel *chan,
 					gfp_t gfp);
 
+/**
+ * cfg80211_sinfo_alloc_tid_stats - allocate per-tid statistics.
+ *
+ * @sinfo: the station information
+ * @gfp: allocation flags
+ */
+int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
 /**
  * cfg80211_new_sta - notify userspace about station
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 43f34aa873bc..04d47689b557 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2233,11 +2233,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 			sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
 	}
 
-	sinfo->filled |= BIT(NL80211_STA_INFO_TID_STATS);
-	for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
-		struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
+	if (!cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
+		for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
+			struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
 
-		sta_set_tidstats(sta, tidstats, i);
+			sta_set_tidstats(sta, tidstats, i);
+		}
 	}
 
 	if (ieee80211_vif_is_mesh(&sdata->vif)) {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f7715b85fd2b..3d638f11edb5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4658,7 +4658,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
 
-	if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) {
+	if (sinfo->pertid) {
 		struct nlattr *tidsattr;
 		int tid;
 
@@ -4702,6 +4702,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		}
 
 		nla_nest_end(msg, tidsattr);
+		kfree(sinfo->pertid);
 	}
 
 	nla_nest_end(msg, sinfoattr);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d112e9a89364..b5bb1c309914 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -1787,6 +1787,17 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
 	return false;
 }
 
+int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
+{
+	sinfo->pertid = kcalloc(sizeof(*(sinfo->pertid)),
+				IEEE80211_NUM_TIDS + 1, gfp);
+	if (!sinfo->pertid)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats);
+
 /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */
 /* Ethernet-II snap header (RFC1042 for most EtherTypes) */
 const unsigned char rfc1042_header[] __aligned(2) =
-- 
cgit v1.2.3-59-g8ed1b


From 73887fd906bb77a974fa02663df9937d0aff053a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 May 2018 09:57:55 +0200
Subject: cfg80211/mac80211: revert to stack allocation for sinfo

Arend's previous patch made the sinfo structure smaller
again by to dynamically allocating the per-tid stats
only when needed. Thus, revert to stack allocation for
the struct to simplify the code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ethtool.c     | 32 +++++++----------
 net/wireless/nl80211.c     | 86 +++++++++++++++-------------------------------
 net/wireless/wext-compat.c | 23 +++++--------
 3 files changed, 48 insertions(+), 93 deletions(-)

diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 09210aa8ea9a..2ba5686cbcab 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -71,15 +71,11 @@ static void ieee80211_get_stats(struct net_device *dev,
 	struct ieee80211_channel *channel;
 	struct sta_info *sta;
 	struct ieee80211_local *local = sdata->local;
-	struct station_info *sinfo;
+	struct station_info sinfo;
 	struct survey_info survey;
 	int i, q;
 #define STA_STATS_SURVEY_LEN 7
 
-	sinfo = kmalloc(sizeof(*sinfo), GFP_KERNEL);
-	if (!sinfo)
-		return;
-
 	memset(data, 0, sizeof(u64) * STA_STATS_LEN);
 
 #define ADD_STA_STATS(sta)					\
@@ -90,8 +86,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] += sta->rx_stats.fragments;		\
 		data[i++] += sta->rx_stats.dropped;		\
 								\
-		data[i++] += sinfo->tx_packets;			\
-		data[i++] += sinfo->tx_bytes;			\
+		data[i++] += sinfo.tx_packets;			\
+		data[i++] += sinfo.tx_bytes;			\
 		data[i++] += sta->status_stats.filtered;	\
 		data[i++] += sta->status_stats.retry_failed;	\
 		data[i++] += sta->status_stats.retry_count;	\
@@ -111,8 +107,8 @@ static void ieee80211_get_stats(struct net_device *dev,
 		if (!(sta && !WARN_ON(sta->sdata->dev != dev)))
 			goto do_survey;
 
-		memset(sinfo, 0, sizeof(*sinfo));
-		sta_set_sinfo(sta, sinfo);
+		memset(&sinfo, 0, sizeof(sinfo));
+		sta_set_sinfo(sta, &sinfo);
 
 		i = 0;
 		ADD_STA_STATS(sta);
@@ -120,17 +116,17 @@ static void ieee80211_get_stats(struct net_device *dev,
 		data[i++] = sta->sta_state;
 
 
-		if (sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+		if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
 			data[i] = 100000ULL *
-				cfg80211_calculate_bitrate(&sinfo->txrate);
+				cfg80211_calculate_bitrate(&sinfo.txrate);
 		i++;
-		if (sinfo->filled & BIT(NL80211_STA_INFO_RX_BITRATE))
+		if (sinfo.filled & BIT(NL80211_STA_INFO_RX_BITRATE))
 			data[i] = 100000ULL *
-				cfg80211_calculate_bitrate(&sinfo->rxrate);
+				cfg80211_calculate_bitrate(&sinfo.rxrate);
 		i++;
 
-		if (sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
-			data[i] = (u8)sinfo->signal_avg;
+		if (sinfo.filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))
+			data[i] = (u8)sinfo.signal_avg;
 		i++;
 	} else {
 		list_for_each_entry(sta, &local->sta_list, list) {
@@ -138,16 +134,14 @@ static void ieee80211_get_stats(struct net_device *dev,
 			if (sta->sdata->dev != dev)
 				continue;
 
-			memset(sinfo, 0, sizeof(*sinfo));
-			sta_set_sinfo(sta, sinfo);
+			memset(&sinfo, 0, sizeof(sinfo));
+			sta_set_sinfo(sta, &sinfo);
 			i = 0;
 			ADD_STA_STATS(sta);
 		}
 	}
 
 do_survey:
-	kfree(sinfo);
-
 	i = STA_STATS_LEN - STA_STATS_SURVEY_LEN;
 	/* Get survey stats for current channel */
 	survey.filled = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3d638f11edb5..7daceb1f253d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4723,17 +4723,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 static int nl80211_dump_station(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	struct station_info *sinfo;
+	struct station_info sinfo;
 	struct cfg80211_registered_device *rdev;
 	struct wireless_dev *wdev;
 	u8 mac_addr[ETH_ALEN];
 	int sta_idx = cb->args[2];
 	int err;
 
-	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
-	if (!sinfo)
-		return -ENOMEM;
-
 	rtnl_lock();
 	err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev);
 	if (err)
@@ -4750,9 +4746,9 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	}
 
 	while (1) {
-		memset(sinfo, 0, sizeof(*sinfo));
+		memset(&sinfo, 0, sizeof(sinfo));
 		err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
-					mac_addr, sinfo);
+					mac_addr, &sinfo);
 		if (err == -ENOENT)
 			break;
 		if (err)
@@ -4762,7 +4758,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 				NETLINK_CB(cb->skb).portid,
 				cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				rdev, wdev->netdev, mac_addr,
-				sinfo) < 0)
+				&sinfo) < 0)
 			goto out;
 
 		sta_idx++;
@@ -4773,7 +4769,6 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	err = skb->len;
  out_err:
 	rtnl_unlock();
-	kfree(sinfo);
 
 	return err;
 }
@@ -4782,49 +4777,37 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
-	struct station_info *sinfo;
+	struct station_info sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
 	int err;
 
-	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
-	if (!sinfo)
-		return -ENOMEM;
+	memset(&sinfo, 0, sizeof(sinfo));
 
-	if (!info->attrs[NL80211_ATTR_MAC]) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (!info->attrs[NL80211_ATTR_MAC])
+		return -EINVAL;
 
 	mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
 
-	if (!rdev->ops->get_station) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!rdev->ops->get_station)
+		return -EOPNOTSUPP;
 
-	err = rdev_get_station(rdev, dev, mac_addr, sinfo);
+	err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
 	if (err)
-		goto out;
+		return err;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!msg)
+		return -ENOMEM;
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
-				 rdev, dev, mac_addr, sinfo) < 0) {
+				 rdev, dev, mac_addr, &sinfo) < 0) {
 		nlmsg_free(msg);
-		err = -ENOBUFS;
-		goto out;
+		return -ENOBUFS;
 	}
 
-	err = genlmsg_reply(msg, info);
-out:
-	kfree(sinfo);
-	return err;
+	return genlmsg_reply(msg, info);
 }
 
 int cfg80211_check_station_change(struct wiphy *wiphy,
@@ -10088,26 +10071,18 @@ static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev,
 	 */
 	if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss &&
 	    rdev->ops->get_station) {
-		struct station_info *sinfo;
+		struct station_info sinfo = {};
 		u8 *mac_addr;
 
-		sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
-		if (!sinfo)
-			return -ENOMEM;
-
 		mac_addr = wdev->current_bss->pub.bssid;
 
-		err = rdev_get_station(rdev, dev, mac_addr, sinfo);
-		if (err) {
-			kfree(sinfo);
+		err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
+		if (err)
 			return err;
-		}
 
-		if (sinfo->filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
+		if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG))
 			wdev->cqm_config->last_rssi_event_value =
-				(s8)sinfo->rx_beacon_signal_avg;
-
-		kfree(sinfo);
+				(s8) sinfo.rx_beacon_signal_avg;
 	}
 
 	last = wdev->cqm_config->last_rssi_event_value;
@@ -14641,32 +14616,25 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	struct wiphy *wiphy = dev->ieee80211_ptr->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
 	struct sk_buff *msg;
-	struct station_info *empty_sinfo = NULL;
+	struct station_info empty_sinfo = {};
 
-	if (!sinfo) {
-		empty_sinfo = kzalloc(sizeof(*empty_sinfo), GFP_KERNEL);
-		if (!empty_sinfo)
-			return;
-		sinfo = empty_sinfo;
-	}
+	if (!sinfo)
+		sinfo = &empty_sinfo;
 
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
 	if (!msg)
-		goto out;
+		return;
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
 		nlmsg_free(msg);
-		goto out;
+		return;
 	}
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
 				NL80211_MCGRP_MLME, gfp);
-
-out:
-	kfree(empty_sinfo);
 }
 EXPORT_SYMBOL(cfg80211_del_sta_sinfo);
 
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 9e002df0f8d8..05186a47878f 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -1254,7 +1254,7 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
-	struct station_info *sinfo;
+	struct station_info sinfo = {};
 	u8 addr[ETH_ALEN];
 	int err;
 
@@ -1274,23 +1274,16 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
 	if (err)
 		return err;
 
-	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
-	if (!sinfo)
-		return -ENOMEM;
-
-	err = rdev_get_station(rdev, dev, addr, sinfo);
+	err = rdev_get_station(rdev, dev, addr, &sinfo);
 	if (err)
-		goto out;
+		return err;
 
-	if (!(sinfo->filled & BIT(NL80211_STA_INFO_TX_BITRATE))) {
-		err = -EOPNOTSUPP;
-		goto out;
-	}
+	if (!(sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)))
+		return -EOPNOTSUPP;
 
-	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo->txrate);
-out:
-	kfree(sinfo);
-	return err;
+	rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate);
+
+	return 0;
 }
 
 /* Get wireless statistics.  Called by /proc/net/wireless and by SIOCGIWSTATS */
-- 
cgit v1.2.3-59-g8ed1b


From 4a22b00b288649f6d697b350ec606574479a2df3 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 11 May 2018 14:25:42 +0100
Subject: cfg80211: fix spelling mistake: "uknown" -> "unknown"

Trivial fix to spelling mistake in pr_debug message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 69cf79165d43..9806380ec671 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -3399,7 +3399,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
 	case NL80211_DFS_JP:
 		return true;
 	default:
-		pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region);
+		pr_debug("Ignoring unknown DFS master region: %d\n", dfs_region);
 		return false;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 39c1134c66b4552f665da576cb625f184a44a8a3 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander.wetzel@web.de>
Date: Mon, 14 May 2018 22:33:34 +0200
Subject: mac80211: fix TX aggregation stop race

The mac80211 tear down code is not waiting for the driver call back.
This can bring down the the TX path (TID) till the user manually
reconnects. (Observed with iwldvm and enabled TX aggregation.)

The race can be prevented when the ampdu_mlme worker handles the tear
down.

The race:
 * ieee80211_sta_tear_down_BA_sessions calls
   ___ieee80211_stop_tx_ba_session for all TIDs,

 * then cancels the ampdu_mlme worker

 * and cleanups the TIDs the driver already has called back for.

 * ieee80211_stop_tx_ba_cb will never be called for a TID if the callback
   came after the the check in ieee80211_sta_tear_down_BA_sessions.

Signed-off-by: Alexander Wetzel <Alexander.Wetzel@web.de>
[johannes: "enabled" -> "blocked" and invert logic, simplify init]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ht.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index c78036a0ac94..26a7ba3b698f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -301,26 +301,27 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 		___ieee80211_stop_tx_ba_session(sta, i, reason);
 	mutex_unlock(&sta->ampdu_mlme.mtx);
 
-	/* stopping might queue the work again - so cancel only afterwards */
-	cancel_work_sync(&sta->ampdu_mlme.work);
-
 	/*
 	 * In case the tear down is part of a reconfigure due to HW restart
 	 * request, it is possible that the low level driver requested to stop
 	 * the BA session, so handle it to properly clean tid_tx data.
 	 */
-	mutex_lock(&sta->ampdu_mlme.mtx);
-	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
-		struct tid_ampdu_tx *tid_tx =
-			rcu_dereference_protected_tid_tx(sta, i);
+	if(reason == AGG_STOP_DESTROY_STA) {
+		cancel_work_sync(&sta->ampdu_mlme.work);
 
-		if (!tid_tx)
-			continue;
+		mutex_lock(&sta->ampdu_mlme.mtx);
+		for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+			struct tid_ampdu_tx *tid_tx =
+				rcu_dereference_protected_tid_tx(sta, i);
 
-		if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
-			ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
+			if (!tid_tx)
+				continue;
+
+			if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state))
+				ieee80211_stop_tx_ba_cb(sta, i, tid_tx);
+		}
+		mutex_unlock(&sta->ampdu_mlme.mtx);
 	}
-	mutex_unlock(&sta->ampdu_mlme.mtx);
 }
 
 void ieee80211_ba_session_work(struct work_struct *work)
@@ -328,16 +329,11 @@ void ieee80211_ba_session_work(struct work_struct *work)
 	struct sta_info *sta =
 		container_of(work, struct sta_info, ampdu_mlme.work);
 	struct tid_ampdu_tx *tid_tx;
+	bool blocked;
 	int tid;
 
-	/*
-	 * When this flag is set, new sessions should be
-	 * blocked, and existing sessions will be torn
-	 * down by the code that set the flag, so this
-	 * need not run.
-	 */
-	if (test_sta_flag(sta, WLAN_STA_BLOCK_BA))
-		return;
+	/* When this flag is set, new sessions should be blocked. */
+	blocked = test_sta_flag(sta, WLAN_STA_BLOCK_BA);
 
 	mutex_lock(&sta->ampdu_mlme.mtx);
 	for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
@@ -352,7 +348,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
 				sta, tid, WLAN_BACK_RECIPIENT,
 				WLAN_REASON_UNSPECIFIED, true);
 
-		if (test_and_clear_bit(tid,
+		if (!blocked &&
+		    test_and_clear_bit(tid,
 				       sta->ampdu_mlme.tid_rx_manage_offl))
 			___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid,
 							 IEEE80211_MAX_AMPDU_BUF,
@@ -367,7 +364,7 @@ void ieee80211_ba_session_work(struct work_struct *work)
 		spin_lock_bh(&sta->lock);
 
 		tid_tx = sta->ampdu_mlme.tid_start_tx[tid];
-		if (tid_tx) {
+		if (!blocked && tid_tx) {
 			/*
 			 * Assign it over to the normal tid_tx array
 			 * where it "goes live".
@@ -390,7 +387,8 @@ void ieee80211_ba_session_work(struct work_struct *work)
 		if (!tid_tx)
 			continue;
 
-		if (test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
+		if (!blocked &&
+		    test_and_clear_bit(HT_AGG_STATE_START_CB, &tid_tx->state))
 			ieee80211_start_tx_ba_cb(sta, tid, tid_tx);
 		if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state))
 			___ieee80211_stop_tx_ba_session(sta, tid,
-- 
cgit v1.2.3-59-g8ed1b


From 0fdf1493b41eb64fc7e8c8e1b8830a4bd8c4bbca Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 May 2018 11:40:44 +0200
Subject: mac80211: allocate and fill tidstats only when needed

This fixes memory leaks in the case where we just have the
station info on the stack for internal usage without sending
it to cfg80211.

Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c      | 4 ++--
 net/mac80211/ethtool.c  | 4 ++--
 net/mac80211/sta_info.c | 7 ++++---
 net/mac80211/sta_info.h | 3 ++-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5ce9d121af2b..bdf6fa78d0d2 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -695,7 +695,7 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
 	if (sta) {
 		ret = 0;
 		memcpy(mac, sta->sta.addr, ETH_ALEN);
-		sta_set_sinfo(sta, sinfo);
+		sta_set_sinfo(sta, sinfo, true);
 	}
 
 	mutex_unlock(&local->sta_mtx);
@@ -724,7 +724,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
 	sta = sta_info_get_bss(sdata, mac);
 	if (sta) {
 		ret = 0;
-		sta_set_sinfo(sta, sinfo);
+		sta_set_sinfo(sta, sinfo, true);
 	}
 
 	mutex_unlock(&local->sta_mtx);
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 2ba5686cbcab..690c142a7a44 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -108,7 +108,7 @@ static void ieee80211_get_stats(struct net_device *dev,
 			goto do_survey;
 
 		memset(&sinfo, 0, sizeof(sinfo));
-		sta_set_sinfo(sta, &sinfo);
+		sta_set_sinfo(sta, &sinfo, false);
 
 		i = 0;
 		ADD_STA_STATS(sta);
@@ -135,7 +135,7 @@ static void ieee80211_get_stats(struct net_device *dev,
 				continue;
 
 			memset(&sinfo, 0, sizeof(sinfo));
-			sta_set_sinfo(sta, &sinfo);
+			sta_set_sinfo(sta, &sinfo, false);
 			i = 0;
 			ADD_STA_STATS(sta);
 		}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 04d47689b557..6428f1ac37b6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1008,7 +1008,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 
 	sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
 	if (sinfo)
-		sta_set_sinfo(sta, sinfo);
+		sta_set_sinfo(sta, sinfo, true);
 	cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
 	kfree(sinfo);
 
@@ -2079,7 +2079,8 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
 	return value;
 }
 
-void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
+		   bool tidstats)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
@@ -2233,7 +2234,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 			sinfo->filled |= BIT(NL80211_STA_INFO_RX_BITRATE);
 	}
 
-	if (!cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
+	if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
 		for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) {
 			struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i];
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d79bd6eeb549..81b35f623792 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -744,7 +744,8 @@ static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata)
 void sta_set_rate_info_tx(struct sta_info *sta,
 			  const struct ieee80211_tx_rate *rate,
 			  struct rate_info *rinfo);
-void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo);
+void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
+		   bool tidstats);
 
 u32 sta_get_expected_throughput(struct sta_info *sta);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7ea3e110f2f8ba23f330c2f702f556acd539bcb8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 18 May 2018 11:40:44 +0200
Subject: cfg80211: release station info tidstats where needed

This fixes memory leaks in cases where we got the station
info but failed sending it out properly.

Fixes: 8689c051a201 ("cfg80211: dynamically allocate per-tid stats for station info")
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 13 +++++++++++++
 net/wireless/nl80211.c | 11 ++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8984d24d68b7..11a218445448 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5710,6 +5710,19 @@ void cfg80211_remain_on_channel_expired(struct wireless_dev *wdev, u64 cookie,
  */
 int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
+/**
+ * cfg80211_sinfo_release_content - release contents of station info
+ * @sinfo: the station information
+ *
+ * Releases any potentially allocated sub-information of the station
+ * information, but not the struct itself (since it's typically on
+ * the stack.)
+ */
+static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
+{
+	kfree(sinfo->pertid);
+}
+
 /**
  * cfg80211_new_sta - notify userspace about station
  *
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7daceb1f253d..e4a52a2b5e65 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4702,7 +4702,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		}
 
 		nla_nest_end(msg, tidsattr);
-		kfree(sinfo->pertid);
 	}
 
 	nla_nest_end(msg, sinfoattr);
@@ -4712,10 +4711,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 		    sinfo->assoc_req_ies))
 		goto nla_put_failure;
 
+	cfg80211_sinfo_release_content(sinfo);
 	genlmsg_end(msg, hdr);
 	return 0;
 
  nla_put_failure:
+	cfg80211_sinfo_release_content(sinfo);
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
 }
@@ -4797,8 +4798,10 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 		return err;
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!msg)
+	if (!msg) {
+		cfg80211_sinfo_release_content(sinfo);
 		return -ENOMEM;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,
 				 info->snd_portid, info->snd_seq, 0,
@@ -14624,8 +14627,10 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
 	trace_cfg80211_del_sta(dev, mac_addr);
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
-	if (!msg)
+	if (!msg) {
+		cfg80211_sinfo_release_content(sinfo);
 		return;
+	}
 
 	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0,
 				 rdev, dev, mac_addr, sinfo) < 0) {
-- 
cgit v1.2.3-59-g8ed1b


From dac09149d992995adbef0f472093fbb6940a8653 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:21 +0200
Subject: xsk: clean up SPDX headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up SPDX-License-Identifier and removing licensing leftovers.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      | 13 ++-----------
 include/uapi/linux/if_xdp.h | 13 ++-----------
 kernel/bpf/xskmap.c         |  9 ---------
 net/xdp/xdp_umem.c          |  9 ---------
 net/xdp/xdp_umem.h          | 13 ++-----------
 net/xdp/xdp_umem_props.h    | 13 ++-----------
 net/xdp/xsk.c               |  9 ---------
 net/xdp/xsk_queue.c         |  9 ---------
 net/xdp/xsk_queue.h         | 13 ++-----------
 samples/bpf/xdpsock_user.c  | 12 +-----------
 10 files changed, 11 insertions(+), 102 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 185f4928fbda..7a647c56ec15 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * AF_XDP internal functions
+/* SPDX-License-Identifier: GPL-2.0 */
+/* AF_XDP internal functions
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XDP_SOCK_H
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 77b88c4efe98..56db977221d2 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -1,17 +1,8 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
- *
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
  * if_xdp: XDP socket user-space interface
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index cb3a12137404..b3c557476a8d 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XSKMAP used for AF_XDP sockets
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/bpf.h>
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2b47a1dd7c6c..df4ea97c433b 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/init.h>
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 7e0b2fab8522..70fe225baa51 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_H_
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 77fb5daf29f3..2cf8ec485fd2 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space packet buffer
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space packet buffer
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef XDP_UMEM_PROPS_H_
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 009c5af5bba5..b8d1cb4d78c0 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -5,15 +5,6 @@
  * applications.
  * Copyright(c) 2018 Intel Corporation.
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
  * Author(s): Björn Töpel <bjorn.topel@intel.com>
  *	      Magnus Karlsson <magnus.karlsson@intel.com>
  */
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index d012e5e23591..9f605d22dad4 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -1,15 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #include <linux/slab.h>
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 7aa9a535db0e..928d464e57b9 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -1,15 +1,6 @@
-/* SPDX-License-Identifier: GPL-2.0
- * XDP user-space ring structure
+/* SPDX-License-Identifier: GPL-2.0 */
+/* XDP user-space ring structure
  * Copyright(c) 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
  */
 
 #ifndef _LINUX_XSK_QUEUE_H
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 7fe60f6f7d53..60a882a2296c 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,15 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright(c) 2017 - 2018 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
+/* Copyright(c) 2017 - 2018 Intel Corporation. */
 
 #include <assert.h>
 #include <errno.h>
-- 
cgit v1.2.3-59-g8ed1b


From 54b85c27fea4ce2be32a69bc2357847fe0367c64 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:22 +0200
Subject: xsk: remove newline at end of file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Minor cleanup, remove newline at end of Makefile.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index 074fb2b2d51c..04f073146256 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1 @@
 obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
-
-- 
cgit v1.2.3-59-g8ed1b


From da60cf00c1a576a459defed2edfb88c858510b64 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:23 +0200
Subject: xsk: fixed some cases of unnecessary parentheses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed some cases of unnecessary parentheses.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c  | 4 ++--
 net/xdp/xsk_queue.c | 3 +--
 net/xdp/xsk_queue.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index df4ea97c433b..c47909c74899 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -20,7 +20,7 @@ int xdp_umem_create(struct xdp_umem **umem)
 {
 	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
 
-	if (!(*umem))
+	if (!*umem)
 		return -ENOMEM;
 
 	return 0;
@@ -247,5 +247,5 @@ out:
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 {
-	return (umem->fq && umem->cq);
+	return umem->fq && umem->cq;
 }
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 9f605d22dad4..ebe85e59507e 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -22,8 +22,7 @@ static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 
 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
 {
-	return (sizeof(struct xdp_ring) +
-		q->nentries * sizeof(struct xdp_desc));
+	return sizeof(struct xdp_ring) + q->nentries * sizeof(struct xdp_desc);
 }
 
 struct xsk_queue *xskq_create(u32 nentries, bool umem_queue)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 928d464e57b9..62e43be407d8 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -223,12 +223,12 @@ static inline void xskq_produce_flush_desc(struct xsk_queue *q)
 
 static inline bool xskq_full_desc(struct xsk_queue *q)
 {
-	return (xskq_nb_avail(q, q->nentries) == q->nentries);
+	return xskq_nb_avail(q, q->nentries) == q->nentries;
 }
 
 static inline bool xskq_empty_desc(struct xsk_queue *q)
 {
-	return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries);
+	return xskq_nb_free(q, q->prod_tail, 1) == q->nentries;
 }
 
 void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
-- 
cgit v1.2.3-59-g8ed1b


From c2f4374b9661db28090a63de1ccc40552c4de8fa Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 18 May 2018 14:00:24 +0200
Subject: xsk: proper '=' alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Properly align xsk_proto_ops initialization.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b8d1cb4d78c0..817340f7725d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -545,24 +545,24 @@ static struct proto xsk_proto = {
 };
 
 static const struct proto_ops xsk_proto_ops = {
-	.family =	PF_XDP,
-	.owner =	THIS_MODULE,
-	.release =	xsk_release,
-	.bind =		xsk_bind,
-	.connect =	sock_no_connect,
-	.socketpair =	sock_no_socketpair,
-	.accept =	sock_no_accept,
-	.getname =	sock_no_getname,
-	.poll =		xsk_poll,
-	.ioctl =	sock_no_ioctl,
-	.listen =	sock_no_listen,
-	.shutdown =	sock_no_shutdown,
-	.setsockopt =	xsk_setsockopt,
-	.getsockopt =	xsk_getsockopt,
-	.sendmsg =	xsk_sendmsg,
-	.recvmsg =	sock_no_recvmsg,
-	.mmap =		xsk_mmap,
-	.sendpage =	sock_no_sendpage,
+	.family		= PF_XDP,
+	.owner		= THIS_MODULE,
+	.release	= xsk_release,
+	.bind		= xsk_bind,
+	.connect	= sock_no_connect,
+	.socketpair	= sock_no_socketpair,
+	.accept		= sock_no_accept,
+	.getname	= sock_no_getname,
+	.poll		= xsk_poll,
+	.ioctl		= sock_no_ioctl,
+	.listen		= sock_no_listen,
+	.shutdown	= sock_no_shutdown,
+	.setsockopt	= xsk_setsockopt,
+	.getsockopt	= xsk_getsockopt,
+	.sendmsg	= xsk_sendmsg,
+	.recvmsg	= sock_no_recvmsg,
+	.mmap		= xsk_mmap,
+	.sendpage	= sock_no_sendpage,
 };
 
 static void xsk_destruct(struct sock *sk)
-- 
cgit v1.2.3-59-g8ed1b


From 67e1c4068d325d2a15a41c4c1509a69598f693f1 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:55:58 +0100
Subject: net: stmmac: Enable OSP for GMAC4

This enables OSP (Operate on Second Packet) for GMAC4. The feature
allows DMA to fetch second descriptor while its still processing the
first one.

Running iperf, the performance gain is +/- 38%.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 117c3a5288f0..9aab5b35e2d8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -94,6 +94,10 @@ static void dwmac4_dma_init_tx_chan(void __iomem *ioaddr,
 
 	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
 	value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT);
+
+	/* Enable OSP to get best performance */
+	value |= DMA_CONTROL_OSP;
+
 	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
 
 	writel(dma_tx_phy, ioaddr + DMA_CHAN_TX_BASE_ADDR(chan));
-- 
cgit v1.2.3-59-g8ed1b


From 4ae0169fd1b3c792b66be58995b7e6b629919ecf Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:55:59 +0100
Subject: net: stmmac: Do not keep rearming the coalesce timer in stmmac_xmit

This is cutting down performance. Once the timer is armed it should run
after the time expires for the first packet sent and not the last one.

After this change, running iperf, the performance gain is +/- 24%.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      | 1 +
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 42fc76e76bf9..4d425b1a0c59 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -105,6 +105,7 @@ struct stmmac_priv {
 	u32 tx_count_frames;
 	u32 tx_coal_frames;
 	u32 tx_coal_timer;
+	bool tx_timer_armed;
 
 	int tx_coalesce;
 	int hwts_tx_en;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d9dbe1355896..789bc226ba69 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3158,13 +3158,16 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * element in case of no SG.
 	 */
 	priv->tx_count_frames += nfrags + 1;
-	if (likely(priv->tx_coal_frames > priv->tx_count_frames)) {
+	if (likely(priv->tx_coal_frames > priv->tx_count_frames) &&
+	    !priv->tx_timer_armed) {
 		mod_timer(&priv->txtimer,
 			  STMMAC_COAL_TIMER(priv->tx_coal_timer));
+		priv->tx_timer_armed = true;
 	} else {
 		priv->tx_count_frames = 0;
 		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
+		priv->tx_timer_armed = false;
 	}
 
 	skb_tx_timestamp(skb);
-- 
cgit v1.2.3-59-g8ed1b


From 6844171d5b0567a18248593176e23ae97d76b346 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:00 +0100
Subject: net: stmmac: Let descriptor code set skbuff address

Stop using if conditions depending on the GMAC version for setting the
the descriptor skbuff address and use instead a helper implemented in
the descriptor files.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c |  7 ++++++
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     |  6 ++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  4 ++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    |  6 ++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 25 ++++++----------------
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 65ed896c13cb..f67caa14b2f5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -424,6 +424,12 @@ static void dwmac4_set_mss_ctxt(struct dma_desc *p, unsigned int mss)
 	p->des3 = cpu_to_le32(TDES3_CONTEXT_TYPE | TDES3_CTXT_TCMSSV);
 }
 
+static void dwmac4_set_addr(struct dma_desc *p, dma_addr_t addr)
+{
+	p->des0 = cpu_to_le32(addr);
+	p->des1 = 0;
+}
+
 const struct stmmac_desc_ops dwmac4_desc_ops = {
 	.tx_status = dwmac4_wrback_get_tx_status,
 	.rx_status = dwmac4_wrback_get_rx_status,
@@ -445,6 +451,7 @@ const struct stmmac_desc_ops dwmac4_desc_ops = {
 	.init_tx_desc = dwmac4_rd_init_tx_desc,
 	.display_ring = dwmac4_display_ring,
 	.set_mss = dwmac4_set_mss_ctxt,
+	.set_addr = dwmac4_set_addr,
 };
 
 const struct stmmac_mode_ops dwmac4_ring_mode_ops = { };
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 3bfb3f584be2..02749e4b1a47 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -437,6 +437,11 @@ static void enh_desc_display_ring(void *head, unsigned int size, bool rx)
 	pr_info("\n");
 }
 
+static void enh_desc_set_addr(struct dma_desc *p, dma_addr_t addr)
+{
+	p->des2 = cpu_to_le32(addr);
+}
+
 const struct stmmac_desc_ops enh_desc_ops = {
 	.tx_status = enh_desc_get_tx_status,
 	.rx_status = enh_desc_get_rx_status,
@@ -457,4 +462,5 @@ const struct stmmac_desc_ops enh_desc_ops = {
 	.get_timestamp = enh_desc_get_timestamp,
 	.get_rx_timestamp_status = enh_desc_get_rx_timestamp_status,
 	.display_ring = enh_desc_display_ring,
+	.set_addr = enh_desc_set_addr,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index b7539a14e6ad..d66d194d5f89 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -79,6 +79,8 @@ struct stmmac_desc_ops {
 	void (*display_ring)(void *head, unsigned int size, bool rx);
 	/* set MSS via context descriptor */
 	void (*set_mss)(struct dma_desc *p, unsigned int mss);
+	/* set descriptor skbuff address */
+	void (*set_addr)(struct dma_desc *p, dma_addr_t addr);
 };
 
 #define stmmac_init_rx_desc(__priv, __args...) \
@@ -123,6 +125,8 @@ struct stmmac_desc_ops {
 	stmmac_do_void_callback(__priv, desc, display_ring, __args)
 #define stmmac_set_mss(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_mss, __args)
+#define stmmac_set_desc_addr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, set_addr, __args)
 
 struct stmmac_dma_cfg;
 struct dma_features;
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index 7b1d901bf5bc..6cf2c7c5208a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -297,6 +297,11 @@ static void ndesc_display_ring(void *head, unsigned int size, bool rx)
 	pr_info("\n");
 }
 
+static void ndesc_set_addr(struct dma_desc *p, dma_addr_t addr)
+{
+	p->des2 = cpu_to_le32(addr);
+}
+
 const struct stmmac_desc_ops ndesc_ops = {
 	.tx_status = ndesc_get_tx_status,
 	.rx_status = ndesc_get_rx_status,
@@ -316,4 +321,5 @@ const struct stmmac_desc_ops ndesc_ops = {
 	.get_timestamp = ndesc_get_timestamp,
 	.get_rx_timestamp_status = ndesc_get_rx_timestamp_status,
 	.display_ring = ndesc_display_ring,
+	.set_addr = ndesc_set_addr,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 789bc226ba69..3f559d72b41f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1156,10 +1156,7 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
 		return -EINVAL;
 	}
 
-	if (priv->synopsys_id >= DWMAC_CORE_4_00)
-		p->des0 = cpu_to_le32(rx_q->rx_skbuff_dma[i]);
-	else
-		p->des2 = cpu_to_le32(rx_q->rx_skbuff_dma[i]);
+	stmmac_set_desc_addr(priv, p, rx_q->rx_skbuff_dma[i]);
 
 	if (priv->dma_buf_sz == BUF_SIZE_16KiB)
 		stmmac_init_desc3(priv, p);
@@ -3100,10 +3097,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			goto dma_map_err; /* should reuse desc w/o issues */
 
 		tx_q->tx_skbuff_dma[entry].buf = des;
-		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-			desc->des0 = cpu_to_le32(des);
-		else
-			desc->des2 = cpu_to_le32(des);
+
+		stmmac_set_desc_addr(priv, desc, des);
 
 		tx_q->tx_skbuff_dma[entry].map_as_page = true;
 		tx_q->tx_skbuff_dma[entry].len = len;
@@ -3185,10 +3180,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			goto dma_map_err;
 
 		tx_q->tx_skbuff_dma[first_entry].buf = des;
-		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-			first->des0 = cpu_to_le32(des);
-		else
-			first->des2 = cpu_to_le32(des);
+
+		stmmac_set_desc_addr(priv, first, des);
 
 		tx_q->tx_skbuff_dma[first_entry].len = nopaged_len;
 		tx_q->tx_skbuff_dma[first_entry].last_segment = last_segment;
@@ -3302,13 +3295,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 				break;
 			}
 
-			if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00)) {
-				p->des0 = cpu_to_le32(rx_q->rx_skbuff_dma[entry]);
-				p->des1 = 0;
-			} else {
-				p->des2 = cpu_to_le32(rx_q->rx_skbuff_dma[entry]);
-			}
-
+			stmmac_set_desc_addr(priv, p, rx_q->rx_skbuff_dma[entry]);
 			stmmac_refill_desc3(priv, rx_q, p);
 
 			if (rx_q->rx_zeroc_thresh > 0)
-- 
cgit v1.2.3-59-g8ed1b


From 44c67f8559fc5e2f818cbb11fde1ee1bebc59e7c Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:01 +0100
Subject: net: stmmac: Let descriptor code clear the descriptor

Stop using if conditions depending on the GMAC version for clearing the
descriptor and use instead a helper implemented in the descriptor files.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 9 +++++++++
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 4 ++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 9 +--------
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index f67caa14b2f5..119a2f93dd55 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -430,6 +430,14 @@ static void dwmac4_set_addr(struct dma_desc *p, dma_addr_t addr)
 	p->des1 = 0;
 }
 
+static void dwmac4_clear(struct dma_desc *p)
+{
+	p->des0 = 0;
+	p->des1 = 0;
+	p->des2 = 0;
+	p->des3 = 0;
+}
+
 const struct stmmac_desc_ops dwmac4_desc_ops = {
 	.tx_status = dwmac4_wrback_get_tx_status,
 	.rx_status = dwmac4_wrback_get_rx_status,
@@ -452,6 +460,7 @@ const struct stmmac_desc_ops dwmac4_desc_ops = {
 	.display_ring = dwmac4_display_ring,
 	.set_mss = dwmac4_set_mss_ctxt,
 	.set_addr = dwmac4_set_addr,
+	.clear = dwmac4_clear,
 };
 
 const struct stmmac_mode_ops dwmac4_ring_mode_ops = { };
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 02749e4b1a47..17cd26fb2a3d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -442,6 +442,11 @@ static void enh_desc_set_addr(struct dma_desc *p, dma_addr_t addr)
 	p->des2 = cpu_to_le32(addr);
 }
 
+static void enh_desc_clear(struct dma_desc *p)
+{
+	p->des2 = 0;
+}
+
 const struct stmmac_desc_ops enh_desc_ops = {
 	.tx_status = enh_desc_get_tx_status,
 	.rx_status = enh_desc_get_rx_status,
@@ -463,4 +468,5 @@ const struct stmmac_desc_ops enh_desc_ops = {
 	.get_rx_timestamp_status = enh_desc_get_rx_timestamp_status,
 	.display_ring = enh_desc_display_ring,
 	.set_addr = enh_desc_set_addr,
+	.clear = enh_desc_clear,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index d66d194d5f89..a6b9c9790d11 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -81,6 +81,8 @@ struct stmmac_desc_ops {
 	void (*set_mss)(struct dma_desc *p, unsigned int mss);
 	/* set descriptor skbuff address */
 	void (*set_addr)(struct dma_desc *p, dma_addr_t addr);
+	/* clear descriptor */
+	void (*clear)(struct dma_desc *p);
 };
 
 #define stmmac_init_rx_desc(__priv, __args...) \
@@ -127,6 +129,8 @@ struct stmmac_desc_ops {
 	stmmac_do_void_callback(__priv, desc, set_mss, __args)
 #define stmmac_set_desc_addr(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_addr, __args)
+#define stmmac_clear_desc(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, clear, __args)
 
 struct stmmac_dma_cfg;
 struct dma_features;
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index 6cf2c7c5208a..a7b221b59178 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -302,6 +302,11 @@ static void ndesc_set_addr(struct dma_desc *p, dma_addr_t addr)
 	p->des2 = cpu_to_le32(addr);
 }
 
+static void ndesc_clear(struct dma_desc *p)
+{
+	p->des2 = 0;
+}
+
 const struct stmmac_desc_ops ndesc_ops = {
 	.tx_status = ndesc_get_tx_status,
 	.rx_status = ndesc_get_rx_status,
@@ -322,4 +327,5 @@ const struct stmmac_desc_ops ndesc_ops = {
 	.get_rx_timestamp_status = ndesc_get_rx_timestamp_status,
 	.display_ring = ndesc_display_ring,
 	.set_addr = ndesc_set_addr,
+	.clear = ndesc_clear,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3f559d72b41f..0ccee6a779f9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1341,14 +1341,7 @@ static int init_dma_tx_desc_rings(struct net_device *dev)
 			else
 				p = tx_q->dma_tx + i;
 
-			if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-				p->des0 = 0;
-				p->des1 = 0;
-				p->des2 = 0;
-				p->des3 = 0;
-			} else {
-				p->des2 = 0;
-			}
+			stmmac_clear_desc(priv, p);
 
 			tx_q->tx_skbuff_dma[i].buf = 0;
 			tx_q->tx_skbuff_dma[i].map_as_page = false;
-- 
cgit v1.2.3-59-g8ed1b


From ab0204e35c2b4462d50f3d4e8df0c965e4f0688a Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:02 +0100
Subject: net: stmmac: Uniformize the use of dma_{rx/tx}_mode callbacks

Instead of relying on the GMAC version for choosing if we need to use
dma_{rx/tx}_mode or just dma_mode callback lets uniformize this and
always use the dma_{rx/tx}_mode callbacks.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c  | 57 ++++++++++--------
 .../net/ethernet/stmicro/stmmac/dwmac1000_dma.c    | 67 ++++++++++++----------
 drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c | 10 ++--
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  6 --
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 32 ++++-------
 5 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 2f7f0915f071..11c287a7ed52 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -437,13 +437,36 @@ static int sun8i_dwmac_dma_interrupt(void __iomem *ioaddr,
 	return ret;
 }
 
-static void sun8i_dwmac_dma_operation_mode(void __iomem *ioaddr, int txmode,
-					   int rxmode, int rxfifosz)
+static void sun8i_dwmac_dma_operation_mode_rx(void __iomem *ioaddr, int mode,
+					      u32 channel, int fifosz, u8 qmode)
+{
+	u32 v;
+
+	v = readl(ioaddr + EMAC_RX_CTL1);
+	if (mode == SF_DMA_MODE) {
+		v |= EMAC_RX_MD;
+	} else {
+		v &= ~EMAC_RX_MD;
+		v &= ~EMAC_RX_TH_MASK;
+		if (mode < 32)
+			v |= EMAC_RX_TH_32;
+		else if (mode < 64)
+			v |= EMAC_RX_TH_64;
+		else if (mode < 96)
+			v |= EMAC_RX_TH_96;
+		else if (mode < 128)
+			v |= EMAC_RX_TH_128;
+	}
+	writel(v, ioaddr + EMAC_RX_CTL1);
+}
+
+static void sun8i_dwmac_dma_operation_mode_tx(void __iomem *ioaddr, int mode,
+					      u32 channel, int fifosz, u8 qmode)
 {
 	u32 v;
 
 	v = readl(ioaddr + EMAC_TX_CTL1);
-	if (txmode == SF_DMA_MODE) {
+	if (mode == SF_DMA_MODE) {
 		v |= EMAC_TX_MD;
 		/* Undocumented bit (called TX_NEXT_FRM in BSP), the original
 		 * comment is
@@ -454,40 +477,24 @@ static void sun8i_dwmac_dma_operation_mode(void __iomem *ioaddr, int txmode,
 	} else {
 		v &= ~EMAC_TX_MD;
 		v &= ~EMAC_TX_TH_MASK;
-		if (txmode < 64)
+		if (mode < 64)
 			v |= EMAC_TX_TH_64;
-		else if (txmode < 128)
+		else if (mode < 128)
 			v |= EMAC_TX_TH_128;
-		else if (txmode < 192)
+		else if (mode < 192)
 			v |= EMAC_TX_TH_192;
-		else if (txmode < 256)
+		else if (mode < 256)
 			v |= EMAC_TX_TH_256;
 	}
 	writel(v, ioaddr + EMAC_TX_CTL1);
-
-	v = readl(ioaddr + EMAC_RX_CTL1);
-	if (rxmode == SF_DMA_MODE) {
-		v |= EMAC_RX_MD;
-	} else {
-		v &= ~EMAC_RX_MD;
-		v &= ~EMAC_RX_TH_MASK;
-		if (rxmode < 32)
-			v |= EMAC_RX_TH_32;
-		else if (rxmode < 64)
-			v |= EMAC_RX_TH_64;
-		else if (rxmode < 96)
-			v |= EMAC_RX_TH_96;
-		else if (rxmode < 128)
-			v |= EMAC_RX_TH_128;
-	}
-	writel(v, ioaddr + EMAC_RX_CTL1);
 }
 
 static const struct stmmac_dma_ops sun8i_dwmac_dma_ops = {
 	.reset = sun8i_dwmac_dma_reset,
 	.init = sun8i_dwmac_dma_init,
 	.dump_regs = sun8i_dwmac_dump_regs,
-	.dma_mode = sun8i_dwmac_dma_operation_mode,
+	.dma_rx_mode = sun8i_dwmac_dma_operation_mode_rx,
+	.dma_tx_mode = sun8i_dwmac_dma_operation_mode_tx,
 	.enable_dma_transmission = sun8i_dwmac_enable_dma_transmission,
 	.enable_dma_irq = sun8i_dwmac_enable_dma_irq,
 	.disable_dma_irq = sun8i_dwmac_disable_dma_irq,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 7ecf549c7f1c..d7447b05f7a5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -148,12 +148,40 @@ static u32 dwmac1000_configure_fc(u32 csr6, int rxfifosz)
 	return csr6;
 }
 
-static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode,
-					 int rxmode, int rxfifosz)
+static void dwmac1000_dma_operation_mode_rx(void __iomem *ioaddr, int mode,
+					    u32 channel, int fifosz, u8 qmode)
 {
 	u32 csr6 = readl(ioaddr + DMA_CONTROL);
 
-	if (txmode == SF_DMA_MODE) {
+	if (mode == SF_DMA_MODE) {
+		pr_debug("GMAC: enable RX store and forward mode\n");
+		csr6 |= DMA_CONTROL_RSF;
+	} else {
+		pr_debug("GMAC: disable RX SF mode (threshold %d)\n", mode);
+		csr6 &= ~DMA_CONTROL_RSF;
+		csr6 &= DMA_CONTROL_TC_RX_MASK;
+		if (mode <= 32)
+			csr6 |= DMA_CONTROL_RTC_32;
+		else if (mode <= 64)
+			csr6 |= DMA_CONTROL_RTC_64;
+		else if (mode <= 96)
+			csr6 |= DMA_CONTROL_RTC_96;
+		else
+			csr6 |= DMA_CONTROL_RTC_128;
+	}
+
+	/* Configure flow control based on rx fifo size */
+	csr6 = dwmac1000_configure_fc(csr6, fifosz);
+
+	writel(csr6, ioaddr + DMA_CONTROL);
+}
+
+static void dwmac1000_dma_operation_mode_tx(void __iomem *ioaddr, int mode,
+					    u32 channel, int fifosz, u8 qmode)
+{
+	u32 csr6 = readl(ioaddr + DMA_CONTROL);
+
+	if (mode == SF_DMA_MODE) {
 		pr_debug("GMAC: enable TX store and forward mode\n");
 		/* Transmit COE type 2 cannot be done in cut-through mode. */
 		csr6 |= DMA_CONTROL_TSF;
@@ -162,42 +190,22 @@ static void dwmac1000_dma_operation_mode(void __iomem *ioaddr, int txmode,
 		 */
 		csr6 |= DMA_CONTROL_OSF;
 	} else {
-		pr_debug("GMAC: disabling TX SF (threshold %d)\n", txmode);
+		pr_debug("GMAC: disabling TX SF (threshold %d)\n", mode);
 		csr6 &= ~DMA_CONTROL_TSF;
 		csr6 &= DMA_CONTROL_TC_TX_MASK;
 		/* Set the transmit threshold */
-		if (txmode <= 32)
+		if (mode <= 32)
 			csr6 |= DMA_CONTROL_TTC_32;
-		else if (txmode <= 64)
+		else if (mode <= 64)
 			csr6 |= DMA_CONTROL_TTC_64;
-		else if (txmode <= 128)
+		else if (mode <= 128)
 			csr6 |= DMA_CONTROL_TTC_128;
-		else if (txmode <= 192)
+		else if (mode <= 192)
 			csr6 |= DMA_CONTROL_TTC_192;
 		else
 			csr6 |= DMA_CONTROL_TTC_256;
 	}
 
-	if (rxmode == SF_DMA_MODE) {
-		pr_debug("GMAC: enable RX store and forward mode\n");
-		csr6 |= DMA_CONTROL_RSF;
-	} else {
-		pr_debug("GMAC: disable RX SF mode (threshold %d)\n", rxmode);
-		csr6 &= ~DMA_CONTROL_RSF;
-		csr6 &= DMA_CONTROL_TC_RX_MASK;
-		if (rxmode <= 32)
-			csr6 |= DMA_CONTROL_RTC_32;
-		else if (rxmode <= 64)
-			csr6 |= DMA_CONTROL_RTC_64;
-		else if (rxmode <= 96)
-			csr6 |= DMA_CONTROL_RTC_96;
-		else
-			csr6 |= DMA_CONTROL_RTC_128;
-	}
-
-	/* Configure flow control based on rx fifo size */
-	csr6 = dwmac1000_configure_fc(csr6, rxfifosz);
-
 	writel(csr6, ioaddr + DMA_CONTROL);
 }
 
@@ -258,7 +266,8 @@ const struct stmmac_dma_ops dwmac1000_dma_ops = {
 	.init = dwmac1000_dma_init,
 	.axi = dwmac1000_dma_axi,
 	.dump_regs = dwmac1000_dump_dma_regs,
-	.dma_mode = dwmac1000_dma_operation_mode,
+	.dma_rx_mode = dwmac1000_dma_operation_mode_rx,
+	.dma_tx_mode = dwmac1000_dma_operation_mode_tx,
 	.enable_dma_transmission = dwmac_enable_dma_transmission,
 	.enable_dma_irq = dwmac_enable_dma_irq,
 	.disable_dma_irq = dwmac_disable_dma_irq,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
index 6502b9aa3bf5..80339d3d71c7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
@@ -51,14 +51,14 @@ static void dwmac100_dma_init(void __iomem *ioaddr,
  * The transmit threshold can be programmed by setting the TTC bits in the DMA
  * control register.
  */
-static void dwmac100_dma_operation_mode(void __iomem *ioaddr, int txmode,
-					int rxmode, int rxfifosz)
+static void dwmac100_dma_operation_mode_tx(void __iomem *ioaddr, int mode,
+					   u32 channel, int fifosz, u8 qmode)
 {
 	u32 csr6 = readl(ioaddr + DMA_CONTROL);
 
-	if (txmode <= 32)
+	if (mode <= 32)
 		csr6 |= DMA_CONTROL_TTC_32;
-	else if (txmode <= 64)
+	else if (mode <= 64)
 		csr6 |= DMA_CONTROL_TTC_64;
 	else
 		csr6 |= DMA_CONTROL_TTC_128;
@@ -113,7 +113,7 @@ const struct stmmac_dma_ops dwmac100_dma_ops = {
 	.reset = dwmac_dma_reset,
 	.init = dwmac100_dma_init,
 	.dump_regs = dwmac100_dump_dma_regs,
-	.dma_mode = dwmac100_dma_operation_mode,
+	.dma_tx_mode = dwmac100_dma_operation_mode_tx,
 	.dma_diagnostic_fr = dwmac100_dma_diagnostic_fr,
 	.enable_dma_transmission = dwmac_enable_dma_transmission,
 	.enable_dma_irq = dwmac_enable_dma_irq,
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index a6b9c9790d11..3ff4afe5e452 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -153,10 +153,6 @@ struct stmmac_dma_ops {
 	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
 	/* Dump DMA registers */
 	void (*dump_regs)(void __iomem *ioaddr, u32 *reg_space);
-	/* Set tx/rx threshold in the csr6 register
-	 * An invalid value enables the store-and-forward mode */
-	void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode,
-			 int rxfifosz);
 	void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel,
 			    int fifosz, u8 qmode);
 	void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel,
@@ -199,8 +195,6 @@ struct stmmac_dma_ops {
 	stmmac_do_void_callback(__priv, dma, axi, __args)
 #define stmmac_dump_dma_regs(__priv, __args...) \
 	stmmac_do_void_callback(__priv, dma, dump_regs, __args)
-#define stmmac_dma_mode(__priv, __args...) \
-	stmmac_do_void_callback(__priv, dma, dma_mode, __args)
 #define stmmac_dma_rx_mode(__priv, __args...) \
 	stmmac_do_void_callback(__priv, dma, dma_rx_mode, __args)
 #define stmmac_dma_tx_mode(__priv, __args...) \
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 0ccee6a779f9..beb7ec12dfa6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1787,22 +1787,18 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv)
 	}
 
 	/* configure all channels */
-	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		for (chan = 0; chan < rx_channels_count; chan++) {
-			qmode = priv->plat->rx_queues_cfg[chan].mode_to_use;
+	for (chan = 0; chan < rx_channels_count; chan++) {
+		qmode = priv->plat->rx_queues_cfg[chan].mode_to_use;
 
-			stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan,
-					rxfifosz, qmode);
-		}
+		stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan,
+				rxfifosz, qmode);
+	}
 
-		for (chan = 0; chan < tx_channels_count; chan++) {
-			qmode = priv->plat->tx_queues_cfg[chan].mode_to_use;
+	for (chan = 0; chan < tx_channels_count; chan++) {
+		qmode = priv->plat->tx_queues_cfg[chan].mode_to_use;
 
-			stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
-					txfifosz, qmode);
-		}
-	} else {
-		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
+		stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan,
+				txfifosz, qmode);
 	}
 }
 
@@ -1971,14 +1967,8 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode,
 	rxfifosz /= rx_channels_count;
 	txfifosz /= tx_channels_count;
 
-	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, rxfifosz,
-				rxqmode);
-		stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan, txfifosz,
-				txqmode);
-	} else {
-		stmmac_dma_mode(priv, priv->ioaddr, txmode, rxmode, rxfifosz);
-	}
+	stmmac_dma_rx_mode(priv, priv->ioaddr, rxmode, chan, rxfifosz, rxqmode);
+	stmmac_dma_tx_mode(priv, priv->ioaddr, txmode, chan, txfifosz, txqmode);
 }
 
 static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv)
-- 
cgit v1.2.3-59-g8ed1b


From 63a550fc151c5b20bb6d7c7c21dca8e56a67620f Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:03 +0100
Subject: net: stmmac: Remove uneeded checks for GMAC version

With the introducion of callbacks check in hwif.h we only call the
callback if HW supports it so there is no longer need to check for GMAC
version.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index beb7ec12dfa6..ce6f839ccbea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1973,11 +1973,8 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode,
 
 static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv)
 {
-	int ret = false;
+	int ret;
 
-	/* Safety features are only available in cores >= 5.10 */
-	if (priv->synopsys_id < DWMAC_CORE_5_10)
-		return ret;
 	ret = stmmac_safety_feat_irq_status(priv, priv->dev,
 			priv->ioaddr, priv->dma_cap.asp, &priv->sstats);
 	if (ret && (ret != -EINVAL)) {
@@ -2495,12 +2492,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	stmmac_core_init(priv, priv->hw, dev);
 
 	/* Initialize MTL*/
-	if (priv->synopsys_id >= DWMAC_CORE_4_00)
-		stmmac_mtl_configuration(priv);
+	stmmac_mtl_configuration(priv);
 
 	/* Initialize Safety Features */
-	if (priv->synopsys_id >= DWMAC_CORE_5_10)
-		stmmac_safety_feat_configuration(priv);
+	stmmac_safety_feat_configuration(priv);
 
 	ret = stmmac_rx_ipc(priv, priv->hw);
 	if (!ret) {
@@ -3054,10 +3049,9 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (enh_desc)
 		is_jumbo = stmmac_is_jumbo_frm(priv, skb->len, enh_desc);
 
-	if (unlikely(is_jumbo) && likely(priv->synopsys_id <
-					 DWMAC_CORE_4_00)) {
+	if (unlikely(is_jumbo)) {
 		entry = stmmac_jumbo_frm(priv, tx_q, skb, csum_insertion);
-		if (unlikely(entry < 0))
+		if (unlikely(entry < 0) && (entry != -EINVAL))
 			goto dma_map_err;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 758d5c73e2a3e9d06708112e1034b4fa3e9df53f Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:04 +0100
Subject: net: stmmac: Move PTP and MMC base address calculation to hwif.c

PTP and MMC modules base address can depend on the GMAC version. As this
is HW specific lets move this base address calculation to hwif.c. Also,
add an entry in the HW table so that we can specify the module offset.
This can later be extended to more modules, if deemed necessary.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.c        | 34 +++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |  5 ++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  8 ------
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 9acc8d2f1039..23a12649e247 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -6,6 +6,7 @@
 
 #include "common.h"
 #include "stmmac.h"
+#include "stmmac_ptp.h"
 
 static u32 stmmac_get_id(struct stmmac_priv *priv, u32 id_reg)
 {
@@ -72,6 +73,7 @@ static const struct stmmac_hwif_entry {
 	bool gmac;
 	bool gmac4;
 	u32 min_id;
+	const struct stmmac_regs_off regs;
 	const void *desc;
 	const void *dma;
 	const void *mac;
@@ -86,6 +88,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = false,
 		.gmac4 = false,
 		.min_id = 0,
+		.regs = {
+			.ptp_off = PTP_GMAC3_X_OFFSET,
+			.mmc_off = MMC_GMAC3_X_OFFSET,
+		},
 		.desc = NULL,
 		.dma = &dwmac100_dma_ops,
 		.mac = &dwmac100_ops,
@@ -98,6 +104,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = true,
 		.gmac4 = false,
 		.min_id = 0,
+		.regs = {
+			.ptp_off = PTP_GMAC3_X_OFFSET,
+			.mmc_off = MMC_GMAC3_X_OFFSET,
+		},
 		.desc = NULL,
 		.dma = &dwmac1000_dma_ops,
 		.mac = &dwmac1000_ops,
@@ -110,6 +120,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = false,
 		.gmac4 = true,
 		.min_id = 0,
+		.regs = {
+			.ptp_off = PTP_GMAC4_OFFSET,
+			.mmc_off = MMC_GMAC4_OFFSET,
+		},
 		.desc = &dwmac4_desc_ops,
 		.dma = &dwmac4_dma_ops,
 		.mac = &dwmac4_ops,
@@ -122,6 +136,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = false,
 		.gmac4 = true,
 		.min_id = DWMAC_CORE_4_00,
+		.regs = {
+			.ptp_off = PTP_GMAC4_OFFSET,
+			.mmc_off = MMC_GMAC4_OFFSET,
+		},
 		.desc = &dwmac4_desc_ops,
 		.dma = &dwmac4_dma_ops,
 		.mac = &dwmac410_ops,
@@ -134,6 +152,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = false,
 		.gmac4 = true,
 		.min_id = DWMAC_CORE_4_10,
+		.regs = {
+			.ptp_off = PTP_GMAC4_OFFSET,
+			.mmc_off = MMC_GMAC4_OFFSET,
+		},
 		.desc = &dwmac4_desc_ops,
 		.dma = &dwmac410_dma_ops,
 		.mac = &dwmac410_ops,
@@ -146,6 +168,10 @@ static const struct stmmac_hwif_entry {
 		.gmac = false,
 		.gmac4 = true,
 		.min_id = DWMAC_CORE_5_10,
+		.regs = {
+			.ptp_off = PTP_GMAC4_OFFSET,
+			.mmc_off = MMC_GMAC4_OFFSET,
+		},
 		.desc = &dwmac4_desc_ops,
 		.dma = &dwmac410_dma_ops,
 		.mac = &dwmac510_ops,
@@ -175,6 +201,12 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 	/* Save ID for later use */
 	priv->synopsys_id = id;
 
+	/* Lets assume some safe values first */
+	priv->ptpaddr = priv->ioaddr +
+		(needs_gmac4 ? PTP_GMAC4_OFFSET : PTP_GMAC3_X_OFFSET);
+	priv->mmcaddr = priv->ioaddr +
+		(needs_gmac4 ? MMC_GMAC4_OFFSET : MMC_GMAC3_X_OFFSET);
+
 	/* Check for HW specific setup first */
 	if (priv->plat->setup) {
 		priv->hw = priv->plat->setup(priv);
@@ -206,6 +238,8 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 		mac->tc = entry->tc;
 
 		priv->hw = mac;
+		priv->ptpaddr = priv->ioaddr + entry->regs.ptp_off;
+		priv->mmcaddr = priv->ioaddr + entry->regs.mmc_off;
 
 		/* Entry found */
 		ret = entry->setup(priv);
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 3ff4afe5e452..06fb20b27abf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -442,6 +442,11 @@ struct stmmac_tc_ops {
 #define stmmac_tc_setup_cls_u32(__priv, __args...) \
 	stmmac_do_callback(__priv, tc, setup_cls_u32, __args)
 
+struct stmmac_regs_off {
+	u32 ptp_off;
+	u32 mmc_off;
+};
+
 extern const struct stmmac_ops dwmac100_ops;
 extern const struct stmmac_dma_ops dwmac100_dma_ops;
 extern const struct stmmac_ops dwmac1000_ops;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ce6f839ccbea..a4d6ea7eb051 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2085,14 +2085,6 @@ static void stmmac_mmc_setup(struct stmmac_priv *priv)
 	unsigned int mode = MMC_CNTRL_RESET_ON_READ | MMC_CNTRL_COUNTER_RESET |
 			    MMC_CNTRL_PRESET | MMC_CNTRL_FULL_HALF_PRESET;
 
-	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		priv->ptpaddr = priv->ioaddr + PTP_GMAC4_OFFSET;
-		priv->mmcaddr = priv->ioaddr + MMC_GMAC4_OFFSET;
-	} else {
-		priv->ptpaddr = priv->ioaddr + PTP_GMAC3_X_OFFSET;
-		priv->mmcaddr = priv->ioaddr + MMC_GMAC3_X_OFFSET;
-	}
-
 	dwmac_mmc_intr_all_mask(priv->mmcaddr);
 
 	if (priv->dma_cap.rmon) {
-- 
cgit v1.2.3-59-g8ed1b


From 24aaed0cc0953fce632d539a148025f0e4a82bcf Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:05 +0100
Subject: net: stmmac: Uniformize the use of dma_init_* callbacks

Instead of relying on the GMAC version for choosing if we need to use
dma_init or dma_init_{rx/tx}_chan callback, lets uniformize this and
always use the dma_init_{rx/tx}_chan callbacks.

While at it, fix the use of dma_init_chan callback, which shall be
called for as many channels as the max of rx/tx channels.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c  | 25 ++++++--
 .../net/ethernet/stmicro/stmmac/dwmac1000_dma.c    | 25 +++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c | 25 +++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c   |  3 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 66 +++++++++-------------
 6 files changed, 85 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 11c287a7ed52..2e6e2a96b4f2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -276,17 +276,28 @@ static int sun8i_dwmac_dma_reset(void __iomem *ioaddr)
  * Called from stmmac via stmmac_dma_ops->init
  */
 static void sun8i_dwmac_dma_init(void __iomem *ioaddr,
-				 struct stmmac_dma_cfg *dma_cfg,
-				 u32 dma_tx, u32 dma_rx, int atds)
+				 struct stmmac_dma_cfg *dma_cfg, int atds)
 {
-	/* Write TX and RX descriptors address */
-	writel(dma_rx, ioaddr + EMAC_RX_DESC_LIST);
-	writel(dma_tx, ioaddr + EMAC_TX_DESC_LIST);
-
 	writel(EMAC_RX_INT | EMAC_TX_INT, ioaddr + EMAC_INT_EN);
 	writel(0x1FFFFFF, ioaddr + EMAC_INT_STA);
 }
 
+static void sun8i_dwmac_dma_init_rx(void __iomem *ioaddr,
+				    struct stmmac_dma_cfg *dma_cfg,
+				    u32 dma_rx_phy, u32 chan)
+{
+	/* Write RX descriptors address */
+	writel(dma_rx_phy, ioaddr + EMAC_RX_DESC_LIST);
+}
+
+static void sun8i_dwmac_dma_init_tx(void __iomem *ioaddr,
+				    struct stmmac_dma_cfg *dma_cfg,
+				    u32 dma_tx_phy, u32 chan)
+{
+	/* Write TX descriptors address */
+	writel(dma_tx_phy, ioaddr + EMAC_TX_DESC_LIST);
+}
+
 /* sun8i_dwmac_dump_regs() - Dump EMAC address space
  * Called from stmmac_dma_ops->dump_regs
  * Used for ethtool
@@ -492,6 +503,8 @@ static void sun8i_dwmac_dma_operation_mode_tx(void __iomem *ioaddr, int mode,
 static const struct stmmac_dma_ops sun8i_dwmac_dma_ops = {
 	.reset = sun8i_dwmac_dma_reset,
 	.init = sun8i_dwmac_dma_init,
+	.init_rx_chan = sun8i_dwmac_dma_init_rx,
+	.init_tx_chan = sun8i_dwmac_dma_init_tx,
 	.dump_regs = sun8i_dwmac_dump_regs,
 	.dma_rx_mode = sun8i_dwmac_dma_operation_mode_rx,
 	.dma_tx_mode = sun8i_dwmac_dma_operation_mode_tx,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index d7447b05f7a5..aacc4aa80e3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -81,8 +81,7 @@ static void dwmac1000_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 }
 
 static void dwmac1000_dma_init(void __iomem *ioaddr,
-			       struct stmmac_dma_cfg *dma_cfg,
-			       u32 dma_tx, u32 dma_rx, int atds)
+			       struct stmmac_dma_cfg *dma_cfg, int atds)
 {
 	u32 value = readl(ioaddr + DMA_BUS_MODE);
 	int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
@@ -119,12 +118,22 @@ static void dwmac1000_dma_init(void __iomem *ioaddr,
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_INTR_DEFAULT_MASK, ioaddr + DMA_INTR_ENA);
+}
 
-	/* RX/TX descriptor base address lists must be written into
-	 * DMA CSR3 and CSR4, respectively
-	 */
-	writel(dma_tx, ioaddr + DMA_TX_BASE_ADDR);
-	writel(dma_rx, ioaddr + DMA_RCV_BASE_ADDR);
+static void dwmac1000_dma_init_rx(void __iomem *ioaddr,
+				  struct stmmac_dma_cfg *dma_cfg,
+				  u32 dma_rx_phy, u32 chan)
+{
+	/* RX descriptor base address list must be written into DMA CSR3 */
+	writel(dma_rx_phy, ioaddr + DMA_RCV_BASE_ADDR);
+}
+
+static void dwmac1000_dma_init_tx(void __iomem *ioaddr,
+				  struct stmmac_dma_cfg *dma_cfg,
+				  u32 dma_tx_phy, u32 chan)
+{
+	/* TX descriptor base address list must be written into DMA CSR4 */
+	writel(dma_tx_phy, ioaddr + DMA_TX_BASE_ADDR);
 }
 
 static u32 dwmac1000_configure_fc(u32 csr6, int rxfifosz)
@@ -264,6 +273,8 @@ static void dwmac1000_rx_watchdog(void __iomem *ioaddr, u32 riwt,
 const struct stmmac_dma_ops dwmac1000_dma_ops = {
 	.reset = dwmac_dma_reset,
 	.init = dwmac1000_dma_init,
+	.init_rx_chan = dwmac1000_dma_init_rx,
+	.init_tx_chan = dwmac1000_dma_init_tx,
 	.axi = dwmac1000_dma_axi,
 	.dump_regs = dwmac1000_dump_dma_regs,
 	.dma_rx_mode = dwmac1000_dma_operation_mode_rx,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
index 80339d3d71c7..21dee25ee570 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
@@ -29,8 +29,7 @@
 #include "dwmac_dma.h"
 
 static void dwmac100_dma_init(void __iomem *ioaddr,
-			      struct stmmac_dma_cfg *dma_cfg,
-			      u32 dma_tx, u32 dma_rx, int atds)
+			      struct stmmac_dma_cfg *dma_cfg, int atds)
 {
 	/* Enable Application Access by writing to DMA CSR0 */
 	writel(DMA_BUS_MODE_DEFAULT | (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT),
@@ -38,12 +37,22 @@ static void dwmac100_dma_init(void __iomem *ioaddr,
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_INTR_DEFAULT_MASK, ioaddr + DMA_INTR_ENA);
+}
 
-	/* RX/TX descriptor base addr lists must be written into
-	 * DMA CSR3 and CSR4, respectively
-	 */
-	writel(dma_tx, ioaddr + DMA_TX_BASE_ADDR);
-	writel(dma_rx, ioaddr + DMA_RCV_BASE_ADDR);
+static void dwmac100_dma_init_rx(void __iomem *ioaddr,
+				 struct stmmac_dma_cfg *dma_cfg,
+				 u32 dma_rx_phy, u32 chan)
+{
+	/* RX descriptor base addr lists must be written into DMA CSR3 */
+	writel(dma_rx_phy, ioaddr + DMA_RCV_BASE_ADDR);
+}
+
+static void dwmac100_dma_init_tx(void __iomem *ioaddr,
+				 struct stmmac_dma_cfg *dma_cfg,
+				 u32 dma_tx_phy, u32 chan)
+{
+	/* TX descriptor base addr lists must be written into DMA CSR4 */
+	writel(dma_tx_phy, ioaddr + DMA_TX_BASE_ADDR);
 }
 
 /* Store and Forward capability is not used at all.
@@ -112,6 +121,8 @@ static void dwmac100_dma_diagnostic_fr(void *data, struct stmmac_extra_stats *x,
 const struct stmmac_dma_ops dwmac100_dma_ops = {
 	.reset = dwmac_dma_reset,
 	.init = dwmac100_dma_init,
+	.init_rx_chan = dwmac100_dma_init_rx,
+	.init_tx_chan = dwmac100_dma_init_tx,
 	.dump_regs = dwmac100_dump_dma_regs,
 	.dma_tx_mode = dwmac100_dma_operation_mode_tx,
 	.dma_diagnostic_fr = dwmac100_dma_diagnostic_fr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 9aab5b35e2d8..bf8e5a16f11c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -120,8 +120,7 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr,
 }
 
 static void dwmac4_dma_init(void __iomem *ioaddr,
-			    struct stmmac_dma_cfg *dma_cfg,
-			    u32 dma_tx, u32 dma_rx, int atds)
+			    struct stmmac_dma_cfg *dma_cfg, int atds)
 {
 	u32 value = readl(ioaddr + DMA_SYS_BUS_MODE);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 06fb20b27abf..1c674d61060c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -140,7 +140,7 @@ struct stmmac_dma_ops {
 	/* DMA core initialization */
 	int (*reset)(void __iomem *ioaddr);
 	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
-		     u32 dma_tx, u32 dma_rx, int atds);
+		     int atds);
 	void (*init_chan)(void __iomem *ioaddr,
 			  struct stmmac_dma_cfg *dma_cfg, u32 chan);
 	void (*init_rx_chan)(void __iomem *ioaddr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index a4d6ea7eb051..34c1fcc23fb6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2138,10 +2138,9 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 {
 	u32 rx_channels_count = priv->plat->rx_queues_to_use;
 	u32 tx_channels_count = priv->plat->tx_queues_to_use;
+	u32 dma_csr_ch = max(rx_channels_count, tx_channels_count);
 	struct stmmac_rx_queue *rx_q;
 	struct stmmac_tx_queue *tx_q;
-	u32 dummy_dma_rx_phy = 0;
-	u32 dummy_dma_tx_phy = 0;
 	u32 chan = 0;
 	int atds = 0;
 	int ret = 0;
@@ -2160,48 +2159,39 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return ret;
 	}
 
-	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-		/* DMA Configuration */
-		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
-				dummy_dma_tx_phy, dummy_dma_rx_phy, atds);
-
-		/* DMA RX Channel Configuration */
-		for (chan = 0; chan < rx_channels_count; chan++) {
-			rx_q = &priv->rx_queue[chan];
-
-			stmmac_init_rx_chan(priv, priv->ioaddr,
-					priv->plat->dma_cfg, rx_q->dma_rx_phy,
-					chan);
-
-			rx_q->rx_tail_addr = rx_q->dma_rx_phy +
-				    (DMA_RX_SIZE * sizeof(struct dma_desc));
-			stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
-					rx_q->rx_tail_addr, chan);
-		}
-
-		/* DMA TX Channel Configuration */
-		for (chan = 0; chan < tx_channels_count; chan++) {
-			tx_q = &priv->tx_queue[chan];
+	/* DMA RX Channel Configuration */
+	for (chan = 0; chan < rx_channels_count; chan++) {
+		rx_q = &priv->rx_queue[chan];
 
-			stmmac_init_chan(priv, priv->ioaddr,
-					priv->plat->dma_cfg, chan);
+		stmmac_init_rx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+				    rx_q->dma_rx_phy, chan);
 
-			stmmac_init_tx_chan(priv, priv->ioaddr,
-					priv->plat->dma_cfg, tx_q->dma_tx_phy,
-					chan);
+		rx_q->rx_tail_addr = rx_q->dma_rx_phy +
+			    (DMA_RX_SIZE * sizeof(struct dma_desc));
+		stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
+				       rx_q->rx_tail_addr, chan);
+	}
 
-			tx_q->tx_tail_addr = tx_q->dma_tx_phy +
-				    (DMA_TX_SIZE * sizeof(struct dma_desc));
-			stmmac_set_tx_tail_ptr(priv, priv->ioaddr,
-					tx_q->tx_tail_addr, chan);
-		}
-	} else {
-		rx_q = &priv->rx_queue[chan];
+	/* DMA TX Channel Configuration */
+	for (chan = 0; chan < tx_channels_count; chan++) {
 		tx_q = &priv->tx_queue[chan];
-		stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg,
-				tx_q->dma_tx_phy, rx_q->dma_rx_phy, atds);
+
+		stmmac_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg,
+				    tx_q->dma_tx_phy, chan);
+
+		tx_q->tx_tail_addr = tx_q->dma_tx_phy +
+			    (DMA_TX_SIZE * sizeof(struct dma_desc));
+		stmmac_set_tx_tail_ptr(priv, priv->ioaddr,
+				       tx_q->tx_tail_addr, chan);
 	}
 
+	/* DMA CSR Channel configuration */
+	for (chan = 0; chan < dma_csr_ch; chan++)
+		stmmac_init_chan(priv, priv->ioaddr, priv->plat->dma_cfg, chan);
+
+	/* DMA Configuration */
+	stmmac_dma_init(priv, priv->ioaddr, priv->plat->dma_cfg, atds);
+
 	if (priv->plat->axi)
 		stmmac_axi(priv, priv->ioaddr, priv->plat->axi);
 
-- 
cgit v1.2.3-59-g8ed1b


From f1565c60210f049e068549bfc2a945331e0ee288 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:06 +0100
Subject: net: stmmac: Remove uneeded check for GMAC version in stmmac_xmit

We either have .enable_dma_transmission or .set_tx_tail_ptr in the HW
table callbacks, we can never have both so there is no need to check for
GMAC version.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h  | 1 -
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 ++-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
index 8474bf961dd0..c63c1fe3f26b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
@@ -184,7 +184,6 @@
 #define DMA_CHAN0_DBG_STAT_RPS_SHIFT	8
 
 int dwmac4_dma_reset(void __iomem *ioaddr);
-void dwmac4_enable_dma_transmission(void __iomem *ioaddr, u32 tail_ptr);
 void dwmac4_enable_dma_irq(void __iomem *ioaddr, u32 chan);
 void dwmac410_enable_dma_irq(void __iomem *ioaddr, u32 chan);
 void dwmac4_disable_dma_irq(void __iomem *ioaddr, u32 chan);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 34c1fcc23fb6..1e7ded6dd7b3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3166,11 +3166,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len);
 
-	if (priv->synopsys_id < DWMAC_CORE_4_00)
-		stmmac_enable_dma_transmission(priv, priv->ioaddr);
-	else
-		stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr,
-				queue);
+	stmmac_enable_dma_transmission(priv, priv->ioaddr);
+	stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue);
 
 	return NETDEV_TX_OK;
 
-- 
cgit v1.2.3-59-g8ed1b


From 357951cdf0dcdcc2bacb00c2e7665217a2fe34bf Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:07 +0100
Subject: net: stmmac: Uniformize set_rx_owner()

Currently an if condition is used to select the correct callback to set
rx_onwer in descriptor. Lets keep this simple and always use the same
callback.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 12 ++++++------
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     |  2 +-
 drivers/net/ethernet/stmicro/stmmac/hwif.h         |  2 +-
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  5 +----
 5 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 119a2f93dd55..63f869ce28af 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -189,9 +189,12 @@ static void dwmac4_set_tx_owner(struct dma_desc *p)
 	p->des3 |= cpu_to_le32(TDES3_OWN);
 }
 
-static void dwmac4_set_rx_owner(struct dma_desc *p)
+static void dwmac4_set_rx_owner(struct dma_desc *p, int disable_rx_ic)
 {
-	p->des3 |= cpu_to_le32(RDES3_OWN);
+	p->des3 = cpu_to_le32(RDES3_OWN | RDES3_BUFFER1_VALID_ADDR);
+
+	if (!disable_rx_ic)
+		p->des3 |= cpu_to_le32(RDES3_INT_ON_COMPLETION_EN);
 }
 
 static int dwmac4_get_tx_ls(struct dma_desc *p)
@@ -292,10 +295,7 @@ exit:
 static void dwmac4_rd_init_rx_desc(struct dma_desc *p, int disable_rx_ic,
 				   int mode, int end)
 {
-	p->des3 = cpu_to_le32(RDES3_OWN | RDES3_BUFFER1_VALID_ADDR);
-
-	if (!disable_rx_ic)
-		p->des3 |= cpu_to_le32(RDES3_INT_ON_COMPLETION_EN);
+	dwmac4_set_rx_owner(p, disable_rx_ic);
 }
 
 static void dwmac4_rd_init_tx_desc(struct dma_desc *p, int mode, int end)
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 17cd26fb2a3d..743a60f6ba2b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -292,7 +292,7 @@ static void enh_desc_set_tx_owner(struct dma_desc *p)
 	p->des0 |= cpu_to_le32(ETDES0_OWN);
 }
 
-static void enh_desc_set_rx_owner(struct dma_desc *p)
+static void enh_desc_set_rx_owner(struct dma_desc *p, int disable_rx_ic)
 {
 	p->des0 |= cpu_to_le32(RDES0_OWN);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 1c674d61060c..06b5e5b15069 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -59,7 +59,7 @@ struct stmmac_desc_ops {
 	/* Get the buffer size from the descriptor */
 	int (*get_tx_len)(struct dma_desc *p);
 	/* Handle extra events on specific interrupts hw dependent */
-	void (*set_rx_owner)(struct dma_desc *p);
+	void (*set_rx_owner)(struct dma_desc *p, int disable_rx_ic);
 	/* Get the receive frame size */
 	int (*get_rx_frame_len)(struct dma_desc *p, int rx_coe_type);
 	/* Return the reception status looking at the RDES1 */
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index a7b221b59178..2facdb5d1aa0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -168,7 +168,7 @@ static void ndesc_set_tx_owner(struct dma_desc *p)
 	p->des0 |= cpu_to_le32(TDES0_OWN);
 }
 
-static void ndesc_set_rx_owner(struct dma_desc *p)
+static void ndesc_set_rx_owner(struct dma_desc *p, int disable_rx_ic)
 {
 	p->des0 |= cpu_to_le32(RDES0_OWN);
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 1e7ded6dd7b3..35ccf3f8a5a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3262,10 +3262,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
 		}
 		dma_wmb();
 
-		if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-			stmmac_init_rx_desc(priv, p, priv->use_riwt, 0, 0);
-		else
-			stmmac_set_rx_owner(priv, p);
+		stmmac_set_rx_owner(priv, p, priv->use_riwt);
 
 		dma_wmb();
 
-- 
cgit v1.2.3-59-g8ed1b


From d2df9ea0ade7ef73710603da97b72394c53a111c Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:08 +0100
Subject: net: stmmac: Let descriptor code get skbuff address

Stop using if conditions depending on the GMAC version for getting the
descriptor skbuff address and use instead a helper implemented in the
descriptor files.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c     | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h         | 4 ++++
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c    | 6 ++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 6 +-----
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
index 63f869ce28af..20299f6f65fc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
@@ -424,6 +424,11 @@ static void dwmac4_set_mss_ctxt(struct dma_desc *p, unsigned int mss)
 	p->des3 = cpu_to_le32(TDES3_CONTEXT_TYPE | TDES3_CTXT_TCMSSV);
 }
 
+static void dwmac4_get_addr(struct dma_desc *p, unsigned int *addr)
+{
+	*addr = le32_to_cpu(p->des0);
+}
+
 static void dwmac4_set_addr(struct dma_desc *p, dma_addr_t addr)
 {
 	p->des0 = cpu_to_le32(addr);
@@ -459,6 +464,7 @@ const struct stmmac_desc_ops dwmac4_desc_ops = {
 	.init_tx_desc = dwmac4_rd_init_tx_desc,
 	.display_ring = dwmac4_display_ring,
 	.set_mss = dwmac4_set_mss_ctxt,
+	.get_addr = dwmac4_get_addr,
 	.set_addr = dwmac4_set_addr,
 	.clear = dwmac4_clear,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
index 743a60f6ba2b..77914c89d749 100644
--- a/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/enh_desc.c
@@ -437,6 +437,11 @@ static void enh_desc_display_ring(void *head, unsigned int size, bool rx)
 	pr_info("\n");
 }
 
+static void enh_desc_get_addr(struct dma_desc *p, unsigned int *addr)
+{
+	*addr = le32_to_cpu(p->des2);
+}
+
 static void enh_desc_set_addr(struct dma_desc *p, dma_addr_t addr)
 {
 	p->des2 = cpu_to_le32(addr);
@@ -467,6 +472,7 @@ const struct stmmac_desc_ops enh_desc_ops = {
 	.get_timestamp = enh_desc_get_timestamp,
 	.get_rx_timestamp_status = enh_desc_get_rx_timestamp_status,
 	.display_ring = enh_desc_display_ring,
+	.get_addr = enh_desc_get_addr,
 	.set_addr = enh_desc_set_addr,
 	.clear = enh_desc_clear,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 06b5e5b15069..f499a7fad6f0 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -79,6 +79,8 @@ struct stmmac_desc_ops {
 	void (*display_ring)(void *head, unsigned int size, bool rx);
 	/* set MSS via context descriptor */
 	void (*set_mss)(struct dma_desc *p, unsigned int mss);
+	/* get descriptor skbuff address */
+	void (*get_addr)(struct dma_desc *p, unsigned int *addr);
 	/* set descriptor skbuff address */
 	void (*set_addr)(struct dma_desc *p, dma_addr_t addr);
 	/* clear descriptor */
@@ -127,6 +129,8 @@ struct stmmac_desc_ops {
 	stmmac_do_void_callback(__priv, desc, display_ring, __args)
 #define stmmac_set_mss(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_mss, __args)
+#define stmmac_get_desc_addr(__priv, __args...) \
+	stmmac_do_void_callback(__priv, desc, get_addr, __args)
 #define stmmac_set_desc_addr(__priv, __args...) \
 	stmmac_do_void_callback(__priv, desc, set_addr, __args)
 #define stmmac_clear_desc(__priv, __args...) \
diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
index 2facdb5d1aa0..de65bb29feba 100644
--- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c
@@ -297,6 +297,11 @@ static void ndesc_display_ring(void *head, unsigned int size, bool rx)
 	pr_info("\n");
 }
 
+static void ndesc_get_addr(struct dma_desc *p, unsigned int *addr)
+{
+	*addr = le32_to_cpu(p->des2);
+}
+
 static void ndesc_set_addr(struct dma_desc *p, dma_addr_t addr)
 {
 	p->des2 = cpu_to_le32(addr);
@@ -326,6 +331,7 @@ const struct stmmac_desc_ops ndesc_ops = {
 	.get_timestamp = ndesc_get_timestamp,
 	.get_rx_timestamp_status = ndesc_get_rx_timestamp_status,
 	.display_ring = ndesc_display_ring,
+	.get_addr = ndesc_get_addr,
 	.set_addr = ndesc_set_addr,
 	.clear = ndesc_clear,
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 35ccf3f8a5a7..f2687ec3c774 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3350,11 +3350,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue)
 			int frame_len;
 			unsigned int des;
 
-			if (unlikely(priv->synopsys_id >= DWMAC_CORE_4_00))
-				des = le32_to_cpu(p->des0);
-			else
-				des = le32_to_cpu(p->des2);
-
+			stmmac_get_desc_addr(priv, p, &des);
 			frame_len = stmmac_get_rx_frame_len(priv, p, coe);
 
 			/*  If frame length is greater than skb buffer size
-- 
cgit v1.2.3-59-g8ed1b


From 61fac60a6ab72efb3b1ef7d052724fbeed52fe97 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 14:56:09 +0100
Subject: net: stmmac: Remove if condition by taking advantage of hwif return
 code

We can remove the if condition and check if return code is different
than -EINVAL, meaning callback is present.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f2687ec3c774..c32de53a00d3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3644,6 +3644,7 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 	/* To handle GMAC own interrupts */
 	if ((priv->plat->has_gmac) || (priv->plat->has_gmac4)) {
 		int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats);
+		int mtl_status;
 
 		if (unlikely(status)) {
 			/* For LPI we need to save the tx status */
@@ -3653,20 +3654,18 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 				priv->tx_path_in_lpi_mode = false;
 		}
 
-		if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-			for (queue = 0; queue < queues_count; queue++) {
-				struct stmmac_rx_queue *rx_q =
-				&priv->rx_queue[queue];
+		for (queue = 0; queue < queues_count; queue++) {
+			struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue];
 
-				status |= stmmac_host_mtl_irq_status(priv,
-						priv->hw, queue);
+			mtl_status = stmmac_host_mtl_irq_status(priv, priv->hw,
+								queue);
+			if (mtl_status != -EINVAL)
+				status |= mtl_status;
 
-				if (status & CORE_IRQ_MTL_RX_OVERFLOW)
-					stmmac_set_rx_tail_ptr(priv,
-							priv->ioaddr,
-							rx_q->rx_tail_addr,
-							queue);
-			}
+			if (status & CORE_IRQ_MTL_RX_OVERFLOW)
+				stmmac_set_rx_tail_ptr(priv, priv->ioaddr,
+						       rx_q->rx_tail_addr,
+						       queue);
 		}
 
 		/* PCS link status */
-- 
cgit v1.2.3-59-g8ed1b


From 64a2658b58ab519dc17aa3ec5754eedcbdb68bde Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 17 May 2018 21:23:05 +0200
Subject: net: mscc: Add SPDX identifier

ocelot_qsys.h is missing the SPDX identfier, fix that.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Reviewed-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot_qsys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mscc/ocelot_qsys.h b/drivers/net/ethernet/mscc/ocelot_qsys.h
index aa7267d5ca77..d8c63aa761be 100644
--- a/drivers/net/ethernet/mscc/ocelot_qsys.h
+++ b/drivers/net/ethernet/mscc/ocelot_qsys.h
@@ -1,7 +1,7 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
 /*
  * Microsemi Ocelot Switch driver
  *
- * License: Dual MIT/GPL
  * Copyright (c) 2017 Microsemi Corporation
  */
 
-- 
cgit v1.2.3-59-g8ed1b


From cf0dd203728c20e76976b69cdf42d065c1e5cab3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:24 -0700
Subject: tcp: use __sock_put() instead of sock_put() in
 tcp_clear_xmit_timers()

Socket can not disappear under us.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6deb540297cc..511bd0fde1dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -559,7 +559,7 @@ void tcp_init_xmit_timers(struct sock *);
 static inline void tcp_clear_xmit_timers(struct sock *sk)
 {
 	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
-		sock_put(sk);
+		__sock_put(sk);
 
 	inet_csk_clear_xmit_timers(sk);
 }
-- 
cgit v1.2.3-59-g8ed1b


From a3893637e1eb0ef5eb1bbc52b3a8d2dfa317a35d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:25 -0700
Subject: tcp: do not force quickack when receiving out-of-order packets

As explained in commit 9f9843a751d0 ("tcp: properly handle stretch
acks in slow start"), TCP stacks have to consider how many packets
are acknowledged in one single ACK, because of GRO, but also
because of ACK compression or losses.

We plan to add SACK compression in the following patch, we
must therefore not call tcp_enter_quickack_mode()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0bf032839548..f5622b250665 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4715,8 +4715,6 @@ drop:
 	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
 		goto out_of_window;
 
-	tcp_enter_quickack_mode(sk);
-
 	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		/* Partial packet, seq < rcv_next < end_seq */
 		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
-- 
cgit v1.2.3-59-g8ed1b


From 5d9f4262b7ea41ca9981cc790e37cca6e37c789e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:26 -0700
Subject: tcp: add SACK compression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When TCP receives an out-of-order packet, it immediately sends
a SACK packet, generating network load but also forcing the
receiver to send 1-MSS pathological packets, increasing its
RTX queue length/depth, and thus processing time.

Wifi networks suffer from this aggressive behavior, but generally
speaking, all these SACK packets add fuel to the fire when networks
are under congestion.

This patch adds a high resolution timer and tp->compressed_ack counter.

Instead of sending a SACK, we program this timer with a small delay,
based on RTT and capped to 1 ms :

	delay = min ( 5 % of RTT, 1 ms)

If subsequent SACKs need to be sent while the timer has not yet
expired, we simply increment tp->compressed_ack.

When timer expires, a SACK is sent with the latest information.
Whenever an ACK is sent (if data is sent, or if in-order
data is received) timer is canceled.

Note that tcp_sack_new_ofo_skb() is able to force a SACK to be sent
if the sack blocks need to be shuffled, even if the timer has not
expired.

A new SNMP counter is added in the following patch.

Two other patches add sysctls to allow changing the 1,000,000 and 44
values that this commit hard-coded.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 ++
 include/net/tcp.h     |  3 +++
 net/ipv4/tcp.c        |  1 +
 net/ipv4/tcp_input.c  | 35 +++++++++++++++++++++++++++++------
 net/ipv4/tcp_output.c |  7 +++++++
 net/ipv4/tcp_timer.c  | 25 +++++++++++++++++++++++++
 6 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 807776928cb8..72705eaf4b84 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -218,6 +218,7 @@ struct tcp_sock {
 		   reord:1;	 /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
+	u8	compressed_ack;
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
 	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
 	u8	chrono_type:2,	/* current chronograph type */
@@ -297,6 +298,7 @@ struct tcp_sock {
 	u32	sacked_out;	/* SACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
+	struct hrtimer	compressed_ack_timer;
 
 	/* from STCP, retrans queue hinting */
 	struct sk_buff* lost_skb_hint;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 511bd0fde1dc..952d842a604a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -561,6 +561,9 @@ static inline void tcp_clear_xmit_timers(struct sock *sk)
 	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
 		__sock_put(sk);
 
+	if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
+		__sock_put(sk);
+
 	inet_csk_clear_xmit_timers(sk);
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 62b776f90037..0a2ea0bbf867 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	dst_release(sk->sk_rx_dst);
 	sk->sk_rx_dst = NULL;
 	tcp_saved_syn_free(tp);
+	tp->compressed_ack = 0;
 
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f5622b250665..cc2ac5346b92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
 	 * If the sack array is full, forget about the last one.
 	 */
 	if (this_sack >= TCP_NUM_SACKS) {
+		if (tp->compressed_ack)
+			tcp_send_ack(sk);
 		this_sack--;
 		tp->rx_opt.num_sacks--;
 		sp--;
@@ -5081,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned long rtt, delay;
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5092,15 +5095,35 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
 	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
 	    /* We ACK each frame or... */
-	    tcp_in_quickack_mode(sk) ||
-	    /* We have out of order data. */
-	    (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
-		/* Then ack it now */
+	    tcp_in_quickack_mode(sk)) {
+send_now:
 		tcp_send_ack(sk);
-	} else {
-		/* Else, send delayed ack. */
+		return;
+	}
+
+	if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
 		tcp_send_delayed_ack(sk);
+		return;
 	}
+
+	if (!tcp_is_sack(tp) || tp->compressed_ack >= 44)
+		goto send_now;
+	tp->compressed_ack++;
+
+	if (hrtimer_is_queued(&tp->compressed_ack_timer))
+		return;
+
+	/* compress ack timer : 5 % of rtt, but no more than 1 ms */
+
+	rtt = tp->rcv_rtt_est.rtt_us;
+	if (tp->srtt_us && tp->srtt_us < rtt)
+		rtt = tp->srtt_us;
+
+	delay = min_t(unsigned long, NSEC_PER_MSEC,
+		      rtt * (NSEC_PER_USEC >> 3)/20);
+	sock_hold(sk);
+	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
 static inline void tcp_ack_snd_check(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0d8f950a9006..7ee98aad82b7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -162,6 +162,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
 /* Account for an ACK we sent. */
 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(tp->compressed_ack)) {
+		tp->compressed_ack = 0;
+		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+			__sock_put(sk);
+	}
 	tcp_dec_quickack_mode(sk, pkts);
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 92bdf64fffae..3b3611729928 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -708,6 +708,27 @@ out:
 	sock_put(sk);
 }
 
+static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
+{
+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer);
+	struct sock *sk = (struct sock *)tp;
+
+	bh_lock_sock(sk);
+	if (!sock_owned_by_user(sk)) {
+		if (tp->compressed_ack)
+			tcp_send_ack(sk);
+	} else {
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+				      &sk->sk_tsq_flags))
+			sock_hold(sk);
+	}
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+	return HRTIMER_NORESTART;
+}
+
 void tcp_init_xmit_timers(struct sock *sk)
 {
 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
@@ -715,4 +736,8 @@ void tcp_init_xmit_timers(struct sock *sk)
 	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_ABS_PINNED_SOFT);
 	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+
+	hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL_PINNED_SOFT);
+	tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 200d95f4575934e49f872109cce18c5e72383eb8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:27 -0700
Subject: tcp: add TCPAckCompressed SNMP counter

This counter tracks number of ACK packets that the host has not sent,
thanks to ACK compression.

Sample output :

$ nstat -n;sleep 1;nstat|egrep "IpInReceives|IpOutRequests|TcpInSegs|TcpOutSegs|TcpExtTCPAckCompressed"
IpInReceives                    123250             0.0
IpOutRequests                   3684               0.0
TcpInSegs                       123251             0.0
TcpOutSegs                      3684               0.0
TcpExtTCPAckCompressed          119252             0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 1 +
 net/ipv4/proc.c           | 1 +
 net/ipv4/tcp_output.c     | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index d02e859301ff..750d89120335 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -278,6 +278,7 @@ enum
 	LINUX_MIB_TCPMTUPSUCCESS,		/* TCPMTUPSuccess */
 	LINUX_MIB_TCPDELIVERED,			/* TCPDelivered */
 	LINUX_MIB_TCPDELIVEREDCE,		/* TCPDeliveredCE */
+	LINUX_MIB_TCPACKCOMPRESSED,		/* TCPAckCompressed */
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..6c1ff89a60fa 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
 	SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
 	SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
+	SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7ee98aad82b7..437bb7ceba7f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -165,6 +165,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (unlikely(tp->compressed_ack)) {
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+			      tp->compressed_ack);
 		tp->compressed_ack = 0;
 		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
 			__sock_put(sk);
-- 
cgit v1.2.3-59-g8ed1b


From 6d82aa242092d73c6d2e210cfaf0ebfbe6de1ccf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:28 -0700
Subject: tcp: add tcp_comp_sack_delay_ns sysctl

This per netns sysctl allows for TCP SACK compression fine-tuning.

Its default value is 1,000,000, or 1 ms to meet TSO autosizing period.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/net/netns/ipv4.h               | 1 +
 net/ipv4/sysctl_net_ipv4.c             | 7 +++++++
 net/ipv4/tcp_input.c                   | 4 ++--
 net/ipv4/tcp_ipv4.c                    | 1 +
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ea304a23c8d7..7ba952959bca 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -525,6 +525,13 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
 tcp_sack - BOOLEAN
 	Enable select acknowledgments (SACKS).
 
+tcp_comp_sack_delay_ns - LONG INTEGER
+	TCP tries to reduce number of SACK sent, using a timer
+	based on 5% of SRTT, capped by this sysctl, in nano seconds.
+	The default is 1ms, based on TSO autosizing period.
+
+	Default : 1,000,000 ns (1 ms)
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 8491bc9c86b1..927318243cfa 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_pacing_ca_ratio;
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
+	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
 	int sysctl_tcp_fastopen;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4b195bac8ac0..11fbfdc1566e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1151,6 +1151,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "tcp_comp_sack_delay_ns",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cc2ac5346b92..6a1dae38c955 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5113,13 +5113,13 @@ send_now:
 	if (hrtimer_is_queued(&tp->compressed_ack_timer))
 		return;
 
-	/* compress ack timer : 5 % of rtt, but no more than 1 ms */
+	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
 
 	rtt = tp->rcv_rtt_est.rtt_us;
 	if (tp->srtt_us && tp->srtt_us < rtt)
 		rtt = tp->srtt_us;
 
-	delay = min_t(unsigned long, NSEC_PER_MSEC,
+	delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
 		      rtt * (NSEC_PER_USEC >> 3)/20);
 	sock_hold(sk);
 	hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index caf23de88f8a..a3f4647341db 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2572,6 +2572,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		       init_net.ipv4.sysctl_tcp_wmem,
 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
 	}
+	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
-- 
cgit v1.2.3-59-g8ed1b


From 9c21d2fc41c0c8930600c9dd83eadac2e336fcfa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 17 May 2018 14:47:29 -0700
Subject: tcp: add tcp_comp_sack_nr sysctl

This per netns sysctl allows for TCP SACK compression fine-tuning.

This limits number of SACK that can be compressed.
Using 0 disables SACK compression.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             | 10 ++++++++++
 net/ipv4/tcp_input.c                   |  3 ++-
 net/ipv4/tcp_ipv4.c                    |  1 +
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7ba952959bca..924bd51327b7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -532,6 +532,12 @@ tcp_comp_sack_delay_ns - LONG INTEGER
 
 	Default : 1,000,000 ns (1 ms)
 
+tcp_comp_sack_nr - INTEGER
+	Max numer of SACK that can be compressed.
+	Using 0 disables SACK compression.
+
+	Detault : 44
+
 tcp_slow_start_after_idle - BOOLEAN
 	If set, provide RFC2861 behavior and time out the congestion
 	window after an idle period.  An idle period is defined at
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 927318243cfa..661348f23ea5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -160,6 +160,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_pacing_ca_ratio;
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
+	int sysctl_tcp_comp_sack_nr;
 	unsigned long sysctl_tcp_comp_sack_delay_ns;
 	struct inet_timewait_death_row tcp_death_row;
 	int sysctl_max_syn_backlog;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 11fbfdc1566e..d2eed3ddcb0a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1;
 static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+static int comp_sack_nr_max = 255;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1158,6 +1159,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "tcp_comp_sack_nr",
+		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &comp_sack_nr_max,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6a1dae38c955..aebb29ab2fdf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5106,7 +5106,8 @@ send_now:
 		return;
 	}
 
-	if (!tcp_is_sack(tp) || tp->compressed_ack >= 44)
+	if (!tcp_is_sack(tp) ||
+	    tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
 		goto send_now;
 	tp->compressed_ack++;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f4647341db..adbdb503db0c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2573,6 +2573,7 @@ static int __net_init tcp_sk_init(struct net *net)
 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
 	}
 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
 	spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
-- 
cgit v1.2.3-59-g8ed1b


From 69cb7dc0218b2c54416722eddf181e720f24c305 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:10 +0200
Subject: net/smc: add common buffer size in send and receive buffer
 descriptors

In addition to the buffer references, SMC currently stores the sizes of
the receive and send buffers in each connection as separate variables.
This patch introduces a buffer length variable in the common buffer
descriptor and uses this length instead.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  2 +-
 net/smc/smc.h      |  2 --
 net/smc/smc_cdc.c  |  8 ++++----
 net/smc/smc_core.c |  8 ++------
 net/smc/smc_core.h |  1 +
 net/smc/smc_diag.c |  5 +++--
 net/smc/smc_rx.c   | 10 +++++-----
 net/smc/smc_tx.c   | 28 ++++++++++++++--------------
 net/smc/smc_tx.h   |  2 +-
 9 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6ad4f6c771c3..409b1367970c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1421,7 +1421,7 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		/* output queue size (not send + not acked) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc->conn.sndbuf_size -
+		answ = smc->conn.sndbuf_desc->len -
 					atomic_read(&smc->conn.sndbuf_space);
 		break;
 	case SIOCOUTQNSD:
diff --git a/net/smc/smc.h b/net/smc/smc.h
index ec209cd48d42..ad6694f516dd 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -126,9 +126,7 @@ struct smc_connection {
 	int			rtoken_idx;	/* idx to peer RMB rkey/addr */
 
 	struct smc_buf_desc	*sndbuf_desc;	/* send buffer descriptor */
-	int			sndbuf_size;	/* sndbuf size <== sock wmem */
 	struct smc_buf_desc	*rmb_desc;	/* RMBE descriptor */
-	int			rmbe_size;	/* RMBE size <== sock rmem */
 	int			rmbe_size_short;/* compressed notation */
 	int			rmbe_update_limit;
 						/* lower limit for consumer
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 42ad57365eca..2809f9902446 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
 	smc = container_of(cdcpend->conn, struct smc_sock, conn);
 	bh_lock_sock(&smc->sk);
 	if (!wc_status) {
-		diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+		diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
 				     &cdcpend->conn->tx_curs_fin,
 				     &cdcpend->cursor);
 		/* sndbuf_space is decreased in smc_sendmsg */
 		smp_mb__before_atomic();
 		atomic_add(diff, &cdcpend->conn->sndbuf_space);
-		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
 		smp_mb__after_atomic();
 		smc_curs_write(&cdcpend->conn->tx_curs_fin,
 			       smc_curs_read(&cdcpend->cursor, cdcpend->conn),
@@ -198,13 +198,13 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		smp_mb__after_atomic();
 	}
 
-	diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+	diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
 				  &conn->local_rx_ctrl.prod);
 	if (diff_prod) {
 		/* bytes_to_rcv is decreased in smc_recvmsg */
 		smp_mb__before_atomic();
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
-		/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+		/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
 	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 08c05cd0bbae..6396426080f9 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -236,15 +236,12 @@ out:
 
 static void smc_buf_unuse(struct smc_connection *conn)
 {
-	if (conn->sndbuf_desc) {
+	if (conn->sndbuf_desc)
 		conn->sndbuf_desc->used = 0;
-		conn->sndbuf_size = 0;
-	}
 	if (conn->rmb_desc) {
 		if (!conn->rmb_desc->regerr) {
 			conn->rmb_desc->reused = 1;
 			conn->rmb_desc->used = 0;
-			conn->rmbe_size = 0;
 		} else {
 			/* buf registration failed, reuse not possible */
 			struct smc_link_group *lgr = conn->lgr;
@@ -616,6 +613,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 		}
 	}
 
+	buf_desc->len = bufsize;
 	return buf_desc;
 }
 
@@ -675,14 +673,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
 
 	if (is_rmb) {
 		conn->rmb_desc = buf_desc;
-		conn->rmbe_size = bufsize;
 		conn->rmbe_size_short = bufsize_short;
 		smc->sk.sk_rcvbuf = bufsize * 2;
 		atomic_set(&conn->bytes_to_rcv, 0);
 		conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize);
 	} else {
 		conn->sndbuf_desc = buf_desc;
-		conn->sndbuf_size = bufsize;
 		smc->sk.sk_sndbuf = bufsize * 2;
 		atomic_set(&conn->sndbuf_space, bufsize);
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 845dc073de13..4885a7efa84f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -124,6 +124,7 @@ struct smc_buf_desc {
 	struct list_head	list;
 	void			*cpu_addr;	/* virtual address of buffer */
 	struct page		*pages;
+	int			len;		/* length of buffer */
 	struct sg_table		sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */
 	struct ib_mr		*mr_rx[SMC_LINKS_PER_LGR_MAX];
 						/* for rmb only: memory region
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 05dd7e6d314d..839354402215 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -101,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
 		struct smc_connection *conn = &smc->conn;
 		struct smc_diag_conninfo cinfo = {
 			.token = conn->alert_token_local,
-			.sndbuf_size = conn->sndbuf_size,
-			.rmbe_size = conn->rmbe_size,
+			.sndbuf_size = conn->sndbuf_desc ?
+				conn->sndbuf_desc->len : 0,
+			.rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0,
 			.peer_rmbe_size = conn->peer_rmbe_size,
 
 			.rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index ed45569289f5..290a434471d1 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -51,7 +51,7 @@ static void smc_rx_wake_up(struct sock *sk)
 static void smc_rx_update_consumer(struct smc_connection *conn,
 				   union smc_host_cursor cons, size_t len)
 {
-	smc_curs_add(conn->rmbe_size, &cons, len);
+	smc_curs_add(conn->rmb_desc->len, &cons, len);
 	smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
 		       conn);
 	/* send consumer cursor update if required */
@@ -288,11 +288,11 @@ copy:
 			       conn);
 		/* subsequent splice() calls pick up where previous left */
 		if (splbytes)
-			smc_curs_add(conn->rmbe_size, &cons, splbytes);
+			smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
 		/* determine chunks where to read from rcvbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
-		chunk_len = min_t(size_t,
-				  copylen, conn->rmbe_size - cons.count);
+		chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
+				  cons.count);
 		chunk_len_sum = chunk_len;
 		chunk_off = cons.count;
 		smc_rmb_sync_sg_for_cpu(conn);
@@ -331,7 +331,7 @@ copy:
 			/* increased in recv tasklet smc_cdc_msg_rcv() */
 			smp_mb__before_atomic();
 			atomic_sub(copylen, &conn->bytes_to_rcv);
-			/* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+			/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 			smp_mb__after_atomic();
 			if (msg)
 				smc_rx_update_consumer(conn, cons, copylen);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 08a7de98bb03..58bc7ca3101a 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -180,8 +180,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		tx_cnt_prep = prep.count;
 		/* determine chunks where to write into sndbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
-		chunk_len = min_t(size_t,
-				  copylen, conn->sndbuf_size - tx_cnt_prep);
+		chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len -
+				  tx_cnt_prep);
 		chunk_len_sum = chunk_len;
 		chunk_off = tx_cnt_prep;
 		smc_sndbuf_sync_sg_for_cpu(conn);
@@ -206,21 +206,21 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		}
 		smc_sndbuf_sync_sg_for_device(conn);
 		/* update cursors */
-		smc_curs_add(conn->sndbuf_size, &prep, copylen);
+		smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
 		smc_curs_write(&conn->tx_curs_prep,
 			       smc_curs_read(&prep, conn),
 			       conn);
 		/* increased in send tasklet smc_cdc_tx_handler() */
 		smp_mb__before_atomic();
 		atomic_sub(copylen, &conn->sndbuf_space);
-		/* guarantee 0 <= sndbuf_space <= sndbuf_size */
+		/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
 		smp_mb__after_atomic();
 		/* since we just produced more new data into sndbuf,
 		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
 		 */
 		if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
 		    (atomic_read(&conn->sndbuf_space) >
-						(conn->sndbuf_size >> 1)))
+						(conn->sndbuf_desc->len >> 1)))
 			/* for a corked socket defer the RDMA writes if there
 			 * is still sufficient sndbuf_space available
 			 */
@@ -286,7 +286,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn,
 	atomic_sub(len, &conn->peer_rmbe_space);
 	/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
 	smp_mb__after_atomic();
-	smc_curs_add(conn->sndbuf_size, sent, len);
+	smc_curs_add(conn->sndbuf_desc->len, sent, len);
 }
 
 /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
@@ -309,7 +309,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
 	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
 	/* cf. wmem_alloc - (snd_max - snd_una) */
-	to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+	to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
 	if (to_send <= 0)
 		return 0;
 
@@ -351,12 +351,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 	dst_len_sum = dst_len;
 	src_off = sent.count;
 	/* dst_len determines the maximum src_len */
-	if (sent.count + dst_len <= conn->sndbuf_size) {
+	if (sent.count + dst_len <= conn->sndbuf_desc->len) {
 		/* unwrapped src case: single chunk of entire dst_len */
 		src_len = dst_len;
 	} else {
 		/* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
-		src_len = conn->sndbuf_size - sent.count;
+		src_len = conn->sndbuf_desc->len - sent.count;
 	}
 	src_len_sum = src_len;
 	dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl);
@@ -368,8 +368,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 			sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
 			num_sges++;
 			src_off += src_len;
-			if (src_off >= conn->sndbuf_size)
-				src_off -= conn->sndbuf_size;
+			if (src_off >= conn->sndbuf_desc->len)
+				src_off -= conn->sndbuf_desc->len;
 						/* modulo in send ring */
 			if (src_len_sum == dst_len)
 				break; /* either on 1st or 2nd iteration */
@@ -387,7 +387,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		dst_len = len - dst_len; /* remainder */
 		dst_len_sum += dst_len;
 		src_len = min_t(int,
-				dst_len, conn->sndbuf_size - sent.count);
+				dst_len, conn->sndbuf_desc->len - sent.count);
 		src_len_sum = src_len;
 	}
 
@@ -484,11 +484,11 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	smc_curs_write(&cfed,
 		       smc_curs_read(&conn->rx_curs_confirmed, conn),
 		       conn);
-	to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+	to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
 	    ((to_confirm > conn->rmbe_update_limit) &&
-	     ((to_confirm > (conn->rmbe_size / 2)) ||
+	     ((to_confirm > (conn->rmb_desc->len / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
 		if ((smc_cdc_get_slot_and_msg_send(conn) < 0) &&
 		    conn->alert_token_local) { /* connection healthy */
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 8f64b12bf03c..44d077942976 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -24,7 +24,7 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
 
 	smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
 	smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
-	return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+	return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
 }
 
 void smc_tx_work(struct work_struct *work);
-- 
cgit v1.2.3-59-g8ed1b


From 9fda3510ab6002a9c77cc21de0d6d8c48cf1abba Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:11 +0200
Subject: net/smc: move link group list to smc_core

This patch moves the global link group list to smc_core where the link
group functions are. To make this work, it moves code in af_smc and
smc_ib that operates on the link group list to smc_core as well.

While at it, the link group counter is integrated into the list
structure and initialized to zero.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 19 +------------------
 net/smc/smc_core.c | 40 +++++++++++++++++++++++++++++++++++++---
 net/smc/smc_core.h |  5 +++--
 net/smc/smc_ib.c   | 13 +------------
 4 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 409b1367970c..4bb320ae11a9 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -46,11 +46,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
 						 */
 
-struct smc_lgr_list smc_lgr_list = {		/* established link groups */
-	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
-	.list = LIST_HEAD_INIT(smc_lgr_list.list),
-};
-
 static void smc_tcp_listen_work(struct work_struct *);
 
 static void smc_set_keepalive(struct sock *sk, int val)
@@ -1637,19 +1632,7 @@ out_pnet:
 
 static void __exit smc_exit(void)
 {
-	struct smc_link_group *lgr, *lg;
-	LIST_HEAD(lgr_freeing_list);
-
-	spin_lock_bh(&smc_lgr_list.lock);
-	if (!list_empty(&smc_lgr_list.list))
-		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
-	spin_unlock_bh(&smc_lgr_list.lock);
-	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
-		list_del_init(&lgr->list);
-		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
-		cancel_delayed_work_sync(&lgr->free_work);
-		smc_lgr_free(lgr); /* free link group */
-	}
+	smc_core_exit();
 	static_branch_disable(&tcp_have_smc);
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 6396426080f9..70ab2f716f83 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -30,7 +30,11 @@
 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
 #define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
 
-static u32 smc_lgr_num;			/* unique link group number */
+static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
+	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+	.list = LIST_HEAD_INIT(smc_lgr_list.list),
+	.num = 0,
+};
 
 static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
 			 bool is_rmb);
@@ -181,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc,
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
 	}
-	smc_lgr_num += SMC_LGR_NUM_INCR;
-	memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
+	smc_lgr_list.num += SMC_LGR_NUM_INCR;
+	memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE);
 	INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
 	lgr->conns_all = RB_ROOT;
 
@@ -374,6 +378,18 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	smc_lgr_schedule_free_work(lgr);
 }
 
+/* Called when IB port is terminated */
+void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link_group *lgr, *l;
+
+	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
+		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
+		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
+			smc_lgr_terminate(lgr);
+	}
+}
+
 /* Determine vlan of internal TCP socket.
  * @vlan_id: address to store the determined vlan id into
  */
@@ -802,3 +818,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
 		return conn->rtoken_idx;
 	return 0;
 }
+
+/* Called (from smc_exit) when module is removed */
+void smc_core_exit(void)
+{
+	struct smc_link_group *lgr, *lg;
+	LIST_HEAD(lgr_freeing_list);
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	if (!list_empty(&smc_lgr_list.list))
+		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+		list_del_init(&lgr->list);
+		smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+		cancel_delayed_work_sync(&lgr->free_work);
+		smc_lgr_free(lgr); /* free link group */
+	}
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 4885a7efa84f..3d4bd7011ab3 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -23,10 +23,9 @@
 struct smc_lgr_list {			/* list of link group definition */
 	struct list_head	list;
 	spinlock_t		lock;	/* protects list of link groups */
+	u32			num;	/* unique link group number */
 };
 
-extern struct smc_lgr_list	smc_lgr_list; /* list of link groups */
-
 enum smc_lgr_role {		/* possible roles of a link group */
 	SMC_CLNT,	/* client */
 	SMC_SERV	/* server */
@@ -210,6 +209,7 @@ struct smc_clc_msg_accept_confirm;
 void smc_lgr_free(struct smc_link_group *lgr);
 void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
+void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_buf_create(struct smc_sock *smc);
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
 			    struct smc_clc_msg_accept_confirm *clc);
@@ -219,4 +219,5 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+void smc_core_exit(void);
 #endif
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 26df554f7588..0eed7ab9f28b 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -143,17 +143,6 @@ out:
 	return rc;
 }
 
-static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
-{
-	struct smc_link_group *lgr, *l;
-
-	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
-		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
-		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
-			smc_lgr_terminate(lgr);
-	}
-}
-
 /* process context wrapper for might_sleep smc_ib_remember_port_attr */
 static void smc_ib_port_event_work(struct work_struct *work)
 {
@@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
 		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
 		clear_bit(port_idx, &smcibdev->port_event_mask);
 		if (!smc_ib_port_active(smcibdev, port_idx + 1))
-			smc_ib_port_terminate(smcibdev, port_idx + 1);
+			smc_port_terminate(smcibdev, port_idx + 1);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 92a138e333ead89918db5f72e573264cb3b91cb5 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:12 +0200
Subject: net/smc: rename connection index to RMBE index

The connection index is actually a RMBE index. So, this patch changes
the name accordingly.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  | 2 +-
 net/smc/smc.h     | 2 +-
 net/smc/smc_clc.c | 4 ++--
 net/smc/smc_clc.h | 2 +-
 net/smc/smc_tx.c  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4bb320ae11a9..bd67430f3730 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -377,7 +377,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 static void smc_conn_save_peer_info(struct smc_sock *smc,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
-	smc->conn.peer_conn_idx = clc->conn_idx;
+	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index ad6694f516dd..fb8dec8bc17f 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -118,7 +118,7 @@ struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
 	u32			alert_token_local; /* unique conn. id */
-	u8			peer_conn_idx;	/* from tcp handshake */
+	u8			peer_rmbe_idx;	/* from tcp handshake */
 	int			peer_rmbe_size;	/* size of peer rx buffer */
 	atomic_t		peer_rmbe_space;/* remaining free bytes in peer
 						 * rmbe
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 236cb3f12c71..717449b1da0b 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
 	hton24(cclc.qpn, link->roce_qp->qp_num);
 	cclc.rmb_rkey =
 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
-	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+	cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */
 	cclc.rmbe_alert_token = htonl(conn->alert_token_local);
 	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
 	cclc.rmbe_size = conn->rmbe_size_short;
@@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
 	hton24(aclc.qpn, link->roce_qp->qp_num);
 	aclc.rmb_rkey =
 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
-	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
+	aclc.rmbe_idx = 1;			/* as long as 1 RMB = 1 RMBE */
 	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
 	aclc.qp_mtu = link->path_mtu;
 	aclc.rmbe_size = conn->rmbe_size_short,
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 63bf1dc2c1f9..41ff9ea96139 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
 	struct smc_clc_msg_local lcl;
 	u8 qpn[3];		/* QP number */
 	__be32 rmb_rkey;	/* RMB rkey */
-	u8 conn_idx;		/* Connection index, which RMBE in RMB */
+	u8 rmbe_idx;		/* Index of RMBE in RMB */
 	__be32 rmbe_alert_token;/* unique connection id */
 #if defined(__BIG_ENDIAN_BITFIELD)
 	u8 rmbe_size : 4,	/* RMBE buf size (compressed notation) */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 58bc7ca3101a..58a56c992b26 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -261,7 +261,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
 	rdma_wr.remote_addr =
 		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
 		/* RMBE within RMB */
-		((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+		((conn->peer_rmbe_idx - 1) * conn->peer_rmbe_size) +
 		/* offset within RMBE */
 		peer_rmbe_offset;
 	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
-- 
cgit v1.2.3-59-g8ed1b


From 95d8d26306ee19f9ba32b6381571a72ee924a0b6 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:13 +0200
Subject: net/smc: calculate write offset in RMB only once per connection

Currently, the write offset within the RMB is calculated on each write
operation although it is fixed for each connection. With this patch, the
offset is calculated once and stored in a connection specific variable.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 5 ++++-
 net/smc/smc.h    | 1 +
 net/smc/smc_tx.c | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index bd67430f3730..9871604ebb53 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -377,10 +377,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 static void smc_conn_save_peer_info(struct smc_sock *smc,
 				    struct smc_clc_msg_accept_confirm *clc)
 {
+	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
+
 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
-	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+	smc->conn.peer_rmbe_size = bufsize;
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
 }
 
 static void smc_link_save_peer_info(struct smc_link *link,
diff --git a/net/smc/smc.h b/net/smc/smc.h
index fb8dec8bc17f..9bc37645e7d5 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -151,6 +151,7 @@ struct smc_connection {
 	u16			tx_cdc_seq;	/* sequence # for CDC send */
 	spinlock_t		send_lock;	/* protect wr_sends */
 	struct delayed_work	tx_work;	/* retry of smc_cdc_msg_send */
+	u32			tx_off;		/* base offset in peer rmb */
 
 	struct smc_host_cdc_msg	local_rx_ctrl;	/* filled during event_handl.
 						 * .prod cf. TCP rcv_nxt
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 58a56c992b26..1f4a38b857f0 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -261,7 +261,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
 	rdma_wr.remote_addr =
 		lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
 		/* RMBE within RMB */
-		((conn->peer_rmbe_idx - 1) * conn->peer_rmbe_size) +
+		conn->tx_off +
 		/* offset within RMBE */
 		peer_rmbe_offset;
 	rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
-- 
cgit v1.2.3-59-g8ed1b


From 2f6becaf79cd16f35ee7cb108200b898235b5aa2 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:14 +0200
Subject: net/smc: move smc_core specific code from smc.h to smc_core

SMC connection and buffer handling belong to smc_core. So, this patch
moves this code from smc.h to smc_core.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc.h      | 41 -----------------------------------------
 net/smc/smc_core.c | 27 +++++++++++++++++++++++++++
 net/smc/smc_core.h | 12 ++++++++++++
 3 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/net/smc/smc.h b/net/smc/smc.h
index 9bc37645e7d5..a1467e411645 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -220,41 +220,6 @@ static inline u32 ntoh24(u8 *net)
 	return be32_to_cpu(t);
 }
 
-#define SMC_BUF_MIN_SIZE 16384		/* minimum size of an RMB */
-
-#define SMC_RMBE_SIZES	16	/* number of distinct sizes for an RMBE */
-/* theoretically, the RFC states that largest size would be 512K,
- * i.e. compressed 5 and thus 6 sizes (0..5), despite
- * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
- */
-
-/* convert the RMB size into the compressed notation - minimum 16K.
- * In contrast to plain ilog2, this rounds towards the next power of 2,
- * so the socket application gets at least its desired sndbuf / rcvbuf size.
- */
-static inline u8 smc_compress_bufsize(int size)
-{
-	u8 compressed;
-
-	if (size <= SMC_BUF_MIN_SIZE)
-		return 0;
-
-	size = (size - 1) >> 14;
-	compressed = ilog2(size) + 1;
-	if (compressed >= SMC_RMBE_SIZES)
-		compressed = SMC_RMBE_SIZES - 1;
-	return compressed;
-}
-
-/* convert the RMB size from compressed notation into integer */
-static inline int smc_uncompress_bufsize(u8 compressed)
-{
-	u32 size;
-
-	size = 0x00000001 << (((int)compressed) + 14);
-	return (int)size;
-}
-
 #ifdef CONFIG_XFRM
 static inline bool using_ipsec(struct smc_sock *smc)
 {
@@ -268,12 +233,6 @@ static inline bool using_ipsec(struct smc_sock *smc)
 }
 #endif
 
-struct smc_clc_msg_local;
-
-void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc,
-		    struct smc_ib_device *smcibdev, u8 ibport,
-		    struct smc_clc_msg_local *lcl, int srv_first_contact);
 struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
 void smc_close_non_accepted(struct sock *sk);
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 70ab2f716f83..66afa086ae72 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -543,6 +543,33 @@ out:
 	return rc ? rc : local_contact;
 }
 
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static u8 smc_compress_bufsize(int size)
+{
+	u8 compressed;
+
+	if (size <= SMC_BUF_MIN_SIZE)
+		return 0;
+
+	size = (size - 1) >> 14;
+	compressed = ilog2(size) + 1;
+	if (compressed >= SMC_RMBE_SIZES)
+		compressed = SMC_RMBE_SIZES - 1;
+	return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+int smc_uncompress_bufsize(u8 compressed)
+{
+	u32 size;
+
+	size = 0x00000001 << (((int)compressed) + 14);
+	return (int)size;
+}
+
 /* try to reuse a sndbuf or rmb description slot for a certain
  * buffer size; if not available, return NULL
  */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 3d4bd7011ab3..93cb3523bf50 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -141,6 +141,12 @@ struct smc_rtoken {				/* address/key of remote RMB */
 };
 
 #define SMC_LGR_ID_SIZE		4
+#define SMC_BUF_MIN_SIZE	16384	/* minimum size of an RMB */
+#define SMC_RMBE_SIZES		16	/* number of distinct RMBE sizes */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
 
 struct smc_link_group {
 	struct list_head	list;
@@ -205,12 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn(
 
 struct smc_sock;
 struct smc_clc_msg_accept_confirm;
+struct smc_clc_msg_local;
 
 void smc_lgr_free(struct smc_link_group *lgr);
 void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
 int smc_buf_create(struct smc_sock *smc);
+int smc_uncompress_bufsize(u8 compressed);
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
 			    struct smc_clc_msg_accept_confirm *clc);
 int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
@@ -219,5 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc,
+		    struct smc_ib_device *smcibdev, u8 ibport,
+		    struct smc_clc_msg_local *lcl, int srv_first_contact);
 void smc_core_exit(void);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d7b0e37c1ac152905b18a5b9506179091a35b0b6 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:15 +0200
Subject: net/smc: restructure CDC message reception

This patch moves a CDC sanity check from smc_cdc_msg_recv_action() to
the other sanity checks in smc_cdc_rx_handler(). While doing this, it
simplifies smc_cdc_msg_recv() and removes unneeded function parameters.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_cdc.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 2809f9902446..8d2c079c87b0 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -165,19 +165,12 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
 }
 
 static void smc_cdc_msg_recv_action(struct smc_sock *smc,
-				    struct smc_link *link,
 				    struct smc_cdc_msg *cdc)
 {
 	union smc_host_cursor cons_old, prod_old;
 	struct smc_connection *conn = &smc->conn;
 	int diff_cons, diff_prod;
 
-	if (!cdc->prod_flags.failover_validation) {
-		if (smc_cdc_before(ntohs(cdc->seqno),
-				   conn->local_rx_ctrl.seqno))
-			/* received seqno is old */
-			return;
-	}
 	smc_curs_write(&prod_old,
 		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
 		       conn);
@@ -236,26 +229,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 }
 
 /* called under tasklet context */
-static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
-				    struct smc_link *link, u64 wr_id)
+static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc)
 {
-	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
-						  lnk[SMC_SINGLE_LINK]);
-	struct smc_connection *connection;
-	struct smc_sock *smc;
-
-	/* lookup connection */
-	read_lock_bh(&lgr->conns_lock);
-	connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
-	if (!connection) {
-		read_unlock_bh(&lgr->conns_lock);
-		return;
-	}
-	smc = container_of(connection, struct smc_sock, conn);
 	sock_hold(&smc->sk);
-	read_unlock_bh(&lgr->conns_lock);
 	bh_lock_sock(&smc->sk);
-	smc_cdc_msg_recv_action(smc, link, cdc);
+	smc_cdc_msg_recv_action(smc, cdc);
 	bh_unlock_sock(&smc->sk);
 	sock_put(&smc->sk); /* no free sk in softirq-context */
 }
@@ -266,12 +244,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
 {
 	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
 	struct smc_cdc_msg *cdc = buf;
+	struct smc_connection *conn;
+	struct smc_link_group *lgr;
+	struct smc_sock *smc;
 
 	if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
 		return; /* short message */
 	if (cdc->len != SMC_WR_TX_SIZE)
 		return; /* invalid message */
-	smc_cdc_msg_recv(cdc, link, wc->wr_id);
+
+	/* lookup connection */
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+	read_lock_bh(&lgr->conns_lock);
+	conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+	read_unlock_bh(&lgr->conns_lock);
+	if (!conn)
+		return;
+	smc = container_of(conn, struct smc_sock, conn);
+
+	if (!cdc->prod_flags.failover_validation) {
+		if (smc_cdc_before(ntohs(cdc->seqno),
+				   conn->local_rx_ctrl.seqno))
+			/* received seqno is old */
+			return;
+	}
+	smc_cdc_msg_recv(smc, cdc);
 }
 
 static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 8437bda0d47bd8ffb2675e7601978f3ffde767ad Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:16 +0200
Subject: net/smc: do a few smc_core.c cleanups

This patch consists of Christmas tree fixes and removal of an unneeded
function parameter.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 66afa086ae72..c63cf7b07ccc 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -474,10 +474,10 @@ int smc_conn_create(struct smc_sock *smc,
 		    struct smc_clc_msg_local *lcl, int srv_first_contact)
 {
 	struct smc_connection *conn = &smc->conn;
+	int local_contact = SMC_FIRST_CONTACT;
 	struct smc_link_group *lgr;
 	unsigned short vlan_id;
 	enum smc_lgr_role role;
-	int local_contact = SMC_FIRST_CONTACT;
 	int rc = 0;
 
 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -573,11 +573,9 @@ int smc_uncompress_bufsize(u8 compressed)
 /* try to reuse a sndbuf or rmb description slot for a certain
  * buffer size; if not available, return NULL
  */
-static inline
-struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr,
-				      int compressed_bufsize,
-				      rwlock_t *lock,
-				      struct list_head *buf_list)
+static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
+					     rwlock_t *lock,
+					     struct list_head *buf_list)
 {
 	struct smc_buf_desc *buf_slot;
 
@@ -662,9 +660,9 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 
 static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
 {
+	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
 	struct smc_connection *conn = &smc->conn;
 	struct smc_link_group *lgr = conn->lgr;
-	struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM);
 	struct list_head *buf_list;
 	int bufsize, bufsize_short;
 	int sk_buf_size;
@@ -692,7 +690,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb)
 			continue;
 
 		/* check for reusable slot in the link group */
-		buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list);
+		buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list);
 		if (buf_desc) {
 			memset(buf_desc->cpu_addr, 0, bufsize);
 			break; /* found reusable slot */
-- 
cgit v1.2.3-59-g8ed1b


From 6511aad3f0392f37630b4f7f96fbc192dc54b29f Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:17 +0200
Subject: net/smc: change smc_buf_free function parameters

This patch changes the function smc_buf_free to use the SMC link group
instead of the link as function parameter. Also, it changes the order of
the other two parameters.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index c63cf7b07ccc..1e5c0e90a706 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -36,8 +36,8 @@ static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 	.num = 0,
 };
 
-static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
-			 bool is_rmb);
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+			 struct smc_buf_desc *buf_desc);
 
 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
 {
@@ -249,14 +249,12 @@ static void smc_buf_unuse(struct smc_connection *conn)
 		} else {
 			/* buf registration failed, reuse not possible */
 			struct smc_link_group *lgr = conn->lgr;
-			struct smc_link *lnk;
 
 			write_lock_bh(&lgr->rmbs_lock);
 			list_del(&conn->rmb_desc->list);
 			write_unlock_bh(&lgr->rmbs_lock);
 
-			lnk = &lgr->lnk[SMC_SINGLE_LINK];
-			smc_buf_free(conn->rmb_desc, lnk, true);
+			smc_buf_free(lgr, true, conn->rmb_desc);
 		}
 	}
 }
@@ -282,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk)
 	smc_wr_free_link_mem(lnk);
 }
 
-static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
-			 bool is_rmb)
+static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
+			 struct smc_buf_desc *buf_desc)
 {
+	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
 	if (is_rmb) {
 		if (buf_desc->mr_rx[SMC_SINGLE_LINK])
 			smc_ib_put_memory_region(
@@ -303,7 +303,6 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk,
 
 static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
 {
-	struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
 	struct smc_buf_desc *buf_desc, *bf_desc;
 	struct list_head *buf_list;
 	int i;
@@ -316,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
 					 list) {
 			list_del(&buf_desc->list);
-			smc_buf_free(buf_desc, lnk, is_rmb);
+			smc_buf_free(lgr, is_rmb, buf_desc);
 		}
 	}
 }
@@ -627,7 +626,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 	rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1,
 			    GFP_KERNEL);
 	if (rc) {
-		smc_buf_free(buf_desc, lnk, is_rmb);
+		smc_buf_free(lgr, is_rmb, buf_desc);
 		return ERR_PTR(rc);
 	}
 	sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl,
@@ -638,7 +637,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 			       is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	/* SMC protocol depends on mapping to one DMA address only */
 	if (rc != 1)  {
-		smc_buf_free(buf_desc, lnk, is_rmb);
+		smc_buf_free(lgr, is_rmb, buf_desc);
 		return ERR_PTR(-EAGAIN);
 	}
 
@@ -649,7 +648,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr,
 					      IB_ACCESS_LOCAL_WRITE,
 					      buf_desc);
 		if (rc) {
-			smc_buf_free(buf_desc, lnk, is_rmb);
+			smc_buf_free(lgr, is_rmb, buf_desc);
 			return ERR_PTR(rc);
 		}
 	}
@@ -775,8 +774,7 @@ int smc_buf_create(struct smc_sock *smc)
 	/* create rmb */
 	rc = __smc_buf_create(smc, true);
 	if (rc)
-		smc_buf_free(smc->conn.sndbuf_desc,
-			     &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false);
+		smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
 	return rc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3b2dec2603d5b06ad3af71c1164ca0b92df3d2a8 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Fri, 18 May 2018 09:34:18 +0200
Subject: net/smc: restructure client and server code in af_smc

This patch splits up the functions smc_connect_rdma and smc_listen_work
into smaller functions.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 559 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 318 insertions(+), 241 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 9871604ebb53..48530dab5c94 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -396,165 +396,186 @@ static void smc_link_save_peer_info(struct smc_link *link,
 	link->peer_mtu = clc->qp_mtu;
 }
 
-/* setup for RDMA connection of client */
-static int smc_connect_rdma(struct smc_sock *smc)
+/* fall back during connect */
+static int smc_connect_fallback(struct smc_sock *smc)
 {
-	struct smc_clc_msg_accept_confirm aclc;
-	int local_contact = SMC_FIRST_CONTACT;
-	struct smc_ib_device *smcibdev;
-	struct smc_link *link;
-	u8 srv_first_contact;
-	int reason_code = 0;
-	int rc = 0;
-	u8 ibport;
-
-	sock_hold(&smc->sk); /* sock put in passive closing */
+	smc->use_fallback = true;
+	smc_copy_sock_settings_to_clc(smc);
+	if (smc->sk.sk_state == SMC_INIT)
+		smc->sk.sk_state = SMC_ACTIVE;
+	return 0;
+}
 
-	if (smc->use_fallback)
-		goto out_connected;
+/* decline and fall back during connect */
+static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
+{
+	int rc;
 
-	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
-		/* peer has not signalled SMC-capability */
-		smc->use_fallback = true;
-		goto out_connected;
+	if (reason_code < 0) /* error, fallback is not possible */
+		return reason_code;
+	if (reason_code != SMC_CLC_DECL_REPLY) {
+		rc = smc_clc_send_decline(smc, reason_code);
+		if (rc < 0)
+			return rc;
 	}
+	return smc_connect_fallback(smc);
+}
 
-	/* IPSec connections opt out of SMC-R optimizations */
-	if (using_ipsec(smc)) {
-		reason_code = SMC_CLC_DECL_IPSEC;
-		goto decline_rdma;
-	}
+/* abort connecting */
+static int smc_connect_abort(struct smc_sock *smc, int reason_code,
+			     int local_contact)
+{
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(smc->conn.lgr);
+	mutex_unlock(&smc_create_lgr_pending);
+	smc_conn_free(&smc->conn);
+	if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
+		sock_put(&smc->sk); /* passive closing */
+	return reason_code;
+}
+
+/* check if there is a rdma device available for this connection. */
+/* called for connect and listen */
+static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
+			  u8 *ibport)
+{
+	int reason_code = 0;
 
 	/* PNET table look up: search active ib_device and port
 	 * within same PNETID that also contains the ethernet device
 	 * used for the internal TCP socket
 	 */
-	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
-	if (!smcibdev) {
+	smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
+	if (!(*ibdev))
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-		goto decline_rdma;
-	}
+
+	return reason_code;
+}
+
+/* CLC handshake during connect */
+static int smc_connect_clc(struct smc_sock *smc,
+			   struct smc_clc_msg_accept_confirm *aclc,
+			   struct smc_ib_device *ibdev, u8 ibport)
+{
+	int rc = 0;
 
 	/* do inband token exchange */
-	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
-	if (reason_code < 0) {
-		rc = reason_code;
-		goto out_err;
-	}
-	if (reason_code > 0) /* configuration error */
-		goto decline_rdma;
+	rc = smc_clc_send_proposal(smc, ibdev, ibport);
+	if (rc)
+		return rc;
 	/* receive SMC Accept CLC message */
-	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
-				       SMC_CLC_ACCEPT);
-	if (reason_code < 0) {
-		rc = reason_code;
-		goto out_err;
-	}
-	if (reason_code > 0)
-		goto decline_rdma;
+	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT);
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc,
+			    struct smc_clc_msg_accept_confirm *aclc,
+			    struct smc_ib_device *ibdev, u8 ibport)
+{
+	int local_contact = SMC_FIRST_CONTACT;
+	struct smc_link *link;
+	int reason_code = 0;
 
-	srv_first_contact = aclc.hdr.flag;
 	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
-					srv_first_contact);
+	local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl,
+					aclc->hdr.flag);
 	if (local_contact < 0) {
-		rc = local_contact;
-		if (rc == -ENOMEM)
+		if (local_contact == -ENOMEM)
 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
-		else if (rc == -ENOLINK)
+		else if (local_contact == -ENOLINK)
 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 		else
 			reason_code = SMC_CLC_DECL_INTERR; /* other error */
-		goto decline_rdma_unlock;
+		return smc_connect_abort(smc, reason_code, 0);
 	}
 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
-	smc_conn_save_peer_info(smc, &aclc);
+	smc_conn_save_peer_info(smc, aclc);
 
 	/* create send buffer and rmb */
-	rc = smc_buf_create(smc);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_MEM;
-		goto decline_rdma_unlock;
-	}
+	if (smc_buf_create(smc))
+		return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact);
 
 	if (local_contact == SMC_FIRST_CONTACT)
-		smc_link_save_peer_info(link, &aclc);
+		smc_link_save_peer_info(link, aclc);
 
-	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_INTERR;
-		goto decline_rdma_unlock;
-	}
+	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
+		return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+					 local_contact);
 
 	smc_close_init(smc);
 	smc_rx_init(smc);
 
 	if (local_contact == SMC_FIRST_CONTACT) {
-		rc = smc_ib_ready_link(link);
-		if (rc) {
-			reason_code = SMC_CLC_DECL_INTERR;
-			goto decline_rdma_unlock;
-		}
+		if (smc_ib_ready_link(link))
+			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+						 local_contact);
 	} else {
-		if (!smc->conn.rmb_desc->reused) {
-			if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) {
-				reason_code = SMC_CLC_DECL_INTERR;
-				goto decline_rdma_unlock;
-			}
-		}
+		if (!smc->conn.rmb_desc->reused &&
+		    smc_reg_rmb(link, smc->conn.rmb_desc, true))
+			return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+						 local_contact);
 	}
 	smc_rmb_sync_sg_for_device(&smc->conn);
 
-	rc = smc_clc_send_confirm(smc);
-	if (rc)
-		goto out_err_unlock;
+	reason_code = smc_clc_send_confirm(smc);
+	if (reason_code)
+		return smc_connect_abort(smc, reason_code, local_contact);
+
+	smc_tx_init(smc);
 
 	if (local_contact == SMC_FIRST_CONTACT) {
 		/* QP confirmation over RoCE fabric */
 		reason_code = smc_clnt_conf_first_link(smc);
-		if (reason_code < 0) {
-			rc = reason_code;
-			goto out_err_unlock;
-		}
-		if (reason_code > 0)
-			goto decline_rdma_unlock;
+		if (reason_code)
+			return smc_connect_abort(smc, reason_code,
+						 local_contact);
 	}
-
 	mutex_unlock(&smc_create_lgr_pending);
-	smc_tx_init(smc);
 
-out_connected:
 	smc_copy_sock_settings_to_clc(smc);
 	if (smc->sk.sk_state == SMC_INIT)
 		smc->sk.sk_state = SMC_ACTIVE;
 
-	return rc ? rc : local_contact;
+	return 0;
+}
 
-decline_rdma_unlock:
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(smc->conn.lgr);
-	mutex_unlock(&smc_create_lgr_pending);
-	smc_conn_free(&smc->conn);
-decline_rdma:
-	/* RDMA setup failed, switch back to TCP */
-	smc->use_fallback = true;
-	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
-		rc = smc_clc_send_decline(smc, reason_code);
-		if (rc < 0)
-			goto out_err;
-	}
-	goto out_connected;
+/* perform steps before actually connecting */
+static int __smc_connect(struct smc_sock *smc)
+{
+	struct smc_clc_msg_accept_confirm aclc;
+	struct smc_ib_device *ibdev;
+	int rc = 0;
+	u8 ibport;
 
-out_err_unlock:
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(smc->conn.lgr);
-	mutex_unlock(&smc_create_lgr_pending);
-	smc_conn_free(&smc->conn);
-out_err:
-	if (smc->sk.sk_state == SMC_INIT)
-		sock_put(&smc->sk); /* passive closing */
-	return rc;
+	sock_hold(&smc->sk); /* sock put in passive closing */
+
+	if (smc->use_fallback)
+		return smc_connect_fallback(smc);
+
+	/* if peer has not signalled SMC-capability, fall back */
+	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
+		return smc_connect_fallback(smc);
+
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(smc))
+		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
+
+	/* check if a RDMA device is available; if not, fall back */
+	if (smc_check_rdma(smc, &ibdev, &ibport))
+		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+
+	/* perform CLC handshake */
+	rc = smc_connect_clc(smc, &aclc, ibdev, ibport);
+	if (rc)
+		return smc_connect_decline_fallback(smc, rc);
+
+	/* connect using rdma */
+	rc = smc_connect_rdma(smc, &aclc, ibdev, ibport);
+	if (rc)
+		return smc_connect_decline_fallback(smc, rc);
+
+	return 0;
 }
 
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
@@ -590,8 +611,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	if (rc)
 		goto out;
 
-	/* setup RDMA connection */
-	rc = smc_connect_rdma(smc);
+	rc = __smc_connect(smc);
 	if (rc < 0)
 		goto out;
 	else
@@ -789,182 +809,239 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 	return 0;
 }
 
-/* setup for RDMA connection of server */
-static void smc_listen_work(struct work_struct *work)
+/* listen worker: finish */
+static void smc_listen_out(struct smc_sock *new_smc)
 {
-	struct smc_sock *new_smc = container_of(work, struct smc_sock,
-						smc_listen_work);
-	struct smc_clc_msg_proposal_prefix *pclc_prfx;
-	struct socket *newclcsock = new_smc->clcsock;
 	struct smc_sock *lsmc = new_smc->listen_smc;
-	struct smc_clc_msg_accept_confirm cclc;
-	int local_contact = SMC_REUSE_CONTACT;
 	struct sock *newsmcsk = &new_smc->sk;
-	struct smc_clc_msg_proposal *pclc;
-	struct smc_ib_device *smcibdev;
-	u8 buf[SMC_CLC_MAX_LEN];
-	struct smc_link *link;
-	int reason_code = 0;
-	int rc = 0;
-	u8 ibport;
 
-	if (new_smc->use_fallback)
-		goto out_connected;
-
-	/* check if peer is smc capable */
-	if (!tcp_sk(newclcsock->sk)->syn_smc) {
-		new_smc->use_fallback = true;
-		goto out_connected;
+	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+	if (lsmc->sk.sk_state == SMC_LISTEN) {
+		smc_accept_enqueue(&lsmc->sk, newsmcsk);
+	} else { /* no longer listening */
+		smc_close_non_accepted(newsmcsk);
 	}
+	release_sock(&lsmc->sk);
 
-	/* do inband token exchange -
-	 *wait for and receive SMC Proposal CLC message
-	 */
-	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
-				       SMC_CLC_PROPOSAL);
-	if (reason_code < 0)
-		goto out_err;
-	if (reason_code > 0)
-		goto decline_rdma;
+	/* Wake up accept */
+	lsmc->sk.sk_data_ready(&lsmc->sk);
+	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+}
 
-	/* IPSec connections opt out of SMC-R optimizations */
-	if (using_ipsec(new_smc)) {
-		reason_code = SMC_CLC_DECL_IPSEC;
-		goto decline_rdma;
-	}
+/* listen worker: finish in state connected */
+static void smc_listen_out_connected(struct smc_sock *new_smc)
+{
+	struct sock *newsmcsk = &new_smc->sk;
 
-	/* PNET table look up: search active ib_device and port
-	 * within same PNETID that also contains the ethernet device
-	 * used for the internal TCP socket
-	 */
-	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
-	if (!smcibdev) {
-		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-		goto decline_rdma;
+	sk_refcnt_debug_inc(newsmcsk);
+	if (newsmcsk->sk_state == SMC_INIT)
+		newsmcsk->sk_state = SMC_ACTIVE;
+
+	smc_listen_out(new_smc);
+}
+
+/* listen worker: finish in error state */
+static void smc_listen_out_err(struct smc_sock *new_smc)
+{
+	struct sock *newsmcsk = &new_smc->sk;
+
+	if (newsmcsk->sk_state == SMC_INIT)
+		sock_put(&new_smc->sk); /* passive closing */
+	newsmcsk->sk_state = SMC_CLOSED;
+	smc_conn_free(&new_smc->conn);
+
+	smc_listen_out(new_smc);
+}
+
+/* listen worker: decline and fall back if possible */
+static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
+			       int local_contact)
+{
+	/* RDMA setup failed, switch back to TCP */
+	if (local_contact == SMC_FIRST_CONTACT)
+		smc_lgr_forget(new_smc->conn.lgr);
+	if (reason_code < 0) { /* error, no fallback possible */
+		smc_listen_out_err(new_smc);
+		return;
+	}
+	smc_conn_free(&new_smc->conn);
+	new_smc->use_fallback = true;
+	if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
+		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
+			smc_listen_out_err(new_smc);
+			return;
+		}
 	}
+	smc_listen_out_connected(new_smc);
+}
+
+/* listen worker: check prefixes */
+static int smc_listen_rdma_check(struct smc_sock *new_smc,
+				 struct smc_clc_msg_proposal *pclc)
+{
+	struct smc_clc_msg_proposal_prefix *pclc_prfx;
+	struct socket *newclcsock = new_smc->clcsock;
 
-	pclc = (struct smc_clc_msg_proposal *)&buf;
 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
+	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
+		return SMC_CLC_DECL_CNFERR;
 
-	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-		goto decline_rdma;
-	}
+	return 0;
+}
 
+/* listen worker: initialize connection and buffers */
+static int smc_listen_rdma_init(struct smc_sock *new_smc,
+				struct smc_clc_msg_proposal *pclc,
+				struct smc_ib_device *ibdev, u8 ibport,
+				int *local_contact)
+{
 	/* allocate connection / link group */
-	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
-					0);
-	if (local_contact < 0) {
-		rc = local_contact;
-		if (rc == -ENOMEM)
-			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
-		goto decline_rdma_unlock;
+	*local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0);
+	if (*local_contact < 0) {
+		if (*local_contact == -ENOMEM)
+			return SMC_CLC_DECL_MEM;/* insufficient memory*/
+		return SMC_CLC_DECL_INTERR; /* other error */
 	}
-	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
 	/* create send buffer and rmb */
-	rc = smc_buf_create(new_smc);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_MEM;
-		goto decline_rdma_unlock;
-	}
+	if (smc_buf_create(new_smc))
+		return SMC_CLC_DECL_MEM;
 
-	smc_close_init(new_smc);
-	smc_rx_init(new_smc);
+	return 0;
+}
+
+/* listen worker: register buffers */
+static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
+{
+	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 
 	if (local_contact != SMC_FIRST_CONTACT) {
 		if (!new_smc->conn.rmb_desc->reused) {
-			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) {
-				reason_code = SMC_CLC_DECL_INTERR;
-				goto decline_rdma_unlock;
-			}
+			if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
+				return SMC_CLC_DECL_INTERR;
 		}
 	}
 	smc_rmb_sync_sg_for_device(&new_smc->conn);
 
-	rc = smc_clc_send_accept(new_smc, local_contact);
-	if (rc)
-		goto out_err_unlock;
+	return 0;
+}
+
+/* listen worker: finish RDMA setup */
+static void smc_listen_rdma_finish(struct smc_sock *new_smc,
+				   struct smc_clc_msg_accept_confirm *cclc,
+				   int local_contact)
+{
+	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+	int reason_code = 0;
 
-	/* receive SMC Confirm CLC message */
-	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
-				       SMC_CLC_CONFIRM);
-	if (reason_code < 0)
-		goto out_err_unlock;
-	if (reason_code > 0)
-		goto decline_rdma_unlock;
-	smc_conn_save_peer_info(new_smc, &cclc);
 	if (local_contact == SMC_FIRST_CONTACT)
-		smc_link_save_peer_info(link, &cclc);
+		smc_link_save_peer_info(link, cclc);
 
-	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
-	if (rc) {
+	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
 		reason_code = SMC_CLC_DECL_INTERR;
-		goto decline_rdma_unlock;
+		goto decline;
 	}
 
 	if (local_contact == SMC_FIRST_CONTACT) {
-		rc = smc_ib_ready_link(link);
-		if (rc) {
+		if (smc_ib_ready_link(link)) {
 			reason_code = SMC_CLC_DECL_INTERR;
-			goto decline_rdma_unlock;
+			goto decline;
 		}
 		/* QP confirmation over RoCE fabric */
 		reason_code = smc_serv_conf_first_link(new_smc);
-		if (reason_code < 0)
-			/* peer is not aware of a problem */
-			goto out_err_unlock;
-		if (reason_code > 0)
-			goto decline_rdma_unlock;
+		if (reason_code)
+			goto decline;
 	}
+	return;
 
-	smc_tx_init(new_smc);
+decline:
 	mutex_unlock(&smc_create_lgr_pending);
+	smc_listen_decline(new_smc, reason_code, local_contact);
+}
 
-out_connected:
-	sk_refcnt_debug_inc(newsmcsk);
-	if (newsmcsk->sk_state == SMC_INIT)
-		newsmcsk->sk_state = SMC_ACTIVE;
-enqueue:
-	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
-	if (lsmc->sk.sk_state == SMC_LISTEN) {
-		smc_accept_enqueue(&lsmc->sk, newsmcsk);
-	} else { /* no longer listening */
-		smc_close_non_accepted(newsmcsk);
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+	struct smc_sock *new_smc = container_of(work, struct smc_sock,
+						smc_listen_work);
+	struct socket *newclcsock = new_smc->clcsock;
+	struct smc_clc_msg_accept_confirm cclc;
+	struct smc_clc_msg_proposal *pclc;
+	struct smc_ib_device *ibdev;
+	u8 buf[SMC_CLC_MAX_LEN];
+	int local_contact = 0;
+	int reason_code = 0;
+	int rc = 0;
+	u8 ibport;
+
+	if (new_smc->use_fallback) {
+		smc_listen_out_connected(new_smc);
+		return;
 	}
-	release_sock(&lsmc->sk);
 
-	/* Wake up accept */
-	lsmc->sk.sk_data_ready(&lsmc->sk);
-	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
-	return;
+	/* check if peer is smc capable */
+	if (!tcp_sk(newclcsock->sk)->syn_smc) {
+		new_smc->use_fallback = true;
+		smc_listen_out_connected(new_smc);
+		return;
+	}
 
-decline_rdma_unlock:
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(new_smc->conn.lgr);
-	mutex_unlock(&smc_create_lgr_pending);
-decline_rdma:
-	/* RDMA setup failed, switch back to TCP */
-	smc_conn_free(&new_smc->conn);
-	new_smc->use_fallback = true;
-	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
-		if (smc_clc_send_decline(new_smc, reason_code) < 0)
-			goto out_err;
+	/* do inband token exchange -
+	 * wait for and receive SMC Proposal CLC message
+	 */
+	pclc = (struct smc_clc_msg_proposal *)&buf;
+	reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
+				       SMC_CLC_PROPOSAL);
+	if (reason_code) {
+		smc_listen_decline(new_smc, reason_code, 0);
+		return;
 	}
-	goto out_connected;
 
-out_err_unlock:
-	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(new_smc->conn.lgr);
+	/* IPSec connections opt out of SMC-R optimizations */
+	if (using_ipsec(new_smc)) {
+		smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0);
+		return;
+	}
+
+	mutex_lock(&smc_create_lgr_pending);
+	smc_close_init(new_smc);
+	smc_rx_init(new_smc);
+	smc_tx_init(new_smc);
+
+	/* check if RDMA is available */
+	if (smc_check_rdma(new_smc, &ibdev, &ibport) ||
+	    smc_listen_rdma_check(new_smc, pclc) ||
+	    smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
+				 &local_contact) ||
+	    smc_listen_rdma_reg(new_smc, local_contact)) {
+		/* SMC not supported, decline */
+		mutex_unlock(&smc_create_lgr_pending);
+		smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
+		return;
+	}
+
+	/* send SMC Accept CLC message */
+	rc = smc_clc_send_accept(new_smc, local_contact);
+	if (rc) {
+		mutex_unlock(&smc_create_lgr_pending);
+		smc_listen_decline(new_smc, rc, local_contact);
+		return;
+	}
+
+	/* receive SMC Confirm CLC message */
+	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+				       SMC_CLC_CONFIRM);
+	if (reason_code) {
+		mutex_unlock(&smc_create_lgr_pending);
+		smc_listen_decline(new_smc, reason_code, local_contact);
+		return;
+	}
+
+	/* finish worker */
+	smc_listen_rdma_finish(new_smc, &cclc, local_contact);
+	smc_conn_save_peer_info(new_smc, &cclc);
 	mutex_unlock(&smc_create_lgr_pending);
-out_err:
-	if (newsmcsk->sk_state == SMC_INIT)
-		sock_put(&new_smc->sk); /* passive closing */
-	newsmcsk->sk_state = SMC_CLOSED;
-	smc_conn_free(&new_smc->conn);
-	goto enqueue; /* queue new sock with sk_err set */
+	smc_listen_out_connected(new_smc);
 }
 
 static void smc_tcp_listen_work(struct work_struct *work)
@@ -1225,7 +1302,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			if (sk->sk_state == SMC_INIT &&
 			    mask & EPOLLOUT &&
 			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
-				rc = smc_connect_rdma(smc);
+				rc = __smc_connect(smc);
 				if (rc < 0)
 					mask |= EPOLLERR;
 				/* success cases including fallback */
-- 
cgit v1.2.3-59-g8ed1b


From 1f7455c3912d8b751514f03de659aab61a50c773 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Fri, 18 May 2018 13:14:23 +0800
Subject: tcp: tcp_rack_reo_wnd() can be static

Fixes: 20b654dfe1be ("tcp: support DUPACK threshold in RACK")
Signed-off-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 30cbfb69b1de..71593e4400ab 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -21,7 +21,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
-u32 tcp_rack_reo_wnd(const struct sock *sk)
+static u32 tcp_rack_reo_wnd(const struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8f678036f9780c7bf93241f09cf4f269e090ea94 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 May 2018 11:09:22 +0100
Subject: hippi: fix spelling mistake: "Framming" -> "Framing"

Trivial fix to spelling mistake in printk message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hippi/rrunner.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c
index 1ab97d99b9ba..f41116488079 100644
--- a/drivers/net/hippi/rrunner.c
+++ b/drivers/net/hippi/rrunner.c
@@ -867,7 +867,7 @@ static u32 rr_handle_event(struct net_device *dev, u32 prodidx, u32 eidx)
 			       dev->name);
 			goto drop;
 		case E_FRM_ERR:
-			printk(KERN_WARNING "%s: Framming Error\n",
+			printk(KERN_WARNING "%s: Framing Error\n",
 			       dev->name);
 			goto drop;
 		case E_FLG_SYN_ERR:
-- 
cgit v1.2.3-59-g8ed1b


From a3302baa2c74f09823bee9a1b8a09838059d9182 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Fri, 18 May 2018 14:34:51 +0200
Subject: net: mvpp2: typo and cosmetic fixes

This patch on the Marvell PPv2 driver is only cosmetic. Two typos are
removed as well as other cosmetic fixes, such as extra new lines or tabs
vs spaces.

Suggested-by: Stefan Chulski <stefanc@marvell.com>
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 7ba7e5938c2e..8a071ac79aa4 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -1756,7 +1756,6 @@ static void mvpp2_prs_tcam_ai_update(struct mvpp2_prs_entry *pe,
 	int i, ai_idx = MVPP2_PRS_TCAM_AI_BYTE;
 
 	for (i = 0; i < MVPP2_PRS_AI_BITS; i++) {
-
 		if (!(enable & BIT(i)))
 			continue;
 
@@ -1840,7 +1839,6 @@ static void mvpp2_prs_sram_ai_update(struct mvpp2_prs_entry *pe,
 	int ai_off = MVPP2_PRS_SRAM_AI_OFFS;
 
 	for (i = 0; i < MVPP2_PRS_SRAM_AI_CTRL_BITS; i++) {
-
 		if (!(mask & BIT(i)))
 			continue;
 
@@ -4936,7 +4934,7 @@ static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
 	if (port->gop_id == 0) {
 		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
 		val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
-		         MVPP22_XLG_EXT_INT_MASK_GIG);
+			 MVPP22_XLG_EXT_INT_MASK_GIG);
 		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
 	}
 
@@ -5470,7 +5468,6 @@ static void mvpp2_aggr_txq_pend_desc_add(struct mvpp2_port *port, int pending)
 			   MVPP2_AGGR_TXQ_UPDATE_REG, pending);
 }
 
-
 /* Check if there are enough free descriptors in aggregated txq.
  * If not, update the number of occupied descriptors and repeat the check.
  *
@@ -5550,7 +5547,7 @@ static int mvpp2_txq_reserved_desc_num_proc(struct mvpp2 *priv,
 
 	txq_pcpu->reserved_num += mvpp2_txq_alloc_reserved_desc(priv, txq, req);
 
-	/* OK, the descriptor cound has been updated: check again. */
+	/* OK, the descriptor could have been updated: check again. */
 	if (txq_pcpu->reserved_num < num)
 		return -ENOMEM;
 	return 0;
@@ -6032,7 +6029,7 @@ static int mvpp2_txq_init(struct mvpp2_port *port,
 	/* Calculate base address in prefetch buffer. We reserve 16 descriptors
 	 * for each existing TXQ.
 	 * TCONTS for PON port must be continuous from 0 to MVPP2_MAX_TCONT
-	 * GBE ports assumed to be continious from 0 to MVPP2_MAX_PORTS
+	 * GBE ports assumed to be continuous from 0 to MVPP2_MAX_PORTS
 	 */
 	desc_per_txq = 16;
 	desc = (port->id * MVPP2_MAX_TXQ * desc_per_txq) +
@@ -6602,8 +6599,7 @@ static int mvpp2_tx_frag_process(struct mvpp2_port *port, struct sk_buff *skb,
 		mvpp2_txdesc_size_set(port, tx_desc, frag->size);
 
 		buf_dma_addr = dma_map_single(port->dev->dev.parent, addr,
-					       frag->size,
-					       DMA_TO_DEVICE);
+					      frag->size, DMA_TO_DEVICE);
 		if (dma_mapping_error(port->dev->dev.parent, buf_dma_addr)) {
 			mvpp2_txq_desc_put(txq);
 			goto cleanup;
-- 
cgit v1.2.3-59-g8ed1b


From 80a95a80d358ded600a517ea0a93fe86b8f8da84 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Fri, 18 May 2018 19:12:53 +0530
Subject: cxgb4: collect SGE PF/VF queue map

For T6, collect info on queue mapping to corresponding PF/VF in SGE.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h | 17 ++++++++
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c    | 47 ++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c  |  3 +-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
index 740a18ba4229..c333e25620a7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h
@@ -62,6 +62,18 @@ struct cudbg_hw_sched {
 	u32 map;
 };
 
+#define SGE_QBASE_DATA_REG_NUM 4
+
+struct sge_qbase_reg_field {
+	u32 reg_addr;
+	u32 reg_data[SGE_QBASE_DATA_REG_NUM];
+	/* Max supported PFs */
+	u32 pf_data_value[PCIE_FW_MASTER_M + 1][SGE_QBASE_DATA_REG_NUM];
+	/* Max supported VFs */
+	u32 vf_data_value[T6_VF_M + 1][SGE_QBASE_DATA_REG_NUM];
+	u32 vfcount; /* Actual number of max vfs in current configuration */
+};
+
 struct ireg_field {
 	u32 ireg_addr;
 	u32 ireg_data;
@@ -357,6 +369,11 @@ static const u32 t5_sge_dbg_index_array[2][IREG_NUM_ELEM] = {
 	{0x10cc, 0x10d4, 0x0, 16},
 };
 
+static const u32 t6_sge_qbase_index_array[] = {
+	/* 1 addr reg SGE_QBASE_INDEX and 4 data reg SGE_QBASE_MAP[0-3] */
+	0x1250, 0x1240, 0x1244, 0x1248, 0x124c,
+};
+
 static const u32 t5_pcie_pdbg_array[][IREG_NUM_ELEM] = {
 	{0x5a04, 0x5a0c, 0x00, 0x20}, /* t5_pcie_pdbg_regs_00_to_20 */
 	{0x5a04, 0x5a0c, 0x21, 0x20}, /* t5_pcie_pdbg_regs_21_to_40 */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 4feb7eca0acf..0afcfe99bff3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -1339,16 +1339,39 @@ int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init,
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
+static void cudbg_read_sge_qbase_indirect_reg(struct adapter *padap,
+					      struct sge_qbase_reg_field *qbase,
+					      u32 func, bool is_pf)
+{
+	u32 *buff, i;
+
+	if (is_pf) {
+		buff = qbase->pf_data_value[func];
+	} else {
+		buff = qbase->vf_data_value[func];
+		/* In SGE_QBASE_INDEX,
+		 * Entries 0->7 are PF0->7, Entries 8->263 are VFID0->256.
+		 */
+		func += 8;
+	}
+
+	t4_write_reg(padap, qbase->reg_addr, func);
+	for (i = 0; i < SGE_QBASE_DATA_REG_NUM; i++, buff++)
+		*buff = t4_read_reg(padap, qbase->reg_data[i]);
+}
+
 int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
 			       struct cudbg_buffer *dbg_buff,
 			       struct cudbg_error *cudbg_err)
 {
 	struct adapter *padap = pdbg_init->adap;
 	struct cudbg_buffer temp_buff = { 0 };
+	struct sge_qbase_reg_field *sge_qbase;
 	struct ireg_buf *ch_sge_dbg;
 	int i, rc;
 
-	rc = cudbg_get_buff(pdbg_init, dbg_buff, sizeof(*ch_sge_dbg) * 2,
+	rc = cudbg_get_buff(pdbg_init, dbg_buff,
+			    sizeof(*ch_sge_dbg) * 2 + sizeof(*sge_qbase),
 			    &temp_buff);
 	if (rc)
 		return rc;
@@ -1370,6 +1393,28 @@ int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init,
 				 sge_pio->ireg_local_offset);
 		ch_sge_dbg++;
 	}
+
+	if (CHELSIO_CHIP_VERSION(padap->params.chip) > CHELSIO_T5) {
+		sge_qbase = (struct sge_qbase_reg_field *)ch_sge_dbg;
+		/* 1 addr reg SGE_QBASE_INDEX and 4 data reg
+		 * SGE_QBASE_MAP[0-3]
+		 */
+		sge_qbase->reg_addr = t6_sge_qbase_index_array[0];
+		for (i = 0; i < SGE_QBASE_DATA_REG_NUM; i++)
+			sge_qbase->reg_data[i] =
+				t6_sge_qbase_index_array[i + 1];
+
+		for (i = 0; i <= PCIE_FW_MASTER_M; i++)
+			cudbg_read_sge_qbase_indirect_reg(padap, sge_qbase,
+							  i, true);
+
+		for (i = 0; i < padap->params.arch.vfcount; i++)
+			cudbg_read_sge_qbase_indirect_reg(padap, sge_qbase,
+							  i, false);
+
+		sge_qbase->vfcount = padap->params.arch.vfcount;
+	}
+
 	return cudbg_write_and_release_buff(pdbg_init, &temp_buff, dbg_buff);
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 085691eb2b95..8d751efcb90e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -214,7 +214,8 @@ static u32 cxgb4_get_entity_length(struct adapter *adap, u32 entity)
 		len = sizeof(struct ireg_buf) * n;
 		break;
 	case CUDBG_SGE_INDIRECT:
-		len = sizeof(struct ireg_buf) * 2;
+		len = sizeof(struct ireg_buf) * 2 +
+		      sizeof(struct sge_qbase_reg_field);
 		break;
 	case CUDBG_ULPRX_LA:
 		len = sizeof(struct cudbg_ulprx_la);
-- 
cgit v1.2.3-59-g8ed1b


From eb38401c779d350e9e31396471ea072fa29aec9b Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Fri, 18 May 2018 16:54:38 +0100
Subject: net: stmmac: Populate missing callbacks in HWIF initialization

Some HW specific setups, like sun8i, do not populate all the necessary
callbacks, which is what HWIF helpers were expecting.

Fix this by always trying to get the generic helpers and populate them
if they were not previously populated by HW specific setup.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Fixes: 5f0456b43140 ("net: stmmac: Implement logic to automatically
select HW Interface")
Reported-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Tested-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Cc: Corentin Labbe <clabbe.montjoie@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.c | 38 +++++++++++++++++-------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index 23a12649e247..14770fc8865e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -189,13 +189,16 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 	bool needs_gmac = priv->plat->has_gmac;
 	const struct stmmac_hwif_entry *entry;
 	struct mac_device_info *mac;
+	bool needs_setup = true;
 	int i, ret;
 	u32 id;
 
 	if (needs_gmac) {
 		id = stmmac_get_id(priv, GMAC_VERSION);
-	} else {
+	} else if (needs_gmac4) {
 		id = stmmac_get_id(priv, GMAC4_VERSION);
+	} else {
+		id = 0;
 	}
 
 	/* Save ID for later use */
@@ -209,13 +212,12 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 
 	/* Check for HW specific setup first */
 	if (priv->plat->setup) {
-		priv->hw = priv->plat->setup(priv);
-		if (!priv->hw)
-			return -ENOMEM;
-		return 0;
+		mac = priv->plat->setup(priv);
+		needs_setup = false;
+	} else {
+		mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
 	}
 
-	mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL);
 	if (!mac)
 		return -ENOMEM;
 
@@ -227,24 +229,28 @@ int stmmac_hwif_init(struct stmmac_priv *priv)
 			continue;
 		if (needs_gmac4 ^ entry->gmac4)
 			continue;
-		if (id < entry->min_id)
+		/* Use synopsys_id var because some setups can override this */
+		if (priv->synopsys_id < entry->min_id)
 			continue;
 
-		mac->desc = entry->desc;
-		mac->dma = entry->dma;
-		mac->mac = entry->mac;
-		mac->ptp = entry->hwtimestamp;
-		mac->mode = entry->mode;
-		mac->tc = entry->tc;
+		/* Only use generic HW helpers if needed */
+		mac->desc = mac->desc ? : entry->desc;
+		mac->dma = mac->dma ? : entry->dma;
+		mac->mac = mac->mac ? : entry->mac;
+		mac->ptp = mac->ptp ? : entry->hwtimestamp;
+		mac->mode = mac->mode ? : entry->mode;
+		mac->tc = mac->tc ? : entry->tc;
 
 		priv->hw = mac;
 		priv->ptpaddr = priv->ioaddr + entry->regs.ptp_off;
 		priv->mmcaddr = priv->ioaddr + entry->regs.mmc_off;
 
 		/* Entry found */
-		ret = entry->setup(priv);
-		if (ret)
-			return ret;
+		if (needs_setup) {
+			ret = entry->setup(priv);
+			if (ret)
+				return ret;
+		}
 
 		/* Run quirks, if needed */
 		if (entry->quirks) {
-- 
cgit v1.2.3-59-g8ed1b


From 991f5b3651f6bb1cb5034f422e43f489e65f2701 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 18 May 2018 12:12:09 -0700
Subject: nfp: bpf: support logic indirect shifts (BPF_[L|R]SH | BPF_X)

For indirect shifts, shift amount is not specified as constant, NFP needs
to get the shift amount through the low 5 bits of source A operand in
PREV_ALU, therefore extra instructions are needed compared with shifts by
constants.

Because NFP is 32-bit, so we are using register pair for 64-bit shifts and
therefore would need different instruction sequences depending on whether
shift amount is less than 32 or not.

NFP branch-on-bit-test instruction emitter is added by this patch and is
used for efficient runtime check on shift amount. We'd think the shift
amount is less than 32 if bit 5 is clear and greater or equal than 32
otherwise. Shift amount is greater than or equal to 64 will result in
undefined behavior.

This patch also use range info to avoid generating unnecessary runtime code
if we are certain shift amount is less than 32 or not.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 299 +++++++++++++++++++---
 drivers/net/ethernet/netronome/nfp/bpf/main.h     |  28 ++
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  |   2 +
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   8 +
 drivers/net/ethernet/netronome/nfp/nfp_asm.h      |  17 +-
 5 files changed, 322 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index a4d3da215863..4cff08771951 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -211,6 +211,60 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
 }
 
+static void
+__emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
+	      bool set, bool src_lmextn)
+{
+	u16 addr_lo, addr_hi;
+	u64 insn;
+
+	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
+	addr_hi = addr != addr_lo;
+
+	insn = OP_BR_BIT_BASE |
+		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
+		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
+		FIELD_PREP(OP_BR_BIT_BV, set) |
+		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
+		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
+		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
+		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
+
+	nfp_prog_push(nfp_prog, insn);
+}
+
+static void
+emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
+		 u8 defer, bool set, enum nfp_relo_type relo)
+{
+	struct nfp_insn_re_regs reg;
+	int err;
+
+	/* NOTE: The bit to test is specified as an rotation amount, such that
+	 *	 the bit to test will be placed on the MSB of the result when
+	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
+	 */
+	bit += 1;
+
+	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
+	if (err) {
+		nfp_prog->error = err;
+		return;
+	}
+
+	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
+		      reg.src_lmextn);
+
+	nfp_prog->prog[nfp_prog->prog_len - 1] |=
+		FIELD_PREP(OP_RELO_TYPE, relo);
+}
+
+static void
+emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
+{
+	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
+}
+
 static void
 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
 	     enum immed_width width, bool invert,
@@ -309,6 +363,19 @@ emit_shf(struct nfp_prog *nfp_prog, swreg dst,
 		   reg.dst_lmextn, reg.src_lmextn);
 }
 
+static void
+emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
+	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
+{
+	if (sc == SHF_SC_R_ROT) {
+		pr_err("indirect shift is not allowed on rotation\n");
+		nfp_prog->error = -EFAULT;
+		return;
+	}
+
+	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
+}
+
 static void
 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
@@ -1629,56 +1696,226 @@ static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
-static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
-{
-	const struct bpf_insn *insn = &meta->insn;
-	u8 dst = insn->dst_reg * 2;
-
-	if (insn->imm < 32) {
-		emit_shf(nfp_prog, reg_both(dst + 1),
-			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
-			 SHF_SC_R_DSHF, 32 - insn->imm);
-		emit_shf(nfp_prog, reg_both(dst),
-			 reg_none(), SHF_OP_NONE, reg_b(dst),
-			 SHF_SC_L_SHF, insn->imm);
-	} else if (insn->imm == 32) {
+/* Pseudo code:
+ *   if shift_amt >= 32
+ *     dst_high = dst_low << shift_amt[4:0]
+ *     dst_low = 0;
+ *   else
+ *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
+ *     dst_low = dst_low << shift_amt
+ *
+ * The indirect shift will use the same logic at runtime.
+ */
+static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
+{
+	if (shift_amt < 32) {
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
+			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
+			 32 - shift_amt);
+		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
+			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
+	} else if (shift_amt == 32) {
 		wrp_reg_mov(nfp_prog, dst + 1, dst);
 		wrp_immed(nfp_prog, reg_both(dst), 0);
-	} else if (insn->imm > 32) {
-		emit_shf(nfp_prog, reg_both(dst + 1),
-			 reg_none(), SHF_OP_NONE, reg_b(dst),
-			 SHF_SC_L_SHF, insn->imm - 32);
+	} else if (shift_amt > 32) {
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
+			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
 		wrp_immed(nfp_prog, reg_both(dst), 0);
 	}
 
 	return 0;
 }
 
-static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
 	u8 dst = insn->dst_reg * 2;
 
-	if (insn->imm < 32) {
-		emit_shf(nfp_prog, reg_both(dst),
-			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
-			 SHF_SC_R_DSHF, insn->imm);
-		emit_shf(nfp_prog, reg_both(dst + 1),
-			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
-			 SHF_SC_R_SHF, insn->imm);
-	} else if (insn->imm == 32) {
+	return __shl_imm64(nfp_prog, dst, insn->imm);
+}
+
+static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
+		 reg_b(src));
+	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
+		       reg_b(dst), SHF_SC_R_DSHF);
+}
+
+/* NOTE: for indirect left shift, HIGH part should be calculated first. */
+static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
+		       reg_b(dst), SHF_SC_L_SHF);
+}
+
+static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	shl_reg64_lt32_high(nfp_prog, dst, src);
+	shl_reg64_lt32_low(nfp_prog, dst, src);
+}
+
+static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
+		       reg_b(dst), SHF_SC_L_SHF);
+	wrp_immed(nfp_prog, reg_both(dst), 0);
+}
+
+static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u64 umin, umax;
+	u8 dst, src;
+
+	dst = insn->dst_reg * 2;
+	umin = meta->umin;
+	umax = meta->umax;
+	if (umin == umax)
+		return __shl_imm64(nfp_prog, dst, umin);
+
+	src = insn->src_reg * 2;
+	if (umax < 32) {
+		shl_reg64_lt32(nfp_prog, dst, src);
+	} else if (umin >= 32) {
+		shl_reg64_ge32(nfp_prog, dst, src);
+	} else {
+		/* Generate different instruction sequences depending on runtime
+		 * value of shift amount.
+		 */
+		u16 label_ge32, label_end;
+
+		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
+		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
+
+		shl_reg64_lt32_high(nfp_prog, dst, src);
+		label_end = nfp_prog_current_offset(nfp_prog) + 6;
+		emit_br(nfp_prog, BR_UNC, label_end, 2);
+		/* shl_reg64_lt32_low packed in delay slot. */
+		shl_reg64_lt32_low(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
+			return -EINVAL;
+		shl_reg64_ge32(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Pseudo code:
+ *   if shift_amt >= 32
+ *     dst_high = 0;
+ *     dst_low = dst_high >> shift_amt[4:0]
+ *   else
+ *     dst_high = dst_high >> shift_amt
+ *     dst_low = (dst_high, dst_low) >> shift_amt
+ *
+ * The indirect shift will use the same logic at runtime.
+ */
+static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
+{
+	if (shift_amt < 32) {
+		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
+			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
+			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
+	} else if (shift_amt == 32) {
 		wrp_reg_mov(nfp_prog, dst, dst + 1);
 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
-	} else if (insn->imm > 32) {
-		emit_shf(nfp_prog, reg_both(dst),
-			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
-			 SHF_SC_R_SHF, insn->imm - 32);
+	} else if (shift_amt > 32) {
+		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
+			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
 	}
 
 	return 0;
 }
 
+static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u8 dst = insn->dst_reg * 2;
+
+	return __shr_imm64(nfp_prog, dst, insn->imm);
+}
+
+/* NOTE: for indirect right shift, LOW part should be calculated first. */
+static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
+		       reg_b(dst + 1), SHF_SC_R_SHF);
+}
+
+static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
+		       reg_b(dst), SHF_SC_R_DSHF);
+}
+
+static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	shr_reg64_lt32_low(nfp_prog, dst, src);
+	shr_reg64_lt32_high(nfp_prog, dst, src);
+}
+
+static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
+	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
+		       reg_b(dst + 1), SHF_SC_R_SHF);
+	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+}
+
+static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u64 umin, umax;
+	u8 dst, src;
+
+	dst = insn->dst_reg * 2;
+	umin = meta->umin;
+	umax = meta->umax;
+	if (umin == umax)
+		return __shr_imm64(nfp_prog, dst, umin);
+
+	src = insn->src_reg * 2;
+	if (umax < 32) {
+		shr_reg64_lt32(nfp_prog, dst, src);
+	} else if (umin >= 32) {
+		shr_reg64_ge32(nfp_prog, dst, src);
+	} else {
+		/* Generate different instruction sequences depending on runtime
+		 * value of shift amount.
+		 */
+		u16 label_ge32, label_end;
+
+		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
+		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
+		shr_reg64_lt32_low(nfp_prog, dst, src);
+		label_end = nfp_prog_current_offset(nfp_prog) + 6;
+		emit_br(nfp_prog, BR_UNC, label_end, 2);
+		/* shr_reg64_lt32_high packed in delay slot. */
+		shr_reg64_lt32_high(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
+			return -EINVAL;
+		shr_reg64_ge32(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2501,7 +2738,9 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
+	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
+	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 8b143546ae85..654fe7823e5e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -263,6 +263,8 @@ struct nfp_bpf_reg_state {
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
+ * @umin: copy of core verifier umin_value.
+ * @umax: copy of core verifier umax_value.
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
@@ -298,6 +300,13 @@ struct nfp_insn_meta {
 			struct bpf_reg_state arg1;
 			struct nfp_bpf_reg_state arg2;
 		};
+		/* We are interested in range info for some operands,
+		 * for example, the shift amount.
+		 */
+		struct {
+			u64 umin;
+			u64 umax;
+		};
 	};
 	unsigned int off;
 	unsigned short n;
@@ -375,6 +384,25 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
 }
 
+static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta)
+{
+	u8 code = meta->insn.code;
+	bool is_alu, is_shift;
+	u8 opclass, opcode;
+
+	opclass = BPF_CLASS(code);
+	is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU;
+	if (!is_alu)
+		return false;
+
+	opcode = BPF_OP(code);
+	is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH;
+	if (!is_shift)
+		return false;
+
+	return BPF_SRC(code) == BPF_X;
+}
+
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 4db0ac1e42a8..7eae4c0266f8 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -190,6 +190,8 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 
 		meta->insn = prog[i];
 		meta->n = i;
+		if (is_mbpf_indir_shift(meta))
+			meta->umin = U64_MAX;
 
 		list_add_tail(&meta->l, &nfp_prog->insns);
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 844a9be6e55a..4bfeba7b21b2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -551,6 +551,14 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	if (is_mbpf_xadd(meta))
 		return nfp_bpf_check_xadd(nfp_prog, meta, env);
 
+	if (is_mbpf_indir_shift(meta)) {
+		const struct bpf_reg_state *sreg =
+			cur_regs(env) + meta->insn.src_reg;
+
+		meta->umin = min(meta->umin, sreg->umin_value);
+		meta->umax = max(meta->umax, sreg->umax_value);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index faa4e131c136..fa826bd9c668 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -72,8 +72,21 @@
 #define OP_BR_ADDR_LO		0x007ffc00000ULL
 #define OP_BR_ADDR_HI		0x10000000000ULL
 
-#define nfp_is_br(_insn)				\
-	(((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE)
+#define OP_BR_BIT_BASE		0x0d000000000ULL
+#define OP_BR_BIT_BASE_MASK	0x0f800080300ULL
+#define OP_BR_BIT_A_SRC		0x000000000ffULL
+#define OP_BR_BIT_B_SRC		0x0000003fc00ULL
+#define OP_BR_BIT_BV		0x00000040000ULL
+#define OP_BR_BIT_SRC_LMEXTN	0x40000000000ULL
+#define OP_BR_BIT_DEFBR		OP_BR_DEFBR
+#define OP_BR_BIT_ADDR_LO	OP_BR_ADDR_LO
+#define OP_BR_BIT_ADDR_HI	OP_BR_ADDR_HI
+
+static inline bool nfp_is_br(u64 insn)
+{
+	return (insn & OP_BR_BASE_MASK) == OP_BR_BASE ||
+	       (insn & OP_BR_BIT_BASE_MASK) == OP_BR_BIT_BASE;
+}
 
 enum br_mask {
 	BR_BEQ = 0x00,
-- 
cgit v1.2.3-59-g8ed1b


From f43d0f17fe9a4bb770ab38b36e2b5150d8c3d6cf Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 18 May 2018 12:12:10 -0700
Subject: nfp: bpf: support arithmetic right shift by constant (BPF_ARSH |
 BPF_K)

Code logic is similar with logic right shift except we also need to set
PREV_ALU result properly, the MSB of which is the bit that will be
replicated to fill in all the vacant positions.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 34 ++++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_asm.h |  1 +
 2 files changed, 35 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 4cff08771951..f73242c4da2f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1916,6 +1916,39 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+/* Code logic is the same as __shr_imm64 except ashr requires signedness bit
+ * told through PREV_ALU result.
+ */
+static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u8 dst = insn->dst_reg * 2;
+
+	if (insn->imm < 32) {
+		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
+			 reg_b(dst), SHF_SC_R_DSHF, insn->imm);
+		/* Set signedness bit. */
+		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
+			 reg_imm(0));
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+			 reg_b(dst + 1), SHF_SC_R_SHF, insn->imm);
+	} else if (insn->imm == 32) {
+		/* NOTE: this also helps setting signedness bit. */
+		wrp_reg_mov(nfp_prog, dst, dst + 1);
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
+	} else if (insn->imm > 32) {
+		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
+			 reg_imm(0));
+		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+			 reg_b(dst + 1), SHF_SC_R_SHF, insn->imm - 32);
+		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
+	}
+
+	return 0;
+}
+
 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2742,6 +2775,7 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
+	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index fa826bd9c668..f6677bc9875a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -174,6 +174,7 @@ enum shf_op {
 	SHF_OP_NONE = 0,
 	SHF_OP_AND = 2,
 	SHF_OP_OR = 5,
+	SHF_OP_ASHR = 6,
 };
 
 enum shf_sc {
-- 
cgit v1.2.3-59-g8ed1b


From c217abccaaa5e4eeba4aee26f29cdb57c026afc1 Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@netronome.com>
Date: Fri, 18 May 2018 12:12:11 -0700
Subject: nfp: bpf: support arithmetic indirect right shift (BPF_ARSH | BPF_X)

Code logic is similar with arithmetic right shift by constant, and NFP
get indirect shift amount through source A operand of PREV_ALU.

It is possible to fall back to logic right shift if the MSB is known to be
zero from range info, however there is no benefit to do this given logic
indirect right shift use the same number and cycle of instruction sequence.

Suppose the MSB of regX is the bit we want to replicate to fill in all the
vacant positions, and regY contains the shift amount, then we could use
single instruction to set up both.

  [alu, --, regY, OR, regX]

  --
  NOTE: the PREV_ALU result doesn't need to write to any destination
        register.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 99 +++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index f73242c4da2f..8a92088df0d7 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1919,29 +1919,26 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
  * told through PREV_ALU result.
  */
-static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
 {
-	const struct bpf_insn *insn = &meta->insn;
-	u8 dst = insn->dst_reg * 2;
-
-	if (insn->imm < 32) {
+	if (shift_amt < 32) {
 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
-			 reg_b(dst), SHF_SC_R_DSHF, insn->imm);
+			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
 		/* Set signedness bit. */
 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
 			 reg_imm(0));
 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
-			 reg_b(dst + 1), SHF_SC_R_SHF, insn->imm);
-	} else if (insn->imm == 32) {
+			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
+	} else if (shift_amt == 32) {
 		/* NOTE: this also helps setting signedness bit. */
 		wrp_reg_mov(nfp_prog, dst, dst + 1);
 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
-	} else if (insn->imm > 32) {
+	} else if (shift_amt > 32) {
 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
 			 reg_imm(0));
 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
-			 reg_b(dst + 1), SHF_SC_R_SHF, insn->imm - 32);
+			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
 	}
@@ -1949,6 +1946,87 @@ static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u8 dst = insn->dst_reg * 2;
+
+	return __ashr_imm64(nfp_prog, dst, insn->imm);
+}
+
+static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	/* NOTE: the first insn will set both indirect shift amount (source A)
+	 * and signedness bit (MSB of result).
+	 */
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
+	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+		       reg_b(dst + 1), SHF_SC_R_SHF);
+}
+
+static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	/* NOTE: it is the same as logic shift because we don't need to shift in
+	 * signedness bit when the shift amount is less than 32.
+	 */
+	return shr_reg64_lt32_low(nfp_prog, dst, src);
+}
+
+static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	ashr_reg64_lt32_low(nfp_prog, dst, src);
+	ashr_reg64_lt32_high(nfp_prog, dst, src);
+}
+
+static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
+{
+	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
+	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
+		       reg_b(dst + 1), SHF_SC_R_SHF);
+	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
+		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
+}
+
+/* Like ashr_imm64, but need to use indirect shift. */
+static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+	u64 umin, umax;
+	u8 dst, src;
+
+	dst = insn->dst_reg * 2;
+	umin = meta->umin;
+	umax = meta->umax;
+	if (umin == umax)
+		return __ashr_imm64(nfp_prog, dst, umin);
+
+	src = insn->src_reg * 2;
+	if (umax < 32) {
+		ashr_reg64_lt32(nfp_prog, dst, src);
+	} else if (umin >= 32) {
+		ashr_reg64_ge32(nfp_prog, dst, src);
+	} else {
+		u16 label_ge32, label_end;
+
+		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
+		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
+		ashr_reg64_lt32_low(nfp_prog, dst, src);
+		label_end = nfp_prog_current_offset(nfp_prog) + 6;
+		emit_br(nfp_prog, BR_UNC, label_end, 2);
+		/* ashr_reg64_lt32_high packed in delay slot. */
+		ashr_reg64_lt32_high(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
+			return -EINVAL;
+		ashr_reg64_ge32(nfp_prog, dst, src);
+
+		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2775,6 +2853,7 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
+	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
-- 
cgit v1.2.3-59-g8ed1b


From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:16:58 -0700
Subject: bpf: allow sk_msg programs to read sock fields

Currently sk_msg programs only have access to the raw data. However,
it is often useful when building policies to have the policies specific
to the socket endpoint. This allows using the socket tuple as input
into filters, etc.

This patch adds ctx access to the sock fields.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h   |   1 +
 include/uapi/linux/bpf.h |   8 ++++
 kernel/bpf/sockmap.c     |   1 +
 net/core/filter.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 121 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9dbcb9d55921..d358d1815c16 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -517,6 +517,7 @@ struct sk_msg_buff {
 	bool sg_copy[MAX_SKB_FRAGS];
 	__u32 flags;
 	struct sock *sk_redir;
+	struct sock *sk;
 	struct sk_buff *skb;
 	struct list_head list;
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..97446bbe2ca5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2176,6 +2176,14 @@ enum sk_action {
 struct sk_msg_md {
 	void *data;
 	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index c682669570dc..3b28955a6383 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -523,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	}
 
 	bpf_compute_data_pointers_sg(md);
+	md->sk = sk;
 	rc = (*prog->bpf_func)(md, prog->insnsi);
 	psock->apply_bytes = md->apply_bytes;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0d1560bd70..aec5ebafb262 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5148,18 +5148,23 @@ static bool sk_msg_is_valid_access(int off, int size,
 	switch (off) {
 	case offsetof(struct sk_msg_md, data):
 		info->reg_type = PTR_TO_PACKET;
+		if (size != sizeof(__u64))
+			return false;
 		break;
 	case offsetof(struct sk_msg_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
+		if (size != sizeof(__u64))
+			return false;
 		break;
+	default:
+		if (size != sizeof(__u32))
+			return false;
 	}
 
 	if (off < 0 || off >= sizeof(struct sk_msg_md))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u64))
-		return false;
 
 	return true;
 }
@@ -5835,7 +5840,8 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct bpf_sock_ops, local_ip4):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
 					      struct bpf_sock_ops_kern, sk),
@@ -6152,6 +6158,7 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
@@ -6164,6 +6171,107 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_msg_buff, data_end));
 		break;
+	case offsetof(struct sk_msg_md, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_family));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_daddr));
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_rcv_saddr));
+		break;
+
+	case offsetof(struct sk_msg_md, remote_ip6[0]) ...
+	     offsetof(struct sk_msg_md, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, remote_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_ip6[0]) ...
+	     offsetof(struct sk_msg_md, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct sk_msg_md, local_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct sk_msg_md, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct sk_msg_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_num));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3-59-g8ed1b


From 4da0dcabe4bc633ffc13bb4a669c34dd876e59d1 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 17 May 2018 14:17:03 -0700
Subject: bpf: add sk_msg prog sk access tests to test_verifier

Add tests for BPF_PROG_TYPE_SK_MSG to test_verifier for read access
to new sk fields.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h              |   8 ++
 tools/testing/selftests/bpf/test_verifier.c | 115 ++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d94d333a8225..97446bbe2ca5 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2176,6 +2176,14 @@ enum sk_action {
 struct sk_msg_md {
 	void *data;
 	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 #define BPF_TAG_SIZE	8
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index a877af00605d..6ec4d9def877 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1685,6 +1685,121 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SK_SKB,
 	},
+	{
+		"valid access family in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, family)),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"valid access remote_ip4 in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_ip4)),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"valid access local_ip4 in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_ip4)),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"valid access remote_port in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_port)),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"valid access local_port in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_port)),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"valid access remote_ip6 in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_ip6[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_ip6[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_ip6[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, remote_ip6[3])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_SKB,
+	},
+	{
+		"valid access local_ip6 in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_ip6[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_ip6[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_ip6[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_ip6[3])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SK_SKB,
+	},
+	{
+		"invalid 64B read of family in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct sk_msg_md, family)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"invalid read past end of SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct sk_msg_md, local_port) + 4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 !read_ok",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
+	{
+		"invalid read offset in SK_MSG",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct sk_msg_md, family) + 1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SK_MSG,
+	},
 	{
 		"direct packet read for SK_MSG",
 		.insns = {
-- 
cgit v1.2.3-59-g8ed1b


From b9ffcbaf56d3040efee64d3818688083c29b2a44 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:00 +0200
Subject: devlink: introduce devlink_port_attrs_set

Change existing setter for split port information into more generic
attrs setter. Alongside with that, allow to set port number and subport
number for split ports.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  7 ++--
 drivers/net/ethernet/mellanox/mlxsw/core.h       |  3 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   |  4 +--
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c   |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  4 +--
 include/net/devlink.h                            | 20 +++++++----
 include/uapi/linux/devlink.h                     |  3 ++
 net/core/devlink.c                               | 46 ++++++++++++++++++------
 8 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index e13ac3b8dff7..958769689ca2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1714,15 +1714,16 @@ EXPORT_SYMBOL(mlxsw_core_port_fini);
 
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group)
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber)
 {
 	struct mlxsw_core_port *mlxsw_core_port =
 					&mlxsw_core->ports[local_port];
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	if (split)
-		devlink_port_split_set(devlink_port, split_group);
+	devlink_port_attrs_set(devlink_port, port_number,
+			       split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 092d39399f3c..e65287c15afd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -201,7 +201,8 @@ int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port);
 void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     void *port_driver_priv, struct net_device *dev,
-			     bool split, u32 split_group);
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber);
 void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 			    void *port_driver_priv);
 void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 94132f6cec61..96f6d8af1fd8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2927,8 +2927,8 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port,
-				mlxsw_sp_port, dev, mlxsw_sp_port->split,
-				module);
+				mlxsw_sp_port, dev, module + 1,
+				mlxsw_sp_port->split, lane / width);
 	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0);
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index a655c5850aa6..551e05067b1e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -1149,7 +1149,7 @@ static int __mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
 	}
 
 	mlxsw_core_port_eth_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
-				mlxsw_sx_port, dev, false, 0);
+				mlxsw_sx_port, dev, module + 1, false, 0);
 	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
 	return 0;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index eb0fc614673d..d7a768ff3112 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,8 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	if (eth_port.is_split)
-		devlink_port_split_set(&port->dl_port, eth_port.label_port);
+	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
+			       eth_port.is_split, eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 2e4f71e16e95..d6ca92266709 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -35,6 +35,13 @@ struct devlink {
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
+struct devlink_port_attrs {
+	bool set;
+	u32 port_number; /* same value as "split group" */
+	bool split;
+	u32 split_subport_number;
+};
+
 struct devlink_port {
 	struct list_head list;
 	struct devlink *devlink;
@@ -43,8 +50,7 @@ struct devlink_port {
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
 	void *type_dev;
-	bool split;
-	u32 split_group;
+	struct devlink_port_attrs attrs;
 };
 
 struct devlink_sb_pool_info {
@@ -367,8 +373,9 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
 void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group);
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -466,8 +473,9 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
 }
 
-static inline void devlink_port_split_set(struct devlink_port *devlink_port,
-					  u32 split_group)
+static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  u32 port_number, bool split,
+					  u32 split_subport_number)
 {
 }
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1df65a4c2044..15b031a5ee7a 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -224,6 +224,9 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ad1317376798..8fde7d2df9b0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -453,6 +453,25 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
 				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+				     struct devlink_port *devlink_port)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	if (!attrs->set)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+		return -EMSGSIZE;
+	if (!attrs->split)
+		return 0;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+			attrs->split_subport_number))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				struct devlink_port *devlink_port,
 				enum devlink_command cmd, u32 portid,
@@ -492,9 +511,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
 				   ibdev->name))
 			goto nla_put_failure;
 	}
-	if (devlink_port->split &&
-	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
-			devlink_port->split_group))
+	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -2971,19 +2988,28 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
 /**
- *	devlink_port_split_set - Set port is split
+ *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
- *	@split_group: split group - identifies group split port is part of
+ *	@port_number: number of the port that is facing user, for example
+ *	              the front panel port number
+ *	@split: indicates if this is split port
+ *	@split_subport_number: if the port is split, this is the number
+ *	                       of subport.
  */
-void devlink_port_split_set(struct devlink_port *devlink_port,
-			    u32 split_group)
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    u32 port_number, bool split,
+			    u32 split_subport_number)
 {
-	devlink_port->split = true;
-	devlink_port->split_group = split_group;
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+	attrs->set = true;
+	attrs->port_number = port_number;
+	attrs->split = split;
+	attrs->split_subport_number = split_subport_number;
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
-EXPORT_SYMBOL_GPL(devlink_port_split_set);
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
-- 
cgit v1.2.3-59-g8ed1b


From 5ec1380a21bb6cd2ba89e31c44dfcc150f9ef792 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:01 +0200
Subject: devlink: extend attrs_set for setting port flavours

Devlink ports can have specific flavour according to the purpose of use.
This patch extend attrs_set so the driver can say which flavour port
has. Initial flavours are:
physical, cpu, dsa
User can query this to see right away what is the purpose of each port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  5 +++--
 include/net/devlink.h                            |  3 +++
 include/uapi/linux/devlink.h                     | 11 +++++++++++
 net/core/devlink.c                               |  5 +++++
 5 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 958769689ca2..a720aa11bcc0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1722,8 +1722,8 @@ void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
 	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
 
 	mlxsw_core_port->port_driver_priv = port_driver_priv;
-	devlink_port_attrs_set(devlink_port, port_number,
-			       split, split_port_subnumber);
+	devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       port_number, split, split_port_subnumber);
 	devlink_port_type_eth_set(devlink_port, dev);
 }
 EXPORT_SYMBOL(mlxsw_core_port_eth_set);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index d7a768ff3112..b1e67cf4257a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -175,8 +175,9 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
 		return ret;
 
 	devlink_port_type_eth_set(&port->dl_port, port->netdev);
-	devlink_port_attrs_set(&port->dl_port, eth_port.label_port,
-			       eth_port.is_split, eth_port.label_subport);
+	devlink_port_attrs_set(&port->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       eth_port.label_port, eth_port.is_split,
+			       eth_port.label_subport);
 
 	devlink = priv_to_devlink(app->pf);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index d6ca92266709..6eae15cf0f57 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -37,6 +37,7 @@ struct devlink {
 
 struct devlink_port_attrs {
 	bool set;
+	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	bool split;
 	u32 split_subport_number;
@@ -374,6 +375,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
@@ -474,6 +476,7 @@ static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
 }
 
 static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
+					  enum devlink_port_flavour flavour,
 					  u32 port_number, bool split,
 					  u32 split_subport_number)
 {
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 15b031a5ee7a..75cb5450c851 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -132,6 +132,16 @@ enum devlink_eswitch_encap_mode {
 	DEVLINK_ESWITCH_ENCAP_MODE_BASIC,
 };
 
+enum devlink_port_flavour {
+	DEVLINK_PORT_FLAVOUR_PHYSICAL, /* Any kind of a port physically
+					* facing the user.
+					*/
+	DEVLINK_PORT_FLAVOUR_CPU, /* CPU port */
+	DEVLINK_PORT_FLAVOUR_DSA, /* Distributed switch architecture
+				   * interconnect port.
+				   */
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -224,6 +234,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,/* u64 */
 
+	DEVLINK_ATTR_PORT_FLAVOUR,		/* u16 */
 	DEVLINK_ATTR_PORT_NUMBER,		/* u32 */
 	DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,	/* u32 */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fde7d2df9b0..af90d237cbc2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -460,6 +460,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
 
 	if (!attrs->set)
 		return 0;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+		return -EMSGSIZE;
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
 		return -EMSGSIZE;
 	if (!attrs->split)
@@ -2991,6 +2993,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	devlink_port_attrs_set - Set port attributes
  *
  *	@devlink_port: devlink port
+ *	@flavour: flavour of the port
  *	@port_number: number of the port that is facing user, for example
  *	              the front panel port number
  *	@split: indicates if this is split port
@@ -2998,12 +3001,14 @@ EXPORT_SYMBOL_GPL(devlink_port_type_clear);
  *	                       of subport.
  */
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
+			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number)
 {
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 
 	attrs->set = true;
+	attrs->flavour = flavour;
 	attrs->port_number = port_number;
 	attrs->split = split;
 	attrs->split_subport_number = split_subport_number;
-- 
cgit v1.2.3-59-g8ed1b


From 08474c1a9df0cefcc3d197bd1d770695a34b9d60 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:02 +0200
Subject: devlink: introduce a helper to generate physical port names

Each driver implements physical port name generation by itself. However
as devlink has all needed info, it can easily do the job for all its
users. So implement this helper in devlink.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  9 +++++++++
 net/core/devlink.c    | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6eae15cf0f57..9686a1aa4ec9 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -378,6 +378,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
@@ -482,6 +484,13 @@ static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
 {
 }
 
+static inline int
+devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				char *name, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int devlink_sb_register(struct devlink *devlink,
 				      unsigned int sb_index, u32 size,
 				      u16 ingress_pools_count,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index af90d237cbc2..5c8a40e1a01e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3016,6 +3016,39 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
 
+int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
+				    char *name, size_t len)
+{
+	struct devlink_port_attrs *attrs = &devlink_port->attrs;
+	int n = 0;
+
+	if (!attrs->set)
+		return -EOPNOTSUPP;
+
+	switch (attrs->flavour) {
+	case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+		if (!attrs->split)
+			n = snprintf(name, len, "p%u", attrs->port_number);
+		else
+			n = snprintf(name, len, "p%us%u", attrs->port_number,
+				     attrs->split_subport_number);
+		break;
+	case DEVLINK_PORT_FLAVOUR_CPU:
+	case DEVLINK_PORT_FLAVOUR_DSA:
+		/* As CPU and DSA ports do not have a netdevice associated
+		 * case should not ever happen.
+		 */
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	if (n >= len)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
+
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3-59-g8ed1b


From da077392486b2f87c384bc12b27a16b29620c5e4 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:03 +0200
Subject: dsa: set devlink port attrs for dsa ports

Set the attrs and allow to expose port flavour to user via devlink.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index adf50fbc4c13..00126cda4319 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -270,7 +270,28 @@ static int dsa_port_setup(struct dsa_port *dp)
 	case DSA_PORT_TYPE_UNUSED:
 		break;
 	case DSA_PORT_TYPE_CPU:
+		/* dp->index is used now as port_number. However
+		 * CPU ports should have separate numbering
+		 * independent from front panel port numbers.
+		 */
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_CPU,
+				       dp->index, false, 0);
+		err = dsa_port_link_register_of(dp);
+		if (err) {
+			dev_err(ds->dev, "failed to setup link for port %d.%d\n",
+				ds->index, dp->index);
+			return err;
+		}
+		break;
 	case DSA_PORT_TYPE_DSA:
+		/* dp->index is used now as port_number. However
+		 * DSA ports should have separate numbering
+		 * independent from front panel port numbers.
+		 */
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_DSA,
+				       dp->index, false, 0);
 		err = dsa_port_link_register_of(dp);
 		if (err) {
 			dev_err(ds->dev, "failed to setup link for port %d.%d\n",
@@ -279,6 +300,9 @@ static int dsa_port_setup(struct dsa_port *dp)
 		}
 		break;
 	case DSA_PORT_TYPE_USER:
+		devlink_port_attrs_set(&dp->devlink_port,
+				       DEVLINK_PORT_FLAVOUR_PHYSICAL,
+				       dp->index, false, 0);
 		err = dsa_slave_create(dp);
 		if (err)
 			dev_err(ds->dev, "failed to create slave for port %d.%d\n",
-- 
cgit v1.2.3-59-g8ed1b


From ec932fbda7ba6ac10dba59fd20c736be9a6976e4 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 May 2018 09:29:04 +0200
Subject: mlxsw: use devlink helper to generate physical port name

Since devlink knows the info needed to generate the physical port name
in a generic way for all devlink users, use the helper to do the job.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     | 11 +++++++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |  2 ++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 17 +++--------------
 drivers/net/ethernet/mellanox/mlxsw/switchx2.c |  9 +++------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index a720aa11bcc0..a38faec45b30 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1763,6 +1763,17 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
 }
 EXPORT_SYMBOL(mlxsw_core_port_type_get);
 
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	return devlink_port_get_phys_port_name(devlink_port, name, len);
+}
+EXPORT_SYMBOL(mlxsw_core_port_get_phys_port_name);
+
 static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
 				    const char *buf, size_t size)
 {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e65287c15afd..4eac7fbd07d5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -209,6 +209,8 @@ void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
 			   void *port_driver_priv);
 enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
 						u8 local_port);
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len);
 
 int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
 bool mlxsw_core_schedule_work(struct work_struct *work);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 96f6d8af1fd8..bb252b36994d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1238,21 +1238,10 @@ static int mlxsw_sp_port_get_phys_port_name(struct net_device *dev, char *name,
 					    size_t len)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
-	u8 module = mlxsw_sp_port->mapping.module;
-	u8 width = mlxsw_sp_port->mapping.width;
-	u8 lane = mlxsw_sp_port->mapping.lane;
-	int err;
-
-	if (!mlxsw_sp_port->split)
-		err = snprintf(name, len, "p%d", module + 1);
-	else
-		err = snprintf(name, len, "p%ds%d", module + 1,
-			       lane / width);
 
-	if (err >= len)
-		return -EINVAL;
-
-	return 0;
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sp_port->mlxsw_sp->core,
+						  mlxsw_sp_port->local_port,
+						  name, len);
 }
 
 static struct mlxsw_sp_port_mall_tc_entry *
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
index 551e05067b1e..3922c1cfe5f5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -417,13 +417,10 @@ static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
 					    size_t len)
 {
 	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
-	int err;
-
-	err = snprintf(name, len, "p%d", mlxsw_sx_port->mapping.module + 1);
-	if (err >= len)
-		return -EINVAL;
 
-	return 0;
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sx_port->mlxsw_sx->core,
+						  mlxsw_sx_port->local_port,
+						  name, len);
 }
 
 static const struct net_device_ops mlxsw_sx_port_netdev_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 62c8a069b510d905039abd4097434f190a316941 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Fri, 18 May 2018 09:33:39 +0200
Subject: net: mvpp2: Add missing VLAN tag detection

Marvell PPv2 Header Parser sets some bits in the 'result_info' field in
each lookup iteration, to identify different packet attributes such as
DSA / VLAN tag, protocol infos, etc. This is used in further
classification stages in the controller.

It's the DSA tag detection entry that is in charge of detecting when there
is a single VLAN tag.

This commits adds the missing update of the result_info in this case.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index 8a071ac79aa4..6847cd431aa0 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -2128,6 +2128,9 @@ static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add,
 				mvpp2_prs_sram_ai_update(&pe, 0,
 							MVPP2_PRS_SRAM_AI_MASK);
 
+			/* Set result info bits to 'single vlan' */
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_SINGLE,
+						 MVPP2_PRS_RI_VLAN_MASK);
 			/* If packet is tagged continue check vid filtering */
 			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
 		} else {
-- 
cgit v1.2.3-59-g8ed1b


From 230c184679d5517e9770275853cd3456d00d6599 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Fri, 18 May 2018 21:30:18 +0300
Subject: sh_eth: add RGMII support

The R-Car V3H (AKA R8A77980) GEther controller  adds support for the RGMII
PHY interface mode as a new  value  for the RMII_MII register.

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov <vladimir.barinov@cogentembedded.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 8dd41e08a6c6..d214ec09d401 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -466,6 +466,9 @@ static void sh_eth_select_mii(struct net_device *ndev)
 	u32 value;
 
 	switch (mdp->phy_interface) {
+	case PHY_INTERFACE_MODE_RGMII ... PHY_INTERFACE_MODE_RGMII_TXID:
+		value = 0x3;
+		break;
 	case PHY_INTERFACE_MODE_GMII:
 		value = 0x2;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 93f0fa75190fefee55408a1c0812fa54f576c61b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Fri, 18 May 2018 21:31:28 +0300
Subject: sh_eth: add EDMR.NBST support

The R-Car V3H (AKA R8A77980) GEther controller adds the DMA burst mode bit
(NBST) in EDMR and the manual tells to always set it before doing any DMA.

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov <vladimir.barinov@cogentembedded.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 4 ++++
 drivers/net/ethernet/renesas/sh_eth.h | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index d214ec09d401..800c196510eb 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -1434,6 +1434,10 @@ static int sh_eth_dev_init(struct net_device *ndev)
 
 	sh_eth_write(ndev, mdp->cd->trscer_err_mask, TRSCER);
 
+	/* DMA transfer burst mode */
+	if (mdp->cd->nbst)
+		sh_eth_modify(ndev, EDMR, EDMR_NBST, EDMR_NBST);
+
 	if (mdp->cd->bculr)
 		sh_eth_write(ndev, 0x800, BCULR);	/* Burst sycle set */
 
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index a5b792ce2ae7..7d5aaba1384a 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -184,6 +184,7 @@ enum GECMR_BIT {
 
 /* EDMR */
 enum DMAC_M_BIT {
+	EDMR_NBST = 0x80,
 	EDMR_EL = 0x40, /* Litte endian */
 	EDMR_DL1 = 0x20, EDMR_DL0 = 0x10,
 	EDMR_SRST_GETHER = 0x03,
@@ -505,6 +506,7 @@ struct sh_eth_cpu_data {
 	unsigned bculr:1;	/* EtherC have BCULR */
 	unsigned tsu:1;		/* EtherC have TSU */
 	unsigned hw_swap:1;	/* E-DMAC have DE bit in EDMR */
+	unsigned nbst:1;	/* E-DMAC has NBST bit in EDMR */
 	unsigned rpadir:1;	/* E-DMAC have RPADIR */
 	unsigned no_trimd:1;	/* E-DMAC DO NOT have TRIMD */
 	unsigned no_ade:1;	/* E-DMAC DO NOT have ADE bit in EESR */
-- 
cgit v1.2.3-59-g8ed1b


From 3eb9c2ad1db04913041b78e0b5e543527128b90b Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Fri, 18 May 2018 21:32:46 +0300
Subject: sh_eth: add R8A77980 support

Finally, add support for the DT probing of the R-Car V3H (AKA R8A77980) --
it's the only R-Car gen3 SoC having the GEther controller -- others have
only EtherAVB...

Based on the original (and large) patch by Vladimir Barinov.

Signed-off-by: Vladimir Barinov <vladimir.barinov@cogentembedded.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/sh_eth.txt |  1 +
 drivers/net/ethernet/renesas/sh_eth.c            | 44 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/sh_eth.txt b/Documentation/devicetree/bindings/net/sh_eth.txt
index 5172799a7f1a..82a4cf2c145d 100644
--- a/Documentation/devicetree/bindings/net/sh_eth.txt
+++ b/Documentation/devicetree/bindings/net/sh_eth.txt
@@ -14,6 +14,7 @@ Required properties:
 	      "renesas,ether-r8a7791"  if the device is a part of R8A7791 SoC.
 	      "renesas,ether-r8a7793"  if the device is a part of R8A7793 SoC.
 	      "renesas,ether-r8a7794"  if the device is a part of R8A7794 SoC.
+	      "renesas,gether-r8a77980" if the device is a part of R8A77980 SoC.
 	      "renesas,ether-r7s72100" if the device is a part of R7S72100 SoC.
 	      "renesas,rcar-gen1-ether" for a generic R-Car Gen1 device.
 	      "renesas,rcar-gen2-ether" for a generic R-Car Gen2 or RZ/G1
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 800c196510eb..83148ca61317 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -753,6 +753,49 @@ static struct sh_eth_cpu_data rcar_gen2_data = {
 	.rmiimode	= 1,
 	.magic		= 1,
 };
+
+/* R8A77980 */
+static struct sh_eth_cpu_data r8a77980_data = {
+	.soft_reset	= sh_eth_soft_reset_gether,
+
+	.set_duplex	= sh_eth_set_duplex,
+	.set_rate	= sh_eth_set_rate_gether,
+
+	.register_type  = SH_ETH_REG_GIGABIT,
+
+	.edtrr_trns	= EDTRR_TRNS_GETHER,
+	.ecsr_value	= ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD | ECSR_MPD,
+	.ecsipr_value	= ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP |
+			  ECSIPR_MPDIP,
+	.eesipr_value	= EESIPR_RFCOFIP | EESIPR_ECIIP |
+			  EESIPR_FTCIP | EESIPR_TDEIP | EESIPR_TFUFIP |
+			  EESIPR_FRIP | EESIPR_RDEIP | EESIPR_RFOFIP |
+			  EESIPR_RMAFIP | EESIPR_RRFIP |
+			  EESIPR_RTLFIP | EESIPR_RTSFIP |
+			  EESIPR_PREIP | EESIPR_CERFIP,
+
+	.tx_check       = EESR_FTC | EESR_CD | EESR_RTO,
+	.eesr_err_check = EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
+			  EESR_RFE | EESR_RDE | EESR_RFRMER |
+			  EESR_TFE | EESR_TDE | EESR_ECI,
+	.fdr_value	= 0x0000070f,
+
+	.apr		= 1,
+	.mpr		= 1,
+	.tpauser	= 1,
+	.bculr		= 1,
+	.hw_swap	= 1,
+	.nbst		= 1,
+	.rpadir		= 1,
+	.rpadir_value   = 2 << 16,
+	.no_trimd	= 1,
+	.no_ade		= 1,
+	.xdfar_rw	= 1,
+	.hw_checksum	= 1,
+	.select_mii	= 1,
+	.magic		= 1,
+	.cexcr		= 1,
+};
 #endif /* CONFIG_OF */
 
 static void sh_eth_set_rate_sh7724(struct net_device *ndev)
@@ -3134,6 +3177,7 @@ static const struct of_device_id sh_eth_match_table[] = {
 	{ .compatible = "renesas,ether-r8a7791", .data = &rcar_gen2_data },
 	{ .compatible = "renesas,ether-r8a7793", .data = &rcar_gen2_data },
 	{ .compatible = "renesas,ether-r8a7794", .data = &rcar_gen2_data },
+	{ .compatible = "renesas,gether-r8a77980", .data = &r8a77980_data },
 	{ .compatible = "renesas,ether-r7s72100", .data = &r7s72100_data },
 	{ .compatible = "renesas,rcar-gen1-ether", .data = &rcar_gen1_data },
 	{ .compatible = "renesas,rcar-gen2-ether", .data = &rcar_gen2_data },
-- 
cgit v1.2.3-59-g8ed1b


From 1bc49fd18b02a4942d4bb6f46fd23a5fd61c9fd6 Mon Sep 17 00:00:00 2001
From: Hemanth Puranik <hpuranik@codeaurora.org>
Date: Fri, 18 May 2018 08:59:29 +0530
Subject: net: qcom/emac: Allocate buffers from local node

Currently we use non-NUMA aware allocation for TPD and RRD buffers,
this patch modifies to use NUMA friendly allocation.

Signed-off-by: Hemanth Puranik <hpuranik@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac-mac.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index 092718a03786..031f6e6ee9c1 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -683,10 +683,11 @@ static int emac_tx_q_desc_alloc(struct emac_adapter *adpt,
 				struct emac_tx_queue *tx_q)
 {
 	struct emac_ring_header *ring_header = &adpt->ring_header;
+	int node = dev_to_node(adpt->netdev->dev.parent);
 	size_t size;
 
 	size = sizeof(struct emac_buffer) * tx_q->tpd.count;
-	tx_q->tpd.tpbuff = kzalloc(size, GFP_KERNEL);
+	tx_q->tpd.tpbuff = kzalloc_node(size, GFP_KERNEL, node);
 	if (!tx_q->tpd.tpbuff)
 		return -ENOMEM;
 
@@ -723,11 +724,12 @@ static void emac_rx_q_bufs_free(struct emac_adapter *adpt)
 static int emac_rx_descs_alloc(struct emac_adapter *adpt)
 {
 	struct emac_ring_header *ring_header = &adpt->ring_header;
+	int node = dev_to_node(adpt->netdev->dev.parent);
 	struct emac_rx_queue *rx_q = &adpt->rx_q;
 	size_t size;
 
 	size = sizeof(struct emac_buffer) * rx_q->rfd.count;
-	rx_q->rfd.rfbuff = kzalloc(size, GFP_KERNEL);
+	rx_q->rfd.rfbuff = kzalloc_node(size, GFP_KERNEL, node);
 	if (!rx_q->rfd.rfbuff)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-59-g8ed1b


From f0b99e3ab323c72c480140532b0526f087a08c23 Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Fri, 18 May 2018 11:58:30 -0700
Subject: Revert "ixgbe: release lock for the duration of
 ixgbe_suspend_close()"

This reverts commit 6710f970d9979d8f03f6e292bb729b2ee1526d0e.

Gotta love when developers have offline discussions, thinking everyone
is reading their responses/dialog.

The change had the potential for a number of race conditions on
shutdown, which is why we are reverting the change.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 5ddfb93ed491..a52d92e182ee 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6698,15 +6698,8 @@ static int __ixgbe_shutdown(struct pci_dev *pdev, bool *enable_wake)
 	rtnl_lock();
 	netif_device_detach(netdev);
 
-	if (netif_running(netdev)) {
-		/* Suspend takes a long time, device_shutdown may be
-		 * parallelized this function, so drop lock for the
-		 * duration of this call.
-		 */
-		rtnl_unlock();
+	if (netif_running(netdev))
 		ixgbe_close_suspend(adapter);
-		rtnl_lock();
-	}
 
 	ixgbe_clear_interrupt_scheme(adapter);
 	rtnl_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From d48f1958ab7d9fb896d86f0241f0dc83dc3a63de Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 18 May 2018 19:41:01 -0700
Subject: erspan: set bso bit based on mirrored packet's len

Before the patch, the erspan BSO bit (Bad/Short/Oversized) is not
handled.  BSO has 4 possible values:
  00 --> Good frame with no error, or unknown integrity
  11 --> Payload is a Bad Frame with CRC or Alignment Error
  01 --> Payload is a Short Frame
  10 --> Payload is an Oversized Frame

Based the short/oversized definitions in RFC1757, the patch sets
the bso bit based on the mirrored packet's size.

Reported-by: Xiaoyan Jin <xiaoyanj@vmware.com>
Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/net/erspan.h b/include/net/erspan.h
index d044aa60cc76..b39643ef4c95 100644
--- a/include/net/erspan.h
+++ b/include/net/erspan.h
@@ -219,6 +219,33 @@ static inline __be32 erspan_get_timestamp(void)
 	return htonl((u32)h_usecs);
 }
 
+/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757
+ *   00b --> Good frame with no error, or unknown integrity
+ *   01b --> Payload is a Short Frame
+ *   10b --> Payload is an Oversized Frame
+ *   11b --> Payload is a Bad Frame with CRC or Alignment Error
+ */
+enum erspan_bso {
+	BSO_NOERROR = 0x0,
+	BSO_SHORT = 0x1,
+	BSO_OVERSIZED = 0x2,
+	BSO_BAD = 0x3,
+};
+
+static inline u8 erspan_detect_bso(struct sk_buff *skb)
+{
+	/* BSO_BAD is not handled because the frame CRC
+	 * or alignment error information is in FCS.
+	 */
+	if (skb->len < ETH_ZLEN)
+		return BSO_SHORT;
+
+	if (skb->len > ETH_FRAME_LEN)
+		return BSO_OVERSIZED;
+
+	return BSO_NOERROR;
+}
+
 static inline void erspan_build_header_v2(struct sk_buff *skb,
 					  u32 id, u8 direction, u16 hwid,
 					  bool truncate, bool is_ipv4)
@@ -248,6 +275,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb,
 		vlan_tci = ntohs(qp->tci);
 	}
 
+	bso = erspan_detect_bso(skb);
 	skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
 	ershdr = (struct erspan_base_hdr *)skb->data;
 	memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE);
-- 
cgit v1.2.3-59-g8ed1b


From 52f8560ee4684f8689085817225786dba3399f6f Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 19 May 2018 10:29:33 +0200
Subject: r8169: fix network error on resume from suspend

This commit removed calls to rtl_set_rx_mode(). This is ok for the
standard path if the link is brought up, however it breaks system
resume from suspend. Link comes up but no network traffic.

Meanwhile common code from rtl_hw_start_8169/8101/8168() was moved
to rtl_hw_start(), therefore re-add the call to rtl_set_rx_mode()
there.

Due to adding this call we have to move definition of rtl_hw_start()
after definition of rtl_set_rx_mode().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Fixes: 82d3ff6dd199 ("r8169: remove calls to rtl_set_rx_mode")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/r8169.c | 39 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 2c2f0c5b3f94..75dfac0248f4 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5064,25 +5064,6 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp)
 	RTL_W32(tp, RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32));
 }
 
-static void rtl_hw_start(struct  rtl8169_private *tp)
-{
-	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-
-	tp->hw_start(tp);
-
-	rtl_set_rx_max_size(tp);
-	rtl_set_rx_tx_desc_registers(tp);
-	rtl_set_rx_tx_config_registers(tp);
-	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
-
-	/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
-	RTL_R8(tp, IntrMask);
-	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
-	/* no early-rx interrupts */
-	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
-	rtl_irq_enable_all(tp);
-}
-
 static void rtl8169_set_magic_reg(struct rtl8169_private *tp, unsigned mac_version)
 {
 	static const struct rtl_cfg2_info {
@@ -5160,6 +5141,26 @@ static void rtl_set_rx_mode(struct net_device *dev)
 	RTL_W32(tp, RxConfig, tmp);
 }
 
+static void rtl_hw_start(struct  rtl8169_private *tp)
+{
+	RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
+
+	tp->hw_start(tp);
+
+	rtl_set_rx_max_size(tp);
+	rtl_set_rx_tx_desc_registers(tp);
+	rtl_set_rx_tx_config_registers(tp);
+	RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+
+	/* Initially a 10 us delay. Turned it into a PCI commit. - FR */
+	RTL_R8(tp, IntrMask);
+	RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb);
+	rtl_set_rx_mode(tp->dev);
+	/* no early-rx interrupts */
+	RTL_W16(tp, MultiIntr, RTL_R16(tp, MultiIntr) & 0xf000);
+	rtl_irq_enable_all(tp);
+}
+
 static void rtl_hw_start_8169(struct rtl8169_private *tp)
 {
 	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
-- 
cgit v1.2.3-59-g8ed1b


From 743e1a8438e5a37f9ab1846e75a47de8e9cd522f Mon Sep 17 00:00:00 2001
From: Salil Mehta <salil.mehta@huawei.com>
Date: Sat, 19 May 2018 16:53:15 +0100
Subject: net: hns3: Fixes error reported by Kbuild and internal review

This patch fixes the error reported by Intel's kbuild and fixes a
return value in one of the legs, caught during review of the original
patch sent by kbuild.

Fixes: fdb793670a00 ("net: hns3: Add support of .sriov_configure in HNS3 driver")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index e85ff3805c96..e75c65270222 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1579,7 +1579,7 @@ static void hns3_remove(struct pci_dev *pdev)
  * Enable or change the number of VFs. Called when the user updates the number
  * of VFs in sysfs.
  **/
-int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
 {
 	int ret;
 
@@ -1592,6 +1592,8 @@ int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
 		ret = pci_enable_sriov(pdev, num_vfs);
 		if (ret)
 			dev_err(&pdev->dev, "SRIOV enable failed %d\n", ret);
+		else
+			return num_vfs;
 	} else if (!pci_vfs_assigned(pdev)) {
 		pci_disable_sriov(pdev);
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From e63cd65f435c77eb2bc0c4f6691be350cd749dce Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Sat, 19 May 2018 16:53:16 +0100
Subject: net: hns3: Fixes API to fetch ethernet header length with kernel
 default

During the RX leg driver needs to fetch the ethernet header
length from the RX'ed Buffer Descriptor. Currently, proprietary
version hns3_nic_get_headlen is being used to fetch the header
length which uses l234info present in the Buffer Descriptor
which might not be valid for the first Buffer Descriptor if the
packet is spanning across multiple descriptors.
Kernel default eth_get_headlen API does the job correctly.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Peng Li <lipeng321@huawei.com>
Reviewed-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 104 +-----------------------
 1 file changed, 2 insertions(+), 102 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index e75c65270222..66cbb66b2326 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1954,106 +1954,6 @@ hns3_nic_alloc_rx_buffers(struct hns3_enet_ring *ring, int cleand_count)
 	writel_relaxed(i, ring->tqp->io_base + HNS3_RING_RX_RING_HEAD_REG);
 }
 
-/* hns3_nic_get_headlen - determine size of header for LRO/GRO
- * @data: pointer to the start of the headers
- * @max: total length of section to find headers in
- *
- * This function is meant to determine the length of headers that will
- * be recognized by hardware for LRO, GRO, and RSC offloads.  The main
- * motivation of doing this is to only perform one pull for IPv4 TCP
- * packets so that we can do basic things like calculating the gso_size
- * based on the average data per packet.
- */
-static unsigned int hns3_nic_get_headlen(unsigned char *data, u32 flag,
-					 unsigned int max_size)
-{
-	unsigned char *network;
-	u8 hlen;
-
-	/* This should never happen, but better safe than sorry */
-	if (max_size < ETH_HLEN)
-		return max_size;
-
-	/* Initialize network frame pointer */
-	network = data;
-
-	/* Set first protocol and move network header forward */
-	network += ETH_HLEN;
-
-	/* Handle any vlan tag if present */
-	if (hnae_get_field(flag, HNS3_RXD_VLAN_M, HNS3_RXD_VLAN_S)
-		== HNS3_RX_FLAG_VLAN_PRESENT) {
-		if ((typeof(max_size))(network - data) > (max_size - VLAN_HLEN))
-			return max_size;
-
-		network += VLAN_HLEN;
-	}
-
-	/* Handle L3 protocols */
-	if (hnae_get_field(flag, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S)
-		== HNS3_RX_FLAG_L3ID_IPV4) {
-		if ((typeof(max_size))(network - data) >
-		    (max_size - sizeof(struct iphdr)))
-			return max_size;
-
-		/* Access ihl as a u8 to avoid unaligned access on ia64 */
-		hlen = (network[0] & 0x0F) << 2;
-
-		/* Verify hlen meets minimum size requirements */
-		if (hlen < sizeof(struct iphdr))
-			return network - data;
-
-		/* Record next protocol if header is present */
-	} else if (hnae_get_field(flag, HNS3_RXD_L3ID_M, HNS3_RXD_L3ID_S)
-		== HNS3_RX_FLAG_L3ID_IPV6) {
-		if ((typeof(max_size))(network - data) >
-		    (max_size - sizeof(struct ipv6hdr)))
-			return max_size;
-
-		/* Record next protocol */
-		hlen = sizeof(struct ipv6hdr);
-	} else {
-		return network - data;
-	}
-
-	/* Relocate pointer to start of L4 header */
-	network += hlen;
-
-	/* Finally sort out TCP/UDP */
-	if (hnae_get_field(flag, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S)
-		== HNS3_RX_FLAG_L4ID_TCP) {
-		if ((typeof(max_size))(network - data) >
-		    (max_size - sizeof(struct tcphdr)))
-			return max_size;
-
-		/* Access doff as a u8 to avoid unaligned access on ia64 */
-		hlen = (network[12] & 0xF0) >> 2;
-
-		/* Verify hlen meets minimum size requirements */
-		if (hlen < sizeof(struct tcphdr))
-			return network - data;
-
-		network += hlen;
-	} else if (hnae_get_field(flag, HNS3_RXD_L4ID_M, HNS3_RXD_L4ID_S)
-		== HNS3_RX_FLAG_L4ID_UDP) {
-		if ((typeof(max_size))(network - data) >
-		    (max_size - sizeof(struct udphdr)))
-			return max_size;
-
-		network += sizeof(struct udphdr);
-	}
-
-	/* If everything has gone correctly network should be the
-	 * data section of the packet and will be the end of the header.
-	 * If not then it probably represents the end of the last recognized
-	 * header.
-	 */
-	if ((typeof(max_size))(network - data) < max_size)
-		return network - data;
-	else
-		return max_size;
-}
-
 static void hns3_nic_reuse_page(struct sk_buff *skb, int i,
 				struct hns3_enet_ring *ring, int pull_len,
 				struct hns3_desc_cb *desc_cb)
@@ -2253,8 +2153,8 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 		ring->stats.seg_pkt_cnt++;
 		u64_stats_update_end(&ring->syncp);
 
-		pull_len = hns3_nic_get_headlen(va, l234info,
-						HNS3_RX_HEAD_SIZE);
+		pull_len = eth_get_headlen(va, HNS3_RX_HEAD_SIZE);
+
 		memcpy(__skb_put(skb, pull_len), va,
 		       ALIGN(pull_len, sizeof(long)));
 
-- 
cgit v1.2.3-59-g8ed1b


From 99a6993a6933acbf791d1c7bb789e0ebe5bd7127 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Sat, 19 May 2018 16:53:17 +0100
Subject: net: hns3: cleanup of return values in hclge_init_client_instance()

Removes the goto and directly returns in case of errors as part of the
cleanup.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 46435c85b160..46e030c41fe9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5297,7 +5297,7 @@ static int hclge_init_client_instance(struct hnae3_client *client,
 			vport->nic.client = client;
 			ret = client->ops->init_instance(&vport->nic);
 			if (ret)
-				goto err;
+				return ret;
 
 			if (hdev->roce_client &&
 			    hnae3_dev_roce_supported(hdev)) {
@@ -5305,11 +5305,11 @@ static int hclge_init_client_instance(struct hnae3_client *client,
 
 				ret = hclge_init_roce_base_info(vport);
 				if (ret)
-					goto err;
+					return ret;
 
 				ret = rc->ops->init_instance(&vport->roce);
 				if (ret)
-					goto err;
+					return ret;
 			}
 
 			break;
@@ -5319,7 +5319,7 @@ static int hclge_init_client_instance(struct hnae3_client *client,
 
 			ret = client->ops->init_instance(&vport->nic);
 			if (ret)
-				goto err;
+				return ret;
 
 			break;
 		case HNAE3_CLIENT_ROCE:
@@ -5331,18 +5331,16 @@ static int hclge_init_client_instance(struct hnae3_client *client,
 			if (hdev->roce_client && hdev->nic_client) {
 				ret = hclge_init_roce_base_info(vport);
 				if (ret)
-					goto err;
+					return ret;
 
 				ret = client->ops->init_instance(&vport->roce);
 				if (ret)
-					goto err;
+					return ret;
 			}
 		}
 	}
 
 	return 0;
-err:
-	return ret;
 }
 
 static void hclge_uninit_client_instance(struct hnae3_client *client,
-- 
cgit v1.2.3-59-g8ed1b


From 13562d1f5e2fbe2cf33b23a00abca3f71264c4ac Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Sat, 19 May 2018 16:53:18 +0100
Subject: net: hns3: Fix the missing client list node initialization

This patch fixes the missing initialization of the client list node
in the hnae3_register_client() function.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 66cbb66b2326..701ae5e79219 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3509,6 +3509,8 @@ static int __init hns3_init_module(void)
 
 	client.ops = &client_ops;
 
+	INIT_LIST_HEAD(&client.node);
+
 	ret = hnae3_register_client(&client);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 3c7624d8fc0b893b644b945ab904c629ebc9611e Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Sat, 19 May 2018 16:53:19 +0100
Subject: net: hns3: Fix for hns3 module is loaded multiple times problem

If the hns3 driver has been built into kernel and then loaded with
the same driver which built as KLM, it may trigger an error like
below:

[   20.009555] hns3: Hisilicon Ethernet Network Driver for Hip08 Family - version
[   20.016789] hns3: Copyright (c) 2017 Huawei Corporation.
[   20.022100] Error: Driver 'hns3' is already registered, aborting...
[   23.517397] Unable to handle kernel NULL pointer dereference at virtual address 00000000
...
[   23.691583] Process insmod (pid: 1982, stack limit = 0x00000000cd5f21cb)
[   23.698270] Call trace:
[   23.700705]  __list_del_entry_valid+0x2c/0xd8
[   23.705049]  hnae3_unregister_client+0x68/0xa8
[   23.709487]  hns3_init_module+0x98/0x1000 [hns3]
[   23.714093]  do_one_initcall+0x5c/0x170
[   23.717918]  do_init_module+0x64/0x1f4
[   23.721654]  load_module+0x1d14/0x24b0
[   23.725390]  SyS_init_module+0x158/0x208
[   23.729300]  el0_svc_naked+0x30/0x34

This patch fixes it by adding module version info.

Fixes: 38caee9d3ee8 ("net: hns3: Add support of the HNAE3 framework")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c               | 1 +
 drivers/net/ethernet/hisilicon/hns3/hnae3.h               | 2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c           | 1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h           | 2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h   | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 +-
 6 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index bd3c232234d7..63d7dbfb90bf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -285,3 +285,4 @@ EXPORT_SYMBOL(hnae3_unregister_ae_dev);
 MODULE_AUTHOR("Huawei Tech. Co., Ltd.");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("HNAE3(Hisilicon Network Acceleration Engine) Framework");
+MODULE_VERSION(HNAE3_MOD_VERSION);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 2f266ef96582..45c571eea2ae 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -36,6 +36,8 @@
 #include <linux/pci.h>
 #include <linux/types.h>
 
+#define HNAE3_MOD_VERSION "1.0"
+
 /* Device IDs */
 #define HNAE3_DEV_ID_GE				0xA220
 #define HNAE3_DEV_ID_25GE			0xA221
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 701ae5e79219..cac51954f2cf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3538,3 +3538,4 @@ MODULE_DESCRIPTION("HNS3: Hisilicon Ethernet Driver");
 MODULE_AUTHOR("Huawei Tech. Co., Ltd.");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("pci:hns-nic");
+MODULE_VERSION(HNS3_MOD_VERSION);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 98cdbd3a1163..5b40f5a53761 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -14,6 +14,8 @@
 
 #include "hnae3.h"
 
+#define HNS3_MOD_VERSION "1.0"
+
 extern const char hns3_driver_version[];
 
 enum hns3_nic_state {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index af736a44607c..93177d91eea4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -17,7 +17,7 @@
 #include "hclge_cmd.h"
 #include "hnae3.h"
 
-#define HCLGE_MOD_VERSION "v1.0"
+#define HCLGE_MOD_VERSION "1.0"
 #define HCLGE_DRIVER_NAME "hclge"
 
 #define HCLGE_INVALID_VPORT 0xffff
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
index a477a7c36bbd..9763e742e6fb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
@@ -9,7 +9,7 @@
 #include "hclgevf_cmd.h"
 #include "hnae3.h"
 
-#define HCLGEVF_MOD_VERSION "v1.0"
+#define HCLGEVF_MOD_VERSION "1.0"
 #define HCLGEVF_DRIVER_NAME "hclgevf"
 
 #define HCLGEVF_ROCEE_VECTOR_NUM	0
-- 
cgit v1.2.3-59-g8ed1b


From 42687543ca18c106ffd562b40199bd31ab3a347a Mon Sep 17 00:00:00 2001
From: Huazhong Tan <tanhuazhong@huawei.com>
Date: Sat, 19 May 2018 16:53:20 +0100
Subject: net: hns3: Use enums instead of magic number in
 hclge_is_special_opcode

This patch does bit of a clean-up by using already defined enums for
certain values in function hclge_is_special_opcode(). Below enums from
have been used as replacements for magic values:

enum hclge_opcode_type{
	<snip>
	HCLGE_OPC_STATS_64_BIT		= 0x0030,
	HCLGE_OPC_STATS_32_BIT		= 0x0031,
	HCLGE_OPC_STATS_MAC		= 0x0032,
	<snip>
};

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index fab70683bbf7..59fb0eb755e7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -190,7 +190,11 @@ static int hclge_cmd_csq_done(struct hclge_hw *hw)
 
 static bool hclge_is_special_opcode(u16 opcode)
 {
-	u16 spec_opcode[3] = {0x0030, 0x0031, 0x0032};
+	/* these commands have several descriptors,
+	 * and use the first one to save opcode and return value
+	 */
+	u16 spec_opcode[3] = {HCLGE_OPC_STATS_64_BIT,
+		HCLGE_OPC_STATS_32_BIT, HCLGE_OPC_STATS_MAC};
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(spec_opcode); i++) {
-- 
cgit v1.2.3-59-g8ed1b


From f30dfddcca1b7773ef8ed8ddbf2d477bc5eb9f3e Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Sat, 19 May 2018 16:53:21 +0100
Subject: net: hns3: Fix for netdev not running problem after calling net_stop
 and net_open

The link status update function is called by timer every second. But
net_stop and net_open may be called with very short intervals. The link
status update function can not detect the link state has changed. It
causes the netdev not running problem.

This patch fixes it by updating the link state in ae_stop function.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 46e030c41fe9..2f0bbb6708b9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3793,6 +3793,9 @@ static void hclge_ae_stop(struct hnae3_handle *handle)
 
 	/* reset tqp stats */
 	hclge_reset_tqp_stats(handle);
+	del_timer_sync(&hdev->service_timer);
+	cancel_work_sync(&hdev->service_task);
+	hclge_update_link_status(hdev);
 }
 
 static int hclge_get_mac_vlan_cmd_status(struct hclge_vport *vport,
-- 
cgit v1.2.3-59-g8ed1b


From 7bb572fddd757512130afcc0a8a683095eee1cd5 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Sat, 19 May 2018 16:53:22 +0100
Subject: net: hns3: Fixes kernel panic issue during rmmod hns3 driver

If CONFIG_ARM_SMMU_V3 is enabled, arm64's dma_ops will replace
arm64_swiotlb_dma_ops with iommu_dma_ops. When releasing contiguous
dma memory, the new ops will call the vunmap function which cannot
be run in interrupt context.

Currently, spin_lock_bh is called before vunmap is executed. This
disables BH and causes the interrupt context to be detected to
generate a kernel panic like below:

[ 2831.573400] kernel BUG at mm/vmalloc.c:1621!
[ 2831.577659] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
...
[ 2831.699907] Process rmmod (pid: 1893, stack limit = 0x0000000055103ee2)
[ 2831.706507] Call trace:
[ 2831.708941]  vunmap+0x48/0x50
[ 2831.711897]  dma_common_free_remap+0x78/0x88
[ 2831.716155]  __iommu_free_attrs+0xa8/0x1c0
[ 2831.720255]  hclge_free_cmd_desc+0xc8/0x118 [hclge]
[ 2831.725128]  hclge_destroy_cmd_queue+0x34/0x68 [hclge]
[ 2831.730261]  hclge_uninit_ae_dev+0x90/0x100 [hclge]
[ 2831.735127]  hnae3_unregister_ae_dev+0xb0/0x868 [hnae3]
[ 2831.740345]  hns3_remove+0x3c/0x90 [hns3]
[ 2831.744344]  pci_device_remove+0x48/0x108
[ 2831.748342]  device_release_driver_internal+0x164/0x200
[ 2831.753553]  driver_detach+0x4c/0x88
[ 2831.757116]  bus_remove_driver+0x60/0xc0
[ 2831.761026]  driver_unregister+0x34/0x60
[ 2831.764935]  pci_unregister_driver+0x30/0xb0
[ 2831.769197]  hns3_exit_module+0x10/0x978 [hns3]
[ 2831.773715]  SyS_delete_module+0x1f8/0x248
[ 2831.777799]  el0_svc_naked+0x30/0x34

This patch fixes it by using spin_lock instead of spin_lock_bh.

Fixes: 68c0a5c70614 ("net: hns3: Add HNS3 IMP(Integrated Mgmt Proc) Cmd Interface Support")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index 59fb0eb755e7..c36d64710fa6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -385,9 +385,9 @@ int hclge_cmd_init(struct hclge_dev *hdev)
 
 static void hclge_destroy_queue(struct hclge_cmq_ring *ring)
 {
-	spin_lock_bh(&ring->lock);
+	spin_lock(&ring->lock);
 	hclge_free_cmd_desc(ring);
-	spin_unlock_bh(&ring->lock);
+	spin_unlock(&ring->lock);
 }
 
 void hclge_destroy_cmd_queue(struct hclge_hw *hw)
-- 
cgit v1.2.3-59-g8ed1b


From eddf04626d1d6d0bcd01ac6a287e49f5ddb90a26 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Sat, 19 May 2018 16:53:23 +0100
Subject: net: hns3: Fix for CMDQ and Misc. interrupt init order problem

When vf module is loading, the cmd queue initialization should
happen before misc interrupt initialization, otherwise the misc
interrupt handle will cause using uninitialized cmd queue problem.
There is also the same issue when vf module is unloading.

This patch fixes it by adjusting the location of some function.

Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index f1f4a17340a5..2b0e3295989f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1634,6 +1634,10 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 
 	hclgevf_state_init(hdev);
 
+	ret = hclgevf_cmd_init(hdev);
+	if (ret)
+		goto err_cmd_init;
+
 	ret = hclgevf_misc_irq_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed(%d) to init Misc IRQ(vector0)\n",
@@ -1641,10 +1645,6 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 		goto err_misc_irq_init;
 	}
 
-	ret = hclgevf_cmd_init(hdev);
-	if (ret)
-		goto err_cmd_init;
-
 	ret = hclgevf_configure(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed(%d) to fetch configuration\n", ret);
@@ -1692,10 +1692,10 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 	return 0;
 
 err_config:
-	hclgevf_cmd_uninit(hdev);
-err_cmd_init:
 	hclgevf_misc_irq_uninit(hdev);
 err_misc_irq_init:
+	hclgevf_cmd_uninit(hdev);
+err_cmd_init:
 	hclgevf_state_uninit(hdev);
 	hclgevf_uninit_msi(hdev);
 err_irq_init:
@@ -1705,9 +1705,9 @@ err_irq_init:
 
 static void hclgevf_uninit_hdev(struct hclgevf_dev *hdev)
 {
-	hclgevf_cmd_uninit(hdev);
-	hclgevf_misc_irq_uninit(hdev);
 	hclgevf_state_uninit(hdev);
+	hclgevf_misc_irq_uninit(hdev);
+	hclgevf_cmd_uninit(hdev);
 	hclgevf_uninit_msi(hdev);
 	hclgevf_pci_uninit(hdev);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 27164491cb82b611b5269e632e4af0664788794d Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 20 May 2018 00:02:36 +0300
Subject: sh_eth: fix typo in EESR.TRO bit name

The  correct name of the EESR bit 8 is TRO (transmit retry over), not RTO.
Note that EESIPR bit 8, TROIP remained correct...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 10 +++++-----
 drivers/net/ethernet/renesas/sh_eth.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 83148ca61317..d71657dce281 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -706,7 +706,7 @@ static struct sh_eth_cpu_data rcar_gen1_data = {
 			  EESIPR_RTLFIP | EESIPR_RTSFIP |
 			  EESIPR_PREIP | EESIPR_CERFIP,
 
-	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
+	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
 	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
 			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
 	.fdr_value	= 0x00000f0f,
@@ -738,7 +738,7 @@ static struct sh_eth_cpu_data rcar_gen2_data = {
 			  EESIPR_RTLFIP | EESIPR_RTSFIP |
 			  EESIPR_PREIP | EESIPR_CERFIP,
 
-	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
+	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
 	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
 			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
 	.fdr_value	= 0x00000f0f,
@@ -774,7 +774,7 @@ static struct sh_eth_cpu_data r8a77980_data = {
 			  EESIPR_RTLFIP | EESIPR_RTSFIP |
 			  EESIPR_PREIP | EESIPR_CERFIP,
 
-	.tx_check       = EESR_FTC | EESR_CD | EESR_RTO,
+	.tx_check       = EESR_FTC | EESR_CD | EESR_TRO,
 	.eesr_err_check = EESR_TWB1 | EESR_TWB | EESR_TABT | EESR_RABT |
 			  EESR_RFE | EESR_RDE | EESR_RFRMER |
 			  EESR_TFE | EESR_TDE | EESR_ECI,
@@ -831,7 +831,7 @@ static struct sh_eth_cpu_data sh7724_data = {
 			  EESIPR_RTLFIP | EESIPR_RTSFIP |
 			  EESIPR_PREIP | EESIPR_CERFIP,
 
-	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
+	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
 	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
 			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
 
@@ -876,7 +876,7 @@ static struct sh_eth_cpu_data sh7757_data = {
 			  EESIPR_RRFIP | EESIPR_RTLFIP | EESIPR_RTSFIP |
 			  EESIPR_PREIP | EESIPR_CERFIP,
 
-	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
+	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_TRO,
 	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE |
 			  EESR_RDE | EESR_RFRMER | EESR_TFE | EESR_TDE,
 
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 7d5aaba1384a..9d9e80c33343 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -243,7 +243,7 @@ enum EESR_BIT {
 	EESR_CND	= 0x00000800,
 	EESR_DLC	= 0x00000400,
 	EESR_CD		= 0x00000200,
-	EESR_RTO	= 0x00000100,
+	EESR_TRO	= 0x00000100,
 	EESR_RMAF	= 0x00000080,
 	EESR_CEEF	= 0x00000040,
 	EESR_CELF	= 0x00000020,
@@ -263,7 +263,7 @@ enum EESR_BIT {
 				 EESR_CERF)  /* Recv frame CRC error */
 
 #define DEFAULT_TX_CHECK	(EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | \
-				 EESR_RTO)
+				 EESR_TRO)
 #define DEFAULT_EESR_ERR_CHECK	(EESR_TWB | EESR_TABT | EESR_RABT | EESR_RFE | \
 				 EESR_RDE | EESR_RFRMER | EESR_ADE | \
 				 EESR_TFE | EESR_TDE)
-- 
cgit v1.2.3-59-g8ed1b


From 9c59c9a8e94660d05790e47bd4a8c756254614b7 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 20 May 2018 00:03:42 +0300
Subject: sh_eth: fix comment grammar in 'struct sh_eth_cpu_data'

All the verbs in the comments to the 'struct sh_eth_cpu_data' declaration
should  be in a  3rd person singular, to match the nouns.

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 9d9e80c33343..ceb420e556a6 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -499,21 +499,21 @@ struct sh_eth_cpu_data {
 
 	/* hardware features */
 	unsigned long irq_flags; /* IRQ configuration flags */
-	unsigned no_psr:1;	/* EtherC DO NOT have PSR */
-	unsigned apr:1;		/* EtherC have APR */
-	unsigned mpr:1;		/* EtherC have MPR */
-	unsigned tpauser:1;	/* EtherC have TPAUSER */
-	unsigned bculr:1;	/* EtherC have BCULR */
-	unsigned tsu:1;		/* EtherC have TSU */
-	unsigned hw_swap:1;	/* E-DMAC have DE bit in EDMR */
+	unsigned no_psr:1;	/* EtherC DOES NOT have PSR */
+	unsigned apr:1;		/* EtherC has APR */
+	unsigned mpr:1;		/* EtherC has MPR */
+	unsigned tpauser:1;	/* EtherC has TPAUSER */
+	unsigned bculr:1;	/* EtherC has BCULR */
+	unsigned tsu:1;		/* EtherC has TSU */
+	unsigned hw_swap:1;	/* E-DMAC has DE bit in EDMR */
 	unsigned nbst:1;	/* E-DMAC has NBST bit in EDMR */
-	unsigned rpadir:1;	/* E-DMAC have RPADIR */
-	unsigned no_trimd:1;	/* E-DMAC DO NOT have TRIMD */
-	unsigned no_ade:1;	/* E-DMAC DO NOT have ADE bit in EESR */
+	unsigned rpadir:1;	/* E-DMAC has RPADIR */
+	unsigned no_trimd:1;	/* E-DMAC DOES NOT have TRIMD */
+	unsigned no_ade:1;	/* E-DMAC DOES NOT have ADE bit in EESR */
 	unsigned no_xdfar:1;	/* E-DMAC DOES NOT have RDFAR/TDFAR */
 	unsigned xdfar_rw:1;	/* E-DMAC has writeable RDFAR/TDFAR */
 	unsigned hw_checksum:1;	/* E-DMAC has CSMR */
-	unsigned select_mii:1;	/* EtherC have RMII_MII (MII select register) */
+	unsigned select_mii:1;	/* EtherC has RMII_MII (MII select register) */
 	unsigned rmiimode:1;	/* EtherC has RMIIMODE register */
 	unsigned rtrate:1;	/* EtherC has RTRATE register */
 	unsigned magic:1;	/* EtherC has ECMR.MPDE and ECSR.MPD */
-- 
cgit v1.2.3-59-g8ed1b


From 6b14787ab11d8fc2225132f0fe8275531fdefc72 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 20 May 2018 00:05:02 +0300
Subject: sh_eth: fix typo in comment to BCULR write

Simon has noticed a typo in the comment accompaining the BCULR write --
fix it and move the comment before the write (following the style of
the other comments), while at it...

Reported-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index d71657dce281..d9cadfb1bc4a 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -1481,8 +1481,9 @@ static int sh_eth_dev_init(struct net_device *ndev)
 	if (mdp->cd->nbst)
 		sh_eth_modify(ndev, EDMR, EDMR_NBST, EDMR_NBST);
 
+	/* Burst cycle count upper-limit */
 	if (mdp->cd->bculr)
-		sh_eth_write(ndev, 0x800, BCULR);	/* Burst sycle set */
+		sh_eth_write(ndev, 0x800, BCULR);
 
 	sh_eth_write(ndev, mdp->cd->fcftr_value, FCFTR);
 
-- 
cgit v1.2.3-59-g8ed1b


From 577941eb56fe5fd95e1a60c6dba3dfaf606e3fde Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 19 May 2018 22:31:33 +0200
Subject: net: dsa: mv88e6xxx: Remove OF check for IRQ domain

An IRQ domain will work without an OF node. It is not possible to
reference interrupts via a phandle, but C code can still use
irq_find_mapping() to get an interrupt from the domain.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/global2.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/global2.c b/drivers/net/dsa/mv88e6xxx/global2.c
index f9bde011a3e6..91a3cb2452ac 100644
--- a/drivers/net/dsa/mv88e6xxx/global2.c
+++ b/drivers/net/dsa/mv88e6xxx/global2.c
@@ -1047,9 +1047,6 @@ int mv88e6xxx_g2_irq_setup(struct mv88e6xxx_chip *chip)
 {
 	int err, irq, virq;
 
-	if (!chip->dev->of_node)
-		return -EINVAL;
-
 	chip->g2_irq.domain = irq_domain_add_simple(
 		chip->dev->of_node, 16, 0, &mv88e6xxx_g2_irq_domain_ops, chip);
 	if (!chip->g2_irq.domain)
-- 
cgit v1.2.3-59-g8ed1b


From 877b7cb0b6f283593a663134ee52703f12c895cc Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 19 May 2018 22:31:34 +0200
Subject: net: dsa: mv88e6xxx: Add minimal platform_data support

Not all the world uses device tree. Some parts of the world still use
platform devices and platform data. Add basic support for probing a
Marvell switch via platform data.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                             |  1 +
 drivers/net/dsa/mv88e6xxx/chip.c        | 56 ++++++++++++++++++++++++++++-----
 include/linux/platform_data/mv88e6xxx.h | 17 ++++++++++
 3 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/platform_data/mv88e6xxx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 658880464b9d..9f2045a5adac 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8466,6 +8466,7 @@ M:	Vivien Didelot <vivien.didelot@savoirfairelinux.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/dsa/mv88e6xxx/
+F:	linux/platform_data/mv88e6xxx.h
 F:	Documentation/devicetree/bindings/net/dsa/marvell.txt
 
 MARVELL ARMADA DRM SUPPORT
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 2bb3f03ee1cb..5b40382036ea 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -28,6 +28,7 @@
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
+#include <linux/platform_data/mv88e6xxx.h>
 #include <linux/netdevice.h>
 #include <linux/gpio/consumer.h>
 #include <linux/phy.h>
@@ -4350,6 +4351,7 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip)
 		return -ENOMEM;
 
 	ds->priv = chip;
+	ds->dev = dev;
 	ds->ops = &mv88e6xxx_switch_ops;
 	ds->ageing_time_min = chip->info->age_time_coeff;
 	ds->ageing_time_max = chip->info->age_time_coeff * U8_MAX;
@@ -4364,36 +4366,73 @@ static void mv88e6xxx_unregister_switch(struct mv88e6xxx_chip *chip)
 	dsa_unregister_switch(chip->ds);
 }
 
+static const void *pdata_device_get_match_data(struct device *dev)
+{
+	const struct of_device_id *matches = dev->driver->of_match_table;
+	const struct dsa_mv88e6xxx_pdata *pdata = dev->platform_data;
+
+	for (; matches->name[0] || matches->type[0] || matches->compatible[0];
+	     matches++) {
+		if (!strcmp(pdata->compatible, matches->compatible))
+			return matches->data;
+	}
+	return NULL;
+}
+
 static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 {
+	struct dsa_mv88e6xxx_pdata *pdata = mdiodev->dev.platform_data;
 	struct device *dev = &mdiodev->dev;
 	struct device_node *np = dev->of_node;
 	const struct mv88e6xxx_info *compat_info;
 	struct mv88e6xxx_chip *chip;
 	u32 eeprom_len;
+	int port;
 	int err;
 
-	compat_info = of_device_get_match_data(dev);
+	if (np)
+		compat_info = of_device_get_match_data(dev);
+
+	if (pdata) {
+		compat_info = pdata_device_get_match_data(dev);
+
+		if (!pdata->netdev)
+			return -EINVAL;
+
+		for (port = 0; port < DSA_MAX_PORTS; port++) {
+			if (!(pdata->enabled_ports & (1 << port)))
+				continue;
+			if (strcmp(pdata->cd.port_names[port], "cpu"))
+				continue;
+			pdata->cd.netdev[port] = &pdata->netdev->dev;
+			break;
+		}
+	}
+
 	if (!compat_info)
 		return -EINVAL;
 
 	chip = mv88e6xxx_alloc_chip(dev);
-	if (!chip)
-		return -ENOMEM;
+	if (!chip) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	chip->info = compat_info;
 
 	err = mv88e6xxx_smi_init(chip, mdiodev->bus, mdiodev->addr);
 	if (err)
-		return err;
+		goto out;
 
 	chip->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
-	if (IS_ERR(chip->reset))
-		return PTR_ERR(chip->reset);
+	if (IS_ERR(chip->reset)) {
+		err = PTR_ERR(chip->reset);
+		goto out;
+	}
 
 	err = mv88e6xxx_detect(chip);
 	if (err)
-		return err;
+		goto out;
 
 	mv88e6xxx_phy_init(chip);
 
@@ -4468,6 +4507,9 @@ out_g1_irq:
 		mv88e6xxx_irq_poll_free(chip);
 	mutex_unlock(&chip->reg_lock);
 out:
+	if (pdata)
+		dev_put(pdata->netdev);
+
 	return err;
 }
 
diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h
new file mode 100644
index 000000000000..88e91e05f48f
--- /dev/null
+++ b/include/linux/platform_data/mv88e6xxx.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DSA_MV88E6XXX_H
+#define __DSA_MV88E6XXX_H
+
+#include <net/dsa.h>
+
+struct dsa_mv88e6xxx_pdata {
+	/* Must be first, such that dsa_register_switch() can access this
+	 * without gory pointer manipulations
+	 */
+	struct dsa_chip_data cd;
+	const char *compatible;
+	unsigned int enabled_ports;
+	struct net_device *netdev;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 00baabe5286c41d21ed4a78f479b021eba1f0d51 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sat, 19 May 2018 22:31:35 +0200
Subject: net: dsa: mv88e6xxx: Add support for EEPROM via platform data

Add the size of the EEPROM to the platform data, so it can also be
instantiated by a platform device.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c        | 11 +++++++----
 drivers/net/dsa/mv88e6xxx/chip.h        |  2 +-
 include/linux/platform_data/mv88e6xxx.h |  1 +
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 5b40382036ea..1fa1f820a437 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4386,7 +4386,6 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 	struct device_node *np = dev->of_node;
 	const struct mv88e6xxx_info *compat_info;
 	struct mv88e6xxx_chip *chip;
-	u32 eeprom_len;
 	int port;
 	int err;
 
@@ -4436,9 +4435,13 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 
 	mv88e6xxx_phy_init(chip);
 
-	if (chip->info->ops->get_eeprom &&
-	    !of_property_read_u32(np, "eeprom-length", &eeprom_len))
-		chip->eeprom_len = eeprom_len;
+	if (chip->info->ops->get_eeprom) {
+		if (np)
+			of_property_read_u32(np, "eeprom-length",
+					     &chip->eeprom_len);
+		else
+			chip->eeprom_len = pdata->eeprom_len;
+	}
 
 	mutex_lock(&chip->reg_lock);
 	err = mv88e6xxx_switch_reset(chip);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 012268046442..8ac3fbb15352 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -238,7 +238,7 @@ struct mv88e6xxx_chip {
 	struct gpio_desc *reset;
 
 	/* set to size of eeprom if supported by the switch */
-	int		eeprom_len;
+	u32 eeprom_len;
 
 	/* List of mdio busses */
 	struct list_head mdios;
diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h
index 88e91e05f48f..f63af2955ea0 100644
--- a/include/linux/platform_data/mv88e6xxx.h
+++ b/include/linux/platform_data/mv88e6xxx.h
@@ -12,6 +12,7 @@ struct dsa_mv88e6xxx_pdata {
 	const char *compatible;
 	unsigned int enabled_ports;
 	struct net_device *netdev;
+	u32 eeprom_len;
 };
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From bf3c592b9d78ab2768150c680de22ef99e6e3b32 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 20 May 2018 08:56:30 -0700
Subject: net: dsa: b53: Extend platform data to include DSA ports

The b53 driver already defines and internally uses platform data to let the
glue drivers specify parameters such as the chip id.  What we were missing was
a way to tell the core DSA layer about the ports and their type.

Place a dsa_chip_data structure at the beginning of b53_platform_data for
dsa_register_switch() to access it. This does not require modifications to
b53_common.c which will pass platform_data trough.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/b53.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/platform_data/b53.h b/include/linux/platform_data/b53.h
index 69d279c0da96..8eaef2f2b691 100644
--- a/include/linux/platform_data/b53.h
+++ b/include/linux/platform_data/b53.h
@@ -20,8 +20,12 @@
 #define __B53_H
 
 #include <linux/kernel.h>
+#include <net/dsa.h>
 
 struct b53_platform_data {
+	/* Must be first such that dsa_register_switch() can access it */
+	struct dsa_chip_data cd;
+
 	u32 chip_id;
 	u16 enabled_ports;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7ddae24f97f9fc8f151aae75f0371ced06a8cd99 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 20 May 2018 19:04:24 -0400
Subject: mv88e6xxx: Fix uninitialized variable warning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In mv88e6xxx_probe(), ("np" or "pdata") might be an invariant
but GCC can't see that, therefore:

drivers/net/dsa/mv88e6xxx/chip.c: In function ‘mv88e6xxx_probe’:
drivers/net/dsa/mv88e6xxx/chip.c:4420:13: warning: ‘compat_info’ may be used uninitialized in this function [-Wmaybe-uninitialized]
  chip->info = compat_info;

Actually, it should have warned on the "if (!compat_info)" test, but
whatever.

Explicitly initialize to NULL in the variable declaration to
deal with this.

Fixes: 877b7cb0b6f2 ("net: dsa: mv88e6xxx: Add minimal platform_data support")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 1fa1f820a437..12df00f593b7 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4382,9 +4382,9 @@ static const void *pdata_device_get_match_data(struct device *dev)
 static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 {
 	struct dsa_mv88e6xxx_pdata *pdata = mdiodev->dev.platform_data;
+	const struct mv88e6xxx_info *compat_info = NULL;
 	struct device *dev = &mdiodev->dev;
 	struct device_node *np = dev->of_node;
-	const struct mv88e6xxx_info *compat_info;
 	struct mv88e6xxx_chip *chip;
 	int port;
 	int err;
-- 
cgit v1.2.3-59-g8ed1b


From 3bcd47726c3b744fd08781795cca905cc59a1382 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 20 May 2018 20:49:47 -0700
Subject: net: phy: phylink: Don't release NULL GPIO

If CONFIG_GPIOLIB is disabled, gpiod_put() becomes a stub that produces a
warning, this helped identify that we could be attempting to release a NULL
pl->link_gpio GPIO descriptor, so guard against that.

Fixes: daab3349ad1a ("net: phy: phylink: Release link GPIO")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 581ce93ecaf9..af4dc4425be2 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -624,7 +624,7 @@ void phylink_destroy(struct phylink *pl)
 {
 	if (pl->sfp_bus)
 		sfp_unregister_upstream(pl->sfp_bus);
-	if (!IS_ERR(pl->link_gpio))
+	if (!IS_ERR_OR_NULL(pl->link_gpio))
 		gpiod_put(pl->link_gpio);
 
 	cancel_work_sync(&pl->resolve);
-- 
cgit v1.2.3-59-g8ed1b


From 6c541b4595a28b204888db75aba1966ede4a6184 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 20 May 2018 20:58:28 -0700
Subject: net: ethernet: Sort Kconfig sourcing alphabetically

A number of entries were not alphabetically sorted, remedy that.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/Kconfig | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 54d71e1c48d5..af766fd61151 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -33,9 +33,9 @@ source "drivers/net/ethernet/aquantia/Kconfig"
 source "drivers/net/ethernet/arc/Kconfig"
 source "drivers/net/ethernet/atheros/Kconfig"
 source "drivers/net/ethernet/aurora/Kconfig"
-source "drivers/net/ethernet/cadence/Kconfig"
 source "drivers/net/ethernet/broadcom/Kconfig"
 source "drivers/net/ethernet/brocade/Kconfig"
+source "drivers/net/ethernet/cadence/Kconfig"
 source "drivers/net/ethernet/calxeda/Kconfig"
 source "drivers/net/ethernet/cavium/Kconfig"
 source "drivers/net/ethernet/chelsio/Kconfig"
@@ -72,16 +72,16 @@ source "drivers/net/ethernet/dec/Kconfig"
 source "drivers/net/ethernet/dlink/Kconfig"
 source "drivers/net/ethernet/emulex/Kconfig"
 source "drivers/net/ethernet/ezchip/Kconfig"
-source "drivers/net/ethernet/neterion/Kconfig"
 source "drivers/net/ethernet/faraday/Kconfig"
 source "drivers/net/ethernet/freescale/Kconfig"
 source "drivers/net/ethernet/fujitsu/Kconfig"
 source "drivers/net/ethernet/hisilicon/Kconfig"
 source "drivers/net/ethernet/hp/Kconfig"
 source "drivers/net/ethernet/huawei/Kconfig"
+source "drivers/net/ethernet/i825xx/Kconfig"
 source "drivers/net/ethernet/ibm/Kconfig"
 source "drivers/net/ethernet/intel/Kconfig"
-source "drivers/net/ethernet/i825xx/Kconfig"
+source "drivers/net/ethernet/neterion/Kconfig"
 source "drivers/net/ethernet/xscale/Kconfig"
 
 config JME
@@ -114,8 +114,8 @@ source "drivers/net/ethernet/mediatek/Kconfig"
 source "drivers/net/ethernet/mellanox/Kconfig"
 source "drivers/net/ethernet/micrel/Kconfig"
 source "drivers/net/ethernet/microchip/Kconfig"
-source "drivers/net/ethernet/mscc/Kconfig"
 source "drivers/net/ethernet/moxa/Kconfig"
+source "drivers/net/ethernet/mscc/Kconfig"
 source "drivers/net/ethernet/myricom/Kconfig"
 
 config FEALNX
@@ -161,20 +161,21 @@ source "drivers/net/ethernet/packetengines/Kconfig"
 source "drivers/net/ethernet/pasemi/Kconfig"
 source "drivers/net/ethernet/qlogic/Kconfig"
 source "drivers/net/ethernet/qualcomm/Kconfig"
+source "drivers/net/ethernet/rdc/Kconfig"
 source "drivers/net/ethernet/realtek/Kconfig"
 source "drivers/net/ethernet/renesas/Kconfig"
-source "drivers/net/ethernet/rdc/Kconfig"
 source "drivers/net/ethernet/rocker/Kconfig"
 source "drivers/net/ethernet/samsung/Kconfig"
 source "drivers/net/ethernet/seeq/Kconfig"
-source "drivers/net/ethernet/silan/Kconfig"
-source "drivers/net/ethernet/sis/Kconfig"
 source "drivers/net/ethernet/sfc/Kconfig"
 source "drivers/net/ethernet/sgi/Kconfig"
+source "drivers/net/ethernet/silan/Kconfig"
+source "drivers/net/ethernet/sis/Kconfig"
 source "drivers/net/ethernet/smsc/Kconfig"
 source "drivers/net/ethernet/socionext/Kconfig"
 source "drivers/net/ethernet/stmicro/Kconfig"
 source "drivers/net/ethernet/sun/Kconfig"
+source "drivers/net/ethernet/synopsys/Kconfig"
 source "drivers/net/ethernet/tehuti/Kconfig"
 source "drivers/net/ethernet/ti/Kconfig"
 source "drivers/net/ethernet/toshiba/Kconfig"
@@ -183,6 +184,5 @@ source "drivers/net/ethernet/via/Kconfig"
 source "drivers/net/ethernet/wiznet/Kconfig"
 source "drivers/net/ethernet/xilinx/Kconfig"
 source "drivers/net/ethernet/xircom/Kconfig"
-source "drivers/net/ethernet/synopsys/Kconfig"
 
 endif # ETHERNET
-- 
cgit v1.2.3-59-g8ed1b


From a6076fcd187a1cb4900cf970a04401957b4b4ab8 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Mon, 21 May 2018 12:26:36 +0530
Subject: cxgb4: copy the length of cpl_tx_pkt_core to fw_wr

immdlen field of FW_ETH_TX_PKT_WR is filled in a wrong way,
we must copy the length of all the cpls encapsulated in fw
work request. In the xmit path we missed adding the length
of CPL_TX_PKT_CORE but we added the length of WR_HDR and it
worked because WR_HDR and CPL_TX_PKT_CORE are of same length.
Add the length of cpl_tx_pkt_core not WR_HDR's. This also
fixes the lso cpl errors for udp tunnels

Fixes: d0a1299c6bf7 ("cxgb4: add support for vxlan segmentation offload")
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/sge.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 0f87e973a158..276f22357f81 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1411,8 +1411,9 @@ out_free:	dev_kfree_skb_any(skb);
 	end = (u64 *)wr + flits;
 
 	len = immediate ? skb->len : 0;
+	len += sizeof(*cpl);
 	if (ssi->gso_size) {
-		struct cpl_tx_pkt_lso *lso = (void *)wr;
+		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 		bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
 		int l3hdr_len = skb_network_header_len(skb);
 		int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
@@ -1442,20 +1443,19 @@ out_free:	dev_kfree_skb_any(skb);
 			if (skb->ip_summed == CHECKSUM_PARTIAL)
 				cntrl = hwcsum(adap->params.chip, skb);
 		} else {
-			lso->c.lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
-					  LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
-					  LSO_IPV6_V(v6) |
-					  LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
-					  LSO_IPHDR_LEN_V(l3hdr_len / 4) |
-					  LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
-			lso->c.ipid_ofst = htons(0);
-			lso->c.mss = htons(ssi->gso_size);
-			lso->c.seqno_offset = htonl(0);
+			lso->lso_ctrl = htonl(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
+					LSO_FIRST_SLICE_F | LSO_LAST_SLICE_F |
+					LSO_IPV6_V(v6) |
+					LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
+					LSO_IPHDR_LEN_V(l3hdr_len / 4) |
+					LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
+			lso->ipid_ofst = htons(0);
+			lso->mss = htons(ssi->gso_size);
+			lso->seqno_offset = htonl(0);
 			if (is_t4(adap->params.chip))
-				lso->c.len = htonl(skb->len);
+				lso->len = htonl(skb->len);
 			else
-				lso->c.len =
-					htonl(LSO_T5_XFER_SIZE_V(skb->len));
+				lso->len = htonl(LSO_T5_XFER_SIZE_V(skb->len));
 			cpl = (void *)(lso + 1);
 
 			if (CHELSIO_CHIP_VERSION(adap->params.chip)
@@ -1484,7 +1484,6 @@ out_free:	dev_kfree_skb_any(skb);
 		q->tso++;
 		q->tx_cso += ssi->gso_segs;
 	} else {
-		len += sizeof(*cpl);
 		if (ptp_enabled)
 			op = FW_PTP_TX_PKT_WR;
 		else
@@ -1538,7 +1537,7 @@ out_free:	dev_kfree_skb_any(skb);
 		if (last_desc >= q->q.size)
 			last_desc -= q->q.size;
 		q->q.sdesc[last_desc].skb = skb;
-		q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1);
+		q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)sgl;
 	}
 
 	txq_advance(&q->q, ndesc);
-- 
cgit v1.2.3-59-g8ed1b


From 44c752fe584d8b9f6e0756ecffa8691677471862 Mon Sep 17 00:00:00 2001
From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Date: Mon, 21 May 2018 19:07:50 +0530
Subject: vmcore: move get_vmcore_size out of __init

Fix below build warning:

WARNING: vmlinux.o(.text+0x422bb8): Section mismatch in reference from
the function vmcore_add_device_dump() to the function
.init.text:get_vmcore_size.constprop.5()

The function vmcore_add_device_dump() references
the function __init get_vmcore_size.constprop.5().
This is often because vmcore_add_device_dump lacks a __init
annotation or the annotation of get_vmcore_size.constprop.5 is wrong.

Fixes: 7efe48df8a3d ("vmcore: append device dumps to vmcore as elf notes")
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/vmcore.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 247c3499e5bd..cfb6674331fd 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -649,8 +649,8 @@ static struct vmcore* __init get_new_element(void)
 	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
-static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
-				  struct list_head *vc_list)
+static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+			   struct list_head *vc_list)
 {
 	u64 size;
 	struct vmcore *m;
-- 
cgit v1.2.3-59-g8ed1b


From b4eb7393683d70976a0736b37f1562e8421e3120 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 21 May 2018 11:45:51 -0700
Subject: ti: ethernet: cpdma: Use correct format for genpool_*

Now that we can compile davinci_cpdma.c on 64-bit hosts, we can see that
the format used for printing a size_t type is incorrect, use %zd
accordingly.

Fixes: aeec3021043b ("net: ethernet: ti: cpdma: remove used_desc counter")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 31ae04117f0a..9ec49213de5e 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -191,7 +191,7 @@ static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
 		return;
 
 	WARN(gen_pool_size(pool->gen_pool) != gen_pool_avail(pool->gen_pool),
-	     "cpdma_desc_pool size %d != avail %d",
+	     "cpdma_desc_pool size %zd != avail %zd",
 	     gen_pool_size(pool->gen_pool),
 	     gen_pool_avail(pool->gen_pool));
 	if (pool->cpumap)
-- 
cgit v1.2.3-59-g8ed1b


From ea5ec9fc9eb8bb1eed5ff12dcec328dec918664f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 21 May 2018 11:45:52 -0700
Subject: net: ethernet: ti: cpts: Fix timestamp print

On 64-bit hosts we will get the following warning:

drivers/net/ethernet/ti/cpts.c: In function 'cpts_overflow_check':
drivers/net/ethernet/ti/cpts.c:297:11: warning: format '%lld' expects
argument of type 'long long int', but argument 3 has type
'__kernel_time_t {aka long int}' [-Wformat=]
  pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec,
ts.tv_nsec);

Fix this by using an appropriate casting that works on all bit sizes.

Fixes: a5c79c26e168 ("ptp: cpts: convert to the 64 bit get/set time methods.")
Fixes: 87c0e764d43a ("cpts: introduce time stamping code and a PTP hardware clock.")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpts.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 7842f094f2ef..6f63c8729afc 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -294,7 +294,8 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp)
 		delay = CPTS_SKB_TX_WORK_TIMEOUT;
 	spin_unlock_irqrestore(&cpts->lock, flags);
 
-	pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec);
+	pr_debug("cpts overflow check at %lld.%09ld\n",
+		 (long long)ts.tv_sec, ts.tv_nsec);
 	return (long)delay;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From bf2ce3fdf314cd181908b55a4d6c58817674a062 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 21 May 2018 11:45:53 -0700
Subject: net: ethernet: ti: cpsw: Fix cpsw_add_ch_strings() printk format

When building on a 64-bit host we will get the following warning:

drivers/net/ethernet/ti/cpsw.c: In function 'cpsw_add_ch_strings':
drivers/net/ethernet/ti/cpsw.c:1284:19: warning: format '%d' expects
argument of type 'int', but argument 5 has type 'long unsigned int'
[-Wformat=]
     "%s DMA chan %d: %s", rx_dir ? "Rx" : "Tx",
                  ~^
                  %ld

Fix this by using an %ld format and casting to long.

Fixes: e05107e6b747 ("net: ethernet: ti: cpsw: add multi queue support")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index a7285dddfd29..643cd2d9dfb6 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1281,8 +1281,8 @@ static void cpsw_add_ch_strings(u8 **p, int ch_num, int rx_dir)
 	for (i = 0; i < ch_stats_len; i++) {
 		line = i % CPSW_STATS_CH_LEN;
 		snprintf(*p, ETH_GSTRING_LEN,
-			 "%s DMA chan %d: %s", rx_dir ? "Rx" : "Tx",
-			 i / CPSW_STATS_CH_LEN,
+			 "%s DMA chan %ld: %s", rx_dir ? "Rx" : "Tx",
+			 (long)(i / CPSW_STATS_CH_LEN),
 			 cpsw_gstrings_ch_stats[line].stat_string);
 		*p += ETH_GSTRING_LEN;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5a04e8f81a4f55ce1c2b7b525744a187c99ba302 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 21 May 2018 11:45:54 -0700
Subject: net: ethernet: davinci_emac: Fix printing of base address

Use %pa which is the correct formatter to print a physical address,
instead of %p which is just a pointer.

Fixes: a6286ee630f6 ("net: Add TI DaVinci EMAC driver")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_emac.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c
index abceea802ea1..be0fec17d95d 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1930,8 +1930,8 @@ static int davinci_emac_probe(struct platform_device *pdev)
 
 	if (netif_msg_probe(priv)) {
 		dev_notice(&pdev->dev, "DaVinci EMAC Probe found device "
-			   "(regs: %p, irq: %d)\n",
-			   (void *)priv->emac_base_phys, ndev->irq);
+			   "(regs: %pa, irq: %d)\n",
+			   &priv->emac_base_phys, ndev->irq);
 	}
 	pm_runtime_put(&pdev->dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From c79c3850440c1535b845e96e7e5ff0139ba479e6 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 21 May 2018 11:45:55 -0700
Subject: ti: ethernet: davinci: Fix cast to int warnings

Now that we can compile test this driver on 64-bit hosts, we get some
warnings about how a pointer/address is written/read to/from a register
(sw_token). Fix this by doing the appropriate conversions, we cannot
possibly have the driver work on 64-bit hosts the way the tokens are
managed though, since the registers being written to a 32-bit only.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_cpdma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 9ec49213de5e..cdbddf16dd29 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -1080,7 +1080,7 @@ int cpdma_chan_submit(struct cpdma_chan *chan, void *token, void *data,
 	writel_relaxed(buffer, &desc->hw_buffer);
 	writel_relaxed(len, &desc->hw_len);
 	writel_relaxed(mode | len, &desc->hw_mode);
-	writel_relaxed(token, &desc->sw_token);
+	writel_relaxed((uintptr_t)token, &desc->sw_token);
 	writel_relaxed(buffer, &desc->sw_buffer);
 	writel_relaxed(len, &desc->sw_len);
 	desc_read(desc, sw_len);
@@ -1121,15 +1121,15 @@ static void __cpdma_chan_free(struct cpdma_chan *chan,
 	struct cpdma_desc_pool		*pool = ctlr->pool;
 	dma_addr_t			buff_dma;
 	int				origlen;
-	void				*token;
+	uintptr_t			token;
 
-	token      = (void *)desc_read(desc, sw_token);
+	token      = desc_read(desc, sw_token);
 	buff_dma   = desc_read(desc, sw_buffer);
 	origlen    = desc_read(desc, sw_len);
 
 	dma_unmap_single(ctlr->dev, buff_dma, origlen, chan->dir);
 	cpdma_desc_free(pool, desc, 1);
-	(*chan->handler)(token, outlen, status);
+	(*chan->handler)((void *)token, outlen, status);
 }
 
 static int __cpdma_chan_process(struct cpdma_chan *chan)
-- 
cgit v1.2.3-59-g8ed1b


From ba8f566a5be07dae46c2e562f4243c687ba6dac9 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 21 May 2018 19:21:42 -0500
Subject: nl80211: Fix compilation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 7ea3e110f2f8ba23f330c2f702f556acd539bcb8 seems to have
introduced:

net/wireless/nl80211.c: In function ‘nl80211_get_station’:
net/wireless/nl80211.c:4802:34: error: incompatible type for argument 1 of ‘cfg80211_sinfo_release_content’
   cfg80211_sinfo_release_content(sinfo);
                                  ^~~~~
In file included from net/wireless/nl80211.c:24:0:
./include/net/cfg80211.h:5721:20: note: expected ‘struct station_info *’ but argument is of type ‘struct station_info’
 static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
                    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fixes: 7ea3e110f2f8 ("cfg80211: release station info tidstats where needed")
Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e4a52a2b5e65..462e028ad452 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4799,7 +4799,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg) {
-		cfg80211_sinfo_release_content(sinfo);
+		cfg80211_sinfo_release_content(&sinfo);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 959b71db53e310bc63be22bfa3401aef30c6035d Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:34:56 +0200
Subject: xsk: remove rebind support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Supporting rebind, i.e. after a successful bind the process can call
bind again without closing the socket, makes the AF_XDP setup state
machine more complex. Constrain the state space, by not supporting
rebind.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 817340f7725d..cb1acd7009f4 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -227,14 +227,6 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	return 0;
 }
 
-static void __xsk_release(struct xdp_sock *xs)
-{
-	/* Wait for driver to stop using the xdp socket. */
-	synchronize_net();
-
-	dev_put(xs->dev);
-}
-
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -251,7 +243,9 @@ static int xsk_release(struct socket *sock)
 	local_bh_enable();
 
 	if (xs->dev) {
-		__xsk_release(xs);
+		/* Wait for driver to stop using the xdp socket. */
+		synchronize_net();
+		dev_put(xs->dev);
 		xs->dev = NULL;
 	}
 
@@ -285,9 +279,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
 	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 	struct sock *sk = sock->sk;
-	struct net_device *dev, *dev_curr;
 	struct xdp_sock *xs = xdp_sk(sk);
-	struct xdp_umem *old_umem = NULL;
+	struct net_device *dev;
 	int err = 0;
 
 	if (addr_len < sizeof(struct sockaddr_xdp))
@@ -296,7 +289,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		return -EINVAL;
 
 	mutex_lock(&xs->mutex);
-	dev_curr = xs->dev;
+	if (xs->dev) {
+		err = -EBUSY;
+		goto out_release;
+	}
+
 	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 	if (!dev) {
 		err = -ENODEV;
@@ -343,7 +340,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		}
 
 		xdp_get_umem(umem_xs->umem);
-		old_umem = xs->umem;
 		xs->umem = umem_xs->umem;
 		sockfd_put(sock);
 	} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
@@ -355,14 +351,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
 	}
 
-	/* Rebind? */
-	if (dev_curr && (dev_curr != dev ||
-			 xs->queue_id != sxdp->sxdp_queue_id)) {
-		__xsk_release(xs);
-		if (old_umem)
-			xdp_put_umem(old_umem);
-	}
-
 	xs->dev = dev;
 	xs->queue_id = sxdp->sxdp_queue_id;
 
-- 
cgit v1.2.3-59-g8ed1b


From ad75646c68a6e344a3609f40825855a0e742d04d Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:34:57 +0200
Subject: xsk: fill hole in struct sockaddr_xdp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the sxdp_flags up, avoiding a hole in the uapi structure.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 56db977221d2..c46609a00168 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -17,10 +17,10 @@
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
+	__u16 sxdp_flags;
 	__u32 sxdp_ifindex;
 	__u32 sxdp_queue_id;
 	__u32 sxdp_shared_umem_fd;
-	__u16 sxdp_flags;
 };
 
 /* XDP socket options */
-- 
cgit v1.2.3-59-g8ed1b


From 2e59dd5e4f6e884d1f7a70acf9845d223429026e Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Tue, 22 May 2018 09:34:58 +0200
Subject: xsk: proper queue id check at bind

Validate the queue id against both Rx and Tx on the netdev. Also, make
sure that the queue exists at xmit time.

Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cb1acd7009f4..29707354cf78 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -142,6 +142,11 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
+		if (xs->queue_id >= xs->dev->real_num_tx_queues) {
+			err = -ENXIO;
+			goto out;
+		}
+
 		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
 		if (unlikely(!skb)) {
 			err = -EAGAIN;
@@ -305,7 +310,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	}
 
-	if (sxdp->sxdp_queue_id >= dev->num_rx_queues) {
+	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
+	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b3a9e0be436960072ddf327d9d82f50ee2f620e0 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:34:59 +0200
Subject: xsk: remove explicit ring structure from uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this commit we remove the explicit ring structure from the the
uapi. It is tricky for an uapi to depend on a certain L1 cache line
size, since it can differ for variants of the same architecture. Now,
we let the user application determine the offsets of the producer,
consumer and descriptors by asking the socket via getsockopt.

A typical flow would be (Rx ring):

  struct xdp_mmap_offsets off;
  struct xdp_desc *ring;
  u32 *prod, *cons;
  void *map;
  ...

  getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);

  map = mmap(NULL, off.rx.desc +
		   NUM_DESCS * sizeof(struct xdp_desc),
		   PROT_READ | PROT_WRITE,
		   MAP_SHARED | MAP_POPULATE, sfd,
		   XDP_PGOFF_RX_RING);
  prod = map + off.rx.producer;
  cons = map + off.rx.consumer;
  ring = map + off.rx.desc;

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/if_xdp.h | 44 ++++++++++++++++++++++----------------------
 net/xdp/xsk.c               | 29 +++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 17 +++++++++++++++++
 3 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index c46609a00168..4737cfe222f5 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -23,13 +23,27 @@ struct sockaddr_xdp {
 	__u32 sxdp_shared_umem_fd;
 };
 
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
 /* XDP socket options */
-#define XDP_RX_RING			1
-#define XDP_TX_RING			2
-#define XDP_UMEM_REG			3
-#define XDP_UMEM_FILL_RING		4
-#define XDP_UMEM_COMPLETION_RING	5
-#define XDP_STATISTICS			6
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -50,6 +64,7 @@ struct xdp_statistics {
 #define XDP_UMEM_PGOFF_FILL_RING	0x100000000
 #define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000
 
+/* Rx/Tx descriptor */
 struct xdp_desc {
 	__u32 idx;
 	__u32 len;
@@ -58,21 +73,6 @@ struct xdp_desc {
 	__u8 padding[5];
 };
 
-struct xdp_ring {
-	__u32 producer __attribute__((aligned(64)));
-	__u32 consumer __attribute__((aligned(64)));
-};
-
-/* Used for the RX and TX queues for packets */
-struct xdp_rxtx_ring {
-	struct xdp_ring ptrs;
-	struct xdp_desc desc[0] __attribute__((aligned(64)));
-};
-
-/* Used for the fill and completion queues for buffers */
-struct xdp_umem_ring {
-	struct xdp_ring ptrs;
-	__u32 desc[0] __attribute__((aligned(64)));
-};
+/* UMEM descriptor is __u32 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 29707354cf78..378dd9287da5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -489,6 +489,35 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 
 		return 0;
 	}
+	case XDP_MMAP_OFFSETS:
+	{
+		struct xdp_mmap_offsets off;
+
+		if (len < sizeof(off))
+			return -EINVAL;
+
+		off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.rx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+		off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+		off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+		off.tx.desc	= offsetof(struct xdp_rxtx_ring, desc);
+
+		off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.fr.desc	= offsetof(struct xdp_umem_ring, desc);
+		off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+		off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+		off.cr.desc	= offsetof(struct xdp_umem_ring, desc);
+
+		len = sizeof(off);
+		if (copy_to_user(optval, &off, len))
+			return -EFAULT;
+		if (put_user(len, optlen))
+			return -EFAULT;
+
+		return 0;
+	}
 	default:
 		break;
 	}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 62e43be407d8..cb8e5be35110 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -13,6 +13,23 @@
 
 #define RX_BATCH_SIZE 16
 
+struct xdp_ring {
+	u32 producer ____cacheline_aligned_in_smp;
+	u32 consumer ____cacheline_aligned_in_smp;
+};
+
+/* Used for the RX and TX queues for packets */
+struct xdp_rxtx_ring {
+	struct xdp_ring ptrs;
+	struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_ring {
+	struct xdp_ring ptrs;
+	u32 desc[0] ____cacheline_aligned_in_smp;
+};
+
 struct xsk_queue {
 	struct xdp_umem_props umem_props;
 	u32 ring_mask;
-- 
cgit v1.2.3-59-g8ed1b


From 1c4917da36ed76981cc3c2671b3a44765c02bbc3 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:35:00 +0200
Subject: samples/bpf: adapt xdpsock to the new uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adapt xdpsock to use the new getsockopt introduced in the previous
commit.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 123 ++++++++++++++++++++++++++++-----------------
 1 file changed, 76 insertions(+), 47 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 60a882a2296c..e379eac034ac 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -79,7 +79,10 @@ struct xdp_umem_uqueue {
 	u32 cached_cons;
 	u32 mask;
 	u32 size;
-	struct xdp_umem_ring *ring;
+	u32 *producer;
+	u32 *consumer;
+	u32 *ring;
+	void *map;
 };
 
 struct xdp_umem {
@@ -94,7 +97,10 @@ struct xdp_uqueue {
 	u32 cached_cons;
 	u32 mask;
 	u32 size;
-	struct xdp_rxtx_ring *ring;
+	u32 *producer;
+	u32 *consumer;
+	struct xdp_desc *ring;
+	void *map;
 };
 
 struct xdpsock {
@@ -155,7 +161,7 @@ static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
 		return free_entries;
 
 	/* Refresh the local tail pointer */
-	q->cached_cons = q->ring->ptrs.consumer;
+	q->cached_cons = *q->consumer;
 
 	return q->size - (q->cached_prod - q->cached_cons);
 }
@@ -168,7 +174,7 @@ static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
 		return free_entries;
 
 	/* Refresh the local tail pointer */
-	q->cached_cons = q->ring->ptrs.consumer + q->size;
+	q->cached_cons = *q->consumer + q->size;
 	return q->cached_cons - q->cached_prod;
 }
 
@@ -177,7 +183,7 @@ static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
 	u32 entries = q->cached_prod - q->cached_cons;
 
 	if (entries == 0) {
-		q->cached_prod = q->ring->ptrs.producer;
+		q->cached_prod = *q->producer;
 		entries = q->cached_prod - q->cached_cons;
 	}
 
@@ -189,7 +195,7 @@ static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
 	u32 entries = q->cached_prod - q->cached_cons;
 
 	if (entries == 0) {
-		q->cached_prod = q->ring->ptrs.producer;
+		q->cached_prod = *q->producer;
 		entries = q->cached_prod - q->cached_cons;
 	}
 
@@ -208,12 +214,12 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
 	for (i = 0; i < nb; i++) {
 		u32 idx = fq->cached_prod++ & fq->mask;
 
-		fq->ring->desc[idx] = d[i].idx;
+		fq->ring[idx] = d[i].idx;
 	}
 
 	u_smp_wmb();
 
-	fq->ring->ptrs.producer = fq->cached_prod;
+	*fq->producer = fq->cached_prod;
 
 	return 0;
 }
@@ -229,12 +235,12 @@ static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
 	for (i = 0; i < nb; i++) {
 		u32 idx = fq->cached_prod++ & fq->mask;
 
-		fq->ring->desc[idx] = d[i];
+		fq->ring[idx] = d[i];
 	}
 
 	u_smp_wmb();
 
-	fq->ring->ptrs.producer = fq->cached_prod;
+	*fq->producer = fq->cached_prod;
 
 	return 0;
 }
@@ -248,13 +254,13 @@ static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
 
 	for (i = 0; i < entries; i++) {
 		idx = cq->cached_cons++ & cq->mask;
-		d[i] = cq->ring->desc[idx];
+		d[i] = cq->ring[idx];
 	}
 
 	if (entries > 0) {
 		u_smp_wmb();
 
-		cq->ring->ptrs.consumer = cq->cached_cons;
+		*cq->consumer = cq->cached_cons;
 	}
 
 	return entries;
@@ -270,7 +276,7 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 			 const struct xdp_desc *descs,
 			 unsigned int ndescs)
 {
-	struct xdp_rxtx_ring *r = uq->ring;
+	struct xdp_desc *r = uq->ring;
 	unsigned int i;
 
 	if (xq_nb_free(uq, ndescs) < ndescs)
@@ -279,21 +285,21 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		r->desc[idx].idx = descs[i].idx;
-		r->desc[idx].len = descs[i].len;
-		r->desc[idx].offset = descs[i].offset;
+		r[idx].idx = descs[i].idx;
+		r[idx].len = descs[i].len;
+		r[idx].offset = descs[i].offset;
 	}
 
 	u_smp_wmb();
 
-	r->ptrs.producer = uq->cached_prod;
+	*uq->producer = uq->cached_prod;
 	return 0;
 }
 
 static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
 				 __u32 idx, unsigned int ndescs)
 {
-	struct xdp_rxtx_ring *q = uq->ring;
+	struct xdp_desc *r = uq->ring;
 	unsigned int i;
 
 	if (xq_nb_free(uq, ndescs) < ndescs)
@@ -302,14 +308,14 @@ static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		q->desc[idx].idx	= idx + i;
-		q->desc[idx].len	= sizeof(pkt_data) - 1;
-		q->desc[idx].offset	= 0;
+		r[idx].idx	= idx + i;
+		r[idx].len	= sizeof(pkt_data) - 1;
+		r[idx].offset	= 0;
 	}
 
 	u_smp_wmb();
 
-	q->ptrs.producer = uq->cached_prod;
+	*uq->producer = uq->cached_prod;
 	return 0;
 }
 
@@ -317,7 +323,7 @@ static inline int xq_deq(struct xdp_uqueue *uq,
 			 struct xdp_desc *descs,
 			 int ndescs)
 {
-	struct xdp_rxtx_ring *r = uq->ring;
+	struct xdp_desc *r = uq->ring;
 	unsigned int idx;
 	int i, entries;
 
@@ -327,13 +333,13 @@ static inline int xq_deq(struct xdp_uqueue *uq,
 
 	for (i = 0; i < entries; i++) {
 		idx = uq->cached_cons++ & uq->mask;
-		descs[i] = r->desc[idx];
+		descs[i] = r[idx];
 	}
 
 	if (entries > 0) {
 		u_smp_wmb();
 
-		r->ptrs.consumer = uq->cached_cons;
+		*uq->consumer = uq->cached_cons;
 	}
 
 	return entries;
@@ -392,8 +398,10 @@ static size_t gen_eth_frame(char *frame)
 static struct xdp_umem *xdp_umem_configure(int sfd)
 {
 	int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
+	struct xdp_mmap_offsets off;
 	struct xdp_umem_reg mr;
 	struct xdp_umem *umem;
+	socklen_t optlen;
 	void *bufs;
 
 	umem = calloc(1, sizeof(*umem));
@@ -413,25 +421,35 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
 			   sizeof(int)) == 0);
 
-	umem->fq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
-			     FQ_NUM_DESCS * sizeof(u32),
-			     PROT_READ | PROT_WRITE,
-			     MAP_SHARED | MAP_POPULATE, sfd,
-			     XDP_UMEM_PGOFF_FILL_RING);
-	lassert(umem->fq.ring != MAP_FAILED);
+	optlen = sizeof(off);
+	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+			   &optlen) == 0);
+
+	umem->fq.map = mmap(0, off.fr.desc +
+			    FQ_NUM_DESCS * sizeof(u32),
+			    PROT_READ | PROT_WRITE,
+			    MAP_SHARED | MAP_POPULATE, sfd,
+			    XDP_UMEM_PGOFF_FILL_RING);
+	lassert(umem->fq.map != MAP_FAILED);
 
 	umem->fq.mask = FQ_NUM_DESCS - 1;
 	umem->fq.size = FQ_NUM_DESCS;
+	umem->fq.producer = umem->fq.map + off.fr.producer;
+	umem->fq.consumer = umem->fq.map + off.fr.consumer;
+	umem->fq.ring = umem->fq.map + off.fr.desc;
 
-	umem->cq.ring = mmap(0, sizeof(struct xdp_umem_ring) +
+	umem->cq.map = mmap(0, off.cr.desc +
 			     CQ_NUM_DESCS * sizeof(u32),
 			     PROT_READ | PROT_WRITE,
 			     MAP_SHARED | MAP_POPULATE, sfd,
 			     XDP_UMEM_PGOFF_COMPLETION_RING);
-	lassert(umem->cq.ring != MAP_FAILED);
+	lassert(umem->cq.map != MAP_FAILED);
 
 	umem->cq.mask = CQ_NUM_DESCS - 1;
 	umem->cq.size = CQ_NUM_DESCS;
+	umem->cq.producer = umem->cq.map + off.cr.producer;
+	umem->cq.consumer = umem->cq.map + off.cr.consumer;
+	umem->cq.ring = umem->cq.map + off.cr.desc;
 
 	umem->frames = (char (*)[FRAME_SIZE])bufs;
 	umem->fd = sfd;
@@ -449,9 +467,11 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 {
 	struct sockaddr_xdp sxdp = {};
+	struct xdp_mmap_offsets off;
 	int sfd, ndescs = NUM_DESCS;
 	struct xdpsock *xsk;
 	bool shared = true;
+	socklen_t optlen;
 	u32 i;
 
 	sfd = socket(PF_XDP, SOCK_RAW, 0);
@@ -474,15 +494,18 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 			   &ndescs, sizeof(int)) == 0);
 	lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
 			   &ndescs, sizeof(int)) == 0);
+	optlen = sizeof(off);
+	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
+			   &optlen) == 0);
 
 	/* Rx */
-	xsk->rx.ring = mmap(NULL,
-			    sizeof(struct xdp_ring) +
-			    NUM_DESCS * sizeof(struct xdp_desc),
-			    PROT_READ | PROT_WRITE,
-			    MAP_SHARED | MAP_POPULATE, sfd,
-			    XDP_PGOFF_RX_RING);
-	lassert(xsk->rx.ring != MAP_FAILED);
+	xsk->rx.map = mmap(NULL,
+			   off.rx.desc +
+			   NUM_DESCS * sizeof(struct xdp_desc),
+			   PROT_READ | PROT_WRITE,
+			   MAP_SHARED | MAP_POPULATE, sfd,
+			   XDP_PGOFF_RX_RING);
+	lassert(xsk->rx.map != MAP_FAILED);
 
 	if (!shared) {
 		for (i = 0; i < NUM_DESCS / 2; i++)
@@ -491,19 +514,25 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	}
 
 	/* Tx */
-	xsk->tx.ring = mmap(NULL,
-			 sizeof(struct xdp_ring) +
-			 NUM_DESCS * sizeof(struct xdp_desc),
-			 PROT_READ | PROT_WRITE,
-			 MAP_SHARED | MAP_POPULATE, sfd,
-			 XDP_PGOFF_TX_RING);
-	lassert(xsk->tx.ring != MAP_FAILED);
+	xsk->tx.map = mmap(NULL,
+			   off.tx.desc +
+			   NUM_DESCS * sizeof(struct xdp_desc),
+			   PROT_READ | PROT_WRITE,
+			   MAP_SHARED | MAP_POPULATE, sfd,
+			   XDP_PGOFF_TX_RING);
+	lassert(xsk->tx.map != MAP_FAILED);
 
 	xsk->rx.mask = NUM_DESCS - 1;
 	xsk->rx.size = NUM_DESCS;
+	xsk->rx.producer = xsk->rx.map + off.rx.producer;
+	xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
+	xsk->rx.ring = xsk->rx.map + off.rx.desc;
 
 	xsk->tx.mask = NUM_DESCS - 1;
 	xsk->tx.size = NUM_DESCS;
+	xsk->tx.producer = xsk->tx.map + off.tx.producer;
+	xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
+	xsk->tx.ring = xsk->tx.map + off.tx.desc;
 
 	sxdp.sxdp_family = PF_XDP;
 	sxdp.sxdp_ifindex = opt_ifindex;
-- 
cgit v1.2.3-59-g8ed1b


From 37b076933a8e38e72ffd3c40d3eeb5949f38baf3 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:35:01 +0200
Subject: xsk: add missing write- and data-dependency barrier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, we add a missing write-barrier, and use READ_ONCE for the
data-dependency barrier.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 378dd9287da5..01f010ec0c05 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -228,6 +228,8 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 	if (!q)
 		return -ENOMEM;
 
+	/* Make sure queue is ready before it can be seen by others */
+	smp_wmb();
 	*queue = q;
 	return 0;
 }
@@ -532,21 +534,23 @@ static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long size = vma->vm_end - vma->vm_start;
 	struct xdp_sock *xs = xdp_sk(sock->sk);
 	struct xsk_queue *q = NULL;
+	struct xdp_umem *umem;
 	unsigned long pfn;
 	struct page *qpg;
 
 	if (offset == XDP_PGOFF_RX_RING) {
-		q = xs->rx;
+		q = READ_ONCE(xs->rx);
 	} else if (offset == XDP_PGOFF_TX_RING) {
-		q = xs->tx;
+		q = READ_ONCE(xs->tx);
 	} else {
-		if (!xs->umem)
+		umem = READ_ONCE(xs->umem);
+		if (!umem)
 			return -EINVAL;
 
 		if (offset == XDP_UMEM_PGOFF_FILL_RING)
-			q = xs->umem->fq;
+			q = READ_ONCE(umem->fq);
 		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
-			q = xs->umem->cq;
+			q = READ_ONCE(umem->cq);
 	}
 
 	if (!q)
-- 
cgit v1.2.3-59-g8ed1b


From a49049ea257656f27ffe424224f4a362b8b1234a Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:35:02 +0200
Subject: xsk: simplified umem setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As suggested by Daniel Borkmann, the umem setup code was a too
defensive and complex. Here, we reduce the number of checks. Also, the
memory pinning is now folded into the umem creation, and we do correct
locking.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 79 ++++++++++++++++++++++++++----------------------------
 net/xdp/xdp_umem.h |  3 +--
 net/xdp/xsk.c      | 24 ++++++++---------
 3 files changed, 51 insertions(+), 55 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index c47909c74899..faa6ffbaf6ab 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -16,39 +16,25 @@
 
 #define XDP_UMEM_MIN_FRAME_SIZE 2048
 
-int xdp_umem_create(struct xdp_umem **umem)
-{
-	*umem = kzalloc(sizeof(**umem), GFP_KERNEL);
-
-	if (!*umem)
-		return -ENOMEM;
-
-	return 0;
-}
-
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unsigned int i;
 
-	if (umem->pgs) {
-		for (i = 0; i < umem->npgs; i++) {
-			struct page *page = umem->pgs[i];
-
-			set_page_dirty_lock(page);
-			put_page(page);
-		}
+	for (i = 0; i < umem->npgs; i++) {
+		struct page *page = umem->pgs[i];
 
-		kfree(umem->pgs);
-		umem->pgs = NULL;
+		set_page_dirty_lock(page);
+		put_page(page);
 	}
+
+	kfree(umem->pgs);
+	umem->pgs = NULL;
 }
 
 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 {
-	if (umem->user) {
-		atomic_long_sub(umem->npgs, &umem->user->locked_vm);
-		free_uid(umem->user);
-	}
+	atomic_long_sub(umem->npgs, &umem->user->locked_vm);
+	free_uid(umem->user);
 }
 
 static void xdp_umem_release(struct xdp_umem *umem)
@@ -66,22 +52,18 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		umem->cq = NULL;
 	}
 
-	if (umem->pgs) {
-		xdp_umem_unpin_pages(umem);
-
-		task = get_pid_task(umem->pid, PIDTYPE_PID);
-		put_pid(umem->pid);
-		if (!task)
-			goto out;
-		mm = get_task_mm(task);
-		put_task_struct(task);
-		if (!mm)
-			goto out;
+	xdp_umem_unpin_pages(umem);
 
-		mmput(mm);
-		umem->pgs = NULL;
-	}
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
 
+	mmput(mm);
 	xdp_umem_unaccount_pages(umem);
 out:
 	kfree(umem);
@@ -167,16 +149,13 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 	return 0;
 }
 
-int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
+static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
 	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
 	u64 addr = mr->addr, size = mr->len;
 	unsigned int nframes, nfpp;
 	int size_chk, err;
 
-	if (!umem)
-		return -EINVAL;
-
 	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
 		 * - huge pages, or*
@@ -245,6 +224,24 @@ out:
 	return err;
 }
 
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
+{
+	struct xdp_umem *umem;
+	int err;
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	err = xdp_umem_reg(umem, mr);
+	if (err) {
+		kfree(umem);
+		return ERR_PTR(err);
+	}
+
+	return umem;
+}
+
 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 {
 	return umem->fq && umem->cq;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 70fe225baa51..9802287ff19d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -50,9 +50,8 @@ static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
-int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
-int xdp_umem_create(struct xdp_umem **umem);
+struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
 
 #endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 01f010ec0c05..cce0e4f8a536 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -406,25 +406,23 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		struct xdp_umem_reg mr;
 		struct xdp_umem *umem;
 
-		if (xs->umem)
-			return -EBUSY;
-
 		if (copy_from_user(&mr, optval, sizeof(mr)))
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
-		err = xdp_umem_create(&umem);
+		if (xs->umem) {
+			mutex_unlock(&xs->mutex);
+			return -EBUSY;
+		}
 
-		err = xdp_umem_reg(umem, &mr);
-		if (err) {
-			kfree(umem);
+		umem = xdp_umem_create(&mr);
+		if (IS_ERR(umem)) {
 			mutex_unlock(&xs->mutex);
-			return err;
+			return PTR_ERR(umem);
 		}
 
 		/* Make sure umem is ready before it can be seen by others */
 		smp_wmb();
-
 		xs->umem = umem;
 		mutex_unlock(&xs->mutex);
 		return 0;
@@ -435,13 +433,15 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		struct xsk_queue **q;
 		int entries;
 
-		if (!xs->umem)
-			return -EINVAL;
-
 		if (copy_from_user(&entries, optval, sizeof(entries)))
 			return -EFAULT;
 
 		mutex_lock(&xs->mutex);
+		if (!xs->umem) {
+			mutex_unlock(&xs->mutex);
+			return -EINVAL;
+		}
+
 		q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 			&xs->umem->cq;
 		err = xsk_init_queue(entries, q, true);
-- 
cgit v1.2.3-59-g8ed1b


From d3b42f1422d9c050bf5a2c660c045af2ab5d3e72 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Tue, 22 May 2018 09:35:03 +0200
Subject: xsk: convert atomic_t to refcount_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce refcount_t, in favor of atomic_t.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 6 +++---
 net/xdp/xdp_umem.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index faa6ffbaf6ab..87998818116f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -78,7 +78,7 @@ static void xdp_umem_release_deferred(struct work_struct *work)
 
 void xdp_get_umem(struct xdp_umem *umem)
 {
-	atomic_inc(&umem->users);
+	refcount_inc(&umem->users);
 }
 
 void xdp_put_umem(struct xdp_umem *umem)
@@ -86,7 +86,7 @@ void xdp_put_umem(struct xdp_umem *umem)
 	if (!umem)
 		return;
 
-	if (atomic_dec_and_test(&umem->users)) {
+	if (refcount_dec_and_test(&umem->users)) {
 		INIT_WORK(&umem->work, xdp_umem_release_deferred);
 		schedule_work(&umem->work);
 	}
@@ -206,7 +206,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->frame_size_log2 = ilog2(frame_size);
 	umem->nfpp_mask = nfpp - 1;
 	umem->nfpplog2 = ilog2(nfpp);
-	atomic_set(&umem->users, 1);
+	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
 	if (err)
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 9802287ff19d..0881cf456230 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -27,7 +27,7 @@ struct xdp_umem {
 	struct pid *pid;
 	unsigned long address;
 	size_t size;
-	atomic_t users;
+	refcount_t users;
 	struct work_struct work;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 50d889b1789458d1f7d7f40ff4f628b670047773 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 09:08:13 -0700
Subject: net/ipv4: Add helper to return path MTU based on fib result

Determine path MTU from a FIB lookup result. Logic is a distillation of
ip_dst_mtu_maybe_forward.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/ip_fib.h |  2 ++
 net/ipv4/route.c     | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 81d0f2107ff1..69c91d1934c1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -449,4 +449,6 @@ static inline void fib_proc_exit(struct net *net)
 }
 #endif
 
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
+
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 29268efad247..ac3b22bc51b2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1352,6 +1352,37 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 	return NULL;
 }
 
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+	struct fib_info *fi = res->fi;
+	struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+	struct net_device *dev = nh->nh_dev;
+	u32 mtu = 0;
+
+	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+		mtu = fi->fib_mtu;
+
+	if (likely(!mtu)) {
+		struct fib_nh_exception *fnhe;
+
+		fnhe = find_exception(nh, daddr);
+		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+			mtu = fnhe->fnhe_pmtu;
+	}
+
+	if (likely(!mtu))
+		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 			      __be32 daddr, const bool do_cache)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 901731b882d77dc53897aec45015ced42d56fe4c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 09:08:14 -0700
Subject: net/ipv6: Add helper to return path MTU based on fib result

Determine path MTU from a FIB lookup result. Logic is based on
ip6_dst_mtu_forward plus lookup of nexthop exception.

Add ip6_dst_mtu_forward to ipv6_stubs to handle access by core
bpf code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/addrconf.h   |  2 ++
 include/net/ip6_fib.h    |  6 ++++++
 include/net/ip6_route.h  |  3 +++
 net/ipv6/addrconf_core.c |  8 ++++++++
 net/ipv6/af_inet6.c      |  1 +
 net/ipv6/route.c         | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 68 insertions(+)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index ff766ab207e0..c07d4dd09361 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -236,6 +236,8 @@ struct ipv6_stub {
 						   struct flowi6 *fl6, int oif,
 						   const struct sk_buff *skb,
 						   int strict);
+	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
+				 struct in6_addr *saddr);
 
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cc70f6da8462..7897efe80727 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -412,6 +412,12 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 	return f6i->fib6_nh.nh_dev;
 }
 
+static inline
+struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
+{
+	return f6i->fib6_nh.nh_lwtstate;
+}
+
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4cf1ef935ed9..7b9c82de11cc 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -300,6 +300,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
+u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+		      struct in6_addr *saddr);
+
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
 				   const void *daddr);
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 2fe754fd4f5e..5cd0029d930e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -161,12 +161,20 @@ eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
 	return f6i;
 }
 
+static u32
+eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+			       struct in6_addr *saddr)
+{
+	return 0;
+}
+
 const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
 	.ipv6_dst_lookup   = eafnosupport_ipv6_dst_lookup,
 	.fib6_get_table    = eafnosupport_fib6_get_table,
 	.fib6_table_lookup = eafnosupport_fib6_table_lookup,
 	.fib6_lookup       = eafnosupport_fib6_lookup,
 	.fib6_multipath_select = eafnosupport_fib6_multipath_select,
+	.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
 };
 EXPORT_SYMBOL_GPL(ipv6_stub);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 50de8b0d4f70..9ed0eae91758 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -894,6 +894,7 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.fib6_table_lookup = fib6_table_lookup,
 	.fib6_lookup       = fib6_lookup,
 	.fib6_multipath_select = fib6_multipath_select,
+	.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
 	.udpv6_encap_enable = udpv6_encap_enable,
 	.ndisc_send_na = ndisc_send_na,
 	.nd_tbl	= &nd_tbl,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3bc334..dc5d5c84dbef 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2603,6 +2603,54 @@ out:
 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ *
+ * based on ip6_dst_mtu_forward and exception logic of
+ * rt6_find_cached_rt; called with rcu_read_lock
+ */
+u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
+		      struct in6_addr *saddr)
+{
+	struct rt6_exception_bucket *bucket;
+	struct rt6_exception *rt6_ex;
+	struct in6_addr *src_key;
+	struct inet6_dev *idev;
+	u32 mtu = 0;
+
+	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
+		mtu = f6i->fib6_pmtu;
+		if (mtu)
+			goto out;
+	}
+
+	src_key = NULL;
+#ifdef CONFIG_IPV6_SUBTREES
+	if (f6i->fib6_src.plen)
+		src_key = saddr;
+#endif
+
+	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
+	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
+
+	if (likely(!mtu)) {
+		struct net_device *dev = fib6_info_nh_dev(f6i);
+
+		mtu = IPV6_MIN_MTU;
+		idev = __in6_dev_get(dev);
+		if (idev && idev->cnf.mtu6 > mtu)
+			mtu = idev->cnf.mtu6;
+	}
+
+	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+out:
+	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
+}
+
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 				  struct flowi6 *fl6)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 4f74fede40df8dbdb5261d50682491126675aac3 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 09:08:15 -0700
Subject: bpf: Add mtu checking to FIB forwarding helper

Add check that egress MTU can handle packet to be forwarded. If
the MTU is less than the packet length, return 0 meaning the
packet is expected to continue up the stack for help - eg.,
fragmenting the packet or sending an ICMP.

The XDP path needs to leverage the FIB entry for an MTU on the
route spec or an exception entry for a given destination. The
skb path lets is_skb_forwardable decide if the packet can be
sent.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index aec5ebafb262..ba3ff5aa575a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4089,7 +4089,7 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
 
 #if IS_ENABLED(CONFIG_INET)
 static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
-			       u32 flags)
+			       u32 flags, bool check_mtu)
 {
 	struct in_device *in_dev;
 	struct neighbour *neigh;
@@ -4098,6 +4098,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	struct fib_nh *nh;
 	struct flowi4 fl4;
 	int err;
+	u32 mtu;
 
 	dev = dev_get_by_index_rcu(net, params->ifindex);
 	if (unlikely(!dev))
@@ -4149,6 +4150,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	if (res.fi->fib_nhs > 1)
 		fib_select_path(net, &res, &fl4, NULL);
 
+	if (check_mtu) {
+		mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
+		if (params->tot_len > mtu)
+			return 0;
+	}
+
 	nh = &res.fi->fib_nh[res.nh_sel];
 
 	/* do not handle lwt encaps right now */
@@ -4177,7 +4184,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 #if IS_ENABLED(CONFIG_IPV6)
 static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
-			       u32 flags)
+			       u32 flags, bool check_mtu)
 {
 	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
 	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
@@ -4188,6 +4195,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	struct flowi6 fl6;
 	int strict = 0;
 	int oif;
+	u32 mtu;
 
 	/* link local addresses are never forwarded */
 	if (rt6_need_strict(dst) || rt6_need_strict(src))
@@ -4250,6 +4258,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 						       fl6.flowi6_oif, NULL,
 						       strict);
 
+	if (check_mtu) {
+		mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
+		if (params->tot_len > mtu)
+			return 0;
+	}
+
 	if (f6i->fib6_nh.nh_lwtstate)
 		return 0;
 
@@ -4282,12 +4296,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
 		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
-					   flags);
+					   flags, true);
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
 		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
-					   flags);
+					   flags, true);
 #endif
 	}
 	return 0;
@@ -4306,20 +4320,34 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
 BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
+	struct net *net = dev_net(skb->dev);
+	int index = 0;
+
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
+		index = bpf_ipv4_fib_lookup(net, params, flags, false);
+		break;
 #endif
 #if IS_ENABLED(CONFIG_IPV6)
 	case AF_INET6:
-		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
+		index = bpf_ipv6_fib_lookup(net, params, flags, false);
+		break;
 #endif
 	}
-	return -ENOTSUPP;
+
+	if (index > 0) {
+		struct net_device *dev;
+
+		dev = dev_get_by_index_rcu(net, index);
+		if (!is_skb_forwardable(dev, skb))
+			index = 0;
+	}
+
+	return index;
 }
 
 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
-- 
cgit v1.2.3-59-g8ed1b


From f3a7ca64587f58686d4e2e894e9abbfbc9dffb25 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 22 May 2018 11:31:59 +0200
Subject: cfg80211: add missing kernel-doc

Add the kernel-doc missed earlier.

Fixes: 52539ca89f36 ("cfg80211: Expose TXQ stats and parameters to userspace")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 11a218445448..df33c2766d22 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3856,6 +3856,10 @@ struct wiphy_iftype_ext_capab {
  *	bitmap of &enum nl80211_band values.  For instance, for
  *	NL80211_BAND_2GHZ, bit 0 would be set
  *	(i.e. BIT(NL80211_BAND_2GHZ)).
+ *
+ * @txq_limit: configuration of internal TX queue frame limit
+ * @txq_memory_limit: configuration internal TX queue memory limit
+ * @txq_quantum: configuration of internal TX queue scheduler quantum
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
-- 
cgit v1.2.3-59-g8ed1b


From bdf2752305af1f8c2c2422ae2b292194cf33f5e9 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:32 -0700
Subject: i40e: free skb after clearing lock in ptp_stop

Use the same logic to free the skb after clearing the Tx timestamp bit
lock in i40e_ptp_stop as we use in the other locations. It is not as
important here since we are not racing against a future Tx timestamp
request (as we are disabling PTP at this point). However it is good to
be consistent in how we approach the bit lock so that future callers
don't copy the old anti-pattern.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ptp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index d50d84927e6b..35f2866b38c6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -799,9 +799,11 @@ void i40e_ptp_stop(struct i40e_pf *pf)
 	pf->ptp_rx = false;
 
 	if (pf->ptp_tx_skb) {
-		dev_kfree_skb_any(pf->ptp_tx_skb);
+		struct sk_buff *skb = pf->ptp_tx_skb;
+
 		pf->ptp_tx_skb = NULL;
 		clear_bit_unlock(__I40E_PTP_TX_IN_PROGRESS, pf->state);
+		dev_kfree_skb_any(skb);
 	}
 
 	if (pf->ptp_clock) {
-- 
cgit v1.2.3-59-g8ed1b


From 9955d4940e33442b9b2f1f255a63aac21f8ec3c3 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:33 -0700
Subject: i40e: always return VEB stat strings

The ethtool API for obtaining device statistics is not intended to allow
runtime changes in the number of statistics reported. It may *appear*
this way, as there is an ability to request the number of stats using
ethtool_get_set_count(). However, it is expected that this must always
return the same value for invocations of the same device.

If we don't satisfy this contract, and allow the number of stats to
change during run time, we could cause invalid memory accesses or report
the stat strings incorrectly. This is because the API for obtaining
stats is to (1) get the size, (2) get the strings and finally (3) get
the stats. Since these are each separate ethtool op commands, it is not
possible to maintain consistency by holding the RTNL lock over the whole
operation. This results in the potential for a race condition to occur
where the size changed between any of the 3 calls.

Avoid this issue by requiring that we always return the same value for
a given device. We can check any values which remain constant for the
life of the device, but must not report different sizes depending on
runtime attributes.

This patch specifically fixes the VEB statistics strings to always be
reported. Other issues will be fixed in future patches.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 52 ++++++++++++--------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 329e59eae4a1..de5dad7ff340 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1661,15 +1661,10 @@ static int i40e_get_stats_count(struct net_device *netdev)
 	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_pf *pf = vsi->back;
 
-	if (vsi == pf->vsi[pf->lan_vsi] && pf->hw.partition_id == 1) {
-		if (pf->lan_veb != I40E_NO_VEB &&
-		    pf->flags & I40E_FLAG_VEB_STATS_ENABLED)
-			return I40E_PF_STATS_LEN(netdev) + I40E_VEB_STATS_TOTAL;
-		else
-			return I40E_PF_STATS_LEN(netdev);
-	} else {
+	if (vsi == pf->vsi[pf->lan_vsi] && pf->hw.partition_id == 1)
+		return I40E_PF_STATS_LEN(netdev) + I40E_VEB_STATS_TOTAL;
+	else
 		return I40E_VSI_STATS_LEN(netdev);
-	}
 }
 
 static int i40e_get_sset_count(struct net_device *netdev, int sset)
@@ -1760,6 +1755,8 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 			data[i++] = veb->tc_stats.tc_rx_packets[j];
 			data[i++] = veb->tc_stats.tc_rx_bytes[j];
 		}
+	} else {
+		i += I40E_VEB_STATS_TOTAL;
 	}
 	for (j = 0; j < I40E_GLOBAL_STATS_LEN; j++) {
 		p = (char *)pf + i40e_gstrings_stats[j].stat_offset;
@@ -1816,27 +1813,24 @@ static void i40e_get_strings(struct net_device *netdev, u32 stringset,
 		if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
 			return;
 
-		if ((pf->lan_veb != I40E_NO_VEB) &&
-		    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED)) {
-			for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
-				snprintf(p, ETH_GSTRING_LEN, "veb.%s",
-					i40e_gstrings_veb_stats[i].stat_string);
-				p += ETH_GSTRING_LEN;
-			}
-			for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
-				snprintf(p, ETH_GSTRING_LEN,
-					 "veb.tc_%d_tx_packets", i);
-				p += ETH_GSTRING_LEN;
-				snprintf(p, ETH_GSTRING_LEN,
-					 "veb.tc_%d_tx_bytes", i);
-				p += ETH_GSTRING_LEN;
-				snprintf(p, ETH_GSTRING_LEN,
-					 "veb.tc_%d_rx_packets", i);
-				p += ETH_GSTRING_LEN;
-				snprintf(p, ETH_GSTRING_LEN,
-					 "veb.tc_%d_rx_bytes", i);
-				p += ETH_GSTRING_LEN;
-			}
+		for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
+			snprintf(p, ETH_GSTRING_LEN, "veb.%s",
+				 i40e_gstrings_veb_stats[i].stat_string);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+			snprintf(p, ETH_GSTRING_LEN,
+				 "veb.tc_%u_tx_packets", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN,
+				 "veb.tc_%u_tx_bytes", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN,
+				 "veb.tc_%u_rx_packets", i);
+			p += ETH_GSTRING_LEN;
+			snprintf(p, ETH_GSTRING_LEN,
+				 "veb.tc_%u_rx_bytes", i);
+			p += ETH_GSTRING_LEN;
 		}
 		for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
 			snprintf(p, ETH_GSTRING_LEN, "port.%s",
-- 
cgit v1.2.3-59-g8ed1b


From b8312365097ca854a03231bd58a284491aaa8ef7 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:34 -0700
Subject: i40e: always return all queue stat strings

The ethtool API for obtaining device statistics is not intended to allow
runtime changes in the number of statistics reported. It may *appear*
this way, as there is an ability to request the number of stats using
ethtool_get_set_count(). However, it is expected that this must always
return the same value for invocations of the same device.

If we don't satisfy this contract, and allow the number of stats to
change during run time, we could cause invalid memory accesses or report
the stat strings incorrectly. This is because the API for obtaining
stats is to (1) get the size, (2) get the strings and finally (3) get
the stats. Since these are each separate ethtool op commands, it is not
possible to maintain consistency by holding the RTNL lock over the whole
operation. This results in the potential for a race condition to occur
where the size changed between any of the 3 calls.

Avoid this issue by requiring that we always return the same value for
a given device. We can check any values which remain constant for the
life of the device, but must not report different sizes depending on
runtime attributes.

This patch specifically fixes the queue statistics to always return
every queue even if it's not currently in use.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index de5dad7ff340..bacb01b63727 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -140,8 +140,12 @@ static const struct i40e_stats i40e_gstrings_stats[] = {
 	I40E_PF_STAT("rx_lpi_count", stats.rx_lpi_count),
 };
 
-#define I40E_QUEUE_STATS_LEN(n) \
-	(((struct i40e_netdev_priv *)netdev_priv((n)))->vsi->num_queue_pairs \
+/* We use num_tx_queues here as a proxy for the maximum number of queues
+ * available because we always allocate queues symmetrically.
+ */
+#define I40E_MAX_NUM_QUEUES(n) ((n)->num_tx_queues)
+#define I40E_QUEUE_STATS_LEN(n)                                              \
+	   (I40E_MAX_NUM_QUEUES(n)                                           \
 	    * 2 /* Tx and Rx together */                                     \
 	    * (sizeof(struct i40e_queue_stats) / sizeof(u64)))
 #define I40E_GLOBAL_STATS_LEN	ARRAY_SIZE(i40e_gstrings_stats)
@@ -1712,11 +1716,19 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	rcu_read_lock();
-	for (j = 0; j < vsi->num_queue_pairs; j++) {
+	for (j = 0; j < I40E_MAX_NUM_QUEUES(netdev) ; j++) {
 		tx_ring = READ_ONCE(vsi->tx_rings[j]);
 
-		if (!tx_ring)
+		if (!tx_ring) {
+			/* Bump the stat counter to skip these stats, and make
+			 * sure the memory is zero'd
+			 */
+			data[i++] = 0;
+			data[i++] = 0;
+			data[i++] = 0;
+			data[i++] = 0;
 			continue;
+		}
 
 		/* process Tx ring statistics */
 		do {
@@ -1800,7 +1812,7 @@ static void i40e_get_strings(struct net_device *netdev, u32 stringset,
 				 i40e_gstrings_misc_stats[i].stat_string);
 			p += ETH_GSTRING_LEN;
 		}
-		for (i = 0; i < vsi->num_queue_pairs; i++) {
+		for (i = 0; i < I40E_MAX_NUM_QUEUES(netdev); i++) {
 			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_packets", i);
 			p += ETH_GSTRING_LEN;
 			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_bytes", i);
-- 
cgit v1.2.3-59-g8ed1b


From 019b9cd44d5aa457d9e45c85847f86ced40c852d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:35 -0700
Subject: i40e: split i40e_get_strings() into smaller functions

Split the statistic strings and private flags strings into their own
separate functions to aid code readability.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 183 ++++++++++++++-----------
 1 file changed, 100 insertions(+), 83 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index bacb01b63727..c50ed2d391e1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1787,8 +1787,7 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 		data[i++] = pf->stats.priority_xon_2_xoff[j];
 }
 
-static void i40e_get_strings(struct net_device *netdev, u32 stringset,
-			     u8 *data)
+static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
@@ -1796,95 +1795,113 @@ static void i40e_get_strings(struct net_device *netdev, u32 stringset,
 	char *p = (char *)data;
 	unsigned int i;
 
+	for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "%s",
+			 i40e_gstrings_net_stats[i].stat_string);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MISC_STATS_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "%s",
+			 i40e_gstrings_misc_stats[i].stat_string);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MAX_NUM_QUEUES(netdev); i++) {
+		snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_packets", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_bytes", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_packets", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_bytes", i);
+		p += ETH_GSTRING_LEN;
+	}
+	if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
+		return;
+
+	for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "veb.%s",
+			 i40e_gstrings_veb_stats[i].stat_string);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		snprintf(p, ETH_GSTRING_LEN,
+			 "veb.tc_%u_tx_packets", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN,
+			 "veb.tc_%u_tx_bytes", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN,
+			 "veb.tc_%u_rx_packets", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN,
+			 "veb.tc_%u_rx_bytes", i);
+		p += ETH_GSTRING_LEN;
+	}
+
+	for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "port.%s",
+			 i40e_gstrings_stats[i].stat_string);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		snprintf(p, ETH_GSTRING_LEN,
+			 "port.tx_priority_%u_xon", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN,
+			 "port.tx_priority_%u_xoff", i);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		snprintf(p, ETH_GSTRING_LEN,
+			 "port.rx_priority_%u_xon", i);
+		p += ETH_GSTRING_LEN;
+		snprintf(p, ETH_GSTRING_LEN,
+			 "port.rx_priority_%u_xoff", i);
+		p += ETH_GSTRING_LEN;
+	}
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		snprintf(p, ETH_GSTRING_LEN,
+			 "port.rx_priority_%u_xon_2_xoff", i);
+		p += ETH_GSTRING_LEN;
+	}
+	/* BUG_ON(p - data != I40E_STATS_LEN * ETH_GSTRING_LEN); */
+}
+
+static void i40e_get_priv_flag_strings(struct net_device *netdev, u8 *data)
+{
+	struct i40e_netdev_priv *np = netdev_priv(netdev);
+	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
+	char *p = (char *)data;
+	unsigned int i;
+
+	for (i = 0; i < I40E_PRIV_FLAGS_STR_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "%s",
+			 i40e_gstrings_priv_flags[i].flag_string);
+		p += ETH_GSTRING_LEN;
+	}
+	if (pf->hw.pf_id != 0)
+		return;
+	for (i = 0; i < I40E_GL_PRIV_FLAGS_STR_LEN; i++) {
+		snprintf(p, ETH_GSTRING_LEN, "%s",
+			 i40e_gl_gstrings_priv_flags[i].flag_string);
+		p += ETH_GSTRING_LEN;
+	}
+}
+
+static void i40e_get_strings(struct net_device *netdev, u32 stringset,
+			     u8 *data)
+{
 	switch (stringset) {
 	case ETH_SS_TEST:
 		memcpy(data, i40e_gstrings_test,
 		       I40E_TEST_LEN * ETH_GSTRING_LEN);
 		break;
 	case ETH_SS_STATS:
-		for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 i40e_gstrings_net_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MISC_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 i40e_gstrings_misc_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MAX_NUM_QUEUES(netdev); i++) {
-			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_bytes", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
-		if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
-			return;
-
-		for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "veb.%s",
-				 i40e_gstrings_veb_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "veb.tc_%u_tx_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "veb.tc_%u_tx_bytes", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "veb.tc_%u_rx_packets", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "veb.tc_%u_rx_bytes", i);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "port.%s",
-				 i40e_gstrings_stats[i].stat_string);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "port.tx_priority_%d_xon", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "port.tx_priority_%d_xoff", i);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "port.rx_priority_%d_xon", i);
-			p += ETH_GSTRING_LEN;
-			snprintf(p, ETH_GSTRING_LEN,
-				 "port.rx_priority_%d_xoff", i);
-			p += ETH_GSTRING_LEN;
-		}
-		for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-			snprintf(p, ETH_GSTRING_LEN,
-				 "port.rx_priority_%d_xon_2_xoff", i);
-			p += ETH_GSTRING_LEN;
-		}
-		/* BUG_ON(p - data != I40E_STATS_LEN * ETH_GSTRING_LEN); */
+		i40e_get_stat_strings(netdev, data);
 		break;
 	case ETH_SS_PRIV_FLAGS:
-		for (i = 0; i < I40E_PRIV_FLAGS_STR_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 i40e_gstrings_priv_flags[i].flag_string);
-			p += ETH_GSTRING_LEN;
-		}
-		if (pf->hw.pf_id != 0)
-			break;
-		for (i = 0; i < I40E_GL_PRIV_FLAGS_STR_LEN; i++) {
-			snprintf(p, ETH_GSTRING_LEN, "%s",
-				 i40e_gl_gstrings_priv_flags[i].flag_string);
-			p += ETH_GSTRING_LEN;
-		}
+		i40e_get_priv_flag_strings(netdev, data);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 9b10df596bd4d38f2a58cf87e0780510acc53d8d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:36 -0700
Subject: i40e: use WARN_ONCE to replace the commented BUG_ON size check

We don't really want to use BUG_ON here since that would completely
crash the kernel, thus the reason we commented it out. We *can't* use
BUILD_BUG_ON because at least now (a) the sizes aren't constant (we are
fixing this) and (b) not all compilers are smart enough to understand
that "p - data" is a constant.

Instead, just use a WARN_ONCE so that the first time we end up with an
incorrect size we will dump a stack trace and a message, hopefully
highlighting the issues early in testing.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index c50ed2d391e1..32bcb6a2a590 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1792,8 +1792,8 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
 	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_pf *pf = vsi->back;
-	char *p = (char *)data;
 	unsigned int i;
+	u8 *p = data;
 
 	for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
 		snprintf(p, ETH_GSTRING_LEN, "%s",
@@ -1864,7 +1864,9 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 			 "port.rx_priority_%u_xon_2_xoff", i);
 		p += ETH_GSTRING_LEN;
 	}
-	/* BUG_ON(p - data != I40E_STATS_LEN * ETH_GSTRING_LEN); */
+
+	WARN_ONCE(p - data != i40e_get_stats_count(netdev) * ETH_GSTRING_LEN,
+		  "stat strings count mismatch!");
 }
 
 static void i40e_get_priv_flag_strings(struct net_device *netdev, u8 *data)
-- 
cgit v1.2.3-59-g8ed1b


From bf1c39e64016c2c29991f20f29a91a76272b043a Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:37 -0700
Subject: i40e: fold prefix strings directly into stat names

We always prefix these stats with a fixed string, so just fold this
prefix into the stat string definition. This preparatory work will make
it easier to implement a helper function to copy stats and strings into
the supplied buffers in a future patch.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 137 +++++++++++++------------
 1 file changed, 69 insertions(+), 68 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 32bcb6a2a590..6b34845d251c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -42,18 +42,18 @@ static const struct i40e_stats i40e_gstrings_net_stats[] = {
 };
 
 static const struct i40e_stats i40e_gstrings_veb_stats[] = {
-	I40E_VEB_STAT("rx_bytes", stats.rx_bytes),
-	I40E_VEB_STAT("tx_bytes", stats.tx_bytes),
-	I40E_VEB_STAT("rx_unicast", stats.rx_unicast),
-	I40E_VEB_STAT("tx_unicast", stats.tx_unicast),
-	I40E_VEB_STAT("rx_multicast", stats.rx_multicast),
-	I40E_VEB_STAT("tx_multicast", stats.tx_multicast),
-	I40E_VEB_STAT("rx_broadcast", stats.rx_broadcast),
-	I40E_VEB_STAT("tx_broadcast", stats.tx_broadcast),
-	I40E_VEB_STAT("rx_discards", stats.rx_discards),
-	I40E_VEB_STAT("tx_discards", stats.tx_discards),
-	I40E_VEB_STAT("tx_errors", stats.tx_errors),
-	I40E_VEB_STAT("rx_unknown_protocol", stats.rx_unknown_protocol),
+	I40E_VEB_STAT("veb.rx_bytes", stats.rx_bytes),
+	I40E_VEB_STAT("veb.tx_bytes", stats.tx_bytes),
+	I40E_VEB_STAT("veb.rx_unicast", stats.rx_unicast),
+	I40E_VEB_STAT("veb.tx_unicast", stats.tx_unicast),
+	I40E_VEB_STAT("veb.rx_multicast", stats.rx_multicast),
+	I40E_VEB_STAT("veb.tx_multicast", stats.tx_multicast),
+	I40E_VEB_STAT("veb.rx_broadcast", stats.rx_broadcast),
+	I40E_VEB_STAT("veb.tx_broadcast", stats.tx_broadcast),
+	I40E_VEB_STAT("veb.rx_discards", stats.rx_discards),
+	I40E_VEB_STAT("veb.tx_discards", stats.tx_discards),
+	I40E_VEB_STAT("veb.tx_errors", stats.tx_errors),
+	I40E_VEB_STAT("veb.rx_unknown_protocol", stats.rx_unknown_protocol),
 };
 
 static const struct i40e_stats i40e_gstrings_misc_stats[] = {
@@ -82,62 +82,63 @@ static const struct i40e_stats i40e_gstrings_misc_stats[] = {
  * is queried on the base PF netdev, not on the VMDq or FCoE netdev.
  */
 static const struct i40e_stats i40e_gstrings_stats[] = {
-	I40E_PF_STAT("rx_bytes", stats.eth.rx_bytes),
-	I40E_PF_STAT("tx_bytes", stats.eth.tx_bytes),
-	I40E_PF_STAT("rx_unicast", stats.eth.rx_unicast),
-	I40E_PF_STAT("tx_unicast", stats.eth.tx_unicast),
-	I40E_PF_STAT("rx_multicast", stats.eth.rx_multicast),
-	I40E_PF_STAT("tx_multicast", stats.eth.tx_multicast),
-	I40E_PF_STAT("rx_broadcast", stats.eth.rx_broadcast),
-	I40E_PF_STAT("tx_broadcast", stats.eth.tx_broadcast),
-	I40E_PF_STAT("tx_errors", stats.eth.tx_errors),
-	I40E_PF_STAT("rx_dropped", stats.eth.rx_discards),
-	I40E_PF_STAT("tx_dropped_link_down", stats.tx_dropped_link_down),
-	I40E_PF_STAT("rx_crc_errors", stats.crc_errors),
-	I40E_PF_STAT("illegal_bytes", stats.illegal_bytes),
-	I40E_PF_STAT("mac_local_faults", stats.mac_local_faults),
-	I40E_PF_STAT("mac_remote_faults", stats.mac_remote_faults),
-	I40E_PF_STAT("tx_timeout", tx_timeout_count),
-	I40E_PF_STAT("rx_csum_bad", hw_csum_rx_error),
-	I40E_PF_STAT("rx_length_errors", stats.rx_length_errors),
-	I40E_PF_STAT("link_xon_rx", stats.link_xon_rx),
-	I40E_PF_STAT("link_xoff_rx", stats.link_xoff_rx),
-	I40E_PF_STAT("link_xon_tx", stats.link_xon_tx),
-	I40E_PF_STAT("link_xoff_tx", stats.link_xoff_tx),
-	I40E_PF_STAT("rx_size_64", stats.rx_size_64),
-	I40E_PF_STAT("rx_size_127", stats.rx_size_127),
-	I40E_PF_STAT("rx_size_255", stats.rx_size_255),
-	I40E_PF_STAT("rx_size_511", stats.rx_size_511),
-	I40E_PF_STAT("rx_size_1023", stats.rx_size_1023),
-	I40E_PF_STAT("rx_size_1522", stats.rx_size_1522),
-	I40E_PF_STAT("rx_size_big", stats.rx_size_big),
-	I40E_PF_STAT("tx_size_64", stats.tx_size_64),
-	I40E_PF_STAT("tx_size_127", stats.tx_size_127),
-	I40E_PF_STAT("tx_size_255", stats.tx_size_255),
-	I40E_PF_STAT("tx_size_511", stats.tx_size_511),
-	I40E_PF_STAT("tx_size_1023", stats.tx_size_1023),
-	I40E_PF_STAT("tx_size_1522", stats.tx_size_1522),
-	I40E_PF_STAT("tx_size_big", stats.tx_size_big),
-	I40E_PF_STAT("rx_undersize", stats.rx_undersize),
-	I40E_PF_STAT("rx_fragments", stats.rx_fragments),
-	I40E_PF_STAT("rx_oversize", stats.rx_oversize),
-	I40E_PF_STAT("rx_jabber", stats.rx_jabber),
-	I40E_PF_STAT("VF_admin_queue_requests", vf_aq_requests),
-	I40E_PF_STAT("arq_overflows", arq_overflows),
-	I40E_PF_STAT("rx_hwtstamp_cleared", rx_hwtstamp_cleared),
-	I40E_PF_STAT("tx_hwtstamp_skipped", tx_hwtstamp_skipped),
-	I40E_PF_STAT("fdir_flush_cnt", fd_flush_cnt),
-	I40E_PF_STAT("fdir_atr_match", stats.fd_atr_match),
-	I40E_PF_STAT("fdir_atr_tunnel_match", stats.fd_atr_tunnel_match),
-	I40E_PF_STAT("fdir_atr_status", stats.fd_atr_status),
-	I40E_PF_STAT("fdir_sb_match", stats.fd_sb_match),
-	I40E_PF_STAT("fdir_sb_status", stats.fd_sb_status),
+	I40E_PF_STAT("port.rx_bytes", stats.eth.rx_bytes),
+	I40E_PF_STAT("port.tx_bytes", stats.eth.tx_bytes),
+	I40E_PF_STAT("port.rx_unicast", stats.eth.rx_unicast),
+	I40E_PF_STAT("port.tx_unicast", stats.eth.tx_unicast),
+	I40E_PF_STAT("port.rx_multicast", stats.eth.rx_multicast),
+	I40E_PF_STAT("port.tx_multicast", stats.eth.tx_multicast),
+	I40E_PF_STAT("port.rx_broadcast", stats.eth.rx_broadcast),
+	I40E_PF_STAT("port.tx_broadcast", stats.eth.tx_broadcast),
+	I40E_PF_STAT("port.tx_errors", stats.eth.tx_errors),
+	I40E_PF_STAT("port.rx_dropped", stats.eth.rx_discards),
+	I40E_PF_STAT("port.tx_dropped_link_down", stats.tx_dropped_link_down),
+	I40E_PF_STAT("port.rx_crc_errors", stats.crc_errors),
+	I40E_PF_STAT("port.illegal_bytes", stats.illegal_bytes),
+	I40E_PF_STAT("port.mac_local_faults", stats.mac_local_faults),
+	I40E_PF_STAT("port.mac_remote_faults", stats.mac_remote_faults),
+	I40E_PF_STAT("port.tx_timeout", tx_timeout_count),
+	I40E_PF_STAT("port.rx_csum_bad", hw_csum_rx_error),
+	I40E_PF_STAT("port.rx_length_errors", stats.rx_length_errors),
+	I40E_PF_STAT("port.link_xon_rx", stats.link_xon_rx),
+	I40E_PF_STAT("port.link_xoff_rx", stats.link_xoff_rx),
+	I40E_PF_STAT("port.link_xon_tx", stats.link_xon_tx),
+	I40E_PF_STAT("port.link_xoff_tx", stats.link_xoff_tx),
+	I40E_PF_STAT("port.rx_size_64", stats.rx_size_64),
+	I40E_PF_STAT("port.rx_size_127", stats.rx_size_127),
+	I40E_PF_STAT("port.rx_size_255", stats.rx_size_255),
+	I40E_PF_STAT("port.rx_size_511", stats.rx_size_511),
+	I40E_PF_STAT("port.rx_size_1023", stats.rx_size_1023),
+	I40E_PF_STAT("port.rx_size_1522", stats.rx_size_1522),
+	I40E_PF_STAT("port.rx_size_big", stats.rx_size_big),
+	I40E_PF_STAT("port.tx_size_64", stats.tx_size_64),
+	I40E_PF_STAT("port.tx_size_127", stats.tx_size_127),
+	I40E_PF_STAT("port.tx_size_255", stats.tx_size_255),
+	I40E_PF_STAT("port.tx_size_511", stats.tx_size_511),
+	I40E_PF_STAT("port.tx_size_1023", stats.tx_size_1023),
+	I40E_PF_STAT("port.tx_size_1522", stats.tx_size_1522),
+	I40E_PF_STAT("port.tx_size_big", stats.tx_size_big),
+	I40E_PF_STAT("port.rx_undersize", stats.rx_undersize),
+	I40E_PF_STAT("port.rx_fragments", stats.rx_fragments),
+	I40E_PF_STAT("port.rx_oversize", stats.rx_oversize),
+	I40E_PF_STAT("port.rx_jabber", stats.rx_jabber),
+	I40E_PF_STAT("port.VF_admin_queue_requests", vf_aq_requests),
+	I40E_PF_STAT("port.arq_overflows", arq_overflows),
+	I40E_PF_STAT("port.tx_hwtstamp_timeouts", tx_hwtstamp_timeouts),
+	I40E_PF_STAT("port.rx_hwtstamp_cleared", rx_hwtstamp_cleared),
+	I40E_PF_STAT("port.tx_hwtstamp_skipped", tx_hwtstamp_skipped),
+	I40E_PF_STAT("port.fdir_flush_cnt", fd_flush_cnt),
+	I40E_PF_STAT("port.fdir_atr_match", stats.fd_atr_match),
+	I40E_PF_STAT("port.fdir_atr_tunnel_match", stats.fd_atr_tunnel_match),
+	I40E_PF_STAT("port.fdir_atr_status", stats.fd_atr_status),
+	I40E_PF_STAT("port.fdir_sb_match", stats.fd_sb_match),
+	I40E_PF_STAT("port.fdir_sb_status", stats.fd_sb_status),
 
 	/* LPI stats */
-	I40E_PF_STAT("tx_lpi_status", stats.tx_lpi_status),
-	I40E_PF_STAT("rx_lpi_status", stats.rx_lpi_status),
-	I40E_PF_STAT("tx_lpi_count", stats.tx_lpi_count),
-	I40E_PF_STAT("rx_lpi_count", stats.rx_lpi_count),
+	I40E_PF_STAT("port.tx_lpi_status", stats.tx_lpi_status),
+	I40E_PF_STAT("port.rx_lpi_status", stats.rx_lpi_status),
+	I40E_PF_STAT("port.tx_lpi_count", stats.tx_lpi_count),
+	I40E_PF_STAT("port.rx_lpi_count", stats.rx_lpi_count),
 };
 
 /* We use num_tx_queues here as a proxy for the maximum number of queues
@@ -1819,7 +1820,7 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 		return;
 
 	for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "veb.%s",
+		snprintf(p, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_veb_stats[i].stat_string);
 		p += ETH_GSTRING_LEN;
 	}
@@ -1839,7 +1840,7 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 	}
 
 	for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "port.%s",
+		snprintf(p, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_stats[i].stat_string);
 		p += ETH_GSTRING_LEN;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From e08696dcd988dbe5fe8e51753e23d16ee7e75f3e Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:38 -0700
Subject: i40e: update data pointer directly when copying to the buffer

A future patch is going to add a helper function i40e_add_ethtool_stats
that will help lower the amount of boiler plate code in the
i40e_get_ethtool_stats function.

This conversion will take place over many patches, and the helper
function will work by directly updating a reference to the data pointer.

Since this would not work combined with the current method of accessing
data like an array, update all the code that copies stats into the data
buffer to use direct updates to the pointer instead of array accesses.

This will prevent incorrect stat updates for patches in between the
conversion.

Similarly, when copying strings, we used a separate char *p pointer.
Instead, use the data pointer directly as it's already a (u8 *) type
which is the same size.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 117 ++++++++++++-------------
 1 file changed, 58 insertions(+), 59 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 6b34845d251c..44a2803cb1ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1699,7 +1699,6 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_pf *pf = vsi->back;
 	unsigned int j;
-	int i = 0;
 	char *p;
 	struct rtnl_link_stats64 *net_stats = i40e_get_vsi_stats_struct(vsi);
 	unsigned int start;
@@ -1708,12 +1707,12 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 
 	for (j = 0; j < I40E_NETDEV_STATS_LEN; j++) {
 		p = (char *)net_stats + i40e_gstrings_net_stats[j].stat_offset;
-		data[i++] = (i40e_gstrings_net_stats[j].sizeof_stat ==
+		*(data++) = (i40e_gstrings_net_stats[j].sizeof_stat ==
 			sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	for (j = 0; j < I40E_MISC_STATS_LEN; j++) {
 		p = (char *)vsi + i40e_gstrings_misc_stats[j].stat_offset;
-		data[i++] = (i40e_gstrings_misc_stats[j].sizeof_stat ==
+		*(data++) = (i40e_gstrings_misc_stats[j].sizeof_stat ==
 			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	rcu_read_lock();
@@ -1724,29 +1723,29 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 			/* Bump the stat counter to skip these stats, and make
 			 * sure the memory is zero'd
 			 */
-			data[i++] = 0;
-			data[i++] = 0;
-			data[i++] = 0;
-			data[i++] = 0;
+			*(data++) = 0;
+			*(data++) = 0;
+			*(data++) = 0;
+			*(data++) = 0;
 			continue;
 		}
 
 		/* process Tx ring statistics */
 		do {
 			start = u64_stats_fetch_begin_irq(&tx_ring->syncp);
-			data[i] = tx_ring->stats.packets;
-			data[i + 1] = tx_ring->stats.bytes;
+			data[0] = tx_ring->stats.packets;
+			data[1] = tx_ring->stats.bytes;
 		} while (u64_stats_fetch_retry_irq(&tx_ring->syncp, start));
-		i += 2;
+		data += 2;
 
 		/* Rx ring is the 2nd half of the queue pair */
 		rx_ring = &tx_ring[1];
 		do {
 			start = u64_stats_fetch_begin_irq(&rx_ring->syncp);
-			data[i] = rx_ring->stats.packets;
-			data[i + 1] = rx_ring->stats.bytes;
+			data[0] = rx_ring->stats.packets;
+			data[1] = rx_ring->stats.bytes;
 		} while (u64_stats_fetch_retry_irq(&rx_ring->syncp, start));
-		i += 2;
+		data += 2;
 	}
 	rcu_read_unlock();
 	if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
@@ -1759,33 +1758,33 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 		for (j = 0; j < I40E_VEB_STATS_LEN; j++) {
 			p = (char *)veb;
 			p += i40e_gstrings_veb_stats[j].stat_offset;
-			data[i++] = (i40e_gstrings_veb_stats[j].sizeof_stat ==
+			*(data++) = (i40e_gstrings_veb_stats[j].sizeof_stat ==
 				     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 		}
 		for (j = 0; j < I40E_MAX_TRAFFIC_CLASS; j++) {
-			data[i++] = veb->tc_stats.tc_tx_packets[j];
-			data[i++] = veb->tc_stats.tc_tx_bytes[j];
-			data[i++] = veb->tc_stats.tc_rx_packets[j];
-			data[i++] = veb->tc_stats.tc_rx_bytes[j];
+			*(data++) = veb->tc_stats.tc_tx_packets[j];
+			*(data++) = veb->tc_stats.tc_tx_bytes[j];
+			*(data++) = veb->tc_stats.tc_rx_packets[j];
+			*(data++) = veb->tc_stats.tc_rx_bytes[j];
 		}
 	} else {
-		i += I40E_VEB_STATS_TOTAL;
+		data += I40E_VEB_STATS_TOTAL;
 	}
 	for (j = 0; j < I40E_GLOBAL_STATS_LEN; j++) {
 		p = (char *)pf + i40e_gstrings_stats[j].stat_offset;
-		data[i++] = (i40e_gstrings_stats[j].sizeof_stat ==
+		*(data++) = (i40e_gstrings_stats[j].sizeof_stat ==
 			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_tx[j];
-		data[i++] = pf->stats.priority_xoff_tx[j];
+		*(data++) = pf->stats.priority_xon_tx[j];
+		*(data++) = pf->stats.priority_xoff_tx[j];
 	}
 	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
-		data[i++] = pf->stats.priority_xon_rx[j];
-		data[i++] = pf->stats.priority_xoff_rx[j];
+		*(data++) = pf->stats.priority_xon_rx[j];
+		*(data++) = pf->stats.priority_xoff_rx[j];
 	}
 	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++)
-		data[i++] = pf->stats.priority_xon_2_xoff[j];
+		*(data++) = pf->stats.priority_xon_2_xoff[j];
 }
 
 static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
@@ -1797,73 +1796,73 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 	u8 *p = data;
 
 	for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "%s",
+		snprintf(data, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_net_stats[i].stat_string);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MISC_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "%s",
+		snprintf(data, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_misc_stats[i].stat_string);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MAX_NUM_QUEUES(netdev); i++) {
-		snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_packets", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_bytes", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_packets", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_bytes", i);
-		p += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN, "tx-%u.tx_packets", i);
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN, "tx-%u.tx_bytes", i);
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN, "rx-%u.rx_packets", i);
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN, "rx-%u.rx_bytes", i);
+		data += ETH_GSTRING_LEN;
 	}
 	if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
 		return;
 
 	for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "%s",
+		snprintf(data, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_veb_stats[i].stat_string);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
-		snprintf(p, ETH_GSTRING_LEN,
+		snprintf(data, ETH_GSTRING_LEN,
 			 "veb.tc_%u_tx_packets", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN,
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN,
 			 "veb.tc_%u_tx_bytes", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN,
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN,
 			 "veb.tc_%u_rx_packets", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN,
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN,
 			 "veb.tc_%u_rx_bytes", i);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 
 	for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
-		snprintf(p, ETH_GSTRING_LEN, "%s",
+		snprintf(data, ETH_GSTRING_LEN, "%s",
 			 i40e_gstrings_stats[i].stat_string);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-		snprintf(p, ETH_GSTRING_LEN,
+		snprintf(data, ETH_GSTRING_LEN,
 			 "port.tx_priority_%u_xon", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN,
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN,
 			 "port.tx_priority_%u_xoff", i);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-		snprintf(p, ETH_GSTRING_LEN,
+		snprintf(data, ETH_GSTRING_LEN,
 			 "port.rx_priority_%u_xon", i);
-		p += ETH_GSTRING_LEN;
-		snprintf(p, ETH_GSTRING_LEN,
+		data += ETH_GSTRING_LEN;
+		snprintf(data, ETH_GSTRING_LEN,
 			 "port.rx_priority_%u_xoff", i);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
-		snprintf(p, ETH_GSTRING_LEN,
+		snprintf(data, ETH_GSTRING_LEN,
 			 "port.rx_priority_%u_xon_2_xoff", i);
-		p += ETH_GSTRING_LEN;
+		data += ETH_GSTRING_LEN;
 	}
 
 	WARN_ONCE(p - data != i40e_get_stats_count(netdev) * ETH_GSTRING_LEN,
-- 
cgit v1.2.3-59-g8ed1b


From ec29bbf8abaca7b295abd4a4204461ca9f82c3f5 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:39 -0700
Subject: i40e: add function doc headers for ethtool stats functions

Add documentation for the i40e_get_stats_count, i40e_get_stat_strings
and i40e_get_ethtool_stats explaining that the number and ordering of
statistics must remain constant for a given netdevice.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 38 ++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 44a2803cb1ec..4b5ecba0148c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1660,6 +1660,20 @@ done:
 	return err;
 }
 
+/**
+ * i40e_get_stats_count - return the stats count for a device
+ * @netdev: the netdev to return the count for
+ *
+ * Returns the total number of statistics for this netdev. Note that even
+ * though this is a function, it is required that the count for a specific
+ * netdev must never change. Basing the count on static values such as the
+ * maximum number of queues or the device type is ok. However, the API for
+ * obtaining stats is *not* safe against changes based on non-static
+ * values such as the *current* number of queues, or runtime flags.
+ *
+ * If a statistic is not always enabled, return it as part of the count
+ * anyways, always return its string, and report its value as zero.
+ **/
 static int i40e_get_stats_count(struct net_device *netdev)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
@@ -1691,6 +1705,20 @@ static int i40e_get_sset_count(struct net_device *netdev, int sset)
 	}
 }
 
+/**
+ * i40e_get_ethtool_stats - copy stat values into supplied buffer
+ * @netdev: the netdev to collect stats for
+ * @stats: ethtool stats command structure
+ * @data: ethtool supplied buffer
+ *
+ * Copy the stats values for this netdev into the buffer. Expects data to be
+ * pre-allocated to the size returned by i40e_get_stats_count.. Note that all
+ * statistics must be copied in a static order, and the count must not change
+ * for a given netdev. See i40e_get_stats_count for more details.
+ *
+ * If a statistic is not currently valid (such as a disabled queue), this
+ * function reports its value as zero.
+ **/
 static void i40e_get_ethtool_stats(struct net_device *netdev,
 				   struct ethtool_stats *stats, u64 *data)
 {
@@ -1787,6 +1815,16 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 		*(data++) = pf->stats.priority_xon_2_xoff[j];
 }
 
+/**
+ * i40e_get_stat_strings - copy stat strings into supplied buffer
+ * @netdev: the netdev to collect strings for
+ * @data: supplied buffer to copy strings into
+ *
+ * Copy the strings related to stats for this netdev. Expects data to be
+ * pre-allocated with the size reported by i40e_get_stats_count. Note that the
+ * strings must be copied in a static order and the total count must not
+ * change for a given netdev. See i40e_get_stats_count for more details.
+ **/
 static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
 {
 	struct i40e_netdev_priv *np = netdev_priv(netdev);
-- 
cgit v1.2.3-59-g8ed1b


From 3f76bdb437c6a9a4621ee64f1c877b7532efa7d3 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Thu, 17 May 2018 01:08:40 -0700
Subject: i40e: use the more traditional 'i' loop variable

Since we no longer use i as an array index for the data variable,
replace the use of 'j' with 'i' so that we match the general loop
variable name.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 56 +++++++++++++-------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 4b5ecba0148c..6947a2a571cb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1726,26 +1726,26 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 	struct i40e_ring *tx_ring, *rx_ring;
 	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_pf *pf = vsi->back;
-	unsigned int j;
+	unsigned int i;
 	char *p;
 	struct rtnl_link_stats64 *net_stats = i40e_get_vsi_stats_struct(vsi);
 	unsigned int start;
 
 	i40e_update_stats(vsi);
 
-	for (j = 0; j < I40E_NETDEV_STATS_LEN; j++) {
-		p = (char *)net_stats + i40e_gstrings_net_stats[j].stat_offset;
-		*(data++) = (i40e_gstrings_net_stats[j].sizeof_stat ==
+	for (i = 0; i < I40E_NETDEV_STATS_LEN; i++) {
+		p = (char *)net_stats + i40e_gstrings_net_stats[i].stat_offset;
+		*(data++) = (i40e_gstrings_net_stats[i].sizeof_stat ==
 			sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
-	for (j = 0; j < I40E_MISC_STATS_LEN; j++) {
-		p = (char *)vsi + i40e_gstrings_misc_stats[j].stat_offset;
-		*(data++) = (i40e_gstrings_misc_stats[j].sizeof_stat ==
+	for (i = 0; i < I40E_MISC_STATS_LEN; i++) {
+		p = (char *)vsi + i40e_gstrings_misc_stats[i].stat_offset;
+		*(data++) = (i40e_gstrings_misc_stats[i].sizeof_stat ==
 			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
 	rcu_read_lock();
-	for (j = 0; j < I40E_MAX_NUM_QUEUES(netdev) ; j++) {
-		tx_ring = READ_ONCE(vsi->tx_rings[j]);
+	for (i = 0; i < I40E_MAX_NUM_QUEUES(netdev) ; i++) {
+		tx_ring = READ_ONCE(vsi->tx_rings[i]);
 
 		if (!tx_ring) {
 			/* Bump the stat counter to skip these stats, and make
@@ -1783,36 +1783,36 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 	    (pf->flags & I40E_FLAG_VEB_STATS_ENABLED)) {
 		struct i40e_veb *veb = pf->veb[pf->lan_veb];
 
-		for (j = 0; j < I40E_VEB_STATS_LEN; j++) {
+		for (i = 0; i < I40E_VEB_STATS_LEN; i++) {
 			p = (char *)veb;
-			p += i40e_gstrings_veb_stats[j].stat_offset;
-			*(data++) = (i40e_gstrings_veb_stats[j].sizeof_stat ==
+			p += i40e_gstrings_veb_stats[i].stat_offset;
+			*(data++) = (i40e_gstrings_veb_stats[i].sizeof_stat ==
 				     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 		}
-		for (j = 0; j < I40E_MAX_TRAFFIC_CLASS; j++) {
-			*(data++) = veb->tc_stats.tc_tx_packets[j];
-			*(data++) = veb->tc_stats.tc_tx_bytes[j];
-			*(data++) = veb->tc_stats.tc_rx_packets[j];
-			*(data++) = veb->tc_stats.tc_rx_bytes[j];
+		for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+			*(data++) = veb->tc_stats.tc_tx_packets[i];
+			*(data++) = veb->tc_stats.tc_tx_bytes[i];
+			*(data++) = veb->tc_stats.tc_rx_packets[i];
+			*(data++) = veb->tc_stats.tc_rx_bytes[i];
 		}
 	} else {
 		data += I40E_VEB_STATS_TOTAL;
 	}
-	for (j = 0; j < I40E_GLOBAL_STATS_LEN; j++) {
-		p = (char *)pf + i40e_gstrings_stats[j].stat_offset;
-		*(data++) = (i40e_gstrings_stats[j].sizeof_stat ==
+	for (i = 0; i < I40E_GLOBAL_STATS_LEN; i++) {
+		p = (char *)pf + i40e_gstrings_stats[i].stat_offset;
+		*(data++) = (i40e_gstrings_stats[i].sizeof_stat ==
 			     sizeof(u64)) ? *(u64 *)p : *(u32 *)p;
 	}
-	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
-		*(data++) = pf->stats.priority_xon_tx[j];
-		*(data++) = pf->stats.priority_xoff_tx[j];
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		*(data++) = pf->stats.priority_xon_tx[i];
+		*(data++) = pf->stats.priority_xoff_tx[i];
 	}
-	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++) {
-		*(data++) = pf->stats.priority_xon_rx[j];
-		*(data++) = pf->stats.priority_xoff_rx[j];
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
+		*(data++) = pf->stats.priority_xon_rx[i];
+		*(data++) = pf->stats.priority_xoff_rx[i];
 	}
-	for (j = 0; j < I40E_MAX_USER_PRIORITY; j++)
-		*(data++) = pf->stats.priority_xon_2_xoff[j];
+	for (i = 0; i < I40E_MAX_USER_PRIORITY; i++)
+		*(data++) = pf->stats.priority_xon_2_xoff[i];
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 5a15a1b07c51dffcb4c4b6be0f8af1bb964162e0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:52 -0700
Subject: mlxsw: spectrum_router: Add support for route append

Handle append for gateway based routes. Dev-only multipath routes will
be handled by a follow on patch.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 8028d221aece..77b2adb29341 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -5725,6 +5725,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
 
 	switch (fib_work->event) {
 	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
 	case FIB_EVENT_ENTRY_ADD:
 		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
 		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
@@ -5831,6 +5832,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
 
 	switch (fib_work->event) {
 	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
 	case FIB_EVENT_ENTRY_ADD: /* fall through */
 	case FIB_EVENT_ENTRY_DEL:
 		fen6_info = container_of(info, struct fib6_entry_notifier_info,
-- 
cgit v1.2.3-59-g8ed1b


From f34436a430920bdfdba575b509d994c619f5101d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:53 -0700
Subject: net/ipv6: Simplify route replace and appending into multipath route

Bring consistency to ipv6 route replace and append semantics.

Remove rt6_qualify_for_ecmp which is just guess work. It fails in 2 cases:
1. can not replace a route with a reject route. Existing code appends
   a new route instead of replacing the existing one.

2. can not have a multipath route where a leg uses a dev only nexthop

Existing use cases affected by this change:
1. adding a route with existing prefix and metric using NLM_F_CREATE
   without NLM_F_APPEND or NLM_F_EXCL (ie., what iproute2 calls
   'prepend'). Existing code auto-determines that the new nexthop can
   be appended to an existing route to create a multipath route. This
   change breaks that by requiring the APPEND flag for the new route
   to be added to an existing one. Instead the prepend just adds another
   route entry.

2. route replace. Existing code replaces first matching multipath route
   if new route is multipath capable and fallback to first matching
   non-ECMP route (reject or dev only route) in case one isn't available.
   New behavior replaces first matching route. (Thanks to Ido for spotting
   this one)

Note: Newer iproute2 is needed to display multipath routes with a dev-only
      nexthop. This is due to a bug in iproute2 and parsing nexthops.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   6 --
 net/ipv6/ip6_fib.c      | 157 ++++++++++++++++++++++--------------------------
 net/ipv6/route.c        |   3 +-
 3 files changed, 73 insertions(+), 93 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 4cf1ef935ed9..9e4d0f0aeb6d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
-static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
-{
-	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
-}
-
 void ip6_route_input(struct sk_buff *skb);
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index d1dc6017f5a6..f9132a6de917 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 {
 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-	struct fib6_info *iter = NULL;
+	struct fib6_info *iter = NULL, *match = NULL;
 	struct fib6_info __rcu **ins;
-	struct fib6_info __rcu **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
+	int append = (info->nlh &&
+		       (info->nlh->nlmsg_flags & NLM_F_APPEND));
 	int add = (!info->nlh ||
 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
 	int found = 0;
-	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
 	u16 nlflags = NLM_F_EXCL;
 	int err;
 
-	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
+	if (append)
 		nlflags |= NLM_F_APPEND;
 
 	ins = &fn->leaf;
@@ -968,13 +968,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 
 			nlflags &= ~NLM_F_EXCL;
 			if (replace) {
-				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
-					found++;
-					break;
-				}
-				if (rt_can_ecmp)
-					fallback_ins = fallback_ins ?: ins;
-				goto next_iter;
+				found++;
+				break;
 			}
 
 			if (rt6_duplicate_nexthop(iter, rt)) {
@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
 				fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
 				return -EEXIST;
 			}
-			/* If we have the same destination and the same metric,
-			 * but not the same gateway, then the route we try to
-			 * add is sibling to this route, increment our counter
-			 * of siblings, and later we will add our route to the
-			 * list.
-			 * Only static routes (which don't have flag
-			 * RTF_EXPIRES) are used for ECMPv6.
-			 *
-			 * To avoid long list, we only had siblings if the
-			 * route have a gateway.
-			 */
-			if (rt_can_ecmp &&
-			    rt6_qualify_for_ecmp(iter))
-				rt->fib6_nsiblings++;
+
+			/* first route that matches */
+			if (!match)
+				match = iter;
 		}
 
 		if (iter->fib6_metric > rt->fib6_metric)
 			break;
 
-next_iter:
 		ins = &iter->fib6_next;
 	}
 
-	if (fallback_ins && !found) {
-		/* No ECMP-able route found, replace first non-ECMP one */
-		ins = fallback_ins;
-		iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-		found++;
-	}
-
 	/* Reset round-robin state, if necessary */
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
 
 	/* Link this route to others same route. */
-	if (rt->fib6_nsiblings) {
-		unsigned int fib6_nsiblings;
+	if (append && match) {
 		struct fib6_info *sibling, *temp_sibling;
 
-		/* Find the first route that have the same metric */
-		sibling = leaf;
-		while (sibling) {
-			if (sibling->fib6_metric == rt->fib6_metric &&
-			    rt6_qualify_for_ecmp(sibling)) {
-				list_add_tail(&rt->fib6_siblings,
-					      &sibling->fib6_siblings);
-				break;
-			}
-			sibling = rcu_dereference_protected(sibling->fib6_next,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
+		if (rt->fib6_flags & RTF_REJECT) {
+			NL_SET_ERR_MSG(extack,
+				       "Can not append a REJECT route");
+			return -EINVAL;
+		} else if (match->fib6_flags & RTF_REJECT) {
+			NL_SET_ERR_MSG(extack,
+				       "Can not append to a REJECT route");
+			return -EINVAL;
 		}
+		rt->fib6_nsiblings = match->fib6_nsiblings;
+		list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
+		match->fib6_nsiblings++;
+
 		/* For each sibling in the list, increment the counter of
 		 * siblings. BUG() if counters does not match, list of siblings
 		 * is broken!
 		 */
-		fib6_nsiblings = 0;
 		list_for_each_entry_safe(sibling, temp_sibling,
-					 &rt->fib6_siblings, fib6_siblings) {
+					 &match->fib6_siblings, fib6_siblings) {
 			sibling->fib6_nsiblings++;
-			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
-			fib6_nsiblings++;
+			BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
 		}
-		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
-		rt6_multipath_rebalance(temp_sibling);
+
+		rt6_multipath_rebalance(match);
 	}
 
 	/*
 	 *	insert node
 	 */
 	if (!replace) {
+		enum fib_event_type event;
+
 		if (!add)
 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
 
 add:
 		nlflags |= NLM_F_CREATE;
 
-		err = call_fib6_entry_notifiers(info->nl_net,
-						FIB_EVENT_ENTRY_ADD,
-						rt, extack);
+		event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
+		err = call_fib6_entry_notifiers(info->nl_net, event, rt,
+						extack);
 		if (err)
 			return err;
 
@@ -1086,7 +1062,7 @@ add:
 		}
 
 	} else {
-		int nsiblings;
+		struct fib6_info *tmp;
 
 		if (!found) {
 			if (add)
@@ -1101,48 +1077,57 @@ add:
 		if (err)
 			return err;
 
+		/* if route being replaced has siblings, set tmp to
+		 * last one, otherwise tmp is current route. this is
+		 * used to set fib6_next for new route
+		 */
+		if (iter->fib6_nsiblings)
+			tmp = list_last_entry(&iter->fib6_siblings,
+					      struct fib6_info,
+					      fib6_siblings);
+		else
+			tmp = iter;
+
+		/* insert new route */
 		atomic_inc(&rt->fib6_ref);
 		rcu_assign_pointer(rt->fib6_node, fn);
-		rt->fib6_next = iter->fib6_next;
+		rt->fib6_next = tmp->fib6_next;
 		rcu_assign_pointer(*ins, rt);
+
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
-		nsiblings = iter->fib6_nsiblings;
-		iter->fib6_node = NULL;
-		fib6_purge_rt(iter, fn, info->nl_net);
-		if (rcu_access_pointer(fn->rr_ptr) == iter)
-			fn->rr_ptr = NULL;
-		fib6_info_release(iter);
 
-		if (nsiblings) {
+		/* delete old route */
+		rt = iter;
+
+		if (rt->fib6_nsiblings) {
+			struct fib6_info *tmp;
+
 			/* Replacing an ECMP route, remove all siblings */
-			ins = &rt->fib6_next;
-			iter = rcu_dereference_protected(*ins,
-				    lockdep_is_held(&rt->fib6_table->tb6_lock));
-			while (iter) {
-				if (iter->fib6_metric > rt->fib6_metric)
-					break;
-				if (rt6_qualify_for_ecmp(iter)) {
-					*ins = iter->fib6_next;
-					iter->fib6_node = NULL;
-					fib6_purge_rt(iter, fn, info->nl_net);
-					if (rcu_access_pointer(fn->rr_ptr) == iter)
-						fn->rr_ptr = NULL;
-					fib6_info_release(iter);
-					nsiblings--;
-					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
-				} else {
-					ins = &iter->fib6_next;
-				}
-				iter = rcu_dereference_protected(*ins,
-					lockdep_is_held(&rt->fib6_table->tb6_lock));
+			list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
+						 fib6_siblings) {
+				iter->fib6_node = NULL;
+				fib6_purge_rt(iter, fn, info->nl_net);
+				if (rcu_access_pointer(fn->rr_ptr) == iter)
+					fn->rr_ptr = NULL;
+				fib6_info_release(iter);
+
+				rt->fib6_nsiblings--;
+				info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
 			}
-			WARN_ON(nsiblings != 0);
 		}
+
+		WARN_ON(rt->fib6_nsiblings != 0);
+
+		rt->fib6_node = NULL;
+		fib6_purge_rt(rt, fn, info->nl_net);
+		if (rcu_access_pointer(fn->rr_ptr) == rt)
+			fn->rr_ptr = NULL;
+		fib6_info_release(rt);
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3bc334..bcb8785c0451 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
 			lockdep_is_held(&rt->fib6_table->tb6_lock));
 	while (iter) {
 		if (iter->fib6_metric == rt->fib6_metric &&
-		    rt6_qualify_for_ecmp(iter))
+		    iter->fib6_nsiblings)
 			return iter;
 		iter = rcu_dereference_protected(iter->fib6_next,
 				lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
+		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
 		nhn++;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 37ce42c14edad0d979444d8a589ae2299edb464d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:54 -0700
Subject: selftests: fib_tests: Add success-fail counts

As more tests are added, it is convenient to have a tally at the end.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 9164e60d4b66..7e2291161e15 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -18,8 +18,10 @@ log_test()
 
 	if [ ${rc} -eq ${expected} ]; then
 		printf "    TEST: %-60s  [ OK ]\n" "${msg}"
+		nsuccess=$((nsuccess+1))
 	else
 		ret=1
+		nfail=$((nfail+1))
 		printf "    TEST: %-60s  [FAIL]\n" "${msg}"
 		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
 		echo
@@ -598,4 +600,9 @@ cleanup &> /dev/null
 
 fib_test
 
+if [ "$TESTS" != "none" ]; then
+	printf "\nTests passed: %3d\n" ${nsuccess}
+	printf "Tests failed: %3d\n"   ${nfail}
+fi
+
 exit $ret
-- 
cgit v1.2.3-59-g8ed1b


From 1c7447b4e8966a9c3a5eafc19c5ca20c385025c1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:55 -0700
Subject: selftests: fib_tests: Add command line options

Add command line options for controlling pause on fail, controlling
specific tests to run and verbose mode rather than relying on environment
variables.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 53 ++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 7e2291161e15..8c99f0689efc 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,8 +6,9 @@
 
 ret=0
 
-VERBOSE=${VERBOSE:=0}
-PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+TESTS="unregister down carrier nexthop"
+VERBOSE=0
+PAUSE_ON_FAIL=no
 IP="ip -netns testns"
 
 log_test()
@@ -565,20 +566,36 @@ fib_nexthop_test()
 }
 
 ################################################################################
-#
+# usage
 
-fib_test()
+usage()
 {
-	if [ -n "$TEST" ]; then
-		eval $TEST
-	else
-		fib_unreg_test
-		fib_down_test
-		fib_carrier_test
-		fib_nexthop_test
-	fi
+	cat <<EOF
+usage: ${0##*/} OPTS
+
+        -t <test>   Test(s) to run (default: all)
+                    (options: $TESTS)
+        -p          Pause on fail
+        -v          verbose mode (show commands and output)
+EOF
 }
 
+################################################################################
+# main
+
+while getopts :t:pPhv o
+do
+	case $o in
+		t) TESTS=$OPTARG;;
+		p) PAUSE_ON_FAIL=yes;;
+		v) VERBOSE=$(($VERBOSE + 1));;
+		h) usage; exit 0;;
+		*) usage; exit 1;;
+	esac
+done
+
+PEER_CMD="ip netns exec ${PEER_NS}"
+
 if [ "$(id -u)" -ne 0 ];then
 	echo "SKIP: Need root privileges"
 	exit 0
@@ -598,7 +615,17 @@ fi
 # start clean
 cleanup &> /dev/null
 
-fib_test
+for t in $TESTS
+do
+	case $t in
+	fib_unreg_test|unregister)	fib_unreg_test;;
+	fib_down_test|down)		fib_down_test;;
+	fib_carrier_test|carrier)	fib_carrier_test;;
+	fib_nexthop_test|nexthop)	fib_nexthop_test;;
+
+	help) echo "Test names: $TESTS"; exit 0;;
+	esac
+done
 
 if [ "$TESTS" != "none" ]; then
 	printf "\nTests passed: %3d\n" ${nsuccess}
-- 
cgit v1.2.3-59-g8ed1b


From 7df15e6c3e83850eeec13a9a89a494a898344e93 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:56 -0700
Subject: selftests: fib_tests: Add option to pause after each test

Add option to pause after each test before cleanup is done. Allows
user to do manual inspection or more ad-hoc testing after each test
with the setup in tact.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 8c99f0689efc..12b648826151 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -9,6 +9,7 @@ ret=0
 TESTS="unregister down carrier nexthop"
 VERBOSE=0
 PAUSE_ON_FAIL=no
+PAUSE=no
 IP="ip -netns testns"
 
 log_test()
@@ -31,6 +32,13 @@ log_test()
 			[ "$a" = "q" ] && exit 1
 		fi
 	fi
+
+	if [ "${PAUSE}" = "yes" ]; then
+		echo
+		echo "hit enter to continue, 'q' to quit"
+		read a
+		[ "$a" = "q" ] && exit 1
+	fi
 }
 
 setup()
@@ -576,6 +584,7 @@ usage: ${0##*/} OPTS
         -t <test>   Test(s) to run (default: all)
                     (options: $TESTS)
         -p          Pause on fail
+        -P          Pause after each test before cleanup
         -v          verbose mode (show commands and output)
 EOF
 }
@@ -588,6 +597,7 @@ do
 	case $o in
 		t) TESTS=$OPTARG;;
 		p) PAUSE_ON_FAIL=yes;;
+		P) PAUSE=yes;;
 		v) VERBOSE=$(($VERBOSE + 1));;
 		h) usage; exit 0;;
 		*) usage; exit 1;;
@@ -596,6 +606,9 @@ done
 
 PEER_CMD="ip netns exec ${PEER_NS}"
 
+# make sure we don't pause twice
+[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
+
 if [ "$(id -u)" -ne 0 ];then
 	echo "SKIP: Need root privileges"
 	exit 0
-- 
cgit v1.2.3-59-g8ed1b


From f9a5a9d89ffac4da32c156879b57d573bc89117c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:57 -0700
Subject: selftests: fib_tests: Add ipv6 route add append replace tests

Add IPv6 route tests covering add, append and replace permutations.
Assumes the ability to add a basic single path route works; this is
required for example when adding an address to an interface.

$ fib_tests.sh -t ipv6_rt

IPv6 route add / append tests
    TEST: Attempt to add duplicate route - gw                           [ OK ]
    TEST: Attempt to add duplicate route - dev only                     [ OK ]
    TEST: Attempt to add duplicate route - reject route                 [ OK ]
    TEST: Add new route for existing prefix (w/o NLM_F_EXCL)            [ OK ]
    TEST: Append nexthop to existing route - gw                         [ OK ]
    TEST: Append nexthop to existing route - dev only                   [ OK ]
    TEST: Append nexthop to existing route - reject route               [ OK ]
    TEST: Append nexthop to existing reject route - gw                  [ OK ]
    TEST: Append nexthop to existing reject route - dev only            [ OK ]
    TEST: Add multipath route                                           [ OK ]
    TEST: Attempt to add duplicate multipath route                      [ OK ]
    TEST: Route add with different metrics                              [ OK ]
    TEST: Route delete with metric                                      [ OK ]

IPv6 route replace tests
    TEST: Single path with single path                                  [ OK ]
    TEST: Single path with multipath                                    [ OK ]
    TEST: Single path with reject route                                 [ OK ]
    TEST: Single path with single path via multipath attribute          [ OK ]
    TEST: Invalid nexthop                                               [ OK ]
    TEST: Single path - replace of non-existent route                   [ OK ]
    TEST: Multipath with multipath                                      [ OK ]
    TEST: Multipath with single path                                    [ OK ]
    TEST: Multipath with single path via multipath attribute            [ OK ]
    TEST: Multipath with reject route                                   [ OK ]
    TEST: Multipath - invalid first nexthop                             [ OK ]
    TEST: Multipath - invalid second nexthop                            [ OK ]
    TEST: Multipath - replace of non-existent route                     [ OK ]

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 335 ++++++++++++++++++++++++++++++-
 1 file changed, 334 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 12b648826151..384828f28372 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,7 +6,7 @@
 
 ret=0
 
-TESTS="unregister down carrier nexthop"
+TESTS="unregister down carrier nexthop ipv6_rt"
 VERBOSE=0
 PAUSE_ON_FAIL=no
 PAUSE=no
@@ -573,6 +573,338 @@ fib_nexthop_test()
 	cleanup
 }
 
+################################################################################
+# Tests on route add and replace
+
+run_cmd()
+{
+	local cmd="$1"
+	local out
+	local stderr="2>/dev/null"
+
+	if [ "$VERBOSE" = "1" ]; then
+		printf "    COMMAND: $cmd\n"
+		stderr=
+	fi
+
+	out=$(eval $cmd $stderr)
+	rc=$?
+	if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+		echo "    $out"
+	fi
+
+	[ "$VERBOSE" = "1" ] && echo
+
+	return $rc
+}
+
+# add route for a prefix, flushing any existing routes first
+# expected to be the first step of a test
+add_route6()
+{
+	local pfx="$1"
+	local nh="$2"
+	local out
+
+	if [ "$VERBOSE" = "1" ]; then
+		echo
+		echo "    ##################################################"
+		echo
+	fi
+
+	run_cmd "$IP -6 ro flush ${pfx}"
+	[ $? -ne 0 ] && exit 1
+
+	out=$($IP -6 ro ls match ${pfx})
+	if [ -n "$out" ]; then
+		echo "Failed to flush routes for prefix used for tests."
+		exit 1
+	fi
+
+	run_cmd "$IP -6 ro add ${pfx} ${nh}"
+	if [ $? -ne 0 ]; then
+		echo "Failed to add initial route for test."
+		exit 1
+	fi
+}
+
+# add initial route - used in replace route tests
+add_initial_route6()
+{
+	add_route6 "2001:db8:104::/64" "$1"
+}
+
+check_route6()
+{
+	local pfx="2001:db8:104::/64"
+	local expected="$1"
+	local out
+	local rc=0
+
+	out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
+	if [ -z "${out}" ]; then
+		if [ "$VERBOSE" = "1" ]; then
+			printf "\nNo route entry found\n"
+			printf "Expected:\n"
+			printf "    ${expected}\n"
+		fi
+		return 1
+	fi
+
+	# tricky way to convert output to 1-line without ip's
+	# messy '\'; this drops all extra white space
+	out=$(echo ${out})
+	if [ "${out}" != "${expected}" ]; then
+		rc=1
+		if [ "${VERBOSE}" = "1" ]; then
+			printf "    Unexpected route entry. Have:\n"
+			printf "        ${out}\n"
+			printf "    Expected:\n"
+			printf "        ${expected}\n\n"
+		fi
+	fi
+
+	return $rc
+}
+
+route_cleanup()
+{
+	$IP li del red 2>/dev/null
+	$IP li del dummy1 2>/dev/null
+	$IP li del veth1 2>/dev/null
+	$IP li del veth3 2>/dev/null
+
+	cleanup &> /dev/null
+}
+
+route_setup()
+{
+	route_cleanup
+	setup
+
+	[ "${VERBOSE}" = "1" ] && set -x
+	set -e
+
+	$IP li add red up type vrf table 101
+	$IP li add veth1 type veth peer name veth2
+	$IP li add veth3 type veth peer name veth4
+
+	$IP li set veth1 up
+	$IP li set veth3 up
+	$IP li set veth2 vrf red up
+	$IP li set veth4 vrf red up
+	$IP li add dummy1 type dummy
+	$IP li set dummy1 vrf red up
+
+	$IP -6 addr add 2001:db8:101::1/64 dev veth1
+	$IP -6 addr add 2001:db8:101::2/64 dev veth2
+	$IP -6 addr add 2001:db8:103::1/64 dev veth3
+	$IP -6 addr add 2001:db8:103::2/64 dev veth4
+	$IP -6 addr add 2001:db8:104::1/64 dev dummy1
+
+	set +ex
+}
+
+# assumption is that basic add of a single path route works
+# otherwise just adding an address on an interface is broken
+ipv6_rt_add()
+{
+	local rc
+
+	echo
+	echo "IPv6 route add / append tests"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2"
+	log_test $? 2 "Attempt to add duplicate route - gw"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro add 2001:db8:104::/64 dev veth3"
+	log_test $? 2 "Attempt to add duplicate route - dev only"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
+	log_test $? 2 "Attempt to add duplicate route - reject route"
+
+	# iproute2 prepend only sets NLM_F_CREATE
+	# - adds a new route; does NOT convert existing route to ECMP
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2"
+	check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
+	log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)"
+
+	# route append with same prefix adds a new route
+	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
+	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+	log_test $? 0 "Append nexthop to existing route - gw"
+
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
+	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1"
+	log_test $? 0 "Append nexthop to existing route - dev only"
+
+	# multipath route can not have a nexthop that is a reject route
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64"
+	log_test $? 2 "Append nexthop to existing route - reject route"
+
+	# reject route can not be converted to multipath route
+	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
+	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
+	run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
+	log_test $? 2 "Append nexthop to existing reject route - gw"
+
+	run_cmd "$IP -6 ro flush 2001:db8:104::/64"
+	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
+	run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3"
+	log_test $? 2 "Append nexthop to existing reject route - dev only"
+
+	# insert mpath directly
+	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+	log_test $? 0 "Add multipath route"
+
+	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro add 2001:db8:104::/64 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	log_test $? 2 "Attempt to add duplicate multipath route"
+
+	# insert of a second route without append but different metric
+	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2 metric 512"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::3 metric 256"
+		rc=$?
+	fi
+	log_test $rc 0 "Route add with different metrics"
+
+	run_cmd "$IP -6 ro del 2001:db8:104::/64 metric 512"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 via 2001:db8:103::3 dev veth3 metric 256 2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
+		rc=$?
+	fi
+	log_test $rc 0 "Route delete with metric"
+}
+
+ipv6_rt_replace_single()
+{
+	# single path with single path
+	#
+	add_initial_route6 "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:103::2"
+	check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
+	log_test $? 0 "Single path with single path"
+
+	# single path with multipath
+	#
+	add_initial_route6 "nexthop via 2001:db8:101::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::2"
+	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+	log_test $? 0 "Single path with multipath"
+
+	# single path with reject
+	#
+	add_initial_route6 "nexthop via 2001:db8:101::2"
+	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
+	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
+	log_test $? 0 "Single path with reject route"
+
+	# single path with single path using MULTIPATH attribute
+	#
+	add_initial_route6 "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:103::2"
+	check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
+	log_test $? 0 "Single path with single path via multipath attribute"
+
+	# route replace fails - invalid nexthop
+	add_initial_route6 "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:104::2"
+	if [ $? -eq 0 ]; then
+		# previous command is expected to fail so if it returns 0
+		# that means the test failed.
+		log_test 0 1 "Invalid nexthop"
+	else
+		check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
+		log_test $? 0 "Invalid nexthop"
+	fi
+
+	# replace non-existent route
+	# - note use of change versus replace since ip adds NLM_F_CREATE
+	#   for replace
+	add_initial_route6 "via 2001:db8:101::2"
+	run_cmd "$IP -6 ro change 2001:db8:105::/64 via 2001:db8:101::2"
+	log_test $? 2 "Single path - replace of non-existent route"
+}
+
+ipv6_rt_replace_mpath()
+{
+	# multipath with multipath
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
+	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::3 dev veth3 weight 1"
+	log_test $? 0 "Multipath with multipath"
+
+	# multipath with single
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:101::3"
+	check_route6  "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
+	log_test $? 0 "Multipath with single path"
+
+	# multipath with single
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3"
+	check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
+	log_test $? 0 "Multipath with single path via multipath attribute"
+
+	# multipath with reject
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64"
+	check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024"
+	log_test $? 0 "Multipath with reject route"
+
+	# route replace fails - invalid nexthop 1
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
+	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+	log_test $? 0 "Multipath - invalid first nexthop"
+
+	# route replace fails - invalid nexthop 2
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:113::3"
+	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+	log_test $? 0 "Multipath - invalid second nexthop"
+
+	# multipath non-existent route
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro change 2001:db8:105::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
+	log_test $? 2 "Multipath - replace of non-existent route"
+}
+
+ipv6_rt_replace()
+{
+	echo
+	echo "IPv6 route replace tests"
+
+	ipv6_rt_replace_single
+	ipv6_rt_replace_mpath
+}
+
+ipv6_route_test()
+{
+	route_setup
+
+	ipv6_rt_add
+	ipv6_rt_replace
+
+	route_cleanup
+}
+
 ################################################################################
 # usage
 
@@ -635,6 +967,7 @@ do
 	fib_down_test|down)		fib_down_test;;
 	fib_carrier_test|carrier)	fib_carrier_test;;
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
+	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 	esac
-- 
cgit v1.2.3-59-g8ed1b


From abb1860aac20868c74da2f4974d8ed05827e2854 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 21 May 2018 10:26:58 -0700
Subject: selftests: fib_tests: Add ipv4 route add append replace tests

Add IPv4 route tests covering add, append and replace permutations.
Assumes the ability to add a basic single path route works; this is
required for example when adding an address to an interface.

$ fib_tests.sh -t ipv4_rt

IPv4 route add / append tests
    TEST: Attempt to add duplicate route - gw                           [ OK ]
    TEST: Attempt to add duplicate route - dev only                     [ OK ]
    TEST: Attempt to add duplicate route - reject route                 [ OK ]
    TEST: Add new nexthop for existing prefix                           [ OK ]
    TEST: Append nexthop to existing route - gw                         [ OK ]
    TEST: Append nexthop to existing route - dev only                   [ OK ]
    TEST: Append nexthop to existing route - reject route               [ OK ]
    TEST: Append nexthop to existing reject route - gw                  [ OK ]
    TEST: Append nexthop to existing reject route - dev only            [ OK ]
    TEST: add multipath route                                           [ OK ]
    TEST: Attempt to add duplicate multipath route                      [ OK ]
    TEST: Route add with different metrics                              [ OK ]
    TEST: Route delete with metric                                      [ OK ]

IPv4 route replace tests
    TEST: Single path with single path                                  [ OK ]
    TEST: Single path with multipath                                    [ OK ]
    TEST: Single path with reject route                                 [ OK ]
    TEST: Single path with single path via multipath attribute          [ OK ]
    TEST: Invalid nexthop                                               [ OK ]
    TEST: Single path - replace of non-existent route                   [ OK ]
    TEST: Multipath with multipath                                      [ OK ]
    TEST: Multipath with single path                                    [ OK ]
    TEST: Multipath with single path via multipath attribute            [ OK ]
    TEST: Multipath with reject route                                   [ OK ]
    TEST: Multipath - invalid first nexthop                             [ OK ]
    TEST: Multipath - invalid second nexthop                            [ OK ]
    TEST: Multipath - replace of non-existent route                     [ OK ]

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 279 ++++++++++++++++++++++++++++++-
 1 file changed, 278 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 384828f28372..e7d76fbc36e9 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,7 +6,7 @@
 
 ret=0
 
-TESTS="unregister down carrier nexthop ipv6_rt"
+TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt"
 VERBOSE=0
 PAUSE_ON_FAIL=no
 PAUSE=no
@@ -702,6 +702,12 @@ route_setup()
 	$IP -6 addr add 2001:db8:103::2/64 dev veth4
 	$IP -6 addr add 2001:db8:104::1/64 dev dummy1
 
+	$IP addr add 172.16.101.1/24 dev veth1
+	$IP addr add 172.16.101.2/24 dev veth2
+	$IP addr add 172.16.103.1/24 dev veth3
+	$IP addr add 172.16.103.2/24 dev veth4
+	$IP addr add 172.16.104.1/24 dev dummy1
+
 	set +ex
 }
 
@@ -905,6 +911,276 @@ ipv6_route_test()
 	route_cleanup
 }
 
+# add route for a prefix, flushing any existing routes first
+# expected to be the first step of a test
+add_route()
+{
+	local pfx="$1"
+	local nh="$2"
+	local out
+
+	if [ "$VERBOSE" = "1" ]; then
+		echo
+		echo "    ##################################################"
+		echo
+	fi
+
+	run_cmd "$IP ro flush ${pfx}"
+	[ $? -ne 0 ] && exit 1
+
+	out=$($IP ro ls match ${pfx})
+	if [ -n "$out" ]; then
+		echo "Failed to flush routes for prefix used for tests."
+		exit 1
+	fi
+
+	run_cmd "$IP ro add ${pfx} ${nh}"
+	if [ $? -ne 0 ]; then
+		echo "Failed to add initial route for test."
+		exit 1
+	fi
+}
+
+# add initial route - used in replace route tests
+add_initial_route()
+{
+	add_route "172.16.104.0/24" "$1"
+}
+
+check_route()
+{
+	local pfx="172.16.104.0/24"
+	local expected="$1"
+	local out
+	local rc=0
+
+	out=$($IP ro ls match ${pfx})
+	if [ -z "${out}" ]; then
+		if [ "$VERBOSE" = "1" ]; then
+			printf "\nNo route entry found\n"
+			printf "Expected:\n"
+			printf "    ${expected}\n"
+		fi
+		return 1
+	fi
+
+	# tricky way to convert output to 1-line without ip's
+	# messy '\'; this drops all extra white space
+	out=$(echo ${out})
+	if [ "${out}" != "${expected}" ]; then
+		rc=1
+		if [ "${VERBOSE}" = "1" ]; then
+			printf "    Unexpected route entry. Have:\n"
+			printf "        ${out}\n"
+			printf "    Expected:\n"
+			printf "        ${expected}\n\n"
+		fi
+	fi
+
+	return $rc
+}
+
+# assumption is that basic add of a single path route works
+# otherwise just adding an address on an interface is broken
+ipv4_rt_add()
+{
+	local rc
+
+	echo
+	echo "IPv4 route add / append tests"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2"
+	log_test $? 2 "Attempt to add duplicate route - gw"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro add 172.16.104.0/24 dev veth3"
+	log_test $? 2 "Attempt to add duplicate route - dev only"
+
+	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro add unreachable 172.16.104.0/24"
+	log_test $? 2 "Attempt to add duplicate route - reject route"
+
+	# iproute2 prepend only sets NLM_F_CREATE
+	# - adds a new route; does NOT convert existing route to ECMP
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro prepend 172.16.104.0/24 via 172.16.103.2"
+	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3 172.16.104.0/24 via 172.16.101.2 dev veth1"
+	log_test $? 0 "Add new nexthop for existing prefix"
+
+	# route append with same prefix adds a new route
+	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
+	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.2 dev veth3"
+	log_test $? 0 "Append nexthop to existing route - gw"
+
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
+	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 dev veth3 scope link"
+	log_test $? 0 "Append nexthop to existing route - dev only"
+
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro append unreachable 172.16.104.0/24"
+	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 unreachable 172.16.104.0/24"
+	log_test $? 0 "Append nexthop to existing route - reject route"
+
+	run_cmd "$IP ro flush 172.16.104.0/24"
+	run_cmd "$IP ro add unreachable 172.16.104.0/24"
+	run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
+	check_route "unreachable 172.16.104.0/24 172.16.104.0/24 via 172.16.103.2 dev veth3"
+	log_test $? 0 "Append nexthop to existing reject route - gw"
+
+	run_cmd "$IP ro flush 172.16.104.0/24"
+	run_cmd "$IP ro add unreachable 172.16.104.0/24"
+	run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
+	check_route "unreachable 172.16.104.0/24 172.16.104.0/24 dev veth3 scope link"
+	log_test $? 0 "Append nexthop to existing reject route - dev only"
+
+	# insert mpath directly
+	add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+	log_test $? 0 "add multipath route"
+
+	add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	log_test $? 2 "Attempt to add duplicate multipath route"
+
+	# insert of a second route without append but different metric
+	add_route "172.16.104.0/24" "via 172.16.101.2"
+	run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2 metric 512"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.3 metric 256"
+		rc=$?
+	fi
+	log_test $rc 0 "Route add with different metrics"
+
+	run_cmd "$IP ro del 172.16.104.0/24 metric 512"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.3 dev veth3 metric 256"
+		rc=$?
+	fi
+	log_test $rc 0 "Route delete with metric"
+}
+
+ipv4_rt_replace_single()
+{
+	# single path with single path
+	#
+	add_initial_route "via 172.16.101.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.103.2"
+	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
+	log_test $? 0 "Single path with single path"
+
+	# single path with multipath
+	#
+	add_initial_route "nexthop via 172.16.101.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.2"
+	check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+	log_test $? 0 "Single path with multipath"
+
+	# single path with reject
+	#
+	add_initial_route "nexthop via 172.16.101.2"
+	run_cmd "$IP ro replace unreachable 172.16.104.0/24"
+	check_route "unreachable 172.16.104.0/24"
+	log_test $? 0 "Single path with reject route"
+
+	# single path with single path using MULTIPATH attribute
+	#
+	add_initial_route "via 172.16.101.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.103.2"
+	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
+	log_test $? 0 "Single path with single path via multipath attribute"
+
+	# route replace fails - invalid nexthop
+	add_initial_route "via 172.16.101.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 via 2001:db8:104::2"
+	if [ $? -eq 0 ]; then
+		# previous command is expected to fail so if it returns 0
+		# that means the test failed.
+		log_test 0 1 "Invalid nexthop"
+	else
+		check_route "172.16.104.0/24 via 172.16.101.2 dev veth1"
+		log_test $? 0 "Invalid nexthop"
+	fi
+
+	# replace non-existent route
+	# - note use of change versus replace since ip adds NLM_F_CREATE
+	#   for replace
+	add_initial_route "via 172.16.101.2"
+	run_cmd "$IP ro change 172.16.105.0/24 via 172.16.101.2"
+	log_test $? 2 "Single path - replace of non-existent route"
+}
+
+ipv4_rt_replace_mpath()
+{
+	# multipath with multipath
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
+	check_route  "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.3 dev veth3 weight 1"
+	log_test $? 0 "Multipath with multipath"
+
+	# multipath with single
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.101.3"
+	check_route  "172.16.104.0/24 via 172.16.101.3 dev veth1"
+	log_test $? 0 "Multipath with single path"
+
+	# multipath with single
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3"
+	check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
+	log_test $? 0 "Multipath with single path via multipath attribute"
+
+	# multipath with reject
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace unreachable 172.16.104.0/24"
+	check_route "unreachable 172.16.104.0/24"
+	log_test $? 0 "Multipath with reject route"
+
+	# route replace fails - invalid nexthop 1
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.111.3 nexthop via 172.16.103.3"
+	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+	log_test $? 0 "Multipath - invalid first nexthop"
+
+	# route replace fails - invalid nexthop 2
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.113.3"
+	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+	log_test $? 0 "Multipath - invalid second nexthop"
+
+	# multipath non-existent route
+	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+	run_cmd "$IP ro change 172.16.105.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
+	log_test $? 2 "Multipath - replace of non-existent route"
+}
+
+ipv4_rt_replace()
+{
+	echo
+	echo "IPv4 route replace tests"
+
+	ipv4_rt_replace_single
+	ipv4_rt_replace_mpath
+}
+
+ipv4_route_test()
+{
+	route_setup
+
+	ipv4_rt_add
+	ipv4_rt_replace
+
+	route_cleanup
+}
+
 ################################################################################
 # usage
 
@@ -968,6 +1244,7 @@ do
 	fib_carrier_test|carrier)	fib_carrier_test;;
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
+	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 	esac
-- 
cgit v1.2.3-59-g8ed1b


From 290aa0ad74c995c60d94fb4f1d66d411efa13dd5 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 21 May 2018 23:03:04 +0300
Subject: net: sched: don't disable bh when accessing action idr

Initial net_device implementation used ingress_lock spinlock to synchronize
ingress path of device. This lock was used in both process and bh context.
In some code paths action map lock was obtained while holding ingress_lock.
Commit e1e992e52faa ("[NET_SCHED] protect action config/dump from irqs")
modified actions to always disable bh, while using action map lock, in
order to prevent deadlock on ingress_lock in softirq. This lock was removed
from net_device, so disabling bh, while accessing action map, is no longer
necessary.

Replace all action idr spinlock usage with regular calls that do not
disable bh.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 72251241665a..3f4cf930f809 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -77,9 +77,9 @@ static void free_tcf(struct tc_action *p)
 
 static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 {
-	spin_lock_bh(&idrinfo->lock);
+	spin_lock(&idrinfo->lock);
 	idr_remove(&idrinfo->action_idr, p->tcfa_index);
-	spin_unlock_bh(&idrinfo->lock);
+	spin_unlock(&idrinfo->lock);
 	gen_kill_estimator(&p->tcfa_rate_est);
 	free_tcf(p);
 }
@@ -156,7 +156,7 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 	struct tc_action *p;
 	unsigned long id = 1;
 
-	spin_lock_bh(&idrinfo->lock);
+	spin_lock(&idrinfo->lock);
 
 	s_i = cb->args[0];
 
@@ -191,7 +191,7 @@ done:
 	if (index >= 0)
 		cb->args[0] = index + 1;
 
-	spin_unlock_bh(&idrinfo->lock);
+	spin_unlock(&idrinfo->lock);
 	if (n_i) {
 		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
 			cb->args[1] = n_i;
@@ -261,9 +261,9 @@ static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
 {
 	struct tc_action *p = NULL;
 
-	spin_lock_bh(&idrinfo->lock);
+	spin_lock(&idrinfo->lock);
 	p = idr_find(&idrinfo->action_idr, index);
-	spin_unlock_bh(&idrinfo->lock);
+	spin_unlock(&idrinfo->lock);
 
 	return p;
 }
@@ -323,7 +323,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	}
 	spin_lock_init(&p->tcfa_lock);
 	idr_preload(GFP_KERNEL);
-	spin_lock_bh(&idrinfo->lock);
+	spin_lock(&idrinfo->lock);
 	/* user doesn't specify an index */
 	if (!index) {
 		index = 1;
@@ -331,7 +331,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	} else {
 		err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC);
 	}
-	spin_unlock_bh(&idrinfo->lock);
+	spin_unlock(&idrinfo->lock);
 	idr_preload_end();
 	if (err)
 		goto err3;
@@ -369,9 +369,9 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
 {
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
-	spin_lock_bh(&idrinfo->lock);
+	spin_lock(&idrinfo->lock);
 	idr_replace(&idrinfo->action_idr, a, a->tcfa_index);
-	spin_unlock_bh(&idrinfo->lock);
+	spin_unlock(&idrinfo->lock);
 }
 EXPORT_SYMBOL(tcf_idr_insert);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a9c9b51e54618861420093ae6e9b50a961914c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 21 May 2018 15:08:56 -0700
Subject: tcp: add max_quickacks param to tcp_incr_quickack and
 tcp_enter_quickack_mode

We want to add finer control of the number of ACK packets sent after
ECN events.

This patch is not changing current behavior, it only enables following
change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index aebb29ab2fdf..2e970e9f4e09 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -203,21 +203,23 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
 
 	if (quickacks == 0)
 		quickacks = 2;
+	quickacks = min(quickacks, max_quickacks);
 	if (quickacks > icsk->icsk_ack.quick)
-		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+		icsk->icsk_ack.quick = quickacks;
 }
 
-static void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	tcp_incr_quickack(sk);
+
+	tcp_incr_quickack(sk, max_quickacks);
 	icsk->icsk_ack.pingpong = 0;
 	icsk->icsk_ack.ato = TCP_ATO_MIN;
 }
@@ -261,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode((struct sock *)tp);
+			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -269,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode((struct sock *)tp);
+			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
@@ -686,7 +688,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 		/* The _first_ data packet received, initialize
 		 * delayed ACK engine.
 		 */
-		tcp_incr_quickack(sk);
+		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
 		icsk->icsk_ack.ato = TCP_ATO_MIN;
 	} else {
 		int m = now - icsk->icsk_ack.lrcvtime;
@@ -702,7 +704,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 			/* Too long gap. Apparently sender failed to
 			 * restart window, so that we send ACKs quickly.
 			 */
-			tcp_incr_quickack(sk);
+			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
 			sk_mem_reclaim(sk);
 		}
 	}
@@ -4179,7 +4181,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
 	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
-		tcp_enter_quickack_mode(sk);
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 
 		if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
 			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4706,7 +4708,7 @@ queue_and_out:
 		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
 out_of_window:
-		tcp_enter_quickack_mode(sk);
+		tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 		inet_csk_schedule_ack(sk);
 drop:
 		tcp_drop(sk, skb);
@@ -5790,7 +5792,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 			 * to stand against the temptation 8)     --ANK
 			 */
 			inet_csk_schedule_ack(sk);
-			tcp_enter_quickack_mode(sk);
+			tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						  TCP_DELACK_MAX, TCP_RTO_MAX);
 
-- 
cgit v1.2.3-59-g8ed1b


From 522040ea5fdd1c33bbf75e1d7c7c0422b96a94ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 21 May 2018 15:08:57 -0700
Subject: tcp: do not aggressively quick ack after ECN events

ECN signals currently forces TCP to enter quickack mode for
up to 16 (TCP_MAX_QUICKACKS) following incoming packets.

We believe this is not needed, and only sending one immediate ack
for the current packet should be enough.

This should reduce the extra load noticed in DCTCP environments,
after congestion events.

This is part 2 of our effort to reduce pure ACK packets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2e970e9f4e09..1191cac72109 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -263,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+			tcp_enter_quickack_mode((struct sock *)tp, 1);
 		break;
 	case INET_ECN_CE:
 		if (tcp_ca_needs_ecn((struct sock *)tp))
@@ -271,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
+			tcp_enter_quickack_mode((struct sock *)tp, 1);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
-- 
cgit v1.2.3-59-g8ed1b


From dd006921d67f4a96f3d1fa763aad4d5dcd86959b Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:37 -0700
Subject: qed: Add MFW interfaces for TLV request support.

The patch adds required management firmware (MFW) interfaces such as
mailbox commands, TLV types etc.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h | 231 ++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index b5f70eff0182..8e1e6e1eb40e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -11863,6 +11863,8 @@ struct public_global {
 	u32 running_bundle_id;
 	s32 external_temperature;
 	u32 mdump_reason;
+	u32 data_ptr;
+	u32 data_size;
 };
 
 struct fw_flr_mb {
@@ -12322,6 +12324,7 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_BIST_TEST			0x001e0000
 #define DRV_MSG_CODE_SET_LED_MODE		0x00200000
 #define DRV_MSG_CODE_RESOURCE_CMD	0x00230000
+#define DRV_MSG_CODE_GET_TLV_DONE		0x002f0000
 
 #define RESOURCE_CMD_REQ_RESC_MASK		0x0000001F
 #define RESOURCE_CMD_REQ_RESC_SHIFT		0
@@ -12523,6 +12526,7 @@ enum MFW_DRV_MSG_TYPE {
 	MFW_DRV_MSG_TRANSCEIVER_STATE_CHANGE,
 	MFW_DRV_MSG_BW_UPDATE11,
 	MFW_DRV_MSG_OEM_CFG_UPDATE,
+	MFW_DRV_MSG_GET_TLV_REQ,
 	MFW_DRV_MSG_MAX
 };
 
@@ -12558,6 +12562,233 @@ struct mcp_public_data {
 	struct public_func func[MCP_GLOB_FUNC_MAX];
 };
 
+/* OCBB definitions */
+enum tlvs {
+	/* Category 1: Device Properties */
+	DRV_TLV_CLP_STR,
+	DRV_TLV_CLP_STR_CTD,
+	/* Category 6: Device Configuration */
+	DRV_TLV_SCSI_TO,
+	DRV_TLV_R_T_TOV,
+	DRV_TLV_R_A_TOV,
+	DRV_TLV_E_D_TOV,
+	DRV_TLV_CR_TOV,
+	DRV_TLV_BOOT_TYPE,
+	/* Category 8: Port Configuration */
+	DRV_TLV_NPIV_ENABLED,
+	/* Category 10: Function Configuration */
+	DRV_TLV_FEATURE_FLAGS,
+	DRV_TLV_LOCAL_ADMIN_ADDR,
+	DRV_TLV_ADDITIONAL_MAC_ADDR_1,
+	DRV_TLV_ADDITIONAL_MAC_ADDR_2,
+	DRV_TLV_LSO_MAX_OFFLOAD_SIZE,
+	DRV_TLV_LSO_MIN_SEGMENT_COUNT,
+	DRV_TLV_PROMISCUOUS_MODE,
+	DRV_TLV_TX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_RX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_NUM_OF_NET_QUEUE_VMQ_CFG,
+	DRV_TLV_FLEX_NIC_OUTER_VLAN_ID,
+	DRV_TLV_OS_DRIVER_STATES,
+	DRV_TLV_PXE_BOOT_PROGRESS,
+	/* Category 12: FC/FCoE Configuration */
+	DRV_TLV_NPIV_STATE,
+	DRV_TLV_NUM_OF_NPIV_IDS,
+	DRV_TLV_SWITCH_NAME,
+	DRV_TLV_SWITCH_PORT_NUM,
+	DRV_TLV_SWITCH_PORT_ID,
+	DRV_TLV_VENDOR_NAME,
+	DRV_TLV_SWITCH_MODEL,
+	DRV_TLV_SWITCH_FW_VER,
+	DRV_TLV_QOS_PRIORITY_PER_802_1P,
+	DRV_TLV_PORT_ALIAS,
+	DRV_TLV_PORT_STATE,
+	DRV_TLV_FIP_TX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_LINK_FAILURE_COUNT,
+	DRV_TLV_FCOE_BOOT_PROGRESS,
+	/* Category 13: iSCSI Configuration */
+	DRV_TLV_TARGET_LLMNR_ENABLED,
+	DRV_TLV_HEADER_DIGEST_FLAG_ENABLED,
+	DRV_TLV_DATA_DIGEST_FLAG_ENABLED,
+	DRV_TLV_AUTHENTICATION_METHOD,
+	DRV_TLV_ISCSI_BOOT_TARGET_PORTAL,
+	DRV_TLV_MAX_FRAME_SIZE,
+	DRV_TLV_PDU_TX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_SIZE,
+	DRV_TLV_ISCSI_BOOT_PROGRESS,
+	/* Category 20: Device Data */
+	DRV_TLV_PCIE_BUS_RX_UTILIZATION,
+	DRV_TLV_PCIE_BUS_TX_UTILIZATION,
+	DRV_TLV_DEVICE_CPU_CORES_UTILIZATION,
+	DRV_TLV_LAST_VALID_DCC_TLV_RECEIVED,
+	DRV_TLV_NCSI_RX_BYTES_RECEIVED,
+	DRV_TLV_NCSI_TX_BYTES_SENT,
+	/* Category 22: Base Port Data */
+	DRV_TLV_RX_DISCARDS,
+	DRV_TLV_RX_ERRORS,
+	DRV_TLV_TX_ERRORS,
+	DRV_TLV_TX_DISCARDS,
+	DRV_TLV_RX_FRAMES_RECEIVED,
+	DRV_TLV_TX_FRAMES_SENT,
+	/* Category 23: FC/FCoE Port Data */
+	DRV_TLV_RX_BROADCAST_PACKETS,
+	DRV_TLV_TX_BROADCAST_PACKETS,
+	/* Category 28: Base Function Data */
+	DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV4,
+	DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV6,
+	DRV_TLV_TX_DESCRIPTOR_QUEUE_AVG_DEPTH,
+	DRV_TLV_RX_DESCRIPTORS_QUEUE_AVG_DEPTH,
+	DRV_TLV_PF_RX_FRAMES_RECEIVED,
+	DRV_TLV_RX_BYTES_RECEIVED,
+	DRV_TLV_PF_TX_FRAMES_SENT,
+	DRV_TLV_TX_BYTES_SENT,
+	DRV_TLV_IOV_OFFLOAD,
+	DRV_TLV_PCI_ERRORS_CAP_ID,
+	DRV_TLV_UNCORRECTABLE_ERROR_STATUS,
+	DRV_TLV_UNCORRECTABLE_ERROR_MASK,
+	DRV_TLV_CORRECTABLE_ERROR_STATUS,
+	DRV_TLV_CORRECTABLE_ERROR_MASK,
+	DRV_TLV_PCI_ERRORS_AECC_REGISTER,
+	DRV_TLV_TX_QUEUES_EMPTY,
+	DRV_TLV_RX_QUEUES_EMPTY,
+	DRV_TLV_TX_QUEUES_FULL,
+	DRV_TLV_RX_QUEUES_FULL,
+	/* Category 29: FC/FCoE Function Data */
+	DRV_TLV_FCOE_TX_DESCRIPTOR_QUEUE_AVG_DEPTH,
+	DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_AVG_DEPTH,
+	DRV_TLV_FCOE_RX_FRAMES_RECEIVED,
+	DRV_TLV_FCOE_RX_BYTES_RECEIVED,
+	DRV_TLV_FCOE_TX_FRAMES_SENT,
+	DRV_TLV_FCOE_TX_BYTES_SENT,
+	DRV_TLV_CRC_ERROR_COUNT,
+	DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_CRC_ERROR_1_TIMESTAMP,
+	DRV_TLV_CRC_ERROR_2_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_CRC_ERROR_2_TIMESTAMP,
+	DRV_TLV_CRC_ERROR_3_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_CRC_ERROR_3_TIMESTAMP,
+	DRV_TLV_CRC_ERROR_4_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_CRC_ERROR_4_TIMESTAMP,
+	DRV_TLV_CRC_ERROR_5_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_CRC_ERROR_5_TIMESTAMP,
+	DRV_TLV_LOSS_OF_SYNC_ERROR_COUNT,
+	DRV_TLV_LOSS_OF_SIGNAL_ERRORS,
+	DRV_TLV_PRIMITIVE_SEQUENCE_PROTOCOL_ERROR_COUNT,
+	DRV_TLV_DISPARITY_ERROR_COUNT,
+	DRV_TLV_CODE_VIOLATION_ERROR_COUNT,
+	DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1,
+	DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_2,
+	DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_3,
+	DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_4,
+	DRV_TLV_LAST_FLOGI_TIMESTAMP,
+	DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1,
+	DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_2,
+	DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_3,
+	DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_4,
+	DRV_TLV_LAST_FLOGI_ACC_TIMESTAMP,
+	DRV_TLV_LAST_FLOGI_RJT,
+	DRV_TLV_LAST_FLOGI_RJT_TIMESTAMP,
+	DRV_TLV_FDISCS_SENT_COUNT,
+	DRV_TLV_FDISC_ACCS_RECEIVED,
+	DRV_TLV_FDISC_RJTS_RECEIVED,
+	DRV_TLV_PLOGI_SENT_COUNT,
+	DRV_TLV_PLOGI_ACCS_RECEIVED,
+	DRV_TLV_PLOGI_RJTS_RECEIVED,
+	DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID,
+	DRV_TLV_PLOGI_1_TIMESTAMP,
+	DRV_TLV_PLOGI_2_SENT_DESTINATION_FC_ID,
+	DRV_TLV_PLOGI_2_TIMESTAMP,
+	DRV_TLV_PLOGI_3_SENT_DESTINATION_FC_ID,
+	DRV_TLV_PLOGI_3_TIMESTAMP,
+	DRV_TLV_PLOGI_4_SENT_DESTINATION_FC_ID,
+	DRV_TLV_PLOGI_4_TIMESTAMP,
+	DRV_TLV_PLOGI_5_SENT_DESTINATION_FC_ID,
+	DRV_TLV_PLOGI_5_TIMESTAMP,
+	DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_PLOGI_1_ACC_TIMESTAMP,
+	DRV_TLV_PLOGI_2_ACC_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_PLOGI_2_ACC_TIMESTAMP,
+	DRV_TLV_PLOGI_3_ACC_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_PLOGI_3_ACC_TIMESTAMP,
+	DRV_TLV_PLOGI_4_ACC_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_PLOGI_4_ACC_TIMESTAMP,
+	DRV_TLV_PLOGI_5_ACC_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_PLOGI_5_ACC_TIMESTAMP,
+	DRV_TLV_LOGOS_ISSUED,
+	DRV_TLV_LOGO_ACCS_RECEIVED,
+	DRV_TLV_LOGO_RJTS_RECEIVED,
+	DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_LOGO_1_TIMESTAMP,
+	DRV_TLV_LOGO_2_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_LOGO_2_TIMESTAMP,
+	DRV_TLV_LOGO_3_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_LOGO_3_TIMESTAMP,
+	DRV_TLV_LOGO_4_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_LOGO_4_TIMESTAMP,
+	DRV_TLV_LOGO_5_RECEIVED_SOURCE_FC_ID,
+	DRV_TLV_LOGO_5_TIMESTAMP,
+	DRV_TLV_LOGOS_RECEIVED,
+	DRV_TLV_ACCS_ISSUED,
+	DRV_TLV_PRLIS_ISSUED,
+	DRV_TLV_ACCS_RECEIVED,
+	DRV_TLV_ABTS_SENT_COUNT,
+	DRV_TLV_ABTS_ACCS_RECEIVED,
+	DRV_TLV_ABTS_RJTS_RECEIVED,
+	DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID,
+	DRV_TLV_ABTS_1_TIMESTAMP,
+	DRV_TLV_ABTS_2_SENT_DESTINATION_FC_ID,
+	DRV_TLV_ABTS_2_TIMESTAMP,
+	DRV_TLV_ABTS_3_SENT_DESTINATION_FC_ID,
+	DRV_TLV_ABTS_3_TIMESTAMP,
+	DRV_TLV_ABTS_4_SENT_DESTINATION_FC_ID,
+	DRV_TLV_ABTS_4_TIMESTAMP,
+	DRV_TLV_ABTS_5_SENT_DESTINATION_FC_ID,
+	DRV_TLV_ABTS_5_TIMESTAMP,
+	DRV_TLV_RSCNS_RECEIVED,
+	DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1,
+	DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_2,
+	DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_3,
+	DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_4,
+	DRV_TLV_LUN_RESETS_ISSUED,
+	DRV_TLV_ABORT_TASK_SETS_ISSUED,
+	DRV_TLV_TPRLOS_SENT,
+	DRV_TLV_NOS_SENT_COUNT,
+	DRV_TLV_NOS_RECEIVED_COUNT,
+	DRV_TLV_OLS_COUNT,
+	DRV_TLV_LR_COUNT,
+	DRV_TLV_LRR_COUNT,
+	DRV_TLV_LIP_SENT_COUNT,
+	DRV_TLV_LIP_RECEIVED_COUNT,
+	DRV_TLV_EOFA_COUNT,
+	DRV_TLV_EOFNI_COUNT,
+	DRV_TLV_SCSI_STATUS_CHECK_CONDITION_COUNT,
+	DRV_TLV_SCSI_STATUS_CONDITION_MET_COUNT,
+	DRV_TLV_SCSI_STATUS_BUSY_COUNT,
+	DRV_TLV_SCSI_STATUS_INTERMEDIATE_COUNT,
+	DRV_TLV_SCSI_STATUS_INTERMEDIATE_CONDITION_MET_COUNT,
+	DRV_TLV_SCSI_STATUS_RESERVATION_CONFLICT_COUNT,
+	DRV_TLV_SCSI_STATUS_TASK_SET_FULL_COUNT,
+	DRV_TLV_SCSI_STATUS_ACA_ACTIVE_COUNT,
+	DRV_TLV_SCSI_STATUS_TASK_ABORTED_COUNT,
+	DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ,
+	DRV_TLV_SCSI_CHECK_1_TIMESTAMP,
+	DRV_TLV_SCSI_CHECK_CONDITION_2_RECEIVED_SK_ASC_ASCQ,
+	DRV_TLV_SCSI_CHECK_2_TIMESTAMP,
+	DRV_TLV_SCSI_CHECK_CONDITION_3_RECEIVED_SK_ASC_ASCQ,
+	DRV_TLV_SCSI_CHECK_3_TIMESTAMP,
+	DRV_TLV_SCSI_CHECK_CONDITION_4_RECEIVED_SK_ASC_ASCQ,
+	DRV_TLV_SCSI_CHECK_4_TIMESTAMP,
+	DRV_TLV_SCSI_CHECK_CONDITION_5_RECEIVED_SK_ASC_ASCQ,
+	DRV_TLV_SCSI_CHECK_5_TIMESTAMP,
+	/* Category 30: iSCSI Function Data */
+	DRV_TLV_PDU_TX_DESCRIPTOR_QUEUE_AVG_DEPTH,
+	DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_AVG_DEPTH,
+	DRV_TLV_ISCSI_PDU_RX_FRAMES_RECEIVED,
+	DRV_TLV_ISCSI_PDU_RX_BYTES_RECEIVED,
+	DRV_TLV_ISCSI_PDU_TX_FRAMES_SENT,
+	DRV_TLV_ISCSI_PDU_TX_BYTES_SENT
+};
+
 struct nvm_cfg_mac_address {
 	u32 mac_addr_hi;
 #define NVM_CFG_MAC_ADDRESS_HI_MASK	0x0000FFFF
-- 
cgit v1.2.3-59-g8ed1b


From 2528c389936efbbece25088426fe7c3c91ff355f Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:38 -0700
Subject: qed: Add support for tlv request processing.

The patch adds driver support for processing TLV requests/repsonses
from the mfw and upper driver layers respectively. The implementation
reads the requested TLVs from the shared memory, requests the values
from upper layer drivers, populates this info (TLVs) shared memory and
notifies MFW about the TLV values.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/Makefile      |   2 +-
 drivers/net/ethernet/qlogic/qed/qed.h         |   5 +
 drivers/net/ethernet/qlogic/qed/qed_main.c    |   6 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |  53 ++++
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 378 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    |  37 +++
 6 files changed, 480 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c

diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index c70cf2ad81c0..a0acb94d65f0 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_QED) := qed.o
 
 qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 	 qed_int.o qed_main.o qed_mcp.o qed_sp_commands.o qed_spq.o qed_l2.o \
-	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o
+	 qed_selftest.o qed_dcbx.o qed_debug.o qed_ptp.o qed_mng_tlv.o
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_RDMA) += qed_roce.o qed_rdma.o qed_iwarp.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index adcff495466e..dfdbe529e2bf 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -92,6 +92,8 @@ struct qed_eth_cb_ops;
 struct qed_dev_info;
 union qed_mcp_protocol_stats;
 enum qed_mcp_protocol_type;
+enum qed_mfw_tlv_type;
+union qed_mfw_tlv_data;
 
 /* helpers */
 #define QED_MFW_GET_FIELD(name, field) \
@@ -907,4 +909,7 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn);
 
+int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn,
+			  enum qed_mfw_tlv_type type,
+			  union qed_mfw_tlv_data *tlv_data);
 #endif /* _QED_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 9feed3b79cd4..cbf0ea97fc8e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -2088,3 +2088,9 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 		return;
 	}
 }
+
+int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn, enum qed_mfw_tlv_type type,
+			  union qed_mfw_tlv_data *tlv_buf)
+{
+	return -EINVAL;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 250579ba632b..591877fca67a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -213,6 +213,40 @@ enum qed_ov_wol {
 	QED_OV_WOL_ENABLED
 };
 
+enum qed_mfw_tlv_type {
+	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
+	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
+	QED_MFW_TLV_MAX = 0x4,
+};
+
+struct qed_mfw_tlv_generic {
+#define QED_MFW_TLV_FLAGS_SIZE	2
+	struct {
+		u8 ipv4_csum_offload;
+		u8 lso_supported;
+		bool b_set;
+	} flags;
+
+#define QED_MFW_TLV_MAC_COUNT 3
+	/* First entry for primary MAC, 2 secondary MACs possible */
+	u8 mac[QED_MFW_TLV_MAC_COUNT][6];
+	bool mac_set[QED_MFW_TLV_MAC_COUNT];
+
+	u64 rx_frames;
+	bool rx_frames_set;
+	u64 rx_bytes;
+	bool rx_bytes_set;
+	u64 tx_frames;
+	bool tx_frames_set;
+	u64 tx_bytes;
+	bool tx_bytes_set;
+};
+
+union qed_mfw_tlv_data {
+	struct qed_mfw_tlv_generic generic;
+	struct qed_mfw_tlv_eth eth;
+};
+
 /**
  * @brief - returns the link params of the hw function
  *
@@ -561,6 +595,17 @@ int qed_mcp_bist_nvm_get_image_att(struct qed_hwfn *p_hwfn,
 				   struct bist_nvm_image_att *p_image_att,
 				   u32 image_index);
 
+/**
+ * @brief - Processes the TLV request from MFW i.e., get the required TLV info
+ *          from the qed client and send it to the MFW.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ *
+ * @param return 0 upon success.
+ */
+int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
@@ -621,6 +666,14 @@ struct qed_mcp_mb_params {
 	u32			mcp_param;
 };
 
+struct qed_drv_tlv_hdr {
+	u8 tlv_type;
+	u8 tlv_length;	/* In dwords - not including this header */
+	u8 tlv_reserved;
+#define QED_DRV_TLV_FLAGS_CHANGED 0x01
+	u8 tlv_flags;
+};
+
 /**
  * @brief Initialize the interface with the MCP
  *
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
new file mode 100644
index 000000000000..d58a7146cce5
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include "qed.h"
+#include "qed_hw.h"
+#include "qed_mcp.h"
+#include "qed_reg_addr.h"
+
+#define TLV_TYPE(p)     (p[0])
+#define TLV_LENGTH(p)   (p[1])
+#define TLV_FLAGS(p)    (p[3])
+
+#define QED_TLV_DATA_MAX (14)
+struct qed_tlv_parsed_buf {
+	/* To be filled with the address to set in Value field */
+	void *p_val;
+
+	/* To be used internally in case the value has to be modified */
+	u8 data[QED_TLV_DATA_MAX];
+};
+
+static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
+{
+	switch (tlv_type) {
+	case DRV_TLV_FEATURE_FLAGS:
+	case DRV_TLV_LOCAL_ADMIN_ADDR:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_1:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_2:
+	case DRV_TLV_OS_DRIVER_STATES:
+	case DRV_TLV_PXE_BOOT_PROGRESS:
+	case DRV_TLV_RX_FRAMES_RECEIVED:
+	case DRV_TLV_RX_BYTES_RECEIVED:
+	case DRV_TLV_TX_FRAMES_SENT:
+	case DRV_TLV_TX_BYTES_SENT:
+	case DRV_TLV_NPIV_ENABLED:
+	case DRV_TLV_PCIE_BUS_RX_UTILIZATION:
+	case DRV_TLV_PCIE_BUS_TX_UTILIZATION:
+	case DRV_TLV_DEVICE_CPU_CORES_UTILIZATION:
+	case DRV_TLV_LAST_VALID_DCC_TLV_RECEIVED:
+	case DRV_TLV_NCSI_RX_BYTES_RECEIVED:
+	case DRV_TLV_NCSI_TX_BYTES_SENT:
+		*tlv_group |= QED_MFW_TLV_GENERIC;
+		break;
+	case DRV_TLV_LSO_MAX_OFFLOAD_SIZE:
+	case DRV_TLV_LSO_MIN_SEGMENT_COUNT:
+	case DRV_TLV_PROMISCUOUS_MODE:
+	case DRV_TLV_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_NUM_OF_NET_QUEUE_VMQ_CFG:
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV4:
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV6:
+	case DRV_TLV_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_IOV_OFFLOAD:
+	case DRV_TLV_TX_QUEUES_EMPTY:
+	case DRV_TLV_RX_QUEUES_EMPTY:
+	case DRV_TLV_TX_QUEUES_FULL:
+	case DRV_TLV_RX_QUEUES_FULL:
+		*tlv_group |= QED_MFW_TLV_ETH;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Returns size of the data buffer or, -1 in case TLV data is not available. */
+static int
+qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			  struct qed_mfw_tlv_generic *p_drv_buf,
+			  struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_FEATURE_FLAGS:
+		if (p_drv_buf->flags.b_set) {
+			memset(p_buf->data, 0, sizeof(u8) * QED_TLV_DATA_MAX);
+			p_buf->data[0] = p_drv_buf->flags.ipv4_csum_offload ?
+			    1 : 0;
+			p_buf->data[0] |= (p_drv_buf->flags.lso_supported ?
+					   1 : 0) << 1;
+			p_buf->p_val = p_buf->data;
+			return QED_MFW_TLV_FLAGS_SIZE;
+		}
+		break;
+
+	case DRV_TLV_LOCAL_ADMIN_ADDR:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_1:
+	case DRV_TLV_ADDITIONAL_MAC_ADDR_2:
+		{
+			int idx = p_tlv->tlv_type - DRV_TLV_LOCAL_ADMIN_ADDR;
+
+			if (p_drv_buf->mac_set[idx]) {
+				p_buf->p_val = p_drv_buf->mac[idx];
+				return ETH_ALEN;
+			}
+			break;
+		}
+
+	case DRV_TLV_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->rx_frames;
+			return sizeof(p_drv_buf->rx_frames);
+		}
+		break;
+	case DRV_TLV_RX_BYTES_RECEIVED:
+		if (p_drv_buf->rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->rx_bytes;
+			return sizeof(p_drv_buf->rx_bytes);
+		}
+		break;
+	case DRV_TLV_TX_FRAMES_SENT:
+		if (p_drv_buf->tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->tx_frames;
+			return sizeof(p_drv_buf->tx_frames);
+		}
+		break;
+	case DRV_TLV_TX_BYTES_SENT:
+		if (p_drv_buf->tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->tx_bytes;
+			return sizeof(p_drv_buf->tx_bytes);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
+static int
+qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			  struct qed_mfw_tlv_eth *p_drv_buf,
+			  struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_LSO_MAX_OFFLOAD_SIZE:
+		if (p_drv_buf->lso_maxoff_size_set) {
+			p_buf->p_val = &p_drv_buf->lso_maxoff_size;
+			return sizeof(p_drv_buf->lso_maxoff_size);
+		}
+		break;
+	case DRV_TLV_LSO_MIN_SEGMENT_COUNT:
+		if (p_drv_buf->lso_minseg_size_set) {
+			p_buf->p_val = &p_drv_buf->lso_minseg_size;
+			return sizeof(p_drv_buf->lso_minseg_size);
+		}
+		break;
+	case DRV_TLV_PROMISCUOUS_MODE:
+		if (p_drv_buf->prom_mode_set) {
+			p_buf->p_val = &p_drv_buf->prom_mode;
+			return sizeof(p_drv_buf->prom_mode);
+		}
+		break;
+	case DRV_TLV_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->tx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->tx_descr_size;
+			return sizeof(p_drv_buf->tx_descr_size);
+		}
+		break;
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->rx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->rx_descr_size;
+			return sizeof(p_drv_buf->rx_descr_size);
+		}
+		break;
+	case DRV_TLV_NUM_OF_NET_QUEUE_VMQ_CFG:
+		if (p_drv_buf->netq_count_set) {
+			p_buf->p_val = &p_drv_buf->netq_count;
+			return sizeof(p_drv_buf->netq_count);
+		}
+		break;
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV4:
+		if (p_drv_buf->tcp4_offloads_set) {
+			p_buf->p_val = &p_drv_buf->tcp4_offloads;
+			return sizeof(p_drv_buf->tcp4_offloads);
+		}
+		break;
+	case DRV_TLV_NUM_OFFLOADED_CONNECTIONS_TCP_IPV6:
+		if (p_drv_buf->tcp6_offloads_set) {
+			p_buf->p_val = &p_drv_buf->tcp6_offloads;
+			return sizeof(p_drv_buf->tcp6_offloads);
+		}
+		break;
+	case DRV_TLV_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->tx_descr_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->tx_descr_qdepth;
+			return sizeof(p_drv_buf->tx_descr_qdepth);
+		}
+		break;
+	case DRV_TLV_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->rx_descr_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->rx_descr_qdepth;
+			return sizeof(p_drv_buf->rx_descr_qdepth);
+		}
+		break;
+	case DRV_TLV_IOV_OFFLOAD:
+		if (p_drv_buf->iov_offload_set) {
+			p_buf->p_val = &p_drv_buf->iov_offload;
+			return sizeof(p_drv_buf->iov_offload);
+		}
+		break;
+	case DRV_TLV_TX_QUEUES_EMPTY:
+		if (p_drv_buf->txqs_empty_set) {
+			p_buf->p_val = &p_drv_buf->txqs_empty;
+			return sizeof(p_drv_buf->txqs_empty);
+		}
+		break;
+	case DRV_TLV_RX_QUEUES_EMPTY:
+		if (p_drv_buf->rxqs_empty_set) {
+			p_buf->p_val = &p_drv_buf->rxqs_empty;
+			return sizeof(p_drv_buf->rxqs_empty);
+		}
+		break;
+	case DRV_TLV_TX_QUEUES_FULL:
+		if (p_drv_buf->num_txqs_full_set) {
+			p_buf->p_val = &p_drv_buf->num_txqs_full;
+			return sizeof(p_drv_buf->num_txqs_full);
+		}
+		break;
+	case DRV_TLV_RX_QUEUES_FULL:
+		if (p_drv_buf->num_rxqs_full_set) {
+			p_buf->p_val = &p_drv_buf->num_rxqs_full;
+			return sizeof(p_drv_buf->num_rxqs_full);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
+static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
+			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
+{
+	union qed_mfw_tlv_data *p_tlv_data;
+	struct qed_tlv_parsed_buf buffer;
+	struct qed_drv_tlv_hdr tlv;
+	int len = 0;
+	u32 offset;
+	u8 *p_tlv;
+
+	p_tlv_data = vzalloc(sizeof(*p_tlv_data));
+	if (!p_tlv_data)
+		return -ENOMEM;
+
+	if (qed_mfw_fill_tlv_data(p_hwfn, tlv_group, p_tlv_data)) {
+		vfree(p_tlv_data);
+		return -EINVAL;
+	}
+
+	memset(&tlv, 0, sizeof(tlv));
+	for (offset = 0; offset < size;
+	     offset += sizeof(tlv) + sizeof(u32) * tlv.tlv_length) {
+		p_tlv = &p_mfw_buf[offset];
+		tlv.tlv_type = TLV_TYPE(p_tlv);
+		tlv.tlv_length = TLV_LENGTH(p_tlv);
+		tlv.tlv_flags = TLV_FLAGS(p_tlv);
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Type %d length = %d flags = 0x%x\n", tlv.tlv_type,
+			   tlv.tlv_length, tlv.tlv_flags);
+
+		if (tlv_group == QED_MFW_TLV_GENERIC)
+			len = qed_mfw_get_gen_tlv_value(&tlv,
+							&p_tlv_data->generic,
+							&buffer);
+		else if (tlv_group == QED_MFW_TLV_ETH)
+			len = qed_mfw_get_eth_tlv_value(&tlv,
+							&p_tlv_data->eth,
+							&buffer);
+
+		if (len > 0) {
+			WARN(len > 4 * tlv.tlv_length,
+			     "Incorrect MFW TLV length %d, it shouldn't be greater than %d\n",
+			     len, 4 * tlv.tlv_length);
+			len = min_t(int, len, 4 * tlv.tlv_length);
+			tlv.tlv_flags |= QED_DRV_TLV_FLAGS_CHANGED;
+			TLV_FLAGS(p_tlv) = tlv.tlv_flags;
+			memcpy(p_mfw_buf + offset + sizeof(tlv),
+			       buffer.p_val, len);
+		}
+	}
+
+	vfree(p_tlv_data);
+
+	return 0;
+}
+
+int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 addr, size, offset, resp, param, val, global_offsize, global_addr;
+	u8 tlv_group = 0, id, *p_mfw_buf = NULL, *p_temp;
+	struct qed_drv_tlv_hdr tlv;
+	int rc;
+
+	addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base,
+				    PUBLIC_GLOBAL);
+	global_offsize = qed_rd(p_hwfn, p_ptt, addr);
+	global_addr = SECTION_ADDR(global_offsize, 0);
+	addr = global_addr + offsetof(struct public_global, data_ptr);
+	addr = qed_rd(p_hwfn, p_ptt, addr);
+	size = qed_rd(p_hwfn, p_ptt, global_addr +
+		      offsetof(struct public_global, data_size));
+
+	if (!size) {
+		DP_NOTICE(p_hwfn, "Invalid TLV req size = %d\n", size);
+		goto drv_done;
+	}
+
+	p_mfw_buf = vzalloc(size);
+	if (!p_mfw_buf) {
+		DP_NOTICE(p_hwfn, "Failed allocate memory for p_mfw_buf\n");
+		goto drv_done;
+	}
+
+	/* Read the TLV request to local buffer. MFW represents the TLV in
+	 * little endian format and mcp returns it bigendian format. Hence
+	 * driver need to convert data to little endian first and then do the
+	 * memcpy (casting) to preserve the MFW TLV format in the driver buffer.
+	 *
+	 */
+	for (offset = 0; offset < size; offset += sizeof(u32)) {
+		val = qed_rd(p_hwfn, p_ptt, addr + offset);
+		val = be32_to_cpu(val);
+		memcpy(&p_mfw_buf[offset], &val, sizeof(u32));
+	}
+
+	/* Parse the headers to enumerate the requested TLV groups */
+	for (offset = 0; offset < size;
+	     offset += sizeof(tlv) + sizeof(u32) * tlv.tlv_length) {
+		p_temp = &p_mfw_buf[offset];
+		tlv.tlv_type = TLV_TYPE(p_temp);
+		tlv.tlv_length = TLV_LENGTH(p_temp);
+		if (qed_mfw_get_tlv_group(tlv.tlv_type, &tlv_group))
+			DP_VERBOSE(p_hwfn, NETIF_MSG_DRV,
+				   "Un recognized TLV %d\n", tlv.tlv_type);
+	}
+
+	/* Sanitize the TLV groups according to personality */
+	if ((tlv_group & QED_MFW_TLV_ETH) && !QED_IS_L2_PERSONALITY(p_hwfn)) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping L2 TLVs for non-L2 function\n");
+		tlv_group &= ~QED_MFW_TLV_ETH;
+	}
+
+	/* Update the TLV values in the local buffer */
+	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
+		if (tlv_group & id)
+			if (qed_mfw_update_tlvs(p_hwfn, id, p_mfw_buf, size))
+				goto drv_done;
+	}
+
+	/* Write the TLV data to shared memory. The stream of 4 bytes first need
+	 * to be mem-copied to u32 element to make it as LSB format. And then
+	 * converted to big endian as required by mcp-write.
+	 */
+	for (offset = 0; offset < size; offset += sizeof(u32)) {
+		memcpy(&val, &p_mfw_buf[offset], sizeof(u32));
+		val = cpu_to_be32(val);
+		qed_wr(p_hwfn, p_ptt, addr + offset, val);
+	}
+
+drv_done:
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_GET_TLV_DONE, 0, &resp,
+			 &param);
+
+	vfree(p_mfw_buf);
+
+	return rc;
+}
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 907976fd56f7..8e4fad427ac2 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -182,6 +182,43 @@ enum qed_led_mode {
 	QED_LED_MODE_RESTORE
 };
 
+struct qed_mfw_tlv_eth {
+	u16 lso_maxoff_size;
+	bool lso_maxoff_size_set;
+	u16 lso_minseg_size;
+	bool lso_minseg_size_set;
+	u8 prom_mode;
+	bool prom_mode_set;
+	u16 tx_descr_size;
+	bool tx_descr_size_set;
+	u16 rx_descr_size;
+	bool rx_descr_size_set;
+	u16 netq_count;
+	bool netq_count_set;
+	u32 tcp4_offloads;
+	bool tcp4_offloads_set;
+	u32 tcp6_offloads;
+	bool tcp6_offloads_set;
+	u16 tx_descr_qdepth;
+	bool tx_descr_qdepth_set;
+	u16 rx_descr_qdepth;
+	bool rx_descr_qdepth_set;
+	u8 iov_offload;
+#define QED_MFW_TLV_IOV_OFFLOAD_NONE            (0)
+#define QED_MFW_TLV_IOV_OFFLOAD_MULTIQUEUE      (1)
+#define QED_MFW_TLV_IOV_OFFLOAD_VEB             (2)
+#define QED_MFW_TLV_IOV_OFFLOAD_VEPA            (3)
+	bool iov_offload_set;
+	u8 txqs_empty;
+	bool txqs_empty_set;
+	u8 rxqs_empty;
+	bool rxqs_empty_set;
+	u8 num_txqs_full;
+	bool num_txqs_full_set;
+	u8 num_rxqs_full;
+	bool num_rxqs_full_set;
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3-59-g8ed1b


From f240b6882211aae7155a9839dff1426e2853fe30 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:39 -0700
Subject: qed: Add support for processing fcoe tlv request.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 828 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    | 193 ++++++
 3 files changed, 1024 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 591877fca67a..b31f5d8bd27c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -216,7 +216,8 @@ enum qed_ov_wol {
 enum qed_mfw_tlv_type {
 	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
 	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
-	QED_MFW_TLV_MAX = 0x4,
+	QED_MFW_TLV_FCOE = 0x4,		/* FCoE protocol TLVs */
+	QED_MFW_TLV_MAX = 0x8,
 };
 
 struct qed_mfw_tlv_generic {
@@ -245,6 +246,7 @@ struct qed_mfw_tlv_generic {
 union qed_mfw_tlv_data {
 	struct qed_mfw_tlv_generic generic;
 	struct qed_mfw_tlv_eth eth;
+	struct qed_mfw_tlv_fcoe fcoe;
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
index d58a7146cce5..1873cfc08715 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -64,6 +64,157 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
 	case DRV_TLV_RX_QUEUES_FULL:
 		*tlv_group |= QED_MFW_TLV_ETH;
 		break;
+	case DRV_TLV_SCSI_TO:
+	case DRV_TLV_R_T_TOV:
+	case DRV_TLV_R_A_TOV:
+	case DRV_TLV_E_D_TOV:
+	case DRV_TLV_CR_TOV:
+	case DRV_TLV_BOOT_TYPE:
+	case DRV_TLV_NPIV_STATE:
+	case DRV_TLV_NUM_OF_NPIV_IDS:
+	case DRV_TLV_SWITCH_NAME:
+	case DRV_TLV_SWITCH_PORT_NUM:
+	case DRV_TLV_SWITCH_PORT_ID:
+	case DRV_TLV_VENDOR_NAME:
+	case DRV_TLV_SWITCH_MODEL:
+	case DRV_TLV_SWITCH_FW_VER:
+	case DRV_TLV_QOS_PRIORITY_PER_802_1P:
+	case DRV_TLV_PORT_ALIAS:
+	case DRV_TLV_PORT_STATE:
+	case DRV_TLV_FIP_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_LINK_FAILURE_COUNT:
+	case DRV_TLV_FCOE_BOOT_PROGRESS:
+	case DRV_TLV_RX_BROADCAST_PACKETS:
+	case DRV_TLV_TX_BROADCAST_PACKETS:
+	case DRV_TLV_FCOE_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_FCOE_RX_FRAMES_RECEIVED:
+	case DRV_TLV_FCOE_RX_BYTES_RECEIVED:
+	case DRV_TLV_FCOE_TX_FRAMES_SENT:
+	case DRV_TLV_FCOE_TX_BYTES_SENT:
+	case DRV_TLV_CRC_ERROR_COUNT:
+	case DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_1_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_2_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_3_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_4_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_5_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_5_TIMESTAMP:
+	case DRV_TLV_LOSS_OF_SYNC_ERROR_COUNT:
+	case DRV_TLV_LOSS_OF_SIGNAL_ERRORS:
+	case DRV_TLV_PRIMITIVE_SEQUENCE_PROTOCOL_ERROR_COUNT:
+	case DRV_TLV_DISPARITY_ERROR_COUNT:
+	case DRV_TLV_CODE_VIOLATION_ERROR_COUNT:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_4:
+	case DRV_TLV_LAST_FLOGI_TIMESTAMP:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_4:
+	case DRV_TLV_LAST_FLOGI_ACC_TIMESTAMP:
+	case DRV_TLV_LAST_FLOGI_RJT:
+	case DRV_TLV_LAST_FLOGI_RJT_TIMESTAMP:
+	case DRV_TLV_FDISCS_SENT_COUNT:
+	case DRV_TLV_FDISC_ACCS_RECEIVED:
+	case DRV_TLV_FDISC_RJTS_RECEIVED:
+	case DRV_TLV_PLOGI_SENT_COUNT:
+	case DRV_TLV_PLOGI_ACCS_RECEIVED:
+	case DRV_TLV_PLOGI_RJTS_RECEIVED:
+	case DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_1_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_2_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_3_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_4_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_5_TIMESTAMP:
+	case DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_1_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_2_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_3_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_4_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_5_ACC_TIMESTAMP:
+	case DRV_TLV_LOGOS_ISSUED:
+	case DRV_TLV_LOGO_ACCS_RECEIVED:
+	case DRV_TLV_LOGO_RJTS_RECEIVED:
+	case DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_1_TIMESTAMP:
+	case DRV_TLV_LOGO_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_2_TIMESTAMP:
+	case DRV_TLV_LOGO_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_3_TIMESTAMP:
+	case DRV_TLV_LOGO_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_4_TIMESTAMP:
+	case DRV_TLV_LOGO_5_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_5_TIMESTAMP:
+	case DRV_TLV_LOGOS_RECEIVED:
+	case DRV_TLV_ACCS_ISSUED:
+	case DRV_TLV_PRLIS_ISSUED:
+	case DRV_TLV_ACCS_RECEIVED:
+	case DRV_TLV_ABTS_SENT_COUNT:
+	case DRV_TLV_ABTS_ACCS_RECEIVED:
+	case DRV_TLV_ABTS_RJTS_RECEIVED:
+	case DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_1_TIMESTAMP:
+	case DRV_TLV_ABTS_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_2_TIMESTAMP:
+	case DRV_TLV_ABTS_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_3_TIMESTAMP:
+	case DRV_TLV_ABTS_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_4_TIMESTAMP:
+	case DRV_TLV_ABTS_5_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_5_TIMESTAMP:
+	case DRV_TLV_RSCNS_RECEIVED:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_2:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_3:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_4:
+	case DRV_TLV_LUN_RESETS_ISSUED:
+	case DRV_TLV_ABORT_TASK_SETS_ISSUED:
+	case DRV_TLV_TPRLOS_SENT:
+	case DRV_TLV_NOS_SENT_COUNT:
+	case DRV_TLV_NOS_RECEIVED_COUNT:
+	case DRV_TLV_OLS_COUNT:
+	case DRV_TLV_LR_COUNT:
+	case DRV_TLV_LRR_COUNT:
+	case DRV_TLV_LIP_SENT_COUNT:
+	case DRV_TLV_LIP_RECEIVED_COUNT:
+	case DRV_TLV_EOFA_COUNT:
+	case DRV_TLV_EOFNI_COUNT:
+	case DRV_TLV_SCSI_STATUS_CHECK_CONDITION_COUNT:
+	case DRV_TLV_SCSI_STATUS_CONDITION_MET_COUNT:
+	case DRV_TLV_SCSI_STATUS_BUSY_COUNT:
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_COUNT:
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_CONDITION_MET_COUNT:
+	case DRV_TLV_SCSI_STATUS_RESERVATION_CONFLICT_COUNT:
+	case DRV_TLV_SCSI_STATUS_TASK_SET_FULL_COUNT:
+	case DRV_TLV_SCSI_STATUS_ACA_ACTIVE_COUNT:
+	case DRV_TLV_SCSI_STATUS_TASK_ABORTED_COUNT:
+	case DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_1_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_2_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_2_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_3_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_3_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_4_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_4_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_CONDITION_5_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
+		*tlv_group = QED_MFW_TLV_FCOE;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -237,6 +388,672 @@ qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
 	return -1;
 }
 
+static int
+qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time,
+			   struct qed_tlv_parsed_buf *p_buf)
+{
+	if (!p_time->b_set)
+		return -1;
+
+	/* Validate numbers */
+	if (p_time->month > 12)
+		p_time->month = 0;
+	if (p_time->day > 31)
+		p_time->day = 0;
+	if (p_time->hour > 23)
+		p_time->hour = 0;
+	if (p_time->min > 59)
+		p_time->hour = 0;
+	if (p_time->msec > 999)
+		p_time->msec = 0;
+	if (p_time->usec > 999)
+		p_time->usec = 0;
+
+	memset(p_buf->data, 0, sizeof(u8) * QED_TLV_DATA_MAX);
+	snprintf(p_buf->data, 14, "%d%d%d%d%d%d",
+		 p_time->month, p_time->day,
+		 p_time->hour, p_time->min, p_time->msec, p_time->usec);
+
+	p_buf->p_val = p_buf->data;
+
+	return QED_MFW_TLV_TIME_SIZE;
+}
+
+static int
+qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			   struct qed_mfw_tlv_fcoe *p_drv_buf,
+			   struct qed_tlv_parsed_buf *p_buf)
+{
+	struct qed_mfw_tlv_time *p_time;
+	u8 idx;
+
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_SCSI_TO:
+		if (p_drv_buf->scsi_timeout_set) {
+			p_buf->p_val = &p_drv_buf->scsi_timeout;
+			return sizeof(p_drv_buf->scsi_timeout);
+		}
+		break;
+	case DRV_TLV_R_T_TOV:
+		if (p_drv_buf->rt_tov_set) {
+			p_buf->p_val = &p_drv_buf->rt_tov;
+			return sizeof(p_drv_buf->rt_tov);
+		}
+		break;
+	case DRV_TLV_R_A_TOV:
+		if (p_drv_buf->ra_tov_set) {
+			p_buf->p_val = &p_drv_buf->ra_tov;
+			return sizeof(p_drv_buf->ra_tov);
+		}
+		break;
+	case DRV_TLV_E_D_TOV:
+		if (p_drv_buf->ed_tov_set) {
+			p_buf->p_val = &p_drv_buf->ed_tov;
+			return sizeof(p_drv_buf->ed_tov);
+		}
+		break;
+	case DRV_TLV_CR_TOV:
+		if (p_drv_buf->cr_tov_set) {
+			p_buf->p_val = &p_drv_buf->cr_tov;
+			return sizeof(p_drv_buf->cr_tov);
+		}
+		break;
+	case DRV_TLV_BOOT_TYPE:
+		if (p_drv_buf->boot_type_set) {
+			p_buf->p_val = &p_drv_buf->boot_type;
+			return sizeof(p_drv_buf->boot_type);
+		}
+		break;
+	case DRV_TLV_NPIV_STATE:
+		if (p_drv_buf->npiv_state_set) {
+			p_buf->p_val = &p_drv_buf->npiv_state;
+			return sizeof(p_drv_buf->npiv_state);
+		}
+		break;
+	case DRV_TLV_NUM_OF_NPIV_IDS:
+		if (p_drv_buf->num_npiv_ids_set) {
+			p_buf->p_val = &p_drv_buf->num_npiv_ids;
+			return sizeof(p_drv_buf->num_npiv_ids);
+		}
+		break;
+	case DRV_TLV_SWITCH_NAME:
+		if (p_drv_buf->switch_name_set) {
+			p_buf->p_val = &p_drv_buf->switch_name;
+			return sizeof(p_drv_buf->switch_name);
+		}
+		break;
+	case DRV_TLV_SWITCH_PORT_NUM:
+		if (p_drv_buf->switch_portnum_set) {
+			p_buf->p_val = &p_drv_buf->switch_portnum;
+			return sizeof(p_drv_buf->switch_portnum);
+		}
+		break;
+	case DRV_TLV_SWITCH_PORT_ID:
+		if (p_drv_buf->switch_portid_set) {
+			p_buf->p_val = &p_drv_buf->switch_portid;
+			return sizeof(p_drv_buf->switch_portid);
+		}
+		break;
+	case DRV_TLV_VENDOR_NAME:
+		if (p_drv_buf->vendor_name_set) {
+			p_buf->p_val = &p_drv_buf->vendor_name;
+			return sizeof(p_drv_buf->vendor_name);
+		}
+		break;
+	case DRV_TLV_SWITCH_MODEL:
+		if (p_drv_buf->switch_model_set) {
+			p_buf->p_val = &p_drv_buf->switch_model;
+			return sizeof(p_drv_buf->switch_model);
+		}
+		break;
+	case DRV_TLV_SWITCH_FW_VER:
+		if (p_drv_buf->switch_fw_version_set) {
+			p_buf->p_val = &p_drv_buf->switch_fw_version;
+			return sizeof(p_drv_buf->switch_fw_version);
+		}
+		break;
+	case DRV_TLV_QOS_PRIORITY_PER_802_1P:
+		if (p_drv_buf->qos_pri_set) {
+			p_buf->p_val = &p_drv_buf->qos_pri;
+			return sizeof(p_drv_buf->qos_pri);
+		}
+		break;
+	case DRV_TLV_PORT_ALIAS:
+		if (p_drv_buf->port_alias_set) {
+			p_buf->p_val = &p_drv_buf->port_alias;
+			return sizeof(p_drv_buf->port_alias);
+		}
+		break;
+	case DRV_TLV_PORT_STATE:
+		if (p_drv_buf->port_state_set) {
+			p_buf->p_val = &p_drv_buf->port_state;
+			return sizeof(p_drv_buf->port_state);
+		}
+		break;
+	case DRV_TLV_FIP_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->fip_tx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->fip_tx_descr_size;
+			return sizeof(p_drv_buf->fip_tx_descr_size);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->fip_rx_descr_size_set) {
+			p_buf->p_val = &p_drv_buf->fip_rx_descr_size;
+			return sizeof(p_drv_buf->fip_rx_descr_size);
+		}
+		break;
+	case DRV_TLV_LINK_FAILURE_COUNT:
+		if (p_drv_buf->link_failures_set) {
+			p_buf->p_val = &p_drv_buf->link_failures;
+			return sizeof(p_drv_buf->link_failures);
+		}
+		break;
+	case DRV_TLV_FCOE_BOOT_PROGRESS:
+		if (p_drv_buf->fcoe_boot_progress_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_boot_progress;
+			return sizeof(p_drv_buf->fcoe_boot_progress);
+		}
+		break;
+	case DRV_TLV_RX_BROADCAST_PACKETS:
+		if (p_drv_buf->rx_bcast_set) {
+			p_buf->p_val = &p_drv_buf->rx_bcast;
+			return sizeof(p_drv_buf->rx_bcast);
+		}
+		break;
+	case DRV_TLV_TX_BROADCAST_PACKETS:
+		if (p_drv_buf->tx_bcast_set) {
+			p_buf->p_val = &p_drv_buf->tx_bcast;
+			return sizeof(p_drv_buf->tx_bcast);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->fcoe_txq_depth_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_txq_depth;
+			return sizeof(p_drv_buf->fcoe_txq_depth);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->fcoe_rxq_depth_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rxq_depth;
+			return sizeof(p_drv_buf->fcoe_rxq_depth);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->fcoe_rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rx_frames;
+			return sizeof(p_drv_buf->fcoe_rx_frames);
+		}
+		break;
+	case DRV_TLV_FCOE_RX_BYTES_RECEIVED:
+		if (p_drv_buf->fcoe_rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_rx_bytes;
+			return sizeof(p_drv_buf->fcoe_rx_bytes);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_FRAMES_SENT:
+		if (p_drv_buf->fcoe_tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_tx_frames;
+			return sizeof(p_drv_buf->fcoe_tx_frames);
+		}
+		break;
+	case DRV_TLV_FCOE_TX_BYTES_SENT:
+		if (p_drv_buf->fcoe_tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->fcoe_tx_bytes;
+			return sizeof(p_drv_buf->fcoe_tx_bytes);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_COUNT:
+		if (p_drv_buf->crc_count_set) {
+			p_buf->p_val = &p_drv_buf->crc_count;
+			return sizeof(p_drv_buf->crc_count);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_CRC_ERROR_5_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_CRC_ERROR_1_RECEIVED_SOURCE_FC_ID) / 2;
+
+		if (p_drv_buf->crc_err_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->crc_err_src_fcid[idx];
+			return sizeof(p_drv_buf->crc_err_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_CRC_ERROR_1_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_2_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_3_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_4_TIMESTAMP:
+	case DRV_TLV_CRC_ERROR_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_CRC_ERROR_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->crc_err[idx],
+						  p_buf);
+	case DRV_TLV_LOSS_OF_SYNC_ERROR_COUNT:
+		if (p_drv_buf->losync_err_set) {
+			p_buf->p_val = &p_drv_buf->losync_err;
+			return sizeof(p_drv_buf->losync_err);
+		}
+		break;
+	case DRV_TLV_LOSS_OF_SIGNAL_ERRORS:
+		if (p_drv_buf->losig_err_set) {
+			p_buf->p_val = &p_drv_buf->losig_err;
+			return sizeof(p_drv_buf->losig_err);
+		}
+		break;
+	case DRV_TLV_PRIMITIVE_SEQUENCE_PROTOCOL_ERROR_COUNT:
+		if (p_drv_buf->primtive_err_set) {
+			p_buf->p_val = &p_drv_buf->primtive_err;
+			return sizeof(p_drv_buf->primtive_err);
+		}
+		break;
+	case DRV_TLV_DISPARITY_ERROR_COUNT:
+		if (p_drv_buf->disparity_err_set) {
+			p_buf->p_val = &p_drv_buf->disparity_err;
+			return sizeof(p_drv_buf->disparity_err);
+		}
+		break;
+	case DRV_TLV_CODE_VIOLATION_ERROR_COUNT:
+		if (p_drv_buf->code_violation_err_set) {
+			p_buf->p_val = &p_drv_buf->code_violation_err;
+			return sizeof(p_drv_buf->code_violation_err);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_4:
+		idx = p_tlv->tlv_type -
+			DRV_TLV_LAST_FLOGI_ISSUED_COMMON_PARAMETERS_WORD_1;
+		if (p_drv_buf->flogi_param_set[idx]) {
+			p_buf->p_val = &p_drv_buf->flogi_param[idx];
+			return sizeof(p_drv_buf->flogi_param[idx]);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_tstamp,
+						  p_buf);
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_2:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_3:
+	case DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_4:
+		idx = p_tlv->tlv_type -
+			DRV_TLV_LAST_FLOGI_ACC_COMMON_PARAMETERS_WORD_1;
+
+		if (p_drv_buf->flogi_acc_param_set[idx]) {
+			p_buf->p_val = &p_drv_buf->flogi_acc_param[idx];
+			return sizeof(p_drv_buf->flogi_acc_param[idx]);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_ACC_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_acc_tstamp,
+						  p_buf);
+	case DRV_TLV_LAST_FLOGI_RJT:
+		if (p_drv_buf->flogi_rjt_set) {
+			p_buf->p_val = &p_drv_buf->flogi_rjt;
+			return sizeof(p_drv_buf->flogi_rjt);
+		}
+		break;
+	case DRV_TLV_LAST_FLOGI_RJT_TIMESTAMP:
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->flogi_rjt_tstamp,
+						  p_buf);
+	case DRV_TLV_FDISCS_SENT_COUNT:
+		if (p_drv_buf->fdiscs_set) {
+			p_buf->p_val = &p_drv_buf->fdiscs;
+			return sizeof(p_drv_buf->fdiscs);
+		}
+		break;
+	case DRV_TLV_FDISC_ACCS_RECEIVED:
+		if (p_drv_buf->fdisc_acc_set) {
+			p_buf->p_val = &p_drv_buf->fdisc_acc;
+			return sizeof(p_drv_buf->fdisc_acc);
+		}
+		break;
+	case DRV_TLV_FDISC_RJTS_RECEIVED:
+		if (p_drv_buf->fdisc_rjt_set) {
+			p_buf->p_val = &p_drv_buf->fdisc_rjt;
+			return sizeof(p_drv_buf->fdisc_rjt);
+		}
+		break;
+	case DRV_TLV_PLOGI_SENT_COUNT:
+		if (p_drv_buf->plogi_set) {
+			p_buf->p_val = &p_drv_buf->plogi;
+			return sizeof(p_drv_buf->plogi);
+		}
+		break;
+	case DRV_TLV_PLOGI_ACCS_RECEIVED:
+		if (p_drv_buf->plogi_acc_set) {
+			p_buf->p_val = &p_drv_buf->plogi_acc;
+			return sizeof(p_drv_buf->plogi_acc);
+		}
+		break;
+	case DRV_TLV_PLOGI_RJTS_RECEIVED:
+		if (p_drv_buf->plogi_rjt_set) {
+			p_buf->p_val = &p_drv_buf->plogi_rjt;
+			return sizeof(p_drv_buf->plogi_rjt);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_PLOGI_5_SENT_DESTINATION_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_PLOGI_1_SENT_DESTINATION_FC_ID) / 2;
+
+		if (p_drv_buf->plogi_dst_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogi_dst_fcid[idx];
+			return sizeof(p_drv_buf->plogi_dst_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_PLOGI_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->plogi_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_2_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_3_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_4_ACC_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_PLOGI_5_ACC_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_PLOGI_1_ACC_RECEIVED_SOURCE_FC_ID) / 2;
+
+		if (p_drv_buf->plogi_acc_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogi_acc_src_fcid[idx];
+			return sizeof(p_drv_buf->plogi_acc_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_PLOGI_1_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_2_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_3_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_4_ACC_TIMESTAMP:
+	case DRV_TLV_PLOGI_5_ACC_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_PLOGI_1_ACC_TIMESTAMP) / 2;
+		p_time = &p_drv_buf->plogi_acc_tstamp[idx];
+
+		return qed_mfw_get_tlv_time_value(p_time, p_buf);
+	case DRV_TLV_LOGOS_ISSUED:
+		if (p_drv_buf->tx_plogos_set) {
+			p_buf->p_val = &p_drv_buf->tx_plogos;
+			return sizeof(p_drv_buf->tx_plogos);
+		}
+		break;
+	case DRV_TLV_LOGO_ACCS_RECEIVED:
+		if (p_drv_buf->plogo_acc_set) {
+			p_buf->p_val = &p_drv_buf->plogo_acc;
+			return sizeof(p_drv_buf->plogo_acc);
+		}
+		break;
+	case DRV_TLV_LOGO_RJTS_RECEIVED:
+		if (p_drv_buf->plogo_rjt_set) {
+			p_buf->p_val = &p_drv_buf->plogo_rjt;
+			return sizeof(p_drv_buf->plogo_rjt);
+		}
+		break;
+	case DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_2_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_3_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_4_RECEIVED_SOURCE_FC_ID:
+	case DRV_TLV_LOGO_5_RECEIVED_SOURCE_FC_ID:
+		idx = (p_tlv->tlv_type - DRV_TLV_LOGO_1_RECEIVED_SOURCE_FC_ID) /
+			2;
+
+		if (p_drv_buf->plogo_src_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->plogo_src_fcid[idx];
+			return sizeof(p_drv_buf->plogo_src_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_LOGO_1_TIMESTAMP:
+	case DRV_TLV_LOGO_2_TIMESTAMP:
+	case DRV_TLV_LOGO_3_TIMESTAMP:
+	case DRV_TLV_LOGO_4_TIMESTAMP:
+	case DRV_TLV_LOGO_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_LOGO_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->plogo_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_LOGOS_RECEIVED:
+		if (p_drv_buf->rx_logos_set) {
+			p_buf->p_val = &p_drv_buf->rx_logos;
+			return sizeof(p_drv_buf->rx_logos);
+		}
+		break;
+	case DRV_TLV_ACCS_ISSUED:
+		if (p_drv_buf->tx_accs_set) {
+			p_buf->p_val = &p_drv_buf->tx_accs;
+			return sizeof(p_drv_buf->tx_accs);
+		}
+		break;
+	case DRV_TLV_PRLIS_ISSUED:
+		if (p_drv_buf->tx_prlis_set) {
+			p_buf->p_val = &p_drv_buf->tx_prlis;
+			return sizeof(p_drv_buf->tx_prlis);
+		}
+		break;
+	case DRV_TLV_ACCS_RECEIVED:
+		if (p_drv_buf->rx_accs_set) {
+			p_buf->p_val = &p_drv_buf->rx_accs;
+			return sizeof(p_drv_buf->rx_accs);
+		}
+		break;
+	case DRV_TLV_ABTS_SENT_COUNT:
+		if (p_drv_buf->tx_abts_set) {
+			p_buf->p_val = &p_drv_buf->tx_abts;
+			return sizeof(p_drv_buf->tx_abts);
+		}
+		break;
+	case DRV_TLV_ABTS_ACCS_RECEIVED:
+		if (p_drv_buf->rx_abts_acc_set) {
+			p_buf->p_val = &p_drv_buf->rx_abts_acc;
+			return sizeof(p_drv_buf->rx_abts_acc);
+		}
+		break;
+	case DRV_TLV_ABTS_RJTS_RECEIVED:
+		if (p_drv_buf->rx_abts_rjt_set) {
+			p_buf->p_val = &p_drv_buf->rx_abts_rjt;
+			return sizeof(p_drv_buf->rx_abts_rjt);
+		}
+		break;
+	case DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_2_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_3_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_4_SENT_DESTINATION_FC_ID:
+	case DRV_TLV_ABTS_5_SENT_DESTINATION_FC_ID:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_ABTS_1_SENT_DESTINATION_FC_ID) / 2;
+
+		if (p_drv_buf->abts_dst_fcid_set[idx]) {
+			p_buf->p_val = &p_drv_buf->abts_dst_fcid[idx];
+			return sizeof(p_drv_buf->abts_dst_fcid[idx]);
+		}
+		break;
+	case DRV_TLV_ABTS_1_TIMESTAMP:
+	case DRV_TLV_ABTS_2_TIMESTAMP:
+	case DRV_TLV_ABTS_3_TIMESTAMP:
+	case DRV_TLV_ABTS_4_TIMESTAMP:
+	case DRV_TLV_ABTS_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_ABTS_1_TIMESTAMP) / 2;
+
+		return qed_mfw_get_tlv_time_value(&p_drv_buf->abts_tstamp[idx],
+						  p_buf);
+	case DRV_TLV_RSCNS_RECEIVED:
+		if (p_drv_buf->rx_rscn_set) {
+			p_buf->p_val = &p_drv_buf->rx_rscn;
+			return sizeof(p_drv_buf->rx_rscn);
+		}
+		break;
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_2:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_3:
+	case DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_4:
+		idx = p_tlv->tlv_type - DRV_TLV_LAST_RSCN_RECEIVED_N_PORT_1;
+
+		if (p_drv_buf->rx_rscn_nport_set[idx]) {
+			p_buf->p_val = &p_drv_buf->rx_rscn_nport[idx];
+			return sizeof(p_drv_buf->rx_rscn_nport[idx]);
+		}
+		break;
+	case DRV_TLV_LUN_RESETS_ISSUED:
+		if (p_drv_buf->tx_lun_rst_set) {
+			p_buf->p_val = &p_drv_buf->tx_lun_rst;
+			return sizeof(p_drv_buf->tx_lun_rst);
+		}
+		break;
+	case DRV_TLV_ABORT_TASK_SETS_ISSUED:
+		if (p_drv_buf->abort_task_sets_set) {
+			p_buf->p_val = &p_drv_buf->abort_task_sets;
+			return sizeof(p_drv_buf->abort_task_sets);
+		}
+		break;
+	case DRV_TLV_TPRLOS_SENT:
+		if (p_drv_buf->tx_tprlos_set) {
+			p_buf->p_val = &p_drv_buf->tx_tprlos;
+			return sizeof(p_drv_buf->tx_tprlos);
+		}
+		break;
+	case DRV_TLV_NOS_SENT_COUNT:
+		if (p_drv_buf->tx_nos_set) {
+			p_buf->p_val = &p_drv_buf->tx_nos;
+			return sizeof(p_drv_buf->tx_nos);
+		}
+		break;
+	case DRV_TLV_NOS_RECEIVED_COUNT:
+		if (p_drv_buf->rx_nos_set) {
+			p_buf->p_val = &p_drv_buf->rx_nos;
+			return sizeof(p_drv_buf->rx_nos);
+		}
+		break;
+	case DRV_TLV_OLS_COUNT:
+		if (p_drv_buf->ols_set) {
+			p_buf->p_val = &p_drv_buf->ols;
+			return sizeof(p_drv_buf->ols);
+		}
+		break;
+	case DRV_TLV_LR_COUNT:
+		if (p_drv_buf->lr_set) {
+			p_buf->p_val = &p_drv_buf->lr;
+			return sizeof(p_drv_buf->lr);
+		}
+		break;
+	case DRV_TLV_LRR_COUNT:
+		if (p_drv_buf->lrr_set) {
+			p_buf->p_val = &p_drv_buf->lrr;
+			return sizeof(p_drv_buf->lrr);
+		}
+		break;
+	case DRV_TLV_LIP_SENT_COUNT:
+		if (p_drv_buf->tx_lip_set) {
+			p_buf->p_val = &p_drv_buf->tx_lip;
+			return sizeof(p_drv_buf->tx_lip);
+		}
+		break;
+	case DRV_TLV_LIP_RECEIVED_COUNT:
+		if (p_drv_buf->rx_lip_set) {
+			p_buf->p_val = &p_drv_buf->rx_lip;
+			return sizeof(p_drv_buf->rx_lip);
+		}
+		break;
+	case DRV_TLV_EOFA_COUNT:
+		if (p_drv_buf->eofa_set) {
+			p_buf->p_val = &p_drv_buf->eofa;
+			return sizeof(p_drv_buf->eofa);
+		}
+		break;
+	case DRV_TLV_EOFNI_COUNT:
+		if (p_drv_buf->eofni_set) {
+			p_buf->p_val = &p_drv_buf->eofni;
+			return sizeof(p_drv_buf->eofni);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_CHECK_CONDITION_COUNT:
+		if (p_drv_buf->scsi_chks_set) {
+			p_buf->p_val = &p_drv_buf->scsi_chks;
+			return sizeof(p_drv_buf->scsi_chks);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_CONDITION_MET_COUNT:
+		if (p_drv_buf->scsi_cond_met_set) {
+			p_buf->p_val = &p_drv_buf->scsi_cond_met;
+			return sizeof(p_drv_buf->scsi_cond_met);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_BUSY_COUNT:
+		if (p_drv_buf->scsi_busy_set) {
+			p_buf->p_val = &p_drv_buf->scsi_busy;
+			return sizeof(p_drv_buf->scsi_busy);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_COUNT:
+		if (p_drv_buf->scsi_inter_set) {
+			p_buf->p_val = &p_drv_buf->scsi_inter;
+			return sizeof(p_drv_buf->scsi_inter);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_INTERMEDIATE_CONDITION_MET_COUNT:
+		if (p_drv_buf->scsi_inter_cond_met_set) {
+			p_buf->p_val = &p_drv_buf->scsi_inter_cond_met;
+			return sizeof(p_drv_buf->scsi_inter_cond_met);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_RESERVATION_CONFLICT_COUNT:
+		if (p_drv_buf->scsi_rsv_conflicts_set) {
+			p_buf->p_val = &p_drv_buf->scsi_rsv_conflicts;
+			return sizeof(p_drv_buf->scsi_rsv_conflicts);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_TASK_SET_FULL_COUNT:
+		if (p_drv_buf->scsi_tsk_full_set) {
+			p_buf->p_val = &p_drv_buf->scsi_tsk_full;
+			return sizeof(p_drv_buf->scsi_tsk_full);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_ACA_ACTIVE_COUNT:
+		if (p_drv_buf->scsi_aca_active_set) {
+			p_buf->p_val = &p_drv_buf->scsi_aca_active;
+			return sizeof(p_drv_buf->scsi_aca_active);
+		}
+		break;
+	case DRV_TLV_SCSI_STATUS_TASK_ABORTED_COUNT:
+		if (p_drv_buf->scsi_tsk_abort_set) {
+			p_buf->p_val = &p_drv_buf->scsi_tsk_abort;
+			return sizeof(p_drv_buf->scsi_tsk_abort);
+		}
+		break;
+	case DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_2_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_3_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_4_RECEIVED_SK_ASC_ASCQ:
+	case DRV_TLV_SCSI_CHECK_CONDITION_5_RECEIVED_SK_ASC_ASCQ:
+		idx = (p_tlv->tlv_type -
+		       DRV_TLV_SCSI_CHECK_CONDITION_1_RECEIVED_SK_ASC_ASCQ) / 2;
+
+		if (p_drv_buf->scsi_rx_chk_set[idx]) {
+			p_buf->p_val = &p_drv_buf->scsi_rx_chk[idx];
+			return sizeof(p_drv_buf->scsi_rx_chk[idx]);
+		}
+		break;
+	case DRV_TLV_SCSI_CHECK_1_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_2_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_3_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_4_TIMESTAMP:
+	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
+		idx = (p_tlv->tlv_type - DRV_TLV_SCSI_CHECK_1_TIMESTAMP) / 2;
+		p_time = &p_drv_buf->scsi_chk_tstamp[idx];
+
+		return qed_mfw_get_tlv_time_value(p_time, p_buf);
+	default:
+		break;
+	}
+
+	return -1;
+}
+
 static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
 {
@@ -276,6 +1093,10 @@ static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			len = qed_mfw_get_eth_tlv_value(&tlv,
 							&p_tlv_data->eth,
 							&buffer);
+		else if (tlv_group == QED_MFW_TLV_FCOE)
+			len = qed_mfw_get_fcoe_tlv_value(&tlv,
+							 &p_tlv_data->fcoe,
+							 &buffer);
 
 		if (len > 0) {
 			WARN(len > 4 * tlv.tlv_length,
@@ -351,6 +1172,13 @@ int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		tlv_group &= ~QED_MFW_TLV_ETH;
 	}
 
+	if ((tlv_group & QED_MFW_TLV_FCOE) &&
+	    p_hwfn->hw_info.personality != QED_PCI_FCOE) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping FCoE TLVs for non-FCoE function\n");
+		tlv_group &= ~QED_MFW_TLV_FCOE;
+	}
+
 	/* Update the TLV values in the local buffer */
 	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
 		if (tlv_group & id)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8e4fad427ac2..74c2b9ad8afa 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -219,6 +219,199 @@ struct qed_mfw_tlv_eth {
 	bool num_rxqs_full_set;
 };
 
+#define QED_MFW_TLV_TIME_SIZE	14
+struct qed_mfw_tlv_time {
+	bool b_set;
+	u8 month;
+	u8 day;
+	u8 hour;
+	u8 min;
+	u16 msec;
+	u16 usec;
+};
+
+struct qed_mfw_tlv_fcoe {
+	u8 scsi_timeout;
+	bool scsi_timeout_set;
+	u32 rt_tov;
+	bool rt_tov_set;
+	u32 ra_tov;
+	bool ra_tov_set;
+	u32 ed_tov;
+	bool ed_tov_set;
+	u32 cr_tov;
+	bool cr_tov_set;
+	u8 boot_type;
+	bool boot_type_set;
+	u8 npiv_state;
+	bool npiv_state_set;
+	u32 num_npiv_ids;
+	bool num_npiv_ids_set;
+	u8 switch_name[8];
+	bool switch_name_set;
+	u16 switch_portnum;
+	bool switch_portnum_set;
+	u8 switch_portid[3];
+	bool switch_portid_set;
+	u8 vendor_name[8];
+	bool vendor_name_set;
+	u8 switch_model[8];
+	bool switch_model_set;
+	u8 switch_fw_version[8];
+	bool switch_fw_version_set;
+	u8 qos_pri;
+	bool qos_pri_set;
+	u8 port_alias[3];
+	bool port_alias_set;
+	u8 port_state;
+#define QED_MFW_TLV_PORT_STATE_OFFLINE  (0)
+#define QED_MFW_TLV_PORT_STATE_LOOP             (1)
+#define QED_MFW_TLV_PORT_STATE_P2P              (2)
+#define QED_MFW_TLV_PORT_STATE_FABRIC           (3)
+	bool port_state_set;
+	u16 fip_tx_descr_size;
+	bool fip_tx_descr_size_set;
+	u16 fip_rx_descr_size;
+	bool fip_rx_descr_size_set;
+	u16 link_failures;
+	bool link_failures_set;
+	u8 fcoe_boot_progress;
+	bool fcoe_boot_progress_set;
+	u64 rx_bcast;
+	bool rx_bcast_set;
+	u64 tx_bcast;
+	bool tx_bcast_set;
+	u16 fcoe_txq_depth;
+	bool fcoe_txq_depth_set;
+	u16 fcoe_rxq_depth;
+	bool fcoe_rxq_depth_set;
+	u64 fcoe_rx_frames;
+	bool fcoe_rx_frames_set;
+	u64 fcoe_rx_bytes;
+	bool fcoe_rx_bytes_set;
+	u64 fcoe_tx_frames;
+	bool fcoe_tx_frames_set;
+	u64 fcoe_tx_bytes;
+	bool fcoe_tx_bytes_set;
+	u16 crc_count;
+	bool crc_count_set;
+	u32 crc_err_src_fcid[5];
+	bool crc_err_src_fcid_set[5];
+	struct qed_mfw_tlv_time crc_err[5];
+	u16 losync_err;
+	bool losync_err_set;
+	u16 losig_err;
+	bool losig_err_set;
+	u16 primtive_err;
+	bool primtive_err_set;
+	u16 disparity_err;
+	bool disparity_err_set;
+	u16 code_violation_err;
+	bool code_violation_err_set;
+	u32 flogi_param[4];
+	bool flogi_param_set[4];
+	struct qed_mfw_tlv_time flogi_tstamp;
+	u32 flogi_acc_param[4];
+	bool flogi_acc_param_set[4];
+	struct qed_mfw_tlv_time flogi_acc_tstamp;
+	u32 flogi_rjt;
+	bool flogi_rjt_set;
+	struct qed_mfw_tlv_time flogi_rjt_tstamp;
+	u32 fdiscs;
+	bool fdiscs_set;
+	u8 fdisc_acc;
+	bool fdisc_acc_set;
+	u8 fdisc_rjt;
+	bool fdisc_rjt_set;
+	u8 plogi;
+	bool plogi_set;
+	u8 plogi_acc;
+	bool plogi_acc_set;
+	u8 plogi_rjt;
+	bool plogi_rjt_set;
+	u32 plogi_dst_fcid[5];
+	bool plogi_dst_fcid_set[5];
+	struct qed_mfw_tlv_time plogi_tstamp[5];
+	u32 plogi_acc_src_fcid[5];
+	bool plogi_acc_src_fcid_set[5];
+	struct qed_mfw_tlv_time plogi_acc_tstamp[5];
+	u8 tx_plogos;
+	bool tx_plogos_set;
+	u8 plogo_acc;
+	bool plogo_acc_set;
+	u8 plogo_rjt;
+	bool plogo_rjt_set;
+	u32 plogo_src_fcid[5];
+	bool plogo_src_fcid_set[5];
+	struct qed_mfw_tlv_time plogo_tstamp[5];
+	u8 rx_logos;
+	bool rx_logos_set;
+	u8 tx_accs;
+	bool tx_accs_set;
+	u8 tx_prlis;
+	bool tx_prlis_set;
+	u8 rx_accs;
+	bool rx_accs_set;
+	u8 tx_abts;
+	bool tx_abts_set;
+	u8 rx_abts_acc;
+	bool rx_abts_acc_set;
+	u8 rx_abts_rjt;
+	bool rx_abts_rjt_set;
+	u32 abts_dst_fcid[5];
+	bool abts_dst_fcid_set[5];
+	struct qed_mfw_tlv_time abts_tstamp[5];
+	u8 rx_rscn;
+	bool rx_rscn_set;
+	u32 rx_rscn_nport[4];
+	bool rx_rscn_nport_set[4];
+	u8 tx_lun_rst;
+	bool tx_lun_rst_set;
+	u8 abort_task_sets;
+	bool abort_task_sets_set;
+	u8 tx_tprlos;
+	bool tx_tprlos_set;
+	u8 tx_nos;
+	bool tx_nos_set;
+	u8 rx_nos;
+	bool rx_nos_set;
+	u8 ols;
+	bool ols_set;
+	u8 lr;
+	bool lr_set;
+	u8 lrr;
+	bool lrr_set;
+	u8 tx_lip;
+	bool tx_lip_set;
+	u8 rx_lip;
+	bool rx_lip_set;
+	u8 eofa;
+	bool eofa_set;
+	u8 eofni;
+	bool eofni_set;
+	u8 scsi_chks;
+	bool scsi_chks_set;
+	u8 scsi_cond_met;
+	bool scsi_cond_met_set;
+	u8 scsi_busy;
+	bool scsi_busy_set;
+	u8 scsi_inter;
+	bool scsi_inter_set;
+	u8 scsi_inter_cond_met;
+	bool scsi_inter_cond_met_set;
+	u8 scsi_rsv_conflicts;
+	bool scsi_rsv_conflicts_set;
+	u8 scsi_tsk_full;
+	bool scsi_tsk_full_set;
+	u8 scsi_aca_active;
+	bool scsi_aca_active_set;
+	u8 scsi_tsk_abort;
+	bool scsi_tsk_abort_set;
+	u32 scsi_rx_chk[5];
+	bool scsi_rx_chk_set[5];
+	struct qed_mfw_tlv_time scsi_chk_tstamp[5];
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3-59-g8ed1b


From 77a509e4f6d14c2e09acbab5a89a769740bda62c Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:40 -0700
Subject: qed: Add support for processing iscsi tlv request.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.h     |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 131 ++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                    |  36 +++++++
 3 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index b31f5d8bd27c..632a838f1fe3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -217,7 +217,8 @@ enum qed_mfw_tlv_type {
 	QED_MFW_TLV_GENERIC = 0x1,	/* Core driver TLVs */
 	QED_MFW_TLV_ETH = 0x2,		/* L2 driver TLVs */
 	QED_MFW_TLV_FCOE = 0x4,		/* FCoE protocol TLVs */
-	QED_MFW_TLV_MAX = 0x8,
+	QED_MFW_TLV_ISCSI = 0x8,	/* SCSI protocol TLVs */
+	QED_MFW_TLV_MAX = 0x16,
 };
 
 struct qed_mfw_tlv_generic {
@@ -247,6 +248,7 @@ union qed_mfw_tlv_data {
 	struct qed_mfw_tlv_generic generic;
 	struct qed_mfw_tlv_eth eth;
 	struct qed_mfw_tlv_fcoe fcoe;
+	struct qed_mfw_tlv_iscsi iscsi;
 };
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
index 1873cfc08715..6c16158d8090 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c
@@ -215,6 +215,23 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group)
 	case DRV_TLV_SCSI_CHECK_5_TIMESTAMP:
 		*tlv_group = QED_MFW_TLV_FCOE;
 		break;
+	case DRV_TLV_TARGET_LLMNR_ENABLED:
+	case DRV_TLV_HEADER_DIGEST_FLAG_ENABLED:
+	case DRV_TLV_DATA_DIGEST_FLAG_ENABLED:
+	case DRV_TLV_AUTHENTICATION_METHOD:
+	case DRV_TLV_ISCSI_BOOT_TARGET_PORTAL:
+	case DRV_TLV_MAX_FRAME_SIZE:
+	case DRV_TLV_PDU_TX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_SIZE:
+	case DRV_TLV_ISCSI_BOOT_PROGRESS:
+	case DRV_TLV_PDU_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+	case DRV_TLV_ISCSI_PDU_RX_FRAMES_RECEIVED:
+	case DRV_TLV_ISCSI_PDU_RX_BYTES_RECEIVED:
+	case DRV_TLV_ISCSI_PDU_TX_FRAMES_SENT:
+	case DRV_TLV_ISCSI_PDU_TX_BYTES_SENT:
+		*tlv_group |= QED_MFW_TLV_ISCSI;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1054,6 +1071,109 @@ qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
 	return -1;
 }
 
+static int
+qed_mfw_get_iscsi_tlv_value(struct qed_drv_tlv_hdr *p_tlv,
+			    struct qed_mfw_tlv_iscsi *p_drv_buf,
+			    struct qed_tlv_parsed_buf *p_buf)
+{
+	switch (p_tlv->tlv_type) {
+	case DRV_TLV_TARGET_LLMNR_ENABLED:
+		if (p_drv_buf->target_llmnr_set) {
+			p_buf->p_val = &p_drv_buf->target_llmnr;
+			return sizeof(p_drv_buf->target_llmnr);
+		}
+		break;
+	case DRV_TLV_HEADER_DIGEST_FLAG_ENABLED:
+		if (p_drv_buf->header_digest_set) {
+			p_buf->p_val = &p_drv_buf->header_digest;
+			return sizeof(p_drv_buf->header_digest);
+		}
+		break;
+	case DRV_TLV_DATA_DIGEST_FLAG_ENABLED:
+		if (p_drv_buf->data_digest_set) {
+			p_buf->p_val = &p_drv_buf->data_digest;
+			return sizeof(p_drv_buf->data_digest);
+		}
+		break;
+	case DRV_TLV_AUTHENTICATION_METHOD:
+		if (p_drv_buf->auth_method_set) {
+			p_buf->p_val = &p_drv_buf->auth_method;
+			return sizeof(p_drv_buf->auth_method);
+		}
+		break;
+	case DRV_TLV_ISCSI_BOOT_TARGET_PORTAL:
+		if (p_drv_buf->boot_taget_portal_set) {
+			p_buf->p_val = &p_drv_buf->boot_taget_portal;
+			return sizeof(p_drv_buf->boot_taget_portal);
+		}
+		break;
+	case DRV_TLV_MAX_FRAME_SIZE:
+		if (p_drv_buf->frame_size_set) {
+			p_buf->p_val = &p_drv_buf->frame_size;
+			return sizeof(p_drv_buf->frame_size);
+		}
+		break;
+	case DRV_TLV_PDU_TX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->tx_desc_size_set) {
+			p_buf->p_val = &p_drv_buf->tx_desc_size;
+			return sizeof(p_drv_buf->tx_desc_size);
+		}
+		break;
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_SIZE:
+		if (p_drv_buf->rx_desc_size_set) {
+			p_buf->p_val = &p_drv_buf->rx_desc_size;
+			return sizeof(p_drv_buf->rx_desc_size);
+		}
+		break;
+	case DRV_TLV_ISCSI_BOOT_PROGRESS:
+		if (p_drv_buf->boot_progress_set) {
+			p_buf->p_val = &p_drv_buf->boot_progress;
+			return sizeof(p_drv_buf->boot_progress);
+		}
+		break;
+	case DRV_TLV_PDU_TX_DESCRIPTOR_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->tx_desc_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->tx_desc_qdepth;
+			return sizeof(p_drv_buf->tx_desc_qdepth);
+		}
+		break;
+	case DRV_TLV_PDU_RX_DESCRIPTORS_QUEUE_AVG_DEPTH:
+		if (p_drv_buf->rx_desc_qdepth_set) {
+			p_buf->p_val = &p_drv_buf->rx_desc_qdepth;
+			return sizeof(p_drv_buf->rx_desc_qdepth);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_RX_FRAMES_RECEIVED:
+		if (p_drv_buf->rx_frames_set) {
+			p_buf->p_val = &p_drv_buf->rx_frames;
+			return sizeof(p_drv_buf->rx_frames);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_RX_BYTES_RECEIVED:
+		if (p_drv_buf->rx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->rx_bytes;
+			return sizeof(p_drv_buf->rx_bytes);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_TX_FRAMES_SENT:
+		if (p_drv_buf->tx_frames_set) {
+			p_buf->p_val = &p_drv_buf->tx_frames;
+			return sizeof(p_drv_buf->tx_frames);
+		}
+		break;
+	case DRV_TLV_ISCSI_PDU_TX_BYTES_SENT:
+		if (p_drv_buf->tx_bytes_set) {
+			p_buf->p_val = &p_drv_buf->tx_bytes;
+			return sizeof(p_drv_buf->tx_bytes);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return -1;
+}
+
 static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			       u8 tlv_group, u8 *p_mfw_buf, u32 size)
 {
@@ -1097,6 +1217,10 @@ static int qed_mfw_update_tlvs(struct qed_hwfn *p_hwfn,
 			len = qed_mfw_get_fcoe_tlv_value(&tlv,
 							 &p_tlv_data->fcoe,
 							 &buffer);
+		else
+			len = qed_mfw_get_iscsi_tlv_value(&tlv,
+							  &p_tlv_data->iscsi,
+							  &buffer);
 
 		if (len > 0) {
 			WARN(len > 4 * tlv.tlv_length,
@@ -1179,6 +1303,13 @@ int qed_mfw_process_tlv_req(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		tlv_group &= ~QED_MFW_TLV_FCOE;
 	}
 
+	if ((tlv_group & QED_MFW_TLV_ISCSI) &&
+	    p_hwfn->hw_info.personality != QED_PCI_ISCSI) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Skipping iSCSI TLVs for non-iSCSI function\n");
+		tlv_group &= ~QED_MFW_TLV_ISCSI;
+	}
+
 	/* Update the TLV values in the local buffer */
 	for (id = QED_MFW_TLV_GENERIC; id < QED_MFW_TLV_MAX; id <<= 1) {
 		if (tlv_group & id)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 74c2b9ad8afa..92b5352287df 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -412,6 +412,42 @@ struct qed_mfw_tlv_fcoe {
 	struct qed_mfw_tlv_time scsi_chk_tstamp[5];
 };
 
+struct qed_mfw_tlv_iscsi {
+	u8 target_llmnr;
+	bool target_llmnr_set;
+	u8 header_digest;
+	bool header_digest_set;
+	u8 data_digest;
+	bool data_digest_set;
+	u8 auth_method;
+#define QED_MFW_TLV_AUTH_METHOD_NONE            (1)
+#define QED_MFW_TLV_AUTH_METHOD_CHAP            (2)
+#define QED_MFW_TLV_AUTH_METHOD_MUTUAL_CHAP     (3)
+	bool auth_method_set;
+	u16 boot_taget_portal;
+	bool boot_taget_portal_set;
+	u16 frame_size;
+	bool frame_size_set;
+	u16 tx_desc_size;
+	bool tx_desc_size_set;
+	u16 rx_desc_size;
+	bool rx_desc_size_set;
+	u8 boot_progress;
+	bool boot_progress_set;
+	u16 tx_desc_qdepth;
+	bool tx_desc_qdepth_set;
+	u16 rx_desc_qdepth;
+	bool rx_desc_qdepth_set;
+	u64 rx_frames;
+	bool rx_frames_set;
+	u64 rx_bytes;
+	bool rx_bytes_set;
+	u64 tx_frames;
+	bool tx_frames_set;
+	u64 tx_bytes;
+	bool tx_bytes_set;
+};
+
 #define DIRECT_REG_WR(reg_addr, val) writel((u32)val, \
 					    (void __iomem *)(reg_addr))
 
-- 
cgit v1.2.3-59-g8ed1b


From 59ccf86fe69a6a77afebe706913d6b551d84d5bc Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:41 -0700
Subject: qed: Add driver infrastucture for handling mfw requests.

MFW requests the TLVs in interrupt context. Extracting of the required
data from upper layers and populating of the TLVs require process context.
The patch adds work-queues for processing the tlv requests. It also adds
the implementation for requesting the tlv values from appropriate protocol
driver.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h      |   8 ++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 151 ++++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  |   2 +
 include/linux/qed/qed_if.h                 |  10 ++
 4 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index dfdbe529e2bf..00db3401b898 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -515,6 +515,10 @@ struct qed_simd_fp_handler {
 	void	(*func)(void *);
 };
 
+enum qed_slowpath_wq_flag {
+	QED_SLOWPATH_MFW_TLV_REQ,
+};
+
 struct qed_hwfn {
 	struct qed_dev			*cdev;
 	u8				my_id;          /* ID inside the PF */
@@ -644,6 +648,9 @@ struct qed_hwfn {
 #endif
 
 	struct z_stream_s		*stream;
+	struct workqueue_struct *slowpath_wq;
+	struct delayed_work slowpath_task;
+	unsigned long slowpath_task_flags;
 };
 
 struct pci_params {
@@ -908,6 +915,7 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 			    union qed_mcp_protocol_stats *stats);
 int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
 void qed_slowpath_irq_sync(struct qed_hwfn *p_hwfn);
+int qed_mfw_tlv_req(struct qed_hwfn *hwfn);
 
 int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn,
 			  enum qed_mfw_tlv_type type,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index cbf0ea97fc8e..68c4399ffd50 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -946,6 +946,68 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 	}
 }
 
+static void qed_slowpath_wq_stop(struct qed_dev *cdev)
+{
+	int i;
+
+	if (IS_VF(cdev))
+		return;
+
+	for_each_hwfn(cdev, i) {
+		if (!cdev->hwfns[i].slowpath_wq)
+			continue;
+
+		flush_workqueue(cdev->hwfns[i].slowpath_wq);
+		destroy_workqueue(cdev->hwfns[i].slowpath_wq);
+	}
+}
+
+static void qed_slowpath_task(struct work_struct *work)
+{
+	struct qed_hwfn *hwfn = container_of(work, struct qed_hwfn,
+					     slowpath_task.work);
+	struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+
+	if (!ptt) {
+		queue_delayed_work(hwfn->slowpath_wq, &hwfn->slowpath_task, 0);
+		return;
+	}
+
+	if (test_and_clear_bit(QED_SLOWPATH_MFW_TLV_REQ,
+			       &hwfn->slowpath_task_flags))
+		qed_mfw_process_tlv_req(hwfn, ptt);
+
+	qed_ptt_release(hwfn, ptt);
+}
+
+static int qed_slowpath_wq_start(struct qed_dev *cdev)
+{
+	struct qed_hwfn *hwfn;
+	char name[NAME_SIZE];
+	int i;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	for_each_hwfn(cdev, i) {
+		hwfn = &cdev->hwfns[i];
+
+		snprintf(name, NAME_SIZE, "slowpath-%02x:%02x.%02x",
+			 cdev->pdev->bus->number,
+			 PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id);
+
+		hwfn->slowpath_wq = alloc_workqueue(name, 0, 0);
+		if (!hwfn->slowpath_wq) {
+			DP_NOTICE(hwfn, "Cannot create slowpath workqueue\n");
+			return -ENOMEM;
+		}
+
+		INIT_DELAYED_WORK(&hwfn->slowpath_task, qed_slowpath_task);
+	}
+
+	return 0;
+}
+
 static int qed_slowpath_start(struct qed_dev *cdev,
 			      struct qed_slowpath_params *params)
 {
@@ -961,6 +1023,9 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 	if (qed_iov_wq_start(cdev))
 		goto err;
 
+	if (qed_slowpath_wq_start(cdev))
+		goto err;
+
 	if (IS_PF(cdev)) {
 		rc = request_firmware(&cdev->firmware, QED_FW_FILE_NAME,
 				      &cdev->pdev->dev);
@@ -1095,6 +1160,8 @@ err:
 
 	qed_iov_wq_stop(cdev, false);
 
+	qed_slowpath_wq_stop(cdev);
+
 	return rc;
 }
 
@@ -1103,6 +1170,8 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	if (!cdev)
 		return -ENODEV;
 
+	qed_slowpath_wq_stop(cdev);
+
 	qed_ll2_dealloc_if(cdev);
 
 	if (IS_PF(cdev)) {
@@ -2089,8 +2158,88 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 	}
 }
 
+int qed_mfw_tlv_req(struct qed_hwfn *hwfn)
+{
+	DP_VERBOSE(hwfn->cdev, NETIF_MSG_DRV,
+		   "Scheduling slowpath task [Flag: %d]\n",
+		   QED_SLOWPATH_MFW_TLV_REQ);
+	smp_mb__before_atomic();
+	set_bit(QED_SLOWPATH_MFW_TLV_REQ, &hwfn->slowpath_task_flags);
+	smp_mb__after_atomic();
+	queue_delayed_work(hwfn->slowpath_wq, &hwfn->slowpath_task, 0);
+
+	return 0;
+}
+
+static void
+qed_fill_generic_tlv_data(struct qed_dev *cdev, struct qed_mfw_tlv_generic *tlv)
+{
+	struct qed_common_cb_ops *op = cdev->protocol_ops.common;
+	struct qed_eth_stats_common *p_common;
+	struct qed_generic_tlvs gen_tlvs;
+	struct qed_eth_stats stats;
+	int i;
+
+	memset(&gen_tlvs, 0, sizeof(gen_tlvs));
+	op->get_generic_tlv_data(cdev->ops_cookie, &gen_tlvs);
+
+	if (gen_tlvs.feat_flags & QED_TLV_IP_CSUM)
+		tlv->flags.ipv4_csum_offload = true;
+	if (gen_tlvs.feat_flags & QED_TLV_LSO)
+		tlv->flags.lso_supported = true;
+	tlv->flags.b_set = true;
+
+	for (i = 0; i < QED_TLV_MAC_COUNT; i++) {
+		if (is_valid_ether_addr(gen_tlvs.mac[i])) {
+			ether_addr_copy(tlv->mac[i], gen_tlvs.mac[i]);
+			tlv->mac_set[i] = true;
+		}
+	}
+
+	qed_get_vport_stats(cdev, &stats);
+	p_common = &stats.common;
+	tlv->rx_frames = p_common->rx_ucast_pkts + p_common->rx_mcast_pkts +
+			 p_common->rx_bcast_pkts;
+	tlv->rx_frames_set = true;
+	tlv->rx_bytes = p_common->rx_ucast_bytes + p_common->rx_mcast_bytes +
+			p_common->rx_bcast_bytes;
+	tlv->rx_bytes_set = true;
+	tlv->tx_frames = p_common->tx_ucast_pkts + p_common->tx_mcast_pkts +
+			 p_common->tx_bcast_pkts;
+	tlv->tx_frames_set = true;
+	tlv->tx_bytes = p_common->tx_ucast_bytes + p_common->tx_mcast_bytes +
+			p_common->tx_bcast_bytes;
+	tlv->rx_bytes_set = true;
+}
+
 int qed_mfw_fill_tlv_data(struct qed_hwfn *hwfn, enum qed_mfw_tlv_type type,
 			  union qed_mfw_tlv_data *tlv_buf)
 {
-	return -EINVAL;
+	struct qed_dev *cdev = hwfn->cdev;
+	struct qed_common_cb_ops *ops;
+
+	ops = cdev->protocol_ops.common;
+	if (!ops || !ops->get_protocol_tlv_data || !ops->get_generic_tlv_data) {
+		DP_NOTICE(hwfn, "Can't collect TLV management info\n");
+		return -EINVAL;
+	}
+
+	switch (type) {
+	case QED_MFW_TLV_GENERIC:
+		qed_fill_generic_tlv_data(hwfn->cdev, &tlv_buf->generic);
+		break;
+	case QED_MFW_TLV_ETH:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->eth);
+		break;
+	case QED_MFW_TLV_FCOE:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->fcoe);
+		break;
+	case QED_MFW_TLV_ISCSI:
+		ops->get_protocol_tlv_data(cdev->ops_cookie, &tlv_buf->iscsi);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index e80f5e7c7992..2612e3e458d9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1622,6 +1622,8 @@ int qed_mcp_handle_events(struct qed_hwfn *p_hwfn,
 		case MFW_DRV_MSG_S_TAG_UPDATE:
 			qed_mcp_update_stag(p_hwfn, p_ptt);
 			break;
+		case MFW_DRV_MSG_GET_TLV_REQ:
+			qed_mfw_tlv_req(p_hwfn);
 			break;
 		default:
 			DP_INFO(p_hwfn, "Unimplemented MFW message %d\n", i);
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 92b5352287df..44af652e7407 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -751,6 +751,14 @@ struct qed_int_info {
 	u8			used_cnt;
 };
 
+struct qed_generic_tlvs {
+#define QED_TLV_IP_CSUM         BIT(0)
+#define QED_TLV_LSO             BIT(1)
+	u16 feat_flags;
+#define QED_TLV_MAC_COUNT	3
+	u8 mac[QED_TLV_MAC_COUNT][ETH_ALEN];
+};
+
 #define QED_NVM_SIGNATURE 0x12435687
 
 enum qed_nvm_flash_cmd {
@@ -765,6 +773,8 @@ struct qed_common_cb_ops {
 	void	(*link_update)(void			*dev,
 			       struct qed_link_output	*link);
 	void	(*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type);
+	void (*get_generic_tlv_data)(void *dev, struct qed_generic_tlvs *data);
+	void (*get_protocol_tlv_data)(void *dev, void *data);
 };
 
 struct qed_selftest_ops {
-- 
cgit v1.2.3-59-g8ed1b


From d25b859ccd614d2397569b833491372d129d1982 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 22 May 2018 00:28:42 -0700
Subject: qede: Add support for populating ethernet TLVs.

This patch adds callbacks for providing the ethernet protocol driver TLVs.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 101 +++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 7abaf2740530..9e70f713c3c5 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -133,6 +133,9 @@ static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id);
 static void qede_remove(struct pci_dev *pdev);
 static void qede_shutdown(struct pci_dev *pdev);
 static void qede_link_update(void *dev, struct qed_link_output *link);
+static void qede_get_eth_tlv_data(void *edev, void *data);
+static void qede_get_generic_tlv_data(void *edev,
+				      struct qed_generic_tlvs *data);
 
 /* The qede lock is used to protect driver state change and driver flows that
  * are not reentrant.
@@ -228,6 +231,8 @@ static struct qed_eth_cb_ops qede_ll_ops = {
 		.arfs_filter_op = qede_arfs_filter_op,
 #endif
 		.link_update = qede_link_update,
+		.get_generic_tlv_data = qede_get_generic_tlv_data,
+		.get_protocol_tlv_data = qede_get_eth_tlv_data,
 	},
 	.force_mac = qede_force_mac,
 	.ports_update = qede_udp_ports_update,
@@ -2131,3 +2136,99 @@ static void qede_link_update(void *dev, struct qed_link_output *link)
 		}
 	}
 }
+
+static bool qede_is_txq_full(struct qede_dev *edev, struct qede_tx_queue *txq)
+{
+	struct netdev_queue *netdev_txq;
+
+	netdev_txq = netdev_get_tx_queue(edev->ndev, txq->index);
+	if (netif_xmit_stopped(netdev_txq))
+		return true;
+
+	return false;
+}
+
+static void qede_get_generic_tlv_data(void *dev, struct qed_generic_tlvs *data)
+{
+	struct qede_dev *edev = dev;
+	struct netdev_hw_addr *ha;
+	int i;
+
+	if (edev->ndev->features & NETIF_F_IP_CSUM)
+		data->feat_flags |= QED_TLV_IP_CSUM;
+	if (edev->ndev->features & NETIF_F_TSO)
+		data->feat_flags |= QED_TLV_LSO;
+
+	ether_addr_copy(data->mac[0], edev->ndev->dev_addr);
+	memset(data->mac[1], 0, ETH_ALEN);
+	memset(data->mac[2], 0, ETH_ALEN);
+	/* Copy the first two UC macs */
+	netif_addr_lock_bh(edev->ndev);
+	i = 1;
+	netdev_for_each_uc_addr(ha, edev->ndev) {
+		ether_addr_copy(data->mac[i++], ha->addr);
+		if (i == QED_TLV_MAC_COUNT)
+			break;
+	}
+
+	netif_addr_unlock_bh(edev->ndev);
+}
+
+static void qede_get_eth_tlv_data(void *dev, void *data)
+{
+	struct qed_mfw_tlv_eth *etlv = data;
+	struct qede_dev *edev = dev;
+	struct qede_fastpath *fp;
+	int i;
+
+	etlv->lso_maxoff_size = 0XFFFF;
+	etlv->lso_maxoff_size_set = true;
+	etlv->lso_minseg_size = (u16)ETH_TX_LSO_WINDOW_MIN_LEN;
+	etlv->lso_minseg_size_set = true;
+	etlv->prom_mode = !!(edev->ndev->flags & IFF_PROMISC);
+	etlv->prom_mode_set = true;
+	etlv->tx_descr_size = QEDE_TSS_COUNT(edev);
+	etlv->tx_descr_size_set = true;
+	etlv->rx_descr_size = QEDE_RSS_COUNT(edev);
+	etlv->rx_descr_size_set = true;
+	etlv->iov_offload = QED_MFW_TLV_IOV_OFFLOAD_VEB;
+	etlv->iov_offload_set = true;
+
+	/* Fill information regarding queues; Should be done under the qede
+	 * lock to guarantee those don't change beneath our feet.
+	 */
+	etlv->txqs_empty = true;
+	etlv->rxqs_empty = true;
+	etlv->num_txqs_full = 0;
+	etlv->num_rxqs_full = 0;
+
+	__qede_lock(edev);
+	for_each_queue(i) {
+		fp = &edev->fp_array[i];
+		if (fp->type & QEDE_FASTPATH_TX) {
+			if (fp->txq->sw_tx_cons != fp->txq->sw_tx_prod)
+				etlv->txqs_empty = false;
+			if (qede_is_txq_full(edev, fp->txq))
+				etlv->num_txqs_full++;
+		}
+		if (fp->type & QEDE_FASTPATH_RX) {
+			if (qede_has_rx_work(fp->rxq))
+				etlv->rxqs_empty = false;
+
+			/* This one is a bit tricky; Firmware might stop
+			 * placing packets if ring is not yet full.
+			 * Give an approximation.
+			 */
+			if (le16_to_cpu(*fp->rxq->hw_cons_ptr) -
+			    qed_chain_get_cons_idx(&fp->rxq->rx_comp_ring) >
+			    RX_RING_SIZE - 100)
+				etlv->num_rxqs_full++;
+		}
+	}
+	__qede_unlock(edev);
+
+	etlv->txqs_empty_set = true;
+	etlv->rxqs_empty_set = true;
+	etlv->num_txqs_full_set = true;
+	etlv->num_rxqs_full_set = true;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 642a0b37e669465765cad9b64b9798be65df0f09 Mon Sep 17 00:00:00 2001
From: Chad Dupuis <chad.dupuis@cavium.com>
Date: Tue, 22 May 2018 00:28:43 -0700
Subject: qedf: Add support for populating ethernet TLVs.

This patch adds callbacks for providing the ethernet protocol driver TLVs.

Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/qedf/qedf.h         |  6 +++
 drivers/scsi/qedf/qedf_debugfs.c |  1 -
 drivers/scsi/qedf/qedf_io.c      | 11 +++++
 drivers/scsi/qedf/qedf_main.c    | 87 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h
index c105a2e48ac1..0cb0490bac01 100644
--- a/drivers/scsi/qedf/qedf.h
+++ b/drivers/scsi/qedf/qedf.h
@@ -383,11 +383,16 @@ struct qedf_ctx {
 	u32 flogi_failed;
 
 	/* Used for fc statistics */
+	struct mutex stats_mutex;
 	u64 input_requests;
 	u64 output_requests;
 	u64 control_requests;
 	u64 packet_aborts;
 	u64 alloc_failures;
+	u8 lun_resets;
+	u8 target_resets;
+	u8 task_set_fulls;
+	u8 busy;
 };
 
 struct io_bdt {
@@ -496,6 +501,7 @@ extern int qedf_post_io_req(struct qedf_rport *fcport,
 extern void qedf_process_seq_cleanup_compl(struct qedf_ctx *qedf,
 	struct fcoe_cqe *cqe, struct qedf_ioreq *io_req);
 extern int qedf_send_flogi(struct qedf_ctx *qedf);
+extern void qedf_get_protocol_tlv_data(void *dev, void *data);
 extern void qedf_fp_io_handler(struct work_struct *work);
 
 #define FCOE_WORD_TO_BYTE  4
diff --git a/drivers/scsi/qedf/qedf_debugfs.c b/drivers/scsi/qedf/qedf_debugfs.c
index c539a7ae3a7e..5789ce185923 100644
--- a/drivers/scsi/qedf/qedf_debugfs.c
+++ b/drivers/scsi/qedf/qedf_debugfs.c
@@ -439,7 +439,6 @@ qedf_dbg_offload_stats_open(struct inode *inode, struct file *file)
 	return single_open(file, qedf_offload_stats_show, qedf);
 }
 
-
 const struct file_operations qedf_dbg_fops[] = {
 	qedf_dbg_fileops(qedf, fp_int),
 	qedf_dbg_fileops_seq(qedf, io_trace),
diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c
index 50a50c4249d0..3fe579d0f1a8 100644
--- a/drivers/scsi/qedf/qedf_io.c
+++ b/drivers/scsi/qedf/qedf_io.c
@@ -1200,6 +1200,12 @@ void qedf_scsi_completion(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
 					fcport->retry_delay_timestamp =
 					    jiffies + (qualifier * HZ / 10);
 				}
+				/* Record stats */
+				if (io_req->cdb_status ==
+				    SAM_STAT_TASK_SET_FULL)
+					qedf->task_set_fulls++;
+				else
+					qedf->busy++;
 			}
 		}
 		if (io_req->fcp_resid)
@@ -1866,6 +1872,11 @@ static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
 		goto reset_tmf_err;
 	}
 
+	if (tm_flags == FCP_TMF_LUN_RESET)
+		qedf->lun_resets++;
+	else if (tm_flags == FCP_TMF_TGT_RESET)
+		qedf->target_resets++;
+
 	/* Initialize rest of io_req fields */
 	io_req->sc_cmd = sc_cmd;
 	io_req->fcport = fcport;
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index 6c19015975a8..730e480df256 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -566,6 +566,7 @@ static struct qed_fcoe_cb_ops qedf_cb_ops = {
 	{
 		.link_update = qedf_link_update,
 		.dcbx_aen = qedf_dcbx_handler,
+		.get_protocol_tlv_data = qedf_get_protocol_tlv_data,
 	}
 };
 
@@ -1746,6 +1747,8 @@ static struct fc_host_statistics *qedf_fc_get_host_stats(struct Scsi_Host
 		goto out;
 	}
 
+	mutex_lock(&qedf->stats_mutex);
+
 	/* Query firmware for offload stats */
 	qed_ops->get_stats(qedf->cdev, fw_fcoe_stats);
 
@@ -1779,6 +1782,7 @@ static struct fc_host_statistics *qedf_fc_get_host_stats(struct Scsi_Host
 	qedf_stats->fcp_packet_aborts += qedf->packet_aborts;
 	qedf_stats->fcp_frame_alloc_failures += qedf->alloc_failures;
 
+	mutex_unlock(&qedf->stats_mutex);
 	kfree(fw_fcoe_stats);
 out:
 	return qedf_stats;
@@ -2948,6 +2952,7 @@ static int __qedf_probe(struct pci_dev *pdev, int mode)
 		qedf->stop_io_on_error = false;
 		pci_set_drvdata(pdev, qedf);
 		init_completion(&qedf->fipvlan_compl);
+		mutex_init(&qedf->stats_mutex);
 
 		QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_INFO,
 		   "QLogic FastLinQ FCoE Module qedf %s, "
@@ -3392,6 +3397,88 @@ static void qedf_remove(struct pci_dev *pdev)
 	__qedf_remove(pdev, QEDF_MODE_NORMAL);
 }
 
+/*
+ * Protocol TLV handler
+ */
+void qedf_get_protocol_tlv_data(void *dev, void *data)
+{
+	struct qedf_ctx *qedf = dev;
+	struct qed_mfw_tlv_fcoe *fcoe = data;
+	struct fc_lport *lport = qedf->lport;
+	struct Scsi_Host *host = lport->host;
+	struct fc_host_attrs *fc_host = shost_to_fc_host(host);
+	struct fc_host_statistics *hst;
+
+	/* Force a refresh of the fc_host stats including offload stats */
+	hst = qedf_fc_get_host_stats(host);
+
+	fcoe->qos_pri_set = true;
+	fcoe->qos_pri = 3; /* Hard coded to 3 in driver */
+
+	fcoe->ra_tov_set = true;
+	fcoe->ra_tov = lport->r_a_tov;
+
+	fcoe->ed_tov_set = true;
+	fcoe->ed_tov = lport->e_d_tov;
+
+	fcoe->npiv_state_set = true;
+	fcoe->npiv_state = 1; /* NPIV always enabled */
+
+	fcoe->num_npiv_ids_set = true;
+	fcoe->num_npiv_ids = fc_host->npiv_vports_inuse;
+
+	/* Certain attributes we only want to set if we've selected an FCF */
+	if (qedf->ctlr.sel_fcf) {
+		fcoe->switch_name_set = true;
+		u64_to_wwn(qedf->ctlr.sel_fcf->switch_name, fcoe->switch_name);
+	}
+
+	fcoe->port_state_set = true;
+	/* For qedf we're either link down or fabric attach */
+	if (lport->link_up)
+		fcoe->port_state = QED_MFW_TLV_PORT_STATE_FABRIC;
+	else
+		fcoe->port_state = QED_MFW_TLV_PORT_STATE_OFFLINE;
+
+	fcoe->link_failures_set = true;
+	fcoe->link_failures = (u16)hst->link_failure_count;
+
+	fcoe->fcoe_txq_depth_set = true;
+	fcoe->fcoe_rxq_depth_set = true;
+	fcoe->fcoe_rxq_depth = FCOE_PARAMS_NUM_TASKS;
+	fcoe->fcoe_txq_depth = FCOE_PARAMS_NUM_TASKS;
+
+	fcoe->fcoe_rx_frames_set = true;
+	fcoe->fcoe_rx_frames = hst->rx_frames;
+
+	fcoe->fcoe_tx_frames_set = true;
+	fcoe->fcoe_tx_frames = hst->tx_frames;
+
+	fcoe->fcoe_rx_bytes_set = true;
+	fcoe->fcoe_rx_bytes = hst->fcp_input_megabytes * 1000000;
+
+	fcoe->fcoe_tx_bytes_set = true;
+	fcoe->fcoe_tx_bytes = hst->fcp_output_megabytes * 1000000;
+
+	fcoe->crc_count_set = true;
+	fcoe->crc_count = hst->invalid_crc_count;
+
+	fcoe->tx_abts_set = true;
+	fcoe->tx_abts = hst->fcp_packet_aborts;
+
+	fcoe->tx_lun_rst_set = true;
+	fcoe->tx_lun_rst = qedf->lun_resets;
+
+	fcoe->abort_task_sets_set = true;
+	fcoe->abort_task_sets = qedf->packet_aborts;
+
+	fcoe->scsi_busy_set = true;
+	fcoe->scsi_busy = qedf->busy;
+
+	fcoe->scsi_tsk_full_set = true;
+	fcoe->scsi_tsk_full = qedf->task_set_fulls;
+}
+
 /*
  * Module Init/Remove
  */
-- 
cgit v1.2.3-59-g8ed1b


From 8673daf4f55bf3b918dec78aee2132d191225106 Mon Sep 17 00:00:00 2001
From: Chad Dupuis <chad.dupuis@cavium.com>
Date: Tue, 22 May 2018 00:28:44 -0700
Subject: qedf: Add get_generic_tlv_data handler.

Signed-off-by: Chad Dupuis <chad.dupuis@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/qedf/qedf.h      |  1 +
 drivers/scsi/qedf/qedf_main.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h
index 0cb0490bac01..cabb6af60fb8 100644
--- a/drivers/scsi/qedf/qedf.h
+++ b/drivers/scsi/qedf/qedf.h
@@ -503,6 +503,7 @@ extern void qedf_process_seq_cleanup_compl(struct qedf_ctx *qedf,
 extern int qedf_send_flogi(struct qedf_ctx *qedf);
 extern void qedf_get_protocol_tlv_data(void *dev, void *data);
 extern void qedf_fp_io_handler(struct work_struct *work);
+extern void qedf_get_generic_tlv_data(void *dev, struct qed_generic_tlvs *data);
 
 #define FCOE_WORD_TO_BYTE  4
 #define QEDF_MAX_TASK_NUM	0xFFFF
diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c
index 730e480df256..d3f73d8d7738 100644
--- a/drivers/scsi/qedf/qedf_main.c
+++ b/drivers/scsi/qedf/qedf_main.c
@@ -566,6 +566,7 @@ static struct qed_fcoe_cb_ops qedf_cb_ops = {
 	{
 		.link_update = qedf_link_update,
 		.dcbx_aen = qedf_dcbx_handler,
+		.get_generic_tlv_data = qedf_get_generic_tlv_data,
 		.get_protocol_tlv_data = qedf_get_protocol_tlv_data,
 	}
 };
@@ -3479,6 +3480,22 @@ void qedf_get_protocol_tlv_data(void *dev, void *data)
 	fcoe->scsi_tsk_full = qedf->task_set_fulls;
 }
 
+/* Generic TLV data callback */
+void qedf_get_generic_tlv_data(void *dev, struct qed_generic_tlvs *data)
+{
+	struct qedf_ctx *qedf;
+
+	if (!dev) {
+		QEDF_INFO(NULL, QEDF_LOG_EVT,
+			  "dev is NULL so ignoring get_generic_tlv_data request.\n");
+		return;
+	}
+	qedf = (struct qedf_ctx *)dev;
+
+	memset(data, 0, sizeof(struct qed_generic_tlvs));
+	ether_addr_copy(data->mac[0], qedf->mac);
+}
+
 /*
  * Module Init/Remove
  */
-- 
cgit v1.2.3-59-g8ed1b


From 534bbdf8832ae48491cb8cfa42eaba0c21421ea9 Mon Sep 17 00:00:00 2001
From: Manish Rangankar <manish.rangankar@cavium.com>
Date: Tue, 22 May 2018 00:28:45 -0700
Subject: qedi: Add support for populating ethernet TLVs.

This patch adds callbacks for providing the ethernet protocol driver TLVs.

Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/qedi/qedi.h       |   3 +
 drivers/scsi/qedi/qedi_iscsi.h |   6 ++
 drivers/scsi/qedi/qedi_main.c  | 182 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+)

diff --git a/drivers/scsi/qedi/qedi.h b/drivers/scsi/qedi/qedi.h
index b8b22ce60ecc..fc3babc15fa3 100644
--- a/drivers/scsi/qedi/qedi.h
+++ b/drivers/scsi/qedi/qedi.h
@@ -353,6 +353,9 @@ struct qedi_ctx {
 #define IPV6_LEN	41
 #define IPV4_LEN	17
 	struct iscsi_boot_kset *boot_kset;
+
+	/* Used for iscsi statistics */
+	struct mutex stats_lock;
 };
 
 struct qedi_work {
diff --git a/drivers/scsi/qedi/qedi_iscsi.h b/drivers/scsi/qedi/qedi_iscsi.h
index ea1315189922..11260776212f 100644
--- a/drivers/scsi/qedi/qedi_iscsi.h
+++ b/drivers/scsi/qedi/qedi_iscsi.h
@@ -223,6 +223,12 @@ struct qedi_work_map {
 	struct work_struct *ptr_tmf_work;
 };
 
+struct qedi_boot_target {
+	char ip_addr[64];
+	char iscsi_name[255];
+	u32 ipv6_en;
+};
+
 #define qedi_set_itt(task_id, itt) ((u32)(((task_id) & 0xffff) | ((itt) << 16)))
 #define qedi_get_itt(cqe) (cqe.iscsi_hdr.cmd.itt >> 16)
 
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 4da3592aec0f..d2045a5e39b8 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -55,6 +55,7 @@ static void qedi_free_global_queues(struct qedi_ctx *qedi);
 static struct qedi_cmd *qedi_get_cmd_from_tid(struct qedi_ctx *qedi, u32 tid);
 static void qedi_reset_uio_rings(struct qedi_uio_dev *udev);
 static void qedi_ll2_free_skbs(struct qedi_ctx *qedi);
+static struct nvm_iscsi_block *qedi_get_nvram_block(struct qedi_ctx *qedi);
 
 static int qedi_iscsi_event_cb(void *context, u8 fw_event_code, void *fw_handle)
 {
@@ -879,6 +880,186 @@ static void qedi_free_iscsi_pf_param(struct qedi_ctx *qedi)
 	kfree(qedi->global_queues);
 }
 
+static void qedi_get_boot_tgt_info(struct nvm_iscsi_block *block,
+				   struct qedi_boot_target *tgt, u8 index)
+{
+	u32 ipv6_en;
+
+	ipv6_en = !!(block->generic.ctrl_flags &
+		     NVM_ISCSI_CFG_GEN_IPV6_ENABLED);
+
+	snprintf(tgt->iscsi_name, NVM_ISCSI_CFG_ISCSI_NAME_MAX_LEN, "%s\n",
+		 block->target[index].target_name.byte);
+
+	tgt->ipv6_en = ipv6_en;
+
+	if (ipv6_en)
+		snprintf(tgt->ip_addr, IPV6_LEN, "%pI6\n",
+			 block->target[index].ipv6_addr.byte);
+	else
+		snprintf(tgt->ip_addr, IPV4_LEN, "%pI4\n",
+			 block->target[index].ipv4_addr.byte);
+}
+
+static int qedi_find_boot_info(struct qedi_ctx *qedi,
+			       struct qed_mfw_tlv_iscsi *iscsi,
+			       struct nvm_iscsi_block *block)
+{
+	struct qedi_boot_target *pri_tgt = NULL, *sec_tgt = NULL;
+	u32 pri_ctrl_flags = 0, sec_ctrl_flags = 0, found = 0;
+	struct iscsi_cls_session *cls_sess;
+	struct iscsi_cls_conn *cls_conn;
+	struct qedi_conn *qedi_conn;
+	struct iscsi_session *sess;
+	struct iscsi_conn *conn;
+	char ep_ip_addr[64];
+	int i, ret = 0;
+
+	pri_ctrl_flags = !!(block->target[0].ctrl_flags &
+					NVM_ISCSI_CFG_TARGET_ENABLED);
+	if (pri_ctrl_flags) {
+		pri_tgt = kzalloc(sizeof(*pri_tgt), GFP_KERNEL);
+		if (!pri_tgt)
+			return -1;
+		qedi_get_boot_tgt_info(block, pri_tgt, 0);
+	}
+
+	sec_ctrl_flags = !!(block->target[1].ctrl_flags &
+					NVM_ISCSI_CFG_TARGET_ENABLED);
+	if (sec_ctrl_flags) {
+		sec_tgt = kzalloc(sizeof(*sec_tgt), GFP_KERNEL);
+		if (!sec_tgt) {
+			ret = -1;
+			goto free_tgt;
+		}
+		qedi_get_boot_tgt_info(block, sec_tgt, 1);
+	}
+
+	for (i = 0; i < qedi->max_active_conns; i++) {
+		qedi_conn = qedi_get_conn_from_id(qedi, i);
+		if (!qedi_conn)
+			continue;
+
+		if (qedi_conn->ep->ip_type == TCP_IPV4)
+			snprintf(ep_ip_addr, IPV4_LEN, "%pI4\n",
+				 qedi_conn->ep->dst_addr);
+		else
+			snprintf(ep_ip_addr, IPV6_LEN, "%pI6\n",
+				 qedi_conn->ep->dst_addr);
+
+		cls_conn = qedi_conn->cls_conn;
+		conn = cls_conn->dd_data;
+		cls_sess = iscsi_conn_to_session(cls_conn);
+		sess = cls_sess->dd_data;
+
+		if (pri_ctrl_flags) {
+			if (!strcmp(pri_tgt->iscsi_name, sess->targetname) &&
+			    !strcmp(pri_tgt->ip_addr, ep_ip_addr)) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (sec_ctrl_flags) {
+			if (!strcmp(sec_tgt->iscsi_name, sess->targetname) &&
+			    !strcmp(sec_tgt->ip_addr, ep_ip_addr)) {
+				found = 1;
+				break;
+			}
+		}
+	}
+
+	if (found) {
+		if (conn->hdrdgst_en) {
+			iscsi->header_digest_set = true;
+			iscsi->header_digest = 1;
+		}
+
+		if (conn->datadgst_en) {
+			iscsi->data_digest_set = true;
+			iscsi->data_digest = 1;
+		}
+		iscsi->boot_taget_portal_set = true;
+		iscsi->boot_taget_portal = sess->tpgt;
+
+	} else {
+		ret = -1;
+	}
+
+	if (sec_ctrl_flags)
+		kfree(sec_tgt);
+free_tgt:
+	if (pri_ctrl_flags)
+		kfree(pri_tgt);
+
+	return ret;
+}
+
+/*
+ * Protocol TLV handler
+ */
+static void qedi_get_protocol_tlv_data(void *dev, void *data)
+{
+	struct qed_mfw_tlv_iscsi *iscsi = data;
+	struct qed_iscsi_stats *fw_iscsi_stats;
+	struct nvm_iscsi_block *block = NULL;
+	u32 chap_en = 0, mchap_en = 0;
+	struct qedi_ctx *qedi = dev;
+	int rval = 0;
+
+	fw_iscsi_stats = kmalloc(sizeof(*fw_iscsi_stats), GFP_KERNEL);
+	if (!fw_iscsi_stats) {
+		QEDI_ERR(&qedi->dbg_ctx,
+			 "Could not allocate memory for fw_iscsi_stats.\n");
+		goto exit_get_data;
+	}
+
+	mutex_lock(&qedi->stats_lock);
+	/* Query firmware for offload stats */
+	qedi_ops->get_stats(qedi->cdev, fw_iscsi_stats);
+	mutex_unlock(&qedi->stats_lock);
+
+	iscsi->rx_frames_set = true;
+	iscsi->rx_frames = fw_iscsi_stats->iscsi_rx_packet_cnt;
+	iscsi->rx_bytes_set = true;
+	iscsi->rx_bytes = fw_iscsi_stats->iscsi_rx_bytes_cnt;
+	iscsi->tx_frames_set = true;
+	iscsi->tx_frames = fw_iscsi_stats->iscsi_tx_packet_cnt;
+	iscsi->tx_bytes_set = true;
+	iscsi->tx_bytes = fw_iscsi_stats->iscsi_tx_bytes_cnt;
+	iscsi->frame_size_set = true;
+	iscsi->frame_size = qedi->ll2_mtu;
+	block = qedi_get_nvram_block(qedi);
+	if (block) {
+		chap_en = !!(block->generic.ctrl_flags &
+			     NVM_ISCSI_CFG_GEN_CHAP_ENABLED);
+		mchap_en = !!(block->generic.ctrl_flags &
+			      NVM_ISCSI_CFG_GEN_CHAP_MUTUAL_ENABLED);
+
+		iscsi->auth_method_set = (chap_en || mchap_en) ? true : false;
+		iscsi->auth_method = 1;
+		if (chap_en)
+			iscsi->auth_method = 2;
+		if (mchap_en)
+			iscsi->auth_method = 3;
+
+		iscsi->tx_desc_size_set = true;
+		iscsi->tx_desc_size = QEDI_SQ_SIZE;
+		iscsi->rx_desc_size_set = true;
+		iscsi->rx_desc_size = QEDI_CQ_SIZE;
+
+		/* tpgt, hdr digest, data digest */
+		rval = qedi_find_boot_info(qedi, iscsi, block);
+		if (rval)
+			QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_INFO,
+				  "Boot target not set");
+	}
+
+	kfree(fw_iscsi_stats);
+exit_get_data:
+	return;
+}
+
 static void qedi_link_update(void *dev, struct qed_link_output *link)
 {
 	struct qedi_ctx *qedi = (struct qedi_ctx *)dev;
@@ -896,6 +1077,7 @@ static void qedi_link_update(void *dev, struct qed_link_output *link)
 static struct qed_iscsi_cb_ops qedi_cb_ops = {
 	{
 		.link_update =		qedi_link_update,
+		.get_protocol_tlv_data = qedi_get_protocol_tlv_data,
 	}
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 269afb36030b04f7d8b3a0e9e4141112f86e08fc Mon Sep 17 00:00:00 2001
From: Manish Rangankar <manish.rangankar@cavium.com>
Date: Tue, 22 May 2018 00:28:46 -0700
Subject: qedi: Add get_generic_tlv_data handler.

Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/scsi/qedi/qedi_main.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index d2045a5e39b8..32ee7f62fef9 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -995,6 +995,21 @@ free_tgt:
 	return ret;
 }
 
+static void qedi_get_generic_tlv_data(void *dev, struct qed_generic_tlvs *data)
+{
+	struct qedi_ctx *qedi;
+
+	if (!dev) {
+		QEDI_INFO(NULL, QEDI_LOG_EVT,
+			  "dev is NULL so ignoring get_generic_tlv_data request.\n");
+		return;
+	}
+	qedi = (struct qedi_ctx *)dev;
+
+	memset(data, 0, sizeof(struct qed_generic_tlvs));
+	ether_addr_copy(data->mac[0], qedi->mac);
+}
+
 /*
  * Protocol TLV handler
  */
@@ -1078,6 +1093,7 @@ static struct qed_iscsi_cb_ops qedi_cb_ops = {
 	{
 		.link_update =		qedi_link_update,
 		.get_protocol_tlv_data = qedi_get_protocol_tlv_data,
+		.get_generic_tlv_data = qedi_get_generic_tlv_data,
 	}
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 1f55236bd8dde69d1860a30c50793fb28d8405ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:53 +0200
Subject: netfilter: nf_nat: move common nat code to nat core

Copy-pasted, both l3 helpers almost use same code here.
Split out the common part into an 'inet' helper.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_core.h      |  7 ++++
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 55 +------------------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 48 +---------------------
 net/netfilter/nf_nat_core.c              | 70 ++++++++++++++++++++++++++++++++
 4 files changed, 81 insertions(+), 99 deletions(-)

diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 235bd0e9a5aa..0d84dd29108d 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -11,6 +11,13 @@
 unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			   unsigned int hooknum, struct sk_buff *skb);
 
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+	       const struct nf_hook_state *state,
+	       unsigned int (*do_chain)(void *priv,
+					struct sk_buff *skb,
+					const struct nf_hook_state *state));
+
 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
 
 static inline int nf_nat_initialized(struct nf_conn *ct,
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 325e02956bf5..29b5aceac66d 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -250,24 +250,12 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	struct nf_conn_nat *nat;
-	/* maniptype == SRC for postrouting. */
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 
 	ct = nf_ct_get(skb, &ctinfo);
-	/* Can't track?  It's not due to stress, or conntrack would
-	 * have dropped it.  Hence it's the user's responsibilty to
-	 * packet filter it out, or implement conntrack/NAT for that
-	 * protocol. 8) --RR
-	 */
 	if (!ct)
 		return NF_ACCEPT;
 
-	nat = nfct_nat(ct);
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
+	if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
 		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
 			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
 							   state->hook))
@@ -275,48 +263,9 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 			else
 				return NF_ACCEPT;
 		}
-		/* Only ICMPs can be IP_CT_IS_REPLY: */
-		/* fall through */
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
-			ret = nf_nat_alloc_null_binding(ct, state->hook);
-			if (ret != NF_ACCEPT)
-				return ret;
-		} else {
-			pr_debug("Already setup manip %s for ct %p\n",
-				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-				 ct);
-			if (nf_nat_oif_changed(state->hook, ctinfo, nat,
-					       state->out))
-				goto oif_changed;
-		}
-		break;
-
-	default:
-		/* ESTABLISHED */
-		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
-			ctinfo != IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
-	nf_ct_kill_acct(ct, ctinfo, skb);
-	return NF_DROP;
+	return nf_nat_inet_fn(priv, skb, state, do_chain);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
 
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index f1582b6f9588..3ec228984f82 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -261,8 +261,6 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
-	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
 	__be16 frag_off;
 	int hdrlen;
 	u8 nexthdr;
@@ -276,11 +274,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 	if (!ct)
 		return NF_ACCEPT;
 
-	nat = nfct_nat(ct);
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
+	if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
 		nexthdr = ipv6_hdr(skb)->nexthdr;
 		hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
 					  &nexthdr, &frag_off);
@@ -293,47 +287,9 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 			else
 				return NF_ACCEPT;
 		}
-		/* Only ICMPs can be IP_CT_IS_REPLY: */
-		/* fall through */
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
-			ret = nf_nat_alloc_null_binding(ct, state->hook);
-			if (ret != NF_ACCEPT)
-				return ret;
-		} else {
-			pr_debug("Already setup manip %s for ct %p\n",
-				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
-				 ct);
-			if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-				goto oif_changed;
-		}
-		break;
-
-	default:
-		/* ESTABLISHED */
-		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
-			ctinfo != IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
-			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, state->hook, skb);
-
-oif_changed:
-	nf_ct_kill_acct(ct, ctinfo, skb);
-	return NF_DROP;
+	return nf_nat_inet_fn(priv, skb, state, do_chain);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 37b3c9913b08..0cd503aacbf0 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -513,6 +513,76 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_nat_packet);
 
+unsigned int
+nf_nat_inet_fn(void *priv, struct sk_buff *skb,
+	       const struct nf_hook_state *state,
+	       unsigned int (*do_chain)(void *priv,
+					struct sk_buff *skb,
+					const struct nf_hook_state *state))
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn_nat *nat;
+	/* maniptype == SRC for postrouting. */
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
+
+	ct = nf_ct_get(skb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	 * have dropped it.  Hence it's the user's responsibilty to
+	 * packet filter it out, or implement conntrack/NAT for that
+	 * protocol. 8) --RR
+	 */
+	if (!ct)
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		/* Only ICMPs can be IP_CT_IS_REPLY.  Fallthrough */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			ret = do_chain(priv, skb, state);
+			if (ret != NF_ACCEPT)
+				return ret;
+
+			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
+				break;
+
+			ret = nf_nat_alloc_null_binding(ct, state->hook);
+			if (ret != NF_ACCEPT)
+				return ret;
+		} else {
+			pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
+				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
+				 ct, ct->status);
+			if (nf_nat_oif_changed(state->hook, ctinfo, nat,
+					       state->out))
+				goto oif_changed;
+		}
+		break;
+	default:
+		/* ESTABLISHED */
+		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+			ctinfo != IP_CT_ESTABLISHED_REPLY);
+		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
+			goto oif_changed;
+	}
+
+	return nf_nat_packet(ct, ctinfo, state->hook, skb);
+
+oif_changed:
+	nf_ct_kill_acct(ct, ctinfo, skb);
+	return NF_DROP;
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
+
 struct nf_nat_proto_clean {
 	u8	l3proto;
 	u8	l4proto;
-- 
cgit v1.2.3-59-g8ed1b


From ba7d284a984d97d5dbc44ddbfd2216b58107a5ba Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:54 +0200
Subject: netfilter: xtables: allow table definitions not backed by hook_ops

The ip(6)tables nat table is currently receiving skbs from the netfilter
core, after a followup patch skbs will be coming from the netfilter nat
core instead, so the table is no longer backed by normal hook_ops.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ip_tables.c  | 5 ++++-
 net/ipv6/netfilter/ip6_tables.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 444f125f3974..ddcc56c37d4d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1782,6 +1782,8 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
 
 	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+	if (!ops)
+		return 0;
 
 	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
 	if (ret != 0) {
@@ -1799,7 +1801,8 @@ out_free:
 void ipt_unregister_table(struct net *net, struct xt_table *table,
 			  const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ops)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 	__ipt_unregister_table(net, table);
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7097bbf95843..e18b14b2e019 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1792,6 +1792,8 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
 
 	/* set res now, will see skbs right after nf_register_net_hooks */
 	WRITE_ONCE(*res, new_table);
+	if (!ops)
+		return 0;
 
 	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
 	if (ret != 0) {
@@ -1809,7 +1811,8 @@ out_free:
 void ip6t_unregister_table(struct net *net, struct xt_table *table,
 			   const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ops)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
 	__ip6t_unregister_table(net, table);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4e25ceb80b585891c5e2a6edfa481bc4709e9544 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:55 +0200
Subject: netfilter: nf_tables: allow chain type to override hook register

Will be used in followup patch when nat types no longer
use nf_register_net_hook() but will instead register with the nat core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h       |  8 ++++----
 net/ipv4/netfilter/nft_chain_nat_ipv4.c | 19 +++++++++++++------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c | 20 ++++++++++++++------
 net/netfilter/nf_tables_api.c           | 23 ++++++++++++++++-------
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 435c9e3b9181..a94fd0c730d6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -880,8 +880,8 @@ enum nft_chain_types {
  * 	@owner: module owner
  * 	@hook_mask: mask of valid hooks
  * 	@hooks: array of hook functions
- *	@init: chain initialization function
- *	@free: chain release function
+ *	@ops_register: base chain register function
+ *	@ops_unregister: base chain unregister function
  */
 struct nft_chain_type {
 	const char			*name;
@@ -890,8 +890,8 @@ struct nft_chain_type {
 	struct module			*owner;
 	unsigned int			hook_mask;
 	nf_hookfn			*hooks[NF_MAX_HOOKS];
-	int				(*init)(struct nft_ctx *ctx);
-	void				(*free)(struct nft_ctx *ctx);
+	int				(*ops_register)(struct net *net, const struct nf_hook_ops *ops);
+	void				(*ops_unregister)(struct net *net, const struct nf_hook_ops *ops);
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 285baccfbdea..bbcb624b6b81 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -66,14 +66,21 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv,
 	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static int nft_nat_ipv4_init(struct nft_ctx *ctx)
+static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	return nf_ct_netns_get(ctx->net, ctx->family);
+	int ret = nf_register_net_hook(net, ops);
+	if (ret == 0) {
+		ret = nf_ct_netns_get(net, NFPROTO_IPV4);
+		if (ret)
+			 nf_unregister_net_hook(net, ops);
+	}
+	return ret;
 }
 
-static void nft_nat_ipv4_free(struct nft_ctx *ctx)
+static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_unregister_net_hook(net, ops);
+	nf_ct_netns_put(net, NFPROTO_IPV4);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -91,8 +98,8 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv4_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv4_fn,
 	},
-	.init		= nft_nat_ipv4_init,
-	.free		= nft_nat_ipv4_free,
+	.ops_register = nft_nat_ipv4_reg,
+	.ops_unregister = nft_nat_ipv4_unreg,
 };
 
 static int __init nft_chain_nat_init(void)
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 100a6bd1046a..05bcb2c23125 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -64,14 +64,22 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv,
 	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static int nft_nat_ipv6_init(struct nft_ctx *ctx)
+static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	return nf_ct_netns_get(ctx->net, ctx->family);
+	int ret = nf_register_net_hook(net, ops);
+	if (ret == 0) {
+		ret = nf_ct_netns_get(net, NFPROTO_IPV6);
+		if (ret)
+			 nf_unregister_net_hook(net, ops);
+	}
+
+	return ret;
 }
 
-static void nft_nat_ipv6_free(struct nft_ctx *ctx)
+static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_unregister_net_hook(net, ops);
+	nf_ct_netns_put(net, NFPROTO_IPV6);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -89,8 +97,8 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv6_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv6_fn,
 	},
-	.init		= nft_nat_ipv6_init,
-	.free		= nft_nat_ipv6_free,
+	.ops_register		= nft_nat_ipv6_reg,
+	.ops_unregister		= nft_nat_ipv6_unreg,
 };
 
 static int __init nft_chain_nat_ipv6_init(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 18bd584fadda..ded54b2abfbc 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -129,6 +129,7 @@ static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
 {
+	const struct nft_base_chain *basechain;
 	struct nf_hook_ops *ops;
 	int ret;
 
@@ -136,7 +137,12 @@ static int nf_tables_register_hook(struct net *net,
 	    !nft_is_base_chain(chain))
 		return 0;
 
-	ops = &nft_base_chain(chain)->ops;
+	basechain = nft_base_chain(chain);
+	ops = &basechain->ops;
+
+	if (basechain->type->ops_register)
+		return basechain->type->ops_register(net, ops);
+
 	ret = nf_register_net_hook(net, ops);
 	if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
 		ops->nat_hook = false;
@@ -151,11 +157,19 @@ static void nf_tables_unregister_hook(struct net *net,
 				      const struct nft_table *table,
 				      struct nft_chain *chain)
 {
+	const struct nft_base_chain *basechain;
+	const struct nf_hook_ops *ops;
+
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
 		return;
+	basechain = nft_base_chain(chain);
+	ops = &basechain->ops;
+
+	if (basechain->type->ops_unregister)
+		return basechain->type->ops_unregister(net, ops);
 
-	nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+	nf_unregister_net_hook(net, ops);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -1262,8 +1276,6 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
-		if (basechain->type->free)
-			basechain->type->free(ctx);
 		module_put(basechain->type->owner);
 		free_percpu(basechain->stats);
 		if (basechain->stats)
@@ -1396,9 +1408,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		}
 
 		basechain->type = hook.type;
-		if (basechain->type->init)
-			basechain->type->init(ctx);
-
 		chain = &basechain->chain;
 
 		ops		= &basechain->ops;
-- 
cgit v1.2.3-59-g8ed1b


From 06cad3acf07893820391bda912bebc3a8c058994 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:56 +0200
Subject: netfilter: core: export raw versions of add/delete hook functions

This will allow the nat core to reuse the nf_hook infrastructure
to maintain nat lookup functions.

The raw versions don't assume a particular hook location, the
functions get added/deleted from the hook blob that is passed to the
functions.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c         | 75 +++++++++++++++++++++++++++++++-------------
 net/netfilter/nf_internals.h |  5 +++
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0f6b8172fb9a..5f0ebf9a8d5b 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -186,9 +186,31 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
 #endif
 }
 
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+				const struct nf_hook_ops *reg)
+{
+	struct nf_hook_entries *new_hooks;
+	struct nf_hook_entries *p;
+
+	p = rcu_dereference_raw(*pp);
+	new_hooks = nf_hook_entries_grow(p, reg);
+	if (IS_ERR(new_hooks))
+		return PTR_ERR(new_hooks);
+
+	hooks_validate(new_hooks);
+
+	rcu_assign_pointer(*pp, new_hooks);
+
+	BUG_ON(p == new_hooks);
+	nf_hook_entries_free(p);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw);
+
 /*
  * __nf_hook_entries_try_shrink - try to shrink hook array
  *
+ * @old -- current hook blob at @pp
  * @pp -- location of hook blob
  *
  * Hook unregistration must always succeed, so to-be-removed hooks
@@ -201,14 +223,14 @@ static void hooks_validate(const struct nf_hook_entries *hooks)
  *
  * Returns address to free, or NULL.
  */
-static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
+static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
+					  struct nf_hook_entries __rcu **pp)
 {
-	struct nf_hook_entries *old, *new = NULL;
 	unsigned int i, j, skip = 0, hook_entries;
+	struct nf_hook_entries *new = NULL;
 	struct nf_hook_ops **orig_ops;
 	struct nf_hook_ops **new_ops;
 
-	old = nf_entry_dereference(*pp);
 	if (WARN_ON_ONCE(!old))
 		return NULL;
 
@@ -347,11 +369,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
  * This cannot fail, hook unregistration must always succeed.
  * Therefore replace the to-be-removed hook with a dummy hook.
  */
-static void nf_remove_net_hook(struct nf_hook_entries *old,
-			       const struct nf_hook_ops *unreg, int pf)
+static bool nf_remove_net_hook(struct nf_hook_entries *old,
+			       const struct nf_hook_ops *unreg)
 {
 	struct nf_hook_ops **orig_ops;
-	bool found = false;
 	unsigned int i;
 
 	orig_ops = nf_hook_entries_get_hook_ops(old);
@@ -360,21 +381,10 @@ static void nf_remove_net_hook(struct nf_hook_entries *old,
 			continue;
 		WRITE_ONCE(old->hooks[i].hook, accept_all);
 		WRITE_ONCE(orig_ops[i], &dummy_ops);
-		found = true;
-		break;
+		return true;
 	}
 
-	if (found) {
-#ifdef CONFIG_NETFILTER_INGRESS
-		if (pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
-			net_dec_ingress_queue();
-#endif
-#ifdef HAVE_JUMP_LABEL
-		static_key_slow_dec(&nf_hooks_needed[pf][unreg->hooknum]);
-#endif
-	} else {
-		WARN_ONCE(1, "hook not found, pf %d num %d", pf, unreg->hooknum);
-	}
+	return false;
 }
 
 static void __nf_unregister_net_hook(struct net *net, int pf,
@@ -395,9 +405,19 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
 		return;
 	}
 
-	nf_remove_net_hook(p, reg, pf);
+	if (nf_remove_net_hook(p, reg)) {
+#ifdef CONFIG_NETFILTER_INGRESS
+		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+			net_dec_ingress_queue();
+#endif
+#ifdef HAVE_JUMP_LABEL
+		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#endif
+	} else {
+		WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
+	}
 
-	p = __nf_hook_entries_try_shrink(pp);
+	p = __nf_hook_entries_try_shrink(p, pp);
 	mutex_unlock(&nf_hook_mutex);
 	if (!p)
 		return;
@@ -417,6 +437,19 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 }
 EXPORT_SYMBOL(nf_unregister_net_hook);
 
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+				const struct nf_hook_ops *reg)
+{
+	struct nf_hook_entries *p;
+
+	p = rcu_dereference_raw(*pp);
+	if (nf_remove_net_hook(p, reg)) {
+		p = __nf_hook_entries_try_shrink(p, pp);
+		nf_hook_entries_free(p);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw);
+
 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
 	int err;
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 18f6d7ae995b..e15779fd58e3 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -15,4 +15,9 @@ void nf_queue_nf_hook_drop(struct net *net);
 /* nf_log.c */
 int __init netfilter_log_init(void);
 
+/* core.c */
+void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
+				const struct nf_hook_ops *reg);
+int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp,
+				const struct nf_hook_ops *reg);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 1cd472bf036ca038e783ef5f058f54e45b7e8180 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:57 +0200
Subject: netfilter: nf_nat: add nat hook register functions to nf_nat

This adds the infrastructure to register nat hooks with the nat core
instead of the netfilter core.

nat hooks are used to configure nat bindings.  Such hooks are registered
from ip(6)table_nat or by the nftables core when a nat chain is added.

After next patch, nat hooks will be registered with nf_nat instead of
netfilter core.  This allows to use many nat lookup functions at the
same time while doing the real packet rewrite (nat transformation) in
one place.

This change doesn't convert the intended users yet to ease review.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h |   4 ++
 net/netfilter/nf_nat_core.c    | 157 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index da3d601cadee..a17eb2f8d40e 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -75,4 +75,8 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 #endif
 }
 
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+		       const struct nf_hook_ops *nat_ops, unsigned int ops_count);
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+			  unsigned int ops_count);
 #endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 0cd503aacbf0..f531d77dd684 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -32,6 +32,8 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+#include "nf_internals.h"
+
 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
 
 static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -39,11 +41,27 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
+static unsigned int nat_net_id __read_mostly;
 
 static struct hlist_head *nf_nat_bysource __read_mostly;
 static unsigned int nf_nat_htable_size __read_mostly;
 static unsigned int nf_nat_hash_rnd __read_mostly;
 
+struct nf_nat_lookup_hook_priv {
+	struct nf_hook_entries __rcu *entries;
+
+	struct rcu_head rcu_head;
+};
+
+struct nf_nat_hooks_net {
+	struct nf_hook_ops *nat_hook_ops;
+	unsigned int users;
+};
+
+struct nat_net {
+	struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
+};
+
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
 {
@@ -871,6 +889,138 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 	.expectfn	= nf_nat_follow_master,
 };
 
+int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+		       const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
+{
+	struct nat_net *nat_net = net_generic(net, nat_net_id);
+	struct nf_nat_hooks_net *nat_proto_net;
+	struct nf_nat_lookup_hook_priv *priv;
+	unsigned int hooknum = ops->hooknum;
+	struct nf_hook_ops *nat_ops;
+	int i, ret;
+
+	if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+		return -EINVAL;
+
+	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+	for (i = 0; i < ops_count; i++) {
+		if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
+			return -EINVAL;
+		if (orig_nat_ops[i].hooknum == hooknum) {
+			hooknum = i;
+			break;
+		}
+	}
+
+	if (WARN_ON_ONCE(i == ops_count))
+		return -EINVAL;
+
+	mutex_lock(&nf_nat_proto_mutex);
+	if (!nat_proto_net->nat_hook_ops) {
+		WARN_ON(nat_proto_net->users != 0);
+
+		nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+		if (!nat_ops) {
+			mutex_unlock(&nf_nat_proto_mutex);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < ops_count; i++) {
+			priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+			if (priv) {
+				nat_ops[i].priv = priv;
+				continue;
+			}
+			mutex_unlock(&nf_nat_proto_mutex);
+			while (i)
+				kfree(nat_ops[--i].priv);
+			kfree(nat_ops);
+			return -ENOMEM;
+		}
+
+		ret = nf_register_net_hooks(net, nat_ops, ops_count);
+		if (ret < 0) {
+			mutex_unlock(&nf_nat_proto_mutex);
+			for (i = 0; i < ops_count; i++)
+				kfree(nat_ops[i].priv);
+			kfree(nat_ops);
+			return ret;
+		}
+
+		nat_proto_net->nat_hook_ops = nat_ops;
+	}
+
+	nat_ops = nat_proto_net->nat_hook_ops;
+	priv = nat_ops[hooknum].priv;
+	if (WARN_ON_ONCE(!priv)) {
+		mutex_unlock(&nf_nat_proto_mutex);
+		return -EOPNOTSUPP;
+	}
+
+	ret = nf_hook_entries_insert_raw(&priv->entries, ops);
+	if (ret == 0)
+		nat_proto_net->users++;
+
+	mutex_unlock(&nf_nat_proto_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_register_fn);
+
+void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+		          unsigned int ops_count)
+{
+	struct nat_net *nat_net = net_generic(net, nat_net_id);
+	struct nf_nat_hooks_net *nat_proto_net;
+	struct nf_nat_lookup_hook_priv *priv;
+	struct nf_hook_ops *nat_ops;
+	int hooknum = ops->hooknum;
+	int i;
+
+	if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+		return;
+
+	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+
+	mutex_lock(&nf_nat_proto_mutex);
+	if (WARN_ON(nat_proto_net->users == 0))
+		goto unlock;
+
+	nat_proto_net->users--;
+
+	nat_ops = nat_proto_net->nat_hook_ops;
+	for (i = 0; i < ops_count; i++) {
+		if (nat_ops[i].hooknum == hooknum) {
+			hooknum = i;
+			break;
+		}
+	}
+	if (WARN_ON_ONCE(i == ops_count))
+		goto unlock;
+	priv = nat_ops[hooknum].priv;
+	nf_hook_entries_delete_raw(&priv->entries, ops);
+
+	if (nat_proto_net->users == 0) {
+		nf_unregister_net_hooks(net, nat_ops, ops_count);
+
+		for (i = 0; i < ops_count; i++) {
+			priv = nat_ops[i].priv;
+			kfree_rcu(priv, rcu_head);
+		}
+
+		nat_proto_net->nat_hook_ops = NULL;
+		kfree(nat_ops);
+	}
+unlock:
+	mutex_unlock(&nf_nat_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
+
+static struct pernet_operations nat_net_ops = {
+	.id = &nat_net_id,
+	.size = sizeof(struct nat_net),
+};
+
 static int __init nf_nat_init(void)
 {
 	int ret, i;
@@ -894,6 +1044,12 @@ static int __init nf_nat_init(void)
 	for (i = 0; i < CONNTRACK_LOCKS; i++)
 		spin_lock_init(&nf_nat_locks[i]);
 
+	ret = register_pernet_subsys(&nat_net_ops);
+	if (ret < 0) {
+		nf_ct_extend_unregister(&nat_extend);
+		return ret;
+	}
+
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
@@ -925,6 +1081,7 @@ static void __exit nf_nat_cleanup(void)
 		kfree(nf_nat_l4protos[i]);
 	synchronize_net();
 	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+	unregister_pernet_subsys(&nat_net_ops);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 9971a514ed2697e542f3984a6162eac54bb1da98 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:58 +0200
Subject: netfilter: nf_nat: add nat type hooks to nat core

Currently the packet rewrite and instantiation of nat NULL bindings
happens from the protocol specific nat backend.

Invocation occurs either via ip(6)table_nat or the nf_tables nat chain type.

Invocation looks like this (simplified):
NF_HOOK()
   |
   `---iptable_nat
	 |
	 `---> nf_nat_l3proto_ipv4 -> nf_nat_packet
	               |
          new packet? pass skb though iptables nat chain
                       |
		       `---> iptable_nat: ipt_do_table

In nft case, this looks the same (nft_chain_nat_ipv4 instead of
iptable_nat).

This is a problem for two reasons:
1. Can't use iptables nat and nf_tables nat at the same time,
   as the first user adds a nat binding (nf_nat_l3proto_ipv4 adds a
   NULL binding if do_table() did not find a matching nat rule so we
   can detect post-nat tuple collisions).
2. If you use e.g. nft_masq, snat, redir, etc. uses must also register
   an empty base chain so that the nat core gets called fro NF_HOOK()
   to do the reverse translation, which is neither obvious nor user
   friendly.

After this change, the base hook gets registered not from iptable_nat or
nftables nat hooks, but from the l3 nat core.

iptables/nft nat base hooks get registered with the nat core instead:

NF_HOOK()
   |
   `---> nf_nat_l3proto_ipv4 -> nf_nat_packet
		|
         new packet? pass skb through iptables/nftables nat chains
                |
		+-> iptables_nat: ipt_do_table
	        +-> nft nat chain x
	        `-> nft nat chain y

The nat core deals with null bindings and reverse translation.
When no mapping exists, it calls the registered nat lookup hooks until
one creates a new mapping.
If both iptables and nftables nat hooks exist, the first matching
one is used (i.e., higher priority wins).

Also, nft users do not need to create empty nat hooks anymore,
nat core always registers the base hooks that take care of reverse/reply
translation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_core.h      |  5 +-
 include/net/netfilter/nf_nat_l3proto.h   | 52 ++-----------------
 net/ipv4/netfilter/iptable_nat.c         | 85 ++++++++++++++++----------------
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 82 ++++++++++++++++++++----------
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  | 51 +++----------------
 net/ipv6/netfilter/ip6table_nat.c        | 84 +++++++++++++++----------------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 83 ++++++++++++++++++++-----------
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  | 48 +++---------------
 net/netfilter/nf_nat_core.c              | 31 +++++++-----
 9 files changed, 232 insertions(+), 289 deletions(-)

diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index 0d84dd29108d..c78e9be14b3d 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -13,10 +13,7 @@ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state));
+	       const struct nf_hook_state *state);
 
 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family);
 
diff --git a/include/net/netfilter/nf_nat_l3proto.h b/include/net/netfilter/nf_nat_l3proto.h
index 8bad2560576f..d300b8f03972 100644
--- a/include/net/netfilter/nf_nat_l3proto.h
+++ b/include/net/netfilter/nf_nat_l3proto.h
@@ -44,58 +44,14 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				  enum ip_conntrack_info ctinfo,
 				  unsigned int hooknum);
 
-unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
-			     const struct nf_hook_state *state,
-			     unsigned int (*do_chain)(void *priv,
-						      struct sk_buff *skb,
-						      const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_local_fn(void *priv,
-				  struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  unsigned int (*do_chain)(void *priv,
-							   struct sk_buff *skb,
-							   const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
 int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct,
 				    enum ip_conntrack_info ctinfo,
 				    unsigned int hooknum, unsigned int hdrlen);
 
-unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
-			     const struct nf_hook_state *state,
-			     unsigned int (*do_chain)(void *priv,
-						      struct sk_buff *skb,
-						      const struct nf_hook_state *state));
-
-unsigned int nf_nat_ipv6_local_fn(void *priv,
-				  struct sk_buff *skb,
-				  const struct nf_hook_state *state,
-				  unsigned int (*do_chain)(void *priv,
-							   struct sk_buff *skb,
-							   const struct nf_hook_state *state));
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
-unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
-			    const struct nf_hook_state *state,
-			    unsigned int (*do_chain)(void *priv,
-						     struct sk_buff *skb,
-						     const struct nf_hook_state *state));
+int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
 #endif /* _NF_NAT_L3PROTO_H */
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 529d89ec31e8..a317445448bf 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -38,69 +38,58 @@ static unsigned int iptable_nat_do_chain(void *priv,
 	return ipt_do_table(skb, state, state->net->ipv4.nat_table);
 }
 
-static unsigned int iptable_nat_ipv4_fn(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_in(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_in(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_out(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_out(priv, skb, state, iptable_nat_do_chain);
-}
-
-static unsigned int iptable_nat_ipv4_local_fn(void *priv,
-					      struct sk_buff *skb,
-					      const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_local_fn(priv, skb, state, iptable_nat_do_chain);
-}
-
 static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
-	/* Before packet filtering, change destination */
 	{
-		.hook		= iptable_nat_ipv4_in,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= iptable_nat_ipv4_out,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
-	/* Before packet filtering, change destination */
 	{
-		.hook		= iptable_nat_ipv4_local_fn,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= iptable_nat_ipv4_fn,
+		.hook		= iptable_nat_do_chain,
 		.pf		= NFPROTO_IPV4,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP_PRI_NAT_SRC,
 	},
 };
 
+static int ipt_nat_register_lookups(struct net *net)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
+		ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+		if (ret) {
+			while (i)
+				nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void ipt_nat_unregister_lookups(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
+		nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+}
+
 static int __net_init iptable_nat_table_init(struct net *net)
 {
 	struct ipt_replace *repl;
@@ -113,7 +102,18 @@ static int __net_init iptable_nat_table_init(struct net *net)
 	if (repl == NULL)
 		return -ENOMEM;
 	ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
-				 nf_nat_ipv4_ops, &net->ipv4.nat_table);
+				 NULL, &net->ipv4.nat_table);
+	if (ret < 0) {
+		kfree(repl);
+		return ret;
+	}
+
+	ret = ipt_nat_register_lookups(net);
+	if (ret < 0) {
+		ipt_unregister_table(net, net->ipv4.nat_table, NULL);
+		net->ipv4.nat_table = NULL;
+	}
+
 	kfree(repl);
 	return ret;
 }
@@ -122,7 +122,8 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
 {
 	if (!net->ipv4.nat_table)
 		return;
-	ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+	ipt_nat_unregister_lookups(net);
+	ipt_unregister_table(net, net->ipv4.nat_table, NULL);
 	net->ipv4.nat_table = NULL;
 }
 
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 29b5aceac66d..6115bf1ff6f0 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -241,12 +241,9 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -265,35 +262,28 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 		}
 	}
 
-	return nf_nat_inet_fn(priv, skb, state, do_chain);
+	return nf_nat_inet_fn(priv, skb, state);
 }
 EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    daddr != ip_hdr(skb)->daddr)
 		skb_dst_drop(skb);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
-		const struct nf_hook_state *state,
-		unsigned int (*do_chain)(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state))
+		const struct nf_hook_state *state)
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -302,7 +292,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -322,21 +312,17 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
 #endif
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
 
-unsigned int
+static unsigned int
 nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
-		     const struct nf_hook_state *state,
-		     unsigned int (*do_chain)(void *priv,
-					       struct sk_buff *skb,
-					       const struct nf_hook_state *state))
+		     const struct nf_hook_state *state)
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 	int err;
 
-	ret = nf_nat_ipv4_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv4_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -360,7 +346,49 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_in,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_out,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv4_local_fn,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv4_fn,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_NAT_SRC,
+	},
+};
+
+int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
+
+void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
 
 static int __init nf_nat_l3proto_ipv4_init(void)
 {
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index bbcb624b6b81..a3c4ea303e3e 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -27,8 +27,8 @@
 #include <net/ip.h>
 
 static unsigned int nft_nat_do_chain(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
+				     struct sk_buff *skb,
+				     const struct nf_hook_state *state)
 {
 	struct nft_pktinfo pkt;
 
@@ -38,49 +38,14 @@ static unsigned int nft_nat_do_chain(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_nat_ipv4_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv4_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
 static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	int ret = nf_register_net_hook(net, ops);
-	if (ret == 0) {
-		ret = nf_ct_netns_get(net, NFPROTO_IPV4);
-		if (ret)
-			 nf_unregister_net_hook(net, ops);
-	}
-	return ret;
+	return nf_nat_l3proto_ipv4_register_fn(net, ops);
 }
 
 static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hook(net, ops);
-	nf_ct_netns_put(net, NFPROTO_IPV4);
+	nf_nat_l3proto_ipv4_unregister_fn(net, ops);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv4 = {
@@ -93,10 +58,10 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_LOCAL_IN),
 	.hooks		= {
-		[NF_INET_PRE_ROUTING]	= nft_nat_ipv4_in,
-		[NF_INET_POST_ROUTING]	= nft_nat_ipv4_out,
-		[NF_INET_LOCAL_OUT]	= nft_nat_ipv4_local_fn,
-		[NF_INET_LOCAL_IN]	= nft_nat_ipv4_fn,
+		[NF_INET_PRE_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_nat_do_chain,
 	},
 	.ops_register = nft_nat_ipv4_reg,
 	.ops_unregister = nft_nat_ipv4_unreg,
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 2bf554e18af8..67ba70ab9f5c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -40,69 +40,58 @@ static unsigned int ip6table_nat_do_chain(void *priv,
 	return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
 }
 
-static unsigned int ip6table_nat_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_fn(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_in(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_out(priv, skb, state, ip6table_nat_do_chain);
-}
-
-static unsigned int ip6table_nat_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain);
-}
-
 static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
-	/* Before packet filtering, change destination */
 	{
-		.hook		= ip6table_nat_in,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_PRE_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= ip6table_nat_out,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP6_PRI_NAT_SRC,
 	},
-	/* Before packet filtering, change destination */
 	{
-		.hook		= ip6table_nat_local_fn,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
-		.nat_hook	= true,
 		.hooknum	= NF_INET_LOCAL_OUT,
 		.priority	= NF_IP6_PRI_NAT_DST,
 	},
-	/* After packet filtering, change source */
 	{
-		.hook		= ip6table_nat_fn,
-		.nat_hook	= true,
+		.hook		= ip6table_nat_do_chain,
 		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP6_PRI_NAT_SRC,
 	},
 };
 
+static int ip6t_nat_register_lookups(struct net *net)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
+		ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
+		if (ret) {
+			while (i)
+				nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
+
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void ip6t_nat_unregister_lookups(struct net *net)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
+		nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
+}
+
 static int __net_init ip6table_nat_table_init(struct net *net)
 {
 	struct ip6t_replace *repl;
@@ -115,7 +104,17 @@ static int __net_init ip6table_nat_table_init(struct net *net)
 	if (repl == NULL)
 		return -ENOMEM;
 	ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
-				  nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
+				  NULL, &net->ipv6.ip6table_nat);
+	if (ret < 0) {
+		kfree(repl);
+		return ret;
+	}
+
+	ret = ip6t_nat_register_lookups(net);
+	if (ret < 0) {
+		ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
+		net->ipv6.ip6table_nat = NULL;
+	}
 	kfree(repl);
 	return ret;
 }
@@ -124,7 +123,8 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
 	if (!net->ipv6.ip6table_nat)
 		return;
-	ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+	ip6t_nat_unregister_lookups(net);
+	ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
 	net->ipv6.ip6table_nat = NULL;
 }
 
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 3ec228984f82..ca6d38698b1a 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -252,12 +252,9 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -289,35 +286,27 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 		}
 	}
 
-	return nf_nat_inet_fn(priv, skb, state, do_chain);
+	return nf_nat_inet_fn(priv, skb, state);
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_fn);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
 		skb_dst_drop(skb);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_in);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
-		const struct nf_hook_state *state,
-		unsigned int (*do_chain)(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state))
+		const struct nf_hook_state *state)
 {
 #ifdef CONFIG_XFRM
 	const struct nf_conn *ct;
@@ -326,7 +315,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 #endif
 	unsigned int ret;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -346,21 +335,17 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
 #endif
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_out);
 
-unsigned int
+static unsigned int
 nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
-		     const struct nf_hook_state *state,
-		     unsigned int (*do_chain)(void *priv,
-					      struct sk_buff *skb,
-					      const struct nf_hook_state *state))
+		     const struct nf_hook_state *state)
 {
 	const struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 	int err;
 
-	ret = nf_nat_ipv6_fn(priv, skb, state, do_chain);
+	ret = nf_nat_ipv6_fn(priv, skb, state);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -384,7 +369,49 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_nat_ipv6_local_fn);
+
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv6_in,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv6_out,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_NAT_SRC,
+	},
+	/* Before packet filtering, change destination */
+	{
+		.hook		= nf_nat_ipv6_local_fn,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_NAT_DST,
+	},
+	/* After packet filtering, change source */
+	{
+		.hook		= nf_nat_ipv6_fn,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_NAT_SRC,
+	},
+};
+
+int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn);
+
+void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+	nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
 
 static int __init nf_nat_l3proto_ipv6_init(void)
 {
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 05bcb2c23125..8a081ad7d5db 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -36,50 +36,14 @@ static unsigned int nft_nat_do_chain(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static unsigned int nft_nat_ipv6_fn(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_fn(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_in(void *priv,
-				    struct sk_buff *skb,
-				    const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_in(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_out(void *priv,
-				     struct sk_buff *skb,
-				     const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_out(priv, skb, state, nft_nat_do_chain);
-}
-
-static unsigned int nft_nat_ipv6_local_fn(void *priv,
-					  struct sk_buff *skb,
-					  const struct nf_hook_state *state)
-{
-	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
-}
-
 static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
 {
-	int ret = nf_register_net_hook(net, ops);
-	if (ret == 0) {
-		ret = nf_ct_netns_get(net, NFPROTO_IPV6);
-		if (ret)
-			 nf_unregister_net_hook(net, ops);
-	}
-
-	return ret;
+	return nf_nat_l3proto_ipv6_register_fn(net, ops);
 }
 
 static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
 {
-	nf_unregister_net_hook(net, ops);
-	nf_ct_netns_put(net, NFPROTO_IPV6);
+	nf_nat_l3proto_ipv6_unregister_fn(net, ops);
 }
 
 static const struct nft_chain_type nft_chain_nat_ipv6 = {
@@ -92,10 +56,10 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 			  (1 << NF_INET_LOCAL_OUT) |
 			  (1 << NF_INET_LOCAL_IN),
 	.hooks		= {
-		[NF_INET_PRE_ROUTING]	= nft_nat_ipv6_in,
-		[NF_INET_POST_ROUTING]	= nft_nat_ipv6_out,
-		[NF_INET_LOCAL_OUT]	= nft_nat_ipv6_local_fn,
-		[NF_INET_LOCAL_IN]	= nft_nat_ipv6_fn,
+		[NF_INET_PRE_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_nat_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_nat_do_chain,
 	},
 	.ops_register		= nft_nat_ipv6_reg,
 	.ops_unregister		= nft_nat_ipv6_unreg,
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f531d77dd684..489599b549cf 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -533,10 +533,7 @@ EXPORT_SYMBOL_GPL(nf_nat_packet);
 
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
-	       const struct nf_hook_state *state,
-	       unsigned int (*do_chain)(void *priv,
-					struct sk_buff *skb,
-					const struct nf_hook_state *state))
+	       const struct nf_hook_state *state)
 {
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
@@ -564,15 +561,23 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 		 * or local packets.
 		 */
 		if (!nf_nat_initialized(ct, maniptype)) {
+			struct nf_nat_lookup_hook_priv *lpriv = priv;
+			struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
 			unsigned int ret;
-
-			ret = do_chain(priv, skb, state);
-			if (ret != NF_ACCEPT)
-				return ret;
-
-			if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
-				break;
-
+			int i;
+
+			if (!e)
+				goto null_bind;
+
+			for (i = 0; i < e->num_hook_entries; i++) {
+				ret = e->hooks[i].hook(e->hooks[i].priv, skb,
+						       state);
+				if (ret != NF_ACCEPT)
+					return ret;
+				if (nf_nat_initialized(ct, maniptype))
+					goto do_nat;
+			}
+null_bind:
 			ret = nf_nat_alloc_null_binding(ct, state->hook);
 			if (ret != NF_ACCEPT)
 				return ret;
@@ -592,7 +597,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 			goto oif_changed;
 	}
-
+do_nat:
 	return nf_nat_packet(ct, ctinfo, state->hook, skb);
 
 oif_changed:
-- 
cgit v1.2.3-59-g8ed1b


From a37061a678cab6d164f2989dd6f3b65f730289c7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 14 May 2018 23:46:59 +0200
Subject: netfilter: lift one-nat-hook-only restriction

This reverts commit f92b40a8b2645
("netfilter: core: only allow one nat hook per hook point"), this
limitation is no longer needed.  The nat core now invokes these
functions and makes sure that hook evaluation stops after a mapping is
created and a null binding is created otherwise.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h     |  1 -
 net/netfilter/core.c          |  5 ----
 net/netfilter/nf_tables_api.c | 66 ++-----------------------------------------
 3 files changed, 2 insertions(+), 70 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 85a1a0b32c66..72f5871b9a0a 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -67,7 +67,6 @@ struct nf_hook_ops {
 	struct net_device	*dev;
 	void			*priv;
 	u_int8_t		pf;
-	bool			nat_hook;
 	unsigned int		hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int			priority;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5f0ebf9a8d5b..907d6ef8f3c1 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -138,11 +138,6 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
 			continue;
 		}
 
-		if (reg->nat_hook && orig_ops[i]->nat_hook) {
-			kvfree(new);
-			return ERR_PTR(-EBUSY);
-		}
-
 		if (inserted || reg->priority > orig_ops[i]->priority) {
 			new_ops[nhooks] = (void *)orig_ops[i];
 			new->hooks[nhooks] = old->hooks[i];
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ded54b2abfbc..a2bb31472aa1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -74,64 +74,12 @@ static void nft_trans_destroy(struct nft_trans *trans)
 	kfree(trans);
 }
 
-/* removal requests are queued in the commit_list, but not acted upon
- * until after all new rules are in place.
- *
- * Therefore, nf_register_net_hook(net, &nat_hook) runs before pending
- * nf_unregister_net_hook().
- *
- * nf_register_net_hook thus fails if a nat hook is already in place
- * even if the conflicting hook is about to be removed.
- *
- * If collision is detected, search commit_log for DELCHAIN matching
- * the new nat hooknum; if we find one collision is temporary:
- *
- * Either transaction is aborted (new/colliding hook is removed), or
- * transaction is committed (old hook is removed).
- */
-static bool nf_tables_allow_nat_conflict(const struct net *net,
-					 const struct nf_hook_ops *ops)
-{
-	const struct nft_trans *trans;
-	bool ret = false;
-
-	if (!ops->nat_hook)
-		return false;
-
-	list_for_each_entry(trans, &net->nft.commit_list, list) {
-		const struct nf_hook_ops *pending_ops;
-		const struct nft_chain *pending;
-
-		if (trans->msg_type != NFT_MSG_NEWCHAIN &&
-		    trans->msg_type != NFT_MSG_DELCHAIN)
-			continue;
-
-		pending = trans->ctx.chain;
-		if (!nft_is_base_chain(pending))
-			continue;
-
-		pending_ops = &nft_base_chain(pending)->ops;
-		if (pending_ops->nat_hook &&
-		    pending_ops->pf == ops->pf &&
-		    pending_ops->hooknum == ops->hooknum) {
-			/* other hook registration already pending? */
-			if (trans->msg_type == NFT_MSG_NEWCHAIN)
-				return false;
-
-			ret = true;
-		}
-	}
-
-	return ret;
-}
-
 static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
 {
 	const struct nft_base_chain *basechain;
-	struct nf_hook_ops *ops;
-	int ret;
+	const struct nf_hook_ops *ops;
 
 	if (table->flags & NFT_TABLE_F_DORMANT ||
 	    !nft_is_base_chain(chain))
@@ -143,14 +91,7 @@ static int nf_tables_register_hook(struct net *net,
 	if (basechain->type->ops_register)
 		return basechain->type->ops_register(net, ops);
 
-	ret = nf_register_net_hook(net, ops);
-	if (ret == -EBUSY && nf_tables_allow_nat_conflict(net, ops)) {
-		ops->nat_hook = false;
-		ret = nf_register_net_hook(net, ops);
-		ops->nat_hook = true;
-	}
-
-	return ret;
+	return nf_register_net_hook(net, ops);
 }
 
 static void nf_tables_unregister_hook(struct net *net,
@@ -1418,9 +1359,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		ops->hook	= hook.type->hooks[ops->hooknum];
 		ops->dev	= hook.dev;
 
-		if (basechain->type->type == NFT_CHAIN_T_NAT)
-			ops->nat_hook = true;
-
 		chain->flags |= NFT_BASE_CHAIN;
 		basechain->policy = policy;
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From 57a4c9c5c7dd582ad6172bdbd81543c3f4a1fcd8 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 16 May 2018 01:07:58 +0200
Subject: netfilter: make NF_OSF non-visible symbol

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e57c9d479503..a5b60e6a983e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -445,7 +445,7 @@ config NETFILTER_SYNPROXY
 endif # NF_CONNTRACK
 
 config NF_OSF
-	tristate 'Passive OS fingerprint infrastructure'
+	tristate
 
 config NF_TABLES
 	select NETFILTER_NETLINK
-- 
cgit v1.2.3-59-g8ed1b


From 8d8540c4f5e03d847c004e71d6a577bf4f8c78cd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 May 2018 22:58:34 +0200
Subject: netfilter: nft_set_rbtree: add timeout support

Add garbage collection logic to expire elements stored in the rb-tree
representation.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_rbtree.c | 75 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 22c57d7612c4..d260ce2d6671 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -22,6 +22,7 @@ struct nft_rbtree {
 	struct rb_root		root;
 	rwlock_t		lock;
 	seqcount_t		count;
+	struct delayed_work	gc_work;
 };
 
 struct nft_rbtree_elem {
@@ -265,6 +266,7 @@ static void nft_rbtree_activate(const struct net *net,
 	struct nft_rbtree_elem *rbe = elem->priv;
 
 	nft_set_elem_change_active(net, set, &rbe->ext);
+	nft_set_elem_clear_busy(&rbe->ext);
 }
 
 static bool nft_rbtree_flush(const struct net *net,
@@ -272,8 +274,12 @@ static bool nft_rbtree_flush(const struct net *net,
 {
 	struct nft_rbtree_elem *rbe = priv;
 
-	nft_set_elem_change_active(net, set, &rbe->ext);
-	return true;
+	if (!nft_set_elem_mark_busy(&rbe->ext) ||
+	    !nft_is_active(net, &rbe->ext)) {
+		nft_set_elem_change_active(net, set, &rbe->ext);
+		return true;
+	}
+	return false;
 }
 
 static void *nft_rbtree_deactivate(const struct net *net,
@@ -347,6 +353,62 @@ cont:
 	read_unlock_bh(&priv->lock);
 }
 
+static void nft_rbtree_gc(struct work_struct *work)
+{
+	struct nft_set_gc_batch *gcb = NULL;
+	struct rb_node *node, *prev = NULL;
+	struct nft_rbtree_elem *rbe;
+	struct nft_rbtree *priv;
+	struct nft_set *set;
+	int i;
+
+	priv = container_of(work, struct nft_rbtree, gc_work.work);
+	set  = nft_set_container_of(priv);
+
+	write_lock_bh(&priv->lock);
+	write_seqcount_begin(&priv->count);
+	for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+		rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+		if (nft_rbtree_interval_end(rbe)) {
+			prev = node;
+			continue;
+		}
+		if (!nft_set_elem_expired(&rbe->ext))
+			continue;
+		if (nft_set_elem_mark_busy(&rbe->ext))
+			continue;
+
+		gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
+		if (!gcb)
+			goto out;
+
+		atomic_dec(&set->nelems);
+		nft_set_gc_batch_add(gcb, rbe);
+
+		if (prev) {
+			rbe = rb_entry(prev, struct nft_rbtree_elem, node);
+			atomic_dec(&set->nelems);
+			nft_set_gc_batch_add(gcb, rbe);
+		}
+		node = rb_next(node);
+	}
+out:
+	if (gcb) {
+		for (i = 0; i < gcb->head.cnt; i++) {
+			rbe = gcb->elems[i];
+			rb_erase(&rbe->node, &priv->root);
+		}
+	}
+	write_seqcount_end(&priv->count);
+	write_unlock_bh(&priv->lock);
+
+	nft_set_gc_batch_complete(gcb);
+
+	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+			   nft_set_gc_interval(set));
+}
+
 static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[],
 					const struct nft_set_desc *desc)
 {
@@ -362,6 +424,12 @@ static int nft_rbtree_init(const struct nft_set *set,
 	rwlock_init(&priv->lock);
 	seqcount_init(&priv->count);
 	priv->root = RB_ROOT;
+
+	INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
+	if (set->flags & NFT_SET_TIMEOUT)
+		queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+				   nft_set_gc_interval(set));
+
 	return 0;
 }
 
@@ -371,6 +439,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
 	struct nft_rbtree_elem *rbe;
 	struct rb_node *node;
 
+	cancel_delayed_work_sync(&priv->gc_work);
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -395,7 +464,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
 
 static struct nft_set_type nft_rbtree_type __read_mostly = {
 	.owner		= THIS_MODULE,
-	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT,
+	.features	= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
 	.ops		= {
 		.privsize	= nft_rbtree_privsize,
 		.elemsize	= offsetof(struct nft_rbtree_elem, ext),
-- 
cgit v1.2.3-59-g8ed1b


From cede24d1b21d68d84ac5a36c44f7d37daadcc258 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Sun, 20 May 2018 13:03:38 +0200
Subject: netfilter: ip6t_rpfilter: provide input interface for route lookup

In commit 47b7e7f82802, this bit was removed at the same time the
RT6_LOOKUP_F_IFACE flag was removed. However, it is needed when
link-local addresses are used, which is a very common case: when
packets are routed, neighbor solicitations are done using link-local
addresses. For example, the following neighbor solicitation is not
matched by "-m rpfilter":

    IP6 fe80::5254:33ff:fe00:1 > ff02::1:ff00:3: ICMP6, neighbor
    solicitation, who has 2001:db8::5254:33ff:fe00:3, length 32

Commit 47b7e7f82802 doesn't quite explain why we shouldn't use
RT6_LOOKUP_F_IFACE in the rpfilter case. I suppose the interface check
later in the function would make it redundant. However, the remaining
of the routing code is using RT6_LOOKUP_F_IFACE when there is no
source address (which matches rpfilter's case with a non-unicast
destination, like with neighbor solicitation).

Signed-off-by: Vincent Bernat <vincent@bernat.im>
Fixes: 47b7e7f82802 ("netfilter: don't set F_IFACE on ipv6 fib lookups")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6t_rpfilter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d12f511929f5..0fe61ede77c6 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
 	}
 
 	fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+	if ((flags & XT_RPFILTER_LOOSE) == 0)
+		fl6.flowi6_oif = dev->ifindex;
 
 	rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
 	if (rt->dst.error)
-- 
cgit v1.2.3-59-g8ed1b


From 1f4b24397d3e164dde7026b91056e67304724fb6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:12 +0200
Subject: netfilter: add struct nf_ct_hook and use it

Move the nf_ct_destroy indirection to the struct nf_ct_hook.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         |  7 ++++++-
 net/netfilter/core.c              | 14 +++++++-------
 net/netfilter/nf_conntrack_core.c |  9 ++++++---
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 72f5871b9a0a..75ded6f6eebe 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -373,13 +373,18 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
-extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
 
 struct nf_conn;
 enum ip_conntrack_info;
+
+struct nf_ct_hook {
+	void (*destroy)(struct nf_conntrack *);
+};
+extern struct nf_ct_hook __rcu *nf_ct_hook;
+
 struct nlattr;
 
 struct nfnl_ct_hook {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 907d6ef8f3c1..1bd844ea1d7c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -563,6 +563,9 @@ EXPORT_SYMBOL(skb_make_writable);
 struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfnl_ct_hook);
 
+struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_hook);
+
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 /* This does not belong here, but locally generated errors need it if connection
    tracking in use: without this, connection may not be in hash table, and hence
@@ -585,17 +588,14 @@ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 }
 EXPORT_SYMBOL(nf_ct_attach);
 
-void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
-EXPORT_SYMBOL(nf_ct_destroy);
-
 void nf_conntrack_destroy(struct nf_conntrack *nfct)
 {
-	void (*destroy)(struct nf_conntrack *);
+	struct nf_ct_hook *ct_hook;
 
 	rcu_read_lock();
-	destroy = rcu_dereference(nf_ct_destroy);
-	BUG_ON(destroy == NULL);
-	destroy(nfct);
+	ct_hook = rcu_dereference(nf_ct_hook);
+	BUG_ON(ct_hook == NULL);
+	ct_hook->destroy(nfct);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(nf_conntrack_destroy);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 605441727008..8b2a8644d955 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1813,8 +1813,7 @@ void nf_conntrack_cleanup_start(void)
 
 void nf_conntrack_cleanup_end(void)
 {
-	RCU_INIT_POINTER(nf_ct_destroy, NULL);
-
+	RCU_INIT_POINTER(nf_ct_hook, NULL);
 	cancel_delayed_work_sync(&conntrack_gc_work.dwork);
 	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
 
@@ -2131,11 +2130,15 @@ err_cachep:
 	return ret;
 }
 
+static struct nf_ct_hook nf_conntrack_hook = {
+	.destroy	= destroy_conntrack,
+};
+
 void nf_conntrack_init_end(void)
 {
 	/* For use by REJECT target */
 	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
-	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
+	RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 2c205dd3981f79cef097207ba9c61c2260812f39 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:19 +0200
Subject: netfilter: add struct nf_nat_hook and use it

Move decode_session() and parse_nat_setup_hook() indirections to struct
nf_nat_hook structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h            | 21 ++++++++++++++++-----
 include/net/netfilter/nf_nat_core.h  |  7 -------
 net/netfilter/core.c                 |  8 +++-----
 net/netfilter/nf_conntrack_core.c    |  5 -----
 net/netfilter/nf_conntrack_netlink.c | 10 +++++-----
 net/netfilter/nf_nat_core.c          | 23 ++++++++++++-----------
 6 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 75ded6f6eebe..e8d09dc028f6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -320,18 +320,29 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 
 #include <net/flow.h>
-extern void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
+
+struct nf_conn;
+enum nf_nat_manip_type;
+struct nlattr;
+
+struct nf_nat_hook {
+	int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
+			       const struct nlattr *attr);
+	void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
+};
+
+extern struct nf_nat_hook __rcu *nf_nat_hook;
 
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 {
 #ifdef CONFIG_NF_NAT_NEEDED
-	void (*decodefn)(struct sk_buff *, struct flowi *);
+	struct nf_nat_hook *nat_hook;
 
 	rcu_read_lock();
-	decodefn = rcu_dereference(nf_nat_decode_session_hook);
-	if (decodefn)
-		decodefn(skb, fl);
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (nat_hook->decode_session)
+		nat_hook->decode_session(skb, fl);
 	rcu_read_unlock();
 #endif
 }
diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h
index c78e9be14b3d..dc7cd0440229 100644
--- a/include/net/netfilter/nf_nat_core.h
+++ b/include/net/netfilter/nf_nat_core.h
@@ -26,11 +26,4 @@ static inline int nf_nat_initialized(struct nf_conn *ct,
 		return ct->status & IPS_DST_NAT_DONE;
 }
 
-struct nlattr;
-
-extern int
-(*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
-				  enum nf_nat_manip_type manip,
-				  const struct nlattr *attr);
-
 #endif /* _NF_NAT_CORE_H */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 1bd844ea1d7c..e0ae4aae96f5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -574,6 +574,9 @@ void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
 		__rcu __read_mostly;
 EXPORT_SYMBOL(ip_ct_attach);
 
+struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_nat_hook);
+
 void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
 {
 	void (*attach)(struct sk_buff *, const struct sk_buff *);
@@ -608,11 +611,6 @@ const struct nf_conntrack_zone nf_ct_zone_dflt = {
 EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 #endif /* CONFIG_NF_CONNTRACK */
 
-#ifdef CONFIG_NF_NAT_NEEDED
-void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
-EXPORT_SYMBOL(nf_nat_decode_session_hook);
-#endif
-
 static void __net_init __netfilter_net_init(struct nf_hook_entries **e, int max)
 {
 	int h;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8b2a8644d955..8d109d750073 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -58,11 +58,6 @@
 
 #include "nf_internals.h"
 
-int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
-				      enum nf_nat_manip_type manip,
-				      const struct nlattr *attr) __read_mostly;
-EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
-
 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d807b8770be3..39327a42879f 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1431,11 +1431,11 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
 {
-	typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup;
+	struct nf_nat_hook *nat_hook;
 	int err;
 
-	parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook);
-	if (!parse_nat_setup) {
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (!nat_hook) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
 		nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
@@ -1446,13 +1446,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 		}
 		nfnl_lock(NFNL_SUBSYS_CTNETLINK);
 		rcu_read_lock();
-		if (nfnetlink_parse_nat_setup_hook)
+		if (nat_hook->parse_nat_setup)
 			return -EAGAIN;
 #endif
 		return -EOPNOTSUPP;
 	}
 
-	err = parse_nat_setup(ct, manip, attr);
+	err = nat_hook->parse_nat_setup(ct, manip, attr);
 	if (err == -EAGAIN) {
 #ifdef CONFIG_MODULES
 		rcu_read_unlock();
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 489599b549cf..f4d264676cfe 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1026,6 +1026,13 @@ static struct pernet_operations nat_net_ops = {
 	.size = sizeof(struct nat_net),
 };
 
+struct nf_nat_hook nat_hook = {
+	.parse_nat_setup	= nfnetlink_parse_nat_setup,
+#ifdef CONFIG_XFRM
+	.decode_session		= __nf_nat_decode_session,
+#endif
+};
+
 static int __init nf_nat_init(void)
 {
 	int ret, i;
@@ -1057,13 +1064,9 @@ static int __init nf_nat_init(void)
 
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
-	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
-			   nfnetlink_parse_nat_setup);
-#ifdef CONFIG_XFRM
-	BUG_ON(nf_nat_decode_session_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
-#endif
+	WARN_ON(nf_nat_hook != NULL);
+	RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
+
 	return 0;
 }
 
@@ -1076,10 +1079,8 @@ static void __exit nf_nat_cleanup(void)
 
 	nf_ct_extend_unregister(&nat_extend);
 	nf_ct_helper_expectfn_unregister(&follow_master_nat);
-	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-#ifdef CONFIG_XFRM
-	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
-#endif
+	RCU_INIT_POINTER(nf_nat_hook, NULL);
+
 	synchronize_rcu();
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
-- 
cgit v1.2.3-59-g8ed1b


From 368982cd7d1bd41cd39049c794990aca3770db44 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 23 May 2018 09:17:24 +0200
Subject: netfilter: nfnetlink_queue: resolve clash for unconfirmed conntracks

In nfqueue, two consecutive skbuffs may race to create the conntrack
entry. Hence, the one that loses the race gets dropped due to clash in
the insertion into the hashes from the nf_conntrack_confirm() path.

This patch adds a new nf_conntrack_update() function which searches for
possible clashes and resolve them. NAT mangling for the packet losing
race is corrected by using the conntrack information that won race.

In order to avoid direct module dependencies with conntrack and NAT, the
nf_ct_hook and nf_nat_hook structures are used for this purpose.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         |  5 +++
 net/netfilter/nf_conntrack_core.c | 77 +++++++++++++++++++++++++++++++++++++++
 net/netfilter/nf_nat_core.c       | 41 +++++++++++++--------
 net/netfilter/nfnetlink_queue.c   | 28 ++++++++++++--
 4 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e8d09dc028f6..04551af2ff23 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -324,11 +324,15 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry);
 struct nf_conn;
 enum nf_nat_manip_type;
 struct nlattr;
+enum ip_conntrack_dir;
 
 struct nf_nat_hook {
 	int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
 			       const struct nlattr *attr);
 	void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
+	unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
+				  enum nf_nat_manip_type mtype,
+				  enum ip_conntrack_dir dir);
 };
 
 extern struct nf_nat_hook __rcu *nf_nat_hook;
@@ -392,6 +396,7 @@ struct nf_conn;
 enum ip_conntrack_info;
 
 struct nf_ct_hook {
+	int (*update)(struct net *net, struct sk_buff *skb);
 	void (*destroy)(struct nf_conntrack *);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8d109d750073..3465da2a98bd 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1607,6 +1607,82 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	nf_conntrack_get(skb_nfct(nskb));
 }
 
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	struct nf_nat_hook *nat_hook;
+	unsigned int dataoff, status;
+	struct nf_conn *ct;
+	u16 l3num;
+	u8 l4num;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || nf_ct_is_confirmed(ct))
+		return 0;
+
+	l3num = nf_ct_l3num(ct);
+	l3proto = nf_ct_l3proto_find_get(l3num);
+
+	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+				 &l4num) <= 0)
+		return -1;
+
+	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+			     l4num, net, &tuple, l3proto, l4proto))
+		return -1;
+
+	if (ct->status & IPS_SRC_NAT) {
+		memcpy(tuple.src.u3.all,
+		       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
+		       sizeof(tuple.src.u3.all));
+		tuple.src.u.all =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
+	}
+
+	if (ct->status & IPS_DST_NAT) {
+		memcpy(tuple.dst.u3.all,
+		       ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
+		       sizeof(tuple.dst.u3.all));
+		tuple.dst.u.all =
+			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
+	}
+
+	h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
+	if (!h)
+		return 0;
+
+	/* Store status bits of the conntrack that is clashing to re-do NAT
+	 * mangling according to what it has been done already to this packet.
+	 */
+	status = ct->status;
+
+	nf_ct_put(ct);
+	ct = nf_ct_tuplehash_to_ctrack(h);
+	nf_ct_set(skb, ct, ctinfo);
+
+	nat_hook = rcu_dereference(nf_nat_hook);
+	if (!nat_hook)
+		return 0;
+
+	if (status & IPS_SRC_NAT &&
+	    nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
+				IP_CT_DIR_ORIGINAL) == NF_DROP)
+		return -1;
+
+	if (status & IPS_DST_NAT &&
+	    nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
+				IP_CT_DIR_ORIGINAL) == NF_DROP)
+		return -1;
+
+	return 0;
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -2126,6 +2202,7 @@ err_cachep:
 }
 
 static struct nf_ct_hook nf_conntrack_hook = {
+	.update		= nf_conntrack_update,
 	.destroy	= destroy_conntrack,
 };
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f4d264676cfe..821f8d835f7a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -493,17 +493,36 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 }
 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
 
+static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
+				     enum nf_nat_manip_type mtype,
+				     enum ip_conntrack_dir dir)
+{
+	const struct nf_nat_l3proto *l3proto;
+	const struct nf_nat_l4proto *l4proto;
+	struct nf_conntrack_tuple target;
+
+	/* We are aiming to look like inverse of other direction. */
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+	l3proto = __nf_nat_l3proto_find(target.src.l3num);
+	l4proto = __nf_nat_l4proto_find(target.src.l3num,
+					target.dst.protonum);
+	if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
+		return NF_DROP;
+
+	return NF_ACCEPT;
+}
+
 /* Do packet manipulations according to nf_nat_setup_info. */
 unsigned int nf_nat_packet(struct nf_conn *ct,
 			   enum ip_conntrack_info ctinfo,
 			   unsigned int hooknum,
 			   struct sk_buff *skb)
 {
-	const struct nf_nat_l3proto *l3proto;
-	const struct nf_nat_l4proto *l4proto;
+	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned int verdict = NF_ACCEPT;
 	unsigned long statusbit;
-	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
 
 	if (mtype == NF_NAT_MANIP_SRC)
 		statusbit = IPS_SRC_NAT;
@@ -515,19 +534,10 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
 		statusbit ^= IPS_NAT_MASK;
 
 	/* Non-atomic: these bits don't change. */
-	if (ct->status & statusbit) {
-		struct nf_conntrack_tuple target;
-
-		/* We are aiming to look like inverse of other direction. */
-		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	if (ct->status & statusbit)
+		verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
 
-		l3proto = __nf_nat_l3proto_find(target.src.l3num);
-		l4proto = __nf_nat_l4proto_find(target.src.l3num,
-						target.dst.protonum);
-		if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
-			return NF_DROP;
-	}
-	return NF_ACCEPT;
+	return verdict;
 }
 EXPORT_SYMBOL_GPL(nf_nat_packet);
 
@@ -1031,6 +1041,7 @@ struct nf_nat_hook nat_hook = {
 #ifdef CONFIG_XFRM
 	.decode_session		= __nf_nat_decode_session,
 #endif
+	.manip_pkt		= nf_nat_manip_pkt,
 };
 
 static int __init nf_nat_init(void)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 74a04638ef03..2c173042ac0e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -227,6 +227,25 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
 	return entry;
 }
 
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+	struct nf_ct_hook *ct_hook;
+	int err;
+
+	if (verdict == NF_ACCEPT ||
+	    verdict == NF_STOP) {
+		rcu_read_lock();
+		ct_hook = rcu_dereference(nf_ct_hook);
+		if (ct_hook) {
+			err = ct_hook->update(entry->state.net, entry->skb);
+			if (err < 0)
+				verdict = NF_DROP;
+		}
+		rcu_read_unlock();
+	}
+	nf_reinject(entry, verdict);
+}
+
 static void
 nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 {
@@ -237,7 +256,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
 		if (!cmpfn || cmpfn(entry, data)) {
 			list_del(&entry->list);
 			queue->queue_total--;
-			nf_reinject(entry, NF_DROP);
+			nfqnl_reinject(entry, NF_DROP);
 		}
 	}
 	spin_unlock_bh(&queue->lock);
@@ -686,7 +705,7 @@ err_out_free_nskb:
 err_out_unlock:
 	spin_unlock_bh(&queue->lock);
 	if (failopen)
-		nf_reinject(entry, NF_ACCEPT);
+		nfqnl_reinject(entry, NF_ACCEPT);
 err_out:
 	return err;
 }
@@ -1085,7 +1104,8 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
 	list_for_each_entry_safe(entry, tmp, &batch_list, list) {
 		if (nfqa[NFQA_MARK])
 			entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
-		nf_reinject(entry, verdict);
+
+		nfqnl_reinject(entry, verdict);
 	}
 	return 0;
 }
@@ -1208,7 +1228,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
 	if (nfqa[NFQA_MARK])
 		entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
 
-	nf_reinject(entry, verdict);
+	nfqnl_reinject(entry, verdict);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0d45d3fe42efc76b6c4f5a62f8d110c7a2e6f83f Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Tue, 15 May 2018 12:08:14 +0200
Subject: mt76x2: apply coverage class on slot time too

According to 802.11-2007 17.3.8.6 (slot time), the slot time should
be increased by 3 us * coverage class. Taking into account coverage
class in slot time configuration allows to increase by an order of
magnitude the throughput on a 4Km link in a noisy environment

Tested-by: Luca Bisti <luca.bisti@gmail.com>
Tested-by: Gaetano Catalli <gaetano.catalli@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 3 +++
 drivers/net/wireless/mediatek/mt76/mt76x2_main.c | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index dd4c1127797e..276c6b6bec95 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -482,7 +482,10 @@ void mt76x2_set_tx_ackto(struct mt76x2_dev *dev)
 {
 	u8 ackto, sifs, slottime = dev->slottime;
 
+	/* As defined by IEEE 802.11-2007 17.3.8.6 */
 	slottime += 3 * dev->coverage_class;
+	mt76_rmw_field(dev, MT_BKOFF_SLOT_CFG,
+		       MT_BKOFF_SLOT_CFG_SLOTTIME, slottime);
 
 	sifs = mt76_get_field(dev, MT_XIFS_TIME_CFG,
 			      MT_XIFS_TIME_CFG_OFDM_SIFS);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index 81c58f865c64..539dda8a59e2 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -247,8 +247,7 @@ mt76x2_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		int slottime = info->use_short_slot ? 9 : 20;
 
 		dev->slottime = slottime;
-		mt76_rmw_field(dev, MT_BKOFF_SLOT_CFG,
-			       MT_BKOFF_SLOT_CFG_SLOTTIME, slottime);
+		mt76x2_set_tx_ackto(dev);
 	}
 
 	mutex_unlock(&dev->mutex);
-- 
cgit v1.2.3-59-g8ed1b


From d98fb328ad103d12c1ebaea92526e4f8670e0fec Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 15 May 2018 14:33:22 +0200
Subject: mt76: fix sending encrypted broadcast packets for secondary
 interfaces

For encryption to work properly, the BSS index needs to be initialized
for the WCID entry used for the interface.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2.h      | 3 +++
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 3 +++
 drivers/net/wireless/mediatek/mt76/mt76x2_main.c | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2.h
index a5d1255e4b9c..dc12bbdbb2ee 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2.h
@@ -39,6 +39,9 @@
 
 #define MT_CALIBRATE_INTERVAL	HZ
 
+#define MT_MAX_VIFS		8
+#define MT_VIF_WCID(_n)		(254 - ((_n) & 7))
+
 #include "mt76.h"
 #include "mt76x2_regs.h"
 #include "mt76x2_mac.h"
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index 276c6b6bec95..b6f27b949566 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -296,6 +296,9 @@ static int mt76x2_mac_reset(struct mt76x2_dev *dev, bool hard)
 	for (i = 0; i < 256; i++)
 		mt76x2_mac_wcid_setup(dev, i, 0, NULL);
 
+	for (i = 0; i < MT_MAX_VIFS; i++)
+		mt76x2_mac_wcid_setup(dev, MT_VIF_WCID(i), i, NULL);
+
 	for (i = 0; i < 16; i++)
 		for (k = 0; k < 4; k++)
 			mt76x2_mac_shared_key_setup(dev, i, k, NULL);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index 539dda8a59e2..826c2431e31a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -104,7 +104,7 @@ mt76x2_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 		idx += 8;
 
 	mvif->idx = idx;
-	mvif->group_wcid.idx = 254 - idx;
+	mvif->group_wcid.idx = MT_VIF_WCID(idx);
 	mvif->group_wcid.hw_key_idx = -1;
 	mt76x2_txq_init(dev, vif->txq);
 
-- 
cgit v1.2.3-59-g8ed1b


From 66a77cbe63eb326513632cc88e1260e877be8123 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 15 May 2018 14:33:23 +0200
Subject: mt76: discard early received packets if not running yet

If the radio was previously in running state, it can receive some
packets before it is able to process them.
This can lead to a crash if the channel is not initialized yet.
Discard all rx packets until start() is called

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_mac.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
index dab713756004..b49aea4da2d6 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
@@ -301,6 +301,9 @@ int mt76x2_mac_process_rx(struct mt76x2_dev *dev, struct sk_buff *skb,
 	u8 wcid;
 	int len;
 
+	if (!test_bit(MT76_STATE_RUNNING, &dev->mt76.state))
+		return -EINVAL;
+
 	if (rxinfo & MT_RXINFO_L2PAD)
 		pad_len += 2;
 
-- 
cgit v1.2.3-59-g8ed1b


From 89bc67e3a93ae6199ae02c9e4fe41833da9ba368 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 20 May 2018 07:43:45 +0200
Subject: mt76: only stop tx queues on offchannel, not during the entire scan

During scans, mac80211 frequently switches back to the home channel to
minimize interruption of ongoing traffic. Keep regular tx queues active
during that time.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 5 +++++
 drivers/net/wireless/mediatek/mt76/mt76.h     | 1 +
 drivers/net/wireless/mediatek/mt76/tx.c       | 4 ++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 915e61733131..d862e5efd094 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -366,6 +366,11 @@ void mt76_set_channel(struct mt76_dev *dev)
 	struct mt76_channel_state *state;
 	bool offchannel = hw->conf.flags & IEEE80211_CONF_OFFCHANNEL;
 
+	if (offchannel)
+		set_bit(MT76_OFFCHANNEL, &dev->state);
+	else
+		clear_bit(MT76_OFFCHANNEL, &dev->state);
+
 	if (dev->drv->update_survey)
 		dev->drv->update_survey(dev);
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index a74e6eef51e9..2d098fac6147 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -189,6 +189,7 @@ enum {
 	MT76_STATE_RUNNING,
 	MT76_SCANNING,
 	MT76_RESET,
+	MT76_OFFCHANNEL,
 };
 
 struct mt76_hw_cap {
diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 7ecd2d7c5db4..e96956710fb2 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -332,7 +332,7 @@ mt76_txq_send_burst(struct mt76_dev *dev, struct mt76_queue *hwq,
 		if (probe)
 			break;
 
-		if (test_bit(MT76_SCANNING, &dev->state) ||
+		if (test_bit(MT76_OFFCHANNEL, &dev->state) ||
 		    test_bit(MT76_RESET, &dev->state))
 			return -EBUSY;
 
@@ -385,7 +385,7 @@ restart:
 		bool empty = false;
 		int cur;
 
-		if (test_bit(MT76_SCANNING, &dev->state) ||
+		if (test_bit(MT76_OFFCHANNEL, &dev->state) ||
 		    test_bit(MT76_RESET, &dev->state))
 			return -EBUSY;
 
-- 
cgit v1.2.3-59-g8ed1b


From a164a94212ceb606d40e92db7a58e9dbcb3ff74e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 20 May 2018 07:43:46 +0200
Subject: mt76: prevent tx scheduling during channel change

Re-schedule tx afterwards

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_main.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index 826c2431e31a..ce90ff999b49 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -124,11 +124,14 @@ mt76x2_set_channel(struct mt76x2_dev *dev, struct cfg80211_chan_def *chandef)
 {
 	int ret;
 
+	cancel_delayed_work_sync(&dev->cal_work);
+
+	set_bit(MT76_RESET, &dev->mt76.state);
+
 	mt76_set_channel(&dev->mt76);
 
 	tasklet_disable(&dev->pre_tbtt_tasklet);
 	tasklet_disable(&dev->dfs_pd.dfs_tasklet);
-	cancel_delayed_work_sync(&dev->cal_work);
 
 	mt76x2_mac_stop(dev, true);
 	ret = mt76x2_phy_set_channel(dev, chandef);
@@ -143,6 +146,10 @@ mt76x2_set_channel(struct mt76x2_dev *dev, struct cfg80211_chan_def *chandef)
 	tasklet_enable(&dev->dfs_pd.dfs_tasklet);
 	tasklet_enable(&dev->pre_tbtt_tasklet);
 
+	clear_bit(MT76_RESET, &dev->mt76.state);
+
+	mt76_txq_schedule_all(&dev->mt76);
+
 	return ret;
 }
 
@@ -452,7 +459,6 @@ mt76x2_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 
 	clear_bit(MT76_SCANNING, &dev->mt76.state);
 	tasklet_enable(&dev->pre_tbtt_tasklet);
-	mt76_txq_schedule_all(&dev->mt76);
 }
 
 static void
-- 
cgit v1.2.3-59-g8ed1b


From a85b590cf55f0789efcdd347b69c9e8ad6c3dcc7 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 20 May 2018 07:43:47 +0200
Subject: mt76: move ieee80211_hw allocation to common core

Allows it to be shared between different drivers and locks to be
initialized earlier

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c    | 22 ++++++++++++++++++++--
 drivers/net/wireless/mediatek/mt76/mt76.h        |  2 ++
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 14 ++++++--------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index d862e5efd094..d1044e5b25db 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -268,6 +268,26 @@ mt76_check_sband(struct mt76_dev *dev, int band)
 	dev->hw->wiphy->bands[band] = NULL;
 }
 
+struct mt76_dev *
+mt76_alloc_device(unsigned int size, const struct ieee80211_ops *ops)
+{
+	struct ieee80211_hw *hw;
+	struct mt76_dev *dev;
+
+	hw = ieee80211_alloc_hw(size, ops);
+	if (!hw)
+		return NULL;
+
+	dev = hw->priv;
+	dev->hw = hw;
+	spin_lock_init(&dev->rx_lock);
+	spin_lock_init(&dev->lock);
+	spin_lock_init(&dev->cc_lock);
+
+	return dev;
+}
+EXPORT_SYMBOL_GPL(mt76_alloc_device);
+
 int mt76_register_device(struct mt76_dev *dev, bool vht,
 			 struct ieee80211_rate *rates, int n_rates)
 {
@@ -277,8 +297,6 @@ int mt76_register_device(struct mt76_dev *dev, bool vht,
 
 	dev_set_drvdata(dev->dev, dev);
 
-	spin_lock_init(&dev->lock);
-	spin_lock_init(&dev->cc_lock);
 	INIT_LIST_HEAD(&dev->txwi_cache);
 
 	SET_IEEE80211_DEV(hw, dev->dev);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 2d098fac6147..bb158f867d7c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -377,6 +377,8 @@ mt76_channel_state(struct mt76_dev *dev, struct ieee80211_channel *c)
 	return &msband->chan[idx];
 }
 
+struct mt76_dev *mt76_alloc_device(unsigned int size,
+				   const struct ieee80211_ops *ops);
 int mt76_register_device(struct mt76_dev *dev, bool vht,
 			 struct ieee80211_rate *rates, int n_rates);
 void mt76_unregister_device(struct mt76_dev *dev);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index b6f27b949566..ec1715639a04 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -638,20 +638,18 @@ struct mt76x2_dev *mt76x2_alloc_device(struct device *pdev)
 		.rx_poll_complete = mt76x2_rx_poll_complete,
 		.sta_ps = mt76x2_sta_ps,
 	};
-	struct ieee80211_hw *hw;
 	struct mt76x2_dev *dev;
+	struct mt76_dev *mdev;
 
-	hw = ieee80211_alloc_hw(sizeof(*dev), &mt76x2_ops);
-	if (!hw)
+	mdev = mt76_alloc_device(sizeof(*dev), &mt76x2_ops);
+	if (!mdev)
 		return NULL;
 
-	dev = hw->priv;
-	dev->mt76.dev = pdev;
-	dev->mt76.hw = hw;
-	dev->mt76.drv = &drv_ops;
+	dev = container_of(mdev, struct mt76x2_dev, mt76);
+	mdev->dev = pdev;
+	mdev->drv = &drv_ops;
 	mutex_init(&dev->mutex);
 	spin_lock_init(&dev->irq_lock);
-	spin_lock_init(&dev->mt76.rx_lock);
 
 	return dev;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 26e40d4c0b52c60f13b074d9d6144eb3ac7fb61b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 20 May 2018 07:43:48 +0200
Subject: mt76: wait for pending tx to complete before switching channel

Reduces interruption caused by scanning

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/dma.c      |  4 ++++
 drivers/net/wireless/mediatek/mt76/mac80211.c | 16 ++++++++++++++++
 drivers/net/wireless/mediatek/mt76/mt76.h     |  2 ++
 3 files changed, 22 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 3518703524e7..3dbedcedc2c4 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -178,6 +178,10 @@ mt76_dma_tx_cleanup(struct mt76_dev *dev, enum mt76_txq_id qid, bool flush)
 		mt76_dma_sync_idx(dev, q);
 
 	wake = wake && qid < IEEE80211_NUM_ACS && q->queued < q->ndesc - 8;
+
+	if (!q->queued)
+		wake_up(&dev->tx_wait);
+
 	spin_unlock_bh(&q->lock);
 
 	if (wake)
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index d1044e5b25db..fcd079a96782 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -283,6 +283,7 @@ mt76_alloc_device(unsigned int size, const struct ieee80211_ops *ops)
 	spin_lock_init(&dev->rx_lock);
 	spin_lock_init(&dev->lock);
 	spin_lock_init(&dev->cc_lock);
+	init_waitqueue_head(&dev->tx_wait);
 
 	return dev;
 }
@@ -377,18 +378,33 @@ void mt76_rx(struct mt76_dev *dev, enum mt76_rxq_id q, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(mt76_rx);
 
+static bool mt76_has_tx_pending(struct mt76_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dev->q_tx); i++) {
+		if (dev->q_tx[i].queued)
+			return true;
+	}
+
+	return false;
+}
+
 void mt76_set_channel(struct mt76_dev *dev)
 {
 	struct ieee80211_hw *hw = dev->hw;
 	struct cfg80211_chan_def *chandef = &hw->conf.chandef;
 	struct mt76_channel_state *state;
 	bool offchannel = hw->conf.flags & IEEE80211_CONF_OFFCHANNEL;
+	int timeout = HZ / 5;
 
 	if (offchannel)
 		set_bit(MT76_OFFCHANNEL, &dev->state);
 	else
 		clear_bit(MT76_OFFCHANNEL, &dev->state);
 
+	wait_event_timeout(dev->tx_wait, !mt76_has_tx_pending(dev), timeout);
+
 	if (dev->drv->update_survey)
 		dev->drv->update_survey(dev);
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index bb158f867d7c..d2166fbf50ff 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -251,6 +251,8 @@ struct mt76_dev {
 	struct mt76_queue q_rx[__MT_RXQ_MAX];
 	const struct mt76_queue_ops *queue_ops;
 
+	wait_queue_head_t tx_wait;
+
 	u8 macaddr[ETH_ALEN];
 	u32 rev;
 	unsigned long state;
-- 
cgit v1.2.3-59-g8ed1b


From cbec83d40cc79137b008d0dd44846c5f2146a83d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 20 May 2018 07:43:49 +0200
Subject: mt76: use udelay instead of usleep_range in mt76x2_mac_stop

usleep_range can cause excessive latency on channel change if waiting
for the MAC to stop fails. It will be forced to stop by the code
following that loop anyway.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index ec1715639a04..79ab93613e06 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -376,7 +376,7 @@ void mt76x2_mac_stop(struct mt76x2_dev *dev, bool force)
 		if ((mt76_rr(dev, MT_MAC_STATUS) &
 		     (MT_MAC_STATUS_RX | MT_MAC_STATUS_TX)) ||
 		    mt76_rr(dev, MT_BBP(IBI, 12))) {
-			usleep_range(10, 20);
+			udelay(1);
 			continue;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 30bfce0b63fa68c14ae1613eb9d259fa18644074 Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Fri, 18 May 2018 15:38:54 +0800
Subject: mwifiex: correct histogram data with appropriate index

Correct snr/nr/rssi data index to avoid possible buffer underflow.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/util.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/util.c b/drivers/net/wireless/marvell/mwifiex/util.c
index 0cd68ffc2c74..51ccf10f4413 100644
--- a/drivers/net/wireless/marvell/mwifiex/util.c
+++ b/drivers/net/wireless/marvell/mwifiex/util.c
@@ -708,12 +708,14 @@ void mwifiex_hist_data_set(struct mwifiex_private *priv, u8 rx_rate, s8 snr,
 			   s8 nflr)
 {
 	struct mwifiex_histogram_data *phist_data = priv->hist_data;
+	s8 nf   = -nflr;
+	s8 rssi = snr - nflr;
 
 	atomic_inc(&phist_data->num_samples);
 	atomic_inc(&phist_data->rx_rate[rx_rate]);
-	atomic_inc(&phist_data->snr[snr]);
-	atomic_inc(&phist_data->noise_flr[128 + nflr]);
-	atomic_inc(&phist_data->sig_str[nflr - snr]);
+	atomic_inc(&phist_data->snr[snr + 128]);
+	atomic_inc(&phist_data->noise_flr[nf + 128]);
+	atomic_inc(&phist_data->sig_str[rssi + 128]);
 }
 
 /* function to reset histogram data during init/reset */
-- 
cgit v1.2.3-59-g8ed1b


From 88001968245c42c26416476bf0ef960442371605 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Mon, 14 May 2018 08:48:20 +0200
Subject: brcmfmac: add debugfs entry for reading firmware capabilities
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows reading all capabilities as reported by a firmware. They are
printed using native (raw) names, just like developers like it the most.
It's how firmware reports support for various features, e.g. supported
modes, supported standards, power saving details, max BSS-es.

Access to all that info is useful for trying new firmwares, comparing
them and debugging features AKA bugs.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../wireless/broadcom/brcm80211/brcmfmac/feature.c | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c
index 876731c57bf5..800a423c7bc2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/feature.c
@@ -165,6 +165,41 @@ static void brcmf_feat_firmware_capabilities(struct brcmf_if *ifp)
 	}
 }
 
+/**
+ * brcmf_feat_fwcap_debugfs_read() - expose firmware capabilities to debugfs.
+ *
+ * @seq: sequence for debugfs entry.
+ * @data: raw data pointer.
+ */
+static int brcmf_feat_fwcap_debugfs_read(struct seq_file *seq, void *data)
+{
+	struct brcmf_bus *bus_if = dev_get_drvdata(seq->private);
+	struct brcmf_if *ifp = brcmf_get_ifp(bus_if->drvr, 0);
+	char caps[MAX_CAPS_BUFFER_SIZE + 1] = { };
+	char *tmp;
+	int err;
+
+	err = brcmf_fil_iovar_data_get(ifp, "cap", caps, sizeof(caps));
+	if (err) {
+		brcmf_err("could not get firmware cap (%d)\n", err);
+		return err;
+	}
+
+	/* Put every capability in a new line */
+	for (tmp = caps; *tmp; tmp++) {
+		if (*tmp == ' ')
+			*tmp = '\n';
+	}
+
+	/* Usually there is a space at the end of capabilities string */
+	seq_printf(seq, "%s", caps);
+	/* So make sure we don't print two line breaks */
+	if (tmp > caps && *(tmp - 1) != '\n')
+		seq_printf(seq, "\n");
+
+	return 0;
+}
+
 void brcmf_feat_attach(struct brcmf_pub *drvr)
 {
 	struct brcmf_if *ifp = brcmf_get_ifp(drvr, 0);
@@ -233,6 +268,7 @@ void brcmf_feat_attach(struct brcmf_pub *drvr)
 void brcmf_feat_debugfs_create(struct brcmf_pub *drvr)
 {
 	brcmf_debugfs_add_entry(drvr, "features", brcmf_feat_debugfs_read);
+	brcmf_debugfs_add_entry(drvr, "fwcap", brcmf_feat_fwcap_debugfs_read);
 }
 
 bool brcmf_feat_is_enabled(struct brcmf_if *ifp, enum brcmf_feat_id id)
-- 
cgit v1.2.3-59-g8ed1b


From abd39c6ded9db53aa44c2540092bdd5fb6590fa8 Mon Sep 17 00:00:00 2001
From: Sanjay Konduri <sanjay.konduri@redpinesignals.com>
Date: Tue, 15 May 2018 14:34:30 +0530
Subject: rsi: add fix for crash during assertions

Observed crash in some scenarios when assertion has occurred,
this is because hw structure is freed and is tried to get
accessed in some functions where null check is already
present. So, avoided the crash by making the hw to NULL after
freeing.

Signed-off-by: Sanjay Konduri <sanjay.konduri@redpinesignals.com>
Signed-off-by: Sushant Kumar Mishra <sushant.mishra@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 3faa0449a5ef..bfa7569c85bb 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -245,6 +245,7 @@ void rsi_mac80211_detach(struct rsi_hw *adapter)
 		ieee80211_stop_queues(hw);
 		ieee80211_unregister_hw(hw);
 		ieee80211_free_hw(hw);
+		adapter->hw = NULL;
 	}
 
 	for (band = 0; band < NUM_NL80211_BANDS; band++) {
-- 
cgit v1.2.3-59-g8ed1b


From eeed833aaa38712efc7c41d9d6bd5afc8c13c55b Mon Sep 17 00:00:00 2001
From: Sanjay Konduri <sanjay.konduri@redpinesignals.com>
Date: Tue, 15 May 2018 14:34:31 +0530
Subject: rsi: add fix for corruption of auto rate table

Auto rate table sent to firmware is getting corrupted
as memset to zeros is not done. Added memset to skb
data before filling auto rate table.

Signed-off-by: Sanjay Konduri <sanjay.konduri@redpinesignals.com>
Signed-off-by: Sushant Kumar Mishra <sushant.mishra@redpinesignals.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mgmt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mgmt.c b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
index 0757adcb3f4a..d0e5937cad6d 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mgmt.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mgmt.c
@@ -1190,6 +1190,7 @@ static int rsi_send_auto_rate_request(struct rsi_common *common,
 		return -ENOMEM;
 	}
 
+	memset(skb->data, 0, frame_len);
 	selected_rates = kzalloc(2 * RSI_TBL_SZ, GFP_KERNEL);
 	if (!selected_rates) {
 		rsi_dbg(ERR_ZONE, "%s: Failed in allocation of mem\n",
-- 
cgit v1.2.3-59-g8ed1b


From d4e36e5554eb92f3ec7fedad3efb602570584df4 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Fri, 20 Apr 2018 13:49:25 +0300
Subject: mac80211: Support adding duration for prepare_tx() callback

There are specific cases, such as SAE authentication exchange, that
might require long duration to complete. For such cases, add support
for indicating to the driver the required duration of the prepare_tx()
operation, so the driver would still be able to complete the frame
exchange.

Currently, indicate the duration only for SAE authentication exchange,
as SAE authentication can take up to 2000 msec (as defined in IEEE
P802.11-REVmd D1.0 p. 3504).

As the patch modified the prepare_tx() callback API, also modify
the relevant code in iwlwifi.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath9k/main.c             |  3 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |  6 +++++-
 include/net/mac80211.h                            |  5 ++++-
 net/mac80211/driver-ops.h                         |  8 +++++---
 net/mac80211/mlme.c                               | 17 +++++++++------
 net/mac80211/trace.h                              | 25 ++++++++++++++++++++---
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index a3be8add56e1..b6663c80e7dd 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -2544,7 +2544,8 @@ static void ath9k_unassign_vif_chanctx(struct ieee80211_hw *hw,
 }
 
 static void ath9k_mgd_prepare_tx(struct ieee80211_hw *hw,
-				 struct ieee80211_vif *vif)
+				 struct ieee80211_vif *vif,
+				 u16 duration)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5f0701c992a4..ec91dd90acfd 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2844,7 +2844,8 @@ static int iwl_mvm_mac_conf_tx(struct ieee80211_hw *hw,
 }
 
 static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw,
-				      struct ieee80211_vif *vif)
+				       struct ieee80211_vif *vif,
+				       u16 req_duration)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	u32 duration = IWL_MVM_TE_SESSION_PROTECTION_MAX_TIME_MS;
@@ -2857,6 +2858,9 @@ static void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw,
 	if (iwl_mvm_ref_sync(mvm, IWL_MVM_REF_PREPARE_TX))
 		return;
 
+	if (req_duration > duration)
+		duration = req_duration;
+
 	mutex_lock(&mvm->mutex);
 	/* Try really hard to protect the session and hear a beacon */
 	iwl_mvm_protect_session(mvm, vif, duration, min_duration, 500, false);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 604d738a2128..851a5e19ae32 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3378,6 +3378,8 @@ enum ieee80211_reconfig_type {
  *	frame in case that no beacon was heard from the AP/P2P GO.
  *	The callback will be called before each transmission and upon return
  *	mac80211 will transmit the frame right away.
+ *      If duration is greater than zero, mac80211 hints to the driver the
+ *      duration for which the operation is requested.
  *	The callback is optional and can (should!) sleep.
  *
  * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
@@ -3697,7 +3699,8 @@ struct ieee80211_ops {
 				  u32 sset, u8 *data);
 
 	void	(*mgd_prepare_tx)(struct ieee80211_hw *hw,
-				  struct ieee80211_vif *vif);
+				  struct ieee80211_vif *vif,
+				  u16 duration);
 
 	void	(*mgd_protect_tdls_discover)(struct ieee80211_hw *hw,
 					     struct ieee80211_vif *vif);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 4d82fe7d627c..8f6998091d26 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -2,6 +2,7 @@
 /*
 * Portions of this file
 * Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
 */
 
 #ifndef __MAC80211_DRIVER_OPS
@@ -813,7 +814,8 @@ drv_allow_buffered_frames(struct ieee80211_local *local,
 }
 
 static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
-				      struct ieee80211_sub_if_data *sdata)
+				      struct ieee80211_sub_if_data *sdata,
+				      u16 duration)
 {
 	might_sleep();
 
@@ -821,9 +823,9 @@ static inline void drv_mgd_prepare_tx(struct ieee80211_local *local,
 		return;
 	WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION);
 
-	trace_drv_mgd_prepare_tx(local, sdata);
+	trace_drv_mgd_prepare_tx(local, sdata, duration);
 	if (local->ops->mgd_prepare_tx)
-		local->ops->mgd_prepare_tx(&local->hw, &sdata->vif);
+		local->ops->mgd_prepare_tx(&local->hw, &sdata->vif, duration);
 	trace_drv_return_void(local);
 }
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 233068756502..a59187c016e0 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -864,7 +864,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
 		return;
 	}
 
-	drv_mgd_prepare_tx(local, sdata);
+	drv_mgd_prepare_tx(local, sdata, 0);
 
 	IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
 	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
@@ -2022,7 +2022,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		 */
 		if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) &&
 		    !ifmgd->have_beacon)
-			drv_mgd_prepare_tx(sdata->local, sdata);
+			drv_mgd_prepare_tx(sdata->local, sdata, 0);
 
 		ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
 					       reason, tx, frame_buf);
@@ -2560,7 +2560,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 	if (!elems.challenge)
 		return;
 	auth_data->expected_transaction = 4;
-	drv_mgd_prepare_tx(sdata->local, sdata);
+	drv_mgd_prepare_tx(sdata->local, sdata, 0);
 	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
 		tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
 			   IEEE80211_TX_INTFL_MLME_CONN_TX;
@@ -3769,6 +3769,7 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 	u32 tx_flags = 0;
 	u16 trans = 1;
 	u16 status = 0;
+	u16 prepare_tx_duration = 0;
 
 	sdata_assert_lock(sdata);
 
@@ -3790,7 +3791,11 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
 		return -ETIMEDOUT;
 	}
 
-	drv_mgd_prepare_tx(local, sdata);
+	if (auth_data->algorithm == WLAN_AUTH_SAE)
+		prepare_tx_duration =
+			jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE);
+
+	drv_mgd_prepare_tx(local, sdata, prepare_tx_duration);
 
 	sdata_info(sdata, "send auth to %pM (try %d/%d)\n",
 		   auth_data->bss->bssid, auth_data->tries,
@@ -4994,7 +4999,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
-		drv_mgd_prepare_tx(sdata->local, sdata);
+		drv_mgd_prepare_tx(sdata->local, sdata, 0);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
 					       req->reason_code, tx,
@@ -5014,7 +5019,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
 			   req->bssid, req->reason_code,
 			   ieee80211_get_reason_code_string(req->reason_code));
 
-		drv_mgd_prepare_tx(sdata->local, sdata);
+		drv_mgd_prepare_tx(sdata->local, sdata, 0);
 		ieee80211_send_deauth_disassoc(sdata, req->bssid,
 					       IEEE80211_STYPE_DEAUTH,
 					       req->reason_code, tx,
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 591ad02e1fa4..80a7edf8d314 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2,6 +2,7 @@
 /*
 * Portions of this file
 * Copyright(c) 2016 Intel Deutschland GmbH
+* Copyright (C) 2018 Intel Corporation
 */
 
 #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ)
@@ -1413,11 +1414,29 @@ DEFINE_EVENT(release_evt, drv_allow_buffered_frames,
 	TP_ARGS(local, sta, tids, num_frames, reason, more_data)
 );
 
-DEFINE_EVENT(local_sdata_evt, drv_mgd_prepare_tx,
+TRACE_EVENT(drv_mgd_prepare_tx,
 	TP_PROTO(struct ieee80211_local *local,
-		 struct ieee80211_sub_if_data *sdata),
+		 struct ieee80211_sub_if_data *sdata,
+		 u16 duration),
 
-	TP_ARGS(local, sdata)
+	TP_ARGS(local, sdata, duration),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u32, duration)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->duration = duration;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " duration: %u",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->duration
+	)
 );
 
 DEFINE_EVENT(local_sdata_evt, drv_mgd_protect_tdls_discover,
-- 
cgit v1.2.3-59-g8ed1b


From d1e23c9456b2881b71732594f4cdb52d78aedecb Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 21 May 2018 10:31:13 -0500
Subject: nl80211: Optimize cfg80211_bss_expire invocations

Only invoke cfg80211_bss_expire on the first nl80211_dump_scan
invocation to avoid (likely) redundant processing.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a3dcea2fbd7a..451f12ecb894 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8100,7 +8100,15 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb)
 
 	wdev_lock(wdev);
 	spin_lock_bh(&rdev->bss_lock);
-	cfg80211_bss_expire(rdev);
+
+	/*
+	 * dump_scan will be called multiple times to break up the scan results
+	 * into multiple messages.  It is unlikely that any more bss-es will be
+	 * expired after the first call, so only call only call this on the
+	 * first dump_scan invocation.
+	 */
+	if (start == 0)
+		cfg80211_bss_expire(rdev);
 
 	cb->seq = rdev->bss_generation;
 
-- 
cgit v1.2.3-59-g8ed1b


From 76804d28c32ec14e1fdb3981623e5b7a4bc1c739 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Tue, 22 May 2018 10:19:06 +0200
Subject: cfg80211: use separate struct for FILS parameters

Put FILS related parameters into their own struct definition so
it can be reused for roam events in subsequent change.

Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 44 ++++++++++++++++++++++++++------------------
 net/wireless/nl80211.c | 22 +++++++++++-----------
 net/wireless/sme.c     | 40 +++++++++++++++++++++-------------------
 3 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index df33c2766d22..0e4a2a04d55d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5420,6 +5420,30 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
 #define CFG80211_TESTMODE_DUMP(cmd)
 #endif
 
+/**
+ * struct cfg80211_fils_resp_params - FILS connection response params
+ * @kek: KEK derived from a successful FILS connection (may be %NULL)
+ * @kek_len: Length of @fils_kek in octets
+ * @update_erp_next_seq_num: Boolean value to specify whether the value in
+ *	@erp_next_seq_num is valid.
+ * @erp_next_seq_num: The next sequence number to use in ERP message in
+ *	FILS Authentication. This value should be specified irrespective of the
+ *	status for a FILS connection.
+ * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
+ * @pmk_len: Length of @pmk in octets
+ * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
+ *	used for this FILS connection (may be %NULL).
+ */
+struct cfg80211_fils_resp_params {
+	const u8 *kek;
+	size_t kek_len;
+	bool update_erp_next_seq_num;
+	u16 erp_next_seq_num;
+	const u8 *pmk;
+	size_t pmk_len;
+	const u8 *pmkid;
+};
+
 /**
  * struct cfg80211_connect_resp_params - Connection response params
  * @status: Status code, %WLAN_STATUS_SUCCESS for successful connection, use
@@ -5438,17 +5462,7 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
  * @resp_ie_len: Association response IEs length
- * @fils_kek: KEK derived from a successful FILS connection (may be %NULL)
- * @fils_kek_len: Length of @fils_kek in octets
- * @update_erp_next_seq_num: Boolean value to specify whether the value in
- *	@fils_erp_next_seq_num is valid.
- * @fils_erp_next_seq_num: The next sequence number to use in ERP message in
- *	FILS Authentication. This value should be specified irrespective of the
- *	status for a FILS connection.
- * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
- * @pmk_len: Length of @pmk in octets
- * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
- *	used for this FILS connection (may be %NULL).
+ * @fils: FILS connection response parameters.
  * @timeout_reason: Reason for connection timeout. This is used when the
  *	connection fails due to a timeout instead of an explicit rejection from
  *	the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
@@ -5464,13 +5478,7 @@ struct cfg80211_connect_resp_params {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
-	const u8 *fils_kek;
-	size_t fils_kek_len;
-	bool update_erp_next_seq_num;
-	u16 fils_erp_next_seq_num;
-	const u8 *pmk;
-	size_t pmk_len;
-	const u8 *pmkid;
+	struct cfg80211_fils_resp_params fils;
 	enum nl80211_timeout_reason timeout_reason;
 };
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 451f12ecb894..3ab443b13bb0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14206,8 +14206,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	void *hdr;
 
 	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
-			cr->fils_kek_len + cr->pmk_len +
-			(cr->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+			cr->fils.kek_len + cr->fils.pmk_len +
+			(cr->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -14233,17 +14233,17 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	    (cr->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
 		     cr->resp_ie)) ||
-	    (cr->update_erp_next_seq_num &&
+	    (cr->fils.update_erp_next_seq_num &&
 	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
-			 cr->fils_erp_next_seq_num)) ||
+			 cr->fils.erp_next_seq_num)) ||
 	    (cr->status == WLAN_STATUS_SUCCESS &&
-	     ((cr->fils_kek &&
-	       nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len,
-		       cr->fils_kek)) ||
-	      (cr->pmk &&
-	       nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) ||
-	      (cr->pmkid &&
-	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid)))))
+	     ((cr->fils.kek &&
+	       nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils.kek_len,
+		       cr->fils.kek)) ||
+	      (cr->fils.pmk &&
+	       nla_put(msg, NL80211_ATTR_PMK, cr->fils.pmk_len, cr->fils.pmk)) ||
+	      (cr->fils.pmkid &&
+	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->fils.pmkid)))))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 5df6b33db786..73881fb7f86d 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -803,8 +803,8 @@ void cfg80211_connect_done(struct net_device *dev,
 
 	ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
 		     params->req_ie_len + params->resp_ie_len +
-		     params->fils_kek_len + params->pmk_len +
-		     (params->pmkid ? WLAN_PMKID_LEN : 0), gfp);
+		     params->fils.kek_len + params->fils.pmk_len +
+		     (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, params->bss);
 		return;
@@ -831,27 +831,29 @@ void cfg80211_connect_done(struct net_device *dev,
 		       params->resp_ie_len);
 		next += params->resp_ie_len;
 	}
-	if (params->fils_kek_len) {
-		ev->cr.fils_kek = next;
-		ev->cr.fils_kek_len = params->fils_kek_len;
-		memcpy((void *)ev->cr.fils_kek, params->fils_kek,
-		       params->fils_kek_len);
-		next += params->fils_kek_len;
+	if (params->fils.kek_len) {
+		ev->cr.fils.kek = next;
+		ev->cr.fils.kek_len = params->fils.kek_len;
+		memcpy((void *)ev->cr.fils.kek, params->fils.kek,
+		       params->fils.kek_len);
+		next += params->fils.kek_len;
 	}
-	if (params->pmk_len) {
-		ev->cr.pmk = next;
-		ev->cr.pmk_len = params->pmk_len;
-		memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len);
-		next += params->pmk_len;
+	if (params->fils.pmk_len) {
+		ev->cr.fils.pmk = next;
+		ev->cr.fils.pmk_len = params->fils.pmk_len;
+		memcpy((void *)ev->cr.fils.pmk, params->fils.pmk,
+		       params->fils.pmk_len);
+		next += params->fils.pmk_len;
 	}
-	if (params->pmkid) {
-		ev->cr.pmkid = next;
-		memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN);
+	if (params->fils.pmkid) {
+		ev->cr.fils.pmkid = next;
+		memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid,
+		       WLAN_PMKID_LEN);
 		next += WLAN_PMKID_LEN;
 	}
-	ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num;
-	if (params->update_erp_next_seq_num)
-		ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num;
+	ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num;
+	if (params->fils.update_erp_next_seq_num)
+		ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num;
 	if (params->bss)
 		cfg80211_hold_bss(bss_from_pub(params->bss));
 	ev->cr.bss = params->bss;
-- 
cgit v1.2.3-59-g8ed1b


From e841b7b11ebd0b359e07bb3d7caf15dca1a80b72 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Tue, 22 May 2018 10:19:07 +0200
Subject: nl80211: add FILS related parameters to ROAM event

In case of FILS shared key offload the parameters can change
upon roaming of which user-space needs to be notified.

Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/nl80211.c       | 16 +++++++++++++--
 net/wireless/sme.c           | 48 +++++++++++++++++++++++++++++++++++++-------
 4 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e4a2a04d55d..1e7c0df4fe33 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5624,6 +5624,7 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid,
  * @req_ie_len: association request IEs length
  * @resp_ie: association response IEs (may be %NULL)
  * @resp_ie_len: assoc response IEs length
+ * @fils: FILS related roaming information.
  */
 struct cfg80211_roam_info {
 	struct ieee80211_channel *channel;
@@ -5633,6 +5634,7 @@ struct cfg80211_roam_info {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
+	struct cfg80211_fils_resp_params fils;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 83ed1dd757e6..0a412335d56b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -215,7 +215,8 @@
  * as specified in IETF RFC 6696.
  *
  * When FILS shared key authentication is completed, driver needs to provide the
- * below additional parameters to userspace.
+ * below additional parameters to userspace, which can be either after setting
+ * up a connection or after roaming.
  *	%NL80211_ATTR_FILS_KEK - used for key renewal
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges
  *	%NL80211_ATTR_PMKID - used to identify the PMKSA used/generated
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3ab443b13bb0..ae57f9712d7d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14264,7 +14264,9 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	void *hdr;
 	const u8 *bssid = info->bss ? info->bss->bssid : info->bssid;
 
-	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp);
+	msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len +
+			info->fils.kek_len + info->fils.pmk_len +
+			(info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -14282,7 +14284,17 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 		     info->req_ie)) ||
 	    (info->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len,
-		     info->resp_ie)))
+		     info->resp_ie)) ||
+	    (info->fils.update_erp_next_seq_num &&
+	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+			 info->fils.erp_next_seq_num)) ||
+	    (info->fils.kek &&
+	     nla_put(msg, NL80211_ATTR_FILS_KEK, info->fils.kek_len,
+		     info->fils.kek)) ||
+	    (info->fils.pmk &&
+	     nla_put(msg, NL80211_ATTR_PMK, info->fils.pmk_len, info->fils.pmk)) ||
+	    (info->fils.pmkid &&
+	     nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, info->fils.pmkid)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 73881fb7f86d..d536b07582f8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -932,6 +932,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_event *ev;
 	unsigned long flags;
+	u8 *next;
 
 	if (!info->bss) {
 		info->bss = cfg80211_get_bss(wdev->wiphy, info->channel,
@@ -944,19 +945,52 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 	if (WARN_ON(!info->bss))
 		return;
 
-	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp);
+	ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len +
+		     info->fils.kek_len + info->fils.pmk_len +
+		     (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, info->bss);
 		return;
 	}
 
 	ev->type = EVENT_ROAMED;
-	ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev);
-	ev->rm.req_ie_len = info->req_ie_len;
-	memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
-	ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len;
-	ev->rm.resp_ie_len = info->resp_ie_len;
-	memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len);
+	next = ((u8 *)ev) + sizeof(*ev);
+	if (info->req_ie_len) {
+		ev->rm.req_ie = next;
+		ev->rm.req_ie_len = info->req_ie_len;
+		memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len);
+		next += info->req_ie_len;
+	}
+	if (info->resp_ie_len) {
+		ev->rm.resp_ie = next;
+		ev->rm.resp_ie_len = info->resp_ie_len;
+		memcpy((void *)ev->rm.resp_ie, info->resp_ie,
+		       info->resp_ie_len);
+		next += info->resp_ie_len;
+	}
+	if (info->fils.kek_len) {
+		ev->rm.fils.kek = next;
+		ev->rm.fils.kek_len = info->fils.kek_len;
+		memcpy((void *)ev->rm.fils.kek, info->fils.kek,
+		       info->fils.kek_len);
+		next += info->fils.kek_len;
+	}
+	if (info->fils.pmk_len) {
+		ev->rm.fils.pmk = next;
+		ev->rm.fils.pmk_len = info->fils.pmk_len;
+		memcpy((void *)ev->rm.fils.pmk, info->fils.pmk,
+		       info->fils.pmk_len);
+		next += info->fils.pmk_len;
+	}
+	if (info->fils.pmkid) {
+		ev->rm.fils.pmkid = next;
+		memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid,
+		       WLAN_PMKID_LEN);
+		next += WLAN_PMKID_LEN;
+	}
+	ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num;
+	if (info->fils.update_erp_next_seq_num)
+		ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num;
 	ev->rm.bss = info->bss;
 
 	spin_lock_irqsave(&wdev->event_lock, flags);
-- 
cgit v1.2.3-59-g8ed1b


From 7f9a3e150ec7d3596386449c15aefb59904a1266 Mon Sep 17 00:00:00 2001
From: Vidyullatha Kanchanapally <vidyullatha@codeaurora.org>
Date: Tue, 22 May 2018 10:19:08 +0200
Subject: nl80211: Update ERP info using NL80211_CMD_UPDATE_CONNECT_PARAMS

Use NL80211_CMD_UPDATE_CONNECT_PARAMS to update new ERP information,
Association IEs and the Authentication type to driver / firmware which
will be used in subsequent roamings.

Signed-off-by: Vidyullatha Kanchanapally <vidyullatha@codeaurora.org>
[arend: extended fils-sk kernel doc and added check in wiphy_register()]
Reviewed-by: Jithu Jance <jithu.jance@broadcom.com>
Reviewed-by: Eylon Pedinovsky <eylon.pedinovsky@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h |  3 ++-
 net/wireless/core.c          |  4 ++++
 net/wireless/nl80211.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1e7c0df4fe33..5fbfe61f41c6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2225,9 +2225,14 @@ struct cfg80211_connect_params {
  * have to be updated as part of update_connect_params() call.
  *
  * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
+ * @UPDATE_FILS_ERP_INFO: Indicates that FILS connection parameters (realm,
+ *	username, erp sequence number and rrk) are updated
+ * @UPDATE_AUTH_TYPE: Indicates that authentication type is updated
  */
 enum cfg80211_connect_params_changed {
 	UPDATE_ASSOC_IES		= BIT(0),
+	UPDATE_FILS_ERP_INFO		= BIT(1),
+	UPDATE_AUTH_TYPE		= BIT(2),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 0a412335d56b..06f9af23156b 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -204,7 +204,8 @@
  * FILS shared key authentication offload should be able to construct the
  * authentication and association frames for FILS shared key authentication and
  * eventually do a key derivation as per IEEE 802.11ai. The below additional
- * parameters should be given to driver in %NL80211_CMD_CONNECT.
+ * parameters should be given to driver in %NL80211_CMD_CONNECT and/or in
+ * %NL80211_CMD_UPDATE_CONNECT_PARAMS.
  *	%NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai
  *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message
diff --git a/net/wireless/core.c b/net/wireless/core.c
index c0fd8a85e7f7..5fe35aafdd9c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -725,6 +725,10 @@ int wiphy_register(struct wiphy *wiphy)
 		    (!rdev->ops->set_pmk || !rdev->ops->del_pmk)))
 		return -EINVAL;
 
+	if (WARN_ON(!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_FW_ROAM) &&
+		    rdev->ops->update_connect_params))
+		return -EINVAL;
+
 	if (wiphy->addresses)
 		memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ae57f9712d7d..bdf73b24cc09 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9429,6 +9429,8 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	bool fils_sk_offload;
+	u32 auth_type;
 	u32 changed = 0;
 	int ret;
 
@@ -9443,6 +9445,56 @@ static int nl80211_update_connect_params(struct sk_buff *skb,
 		changed |= UPDATE_ASSOC_IES;
 	}
 
+	fils_sk_offload = wiphy_ext_feature_isset(&rdev->wiphy,
+						  NL80211_EXT_FEATURE_FILS_SK_OFFLOAD);
+
+	/*
+	 * when driver supports fils-sk offload all attributes must be
+	 * provided. So the else covers "fils-sk-not-all" and
+	 * "no-fils-sk-any".
+	 */
+	if (fils_sk_offload &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		connect.fils_erp_username =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_username_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_realm =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_realm_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_next_seq_num =
+			nla_get_u16(
+			   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
+		connect.fils_erp_rrk =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		connect.fils_erp_rrk_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		changed |= UPDATE_FILS_ERP_INFO;
+	} else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		return -EINVAL;
+	}
+
+	if (info->attrs[NL80211_ATTR_AUTH_TYPE]) {
+		auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
+		if (!nl80211_valid_auth_type(rdev, auth_type,
+					     NL80211_CMD_CONNECT))
+			return -EINVAL;
+
+		if (auth_type == NL80211_AUTHTYPE_FILS_SK &&
+		    fils_sk_offload && !(changed & UPDATE_FILS_ERP_INFO))
+			return -EINVAL;
+
+		connect.auth_type = auth_type;
+		changed |= UPDATE_AUTH_TYPE;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	if (!wdev->current_bss)
 		ret = -ENOLINK;
-- 
cgit v1.2.3-59-g8ed1b


From d874cd74306694433b5b2c5ce32ddba48fbc360e Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 22 May 2018 14:29:31 -0700
Subject: rfkill: Rename rfkill_any_led_trigger* functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename these functions to rfkill_global_led_trigger*, as they are going
to be extended to handle another global rfkill led trigger.

This commit does not change any functionality.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 59d0eb960275..6d64d14f4b0a 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -178,9 +178,9 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 }
 
 static struct led_trigger rfkill_any_led_trigger;
-static struct work_struct rfkill_any_work;
+static struct work_struct rfkill_global_led_trigger_work;
 
-static void rfkill_any_led_trigger_worker(struct work_struct *work)
+static void rfkill_global_led_trigger_worker(struct work_struct *work)
 {
 	enum led_brightness brightness = LED_OFF;
 	struct rfkill *rfkill;
@@ -197,28 +197,29 @@ static void rfkill_any_led_trigger_worker(struct work_struct *work)
 	led_trigger_event(&rfkill_any_led_trigger, brightness);
 }
 
-static void rfkill_any_led_trigger_event(void)
+static void rfkill_global_led_trigger_event(void)
 {
-	schedule_work(&rfkill_any_work);
+	schedule_work(&rfkill_global_led_trigger_work);
 }
 
-static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+static void rfkill_global_led_trigger_activate(struct led_classdev *led_cdev)
 {
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 }
 
-static int rfkill_any_led_trigger_register(void)
+static int rfkill_global_led_trigger_register(void)
 {
-	INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker);
+	INIT_WORK(&rfkill_global_led_trigger_work,
+			rfkill_global_led_trigger_worker);
 	rfkill_any_led_trigger.name = "rfkill-any";
-	rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+	rfkill_any_led_trigger.activate = rfkill_global_led_trigger_activate;
 	return led_trigger_register(&rfkill_any_led_trigger);
 }
 
-static void rfkill_any_led_trigger_unregister(void)
+static void rfkill_global_led_trigger_unregister(void)
 {
 	led_trigger_unregister(&rfkill_any_led_trigger);
-	cancel_work_sync(&rfkill_any_work);
+	cancel_work_sync(&rfkill_global_led_trigger_work);
 }
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
@@ -234,16 +235,16 @@ static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
 
-static void rfkill_any_led_trigger_event(void)
+static void rfkill_global_led_trigger_event(void)
 {
 }
 
-static int rfkill_any_led_trigger_register(void)
+static int rfkill_global_led_trigger_register(void)
 {
 	return 0;
 }
 
-static void rfkill_any_led_trigger_unregister(void)
+static void rfkill_global_led_trigger_unregister(void)
 {
 }
 #endif /* CONFIG_RFKILL_LEDS */
@@ -354,7 +355,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 
 	if (prev != curr)
 		rfkill_event(rfkill);
@@ -535,7 +536,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
 	spin_unlock_irqrestore(&rfkill->lock, flags);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 
 	if (rfkill->registered && prev != blocked)
 		schedule_work(&rfkill->uevent_work);
@@ -579,7 +580,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked)
 		schedule_work(&rfkill->uevent_work);
 
 	rfkill_led_trigger_event(rfkill);
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 
 	return blocked;
 }
@@ -629,7 +630,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
 			schedule_work(&rfkill->uevent_work);
 
 		rfkill_led_trigger_event(rfkill);
-		rfkill_any_led_trigger_event();
+		rfkill_global_led_trigger_event();
 	}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -1046,7 +1047,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
 	}
 
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 	rfkill_send_events(rfkill, RFKILL_OP_ADD);
 
 	mutex_unlock(&rfkill_global_mutex);
@@ -1079,7 +1080,7 @@ void rfkill_unregister(struct rfkill *rfkill)
 	mutex_lock(&rfkill_global_mutex);
 	rfkill_send_events(rfkill, RFKILL_OP_DEL);
 	list_del_init(&rfkill->node);
-	rfkill_any_led_trigger_event();
+	rfkill_global_led_trigger_event();
 	mutex_unlock(&rfkill_global_mutex);
 
 	rfkill_led_trigger_unregister(rfkill);
@@ -1332,7 +1333,7 @@ static int __init rfkill_init(void)
 	if (error)
 		goto error_misc;
 
-	error = rfkill_any_led_trigger_register();
+	error = rfkill_global_led_trigger_register();
 	if (error)
 		goto error_led_trigger;
 
@@ -1346,7 +1347,7 @@ static int __init rfkill_init(void)
 
 #ifdef CONFIG_RFKILL_INPUT
 error_input:
-	rfkill_any_led_trigger_unregister();
+	rfkill_global_led_trigger_unregister();
 #endif
 error_led_trigger:
 	misc_deregister(&rfkill_miscdev);
@@ -1362,7 +1363,7 @@ static void __exit rfkill_exit(void)
 #ifdef CONFIG_RFKILL_INPUT
 	rfkill_handler_exit();
 #endif
-	rfkill_any_led_trigger_unregister();
+	rfkill_global_led_trigger_unregister();
 	misc_deregister(&rfkill_miscdev);
 	class_unregister(&rfkill_class);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 232aa23ec354029a58280e79d815ee4746c0bbb2 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 22 May 2018 14:29:32 -0700
Subject: rfkill: Create rfkill-none LED trigger
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Creates a new trigger rfkill-none, as a complement to rfkill-any, which
drives LEDs when any radio is enabled. The new trigger is meant to turn
a LED ON whenever all radios are OFF, and turn it OFF otherwise.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/rfkill/core.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 6d64d14f4b0a..a7a4e6ff9be2 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -178,6 +178,7 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 }
 
 static struct led_trigger rfkill_any_led_trigger;
+static struct led_trigger rfkill_none_led_trigger;
 static struct work_struct rfkill_global_led_trigger_work;
 
 static void rfkill_global_led_trigger_worker(struct work_struct *work)
@@ -195,6 +196,8 @@ static void rfkill_global_led_trigger_worker(struct work_struct *work)
 	mutex_unlock(&rfkill_global_mutex);
 
 	led_trigger_event(&rfkill_any_led_trigger, brightness);
+	led_trigger_event(&rfkill_none_led_trigger,
+			  brightness == LED_OFF ? LED_FULL : LED_OFF);
 }
 
 static void rfkill_global_led_trigger_event(void)
@@ -202,22 +205,32 @@ static void rfkill_global_led_trigger_event(void)
 	schedule_work(&rfkill_global_led_trigger_work);
 }
 
-static void rfkill_global_led_trigger_activate(struct led_classdev *led_cdev)
-{
-	rfkill_global_led_trigger_event();
-}
-
 static int rfkill_global_led_trigger_register(void)
 {
+	int ret;
+
 	INIT_WORK(&rfkill_global_led_trigger_work,
 			rfkill_global_led_trigger_worker);
+
 	rfkill_any_led_trigger.name = "rfkill-any";
-	rfkill_any_led_trigger.activate = rfkill_global_led_trigger_activate;
-	return led_trigger_register(&rfkill_any_led_trigger);
+	ret = led_trigger_register(&rfkill_any_led_trigger);
+	if (ret)
+		return ret;
+
+	rfkill_none_led_trigger.name = "rfkill-none";
+	ret = led_trigger_register(&rfkill_none_led_trigger);
+	if (ret)
+		led_trigger_unregister(&rfkill_any_led_trigger);
+	else
+		/* Delay activation until all global triggers are registered */
+		rfkill_global_led_trigger_event();
+
+	return ret;
 }
 
 static void rfkill_global_led_trigger_unregister(void)
 {
+	led_trigger_unregister(&rfkill_none_led_trigger);
 	led_trigger_unregister(&rfkill_any_led_trigger);
 	cancel_work_sync(&rfkill_global_led_trigger_work);
 }
-- 
cgit v1.2.3-59-g8ed1b


From bad2929733635f80f99930b252757c70372356fe Mon Sep 17 00:00:00 2001
From: Andrew Zaborowski <andrew.zaborowski@intel.com>
Date: Tue, 22 May 2018 02:46:02 +0200
Subject: nl80211: Reject disconnect commands except from conn_owner

Reject NL80211_CMD_DISCONNECT, NL80211_CMD_DISASSOCIATE,
NL80211_CMD_DEAUTHENTICATE and NL80211_CMD_ASSOCIATE commands
from clients other than the connection owner set in the connect,
authenticate or associate commands, if it was set.

The main point of this check is to prevent chaos when two processes
try to use nl80211 at the same time, it's not a security measure.
The same thing should possibly be done for JOIN_IBSS/LEAVE_IBSS and
START_AP/STOP_AP.

Signed-off-by: Andrew Zaborowski <andrew.zaborowski@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index bdf73b24cc09..bc40a783cb27 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8514,6 +8514,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	const u8 *bssid, *ssid;
 	int err, ssid_len = 0;
 
+	if (dev->ieee80211_ptr->conn_owner_nlportid &&
+	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+		return -EPERM;
+
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
 		return -EINVAL;
 
@@ -8636,6 +8640,10 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info)
 	u16 reason_code;
 	bool local_state_change;
 
+	if (dev->ieee80211_ptr->conn_owner_nlportid &&
+	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+		return -EPERM;
+
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
 		return -EINVAL;
 
@@ -8683,6 +8691,10 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info)
 	u16 reason_code;
 	bool local_state_change;
 
+	if (dev->ieee80211_ptr->conn_owner_nlportid &&
+	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+		return -EPERM;
+
 	if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE]))
 		return -EINVAL;
 
@@ -9512,6 +9524,10 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info)
 	u16 reason;
 	int ret;
 
+	if (dev->ieee80211_ptr->conn_owner_nlportid &&
+	    dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+		return -EPERM;
+
 	if (!info->attrs[NL80211_ATTR_REASON_CODE])
 		reason = WLAN_REASON_DEAUTH_LEAVING;
 	else
-- 
cgit v1.2.3-59-g8ed1b


From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 15:03:31 -0700
Subject: bpf: Expose check_uarg_tail_zero()

This patch exposes check_uarg_tail_zero() which will
be reused by a later BTF patch.  Its name is changed to
bpf_check_uarg_tail_zero().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ed0122b45b63..f6fe3c719ca8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
 int bpf_get_file_flag(int flags);
+int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size,
+			     size_t actual_size);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde949c7f8..2b29ef84ded3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -65,9 +65,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-static int check_uarg_tail_zero(void __user *uaddr,
-				size_t expected_size,
-				size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+			     size_t expected_size,
+			     size_t actual_size)
 {
 	unsigned char __user *addr;
 	unsigned char __user *end;
@@ -1899,7 +1899,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	u32 ulen;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -1998,7 +1998,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -2038,7 +2038,7 @@ static int bpf_btf_get_info_by_fd(struct btf *btf,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
 	if (err)
 		return err;
 
@@ -2110,7 +2110,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
 	if (err)
 		return err;
 	size = min_t(u32, size, sizeof(attr));
-- 
cgit v1.2.3-59-g8ed1b


From f80442a4cd1864154beaa060bb483a2c9f7d811f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:18 -0700
Subject: bpf: btf: Change how section is supported in btf_header

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h |   8 +-
 kernel/bpf/btf.c         | 209 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 160 insertions(+), 57 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index bcb56ee47014..4fa479741a02 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -12,15 +12,11 @@ struct btf_header {
 	__u16	magic;
 	__u8	version;
 	__u8	flags;
-
-	__u32	parent_label;
-	__u32	parent_name;
+	__u32	hdr_len;
 
 	/* All offsets are in bytes relative to the end of this header */
-	__u32	label_off;	/* offset of label section	*/
-	__u32	object_off;	/* offset of data object section*/
-	__u32	func_off;	/* offset of function section	*/
 	__u32	type_off;	/* offset of type section	*/
+	__u32	type_len;	/* length of type section	*/
 	__u32	str_off;	/* offset of string section	*/
 	__u32	str_len;	/* length of string section	*/
 };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab47b8a..75da6cbae47d 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
 
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);
 static DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
-	union {
-		struct btf_header *hdr;
-		void *data;
-	};
+	void *data;
 	struct btf_type **types;
 	u32 *resolved_ids;
 	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
+	struct btf_header hdr;
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
@@ -228,6 +227,11 @@ enum resolve_mode {
 
 #define MAX_RESOLVE_DEPTH 32
 
+struct btf_sec_info {
+	u32 off;
+	u32 len;
+};
+
 struct btf_verifier_env {
 	struct btf *btf;
 	u8 *visit_states;
@@ -418,14 +422,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
 	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
 	if (!BTF_STR_OFFSET(offset))
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
 		return &btf->strings[BTF_STR_OFFSET(offset)];
 	else
 		return "(invalid-name-offset)";
@@ -536,7 +540,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	__btf_verifier_log(log, "\n");
 }
 
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+				 u32 btf_data_size)
 {
 	struct bpf_verifier_log *log = &env->log;
 	const struct btf *btf = env->btf;
@@ -545,19 +550,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env)
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
 	__btf_verifier_log(log, "version: %u\n", hdr->version);
 	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-	__btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
-	__btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
-	__btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
-	__btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
-	__btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+	__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
 	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
 	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
 	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
-	__btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
 }
 
 static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
@@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	struct btf_header *hdr;
 	void *cur, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	cur = btf->nohdr_data + hdr->type_off;
-	end = btf->nohdr_data + hdr->str_off;
+	end = btf->nohdr_data + hdr->type_len;
 
 	env->log_type_id = 1;
 	while (cur < end) {
@@ -1866,8 +1868,20 @@ static int btf_check_all_types(struct btf_verifier_env *env)
 
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
+	const struct btf_header *hdr = &env->btf->hdr;
 	int err;
 
+	/* Type section must align to 4 bytes */
+	if (hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Unaligned type_off");
+		return -EINVAL;
+	}
+
+	if (!hdr->type_len) {
+		btf_verifier_log(env, "No type found");
+		return -EINVAL;
+	}
+
 	err = btf_check_all_metas(env);
 	if (err)
 		return err;
@@ -1881,10 +1895,15 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	struct btf *btf = env->btf;
 	const char *start, *end;
 
-	hdr = btf->hdr;
+	hdr = &btf->hdr;
 	start = btf->nohdr_data + hdr->str_off;
 	end = start + hdr->str_len;
 
+	if (end != btf->data + btf->data_size) {
+		btf_verifier_log(env, "String section is not at the end");
+		return -EINVAL;
+	}
+
 	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
 	    start[0] || end[-1]) {
 		btf_verifier_log(env, "Invalid string section");
@@ -1896,20 +1915,122 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
 	return 0;
 }
 
-static int btf_parse_hdr(struct btf_verifier_env *env)
+static const size_t btf_sec_info_offset[] = {
+	offsetof(struct btf_header, type_off),
+	offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+	const struct btf_sec_info *x = a;
+	const struct btf_sec_info *y = b;
+
+	return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+			      u32 btf_data_size)
 {
+	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
+	struct btf_sec_info secs[nr_secs];
+	u32 total, expected_total, i;
 	const struct btf_header *hdr;
-	struct btf *btf = env->btf;
-	u32 meta_left;
+	const struct btf *btf;
+
+	btf = env->btf;
+	hdr = &btf->hdr;
 
-	if (btf->data_size < sizeof(*hdr)) {
+	/* Populate the secs from hdr */
+	for (i = 0; i < nr_secs; i++)
+		secs[i] = *(struct btf_sec_info *)((void *)hdr +
+						   btf_sec_info_offset[i]);
+
+	sort(secs, nr_secs, sizeof(struct btf_sec_info),
+	     btf_sec_info_cmp, NULL);
+
+	/* Check for gaps and overlap among sections */
+	total = 0;
+	expected_total = btf_data_size - hdr->hdr_len;
+	for (i = 0; i < nr_secs; i++) {
+		if (expected_total < secs[i].off) {
+			btf_verifier_log(env, "Invalid section offset");
+			return -EINVAL;
+		}
+		if (total < secs[i].off) {
+			/* gap */
+			btf_verifier_log(env, "Unsupported section found");
+			return -EINVAL;
+		}
+		if (total > secs[i].off) {
+			btf_verifier_log(env, "Section overlap found");
+			return -EINVAL;
+		}
+		if (expected_total - total < secs[i].len) {
+			btf_verifier_log(env,
+					 "Total section length too long");
+			return -EINVAL;
+		}
+		total += secs[i].len;
+	}
+
+	/* There is data other than hdr and known sections */
+	if (expected_total != total) {
+		btf_verifier_log(env, "Unsupported section found");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+			 u32 btf_data_size)
+{
+	const struct btf_header *hdr;
+	u32 hdr_len, hdr_copy;
+	/*
+	 * Minimal part of the "struct btf_header" that
+	 * contains the hdr_len.
+	 */
+	struct btf_min_header {
+		u16	magic;
+		u8	version;
+		u8	flags;
+		u32	hdr_len;
+	} __user *min_hdr;
+	struct btf *btf;
+	int err;
+
+	btf = env->btf;
+	min_hdr = btf_data;
+
+	if (btf_data_size < sizeof(*min_hdr)) {
+		btf_verifier_log(env, "hdr_len not found");
+		return -EINVAL;
+	}
+
+	if (get_user(hdr_len, &min_hdr->hdr_len))
+		return -EFAULT;
+
+	if (btf_data_size < hdr_len) {
 		btf_verifier_log(env, "btf_header not found");
 		return -EINVAL;
 	}
 
-	btf_verifier_log_hdr(env);
+	err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+	if (err) {
+		if (err == -E2BIG)
+			btf_verifier_log(env, "Unsupported btf_header");
+		return err;
+	}
+
+	hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+	if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+		return -EFAULT;
+
+	hdr = &btf->hdr;
+
+	btf_verifier_log_hdr(env, btf_data_size);
 
-	hdr = btf->hdr;
 	if (hdr->magic != BTF_MAGIC) {
 		btf_verifier_log(env, "Invalid magic");
 		return -EINVAL;
@@ -1925,26 +2046,14 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 		return -ENOTSUPP;
 	}
 
-	meta_left = btf->data_size - sizeof(*hdr);
-	if (!meta_left) {
+	if (btf_data_size == hdr->hdr_len) {
 		btf_verifier_log(env, "No data");
 		return -EINVAL;
 	}
 
-	if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
-	    /* Type section must align to 4 bytes */
-	    hdr->type_off & (sizeof(u32) - 1)) {
-		btf_verifier_log(env, "Invalid type_off");
-		return -EINVAL;
-	}
-
-	if (meta_left < hdr->str_off ||
-	    meta_left - hdr->str_off < hdr->str_len) {
-		btf_verifier_log(env, "Invalid str_off or str_len");
-		return -EINVAL;
-	}
-
-	btf->nohdr_data = btf->hdr + 1;
+	err = btf_check_sec_info(env, btf_data_size);
+	if (err)
+		return err;
 
 	return 0;
 }
@@ -1987,6 +2096,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		err = -ENOMEM;
 		goto errout;
 	}
+	env->btf = btf;
+
+	err = btf_parse_hdr(env, btf_data, btf_data_size);
+	if (err)
+		goto errout;
 
 	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
@@ -1996,18 +2110,13 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	btf->data = data;
 	btf->data_size = btf_data_size;
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
 
 	if (copy_from_user(data, btf_data, btf_data_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
 
-	env->btf = btf;
-
-	err = btf_parse_hdr(env);
-	if (err)
-		goto errout;
-
 	err = btf_parse_str_sec(env);
 	if (err)
 		goto errout;
@@ -2016,16 +2125,14 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 	if (err)
 		goto errout;
 
-	if (!err && log->level && bpf_verifier_log_full(log)) {
+	if (log->level && bpf_verifier_log_full(log)) {
 		err = -ENOSPC;
 		goto errout;
 	}
 
-	if (!err) {
-		btf_verifier_env_free(env);
-		refcount_set(&btf->refcnt, 1);
-		return btf;
-	}
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
 
 errout:
 	btf_verifier_env_free(env);
-- 
cgit v1.2.3-59-g8ed1b


From 4ef5f5741e340d625e668f78d44eb8c9a6a4a26e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:19 -0700
Subject: bpf: btf: Check array->index_type

Instead of ingoring the array->index_type field.  Enforce that
it must be a BTF_KIND_INT in size 1/2/4/8 bytes.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 80 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 75da6cbae47d..e388a6598de2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -443,6 +443,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 	return btf->types[type_id];
 }
 
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+	u16 nr_bits, nr_bytes;
+	u32 int_data;
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data);
+	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+	if (BITS_PER_BYTE_MASKED(nr_bits) ||
+	    BTF_INT_OFFSET(int_data) ||
+	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+		return false;
+	}
+
+	return true;
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
@@ -1308,14 +1330,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	/* We are a little forgiving on array->index_type since
-	 * the kernel is not using it.
-	 */
-	/* Array elem cannot be in type void,
-	 * so !array->type is not allowed.
+	/* Array elem type and index type cannot be in type void,
+	 * so !array->type and !array->index_type are not allowed.
 	 */
 	if (!array->type || BTF_TYPE_PARENT(array->type)) {
-		btf_verifier_log_type(env, t, "Invalid type_id");
+		btf_verifier_log_type(env, t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
 
@@ -1328,11 +1352,32 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 			     const struct resolve_vertex *v)
 {
 	const struct btf_array *array = btf_type_array(v->t);
-	const struct btf_type *elem_type;
-	u32 elem_type_id = array->type;
+	const struct btf_type *elem_type, *index_type;
+	u32 elem_type_id, index_type_id;
 	struct btf *btf = env->btf;
 	u32 elem_size;
 
+	/* Check array->index_type */
+	index_type_id = array->index_type;
+	index_type = btf_type_by_id(btf, index_type_id);
+	if (btf_type_is_void_or_null(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, index_type) &&
+	    !env_type_is_resolved(env, index_type_id))
+		return env_stack_push(env, index_type, index_type_id);
+
+	index_type = btf_type_id_size(btf, &index_type_id, NULL);
+	if (!index_type || !btf_type_is_int(index_type) ||
+	    !btf_type_int_is_regular(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	/* Check array->type */
+	elem_type_id = array->type;
 	elem_type = btf_type_by_id(btf, elem_type_id);
 	if (btf_type_is_void_or_null(elem_type)) {
 		btf_verifier_log_type(env, v->t,
@@ -1350,22 +1395,9 @@ static int btf_array_resolve(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (btf_type_is_int(elem_type)) {
-		int int_type_data = btf_type_int(elem_type);
-		u16 nr_bits = BTF_INT_BITS(int_type_data);
-		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-
-		/* Put more restriction on array of int.  The int cannot
-		 * be a bit field and it must be either u8/u16/u32/u64.
-		 */
-		if (BITS_PER_BYTE_MASKED(nr_bits) ||
-		    BTF_INT_OFFSET(int_type_data) ||
-		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
-			btf_verifier_log_type(env, v->t,
-					      "Invalid array of int");
-			return -EINVAL;
-		}
+	if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid array of int");
+		return -EINVAL;
 	}
 
 	if (array->nelems && elem_size > U32_MAX / array->nelems) {
-- 
cgit v1.2.3-59-g8ed1b


From aea2f7b8911617d7a8c23fb73d69e78764f91b58 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:20 -0700
Subject: bpf: btf: Remove unused bits from uapi/linux/btf.h

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
   raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
   currently encoded at the highest bit of a u32.
   It is because the current use case does not require supporting
   parent type (i.e type_id referring to a type in another BTF file).
   It also does not support referring to a string in ELF.

   The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
   by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
   defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
   There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
   used in the current use case.

5. Remove BTF_INT_VARARGS since func type is not supported now.
   The BTF_INT_ENCODING is limited to 4 bits instead of 8 bits.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/btf.h | 29 +++++++++------------------
 kernel/bpf/btf.c         | 52 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 4fa479741a02..0b5ddbe135a4 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -22,28 +22,19 @@ struct btf_header {
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x7fffffff
+#define BTF_MAX_TYPE	0x0000ffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x7fffffff
+#define BTF_MAX_NAME_OFFSET	0x0000ffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 
-/* The type id is referring to a parent BTF */
-#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
-#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
-
-/* String is in the ELF string section */
-#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
-#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
-
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
-	 * bits    31: root
+	 * bits 24-27: kind (e.g. int, ptr, array...etc)
+	 * bits 28-31: unused
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -58,8 +49,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
@@ -84,15 +74,14 @@ struct btf_type {
 /* BTF_KIND_INT is followed by a u32 and the following
  * is the 32 bits arrangement:
  */
-#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
 #define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
 #define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED	0x1
-#define BTF_INT_CHAR	0x2
-#define BTF_INT_BOOL	0x4
-#define BTF_INT_VARARGS	0x8
+#define BTF_INT_SIGNED	(1 << 0)
+#define BTF_INT_CHAR	(1 << 1)
+#define BTF_INT_BOOL	(1 << 2)
 
 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
  * The exact number of btf_enum is stored in the vlen (of the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e388a6598de2..9cbeabb5aca3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -163,13 +163,16 @@
 #define BITS_ROUNDUP_BYTES(bits) \
 	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
 
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
 /* 16MB for 64k structs and each has 16 members and
  * a few MB spaces for the string section.
  * The hard limit is S32_MAX.
  */
 #define BTF_MAX_SIZE (16 * 1024 * 1024)
-/* 64k. We can raise it later. The hard limit is S32_MAX. */
-#define BTF_MAX_NR_TYPES 65535
 
 #define for_each_member(i, struct_type, member)			\
 	for (i = 0, member = btf_type_member(struct_type);	\
@@ -383,8 +386,6 @@ static const char *btf_int_encoding_str(u8 encoding)
 		return "CHAR";
 	else if (encoding == BTF_INT_BOOL)
 		return "BOOL";
-	else if (encoding == BTF_INT_VARARGS)
-		return "VARARGS";
 	else
 		return "UNKN";
 }
@@ -421,16 +422,16 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 
 static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
 {
-	return !BTF_STR_TBL_ELF_ID(offset) &&
-		BTF_STR_OFFSET(offset) < btf->hdr.str_len;
+	return BTF_STR_OFFSET_VALID(offset) &&
+		offset < btf->hdr.str_len;
 }
 
 static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 {
-	if (!BTF_STR_OFFSET(offset))
+	if (!offset)
 		return "(anon)";
-	else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
-		return &btf->strings[BTF_STR_OFFSET(offset)];
+	else if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
 	else
 		return "(invalid-name-offset)";
 }
@@ -598,13 +599,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 		struct btf_type **new_types;
 		u32 expand_by, new_size;
 
-		if (btf->types_size == BTF_MAX_NR_TYPES) {
+		if (btf->types_size == BTF_MAX_TYPE) {
 			btf_verifier_log(env, "Exceeded max num of types");
 			return -E2BIG;
 		}
 
 		expand_by = max_t(u32, btf->types_size >> 2, 16);
-		new_size = min_t(u32, BTF_MAX_NR_TYPES,
+		new_size = min_t(u32, BTF_MAX_TYPE,
 				 btf->types_size + expand_by);
 
 		new_types = kvzalloc(new_size * sizeof(*new_types),
@@ -934,6 +935,12 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 	}
 
 	int_data = btf_type_int(t);
+	if (int_data & ~BTF_INT_MASK) {
+		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+				       int_data);
+		return -EINVAL;
+	}
+
 	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
 
 	if (nr_bits > BITS_PER_U64) {
@@ -947,12 +954,17 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	/*
+	 * Only one of the encoding bits is allowed and it
+	 * should be sufficient for the pretty print purpose (i.e. decoding).
+	 * Multiple bits can be allowed later if it is found
+	 * to be insufficient.
+	 */
 	encoding = BTF_INT_ENCODING(int_data);
 	if (encoding &&
 	    encoding != BTF_INT_SIGNED &&
 	    encoding != BTF_INT_CHAR &&
-	    encoding != BTF_INT_BOOL &&
-	    encoding != BTF_INT_VARARGS) {
+	    encoding != BTF_INT_BOOL) {
 		btf_verifier_log_type(env, t, "Unsupported encoding");
 		return -ENOTSUPP;
 	}
@@ -1126,7 +1138,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (BTF_TYPE_PARENT(t->type)) {
+	if (!BTF_TYPE_ID_VALID(t->type)) {
 		btf_verifier_log_type(env, t, "Invalid type_id");
 		return -EINVAL;
 	}
@@ -1333,12 +1345,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
-	if (!array->type || BTF_TYPE_PARENT(array->type)) {
+	if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
 		btf_verifier_log_type(env, t, "Invalid elem");
 		return -EINVAL;
 	}
 
-	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+	if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
 		btf_verifier_log_type(env, t, "Invalid index");
 		return -EINVAL;
 	}
@@ -1507,7 +1519,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 		}
 
 		/* A member cannot be in type void */
-		if (!member->type || BTF_TYPE_PARENT(member->type)) {
+		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
 			btf_verifier_log_member(env, t, member,
 						"Invalid type_id");
 			return -EINVAL;
@@ -1760,6 +1772,12 @@ static s32 btf_check_meta(struct btf_verifier_env *env,
 	}
 	meta_left -= sizeof(*t);
 
+	if (t->info & ~BTF_INFO_MASK) {
+		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+				 env->log_type_id, t->info);
+		return -EINVAL;
+	}
+
 	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
 	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
 		btf_verifier_log(env, "[%u] Invalid kind:%u",
-- 
cgit v1.2.3-59-g8ed1b


From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:21 -0700
Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  4 ++--
 include/uapi/linux/bpf.h |  8 ++++----
 kernel/bpf/arraymap.c    |  2 +-
 kernel/bpf/syscall.c     | 18 +++++++++---------
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f6fe3c719ca8..1795eeee846c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -69,8 +69,8 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
-	u32 btf_key_id;
-	u32 btf_value_id;
+	u32 btf_key_type_id;
+	u32 btf_value_type_id;
 	struct btf *btf;
 	bool unpriv_array;
 	/* 55 bytes hole */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bbe2ca5..c3e502d06bc3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -284,8 +284,8 @@ union bpf_attr {
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
 		__u32	btf_fd;		/* fd pointing to a BTF type data */
-		__u32	btf_key_id;	/* BTF type_id of the key */
-		__u32	btf_value_id;	/* BTF type_id of the value */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -2219,8 +2219,8 @@ struct bpf_map_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 btf_id;
-	__u32 btf_key_id;
-	__u32 btf_value_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 0fd8d8f1a398..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -352,7 +352,7 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
 	}
 
 	seq_printf(m, "%u: ", *(u32 *)key);
-	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
 	seq_puts(m, "\n");
 
 	rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b29ef84ded3..0b4c94551001 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -422,7 +422,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -457,10 +457,10 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->usercnt, 1);
 
 	if (bpf_map_support_seq_show(map) &&
-	    (attr->btf_key_id || attr->btf_value_id)) {
+	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
 		struct btf *btf;
 
-		if (!attr->btf_key_id || !attr->btf_value_id) {
+		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
 			err = -EINVAL;
 			goto free_map_nouncharge;
 		}
@@ -471,16 +471,16 @@ static int map_create(union bpf_attr *attr)
 			goto free_map_nouncharge;
 		}
 
-		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
-					      attr->btf_value_id);
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+					      attr->btf_value_type_id);
 		if (err) {
 			btf_put(btf);
 			goto free_map_nouncharge;
 		}
 
 		map->btf = btf;
-		map->btf_key_id = attr->btf_key_id;
-		map->btf_value_id = attr->btf_value_id;
+		map->btf_key_type_id = attr->btf_key_type_id;
+		map->btf_value_type_id = attr->btf_value_type_id;
 	}
 
 	err = security_bpf_map_alloc(map);
@@ -2013,8 +2013,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 
 	if (map->btf) {
 		info.btf_id = btf_id(map->btf);
-		info.btf_key_id = map->btf_key_id;
-		info.btf_value_id = map->btf_value_id;
+		info.btf_key_type_id = map->btf_key_type_id;
+		info.btf_value_type_id = map->btf_value_type_id;
 	}
 
 	if (bpf_map_is_dev_bound(map)) {
-- 
cgit v1.2.3-59-g8ed1b


From f03b15d34bd805e57bf69523b4dca7af10e9eeb1 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 14:57:22 -0700
Subject: bpf: btf: Sync bpf.h and btf.h to tools

This patch sync the uapi bpf.h and btf.h to tools.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h |  8 ++++----
 tools/include/uapi/linux/btf.h | 37 +++++++++++--------------------------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97446bbe2ca5..c3e502d06bc3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -284,8 +284,8 @@ union bpf_attr {
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
 		__u32	btf_fd;		/* fd pointing to a BTF type data */
-		__u32	btf_key_id;	/* BTF type_id of the key */
-		__u32	btf_value_id;	/* BTF type_id of the value */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -2219,8 +2219,8 @@ struct bpf_map_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 btf_id;
-	__u32 btf_key_id;
-	__u32 btf_value_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
 } __attribute__((aligned(8)));
 
 struct bpf_btf_info {
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index bcb56ee47014..0b5ddbe135a4 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -12,42 +12,29 @@ struct btf_header {
 	__u16	magic;
 	__u8	version;
 	__u8	flags;
-
-	__u32	parent_label;
-	__u32	parent_name;
+	__u32	hdr_len;
 
 	/* All offsets are in bytes relative to the end of this header */
-	__u32	label_off;	/* offset of label section	*/
-	__u32	object_off;	/* offset of data object section*/
-	__u32	func_off;	/* offset of function section	*/
 	__u32	type_off;	/* offset of type section	*/
+	__u32	type_len;	/* length of type section	*/
 	__u32	str_off;	/* offset of string section	*/
 	__u32	str_len;	/* length of string section	*/
 };
 
 /* Max # of type identifier */
-#define BTF_MAX_TYPE	0x7fffffff
+#define BTF_MAX_TYPE	0x0000ffff
 /* Max offset into the string section */
-#define BTF_MAX_NAME_OFFSET	0x7fffffff
+#define BTF_MAX_NAME_OFFSET	0x0000ffff
 /* Max # of struct/union/enum members or func args */
 #define BTF_MAX_VLEN	0xffff
 
-/* The type id is referring to a parent BTF */
-#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
-#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
-
-/* String is in the ELF string section */
-#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
-#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
-
 struct btf_type {
 	__u32 name_off;
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-28: kind (e.g. int, ptr, array...etc)
-	 * bits 29-30: unused
-	 * bits    31: root
+	 * bits 24-27: kind (e.g. int, ptr, array...etc)
+	 * bits 28-31: unused
 	 */
 	__u32 info;
 	/* "size" is used by INT, ENUM, STRUCT and UNION.
@@ -62,8 +49,7 @@ struct btf_type {
 	};
 };
 
-#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
-#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x0f)
 #define BTF_INFO_VLEN(info)	((info) & 0xffff)
 
 #define BTF_KIND_UNKN		0	/* Unknown	*/
@@ -88,15 +74,14 @@ struct btf_type {
 /* BTF_KIND_INT is followed by a u32 and the following
  * is the 32 bits arrangement:
  */
-#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0x0f000000) >> 24)
 #define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
 #define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
 
 /* Attributes stored in the BTF_INT_ENCODING */
-#define BTF_INT_SIGNED	0x1
-#define BTF_INT_CHAR	0x2
-#define BTF_INT_BOOL	0x4
-#define BTF_INT_VARARGS	0x8
+#define BTF_INT_SIGNED	(1 << 0)
+#define BTF_INT_CHAR	(1 << 1)
+#define BTF_INT_BOOL	(1 << 2)
 
 /* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
  * The exact number of btf_enum is stored in the vlen (of the
-- 
cgit v1.2.3-59-g8ed1b


From 61746dbe1aa27c9e23293621665b8442dfed7698 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 22 May 2018 15:04:24 -0700
Subject: bpf: btf: Add tests for the btf uapi changes

This patch does the followings:
1. Modify libbpf and test_btf to reflect the uapi changes in btf
2. Add test for the btf_header changes
3. Add tests for array->index_type
4. Add err_str check to the tests
5. Fix a 4 bytes hole in "struct test #1" by swapping "m" and "n"

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/bpf.c                    |   4 +-
 tools/lib/bpf/bpf.h                    |   4 +-
 tools/lib/bpf/btf.c                    |   5 +-
 tools/lib/bpf/libbpf.c                 |  34 +--
 tools/lib/bpf/libbpf.h                 |   4 +-
 tools/testing/selftests/bpf/test_btf.c | 521 +++++++++++++++++++++++++++------
 6 files changed, 456 insertions(+), 116 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6a8a00097fd8..442b4cdfeb71 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -89,8 +89,8 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	       min(name_len, BPF_OBJ_NAME_LEN - 1));
 	attr.numa_node = create_attr->numa_node;
 	attr.btf_fd = create_attr->btf_fd;
-	attr.btf_key_id = create_attr->btf_key_id;
-	attr.btf_value_id = create_attr->btf_value_id;
+	attr.btf_key_type_id = create_attr->btf_key_type_id;
+	attr.btf_value_type_id = create_attr->btf_value_type_id;
 	attr.map_ifindex = create_attr->map_ifindex;
 
 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 15bff7728cf1..d12344f66d4e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -36,8 +36,8 @@ struct bpf_create_map_attr {
 	__u32 max_entries;
 	__u32 numa_node;
 	__u32 btf_fd;
-	__u32 btf_key_id;
-	__u32 btf_value_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
 	__u32 map_ifindex;
 };
 
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 2bac710e3194..8c54a4b6f187 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -35,9 +35,8 @@ struct btf {
 
 static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
 {
-	if (!BTF_STR_TBL_ELF_ID(offset) &&
-	    BTF_STR_OFFSET(offset) < btf->hdr->str_len)
-		return &btf->strings[BTF_STR_OFFSET(offset)];
+	if (offset < btf->hdr->str_len)
+		return &btf->strings[offset];
 	else
 		return NULL;
 }
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3dbe217bf23e..8f1707dbfcfa 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -216,8 +216,8 @@ struct bpf_map {
 	size_t offset;
 	int map_ifindex;
 	struct bpf_map_def def;
-	uint32_t btf_key_id;
-	uint32_t btf_value_id;
+	uint32_t btf_key_type_id;
+	uint32_t btf_value_type_id;
 	void *priv;
 	bpf_map_clear_priv_t clear_priv;
 };
@@ -1074,8 +1074,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 		return -EINVAL;
 	}
 
-	map->btf_key_id = key_id;
-	map->btf_value_id = value_id;
+	map->btf_key_type_id = key_id;
+	map->btf_value_type_id = value_id;
 
 	return 0;
 }
@@ -1100,24 +1100,24 @@ bpf_object__create_maps(struct bpf_object *obj)
 		create_attr.value_size = def->value_size;
 		create_attr.max_entries = def->max_entries;
 		create_attr.btf_fd = 0;
-		create_attr.btf_key_id = 0;
-		create_attr.btf_value_id = 0;
+		create_attr.btf_key_type_id = 0;
+		create_attr.btf_value_type_id = 0;
 
 		if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) {
 			create_attr.btf_fd = btf__fd(obj->btf);
-			create_attr.btf_key_id = map->btf_key_id;
-			create_attr.btf_value_id = map->btf_value_id;
+			create_attr.btf_key_type_id = map->btf_key_type_id;
+			create_attr.btf_value_type_id = map->btf_value_type_id;
 		}
 
 		*pfd = bpf_create_map_xattr(&create_attr);
-		if (*pfd < 0 && create_attr.btf_key_id) {
+		if (*pfd < 0 && create_attr.btf_key_type_id) {
 			pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
 				   map->name, strerror(errno), errno);
 			create_attr.btf_fd = 0;
-			create_attr.btf_key_id = 0;
-			create_attr.btf_value_id = 0;
-			map->btf_key_id = 0;
-			map->btf_value_id = 0;
+			create_attr.btf_key_type_id = 0;
+			create_attr.btf_value_type_id = 0;
+			map->btf_key_type_id = 0;
+			map->btf_value_type_id = 0;
 			*pfd = bpf_create_map_xattr(&create_attr);
 		}
 
@@ -2085,14 +2085,14 @@ const char *bpf_map__name(struct bpf_map *map)
 	return map ? map->name : NULL;
 }
 
-uint32_t bpf_map__btf_key_id(const struct bpf_map *map)
+uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map)
 {
-	return map ? map->btf_key_id : 0;
+	return map ? map->btf_key_type_id : 0;
 }
 
-uint32_t bpf_map__btf_value_id(const struct bpf_map *map)
+uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map)
 {
-	return map ? map->btf_value_id : 0;
+	return map ? map->btf_value_type_id : 0;
 }
 
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index cd3fd8d782c7..09976531aa74 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -244,8 +244,8 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
 int bpf_map__fd(struct bpf_map *map);
 const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
 const char *bpf_map__name(struct bpf_map *map);
-uint32_t bpf_map__btf_key_id(const struct bpf_map *map);
-uint32_t bpf_map__btf_value_id(const struct bpf_map *map);
+uint32_t bpf_map__btf_key_type_id(const struct bpf_map *map);
+uint32_t bpf_map__btf_value_type_id(const struct bpf_map *map);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index c8bceae7ec02..35064df688c1 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -113,22 +113,25 @@ static char btf_log_buf[BTF_LOG_BUF_SIZE];
 static struct btf_header hdr_tmpl = {
 	.magic = BTF_MAGIC,
 	.version = BTF_VERSION,
+	.hdr_len = sizeof(struct btf_header),
 };
 
 struct btf_raw_test {
 	const char *descr;
 	const char *str_sec;
 	const char *map_name;
+	const char *err_str;
 	__u32 raw_types[MAX_NR_RAW_TYPES];
 	__u32 str_sec_size;
 	enum bpf_map_type map_type;
 	__u32 key_size;
 	__u32 value_size;
-	__u32 key_id;
-	__u32 value_id;
+	__u32 key_type_id;
+	__u32 value_type_id;
 	__u32 max_entries;
 	bool btf_load_err;
 	bool map_create_err;
+	int hdr_len_delta;
 	int type_off_delta;
 	int str_off_delta;
 	int str_len_delta;
@@ -141,8 +144,8 @@ static struct btf_raw_test raw_tests[] = {
  * };
  *
  * struct A {
- *	int m;
- *	unsigned long long n;
+ *	unsigned long long m;
+ *	int n;
  *	char o;
  *	[3 bytes hole]
  *	int p[8];
@@ -163,8 +166,8 @@ static struct btf_raw_test raw_tests[] = {
 		BTF_TYPE_ARRAY_ENC(1, 1, 8),			/* [4] */
 		/* struct A { */				/* [5] */
 		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180),
-		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m;		*/
-		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* unsigned long long n;*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),	/* unsigned long long m;*/
+		BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n;		*/
 		BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o;		*/
 		BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8]		*/
 		BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8]		*/
@@ -172,6 +175,7 @@ static struct btf_raw_test raw_tests[] = {
 		/* } */
 		/* int[4][8] */
 		BTF_TYPE_ARRAY_ENC(4, 1, 4),			/* [6] */
+		/* enum E */					/* [7] */
 		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
 		BTF_ENUM_ENC(NAME_TBD, 0),
 		BTF_ENUM_ENC(NAME_TBD, 1),
@@ -183,8 +187,8 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "struct_test1_map",
 	.key_size = sizeof(int),
 	.value_size = 180,
-	.key_id = 1,
-	.value_id = 5,
+	.key_type_id = 1,
+	.value_type_id = 5,
 	.max_entries = 4,
 },
 
@@ -238,8 +242,8 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "struct_test2_map",
 	.key_size = sizeof(int),
 	.value_size = 68,
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 },
 
@@ -258,7 +262,7 @@ static struct btf_raw_test raw_tests[] = {
 		/* struct A { */				/* [2] */
 		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 -  1),
 		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
-		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n; */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */
 		/* } */
 		BTF_END_RAW,
 	},
@@ -268,10 +272,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "size_check1_map",
 	.key_size = sizeof(int),
 	.value_size = 1,
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Member exceeds struct_size",
 },
 
 /* Test member exeeds the size of struct
@@ -301,11 +306,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "size_check2_map",
 	.key_size = sizeof(int),
 	.value_size = 1,
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 	.btf_load_err = true,
-
+	.err_str = "Member exceeds struct_size",
 },
 
 /* Test member exeeds the size of struct
@@ -335,10 +340,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "size_check3_map",
 	.key_size = sizeof(int),
 	.value_size = 1,
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Member exceeds struct_size",
 },
 
 /* Test member exceeds the size of struct
@@ -376,10 +382,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "size_check4_map",
 	.key_size = sizeof(int),
 	.value_size = 1,
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Member exceeds struct_size",
 },
 
 /* typedef const void * const_void_ptr;
@@ -411,8 +418,8 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "void_test1_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *),
-	.key_id = 1,
-	.value_id = 4,
+	.key_type_id = 1,
+	.value_type_id = 4,
 	.max_entries = 4,
 },
 
@@ -440,10 +447,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "void_test2_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *),
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Invalid member",
 },
 
 /* typedef const void * const_void_ptr;
@@ -458,9 +466,9 @@ static struct btf_raw_test raw_tests[] = {
 		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
 		/* const void* */	/* [3] */
 		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
-		/* typedef const void * const_void_ptr */
+		/* typedef const void * const_void_ptr */	/* [4] */
 		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
-		/* const_void_ptr[4] */	/* [4] */
+		/* const_void_ptr[4] */	/* [5] */
 		BTF_TYPE_ARRAY_ENC(3, 1, 4),
 		BTF_END_RAW,
 	},
@@ -470,8 +478,8 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "void_test3_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *) * 4,
-	.key_id = 1,
-	.value_id = 4,
+	.key_type_id = 1,
+	.value_type_id = 4,
 	.max_entries = 4,
 },
 
@@ -493,10 +501,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "void_test4_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *) * 4,
-	.key_id = 1,
-	.value_id = 3,
+	.key_type_id = 1,
+	.value_type_id = 3,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Invalid elem",
 },
 
 /* Array_A  <------------------+
@@ -523,10 +532,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test1_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(sizeof(int) * 8),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 /* typedef is _before_ the BTF type of Array_A and Array_B
@@ -551,7 +561,6 @@ static struct btf_raw_test raw_tests[] = {
 		BTF_TYPE_ARRAY_ENC(2, 1, 8),			/* [3] */
 		/* Array_B */
 		BTF_TYPE_ARRAY_ENC(3, 1, 8),			/* [4] */
-
 		BTF_END_RAW,
 	},
 	.str_sec = "\0int_array\0",
@@ -560,10 +569,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test2_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(sizeof(int) * 8),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 /* Array_A  <------------------+
@@ -582,7 +592,6 @@ static struct btf_raw_test raw_tests[] = {
 		BTF_TYPE_ARRAY_ENC(3, 1, 8),
 		/* Array_B */				/* [3] */
 		BTF_TYPE_ARRAY_ENC(2, 1, 8),
-
 		BTF_END_RAW,
 	},
 	.str_sec = "",
@@ -591,10 +600,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test3_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(sizeof(int) * 8),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 /* typedef is _between_ the BTF type of Array_A and Array_B
@@ -627,10 +637,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test4_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(sizeof(int) * 8),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 /* typedef struct B Struct_B
@@ -668,10 +679,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test5_map",
 	.key_size = sizeof(int),
 	.value_size = 8,
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 /* struct A {
@@ -697,10 +709,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test6_map",
 	.key_size = sizeof(int),
 	.value_size = 8,
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 {
@@ -724,10 +737,11 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test7_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 {
@@ -759,34 +773,73 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "loop_test8_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(void *),
-	.key_id = 1,
-	.value_id = 2,
+	.key_type_id = 1,
+	.value_type_id = 2,
 	.max_entries = 4,
 	.btf_load_err = true,
+	.err_str = "Loop detected",
 },
 
 {
-	.descr = "type_off == str_off",
+	.descr = "string section does not end with null",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
 		BTF_END_RAW,
 	},
 	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int") - 1,
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid string section",
+},
+
+{
+	.descr = "empty string section",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = 0,
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid string section",
+},
+
+{
+	.descr = "empty type section",
+	.raw_types = {
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
 	.str_sec_size = sizeof("\0int"),
 	.map_type = BPF_MAP_TYPE_ARRAY,
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.type_off_delta = sizeof(struct btf_type) + sizeof(int) + sizeof("\0int"),
+	.err_str = "No type found",
 },
 
 {
-	.descr = "Unaligned type_off",
+	.descr = "btf_header test. Longer hdr_len",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
@@ -798,15 +851,16 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.type_off_delta = 1,
+	.hdr_len_delta = 4,
+	.err_str = "Unsupported btf_header",
 },
 
 {
-	.descr = "str_off beyonds btf size",
+	.descr = "btf_header test. Gap between hdr and type",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
@@ -818,15 +872,16 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.str_off_delta = sizeof("\0int") + 1,
+	.type_off_delta = 4,
+	.err_str = "Unsupported section found",
 },
 
 {
-	.descr = "str_len beyonds btf size",
+	.descr = "btf_header test. Gap between type and str",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
@@ -838,15 +893,16 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.str_len_delta = 1,
+	.str_off_delta = 4,
+	.err_str = "Unsupported section found",
 },
 
 {
-	.descr = "String section does not end with null",
+	.descr = "btf_header test. Overlap between type and str",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
@@ -858,15 +914,16 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.str_len_delta = -1,
+	.str_off_delta = -4,
+	.err_str = "Section overlap found",
 },
 
 {
-	.descr = "Empty string section",
+	.descr = "btf_header test. Larger BTF size",
 	.raw_types = {
 		/* int */				/* [1] */
 		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
@@ -878,11 +935,288 @@ static struct btf_raw_test raw_tests[] = {
 	.map_name = "hdr_test_map",
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.key_id = 1,
-	.value_id = 1,
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = -4,
+	.err_str = "Unsupported section found",
+},
+
+{
+	.descr = "btf_header test. Smaller BTF size",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = 4,
+	.err_str = "Total section length too long",
+},
+
+{
+	.descr = "array test. index_type/elem_type \"int\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(1, 1, 16),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "array test. index_type/elem_type \"const int\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 3, 16),
+		/* CONST type_id=1 */			/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "array test. index_type \"const int:31\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int:31 */				/* [2] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4),
+		/* int[16] */				/* [3] */
+		BTF_TYPE_ARRAY_ENC(1, 4, 16),
+		/* CONST type_id=2 */			/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid index",
+},
+
+{
+	.descr = "array test. elem_type \"const int:31\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int:31 */				/* [2] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4),
+		/* int[16] */				/* [3] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 16),
+		/* CONST type_id=2 */			/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid array of int",
+},
+
+{
+	.descr = "array test. index_type \"void\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(1, 0, 16),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid index",
+},
+
+{
+	.descr = "array test. index_type \"const void\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(1, 3, 16),
+		/* CONST type_id=0 (void) */		/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid index",
+},
+
+{
+	.descr = "array test. elem_type \"const void\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 16),
+		/* CONST type_id=0 (void) */		/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid elem",
+},
+
+{
+	.descr = "array test. elem_type \"const void *\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void *[16] */			/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 16),
+		/* CONST type_id=4 */			/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+		/* void* */				/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+},
+
+{
+	.descr = "array test. index_type \"const void *\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void *[16] */			/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 3, 16),
+		/* CONST type_id=4 */			/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+		/* void* */				/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
 	.max_entries = 4,
 	.btf_load_err = true,
-	.str_len_delta = 0 - (int)sizeof("\0int"),
+	.err_str = "Invalid index",
+},
+
+{
+	.descr = "int test. invalid int_data",
+	.raw_types = {
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), 4),
+		0x10000000,
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid int_data",
+},
+
+{
+	.descr = "invalid BTF_INFO",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_TYPE_ENC(0, 0x10000000, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "Invalid btf_info",
 },
 
 }; /* struct btf_raw_test raw_tests[] */
@@ -951,6 +1285,7 @@ static void *btf_raw_create(const struct btf_header *hdr,
 	memcpy(raw_btf + offset, str, str_sec_size);
 
 	ret_hdr = (struct btf_header *)raw_btf;
+	ret_hdr->type_len = type_sec_size;
 	ret_hdr->str_off = type_sec_size;
 	ret_hdr->str_len = str_sec_size;
 
@@ -981,6 +1316,7 @@ static int do_test_raw(unsigned int test_num)
 
 	hdr = raw_btf;
 
+	hdr->hdr_len = (int)hdr->hdr_len + test->hdr_len_delta;
 	hdr->type_off = (int)hdr->type_off + test->type_off_delta;
 	hdr->str_off = (int)hdr->str_off + test->str_off_delta;
 	hdr->str_len = (int)hdr->str_len + test->str_len_delta;
@@ -992,8 +1328,13 @@ static int do_test_raw(unsigned int test_num)
 	free(raw_btf);
 
 	err = ((btf_fd == -1) != test->btf_load_err);
-	CHECK(err, "btf_fd:%d test->btf_load_err:%u",
-	      btf_fd, test->btf_load_err);
+	if (CHECK(err, "btf_fd:%d test->btf_load_err:%u",
+		  btf_fd, test->btf_load_err) ||
+	    CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
+		  "expected err_str:%s", test->err_str)) {
+		err = -1;
+		goto done;
+	}
 
 	if (err || btf_fd == -1)
 		goto done;
@@ -1004,8 +1345,8 @@ static int do_test_raw(unsigned int test_num)
 	create_attr.value_size = test->value_size;
 	create_attr.max_entries = test->max_entries;
 	create_attr.btf_fd = btf_fd;
-	create_attr.btf_key_id = test->key_id;
-	create_attr.btf_value_id = test->value_id;
+	create_attr.btf_key_type_id = test->key_type_id;
+	create_attr.btf_value_type_id = test->value_type_id;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
 
@@ -1267,8 +1608,8 @@ static int test_btf_id(unsigned int test_num)
 	create_attr.value_size = sizeof(unsigned int);
 	create_attr.max_entries = 4;
 	create_attr.btf_fd = btf_fd[0];
-	create_attr.btf_key_id = 1;
-	create_attr.btf_value_id = 2;
+	create_attr.btf_key_type_id = 1;
+	create_attr.btf_value_type_id = 2;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
 	if (CHECK(map_fd == -1, "errno:%d", errno)) {
@@ -1279,10 +1620,10 @@ static int test_btf_id(unsigned int test_num)
 	info_len = sizeof(map_info);
 	err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
 	if (CHECK(err || map_info.btf_id != info[0].id ||
-		  map_info.btf_key_id != 1 || map_info.btf_value_id != 2,
-		  "err:%d errno:%d info.id:%u btf_id:%u btf_key_id:%u btf_value_id:%u",
-		  err, errno, info[0].id, map_info.btf_id, map_info.btf_key_id,
-		  map_info.btf_value_id)) {
+		  map_info.btf_key_type_id != 1 || map_info.btf_value_type_id != 2,
+		  "err:%d errno:%d info.id:%u btf_id:%u btf_key_type_id:%u btf_value_type_id:%u",
+		  err, errno, info[0].id, map_info.btf_id, map_info.btf_key_type_id,
+		  map_info.btf_value_type_id)) {
 		err = -1;
 		goto done;
 	}
@@ -1542,10 +1883,10 @@ static int do_test_file(unsigned int test_num)
 		goto done;
 	}
 
-	err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0)
+	err = (bpf_map__btf_key_type_id(map) == 0 || bpf_map__btf_value_type_id(map) == 0)
 		!= test->btf_kv_notfound;
-	if (CHECK(err, "btf_key_id:%u btf_value_id:%u test->btf_kv_notfound:%u",
-		  bpf_map__btf_key_id(map), bpf_map__btf_value_id(map),
+	if (CHECK(err, "btf_key_type_id:%u btf_value_type_id:%u test->btf_kv_notfound:%u",
+		  bpf_map__btf_key_type_id(map), bpf_map__btf_value_type_id(map),
 		  test->btf_kv_notfound))
 		goto done;
 
@@ -1615,7 +1956,7 @@ static struct btf_raw_test pprint_test = {
 		/* 28 bits */				/* [7] */
 		BTF_TYPE_INT_ENC(0, 0, 0, 28, 4),
 		/* uint8_t[8] */			/* [8] */
-		BTF_TYPE_ARRAY_ENC(9, 3, 8),
+		BTF_TYPE_ARRAY_ENC(9, 1, 8),
 		/* typedef unsigned char uint8_t */	/* [9] */
 		BTF_TYPEDEF_ENC(NAME_TBD, 1),
 		/* typedef unsigned short uint16_t */	/* [10] */
@@ -1654,8 +1995,8 @@ static struct btf_raw_test pprint_test = {
 	.map_name = "pprint_test",
 	.key_size = sizeof(unsigned int),
 	.value_size = sizeof(struct pprint_mapv),
-	.key_id = 3,	/* unsigned int */
-	.value_id = 16,	/* struct pprint_mapv */
+	.key_type_id = 3,	/* unsigned int */
+	.value_type_id = 16,	/* struct pprint_mapv */
 	.max_entries = 128 * 1024,
 };
 
@@ -1712,8 +2053,8 @@ static int test_pprint(void)
 	create_attr.value_size = test->value_size;
 	create_attr.max_entries = test->max_entries;
 	create_attr.btf_fd = btf_fd;
-	create_attr.btf_key_id = test->key_id;
-	create_attr.btf_value_id = test->value_id;
+	create_attr.btf_key_type_id = test->key_type_id;
+	create_attr.btf_value_type_id = test->value_type_id;
 
 	map_fd = bpf_create_map_xattr(&create_attr);
 	if (CHECK(map_fd == -1, "errno:%d", errno)) {
-- 
cgit v1.2.3-59-g8ed1b


From 0c6bca747111dee19aa48c8f73d77fc85fcb8dd0 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 15 May 2018 21:23:31 +0900
Subject: netfilter: nf_tables: remove nft_af_info.

The struct nft_af_info was removed.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/nftables.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 48134353411d..29c3851b486a 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -4,8 +4,6 @@
 
 #include <linux/list.h>
 
-struct nft_af_info;
-
 struct netns_nftables {
 	struct list_head	tables;
 	struct list_head	commit_list;
-- 
cgit v1.2.3-59-g8ed1b


From 167381f3eac0d9db82ad2c565048bfea8fb01574 Mon Sep 17 00:00:00 2001
From: Sirio Balmelli <sirio@b-ad.ch>
Date: Mon, 21 May 2018 09:00:03 +0200
Subject: selftests/bpf: Makefile fix "missing" headers on build with
 -idirafter

Selftests fail to build on several distros/architectures because of
	missing headers files.

On a Ubuntu/x86_64 some missing headers are:
	asm/byteorder.h, asm/socket.h, asm/sockios.h

On a Debian/arm32 build already fails at sys/cdefs.h

In both cases, these already exist in /usr/include/<arch-specific-dir>,
but Clang does not include these when using '-target bpf' flag,
since it is no longer compiling against the host architecture.

The solution is to:

- run Clang without '-target bpf' and extract the include chain for the
current system

- add these to the bpf build with '-idirafter'

The choice of -idirafter is to catch this error without injecting
unexpected include behavior: if an arch-specific tree is built
for bpf in the future, this will be correctly found by Clang.

Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1eb0fa2aba92..49618c9a5638 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -84,7 +84,17 @@ else
   CPU ?= generic
 endif
 
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
 CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
+	      $(CLANG_SYS_INCLUDES) \
 	      -Wno-compare-distinct-pointer-types
 
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
-- 
cgit v1.2.3-59-g8ed1b


From f8793c26fe586659d6da3fa277e63961a69d314b Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Wed, 16 May 2018 14:11:58 +0200
Subject: brcmfmac: move ALLFFMAC variable in flowring module

The only user of ALLFFMAC is the flowring module so no need to
expose it in a header file.

Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c   | 2 --
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h   | 2 --
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
index 105b8774fca9..cd3651069d0c 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.c
@@ -36,8 +36,6 @@ MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom 802.11 wireless LAN fullmac driver.");
 MODULE_LICENSE("Dual BSD/GPL");
 
-const u8 ALLFFMAC[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-
 #define BRCMF_DEFAULT_SCAN_CHANNEL_TIME	40
 #define BRCMF_DEFAULT_SCAN_UNASSOC_TIME	40
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
index ef914619e8e1..a34642cb4d2f 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/common.h
@@ -19,8 +19,6 @@
 #include <linux/platform_data/brcmfmac.h>
 #include "fwil_types.h"
 
-extern const u8 ALLFFMAC[ETH_ALEN];
-
 #define BRCMF_FW_ALTPATH_LEN			256
 
 /* Definitions for the module global and device specific settings are defined
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c
index d0b738da2458..d0d8b32af7d0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/flowring.c
@@ -46,6 +46,8 @@ static const u8 brcmf_flowring_prio2fifo[] = {
 	3
 };
 
+static const u8 ALLFFMAC[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
 
 static bool
 brcmf_flowring_is_tdls_mac(struct brcmf_flowring *flow, u8 mac[ETH_ALEN])
-- 
cgit v1.2.3-59-g8ed1b


From 8e072168f75ebce85b96cbcefea2b10ddbd5913f Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Wed, 16 May 2018 14:11:59 +0200
Subject: brcmfmac: add support for sysfs initiated coredump

The driver already supports device coredump initiated by firmware
event. Since commit 3c47d19ff4dc ("drivers: base: add coredump driver
ops") it is also possible to initiate it from user-space through
sysfs. This patch adds support for SDIO and PCIe devices.

Reviewed-by: Hante Meuleman <hante.meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieter-paul.giesberts@broadcom.com>
Reviewed-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bus.h    | 2 ++
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c   | 8 ++++++++
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c   | 1 +
 4 files changed, 12 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index a1915411c280..d2f788d88668 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1165,6 +1165,7 @@ static struct sdio_driver brcmf_sdmmc_driver = {
 #ifdef CONFIG_PM_SLEEP
 		.pm = &brcmf_sdio_pm_ops,
 #endif	/* CONFIG_PM_SLEEP */
+		.coredump = brcmf_dev_coredump,
 	},
 };
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bus.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bus.h
index 27e693e93f21..c4965184cdf3 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bus.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bus.h
@@ -250,6 +250,8 @@ int brcmf_attach(struct device *dev, struct brcmf_mp_device *settings);
 void brcmf_detach(struct device *dev);
 /* Indication from bus module that dongle should be reset */
 void brcmf_dev_reset(struct device *dev);
+/* Request from bus module to initiate a coredump */
+void brcmf_dev_coredump(struct device *dev);
 
 /* Configure the "global" bus state used by upper layers */
 void brcmf_bus_change_state(struct brcmf_bus *bus, enum brcmf_bus_state state);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
index 8d4511eaa9b9..72954fd6df3b 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
@@ -1180,6 +1180,14 @@ void brcmf_dev_reset(struct device *dev)
 		brcmf_fil_cmd_int_set(drvr->iflist[0], BRCMF_C_TERMINATED, 1);
 }
 
+void brcmf_dev_coredump(struct device *dev)
+{
+	struct brcmf_bus *bus_if = dev_get_drvdata(dev);
+
+	if (brcmf_debug_create_memdump(bus_if, NULL, 0) < 0)
+		brcmf_dbg(TRACE, "failed to create coredump\n");
+}
+
 void brcmf_detach(struct device *dev)
 {
 	s32 i;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index f0797aeada67..5baa8372371d 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -2044,6 +2044,7 @@ static struct pci_driver brcmf_pciedrvr = {
 #ifdef CONFIG_PM
 	.driver.pm = &brcmf_pciedrvr_pm,
 #endif
+	.driver.coredump = brcmf_dev_coredump,
 };
 
 
-- 
cgit v1.2.3-59-g8ed1b


From 21c5c83ce83359f0ab930824c67984575c051550 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Wed, 16 May 2018 14:12:00 +0200
Subject: mwifiex: support sysfs initiated device coredump

Since commit 3c47d19ff4dc ("drivers: base: add coredump driver ops")
it is possible to initiate a device coredump from user-space. This
patch adds support for it adding the .coredump() driver callback.
As there is no longer a need to initiate it through debugfs remove
that code.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cmdevt.c  |  1 +
 drivers/net/wireless/marvell/mwifiex/debugfs.c | 31 +-------------------------
 drivers/net/wireless/marvell/mwifiex/pcie.c    | 18 +++++++++++++--
 drivers/net/wireless/marvell/mwifiex/sdio.c    | 12 ++++++++++
 drivers/net/wireless/marvell/mwifiex/usb.c     | 13 +++++++++++
 5 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
index 9cfcdf6bec52..8be1e69657a7 100644
--- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c
+++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
@@ -674,6 +674,7 @@ int mwifiex_send_cmd(struct mwifiex_private *priv, u16 cmd_no,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mwifiex_send_cmd);
 
 /*
  * This function queues a command to the command pending queue.
diff --git a/drivers/net/wireless/marvell/mwifiex/debugfs.c b/drivers/net/wireless/marvell/mwifiex/debugfs.c
index db2872daae97..07453932f703 100644
--- a/drivers/net/wireless/marvell/mwifiex/debugfs.c
+++ b/drivers/net/wireless/marvell/mwifiex/debugfs.c
@@ -153,34 +153,6 @@ free_and_exit:
 	return ret;
 }
 
-/*
- * Proc device dump read handler.
- *
- * This function is called when the 'device_dump' file is opened for
- * reading.
- * This function dumps driver information and firmware memory segments
- * (ex. DTCM, ITCM, SQRAM etc.) for
- * debugging.
- */
-static ssize_t
-mwifiex_device_dump_read(struct file *file, char __user *ubuf,
-			 size_t count, loff_t *ppos)
-{
-	struct mwifiex_private *priv = file->private_data;
-
-	/* For command timeouts, USB firmware will automatically emit
-	 * firmware dump events, so we don't implement device_dump().
-	 * For user-initiated dumps, we trigger it ourselves.
-	 */
-	if (priv->adapter->iface_type == MWIFIEX_USB)
-		mwifiex_send_cmd(priv, HostCmd_CMD_FW_DUMP_EVENT,
-				 HostCmd_ACT_GEN_SET, 0, NULL, true);
-	else
-		priv->adapter->if_ops.device_dump(priv->adapter);
-
-	return 0;
-}
-
 /*
  * Proc getlog file read handler.
  *
@@ -980,7 +952,6 @@ static const struct file_operations mwifiex_dfs_##name##_fops = {       \
 MWIFIEX_DFS_FILE_READ_OPS(info);
 MWIFIEX_DFS_FILE_READ_OPS(debug);
 MWIFIEX_DFS_FILE_READ_OPS(getlog);
-MWIFIEX_DFS_FILE_READ_OPS(device_dump);
 MWIFIEX_DFS_FILE_OPS(regrdwr);
 MWIFIEX_DFS_FILE_OPS(rdeeprom);
 MWIFIEX_DFS_FILE_OPS(memrw);
@@ -1011,7 +982,7 @@ mwifiex_dev_debugfs_init(struct mwifiex_private *priv)
 	MWIFIEX_DFS_ADD_FILE(getlog);
 	MWIFIEX_DFS_ADD_FILE(regrdwr);
 	MWIFIEX_DFS_ADD_FILE(rdeeprom);
-	MWIFIEX_DFS_ADD_FILE(device_dump);
+
 	MWIFIEX_DFS_ADD_FILE(memrw);
 	MWIFIEX_DFS_ADD_FILE(hscfg);
 	MWIFIEX_DFS_ADD_FILE(histogram);
diff --git a/drivers/net/wireless/marvell/mwifiex/pcie.c b/drivers/net/wireless/marvell/mwifiex/pcie.c
index 7538543d46fa..0c42b7296ddd 100644
--- a/drivers/net/wireless/marvell/mwifiex/pcie.c
+++ b/drivers/net/wireless/marvell/mwifiex/pcie.c
@@ -320,6 +320,19 @@ static void mwifiex_pcie_shutdown(struct pci_dev *pdev)
 	return;
 }
 
+static void mwifiex_pcie_coredump(struct device *dev)
+{
+	struct pci_dev *pdev;
+	struct pcie_service_card *card;
+
+	pdev = container_of(dev, struct pci_dev, dev);
+	card = pci_get_drvdata(pdev);
+
+	if (!test_and_set_bit(MWIFIEX_IFACE_WORK_DEVICE_DUMP,
+			      &card->work_flags))
+		schedule_work(&card->work);
+}
+
 static const struct pci_device_id mwifiex_ids[] = {
 	{
 		PCIE_VENDOR_ID_MARVELL, PCIE_DEVICE_ID_MARVELL_88W8766P,
@@ -415,11 +428,12 @@ static struct pci_driver __refdata mwifiex_pcie = {
 	.id_table = mwifiex_ids,
 	.probe    = mwifiex_pcie_probe,
 	.remove   = mwifiex_pcie_remove,
-#ifdef CONFIG_PM_SLEEP
 	.driver   = {
+		.coredump = mwifiex_pcie_coredump,
+#ifdef CONFIG_PM_SLEEP
 		.pm = &mwifiex_pcie_pm_ops,
-	},
 #endif
+	},
 	.shutdown = mwifiex_pcie_shutdown,
 	.err_handler = &mwifiex_pcie_err_handler,
 };
diff --git a/drivers/net/wireless/marvell/mwifiex/sdio.c b/drivers/net/wireless/marvell/mwifiex/sdio.c
index a82880132af4..47d2dcc3f28f 100644
--- a/drivers/net/wireless/marvell/mwifiex/sdio.c
+++ b/drivers/net/wireless/marvell/mwifiex/sdio.c
@@ -466,6 +466,17 @@ static int mwifiex_sdio_suspend(struct device *dev)
 	return ret;
 }
 
+static void mwifiex_sdio_coredump(struct device *dev)
+{
+	struct sdio_func *func = dev_to_sdio_func(dev);
+	struct sdio_mmc_card *card;
+
+	card = sdio_get_drvdata(func);
+	if (!test_and_set_bit(MWIFIEX_IFACE_WORK_DEVICE_DUMP,
+			      &card->work_flags))
+		schedule_work(&card->work);
+}
+
 /* Device ID for SD8786 */
 #define SDIO_DEVICE_ID_MARVELL_8786   (0x9116)
 /* Device ID for SD8787 */
@@ -515,6 +526,7 @@ static struct sdio_driver mwifiex_sdio = {
 	.remove = mwifiex_sdio_remove,
 	.drv = {
 		.owner = THIS_MODULE,
+		.coredump = mwifiex_sdio_coredump,
 		.pm = &mwifiex_sdio_pm_ops,
 	}
 };
diff --git a/drivers/net/wireless/marvell/mwifiex/usb.c b/drivers/net/wireless/marvell/mwifiex/usb.c
index 4bc244801636..7aa39878ce49 100644
--- a/drivers/net/wireless/marvell/mwifiex/usb.c
+++ b/drivers/net/wireless/marvell/mwifiex/usb.c
@@ -653,6 +653,16 @@ static void mwifiex_usb_disconnect(struct usb_interface *intf)
 	usb_put_dev(interface_to_usbdev(intf));
 }
 
+static void mwifiex_usb_coredump(struct device *dev)
+{
+	struct usb_interface *intf = to_usb_interface(dev);
+	struct usb_card_rec *card = usb_get_intfdata(intf);
+
+	mwifiex_send_cmd(mwifiex_get_priv(card->adapter, MWIFIEX_BSS_ROLE_ANY),
+			 HostCmd_CMD_FW_DUMP_EVENT, HostCmd_ACT_GEN_SET, 0,
+			 NULL, true);
+}
+
 static struct usb_driver mwifiex_usb_driver = {
 	.name = "mwifiex_usb",
 	.probe = mwifiex_usb_probe,
@@ -661,6 +671,9 @@ static struct usb_driver mwifiex_usb_driver = {
 	.suspend = mwifiex_usb_suspend,
 	.resume = mwifiex_usb_resume,
 	.soft_unbind = 1,
+	.drvwrap.driver = {
+		.coredump = mwifiex_usb_coredump,
+	},
 };
 
 static int mwifiex_write_data_sync(struct mwifiex_adapter *adapter, u8 *pbuf,
-- 
cgit v1.2.3-59-g8ed1b


From d2af9b566554e01f9ad67b330ce569dbc130e5d3 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Wed, 16 May 2018 14:12:01 +0200
Subject: brcmfmac: validate user provided data for memdump before copying

In patch "brcmfmac: add support for sysfs initiated coredump", a new
scenario of brcmf_debug_create_memdump was added in which the user of
the function might not necessarily provide prefix data. Hence the
function should not assume the data is always valid and should perform a
check before copying.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
index 504832084eca..489b5dfdf5b9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
@@ -40,7 +40,8 @@ int brcmf_debug_create_memdump(struct brcmf_bus *bus, const void *data,
 	if (!dump)
 		return -ENOMEM;
 
-	memcpy(dump, data, len);
+	if (data && len > 0)
+		memcpy(dump, data, len);
 	err = brcmf_bus_get_memdump(bus, dump + len, ramsize);
 	if (err) {
 		vfree(dump);
-- 
cgit v1.2.3-59-g8ed1b


From 8a3ab2f38f1669e3be6433a1f6b82a077b38c4c7 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Wed, 16 May 2018 14:12:02 +0200
Subject: brcmfmac: trigger memory dump upon firmware halt signal

PCIe dongle firmware signals a halt/trap through mailbox interrupt.
Trigger a memory dump upon receiving such signal could help to provide
useful information for issue debug.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 5baa8372371d..45928b5b8d97 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -182,6 +182,7 @@ static const struct brcmf_firmware_mapping brcmf_pcie_fwnames[] = {
 #define BRCMF_D2H_DEV_D3_ACK			0x00000001
 #define BRCMF_D2H_DEV_DS_ENTER_REQ		0x00000002
 #define BRCMF_D2H_DEV_DS_EXIT_NOTE		0x00000004
+#define BRCMF_D2H_DEV_FWHALT			0x10000000
 
 #define BRCMF_H2D_HOST_D3_INFORM		0x00000001
 #define BRCMF_H2D_HOST_DS_ACK			0x00000002
@@ -717,6 +718,10 @@ static void brcmf_pcie_handle_mb_data(struct brcmf_pciedev_info *devinfo)
 		devinfo->mbdata_completed = true;
 		wake_up(&devinfo->mbdata_resp_wait);
 	}
+	if (dtoh_mb_data & BRCMF_D2H_DEV_FWHALT) {
+		brcmf_dbg(PCIE, "D2H_MB_DATA: FW HALT\n");
+		brcmf_dev_coredump(&devinfo->pdev->dev);
+	}
 }
 
 
-- 
cgit v1.2.3-59-g8ed1b


From b8248236e92790ac635caeb4156e46ea2417e037 Mon Sep 17 00:00:00 2001
From: Franky Lin <franky.lin@broadcom.com>
Date: Wed, 16 May 2018 14:12:03 +0200
Subject: brcmfmac: trigger memory dump on SDIO firmware halt message

Attempt to dump dongle memory for debug upon receiving firmware halt
message through dongle to host mail box interrupt.

Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Franky Lin <franky.lin@broadcom.com>
Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 412a05b9a2b2..c99a191e8d69 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -1072,8 +1072,10 @@ static u32 brcmf_sdio_hostmail(struct brcmf_sdio *bus)
 	bus->sdcnt.f1regdata += 2;
 
 	/* dongle indicates the firmware has halted/crashed */
-	if (hmb_data & HMB_DATA_FWHALT)
+	if (hmb_data & HMB_DATA_FWHALT) {
 		brcmf_err("mailbox indicates firmware halted\n");
+		brcmf_dev_coredump(&sdiod->func1->dev);
+	}
 
 	/* Dongle recomposed rx frames, accept them again */
 	if (hmb_data & HMB_DATA_NAKHANDLED) {
-- 
cgit v1.2.3-59-g8ed1b


From 449325b52b7a6208f65ed67d3484fd7b7184477b Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 21 May 2018 19:22:29 -0700
Subject: umh: introduce fork_usermode_blob() helper

Introduce helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
struct umh_info {
       struct file *pipe_to_umh;
       struct file *pipe_from_umh;
       pid_t pid;
};

that GPLed kernel modules (signed or unsigned) can use it to execute part
of its own data as swappable user mode process.

The kernel will do:
- allocate a unique file in tmpfs
- populate that file with [data, data + len] bytes
- user-mode-helper code will do_execve that file and, before the process
  starts, the kernel will create two unix pipes for bidirectional
  communication between kernel module and umh
- close tmpfs file, effectively deleting it
- the fork_usermode_blob will return zero on success and populate
  'struct umh_info' with two unix pipes and the pid of the user process

As the first step in the development of the bpfilter project
the fork_usermode_blob() helper is introduced to allow user mode code
to be invoked from a kernel module. The idea is that user mode code plus
normal kernel module code are built as part of the kernel build
and installed as traditional kernel module into distro specified location,
such that from a distribution point of view, there is
no difference between regular kernel modules and kernel modules + umh code.
Such modules can be signed, modprobed, rmmod, etc. The use of this new helper
by a kernel module doesn't make it any special from kernel and user space
tooling point of view.

Such approach enables kernel to delegate functionality traditionally done
by the kernel modules into the user space processes (either root or !root) and
reduces security attack surface of the new code. The buggy umh code would crash
the user process, but not the kernel. Another advantage is that umh code
of the kernel module can be debugged and tested out of user space
(e.g. opening the possibility to run clang sanitizers, fuzzers or
user space test suites on the umh code).
In case of the bpfilter project such architecture allows complex control plane
to be done in the user space while bpf based data plane stays in the kernel.

Since umh can crash, can be oom-ed by the kernel, killed by the admin,
the kernel module that uses them (like bpfilter) needs to manage life
time of umh on its own via two unix pipes and the pid of umh.

The exit code of such kernel module should kill the umh it started,
so that rmmod of the kernel module will cleanup the corresponding umh.
Just like if the kernel module does kmalloc() it should kfree() it
in the exit code.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/exec.c               |  38 +++++++++++----
 include/linux/binfmts.h |   1 +
 include/linux/umh.h     |  12 +++++
 kernel/umh.c            | 125 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 164 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 183059c427b9..30a36c2a39bf 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execveat_common(int fd, struct filename *filename,
-			      struct user_arg_ptr argv,
-			      struct user_arg_ptr envp,
-			      int flags)
+static int __do_execve_file(int fd, struct filename *filename,
+			    struct user_arg_ptr argv,
+			    struct user_arg_ptr envp,
+			    int flags, struct file *file)
 {
 	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
-	struct file *file;
 	struct files_struct *displaced;
 	int retval;
 
@@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;
 
-	file = do_open_execat(fd, filename, flags);
+	if (!file)
+		file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename,
 	sched_exec();
 
 	bprm->file = file;
-	if (fd == AT_FDCWD || filename->name[0] == '/') {
+	if (!filename) {
+		bprm->filename = "none";
+	} else if (fd == AT_FDCWD || filename->name[0] == '/') {
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
@@ -1826,7 +1828,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	task_numa_free(current);
 	free_bprm(bprm);
 	kfree(pathbuf);
-	putname(filename);
+	if (filename)
+		putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
 	return retval;
@@ -1849,10 +1852,27 @@ out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
-	putname(filename);
+	if (filename)
+		putname(filename);
 	return retval;
 }
 
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
+{
+	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
+}
+
+int do_execve_file(struct file *file, void *__argv, void *__envp)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+
+	return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
+}
+
 int do_execve(struct filename *filename,
 	const char __user *const __user *__argv,
 	const char __user *const __user *__envp)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 4955e0863b83..c05f24fac4f6 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -150,5 +150,6 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
+int do_execve_file(struct file *file, void *__argv, void *__envp);
 
 #endif /* _LINUX_BINFMTS_H */
diff --git a/include/linux/umh.h b/include/linux/umh.h
index 244aff638220..5c812acbb80a 100644
--- a/include/linux/umh.h
+++ b/include/linux/umh.h
@@ -22,8 +22,10 @@ struct subprocess_info {
 	const char *path;
 	char **argv;
 	char **envp;
+	struct file *file;
 	int wait;
 	int retval;
+	pid_t pid;
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
@@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
 			  int (*init)(struct subprocess_info *info, struct cred *new),
 			  void (*cleanup)(struct subprocess_info *), void *data);
 
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+			  int (*init)(struct subprocess_info *info, struct cred *new),
+			  void (*cleanup)(struct subprocess_info *), void *data);
+struct umh_info {
+	struct file *pipe_to_umh;
+	struct file *pipe_from_umh;
+	pid_t pid;
+};
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
+
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
diff --git a/kernel/umh.c b/kernel/umh.c
index f76b3ff876cf..30db93fd7e39 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -25,6 +25,8 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -97,9 +99,13 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	retval = do_execve(getname_kernel(sub_info->path),
-			   (const char __user *const __user *)sub_info->argv,
-			   (const char __user *const __user *)sub_info->envp);
+	if (sub_info->file)
+		retval = do_execve_file(sub_info->file,
+					sub_info->argv, sub_info->envp);
+	else
+		retval = do_execve(getname_kernel(sub_info->path),
+				   (const char __user *const __user *)sub_info->argv,
+				   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
@@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
+		} else {
+			sub_info->pid = pid;
 		}
 	}
 }
@@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+		int (*init)(struct subprocess_info *info, struct cred *new),
+		void (*cleanup)(struct subprocess_info *info), void *data)
+{
+	struct subprocess_info *sub_info;
+
+	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
+	if (!sub_info)
+		return NULL;
+
+	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+	sub_info->path = "none";
+	sub_info->file = file;
+	sub_info->init = init;
+	sub_info->cleanup = cleanup;
+	sub_info->data = data;
+	return sub_info;
+}
+
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	return 0;
+}
+
+static void umh_save_pid(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	umh_info->pid = info->pid;
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	struct subprocess_info *sub_info;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
+						  umh_save_pid, info);
+	if (!sub_info)
+		goto out;
+
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
-- 
cgit v1.2.3-59-g8ed1b


From d2ba09c17a0647f899d6c20a11bab9e6d3382f07 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 21 May 2018 19:22:30 -0700
Subject: net: add skeleton of bpfilter kernel module

bpfilter.ko consists of bpfilter_kern.c (normal kernel module code)
and user mode helper code that is embedded into bpfilter.ko

The steps to build bpfilter.ko are the following:
- main.c is compiled by HOSTCC into the bpfilter_umh elf executable file
- with quite a bit of objcopy and Makefile magic the bpfilter_umh elf file
  is converted into bpfilter_umh.o object file
  with _binary_net_bpfilter_bpfilter_umh_start and _end symbols
  Example:
  $ nm ./bld_x64/net/bpfilter/bpfilter_umh.o
  0000000000004cf8 T _binary_net_bpfilter_bpfilter_umh_end
  0000000000004cf8 A _binary_net_bpfilter_bpfilter_umh_size
  0000000000000000 T _binary_net_bpfilter_bpfilter_umh_start
- bpfilter_umh.o and bpfilter_kern.o are linked together into bpfilter.ko

bpfilter_kern.c is a normal kernel module code that calls
the fork_usermode_blob() helper to execute part of its own data
as a user mode process.

Notice that _binary_net_bpfilter_bpfilter_umh_start - end
is placed into .init.rodata section, so it's freed as soon as __init
function of bpfilter.ko is finished.
As part of __init the bpfilter.ko does first request/reply action
via two unix pipe provided by fork_usermode_blob() helper to
make sure that umh is healthy. If not it will kill it via pid.

Later bpfilter_process_sockopt() will be called from bpfilter hooks
in get/setsockopt() to pass iptable commands into umh via bpfilter.ko

If admin does 'rmmod bpfilter' the __exit code bpfilter.ko will
kill umh as well.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpfilter.h      |  15 ++++++
 include/uapi/linux/bpfilter.h |  21 ++++++++
 net/Kconfig                   |   2 +
 net/Makefile                  |   1 +
 net/bpfilter/Kconfig          |  16 ++++++
 net/bpfilter/Makefile         |  30 ++++++++++++
 net/bpfilter/bpfilter_kern.c  | 111 ++++++++++++++++++++++++++++++++++++++++++
 net/bpfilter/main.c           |  63 ++++++++++++++++++++++++
 net/bpfilter/msgfmt.h         |  17 +++++++
 net/ipv4/Makefile             |   2 +
 net/ipv4/bpfilter/Makefile    |   2 +
 net/ipv4/bpfilter/sockopt.c   |  42 ++++++++++++++++
 net/ipv4/ip_sockglue.c        |  17 +++++++
 13 files changed, 339 insertions(+)
 create mode 100644 include/linux/bpfilter.h
 create mode 100644 include/uapi/linux/bpfilter.h
 create mode 100644 net/bpfilter/Kconfig
 create mode 100644 net/bpfilter/Makefile
 create mode 100644 net/bpfilter/bpfilter_kern.c
 create mode 100644 net/bpfilter/main.c
 create mode 100644 net/bpfilter/msgfmt.h
 create mode 100644 net/ipv4/bpfilter/Makefile
 create mode 100644 net/ipv4/bpfilter/sockopt.c

diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h
new file mode 100644
index 000000000000..687b1760bb9f
--- /dev/null
+++ b/include/linux/bpfilter.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPFILTER_H
+#define _LINUX_BPFILTER_H
+
+#include <uapi/linux/bpfilter.h>
+
+struct sock;
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
+			    unsigned int optlen);
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
+			    int *optlen);
+extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				       char __user *optval,
+				       unsigned int optlen, bool is_set);
+#endif
diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h
new file mode 100644
index 000000000000..2ec3cc99ea4c
--- /dev/null
+++ b/include/uapi/linux/bpfilter.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI_LINUX_BPFILTER_H
+#define _UAPI_LINUX_BPFILTER_H
+
+#include <linux/if.h>
+
+enum {
+	BPFILTER_IPT_SO_SET_REPLACE = 64,
+	BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
+	BPFILTER_IPT_SET_MAX,
+};
+
+enum {
+	BPFILTER_IPT_SO_GET_INFO = 64,
+	BPFILTER_IPT_SO_GET_ENTRIES = 65,
+	BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
+	BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
+	BPFILTER_IPT_GET_MAX,
+};
+
+#endif /* _UAPI_LINUX_BPFILTER_H */
diff --git a/net/Kconfig b/net/Kconfig
index df8d45ef47d8..ba554cedb615 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -202,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
 
 endif
 
+source "net/bpfilter/Kconfig"
+
 source "net/dccp/Kconfig"
 source "net/sctp/Kconfig"
 source "net/rds/Kconfig"
diff --git a/net/Makefile b/net/Makefile
index 77aaddedbd29..bdaf53925acd 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
+obj-$(CONFIG_BPFILTER)		+= bpfilter/
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
new file mode 100644
index 000000000000..60725c5f79db
--- /dev/null
+++ b/net/bpfilter/Kconfig
@@ -0,0 +1,16 @@
+menuconfig BPFILTER
+	bool "BPF based packet filtering framework (BPFILTER)"
+	default n
+	depends on NET && BPF
+	help
+	  This builds experimental bpfilter framework that is aiming to
+	  provide netfilter compatible functionality via BPF
+
+if BPFILTER
+config BPFILTER_UMH
+	tristate "bpfilter kernel module with user mode helper"
+	default m
+	help
+	  This builds bpfilter kernel module with embedded user mode helper
+endif
+
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
new file mode 100644
index 000000000000..2af752c8ef5e
--- /dev/null
+++ b/net/bpfilter/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Linux BPFILTER layer.
+#
+
+hostprogs-y := bpfilter_umh
+bpfilter_umh-objs := main.o
+HOSTCFLAGS += -I. -Itools/include/
+ifeq ($(CONFIG_BPFILTER_UMH), y)
+# builtin bpfilter_umh should be compiled with -static
+# since rootfs isn't mounted at the time of __init
+# function is called and do_execv won't find elf interpreter
+HOSTLDFLAGS += -static
+endif
+
+# a bit of elf magic to convert bpfilter_umh binary into a binary blob
+# inside bpfilter_umh.o elf file referenced by
+# _binary_net_bpfilter_bpfilter_umh_start symbol
+# which bpfilter_kern.c passes further into umh blob loader at run-time
+quiet_cmd_copy_umh = GEN $@
+      cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
+      $(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
+      -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
+      --rename-section .data=.init.rodata $< $@
+
+$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
+	$(call cmd,copy_umh)
+
+obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
+bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
new file mode 100644
index 000000000000..7596314b61c7
--- /dev/null
+++ b/net/bpfilter/bpfilter_kern.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/umh.h>
+#include <linux/bpfilter.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include "msgfmt.h"
+
+#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
+#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
+
+extern char UMH_start;
+extern char UMH_end;
+
+static struct umh_info info;
+/* since ip_getsockopt() can run in parallel, serialize access to umh */
+static DEFINE_MUTEX(bpfilter_lock);
+
+static void shutdown_umh(struct umh_info *info)
+{
+	struct task_struct *tsk;
+
+	tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
+	if (tsk)
+		force_sig(SIGKILL, tsk);
+	fput(info->pipe_to_umh);
+	fput(info->pipe_from_umh);
+}
+
+static void __stop_umh(void)
+{
+	if (bpfilter_process_sockopt) {
+		bpfilter_process_sockopt = NULL;
+		shutdown_umh(&info);
+	}
+}
+
+static void stop_umh(void)
+{
+	mutex_lock(&bpfilter_lock);
+	__stop_umh();
+	mutex_unlock(&bpfilter_lock);
+}
+
+static int __bpfilter_process_sockopt(struct sock *sk, int optname,
+				      char __user *optval,
+				      unsigned int optlen, bool is_set)
+{
+	struct mbox_request req;
+	struct mbox_reply reply;
+	loff_t pos;
+	ssize_t n;
+	int ret;
+
+	req.is_set = is_set;
+	req.pid = current->pid;
+	req.cmd = optname;
+	req.addr = (long)optval;
+	req.len = optlen;
+	mutex_lock(&bpfilter_lock);
+	n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
+	if (n != sizeof(req)) {
+		pr_err("write fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	pos = 0;
+	n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
+	if (n != sizeof(reply)) {
+		pr_err("read fail %zd\n", n);
+		__stop_umh();
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = reply.status;
+out:
+	mutex_unlock(&bpfilter_lock);
+	return ret;
+}
+
+static int __init load_umh(void)
+{
+	int err;
+
+	/* fork usermode process */
+	err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
+	if (err)
+		return err;
+	pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
+
+	/* health check that usermode process started correctly */
+	if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
+		stop_umh();
+		return -EFAULT;
+	}
+	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+	return 0;
+}
+
+static void __exit fini_umh(void)
+{
+	stop_umh();
+}
+module_init(load_umh);
+module_exit(fini_umh);
+MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
new file mode 100644
index 000000000000..81bbc1684896
--- /dev/null
+++ b/net/bpfilter/main.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "include/uapi/linux/bpf.h"
+#include <asm/unistd.h>
+#include "msgfmt.h"
+
+int debug_fd;
+
+static int handle_get_cmd(struct mbox_request *cmd)
+{
+	switch (cmd->cmd) {
+	case 0:
+		return 0;
+	default:
+		break;
+	}
+	return -ENOPROTOOPT;
+}
+
+static int handle_set_cmd(struct mbox_request *cmd)
+{
+	return -ENOPROTOOPT;
+}
+
+static void loop(void)
+{
+	while (1) {
+		struct mbox_request req;
+		struct mbox_reply reply;
+		int n;
+
+		n = read(0, &req, sizeof(req));
+		if (n != sizeof(req)) {
+			dprintf(debug_fd, "invalid request %d\n", n);
+			return;
+		}
+
+		reply.status = req.is_set ?
+			handle_set_cmd(&req) :
+			handle_get_cmd(&req);
+
+		n = write(1, &reply, sizeof(reply));
+		if (n != sizeof(reply)) {
+			dprintf(debug_fd, "reply failed %d\n", n);
+			return;
+		}
+	}
+}
+
+int main(void)
+{
+	debug_fd = open("/dev/console", 00000002 | 00000100);
+	dprintf(debug_fd, "Started bpfilter\n");
+	loop();
+	close(debug_fd);
+	return 0;
+}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
new file mode 100644
index 000000000000..98d121c62945
--- /dev/null
+++ b/net/bpfilter/msgfmt.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_BPFILTER_MSGFMT_H
+#define _NET_BPFILTER_MSGFMT_H
+
+struct mbox_request {
+	__u64 addr;
+	__u32 len;
+	__u32 is_set;
+	__u32 cmd;
+	__u32 pid;
+};
+
+struct mbox_reply {
+	__u32 status;
+};
+
+#endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520f9133..7018f91c5a39 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -16,6 +16,8 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
 	     metrics.o
 
+obj-$(CONFIG_BPFILTER) += bpfilter/
+
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
new file mode 100644
index 000000000000..ce262d76cc48
--- /dev/null
+++ b/net/ipv4/bpfilter/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BPFILTER) += sockopt.o
+
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
new file mode 100644
index 000000000000..42a96d2d8d05
--- /dev/null
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/uaccess.h>
+#include <linux/bpfilter.h>
+#include <uapi/linux/bpf.h>
+#include <linux/wait.h>
+#include <linux/kmod.h>
+
+int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
+				char __user *optval,
+				unsigned int optlen, bool is_set);
+EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
+
+int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
+			  unsigned int optlen, bool is_set)
+{
+	if (!bpfilter_process_sockopt) {
+		int err = request_module("bpfilter");
+
+		if (err)
+			return err;
+		if (!bpfilter_process_sockopt)
+			return -ECHILD;
+	}
+	return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
+}
+
+int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
+			    unsigned int optlen)
+{
+	return bpfilter_mbox_request(sk, optname, optval, optlen, true);
+}
+
+int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
+			    int __user *optlen)
+{
+	int len;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	return bpfilter_mbox_request(sk, optname, optval, len, false);
+}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5ad2d8ed3a3f..e0791faacb24 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,6 +47,8 @@
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
+#include <linux/bpfilter.h>
+
 /*
  *	SOL_IP control messages.
  */
@@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
 		return -ENOPROTOOPT;
 
 	err = do_ip_setsockopt(sk, level, optname, optval, optlen);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
+	    optname < BPFILTER_IPT_SET_MAX)
+		err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
 	int err;
 
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
 	err = do_ip_getsockopt(sk, level, optname, optval, optlen,
 		MSG_CMSG_COMPAT);
 
+#ifdef CONFIG_BPFILTER
+	if (optname >= BPFILTER_IPT_SO_GET_INFO &&
+	    optname < BPFILTER_IPT_GET_MAX)
+		err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
+#endif
 #ifdef CONFIG_NETFILTER
 	/* we need to exclude all possible ENOPROTOOPTs except default case */
 	if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
-- 
cgit v1.2.3-59-g8ed1b


From a1c818109cc8126cf20523692ca29f1eae3172eb Mon Sep 17 00:00:00 2001
From: Sirio Balmelli <sirio@b-ad.ch>
Date: Wed, 23 May 2018 18:17:07 +0200
Subject: tools/lib/libbpf.c: fix string format to allow build on arm32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On arm32, 'cd tools/testing/selftests/bpf && make' fails with:

libbpf.c:80:10: error: format ‘%ld’ expects argument of type ‘long int’, but argument 4 has type ‘int64_t {aka long long int}’ [-Werror=format=]
   (func)("libbpf: " fmt, ##__VA_ARGS__); \
          ^
libbpf.c:83:30: note: in expansion of macro ‘__pr’
 #define pr_warning(fmt, ...) __pr(__pr_warning, fmt, ##__VA_ARGS__)
                              ^~~~
libbpf.c:1072:3: note: in expansion of macro ‘pr_warning’
   pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",

To fix, typecast 'key_size' and amend format string.

Signed-off-by: Sirio Balmelli <sirio@b-ad.ch>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8f1707dbfcfa..e5cd4a958846 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1042,8 +1042,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 	}
 
 	if (def->key_size != key_size) {
-		pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n",
-			   map->name, name, key_size, def->key_size);
+		pr_warning("map:%s key_type:%s has BTF type_size:%u != key_size:%u\n",
+			   map->name, name, (unsigned int)key_size, def->key_size);
 		return -EINVAL;
 	}
 
@@ -1069,8 +1069,8 @@ static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
 	}
 
 	if (def->value_size != value_size) {
-		pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
-			   map->name, name, value_size, def->value_size);
+		pr_warning("map:%s value_type:%s has BTF type_size:%u != value_size:%u\n",
+			   map->name, name, (unsigned int)value_size, def->value_size);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 8f6196f63c46982c095e485a9c73c683d9900a2e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:43 -0700
Subject: nfp: move rtsym helpers to pf code

nfp_net_pf_rtsym_read_optional() and nfp_net_pf_map_rtsym() are not
really related to networking code.  Move them to the PF code and
remove the net from their names.  They will soon be needed by code
outside of nfp_net_main.c anyway.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c     | 32 ++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_main.h     |  6 +++
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 61 +++++------------------
 3 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 0ade122805ad..1ef3623c6e1c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -75,6 +75,38 @@ static const struct pci_device_id nfp_pci_device_ids[] = {
 };
 MODULE_DEVICE_TABLE(pci, nfp_pci_device_ids);
 
+int nfp_pf_rtsym_read_optional(struct nfp_pf *pf, const char *format,
+			       unsigned int default_val)
+{
+	char name[256];
+	int err = 0;
+	u64 val;
+
+	snprintf(name, sizeof(name), format, nfp_cppcore_pcie_unit(pf->cpp));
+
+	val = nfp_rtsym_read_le(pf->rtbl, name, &err);
+	if (err) {
+		if (err == -ENOENT)
+			return default_val;
+		nfp_err(pf->cpp, "Unable to read symbol %s\n", name);
+		return err;
+	}
+
+	return val;
+}
+
+u8 __iomem *
+nfp_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
+		 unsigned int min_size, struct nfp_cpp_area **area)
+{
+	char pf_symbol[256];
+
+	snprintf(pf_symbol, sizeof(pf_symbol), sym_fmt,
+		 nfp_cppcore_pcie_unit(pf->cpp));
+
+	return nfp_rtsym_map(pf->rtbl, pf_symbol, name, min_size, area);
+}
+
 static bool nfp_board_ready(struct nfp_pf *pf)
 {
 	const char *cp;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index 42211083b51f..5b028486c894 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -177,6 +177,12 @@ nfp_net_get_mac_addr(struct nfp_pf *pf, struct net_device *netdev,
 
 bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb);
 
+int nfp_pf_rtsym_read_optional(struct nfp_pf *pf, const char *format,
+			       unsigned int default_val);
+u8 __iomem *
+nfp_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
+		 unsigned int min_size, struct nfp_cpp_area **area);
+
 enum nfp_dump_diag {
 	NFP_DUMP_NSP_DIAG = 0,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index 45cd2092e498..f8abb4cd9cef 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -101,48 +101,15 @@ nfp_net_find_port(struct nfp_eth_table *eth_tbl, unsigned int index)
 	return NULL;
 }
 
-static int
-nfp_net_pf_rtsym_read_optional(struct nfp_pf *pf, const char *format,
-			       unsigned int default_val)
-{
-	char name[256];
-	int err = 0;
-	u64 val;
-
-	snprintf(name, sizeof(name), format, nfp_cppcore_pcie_unit(pf->cpp));
-
-	val = nfp_rtsym_read_le(pf->rtbl, name, &err);
-	if (err) {
-		if (err == -ENOENT)
-			return default_val;
-		nfp_err(pf->cpp, "Unable to read symbol %s\n", name);
-		return err;
-	}
-
-	return val;
-}
-
 static int nfp_net_pf_get_num_ports(struct nfp_pf *pf)
 {
-	return nfp_net_pf_rtsym_read_optional(pf, "nfd_cfg_pf%u_num_ports", 1);
+	return nfp_pf_rtsym_read_optional(pf, "nfd_cfg_pf%u_num_ports", 1);
 }
 
 static int nfp_net_pf_get_app_id(struct nfp_pf *pf)
 {
-	return nfp_net_pf_rtsym_read_optional(pf, "_pf%u_net_app_id",
-					      NFP_APP_CORE_NIC);
-}
-
-static u8 __iomem *
-nfp_net_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
-		     unsigned int min_size, struct nfp_cpp_area **area)
-{
-	char pf_symbol[256];
-
-	snprintf(pf_symbol, sizeof(pf_symbol), sym_fmt,
-		 nfp_cppcore_pcie_unit(pf->cpp));
-
-	return nfp_rtsym_map(pf->rtbl, pf_symbol, name, min_size, area);
+	return nfp_pf_rtsym_read_optional(pf, "_pf%u_net_app_id",
+					  NFP_APP_CORE_NIC);
 }
 
 static void nfp_net_pf_free_vnic(struct nfp_pf *pf, struct nfp_net *nn)
@@ -379,9 +346,8 @@ nfp_net_pf_app_init(struct nfp_pf *pf, u8 __iomem *qc_bar, unsigned int stride)
 	if (!nfp_app_needs_ctrl_vnic(pf->app))
 		return 0;
 
-	ctrl_bar = nfp_net_pf_map_rtsym(pf, "net.ctrl", "_pf%u_net_ctrl_bar",
-					NFP_PF_CSR_SLICE_SIZE,
-					&pf->ctrl_vnic_bar);
+	ctrl_bar = nfp_pf_map_rtsym(pf, "net.ctrl", "_pf%u_net_ctrl_bar",
+				    NFP_PF_CSR_SLICE_SIZE, &pf->ctrl_vnic_bar);
 	if (IS_ERR(ctrl_bar)) {
 		nfp_err(pf->cpp, "Failed to find ctrl vNIC memory symbol\n");
 		err = PTR_ERR(ctrl_bar);
@@ -507,8 +473,8 @@ static int nfp_net_pci_map_mem(struct nfp_pf *pf)
 	int err;
 
 	min_size = pf->max_data_vnics * NFP_PF_CSR_SLICE_SIZE;
-	mem = nfp_net_pf_map_rtsym(pf, "net.bar0", "_pf%d_net_bar0",
-				   min_size, &pf->data_vnic_bar);
+	mem = nfp_pf_map_rtsym(pf, "net.bar0", "_pf%d_net_bar0",
+			       min_size, &pf->data_vnic_bar);
 	if (IS_ERR(mem)) {
 		nfp_err(pf->cpp, "Failed to find data vNIC memory symbol\n");
 		return PTR_ERR(mem);
@@ -528,10 +494,9 @@ static int nfp_net_pci_map_mem(struct nfp_pf *pf)
 		}
 	}
 
-	pf->vf_cfg_mem = nfp_net_pf_map_rtsym(pf, "net.vfcfg",
-					      "_pf%d_net_vf_bar",
-					      NFP_NET_CFG_BAR_SZ *
-					      pf->limit_vfs, &pf->vf_cfg_bar);
+	pf->vf_cfg_mem = nfp_pf_map_rtsym(pf, "net.vfcfg", "_pf%d_net_vf_bar",
+					  NFP_NET_CFG_BAR_SZ * pf->limit_vfs,
+					  &pf->vf_cfg_bar);
 	if (IS_ERR(pf->vf_cfg_mem)) {
 		if (PTR_ERR(pf->vf_cfg_mem) != -ENOENT) {
 			err = PTR_ERR(pf->vf_cfg_mem);
@@ -541,9 +506,9 @@ static int nfp_net_pci_map_mem(struct nfp_pf *pf)
 	}
 
 	min_size = NFP_NET_VF_CFG_SZ * pf->limit_vfs + NFP_NET_VF_CFG_MB_SZ;
-	pf->vfcfg_tbl2 = nfp_net_pf_map_rtsym(pf, "net.vfcfg_tbl2",
-					      "_pf%d_net_vf_cfg2",
-					      min_size, &pf->vfcfg_tbl2_area);
+	pf->vfcfg_tbl2 = nfp_pf_map_rtsym(pf, "net.vfcfg_tbl2",
+					  "_pf%d_net_vf_cfg2",
+					  min_size, &pf->vfcfg_tbl2_area);
 	if (IS_ERR(pf->vfcfg_tbl2)) {
 		if (PTR_ERR(pf->vfcfg_tbl2) != -ENOENT) {
 			err = PTR_ERR(pf->vfcfg_tbl2);
-- 
cgit v1.2.3-59-g8ed1b


From 0c693323a1e4ba129c7ea4b9b2679eb10227a446 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:44 -0700
Subject: nfp: add support for per-PCI PF mailbox

When working with devlink-related functionality for locking reasons
it's easier to create a new mailbox per-PCI PF device than try to
use one of the netdev/vNIC mailboxes.

Define new mailbox structure and resolve its symbol during probe.
For forward compatibility allow silent truncation of mailbox command
data.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_abi.h  |  59 ++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_main.c | 108 ++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_main.h |   6 ++
 3 files changed, 173 insertions(+)
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_abi.h

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_abi.h b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
new file mode 100644
index 000000000000..ce3935d6729b
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NFP_ABI__
+#define __NFP_ABI__ 1
+
+#include <linux/types.h>
+
+#define NFP_MBOX_SYM_NAME		"_abi_nfd_pf%u_mbox"
+#define NFP_MBOX_SYM_MIN_SIZE		16 /* When no data needed */
+
+#define NFP_MBOX_CMD		0x00
+#define NFP_MBOX_RET		0x04
+#define NFP_MBOX_DATA_LEN	0x08
+#define NFP_MBOX_RESERVED	0x0c
+#define NFP_MBOX_DATA		0x10
+
+/**
+ * enum nfp_mbox_cmd - PF mailbox commands
+ *
+ * @NFP_MBOX_NO_CMD:	null command
+ * Used to indicate previous command has finished.
+ */
+enum nfp_mbox_cmd {
+	NFP_MBOX_NO_CMD			= 0x00,
+};
+
+#endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 1ef3623c6e1c..46b76d5a726c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -55,6 +55,7 @@
 
 #include "nfpcore/nfp6000_pcie.h"
 
+#include "nfp_abi.h"
 #include "nfp_app.h"
 #include "nfp_main.h"
 #include "nfp_net.h"
@@ -107,6 +108,90 @@ nfp_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
 	return nfp_rtsym_map(pf->rtbl, pf_symbol, name, min_size, area);
 }
 
+/* Callers should hold the devlink instance lock */
+int nfp_mbox_cmd(struct nfp_pf *pf, u32 cmd, void *in_data, u64 in_length,
+		 void *out_data, u64 out_length)
+{
+	unsigned long long addr;
+	unsigned long err_at;
+	u64 max_data_sz;
+	u32 val = 0;
+	u32 cpp_id;
+	int n, err;
+
+	if (!pf->mbox)
+		return -EOPNOTSUPP;
+
+	cpp_id = NFP_CPP_ISLAND_ID(pf->mbox->target, NFP_CPP_ACTION_RW, 0,
+				   pf->mbox->domain);
+	addr = pf->mbox->addr;
+	max_data_sz = pf->mbox->size - NFP_MBOX_SYM_MIN_SIZE;
+
+	/* Check if cmd field is clear */
+	err = nfp_cpp_readl(pf->cpp, cpp_id, addr + NFP_MBOX_CMD, &val);
+	if (err || val) {
+		nfp_warn(pf->cpp, "failed to issue command (%u): %u, err: %d\n",
+			 cmd, val, err);
+		return err ?: -EBUSY;
+	}
+
+	in_length = min(in_length, max_data_sz);
+	n = nfp_cpp_write(pf->cpp, cpp_id, addr + NFP_MBOX_DATA,
+			  in_data, in_length);
+	if (n != in_length)
+		return -EIO;
+	/* Write data_len and wipe reserved */
+	err = nfp_cpp_writeq(pf->cpp, cpp_id, addr + NFP_MBOX_DATA_LEN,
+			     in_length);
+	if (err)
+		return err;
+
+	/* Read back for ordering */
+	err = nfp_cpp_readl(pf->cpp, cpp_id, addr + NFP_MBOX_DATA_LEN, &val);
+	if (err)
+		return err;
+
+	/* Write cmd and wipe return value */
+	err = nfp_cpp_writeq(pf->cpp, cpp_id, addr + NFP_MBOX_CMD, cmd);
+	if (err)
+		return err;
+
+	err_at = jiffies + 5 * HZ;
+	while (true) {
+		/* Wait for command to go to 0 (NFP_MBOX_NO_CMD) */
+		err = nfp_cpp_readl(pf->cpp, cpp_id, addr + NFP_MBOX_CMD, &val);
+		if (err)
+			return err;
+		if (!val)
+			break;
+
+		if (time_is_before_eq_jiffies(err_at))
+			return -ETIMEDOUT;
+
+		msleep(5);
+	}
+
+	/* Copy output if any (could be error info, do it before reading ret) */
+	err = nfp_cpp_readl(pf->cpp, cpp_id, addr + NFP_MBOX_DATA_LEN, &val);
+	if (err)
+		return err;
+
+	out_length = min_t(u32, val, min(out_length, max_data_sz));
+	n = nfp_cpp_read(pf->cpp, cpp_id, addr + NFP_MBOX_DATA,
+			 out_data, out_length);
+	if (n != out_length)
+		return -EIO;
+
+	/* Check if there is an error */
+	err = nfp_cpp_readl(pf->cpp, cpp_id, addr + NFP_MBOX_RET, &val);
+	if (err)
+		return err;
+	if (val)
+		return -val;
+
+	return out_length;
+}
+
 static bool nfp_board_ready(struct nfp_pf *pf)
 {
 	const char *cp;
@@ -468,6 +553,25 @@ static void nfp_fw_unload(struct nfp_pf *pf)
 	nfp_nsp_close(nsp);
 }
 
+static int nfp_pf_find_rtsyms(struct nfp_pf *pf)
+{
+	char pf_symbol[256];
+	unsigned int pf_id;
+
+	pf_id = nfp_cppcore_pcie_unit(pf->cpp);
+
+	/* Optional per-PCI PF mailbox */
+	snprintf(pf_symbol, sizeof(pf_symbol), NFP_MBOX_SYM_NAME, pf_id);
+	pf->mbox = nfp_rtsym_lookup(pf->rtbl, pf_symbol);
+	if (pf->mbox && pf->mbox->size < NFP_MBOX_SYM_MIN_SIZE) {
+		nfp_err(pf->cpp, "PF mailbox symbol too small: %llu < %d\n",
+			pf->mbox->size, NFP_MBOX_SYM_MIN_SIZE);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int nfp_pci_probe(struct pci_dev *pdev,
 			 const struct pci_device_id *pci_id)
 {
@@ -542,6 +646,10 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 	pf->mip = nfp_mip_open(pf->cpp);
 	pf->rtbl = __nfp_rtsym_table_read(pf->cpp, pf->mip);
 
+	err = nfp_pf_find_rtsyms(pf);
+	if (err)
+		goto err_fw_unload;
+
 	pf->dump_flag = NFP_DUMP_NSP_DIAG;
 	pf->dumpspec = nfp_net_dump_load_dumpspec(pf->cpp, pf->rtbl);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index 5b028486c894..e3e1fa84ccd7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -60,6 +60,7 @@ struct nfp_mip;
 struct nfp_net;
 struct nfp_nsp_identify;
 struct nfp_port;
+struct nfp_rtsym;
 struct nfp_rtsym_table;
 
 /**
@@ -87,6 +88,7 @@ struct nfp_dumpspec {
  * @vf_cfg_mem:		Pointer to mapped VF configuration area
  * @vfcfg_tbl2_area:	Pointer to the CPP area for the VF config table
  * @vfcfg_tbl2:		Pointer to mapped VF config table
+ * @mbox:		RTSym of per-PCI PF mailbox (under devlink lock)
  * @irq_entries:	Array of MSI-X entries for all vNICs
  * @limit_vfs:		Number of VFs supported by firmware (~0 for PCI limit)
  * @num_vfs:		Number of SR-IOV VFs enabled
@@ -127,6 +129,8 @@ struct nfp_pf {
 	struct nfp_cpp_area *vfcfg_tbl2_area;
 	u8 __iomem *vfcfg_tbl2;
 
+	const struct nfp_rtsym *mbox;
+
 	struct msix_entry *irq_entries;
 
 	unsigned int limit_vfs;
@@ -182,6 +186,8 @@ int nfp_pf_rtsym_read_optional(struct nfp_pf *pf, const char *format,
 u8 __iomem *
 nfp_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
 		 unsigned int min_size, struct nfp_cpp_area **area);
+int nfp_mbox_cmd(struct nfp_pf *pf, u32 cmd, void *in_data, u64 in_length,
+		 void *out_data, u64 out_length);
 
 enum nfp_dump_diag {
 	NFP_DUMP_NSP_DIAG = 0,
-- 
cgit v1.2.3-59-g8ed1b


From a0d163f4327febeae2c98c4b1aaff3552e5b1667 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:45 -0700
Subject: nfp: add shared buffer configuration

Allow app FW to advertise its shared buffer pool information.
Use the per-PF mailbox to configure them from devlink.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/Makefile        |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_abi.h       |  70 ++++++++
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c   |  22 +++
 drivers/net/ethernet/netronome/nfp/nfp_main.h      |  15 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c  |   7 +
 .../net/ethernet/netronome/nfp/nfp_shared_buf.c    | 180 +++++++++++++++++++++
 6 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index d5866d708dfa..f032d645911c 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -30,6 +30,7 @@ nfp-objs := \
 	    nfp_net_sriov.o \
 	    nfp_netvf_main.o \
 	    nfp_port.o \
+	    nfp_shared_buf.o \
 	    nic/main.o
 
 ifeq ($(CONFIG_NFP_APP_FLOWER),y)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_abi.h b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
index ce3935d6729b..7ffa6e6a9d1c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_abi.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
@@ -51,9 +51,79 @@
  *
  * @NFP_MBOX_NO_CMD:	null command
  * Used to indicate previous command has finished.
+ *
+ * @NFP_MBOX_POOL_GET:	get shared buffer pool info/config
+ * Input  - struct nfp_shared_buf_pool_id
+ * Output - struct nfp_shared_buf_pool_info_get
+ *
+ * @NFP_MBOX_POOL_SET:	set shared buffer pool info/config
+ * Input  - struct nfp_shared_buf_pool_info_set
+ * Output - None
  */
 enum nfp_mbox_cmd {
 	NFP_MBOX_NO_CMD			= 0x00,
+
+	NFP_MBOX_POOL_GET		= 0x01,
+	NFP_MBOX_POOL_SET		= 0x02,
+};
+
+#define NFP_SHARED_BUF_COUNT_SYM_NAME	"_abi_nfd_pf%u_sb_cnt"
+#define NFP_SHARED_BUF_TABLE_SYM_NAME	"_abi_nfd_pf%u_sb_tbl"
+
+/**
+ * struct nfp_shared_buf - NFP shared buffer description
+ * @id:				numerical user-visible id of the shared buffer
+ * @size:			size in bytes of the buffer
+ * @ingress_pools_count:	number of ingress pools
+ * @egress_pools_count:		number of egress pools
+ * @ingress_tc_count:		number of ingress trafic classes
+ * @egress_tc_count:		number of egress trafic classes
+ * @pool_size_unit:		pool size may be in credits, each credit is
+ *				@pool_size_unit bytes
+ */
+struct nfp_shared_buf {
+	__le32 id;
+	__le32 size;
+	__le16 ingress_pools_count;
+	__le16 egress_pools_count;
+	__le16 ingress_tc_count;
+	__le16 egress_tc_count;
+
+	__le32 pool_size_unit;
+};
+
+/**
+ * struct nfp_shared_buf_pool_id - shared buffer pool identification
+ * @shared_buf:		shared buffer id
+ * @pool:		pool index
+ */
+struct nfp_shared_buf_pool_id {
+	__le32 shared_buf;
+	__le32 pool;
+};
+
+/**
+ * struct nfp_shared_buf_pool_info_get - struct devlink_sb_pool_info mirror
+ * @pool_type:		one of enum devlink_sb_pool_type
+ * @size:		pool size in units of SB's @pool_size_unit
+ * @threshold_type:	one of enum devlink_sb_threshold_type
+ */
+struct nfp_shared_buf_pool_info_get {
+	__le32 pool_type;
+	__le32 size;
+	__le32 threshold_type;
+};
+
+/**
+ * struct nfp_shared_buf_pool_info_set - packed args of sb_pool_set
+ * @id:			pool identification info
+ * @size:		pool size in units of SB's @pool_size_unit
+ * @threshold_type:	one of enum devlink_sb_threshold_type
+ */
+struct nfp_shared_buf_pool_info_set {
+	struct nfp_shared_buf_pool_id id;
+	__le32 size;
+	__le32 threshold_type;
 };
 
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index b1e67cf4257a..73c7fcc820ac 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -149,6 +149,26 @@ out:
 	return ret;
 }
 
+static int
+nfp_devlink_sb_pool_get(struct devlink *devlink, unsigned int sb_index,
+			u16 pool_index, struct devlink_sb_pool_info *pool_info)
+{
+	struct nfp_pf *pf = devlink_priv(devlink);
+
+	return nfp_shared_buf_pool_get(pf, sb_index, pool_index, pool_info);
+}
+
+static int
+nfp_devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+			u16 pool_index,
+			u32 size, enum devlink_sb_threshold_type threshold_type)
+{
+	struct nfp_pf *pf = devlink_priv(devlink);
+
+	return nfp_shared_buf_pool_set(pf, sb_index, pool_index,
+				       size, threshold_type);
+}
+
 static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
@@ -159,6 +179,8 @@ static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 const struct devlink_ops nfp_devlink_ops = {
 	.port_split		= nfp_devlink_port_split,
 	.port_unsplit		= nfp_devlink_port_unsplit,
+	.sb_pool_get		= nfp_devlink_sb_pool_get,
+	.sb_pool_set		= nfp_devlink_sb_pool_set,
 	.eswitch_mode_get	= nfp_devlink_eswitch_mode_get,
 };
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index e3e1fa84ccd7..595b3dc280e3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -46,10 +46,10 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/workqueue.h>
+#include <net/devlink.h>
 
 struct dentry;
 struct device;
-struct devlink_ops;
 struct pci_dev;
 
 struct nfp_cpp;
@@ -62,6 +62,7 @@ struct nfp_nsp_identify;
 struct nfp_port;
 struct nfp_rtsym;
 struct nfp_rtsym_table;
+struct nfp_shared_buf;
 
 /**
  * struct nfp_dumpspec - NFP FW dump specification structure
@@ -110,6 +111,8 @@ struct nfp_dumpspec {
  * @ports:		Linked list of port structures (struct nfp_port)
  * @wq:			Workqueue for running works which need to grab @lock
  * @port_refresh_work:	Work entry for taking netdevs out
+ * @shared_bufs:	Array of shared buffer structures if FW has any SBs
+ * @num_shared_bufs:	Number of elements in @shared_bufs
  * @lock:		Protects all fields which may change after probe
  */
 struct nfp_pf {
@@ -162,6 +165,9 @@ struct nfp_pf {
 	struct workqueue_struct *wq;
 	struct work_struct port_refresh_work;
 
+	struct nfp_shared_buf *shared_bufs;
+	unsigned int num_shared_bufs;
+
 	struct mutex lock;
 };
 
@@ -200,4 +206,11 @@ s64 nfp_net_dump_calculate_size(struct nfp_pf *pf, struct nfp_dumpspec *spec,
 int nfp_net_dump_populate_buffer(struct nfp_pf *pf, struct nfp_dumpspec *spec,
 				 struct ethtool_dump *dump_param, void *dest);
 
+int nfp_shared_buf_register(struct nfp_pf *pf);
+void nfp_shared_buf_unregister(struct nfp_pf *pf);
+int nfp_shared_buf_pool_get(struct nfp_pf *pf, unsigned int sb, u16 pool_index,
+			    struct devlink_sb_pool_info *pool_info);
+int nfp_shared_buf_pool_set(struct nfp_pf *pf, unsigned int sb,
+			    u16 pool_index, u32 size,
+			    enum devlink_sb_threshold_type threshold_type);
 #endif /* NFP_MAIN_H */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index f8abb4cd9cef..b98422112385 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -728,6 +728,10 @@ int nfp_net_pci_probe(struct nfp_pf *pf)
 	if (err)
 		goto err_app_clean;
 
+	err = nfp_shared_buf_register(pf);
+	if (err)
+		goto err_devlink_unreg;
+
 	mutex_lock(&pf->lock);
 	pf->ddir = nfp_net_debugfs_device_add(pf->pdev);
 
@@ -761,6 +765,8 @@ err_free_vnics:
 err_clean_ddir:
 	nfp_net_debugfs_dir_clean(&pf->ddir);
 	mutex_unlock(&pf->lock);
+	nfp_shared_buf_unregister(pf);
+err_devlink_unreg:
 	cancel_work_sync(&pf->port_refresh_work);
 	devlink_unregister(devlink);
 err_app_clean:
@@ -788,6 +794,7 @@ void nfp_net_pci_remove(struct nfp_pf *pf)
 
 	mutex_unlock(&pf->lock);
 
+	nfp_shared_buf_unregister(pf);
 	devlink_unregister(priv_to_devlink(pf));
 
 	nfp_net_pf_free_irqs(pf);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c
new file mode 100644
index 000000000000..0ecd83705368
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <net/devlink.h>
+
+#include "nfpcore/nfp_cpp.h"
+#include "nfpcore/nfp_nffw.h"
+#include "nfp_abi.h"
+#include "nfp_app.h"
+#include "nfp_main.h"
+
+static u32 nfp_shared_buf_pool_unit(struct nfp_pf *pf, unsigned int sb)
+{
+	__le32 sb_id = cpu_to_le32(sb);
+	unsigned int i;
+
+	for (i = 0; i < pf->num_shared_bufs; i++)
+		if (pf->shared_bufs[i].id == sb_id)
+			return le32_to_cpu(pf->shared_bufs[i].pool_size_unit);
+
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+int nfp_shared_buf_pool_get(struct nfp_pf *pf, unsigned int sb, u16 pool_index,
+			    struct devlink_sb_pool_info *pool_info)
+{
+	struct nfp_shared_buf_pool_info_get get_data;
+	struct nfp_shared_buf_pool_id id = {
+		.shared_buf	= cpu_to_le32(sb),
+		.pool		= cpu_to_le32(pool_index),
+	};
+	unsigned int unit_size;
+	int n;
+
+	unit_size = nfp_shared_buf_pool_unit(pf, sb);
+	if (!unit_size)
+		return -EINVAL;
+
+	n = nfp_mbox_cmd(pf, NFP_MBOX_POOL_GET, &id, sizeof(id),
+			 &get_data, sizeof(get_data));
+	if (n < 0)
+		return n;
+	if (n < sizeof(get_data))
+		return -EIO;
+
+	pool_info->pool_type = le32_to_cpu(get_data.pool_type);
+	pool_info->threshold_type = le32_to_cpu(get_data.threshold_type);
+	pool_info->size = le32_to_cpu(get_data.size) * unit_size;
+
+	return 0;
+}
+
+int nfp_shared_buf_pool_set(struct nfp_pf *pf, unsigned int sb,
+			    u16 pool_index, u32 size,
+			    enum devlink_sb_threshold_type threshold_type)
+{
+	struct nfp_shared_buf_pool_info_set set_data = {
+		.id = {
+			.shared_buf	= cpu_to_le32(sb),
+			.pool		= cpu_to_le32(pool_index),
+		},
+		.threshold_type	= cpu_to_le32(threshold_type),
+	};
+	unsigned int unit_size;
+
+	unit_size = nfp_shared_buf_pool_unit(pf, sb);
+	if (!unit_size || size % unit_size)
+		return -EINVAL;
+	set_data.size = cpu_to_le32(size / unit_size);
+
+	return nfp_mbox_cmd(pf, NFP_MBOX_POOL_SET, &set_data, sizeof(set_data),
+			    NULL, 0);
+}
+
+int nfp_shared_buf_register(struct nfp_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	unsigned int i, num_entries, entry_sz;
+	struct nfp_cpp_area *sb_desc_area;
+	u8 __iomem *sb_desc;
+	int n, err;
+
+	if (!pf->mbox)
+		return 0;
+
+	n = nfp_pf_rtsym_read_optional(pf, NFP_SHARED_BUF_COUNT_SYM_NAME, 0);
+	if (n <= 0)
+		return n;
+	num_entries = n;
+
+	sb_desc = nfp_pf_map_rtsym(pf, "sb_tbl", NFP_SHARED_BUF_TABLE_SYM_NAME,
+				   num_entries * sizeof(pf->shared_bufs[0]),
+				   &sb_desc_area);
+	if (IS_ERR(sb_desc))
+		return PTR_ERR(sb_desc);
+
+	entry_sz = nfp_cpp_area_size(sb_desc_area) / num_entries;
+
+	pf->shared_bufs = kmalloc_array(num_entries, sizeof(pf->shared_bufs[0]),
+					GFP_KERNEL);
+	if (!pf->shared_bufs) {
+		err = -ENOMEM;
+		goto err_release_area;
+	}
+
+	for (i = 0; i < num_entries; i++) {
+		struct nfp_shared_buf *sb = &pf->shared_bufs[i];
+
+		/* Entries may be larger in future FW */
+		memcpy_fromio(sb, sb_desc + i * entry_sz, sizeof(*sb));
+
+		err = devlink_sb_register(devlink,
+					  le32_to_cpu(sb->id),
+					  le32_to_cpu(sb->size),
+					  le16_to_cpu(sb->ingress_pools_count),
+					  le16_to_cpu(sb->egress_pools_count),
+					  le16_to_cpu(sb->ingress_tc_count),
+					  le16_to_cpu(sb->egress_tc_count));
+		if (err)
+			goto err_unreg_prev;
+	}
+	pf->num_shared_bufs = num_entries;
+
+	nfp_cpp_area_release_free(sb_desc_area);
+
+	return 0;
+
+err_unreg_prev:
+	while (i--)
+		devlink_sb_unregister(devlink,
+				      le32_to_cpu(pf->shared_bufs[i].id));
+	kfree(pf->shared_bufs);
+err_release_area:
+	nfp_cpp_area_release_free(sb_desc_area);
+	return err;
+}
+
+void nfp_shared_buf_unregister(struct nfp_pf *pf)
+{
+	struct devlink *devlink = priv_to_devlink(pf);
+	unsigned int i;
+
+	for (i = 0; i < pf->num_shared_bufs; i++)
+		devlink_sb_unregister(devlink,
+				      le32_to_cpu(pf->shared_bufs[i].id));
+	kfree(pf->shared_bufs);
+}
-- 
cgit v1.2.3-59-g8ed1b


From b586c77b3c657a87e5a754c69b9a40c2301416dc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:46 -0700
Subject: nfp: core: allow 4-byte aligned accesses to Memory Units

Current code doesn't enforce length requirements on 32bit accesses
with action NFP_CPP_ACTION_RW to memory units, but if the access
is only aligned to 4 bytes as well we will fall into the explicit
access case and error out.  Such accesses are correct, allow them
by lowering the width earlier.

While at it use a switch statement to improve readability.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c  | 94 ++++++++++------------
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
index a0e336bd1d85..749655c329b2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
@@ -933,7 +933,6 @@ static int nfp6000_area_read(struct nfp_cpp_area *area, void *kernel_vaddr,
 	u32 *wrptr32 = kernel_vaddr;
 	const u32 __iomem *rdptr32;
 	int n, width;
-	bool is_64;
 
 	priv = nfp_cpp_area_priv(area);
 	rdptr64 = priv->iomem + offset;
@@ -943,10 +942,15 @@ static int nfp6000_area_read(struct nfp_cpp_area *area, void *kernel_vaddr,
 		return -EFAULT;
 
 	width = priv->width.read;
-
 	if (width <= 0)
 		return -EINVAL;
 
+	/* MU reads via a PCIe2CPP BAR support 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_MU & NFP_CPP_TARGET_ID_MASK) &&
+	    priv->action == NFP_CPP_ACTION_RW &&
+	    (offset % sizeof(u64) == 4 || length % sizeof(u64) == 4))
+		width = TARGET_WIDTH_32;
+
 	/* Unaligned? Translate to an explicit access */
 	if ((priv->offset + offset) & (width - 1))
 		return nfp_cpp_explicit_read(nfp_cpp_area_cpp(area),
@@ -956,36 +960,29 @@ static int nfp6000_area_read(struct nfp_cpp_area *area, void *kernel_vaddr,
 					     priv->offset + offset,
 					     kernel_vaddr, length, width);
 
-	is_64 = width == TARGET_WIDTH_64;
-
-	/* MU reads via a PCIe2CPP BAR supports 32bit (and other) lengths */
-	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
-	    priv->action == NFP_CPP_ACTION_RW)
-		is_64 = false;
+	if (WARN_ON(!priv->bar))
+		return -EFAULT;
 
-	if (is_64) {
-		if (offset % sizeof(u64) != 0 || length % sizeof(u64) != 0)
-			return -EINVAL;
-	} else {
+	switch (width) {
+	case TARGET_WIDTH_32:
 		if (offset % sizeof(u32) != 0 || length % sizeof(u32) != 0)
 			return -EINVAL;
-	}
 
-	if (WARN_ON(!priv->bar))
-		return -EFAULT;
+		for (n = 0; n < length; n += sizeof(u32))
+			*wrptr32++ = __raw_readl(rdptr32++);
+		return n;
+#ifdef __raw_readq
+	case TARGET_WIDTH_64:
+		if (offset % sizeof(u64) != 0 || length % sizeof(u64) != 0)
+			return -EINVAL;
 
-	if (is_64)
-#ifndef __raw_readq
-		return -EINVAL;
-#else
 		for (n = 0; n < length; n += sizeof(u64))
 			*wrptr64++ = __raw_readq(rdptr64++);
+		return n;
 #endif
-	else
-		for (n = 0; n < length; n += sizeof(u32))
-			*wrptr32++ = __raw_readl(rdptr32++);
-
-	return n;
+	default:
+		return -EINVAL;
+	}
 }
 
 static int
@@ -999,7 +996,6 @@ nfp6000_area_write(struct nfp_cpp_area *area,
 	struct nfp6000_area_priv *priv;
 	u32 __iomem *wrptr32;
 	int n, width;
-	bool is_64;
 
 	priv = nfp_cpp_area_priv(area);
 	wrptr64 = priv->iomem + offset;
@@ -1009,10 +1005,15 @@ nfp6000_area_write(struct nfp_cpp_area *area,
 		return -EFAULT;
 
 	width = priv->width.write;
-
 	if (width <= 0)
 		return -EINVAL;
 
+	/* MU writes via a PCIe2CPP BAR support 32bit (and other) lengths */
+	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
+	    priv->action == NFP_CPP_ACTION_RW &&
+	    (offset % sizeof(u64) == 4 || length % sizeof(u64) == 4))
+		width = TARGET_WIDTH_32;
+
 	/* Unaligned? Translate to an explicit access */
 	if ((priv->offset + offset) & (width - 1))
 		return nfp_cpp_explicit_write(nfp_cpp_area_cpp(area),
@@ -1022,40 +1023,33 @@ nfp6000_area_write(struct nfp_cpp_area *area,
 					      priv->offset + offset,
 					      kernel_vaddr, length, width);
 
-	is_64 = width == TARGET_WIDTH_64;
-
-	/* MU writes via a PCIe2CPP BAR supports 32bit (and other) lengths */
-	if (priv->target == (NFP_CPP_TARGET_ID_MASK & NFP_CPP_TARGET_MU) &&
-	    priv->action == NFP_CPP_ACTION_RW)
-		is_64 = false;
+	if (WARN_ON(!priv->bar))
+		return -EFAULT;
 
-	if (is_64) {
-		if (offset % sizeof(u64) != 0 || length % sizeof(u64) != 0)
-			return -EINVAL;
-	} else {
+	switch (width) {
+	case TARGET_WIDTH_32:
 		if (offset % sizeof(u32) != 0 || length % sizeof(u32) != 0)
 			return -EINVAL;
-	}
 
-	if (WARN_ON(!priv->bar))
-		return -EFAULT;
+		for (n = 0; n < length; n += sizeof(u32)) {
+			__raw_writel(*rdptr32++, wrptr32++);
+			wmb();
+		}
+		return n;
+#ifdef __raw_writeq
+	case TARGET_WIDTH_64:
+		if (offset % sizeof(u64) != 0 || length % sizeof(u64) != 0)
+			return -EINVAL;
 
-	if (is_64)
-#ifndef __raw_writeq
-		return -EINVAL;
-#else
 		for (n = 0; n < length; n += sizeof(u64)) {
 			__raw_writeq(*rdptr64++, wrptr64++);
 			wmb();
 		}
+		return n;
 #endif
-	else
-		for (n = 0; n < length; n += sizeof(u32)) {
-			__raw_writel(*rdptr32++, wrptr32++);
-			wmb();
-		}
-
-	return n;
+	default:
+		return -EINVAL;
+	}
 }
 
 struct nfp6000_explicit_priv {
-- 
cgit v1.2.3-59-g8ed1b


From c4c8f39a57bf5057fc51a848d42b7e348ecfa31d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:47 -0700
Subject: nfp: abm: add initial active buffer management NIC skeleton

Add a very rudimentary active buffer management NIC support.
For now it's like a core NIC without SR-IOV support.  Next
commits will extend its functionality.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/Kconfig        | 13 ++++
 drivers/net/ethernet/netronome/nfp/Makefile   |  5 ++
 drivers/net/ethernet/netronome/nfp/abm/main.c | 85 +++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h | 47 +++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_app.c  |  3 +
 drivers/net/ethernet/netronome/nfp/nfp_app.h  |  2 +
 6 files changed, 155 insertions(+)
 create mode 100644 drivers/net/ethernet/netronome/nfp/abm/main.c
 create mode 100644 drivers/net/ethernet/netronome/nfp/abm/main.h

diff --git a/drivers/net/ethernet/netronome/Kconfig b/drivers/net/ethernet/netronome/Kconfig
index ae0c46ba7546..66f15b05b65e 100644
--- a/drivers/net/ethernet/netronome/Kconfig
+++ b/drivers/net/ethernet/netronome/Kconfig
@@ -36,6 +36,19 @@ config NFP_APP_FLOWER
 	  either directly, with Open vSwitch, or any other way.  Note that
 	  TC Flower offload requires specific FW to work.
 
+config NFP_APP_ABM_NIC
+	bool "NFP4000/NFP6000 Advanced buffer management NIC support"
+	depends on NFP
+	depends on NET_SWITCHDEV
+	default y
+	help
+	  Enable driver support for Advanced buffer management NIC on NFP.
+	  ABM NIC allows advanced configuration of queuing and scheduling
+	  of packets, including ECN marking. Say Y, if you are planning to
+	  use one of the NFP4000 and NFP6000 platforms which support this
+	  functionality.
+	  Code will be built into the nfp.ko driver.
+
 config NFP_DEBUG
 	bool "Debug support for Netronome(R) NFP4000/NFP6000 NIC drivers"
 	depends on NFP
diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index f032d645911c..3d79b709d77a 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -53,4 +53,9 @@ nfp-objs += \
 	    bpf/jit.o
 endif
 
+ifeq ($(CONFIG_NFP_APP_ABM_NIC),y)
+nfp-objs += \
+	    abm/main.o
+endif
+
 nfp-$(CONFIG_NFP_DEBUG) += nfp_net_debugfs.o
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
new file mode 100644
index 000000000000..7a0a807bea4a
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "../nfpcore/nfp_cpp.h"
+#include "../nfpcore/nfp_nsp.h"
+#include "../nfp_app.h"
+#include "../nfp_main.h"
+#include "main.h"
+
+static int nfp_abm_init(struct nfp_app *app)
+{
+	struct nfp_pf *pf = app->pf;
+	struct nfp_abm *abm;
+
+	if (!pf->eth_tbl) {
+		nfp_err(pf->cpp, "ABM NIC requires ETH table\n");
+		return -EINVAL;
+	}
+	if (pf->max_data_vnics != pf->eth_tbl->count) {
+		nfp_err(pf->cpp, "ETH entries don't match vNICs (%d vs %d)\n",
+			pf->max_data_vnics, pf->eth_tbl->count);
+		return -EINVAL;
+	}
+	if (!pf->mac_stats_bar) {
+		nfp_warn(app->cpp, "ABM NIC requires mac_stats symbol\n");
+		return -EINVAL;
+	}
+
+	abm = kzalloc(sizeof(*abm), GFP_KERNEL);
+	if (!abm)
+		return -ENOMEM;
+	app->priv = abm;
+	abm->app = app;
+
+	return 0;
+}
+
+static void nfp_abm_clean(struct nfp_app *app)
+{
+	struct nfp_abm *abm = app->priv;
+
+	kfree(abm);
+	app->priv = NULL;
+}
+
+const struct nfp_app_type app_abm = {
+	.id		= NFP_APP_ACTIVE_BUFFER_MGMT_NIC,
+	.name		= "abm",
+
+	.init		= nfp_abm_init,
+	.clean		= nfp_abm_clean,
+
+	.vnic_alloc	= nfp_app_nic_vnic_alloc,
+};
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
new file mode 100644
index 000000000000..e54b6f64ee10
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NFP_ABM_H__
+#define __NFP_ABM_H__ 1
+
+struct nfp_app;
+
+/**
+ * struct nfp_abm - ABM NIC app structure
+ * @app:	back pointer to nfp_app
+ */
+struct nfp_abm {
+	struct nfp_app *app;
+};
+#endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index 0e0253c7e17b..c9d8a7ab311e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -54,6 +54,9 @@ static const struct nfp_app_type *apps[] = {
 #ifdef CONFIG_NFP_APP_FLOWER
 	[NFP_APP_FLOWER_NIC]	= &app_flower,
 #endif
+#ifdef CONFIG_NFP_APP_ABM_NIC
+	[NFP_APP_ACTIVE_BUFFER_MGMT_NIC] = &app_abm,
+#endif
 };
 
 struct nfp_app *nfp_app_from_netdev(struct net_device *netdev)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 2d9cb2528fc7..654cabd95ee4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -57,11 +57,13 @@ enum nfp_app_id {
 	NFP_APP_CORE_NIC	= 0x1,
 	NFP_APP_BPF_NIC		= 0x2,
 	NFP_APP_FLOWER_NIC	= 0x3,
+	NFP_APP_ACTIVE_BUFFER_MGMT_NIC = 0x4,
 };
 
 extern const struct nfp_app_type app_nic;
 extern const struct nfp_app_type app_bpf;
 extern const struct nfp_app_type app_flower;
+extern const struct nfp_app_type app_abm;
 
 /**
  * struct nfp_app_type - application definition
-- 
cgit v1.2.3-59-g8ed1b


From cc54dc2804f2b038af299aee89a9ff47454248de Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:48 -0700
Subject: nfp: abm: create project-specific vNIC structure

ABM NIC requires more complex vNIC handling, allocate
per-vNIC structure.  Find out RX queue base and PCI PF id.
There will be multiple PFs sharing the same MAC port, therefore
the MAC address assigned to the vNIC must be looked up in the
HWInfo database.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/Makefile      |   1 +
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c    |  58 +++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.c    | 104 ++++++++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/abm/main.h    |  20 +++++
 drivers/net/ethernet/netronome/nfp/nfp_app.h     |   2 +
 drivers/net/ethernet/netronome/nfp/nfp_app_nic.c |   5 +-
 6 files changed, 186 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/abm/ctrl.c

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index 3d79b709d77a..6373f56205fd 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -55,6 +55,7 @@ endif
 
 ifeq ($(CONFIG_NFP_APP_ABM_NIC),y)
 nfp-objs += \
+	    abm/ctrl.o \
 	    abm/main.o
 endif
 
diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
new file mode 100644
index 000000000000..e40f6f06417b
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+
+#include "../nfpcore/nfp_cpp.h"
+#include "../nfp_app.h"
+#include "../nfp_main.h"
+#include "../nfp_net.h"
+#include "main.h"
+
+void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink)
+{
+	alink->queue_base = nn_readl(alink->vnic, NFP_NET_CFG_START_RXQ);
+	alink->queue_base /= alink->vnic->stride_rx;
+}
+
+int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm)
+{
+	struct nfp_pf *pf = abm->app->pf;
+	unsigned int pf_id;
+
+	pf_id =	nfp_cppcore_pcie_unit(pf->cpp);
+	abm->pf_id = pf_id;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 7a0a807bea4a..1985e58ab0db 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -32,16 +32,108 @@
  * SOFTWARE.
  */
 
+#include <linux/etherdevice.h>
+
+#include "../nfpcore/nfp.h"
 #include "../nfpcore/nfp_cpp.h"
 #include "../nfpcore/nfp_nsp.h"
 #include "../nfp_app.h"
 #include "../nfp_main.h"
+#include "../nfp_net.h"
+#include "../nfp_port.h"
 #include "main.h"
 
+static void
+nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn,
+		     unsigned int id)
+{
+	struct nfp_eth_table_port *eth_port = &pf->eth_tbl->ports[id];
+	u8 mac_addr[ETH_ALEN];
+	const char *mac_str;
+	char name[32];
+
+	if (id > pf->eth_tbl->count) {
+		nfp_warn(pf->cpp, "No entry for persistent MAC address\n");
+		eth_hw_addr_random(nn->dp.netdev);
+		return;
+	}
+
+	snprintf(name, sizeof(name), "eth%u.mac.pf%u",
+		 eth_port->eth_index, abm->pf_id);
+
+	mac_str = nfp_hwinfo_lookup(pf->hwinfo, name);
+	if (!mac_str) {
+		nfp_warn(pf->cpp, "Can't lookup persistent MAC address (%s)\n",
+			 name);
+		eth_hw_addr_random(nn->dp.netdev);
+		return;
+	}
+
+	if (sscanf(mac_str, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+		   &mac_addr[0], &mac_addr[1], &mac_addr[2],
+		   &mac_addr[3], &mac_addr[4], &mac_addr[5]) != 6) {
+		nfp_warn(pf->cpp, "Can't parse persistent MAC address (%s)\n",
+			 mac_str);
+		eth_hw_addr_random(nn->dp.netdev);
+		return;
+	}
+
+	ether_addr_copy(nn->dp.netdev->dev_addr, mac_addr);
+	ether_addr_copy(nn->dp.netdev->perm_addr, mac_addr);
+}
+
+static int
+nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
+{
+	struct nfp_abm *abm = app->priv;
+	struct nfp_abm_link *alink;
+	int err;
+
+	alink = kzalloc(sizeof(*alink), GFP_KERNEL);
+	if (!alink)
+		return -ENOMEM;
+	nn->app_priv = alink;
+	alink->abm = abm;
+	alink->vnic = nn;
+	alink->id = id;
+
+	nn->port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, nn->dp.netdev);
+	if (IS_ERR(nn->port)) {
+		err = PTR_ERR(nn->port);
+		goto err_free_alink;
+	}
+
+	err = nfp_app_nic_vnic_init_phy_port(app->pf, app, nn, id);
+	if (err < 0)
+		goto err_free_port;
+	if (nn->port->type == NFP_PORT_INVALID)
+		/* core will kill this vNIC */
+		return 0;
+
+	nfp_abm_vnic_set_mac(app->pf, abm, nn, id);
+	nfp_abm_ctrl_read_params(alink);
+
+	return 0;
+
+err_free_port:
+	nfp_port_free(nn->port);
+err_free_alink:
+	kfree(alink);
+	return err;
+}
+
+static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
+{
+	struct nfp_abm_link *alink = nn->app_priv;
+
+	kfree(alink);
+}
+
 static int nfp_abm_init(struct nfp_app *app)
 {
 	struct nfp_pf *pf = app->pf;
 	struct nfp_abm *abm;
+	int err;
 
 	if (!pf->eth_tbl) {
 		nfp_err(pf->cpp, "ABM NIC requires ETH table\n");
@@ -63,7 +155,16 @@ static int nfp_abm_init(struct nfp_app *app)
 	app->priv = abm;
 	abm->app = app;
 
+	err = nfp_abm_ctrl_find_addrs(abm);
+	if (err)
+		goto err_free_abm;
+
 	return 0;
+
+err_free_abm:
+	kfree(abm);
+	app->priv = NULL;
+	return err;
 }
 
 static void nfp_abm_clean(struct nfp_app *app)
@@ -81,5 +182,6 @@ const struct nfp_app_type app_abm = {
 	.init		= nfp_abm_init,
 	.clean		= nfp_abm_clean,
 
-	.vnic_alloc	= nfp_app_nic_vnic_alloc,
+	.vnic_alloc	= nfp_abm_vnic_alloc,
+	.vnic_free	= nfp_abm_vnic_free,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index e54b6f64ee10..3c5a01c96ecd 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -36,12 +36,32 @@
 #define __NFP_ABM_H__ 1
 
 struct nfp_app;
+struct nfp_net;
 
 /**
  * struct nfp_abm - ABM NIC app structure
  * @app:	back pointer to nfp_app
+ * @pf_id:	ID of our PF link
  */
 struct nfp_abm {
 	struct nfp_app *app;
+	unsigned int pf_id;
 };
+
+/**
+ * struct nfp_abm_link - port tuple of a ABM NIC
+ * @abm:	back pointer to nfp_abm
+ * @vnic:	data vNIC
+ * @id:		id of the data vNIC
+ * @queue_base:	id of base to host queue within PCIe (not QC idx)
+ */
+struct nfp_abm_link {
+	struct nfp_abm *abm;
+	struct nfp_net *vnic;
+	unsigned int id;
+	unsigned int queue_base;
+};
+
+void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
+int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 654cabd95ee4..fdf2593ae151 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -412,5 +412,7 @@ void nfp_app_free(struct nfp_app *app);
 
 int nfp_app_nic_vnic_alloc(struct nfp_app *app, struct nfp_net *nn,
 			   unsigned int id);
+int nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
+				   struct nfp_net *nn, unsigned int id);
 
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
index b9618c37403f..e2dfe4f168bb 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app_nic.c
@@ -38,9 +38,8 @@
 #include "nfp_net.h"
 #include "nfp_port.h"
 
-static int
-nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
-			       struct nfp_net *nn, unsigned int id)
+int nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app,
+				   struct nfp_net *nn, unsigned int id)
 {
 	int err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 634c6b7a85d46a891b2edee075bb422043dcb8da Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:49 -0700
Subject: nfp: add app pointer to port representors

nfp_apps can currently associate their structures with vNICs but
not representors.  Add app priv pointer to representors as well.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index cd756a15445f..8dca283c05c3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -76,6 +76,7 @@ struct nfp_repr_pcpu_stats {
  * @port:	Port of representor
  * @app:	APP handle
  * @stats:	Statistic of packets hitting CPU
+ * @app_priv:	Pointer for APP data
  */
 struct nfp_repr {
 	struct net_device *netdev;
@@ -83,6 +84,7 @@ struct nfp_repr {
 	struct nfp_port *port;
 	struct nfp_app *app;
 	struct nfp_repr_pcpu_stats __percpu *stats;
+	void *app_priv;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 7ac1cc9aef00942cbae01ff39bfc0a93199741e2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:50 -0700
Subject: devlink: don't take instance lock around eswitch mode set

Changing switch mode may want to register and unregister devlink
ports.  Therefore similarly to DEVLINK_CMD_PORT_SPLIT/UNSPLIT it
should not take the instance lock.  Drivers don't depend on existing
locking since it's a very recent addition.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 5c8a40e1a01e..475246b355f0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2756,7 +2756,8 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_eswitch_set_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
-- 
cgit v1.2.3-59-g8ed1b


From 4afa3af4181edda539cce7889a6efcde0e6a0a5c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:51 -0700
Subject: nfp: add devlink_eswitch_mode_set callback

Our previous apps all assumed to use only one eswitch mode (legacy
or switchdev) without the ability to change it.  ABM NIC will
want to support the switch so plumb devlink_eswitch_mode_set through.
The devlink_eswitch_mode_set is expected to spawn representors and
potentially devlink ports so it's called under big devlink lock and
pf->lock.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_app.h     |  9 +++++++++
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 13 +++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index fdf2593ae151..23b99a4e05c2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -97,6 +97,7 @@ extern const struct nfp_app_type app_abm;
  * @bpf:	BPF ndo offload-related calls
  * @xdp_offload:    offload an XDP program
  * @eswitch_mode_get:    get SR-IOV eswitch mode
+ * @eswitch_mode_set:    set SR-IOV eswitch mode (under pf->lock)
  * @sriov_enable: app-specific sriov initialisation
  * @sriov_disable: app-specific sriov clean-up
  * @repr_get:	get representor netdev
@@ -148,6 +149,7 @@ struct nfp_app_type {
 	void (*sriov_disable)(struct nfp_app *app);
 
 	enum devlink_eswitch_mode (*eswitch_mode_get)(struct nfp_app *app);
+	int (*eswitch_mode_set)(struct nfp_app *app, u16 mode);
 	struct net_device *(*repr_get)(struct nfp_app *app, u32 id);
 };
 
@@ -372,6 +374,13 @@ static inline int nfp_app_eswitch_mode_get(struct nfp_app *app, u16 *mode)
 	return 0;
 }
 
+static inline int nfp_app_eswitch_mode_set(struct nfp_app *app, u16 mode)
+{
+	if (!app->type->eswitch_mode_set)
+		return -EOPNOTSUPP;
+	return app->type->eswitch_mode_set(app, mode);
+}
+
 static inline int nfp_app_sriov_enable(struct nfp_app *app, int num_vfs)
 {
 	if (!app || !app->type->sriov_enable)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 73c7fcc820ac..71c2edd83031 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -176,12 +176,25 @@ static int nfp_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 	return nfp_app_eswitch_mode_get(pf->app, mode);
 }
 
+static int nfp_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+	struct nfp_pf *pf = devlink_priv(devlink);
+	int ret;
+
+	mutex_lock(&pf->lock);
+	ret = nfp_app_eswitch_mode_set(pf->app, mode);
+	mutex_unlock(&pf->lock);
+
+	return ret;
+}
+
 const struct devlink_ops nfp_devlink_ops = {
 	.port_split		= nfp_devlink_port_split,
 	.port_unsplit		= nfp_devlink_port_unsplit,
 	.sb_pool_get		= nfp_devlink_sb_pool_get,
 	.sb_pool_set		= nfp_devlink_sb_pool_set,
 	.eswitch_mode_get	= nfp_devlink_eswitch_mode_get,
+	.eswitch_mode_set	= nfp_devlink_eswitch_mode_set,
 };
 
 int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port)
-- 
cgit v1.2.3-59-g8ed1b


From d05d902eda23274d8123bc509d896226845eaf94 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:52 -0700
Subject: nfp: abm: spawn port netdevs

To configure buffering points we need full set of netdevs:

                              ASIC

 user netdev  -- | -- PCIe port   MAC port -- | --

Configuring egrees qdiscs on user netdev configures standard
Linux TC software qdiscs, configuring PCIe port qdiscs will
provide a way of setting ASIC queuing parameters for PCIe block.
MAC port netdev egress qdiscs correspond to ASIC MAC Traffic
Manager block.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c     | 234 ++++++++++++++++++++--
 drivers/net/ethernet/netronome/nfp/abm/main.h     |   8 +
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |   2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |   1 +
 4 files changed, 225 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 1985e58ab0db..d8f1bdac02b8 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -32,7 +32,12 @@
  * SOFTWARE.
  */
 
+#include <linux/bitfield.h>
 #include <linux/etherdevice.h>
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #include "../nfpcore/nfp.h"
 #include "../nfpcore/nfp_cpp.h"
@@ -40,9 +45,195 @@
 #include "../nfp_app.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
+#include "../nfp_net_repr.h"
 #include "../nfp_port.h"
 #include "main.h"
 
+static u32 nfp_abm_portid(enum nfp_repr_type rtype, unsigned int id)
+{
+	return FIELD_PREP(NFP_ABM_PORTID_TYPE, rtype) |
+	       FIELD_PREP(NFP_ABM_PORTID_ID, id);
+}
+
+static struct net_device *nfp_abm_repr_get(struct nfp_app *app, u32 port_id)
+{
+	enum nfp_repr_type rtype;
+	struct nfp_reprs *reprs;
+	u8 port;
+
+	rtype = FIELD_GET(NFP_ABM_PORTID_TYPE, port_id);
+	port = FIELD_GET(NFP_ABM_PORTID_ID, port_id);
+
+	reprs = rcu_dereference(app->reprs[rtype]);
+	if (!reprs)
+		return NULL;
+
+	if (port >= reprs->num_reprs)
+		return NULL;
+
+	return rcu_dereference(reprs->reprs[port]);
+}
+
+static int
+nfp_abm_spawn_repr(struct nfp_app *app, struct nfp_abm_link *alink,
+		   enum nfp_port_type ptype)
+{
+	struct net_device *netdev;
+	enum nfp_repr_type rtype;
+	struct nfp_reprs *reprs;
+	struct nfp_repr *repr;
+	struct nfp_port *port;
+	int err;
+
+	if (ptype == NFP_PORT_PHYS_PORT)
+		rtype = NFP_REPR_TYPE_PHYS_PORT;
+	else
+		rtype = NFP_REPR_TYPE_PF;
+
+	netdev = nfp_repr_alloc(app);
+	if (!netdev)
+		return -ENOMEM;
+	repr = netdev_priv(netdev);
+	repr->app_priv = alink;
+
+	port = nfp_port_alloc(app, ptype, netdev);
+	if (IS_ERR(port)) {
+		err = PTR_ERR(port);
+		goto err_free_repr;
+	}
+
+	if (ptype == NFP_PORT_PHYS_PORT) {
+		err = nfp_port_init_phy_port(app->pf, app, port, alink->id);
+		if (err)
+			goto err_free_port;
+	} else {
+		port->pf_id = alink->abm->pf_id;
+		port->vnic = alink->vnic->dp.ctrl_bar;
+	}
+
+	SET_NETDEV_DEV(netdev, &alink->vnic->pdev->dev);
+	eth_hw_addr_random(netdev);
+
+	err = nfp_repr_init(app, netdev, nfp_abm_portid(rtype, alink->id),
+			    port, alink->vnic->dp.netdev);
+	if (err)
+		goto err_free_port;
+
+	reprs = nfp_reprs_get_locked(app, rtype);
+	WARN(nfp_repr_get_locked(app, reprs, alink->id), "duplicate repr");
+	rcu_assign_pointer(reprs->reprs[alink->id], netdev);
+
+	nfp_info(app->cpp, "%s Port %d Representor(%s) created\n",
+		 ptype == NFP_PORT_PF_PORT ? "PCIe" : "Phys",
+		 alink->id, netdev->name);
+
+	return 0;
+
+err_free_port:
+	nfp_port_free(port);
+err_free_repr:
+	nfp_repr_free(netdev);
+	return err;
+}
+
+static void
+nfp_abm_kill_repr(struct nfp_app *app, struct nfp_abm_link *alink,
+		  enum nfp_repr_type rtype)
+{
+	struct net_device *netdev;
+	struct nfp_reprs *reprs;
+
+	reprs = nfp_reprs_get_locked(app, rtype);
+	netdev = nfp_repr_get_locked(app, reprs, alink->id);
+	if (!netdev)
+		return;
+	rcu_assign_pointer(reprs->reprs[alink->id], NULL);
+	synchronize_rcu();
+	/* Cast to make sure nfp_repr_clean_and_free() takes a nfp_repr */
+	nfp_repr_clean_and_free((struct nfp_repr *)netdev_priv(netdev));
+}
+
+static void
+nfp_abm_kill_reprs(struct nfp_abm *abm, struct nfp_abm_link *alink)
+{
+	nfp_abm_kill_repr(abm->app, alink, NFP_REPR_TYPE_PF);
+	nfp_abm_kill_repr(abm->app, alink, NFP_REPR_TYPE_PHYS_PORT);
+}
+
+static void nfp_abm_kill_reprs_all(struct nfp_abm *abm)
+{
+	struct nfp_pf *pf = abm->app->pf;
+	struct nfp_net *nn;
+
+	list_for_each_entry(nn, &pf->vnics, vnic_list)
+		nfp_abm_kill_reprs(abm, (struct nfp_abm_link *)nn->app_priv);
+}
+
+static enum devlink_eswitch_mode nfp_abm_eswitch_mode_get(struct nfp_app *app)
+{
+	struct nfp_abm *abm = app->priv;
+
+	return abm->eswitch_mode;
+}
+
+static int nfp_abm_eswitch_set_legacy(struct nfp_abm *abm)
+{
+	nfp_abm_kill_reprs_all(abm);
+
+	abm->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
+	return 0;
+}
+
+static void nfp_abm_eswitch_clean_up(struct nfp_abm *abm)
+{
+	if (abm->eswitch_mode != DEVLINK_ESWITCH_MODE_LEGACY)
+		WARN_ON(nfp_abm_eswitch_set_legacy(abm));
+}
+
+static int nfp_abm_eswitch_set_switchdev(struct nfp_abm *abm)
+{
+	struct nfp_app *app = abm->app;
+	struct nfp_pf *pf = app->pf;
+	struct nfp_net *nn;
+	int err;
+
+	list_for_each_entry(nn, &pf->vnics, vnic_list) {
+		struct nfp_abm_link *alink = nn->app_priv;
+
+		err = nfp_abm_spawn_repr(app, alink, NFP_PORT_PHYS_PORT);
+		if (err)
+			goto err_kill_all_reprs;
+
+		err = nfp_abm_spawn_repr(app, alink, NFP_PORT_PF_PORT);
+		if (err)
+			goto err_kill_all_reprs;
+	}
+
+	abm->eswitch_mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+	return 0;
+
+err_kill_all_reprs:
+	nfp_abm_kill_reprs_all(abm);
+	return err;
+}
+
+static int nfp_abm_eswitch_mode_set(struct nfp_app *app, u16 mode)
+{
+	struct nfp_abm *abm = app->priv;
+
+	if (abm->eswitch_mode == mode)
+		return 0;
+
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		return nfp_abm_eswitch_set_legacy(abm);
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+		return nfp_abm_eswitch_set_switchdev(abm);
+	default:
+		return -EINVAL;
+	}
+}
+
 static void
 nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn,
 		     unsigned int id)
@@ -87,7 +278,6 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 {
 	struct nfp_abm *abm = app->priv;
 	struct nfp_abm_link *alink;
-	int err;
 
 	alink = kzalloc(sizeof(*alink), GFP_KERNEL);
 	if (!alink)
@@ -97,41 +287,26 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 	alink->vnic = nn;
 	alink->id = id;
 
-	nn->port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, nn->dp.netdev);
-	if (IS_ERR(nn->port)) {
-		err = PTR_ERR(nn->port);
-		goto err_free_alink;
-	}
-
-	err = nfp_app_nic_vnic_init_phy_port(app->pf, app, nn, id);
-	if (err < 0)
-		goto err_free_port;
-	if (nn->port->type == NFP_PORT_INVALID)
-		/* core will kill this vNIC */
-		return 0;
+	netif_keep_dst(nn->dp.netdev);
 
 	nfp_abm_vnic_set_mac(app->pf, abm, nn, id);
 	nfp_abm_ctrl_read_params(alink);
 
 	return 0;
-
-err_free_port:
-	nfp_port_free(nn->port);
-err_free_alink:
-	kfree(alink);
-	return err;
 }
 
 static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
 {
 	struct nfp_abm_link *alink = nn->app_priv;
 
+	nfp_abm_kill_reprs(alink->abm, alink);
 	kfree(alink);
 }
 
 static int nfp_abm_init(struct nfp_app *app)
 {
 	struct nfp_pf *pf = app->pf;
+	struct nfp_reprs *reprs;
 	struct nfp_abm *abm;
 	int err;
 
@@ -159,8 +334,21 @@ static int nfp_abm_init(struct nfp_app *app)
 	if (err)
 		goto err_free_abm;
 
+	err = -ENOMEM;
+	reprs = nfp_reprs_alloc(pf->max_data_vnics);
+	if (!reprs)
+		goto err_free_abm;
+	RCU_INIT_POINTER(app->reprs[NFP_REPR_TYPE_PHYS_PORT], reprs);
+
+	reprs = nfp_reprs_alloc(pf->max_data_vnics);
+	if (!reprs)
+		goto err_free_phys;
+	RCU_INIT_POINTER(app->reprs[NFP_REPR_TYPE_PF], reprs);
+
 	return 0;
 
+err_free_phys:
+	nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PHYS_PORT);
 err_free_abm:
 	kfree(abm);
 	app->priv = NULL;
@@ -171,6 +359,9 @@ static void nfp_abm_clean(struct nfp_app *app)
 {
 	struct nfp_abm *abm = app->priv;
 
+	nfp_abm_eswitch_clean_up(abm);
+	nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PF);
+	nfp_reprs_clean_and_free_by_type(app, NFP_REPR_TYPE_PHYS_PORT);
 	kfree(abm);
 	app->priv = NULL;
 }
@@ -184,4 +375,9 @@ const struct nfp_app_type app_abm = {
 
 	.vnic_alloc	= nfp_abm_vnic_alloc,
 	.vnic_free	= nfp_abm_vnic_free,
+
+	.eswitch_mode_get	= nfp_abm_eswitch_mode_get,
+	.eswitch_mode_set	= nfp_abm_eswitch_mode_set,
+
+	.repr_get	= nfp_abm_repr_get,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 3c5a01c96ecd..5938b69b8a84 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -35,17 +35,25 @@
 #ifndef __NFP_ABM_H__
 #define __NFP_ABM_H__ 1
 
+#include <net/devlink.h>
+
 struct nfp_app;
 struct nfp_net;
 
+#define NFP_ABM_PORTID_TYPE	GENMASK(23, 16)
+#define NFP_ABM_PORTID_ID	GENMASK(7, 0)
+
 /**
  * struct nfp_abm - ABM NIC app structure
  * @app:	back pointer to nfp_app
  * @pf_id:	ID of our PF link
+ * @eswitch_mode:	devlink eswitch mode, advanced functions only visible
+ *			in switchdev mode
  */
 struct nfp_abm {
 	struct nfp_app *app;
 	unsigned int pf_id;
+	enum devlink_eswitch_mode eswitch_mode;
 };
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 6e79da91e475..09e87d5f4f72 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -385,7 +385,7 @@ err_free_netdev:
 	return NULL;
 }
 
-static void nfp_repr_clean_and_free(struct nfp_repr *repr)
+void nfp_repr_clean_and_free(struct nfp_repr *repr)
 {
 	nfp_info(repr->app->cpp, "Destroying Representor(%s)\n",
 		 repr->netdev->name);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index 8dca283c05c3..8366e4f3c623 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -127,6 +127,7 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 		  struct net_device *pf_netdev);
 void nfp_repr_free(struct net_device *netdev);
 struct net_device *nfp_repr_alloc(struct nfp_app *app);
+void nfp_repr_clean_and_free(struct nfp_repr *repr);
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs);
 void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
 				      enum nfp_repr_type type);
-- 
cgit v1.2.3-59-g8ed1b


From 1f70036723e10b165627bb11a7809eed80d43a16 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:53 -0700
Subject: nfp: abm: force Ethternet port up

Current control firmware does not cater too well to multi-host
applications.  There is no way to check which hosts are up or
otherwise negotiate what the state of the external port (the
Ethernet port) should be.  Make sure the link is up when driver
loads, and don't take it down when Ethernet port netdev is
closed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 14 ++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_port.c |  2 ++
 drivers/net/ethernet/netronome/nfp/nfp_port.h |  2 ++
 3 files changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index d8f1bdac02b8..7afd24ce79a5 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -103,6 +103,7 @@ nfp_abm_spawn_repr(struct nfp_app *app, struct nfp_abm_link *alink,
 	}
 
 	if (ptype == NFP_PORT_PHYS_PORT) {
+		port->eth_forced = true;
 		err = nfp_port_init_phy_port(app->pf, app, port, alink->id);
 		if (err)
 			goto err_free_port;
@@ -276,8 +277,10 @@ nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn,
 static int
 nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 {
+	struct nfp_eth_table_port *eth_port = &app->pf->eth_tbl->ports[id];
 	struct nfp_abm *abm = app->priv;
 	struct nfp_abm_link *alink;
+	int err;
 
 	alink = kzalloc(sizeof(*alink), GFP_KERNEL);
 	if (!alink)
@@ -287,12 +290,23 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 	alink->vnic = nn;
 	alink->id = id;
 
+	/* This is a multi-host app, make sure MAC/PHY is up, but don't
+	 * make the MAC/PHY state follow the state of any of the ports.
+	 */
+	err = nfp_eth_set_configured(app->cpp, eth_port->index, true);
+	if (err < 0)
+		goto err_free_alink;
+
 	netif_keep_dst(nn->dp.netdev);
 
 	nfp_abm_vnic_set_mac(app->pf, abm, nn, id);
 	nfp_abm_ctrl_read_params(alink);
 
 	return 0;
+
+err_free_alink:
+	kfree(alink);
+	return err;
 }
 
 static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index 7bd8be5c833b..a17f1ace6988 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -218,6 +218,8 @@ int nfp_port_configure(struct net_device *netdev, bool configed)
 	eth_port = __nfp_port_get_eth_port(port);
 	if (!eth_port)
 		return 0;
+	if (port->eth_forced)
+		return 0;
 
 	err = nfp_eth_set_configured(port->app->cpp, eth_port->index, configed);
 	return err < 0 && err != -EOPNOTSUPP ? err : 0;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index fa7e669a969c..1d9b2b47e27d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -77,6 +77,7 @@ enum nfp_port_flags {
  * @app:	backpointer to the app structure
  * @dl_port:	devlink port structure
  * @eth_id:	for %NFP_PORT_PHYS_PORT port ID in NFP enumeration scheme
+ * @eth_forced:	for %NFP_PORT_PHYS_PORT port is forced UP or DOWN, don't change
  * @eth_port:	for %NFP_PORT_PHYS_PORT translated ETH Table port entry
  * @eth_stats:	for %NFP_PORT_PHYS_PORT MAC stats if available
  * @pf_id:	for %NFP_PORT_PF_PORT, %NFP_PORT_VF_PORT ID of the PCI PF (0-3)
@@ -99,6 +100,7 @@ struct nfp_port {
 		/* NFP_PORT_PHYS_PORT */
 		struct {
 			unsigned int eth_id;
+			bool eth_forced;
 			struct nfp_eth_table_port *eth_port;
 			u8 __iomem *eth_stats;
 		};
-- 
cgit v1.2.3-59-g8ed1b


From 290f54db317e99edf9fda8ac1d4d27fcd5c7743f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:54 -0700
Subject: nfp: use split in naming of PCIe PF ports

PCI PFs can host more than one logical endpoint.  In NFP terms
this means having more than one vNIC for PCIe PF.  The vNICs
are usually corresponding 1:1 to Ethernet ports.  In core NIC
we use the legacy idea of vNIC *being* the Ethernet port,
hence netdevs put pX(sY) in their phys_port_name, like Ethernet
ports would.  When ASIC ports are fully represented we need to
be able to name different PCIe PF ports, too.  Use a scheme
similar to Ethernet ports - pfXsY, for PCIe PF number X,
sub-port Y.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 2 ++
 drivers/net/ethernet/netronome/nfp/nfp_port.c | 6 +++++-
 drivers/net/ethernet/netronome/nfp/nfp_port.h | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 7afd24ce79a5..5a12bb20bced 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -109,6 +109,8 @@ nfp_abm_spawn_repr(struct nfp_app *app, struct nfp_abm_link *alink,
 			goto err_free_port;
 	} else {
 		port->pf_id = alink->abm->pf_id;
+		port->pf_split = app->pf->max_data_vnics > 1;
+		port->pf_split_id = alink->id;
 		port->vnic = alink->vnic->dp.ctrl_bar;
 	}
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index a17f1ace6988..9c1298114c70 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -181,7 +181,11 @@ nfp_port_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
 				     eth_port->label_subport);
 		break;
 	case NFP_PORT_PF_PORT:
-		n = snprintf(name, len, "pf%d", port->pf_id);
+		if (!port->pf_split)
+			n = snprintf(name, len, "pf%d", port->pf_id);
+		else
+			n = snprintf(name, len, "pf%ds%d", port->pf_id,
+				     port->pf_split_id);
 		break;
 	case NFP_PORT_VF_PORT:
 		n = snprintf(name, len, "pf%dvf%d", port->pf_id, port->vf_id);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 1d9b2b47e27d..18666750456e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -82,6 +82,8 @@ enum nfp_port_flags {
  * @eth_stats:	for %NFP_PORT_PHYS_PORT MAC stats if available
  * @pf_id:	for %NFP_PORT_PF_PORT, %NFP_PORT_VF_PORT ID of the PCI PF (0-3)
  * @vf_id:	for %NFP_PORT_VF_PORT ID of the PCI VF within @pf_id
+ * @pf_split:	for %NFP_PORT_PF_PORT %true if PCI PF has more than one vNIC
+ * @pf_split_id:for %NFP_PORT_PF_PORT ID of PCI PF vNIC (valid if @pf_split)
  * @vnic:	for %NFP_PORT_PF_PORT, %NFP_PORT_VF_PORT vNIC ctrl memory
  * @port_list:	entry on pf's list of ports
  */
@@ -108,6 +110,8 @@ struct nfp_port {
 		struct {
 			unsigned int pf_id;
 			unsigned int vf_id;
+			bool pf_split;
+			unsigned int pf_split_id;
 			u8 __iomem *vnic;
 		};
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 51c1df83e35ce0e24ca10037c73245cb4c8ac11a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 21 May 2018 22:12:55 -0700
Subject: nfp: assign vNIC id as phys_port_name of vNICs which are not ports

When NFP is modelled as a switch we assign phys_port_name to respective
port(representor )s:

 vNIC0 - | - PF port (pf%d)     MAC/PHY (p%d[s%d]) - |E==

In most cases there is only one vNIC for communication with the switch.
If there is more than one we need to be able to identify them.  Use %d
as phys_port_name of the vNICs.

We don't have to pass ID to nfp_net_debugfs_vnic_add() separately any
more.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h         |  7 +++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  | 20 +++++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c    |  4 +++-
 drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c  |  2 +-
 5 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index bd7d8ae31e17..57cb035dcc6d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -545,6 +545,7 @@ struct nfp_net_dp {
 /**
  * struct nfp_net - NFP network device structure
  * @dp:			Datapath structure
+ * @id:			vNIC id within the PF (0 for VFs)
  * @fw_ver:		Firmware version
  * @cap:                Capabilities advertised by the Firmware
  * @max_mtu:            Maximum support MTU advertised by the Firmware
@@ -597,6 +598,8 @@ struct nfp_net {
 
 	struct nfp_net_fw_version fw_ver;
 
+	u32 id;
+
 	u32 cap;
 	u32 max_mtu;
 
@@ -909,7 +912,7 @@ int nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_dp *new,
 void nfp_net_debugfs_create(void);
 void nfp_net_debugfs_destroy(void);
 struct dentry *nfp_net_debugfs_device_add(struct pci_dev *pdev);
-void nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir, int id);
+void nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir);
 void nfp_net_debugfs_dir_clean(struct dentry **dir);
 #else
 static inline void nfp_net_debugfs_create(void)
@@ -926,7 +929,7 @@ static inline struct dentry *nfp_net_debugfs_device_add(struct pci_dev *pdev)
 }
 
 static inline void
-nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir, int id)
+nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir)
 {
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index d9111c077699..eea11e881bf5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3277,6 +3277,24 @@ nfp_net_features_check(struct sk_buff *skb, struct net_device *dev,
 	return features;
 }
 
+static int
+nfp_net_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
+{
+	struct nfp_net *nn = netdev_priv(netdev);
+	int n;
+
+	if (nn->port)
+		return nfp_port_get_phys_port_name(netdev, name, len);
+
+	if (!nn->dp.is_vf) {
+		n = snprintf(name, len, "%d", nn->id);
+		if (n >= len)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  * nfp_net_set_vxlan_port() - set vxlan port in SW and reconfigure HW
  * @nn:   NFP Net device to reconfigure
@@ -3475,7 +3493,7 @@ const struct net_device_ops nfp_net_netdev_ops = {
 	.ndo_set_mac_address	= nfp_net_set_mac_address,
 	.ndo_set_features	= nfp_net_set_features,
 	.ndo_features_check	= nfp_net_features_check,
-	.ndo_get_phys_port_name	= nfp_port_get_phys_port_name,
+	.ndo_get_phys_port_name	= nfp_net_get_phys_port_name,
 	.ndo_udp_tunnel_add	= nfp_net_add_vxlan_port,
 	.ndo_udp_tunnel_del	= nfp_net_del_vxlan_port,
 	.ndo_bpf		= nfp_net_xdp,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
index 67cdd8330c59..099b63d67451 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
@@ -201,7 +201,7 @@ static const struct file_operations nfp_xdp_q_fops = {
 	.llseek = seq_lseek
 };
 
-void nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir, int id)
+void nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir)
 {
 	struct dentry *queues, *tx, *rx, *xdp;
 	char name[20];
@@ -211,7 +211,7 @@ void nfp_net_debugfs_vnic_add(struct nfp_net *nn, struct dentry *ddir, int id)
 		return;
 
 	if (nfp_net_is_data_vnic(nn))
-		sprintf(name, "vnic%d", id);
+		sprintf(name, "vnic%d", nn->id);
 	else
 		strcpy(name, "ctrl-vnic");
 	nn->debugfs_dir = debugfs_create_dir(name, ddir);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index b98422112385..28516eecccc8 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -178,11 +178,13 @@ nfp_net_pf_init_vnic(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id)
 {
 	int err;
 
+	nn->id = id;
+
 	err = nfp_net_init(nn);
 	if (err)
 		return err;
 
-	nfp_net_debugfs_vnic_add(nn, pf->ddir, id);
+	nfp_net_debugfs_vnic_add(nn, pf->ddir);
 
 	if (nn->port) {
 		err = nfp_devlink_port_register(pf->app, nn->port);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
index b802a1d55449..68928c86b698 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c
@@ -283,7 +283,7 @@ static int nfp_netvf_pci_probe(struct pci_dev *pdev,
 
 	nfp_net_info(nn);
 	vf->ddir = nfp_net_debugfs_device_add(pdev);
-	nfp_net_debugfs_vnic_add(nn, vf->ddir, 0);
+	nfp_net_debugfs_vnic_add(nn, vf->ddir);
 
 	return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From a1f5d1f0dfccea54e1d9c2da16fd8c9b54f81a75 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Tue, 22 May 2018 12:17:59 +0200
Subject: net: phy: sfp: warn the user when no tx_disable pin is available

In case no Tx disable pin is available the SFP modules will always be
emitting. This could be an issue when using modules using laser as their
light source as we would have no way to disable it when the fiber is
removed. This patch adds a warning when registering an SFP cage which do
not have its tx_disable pin wired or available.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 4ab6e9a50bbe..a91d12209a81 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -1065,6 +1065,15 @@ static int sfp_probe(struct platform_device *pdev)
 	if (poll)
 		mod_delayed_work(system_wq, &sfp->poll, poll_jiffies);
 
+	/* We could have an issue in cases no Tx disable pin is available or
+	 * wired as modules using a laser as their light source will continue to
+	 * be active when the fiber is removed. This could be a safety issue and
+	 * we should at least warn the user about that.
+	 */
+	if (!sfp->gpio[GPIO_TX_DISABLE])
+		dev_warn(sfp->dev,
+			 "No tx_disable pin: SFP modules will always be emitting.\n");
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 66ede1f9c9466cdf866b4be40a814a140d5df60a Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Tue, 22 May 2018 12:18:00 +0200
Subject: net: phy: sfp: make the i2c-bus dt property mandatory

This patch makes the i2c-bus property mandatory when using a device
tree. If the sfp i2c bus isn't described it's impossible to guess the
protocol to use for a given module, and the sfp module would then not
work in most cases.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index a91d12209a81..c4c92db86dfa 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -976,6 +976,7 @@ static int sfp_probe(struct platform_device *pdev)
 	if (pdev->dev.of_node) {
 		struct device_node *node = pdev->dev.of_node;
 		const struct of_device_id *id;
+		struct i2c_adapter *i2c;
 		struct device_node *np;
 
 		id = of_match_node(sfp_of_match, node);
@@ -985,19 +986,20 @@ static int sfp_probe(struct platform_device *pdev)
 		sff = sfp->type = id->data;
 
 		np = of_parse_phandle(node, "i2c-bus", 0);
-		if (np) {
-			struct i2c_adapter *i2c;
-
-			i2c = of_find_i2c_adapter_by_node(np);
-			of_node_put(np);
-			if (!i2c)
-				return -EPROBE_DEFER;
-
-			err = sfp_i2c_configure(sfp, i2c);
-			if (err < 0) {
-				i2c_put_adapter(i2c);
-				return err;
-			}
+		if (!np) {
+			dev_err(sfp->dev, "missing 'i2c-bus' property\n");
+			return -ENODEV;
+		}
+
+		i2c = of_find_i2c_adapter_by_node(np);
+		of_node_put(np);
+		if (!i2c)
+			return -EPROBE_DEFER;
+
+		err = sfp_i2c_configure(sfp, i2c);
+		if (err < 0) {
+			i2c_put_adapter(i2c);
+			return err;
 		}
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 3e48439347d56cd1b3d3d07a93bed2b0c6b129fe Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Tue, 22 May 2018 12:18:01 +0200
Subject: Documentation/bindings: net: the sfp i2c-bus property is now
 mandatory

The i2c-bus property for sfp modules was made mandatory. Update the
documentation to keep it in sync with the driver's behaviour.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/sff,sfp.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/sff,sfp.txt b/Documentation/devicetree/bindings/net/sff,sfp.txt
index 929591d52ed6..832139919f20 100644
--- a/Documentation/devicetree/bindings/net/sff,sfp.txt
+++ b/Documentation/devicetree/bindings/net/sff,sfp.txt
@@ -7,11 +7,11 @@ Required properties:
   "sff,sfp" for SFP modules
   "sff,sff" for soldered down SFF modules
 
-Optional Properties:
-
 - i2c-bus : phandle of an I2C bus controller for the SFP two wire serial
   interface
 
+Optional Properties:
+
 - mod-def0-gpios : GPIO phandle and a specifier of the MOD-DEF0 (AKA Mod_ABS)
   module presence input gpio signal, active (module absent) high. Must
   not be present for SFF modules
-- 
cgit v1.2.3-59-g8ed1b


From ff06342cbca0ec2c0d5da8685d2691e2d0e9764a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 22 May 2018 11:34:39 -0400
Subject: udp: exclude gso from xfrm paths

UDP GSO delays final datagram construction to the GSO layer. This
conflicts with protocol transformations.

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
CC: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 ++-
 net/ipv6/udp.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ff4d4ba67735..d71f1f3e1155 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -788,7 +788,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EINVAL;
 		if (sk->sk_no_check_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2839c1bd1e58..426c9d2b418d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1053,7 +1053,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EINVAL;
 		if (udp_sk(sk)->no_check6_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
-- 
cgit v1.2.3-59-g8ed1b


From 8eea1ca82be90a7e7a4624ab9cb323574a5f71df Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 22 May 2018 11:34:40 -0400
Subject: gso: limit udp gso to egress-only virtual devices

Until the udp receive stack supports large packets (UDP GRO), GSO
packets must not loop from the egress to the ingress path.

Revert the change that added NETIF_F_GSO_UDP_L4 to various virtual
devices through NETIF_F_GSO_ENCAP_ALL as this included devices that
may loop packets, such as veth and macvlan.

Instead add it to specific devices that forward to another device's
egress path, bonding and team.

Fixes: 83aa025f535f ("udp: add gso support to virtual devices")
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 5 +++--
 drivers/net/team/team.c         | 5 +++--
 include/linux/netdev_features.h | 1 -
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 06efdf6a762b..fea17b92b1ae 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1107,7 +1107,8 @@ static void bond_compute_features(struct bonding *bond)
 
 done:
 	bond_dev->vlan_features = vlan_features;
-	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				    NETIF_F_GSO_UDP_L4;
 	bond_dev->gso_max_segs = gso_max_segs;
 	netif_set_gso_max_size(bond_dev, gso_max_size);
 
@@ -4268,7 +4269,7 @@ void bond_setup(struct net_device *bond_dev)
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	bond_dev->features |= bond_dev->hw_features;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 9dbd390ace34..d6ff881165d0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1026,7 +1026,8 @@ static void __team_compute_features(struct team *team)
 	}
 
 	team->dev->vlan_features = vlan_features;
-	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				     NETIF_F_GSO_UDP_L4;
 	team->dev->hard_header_len = max_hard_header_len;
 
 	team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
@@ -2117,7 +2118,7 @@ static void team_setup(struct net_device *dev)
 			   NETIF_F_HW_VLAN_CTAG_RX |
 			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	dev->features |= dev->hw_features;
 }
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index c87c3a3453c1..623bb8ced060 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -220,7 +220,6 @@ enum {
 				 NETIF_F_GSO_GRE_CSUM |			\
 				 NETIF_F_GSO_IPXIP4 |			\
 				 NETIF_F_GSO_IPXIP6 |			\
-				 NETIF_F_GSO_UDP_L4 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
-- 
cgit v1.2.3-59-g8ed1b


From 7c6f97475da04ba3e80c62ca8cd271e34290a76e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 22 May 2018 17:18:09 +0100
Subject: net: vxge: fix spelling mistake in macro
 VXGE_HW_ERR_PRIVILAGED_OPEARATION

Rename VXGE_HW_ERR_PRIVILAGED_OPEARATION to VXGE_HW_ERR_PRIVILEGED_OPERATION
to fix spelling mistake.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/neterion/vxge/vxge-config.c  | 12 ++++++------
 drivers/net/ethernet/neterion/vxge/vxge-config.h  |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-ethtool.c |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c    |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.c b/drivers/net/ethernet/neterion/vxge/vxge-config.c
index 6223930a8155..c60da9e8bf14 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.c
@@ -693,7 +693,7 @@ __vxge_hw_device_is_privilaged(u32 host_type, u32 func_id)
 		VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)
 		return VXGE_HW_OK;
 	else
-		return VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		return VXGE_HW_ERR_PRIVILEGED_OPERATION;
 }
 
 /*
@@ -1920,7 +1920,7 @@ enum vxge_hw_status vxge_hw_device_getpause_data(struct __vxge_hw_device *hldev,
 	}
 
 	if (!(hldev->access_rights & VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-		status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 		goto exit;
 	}
 
@@ -3153,7 +3153,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3165,7 +3165,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
@@ -3279,7 +3279,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3291,7 +3291,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h
index cfa970417f81..d743a37a3cee 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.h
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h
@@ -127,7 +127,7 @@ enum vxge_hw_status {
 	VXGE_HW_ERR_INVALID_TCODE 		  = VXGE_HW_BASE_ERR + 14,
 	VXGE_HW_ERR_INVALID_BLOCK_SIZE		  = VXGE_HW_BASE_ERR + 15,
 	VXGE_HW_ERR_INVALID_STATE		  = VXGE_HW_BASE_ERR + 16,
-	VXGE_HW_ERR_PRIVILAGED_OPEARATION	  = VXGE_HW_BASE_ERR + 17,
+	VXGE_HW_ERR_PRIVILEGED_OPERATION	  = VXGE_HW_BASE_ERR + 17,
 	VXGE_HW_ERR_INVALID_PORT 		  = VXGE_HW_BASE_ERR + 18,
 	VXGE_HW_ERR_FIFO		 	  = VXGE_HW_BASE_ERR + 19,
 	VXGE_HW_ERR_VPATH			  = VXGE_HW_BASE_ERR + 20,
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
index 0452848d1316..03c3d1230c17 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
@@ -276,7 +276,7 @@ static void vxge_get_ethtool_stats(struct net_device *dev,
 	*ptr++ = 0;
 	status = vxge_hw_device_xmac_stats_get(hldev, xmac_stats);
 	if (status != VXGE_HW_OK) {
-		if (status != VXGE_HW_ERR_PRIVILAGED_OPEARATION) {
+		if (status != VXGE_HW_ERR_PRIVILEGED_OPERATION) {
 			vxge_debug_init(VXGE_ERR,
 				"%s : %d Failure in getting xmac stats",
 				__func__, __LINE__);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index b2299f2b2155..a8918bb7c802 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3484,11 +3484,11 @@ static int vxge_device_register(struct __vxge_hw_device *hldev,
 				0,
 				&stat);
 
-	if (status == VXGE_HW_ERR_PRIVILAGED_OPEARATION)
+	if (status == VXGE_HW_ERR_PRIVILEGED_OPERATION)
 		vxge_debug_init(
 			vxge_hw_device_trace_level_get(hldev),
 			"%s: device stats clear returns"
-			"VXGE_HW_ERR_PRIVILAGED_OPEARATION", ndev->name);
+			"VXGE_HW_ERR_PRIVILEGED_OPERATION", ndev->name);
 
 	vxge_debug_entryexit(vxge_hw_device_trace_level_get(hldev),
 		"%s: %s:%d  Exiting...",
-- 
cgit v1.2.3-59-g8ed1b


From 273de02ae7d828d72d66eb5fce181e1f3f96d0cd Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Tue, 22 May 2018 11:29:34 -0700
Subject: hv_netvsc: Add handlers for ethtool get/set msg level

The handlers for ethtool get/set msg level are missing from netvsc.
This patch adds them.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index da07ccdf84bf..60a5769ef5a1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1618,8 +1618,24 @@ static int netvsc_set_ringparam(struct net_device *ndev,
 	return ret;
 }
 
+static u32 netvsc_get_msglevel(struct net_device *ndev)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+	return ndev_ctx->msg_enable;
+}
+
+static void netvsc_set_msglevel(struct net_device *ndev, u32 val)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
+
+	ndev_ctx->msg_enable = val;
+}
+
 static const struct ethtool_ops ethtool_ops = {
 	.get_drvinfo	= netvsc_get_drvinfo,
+	.get_msglevel	= netvsc_get_msglevel,
+	.set_msglevel	= netvsc_set_msglevel,
 	.get_link	= ethtool_op_get_link,
 	.get_ethtool_stats = netvsc_get_ethtool_stats,
 	.get_sset_count = netvsc_get_sset_count,
-- 
cgit v1.2.3-59-g8ed1b


From 404eb77ea766260c45cb05c4a8043b13bd7142d5 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 14:03:27 -0700
Subject: ipv4: support sport, dport and ip_proto in RTM_GETROUTE

This is a followup to fib rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h               |   3 +
 include/uapi/linux/rtnetlink.h |   3 +
 net/ipv4/Makefile              |   2 +-
 net/ipv4/fib_frontend.c        |   3 +
 net/ipv4/netlink.c             |  23 +++++++
 net/ipv4/route.c               | 146 ++++++++++++++++++++++++++++++-----------
 6 files changed, 140 insertions(+), 40 deletions(-)
 create mode 100644 net/ipv4/netlink.c

diff --git a/include/net/ip.h b/include/net/ip.h
index bada1f1f871e..0d2281b4b27a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst;
 int ip_misc_proc_init(void);
 #endif
 
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack);
+
 #endif	/* _IP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005955fa..cabb210c93af 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_IP_PROTO,
+	RTA_SPORT,
+	RTA_DPORT,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7018f91c5a39..eec9569ffa5c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o
+	     metrics.o netlink.o
 
 obj-$(CONFIG_BPFILTER) += bpfilter/
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112bf95..897ae92dff0f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -649,6 +649,9 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 000000000000..f86bb4f06609
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b518f8d..0e401dc4e1bd 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2574,11 +2574,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
@@ -2674,7 +2673,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			}
 		} else
 #endif
-			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
 				goto nla_put_failure;
 	}
 
@@ -2689,43 +2688,93 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = sizeof(struct udphdr);
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
-
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		err = -ENOBUFS;
-		goto errout;
-	}
-
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-
 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2735,14 +2784,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		uid = (iif ? INVALID_UID : current_uid());
 
-	/* Bugfix: need to give ip_route_input enough of an IP header to
-	 * not gag.
-	 */
-	ip_hdr(skb)->protocol = IPPROTO_UDP;
-	ip_hdr(skb)->saddr = src;
-	ip_hdr(skb)->daddr = dst;
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
 
-	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2751,6 +2808,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2760,10 +2822,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
-		skb->protocol	= htons(ETH_P_IP);
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
 		skb->dev	= dev;
 		skb->mark	= mark;
 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2783,7 +2845,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2791,34 +2853,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = res.table ? res.table->tb_id : 0;
 
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
-		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
 
 errout_free:
+	return err;
+errout_rcu:
 	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
cgit v1.2.3-59-g8ed1b


From eacb9384a3fec5ea1fd227f37d2b3bcd153d73a2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 14:03:28 -0700
Subject: ipv6: support sport, dport and ip_proto in RTM_GETROUTE

This is a followup to fib6 rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bcb8785c0451..038d661d5ffc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,6 +63,7 @@
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/ip.h>
 #include <trace/events/fib6.h>
 
 #include <linux/uaccess.h>
@@ -4083,6 +4084,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
 	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4795,6 +4799,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 
+	if (tb[RTA_SPORT])
+		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+	if (tb[RTA_DPORT])
+		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &fl6.flowi6_proto, extack);
+		if (err)
+			goto errout;
+	}
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 65b2b4939a643861ed1660eccd6dc0b5d8c78afa Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 22 May 2018 14:03:29 -0700
Subject: selftests: net: initial fib rule tests

This adds a first set of tests for fib rule match/action for
ipv4 and ipv6. Initial tests only cover action lookup table.
can be extended to cover other actions in the future.
Uses ip route get to validate the rule lookup.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 248 ++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/fib_rule_tests.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index e60dddbf963c..7cb0f49efdb7 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100755
index 000000000000..d4cfb6a7a086
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+IP="ip -netns testns"
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV=dummy0
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+	else
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "######################################################################"
+	echo "TEST SECTION: $*"
+	echo "######################################################################"
+}
+
+setup()
+{
+	set -e
+	ip netns add testns
+	$IP link set dev lo up
+
+	$IP link add dummy0 type dummy
+	$IP link set dev dummy0 up
+	$IP address add 198.51.100.1/24 dev dummy0
+	$IP -6 address add 2001:db8:1::1/64 dev dummy0
+
+	set +e
+}
+
+cleanup()
+{
+	$IP link del dev dummy0 &> /dev/null
+	ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+	ip rule help 2>&1 | grep -q $1
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing $1 match"
+		return 1
+	fi
+
+	ip route get help 2>&1 | grep -q $2
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 get route too old, missing $2 match"
+		return 1
+	fi
+
+	return 0
+}
+
+fib_rule6_del()
+{
+	$IP -6 rule del $1
+	log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+	pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP -6 rule add $match table $RTABLE
+	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule6 check: $1"
+
+	fib_rule6_del_by_pref "$match"
+	log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+	# setup the fib rule redirect route
+	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP6 iif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+fib_rule4_del()
+{
+	$IP rule del $1
+	log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+	pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP rule add $match table $RTABLE
+	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule4 check: $1"
+
+	fib_rule4_del_by_pref "$match"
+	log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+	# setup the fib rule redirect route
+	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP iif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+run_fibrule_tests()
+{
+	log_section "IPv4 fib rule"
+	fib_rule4_test
+	log_section "IPv6 fib rule"
+	fib_rule6_test
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+run_fibrule_tests
+cleanup
+
+exit $ret
-- 
cgit v1.2.3-59-g8ed1b


From 9d3df886d17b5ef73d4018841ef0a349fcd109ea Mon Sep 17 00:00:00 2001
From: Christian Brauner <christianvanbrauner@gmail.com>
Date: Tue, 22 May 2018 21:34:21 +0200
Subject: selftests: uevent filtering

Recent discussions around uevent filtering (cf. net-next commit [1], [2],
and [3] and discussions in [4], [5], and [6]) have shown that the semantics
around uevent filtering where not well understood.
Now that we have settled - at least for the moment - how uevent filtering
should look like let's add some selftests to ensure we don't regress
anything in the future.
Note, the semantics of uevent filtering are described in detail in my
commit message to [2] so I won't repeat them here.

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=90d52d4fd82007005125d9a8d2d560a1ca059b9d
[2]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=a3498436b3a0f8ec289e6847e1de40b4123e1639
[3]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=26045a7b14bc7a5455e411d820110f66557d6589
[4]: https://lkml.org/lkml/2018/4/4/739
[5]: https://lkml.org/lkml/2018/4/26/767
[6]: https://lkml.org/lkml/2018/4/26/738

Signed-off-by: Christian Brauner <christian@brauner.io>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/uevent/Makefile           |  17 +
 tools/testing/selftests/uevent/config             |   2 +
 tools/testing/selftests/uevent/uevent_filtering.c | 486 ++++++++++++++++++++++
 3 files changed, 505 insertions(+)
 create mode 100644 tools/testing/selftests/uevent/Makefile
 create mode 100644 tools/testing/selftests/uevent/config
 create mode 100644 tools/testing/selftests/uevent/uevent_filtering.c

diff --git a/tools/testing/selftests/uevent/Makefile b/tools/testing/selftests/uevent/Makefile
new file mode 100644
index 000000000000..f7baa9aa2932
--- /dev/null
+++ b/tools/testing/selftests/uevent/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+include ../lib.mk
+
+.PHONY: all clean
+
+BINARIES := uevent_filtering
+CFLAGS += -Wl,-no-as-needed -Wall
+
+uevent_filtering: uevent_filtering.c ../kselftest.h ../kselftest_harness.h
+	$(CC) $(CFLAGS) $< -o $@
+
+TEST_PROGS += $(BINARIES)
+EXTRA_CLEAN := $(BINARIES)
+
+all: $(BINARIES)
diff --git a/tools/testing/selftests/uevent/config b/tools/testing/selftests/uevent/config
new file mode 100644
index 000000000000..1038f4515be8
--- /dev/null
+++ b/tools/testing/selftests/uevent/config
@@ -0,0 +1,2 @@
+CONFIG_USER_NS=y
+CONFIG_NET=y
diff --git a/tools/testing/selftests/uevent/uevent_filtering.c b/tools/testing/selftests/uevent/uevent_filtering.c
new file mode 100644
index 000000000000..f83391aa42cf
--- /dev/null
+++ b/tools/testing/selftests/uevent/uevent_filtering.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/netlink.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sched.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+#define __DEV_FULL "/sys/devices/virtual/mem/full/uevent"
+#define __UEVENT_BUFFER_SIZE (2048 * 2)
+#define __UEVENT_HEADER "add@/devices/virtual/mem/full"
+#define __UEVENT_HEADER_LEN sizeof("add@/devices/virtual/mem/full")
+#define __UEVENT_LISTEN_ALL -1
+
+ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+	ssize_t ret;
+
+again:
+	ret = read(fd, buf, count);
+	if (ret < 0 && errno == EINTR)
+		goto again;
+
+	return ret;
+}
+
+ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+	ssize_t ret;
+
+again:
+	ret = write(fd, buf, count);
+	if (ret < 0 && errno == EINTR)
+		goto again;
+
+	return ret;
+}
+
+int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		return -1;
+	}
+
+	if (ret != pid)
+		goto again;
+
+	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+		return -1;
+
+	return 0;
+}
+
+static int uevent_listener(unsigned long post_flags, bool expect_uevent,
+			   int sync_fd)
+{
+	int sk_fd, ret;
+	socklen_t sk_addr_len;
+	int fret = -1, rcv_buf_sz = __UEVENT_BUFFER_SIZE;
+	uint64_t sync_add = 1;
+	struct sockaddr_nl sk_addr = { 0 }, rcv_addr = { 0 };
+	char buf[__UEVENT_BUFFER_SIZE] = { 0 };
+	struct iovec iov = { buf, __UEVENT_BUFFER_SIZE };
+	char control[CMSG_SPACE(sizeof(struct ucred))];
+	struct msghdr hdr = {
+		&rcv_addr, sizeof(rcv_addr), &iov, 1,
+		control,   sizeof(control),  0,
+	};
+
+	sk_fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+		       NETLINK_KOBJECT_UEVENT);
+	if (sk_fd < 0) {
+		fprintf(stderr, "%s - Failed to open uevent socket\n", strerror(errno));
+		return -1;
+	}
+
+	ret = setsockopt(sk_fd, SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz,
+			 sizeof(rcv_buf_sz));
+	if (ret < 0) {
+		fprintf(stderr, "%s - Failed to set socket options\n", strerror(errno));
+		goto on_error;
+	}
+
+	sk_addr.nl_family = AF_NETLINK;
+	sk_addr.nl_groups = __UEVENT_LISTEN_ALL;
+
+	sk_addr_len = sizeof(sk_addr);
+	ret = bind(sk_fd, (struct sockaddr *)&sk_addr, sk_addr_len);
+	if (ret < 0) {
+		fprintf(stderr, "%s - Failed to bind socket\n", strerror(errno));
+		goto on_error;
+	}
+
+	ret = getsockname(sk_fd, (struct sockaddr *)&sk_addr, &sk_addr_len);
+	if (ret < 0) {
+		fprintf(stderr, "%s - Failed to retrieve socket name\n", strerror(errno));
+		goto on_error;
+	}
+
+	if ((size_t)sk_addr_len != sizeof(sk_addr)) {
+		fprintf(stderr, "Invalid socket address size\n");
+		goto on_error;
+	}
+
+	if (post_flags & CLONE_NEWUSER) {
+		ret = unshare(CLONE_NEWUSER);
+		if (ret < 0) {
+			fprintf(stderr,
+				"%s - Failed to unshare user namespace\n",
+				strerror(errno));
+			goto on_error;
+		}
+	}
+
+	if (post_flags & CLONE_NEWNET) {
+		ret = unshare(CLONE_NEWNET);
+		if (ret < 0) {
+			fprintf(stderr,
+				"%s - Failed to unshare network namespace\n",
+				strerror(errno));
+			goto on_error;
+		}
+	}
+
+	ret = write_nointr(sync_fd, &sync_add, sizeof(sync_add));
+	close(sync_fd);
+	if (ret != sizeof(sync_add)) {
+		fprintf(stderr, "Failed to synchronize with parent process\n");
+		goto on_error;
+	}
+
+	fret = 0;
+	for (;;) {
+		ssize_t r;
+
+		r = recvmsg(sk_fd, &hdr, 0);
+		if (r <= 0) {
+			fprintf(stderr, "%s - Failed to receive uevent\n", strerror(errno));
+			ret = -1;
+			break;
+		}
+
+		/* ignore libudev messages */
+		if (memcmp(buf, "libudev", 8) == 0)
+			continue;
+
+		/* ignore uevents we didn't trigger */
+		if (memcmp(buf, __UEVENT_HEADER, __UEVENT_HEADER_LEN) != 0)
+			continue;
+
+		if (!expect_uevent) {
+			fprintf(stderr, "Received unexpected uevent:\n");
+			ret = -1;
+		}
+
+		if (TH_LOG_ENABLED) {
+			/* If logging is enabled dump the received uevent. */
+			(void)write_nointr(STDERR_FILENO, buf, r);
+			(void)write_nointr(STDERR_FILENO, "\n", 1);
+		}
+
+		break;
+	}
+
+on_error:
+	close(sk_fd);
+
+	return fret;
+}
+
+int trigger_uevent(unsigned int times)
+{
+	int fd, ret;
+	unsigned int i;
+
+	fd = open(__DEV_FULL, O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		if (errno != ENOENT)
+			return -EINVAL;
+
+		return -1;
+	}
+
+	for (i = 0; i < times; i++) {
+		ret = write_nointr(fd, "add\n", sizeof("add\n") - 1);
+		if (ret < 0) {
+			fprintf(stderr, "Failed to trigger uevent\n");
+			break;
+		}
+	}
+	close(fd);
+
+	return ret;
+}
+
+int set_death_signal(void)
+{
+	int ret;
+	pid_t ppid;
+
+	ret = prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
+
+	/* Check whether we have been orphaned. */
+	ppid = getppid();
+	if (ppid == 1) {
+		pid_t self;
+
+		self = getpid();
+		ret = kill(self, SIGKILL);
+	}
+
+	if (ret < 0)
+		return -1;
+
+	return 0;
+}
+
+static int do_test(unsigned long pre_flags, unsigned long post_flags,
+		   bool expect_uevent, int sync_fd)
+{
+	int ret;
+	uint64_t wait_val;
+	pid_t pid;
+	sigset_t mask;
+	sigset_t orig_mask;
+	struct timespec timeout;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGCHLD);
+
+	ret = sigprocmask(SIG_BLOCK, &mask, &orig_mask);
+	if (ret < 0) {
+		fprintf(stderr, "%s- Failed to block SIGCHLD\n", strerror(errno));
+		return -1;
+	}
+
+	pid = fork();
+	if (pid < 0) {
+		fprintf(stderr, "%s - Failed to fork() new process\n", strerror(errno));
+		return -1;
+	}
+
+	if (pid == 0) {
+		/* Make sure that we go away when our parent dies. */
+		ret = set_death_signal();
+		if (ret < 0) {
+			fprintf(stderr, "Failed to set PR_SET_PDEATHSIG to SIGKILL\n");
+			_exit(EXIT_FAILURE);
+		}
+
+		if (pre_flags & CLONE_NEWUSER) {
+			ret = unshare(CLONE_NEWUSER);
+			if (ret < 0) {
+				fprintf(stderr,
+					"%s - Failed to unshare user namespace\n",
+					strerror(errno));
+				_exit(EXIT_FAILURE);
+			}
+		}
+
+		if (pre_flags & CLONE_NEWNET) {
+			ret = unshare(CLONE_NEWNET);
+			if (ret < 0) {
+				fprintf(stderr,
+					"%s - Failed to unshare network namespace\n",
+					strerror(errno));
+				_exit(EXIT_FAILURE);
+			}
+		}
+
+		if (uevent_listener(post_flags, expect_uevent, sync_fd) < 0)
+			_exit(EXIT_FAILURE);
+
+		_exit(EXIT_SUCCESS);
+	}
+
+	ret = read_nointr(sync_fd, &wait_val, sizeof(wait_val));
+	if (ret != sizeof(wait_val)) {
+		fprintf(stderr, "Failed to synchronize with child process\n");
+		_exit(EXIT_FAILURE);
+	}
+
+	/* Trigger 10 uevents to account for the case where the kernel might
+	 * drop some.
+	 */
+	ret = trigger_uevent(10);
+	if (ret < 0)
+		fprintf(stderr, "Failed triggering uevents\n");
+
+	/* Wait for 2 seconds before considering this failed. This should be
+	 * plenty of time for the kernel to deliver the uevent even under heavy
+	 * load.
+	 */
+	timeout.tv_sec = 2;
+	timeout.tv_nsec = 0;
+
+again:
+	ret = sigtimedwait(&mask, NULL, &timeout);
+	if (ret < 0) {
+		if (errno == EINTR)
+			goto again;
+
+		if (!expect_uevent)
+			ret = kill(pid, SIGTERM); /* success */
+		else
+			ret = kill(pid, SIGUSR1); /* error */
+		if (ret < 0)
+			return -1;
+	}
+
+	ret = wait_for_pid(pid);
+	if (ret < 0)
+		return -1;
+
+	return ret;
+}
+
+static void signal_handler(int sig)
+{
+	if (sig == SIGTERM)
+		_exit(EXIT_SUCCESS);
+
+	_exit(EXIT_FAILURE);
+}
+
+TEST(uevent_filtering)
+{
+	int ret, sync_fd;
+	struct sigaction act;
+
+	if (geteuid()) {
+		TH_LOG("Uevent filtering tests require root privileges. Skipping test");
+		_exit(KSFT_SKIP);
+	}
+
+	ret = access(__DEV_FULL, F_OK);
+	EXPECT_EQ(0, ret) {
+		if (errno == ENOENT) {
+			TH_LOG(__DEV_FULL " does not exist. Skipping test");
+			_exit(KSFT_SKIP);
+		}
+
+		_exit(KSFT_FAIL);
+	}
+
+	act.sa_handler = signal_handler;
+	act.sa_flags = 0;
+	sigemptyset(&act.sa_mask);
+
+	ret = sigaction(SIGTERM, &act, NULL);
+	ASSERT_EQ(0, ret);
+
+	sync_fd = eventfd(0, EFD_CLOEXEC);
+	ASSERT_GE(sync_fd, 0);
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in initial network namespace owned by
+	 *   initial user namespace.
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(0, 0, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in non-initial network namespace
+	 *   owned by initial user namespace.
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(CLONE_NEWNET, 0, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - unshare user namespace
+	 * - Open uevent listening socket in initial network namespace
+	 *   owned by initial user namespace.
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(CLONE_NEWUSER, 0, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in non-initial network namespace
+	 *   owned by non-initial user namespace.
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives no uevent
+	 */
+	ret = do_test(CLONE_NEWUSER | CLONE_NEWNET, 0, false, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in initial network namespace
+	 *   owned by initial user namespace.
+	 * - unshare network namespace
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(0, CLONE_NEWNET, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in initial network namespace
+	 *   owned by initial user namespace.
+	 * - unshare user namespace
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(0, CLONE_NEWUSER, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+	/*
+	 * Setup:
+	 * - Open uevent listening socket in initial network namespace
+	 *   owned by initial user namespace.
+	 * - unshare user namespace
+	 * - unshare network namespace
+	 * - Trigger uevent in initial network namespace owned by initial user
+	 *   namespace.
+	 * Expected Result:
+	 * - uevent listening socket receives uevent
+	 */
+	ret = do_test(0, CLONE_NEWUSER | CLONE_NEWNET, true, sync_fd);
+	ASSERT_EQ(0, ret) {
+		goto do_cleanup;
+	}
+
+do_cleanup:
+	close(sync_fd);
+}
+
+TEST_HARNESS_MAIN
-- 
cgit v1.2.3-59-g8ed1b


From 038ed45bcec1c048ec8d9bc462cf0912f8a0eb52 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Wed, 23 May 2018 11:36:59 +0530
Subject: cxgb4: Add new T6 device ids

Add 0x6088 and 0x6089 device ids for new T6 cards.

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index adacc6399131..c7f8d0441278 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -212,6 +212,8 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x6085), /* Custom T6240-SO */
 	CH_PCI_ID_TABLE_FENTRY(0x6086), /* Custom T6225-SO-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x6087), /* Custom T6225-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x6088), /* Custom T62100-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x6089), /* Custom T62100-KR */
 CH_PCI_DEVICE_ID_TABLE_DEFINE_END;
 
 #endif /* __T4_PCI_ID_TBL_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 218bbea11a777c156eb7bcbdc72867b32ae10985 Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:18 +0200
Subject: net: dsa: qca8k: Add QCA8334 binding documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the four-port variant of the Qualcomm QCA833x switch.

The CPU port default link settings can be reconfigured using
a fixed-link sub-node.

Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/dsa/qca8k.txt          | 23 +++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
index 9c67ee4890d7..bbcb255c3150 100644
--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt
+++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
@@ -2,7 +2,10 @@
 
 Required properties:
 
-- compatible: should be "qca,qca8337"
+- compatible: should be one of:
+    "qca,qca8334"
+    "qca,qca8337"
+
 - #size-cells: must be 0
 - #address-cells: must be 1
 
@@ -14,6 +17,20 @@ port and PHY id, each subnode describing a port needs to have a valid phandle
 referencing the internal PHY connected to it. The CPU port of this switch is
 always port 0.
 
+A CPU port node has the following optional node:
+
+- fixed-link            : Fixed-link subnode describing a link to a non-MDIO
+                          managed entity. See
+                          Documentation/devicetree/bindings/net/fixed-link.txt
+                          for details.
+
+For QCA8K the 'fixed-link' sub-node supports only the following properties:
+
+- 'speed' (integer, mandatory), to indicate the link speed. Accepted
+  values are 10, 100 and 1000
+- 'full-duplex' (boolean, optional), to indicate that full duplex is
+  used. When absent, half duplex is assumed.
+
 Example:
 
 
@@ -53,6 +70,10 @@ Example:
 					label = "cpu";
 					ethernet = <&gmac1>;
 					phy-mode = "rgmii";
+					fixed-link {
+						speed = 1000;
+						full-duplex;
+					};
 				};
 
 				port@1 {
-- 
cgit v1.2.3-59-g8ed1b


From 64cf81675a1f64c1b311e4611dd3b6a961607612 Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:19 +0200
Subject: net: dsa: qca8k: Add support for QCA8334 switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the four-port variant of the Qualcomm QCA833x switch.

Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 757b6d90ea36..eb7aceb20f8a 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -939,6 +939,7 @@ static SIMPLE_DEV_PM_OPS(qca8k_pm_ops,
 			 qca8k_suspend, qca8k_resume);
 
 static const struct of_device_id qca8k_of_match[] = {
+	{ .compatible = "qca,qca8334" },
 	{ .compatible = "qca,qca8337" },
 	{ /* sentinel */ },
 };
-- 
cgit v1.2.3-59-g8ed1b


From eee1fe64765c562d8bcaf95e5631a8ea2f760f34 Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:20 +0200
Subject: net: dsa: qca8k: Enable RXMAC when bringing up a port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a port is brought up/down do not enable/disable only the TXMAC
but the RXMAC as well. This is essential for the CPU port to work.

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")
Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index eb7aceb20f8a..a32744e40baf 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -473,7 +473,7 @@ qca8k_set_pad_ctrl(struct qca8k_priv *priv, int port, int mode)
 static void
 qca8k_port_set_status(struct qca8k_priv *priv, int port, int enable)
 {
-	u32 mask = QCA8K_PORT_STATUS_TXMAC;
+	u32 mask = QCA8K_PORT_STATUS_TXMAC | QCA8K_PORT_STATUS_RXMAC;
 
 	/* Port 0 and 6 have no internal PHY */
 	if ((port > 0) && (port < 6))
-- 
cgit v1.2.3-59-g8ed1b


From 79a4ed4f0f93fc65e48a0fc5247ffa5645f7b0cc Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:21 +0200
Subject: net: dsa: qca8k: Force CPU port to its highest bandwidth
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By default autonegotiation is enabled to configure MAC on all ports.
For the CPU port autonegotiation can not be used so we need to set
some sensible defaults manually.

This patch forces the default setting of the CPU port to 1000Mbps/full
duplex which is the chip maximum capability.

Also correct size of the bit field used to configure link speed.

Fixes: 6b93fb46480a ("net-next: dsa: add new driver for qca8xxx family")
Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 6 +++++-
 drivers/net/dsa/qca8k.h | 6 ++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index a32744e40baf..0d7e71528be5 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -490,6 +490,7 @@ qca8k_setup(struct dsa_switch *ds)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	int ret, i, phy_mode = -1;
+	u32 mask;
 
 	/* Make sure that port 0 is the cpu port */
 	if (!dsa_is_cpu_port(ds, 0)) {
@@ -515,7 +516,10 @@ qca8k_setup(struct dsa_switch *ds)
 	if (ret < 0)
 		return ret;
 
-	/* Enable CPU Port */
+	/* Enable CPU Port, force it to maximum bandwidth and full-duplex */
+	mask = QCA8K_PORT_STATUS_SPEED_1000 | QCA8K_PORT_STATUS_TXFLOW |
+	       QCA8K_PORT_STATUS_RXFLOW | QCA8K_PORT_STATUS_DUPLEX;
+	qca8k_write(priv, QCA8K_REG_PORT_STATUS(QCA8K_CPU_PORT), mask);
 	qca8k_reg_set(priv, QCA8K_REG_GLOBAL_FW_CTRL0,
 		      QCA8K_GLOBAL_FW_CTRL0_CPU_PORT_EN);
 	qca8k_port_set_status(priv, QCA8K_CPU_PORT, 1);
diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
index 1cf8a920d4ff..5bda16555478 100644
--- a/drivers/net/dsa/qca8k.h
+++ b/drivers/net/dsa/qca8k.h
@@ -51,8 +51,10 @@
 #define QCA8K_GOL_MAC_ADDR0				0x60
 #define QCA8K_GOL_MAC_ADDR1				0x64
 #define QCA8K_REG_PORT_STATUS(_i)			(0x07c + (_i) * 4)
-#define   QCA8K_PORT_STATUS_SPEED			GENMASK(2, 0)
-#define   QCA8K_PORT_STATUS_SPEED_S			0
+#define   QCA8K_PORT_STATUS_SPEED			GENMASK(1, 0)
+#define   QCA8K_PORT_STATUS_SPEED_10			0
+#define   QCA8K_PORT_STATUS_SPEED_100			0x1
+#define   QCA8K_PORT_STATUS_SPEED_1000			0x2
 #define   QCA8K_PORT_STATUS_TXMAC			BIT(2)
 #define   QCA8K_PORT_STATUS_RXMAC			BIT(3)
 #define   QCA8K_PORT_STATUS_TXFLOW			BIT(4)
-- 
cgit v1.2.3-59-g8ed1b


From 9bb2289f90e671bdb78e306974187424ac19ff8e Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:22 +0200
Subject: net: dsa: qca8k: Allow overwriting CPU port setting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement adjust_link function that allows to overwrite default CPU port
setting using fixed-link device tree subnode.

Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/dsa/qca8k.h |  1 +
 2 files changed, 44 insertions(+)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 0d7e71528be5..8113399018d8 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -587,6 +587,47 @@ qca8k_setup(struct dsa_switch *ds)
 	return 0;
 }
 
+static void
+qca8k_adjust_link(struct dsa_switch *ds, int port, struct phy_device *phy)
+{
+	struct qca8k_priv *priv = ds->priv;
+	u32 reg;
+
+	/* Force fixed-link setting for CPU port, skip others. */
+	if (!phy_is_pseudo_fixed_link(phy))
+		return;
+
+	/* Set port speed */
+	switch (phy->speed) {
+	case 10:
+		reg = QCA8K_PORT_STATUS_SPEED_10;
+		break;
+	case 100:
+		reg = QCA8K_PORT_STATUS_SPEED_100;
+		break;
+	case 1000:
+		reg = QCA8K_PORT_STATUS_SPEED_1000;
+		break;
+	default:
+		dev_dbg(priv->dev, "port%d link speed %dMbps not supported.\n",
+			port, phy->speed);
+		return;
+	}
+
+	/* Set duplex mode */
+	if (phy->duplex == DUPLEX_FULL)
+		reg |= QCA8K_PORT_STATUS_DUPLEX;
+
+	/* Force flow control */
+	if (dsa_is_cpu_port(ds, port))
+		reg |= QCA8K_PORT_STATUS_RXFLOW | QCA8K_PORT_STATUS_TXFLOW;
+
+	/* Force link down before changing MAC options */
+	qca8k_port_set_status(priv, port, 0);
+	qca8k_write(priv, QCA8K_REG_PORT_STATUS(port), reg);
+	qca8k_port_set_status(priv, port, 1);
+}
+
 static int
 qca8k_phy_read(struct dsa_switch *ds, int phy, int regnum)
 {
@@ -841,6 +882,7 @@ qca8k_get_tag_protocol(struct dsa_switch *ds, int port)
 static const struct dsa_switch_ops qca8k_switch_ops = {
 	.get_tag_protocol	= qca8k_get_tag_protocol,
 	.setup			= qca8k_setup,
+	.adjust_link            = qca8k_adjust_link,
 	.get_strings		= qca8k_get_strings,
 	.phy_read		= qca8k_phy_read,
 	.phy_write		= qca8k_phy_write,
@@ -872,6 +914,7 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
 		return -ENOMEM;
 
 	priv->bus = mdiodev->bus;
+	priv->dev = &mdiodev->dev;
 
 	/* read the switches ID register */
 	id = qca8k_read(priv, QCA8K_REG_MASK_CTRL);
diff --git a/drivers/net/dsa/qca8k.h b/drivers/net/dsa/qca8k.h
index 5bda16555478..613fe5c50236 100644
--- a/drivers/net/dsa/qca8k.h
+++ b/drivers/net/dsa/qca8k.h
@@ -167,6 +167,7 @@ struct qca8k_priv {
 	struct ar8xxx_port_status port_sts[QCA8K_NUM_PORTS];
 	struct dsa_switch *ds;
 	struct mutex reg_mutex;
+	struct device *dev;
 };
 
 struct qca8k_mib_desc {
-- 
cgit v1.2.3-59-g8ed1b


From 63a786a31033a32cb89498c9bb4e6069ebe9f8b0 Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:23 +0200
Subject: net: dsa: qca8k: Replace GPL boilerplate by SPDX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the GPLv2 license boilerplate with the SPDX license identifier.

Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 8113399018d8..46c178fb3fb9 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2009 Felix Fietkau <nbd@nbd.name>
  * Copyright (C) 2011-2012 Gabor Juhos <juhosg@openwrt.org>
  * Copyright (c) 2015, The Linux Foundation. All rights reserved.
  * Copyright (c) 2016 John Crispin <john@phrozen.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3-59-g8ed1b


From 38222b1aacf39c92e72d42a809609c725f2e9f83 Mon Sep 17 00:00:00 2001
From: Michal Vokáč <vokac.m@gmail.com>
Date: Wed, 23 May 2018 08:20:24 +0200
Subject: net: dsa: qca8k: Remove redundant parentheses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix warning reported by checkpatch.

Signed-off-by: Michal Vokáč <michal.vokac@ysoft.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/qca8k.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 46c178fb3fb9..cdcde7f8e0b2 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -468,7 +468,7 @@ qca8k_port_set_status(struct qca8k_priv *priv, int port, int enable)
 	u32 mask = QCA8K_PORT_STATUS_TXMAC | QCA8K_PORT_STATUS_RXMAC;
 
 	/* Port 0 and 6 have no internal PHY */
-	if ((port > 0) && (port < 6))
+	if (port > 0 && port < 6)
 		mask |= QCA8K_PORT_STATUS_LINK_AUTO;
 
 	if (enable)
-- 
cgit v1.2.3-59-g8ed1b


From 1d19023fa6f65de636901b9b7340f2f7eebb710a Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Wed, 23 May 2018 20:02:58 +0530
Subject: cxgb4: change the port capability bits definition

MDI Port Capabilities bit definitions were inconsistent with
regard to the MDI enum values. 2 bits used to define MDI in
the port capabilities are not really separable, it's a 2-bit
field with 4 different values. Change the port capability bit
definitions to be "AUTO" and "STRAIGHT" in order to get them
to line up with the enum's.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c     | 4 ++--
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h  | 8 ++++----
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 2 +-
 drivers/scsi/csiostor/csio_hw.c                | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index df5e7c79223b..537ed072db4a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -3941,8 +3941,8 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
@@ -3982,8 +3982,8 @@ static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
 	CAP32_TO_CAP16(802_3_PAUSE);
 	CAP32_TO_CAP16(802_3_ASM_DIR);
 	CAP32_TO_CAP16(ANEG);
-	CAP32_TO_CAP16(MDIX);
 	CAP32_TO_CAP16(MDIAUTO);
+	CAP32_TO_CAP16(MDISTRAIGHT);
 	CAP32_TO_CAP16(FEC_RS);
 	CAP32_TO_CAP16(FEC_BASER_RS);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index e6b2e9549d56..2d91480a5a0e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2471,8 +2471,8 @@ enum fw_port_cap {
 	FW_PORT_CAP_FC_RX		= 0x0040,
 	FW_PORT_CAP_FC_TX		= 0x0080,
 	FW_PORT_CAP_ANEG		= 0x0100,
-	FW_PORT_CAP_MDIX		= 0x0200,
-	FW_PORT_CAP_MDIAUTO		= 0x0400,
+	FW_PORT_CAP_MDIAUTO		= 0x0200,
+	FW_PORT_CAP_MDISTRAIGHT		= 0x0400,
 	FW_PORT_CAP_FEC_RS		= 0x0800,
 	FW_PORT_CAP_FEC_BASER_RS	= 0x1000,
 	FW_PORT_CAP_FEC_RESERVED	= 0x2000,
@@ -2515,8 +2515,8 @@ enum fw_port_mdi {
 #define	FW_PORT_CAP32_802_3_PAUSE	0x00040000UL
 #define	FW_PORT_CAP32_802_3_ASM_DIR	0x00080000UL
 #define	FW_PORT_CAP32_ANEG		0x00100000UL
-#define	FW_PORT_CAP32_MDIX		0x00200000UL
-#define	FW_PORT_CAP32_MDIAUTO		0x00400000UL
+#define	FW_PORT_CAP32_MDIAUTO		0x00200000UL
+#define	FW_PORT_CAP32_MDISTRAIGHT	0x00400000UL
 #define	FW_PORT_CAP32_FEC_RS		0x00800000UL
 #define	FW_PORT_CAP32_FEC_BASER_RS	0x01000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED1	0x02000000UL
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index 798695bf8678..3017f7873ff9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -341,8 +341,8 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c
index 96bbb82c826d..a10cf25ee7f9 100644
--- a/drivers/scsi/csiostor/csio_hw.c
+++ b/drivers/scsi/csiostor/csio_hw.c
@@ -1500,8 +1500,8 @@ fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
-	CAP16_TO_CAP32(MDIX);
 	CAP16_TO_CAP32(MDIAUTO);
+	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
 	CAP16_TO_CAP32(FEC_BASER_RS);
 	CAP16_TO_CAP32(802_3_PAUSE);
-- 
cgit v1.2.3-59-g8ed1b


From 8156b0ba7413238e473ef567ebecd755f92275c5 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Wed, 23 May 2018 20:03:33 +0530
Subject: cxgb4: do L1 config when module is inserted

trigger an L1 configure operation when a transceiver module
is inserted in order to cause current "sticky" options like
Requested Forward Error Correction to be reapplied.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      | 26 ++++++++++++++--
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 11 +++++--
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      | 40 +++++++++++++++++++++----
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 211086bdbe20..0f305d97f4ee 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -491,6 +491,9 @@ struct link_config {
 
 	unsigned char  link_ok;          /* link up? */
 	unsigned char  link_down_rc;     /* link down reason */
+
+	bool new_module;		 /* ->OS Transceiver Module inserted */
+	bool redo_l1cfg;		 /* ->CC redo current "sticky" L1 CFG */
 };
 
 #define FW_LEN16(fw_struct) FW_CMD_LEN16_V(sizeof(fw_struct) / 16)
@@ -1324,7 +1327,7 @@ static inline unsigned int qtimer_val(const struct adapter *adap,
 extern char cxgb4_driver_name[];
 extern const char cxgb4_driver_version[];
 
-void t4_os_portmod_changed(const struct adapter *adap, int port_id);
+void t4_os_portmod_changed(struct adapter *adap, int port_id);
 void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
 
 void t4_free_sge_resources(struct adapter *adap);
@@ -1505,8 +1508,25 @@ void t4_intr_disable(struct adapter *adapter);
 int t4_slow_intr_handler(struct adapter *adapter);
 
 int t4_wait_dev_ready(void __iomem *regs);
-int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
-		  struct link_config *lc);
+
+int t4_link_l1cfg_core(struct adapter *adap, unsigned int mbox,
+		       unsigned int port, struct link_config *lc,
+		       bool sleep_ok, int timeout);
+
+static inline int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
+				unsigned int port, struct link_config *lc)
+{
+	return t4_link_l1cfg_core(adapter, mbox, port, lc,
+				  true, FW_CMD_MAX_TIMEOUT);
+}
+
+static inline int t4_link_l1cfg_ns(struct adapter *adapter, unsigned int mbox,
+				   unsigned int port, struct link_config *lc)
+{
+	return t4_link_l1cfg_core(adapter, mbox, port, lc,
+				  false, FW_CMD_MAX_TIMEOUT);
+}
+
 int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port);
 
 u32 t4_read_pcie_cfg4(struct adapter *adap, int reg);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 130d1eed7993..513e1d356384 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -301,14 +301,14 @@ void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat)
 	}
 }
 
-void t4_os_portmod_changed(const struct adapter *adap, int port_id)
+void t4_os_portmod_changed(struct adapter *adap, int port_id)
 {
 	static const char *mod_str[] = {
 		NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM"
 	};
 
-	const struct net_device *dev = adap->port[port_id];
-	const struct port_info *pi = netdev_priv(dev);
+	struct net_device *dev = adap->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
 
 	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
 		netdev_info(dev, "port module unplugged\n");
@@ -325,6 +325,11 @@ void t4_os_portmod_changed(const struct adapter *adap, int port_id)
 	else
 		netdev_info(dev, "%s: unknown module type %d inserted\n",
 			    dev->name, pi->mod_type);
+
+	/* If the interface is running, then we'll need any "sticky" Link
+	 * Parameters redone with a new Transceiver Module.
+	 */
+	pi->link_cfg.redo_l1cfg = netif_running(dev);
 }
 
 int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 537ed072db4a..704f6960a6ea 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -4058,14 +4058,16 @@ static inline fw_port_cap32_t cc_to_fwcap_fec(enum cc_fec cc_fec)
  *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
  *	  otherwise do it later based on the outcome of auto-negotiation.
  */
-int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
-		  unsigned int port, struct link_config *lc)
+int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
+		       unsigned int port, struct link_config *lc,
+		       bool sleep_ok, int timeout)
 {
 	unsigned int fw_caps = adapter->params.fw_caps_support;
-	struct fw_port_cmd cmd;
-	unsigned int fw_mdi = FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO);
 	fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
+	struct fw_port_cmd cmd;
+	unsigned int fw_mdi;
 
+	fw_mdi = (FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO) & lc->pcaps);
 	/* Convert driver coding of Pause Frame Flow Control settings into the
 	 * Firmware's API.
 	 */
@@ -4087,7 +4089,7 @@ int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
 	/* Figure out what our Requested Port Capabilities are going to be.
 	 */
 	if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
-		rcap = (lc->pcaps & ADVERT_MASK) | fw_fc | fw_fec;
+		rcap = lc->acaps | fw_fc | fw_fec;
 		lc->fc = lc->requested_fc & ~PAUSE_AUTONEG;
 		lc->fec = cc_fec;
 	} else if (lc->autoneg == AUTONEG_DISABLE) {
@@ -4113,7 +4115,8 @@ int t4_link_l1cfg(struct adapter *adapter, unsigned int mbox,
 		cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
 	else
 		cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
-	return t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
+	return t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL,
+				       sleep_ok, timeout);
 }
 
 /**
@@ -8335,6 +8338,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 	fc = fwcap_to_cc_pause(linkattr);
 	speed = fwcap_to_speed(linkattr);
 
+	lc->new_module = false;
+	lc->redo_l1cfg = false;
+
 	if (mod_type != pi->mod_type) {
 		/* With the newer SFP28 and QSFP28 Transceiver Module Types,
 		 * various fundamental Port Capabilities which used to be
@@ -8369,6 +8375,8 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 		pi->port_type = port_type;
 
 		pi->mod_type = mod_type;
+
+		lc->new_module = t4_is_inserted_mod_type(mod_type);
 		t4_os_portmod_changed(adapter, pi->port_id);
 	}
 
@@ -8401,6 +8409,26 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 
 		t4_os_link_changed(adapter, pi->port_id, link_ok);
 	}
+
+	if (lc->new_module && lc->redo_l1cfg) {
+		struct link_config old_lc;
+		int ret;
+
+		/* Save the current L1 Configuration and restore it if an
+		 * error occurs.  We probably should fix the l1_cfg*()
+		 * routines not to change the link_config when an error
+		 * occurs ...
+		 */
+		old_lc = *lc;
+		ret = t4_link_l1cfg_ns(adapter, adapter->mbox, pi->lport, lc);
+		if (ret) {
+			*lc = old_lc;
+			dev_warn(adapter->pdev_dev,
+				 "Attempt to update new Transceiver Module settings failed\n");
+		}
+	}
+	lc->new_module = false;
+	lc->redo_l1cfg = false;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 2351abe6f8736167a187cec867933bc66c3284c0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 23 May 2018 16:38:09 +0200
Subject: net/smc: return 0 for ioctl calls in states INIT and CLOSED

A connected SMC-socket contains addresses of descriptors for the
send buffer and the rmb (receive buffer). Fields of these descriptors
are used to determine the answer for certain ioctl requests.
Add extra handling for unconnected SMC socket states without valid
buffer descriptor addresses.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Reported-by: syzbot+e6714328fda813fc670f@syzkaller.appspotmail.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 48530dab5c94..f2d925921d81 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1490,20 +1490,32 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 	case SIOCINQ: /* same as FIONREAD */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = atomic_read(&smc->conn.bytes_to_rcv);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = atomic_read(&smc->conn.bytes_to_rcv);
 		break;
 	case SIOCOUTQ:
 		/* output queue size (not send + not acked) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc->conn.sndbuf_desc->len -
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc->conn.sndbuf_desc->len -
 					atomic_read(&smc->conn.sndbuf_space);
 		break;
 	case SIOCOUTQNSD:
 		/* output queue size (not send only) */
 		if (smc->sk.sk_state == SMC_LISTEN)
 			return -EINVAL;
-		answ = smc_tx_prepared_sends(&smc->conn);
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED)
+			answ = 0;
+		else
+			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
 	default:
 		return -ENOIOCTLCMD;
-- 
cgit v1.2.3-59-g8ed1b


From b9f227c37071d1115cec791b328b684e2c4ba414 Mon Sep 17 00:00:00 2001
From: Hans Wippel <hwippel@linux.ibm.com>
Date: Wed, 23 May 2018 16:38:10 +0200
Subject: net/smc: lock smc_lgr_list in port_terminate()

Currently, smc_port_terminate() is not holding the lock of the lgr list
while it is traversing the list. This patch adds locking to this
function and changes smc_lgr_terminate() accordingly.

Signed-off-by: Hans Wippel <hwippel@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1e5c0e90a706..21c244f53b0a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -346,7 +346,7 @@ void smc_lgr_forget(struct smc_link_group *lgr)
 }
 
 /* terminate linkgroup abnormally */
-void smc_lgr_terminate(struct smc_link_group *lgr)
+static void __smc_lgr_terminate(struct smc_link_group *lgr)
 {
 	struct smc_connection *conn;
 	struct smc_sock *smc;
@@ -355,7 +355,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	if (lgr->terminating)
 		return;	/* lgr already terminating */
 	lgr->terminating = 1;
-	smc_lgr_forget(lgr);
+	if (!list_empty(&lgr->list)) /* forget lgr */
+		list_del_init(&lgr->list);
 	smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
 
 	write_lock_bh(&lgr->conns_lock);
@@ -377,16 +378,25 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	smc_lgr_schedule_free_work(lgr);
 }
 
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+	spin_lock_bh(&smc_lgr_list.lock);
+	__smc_lgr_terminate(lgr);
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 /* Called when IB port is terminated */
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	struct smc_link_group *lgr, *l;
 
+	spin_lock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
 		if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev &&
 		    lgr->lnk[SMC_SINGLE_LINK].ibport == ibport)
-			smc_lgr_terminate(lgr);
+			__smc_lgr_terminate(lgr);
 	}
+	spin_unlock_bh(&smc_lgr_list.lock);
 }
 
 /* Determine vlan of internal TCP socket.
-- 
cgit v1.2.3-59-g8ed1b


From de8474eb9d50fd47b8c73816f34739dec5e96754 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.ibm.com>
Date: Wed, 23 May 2018 16:38:11 +0200
Subject: net/smc: urgent data support

Add support for out of band data send and receive.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  24 ++++++++++-
 net/smc/smc.h      |  15 +++++++
 net/smc/smc_cdc.c  |  44 ++++++++++++++++++--
 net/smc/smc_cdc.h  |  13 ++++++
 net/smc/smc_core.c |   1 +
 net/smc/smc_rx.c   | 120 +++++++++++++++++++++++++++++++++++++++++++++++------
 net/smc/smc_tx.c   |  55 ++++++++++++++++--------
 net/smc/smc_tx.h   |   2 +-
 8 files changed, 238 insertions(+), 36 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f2d925921d81..2c369d4bb1c1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -8,8 +8,6 @@
  *
  *  Initial restrictions:
  *    - support for alternate links postponed
- *    - partial support for non-blocking sockets only
- *    - support for urgent data postponed
  *
  *  Copyright IBM Corp. 2016, 2018
  *
@@ -1338,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
 				mask |= EPOLLIN;
 		}
+		if (smc->conn.urg_state == SMC_URG_VALID)
+			mask |= EPOLLPRI;
 
 	}
 	release_sock(sk);
@@ -1477,10 +1477,13 @@ static int smc_getsockopt(struct socket *sock, int level, int optname,
 static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		     unsigned long arg)
 {
+	union smc_host_cursor cons, urg;
+	struct smc_connection *conn;
 	struct smc_sock *smc;
 	int answ;
 
 	smc = smc_sk(sock->sk);
+	conn = &smc->conn;
 	if (smc->use_fallback) {
 		if (!smc->clcsock)
 			return -EBADF;
@@ -1517,6 +1520,23 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 		else
 			answ = smc_tx_prepared_sends(&smc->conn);
 		break;
+	case SIOCATMARK:
+		if (smc->sk.sk_state == SMC_LISTEN)
+			return -EINVAL;
+		if (smc->sk.sk_state == SMC_INIT ||
+		    smc->sk.sk_state == SMC_CLOSED) {
+			answ = 0;
+		} else {
+			smc_curs_write(&cons,
+			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+				       conn);
+			smc_curs_write(&urg,
+				       smc_curs_read(&conn->urg_curs, conn),
+				       conn);
+			answ = smc_curs_diff(conn->rmb_desc->len,
+					     &cons, &urg) == 1;
+		}
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/net/smc/smc.h b/net/smc/smc.h
index a1467e411645..51ae1f10d81a 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -114,6 +114,12 @@ struct smc_host_cdc_msg {		/* Connection Data Control message */
 	u8				reserved[18];
 } __aligned(8);
 
+enum smc_urg_state {
+	SMC_URG_VALID,			/* data present */
+	SMC_URG_NOTYET,			/* data pending */
+	SMC_URG_READ			/* data was already read */
+};
+
 struct smc_connection {
 	struct rb_node		alert_node;
 	struct smc_link_group	*lgr;		/* link group of connection */
@@ -160,6 +166,15 @@ struct smc_connection {
 	union smc_host_cursor	rx_curs_confirmed; /* confirmed to peer
 						    * source of snd_una ?
 						    */
+	union smc_host_cursor	urg_curs;	/* points at urgent byte */
+	enum smc_urg_state	urg_state;
+	bool			urg_tx_pend;	/* urgent data staged */
+	bool			urg_rx_skip_pend;
+						/* indicate urgent oob data
+						 * read, but previous regular
+						 * data still pending
+						 */
+	char			urg_rx_byte;	/* urgent byte */
 	atomic_t		bytes_to_rcv;	/* arrived data,
 						 * not yet received
 						 */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 8d2c079c87b0..a7e8d63fc8ae 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -164,6 +164,28 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
 	return (s16)(seq1 - seq2) < 0;
 }
 
+static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
+					    int *diff_prod)
+{
+	struct smc_connection *conn = &smc->conn;
+	char *base;
+
+	/* new data included urgent business */
+	smc_curs_write(&conn->urg_curs,
+		       smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+		       conn);
+	conn->urg_state = SMC_URG_VALID;
+	if (!sock_flag(&smc->sk, SOCK_URGINLINE))
+		/* we'll skip the urgent byte, so don't account for it */
+		(*diff_prod)--;
+	base = (char *)conn->rmb_desc->cpu_addr;
+	if (conn->urg_curs.count)
+		conn->urg_rx_byte = *(base + conn->urg_curs.count - 1);
+	else
+		conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1);
+	sk_send_sigurg(&smc->sk);
+}
+
 static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 				    struct smc_cdc_msg *cdc)
 {
@@ -194,15 +216,25 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 	diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
 				  &conn->local_rx_ctrl.prod);
 	if (diff_prod) {
+		if (conn->local_rx_ctrl.prod_flags.urg_data_present)
+			smc_cdc_handle_urg_data_arrival(smc, &diff_prod);
 		/* bytes_to_rcv is decreased in smc_recvmsg */
 		smp_mb__before_atomic();
 		atomic_add(diff_prod, &conn->bytes_to_rcv);
 		/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 		smp_mb__after_atomic();
 		smc->sk.sk_data_ready(&smc->sk);
-	} else if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
-		   (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) {
-		smc->sk.sk_data_ready(&smc->sk);
+	} else {
+		if (conn->local_rx_ctrl.prod_flags.write_blocked ||
+		    conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+		    conn->local_rx_ctrl.prod_flags.urg_data_pending) {
+			if (conn->local_rx_ctrl.prod_flags.urg_data_pending)
+				conn->urg_state = SMC_URG_NOTYET;
+			/* force immediate tx of current consumer cursor, but
+			 * under send_lock to guarantee arrival in seqno-order
+			 */
+			smc_tx_sndbuf_nonempty(conn);
+		}
 	}
 
 	/* piggy backed tx info */
@@ -212,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
 		/* trigger socket release if connection closed */
 		smc_close_wake_tx_prepared(smc);
 	}
+	if (diff_cons && conn->urg_tx_pend &&
+	    atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
+		/* urg data confirmed by peer, indicate we're ready for more */
+		conn->urg_tx_pend = false;
+		smc->sk.sk_write_space(&smc->sk);
+	}
 
 	if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
 		smc->sk.sk_err = ECONNRESET;
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index d2012fd22100..f60082fee5b8 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size,
 	return max_t(int, 0, (new->count - old->count));
 }
 
+/* calculate cursor difference between old and new - returns negative
+ * value in case old > new
+ */
+static inline int smc_curs_comp(unsigned int size,
+				union smc_host_cursor *old,
+				union smc_host_cursor *new)
+{
+	if (old->wrap > new->wrap ||
+	    (old->wrap == new->wrap && old->count > new->count))
+		return -smc_curs_diff(size, new, old);
+	return smc_curs_diff(size, old, new);
+}
+
 static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
 					  union smc_host_cursor *local,
 					  struct smc_connection *conn)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 21c244f53b0a..2bf138e7d3ec 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -544,6 +544,7 @@ create:
 	}
 	conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
 	conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
+	conn->urg_state = SMC_URG_READ;
 #ifndef KERNEL_HAS_ATOMIC64
 	spin_lock_init(&conn->acurs_lock);
 #endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 290a434471d1..3d77b383cccd 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -47,16 +47,59 @@ static void smc_rx_wake_up(struct sock *sk)
  *   @conn   connection to update
  *   @cons   consumer cursor
  *   @len    number of Bytes consumed
+ *   Returns:
+ *   1 if we should end our receive, 0 otherwise
  */
-static void smc_rx_update_consumer(struct smc_connection *conn,
-				   union smc_host_cursor cons, size_t len)
+static int smc_rx_update_consumer(struct smc_sock *smc,
+				  union smc_host_cursor cons, size_t len)
 {
+	struct smc_connection *conn = &smc->conn;
+	struct sock *sk = &smc->sk;
+	bool force = false;
+	int diff, rc = 0;
+
 	smc_curs_add(conn->rmb_desc->len, &cons, len);
+
+	/* did we process urgent data? */
+	if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) {
+		diff = smc_curs_comp(conn->rmb_desc->len, &cons,
+				     &conn->urg_curs);
+		if (sock_flag(sk, SOCK_URGINLINE)) {
+			if (diff == 0) {
+				force = true;
+				rc = 1;
+				conn->urg_state = SMC_URG_READ;
+			}
+		} else {
+			if (diff == 1) {
+				/* skip urgent byte */
+				force = true;
+				smc_curs_add(conn->rmb_desc->len, &cons, 1);
+				conn->urg_rx_skip_pend = false;
+			} else if (diff < -1)
+				/* we read past urgent byte */
+				conn->urg_state = SMC_URG_READ;
+		}
+	}
+
 	smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
 		       conn);
+
 	/* send consumer cursor update if required */
 	/* similar to advertising new TCP rcv_wnd if required */
-	smc_tx_consumer_update(conn);
+	smc_tx_consumer_update(conn, force);
+
+	return rc;
+}
+
+static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+
+	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+		       conn);
+	smc_rx_update_consumer(smc, cons, len);
 }
 
 struct smc_spd_priv {
@@ -70,7 +113,6 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 	struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private;
 	struct smc_sock *smc = priv->smc;
 	struct smc_connection *conn;
-	union smc_host_cursor cons;
 	struct sock *sk = &smc->sk;
 
 	if (sk->sk_state == SMC_CLOSED ||
@@ -79,9 +121,7 @@ static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe,
 		goto out;
 	conn = &smc->conn;
 	lock_sock(sk);
-	smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
-		       conn);
-	smc_rx_update_consumer(conn, cons, priv->len);
+	smc_rx_update_cons(smc, priv->len);
 	release_sock(sk);
 	if (atomic_sub_and_test(priv->len, &conn->splice_pending))
 		smc_rx_wake_up(sk);
@@ -184,6 +224,52 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo,
 	return rc;
 }
 
+static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
+			   int flags)
+{
+	struct smc_connection *conn = &smc->conn;
+	union smc_host_cursor cons;
+	struct sock *sk = &smc->sk;
+	int rc = 0;
+
+	if (sock_flag(sk, SOCK_URGINLINE) ||
+	    !(conn->urg_state == SMC_URG_VALID) ||
+	    conn->urg_state == SMC_URG_READ)
+		return -EINVAL;
+
+	if (conn->urg_state == SMC_URG_VALID) {
+		if (!(flags & MSG_PEEK))
+			smc->conn.urg_state = SMC_URG_READ;
+		msg->msg_flags |= MSG_OOB;
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
+			len = 1;
+			smc_curs_write(&cons,
+				       smc_curs_read(&conn->local_tx_ctrl.cons,
+						     conn),
+				       conn);
+			if (smc_curs_diff(conn->rmb_desc->len, &cons,
+					  &conn->urg_curs) > 1)
+				conn->urg_rx_skip_pend = true;
+			/* Urgent Byte was already accounted for, but trigger
+			 * skipping the urgent byte in non-inline case
+			 */
+			if (!(flags & MSG_PEEK))
+				smc_rx_update_consumer(smc, cons, 0);
+		} else {
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		return rc ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN)
+		return 0;
+
+	return -EAGAIN;
+}
+
 /* smc_rx_recvmsg - receive data from RMBE
  * @msg:	copy data to receive buffer
  * @pipe:	copy data to pipe if set - indicates splice() call
@@ -209,12 +295,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return -EINVAL; /* future work for sk.sk_family == AF_SMC */
-	if (flags & MSG_OOB)
-		return -EINVAL; /* future work */
 
 	sk = &smc->sk;
 	if (sk->sk_state == SMC_LISTEN)
 		return -ENOTCONN;
+	if (flags & MSG_OOB)
+		return smc_rx_recv_urg(smc, msg, len, flags);
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
@@ -227,6 +313,9 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg,
 
 		if (atomic_read(&conn->bytes_to_rcv))
 			goto copy;
+		else if (conn->urg_state == SMC_URG_VALID)
+			/* we received a single urgent Byte - skip */
+			smc_rx_update_cons(smc, 0);
 
 		if (sk->sk_shutdown & RCV_SHUTDOWN ||
 		    smc_cdc_rxed_any_close_or_senddone(conn) ||
@@ -281,14 +370,18 @@ copy:
 			continue;
 		}
 
-		/* not more than what user space asked for */
-		copylen = min_t(size_t, read_remaining, readable);
 		smc_curs_write(&cons,
 			       smc_curs_read(&conn->local_tx_ctrl.cons, conn),
 			       conn);
 		/* subsequent splice() calls pick up where previous left */
 		if (splbytes)
 			smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
+		if (conn->urg_state == SMC_URG_VALID &&
+		    sock_flag(&smc->sk, SOCK_URGINLINE) &&
+		    readable > 1)
+			readable--;	/* always stop at urgent Byte */
+		/* not more than what user space asked for */
+		copylen = min_t(size_t, read_remaining, readable);
 		/* determine chunks where to read from rcvbuf */
 		/* either unwrapped case, or 1st chunk of wrapped case */
 		chunk_len = min_t(size_t, copylen, conn->rmb_desc->len -
@@ -333,8 +426,8 @@ copy:
 			atomic_sub(copylen, &conn->bytes_to_rcv);
 			/* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */
 			smp_mb__after_atomic();
-			if (msg)
-				smc_rx_update_consumer(conn, cons, copylen);
+			if (msg && smc_rx_update_consumer(smc, cons, copylen))
+				goto out;
 		}
 	} while (read_remaining);
 out:
@@ -346,4 +439,5 @@ void smc_rx_init(struct smc_sock *smc)
 {
 	smc->sk.sk_data_ready = smc_rx_wake_up;
 	atomic_set(&smc->conn.splice_pending, 0);
+	smc->conn.urg_state = SMC_URG_READ;
 }
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 1f4a38b857f0..cee666400752 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -32,7 +32,7 @@
 /***************************** sndbuf producer *******************************/
 
 /* callback implementation for sk.sk_write_space()
- * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * to wakeup sndbuf producers that blocked with smc_tx_wait().
  * called under sk_socket lock.
  */
 static void smc_tx_write_space(struct sock *sk)
@@ -56,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk)
 	}
 }
 
-/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+/* Wakeup sndbuf producers that blocked with smc_tx_wait().
  * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
  */
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
@@ -66,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
 		smc->sk.sk_write_space(&smc->sk);
 }
 
-/* blocks sndbuf producer until at least one byte of free space available */
-static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+/* blocks sndbuf producer until at least one byte of free space available
+ * or urgent Byte was consumed
+ */
+static int smc_tx_wait(struct smc_sock *smc, int flags)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct smc_connection *conn = &smc->conn;
@@ -103,14 +105,15 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
 			break;
 		}
 		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-		if (atomic_read(&conn->sndbuf_space))
-			break; /* at least 1 byte of free space available */
+		if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend)
+			break; /* at least 1 byte of free & no urgent data */
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk_wait_event(sk, &timeo,
 			      sk->sk_err ||
 			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
 			      smc_cdc_rxed_any_close(conn) ||
-			      atomic_read(&conn->sndbuf_space),
+			      (atomic_read(&conn->sndbuf_space) &&
+			       !conn->urg_tx_pend),
 			      &wait);
 	}
 	remove_wait_queue(sk_sleep(sk), &wait);
@@ -157,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		if (smc_cdc_rxed_any_close(conn))
 			return send_done ?: -ECONNRESET;
 
-		if (!atomic_read(&conn->sndbuf_space)) {
-			rc = smc_tx_wait_memory(smc, msg->msg_flags);
+		if (msg->msg_flags & MSG_OOB)
+			conn->local_tx_ctrl.prod_flags.urg_data_pending = 1;
+
+		if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) {
+			rc = smc_tx_wait(smc, msg->msg_flags);
 			if (rc) {
 				if (send_done)
 					return send_done;
@@ -168,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		}
 
 		/* initialize variables for 1st iteration of subsequent loop */
-		/* could be just 1 byte, even after smc_tx_wait_memory above */
+		/* could be just 1 byte, even after smc_tx_wait above */
 		writespace = atomic_read(&conn->sndbuf_space);
 		/* not more than what user space asked for */
 		copylen = min_t(size_t, send_remaining, writespace);
@@ -218,6 +224,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 		/* since we just produced more new data into sndbuf,
 		 * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
 		 */
+		if ((msg->msg_flags & MSG_OOB) && !send_remaining)
+			conn->urg_tx_pend = true;
 		if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) &&
 		    (atomic_read(&conn->sndbuf_space) >
 						(conn->sndbuf_desc->len >> 1)))
@@ -299,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 	union smc_host_cursor sent, prep, prod, cons;
 	struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
 	struct smc_link_group *lgr = conn->lgr;
+	struct smc_cdc_producer_flags *pflags;
 	int to_send, rmbespace;
 	struct smc_link *link;
 	dma_addr_t dma_addr;
@@ -326,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		       conn);
 
 	/* if usable snd_wnd closes ask peer to advertise once it opens again */
-	conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	pflags->write_blocked = (to_send >= rmbespace);
 	/* cf. usable snd_wnd */
 	len = min(to_send, rmbespace);
 
@@ -391,6 +401,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
 		src_len_sum = src_len;
 	}
 
+	if (conn->urg_tx_pend && len == to_send)
+		pflags->urg_data_present = 1;
 	smc_tx_advance_cursors(conn, &prod, &sent, len);
 	/* update connection's cursors with advanced local cursors */
 	smc_curs_write(&conn->local_tx_ctrl.prod,
@@ -410,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
  */
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
+	struct smc_cdc_producer_flags *pflags;
 	struct smc_cdc_tx_pend *pend;
 	struct smc_wr_buf *wr_buf;
 	int rc;
@@ -433,14 +446,21 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
 		goto out_unlock;
 	}
 
-	rc = smc_tx_rdma_writes(conn);
-	if (rc) {
-		smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
-				   (struct smc_wr_tx_pend_priv *)pend);
-		goto out_unlock;
+	if (!conn->local_tx_ctrl.prod_flags.urg_data_present) {
+		rc = smc_tx_rdma_writes(conn);
+		if (rc) {
+			smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+					   (struct smc_wr_tx_pend_priv *)pend);
+			goto out_unlock;
+		}
 	}
 
 	rc = smc_cdc_msg_send(conn, wr_buf, pend);
+	pflags = &conn->local_tx_ctrl.prod_flags;
+	if (!rc && pflags->urg_data_present) {
+		pflags->urg_data_pending = 0;
+		pflags->urg_data_present = 0;
+	}
 
 out_unlock:
 	spin_unlock_bh(&conn->send_lock);
@@ -473,7 +493,7 @@ out:
 	release_sock(&smc->sk);
 }
 
-void smc_tx_consumer_update(struct smc_connection *conn)
+void smc_tx_consumer_update(struct smc_connection *conn, bool force)
 {
 	union smc_host_cursor cfed, cons;
 	int to_confirm;
@@ -487,6 +507,7 @@ void smc_tx_consumer_update(struct smc_connection *conn)
 	to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
 
 	if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+	    force ||
 	    ((to_confirm > conn->rmbe_update_limit) &&
 	     ((to_confirm > (conn->rmb_desc->len / 2)) ||
 	      conn->local_rx_ctrl.prod_flags.write_blocked))) {
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 44d077942976..9d2238909fa0 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -32,6 +32,6 @@ void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
-void smc_tx_consumer_update(struct smc_connection *conn);
+void smc_tx_consumer_update(struct smc_connection *conn, bool force);
 
 #endif /* SMC_TX_H */
-- 
cgit v1.2.3-59-g8ed1b


From 7f58a1ad5a9c203f25aa1f8947dc34751778e820 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 23 May 2018 16:38:12 +0200
Subject: net/smc: longer delay when freeing client link groups

Client link group creation always follows the server linkgroup creation.
If peer creates a new server link group, client has to create a new
client link group. If peer reuses a server link group for a new
connection, client has to reuse its client link group as well. To
avoid out-of-sync conditions for link groups a longer delay for
for client link group removal is defined to make sure this link group
still exists, once the peer decides to reuse a server link group.

Currently the client link group delay time is just 10 jiffies larger
than the server link group delay time. This patch increases the delay
difference to 10 seconds to have a better protection against
out-of-sync link groups.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2bf138e7d3ec..add82b0266f3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -28,7 +28,7 @@
 
 #define SMC_LGR_NUM_INCR		256
 #define SMC_LGR_FREE_DELAY_SERV		(600 * HZ)
-#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10)
+#define SMC_LGR_FREE_DELAY_CLNT		(SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
 
 static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
-- 
cgit v1.2.3-59-g8ed1b


From 6c2799c11e3681729f448706e6484cffad8a5633 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:11 -0500
Subject: amd-xgbe: Fix debug output of max channel counts

A debug output print statement uses the wrong variable to output the
maximum Rx channel count (cut and paste error, basically).  Fix the
statement to use the proper variable.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 82d1f416ee2a..7b63521a5f4d 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -344,7 +344,7 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (netif_msg_probe(pdata)) {
 		dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
 			pdata->tx_max_channel_count,
-			pdata->tx_max_channel_count);
+			pdata->rx_max_channel_count);
 		dev_dbg(dev, "max tx/rx hw queue count = %u/%u\n",
 			pdata->tx_max_q_count, pdata->rx_max_q_count);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From b93c3ab6006b379f2df238693b6e131cba9b37b3 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:20 -0500
Subject: amd-xgbe: Read and save the port property registers during probe

Read and save the port property registers once during the device probe
and then use the saved values as they are needed.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c    | 34 +++++++++++----
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 68 ++++++++++++-----------------
 drivers/net/ethernet/amd/xgbe/xgbe.h        |  7 +++
 3 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 7b63521a5f4d..7b86240ecd5f 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -335,12 +335,29 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	pdata->awcr = XGBE_DMA_PCI_AWCR;
 	pdata->awarcr = XGBE_DMA_PCI_AWARCR;
 
+	/* Read the port property registers */
+	pdata->pp0 = XP_IOREAD(pdata, XP_PROP_0);
+	pdata->pp1 = XP_IOREAD(pdata, XP_PROP_1);
+	pdata->pp2 = XP_IOREAD(pdata, XP_PROP_2);
+	pdata->pp3 = XP_IOREAD(pdata, XP_PROP_3);
+	pdata->pp4 = XP_IOREAD(pdata, XP_PROP_4);
+	if (netif_msg_probe(pdata)) {
+		dev_dbg(dev, "port property 0 = %#010x\n", pdata->pp0);
+		dev_dbg(dev, "port property 1 = %#010x\n", pdata->pp1);
+		dev_dbg(dev, "port property 2 = %#010x\n", pdata->pp2);
+		dev_dbg(dev, "port property 3 = %#010x\n", pdata->pp3);
+		dev_dbg(dev, "port property 4 = %#010x\n", pdata->pp4);
+	}
+
 	/* Set the maximum channels and queues */
-	reg = XP_IOREAD(pdata, XP_PROP_1);
-	pdata->tx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_DMA);
-	pdata->rx_max_channel_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_DMA);
-	pdata->tx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_TX_QUEUES);
-	pdata->rx_max_q_count = XP_GET_BITS(reg, XP_PROP_1, MAX_RX_QUEUES);
+	pdata->tx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+						  MAX_TX_DMA);
+	pdata->rx_max_channel_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+						  MAX_RX_DMA);
+	pdata->tx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+					    MAX_TX_QUEUES);
+	pdata->rx_max_q_count = XP_GET_BITS(pdata->pp1, XP_PROP_1,
+					    MAX_RX_QUEUES);
 	if (netif_msg_probe(pdata)) {
 		dev_dbg(dev, "max tx/rx channel count = %u/%u\n",
 			pdata->tx_max_channel_count,
@@ -353,12 +370,13 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	xgbe_set_counts(pdata);
 
 	/* Set the maximum fifo amounts */
-	reg = XP_IOREAD(pdata, XP_PROP_2);
-	pdata->tx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, TX_FIFO_SIZE);
+	pdata->tx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+					      TX_FIFO_SIZE);
 	pdata->tx_max_fifo_size *= 16384;
 	pdata->tx_max_fifo_size = min(pdata->tx_max_fifo_size,
 				      pdata->vdata->tx_max_fifo_size);
-	pdata->rx_max_fifo_size = XP_GET_BITS(reg, XP_PROP_2, RX_FIFO_SIZE);
+	pdata->rx_max_fifo_size = XP_GET_BITS(pdata->pp2, XP_PROP_2,
+					      RX_FIFO_SIZE);
 	pdata->rx_max_fifo_size *= 16384;
 	pdata->rx_max_fifo_size = min(pdata->rx_max_fifo_size,
 				      pdata->vdata->rx_max_fifo_size);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index aac884314000..123ceb0540d7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -2421,22 +2421,21 @@ static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart)
 static void xgbe_phy_sfp_gpio_setup(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int reg;
-
-	reg = XP_IOREAD(pdata, XP_PROP_3);
 
 	phy_data->sfp_gpio_address = XGBE_GPIO_ADDRESS_PCA9555 +
-				     XP_GET_BITS(reg, XP_PROP_3, GPIO_ADDR);
+				     XP_GET_BITS(pdata->pp3, XP_PROP_3,
+						 GPIO_ADDR);
 
-	phy_data->sfp_gpio_mask = XP_GET_BITS(reg, XP_PROP_3, GPIO_MASK);
+	phy_data->sfp_gpio_mask = XP_GET_BITS(pdata->pp3, XP_PROP_3,
+					      GPIO_MASK);
 
-	phy_data->sfp_gpio_rx_los = XP_GET_BITS(reg, XP_PROP_3,
+	phy_data->sfp_gpio_rx_los = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 						GPIO_RX_LOS);
-	phy_data->sfp_gpio_tx_fault = XP_GET_BITS(reg, XP_PROP_3,
+	phy_data->sfp_gpio_tx_fault = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 						  GPIO_TX_FAULT);
-	phy_data->sfp_gpio_mod_absent = XP_GET_BITS(reg, XP_PROP_3,
+	phy_data->sfp_gpio_mod_absent = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 						    GPIO_MOD_ABS);
-	phy_data->sfp_gpio_rate_select = XP_GET_BITS(reg, XP_PROP_3,
+	phy_data->sfp_gpio_rate_select = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 						     GPIO_RATE_SELECT);
 
 	if (netif_msg_probe(pdata)) {
@@ -2458,18 +2457,17 @@ static void xgbe_phy_sfp_gpio_setup(struct xgbe_prv_data *pdata)
 static void xgbe_phy_sfp_comm_setup(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int reg, mux_addr_hi, mux_addr_lo;
+	unsigned int mux_addr_hi, mux_addr_lo;
 
-	reg = XP_IOREAD(pdata, XP_PROP_4);
-
-	mux_addr_hi = XP_GET_BITS(reg, XP_PROP_4, MUX_ADDR_HI);
-	mux_addr_lo = XP_GET_BITS(reg, XP_PROP_4, MUX_ADDR_LO);
+	mux_addr_hi = XP_GET_BITS(pdata->pp4, XP_PROP_4, MUX_ADDR_HI);
+	mux_addr_lo = XP_GET_BITS(pdata->pp4, XP_PROP_4, MUX_ADDR_LO);
 	if (mux_addr_lo == XGBE_SFP_DIRECT)
 		return;
 
 	phy_data->sfp_comm = XGBE_SFP_COMM_PCA9545;
 	phy_data->sfp_mux_address = (mux_addr_hi << 2) + mux_addr_lo;
-	phy_data->sfp_mux_channel = XP_GET_BITS(reg, XP_PROP_4, MUX_CHAN);
+	phy_data->sfp_mux_channel = XP_GET_BITS(pdata->pp4, XP_PROP_4,
+						MUX_CHAN);
 
 	if (netif_msg_probe(pdata)) {
 		dev_dbg(pdata->dev, "SFP: mux_address=%#x\n",
@@ -2592,13 +2590,11 @@ static bool xgbe_phy_redrv_error(struct xgbe_phy_data *phy_data)
 static int xgbe_phy_mdio_reset_setup(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int reg;
 
 	if (phy_data->conn_type != XGBE_CONN_TYPE_MDIO)
 		return 0;
 
-	reg = XP_IOREAD(pdata, XP_PROP_3);
-	phy_data->mdio_reset = XP_GET_BITS(reg, XP_PROP_3, MDIO_RESET);
+	phy_data->mdio_reset = XP_GET_BITS(pdata->pp3, XP_PROP_3, MDIO_RESET);
 	switch (phy_data->mdio_reset) {
 	case XGBE_MDIO_RESET_NONE:
 	case XGBE_MDIO_RESET_I2C_GPIO:
@@ -2612,12 +2608,12 @@ static int xgbe_phy_mdio_reset_setup(struct xgbe_prv_data *pdata)
 
 	if (phy_data->mdio_reset == XGBE_MDIO_RESET_I2C_GPIO) {
 		phy_data->mdio_reset_addr = XGBE_GPIO_ADDRESS_PCA9555 +
-					    XP_GET_BITS(reg, XP_PROP_3,
+					    XP_GET_BITS(pdata->pp3, XP_PROP_3,
 							MDIO_RESET_I2C_ADDR);
-		phy_data->mdio_reset_gpio = XP_GET_BITS(reg, XP_PROP_3,
+		phy_data->mdio_reset_gpio = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 							MDIO_RESET_I2C_GPIO);
 	} else if (phy_data->mdio_reset == XGBE_MDIO_RESET_INT_GPIO) {
-		phy_data->mdio_reset_gpio = XP_GET_BITS(reg, XP_PROP_3,
+		phy_data->mdio_reset_gpio = XP_GET_BITS(pdata->pp3, XP_PROP_3,
 							MDIO_RESET_INT_GPIO);
 	}
 
@@ -2707,12 +2703,9 @@ static bool xgbe_phy_conn_type_mismatch(struct xgbe_prv_data *pdata)
 
 static bool xgbe_phy_port_enabled(struct xgbe_prv_data *pdata)
 {
-	unsigned int reg;
-
-	reg = XP_IOREAD(pdata, XP_PROP_0);
-	if (!XP_GET_BITS(reg, XP_PROP_0, PORT_SPEEDS))
+	if (!XP_GET_BITS(pdata->pp0, XP_PROP_0, PORT_SPEEDS))
 		return false;
-	if (!XP_GET_BITS(reg, XP_PROP_0, CONN_TYPE))
+	if (!XP_GET_BITS(pdata->pp0, XP_PROP_0, CONN_TYPE))
 		return false;
 
 	return true;
@@ -2921,7 +2914,6 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata)
 	struct ethtool_link_ksettings *lks = &pdata->phy.lks;
 	struct xgbe_phy_data *phy_data;
 	struct mii_bus *mii;
-	unsigned int reg;
 	int ret;
 
 	/* Check if enabled */
@@ -2940,12 +2932,11 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata)
 		return -ENOMEM;
 	pdata->phy_data = phy_data;
 
-	reg = XP_IOREAD(pdata, XP_PROP_0);
-	phy_data->port_mode = XP_GET_BITS(reg, XP_PROP_0, PORT_MODE);
-	phy_data->port_id = XP_GET_BITS(reg, XP_PROP_0, PORT_ID);
-	phy_data->port_speeds = XP_GET_BITS(reg, XP_PROP_0, PORT_SPEEDS);
-	phy_data->conn_type = XP_GET_BITS(reg, XP_PROP_0, CONN_TYPE);
-	phy_data->mdio_addr = XP_GET_BITS(reg, XP_PROP_0, MDIO_ADDR);
+	phy_data->port_mode = XP_GET_BITS(pdata->pp0, XP_PROP_0, PORT_MODE);
+	phy_data->port_id = XP_GET_BITS(pdata->pp0, XP_PROP_0, PORT_ID);
+	phy_data->port_speeds = XP_GET_BITS(pdata->pp0, XP_PROP_0, PORT_SPEEDS);
+	phy_data->conn_type = XP_GET_BITS(pdata->pp0, XP_PROP_0, CONN_TYPE);
+	phy_data->mdio_addr = XP_GET_BITS(pdata->pp0, XP_PROP_0, MDIO_ADDR);
 	if (netif_msg_probe(pdata)) {
 		dev_dbg(pdata->dev, "port mode=%u\n", phy_data->port_mode);
 		dev_dbg(pdata->dev, "port id=%u\n", phy_data->port_id);
@@ -2954,12 +2945,11 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata)
 		dev_dbg(pdata->dev, "mdio addr=%u\n", phy_data->mdio_addr);
 	}
 
-	reg = XP_IOREAD(pdata, XP_PROP_4);
-	phy_data->redrv = XP_GET_BITS(reg, XP_PROP_4, REDRV_PRESENT);
-	phy_data->redrv_if = XP_GET_BITS(reg, XP_PROP_4, REDRV_IF);
-	phy_data->redrv_addr = XP_GET_BITS(reg, XP_PROP_4, REDRV_ADDR);
-	phy_data->redrv_lane = XP_GET_BITS(reg, XP_PROP_4, REDRV_LANE);
-	phy_data->redrv_model = XP_GET_BITS(reg, XP_PROP_4, REDRV_MODEL);
+	phy_data->redrv = XP_GET_BITS(pdata->pp4, XP_PROP_4, REDRV_PRESENT);
+	phy_data->redrv_if = XP_GET_BITS(pdata->pp4, XP_PROP_4, REDRV_IF);
+	phy_data->redrv_addr = XP_GET_BITS(pdata->pp4, XP_PROP_4, REDRV_ADDR);
+	phy_data->redrv_lane = XP_GET_BITS(pdata->pp4, XP_PROP_4, REDRV_LANE);
+	phy_data->redrv_model = XP_GET_BITS(pdata->pp4, XP_PROP_4, REDRV_MODEL);
 	if (phy_data->redrv && netif_msg_probe(pdata)) {
 		dev_dbg(pdata->dev, "redrv present\n");
 		dev_dbg(pdata->dev, "redrv i/f=%u\n", phy_data->redrv_if);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 95d4b56448c6..54e43ad32d26 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1027,6 +1027,13 @@ struct xgbe_prv_data {
 	void __iomem *xprop_regs;	/* XGBE property registers */
 	void __iomem *xi2c_regs;	/* XGBE I2C CSRs */
 
+	/* Port property registers */
+	unsigned int pp0;
+	unsigned int pp1;
+	unsigned int pp2;
+	unsigned int pp3;
+	unsigned int pp4;
+
 	/* Overall device lock */
 	spinlock_t lock;
 
-- 
cgit v1.2.3-59-g8ed1b


From 0d2b5255ea7b69f2270e41130ee25346b1f68b7b Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:29 -0500
Subject: amd-xgbe: Remove use of comm_owned field

The comm_owned field can hide logic where double locking is attempted
and prevent multiple threads for the same device from accessing the
mutex properly.  Remove the comm_owned field and use the mutex API
exclusively for gaining ownership.  The current driver has been audited
and is obtaining communications ownership properly.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 123ceb0540d7..05003be89ff5 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -327,8 +327,6 @@ struct xgbe_phy_data {
 
 	unsigned int mdio_addr;
 
-	unsigned int comm_owned;
-
 	/* SFP Support */
 	enum xgbe_sfp_comm sfp_comm;
 	unsigned int sfp_mux_address;
@@ -382,12 +380,6 @@ static enum xgbe_an_mode xgbe_phy_an_mode(struct xgbe_prv_data *pdata);
 static int xgbe_phy_i2c_xfer(struct xgbe_prv_data *pdata,
 			     struct xgbe_i2c_op *i2c_op)
 {
-	struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-	/* Be sure we own the bus */
-	if (WARN_ON(!phy_data->comm_owned))
-		return -EIO;
-
 	return pdata->i2c_if.i2c_xfer(pdata, i2c_op);
 }
 
@@ -549,10 +541,6 @@ static int xgbe_phy_sfp_get_mux(struct xgbe_prv_data *pdata)
 
 static void xgbe_phy_put_comm_ownership(struct xgbe_prv_data *pdata)
 {
-	struct xgbe_phy_data *phy_data = pdata->phy_data;
-
-	phy_data->comm_owned = 0;
-
 	mutex_unlock(&xgbe_phy_comm_lock);
 }
 
@@ -562,9 +550,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data *pdata)
 	unsigned long timeout;
 	unsigned int mutex_id;
 
-	if (phy_data->comm_owned)
-		return 0;
-
 	/* The I2C and MDIO/GPIO bus is multiplexed between multiple devices,
 	 * the driver needs to take the software mutex and then the hardware
 	 * mutexes before being able to use the busses.
@@ -593,7 +578,6 @@ static int xgbe_phy_get_comm_ownership(struct xgbe_prv_data *pdata)
 		XP_IOWRITE(pdata, XP_I2C_MUTEX, mutex_id);
 		XP_IOWRITE(pdata, XP_MDIO_MUTEX, mutex_id);
 
-		phy_data->comm_owned = 1;
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 67cea0c92264a657230ba14e1956c34132ca6161 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:38 -0500
Subject: amd-xgbe: Remove field that indicates SFP diagnostic support

The driver currently sets an indication of whether the SFP supports, and
that the driver can obtain, diagnostics data.  This isn't currently used
by the driver and the logic to set this indicator is flawed because the
field is cleared each time the SFP is checked and only set when a new SFP
is detected.  Remove this field and the logic supporting it.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 05003be89ff5..cb15cafd5209 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -343,7 +343,6 @@ struct xgbe_phy_data {
 	unsigned int sfp_rx_los;
 	unsigned int sfp_tx_fault;
 	unsigned int sfp_mod_absent;
-	unsigned int sfp_diags;
 	unsigned int sfp_changed;
 	unsigned int sfp_phy_avail;
 	unsigned int sfp_cable_len;
@@ -1211,13 +1210,6 @@ static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data *pdata)
 
 		memcpy(&phy_data->sfp_eeprom, &sfp_eeprom, sizeof(sfp_eeprom));
 
-		if (sfp_eeprom.extd[XGBE_SFP_EXTD_SFF_8472]) {
-			u8 diag_type = sfp_eeprom.extd[XGBE_SFP_EXTD_DIAG];
-
-			if (!(diag_type & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
-				phy_data->sfp_diags = 1;
-		}
-
 		xgbe_phy_free_phy_device(pdata);
 	} else {
 		phy_data->sfp_changed = 0;
@@ -1267,7 +1259,6 @@ static void xgbe_phy_sfp_reset(struct xgbe_phy_data *phy_data)
 	phy_data->sfp_rx_los = 0;
 	phy_data->sfp_tx_fault = 0;
 	phy_data->sfp_mod_absent = 1;
-	phy_data->sfp_diags = 0;
 	phy_data->sfp_base = XGBE_SFP_BASE_UNKNOWN;
 	phy_data->sfp_cable = XGBE_SFP_CABLE_UNKNOWN;
 	phy_data->sfp_speed = XGBE_SFP_SPEED_UNKNOWN;
-- 
cgit v1.2.3-59-g8ed1b


From 53a1024abf3e4cb3310a54b08c48323649158c13 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:46 -0500
Subject: amd-xgbe: Add ethtool support to retrieve SFP module info

Add support to get SFP module information using ethtool.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |  18 ++++
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c    |  21 ++++
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  | 137 +++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe.h         |  13 +++
 4 files changed, 189 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index ff397bb25042..57394b77b2d3 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -626,6 +626,22 @@ static int xgbe_get_ts_info(struct net_device *netdev,
 	return 0;
 }
 
+static int xgbe_get_module_info(struct net_device *netdev,
+				struct ethtool_modinfo *modinfo)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+	return pdata->phy_if.module_info(pdata, modinfo);
+}
+
+static int xgbe_get_module_eeprom(struct net_device *netdev,
+				  struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+	return pdata->phy_if.module_eeprom(pdata, eeprom, data);
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_drvinfo = xgbe_get_drvinfo,
 	.get_msglevel = xgbe_get_msglevel,
@@ -646,6 +662,8 @@ static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_ts_info = xgbe_get_ts_info,
 	.get_link_ksettings = xgbe_get_link_ksettings,
 	.set_link_ksettings = xgbe_set_link_ksettings,
+	.get_module_info = xgbe_get_module_info,
+	.get_module_eeprom = xgbe_get_module_eeprom,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 1b45cd73a258..9c39c72d25ab 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -126,6 +126,24 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+				  struct ethtool_eeprom *eeprom, u8 *data)
+{
+	if (!pdata->phy_if.phy_impl.module_eeprom)
+		return -ENXIO;
+
+	return pdata->phy_if.phy_impl.module_eeprom(pdata, eeprom, data);
+}
+
+static int xgbe_phy_module_info(struct xgbe_prv_data *pdata,
+				struct ethtool_modinfo *modinfo)
+{
+	if (!pdata->phy_if.phy_impl.module_info)
+		return -ENXIO;
+
+	return pdata->phy_if.phy_impl.module_info(pdata, modinfo);
+}
+
 static void xgbe_an37_clear_interrupts(struct xgbe_prv_data *pdata)
 {
 	int reg;
@@ -1639,4 +1657,7 @@ void xgbe_init_function_ptrs_phy(struct xgbe_phy_if *phy_if)
 	phy_if->phy_valid_speed = xgbe_phy_valid_speed;
 
 	phy_if->an_isr          = xgbe_an_combined_isr;
+
+	phy_if->module_info     = xgbe_phy_module_info;
+	phy_if->module_eeprom   = xgbe_phy_module_eeprom;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index cb15cafd5209..141bb13dda26 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -119,6 +119,7 @@
 #include <linux/kmod.h>
 #include <linux/mdio.h>
 #include <linux/phy.h>
+#include <linux/ethtool.h>
 
 #include "xgbe.h"
 #include "xgbe-common.h"
@@ -270,6 +271,15 @@ struct xgbe_sfp_eeprom {
 	u8 vendor[32];
 };
 
+#define XGBE_SFP_DIAGS_SUPPORTED(_x)			\
+	((_x)->extd[XGBE_SFP_EXTD_SFF_8472] &&		\
+	 !((_x)->extd[XGBE_SFP_EXTD_DIAG] & XGBE_SFP_EXTD_DIAG_ADDR_CHANGE))
+
+#define XGBE_SFP_EEPROM_BASE_LEN	256
+#define XGBE_SFP_EEPROM_DIAG_LEN	256
+#define XGBE_SFP_EEPROM_MAX		(XGBE_SFP_EEPROM_BASE_LEN +	\
+					 XGBE_SFP_EEPROM_DIAG_LEN)
+
 #define XGBE_BEL_FUSE_VENDOR	"BEL-FUSE        "
 #define XGBE_BEL_FUSE_PARTNO	"1GBT-SFP06      "
 
@@ -1301,6 +1311,130 @@ put:
 	xgbe_phy_put_comm_ownership(pdata);
 }
 
+static int xgbe_phy_module_eeprom(struct xgbe_prv_data *pdata,
+				  struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+	u8 eeprom_addr, eeprom_data[XGBE_SFP_EEPROM_MAX];
+	struct xgbe_sfp_eeprom *sfp_eeprom;
+	unsigned int i, j, rem;
+	int ret;
+
+	rem = eeprom->len;
+
+	if (!eeprom->len) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	if ((eeprom->offset + eeprom->len) > XGBE_SFP_EEPROM_MAX) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP) {
+		ret = -ENXIO;
+		goto done;
+	}
+
+	if (!netif_running(pdata->netdev)) {
+		ret = -EIO;
+		goto done;
+	}
+
+	if (phy_data->sfp_mod_absent) {
+		ret = -EIO;
+		goto done;
+	}
+
+	ret = xgbe_phy_get_comm_ownership(pdata);
+	if (ret) {
+		ret = -EIO;
+		goto done;
+	}
+
+	ret = xgbe_phy_sfp_get_mux(pdata);
+	if (ret) {
+		netdev_err(pdata->netdev, "I2C error setting SFP MUX\n");
+		ret = -EIO;
+		goto put_own;
+	}
+
+	/* Read the SFP serial ID eeprom */
+	eeprom_addr = 0;
+	ret = xgbe_phy_i2c_read(pdata, XGBE_SFP_SERIAL_ID_ADDRESS,
+				&eeprom_addr, sizeof(eeprom_addr),
+				eeprom_data, XGBE_SFP_EEPROM_BASE_LEN);
+	if (ret) {
+		netdev_err(pdata->netdev,
+			   "I2C error reading SFP EEPROM\n");
+		ret = -EIO;
+		goto put_mux;
+	}
+
+	sfp_eeprom = (struct xgbe_sfp_eeprom *)eeprom_data;
+
+	if (XGBE_SFP_DIAGS_SUPPORTED(sfp_eeprom)) {
+		/* Read the SFP diagnostic eeprom */
+		eeprom_addr = 0;
+		ret = xgbe_phy_i2c_read(pdata, XGBE_SFP_DIAG_INFO_ADDRESS,
+					&eeprom_addr, sizeof(eeprom_addr),
+					eeprom_data + XGBE_SFP_EEPROM_BASE_LEN,
+					XGBE_SFP_EEPROM_DIAG_LEN);
+		if (ret) {
+			netdev_err(pdata->netdev,
+				   "I2C error reading SFP DIAGS\n");
+			ret = -EIO;
+			goto put_mux;
+		}
+	}
+
+	for (i = 0, j = eeprom->offset; i < eeprom->len; i++, j++) {
+		if ((j >= XGBE_SFP_EEPROM_BASE_LEN) &&
+		    !XGBE_SFP_DIAGS_SUPPORTED(sfp_eeprom))
+			break;
+
+		data[i] = eeprom_data[j];
+		rem--;
+	}
+
+put_mux:
+	xgbe_phy_sfp_put_mux(pdata);
+
+put_own:
+	xgbe_phy_put_comm_ownership(pdata);
+
+done:
+	eeprom->len -= rem;
+
+	return ret;
+}
+
+static int xgbe_phy_module_info(struct xgbe_prv_data *pdata,
+				struct ethtool_modinfo *modinfo)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return -ENXIO;
+
+	if (!netif_running(pdata->netdev))
+		return -EIO;
+
+	if (phy_data->sfp_mod_absent)
+		return -EIO;
+
+	if (XGBE_SFP_DIAGS_SUPPORTED(&phy_data->sfp_eeprom)) {
+		modinfo->type = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+	} else {
+		modinfo->type = ETH_MODULE_SFF_8079;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN;
+	}
+
+	return 0;
+}
+
 static void xgbe_phy_phydev_flowctrl(struct xgbe_prv_data *pdata)
 {
 	struct ethtool_link_ksettings *lks = &pdata->phy.lks;
@@ -3196,4 +3330,7 @@ void xgbe_init_function_ptrs_phy_v2(struct xgbe_phy_if *phy_if)
 
 	phy_impl->kr_training_pre	= xgbe_phy_kr_training_pre;
 	phy_impl->kr_training_post	= xgbe_phy_kr_training_post;
+
+	phy_impl->module_info		= xgbe_phy_module_info;
+	phy_impl->module_eeprom		= xgbe_phy_module_eeprom;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 54e43ad32d26..f0f455b79ff9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -835,6 +835,7 @@ struct xgbe_hw_if {
  *   Optional routines:
  *     an_pre, an_post
  *     kr_training_pre, kr_training_post
+ *     module_info, module_eeprom
  */
 struct xgbe_phy_impl_if {
 	/* Perform Setup/teardown actions */
@@ -883,6 +884,12 @@ struct xgbe_phy_impl_if {
 	/* Pre/Post KR training enablement support */
 	void (*kr_training_pre)(struct xgbe_prv_data *);
 	void (*kr_training_post)(struct xgbe_prv_data *);
+
+	/* SFP module related info */
+	int (*module_info)(struct xgbe_prv_data *pdata,
+			   struct ethtool_modinfo *modinfo);
+	int (*module_eeprom)(struct xgbe_prv_data *pdata,
+			     struct ethtool_eeprom *eeprom, u8 *data);
 };
 
 struct xgbe_phy_if {
@@ -905,6 +912,12 @@ struct xgbe_phy_if {
 	/* For single interrupt support */
 	irqreturn_t (*an_isr)(struct xgbe_prv_data *);
 
+	/* For ethtool PHY support */
+	int (*module_info)(struct xgbe_prv_data *pdata,
+			   struct ethtool_modinfo *modinfo);
+	int (*module_eeprom)(struct xgbe_prv_data *pdata,
+			     struct ethtool_eeprom *eeprom, u8 *data);
+
 	/* PHY implementation specific services */
 	struct xgbe_phy_impl_if phy_impl;
 };
-- 
cgit v1.2.3-59-g8ed1b


From bab748de986d786cbbef31d550bea3bc616304cb Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:38:56 -0500
Subject: amd-xgbe: Add ethtool show/set ring parameter support

Add ethtool support to show and set the number of the Rx and Tx ring
descriptors.  Changing the ring configuration will result in a device
restart.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c     |  6 +--
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 65 ++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe.h         |  6 +++
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7c204f05b418..2646c088452c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1426,10 +1426,8 @@ static void xgbe_stopdev(struct work_struct *work)
 	netdev_alert(pdata->netdev, "device stopped\n");
 }
 
-static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
+void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
-	DBGPR("-->xgbe_restart_dev\n");
-
 	/* If not running, "restart" will happen on open */
 	if (!netif_running(pdata->netdev))
 		return;
@@ -1440,8 +1438,6 @@ static void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 	xgbe_free_rx_data(pdata);
 
 	xgbe_start(pdata);
-
-	DBGPR("<--xgbe_restart_dev\n");
 }
 
 static void xgbe_restart(struct work_struct *work)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 57394b77b2d3..d12f982d73b4 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -642,6 +642,69 @@ static int xgbe_get_module_eeprom(struct net_device *netdev,
 	return pdata->phy_if.module_eeprom(pdata, eeprom, data);
 }
 
+static void xgbe_get_ringparam(struct net_device *netdev,
+			       struct ethtool_ringparam *ringparam)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+
+	ringparam->rx_max_pending = XGBE_RX_DESC_CNT_MAX;
+	ringparam->tx_max_pending = XGBE_TX_DESC_CNT_MAX;
+	ringparam->rx_pending = pdata->rx_desc_count;
+	ringparam->tx_pending = pdata->tx_desc_count;
+}
+
+static int xgbe_set_ringparam(struct net_device *netdev,
+			      struct ethtool_ringparam *ringparam)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	unsigned int rx, tx;
+
+	if (ringparam->rx_mini_pending || ringparam->rx_jumbo_pending) {
+		netdev_err(netdev, "unsupported ring parameter\n");
+		return -EINVAL;
+	}
+
+	if ((ringparam->rx_pending < XGBE_RX_DESC_CNT_MIN) ||
+	    (ringparam->rx_pending > XGBE_RX_DESC_CNT_MAX)) {
+		netdev_err(netdev,
+			   "rx ring parameter must be between %u and %u\n",
+			   XGBE_RX_DESC_CNT_MIN, XGBE_RX_DESC_CNT_MAX);
+		return -EINVAL;
+	}
+
+	if ((ringparam->tx_pending < XGBE_TX_DESC_CNT_MIN) ||
+	    (ringparam->tx_pending > XGBE_TX_DESC_CNT_MAX)) {
+		netdev_err(netdev,
+			   "tx ring parameter must be between %u and %u\n",
+			   XGBE_TX_DESC_CNT_MIN, XGBE_TX_DESC_CNT_MAX);
+		return -EINVAL;
+	}
+
+	rx = __rounddown_pow_of_two(ringparam->rx_pending);
+	if (rx != ringparam->rx_pending)
+		netdev_notice(netdev,
+			      "rx ring parameter rounded to power of two: %u\n",
+			      rx);
+
+	tx = __rounddown_pow_of_two(ringparam->tx_pending);
+	if (tx != ringparam->tx_pending)
+		netdev_notice(netdev,
+			      "tx ring parameter rounded to power of two: %u\n",
+			      tx);
+
+	if ((rx == pdata->rx_desc_count) &&
+	    (tx == pdata->tx_desc_count))
+		goto out;
+
+	pdata->rx_desc_count = rx;
+	pdata->tx_desc_count = tx;
+
+	xgbe_restart_dev(pdata);
+
+out:
+	return 0;
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_drvinfo = xgbe_get_drvinfo,
 	.get_msglevel = xgbe_get_msglevel,
@@ -664,6 +727,8 @@ static const struct ethtool_ops xgbe_ethtool_ops = {
 	.set_link_ksettings = xgbe_set_link_ksettings,
 	.get_module_info = xgbe_get_module_info,
 	.get_module_eeprom = xgbe_get_module_eeprom,
+	.get_ringparam = xgbe_get_ringparam,
+	.set_ringparam = xgbe_set_ringparam,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index f0f455b79ff9..7dc0fac560a3 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -144,6 +144,11 @@
 #define XGBE_TX_DESC_MAX_PROC	(XGBE_TX_DESC_CNT >> 1)
 #define XGBE_RX_DESC_CNT	512
 
+#define XGBE_TX_DESC_CNT_MIN	64
+#define XGBE_TX_DESC_CNT_MAX	4096
+#define XGBE_RX_DESC_CNT_MIN	64
+#define XGBE_RX_DESC_CNT_MAX	4096
+
 #define XGBE_TX_MAX_BUF_SIZE	(0x3fff & ~(64 - 1))
 
 /* Descriptors required for maximum contiguous TSO/GSO packet */
@@ -1330,6 +1335,7 @@ int xgbe_powerup(struct net_device *, unsigned int);
 int xgbe_powerdown(struct net_device *, unsigned int);
 void xgbe_init_rx_coalesce(struct xgbe_prv_data *);
 void xgbe_init_tx_coalesce(struct xgbe_prv_data *);
+void xgbe_restart_dev(struct xgbe_prv_data *pdata);
 
 #ifdef CONFIG_DEBUG_FS
 void xgbe_debugfs_init(struct xgbe_prv_data *);
-- 
cgit v1.2.3-59-g8ed1b


From 2244753409f5bc3e2eae4e2ec6f4ced239993f33 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:04 -0500
Subject: amd-xgbe: Prepare for ethtool set-channel support

In order to support being able to dynamically set/change the number of
Rx and Tx channels, update the code to:
 - Move alloc and free of device memory into callable functions
 - Move setting of the real number of Rx and Tx channels to device startup
 - Move mapping of the RSS channels to device startup

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  | 108 ++++++++++++++++++------------
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |  20 +-----
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2646c088452c..397e3a0e1d67 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1312,14 +1312,72 @@ int xgbe_powerup(struct net_device *netdev, unsigned int caller)
 	return 0;
 }
 
+static void xgbe_free_memory(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_desc_if *desc_if = &pdata->desc_if;
+
+	/* Free the ring descriptors and buffers */
+	desc_if->free_ring_resources(pdata);
+
+	/* Free the channel and ring structures */
+	xgbe_free_channels(pdata);
+}
+
+static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_desc_if *desc_if = &pdata->desc_if;
+	struct net_device *netdev = pdata->netdev;
+	int ret;
+
+	/* Calculate the Rx buffer size before allocating rings */
+	pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
+
+	/* Allocate the channel and ring structures */
+	ret = xgbe_alloc_channels(pdata);
+	if (ret)
+		return ret;
+
+	/* Allocate the ring descriptors and buffers */
+	ret = desc_if->alloc_ring_resources(pdata);
+	if (ret)
+		goto err_channels;
+
+	/* Initialize the service and Tx timers */
+	xgbe_init_timers(pdata);
+
+	return 0;
+
+err_channels:
+	xgbe_free_memory(pdata);
+
+	return ret;
+}
+
 static int xgbe_start(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_hw_if *hw_if = &pdata->hw_if;
 	struct xgbe_phy_if *phy_if = &pdata->phy_if;
 	struct net_device *netdev = pdata->netdev;
+	unsigned int i;
 	int ret;
 
-	DBGPR("-->xgbe_start\n");
+	/* Set the number of queues */
+	ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
+	if (ret) {
+		netdev_err(netdev, "error setting real tx queue count\n");
+		return ret;
+	}
+
+	ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
+	if (ret) {
+		netdev_err(netdev, "error setting real rx queue count\n");
+		return ret;
+	}
+
+	/* Set RSS lookup table data for programming */
+	for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
+		XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
+			       i % pdata->rx_ring_count);
 
 	ret = hw_if->init(pdata);
 	if (ret)
@@ -1347,8 +1405,6 @@ static int xgbe_start(struct xgbe_prv_data *pdata)
 
 	clear_bit(XGBE_STOPPED, &pdata->dev_state);
 
-	DBGPR("<--xgbe_start\n");
-
 	return 0;
 
 err_irqs:
@@ -1823,11 +1879,8 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 static int xgbe_open(struct net_device *netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
-	struct xgbe_desc_if *desc_if = &pdata->desc_if;
 	int ret;
 
-	DBGPR("-->xgbe_open\n");
-
 	/* Create the various names based on netdev name */
 	snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
 		 netdev_name(netdev));
@@ -1872,43 +1925,25 @@ static int xgbe_open(struct net_device *netdev)
 		goto err_sysclk;
 	}
 
-	/* Calculate the Rx buffer size before allocating rings */
-	ret = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
-	if (ret < 0)
-		goto err_ptpclk;
-	pdata->rx_buf_size = ret;
-
-	/* Allocate the channel and ring structures */
-	ret = xgbe_alloc_channels(pdata);
-	if (ret)
-		goto err_ptpclk;
-
-	/* Allocate the ring descriptors and buffers */
-	ret = desc_if->alloc_ring_resources(pdata);
-	if (ret)
-		goto err_channels;
-
 	INIT_WORK(&pdata->service_work, xgbe_service);
 	INIT_WORK(&pdata->restart_work, xgbe_restart);
 	INIT_WORK(&pdata->stopdev_work, xgbe_stopdev);
 	INIT_WORK(&pdata->tx_tstamp_work, xgbe_tx_tstamp);
-	xgbe_init_timers(pdata);
+
+	ret = xgbe_alloc_memory(pdata);
+	if (ret)
+		goto err_ptpclk;
 
 	ret = xgbe_start(pdata);
 	if (ret)
-		goto err_rings;
+		goto err_mem;
 
 	clear_bit(XGBE_DOWN, &pdata->dev_state);
 
-	DBGPR("<--xgbe_open\n");
-
 	return 0;
 
-err_rings:
-	desc_if->free_ring_resources(pdata);
-
-err_channels:
-	xgbe_free_channels(pdata);
+err_mem:
+	xgbe_free_memory(pdata);
 
 err_ptpclk:
 	clk_disable_unprepare(pdata->ptpclk);
@@ -1928,18 +1963,11 @@ err_dev_wq:
 static int xgbe_close(struct net_device *netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
-	struct xgbe_desc_if *desc_if = &pdata->desc_if;
-
-	DBGPR("-->xgbe_close\n");
 
 	/* Stop the device */
 	xgbe_stop(pdata);
 
-	/* Free the ring descriptors and buffers */
-	desc_if->free_ring_resources(pdata);
-
-	/* Free the channel and ring structures */
-	xgbe_free_channels(pdata);
+	xgbe_free_memory(pdata);
 
 	/* Disable the clocks */
 	clk_disable_unprepare(pdata->ptpclk);
@@ -1953,8 +1981,6 @@ static int xgbe_close(struct net_device *netdev)
 
 	set_bit(XGBE_DOWN, &pdata->dev_state);
 
-	DBGPR("<--xgbe_close\n");
-
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 441d0973957b..b41f23679a08 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -265,7 +265,6 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 {
 	struct net_device *netdev = pdata->netdev;
 	struct device *dev = pdata->dev;
-	unsigned int i;
 	int ret;
 
 	netdev->irq = pdata->dev_irq;
@@ -324,26 +323,9 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 				pdata->tx_ring_count, pdata->rx_ring_count);
 	}
 
-	/* Set the number of queues */
-	ret = netif_set_real_num_tx_queues(netdev, pdata->tx_ring_count);
-	if (ret) {
-		dev_err(dev, "error setting real tx queue count\n");
-		return ret;
-	}
-
-	ret = netif_set_real_num_rx_queues(netdev, pdata->rx_ring_count);
-	if (ret) {
-		dev_err(dev, "error setting real rx queue count\n");
-		return ret;
-	}
-
-	/* Initialize RSS hash key and lookup table */
+	/* Initialize RSS hash key */
 	netdev_rss_key_fill(pdata->rss_key, sizeof(pdata->rss_key));
 
-	for (i = 0; i < XGBE_RSS_MAX_TABLE_SIZE; i++)
-		XGMAC_SET_BITS(pdata->rss_table[i], MAC_RSSDR, DMCH,
-			       i % pdata->rx_ring_count);
-
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, IP2TE, 1);
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, TCP4TE, 1);
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, UDP4TE, 1);
-- 
cgit v1.2.3-59-g8ed1b


From 01b5277fc9984d9fb9156afa0b1be70b3b475825 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:13 -0500
Subject: amd-xgbe: Add ethtool show/set channels support

Add ethtool support to show and set the device channel configuration.
Changing the channel configuration will result in a device restart.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c     |  25 +++++
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 134 +++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe.h         |   4 +
 3 files changed, 163 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 397e3a0e1d67..24f1053b8785 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1329,6 +1329,17 @@ static int xgbe_alloc_memory(struct xgbe_prv_data *pdata)
 	struct net_device *netdev = pdata->netdev;
 	int ret;
 
+	if (pdata->new_tx_ring_count) {
+		pdata->tx_ring_count = pdata->new_tx_ring_count;
+		pdata->tx_q_count = pdata->tx_ring_count;
+		pdata->new_tx_ring_count = 0;
+	}
+
+	if (pdata->new_rx_ring_count) {
+		pdata->rx_ring_count = pdata->new_rx_ring_count;
+		pdata->new_rx_ring_count = 0;
+	}
+
 	/* Calculate the Rx buffer size before allocating rings */
 	pdata->rx_buf_size = xgbe_calc_rx_buf_size(netdev, netdev->mtu);
 
@@ -1482,6 +1493,20 @@ static void xgbe_stopdev(struct work_struct *work)
 	netdev_alert(pdata->netdev, "device stopped\n");
 }
 
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata)
+{
+	/* If not running, "restart" will happen on open */
+	if (!netif_running(pdata->netdev))
+		return;
+
+	xgbe_stop(pdata);
+
+	xgbe_free_memory(pdata);
+	xgbe_alloc_memory(pdata);
+
+	xgbe_start(pdata);
+}
+
 void xgbe_restart_dev(struct xgbe_prv_data *pdata)
 {
 	/* If not running, "restart" will happen on open */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index d12f982d73b4..a880f10e3e70 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -705,6 +705,138 @@ out:
 	return 0;
 }
 
+static void xgbe_get_channels(struct net_device *netdev,
+			      struct ethtool_channels *channels)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	unsigned int rx, tx, combined;
+
+	/* Calculate maximums allowed:
+	 *   - Take into account the number of available IRQs
+	 *   - Do not take into account the number of online CPUs so that
+	 *     the user can over-subscribe if desired
+	 *   - Tx is additionally limited by the number of hardware queues
+	 */
+	rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+	rx = min(rx, pdata->channel_irq_count);
+	tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+	tx = min(tx, pdata->channel_irq_count);
+	tx = min(tx, pdata->tx_max_q_count);
+
+	combined = min(rx, tx);
+
+	channels->max_combined = combined;
+	channels->max_rx = rx ? rx - 1 : 0;
+	channels->max_tx = tx ? tx - 1 : 0;
+
+	/* Get current settings based on device state */
+	rx = pdata->new_rx_ring_count ? : pdata->rx_ring_count;
+	tx = pdata->new_tx_ring_count ? : pdata->tx_ring_count;
+
+	combined = min(rx, tx);
+	rx -= combined;
+	tx -= combined;
+
+	channels->combined_count = combined;
+	channels->rx_count = rx;
+	channels->tx_count = tx;
+}
+
+static void xgbe_print_set_channels_input(struct net_device *netdev,
+					  struct ethtool_channels *channels)
+{
+	netdev_err(netdev, "channel inputs: combined=%u, rx-only=%u, tx-only=%u\n",
+		   channels->combined_count, channels->rx_count,
+		   channels->tx_count);
+}
+
+static int xgbe_set_channels(struct net_device *netdev,
+			     struct ethtool_channels *channels)
+{
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	unsigned int rx, rx_curr, tx, tx_curr, combined;
+
+	/* Calculate maximums allowed:
+	 *   - Take into account the number of available IRQs
+	 *   - Do not take into account the number of online CPUs so that
+	 *     the user can over-subscribe if desired
+	 *   - Tx is additionally limited by the number of hardware queues
+	 */
+	rx = min(pdata->hw_feat.rx_ch_cnt, pdata->rx_max_channel_count);
+	rx = min(rx, pdata->channel_irq_count);
+	tx = min(pdata->hw_feat.tx_ch_cnt, pdata->tx_max_channel_count);
+	tx = min(tx, pdata->tx_max_q_count);
+	tx = min(tx, pdata->channel_irq_count);
+
+	combined = min(rx, tx);
+
+	/* Should not be setting other count */
+	if (channels->other_count) {
+		netdev_err(netdev,
+			   "other channel count must be zero\n");
+		return -EINVAL;
+	}
+
+	/* Require at least one Combined (Rx and Tx) channel */
+	if (!channels->combined_count) {
+		netdev_err(netdev,
+			   "at least one combined Rx/Tx channel is required\n");
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Check combined channels */
+	if (channels->combined_count > combined) {
+		netdev_err(netdev,
+			   "combined channel count cannot exceed %u\n",
+			   combined);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Can have some Rx-only or Tx-only channels, but not both */
+	if (channels->rx_count && channels->tx_count) {
+		netdev_err(netdev,
+			   "cannot specify both Rx-only and Tx-only channels\n");
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	/* Check that we don't exceed the maximum number of channels */
+	if ((channels->combined_count + channels->rx_count) > rx) {
+		netdev_err(netdev,
+			   "total Rx channels (%u) requested exceeds maximum available (%u)\n",
+			   channels->combined_count + channels->rx_count, rx);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	if ((channels->combined_count + channels->tx_count) > tx) {
+		netdev_err(netdev,
+			   "total Tx channels (%u) requested exceeds maximum available (%u)\n",
+			   channels->combined_count + channels->tx_count, tx);
+		xgbe_print_set_channels_input(netdev, channels);
+		return -EINVAL;
+	}
+
+	rx = channels->combined_count + channels->rx_count;
+	tx = channels->combined_count + channels->tx_count;
+
+	rx_curr = pdata->new_rx_ring_count ? : pdata->rx_ring_count;
+	tx_curr = pdata->new_tx_ring_count ? : pdata->tx_ring_count;
+
+	if ((rx == rx_curr) && (tx == tx_curr))
+		goto out;
+
+	pdata->new_rx_ring_count = rx;
+	pdata->new_tx_ring_count = tx;
+
+	xgbe_full_restart_dev(pdata);
+
+out:
+	return 0;
+}
+
 static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_drvinfo = xgbe_get_drvinfo,
 	.get_msglevel = xgbe_get_msglevel,
@@ -729,6 +861,8 @@ static const struct ethtool_ops xgbe_ethtool_ops = {
 	.get_module_eeprom = xgbe_get_module_eeprom,
 	.get_ringparam = xgbe_get_ringparam,
 	.set_ringparam = xgbe_set_ringparam,
+	.get_channels = xgbe_get_channels,
+	.set_channels = xgbe_set_channels,
 };
 
 const struct ethtool_ops *xgbe_get_ethtool_ops(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7dc0fac560a3..7a412cfa8e40 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1122,6 +1122,9 @@ struct xgbe_prv_data {
 	unsigned int rx_ring_count;
 	unsigned int rx_desc_count;
 
+	unsigned int new_tx_ring_count;
+	unsigned int new_rx_ring_count;
+
 	unsigned int tx_max_q_count;
 	unsigned int rx_max_q_count;
 	unsigned int tx_q_count;
@@ -1336,6 +1339,7 @@ int xgbe_powerdown(struct net_device *, unsigned int);
 void xgbe_init_rx_coalesce(struct xgbe_prv_data *);
 void xgbe_init_tx_coalesce(struct xgbe_prv_data *);
 void xgbe_restart_dev(struct xgbe_prv_data *pdata);
+void xgbe_full_restart_dev(struct xgbe_prv_data *pdata);
 
 #ifdef CONFIG_DEBUG_FS
 void xgbe_debugfs_init(struct xgbe_prv_data *);
-- 
cgit v1.2.3-59-g8ed1b


From eca282b8418b0b5a55b70e42f684e882d3fb9654 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:21 -0500
Subject: amd-xgbe: Always attempt link training in KR mode

Link training is always attempted when in KR mode, but the code is
structured to check if link training has been enabled before attempting
to perform it.  Since that check will always be true, simplify the code
to always enable and start link training during KR auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 69 +++++++------------------------
 1 file changed, 16 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 9c39c72d25ab..450b89cd9eeb 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -216,31 +216,8 @@ static void xgbe_an_clear_interrupts_all(struct xgbe_prv_data *pdata)
 	xgbe_an37_clear_interrupts(pdata);
 }
 
-static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg |= XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
-static void xgbe_an73_disable_kr_training(struct xgbe_prv_data *pdata)
-{
-	unsigned int reg;
-
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-
-	reg &= ~XGBE_KR_TRAINING_ENABLE;
-	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
-}
-
 static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 {
-	/* Enable KR training */
-	xgbe_an73_enable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -250,9 +227,6 @@ static void xgbe_kr_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 2.5G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_2500);
 
@@ -262,9 +236,6 @@ static void xgbe_kx_2500_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -278,9 +249,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 	if (pdata->kr_redrv)
 		return xgbe_kr_mode(pdata);
 
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 10G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_10000);
 
@@ -290,9 +258,6 @@ static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -302,9 +267,6 @@ static void xgbe_x_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -314,9 +276,6 @@ static void xgbe_sgmii_1000_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sgmii_100_mode(struct xgbe_prv_data *pdata)
 {
-	/* Disable KR training */
-	xgbe_an73_disable_kr_training(pdata);
-
 	/* Set MAC to 1G speed */
 	pdata->hw_if.set_speed(pdata, SPEED_1000);
 
@@ -425,6 +384,12 @@ static void xgbe_an73_set(struct xgbe_prv_data *pdata, bool enable,
 {
 	unsigned int reg;
 
+	/* Disable KR training for now */
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg &= ~XGBE_KR_TRAINING_ENABLE;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
+
+	/* Update AN settings */
 	reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1);
 	reg &= ~MDIO_AN_CTRL1_ENABLE;
 
@@ -522,21 +487,19 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata,
 	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_FECCTRL, reg);
 
 	/* Start KR training */
-	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
-	if (reg & XGBE_KR_TRAINING_ENABLE) {
-		if (pdata->phy_if.phy_impl.kr_training_pre)
-			pdata->phy_if.phy_impl.kr_training_pre(pdata);
+	if (pdata->phy_if.phy_impl.kr_training_pre)
+		pdata->phy_if.phy_impl.kr_training_pre(pdata);
 
-		reg |= XGBE_KR_TRAINING_START;
-		XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
-			    reg);
+	reg = XMDIO_READ(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL);
+	reg |= XGBE_KR_TRAINING_ENABLE;
+	reg |= XGBE_KR_TRAINING_START;
+	XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL, reg);
 
-		netif_dbg(pdata, link, pdata->netdev,
-			  "KR training initiated\n");
+	netif_dbg(pdata, link, pdata->netdev,
+		  "KR training initiated\n");
 
-		if (pdata->phy_if.phy_impl.kr_training_post)
-			pdata->phy_if.phy_impl.kr_training_post(pdata);
-	}
+	if (pdata->phy_if.phy_impl.kr_training_post)
+		pdata->phy_if.phy_impl.kr_training_post(pdata);
 
 	return XGBE_AN_PAGE_RECEIVED;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 418746298e586c88edf3a7c340c8d19a0c13df77 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:31 -0500
Subject: amd-xgbe: Advertise FEC support with the KR re-driver

When a KR re-driver is present, indicate the FEC support is available
during auto-negotiation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 141bb13dda26..dd747f6c519b 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -1720,6 +1720,10 @@ static void xgbe_phy_an_advertising(struct xgbe_prv_data *pdata,
 	XGBE_CLR_ADV(dlks, 1000baseKX_Full);
 	XGBE_CLR_ADV(dlks, 10000baseKR_Full);
 
+	/* Advertise FEC support is present */
+	if (pdata->fec_ability & MDIO_PMA_10GBR_FECABLE_ABLE)
+		XGBE_SET_ADV(dlks, 10000baseR_FEC);
+
 	switch (phy_data->port_mode) {
 	case XGBE_PORT_MODE_BACKPLANE:
 		XGBE_SET_ADV(dlks, 10000baseKR_Full);
-- 
cgit v1.2.3-59-g8ed1b


From e722ec82374b7a0c0cfa954e4f780221139c5f93 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:39 -0500
Subject: amd-xgbe: Update the BelFuse quirk to support SGMII

Instead of using a quirk to make the BelFuse 1GBT-SFP06 part look like
a 1000baseX part, program the SFP PHY to support SGMII and 10/100/1000
baseT.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 109 +++++++++++++++++++---------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index dd747f6c519b..194a5696a82c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -860,6 +860,9 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
 	unsigned int phy_id = phy_data->phydev->phy_id;
 
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
 	if ((phy_id & 0xfffffff0) != 0x01ff0cc0)
 		return false;
 
@@ -885,8 +888,80 @@ static bool xgbe_phy_finisar_phy_quirks(struct xgbe_prv_data *pdata)
 	return true;
 }
 
+static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
+	unsigned int phy_id = phy_data->phydev->phy_id;
+	int reg;
+
+	if (phy_data->port_mode != XGBE_PORT_MODE_SFP)
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
+		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
+		return false;
+
+	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
+		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
+		return false;
+
+	if ((phy_id & 0xfffffff0) != 0x03625d10)
+		return false;
+
+	/* Disable RGMII mode */
+	phy_write(phy_data->phydev, 0x18, 0x7007);
+	reg = phy_read(phy_data->phydev, 0x18);
+	phy_write(phy_data->phydev, 0x18, reg & ~0x0080);
+
+	/* Enable fiber register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0001);
+
+	/* Power down SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg | 0x00800);
+
+	/* Configure SGMII-to-Copper mode */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0006;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg | 0x0004);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	/* Enable copper register bank */
+	phy_write(phy_data->phydev, 0x1c, 0x7c00);
+	reg = phy_read(phy_data->phydev, 0x1c);
+	reg &= 0x03ff;
+	reg &= ~0x0001;
+	phy_write(phy_data->phydev, 0x1c, 0x8000 | 0x7c00 | reg);
+
+	/* Power up SerDes */
+	reg = phy_read(phy_data->phydev, 0x00);
+	phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
+
+	phy_data->phydev->supported = PHY_GBIT_FEATURES;
+	phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	phy_data->phydev->advertising = phy_data->phydev->supported;
+
+	netif_dbg(pdata, drv, pdata->netdev,
+		  "BelFuse PHY quirk in place\n");
+
+	return true;
+}
+
 static void xgbe_phy_external_phy_quirks(struct xgbe_prv_data *pdata)
 {
+	if (xgbe_phy_belfuse_phy_quirks(pdata))
+		return;
+
 	if (xgbe_phy_finisar_phy_quirks(pdata))
 		return;
 }
@@ -1027,37 +1102,6 @@ static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
 	return false;
 }
 
-static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	struct xgbe_sfp_eeprom *sfp_eeprom = &phy_data->sfp_eeprom;
-
-	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_NAME],
-		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
-		return false;
-
-	if (!memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
-		    XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN)) {
-		phy_data->sfp_base = XGBE_SFP_BASE_1000_SX;
-		phy_data->sfp_cable = XGBE_SFP_CABLE_ACTIVE;
-		phy_data->sfp_speed = XGBE_SFP_SPEED_1000;
-		if (phy_data->sfp_changed)
-			netif_dbg(pdata, drv, pdata->netdev,
-				  "Bel-Fuse SFP quirk in place\n");
-		return true;
-	}
-
-	return false;
-}
-
-static bool xgbe_phy_sfp_parse_quirks(struct xgbe_prv_data *pdata)
-{
-	if (xgbe_phy_belfuse_parse_quirks(pdata))
-		return true;
-
-	return false;
-}
-
 static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1076,9 +1120,6 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 	phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
 	phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
 
-	if (xgbe_phy_sfp_parse_quirks(pdata))
-		return;
-
 	/* Assume ACTIVE cable unless told it is PASSIVE */
 	if (sfp_base[XGBE_SFP_BASE_CABLE] & XGBE_SFP_BASE_CABLE_PASSIVE) {
 		phy_data->sfp_cable = XGBE_SFP_CABLE_PASSIVE;
-- 
cgit v1.2.3-59-g8ed1b


From 76cce0af85a0d5d8abef8b60eece5798ea7eea5a Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 23 May 2018 11:39:47 -0500
Subject: amd-xgbe: Improve SFP 100Mbps auto-negotiation

After changing speed to 100Mbps as a result of auto-negotiation (AN),
some 10/100/1000Mbps SFPs indicate a successful link (no faults or loss
of signal), but cannot successfully transmit or receive data.  These
SFPs required an extra auto-negotiation (AN) after the speed change in
order to operate properly.  Add a quirk for these SFPs so that if the
outcome of the AN actually results in changing to a new speed, re-initiate
AN at that new speed.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   | 77 ++++++++++++++++-------------
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  6 +++
 drivers/net/ethernet/amd/xgbe/xgbe.h        |  1 +
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 450b89cd9eeb..4b5d625de8f0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -331,13 +331,15 @@ static void xgbe_switch_mode(struct xgbe_prv_data *pdata)
 	xgbe_change_mode(pdata, pdata->phy_if.phy_impl.switch_mode(pdata));
 }
 
-static void xgbe_set_mode(struct xgbe_prv_data *pdata,
+static bool xgbe_set_mode(struct xgbe_prv_data *pdata,
 			  enum xgbe_mode mode)
 {
 	if (mode == xgbe_cur_mode(pdata))
-		return;
+		return false;
 
 	xgbe_change_mode(pdata, mode);
+
+	return true;
 }
 
 static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
@@ -1178,21 +1180,23 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata)
 	return 0;
 }
 
-static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata, bool set_mode)
 {
 	int ret;
 
+	mutex_lock(&pdata->an_mutex);
+
 	set_bit(XGBE_LINK_INIT, &pdata->dev_state);
 	pdata->link_check = jiffies;
 
 	ret = pdata->phy_if.phy_impl.an_config(pdata);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (pdata->phy.autoneg != AUTONEG_ENABLE) {
 		ret = xgbe_phy_config_fixed(pdata);
 		if (ret || !pdata->kr_redrv)
-			return ret;
+			goto out;
 
 		netif_dbg(pdata, link, pdata->netdev, "AN redriver support\n");
 	} else {
@@ -1202,24 +1206,27 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	/* Disable auto-negotiation interrupt */
 	disable_irq(pdata->an_irq);
 
-	/* Start auto-negotiation in a supported mode */
-	if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KR);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SFI);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
-		xgbe_set_mode(pdata, XGBE_MODE_X);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
-	} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
-		xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
-	} else {
-		enable_irq(pdata->an_irq);
-		return -EINVAL;
+	if (set_mode) {
+		/* Start auto-negotiation in a supported mode */
+		if (xgbe_use_mode(pdata, XGBE_MODE_KR)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KR);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_2500)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_2500);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_KX_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_KX_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SFI)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SFI);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_X)) {
+			xgbe_set_mode(pdata, XGBE_MODE_X);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_1000)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_1000);
+		} else if (xgbe_use_mode(pdata, XGBE_MODE_SGMII_100)) {
+			xgbe_set_mode(pdata, XGBE_MODE_SGMII_100);
+		} else {
+			enable_irq(pdata->an_irq);
+			ret = -EINVAL;
+			goto out;
+		}
 	}
 
 	/* Disable and stop any in progress auto-negotiation */
@@ -1239,16 +1246,7 @@ static int __xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	xgbe_an_init(pdata);
 	xgbe_an_restart(pdata);
 
-	return 0;
-}
-
-static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
-{
-	int ret;
-
-	mutex_lock(&pdata->an_mutex);
-
-	ret = __xgbe_phy_config_aneg(pdata);
+out:
 	if (ret)
 		set_bit(XGBE_LINK_ERR, &pdata->dev_state);
 	else
@@ -1259,6 +1257,16 @@ static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
 	return ret;
 }
 
+static int xgbe_phy_config_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, true);
+}
+
+static int xgbe_phy_reconfig_aneg(struct xgbe_prv_data *pdata)
+{
+	return __xgbe_phy_config_aneg(pdata, false);
+}
+
 static bool xgbe_phy_aneg_done(struct xgbe_prv_data *pdata)
 {
 	return (pdata->an_result == XGBE_AN_COMPLETE);
@@ -1315,7 +1323,8 @@ static void xgbe_phy_status_result(struct xgbe_prv_data *pdata)
 
 	pdata->phy.duplex = DUPLEX_FULL;
 
-	xgbe_set_mode(pdata, mode);
+	if (xgbe_set_mode(pdata, mode) && pdata->an_again)
+		xgbe_phy_reconfig_aneg(pdata);
 }
 
 static void xgbe_phy_status(struct xgbe_prv_data *pdata)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 194a5696a82c..3ceb4f95ca7c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -902,6 +902,9 @@ static bool xgbe_phy_belfuse_phy_quirks(struct xgbe_prv_data *pdata)
 		   XGBE_BEL_FUSE_VENDOR, XGBE_SFP_BASE_VENDOR_NAME_LEN))
 		return false;
 
+	/* For Bel-Fuse, use the extra AN flag */
+	pdata->an_again = 1;
+
 	if (memcmp(&sfp_eeprom->base[XGBE_SFP_BASE_VENDOR_PN],
 		   XGBE_BEL_FUSE_PARTNO, XGBE_SFP_BASE_VENDOR_PN_LEN))
 		return false;
@@ -978,6 +981,9 @@ static int xgbe_phy_find_phy_device(struct xgbe_prv_data *pdata)
 	if (phy_data->phydev)
 		return 0;
 
+	/* Clear the extra AN flag */
+	pdata->an_again = 0;
+
 	/* Check for the use of an external PHY */
 	if (phy_data->phydev_mode == XGBE_MDIO_MODE_NONE)
 		return 0;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 7a412cfa8e40..47bcbcf58048 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -1261,6 +1261,7 @@ struct xgbe_prv_data {
 	enum xgbe_rx kr_state;
 	enum xgbe_rx kx_state;
 	struct work_struct an_work;
+	unsigned int an_again;
 	unsigned int an_supported;
 	unsigned int parallel_detect;
 	unsigned int fec_ability;
-- 
cgit v1.2.3-59-g8ed1b


From a2889a4c2d3aefdf6f2a636fa1531243653a7633 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 23 May 2018 11:32:36 -0700
Subject: bpf: btf: Avoid variable length array

Sparse warning:
kernel/bpf/btf.c:1985:34: warning: Variable length array is used.

This patch directly uses ARRAY_SIZE().

Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9cbeabb5aca3..7e90fd13b5b5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1981,8 +1981,7 @@ static int btf_sec_info_cmp(const void *a, const void *b)
 static int btf_check_sec_info(struct btf_verifier_env *env,
 			      u32 btf_data_size)
 {
-	const unsigned int nr_secs = ARRAY_SIZE(btf_sec_info_offset);
-	struct btf_sec_info secs[nr_secs];
+	struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
 	u32 total, expected_total, i;
 	const struct btf_header *hdr;
 	const struct btf *btf;
@@ -1991,17 +1990,17 @@ static int btf_check_sec_info(struct btf_verifier_env *env,
 	hdr = &btf->hdr;
 
 	/* Populate the secs from hdr */
-	for (i = 0; i < nr_secs; i++)
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
 		secs[i] = *(struct btf_sec_info *)((void *)hdr +
 						   btf_sec_info_offset[i]);
 
-	sort(secs, nr_secs, sizeof(struct btf_sec_info),
-	     btf_sec_info_cmp, NULL);
+	sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+	     sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
 
 	/* Check for gaps and overlap among sections */
 	total = 0;
 	expected_total = btf_data_size - hdr->hdr_len;
-	for (i = 0; i < nr_secs; i++) {
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
 		if (expected_total < secs[i].off) {
 			btf_verifier_log(env, "Invalid section offset");
 			return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 2162fed49fa86cb3b5c296565837e674c53cc7ae Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:45 +0530
Subject: bpf: support 64-bit offsets for bpf function calls

The imm field of a bpf instruction is a signed 32-bit integer.
For JITed bpf-to-bpf function calls, it holds the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxiliary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b1372da6..559cb74ba29e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
cgit v1.2.3-59-g8ed1b


From 4ea69b2fd623dee2bbc77d3b6b7d8c0924e2026a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:46 +0530
Subject: bpf: powerpc64: pad function address loads with NOPs

For multi-function programs, loading the address of a callee
function to a register requires emitting instructions whose
count varies from one to five depending on the nature of the
address.

Since we come to know of the callee's address only before the
extra pass, the number of instructions required to load this
address may vary from what was previously generated. This can
make the JITed image grow or shrink.

To avoid this, we should generate a constant five-instruction
when loading function addresses by padding the optimized load
sequence with NOPs.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..e4582744a31d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 
 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
 {
+	unsigned int i, ctx_idx = ctx->idx;
+
+	/* Load function address into r12 */
+	PPC_LI64(12, func);
+
+	/* For bpf-to-bpf function calls, the callee's address is unknown
+	 * until the last extra pass. As seen above, we use PPC_LI64() to
+	 * load the callee's address, but this may optimize the number of
+	 * instructions required based on the nature of the address.
+	 *
+	 * Since we don't want the number of instructions emitted to change,
+	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
+	 * we always have a five-instruction sequence, which is the maximum
+	 * that PPC_LI64() can emit.
+	 */
+	for (i = ctx->idx - ctx_idx; i < 5; i++)
+		PPC_NOP();
+
 #ifdef PPC64_ELF_ABI_v1
-	/* func points to the function descriptor */
-	PPC_LI64(b2p[TMP_REG_2], func);
-	/* Load actual entry point from function descriptor */
-	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
-	/* ... and move it to LR */
-	PPC_MTLR(b2p[TMP_REG_1]);
 	/*
 	 * Load TOC from function descriptor at offset 8.
 	 * We can clobber r2 since we get called through a
 	 * function pointer (so caller will save/restore r2)
 	 * and since we don't use a TOC ourself.
 	 */
-	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
-#else
-	/* We can clobber r12 */
-	PPC_FUNC_ADDR(12, func);
-	PPC_MTLR(12);
+	PPC_BPF_LL(2, 12, 8);
+	/* Load actual entry point from function descriptor */
+	PPC_BPF_LL(12, 12, 0);
 #endif
+
+	PPC_MTLR(12);
 	PPC_BLRL();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 8484ce8306f941f921cfceaf357a28d3a014f178 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:47 +0530
Subject: bpf: powerpc64: add JIT support for multi-function programs

This adds support for bpf-to-bpf function calls in the powerpc64
JIT compiler. The JIT compiler converts the bpf call instructions
to native branch instructions. After a round of the usual passes,
the start addresses of the JITed images for the callee functions
are known. Finally, to fixup the branch target addresses, we need
to perform an extra pass.

Because of the address range in which JITed images are allocated
on powerpc64, the offsets of the start addresses of these images
from __bpf_call_base are as large as 64 bits. So, for a function
call, we cannot use the imm field of the instruction to determine
the callee's address. Instead, we use the alternative method of
getting it from the list of function addresses in the auxiliary
data of the caller by using the off field as an index.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/powerpc/net/bpf_jit_comp64.c | 76 +++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index e4582744a31d..f1c95779843b 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -268,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32
 /* Assemble the body code between the prologue & epilogue */
 static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			      struct codegen_context *ctx,
-			      u32 *addrs)
+			      u32 *addrs, bool extra_pass)
 {
 	const struct bpf_insn *insn = fp->insnsi;
 	int flen = fp->len;
@@ -724,11 +724,25 @@ emit_clear:
 			break;
 
 		/*
-		 * Call kernel helper
+		 * Call kernel helper or bpf function
 		 */
 		case BPF_JMP | BPF_CALL:
 			ctx->seen |= SEEN_FUNC;
-			func = (u8 *) __bpf_call_base + imm;
+
+			/* bpf function call */
+			if (insn[i].src_reg == BPF_PSEUDO_CALL)
+				if (!extra_pass)
+					func = NULL;
+				else if (fp->aux->func && off < fp->aux->func_cnt)
+					/* use the subprog id from the off
+					 * field to lookup the callee address
+					 */
+					func = (u8 *) fp->aux->func[off]->bpf_func;
+				else
+					return -EINVAL;
+			/* kernel helper call */
+			else
+				func = (u8 *) __bpf_call_base + imm;
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 
@@ -876,6 +890,14 @@ cond_branch:
 	return 0;
 }
 
+struct powerpc64_jit_data {
+	struct bpf_binary_header *header;
+	u32 *addrs;
+	u8 *image;
+	u32 proglen;
+	struct codegen_context ctx;
+};
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
@@ -883,6 +905,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	u8 *image = NULL;
 	u32 *code_base;
 	u32 *addrs;
+	struct powerpc64_jit_data *jit_data;
 	struct codegen_context cgctx;
 	int pass;
 	int flen;
@@ -890,6 +913,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	struct bpf_prog *org_fp = fp;
 	struct bpf_prog *tmp_fp;
 	bool bpf_blinded = false;
+	bool extra_pass = false;
 
 	if (!fp->jit_requested)
 		return org_fp;
@@ -903,11 +927,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		fp = tmp_fp;
 	}
 
+	jit_data = fp->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			fp = org_fp;
+			goto out;
+		}
+		fp->aux->jit_data = jit_data;
+	}
+
 	flen = fp->len;
+	addrs = jit_data->addrs;
+	if (addrs) {
+		cgctx = jit_data->ctx;
+		image = jit_data->image;
+		bpf_hdr = jit_data->header;
+		proglen = jit_data->proglen;
+		alloclen = proglen + FUNCTION_DESCR_SIZE;
+		extra_pass = true;
+		goto skip_init_ctx;
+	}
+
 	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
 	if (addrs == NULL) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	memset(&cgctx, 0, sizeof(struct codegen_context));
@@ -916,10 +961,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
 
 	/* Scouting faux-generate pass 0 */
-	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
+	if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
 		/* We hit something illegal or unsupported. */
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	/*
@@ -937,9 +982,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 			bpf_jit_fill_ill_insns);
 	if (!bpf_hdr) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
+skip_init_ctx:
 	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
 	/* Code generation passes 1-2 */
@@ -947,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		/* Now build the prologue, body code & epilogue for real. */
 		cgctx.idx = 0;
 		bpf_jit_build_prologue(code_base, &cgctx);
-		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
+		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
 		bpf_jit_build_epilogue(code_base, &cgctx);
 
 		if (bpf_jit_enable > 1)
@@ -973,10 +1019,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	fp->jited_len = alloclen;
 
 	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+	if (!fp->is_func || extra_pass) {
+out_addrs:
+		kfree(addrs);
+		kfree(jit_data);
+		fp->aux->jit_data = NULL;
+	} else {
+		jit_data->addrs = addrs;
+		jit_data->ctx = cgctx;
+		jit_data->proglen = proglen;
+		jit_data->image = image;
+		jit_data->header = bpf_hdr;
+	}
 
 out:
-	kfree(addrs);
-
 	if (bpf_blinded)
 		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);
 
-- 
cgit v1.2.3-59-g8ed1b


From dbecd7388476aedeb66389febea84d5450d28773 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:48 +0530
Subject: bpf: get kernel symbol addresses via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c94551001..068a4fc79ddb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74ba29e..8c4d9d0fd3ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
cgit v1.2.3-59-g8ed1b


From dd0c5f072e650458feb307fe3a8602cc2ec369d4 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:49 +0530
Subject: tools: bpf: sync bpf uapi header

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
addresses of the kernel symbols corresponding to each
function in a program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3e502d06bc3..0be90965867d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3-59-g8ed1b


From f84192ee00b7d8b3c38545d3a61d4191f80cc81a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:50 +0530
Subject: tools: bpftool: resolve calls without using imm field

Currently, we resolve the callee's address for a JITed function
call by using the imm field of the call instruction as an offset
from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
use this address to get the callee's kernel symbol's name.

For some architectures, such as powerpc64, the imm field is not
large enough to hold this offset. So, instead of assigning this
offset to the imm field, the verifier now assigns the subprog
id. Also, a list of kernel symbol addresses for all the JITed
functions is provided in the program info. We now use the imm
field as an index for this list to lookup a callee's symbol's
address and resolve its name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c          | 24 ++++++++++++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 +++++++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..e05ab58d39e2 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -420,7 +420,9 @@ static int do_show(int argc, char **argv)
 
 static int do_dump(int argc, char **argv)
 {
+	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
+	unsigned int nr_func_ksyms;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
@@ -496,10 +498,22 @@ static int do_dump(int argc, char **argv)
 		return -1;
 	}
 
+	nr_func_ksyms = info.nr_jited_ksyms;
+	if (nr_func_ksyms) {
+		func_ksyms = malloc(nr_func_ksyms * sizeof(__u64));
+		if (!func_ksyms) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
+	info.jited_ksyms = ptr_to_u64(func_ksyms);
+	info.nr_jited_ksyms = nr_func_ksyms;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -513,6 +527,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (info.nr_jited_ksyms > nr_func_ksyms) {
+		p_err("too many addresses returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -558,6 +577,9 @@ static int do_dump(int argc, char **argv)
 			dump_xlated_cfg(buf, *member_len);
 	} else {
 		kernel_syms_load(&dd);
+		dd.nr_jited_ksyms = info.nr_jited_ksyms;
+		dd.jited_ksyms = (__u64 *) info.jited_ksyms;
+
 		if (json_output)
 			dump_xlated_json(&dd, buf, *member_len, opcodes);
 		else
@@ -566,10 +588,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	free(buf);
+	free(func_ksyms);
 	return 0;
 
 err_free:
 	free(buf);
+	free(func_ksyms);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 7a3173b76c16..efdc8fecf2bb 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd,
 				    unsigned long address,
 				    const struct bpf_insn *insn)
 {
-	if (sym)
+	if (!dd->nr_jited_ksyms)
+		/* Do not show address for interpreted programs */
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			"%+d", insn->off);
+	else if (sym)
 		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
 			 "%+d#%s", insn->off, sym->name);
 	else
@@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
 	unsigned long address = dd->address_call_base + insn->imm;
 	struct kernel_sym *sym;
 
+	if (insn->src_reg == BPF_PSEUDO_CALL &&
+	    (__u32) insn->imm < dd->nr_jited_ksyms)
+		address = dd->jited_ksyms[insn->imm];
+
 	sym = kernel_syms_search(dd, address);
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		return print_call_pcrel(dd, sym, address, insn);
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index b34affa7ef2d..eafbb49c8d0b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,6 +49,8 @@ struct dump_data {
 	unsigned long address_call_base;
 	struct kernel_sym *sym_mapping;
 	__u32 sym_count;
+	__u64 *jited_ksyms;
+	__u32 nr_jited_ksyms;
 	char scratch_buff[SYM_MAX_NAME + 8];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 4d56a76ead2fcd856e677cdc9445ad331a409b8c Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:51 +0530
Subject: bpf: fix multi-function JITed dump obtained via syscall

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/syscall.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 068a4fc79ddb..c8e987a612b5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1970,13 +1970,44 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		u32 i;
+
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free, i;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					len = min_t(u32, len, free);
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+					if (!free)
+						break;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 815581c11cc29f74af252b6306ea1ec94160231a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:52 +0530
Subject: bpf: get JITed image lengths of functions via syscall

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c8e987a612b5..788456c18617 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2037,6 +2037,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3-59-g8ed1b


From bd980d43b977c0b6582be906da23b08e0ae1b3dc Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:53 +0530
Subject: tools: bpf: sync bpf uapi header

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
JITed image lengths of each function in a multi-function
program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0be90965867d..344d2ddcef49 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3-59-g8ed1b


From f7f62c7134288a704f90b137097e1ee689ff25ab Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Thu, 24 May 2018 12:26:54 +0530
Subject: tools: bpftool: add delimiters to multi-function JITed dumps

This splits up the contiguous JITed dump obtained via the bpf
system call into more relatable chunks for each function in
the program. If the kernel symbols corresponding to these are
known, they are printed in the header for each JIT image dump
otherwise the masked start address is printed.

Before applying this patch:

  # bpftool prog dump jited id 1

     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq
    72:	push   %rbp
    73:	mov    %rsp,%rbp
  ...
    dd:	leaveq
    de:	retq

  # bpftool -p prog dump jited id 1

  [{
          "pc": "0x0",
          "operation": "push",
          "operands": ["%rbp"
          ]
      },{
  ...
      },{
          "pc": "0x71",
          "operation": "retq",
          "operands": [null
          ]
      },{
          "pc": "0x72",
          "operation": "push",
          "operands": ["%rbp"
          ]
      },{
  ...
      },{
          "pc": "0xde",
          "operation": "retq",
          "operands": [null
          ]
      }
  ]

After applying this patch:

  # echo 0 > /proc/sys/net/core/bpf_jit_kallsyms
  # bpftool prog dump jited id 1

  0xffffffffc02c7000:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq

  0xffffffffc02cf000:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    6b:	leaveq
    6c:	retq

  # bpftool -p prog dump jited id 1

  [{
          "name": "0xffffffffc02c7000",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x71",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      },{
          "name": "0xffffffffc02cf000",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x6c",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      }
  ]

  # echo 1 > /proc/sys/net/core/bpf_jit_kallsyms
  # bpftool prog dump jited id 1

  bpf_prog_b811aab41a39ad3d_foo:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    70:	leaveq
    71:	retq

  bpf_prog_cf418ac8b67bebd9_F:
     0:	push   %rbp
     1:	mov    %rsp,%rbp
  ...
    6b:	leaveq
    6c:	retq

  # bpftool -p prog dump jited id 1

  [{
          "name": "bpf_prog_b811aab41a39ad3d_foo",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x71",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      },{
          "name": "bpf_prog_cf418ac8b67bebd9_F",
          "insns": [{
                  "pc": "0x0",
                  "operation": "push",
                  "operands": ["%rbp"
                  ]
              },{
  ...
              },{
                  "pc": "0x6c",
                  "operation": "retq",
                  "operands": [null
                  ]
              }
          ]
      }
  ]

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c          | 73 ++++++++++++++++++++++++++++++++++++++-
 tools/bpf/bpftool/xlated_dumper.c |  4 +--
 tools/bpf/bpftool/xlated_dumper.h |  1 +
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index e05ab58d39e2..39b88e760367 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -422,7 +422,9 @@ static int do_dump(int argc, char **argv)
 {
 	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
+	unsigned int *func_lens = NULL;
 	unsigned int nr_func_ksyms;
+	unsigned int nr_func_lens;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
@@ -508,12 +510,24 @@ static int do_dump(int argc, char **argv)
 		}
 	}
 
+	nr_func_lens = info.nr_jited_func_lens;
+	if (nr_func_lens) {
+		func_lens = malloc(nr_func_lens * sizeof(__u32));
+		if (!func_lens) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
 	info.jited_ksyms = ptr_to_u64(func_ksyms);
 	info.nr_jited_ksyms = nr_func_ksyms;
+	info.jited_func_lens = ptr_to_u64(func_lens);
+	info.nr_jited_func_lens = nr_func_lens;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -532,6 +546,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (info.nr_jited_func_lens > nr_func_lens) {
+		p_err("too many values returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -569,7 +588,57 @@ static int do_dump(int argc, char **argv)
 				goto err_free;
 		}
 
-		disasm_print_insn(buf, *member_len, opcodes, name);
+		if (info.nr_jited_func_lens && info.jited_func_lens) {
+			struct kernel_sym *sym = NULL;
+			char sym_name[SYM_MAX_NAME];
+			unsigned char *img = buf;
+			__u64 *ksyms = NULL;
+			__u32 *lens;
+			__u32 i;
+
+			if (info.nr_jited_ksyms) {
+				kernel_syms_load(&dd);
+				ksyms = (__u64 *) info.jited_ksyms;
+			}
+
+			if (json_output)
+				jsonw_start_array(json_wtr);
+
+			lens = (__u32 *) info.jited_func_lens;
+			for (i = 0; i < info.nr_jited_func_lens; i++) {
+				if (ksyms) {
+					sym = kernel_syms_search(&dd, ksyms[i]);
+					if (sym)
+						sprintf(sym_name, "%s", sym->name);
+					else
+						sprintf(sym_name, "0x%016llx", ksyms[i]);
+				} else {
+					strcpy(sym_name, "unknown");
+				}
+
+				if (json_output) {
+					jsonw_start_object(json_wtr);
+					jsonw_name(json_wtr, "name");
+					jsonw_string(json_wtr, sym_name);
+					jsonw_name(json_wtr, "insns");
+				} else {
+					printf("%s:\n", sym_name);
+				}
+
+				disasm_print_insn(img, lens[i], opcodes, name);
+				img += lens[i];
+
+				if (json_output)
+					jsonw_end_object(json_wtr);
+				else
+					printf("\n");
+			}
+
+			if (json_output)
+				jsonw_end_array(json_wtr);
+		} else {
+			disasm_print_insn(buf, *member_len, opcodes, name);
+		}
 	} else if (visual) {
 		if (json_output)
 			jsonw_null(json_wtr);
@@ -589,11 +658,13 @@ static int do_dump(int argc, char **argv)
 
 	free(buf);
 	free(func_ksyms);
+	free(func_lens);
 	return 0;
 
 err_free:
 	free(buf);
 	free(func_ksyms);
+	free(func_lens);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index efdc8fecf2bb..b97f1da60dd1 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -102,8 +102,8 @@ void kernel_syms_destroy(struct dump_data *dd)
 	free(dd->sym_mapping);
 }
 
-static struct kernel_sym *kernel_syms_search(struct dump_data *dd,
-					     unsigned long key)
+struct kernel_sym *kernel_syms_search(struct dump_data *dd,
+				      unsigned long key)
 {
 	struct kernel_sym sym = {
 		.address = key,
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index eafbb49c8d0b..33d86e2b369b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -56,6 +56,7 @@ struct dump_data {
 
 void kernel_syms_load(struct dump_data *dd);
 void kernel_syms_destroy(struct dump_data *dd);
+struct kernel_sym *kernel_syms_search(struct dump_data *dd, unsigned long key);
 void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len,
 		      bool opcodes);
 void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len,
-- 
cgit v1.2.3-59-g8ed1b


From 63526e1c805b5a8992d25386d315009fcabac8e2 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:12 +0100
Subject: ipv6: sr: make seg6.h includable without IPv6

include/net/seg6.h cannot be included in a source file if CONFIG_IPV6 is
not enabled:
   include/net/seg6.h: In function 'seg6_pernet':
>> include/net/seg6.h:52:14: error: 'struct net' has no member named
                                        'ipv6'; did you mean 'ipv4'?
     return net->ipv6.seg6_data;
                 ^~~~
                 ipv4

This commit makes seg6_pernet return NULL if IPv6 is not compiled, hence
allowing seg6.h to be included regardless of the configuration.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 099bad59dc90..70b4cfac52d7 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -49,7 +49,11 @@ struct seg6_pernet_data {
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 {
+#if IS_ENABLED(CONFIG_IPV6)
 	return net->ipv6.seg6_data;
+#else
+	return NULL;
+#endif
 }
 
 extern int seg6_init(void);
-- 
cgit v1.2.3-59-g8ed1b


From 1c1e761ef1a08a878a040e67979711015cfc163e Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:13 +0100
Subject: ipv6: sr: export function lookup_nexthop

The function lookup_nexthop is essential to implement most of the seg6local
actions. As we want to provide a BPF helper allowing to apply some of these
actions on the packet being processed, the helper should be able to call
this function, hence the need to make it public.

Moreover, if one argument is incorrect or if the next hop can not be found,
an error should be returned by the BPF helper so the BPF program can adapt
its processing of the packet (return an error, properly force the drop,
...). This patch hence makes this function return dst->error to indicate a
possible error.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6.h       |  3 ++-
 include/net/seg6_local.h | 24 ++++++++++++++++++++++++
 net/ipv6/seg6_local.c    | 20 +++++++++++---------
 3 files changed, 37 insertions(+), 10 deletions(-)
 create mode 100644 include/net/seg6_local.h

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 70b4cfac52d7..e029e301faa5 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -67,5 +67,6 @@ extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 			     int proto);
 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
-
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
 #endif
diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
new file mode 100644
index 000000000000..57498b23085d
--- /dev/null
+++ b/include/net/seg6_local.h
@@ -0,0 +1,24 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Authors:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_LOCAL_H
+#define _NET_SEG6_LOCAL_H
+
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
+extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			       u32 tbl_id);
+
+#endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 45722327375a..e9b23fb924ad 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -30,6 +30,7 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <net/seg6_local.h>
 #include <linux/etherdevice.h>
 
 struct seg6_local_lwt;
@@ -140,8 +141,8 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
 	*daddr = *addr;
 }
 
-static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
-			   u32 tbl_id)
+int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			u32 tbl_id)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -187,6 +188,7 @@ out:
 
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
+	return dst->error;
 }
 
 /* regular endpoint function */
@@ -200,7 +202,7 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -220,7 +222,7 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, &slwt->nh6, 0);
+	seg6_lookup_nexthop(skb, &slwt->nh6, 0);
 
 	return dst_input(skb);
 
@@ -239,7 +241,7 @@ static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 
 	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -331,7 +333,7 @@ static int input_action_end_dx6(struct sk_buff *skb,
 	if (!ipv6_addr_any(&slwt->nh6))
 		nhaddr = &slwt->nh6;
 
-	lookup_nexthop(skb, nhaddr, 0);
+	seg6_lookup_nexthop(skb, nhaddr, 0);
 
 	return dst_input(skb);
 drop:
@@ -380,7 +382,7 @@ static int input_action_end_dt6(struct sk_buff *skb,
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	lookup_nexthop(skb, NULL, slwt->table);
+	seg6_lookup_nexthop(skb, NULL, slwt->table);
 
 	return dst_input(skb);
 
@@ -406,7 +408,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -438,7 +440,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	lookup_nexthop(skb, NULL, 0);
+	seg6_lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3-59-g8ed1b


From fe94cc290f535709d3c5ebd1e472dfd0aec7ee79 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:14 +0100
Subject: bpf: Add IPv6 Segment Routing helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The BPF seg6local hook should be powerful enough to enable users to
implement most of the use-cases one could think of. After some thinking,
we figured out that the following actions should be possible on a SRv6
packet, requiring 3 specific helpers :
    - bpf_lwt_seg6_store_bytes: Modify non-sensitive fields of the SRH
    - bpf_lwt_seg6_adjust_srh: Allow to grow or shrink a SRH
                               (to add/delete TLVs)
    - bpf_lwt_seg6_action: Apply some SRv6 network programming actions
                           (specifically End.X, End.T, End.B6 and
                            End.B6.Encap)

The specifications of these helpers are provided in the patch (see
include/uapi/linux/bpf.h).

The non-sensitive fields of the SRH are the following : flags, tag and
TLVs. The other fields can not be modified, to maintain the SRH
integrity. Flags, tag and TLVs can easily be modified as their validity
can be checked afterwards via seg6_validate_srh. It is not allowed to
modify the segments directly. If one wants to add segments on the path,
he should stack a new SRH using the End.B6 action via
bpf_lwt_seg6_action.

Growing, shrinking or editing TLVs via the helpers will flag the SRH as
invalid, and it will have to be re-validated before re-entering the IPv6
layer. This flag is stored in a per-CPU buffer, along with the current
header length in bytes.

Storing the SRH len in bytes in the control block is mandatory when using
bpf_lwt_seg6_adjust_srh. The Header Ext. Length field contains the SRH
len rounded to 8 bytes (a padding TLV can be inserted to ensure the 8-bytes
boundary). When adding/deleting TLVs within the BPF program, the SRH may
temporary be in an invalid state where its length cannot be rounded to 8
bytes without remainder, hence the need to store the length in bytes
separately. The caller of the BPF program can then ensure that the SRH's
final length is valid using this value. Again, a final SRH modified by a
BPF program which doesn’t respect the 8-bytes boundary will be discarded
as it will be considered as invalid.

Finally, a fourth helper is provided, bpf_lwt_push_encap, which is
available from the LWT BPF IN hook, but not from the seg6local BPF one.
This helper allows to encapsulate a Segment Routing Header (either with
a new outer IPv6 header, or by inlining it directly in the existing IPv6
header) into a non-SRv6 packet. This helper is required if we want to
offer the possibility to dynamically encapsulate a SRH for non-SRv6 packet,
as the BPF seg6local hook only works on traffic already containing a SRH.
This is the BPF equivalent of the seg6 LWT infrastructure, which achieves
the same purpose but with a static SRH per route.

These helpers require CONFIG_IPV6=y (and not =m).

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/seg6_local.h |   8 ++
 include/uapi/linux/bpf.h |  96 +++++++++++++++-
 net/core/filter.c        | 285 +++++++++++++++++++++++++++++++++++++++++++----
 net/ipv6/Kconfig         |   5 +
 net/ipv6/seg6_local.c    |   2 +
 5 files changed, 372 insertions(+), 24 deletions(-)

diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h
index 57498b23085d..661fd5b4d3e0 100644
--- a/include/net/seg6_local.h
+++ b/include/net/seg6_local.h
@@ -15,10 +15,18 @@
 #ifndef _NET_SEG6_LOCAL_H
 #define _NET_SEG6_LOCAL_H
 
+#include <linux/percpu.h>
 #include <linux/net.h>
 #include <linux/ipv6.h>
 
 extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 			       u32 tbl_id);
 
+struct seg6_bpf_srh_state {
+	bool valid;
+	u16 hdrlen;
+};
+
+DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 344d2ddcef49..fdaf6a0bfa5b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1902,6 +1902,90 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2060,11 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2131,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index ba3ff5aa575a..2e05dcfda6d7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -64,6 +64,10 @@
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
+#include <net/ipv6.h>
+#include <linux/seg6_local.h>
+#include <net/seg6.h>
+#include <net/seg6_local.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3363,28 +3367,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_pkt_data(void *func)
-{
-	if (func == bpf_skb_vlan_push ||
-	    func == bpf_skb_vlan_pop ||
-	    func == bpf_skb_store_bytes ||
-	    func == bpf_skb_change_proto ||
-	    func == bpf_skb_change_head ||
-	    func == bpf_skb_change_tail ||
-	    func == bpf_skb_adjust_room ||
-	    func == bpf_skb_pull_data ||
-	    func == bpf_clone_redirect ||
-	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta ||
-	    func == bpf_msg_pull_data ||
-	    func == bpf_xdp_adjust_tail)
-		return true;
-
-	return false;
-}
-
 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
 				  unsigned long off, unsigned long len)
 {
@@ -4360,6 +4342,264 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+{
+	int err;
+	struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
+
+	if (!seg6_validate_srh(srh, len))
+		return -EINVAL;
+
+	switch (type) {
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EBADMSG;
+
+		err = seg6_do_srh_inline(skb, srh);
+		break;
+	case BPF_LWT_ENCAP_SEG6:
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+		err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (err)
+		return err;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	return seg6_lookup_nexthop(skb, NULL, 0);
+}
+#endif /* CONFIG_IPV6_SEG6_BPF */
+
+BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+	   u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	case BPF_LWT_ENCAP_SEG6:
+	case BPF_LWT_ENCAP_SEG6_INLINE:
+		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
+	.func		= bpf_lwt_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
+	   const void *, from, u32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_tlvs, *srh_end, *ptr;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
+	srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
+
+	ptr = skb->data + offset;
+	if (ptr >= srh_tlvs && ptr + len <= srh_end)
+		srh_state->valid = 0;
+	else if (ptr < (void *)&srh->flags ||
+		 ptr + len > (void *)&srh->segments)
+		return -EFAULT;
+
+	if (unlikely(bpf_try_make_writable(skb, offset + len)))
+		return -EFAULT;
+
+	memcpy(skb->data + offset, from, len);
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
+	.func		= bpf_lwt_seg6_store_bytes,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
+	   u32, action, void *, param, u32, param_len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int err;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	if (!srh_state->valid) {
+		if (unlikely((srh_state->hdrlen & 7) != 0))
+			return -EBADMSG;
+
+		srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
+		if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+			return -EBADMSG;
+
+		srh_state->valid = 1;
+	}
+
+	switch (action) {
+	case SEG6_LOCAL_ACTION_END_X:
+		if (param_len != sizeof(struct in6_addr))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
+	case SEG6_LOCAL_ACTION_END_T:
+		if (param_len != sizeof(int))
+			return -EINVAL;
+		return seg6_lookup_nexthop(skb, NULL, *(int *)param);
+	case SEG6_LOCAL_ACTION_END_B6:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	case SEG6_LOCAL_ACTION_END_B6_ENCAP:
+		err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
+					  param, param_len);
+		if (!err)
+			srh_state->hdrlen =
+				((struct ipv6_sr_hdr *)param)->hdrlen << 3;
+		return err;
+	default:
+		return -EINVAL;
+	}
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
+	.func		= bpf_lwt_seg6_action,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
+	   s32, len)
+{
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	void *srh_end, *srh_tlvs, *ptr;
+	struct ipv6_sr_hdr *srh;
+	struct ipv6hdr *hdr;
+	int srhoff = 0;
+	int ret;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return -EINVAL;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+	srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
+			((srh->first_segment + 1) << 4));
+	srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
+			srh_state->hdrlen);
+	ptr = skb->data + offset;
+
+	if (unlikely(ptr < srh_tlvs || ptr > srh_end))
+		return -EFAULT;
+	if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
+		return -EFAULT;
+
+	if (len > 0) {
+		ret = skb_cow_head(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		ret = bpf_skb_net_hdr_push(skb, offset, len);
+	} else {
+		ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
+	}
+
+	bpf_compute_data_pointers(skb);
+	if (unlikely(ret < 0))
+		return ret;
+
+	hdr = (struct ipv6hdr *)skb->data;
+	hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+	srh_state->hdrlen += len;
+	srh_state->valid = 0;
+	return 0;
+#else /* CONFIG_IPV6_SEG6_BPF */
+	return -EOPNOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
+	.func		= bpf_lwt_seg6_adjust_srh,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
+{
+	if (func == bpf_skb_vlan_push ||
+	    func == bpf_skb_vlan_pop ||
+	    func == bpf_skb_store_bytes ||
+	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
+	    func == bpf_skb_change_tail ||
+	    func == bpf_skb_adjust_room ||
+	    func == bpf_skb_pull_data ||
+	    func == bpf_clone_redirect ||
+	    func == bpf_l3_csum_replace ||
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data ||
+	    func == bpf_xdp_adjust_tail ||
+	    func == bpf_lwt_push_encap ||
+	    func == bpf_lwt_seg6_store_bytes ||
+	    func == bpf_lwt_seg6_adjust_srh ||
+	    func == bpf_lwt_seg6_action
+	    )
+		return true;
+
+	return false;
+}
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4774,7 +5014,6 @@ static bool lwt_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
-
 /* Attach type specific accesses */
 static bool __sock_filter_check_attach_type(int off,
 					    enum bpf_access_type access_type,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 11e4e80cf7e9..0eff75525da1 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -329,4 +329,9 @@ config IPV6_SEG6_HMAC
 
 	  If unsure, say N.
 
+config IPV6_SEG6_BPF
+	def_bool y
+	depends on IPV6_SEG6_LWTUNNEL
+	depends on IPV6 = y
+
 endif # IPV6
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index e9b23fb924ad..ae68c1ef8fb0 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -449,6 +449,8 @@ drop:
 	return err;
 }
 
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
-- 
cgit v1.2.3-59-g8ed1b


From cd3092c7f8db8c320ea7f1aa7c1adeac2450f43a Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:15 +0100
Subject: bpf: Split lwt inout verifier structures

The new bpf_lwt_push_encap helper should only be accessible within the
LWT BPF IN hook, and not the OUT one, as this may lead to a skb under
panic.

At the moment, both LWT BPF IN and OUT share the same list of helpers,
whose calls are authorized by the verifier. This patch separates the
verifier ops for the IN and OUT hooks, and allows the IN hook to call the
bpf_lwt_push_encap helper.

This patch is also the occasion to put all lwt_*_func_proto functions
together for clarity. At the moment, socks_op_func_proto is in the middle
of lwt_inout_func_proto and lwt_xmit_func_proto.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h |  4 +--
 net/core/filter.c         | 83 +++++++++++++++++++++++++++++------------------
 2 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b67f8793de0d..aa5c8b878474 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e05dcfda6d7..5dc44309d124 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4783,33 +4783,6 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
-static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
-	switch (func_id) {
-	case BPF_FUNC_skb_load_bytes:
-		return &bpf_skb_load_bytes_proto;
-	case BPF_FUNC_skb_pull_data:
-		return &bpf_skb_pull_data_proto;
-	case BPF_FUNC_csum_diff:
-		return &bpf_csum_diff_proto;
-	case BPF_FUNC_get_cgroup_classid:
-		return &bpf_get_cgroup_classid_proto;
-	case BPF_FUNC_get_route_realm:
-		return &bpf_get_route_realm_proto;
-	case BPF_FUNC_get_hash_recalc:
-		return &bpf_get_hash_recalc_proto;
-	case BPF_FUNC_perf_event_output:
-		return &bpf_skb_event_output_proto;
-	case BPF_FUNC_get_smp_processor_id:
-		return &bpf_get_smp_processor_id_proto;
-	case BPF_FUNC_skb_under_cgroup:
-		return &bpf_skb_under_cgroup_proto;
-	default:
-		return bpf_base_func_proto(func_id);
-	}
-}
-
 static const struct bpf_func_proto *
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4875,6 +4848,44 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_push_encap_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4906,7 +4917,7 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
 	default:
-		return lwt_inout_func_proto(func_id, prog);
+		return lwt_out_func_proto(func_id, prog);
 	}
 }
 
@@ -6587,13 +6598,23 @@ const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_verifier_ops = {
-	.get_func_proto		= lwt_inout_func_proto,
+const struct bpf_verifier_ops lwt_in_verifier_ops = {
+	.get_func_proto		= lwt_in_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_in_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
+const struct bpf_verifier_ops lwt_out_verifier_ops = {
+	.get_func_proto		= lwt_out_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_prog_ops lwt_inout_prog_ops = {
+const struct bpf_prog_ops lwt_out_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:16 +0100
Subject: ipv6: sr: Add seg6local action End.BPF

This patch adds the End.BPF action to the LWT seg6local infrastructure.
This action works like any other seg6local End action, meaning that an IPv6
header with SRH is needed, whose DA has to be equal to the SID of the
action. It will also advance the SRH to the next segment, the BPF program
does not have to take care of this.

Since the BPF program may not be a source of instability in the kernel, it
is important to ensure that the integrity of the packet is maintained
before yielding it back to the IPv6 layer. The hook hence keeps track if
the SRH has been altered through the helpers, and re-validates its
content if needed with seg6_validate_srh. The state kept for validation is
stored in a per-CPU buffer. The BPF program is not allowed to directly
write into the packet, and only some fields of the SRH can be altered
through the helper bpf_lwt_seg6_store_bytes.

Performances profiling has shown that the SRH re-validation does not induce
a significant overhead. If the altered SRH is deemed as invalid, the packet
is dropped.

This validation is also done before executing any action through
bpf_lwt_seg6_action, and will not be performed again if the SRH is not
modified after calling the action.

The BPF program may return 3 types of return codes:
    - BPF_OK: the End.BPF action will look up the next destination through
             seg6_lookup_nexthop.
    - BPF_REDIRECT: if an action has been executed through the
          bpf_lwt_seg6_action helper, the BPF program should return this
          value, as the skb's destination is already set and the default
          lookup should not be performed.
    - BPF_DROP : the packet will be dropped.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Acked-by: David Lebrun <dlebrun@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |   1 +
 include/uapi/linux/bpf.h        |   1 +
 include/uapi/linux/seg6_local.h |  12 +++
 kernel/bpf/verifier.c           |   1 +
 net/core/filter.c               |  25 ++++++
 net/ipv6/seg6_local.c           | 168 +++++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.c          |   1 +
 7 files changed, 207 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index aa5c8b878474..b161e506dcfc 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fdaf6a0bfa5b..e95fec90c2c1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
index ef2d8c3e76c1..edc138bdc56d 100644
--- a/include/uapi/linux/seg6_local.h
+++ b/include/uapi/linux/seg6_local.h
@@ -25,6 +25,7 @@ enum {
 	SEG6_LOCAL_NH6,
 	SEG6_LOCAL_IIF,
 	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
 	__SEG6_LOCAL_MAX,
 };
 #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
@@ -59,10 +60,21 @@ enum {
 	SEG6_LOCAL_ACTION_END_AS	= 13,
 	/* forward to SR-unaware VNF with masquerading */
 	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
 
 	__SEG6_LOCAL_ACTION_MAX,
 };
 
 #define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
 
+enum {
+	SEG6_LOCAL_BPF_PROG_UNSPEC,
+	SEG6_LOCAL_BPF_PROG,
+	SEG6_LOCAL_BPF_PROG_NAME,
+	__SEG6_LOCAL_BPF_PROG_MAX,
+};
+
+#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c4d9d0fd3ab..967cacf286ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1262,6 +1262,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 5dc44309d124..aa114c4acb25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4921,6 +4921,21 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_lwt_seg6_store_bytes:
+		return &bpf_lwt_seg6_store_bytes_proto;
+	case BPF_FUNC_lwt_seg6_action:
+		return &bpf_lwt_seg6_action_proto;
+	case BPF_FUNC_lwt_seg6_adjust_srh:
+		return &bpf_lwt_seg6_adjust_srh_proto;
+	default:
+		return lwt_out_func_proto(func_id, prog);
+	}
+}
+
 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
 				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
@@ -6629,6 +6644,16 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
+	.get_func_proto		= lwt_seg6local_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_seg6local_prog_ops = {
+	.test_run		= bpf_prog_test_run_skb,
+};
+
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ae68c1ef8fb0..cd6e4cab63f6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,8 +1,9 @@
 /*
  *  SR-IPv6 implementation
  *
- *  Author:
+ *  Authors:
  *  David Lebrun <david.lebrun@uclouvain.be>
+ *  eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
  *
  *
  *  This program is free software; you can redistribute it and/or
@@ -32,6 +33,7 @@
 #endif
 #include <net/seg6_local.h>
 #include <linux/etherdevice.h>
+#include <linux/bpf.h>
 
 struct seg6_local_lwt;
 
@@ -42,6 +44,11 @@ struct seg6_action_desc {
 	int static_headroom;
 };
 
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
 struct seg6_local_lwt {
 	int action;
 	struct ipv6_sr_hdr *srh;
@@ -50,6 +57,7 @@ struct seg6_local_lwt {
 	struct in6_addr nh6;
 	int iif;
 	int oif;
+	struct bpf_lwt_prog bpf;
 
 	int headroom;
 	struct seg6_action_desc *desc;
@@ -451,6 +459,69 @@ drop:
 
 DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
 
+static int input_action_end_bpf(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct seg6_bpf_srh_state *srh_state =
+		this_cpu_ptr(&seg6_bpf_srh_states);
+	struct seg6_bpf_srh_state local_srh_state;
+	struct ipv6_sr_hdr *srh;
+	int srhoff = 0;
+	int ret;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+	/* preempt_disable is needed to protect the per-CPU buffer srh_state,
+	 * which is also accessed by the bpf_lwt_seg6_* helpers
+	 */
+	preempt_disable();
+	srh_state->hdrlen = srh->hdrlen << 3;
+	srh_state->valid = 1;
+
+	rcu_read_lock();
+	bpf_compute_data_pointers(skb);
+	ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb);
+	rcu_read_unlock();
+
+	local_srh_state = *srh_state;
+	preempt_enable();
+
+	switch (ret) {
+	case BPF_OK:
+	case BPF_REDIRECT:
+		break;
+	case BPF_DROP:
+		goto drop;
+	default:
+		pr_warn_once("bpf-seg6local: Illegal return value %u\n", ret);
+		goto drop;
+	}
+
+	if (unlikely((local_srh_state.hdrlen & 7) != 0))
+		goto drop;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		goto drop;
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+	srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3);
+
+	if (!local_srh_state.valid &&
+	    unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)))
+		goto drop;
+
+	if (ret != BPF_REDIRECT)
+		seg6_lookup_nexthop(skb, NULL, 0);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
@@ -497,7 +568,13 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.attrs		= (1 << SEG6_LOCAL_SRH),
 		.input		= input_action_end_b6_encap,
 		.static_headroom	= sizeof(struct ipv6hdr),
-	}
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_BPF,
+		.attrs		= (1 << SEG6_LOCAL_BPF),
+		.input		= input_action_end_bpf,
+	},
+
 };
 
 static struct seg6_action_desc *__get_action_desc(int action)
@@ -542,6 +619,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 				    .len = sizeof(struct in6_addr) },
 	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_BPF]	= { .type = NLA_NESTED },
 };
 
 static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -719,6 +797,75 @@ static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
 	return 0;
 }
 
+#define MAX_PROG_NAME 256
+static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
+	[SEG6_LOCAL_BPF_PROG]	   = { .type = NLA_U32, },
+	[SEG6_LOCAL_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				       .len = MAX_PROG_NAME },
+};
+
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
+			       attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME])
+		return -EINVAL;
+
+	slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL);
+	if (!slwt->bpf.name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[SEG6_LOCAL_BPF_PROG]);
+	p = bpf_prog_get_type(fd, BPF_PROG_TYPE_LWT_SEG6LOCAL);
+	if (IS_ERR(p)) {
+		kfree(slwt->bpf.name);
+		return PTR_ERR(p);
+	}
+
+	slwt->bpf.prog = p;
+	return 0;
+}
+
+static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nest;
+
+	if (!slwt->bpf.prog)
+		return 0;
+
+	nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_BPF_PROG, slwt->bpf.prog->aux->id))
+		return -EMSGSIZE;
+
+	if (slwt->bpf.name &&
+	    nla_put_string(skb, SEG6_LOCAL_BPF_PROG_NAME, slwt->bpf.name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (!a->bpf.name && !b->bpf.name)
+		return 0;
+
+	if (!a->bpf.name || !b->bpf.name)
+		return 1;
+
+	return strcmp(a->bpf.name, b->bpf.name);
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -749,6 +896,11 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
 				    .put = put_nla_oif,
 				    .cmp = cmp_nla_oif },
+
+	[SEG6_LOCAL_BPF]	= { .parse = parse_nla_bpf,
+				    .put = put_nla_bpf,
+				    .cmp = cmp_nla_bpf },
+
 };
 
 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
@@ -834,6 +986,13 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
 	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
 
 	kfree(slwt->srh);
+
+	if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
+		kfree(slwt->bpf.name);
+		bpf_prog_put(slwt->bpf.prog);
+	}
+
+	return;
 }
 
 static int seg6_local_fill_encap(struct sk_buff *skb,
@@ -886,6 +1045,11 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
 	if (attrs & (1 << SEG6_LOCAL_OIF))
 		nlsize += nla_total_size(4);
 
+	if (attrs & (1 << SEG6_LOCAL_BPF))
+		nlsize += nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) +
+		       nla_total_size(4);
+
 	return nlsize;
 }
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e5cd4a958846..d07e44474848 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1456,6 +1456,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
 	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
-- 
cgit v1.2.3-59-g8ed1b


From c99a84eac026f404457810f5253220bab17ac213 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sun, 20 May 2018 14:58:17 +0100
Subject: selftests/bpf: test for seg6local End.BPF action

Add a new test for the seg6local End.BPF action. The following helpers
are also tested:

- bpf_lwt_push_encap within the LWT BPF IN hook
- bpf_lwt_seg6_action
- bpf_lwt_seg6_adjust_srh
- bpf_lwt_seg6_store_bytes

A chain of End.BPF actions is built. The SRH is injected through a LWT
BPF IN hook before entering this chain. Each End.BPF action validates
the previous one, otherwise the packet is dropped. The test succeeds
if the last node in the chain receives the packet and the UDP datagram
contained can be retrieved from userspace.

Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h                    |  97 ++++-
 tools/testing/selftests/bpf/Makefile              |   6 +-
 tools/testing/selftests/bpf/bpf_helpers.h         |  12 +
 tools/testing/selftests/bpf/test_lwt_seg6local.c  | 437 ++++++++++++++++++++++
 tools/testing/selftests/bpf/test_lwt_seg6local.sh | 140 +++++++
 5 files changed, 689 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_seg6local.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_seg6local.sh

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 344d2ddcef49..e95fec90c2c1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 };
 
 enum bpf_attach_type {
@@ -1902,6 +1903,90 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2061,11 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2132,12 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 49618c9a5638..85044448bbc7 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
+	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
+	test_lwt_seg6local.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -42,7 +43,8 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_meta.sh \
 	test_offload.py \
 	test_sock_addr.sh \
-	test_tunnel.sh
+	test_tunnel.sh \
+	test_lwt_seg6local.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 8f143dfb3700..334d3e8c5e89 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -114,6 +114,18 @@ static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
 			     int plen, __u32 flags) =
 	(void *) BPF_FUNC_fib_lookup;
+static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr,
+				 unsigned int len) =
+	(void *) BPF_FUNC_lwt_push_encap;
+static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset,
+				       void *from, unsigned int len) =
+	(void *) BPF_FUNC_lwt_seg6_store_bytes;
+static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param,
+				  unsigned int param_len) =
+	(void *) BPF_FUNC_lwt_seg6_action;
+static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset,
+				      unsigned int len) =
+	(void *) BPF_FUNC_lwt_seg6_adjust_srh;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.c b/tools/testing/selftests/bpf/test_lwt_seg6local.c
new file mode 100644
index 000000000000..0575751bc1bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.c
@@ -0,0 +1,437 @@
+#include <stddef.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <linux/seg6_local.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define bpf_printk(fmt, ...)				\
+({							\
+	char ____fmt[] = fmt;				\
+	bpf_trace_printk(____fmt, sizeof(____fmt),	\
+			##__VA_ARGS__);			\
+})
+
+/* Packet parsing state machine helpers. */
+#define cursor_advance(_cursor, _len) \
+	({ void *_tmp = _cursor; _cursor += _len; _tmp; })
+
+#define SR6_FLAG_ALERT (1 << 4)
+
+#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
+				0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
+#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
+				0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
+#define BPF_PACKET_HEADER __attribute__((packed))
+
+struct ip6_t {
+	unsigned int ver:4;
+	unsigned int priority:8;
+	unsigned int flow_label:20;
+	unsigned short payload_len;
+	unsigned char next_header;
+	unsigned char hop_limit;
+	unsigned long long src_hi;
+	unsigned long long src_lo;
+	unsigned long long dst_hi;
+	unsigned long long dst_lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_addr_t {
+	unsigned long long hi;
+	unsigned long long lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_srh_t {
+	unsigned char nexthdr;
+	unsigned char hdrlen;
+	unsigned char type;
+	unsigned char segments_left;
+	unsigned char first_segment;
+	unsigned char flags;
+	unsigned short tag;
+
+	struct ip6_addr_t segments[0];
+} BPF_PACKET_HEADER;
+
+struct sr6_tlv_t {
+	unsigned char type;
+	unsigned char len;
+	unsigned char value[0];
+} BPF_PACKET_HEADER;
+
+__attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb)
+{
+	void *cursor, *data_end;
+	struct ip6_srh_t *srh;
+	struct ip6_t *ip;
+	uint8_t *ipver;
+
+	data_end = (void *)(long)skb->data_end;
+	cursor = (void *)(long)skb->data;
+	ipver = (uint8_t *)cursor;
+
+	if ((void *)ipver + sizeof(*ipver) > data_end)
+		return NULL;
+
+	if ((*ipver >> 4) != 6)
+		return NULL;
+
+	ip = cursor_advance(cursor, sizeof(*ip));
+	if ((void *)ip + sizeof(*ip) > data_end)
+		return NULL;
+
+	if (ip->next_header != 43)
+		return NULL;
+
+	srh = cursor_advance(cursor, sizeof(*srh));
+	if ((void *)srh + sizeof(*srh) > data_end)
+		return NULL;
+
+	if (srh->type != 4)
+		return NULL;
+
+	return srh;
+}
+
+__attribute__((always_inline))
+int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad,
+		   uint32_t old_pad, uint32_t pad_off)
+{
+	int err;
+
+	if (new_pad != old_pad) {
+		err = bpf_lwt_seg6_adjust_srh(skb, pad_off,
+					  (int) new_pad - (int) old_pad);
+		if (err)
+			return err;
+	}
+
+	if (new_pad > 0) {
+		char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+					0, 0, 0};
+		struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf;
+
+		pad_tlv->type = SR6_TLV_PADDING;
+		pad_tlv->len = new_pad - 2;
+
+		err = bpf_lwt_seg6_store_bytes(skb, pad_off,
+					       (void *)pad_tlv_buf, new_pad);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+__attribute__((always_inline))
+int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
+			  uint32_t *tlv_off, uint32_t *pad_size,
+			  uint32_t *pad_off)
+{
+	uint32_t srh_off, cur_off;
+	int offset_valid = 0;
+	int err;
+
+	srh_off = (char *)srh - (char *)(long)skb->data;
+	// cur_off = end of segments, start of possible TLVs
+	cur_off = srh_off + sizeof(*srh) +
+		sizeof(struct ip6_addr_t) * (srh->first_segment + 1);
+
+	*pad_off = 0;
+
+	// we can only go as far as ~10 TLVs due to the BPF max stack size
+	#pragma clang loop unroll(full)
+	for (int i = 0; i < 10; i++) {
+		struct sr6_tlv_t tlv;
+
+		if (cur_off == *tlv_off)
+			offset_valid = 1;
+
+		if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3))
+			break;
+
+		err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv));
+		if (err)
+			return err;
+
+		if (tlv.type == SR6_TLV_PADDING) {
+			*pad_size = tlv.len + sizeof(tlv);
+			*pad_off = cur_off;
+
+			if (*tlv_off == srh_off) {
+				*tlv_off = cur_off;
+				offset_valid = 1;
+			}
+			break;
+
+		} else if (tlv.type == SR6_TLV_HMAC) {
+			break;
+		}
+
+		cur_off += sizeof(tlv) + tlv.len;
+	} // we reached the padding or HMAC TLVs, or the end of the SRH
+
+	if (*pad_off == 0)
+		*pad_off = cur_off;
+
+	if (*tlv_off == -1)
+		*tlv_off = cur_off;
+	else if (!offset_valid)
+		return -EINVAL;
+
+	return 0;
+}
+
+__attribute__((always_inline))
+int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off,
+	    struct sr6_tlv_t *itlv, uint8_t tlv_size)
+{
+	uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+	uint8_t len_remaining, new_pad;
+	uint32_t pad_off = 0;
+	uint32_t pad_size = 0;
+	uint32_t partial_srh_len;
+	int err;
+
+	if (tlv_off != -1)
+		tlv_off += srh_off;
+
+	if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC)
+		return -EINVAL;
+
+	err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len);
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size);
+	if (err)
+		return err;
+
+	// the following can't be moved inside update_tlv_pad because the
+	// bpf verifier has some issues with it
+	pad_off += sizeof(*itlv) + itlv->len;
+	partial_srh_len = pad_off - srh_off;
+	len_remaining = partial_srh_len % 8;
+	new_pad = 8 - len_remaining;
+
+	if (new_pad == 1) // cannot pad for 1 byte only
+		new_pad = 9;
+	else if (new_pad == 8)
+		new_pad = 0;
+
+	return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+__attribute__((always_inline))
+int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh,
+	       uint32_t tlv_off)
+{
+	uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+	uint8_t len_remaining, new_pad;
+	uint32_t partial_srh_len;
+	uint32_t pad_off = 0;
+	uint32_t pad_size = 0;
+	struct sr6_tlv_t tlv;
+	int err;
+
+	tlv_off += srh_off;
+
+	err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+	if (err)
+		return err;
+
+	err = bpf_skb_load_bytes(skb, tlv_off, &tlv, sizeof(tlv));
+	if (err)
+		return err;
+
+	err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, -(sizeof(tlv) + tlv.len));
+	if (err)
+		return err;
+
+	pad_off -= sizeof(tlv) + tlv.len;
+	partial_srh_len = pad_off - srh_off;
+	len_remaining = partial_srh_len % 8;
+	new_pad = 8 - len_remaining;
+	if (new_pad == 1) // cannot pad for 1 byte only
+		new_pad = 9;
+	else if (new_pad == 8)
+		new_pad = 0;
+
+	return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+__attribute__((always_inline))
+int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh)
+{
+	int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) +
+		((srh->first_segment + 1) << 4);
+	struct sr6_tlv_t tlv;
+
+	if (bpf_skb_load_bytes(skb, tlv_offset, &tlv, sizeof(struct sr6_tlv_t)))
+		return 0;
+
+	if (tlv.type == SR6_TLV_EGRESS && tlv.len == 18) {
+		struct ip6_addr_t egr_addr;
+
+		if (bpf_skb_load_bytes(skb, tlv_offset + 4, &egr_addr, 16))
+			return 0;
+
+		// check if egress TLV value is correct
+		if (ntohll(egr_addr.hi) == 0xfd00000000000000 &&
+				ntohll(egr_addr.lo) == 0x4)
+			return 1;
+	}
+
+	return 0;
+}
+
+// This function will push a SRH with segments fd00::1, fd00::2, fd00::3,
+// fd00::4
+SEC("encap_srh")
+int __encap_srh(struct __sk_buff *skb)
+{
+	unsigned long long hi = 0xfd00000000000000;
+	struct ip6_addr_t *seg;
+	struct ip6_srh_t *srh;
+	char srh_buf[72]; // room for 4 segments
+	int err;
+
+	srh = (struct ip6_srh_t *)srh_buf;
+	srh->nexthdr = 0;
+	srh->hdrlen = 8;
+	srh->type = 4;
+	srh->segments_left = 3;
+	srh->first_segment = 3;
+	srh->flags = 0;
+	srh->tag = 0;
+
+	seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh));
+
+	#pragma clang loop unroll(full)
+	for (unsigned long long lo = 0; lo < 4; lo++) {
+		seg->lo = htonll(4 - lo);
+		seg->hi = htonll(hi);
+		seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg));
+	}
+
+	err = bpf_lwt_push_encap(skb, 0, (void *)srh, sizeof(srh_buf));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_REDIRECT;
+}
+
+// Add an Egress TLV fc00::4, add the flag A,
+// and apply End.X action to fc42::1
+SEC("add_egr_x")
+int __add_egr_x(struct __sk_buff *skb)
+{
+	unsigned long long hi = 0xfc42000000000000;
+	unsigned long long lo = 0x1;
+	struct ip6_srh_t *srh = get_srh(skb);
+	uint8_t new_flags = SR6_FLAG_ALERT;
+	struct ip6_addr_t addr;
+	int err, offset;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+			   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4};
+
+	err = add_tlv(skb, srh, (srh->hdrlen+1) << 3,
+		      (struct sr6_tlv_t *)&tlv, 20);
+	if (err)
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+	err = bpf_lwt_seg6_store_bytes(skb, offset,
+				       (void *)&new_flags, sizeof(new_flags));
+	if (err)
+		return BPF_DROP;
+
+	addr.lo = htonll(lo);
+	addr.hi = htonll(hi);
+	err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
+				  (void *)&addr, sizeof(addr));
+	if (err)
+		return BPF_DROP;
+	return BPF_REDIRECT;
+}
+
+// Pop the Egress TLV, reset the flags, change the tag 2442 and finally do a
+// simple End action
+SEC("pop_egr")
+int __pop_egr(struct __sk_buff *skb)
+{
+	struct ip6_srh_t *srh = get_srh(skb);
+	uint16_t new_tag = bpf_htons(2442);
+	uint8_t new_flags = 0;
+	int err, offset;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	if (srh->flags != SR6_FLAG_ALERT)
+		return BPF_DROP;
+
+	if (srh->hdrlen != 11) // 4 segments + Egress TLV + Padding TLV
+		return BPF_DROP;
+
+	if (!has_egr_tlv(skb, srh))
+		return BPF_DROP;
+
+	err = delete_tlv(skb, srh, 8 + (srh->first_segment + 1) * 16);
+	if (err)
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+	if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_flags,
+				     sizeof(new_flags)))
+		return BPF_DROP;
+
+	offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, tag);
+	if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_tag,
+				     sizeof(new_tag)))
+		return BPF_DROP;
+
+	return BPF_OK;
+}
+
+// Inspect if the Egress TLV and flag have been removed, if the tag is correct,
+// then apply a End.T action to reach the last segment
+SEC("inspect_t")
+int __inspect_t(struct __sk_buff *skb)
+{
+	struct ip6_srh_t *srh = get_srh(skb);
+	int table = 117;
+	int err;
+
+	if (srh == NULL)
+		return BPF_DROP;
+
+	if (srh->flags != 0)
+		return BPF_DROP;
+
+	if (srh->tag != bpf_htons(2442))
+		return BPF_DROP;
+
+	if (srh->hdrlen != 8) // 4 segments
+		return BPF_DROP;
+
+	err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_T,
+				  (void *)&table, sizeof(table));
+
+	if (err)
+		return BPF_DROP;
+
+	return BPF_REDIRECT;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
new file mode 100755
index 000000000000..1c77994b5e71
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# Connects 6 network namespaces through veths.
+# Each NS may have different IPv6 global scope addresses :
+#   NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6
+# fb00::1           fd00::1  fd00::2  fd00::3  fb00::6
+#                   fc42::1           fd00::4
+#
+# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a
+# IPv6 header with a Segment Routing Header, with segments :
+# 	fd00::1 -> fd00::2 -> fd00::3 -> fd00::4
+#
+# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions :
+# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1
+# - fd00::2 : remove the TLV, change the flags, add a tag
+# - fd00::3 : apply an End.T action to fd00::4, through routing table 117
+#
+# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet.
+# Each End.BPF action will validate the operations applied on the SRH by the
+# previous BPF program in the chain, otherwise the packet is dropped.
+#
+# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this
+# datagram can be read on NS6 when binding to fb00::6.
+
+TMP_FILE="/tmp/selftest_lwt_seg6local.txt"
+
+cleanup()
+{
+	if [ "$?" = "0" ]; then
+		echo "selftests: test_lwt_seg6local [PASS]";
+	else
+		echo "selftests: test_lwt_seg6local [FAILED]";
+	fi
+
+	set +e
+	ip netns del ns1 2> /dev/null
+	ip netns del ns2 2> /dev/null
+	ip netns del ns3 2> /dev/null
+	ip netns del ns4 2> /dev/null
+	ip netns del ns5 2> /dev/null
+	ip netns del ns6 2> /dev/null
+	rm -f $TMP_FILE
+}
+
+set -e
+
+ip netns add ns1
+ip netns add ns2
+ip netns add ns3
+ip netns add ns4
+ip netns add ns5
+ip netns add ns6
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+ip link add veth3 type veth peer name veth4
+ip link add veth5 type veth peer name veth6
+ip link add veth7 type veth peer name veth8
+ip link add veth9 type veth peer name veth10
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+ip link set veth3 netns ns2
+ip link set veth4 netns ns3
+ip link set veth5 netns ns3
+ip link set veth6 netns ns4
+ip link set veth7 netns ns4
+ip link set veth8 netns ns5
+ip link set veth9 netns ns5
+ip link set veth10 netns ns6
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+ip netns exec ns2 ip link set dev veth3 up
+ip netns exec ns3 ip link set dev veth4 up
+ip netns exec ns3 ip link set dev veth5 up
+ip netns exec ns4 ip link set dev veth6 up
+ip netns exec ns4 ip link set dev veth7 up
+ip netns exec ns5 ip link set dev veth8 up
+ip netns exec ns5 ip link set dev veth9 up
+ip netns exec ns6 ip link set dev veth10 up
+ip netns exec ns6 ip link set dev lo up
+
+# All link scope addresses and routes required between veths
+ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link
+ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link
+ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link
+ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link
+ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link
+ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link
+ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link
+ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link
+ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link
+ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link
+ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link
+ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link
+ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link
+ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link
+
+ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo
+ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21
+
+ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2
+ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link
+
+ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65
+ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF obj test_lwt_seg6local.o sec add_egr_x dev veth4
+
+ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF obj test_lwt_seg6local.o sec pop_egr dev veth6
+ip netns exec ns4 ip -6 addr add fc42::1 dev lo
+ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87
+
+ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109
+ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF obj test_lwt_seg6local.o sec inspect_t dev veth8
+
+ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo
+ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo
+
+ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
+
+ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE &
+ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
+sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment
+kill -INT $!
+
+if [[ $(< $TMP_FILE) != "foobar" ]]; then
+	exit 1
+fi
+
+exit 0
-- 
cgit v1.2.3-59-g8ed1b


From 61a552eb487f89dd99d480b711b9a073e22ab4c0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 23 May 2018 21:29:05 -0700
Subject: bpfilter: fix build dependency

BPFILTER could have been enabled without INET causing this build error:
ERROR: "bpfilter_process_sockopt" [net/bpfilter/bpfilter.ko] undefined!

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 60725c5f79db..a948b072c28f 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -1,7 +1,7 @@
 menuconfig BPFILTER
 	bool "BPF based packet filtering framework (BPFILTER)"
 	default n
-	depends on NET && BPF
+	depends on NET && BPF && INET
 	help
 	  This builds experimental bpfilter framework that is aiming to
 	  provide netfilter compatible functionality via BPF
-- 
cgit v1.2.3-59-g8ed1b


From 13405468f49d4b6e014fdf376f46393f9590769d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 24 May 2018 00:41:19 -0700
Subject: bpfilter: don't pass O_CREAT when opening console for debug

Passing O_CREAT (00000100) to open means we should also pass file
mode as the third parameter.  Creating /dev/console as a regular
file may not be helpful anyway, so simply drop the flag when
opening debug_fd.

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
index 81bbc1684896..1317f108df8a 100644
--- a/net/bpfilter/main.c
+++ b/net/bpfilter/main.c
@@ -55,7 +55,7 @@ static void loop(void)
 
 int main(void)
 {
-	debug_fd = open("/dev/console", 00000002 | 00000100);
+	debug_fd = open("/dev/console", 00000002);
 	dprintf(debug_fd, "Started bpfilter\n");
 	loop();
 	close(debug_fd);
-- 
cgit v1.2.3-59-g8ed1b


From 87e5808d52b65fc5b0bfda209ba8864cc2f933e5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 23 May 2018 08:05:20 +0200
Subject: net: phy: replace bool members in struct phy_device with bit-fields

In struct phy_device we have a number of flags being defined as type
bool. Similar to e.g. struct pci_dev we can save some space by using
bit-fields.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 073235e70442..6cd09098427c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -406,13 +406,17 @@ struct phy_device {
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
-	bool is_c45;
-	bool is_internal;
-	bool is_pseudo_fixed_link;
-	bool has_fixups;
-	bool suspended;
-	bool sysfs_links;
-	bool loopback_enabled;
+	unsigned is_c45:1;
+	unsigned is_internal:1;
+	unsigned is_pseudo_fixed_link:1;
+	unsigned has_fixups:1;
+	unsigned suspended:1;
+	unsigned sysfs_links:1;
+	unsigned loopback_enabled:1;
+
+	unsigned autoneg:1;
+	/* The most recently read link state */
+	unsigned link:1;
 
 	enum phy_state state;
 
@@ -429,9 +433,6 @@ struct phy_device {
 	int pause;
 	int asym_pause;
 
-	/* The most recently read link state */
-	int link;
-
 	/* Enabled Interrupts */
 	u32 interrupts;
 
@@ -444,8 +445,6 @@ struct phy_device {
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-	int autoneg;
-
 	int link_timeout;
 
 #ifdef CONFIG_LED_TRIGGER_PHY
-- 
cgit v1.2.3-59-g8ed1b


From e549f6f9c098067a99e9de8ac84f5cc2c07ae5c6 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 22 Feb 2018 11:57:10 -0600
Subject: net/dcb: Add dcbnl buffer attribute

In this patch, we add dcbnl buffer attribute to allow user
change the NIC's buffer configuration such as priority
to buffer mapping and buffer size of individual buffer.

This attribute combined with pfc attribute allows advanced user to
fine tune the qos setting for specific priority queue. For example,
user can give dedicated buffer for one or more priorities or user
can give large buffer to certain priorities.

The dcb buffer configuration will be controlled by lldptool.
lldptool -T -i eth2 -V BUFFER prio 0,2,5,7,1,2,3,6
  maps priorities 0,1,2,3,4,5,6,7 to receive buffer 0,2,5,7,1,2,3,6
lldptool -T -i eth2 -V BUFFER size 87296,87296,0,87296,0,0,0,0
  sets receive buffer size for buffer 0,1,2,3,4,5,6,7 respectively

After discussion on mailing list with Jakub, Jiri, Ido and John, we agreed to
choose dcbnl over devlink interface since this feature is intended to set
port attributes which are governed by the netdev instance of that port, where
devlink API is more suitable for global ASIC configurations.

We present an use case scenario where dcbnl buffer attribute configured
by advance user helps reduce the latency of messages of different sizes.

Scenarios description:
On ConnectX-5, we run latency sensitive traffic with
small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
traffic with large messages sizes 512KB and 1MB. We group small, medium,
and large message sizes to their own pfc enables priorities as follow.
  Priorities 1 & 2 (64B, 256B and 1KB)
  Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
  Priorities 5 & 6 (512KB and 1MB)

By default, ConnectX-5 maps all pfc enabled priorities to a single
lossless fixed buffer size of 50% of total available buffer space. The
other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
we create three equal size lossless buffers. Each buffer has 25% of total
available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
to lossless  buffer mappings are set as follow.
  Priorities 1 & 2 on lossless buffer #1
  Priorities 3 & 4 on lossless buffer #2
  Priorities 5 & 6 on lossless buffer #3

We observe improvements in latency for small and medium message sizes
as follows. Please note that the large message sizes bandwidth performance is
reduced but the total bandwidth remains the same.
  256B message size (42 % latency reduction)
  4K message size (21% latency reduction)
  64K message size (16% latency reduction)

CC: Ido Schimmel <idosch@idosch.org>
CC: Jakub Kicinski <jakub.kicinski@netronome.com>
CC: Jiri Pirko <jiri@resnulli.us>
CC: Or Gerlitz <gerlitz.or@gmail.com>
CC: Parav Pandit <parav@mellanox.com>
CC: Aron Silverton <aron.silverton@oracle.com>
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/dcbnl.h        |  4 ++++
 include/uapi/linux/dcbnl.h | 11 +++++++++++
 net/dcb/dcbnl.c            | 20 ++++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 207d9ba1f92c..0e5e91be2d30 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -101,6 +101,10 @@ struct dcbnl_rtnl_ops {
 	/* CEE peer */
 	int (*cee_peer_getpg) (struct net_device *, struct cee_pg *);
 	int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *);
+
+	/* buffer settings */
+	int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *);
+	int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 2c0c6453c3f4..60aa2e446698 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -163,6 +163,16 @@ struct ieee_pfc {
 	__u64	indications[IEEE_8021QAZ_MAX_TCS];
 };
 
+#define IEEE_8021Q_MAX_PRIORITIES 8
+#define DCBX_MAX_BUFFERS  8
+struct dcbnl_buffer {
+	/* priority to buffer mapping */
+	__u8    prio2buffer[IEEE_8021Q_MAX_PRIORITIES];
+	/* buffer size in Bytes */
+	__u32   buffer_size[DCBX_MAX_BUFFERS];
+	__u32   total_size;
+};
+
 /* CEE DCBX std supported values */
 #define CEE_DCBX_MAX_PGS	8
 #define CEE_DCBX_MAX_PRIO	8
@@ -406,6 +416,7 @@ enum ieee_attrs {
 	DCB_ATTR_IEEE_MAXRATE,
 	DCB_ATTR_IEEE_QCN,
 	DCB_ATTR_IEEE_QCN_STATS,
+	DCB_ATTR_DCB_BUFFER,
 	__DCB_ATTR_IEEE_MAX
 };
 #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1)
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index bae7d78aa068..d2f4e0c1faaf 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -176,6 +176,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
 	[DCB_ATTR_IEEE_MAXRATE]   = {.len = sizeof(struct ieee_maxrate)},
 	[DCB_ATTR_IEEE_QCN]         = {.len = sizeof(struct ieee_qcn)},
 	[DCB_ATTR_IEEE_QCN_STATS]   = {.len = sizeof(struct ieee_qcn_stats)},
+	[DCB_ATTR_DCB_BUFFER]       = {.len = sizeof(struct dcbnl_buffer)},
 };
 
 /* DCB number of traffic classes nested attributes. */
@@ -1094,6 +1095,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 			return -EMSGSIZE;
 	}
 
+	if (ops->dcbnl_getbuffer) {
+		struct dcbnl_buffer buffer;
+
+		memset(&buffer, 0, sizeof(buffer));
+		err = ops->dcbnl_getbuffer(netdev, &buffer);
+		if (!err &&
+		    nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
+			return -EMSGSIZE;
+	}
+
 	app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
 	if (!app)
 		return -EMSGSIZE;
@@ -1453,6 +1464,15 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
 			goto err;
 	}
 
+	if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
+		struct dcbnl_buffer *buffer =
+			nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+
+		err = ops->dcbnl_setbuffer(netdev, buffer);
+		if (err)
+			goto err;
+	}
+
 	if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
 		struct nlattr *attr;
 		int rem;
-- 
cgit v1.2.3-59-g8ed1b


From 2c81bfd5ae5659df44b38ec71c404b4b261a9515 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 22 Feb 2018 13:22:56 -0600
Subject: net/mlx5e: Move port speed code from en_ethtool.c to en/port.c

Move four below functions from en_ethtool.c to en/port.c. These
functions are used by both en_ethtool.c and en_main.c. Future code
can use these functions without ethtool link mode dependency.
  u32 mlx5e_port_ptys2speed(u32 eth_proto_oper);
  int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
  int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
  u32 mlx5e_port_speed2linkmodes(u32 speed);

Delete the speed field from table mlx5e_build_ptys2ethtool_map. This
table only keeps the mapping between the mlx5e link mode and
ethtool link mode. Add new table mlx5e_link_speed for translation
from mlx5e link mode to actual speed.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   2 -
 .../net/ethernet/mellanox/mlx5/core/en/Makefile    |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c  | 129 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h  |  43 +++++++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 102 ++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   3 +-
 8 files changed, 213 insertions(+), 72 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index a7135f5d5cf6..651cf3640420 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -15,7 +15,7 @@ mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
 
 mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
 		en_tx.o en_rx.o en_dim.o en_txrx.o en_stats.o vxlan.o \
-		en_arfs.o en_fs_ethtool.o en_selftest.o
+		en_arfs.o en_fs_ethtool.o en_selftest.o en/port.o
 
 mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index bc91a7335c93..d13a86a1d702 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -932,8 +932,6 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
 
 void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
 				   int num_channels);
-int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-
 void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
 				 u8 cq_period_mode);
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
new file mode 100644
index 000000000000..d8e17110f25d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
new file mode 100644
index 000000000000..9f04542f3661
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "port.h"
+
+/* speed in units of 1Mb */
+static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
+	[MLX5E_1000BASE_CX_SGMII] = 1000,
+	[MLX5E_1000BASE_KX]       = 1000,
+	[MLX5E_10GBASE_CX4]       = 10000,
+	[MLX5E_10GBASE_KX4]       = 10000,
+	[MLX5E_10GBASE_KR]        = 10000,
+	[MLX5E_20GBASE_KR2]       = 20000,
+	[MLX5E_40GBASE_CR4]       = 40000,
+	[MLX5E_40GBASE_KR4]       = 40000,
+	[MLX5E_56GBASE_R4]        = 56000,
+	[MLX5E_10GBASE_CR]        = 10000,
+	[MLX5E_10GBASE_SR]        = 10000,
+	[MLX5E_10GBASE_ER]        = 10000,
+	[MLX5E_40GBASE_SR4]       = 40000,
+	[MLX5E_40GBASE_LR4]       = 40000,
+	[MLX5E_50GBASE_SR2]       = 50000,
+	[MLX5E_100GBASE_CR4]      = 100000,
+	[MLX5E_100GBASE_SR4]      = 100000,
+	[MLX5E_100GBASE_KR4]      = 100000,
+	[MLX5E_100GBASE_LR4]      = 100000,
+	[MLX5E_100BASE_TX]        = 100,
+	[MLX5E_1000BASE_T]        = 1000,
+	[MLX5E_10GBASE_T]         = 10000,
+	[MLX5E_25GBASE_CR]        = 25000,
+	[MLX5E_25GBASE_KR]        = 25000,
+	[MLX5E_25GBASE_SR]        = 25000,
+	[MLX5E_50GBASE_CR2]       = 50000,
+	[MLX5E_50GBASE_KR2]       = 50000,
+};
+
+u32 mlx5e_port_ptys2speed(u32 eth_proto_oper)
+{
+	unsigned long temp = eth_proto_oper;
+	u32 speed = 0;
+	int i;
+
+	i = find_first_bit(&temp, MLX5E_LINK_MODES_NUMBER);
+	if (i < MLX5E_LINK_MODES_NUMBER)
+		speed = mlx5e_link_speed[i];
+
+	return speed;
+}
+
+int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {};
+	u32 eth_proto_oper;
+	int err;
+
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
+	if (err)
+		return err;
+
+	eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	*speed = mlx5e_port_ptys2speed(eth_proto_oper);
+	if (!(*speed)) {
+		mlx5_core_warn(mdev, "cannot get port speed\n");
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	u32 max_speed = 0;
+	u32 proto_cap;
+	int err;
+	int i;
+
+	err = mlx5_query_port_proto_cap(mdev, &proto_cap, MLX5_PTYS_EN);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i)
+		if (proto_cap & MLX5E_PROT_MASK(i))
+			max_speed = max(max_speed, mlx5e_link_speed[i]);
+
+	*speed = max_speed;
+	return 0;
+}
+
+u32 mlx5e_port_speed2linkmodes(u32 speed)
+{
+	u32 link_modes = 0;
+	int i;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (mlx5e_link_speed[i] == speed)
+			link_modes |= MLX5E_PROT_MASK(i);
+	}
+
+	return link_modes;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
new file mode 100644
index 000000000000..7aae38e98a65
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_EN_PORT_H
+#define __MLX5E_EN_PORT_H
+
+#include <linux/mlx5/driver.h>
+#include "en.h"
+
+u32 mlx5e_port_ptys2speed(u32 eth_proto_oper);
+int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+u32 mlx5e_port_speed2linkmodes(u32 speed);
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 2b786c4d3dab..42bd256e680d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -31,6 +31,7 @@
  */
 
 #include "en.h"
+#include "en/port.h"
 
 void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 			       struct ethtool_drvinfo *drvinfo)
@@ -59,18 +60,16 @@ static void mlx5e_get_drvinfo(struct net_device *dev,
 struct ptys2ethtool_config {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
-	u32 speed;
 };
 
 static struct ptys2ethtool_config ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER];
 
-#define MLX5_BUILD_PTYS2ETHTOOL_CONFIG(reg_, speed_, ...)               \
+#define MLX5_BUILD_PTYS2ETHTOOL_CONFIG(reg_, ...)                       \
 	({                                                              \
 		struct ptys2ethtool_config *cfg;                        \
 		const unsigned int modes[] = { __VA_ARGS__ };           \
 		unsigned int i;                                         \
 		cfg = &ptys2ethtool_table[reg_];                        \
-		cfg->speed = speed_;                                    \
 		bitmap_zero(cfg->supported,                             \
 			    __ETHTOOL_LINK_MODE_MASK_NBITS);            \
 		bitmap_zero(cfg->advertised,                            \
@@ -83,55 +82,55 @@ static struct ptys2ethtool_config ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER];
 
 void mlx5e_build_ptys2ethtool_map(void)
 {
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_CX_SGMII, SPEED_1000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_CX_SGMII,
 				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_KX, SPEED_1000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_KX,
 				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CX4, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CX4,
 				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KX4, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KX4,
 				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KR, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KR,
 				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_20GBASE_KR2, SPEED_20000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_20GBASE_KR2,
 				       ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_CR4, SPEED_40000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_CR4,
 				       ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_KR4, SPEED_40000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_KR4,
 				       ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_56GBASE_R4, SPEED_56000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_56GBASE_R4,
 				       ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CR, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CR,
 				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_SR, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_SR,
 				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_ER, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_ER,
 				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_SR4, SPEED_40000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_SR4,
 				       ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_LR4, SPEED_40000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_LR4,
 				       ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_SR2, SPEED_50000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_SR2,
 				       ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_CR4, SPEED_100000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_CR4,
 				       ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_SR4, SPEED_100000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_SR4,
 				       ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_KR4, SPEED_100000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_KR4,
 				       ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4, SPEED_100000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4,
 				       ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T, SPEED_10000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T,
 				       ETHTOOL_LINK_MODE_10000baseT_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR, SPEED_25000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR,
 				       ETHTOOL_LINK_MODE_25000baseCR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_KR, SPEED_25000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_KR,
 				       ETHTOOL_LINK_MODE_25000baseKR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_SR, SPEED_25000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_SR,
 				       ETHTOOL_LINK_MODE_25000baseSR_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_CR2, SPEED_50000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_CR2,
 				       ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT);
-	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_KR2, SPEED_50000,
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_KR2,
 				       ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT);
 }
 
@@ -617,43 +616,24 @@ static void ptys2ethtool_supported_advertised_port(struct ethtool_link_ksettings
 	}
 }
 
-int mlx5e_get_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
-{
-	u32 max_speed = 0;
-	u32 proto_cap;
-	int err;
-	int i;
-
-	err = mlx5_query_port_proto_cap(mdev, &proto_cap, MLX5_PTYS_EN);
-	if (err)
-		return err;
-
-	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i)
-		if (proto_cap & MLX5E_PROT_MASK(i))
-			max_speed = max(max_speed, ptys2ethtool_table[i].speed);
-
-	*speed = max_speed;
-	return 0;
-}
-
 static void get_speed_duplex(struct net_device *netdev,
 			     u32 eth_proto_oper,
 			     struct ethtool_link_ksettings *link_ksettings)
 {
-	int i;
 	u32 speed = SPEED_UNKNOWN;
 	u8 duplex = DUPLEX_UNKNOWN;
 
 	if (!netif_carrier_ok(netdev))
 		goto out;
 
-	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
-		if (eth_proto_oper & MLX5E_PROT_MASK(i)) {
-			speed = ptys2ethtool_table[i].speed;
-			duplex = DUPLEX_FULL;
-			break;
-		}
+	speed = mlx5e_port_ptys2speed(eth_proto_oper);
+	if (!speed) {
+		speed = SPEED_UNKNOWN;
+		goto out;
 	}
+
+	duplex = DUPLEX_FULL;
+
 out:
 	link_ksettings->base.speed = speed;
 	link_ksettings->base.duplex = duplex;
@@ -811,18 +791,6 @@ static u32 mlx5e_ethtool2ptys_adver_link(const unsigned long *link_modes)
 	return ptys_modes;
 }
 
-static u32 mlx5e_ethtool2ptys_speed_link(u32 speed)
-{
-	u32 i, speed_links = 0;
-
-	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
-		if (ptys2ethtool_table[i].speed == speed)
-			speed_links |= MLX5E_PROT_MASK(i);
-	}
-
-	return speed_links;
-}
-
 static int mlx5e_set_link_ksettings(struct net_device *netdev,
 				    const struct ethtool_link_ksettings *link_ksettings)
 {
@@ -842,7 +810,7 @@ static int mlx5e_set_link_ksettings(struct net_device *netdev,
 
 	link_modes = link_ksettings->base.autoneg == AUTONEG_ENABLE ?
 		mlx5e_ethtool2ptys_adver_link(link_ksettings->link_modes.advertising) :
-		mlx5e_ethtool2ptys_speed_link(speed);
+		mlx5e_port_speed2linkmodes(speed);
 
 	err = mlx5_query_port_proto_cap(mdev, &eth_proto_cap, MLX5_PTYS_EN);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b5a7580b12fe..cee44c21766c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -46,6 +46,7 @@
 #include "accel/ipsec.h"
 #include "accel/tls.h"
 #include "vxlan.h"
+#include "en/port.h"
 
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
@@ -4082,7 +4083,7 @@ static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 	u32 link_speed = 0;
 	u32 pci_bw = 0;
 
-	mlx5e_get_max_linkspeed(mdev, &link_speed);
+	mlx5e_port_max_linkspeed(mdev, &link_speed);
 	pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
 	mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n",
 			   link_speed, pci_bw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 674f1d7d2737..a9c96fe8e4fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -52,6 +52,7 @@
 #include "eswitch.h"
 #include "vxlan.h"
 #include "fs_core.h"
+#include "en/port.h"
 
 struct mlx5_nic_flow_attr {
 	u32 action;
@@ -613,7 +614,7 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 
 	params.q_counter = priv->q_counter;
 	/* set hairpin pair per each 50Gbs share of the link */
-	mlx5e_get_max_linkspeed(priv->mdev, &link_speed);
+	mlx5e_port_max_linkspeed(priv->mdev, &link_speed);
 	link_speed = max_t(u32, link_speed, 50000);
 	link_speed64 = link_speed;
 	do_div(link_speed64, 50000);
-- 
cgit v1.2.3-59-g8ed1b


From df5f1361cc080877013f7838b61d31ad31307c2b Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 28 Feb 2018 14:16:47 -0600
Subject: net/mlx5: Add pbmc and pptb in the port_access_reg_cap_mask

Add pbmc and pptb in the port_access_reg_cap_mask. These two
bits determine if device supports receive buffer configuration.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  3 +++
 include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2bc27f8c5b87..db0332a6d23c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1152,6 +1152,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \
 	MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld)
 
+#define MLX5_CAP_PCAM_REG(mdev, reg) \
+	MLX5_GET(pcam_reg, (mdev)->caps.pcam, port_access_reg_cap_mask.regs_5000_to_507f.reg)
+
 #define MLX5_CAP_MCAM_REG(mdev, reg) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_access_reg_cap_mask.access_regs.reg)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b4ea8a9914c4..f687989d336b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8003,6 +8003,17 @@ struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         ppcnt_statistical_group[0x1];
 };
 
+struct mlx5_ifc_pcam_regs_5000_to_507f_bits {
+	u8         port_access_reg_cap_mask_127_to_96[0x20];
+	u8         port_access_reg_cap_mask_95_to_64[0x20];
+	u8         port_access_reg_cap_mask_63_to_32[0x20];
+
+	u8         port_access_reg_cap_mask_31_to_13[0x13];
+	u8         pbmc[0x1];
+	u8         pptb[0x1];
+	u8         port_access_reg_cap_mask_10_to_0[0xb];
+};
+
 struct mlx5_ifc_pcam_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         feature_group[0x8];
@@ -8012,6 +8023,7 @@ struct mlx5_ifc_pcam_reg_bits {
 	u8         reserved_at_20[0x20];
 
 	union {
+		struct mlx5_ifc_pcam_regs_5000_to_507f_bits regs_5000_to_507f;
 		u8         reserved_at_0[0x80];
 	} port_access_reg_cap_mask;
 
-- 
cgit v1.2.3-59-g8ed1b


From 50b4a3c23646254c7345f3663ff1e0a6cbcd9abb Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Fri, 2 Mar 2018 15:47:01 -0600
Subject: net/mlx5: PPTB and PBMC register firmware command support

Add firmware command interface to read and write PPTB and PBMC
registers.

PPTB register enables mappings priority to a specific receive buffer.

PBMC registers enables changing the receive buffer's configuration such
as buffer size, xon/xoff thresholds, buffer's lossy property and
buffer's shared property.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 108 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h |   5 +
 include/linux/mlx5/driver.h                       |   2 +
 include/linux/mlx5/mlx5_ifc.h                     |  35 +++++++
 4 files changed, 150 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index 9f04542f3661..24e3b564964f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -127,3 +127,111 @@ u32 mlx5e_port_speed2linkmodes(u32 speed)
 
 	return link_modes;
 }
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 0);
+
+	kfree(in);
+	return err;
+}
+
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *out;
+	int err;
+
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 1);
+
+	kfree(out);
+	return err;
+}
+
+/* buffer[i]: buffer that priority i mapped to */
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	prio_x_buff = MLX5_GET(pptb_reg, out, prio_x_buff);
+	for (prio = 0; prio < 8; prio++) {
+		buffer[prio] = (u8)(prio_x_buff >> (4 * prio)) & 0xF;
+		mlx5_core_dbg(mdev, "prio %d, buffer %d\n", prio, buffer[prio]);
+	}
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* First query the pptb register */
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(pptb_reg, in, local_port, 1);
+
+	/* Update the pm and prio_x_buff */
+	MLX5_SET(pptb_reg, in, pm, 0xFF);
+
+	prio_x_buff = 0;
+	for (prio = 0; prio < 8; prio++)
+		prio_x_buff |= (buffer[prio] << (4 * prio));
+	MLX5_SET(pptb_reg, in, prio_x_buff, prio_x_buff);
+
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
index 7aae38e98a65..f8cbd8194179 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -40,4 +40,9 @@ u32 mlx5e_port_ptys2speed(u32 eth_proto_oper);
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 u32 mlx5e_port_speed2linkmodes(u32 speed);
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
 #endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d703774982ca..92d292454351 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -124,6 +124,8 @@ enum {
 	MLX5_REG_PAOS		 = 0x5006,
 	MLX5_REG_PFCC            = 0x5007,
 	MLX5_REG_PPCNT		 = 0x5008,
+	MLX5_REG_PPTB            = 0x500b,
+	MLX5_REG_PBMC            = 0x500c,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
 	MLX5_REG_PMPE		 = 0x5010,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f687989d336b..edbddeaacc88 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8788,6 +8788,41 @@ struct mlx5_ifc_qpts_reg_bits {
 	u8         trust_state[0x3];
 };
 
+struct mlx5_ifc_pptb_reg_bits {
+	u8         reserved_at_0[0x2];
+	u8         mm[0x2];
+	u8         reserved_at_4[0x4];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x6];
+	u8         cm[0x1];
+	u8         um[0x1];
+	u8         pm[0x8];
+
+	u8         prio_x_buff[0x20];
+
+	u8         pm_msb[0x8];
+	u8         reserved_at_48[0x10];
+	u8         ctrl_buff[0x4];
+	u8         untagged_buff[0x4];
+};
+
+struct mlx5_ifc_pbmc_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x10];
+
+	u8         xoff_timer_value[0x10];
+	u8         xoff_refresh[0x10];
+
+	u8         reserved_at_40[0x9];
+	u8         fullness_threshold[0x7];
+	u8         port_buffer_size[0x10];
+
+	struct mlx5_ifc_bufferx_reg_bits buffer[10];
+
+	u8         reserved_at_2e0[0x40];
+};
+
 struct mlx5_ifc_qtct_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         port_number[0x8];
-- 
cgit v1.2.3-59-g8ed1b


From 0696d60853d58f1b9431bc689e4e4526845b790a Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 21 Mar 2018 21:10:22 -0500
Subject: net/mlx5e: Receive buffer configuration

Add APIs for buffer configuration based on the changes in
pfc configuration, cable len, buffer size configuration,
and priority to buffer mapping.

Note that the xoff fomula is as below
  xoff = ((301+2.16 * len [m]) * speed [Gbps] + 2.72 MTU [B]
  xoff_threshold = buffer_size - xoff
  xon_threshold = xoff_threshold - MTU

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   5 +
 .../ethernet/mellanox/mlx5/core/en/port_buffer.c   | 327 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/en/port_buffer.h   |  75 +++++
 4 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 651cf3640420..9efbf193ad5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -21,7 +21,7 @@ mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
 
 mlx5_core-$(CONFIG_MLX5_ESWITCH) += eswitch.o eswitch_offloads.o en_rep.o en_tc.o
 
-mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o en/port_buffer.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib_vlan.o
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d13a86a1d702..9ab7158a7ce7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -65,6 +65,7 @@ struct page_pool;
 #define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
 #define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
 
+#define MLX5E_MAX_PRIORITY      8
 #define MLX5E_MAX_DSCP          64
 #define MLX5E_MAX_NUM_TC	8
 
@@ -275,6 +276,10 @@ struct mlx5e_dcbx {
 	/* The only setting that cannot be read from FW */
 	u8                         tc_tsa[IEEE_8021QAZ_MAX_TCS];
 	u8                         cap;
+
+	/* Buffer configuration */
+	u32                        cable_len;
+	u32                        xoff;
 };
 
 struct mlx5e_dcbx_dp {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
new file mode 100644
index 000000000000..c047da8752da
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "port_buffer.h"
+
+int mlx5e_port_query_buffer(struct mlx5e_priv *priv,
+			    struct mlx5e_port_buffer *port_buffer)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	u32 total_used = 0;
+	void *buffer;
+	void *out;
+	int err;
+	int i;
+
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5e_port_query_pbmc(mdev, out);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]);
+		port_buffer->buffer[i].lossy =
+			MLX5_GET(bufferx_reg, buffer, lossy);
+		port_buffer->buffer[i].epsb =
+			MLX5_GET(bufferx_reg, buffer, epsb);
+		port_buffer->buffer[i].size =
+			MLX5_GET(bufferx_reg, buffer, size) << MLX5E_BUFFER_CELL_SHIFT;
+		port_buffer->buffer[i].xon =
+			MLX5_GET(bufferx_reg, buffer, xon_threshold) << MLX5E_BUFFER_CELL_SHIFT;
+		port_buffer->buffer[i].xoff =
+			MLX5_GET(bufferx_reg, buffer, xoff_threshold) << MLX5E_BUFFER_CELL_SHIFT;
+		total_used += port_buffer->buffer[i].size;
+
+		mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i,
+			  port_buffer->buffer[i].size,
+			  port_buffer->buffer[i].xon,
+			  port_buffer->buffer[i].xoff,
+			  port_buffer->buffer[i].epsb,
+			  port_buffer->buffer[i].lossy);
+	}
+
+	port_buffer->port_buffer_size =
+		MLX5_GET(pbmc_reg, out, port_buffer_size) << MLX5E_BUFFER_CELL_SHIFT;
+	port_buffer->spare_buffer_size =
+		port_buffer->port_buffer_size - total_used;
+
+	mlx5e_dbg(HW, priv, "total buffer size=%d, spare buffer size=%d\n",
+		  port_buffer->port_buffer_size,
+		  port_buffer->spare_buffer_size);
+out:
+	kfree(out);
+	return err;
+}
+
+static int port_set_buffer(struct mlx5e_priv *priv,
+			   struct mlx5e_port_buffer *port_buffer)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *buffer;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = mlx5e_port_query_pbmc(mdev, in);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]);
+
+		MLX5_SET(bufferx_reg, buffer, size,
+			 port_buffer->buffer[i].size >> MLX5E_BUFFER_CELL_SHIFT);
+		MLX5_SET(bufferx_reg, buffer, lossy,
+			 port_buffer->buffer[i].lossy);
+		MLX5_SET(bufferx_reg, buffer, xoff_threshold,
+			 port_buffer->buffer[i].xoff >> MLX5E_BUFFER_CELL_SHIFT);
+		MLX5_SET(bufferx_reg, buffer, xon_threshold,
+			 port_buffer->buffer[i].xon >> MLX5E_BUFFER_CELL_SHIFT);
+	}
+
+	err = mlx5e_port_set_pbmc(mdev, in);
+out:
+	kfree(in);
+	return err;
+}
+
+/* xoff = ((301+2.16 * len [m]) * speed [Gbps] + 2.72 MTU [B]) */
+static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu)
+{
+	u32 speed;
+	u32 xoff;
+	int err;
+
+	err = mlx5e_port_linkspeed(priv->mdev, &speed);
+	if (err)
+		return 0;
+
+	xoff = (301 + 216 * priv->dcbx.cable_len / 100) * speed / 1000 + 272 * mtu / 100;
+
+	mlx5e_dbg(HW, priv, "%s: xoff=%d\n", __func__, xoff);
+	return xoff;
+}
+
+static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer,
+				 u32 xoff, unsigned int mtu)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		if (port_buffer->buffer[i].lossy) {
+			port_buffer->buffer[i].xoff = 0;
+			port_buffer->buffer[i].xon  = 0;
+			continue;
+		}
+
+		if (port_buffer->buffer[i].size <
+		    (xoff + mtu + (1 << MLX5E_BUFFER_CELL_SHIFT)))
+			return -ENOMEM;
+
+		port_buffer->buffer[i].xoff = port_buffer->buffer[i].size - xoff;
+		port_buffer->buffer[i].xon  = port_buffer->buffer[i].xoff - mtu;
+	}
+
+	return 0;
+}
+
+/**
+ * update_buffer_lossy()
+ *   mtu: device's MTU
+ *   pfc_en: <input> current pfc configuration
+ *   buffer: <input> current prio to buffer mapping
+ *   xoff:   <input> xoff value
+ *   port_buffer: <output> port receive buffer configuration
+ *   change: <output>
+ *
+ *   Update buffer configuration based on pfc configuraiton and priority
+ *   to buffer mapping.
+ *   Buffer's lossy bit is changed to:
+ *     lossless if there is at least one PFC enabled priority mapped to this buffer
+ *     lossy if all priorities mapped to this buffer are PFC disabled
+ *
+ *   Return:
+ *     Return 0 if no error.
+ *     Set change to true if buffer configuration is modified.
+ */
+static int update_buffer_lossy(unsigned int mtu,
+			       u8 pfc_en, u8 *buffer, u32 xoff,
+			       struct mlx5e_port_buffer *port_buffer,
+			       bool *change)
+{
+	bool changed = false;
+	u8 lossy_count;
+	u8 prio_count;
+	u8 lossy;
+	int prio;
+	int err;
+	int i;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		prio_count = 0;
+		lossy_count = 0;
+
+		for (prio = 0; prio < MLX5E_MAX_PRIORITY; prio++) {
+			if (buffer[prio] != i)
+				continue;
+
+			prio_count++;
+			lossy_count += !(pfc_en & (1 << prio));
+		}
+
+		if (lossy_count == prio_count)
+			lossy = 1;
+		else /* lossy_count < prio_count */
+			lossy = 0;
+
+		if (lossy != port_buffer->buffer[i].lossy) {
+			port_buffer->buffer[i].lossy = lossy;
+			changed = true;
+		}
+	}
+
+	if (changed) {
+		err = update_xoff_threshold(port_buffer, xoff, mtu);
+		if (err)
+			return err;
+
+		*change = true;
+	}
+
+	return 0;
+}
+
+int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
+				    u32 change, unsigned int mtu,
+				    struct ieee_pfc *pfc,
+				    u32 *buffer_size,
+				    u8 *prio2buffer)
+{
+	struct mlx5e_port_buffer port_buffer;
+	u32 xoff = calculate_xoff(priv, mtu);
+	bool update_prio2buffer = false;
+	u8 buffer[MLX5E_MAX_PRIORITY];
+	bool update_buffer = false;
+	u32 total_used = 0;
+	u8 curr_pfc_en;
+	int err;
+	int i;
+
+	mlx5e_dbg(HW, priv, "%s: change=%x\n", __func__, change);
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	if (change & MLX5E_PORT_BUFFER_CABLE_LEN) {
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, mtu);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_PFC) {
+		err = mlx5e_port_query_priority2buffer(priv->mdev, buffer);
+		if (err)
+			return err;
+
+		err = update_buffer_lossy(mtu, pfc->pfc_en, buffer, xoff,
+					  &port_buffer, &update_buffer);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) {
+		update_prio2buffer = true;
+		err = mlx5_query_port_pfc(priv->mdev, &curr_pfc_en, NULL);
+		if (err)
+			return err;
+
+		err = update_buffer_lossy(mtu, curr_pfc_en, prio2buffer, xoff,
+					  &port_buffer, &update_buffer);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_SIZE) {
+		for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+			mlx5e_dbg(HW, priv, "%s: buffer[%d]=%d\n", __func__, i, buffer_size[i]);
+			if (!port_buffer.buffer[i].lossy && !buffer_size[i]) {
+				mlx5e_dbg(HW, priv, "%s: lossless buffer[%d] size cannot be zero\n",
+					  __func__, i);
+				return -EINVAL;
+			}
+
+			port_buffer.buffer[i].size = buffer_size[i];
+			total_used += buffer_size[i];
+		}
+
+		mlx5e_dbg(HW, priv, "%s: total buffer requested=%d\n", __func__, total_used);
+
+		if (total_used > port_buffer.port_buffer_size)
+			return -EINVAL;
+
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, mtu);
+		if (err)
+			return err;
+	}
+
+	/* Need to update buffer configuration if xoff value is changed */
+	if (!update_buffer && xoff != priv->dcbx.xoff) {
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, mtu);
+		if (err)
+			return err;
+	}
+	priv->dcbx.xoff = xoff;
+
+	/* Apply the settings */
+	if (update_buffer) {
+		err = port_set_buffer(priv, &port_buffer);
+		if (err)
+			return err;
+	}
+
+	if (update_prio2buffer)
+		err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
new file mode 100644
index 000000000000..34f55b81a0de
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_PORT_BUFFER_H__
+#define __MLX5_EN_PORT_BUFFER_H__
+
+#include "en.h"
+#include "port.h"
+
+#define MLX5E_MAX_BUFFER 8
+#define MLX5E_BUFFER_CELL_SHIFT 7
+#define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */
+
+#define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \
+				     MLX5_CAP_PCAM_REG(mdev, pbmc) && \
+				     MLX5_CAP_PCAM_REG(mdev, pptb))
+
+enum {
+	MLX5E_PORT_BUFFER_CABLE_LEN   = BIT(0),
+	MLX5E_PORT_BUFFER_PFC         = BIT(1),
+	MLX5E_PORT_BUFFER_PRIO2BUFFER = BIT(2),
+	MLX5E_PORT_BUFFER_SIZE        = BIT(3),
+};
+
+struct mlx5e_bufferx_reg {
+	u8   lossy;
+	u8   epsb;
+	u32  size;
+	u32  xoff;
+	u32  xon;
+};
+
+struct mlx5e_port_buffer {
+	u32                       port_buffer_size;
+	u32                       spare_buffer_size;
+	struct mlx5e_bufferx_reg  buffer[MLX5E_MAX_BUFFER];
+};
+
+int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
+				    u32 change, unsigned int mtu,
+				    struct ieee_pfc *pfc,
+				    u32 *buffer_size,
+				    u8 *prio2buffer);
+
+int mlx5e_port_query_buffer(struct mlx5e_priv *priv,
+			    struct mlx5e_port_buffer *port_buffer);
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From ecdf2dadee8e8c5015771b802a9851ff332d3fc4 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 21 Mar 2018 21:39:17 -0500
Subject: net/mlx5e: Receive buffer support for DCBX

Add dcbnl's set/get buffer configuration callback that allows user to
set/get buffer size configuration and priority to buffer mapping.

By default, firmware controls receive buffer configuration and priority
of buffer mapping based on the changes in pfc settings. When set buffer
call back is triggered, the buffer configuration changes to manual mode.

The manual mode means mlx5 driver will adjust the buffer configuration
accordingly based on the changes in pfc settings.

ConnectX buffer stride is 128 Bytes. If the buffer size is not multiple
of 128, the buffer size will be rounded down to the nearest multiple of
128.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 132 +++++++++++++++++++--
 2 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9ab7158a7ce7..c5c7a6d687ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -278,6 +278,7 @@ struct mlx5e_dcbx {
 	u8                         cap;
 
 	/* Buffer configuration */
+	bool                       manual_buffer;
 	u32                        cable_len;
 	u32                        xoff;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index c641d5656b2d..0a52f31fef37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -32,8 +32,8 @@
 #include <linux/device.h>
 #include <linux/netdevice.h>
 #include "en.h"
-
-#define MLX5E_MAX_PRIORITY 8
+#include "en/port.h"
+#include "en/port_buffer.h"
 
 #define MLX5E_100MB (100000)
 #define MLX5E_1GB   (1000000)
@@ -41,6 +41,9 @@
 #define MLX5E_CEE_STATE_UP    1
 #define MLX5E_CEE_STATE_DOWN  0
 
+/* Max supported cable length is 1000 meters */
+#define MLX5E_MAX_CABLE_LENGTH 1000
+
 enum {
 	MLX5E_VENDOR_TC_GROUP_NUM = 7,
 	MLX5E_LOWEST_PRIO_GROUP   = 0,
@@ -338,6 +341,9 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev,
 		pfc->indications[i] = PPORT_PER_PRIO_GET(pstats, i, rx_pause);
 	}
 
+	if (MLX5_BUFFER_SUPPORTED(mdev))
+		pfc->delay = priv->dcbx.cable_len;
+
 	return mlx5_query_port_pfc(mdev, &pfc->pfc_en, NULL);
 }
 
@@ -346,16 +352,39 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 old_cable_len = priv->dcbx.cable_len;
+	struct ieee_pfc pfc_new;
+	u32 changed = 0;
 	u8 curr_pfc_en;
-	int ret;
+	int ret = 0;
 
+	/* pfc_en */
 	mlx5_query_port_pfc(mdev, &curr_pfc_en, NULL);
+	if (pfc->pfc_en != curr_pfc_en) {
+		ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en);
+		if (ret)
+			return ret;
+		mlx5_toggle_port_link(mdev);
+		changed |= MLX5E_PORT_BUFFER_PFC;
+	}
 
-	if (pfc->pfc_en == curr_pfc_en)
-		return 0;
+	if (pfc->delay &&
+	    pfc->delay < MLX5E_MAX_CABLE_LENGTH &&
+	    pfc->delay != priv->dcbx.cable_len) {
+		priv->dcbx.cable_len = pfc->delay;
+		changed |= MLX5E_PORT_BUFFER_CABLE_LEN;
+	}
 
-	ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en);
-	mlx5_toggle_port_link(mdev);
+	if (MLX5_BUFFER_SUPPORTED(mdev)) {
+		pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en;
+		if (priv->dcbx.manual_buffer)
+			ret = mlx5e_port_manual_buffer_config(priv, changed,
+							      dev->mtu, &pfc_new,
+							      NULL, NULL);
+
+		if (ret && (changed & MLX5E_PORT_BUFFER_CABLE_LEN))
+			priv->dcbx.cable_len = old_cable_len;
+	}
 
 	if (!ret) {
 		mlx5e_dbg(HW, priv,
@@ -873,6 +902,90 @@ static void mlx5e_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
 	cee_cfg->pfc_enable = state;
 }
 
+static int mlx5e_dcbnl_getbuffer(struct net_device *dev,
+				 struct dcbnl_buffer *dcb_buffer)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_port_buffer port_buffer;
+	u8 buffer[MLX5E_MAX_PRIORITY];
+	int i, err;
+
+	if (!MLX5_BUFFER_SUPPORTED(mdev))
+		return -EOPNOTSUPP;
+
+	err = mlx5e_port_query_priority2buffer(mdev, buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++)
+		dcb_buffer->prio2buffer[i] = buffer[i];
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++)
+		dcb_buffer->buffer_size[i] = port_buffer.buffer[i].size;
+	dcb_buffer->total_size = port_buffer.port_buffer_size;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_setbuffer(struct net_device *dev,
+				 struct dcbnl_buffer *dcb_buffer)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_port_buffer port_buffer;
+	u8 old_prio2buffer[MLX5E_MAX_PRIORITY];
+	u32 *buffer_size = NULL;
+	u8 *prio2buffer = NULL;
+	u32 changed = 0;
+	int i, err;
+
+	if (!MLX5_BUFFER_SUPPORTED(mdev))
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < DCBX_MAX_BUFFERS; i++)
+		mlx5_core_dbg(mdev, "buffer[%d]=%d\n", i, dcb_buffer->buffer_size[i]);
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++)
+		mlx5_core_dbg(mdev, "priority %d buffer%d\n", i, dcb_buffer->prio2buffer[i]);
+
+	err = mlx5e_port_query_priority2buffer(mdev, old_prio2buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++) {
+		if (dcb_buffer->prio2buffer[i] != old_prio2buffer[i]) {
+			changed |= MLX5E_PORT_BUFFER_PRIO2BUFFER;
+			prio2buffer = dcb_buffer->prio2buffer;
+			break;
+		}
+	}
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		if (port_buffer.buffer[i].size != dcb_buffer->buffer_size[i]) {
+			changed |= MLX5E_PORT_BUFFER_SIZE;
+			buffer_size = dcb_buffer->buffer_size;
+			break;
+		}
+	}
+
+	if (!changed)
+		return 0;
+
+	priv->dcbx.manual_buffer = true;
+	err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL,
+					      buffer_size, prio2buffer);
+	return err;
+}
+
 const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
 	.ieee_getets	= mlx5e_dcbnl_ieee_getets,
 	.ieee_setets	= mlx5e_dcbnl_ieee_setets,
@@ -884,6 +997,8 @@ const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
 	.ieee_delapp    = mlx5e_dcbnl_ieee_delapp,
 	.getdcbx	= mlx5e_dcbnl_getdcbx,
 	.setdcbx	= mlx5e_dcbnl_setdcbx,
+	.dcbnl_getbuffer = mlx5e_dcbnl_getbuffer,
+	.dcbnl_setbuffer = mlx5e_dcbnl_setbuffer,
 
 /* CEE interfaces */
 	.setall         = mlx5e_dcbnl_setall,
@@ -1091,5 +1206,8 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv)
 	if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST)
 		priv->dcbx.cap |= DCB_CAP_DCBX_HOST;
 
+	priv->dcbx.manual_buffer = false;
+	priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN;
+
 	mlx5e_ets_init(priv);
 }
-- 
cgit v1.2.3-59-g8ed1b


From f8d959a5b188dc81e57a6bac34a1b2986f61e2fd Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:08 -0700
Subject: perf/core: add perf_get_event() to return perf_event given a struct
 file

A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..eec302b61cfe 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline struct file *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+	return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce359ad..6eeab86d24ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
 	return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+	if (file->f_op != &perf_fops)
+		return ERR_PTR(-EINVAL);
+
+	return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	if (!event)
-- 
cgit v1.2.3-59-g8ed1b


From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:09 -0700
Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/trace_events.h |  17 ++++++
 include/uapi/linux/bpf.h     |  26 +++++++++
 kernel/bpf/syscall.c         | 131 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c     |  48 ++++++++++++++++
 kernel/trace/trace_kprobe.c  |  29 ++++++++++
 kernel/trace/trace_uprobe.c  |  22 ++++++++
 6 files changed, 273 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3eff564c..d34144a3c5b5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
 {
 	return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+					  u32 *prog_id, u32 *fd_type,
+					  const char **buf, u64 *probe_offset,
+					  u64 *probe_addr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **symbol,
+			       u64 *probe_offset, u64 *probe_addr,
+			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **filename,
+			       u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e95fec90c2c1..9b8c6e310e9a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 788456c18617..388d4feda348 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2178,6 +2180,132 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && ubuf) {
+		if (!len) {
+			/* nothing to copy, just make ubuf NULL terminated */
+			char zero = '\0';
+
+			if (put_user(zero, ubuf))
+				return -EFAULT;
+		} else if (input_len >= len + 1) {
+			/* ubuf can hold the string with NULL terminator */
+			if (copy_to_user(ubuf, buf, len + 1))
+				return -EFAULT;
+		} else {
+			/* ubuf cannot hold the string with NULL terminator,
+			 * do a partial copy with NULL terminator.
+			 */
+			char zero = '\0';
+
+			err = -ENOSPC;
+			if (copy_to_user(ubuf, buf, input_len - 1))
+				return -EFAULT;
+			if (put_user(zero, ubuf + input_len - 1))
+				return -EFAULT;
+		}
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2264,6 +2392,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbff27e4..81fdf2fc94ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*fd_type = BPF_FD_TYPE_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, fd_type, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, fd_type, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76e0978..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+					      : BPF_FD_TYPE_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac892878dbe6..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+				    : BPF_FD_TYPE_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-- 
cgit v1.2.3-59-g8ed1b


From 30687ad94e57b53dd66950ab570203dfd1f9db34 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:10 -0700
Subject: tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in
 libbpf

Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_task_fd_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++
 tools/lib/bpf/bpf.c            | 23 +++++++++++++++++++++++
 tools/lib/bpf/bpf.h            |  3 +++
 3 files changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e95fec90c2c1..9b8c6e310e9a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 442b4cdfeb71..9ddc89dae962 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,26 @@ retry:
 
 	return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+		      __u64 *probe_addr)
+{
+	union bpf_attr attr = {};
+	int err;
+
+	attr.task_fd_query.pid = pid;
+	attr.task_fd_query.fd = fd;
+	attr.task_fd_query.flags = flags;
+	attr.task_fd_query.buf = ptr_to_u64(buf);
+	attr.task_fd_query.buf_len = *buf_len;
+
+	err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr));
+	*buf_len = attr.task_fd_query.buf_len;
+	*prog_id = attr.task_fd_query.prog_id;
+	*fd_type = attr.task_fd_query.fd_type;
+	*probe_offset = attr.task_fd_query.probe_offset;
+	*probe_addr = attr.task_fd_query.probe_addr;
+
+	return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index d12344f66d4e..0639a30a457d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+		      __u64 *probe_addr);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 73bc4d9fc045fb709483a733683f816e64a99431 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:11 -0700
Subject: tools/bpf: add ksym_get_addr() in trace_helpers

Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 ++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8686e4..3868dcb63420 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
 	return &syms[0];
 }
 
+long ksym_get_addr(const char *name)
+{
+	int i;
+
+	for (i = 0; i < sym_cnt; i++) {
+		if (strcmp(syms[i].name, name) == 0)
+			return syms[i].addr;
+	}
+
+	return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3b1ea9..3b4bcf7f5084 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
cgit v1.2.3-59-g8ed1b


From ecb96f7fe153c7ff2fd31db64c52a53b7e6401ab Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:56 -0700
Subject: samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY

This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile             |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1a4cf3..7dc85ed0ce4b 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000000000000..f4b0a9ea674d
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000000000000..8381d792f138
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("FAIL: %s:\n", __func__);	\
+		perror("    ");			\
+		return -1;				\
+	}						\
+})
+
+#define CHECK_AND_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret)					\
+		return -1;				\
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	errno = 0;
+	ret = (int)strtol(buf, NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+	CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+	errno = 0;
+	ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+				__u32 expected_fd_type)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int err;
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+		       __func__, prog_fd_idx, fn_name);
+		perror("    :");
+		return -1;
+	}
+	if (strcmp(buf, fn_name) != 0 ||
+	    fd_type != expected_fd_type ||
+	    probe_offset != 0x0 || probe_addr != 0x0) {
+		printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+		       prog_fd_idx);
+		printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+		       " probe_addr: 0x%llx\n",
+		       buf, fd_type, probe_offset, probe_addr);
+		return -1;
+	}
+	return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+	const char *name, __u64 offset, __u64 addr, bool is_return,
+	char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+	__u64 *probe_offset, __u64 *probe_addr)
+{
+	int is_return_bit = bpf_get_retprobe_bit(event_type);
+	int type = bpf_find_probe_type(event_type);
+	struct perf_event_attr attr = {};
+	int fd;
+
+	if (type < 0 || is_return_bit < 0) {
+		printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+			__func__, type, is_return_bit);
+		return -1;
+	}
+
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	if (is_return)
+		attr.config |= 1 << is_return_bit;
+
+	if (name) {
+		attr.config1 = ptr_to_u64((void *)name);
+		attr.config2 = offset;
+	} else {
+		attr.config1 = 0;
+		attr.config2 = addr;
+	}
+	attr.size = sizeof(attr);
+	attr.type = type;
+
+	fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+	CHECK_PERROR_RET(fd < 0);
+
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+			 prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+	return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+				  __u64 offset, __u64 addr, bool is_return,
+				  __u32 expected_fd_type,
+				  __u32 expected_ret_fd_type,
+				  char *buf, __u32 buf_len)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, fd_type;
+	int err;
+
+	err = test_nondebug_fs_kuprobe_common(event_type, name,
+					      offset, addr, is_return,
+					      buf, &buf_len, &prog_id,
+					      &fd_type, &probe_offset,
+					      &probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, "
+		       "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+		       __func__, name ? name : "", offset, addr, is_return);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != expected_ret_fd_type) ||
+	    (!is_return && fd_type != expected_fd_type)) {
+		printf("FAIL: %s, incorrect fd_type %u\n",
+		       __func__, fd_type);
+		return -1;
+	}
+	if (name) {
+		if (strcmp(name, buf) != 0) {
+			printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+			return -1;
+		}
+		if (probe_offset != offset) {
+			printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+			       __func__, probe_offset);
+			return -1;
+		}
+	} else {
+		if (buf_len != 0) {
+			printf("FAIL: %s, incorrect buf %p\n",
+			       __func__, buf);
+			return -1;
+		}
+
+		if (probe_addr != addr) {
+			printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+			       __func__, probe_addr);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+	const char *event_type = "uprobe";
+	struct perf_event_attr attr = {};
+	char buf[256], event_alias[256];
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, res, kfd, efd;
+	ssize_t bytes;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+		 event_type);
+	kfd = open(buf, O_WRONLY | O_APPEND, 0);
+	CHECK_PERROR_RET(kfd < 0);
+
+	res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+	res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+		       is_return ? 'r' : 'p', event_type, event_alias,
+		       binary_path, offset);
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+	CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+	close(kfd);
+	kfd = -1;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+		 event_type, event_alias);
+	efd = open(buf, O_RDONLY, 0);
+	CHECK_PERROR_RET(efd < 0);
+
+	bytes = read(efd, buf, sizeof(buf));
+	CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+	close(efd);
+	buf[bytes] = '\0';
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+	CHECK_PERROR_RET(kfd < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+	    (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+		printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+		       fd_type);
+		return -1;
+	}
+	if (strcmp(binary_path, buf) != 0) {
+		printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+		return -1;
+	}
+	if (probe_offset != offset) {
+		printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+		       probe_offset);
+		return -1;
+	}
+
+	close(kfd);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
+	extern char __executable_start;
+	char filename[256], buf[256];
+	__u64 uprobe_file_offset;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	/* test two functions in the corresponding *_kern.c file */
+	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+					   BPF_FD_TYPE_KPROBE));
+	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+					   BPF_FD_TYPE_KRETPROBE));
+
+	/* test nondebug fs kprobe */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#ifdef __x86_64__
+	/* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#endif
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     true, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     NULL, 0));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     0, 0));
+
+	/* test nondebug fs uprobe */
+	/* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+	 * and the default linker script, which defines __executable_start as
+	 * the start of the .text section. The calculation could be different
+	 * on different systems with different compilers. The right way is
+	 * to parse the ELF file. We took a shortcut here.
+	 */
+	uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, false,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, true,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+
+	/* test debug fs uprobe */
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   false));
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   true));
+
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From f699cf7aa41f7b5a48316c0d21b76861b45e46e8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:57 -0700
Subject: tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs

The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_progs.c | 158 +++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733330c1..0ef68204c84b 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ close_prog_noerr:
 	bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	int efd, err, prog_fd;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* query (getpid(), efd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      strcmp(buf, "sys_enter") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_prog;
+
+	/* test zero len */
+	len = 0;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test empty buffer */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test smaller buffer */
+	len = 3;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter") &&
+	      strcmp(buf, "sy") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+				       const char *tp_name)
+{
+	const char *file = "./test_tracepoint.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* query (getpid(), pmu_fd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_pmu;
+
+	close(pmu_fd);
+	goto close_prog_noerr;
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp(void)
+{
+	test_task_fd_query_tp_core("sched/sched_switch",
+				   "sched_switch");
+	test_task_fd_query_tp_core("syscalls/sys_enter_read",
+				   "sys_enter_read");
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1561,6 +1717,8 @@ int main(void)
 	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
+	test_task_fd_query_rawtp();
+	test_task_fd_query_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
cgit v1.2.3-59-g8ed1b


From b04df400c30235fa347313c9e2a0695549bd2c8e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 24 May 2018 11:21:58 -0700
Subject: tools/bpftool: add perf subcommand

The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe:     trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe:     trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
   {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
   {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
   {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
	  loaded_at 2018-05-15T04:46:37-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
	  loaded_at 2018-05-15T04:48:32-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
	  loaded_at 2018-05-15T04:48:48-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
	  loaded_at 2018-05-15T04:49:52-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
  21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1    S+     0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 ++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 246 +++++++++++++++++++++++
 6 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 000000000000..e3eb0eab7641
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+================
+bpftool-perf
+================
+-------------------------------------------------------------------------------
+tool for inspection of perf related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** | **help** }
+
+PERF COMMANDS
+=============
+
+|	**bpftool** **perf { show | list }**
+|	**bpftool** **perf help**
+
+DESCRIPTION
+===========
+	**bpftool perf { show | list }**
+		  List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+
+		  Output will start with process id and file descriptor in that process,
+		  followed by bpf program id, attachment information, and attachment point.
+		  The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+		  The attachment point for k[ret]probe is either symbol name and offset,
+		  or a kernel virtual address.
+		  The attachment point for u[ret]probe is the file name and the file offset.
+
+	**bpftool perf help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool perf**
+
+::
+
+      pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
+      pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
+      pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
+      pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
+
+|
+| **# bpftool -j perf**
+
+::
+
+    [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
+     {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
+     {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
+     {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 564cb0d9692b..b6f5d560460d 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -30,6 +30,8 @@ SYNOPSIS
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
+	*PERF-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -56,3 +58,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
+        **bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index b301c9b315f1..7bc198d60de2 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -448,6 +448,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        perf)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d21d44..eea7f14355f3 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup }\n"
+		"       OBJECT := { prog | map | cgroup | perf }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
 	{ "prog",	do_prog },
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
+	{ "perf",	do_perf },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd997e7a..63fdb310b9a4 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 000000000000..ac6b1a12c9b7
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <yhs@fb.com>
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+/* 0: undecided, 1: supported, 2: not supported */
+static int perf_query_supported;
+static bool has_perf_query_support(void)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int fd;
+
+	if (perf_query_supported)
+		goto out;
+
+	fd = open(bin_name, O_RDONLY);
+	if (fd < 0) {
+		p_err("perf_query_support: %s", strerror(errno));
+		goto out;
+	}
+
+	/* the following query will fail as no bpf attachment,
+	 * the expected errno is ENOTSUPP
+	 */
+	errno = 0;
+	len = sizeof(buf);
+	bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id,
+			  &fd_type, &probe_offset, &probe_addr);
+
+	if (errno == 524 /* ENOTSUPP */) {
+		perf_query_supported = 1;
+		goto close_fd;
+	}
+
+	perf_query_supported = 2;
+	p_err("perf_query_support: %s", strerror(errno));
+	fprintf(stderr,
+		"HINT: non root or kernel doesn't support TASK_FD_QUERY\n");
+
+close_fd:
+	close(fd);
+out:
+	return perf_query_supported == 1;
+}
+
+static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			    char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	jsonw_start_object(json_wtr);
+	jsonw_int_field(json_wtr, "pid", pid);
+	jsonw_int_field(json_wtr, "fd", fd);
+	jsonw_uint_field(json_wtr, "prog_id", prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kretprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uretprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	}
+	jsonw_end_object(json_wtr);
+}
+
+static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			     char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	printf("pid %d  fd %d: prog_id %u  ", pid, fd, prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		printf("raw_tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		printf("tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		if (buf[0] != '\0')
+			printf("kprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		if (buf[0] != '\0')
+			printf("kretprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kretprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		printf("uprobe  filename %s  offset %llu\n", buf, probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		printf("uretprobe  filename %s  offset %llu\n", buf,
+		       probe_offset);
+		break;
+	}
+}
+
+static int show_proc(const char *fpath, const struct stat *sb,
+		     int tflag, struct FTW *ftwbuf)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, pid = 0, fd = 0;
+	const char *pch;
+	char buf[4096];
+
+	/* prefix always /proc */
+	pch = fpath + 5;
+	if (*pch == '\0')
+		return 0;
+
+	/* pid should be all numbers */
+	pch++;
+	while (isdigit(*pch)) {
+		pid = pid * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd directory */
+	pch++;
+	if (strncmp(pch, "fd", 2))
+		return FTW_SKIP_SUBTREE;
+	pch += 2;
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd/<fd_num> */
+	pch++;
+	while (isdigit(*pch)) {
+		fd = fd * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch != '\0')
+		return FTW_SKIP_SUBTREE;
+
+	/* query (pid, fd) for potential perf events */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type,
+				&probe_offset, &probe_addr);
+	if (err < 0)
+		return 0;
+
+	if (json_output)
+		print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset,
+				probe_addr);
+	else
+		print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset,
+				 probe_addr);
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int flags = FTW_ACTIONRETVAL | FTW_PHYS;
+	int err = 0, nopenfd = 16;
+
+	if (!has_perf_query_support())
+		return -1;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
+		p_err("%s", strerror(errno));
+		err = -1;
+	}
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s { show | list | help }\n"
+		"",
+		bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_perf(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:46 +0200
Subject: bpf: devmap introduce dev_map_enqueue

Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.

V2: Fix compile issue reported by kbuild test robot <lkp@intel.com>

V5: Cleanups requested by Daniel
 - Newlines before func definition
 - Use BUILD_BUG_ON checks
 - Remove unnecessary use return value store in dev_map_enqueue

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        | 16 +++++++++++++---
 include/trace/events/xdp.h |  9 ++++++++-
 kernel/bpf/devmap.c        | 34 ++++++++++++++++++++++++++++------
 net/core/filter.c          | 15 ++-------------
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1795eeee846c..23a809da452d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -487,14 +487,16 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
 
 /* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
 void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 
@@ -573,6 +575,15 @@ static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
 
+struct xdp_buff;
+struct bpf_dtab_netdev;
+
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	return 0;
+}
+
 static inline
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 {
@@ -587,7 +598,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
 {
 }
 
-struct xdp_buff;
 static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 				  struct xdp_buff *xdp,
 				  struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+	struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
 	 (!map ? 0 :						\
 	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct net_device *)fwd)->ifindex : 0)))
+	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..06c400e7e4ff 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,13 +48,15 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
 	struct rcu_head rcu;
@@ -240,21 +242,38 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	/* TODO: implement a bulking/enqueue step later */
+	return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
@@ -405,6 +424,9 @@ static struct notifier_block dev_map_notifier = {
 
 static int __init dev_map_init(void)
 {
+	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+		     offsetof(struct _bpf_dtab_netdev, dev));
 	register_netdevice_notifier(&dev_map_notifier);
 	return 0;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index aa114c4acb25..c867106d3707 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3065,20 +3065,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP: {
-		struct net_device *dev = fwd;
-		struct xdp_frame *xdpf;
+		struct bpf_dtab_netdev *dst = fwd;
 
-		if (!dev->netdev_ops->ndo_xdp_xmit)
-			return -EOPNOTSUPP;
-
-		xdpf = convert_to_xdp_frame(xdp);
-		if (unlikely(!xdpf))
-			return -EOVERFLOW;
-
-		/* TODO: move to inside map code instead, for bulk support
-		 * err = dev_map_enqueue(dev, xdp);
-		 */
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		err = dev_map_enqueue(dst, xdp);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3-59-g8ed1b


From 5d053f9da4311a86bc58be8588bb5660fb3f0724 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:51 +0200
Subject: bpf: devmap prepare xdp frames for bulking

Like cpumap create queue for xdp frames that will be bulked.  For now,
this patch simply invoke ndo_xdp_xmit foreach frame.  This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.

V5: Avoid memleak on error path in dev_map_update_elem()

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 06c400e7e4ff..15293b9dfb77 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -55,10 +55,17 @@
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -208,6 +215,34 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+			 struct xdp_bulk_queue *bq)
+{
+	struct net_device *dev = obj->dev;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+		int err;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+		if (err)
+			xdp_return_frame(xdpf);
+	}
+	bq->count = 0;
+
+	return 0;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -223,6 +258,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct xdp_bulk_queue *bq;
 		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
@@ -232,6 +268,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq);
 		netdev = dev->dev;
 		if (likely(netdev->netdev_ops->ndo_xdp_flush))
 			netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -254,6 +293,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return obj;
 }
 
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq);
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 {
 	struct net_device *dev = dst->dev;
@@ -266,8 +319,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	/* TODO: implement a bulking/enqueue step later */
-	return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+	return bq_enqueue(dst, xdpf);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
@@ -282,13 +334,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
 	if (dev->dev->netdev_ops->ndo_xdp_flush) {
 		struct net_device *fl = dev->dev;
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq);
+
 			fl->netdev_ops->ndo_xdp_flush(dev->dev);
 		}
 	}
@@ -300,6 +357,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -332,6 +390,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -346,13 +405,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
+			free_percpu(dev->bulkq);
 			kfree(dev);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:45:57 +0200
Subject: xdp: add tracepoint for devmap like cpumap have

Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.

V5: Spotted by John Fastabend.
 Fix 'sent' also counted 'drops' in this patch, a later patch corrected
 this, but it was a mistake in this intermediate step.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        |  6 ++++--
 include/trace/events/xdp.h | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/devmap.c        | 27 +++++++++++++++++++++++----
 net/core/filter.c          |  2 +-
 4 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 23a809da452d..bbe297436e5d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -492,7 +492,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
 
 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -579,7 +580,8 @@ struct xdp_buff;
 struct bpf_dtab_netdev;
 
 static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	return 0;
 }
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
 		  __entry->to_cpu)
 );
 
+TRACE_EVENT(xdp_devmap_xmit,
+
+	TP_PROTO(const struct bpf_map *map, u32 map_index,
+		 int sent, int drops,
+		 const struct net_device *from_dev,
+		 const struct net_device *to_dev),
+
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(u32, map_index)
+		__field(int, drops)
+		__field(int, sent)
+		__field(int, from_ifindex)
+		__field(int, to_ifindex)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map->id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->map_index	= map_index;
+		__entry->drops		= drops;
+		__entry->sent		= sent;
+		__entry->from_ifindex	= from_dev->ifindex;
+		__entry->to_ifindex	= to_dev->ifindex;
+	),
+
+	TP_printk("ndo_xdp_xmit"
+		  " map_id=%d map_index=%d action=%s"
+		  " sent=%d drops=%d"
+		  " from_ifindex=%d to_ifindex=%d",
+		  __entry->map_id, __entry->map_index,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->sent, __entry->drops,
+		  __entry->from_ifindex, __entry->to_ifindex)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 15293b9dfb77..ff2f3bf59f2f 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -58,6 +58,7 @@
 #define DEV_MAP_BULK_SIZE 16
 struct xdp_bulk_queue {
 	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
 	unsigned int count;
 };
 
@@ -219,6 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -235,11 +237,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		int err;
 
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err)
+		if (err) {
+			drops++;
 			xdp_return_frame(xdpf);
+		} else {
+			sent++;
+		}
 	}
 	bq->count = 0;
 
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev);
+	bq->dev_rx = NULL;
 	return 0;
 }
 
@@ -296,18 +305,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 		bq_xmit_all(obj, bq);
 
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
 	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
 {
 	struct net_device *dev = dst->dev;
 	struct xdp_frame *xdpf;
@@ -319,7 +338,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	return bq_enqueue(dst, xdpf);
+	return bq_enqueue(dst, xdpf, dev_rx);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/net/core/filter.c b/net/core/filter.c
index c867106d3707..36cf2f87d742 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3067,7 +3067,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	case BPF_MAP_TYPE_DEVMAP: {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_enqueue(dst, xdp);
+		err = dev_map_enqueue(dst, xdp, dev_rx);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);
-- 
cgit v1.2.3-59-g8ed1b


From 9940fbf633e8714c7c885f8d3848f508b8612069 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:02 +0200
Subject: samples/bpf: xdp_monitor use tracepoint xdp:xdp_devmap_xmit

The xdp_monitor sample/tool is updated to use the new tracepoint
xdp:xdp_devmap_xmit the previous patch just introduced.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/xdp_monitor_kern.c | 39 +++++++++++++++++++++++++++++++++++++
 samples/bpf/xdp_monitor_user.c | 44 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..2854aa0665ea 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -208,3 +208,42 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 
 	return 0;
 }
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	u32 map_index;		//	offset:16; size:4; signed:0;
+	int drops;		//	offset:20; size:4; signed:1;
+	int sent;		//	offset:24; size:4; signed:1;
+	int from_ifindex;	//	offset:28; size:4; signed:1;
+	int to_ifindex;		//	offset:32; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->sent;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	rec->info += 1;
+
+	return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index bf09b5188acd..7e18a454924c 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -141,6 +141,7 @@ struct stats_record {
 	struct record_u64 xdp_exception[XDP_ACTION_MAX];
 	struct record xdp_cpumap_kthread;
 	struct record xdp_cpumap_enqueue[MAX_CPUS];
+	struct record xdp_devmap_xmit;
 };
 
 static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -397,7 +398,7 @@ static void stats_print(struct stats_record *stats_rec,
 			info = calc_info(r, p, t);
 			if (info > 0)
 				i_str = "sched";
-			if (pps > 0)
+			if (pps > 0 || drop > 0)
 				printf(fmt1, "cpumap-kthread",
 				       i, pps, drop, info, i_str);
 		}
@@ -409,6 +410,42 @@ static void stats_print(struct stats_record *stats_rec,
 		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
 
+	/* devmap ndo_xdp_xmit stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_devmap_xmit;
+		prev = &stats_prev->xdp_devmap_xmit;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				i_str = "bulk-average";
+				info = (pps+drop) / info; /* calc avg bulk */
+			}
+			if (pps > 0 || drop > 0)
+				printf(fmt1, "devmap-xmit",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0) {
+			i_str = "bulk-average";
+			info = (pps+drop) / info; /* calc avg bulk */
+		}
+		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+	}
+
 	printf("\n");
 }
 
@@ -437,6 +474,9 @@ static bool stats_collect(struct stats_record *rec)
 	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
 	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
+	fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+	map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
 	return true;
 }
 
@@ -480,6 +520,7 @@ static struct stats_record *alloc_stats_record(void)
 
 	rec_sz = sizeof(struct datarec);
 	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+	rec->xdp_devmap_xmit.cpu    = alloc_rec_per_cpu(rec_sz);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +539,7 @@ static void free_stats_record(struct stats_record *r)
 		free(r->xdp_exception[i].cpu);
 
 	free(r->xdp_cpumap_kthread.cpu);
+	free(r->xdp_devmap_xmit.cpu);
 
 	for (i = 0; i < MAX_CPUS; i++)
 		free(r->xdp_cpumap_enqueue[i].cpu);
-- 
cgit v1.2.3-59-g8ed1b


From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:07 +0200
Subject: xdp: introduce xdp_return_frame_rx_napi

When sending an xdp_frame through xdp_do_redirect call, then error
cases can happen where the xdp_frame needs to be dropped, and
returning an -errno code isn't sufficient/possible any-longer
(e.g. for cpumap case). This is already fully supported, by simply
calling xdp_return_frame.

This patch is an optimization, which provides xdp_return_frame_rx_napi,
which is a faster variant for these error cases.  It take advantage of
the protection provided by XDP RX running under NAPI protection.

This change is mostly relevant for drivers using the page_pool
allocator as it can take advantage of this. (Tested with mlx5).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/page_pool.h |  5 +++--
 include/net/xdp.h       |  1 +
 kernel/bpf/cpumap.c     |  2 +-
 kernel/bpf/devmap.c     |  2 +-
 net/core/xdp.c          | 20 ++++++++++++++++----
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index c79087153148..694d055e01ef 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool);
 void __page_pool_put_page(struct page_pool *pool,
 			  struct page *page, bool allow_direct);
 
-static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+static inline void page_pool_put_page(struct page_pool *pool,
+				      struct page *page, bool allow_direct)
 {
 	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, false);
+	__page_pool_put_page(pool, page, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 }
 
 void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
 void xdp_return_buff(struct xdp_buff *xdp);
 
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index c95b04ec103e..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ff2f3bf59f2f..a9cd5c93dd2b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -239,7 +239,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err) {
 			drops++;
-			xdp_return_frame(xdpf);
+			xdp_return_frame_rx_napi(xdpf);
 		} else {
 			sent++;
 		}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..cb8c4e061a5a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,13 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+/* XDP RX runs under NAPI protection, and in different delivery error
+ * scenarios (e.g. queue full), it is possible to return the xdp_frame
+ * while still leveraging this protection.  The @napi_direct boolian
+ * is used for those calls sites.  Thus, allowing for faster recycling
+ * of xdp_frames/pages in those cases.
+ */
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
 		if (xa)
-			page_pool_put_page(xa->page_pool, page);
+			page_pool_put_page(xa->page_pool, page, napi_direct);
 		else
 			put_page(page);
 		rcu_read_unlock();
@@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	xdp_return(xdpf->data, &xdpf->mem);
+	__xdp_return(xdpf->data, &xdpf->mem, false);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+	__xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	xdp_return(xdp->data, &xdp->rxq->mem);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
-- 
cgit v1.2.3-59-g8ed1b


From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:12 +0200
Subject: xdp: change ndo_xdp_xmit API to support bulking

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 26 ++++++++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ++++++---
 drivers/net/tun.c                             | 37 +++++++++------
 drivers/net/virtio_net.c                      | 66 ++++++++++++++++++++-------
 include/linux/netdevice.h                     | 14 ++++--
 kernel/bpf/devmap.c                           | 29 +++++++-----
 net/core/filter.c                             |  8 ++--
 8 files changed, 139 insertions(+), 64 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5efa68de935b..9b698c5acd05 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * @dev: netdev
  * @xdp: XDP buffer
  *
- * Returns Zero if sent, else an error code
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
-	if (err != I40E_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		if (err != I40E_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index fdd2c55f03a6..eb8804b3d7b6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6652b201df5b..9645619f7729 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int ixgbe_xdp_xmit(struct net_device *dev, int n,
+			  struct xdp_frame **frames)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
-	int err;
+	int drops = 0;
+	int i;
 
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
@@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
-	if (err != IXGBE_XDP_TX)
-		return -ENOSPC;
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		int err;
 
-	return 0;
+		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
+		if (err != IXGBE_XDP_TX) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	return n - drops;
 }
 
 static void ixgbe_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44d4f3d25350..d3dcfcb1c4b3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -70,6 +70,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/sock.h>
+#include <net/xdp.h>
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
 	u32 numqueues;
-	int ret = 0;
+	int drops = 0;
+	int cnt = n;
+	int i;
 
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
 	if (!numqueues) {
-		ret = -ENOSPC;
-		goto out;
+		rcu_read_unlock();
+		return -ENXIO; /* Caller will free/return all frames */
 	}
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Encode the XDP flag into lowest bit for consumer to differ
-	 * XDP buffer from sk_buff.
-	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
-		this_cpu_inc(tun->pcpu_stats->tx_dropped);
-		ret = -ENOSPC;
+
+	spin_lock(&tfile->tx_ring.producer_lock);
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdp = frames[i];
+		/* Encode the XDP flag into lowest bit for consumer to differ
+		 * XDP buffer from sk_buff.
+		 */
+		void *frame = tun_xdp_to_ptr(xdp);
+
+		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
+			this_cpu_inc(tun->pcpu_stats->tx_dropped);
+			xdp_return_frame_rx_napi(xdp);
+			drops++;
+		}
 	}
+	spin_unlock(&tfile->tx_ring.producer_lock);
 
-out:
 	rcu_read_unlock();
-	return ret;
+	return cnt - drops;
 }
 
 static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, frame);
+	return tun_xdp_xmit(dev, 1, &frame);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..39a0783d1cde 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static int __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_frame *xdpf)
+static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
+				   struct send_queue *sq,
+				   struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	struct xdp_frame *xdpf_sent;
-	struct send_queue *sq;
-	unsigned int len;
-	unsigned int qp;
 	int err;
 
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
-		xdp_return_frame(xdpf_sent);
-
 	/* virtqueue want to use data area in-front of packet */
 	if (unlikely(xdpf->metasize > 0))
 		return -EOPNOTSUPP;
@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
 	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
+				   struct xdp_frame *xdpf)
+{
+	struct xdp_frame *xdpf_sent;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
+
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	return __virtnet_xdp_xmit_one(vi, sq, xdpf);
+}
+
+static int virtnet_xdp_xmit(struct net_device *dev,
+			    int n, struct xdp_frame **frames)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
+	struct xdp_frame *xdpf_sent;
 	struct bpf_prog *xdp_prog;
+	struct send_queue *sq;
+	unsigned int len;
+	unsigned int qp;
+	int drops = 0;
+	int err;
+	int i;
+
+	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+	sq = &vi->sq[qp];
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	return __virtnet_xdp_xmit(vi, xdpf);
+	/* Free up any pending old buffers before queueing new ones. */
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+
+		err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
+		if (err) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+	return n - drops;
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			xdpf = convert_to_xdp_frame(&xdp);
 			if (unlikely(!xdpf))
 				goto err_xdp;
-			err = __virtnet_xdp_xmit(vi, xdpf);
+			err = __virtnet_xdp_tx_xmit(vi, xdpf);
 			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..debdb6286170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,9 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
- *	This function is used to submit a XDP packet for transmit on a
- *	netdevice.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ *	This function is used to submit @n XDP packets for transmit on a
+ *	netdevice. Returns number of frames successfully transmitted, frames
+ *	that got dropped are freed/returned via xdp_return_frame().
+ *	Returns negative number, means general error invoking ndo, meaning
+ *	no frames were xmit'ed and core-caller will free all frames.
+ *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1375,8 +1379,8 @@ struct net_device_ops {
 						       int needed_headroom);
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
-	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_frame *xdp);
+	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
+						struct xdp_frame **xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a9cd5c93dd2b..77908311ec98 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-		int err;
-
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-		if (err) {
-			drops++;
-			xdp_return_frame_rx_napi(xdpf);
-		} else {
-			sent++;
-		}
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	if (sent < 0) {
+		sent = 0;
+		goto error;
 	}
+	drops = bq->count - sent;
+out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 			      sent, drops, bq->dev_rx, dev);
 	bq->dev_rx = NULL;
 	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
 }
 
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
diff --git a/net/core/filter.c b/net/core/filter.c
index 36cf2f87d742..1d75f9322275 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			u32 index)
 {
 	struct xdp_frame *xdpf;
-	int err;
+	int sent;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
@@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
-	if (err)
-		return err;
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	if (sent <= 0)
+		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:17 +0200
Subject: xdp/trace: extend tracepoint in devmap with an err

Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code
allow people to easier identify the reason behind the ndo_xdp_xmit
call to a given driver is failing.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/xdp.h | 10 ++++++----
 kernel/bpf/devmap.c        |  5 +++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 2e9ef0650144..1ecf4c67fcf7 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit,
 	TP_PROTO(const struct bpf_map *map, u32 map_index,
 		 int sent, int drops,
 		 const struct net_device *from_dev,
-		 const struct net_device *to_dev),
+		 const struct net_device *to_dev, int err),
 
-	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+	TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
 
 	TP_STRUCT__entry(
 		__field(int, map_id)
@@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__field(int, sent)
 		__field(int, from_ifindex)
 		__field(int, to_ifindex)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit,
 		__entry->sent		= sent;
 		__entry->from_ifindex	= from_dev->ifindex;
 		__entry->to_ifindex	= to_dev->ifindex;
+		__entry->err		= err;
 	),
 
 	TP_printk("ndo_xdp_xmit"
 		  " map_id=%d map_index=%d action=%s"
 		  " sent=%d drops=%d"
-		  " from_ifindex=%d to_ifindex=%d",
+		  " from_ifindex=%d to_ifindex=%d err=%d",
 		  __entry->map_id, __entry->map_index,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->sent, __entry->drops,
-		  __entry->from_ifindex, __entry->to_ifindex)
+		  __entry->from_ifindex, __entry->to_ifindex, __entry->err)
 );
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 77908311ec98..ae16d0c373ef 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -220,7 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 			 struct xdp_bulk_queue *bq)
 {
 	struct net_device *dev = obj->dev;
-	int sent = 0, drops = 0;
+	int sent = 0, drops = 0, err = 0;
 	int i;
 
 	if (unlikely(!bq->count))
@@ -234,6 +234,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 
 	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
 	if (sent < 0) {
+		err = sent;
 		sent = 0;
 		goto error;
 	}
@@ -242,7 +243,7 @@ out:
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
-			      sent, drops, bq->dev_rx, dev);
+			      sent, drops, bq->dev_rx, dev, err);
 	bq->dev_rx = NULL;
 	return 0;
 error:
-- 
cgit v1.2.3-59-g8ed1b


From a570e48fee1bc26f47aba2e1493f96a03bed3c8f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 May 2018 16:46:22 +0200
Subject: samples/bpf: xdp_monitor use err code from tracepoint
 xdp:xdp_devmap_xmit

Update xdp_monitor to use the recently added err code introduced
in tracepoint xdp:xdp_devmap_xmit, to show if the drop count is
caused by some driver general delivery problem.  Other kind of drops
will likely just be more normal TX space issues.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/xdp_monitor_kern.c | 10 ++++++++++
 samples/bpf/xdp_monitor_user.c | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2854aa0665ea..ad10fe700d7d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -125,6 +125,7 @@ struct datarec {
 	u64 processed;
 	u64 dropped;
 	u64 info;
+	u64 err;
 };
 #define MAX_CPUS 64
 
@@ -228,6 +229,7 @@ struct devmap_xmit_ctx {
 	int sent;		//	offset:24; size:4; signed:1;
 	int from_ifindex;	//	offset:28; size:4; signed:1;
 	int to_ifindex;		//	offset:32; size:4; signed:1;
+	int err;		//	offset:36; size:4; signed:1;
 };
 
 SEC("tracepoint/xdp/xdp_devmap_xmit")
@@ -245,5 +247,13 @@ int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
 	/* Record bulk events, then userspace can calc average bulk size */
 	rec->info += 1;
 
+	/* Record error cases, where no frame were sent */
+	if (ctx->err)
+		rec->err++;
+
+	/* Catch API error of drv ndo_xdp_xmit sent more than count */
+	if (ctx->drops < 0)
+		rec->err++;
+
 	return 1;
 }
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 7e18a454924c..dd558cbb2309 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -117,6 +117,7 @@ struct datarec {
 	__u64 processed;
 	__u64 dropped;
 	__u64 info;
+	__u64 err;
 };
 #define MAX_CPUS 64
 
@@ -152,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 	__u64 sum_processed = 0;
 	__u64 sum_dropped = 0;
 	__u64 sum_info = 0;
+	__u64 sum_err = 0;
 	int i;
 
 	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
@@ -170,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
 		sum_dropped        += values[i].dropped;
 		rec->cpu[i].info = values[i].info;
 		sum_info        += values[i].info;
+		rec->cpu[i].err = values[i].err;
+		sum_err        += values[i].err;
 	}
 	rec->total.processed = sum_processed;
 	rec->total.dropped   = sum_dropped;
 	rec->total.info      = sum_info;
+	rec->total.err       = sum_err;
 	return true;
 }
 
@@ -274,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
 	return pps;
 }
 
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->err - p->err;
+		pps = packets / period;
+	}
+	return pps;
+}
+
 static void stats_print(struct stats_record *stats_rec,
 			struct stats_record *stats_prev,
 			bool err_only)
@@ -412,11 +429,12 @@ static void stats_print(struct stats_record *stats_rec,
 
 	/* devmap ndo_xdp_xmit stats */
 	{
-		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
-		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
 		struct record *rec, *prev;
-		double drop, info;
+		double drop, info, err;
 		char *i_str = "";
+		char *err_str = "";
 
 		rec  =  &stats_rec->xdp_devmap_xmit;
 		prev = &stats_prev->xdp_devmap_xmit;
@@ -428,22 +446,29 @@ static void stats_print(struct stats_record *stats_rec,
 			pps  = calc_pps(r, p, t);
 			drop = calc_drop(r, p, t);
 			info = calc_info(r, p, t);
+			err  = calc_err(r, p, t);
 			if (info > 0) {
 				i_str = "bulk-average";
 				info = (pps+drop) / info; /* calc avg bulk */
 			}
+			if (err > 0)
+				err_str = "drv-err";
 			if (pps > 0 || drop > 0)
 				printf(fmt1, "devmap-xmit",
-				       i, pps, drop, info, i_str);
+				       i, pps, drop, info, i_str, err_str);
 		}
 		pps = calc_pps(&rec->total, &prev->total, t);
 		drop = calc_drop(&rec->total, &prev->total, t);
 		info = calc_info(&rec->total, &prev->total, t);
+		err  = calc_err(&rec->total, &prev->total, t);
 		if (info > 0) {
 			i_str = "bulk-average";
 			info = (pps+drop) / info; /* calc avg bulk */
 		}
-		printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+		if (err > 0)
+			err_str = "drv-err";
+		printf(fmt2, "devmap-xmit", "total", pps, drop,
+		       info, i_str, err_str);
 	}
 
 	printf("\n");
-- 
cgit v1.2.3-59-g8ed1b


From d624613e422d9bda9e4f066281b1f178ed51f0b1 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 22 May 2018 15:07:18 +0800
Subject: cxgb4: Check for kvzalloc allocation failure

t4_prep_fw doesn't check for card_fw pointer before store the read data,
which could lead to a NULL pointer dereference if kvzalloc failed.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 513e1d356384..8405187f8e3f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4140,6 +4140,10 @@ static int adap_init0(struct adapter *adap)
 		 * card
 		 */
 		card_fw = kvzalloc(sizeof(*card_fw), GFP_KERNEL);
+		if (!card_fw) {
+			ret = -ENOMEM;
+			goto bye;
+		}
 
 		/* Get FW from from /lib/firmware/ */
 		ret = request_firmware(&fw, fw_info->fw_mod_name,
-- 
cgit v1.2.3-59-g8ed1b


From 6b45432d7876edab0873baa4e729d6f057469628 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 23 May 2018 18:34:56 +0200
Subject: selftests: forwarding: Test mirroring to deleted device

Tests that the mirroring code catches up with deletion of a mirrored-to
device.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_changes.sh | 38 ++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index 50ab3462af0c..a35fd55ff802 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -12,6 +12,7 @@ ALL_TESTS="
 	test_tun_up
 	test_egress_up
 	test_remote_ip
+	test_tun_del
 "
 
 NUM_NETIFS=6
@@ -159,6 +160,35 @@ test_span_gre_remote_ip()
 	log_test "$what: remote address change ($tcflags)"
 }
 
+test_span_gre_tun_del()
+{
+	local tundev=$1; shift
+	local type=$1; shift
+	local flags=$1; shift
+	local local_ip=$1; shift
+	local remote_ip=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+	ip link del dev $tundev
+	fail_test_span_gre_dir $tundev ingress
+
+	tunnel_create $tundev $type $local_ip $remote_ip \
+		      ttl 100 tos inherit $flags
+
+	# Recreating the tunnel doesn't reestablish mirroring, so reinstall it
+	# and verify it works for the follow-up tests.
+	mirror_uninstall $swp1 ingress
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: tunnel deleted ($tcflags)"
+}
+
 test_ttl()
 {
 	test_span_gre_ttl gt4 gretap ip "mirror to gretap"
@@ -183,6 +213,14 @@ test_remote_ip()
 	test_span_gre_remote_ip gt6 ip6gretap 2001:db8:2::2 2001:db8:2::4 "mirror to ip6gretap"
 }
 
+test_tun_del()
+{
+	test_span_gre_tun_del gt4 gretap "" \
+			      192.0.2.129 192.0.2.130 "mirror to gretap"
+	test_span_gre_tun_del gt6 ip6gretap allow-localremote \
+			      2001:db8:2::1 2001:db8:2::2 "mirror to ip6gretap"
+}
+
 test_all()
 {
 	slow_path_trap_install $swp1 ingress
-- 
cgit v1.2.3-59-g8ed1b


From 77a8df38100fc9b982063d4debcbbd9bfd460ed7 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 23 May 2018 18:35:01 +0200
Subject: selftests: forwarding: Test removal of underlay route

When underlay route is removed, the mirrored traffic should not be
forwarded.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_changes.sh | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index a35fd55ff802..e22a9e44db24 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -13,6 +13,7 @@ ALL_TESTS="
 	test_egress_up
 	test_remote_ip
 	test_tun_del
+	test_route_del
 "
 
 NUM_NETIFS=6
@@ -189,6 +190,29 @@ test_span_gre_tun_del()
 	log_test "$what: tunnel deleted ($tcflags)"
 }
 
+test_span_gre_route_del()
+{
+	local tundev=$1; shift
+	local edev=$1; shift
+	local route=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+
+	ip route del $route dev $edev
+	fail_test_span_gre_dir $tundev ingress
+
+	ip route add $route dev $edev
+	quick_test_span_gre_dir $tundev ingress
+
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: underlay route removal ($tcflags)"
+}
+
 test_ttl()
 {
 	test_span_gre_ttl gt4 gretap ip "mirror to gretap"
@@ -221,6 +245,12 @@ test_tun_del()
 			      2001:db8:2::1 2001:db8:2::2 "mirror to ip6gretap"
 }
 
+test_route_del()
+{
+	test_span_gre_route_del gt4 $swp3 192.0.2.128/28 "mirror to gretap"
+	test_span_gre_route_del gt6 $swp3 2001:db8:2::/64 "mirror to ip6gretap"
+}
+
 test_all()
 {
 	slow_path_trap_install $swp1 ingress
-- 
cgit v1.2.3-59-g8ed1b


From a96d81a20b1a3b94d2bf82ee2e8da2d3dc86f972 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 23 May 2018 18:35:07 +0200
Subject: selftests: forwarding: Test removal of mirroring

Test that when flower-based mirror action is removed, mirroring stops.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_gre_flower.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
index 2e54407d8954..12914f40612d 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
@@ -67,6 +67,11 @@ test_span_gre_dir_acl()
 	test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
 }
 
+fail_test_span_gre_dir_acl()
+{
+	fail_test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+}
+
 full_test_span_gre_dir_acl()
 {
 	local tundev=$1; shift
@@ -83,6 +88,9 @@ full_test_span_gre_dir_acl()
 			  "$forward_type" "$backward_type"
 	mirror_uninstall $swp1 $direction
 
+	# Test lack of mirroring after ACL mirror is uninstalled.
+	fail_test_span_gre_dir_acl "$tundev" "$direction"
+
 	log_test "$direction $what ($tcflags)"
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c3f22415474d36194cb7785d9a236523bce0d829 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:37:55 -0500
Subject: ibmvnic: Mark NAPI flag as disabled when released

Set adapter NAPI state as disabled if they are removed. This will allow
them to be enabled again if reallocated in case of a hard reset.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4bb4646a5f92..eabc1e445137 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -789,6 +789,7 @@ static void release_napi(struct ibmvnic_adapter *adapter)
 	kfree(adapter->napi);
 	adapter->napi = NULL;
 	adapter->num_active_rx_napi = 0;
+	adapter->napi_enabled = false;
 }
 
 static int ibmvnic_login(struct net_device *netdev)
-- 
cgit v1.2.3-59-g8ed1b


From 5153698e551b4b824e3d35da59914e1b4f815baa Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:37:56 -0500
Subject: ibmvnic: Introduce active CRQ state

Introduce an "active" state for a IBM vNIC Command-Response Queue. A CRQ
is considered active once it has initialized or linked with its partner by
sending an initialization request and getting a successful response back
from the management partition.  Until this has happened, do not allow CRQ
commands to be sent other than the initialization request.

This change will avoid a protocol error in case of a device transport
event occurring during a initialization. When the driver receives a
transport event notification indicating that the backing hardware
has changed and needs reinitialization, any further commands other
than the initialization handshake with the VIOS management partition
will result in an invalid state error. Instead of sending a command
that will be returned with an error, print a warning and return an
error that will be handled by the caller.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 10 ++++++++++
 drivers/net/ethernet/ibm/ibmvnic.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index eabc1e445137..e6a081c13661 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3147,6 +3147,12 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter,
 		   (unsigned long int)cpu_to_be64(u64_crq[0]),
 		   (unsigned long int)cpu_to_be64(u64_crq[1]));
 
+	if (!adapter->crq.active &&
+	    crq->generic.first != IBMVNIC_CRQ_INIT_CMD) {
+		dev_warn(dev, "Invalid request detected while CRQ is inactive, possible device state change during reset\n");
+		return -EINVAL;
+	}
+
 	/* Make sure the hypervisor sees the complete request */
 	mb();
 
@@ -4225,6 +4231,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
+			adapter->crq.active = true;
 			send_version_xchg(adapter);
 			break;
 		default:
@@ -4233,6 +4240,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		return;
 	case IBMVNIC_CRQ_XPORT_EVENT:
 		netif_carrier_off(netdev);
+		adapter->crq.active = false;
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
 			dev_info(dev, "Migrated, re-enabling adapter\n");
 			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
@@ -4420,6 +4428,7 @@ static int ibmvnic_reset_crq(struct ibmvnic_adapter *adapter)
 	/* Clean out the queue */
 	memset(crq->msgs, 0, PAGE_SIZE);
 	crq->cur = 0;
+	crq->active = false;
 
 	/* And re-open it again */
 	rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address,
@@ -4454,6 +4463,7 @@ static void release_crq_queue(struct ibmvnic_adapter *adapter)
 			 DMA_BIDIRECTIONAL);
 	free_page((unsigned long)crq->msgs);
 	crq->msgs = NULL;
+	crq->active = false;
 }
 
 static int init_crq_queue(struct ibmvnic_adapter *adapter)
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 22391e8805f6..edfc312d3523 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -865,6 +865,7 @@ struct ibmvnic_crq_queue {
 	int size, cur;
 	dma_addr_t msg_token;
 	spinlock_t lock;
+	bool active;
 };
 
 union sub_crq {
-- 
cgit v1.2.3-59-g8ed1b


From 9c4eaabd1bb392e30c2d40d60e60d68315cf0c1f Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:37:57 -0500
Subject: ibmvnic: Check CRQ command return codes

Check whether CRQ command is successful before awaiting a response
from the management partition. If the command was not successful, the
driver may hang waiting for a response that will never come.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 51 +++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index e6a081c13661..70835198a2f9 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -109,8 +109,8 @@ static union sub_crq *ibmvnic_next_scrq(struct ibmvnic_adapter *,
 					struct ibmvnic_sub_crq_queue *);
 static int ibmvnic_poll(struct napi_struct *napi, int data);
 static void send_map_query(struct ibmvnic_adapter *adapter);
-static void send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8);
-static void send_request_unmap(struct ibmvnic_adapter *, u8);
+static int send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8);
+static int send_request_unmap(struct ibmvnic_adapter *, u8);
 static int send_login(struct ibmvnic_adapter *adapter);
 static void send_cap_queries(struct ibmvnic_adapter *adapter);
 static int init_sub_crqs(struct ibmvnic_adapter *);
@@ -172,6 +172,7 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter,
 				struct ibmvnic_long_term_buff *ltb, int size)
 {
 	struct device *dev = &adapter->vdev->dev;
+	int rc;
 
 	ltb->size = size;
 	ltb->buff = dma_alloc_coherent(dev, ltb->size, &ltb->addr,
@@ -185,8 +186,12 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter,
 	adapter->map_id++;
 
 	init_completion(&adapter->fw_done);
-	send_request_map(adapter, ltb->addr,
-			 ltb->size, ltb->map_id);
+	rc = send_request_map(adapter, ltb->addr,
+			      ltb->size, ltb->map_id);
+	if (rc) {
+		dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
+		return rc;
+	}
 	wait_for_completion(&adapter->fw_done);
 
 	if (adapter->fw_done_rc) {
@@ -215,10 +220,14 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
 static int reset_long_term_buff(struct ibmvnic_adapter *adapter,
 				struct ibmvnic_long_term_buff *ltb)
 {
+	int rc;
+
 	memset(ltb->buff, 0, ltb->size);
 
 	init_completion(&adapter->fw_done);
-	send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
+	rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id);
+	if (rc)
+		return rc;
 	wait_for_completion(&adapter->fw_done);
 
 	if (adapter->fw_done_rc) {
@@ -952,6 +961,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	struct device *dev = &adapter->vdev->dev;
 	union ibmvnic_crq crq;
 	int len = 0;
+	int rc;
 
 	if (adapter->vpd->buff)
 		len = adapter->vpd->len;
@@ -959,7 +969,9 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	init_completion(&adapter->fw_done);
 	crq.get_vpd_size.first = IBMVNIC_CRQ_CMD;
 	crq.get_vpd_size.cmd = GET_VPD_SIZE;
-	ibmvnic_send_crq(adapter, &crq);
+	rc = ibmvnic_send_crq(adapter, &crq);
+	if (rc)
+		return rc;
 	wait_for_completion(&adapter->fw_done);
 
 	if (!adapter->vpd->len)
@@ -992,7 +1004,12 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	crq.get_vpd.cmd = GET_VPD;
 	crq.get_vpd.ioba = cpu_to_be32(adapter->vpd->dma_addr);
 	crq.get_vpd.len = cpu_to_be32((u32)adapter->vpd->len);
-	ibmvnic_send_crq(adapter, &crq);
+	rc = ibmvnic_send_crq(adapter, &crq);
+	if (rc) {
+		kfree(adapter->vpd->buff);
+		adapter->vpd->buff = NULL;
+		return rc;
+	}
 	wait_for_completion(&adapter->fw_done);
 
 	return 0;
@@ -1691,6 +1708,7 @@ static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	struct sockaddr *addr = p;
 	union ibmvnic_crq crq;
+	int rc;
 
 	if (!is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
@@ -1701,7 +1719,9 @@ static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p)
 	ether_addr_copy(&crq.change_mac_addr.mac_addr[0], addr->sa_data);
 
 	init_completion(&adapter->fw_done);
-	ibmvnic_send_crq(adapter, &crq);
+	rc = ibmvnic_send_crq(adapter, &crq);
+	if (rc)
+		return rc;
 	wait_for_completion(&adapter->fw_done);
 	/* netdev->dev_addr is changed in handle_change_mac_rsp function */
 	return adapter->fw_done_rc ? -EIO : 0;
@@ -2365,6 +2385,7 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev,
 	struct ibmvnic_adapter *adapter = netdev_priv(dev);
 	union ibmvnic_crq crq;
 	int i, j;
+	int rc;
 
 	memset(&crq, 0, sizeof(crq));
 	crq.request_statistics.first = IBMVNIC_CRQ_CMD;
@@ -2375,7 +2396,9 @@ static void ibmvnic_get_ethtool_stats(struct net_device *dev,
 
 	/* Wait for data to be written */
 	init_completion(&adapter->stats_done);
-	ibmvnic_send_crq(adapter, &crq);
+	rc = ibmvnic_send_crq(adapter, &crq);
+	if (rc)
+		return;
 	wait_for_completion(&adapter->stats_done);
 
 	for (i = 0; i < ARRAY_SIZE(ibmvnic_stats); i++)
@@ -3377,8 +3400,8 @@ buf_alloc_failed:
 	return -1;
 }
 
-static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr,
-			     u32 len, u8 map_id)
+static int send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr,
+			    u32 len, u8 map_id)
 {
 	union ibmvnic_crq crq;
 
@@ -3388,10 +3411,10 @@ static void send_request_map(struct ibmvnic_adapter *adapter, dma_addr_t addr,
 	crq.request_map.map_id = map_id;
 	crq.request_map.ioba = cpu_to_be32(addr);
 	crq.request_map.len = cpu_to_be32(len);
-	ibmvnic_send_crq(adapter, &crq);
+	return ibmvnic_send_crq(adapter, &crq);
 }
 
-static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id)
+static int send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id)
 {
 	union ibmvnic_crq crq;
 
@@ -3399,7 +3422,7 @@ static void send_request_unmap(struct ibmvnic_adapter *adapter, u8 map_id)
 	crq.request_unmap.first = IBMVNIC_CRQ_CMD;
 	crq.request_unmap.cmd = REQUEST_UNMAP;
 	crq.request_unmap.map_id = map_id;
-	ibmvnic_send_crq(adapter, &crq);
+	return ibmvnic_send_crq(adapter, &crq);
 }
 
 static void send_map_query(struct ibmvnic_adapter *adapter)
-- 
cgit v1.2.3-59-g8ed1b


From 17c8705838a5acafbc77079c72378fc7e0f0a876 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:37:58 -0500
Subject: ibmvnic: Return error code if init interrupted by transport event

If device init is interrupted by a failover, set the init return
code so that it can be checked and handled appropriately by the
init routine.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 70835198a2f9..f1f744ebb14e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -4249,7 +4249,10 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			dev_info(dev, "Partner initialized\n");
 			adapter->from_passive_init = true;
 			adapter->failover_pending = false;
-			complete(&adapter->init_done);
+			if (!completion_done(&adapter->init_done)) {
+				complete(&adapter->init_done);
+				adapter->init_done_rc = -EIO;
+			}
 			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
-- 
cgit v1.2.3-59-g8ed1b


From ab5ec33b9ac285d01a0e0fa94b0fdcb64262b928 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:37:59 -0500
Subject: ibmvnic: Handle error case when setting link state

If setting the link state is not successful, print a warning
with the resulting return code and return it to be handled
by the caller.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f1f744ebb14e..b1bbd5bcc129 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -929,6 +929,10 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 			/* Partuial success, delay and re-send */
 			mdelay(1000);
 			resend = true;
+		} else if (adapter->init_done_rc) {
+			netdev_warn(netdev, "Unable to set link state, rc=%d\n",
+				    adapter->init_done_rc);
+			return adapter->init_done_rc;
 		}
 	} while (resend);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8a348450a036d690904ca54a476cbb4939ca8f95 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:38:00 -0500
Subject: ibmvnic: Create separate initialization routine for resets

Instead of having one initialization routine for all cases, create
a separate, simpler function for standard initialization, such as during
device probe. Use the original initialization function to handle
device reset scenarios. The goal of this patch is to avoid having
a single, cluttered init function to handle all possible
scenarios.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 48 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b1bbd5bcc129..f26e1f893441 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -116,6 +116,7 @@ static void send_cap_queries(struct ibmvnic_adapter *adapter);
 static int init_sub_crqs(struct ibmvnic_adapter *);
 static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter);
 static int ibmvnic_init(struct ibmvnic_adapter *);
+static int ibmvnic_reset_init(struct ibmvnic_adapter *);
 static void release_crq_queue(struct ibmvnic_adapter *);
 static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p);
 static int init_crq_queue(struct ibmvnic_adapter *adapter);
@@ -1807,7 +1808,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 			return rc;
 		}
 
-		rc = ibmvnic_init(adapter);
+		rc = ibmvnic_reset_init(adapter);
 		if (rc)
 			return IBMVNIC_INIT_FAILED;
 
@@ -4571,7 +4572,7 @@ map_failed:
 	return retrc;
 }
 
-static int ibmvnic_init(struct ibmvnic_adapter *adapter)
+static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
 	unsigned long timeout = msecs_to_jiffies(30000);
@@ -4630,6 +4631,49 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
+static int ibmvnic_init(struct ibmvnic_adapter *adapter)
+{
+	struct device *dev = &adapter->vdev->dev;
+	unsigned long timeout = msecs_to_jiffies(30000);
+	int rc;
+
+	adapter->from_passive_init = false;
+
+	init_completion(&adapter->init_done);
+	adapter->init_done_rc = 0;
+	ibmvnic_send_crq_init(adapter);
+	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
+		dev_err(dev, "Initialization sequence timed out\n");
+		return -1;
+	}
+
+	if (adapter->init_done_rc) {
+		release_crq_queue(adapter);
+		return adapter->init_done_rc;
+	}
+
+	if (adapter->from_passive_init) {
+		adapter->state = VNIC_OPEN;
+		adapter->from_passive_init = false;
+		return -1;
+	}
+
+	rc = init_sub_crqs(adapter);
+	if (rc) {
+		dev_err(dev, "Initialization of sub crqs failed\n");
+		release_crq_queue(adapter);
+		return rc;
+	}
+
+	rc = init_sub_crq_irqs(adapter);
+	if (rc) {
+		dev_err(dev, "Failed to initialize sub crq irqs\n");
+		release_crq_queue(adapter);
+	}
+
+	return rc;
+}
+
 static struct device_attribute dev_attr_failover;
 
 static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
-- 
cgit v1.2.3-59-g8ed1b


From 06e43d7f9fe536621091ca5b87dc6b8498898226 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:38:01 -0500
Subject: ibmvnic: Set resetting state at earliest possible point

Set device resetting state at the earliest possible point: as soon as a
reset is successfully scheduled. The reset state is toggled off when
all resets have been processed to completion.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f26e1f893441..ee51deba7a12 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1919,7 +1919,6 @@ static void __ibmvnic_reset(struct work_struct *work)
 	netdev = adapter->netdev;
 
 	mutex_lock(&adapter->reset_lock);
-	adapter->resetting = true;
 	reset_state = adapter->state;
 
 	rwi = get_next_rwi(adapter);
@@ -1994,7 +1993,7 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 	rwi->reset_reason = reason;
 	list_add_tail(&rwi->list, &adapter->rwi_list);
 	mutex_unlock(&adapter->rwi_lock);
-
+	adapter->resetting = true;
 	netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
 	schedule_work(&adapter->ibmvnic_reset);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2770a7984db588913e11a6dfcfe3461dbba9b7b2 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 23 May 2018 13:38:02 -0500
Subject: ibmvnic: Introduce hard reset recovery

Introduce a recovery hard reset to handle reset failure as a result of
change of device context following a transport event, such as a
backing device failover or partition migration. These operations reset
the device context to its initial state. If this occurs during a reset,
any initialization commands are likely to fail with an invalid state
error as backing device firmware requests reinitialization.

When this happens, make one more attempt by performing a hard reset,
which frees any resources currently allocated and performs device
initialization. If a transport event occurs during a device reset, a
flag is set which will trigger a new hard reset following the
completionof the current reset event.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 101 +++++++++++++++++++++++++++++++++++--
 drivers/net/ethernet/ibm/ibmvnic.h |   1 +
 2 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index ee51deba7a12..09f8e6baf049 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1878,6 +1878,85 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 	return 0;
 }
 
+static int do_hard_reset(struct ibmvnic_adapter *adapter,
+			 struct ibmvnic_rwi *rwi, u32 reset_state)
+{
+	struct net_device *netdev = adapter->netdev;
+	int rc;
+
+	netdev_dbg(adapter->netdev, "Hard resetting driver (%d)\n",
+		   rwi->reset_reason);
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	ibmvnic_cleanup(netdev);
+	release_resources(adapter);
+	release_sub_crqs(adapter, 0);
+	release_crq_queue(adapter);
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
+
+	rc = init_crq_queue(adapter);
+	if (rc) {
+		netdev_err(adapter->netdev,
+			   "Couldn't initialize crq. rc=%d\n", rc);
+		return rc;
+	}
+
+	rc = ibmvnic_init(adapter);
+	if (rc)
+		return rc;
+
+	/* If the adapter was in PROBE state prior to the reset,
+	 * exit here.
+	 */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+	/* netif_set_real_num_xx_queues needs to take rtnl lock here
+	 * unless wait_for_reset is set, in which case the rtnl lock
+	 * has already been taken before initializing the reset
+	 */
+	if (!adapter->wait_for_reset) {
+		rtnl_lock();
+		rc = init_resources(adapter);
+		rtnl_unlock();
+	} else {
+		rc = init_resources(adapter);
+	}
+	if (rc)
+		return rc;
+
+	ibmvnic_disable_irqs(adapter);
+	adapter->state = VNIC_CLOSED;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	return 0;
+}
+
 static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_rwi *rwi;
@@ -1923,9 +2002,15 @@ static void __ibmvnic_reset(struct work_struct *work)
 
 	rwi = get_next_rwi(adapter);
 	while (rwi) {
-		rc = do_reset(adapter, rwi, reset_state);
+		if (adapter->force_reset_recovery) {
+			adapter->force_reset_recovery = false;
+			rc = do_hard_reset(adapter, rwi, reset_state);
+		} else {
+			rc = do_reset(adapter, rwi, reset_state);
+		}
 		kfree(rwi);
-		if (rc && rc != IBMVNIC_INIT_FAILED)
+		if (rc && rc != IBMVNIC_INIT_FAILED &&
+		    !adapter->force_reset_recovery)
 			break;
 
 		rwi = get_next_rwi(adapter);
@@ -1951,9 +2036,9 @@ static void __ibmvnic_reset(struct work_struct *work)
 static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 			 enum ibmvnic_reset_reason reason)
 {
+	struct list_head *entry, *tmp_entry;
 	struct ibmvnic_rwi *rwi, *tmp;
 	struct net_device *netdev = adapter->netdev;
-	struct list_head *entry;
 	int ret;
 
 	if (adapter->state == VNIC_REMOVING ||
@@ -1989,7 +2074,13 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
 		ret = ENOMEM;
 		goto err;
 	}
-
+	/* if we just received a transport event,
+	 * flush reset queue and process this reset
+	 */
+	if (adapter->force_reset_recovery && !list_empty(&adapter->rwi_list)) {
+		list_for_each_safe(entry, tmp_entry, &adapter->rwi_list)
+			list_del(entry);
+	}
 	rwi->reset_reason = reason;
 	list_add_tail(&rwi->list, &adapter->rwi_list);
 	mutex_unlock(&adapter->rwi_lock);
@@ -4271,6 +4362,8 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 	case IBMVNIC_CRQ_XPORT_EVENT:
 		netif_carrier_off(netdev);
 		adapter->crq.active = false;
+		if (adapter->resetting)
+			adapter->force_reset_recovery = true;
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
 			dev_info(dev, "Migrated, re-enabling adapter\n");
 			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index edfc312d3523..f9fb780102ac 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -1109,6 +1109,7 @@ struct ibmvnic_adapter {
 
 	bool mac_change_pending;
 	bool failover_pending;
+	bool force_reset_recovery;
 
 	struct ibmvnic_tunables desired;
 	struct ibmvnic_tunables fallback;
-- 
cgit v1.2.3-59-g8ed1b


From 74ed089d48a483374656ce7e4f006d758a3a6228 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:10 +0200
Subject: selftests: forwarding: Split mirror_gre_topo_lib.sh

Move generic parts of mirror_gre_topo_lib.sh into a new file
mirror_topo_lib.sh. Reuse the functions in GRE topo, adding the tunnel
devices as necessary.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/forwarding/mirror_gre_topo_lib.sh          | 53 ++----------
 .../selftests/net/forwarding/mirror_topo_lib.sh    | 99 ++++++++++++++++++++++
 2 files changed, 108 insertions(+), 44 deletions(-)
 create mode 100644 tools/testing/selftests/net/forwarding/mirror_topo_lib.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
index b3ceda2b4197..253419564708 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
@@ -33,29 +33,11 @@
 #   |                                                                         |
 #   +-------------------------------------------------------------------------+
 
-mirror_gre_topo_h1_create()
-{
-	simple_if_init $h1 192.0.2.1/28
-}
-
-mirror_gre_topo_h1_destroy()
-{
-	simple_if_fini $h1 192.0.2.1/28
-}
-
-mirror_gre_topo_h2_create()
-{
-	simple_if_init $h2 192.0.2.2/28
-}
-
-mirror_gre_topo_h2_destroy()
-{
-	simple_if_fini $h2 192.0.2.2/28
-}
+source mirror_topo_lib.sh
 
 mirror_gre_topo_h3_create()
 {
-	simple_if_init $h3
+	mirror_topo_h3_create
 
 	tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129
 	ip link set h3-gt4 vrf v$h3
@@ -71,49 +53,32 @@ mirror_gre_topo_h3_destroy()
 	tunnel_destroy h3-gt6
 	tunnel_destroy h3-gt4
 
-	simple_if_fini $h3
+	mirror_topo_h3_destroy
 }
 
 mirror_gre_topo_switch_create()
 {
-	ip link set dev $swp3 up
-
-	ip link add name br1 type bridge vlan_filtering 1
-	ip link set dev br1 up
-
-	ip link set dev $swp1 master br1
-	ip link set dev $swp1 up
-
-	ip link set dev $swp2 master br1
-	ip link set dev $swp2 up
+	mirror_topo_switch_create
 
 	tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
 		      ttl 100 tos inherit
 
 	tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \
 		      ttl 100 tos inherit allow-localremote
-
-	tc qdisc add dev $swp1 clsact
 }
 
 mirror_gre_topo_switch_destroy()
 {
-	tc qdisc del dev $swp1 clsact
-
 	tunnel_destroy gt6
 	tunnel_destroy gt4
 
-	ip link set dev $swp1 down
-	ip link set dev $swp2 down
-	ip link del dev br1
-
-	ip link set dev $swp3 down
+	mirror_topo_switch_destroy
 }
 
 mirror_gre_topo_create()
 {
-	mirror_gre_topo_h1_create
-	mirror_gre_topo_h2_create
+	mirror_topo_h1_create
+	mirror_topo_h2_create
 	mirror_gre_topo_h3_create
 
 	mirror_gre_topo_switch_create
@@ -124,6 +89,6 @@ mirror_gre_topo_destroy()
 	mirror_gre_topo_switch_destroy
 
 	mirror_gre_topo_h3_destroy
-	mirror_gre_topo_h2_destroy
-	mirror_gre_topo_h1_destroy
+	mirror_topo_h2_destroy
+	mirror_topo_h1_destroy
 }
diff --git a/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
new file mode 100644
index 000000000000..5b7879726bfb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is the standard topology for testing mirroring. The tests that use it
+# tweak it in one way or another--typically add more devices to the topology.
+#
+#   +---------------------+                             +---------------------+
+#   | H1                  |                             |                  H2 |
+#   |     + $h1           |                             |           $h2 +     |
+#   |     | 192.0.2.1/28  |                             |  192.0.2.2/28 |     |
+#   +-----|---------------+                             +---------------|-----+
+#         |                                                             |
+#   +-----|-------------------------------------------------------------|-----+
+#   | SW  o--> mirror                                                   |     |
+#   | +---|-------------------------------------------------------------|---+ |
+#   | |   + $swp1                    BR                           $swp2 +   | |
+#   | +---------------------------------------------------------------------+ |
+#   |                                                                         |
+#   |     + $swp3                                                             |
+#   +-----|-------------------------------------------------------------------+
+#         |
+#   +-----|-------------------------------------------------------------------+
+#   | H3  + $h3                                                               |
+#   |                                                                         |
+#   +-------------------------------------------------------------------------+
+
+mirror_topo_h1_create()
+{
+	simple_if_init $h1 192.0.2.1/28
+}
+
+mirror_topo_h1_destroy()
+{
+	simple_if_fini $h1 192.0.2.1/28
+}
+
+mirror_topo_h2_create()
+{
+	simple_if_init $h2 192.0.2.2/28
+}
+
+mirror_topo_h2_destroy()
+{
+	simple_if_fini $h2 192.0.2.2/28
+}
+
+mirror_topo_h3_create()
+{
+	simple_if_init $h3
+}
+
+mirror_topo_h3_destroy()
+{
+	simple_if_fini $h3
+}
+
+mirror_topo_switch_create()
+{
+	ip link set dev $swp3 up
+
+	ip link add name br1 type bridge vlan_filtering 1
+	ip link set dev br1 up
+
+	ip link set dev $swp1 master br1
+	ip link set dev $swp1 up
+
+	ip link set dev $swp2 master br1
+	ip link set dev $swp2 up
+
+	tc qdisc add dev $swp1 clsact
+}
+
+mirror_topo_switch_destroy()
+{
+	tc qdisc del dev $swp1 clsact
+
+	ip link set dev $swp1 down
+	ip link set dev $swp2 down
+	ip link del dev br1
+
+	ip link set dev $swp3 down
+}
+
+mirror_topo_create()
+{
+	mirror_topo_h1_create
+	mirror_topo_h2_create
+	mirror_topo_h3_create
+
+	mirror_topo_switch_create
+}
+
+mirror_topo_destroy()
+{
+	mirror_topo_switch_destroy
+
+	mirror_topo_h3_destroy
+	mirror_topo_h2_destroy
+	mirror_topo_h1_destroy
+}
-- 
cgit v1.2.3-59-g8ed1b


From d5ea2bfc806a92bdeeed9a16e6dddfe44ebc37ec Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:16 +0200
Subject: selftests: forwarding: mirror_gre_lib: Extract generic functions

For non-GRE mirroring tests, a functions along the lines of
do_test_span_gre_dir_ips() and test_span_gre_dir_ips() are necessary,
but such that they don't assume tunnels are involved. Extract the code
from mirror_gre_lib.sh to mirror_lib.sh and convert to just use a given
device without assuming it's named "h3-$tundev". Convert the two
above-mentioned functions to wrappers that pass along the correct device
name.

Add test_span_dir() and fail_test_span_dir() to round up the API for use
by following patches.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_lib.sh     | 41 ++++------------
 .../testing/selftests/net/forwarding/mirror_lib.sh | 54 ++++++++++++++++++++++
 2 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
index 207ffd167dba..c7b2cdc77a6d 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -1,47 +1,26 @@
 # SPDX-License-Identifier: GPL-2.0
 
-do_test_span_gre_dir_ips()
-{
-	local expect=$1; shift
-	local tundev=$1; shift
-	local direction=$1; shift
-	local ip1=$1; shift
-	local ip2=$1; shift
-
-	icmp_capture_install h3-$tundev
-	mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 $expect
-	mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 $expect
-	icmp_capture_uninstall h3-$tundev
-}
+source mirror_lib.sh
 
 quick_test_span_gre_dir_ips()
 {
-	do_test_span_gre_dir_ips 10 "$@"
+	local tundev=$1; shift
+
+	do_test_span_dir_ips 10 h3-$tundev "$@"
 }
 
 fail_test_span_gre_dir_ips()
 {
-	do_test_span_gre_dir_ips 0 "$@"
+	local tundev=$1; shift
+
+	do_test_span_dir_ips 0 h3-$tundev "$@"
 }
 
 test_span_gre_dir_ips()
 {
 	local tundev=$1; shift
-	local direction=$1; shift
-	local forward_type=$1; shift
-	local backward_type=$1; shift
-	local ip1=$1; shift
-	local ip2=$1; shift
-
-	quick_test_span_gre_dir_ips "$tundev" "$direction" "$ip1" "$ip2"
-
-	icmp_capture_install h3-$tundev "type $forward_type"
-	mirror_test v$h1 $ip1 $ip2 h3-$tundev 100 10
-	icmp_capture_uninstall h3-$tundev
 
-	icmp_capture_install h3-$tundev "type $backward_type"
-	mirror_test v$h2 $ip2 $ip1 h3-$tundev 100 10
-	icmp_capture_uninstall h3-$tundev
+	test_span_dir_ips h3-$tundev "$@"
 }
 
 full_test_span_gre_dir_ips()
@@ -57,8 +36,8 @@ full_test_span_gre_dir_ips()
 	RET=0
 
 	mirror_install $swp1 $direction $tundev "matchall $tcflags"
-	test_span_gre_dir_ips "$tundev" "$direction" "$forward_type" \
-			      "$backward_type" "$ip1" "$ip2"
+	test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+			  "$backward_type" "$ip1" "$ip2"
 	mirror_uninstall $swp1 $direction
 
 	log_test "$direction $what ($tcflags)"
diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
index e5028a5725e3..04cbc38e71a2 100644
--- a/tools/testing/selftests/net/forwarding/mirror_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -38,3 +38,57 @@ mirror_test()
 	((expect <= delta && delta <= expect + 2))
 	check_err $? "Expected to capture $expect packets, got $delta."
 }
+
+do_test_span_dir_ips()
+{
+	local expect=$1; shift
+	local dev=$1; shift
+	local direction=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	icmp_capture_install $dev
+	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+	icmp_capture_uninstall $dev
+}
+
+quick_test_span_dir_ips()
+{
+	do_test_span_dir_ips 10 "$@"
+}
+
+fail_test_span_dir_ips()
+{
+	do_test_span_dir_ips 0 "$@"
+}
+
+test_span_dir_ips()
+{
+	local dev=$1; shift
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	quick_test_span_dir_ips "$dev" "$direction" "$ip1" "$ip2"
+
+	icmp_capture_install $dev "type $forward_type"
+	mirror_test v$h1 $ip1 $ip2 $dev 100 10
+	icmp_capture_uninstall $dev
+
+	icmp_capture_install $dev "type $backward_type"
+	mirror_test v$h2 $ip2 $ip1 $dev 100 10
+	icmp_capture_uninstall $dev
+}
+
+fail_test_span_dir()
+{
+	fail_test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+test_span_dir()
+{
+	test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
-- 
cgit v1.2.3-59-g8ed1b


From 91bac7f9977a5d481f1a750c613727089d7cabc0 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:21 +0200
Subject: selftests: forwarding: Add $h3's clsact to mirror_topo_lib.sh

Having a clsact qdisc on $h3 is useful in several tests, and will be
useful in more tests to come. Move the registration from all the tests
that need it into the topology file itself.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_gre.sh         | 2 --
 tools/testing/selftests/net/forwarding/mirror_gre_changes.sh | 2 --
 tools/testing/selftests/net/forwarding/mirror_topo_lib.sh    | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh
index c6786d1b2b96..e6fd7a18c655 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh
@@ -72,7 +72,6 @@ test_span_gre_mac()
 	RET=0
 
 	mirror_install $swp1 $direction $tundev "matchall $tcflags"
-	tc qdisc add dev $h3 clsact
 	tc filter add dev $h3 ingress pref 77 prot $prot \
 		flower ip_proto 0x2f src_mac $swp3mac dst_mac $h3mac \
 		action pass
@@ -80,7 +79,6 @@ test_span_gre_mac()
 	mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10
 
 	tc filter del dev $h3 ingress pref 77
-	tc qdisc del dev $h3 clsact
 	mirror_uninstall $swp1 $direction
 
 	log_test "$direction $what: envelope MAC ($tcflags)"
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
index e22a9e44db24..aa29d46186a8 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -73,7 +73,6 @@ test_span_gre_ttl()
 	RET=0
 
 	mirror_install $swp1 ingress $tundev "matchall $tcflags"
-	tc qdisc add dev $h3 clsact
 	tc filter add dev $h3 ingress pref 77 prot $prot \
 		flower ip_ttl 50 action pass
 
@@ -84,7 +83,6 @@ test_span_gre_ttl()
 
 	ip link set dev $tundev type $type ttl 100
 	tc filter del dev $h3 ingress pref 77
-	tc qdisc del dev $h3 clsact
 	mirror_uninstall $swp1 ingress
 
 	log_test "$what: TTL change ($tcflags)"
diff --git a/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
index 5b7879726bfb..04979e5962e7 100644
--- a/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
@@ -46,10 +46,12 @@ mirror_topo_h2_destroy()
 mirror_topo_h3_create()
 {
 	simple_if_init $h3
+	tc qdisc add dev $h3 clsact
 }
 
 mirror_topo_h3_destroy()
 {
+	tc qdisc del dev $h3 clsact
 	simple_if_fini $h3
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0e7a504c0942e6ad4e3dacae261af246f6f94d0a Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:26 +0200
Subject: selftests: forwarding: lib: Support VLAN devices

Add vlan_create() and vlan_destroy() to manage VLAN netdevices.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 91041c49655b..63641ad55074 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -340,6 +340,31 @@ tunnel_destroy()
 	ip link del dev $name
 }
 
+vlan_create()
+{
+	local if_name=$1; shift
+	local vid=$1; shift
+	local vrf=$1; shift
+	local ips=("${@}")
+	local name=$if_name.$vid
+
+	ip link add name $name link $if_name type vlan id $vid
+	if [ "$vrf" != "" ]; then
+		ip link set dev $name master $vrf
+	fi
+	ip link set dev $name up
+	__addr_add_del $name add "${ips[@]}"
+}
+
+vlan_destroy()
+{
+	local if_name=$1; shift
+	local vid=$1; shift
+	local name=$if_name.$vid
+
+	ip link del dev $name
+}
+
 master_name_get()
 {
 	local if_name=$1
-- 
cgit v1.2.3-59-g8ed1b


From 1893150fd531a1dfacef36e4f43be6cc795c6f50 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:31 +0200
Subject: selftests: forwarding: mirror_gre_lib: Support VLAN

Add full_test_span_gre_dir_vlan_ips() and full_test_span_gre_dir_vlan()
to support mirror-to-gre tests that involve VLAN.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_lib.sh     | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
index c7b2cdc77a6d..92ef6dde98bd 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -43,6 +43,35 @@ full_test_span_gre_dir_ips()
 	log_test "$direction $what ($tcflags)"
 }
 
+full_test_span_gre_dir_vlan_ips()
+{
+	local tundev=$1; shift
+	local direction=$1; shift
+	local vlan_match=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+	local what=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 $direction $tundev "matchall $tcflags"
+
+	test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+			  "$backward_type" "$ip1" "$ip2"
+
+	tc filter add dev $h3 ingress pref 77 prot 802.1q \
+		flower $vlan_match ip_proto 0x2f \
+		action pass
+	mirror_test v$h1 $ip1 $ip2 $h3 77 10
+	tc filter del dev $h3 ingress pref 77
+
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction $what ($tcflags)"
+}
+
 quick_test_span_gre_dir()
 {
 	quick_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
@@ -62,3 +91,8 @@ full_test_span_gre_dir()
 {
 	full_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
 }
+
+full_test_span_gre_dir_vlan()
+{
+	full_test_span_gre_dir_vlan_ips "$@" 192.0.2.1 192.0.2.2
+}
-- 
cgit v1.2.3-59-g8ed1b


From 87c0c046e80293af001af95a66c46491ee43130a Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:35 +0200
Subject: selftests: forwarding: lib: Extract trap_{, un}install()

A mirror-to-vlan test that's coming next needs to install the trap
unconditionally. Therefore extract from slow_path_trap_{,un}install()
a more generic functions trap_install() and trap_uninstall(), and covert
the former two to conditional wrappers around these.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 63641ad55074..89ba4cdd4392 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -448,26 +448,35 @@ tc_offload_check()
 	return 0
 }
 
-slow_path_trap_install()
+trap_install()
 {
 	local dev=$1; shift
 	local direction=$1; shift
 
-	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
-		# For slow-path testing, we need to install a trap to get to
-		# slow path the packets that would otherwise be switched in HW.
-		tc filter add dev $dev $direction pref 1 \
-		   flower skip_sw action trap
-	fi
+	# For slow-path testing, we need to install a trap to get to
+	# slow path the packets that would otherwise be switched in HW.
+	tc filter add dev $dev $direction pref 1 flower skip_sw action trap
 }
 
-slow_path_trap_uninstall()
+trap_uninstall()
 {
 	local dev=$1; shift
 	local direction=$1; shift
 
+	tc filter del dev $dev $direction pref 1 flower skip_sw
+}
+
+slow_path_trap_install()
+{
+	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
+		trap_install "$@"
+	fi
+}
+
+slow_path_trap_uninstall()
+{
 	if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
-		tc filter del dev $dev $direction pref 1 flower skip_sw
+		trap_uninstall "$@"
 	fi
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 35388a6a0c275390f64d26093175718946dd788e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:48 +0200
Subject: selftests: forwarding: Test mirror-to-vlan

Test for "tc action mirred egress mirror" that mirrors to a vlan device.
- test_vlan() tests that the packets get mirrored
- test_tagged_vlan() tests that the mirrored packets have correct inner
  VLAN tag.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_vlan.sh        | 169 +++++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_vlan.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
new file mode 100755
index 000000000000..1e10520dccf4
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing mirroring. See mirror_topo_lib.sh
+# for more details.
+#
+# Test for "tc action mirred egress mirror" that mirrors to a vlan device.
+
+ALL_TESTS="
+	test_vlan
+	test_tagged_vlan
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_topo_create
+
+	vlan_create $swp3 555
+
+	vlan_create $h3 555 v$h3
+	matchall_sink_create $h3.555
+
+	vlan_create $h1 111 v$h1 192.0.2.17/28
+	bridge vlan add dev $swp1 vid 111
+
+	vlan_create $h2 111 v$h2 192.0.2.18/28
+	bridge vlan add dev $swp2 vid 111
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	vlan_destroy $h2 111
+	vlan_destroy $h1 111
+	vlan_destroy $h3 555
+	vlan_destroy $swp3 555
+
+	mirror_topo_destroy
+	vrf_cleanup
+}
+
+test_vlan_dir()
+{
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
+	test_span_dir "$h3.555" "$direction" "$forward_type" "$backward_type"
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction mirror to vlan ($tcflags)"
+}
+
+test_vlan()
+{
+	test_vlan_dir ingress 8 0
+	test_vlan_dir egress 0 8
+}
+
+vlan_capture_add_del()
+{
+	local add_del=$1; shift
+	local pref=$1; shift
+	local dev=$1; shift
+	local filter=$1; shift
+
+	tc filter $add_del dev "$dev" ingress \
+	   proto 802.1q pref $pref \
+	   flower $filter \
+	   action pass
+}
+
+vlan_capture_install()
+{
+	vlan_capture_add_del add 100 "$@"
+}
+
+vlan_capture_uninstall()
+{
+	vlan_capture_add_del del 100 "$@"
+}
+
+do_test_span_vlan_dir_ips()
+{
+	local expect=$1; shift
+	local dev=$1; shift
+	local vid=$1; shift
+	local direction=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	vlan_capture_install $dev "vlan_id $vid"
+	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+	vlan_capture_uninstall $dev
+}
+
+test_tagged_vlan_dir()
+{
+	local direction=$1; shift
+	local forward_type=$1; shift
+	local backward_type=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
+	do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" \
+				  192.0.2.17 192.0.2.18
+	do_test_span_vlan_dir_ips  0 "$h3.555" 555 "$direction" \
+				  192.0.2.17 192.0.2.18
+	mirror_uninstall $swp1 $direction
+
+	log_test "$direction mirror to vlan ($tcflags)"
+}
+
+test_tagged_vlan()
+{
+	test_tagged_vlan_dir ingress 8 0
+	test_tagged_vlan_dir egress 0 8
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+	trap_install $h3 ingress
+
+	tests_run
+
+	trap_install $h3 ingress
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 0056042f80dbcb3e99b150e34c883bffcfae9623 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:27:55 +0200
Subject: selftests: forwarding: Test mirror-to-gre w/ UL VLAN+802.1q

Test for "tc action mirred egress mirror" that mirrors to GRE when the
underlay route points at a vlan device on top of a bridge device with
vlan filtering (802.1q).

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/forwarding/mirror_gre_vlan_bridge_1q.sh    | 140 +++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
new file mode 100755
index 000000000000..01ec28ac2e4a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# vlan device on top of a bridge device with vlan filtering (802.1q).
+
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+	test_gretap_forbidden
+	test_ip6gretap_forbidden
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	vlan_create br1 555 "" 192.0.2.129/32 2001:db8:2::1/128
+	bridge vlan add dev br1 vid 555 self
+	ip route rep 192.0.2.130/32 dev br1.555
+	ip -6 route rep 2001:db8:2::2/128 dev br1.555
+
+	vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64
+
+	ip link set dev $swp3 master br1
+	bridge vlan add dev $swp3 vid 555
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip link set dev $swp3 nomaster
+	vlan_destroy $h3 555
+	vlan_destroy br1 555
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_vlan_match()
+{
+	local tundev=$1; shift
+	local vlan_match=$1; shift
+	local what=$1; shift
+
+	full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what"
+	full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what"
+}
+
+test_gretap()
+{
+	test_vlan_match gt4 'vlan_id 555 vlan_ethtype ip' "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+	test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap"
+}
+
+test_span_gre_forbidden()
+{
+	local tundev=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	# Run the pass-test first, to prime neighbor table.
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+
+	# Now forbid the VLAN at the bridge and see it fail.
+	bridge vlan del dev br1 vid 555 self
+	sleep 1
+
+	fail_test_span_gre_dir $tundev ingress
+	mirror_uninstall $swp1 ingress
+
+	bridge vlan add dev br1 vid 555 self
+	sleep 1
+
+	log_test "$what: vlan forbidden at a bridge ($tcflags)"
+}
+
+test_gretap_forbidden()
+{
+	test_span_gre_forbidden gt4 "mirror to gretap"
+}
+
+test_ip6gretap_forbidden()
+{
+	test_span_gre_forbidden gt4 "mirror to ip6gretap"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From a08fb9f1ad8d8ee4a3dfa85b2182553543c0c301 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:28:01 +0200
Subject: selftests: forwarding: Test mirror-to-gre w/ UL VLAN

Test for "tc action mirred egress mirror" that mirrors to a gretap
netdevice whose underlay route points at a vlan device.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_vlan.sh    | 92 ++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
new file mode 100755
index 000000000000..88cecdb9a861
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" that mirrors to a gretap netdevice
+# whose underlay route points at a vlan device.
+
+ALL_TESTS="
+	test_gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip link add name $swp3.555 link $swp3 type vlan id 555
+	ip address add dev $swp3.555 192.0.2.129/32
+	ip address add dev $swp3.555 2001:db8:2::1/128
+	ip link set dev $swp3.555 up
+
+	ip route add 192.0.2.130/32 dev $swp3.555
+	ip -6 route add 2001:db8:2::2/128 dev $swp3.555
+
+	ip link add name $h3.555 link $h3 type vlan id 555
+	ip link set dev $h3.555 master v$h3
+	ip address add dev $h3.555 192.0.2.130/28
+	ip address add dev $h3.555 2001:db8:2::2/64
+	ip link set dev $h3.555 up
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	ip link del dev $h3.555
+	ip link del dev $swp3.555
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_gretap()
+{
+	full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+	full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From 181d95f8e145d0dc755bcfb39d9e28c93a0c3966 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 24 May 2018 16:28:06 +0200
Subject: selftests: forwarding: Test mirror-to-gre w/ UL 802.1d+VLAN

Test for "tc action mirred egress mirror" that mirrors to GRE when the
underlay route points at an 802.1d bridge and packet egresses through a
VLAN device.

Besides testing basic connectivity, this also tests that the traffic is
properly tagged.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/forwarding/mirror_gre_bridge_1d_vlan.sh    | 109 +++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
new file mode 100755
index 000000000000..3d47afcf7fb9
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# bridge device without vlan filtering (802.1d). The device attached to that
+# bridge is a VLAN.
+
+ALL_TESTS="
+	test_gretap
+	test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+	h1=${NETIFS[p1]}
+	swp1=${NETIFS[p2]}
+
+	swp2=${NETIFS[p3]}
+	h2=${NETIFS[p4]}
+
+	swp3=${NETIFS[p5]}
+	h3=${NETIFS[p6]}
+
+	vrf_prepare
+	mirror_gre_topo_create
+
+	ip link add name br2 type bridge vlan_filtering 0
+	ip link set dev br2 up
+
+	vlan_create $swp3 555
+
+	ip link set dev $swp3.555 master br2
+	ip route add 192.0.2.130/32 dev br2
+	ip -6 route add 2001:db8:2::2/128 dev br2
+
+	ip address add dev br2 192.0.2.129/32
+	ip address add dev br2 2001:db8:2::1/128
+
+	vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	vlan_destroy $h3 555
+	ip link del dev br2
+	vlan_destroy $swp3 555
+
+	mirror_gre_topo_destroy
+	vrf_cleanup
+}
+
+test_vlan_match()
+{
+	local tundev=$1; shift
+	local vlan_match=$1; shift
+	local what=$1; shift
+
+	full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what"
+	full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what"
+}
+
+test_gretap()
+{
+	test_vlan_match gt4 'vlan_id 555 vlan_ethtype ip' "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+	test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap"
+}
+
+test_all()
+{
+	slow_path_trap_install $swp1 ingress
+	slow_path_trap_install $swp1 egress
+
+	tests_run
+
+	slow_path_trap_uninstall $swp1 egress
+	slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+	echo "WARN: Could not test offloaded functionality"
+else
+	tcflags="skip_sw"
+	test_all
+fi
+
+exit $EXIT_STATUS
-- 
cgit v1.2.3-59-g8ed1b


From aaa908ffbee18a65529b716efb346a626e81559a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 23 May 2018 15:26:53 -0700
Subject: net_sched: switch to rcu_work

Commit 05f0fe6b74db ("RCU, workqueue: Implement rcu_work") introduces
new API's for dispatching work in a RCU callback. Now we can just
switch to the new API's for tc filters. This could get rid of a lot
of code.

Cc: Tejun Heo <tj@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h    |  2 +-
 net/sched/cls_api.c      |  5 +++--
 net/sched/cls_basic.c    | 24 +++++++-----------------
 net/sched/cls_bpf.c      | 22 ++++++----------------
 net/sched/cls_cgroup.c   | 23 +++++------------------
 net/sched/cls_flow.c     | 24 +++++++-----------------
 net/sched/cls_flower.c   | 40 ++++++++++------------------------------
 net/sched/cls_fw.c       | 24 +++++++-----------------
 net/sched/cls_matchall.c | 21 +++++----------------
 net/sched/cls_route.c    | 23 +++++++++--------------
 net/sched/cls_rsvp.h     | 20 +++++---------------
 net/sched/cls_tcindex.c  | 41 ++++++++++-------------------------------
 net/sched/cls_u32.c      | 37 ++++++++++---------------------------
 13 files changed, 85 insertions(+), 221 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0005f0b40fe9..f3ec43725724 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -33,7 +33,7 @@ struct tcf_block_ext_info {
 };
 
 struct tcf_block_cb;
-bool tcf_queue_work(struct work_struct *work);
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func);
 
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 963e4bf0aab8..a4a5ace834c3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -103,9 +103,10 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
-bool tcf_queue_work(struct work_struct *work)
+bool tcf_queue_work(struct rcu_work *rwork, work_func_t func)
 {
-	return queue_work(tc_filter_wq, work);
+	INIT_RCU_WORK(rwork, func);
+	return queue_rcu_work(tc_filter_wq, rwork);
 }
 EXPORT_SYMBOL(tcf_queue_work);
 
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 6b7ab3512f5b..95367f37098d 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -35,10 +35,7 @@ struct basic_filter {
 	struct tcf_result	res;
 	struct tcf_proto	*tp;
 	struct list_head	link;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -97,21 +94,14 @@ static void __basic_delete_filter(struct basic_filter *f)
 
 static void basic_delete_filter_work(struct work_struct *work)
 {
-	struct basic_filter *f = container_of(work, struct basic_filter, work);
-
+	struct basic_filter *f = container_of(to_rcu_work(work),
+					      struct basic_filter,
+					      rwork);
 	rtnl_lock();
 	__basic_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void basic_delete_filter(struct rcu_head *head)
-{
-	struct basic_filter *f = container_of(head, struct basic_filter, rcu);
-
-	INIT_WORK(&f->work, basic_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
@@ -122,7 +112,7 @@ static void basic_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		tcf_unbind_filter(tp, &f->res);
 		idr_remove(&head->handle_idr, f->handle);
 		if (tcf_exts_get_net(&f->exts))
-			call_rcu(&f->rcu, basic_delete_filter);
+			tcf_queue_work(&f->rwork, basic_delete_filter_work);
 		else
 			__basic_delete_filter(f);
 	}
@@ -140,7 +130,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last,
 	tcf_unbind_filter(tp, &f->res);
 	idr_remove(&head->handle_idr, f->handle);
 	tcf_exts_get_net(&f->exts);
-	call_rcu(&f->rcu, basic_delete_filter);
+	tcf_queue_work(&f->rwork, basic_delete_filter_work);
 	*last = list_empty(&head->flist);
 	return 0;
 }
@@ -234,7 +224,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&fold->link, &fnew->link);
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, basic_delete_filter);
+		tcf_queue_work(&fold->rwork, basic_delete_filter_work);
 	} else {
 		list_add_rcu(&fnew->link, &head->flist);
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index b07c1fa8bc0d..1aa7f6511065 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -49,10 +49,7 @@ struct cls_bpf_prog {
 	struct sock_filter *bpf_ops;
 	const char *bpf_name;
 	struct tcf_proto *tp;
-	union {
-		struct work_struct work;
-		struct rcu_head rcu;
-	};
+	struct rcu_work rwork;
 };
 
 static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
@@ -275,21 +272,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
 
 static void cls_bpf_delete_prog_work(struct work_struct *work)
 {
-	struct cls_bpf_prog *prog = container_of(work, struct cls_bpf_prog, work);
-
+	struct cls_bpf_prog *prog = container_of(to_rcu_work(work),
+						 struct cls_bpf_prog,
+						 rwork);
 	rtnl_lock();
 	__cls_bpf_delete_prog(prog);
 	rtnl_unlock();
 }
 
-static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu)
-{
-	struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
-
-	INIT_WORK(&prog->work, cls_bpf_delete_prog_work);
-	tcf_queue_work(&prog->work);
-}
-
 static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			     struct netlink_ext_ack *extack)
 {
@@ -300,7 +290,7 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	list_del_rcu(&prog->link);
 	tcf_unbind_filter(tp, &prog->res);
 	if (tcf_exts_get_net(&prog->exts))
-		call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
+		tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work);
 	else
 		__cls_bpf_delete_prog(prog);
 }
@@ -526,7 +516,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
 		tcf_exts_get_net(&oldprog->exts);
-		call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu);
+		tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work);
 	} else {
 		list_add_rcu(&prog->link, &head->plist);
 	}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 762da5c0cf5e..3bc01bdde165 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,10 +23,7 @@ struct cls_cgroup_head {
 	struct tcf_exts		exts;
 	struct tcf_ematch_tree	ematches;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -70,24 +67,14 @@ static void __cls_cgroup_destroy(struct cls_cgroup_head *head)
 
 static void cls_cgroup_destroy_work(struct work_struct *work)
 {
-	struct cls_cgroup_head *head = container_of(work,
+	struct cls_cgroup_head *head = container_of(to_rcu_work(work),
 						    struct cls_cgroup_head,
-						    work);
+						    rwork);
 	rtnl_lock();
 	__cls_cgroup_destroy(head);
 	rtnl_unlock();
 }
 
-static void cls_cgroup_destroy_rcu(struct rcu_head *root)
-{
-	struct cls_cgroup_head *head = container_of(root,
-						    struct cls_cgroup_head,
-						    rcu);
-
-	INIT_WORK(&head->work, cls_cgroup_destroy_work);
-	tcf_queue_work(&head->work);
-}
-
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
@@ -134,7 +121,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	rcu_assign_pointer(tp->root, new);
 	if (head) {
 		tcf_exts_get_net(&head->exts);
-		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+		tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
 	}
 	return 0;
 errout:
@@ -151,7 +138,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp,
 	/* Head can still be NULL due to cls_cgroup_init(). */
 	if (head) {
 		if (tcf_exts_get_net(&head->exts))
-			call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
+			tcf_queue_work(&head->rwork, cls_cgroup_destroy_work);
 		else
 			__cls_cgroup_destroy(head);
 	}
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index cd5fe383afdd..2bb043cd436b 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -57,10 +57,7 @@ struct flow_filter {
 	u32			divisor;
 	u32			baseclass;
 	u32			hashrnd;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static inline u32 addr_fold(void *addr)
@@ -383,21 +380,14 @@ static void __flow_destroy_filter(struct flow_filter *f)
 
 static void flow_destroy_filter_work(struct work_struct *work)
 {
-	struct flow_filter *f = container_of(work, struct flow_filter, work);
-
+	struct flow_filter *f = container_of(to_rcu_work(work),
+					     struct flow_filter,
+					     rwork);
 	rtnl_lock();
 	__flow_destroy_filter(f);
 	rtnl_unlock();
 }
 
-static void flow_destroy_filter(struct rcu_head *head)
-{
-	struct flow_filter *f = container_of(head, struct flow_filter, rcu);
-
-	INIT_WORK(&f->work, flow_destroy_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
@@ -563,7 +553,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 
 	if (fold) {
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, flow_destroy_filter);
+		tcf_queue_work(&fold->rwork, flow_destroy_filter_work);
 	}
 	return 0;
 
@@ -583,7 +573,7 @@ static int flow_delete(struct tcf_proto *tp, void *arg, bool *last,
 
 	list_del_rcu(&f->list);
 	tcf_exts_get_net(&f->exts);
-	call_rcu(&f->rcu, flow_destroy_filter);
+	tcf_queue_work(&f->rwork, flow_destroy_filter_work);
 	*last = list_empty(&head->filters);
 	return 0;
 }
@@ -608,7 +598,7 @@ static void flow_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	list_for_each_entry_safe(f, next, &head->filters, list) {
 		list_del_rcu(&f->list);
 		if (tcf_exts_get_net(&f->exts))
-			call_rcu(&f->rcu, flow_destroy_filter);
+			tcf_queue_work(&f->rwork, flow_destroy_filter_work);
 		else
 			__flow_destroy_filter(f);
 	}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index eacaaf803914..4e74508515f4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -73,10 +73,7 @@ struct fl_flow_mask {
 struct cls_fl_head {
 	struct rhashtable ht;
 	struct list_head masks;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 	struct idr handle_idr;
 };
 
@@ -90,10 +87,7 @@ struct cls_fl_filter {
 	struct list_head list;
 	u32 handle;
 	u32 flags;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 	struct net_device *hw_dev;
 };
 
@@ -235,21 +229,14 @@ static void __fl_destroy_filter(struct cls_fl_filter *f)
 
 static void fl_destroy_filter_work(struct work_struct *work)
 {
-	struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work);
+	struct cls_fl_filter *f = container_of(to_rcu_work(work),
+					struct cls_fl_filter, rwork);
 
 	rtnl_lock();
 	__fl_destroy_filter(f);
 	rtnl_unlock();
 }
 
-static void fl_destroy_filter(struct rcu_head *head)
-{
-	struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu);
-
-	INIT_WORK(&f->work, fl_destroy_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
 				 struct netlink_ext_ack *extack)
 {
@@ -327,7 +314,7 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 		fl_hw_destroy_filter(tp, f, extack);
 	tcf_unbind_filter(tp, &f->res);
 	if (async)
-		call_rcu(&f->rcu, fl_destroy_filter);
+		tcf_queue_work(&f->rwork, fl_destroy_filter_work);
 	else
 		__fl_destroy_filter(f);
 
@@ -336,20 +323,13 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
 
 static void fl_destroy_sleepable(struct work_struct *work)
 {
-	struct cls_fl_head *head = container_of(work, struct cls_fl_head,
-						work);
+	struct cls_fl_head *head = container_of(to_rcu_work(work),
+						struct cls_fl_head,
+						rwork);
 	kfree(head);
 	module_put(THIS_MODULE);
 }
 
-static void fl_destroy_rcu(struct rcu_head *rcu)
-{
-	struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
-
-	INIT_WORK(&head->work, fl_destroy_sleepable);
-	schedule_work(&head->work);
-}
-
 static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -365,7 +345,7 @@ static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 	idr_destroy(&head->handle_idr);
 
 	__module_get(THIS_MODULE);
-	call_rcu(&head->rcu, fl_destroy_rcu);
+	tcf_queue_work(&head->rwork, fl_destroy_sleepable);
 }
 
 static void *fl_get(struct tcf_proto *tp, u32 handle)
@@ -1036,7 +1016,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		list_replace_rcu(&fold->list, &fnew->list);
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, fl_destroy_filter);
+		tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
 	} else {
 		list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
 	}
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 8b207723fbc2..29eeeaf3ea44 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -47,10 +47,7 @@ struct fw_filter {
 #endif /* CONFIG_NET_CLS_IND */
 	struct tcf_exts		exts;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 static u32 fw_hash(u32 handle)
@@ -134,21 +131,14 @@ static void __fw_delete_filter(struct fw_filter *f)
 
 static void fw_delete_filter_work(struct work_struct *work)
 {
-	struct fw_filter *f = container_of(work, struct fw_filter, work);
-
+	struct fw_filter *f = container_of(to_rcu_work(work),
+					   struct fw_filter,
+					   rwork);
 	rtnl_lock();
 	__fw_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void fw_delete_filter(struct rcu_head *head)
-{
-	struct fw_filter *f = container_of(head, struct fw_filter, rcu);
-
-	INIT_WORK(&f->work, fw_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
@@ -164,7 +154,7 @@ static void fw_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 					 rtnl_dereference(f->next));
 			tcf_unbind_filter(tp, &f->res);
 			if (tcf_exts_get_net(&f->exts))
-				call_rcu(&f->rcu, fw_delete_filter);
+				tcf_queue_work(&f->rwork, fw_delete_filter_work);
 			else
 				__fw_delete_filter(f);
 		}
@@ -193,7 +183,7 @@ static int fw_delete(struct tcf_proto *tp, void *arg, bool *last,
 			RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
 			tcf_unbind_filter(tp, &f->res);
 			tcf_exts_get_net(&f->exts);
-			call_rcu(&f->rcu, fw_delete_filter);
+			tcf_queue_work(&f->rwork, fw_delete_filter_work);
 			ret = 0;
 			break;
 		}
@@ -316,7 +306,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(*fp, fnew);
 		tcf_unbind_filter(tp, &f->res);
 		tcf_exts_get_net(&f->exts);
-		call_rcu(&f->rcu, fw_delete_filter);
+		tcf_queue_work(&f->rwork, fw_delete_filter_work);
 
 		*arg = fnew;
 		return err;
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..47b207ef7762 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,10 +21,7 @@ struct cls_mall_head {
 	struct tcf_result res;
 	u32 handle;
 	u32 flags;
-	union {
-		struct work_struct work;
-		struct rcu_head	rcu;
-	};
+	struct rcu_work rwork;
 };
 
 static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp,
@@ -53,22 +50,14 @@ static void __mall_destroy(struct cls_mall_head *head)
 
 static void mall_destroy_work(struct work_struct *work)
 {
-	struct cls_mall_head *head = container_of(work, struct cls_mall_head,
-						  work);
+	struct cls_mall_head *head = container_of(to_rcu_work(work),
+						  struct cls_mall_head,
+						  rwork);
 	rtnl_lock();
 	__mall_destroy(head);
 	rtnl_unlock();
 }
 
-static void mall_destroy_rcu(struct rcu_head *rcu)
-{
-	struct cls_mall_head *head = container_of(rcu, struct cls_mall_head,
-						  rcu);
-
-	INIT_WORK(&head->work, mall_destroy_work);
-	tcf_queue_work(&head->work);
-}
-
 static void mall_destroy_hw_filter(struct tcf_proto *tp,
 				   struct cls_mall_head *head,
 				   unsigned long cookie,
@@ -126,7 +115,7 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 		mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
 
 	if (tcf_exts_get_net(&head->exts))
-		call_rcu(&head->rcu, mall_destroy_rcu);
+		tcf_queue_work(&head->rwork, mall_destroy_work);
 	else
 		__mall_destroy(head);
 }
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 21a03a8ee029..0404aa5fa7cb 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -57,10 +57,7 @@ struct route4_filter {
 	u32			handle;
 	struct route4_bucket	*bkt;
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 #define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
@@ -266,19 +263,17 @@ static void __route4_delete_filter(struct route4_filter *f)
 
 static void route4_delete_filter_work(struct work_struct *work)
 {
-	struct route4_filter *f = container_of(work, struct route4_filter, work);
-
+	struct route4_filter *f = container_of(to_rcu_work(work),
+					       struct route4_filter,
+					       rwork);
 	rtnl_lock();
 	__route4_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void route4_delete_filter(struct rcu_head *head)
+static void route4_queue_work(struct route4_filter *f)
 {
-	struct route4_filter *f = container_of(head, struct route4_filter, rcu);
-
-	INIT_WORK(&f->work, route4_delete_filter_work);
-	tcf_queue_work(&f->work);
+	tcf_queue_work(&f->rwork, route4_delete_filter_work);
 }
 
 static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
@@ -304,7 +299,7 @@ static void route4_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 					RCU_INIT_POINTER(b->ht[h2], next);
 					tcf_unbind_filter(tp, &f->res);
 					if (tcf_exts_get_net(&f->exts))
-						call_rcu(&f->rcu, route4_delete_filter);
+						route4_queue_work(f);
 					else
 						__route4_delete_filter(f);
 				}
@@ -349,7 +344,7 @@ static int route4_delete(struct tcf_proto *tp, void *arg, bool *last,
 			/* Delete it */
 			tcf_unbind_filter(tp, &f->res);
 			tcf_exts_get_net(&f->exts);
-			call_rcu(&f->rcu, route4_delete_filter);
+			tcf_queue_work(&f->rwork, route4_delete_filter_work);
 
 			/* Strip RTNL protected tree */
 			for (i = 0; i <= 32; i++) {
@@ -554,7 +549,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	if (fold) {
 		tcf_unbind_filter(tp, &fold->res);
 		tcf_exts_get_net(&fold->exts);
-		call_rcu(&fold->rcu, route4_delete_filter);
+		tcf_queue_work(&fold->rwork, route4_delete_filter_work);
 	}
 	return 0;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f1297657c27..e9ccf7daea7d 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -97,10 +97,7 @@ struct rsvp_filter {
 
 	u32				handle;
 	struct rsvp_session		*sess;
-	union {
-		struct work_struct		work;
-		struct rcu_head			rcu;
-	};
+	struct rcu_work			rwork;
 };
 
 static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
@@ -294,21 +291,14 @@ static void __rsvp_delete_filter(struct rsvp_filter *f)
 
 static void rsvp_delete_filter_work(struct work_struct *work)
 {
-	struct rsvp_filter *f = container_of(work, struct rsvp_filter, work);
-
+	struct rsvp_filter *f = container_of(to_rcu_work(work),
+					     struct rsvp_filter,
+					     rwork);
 	rtnl_lock();
 	__rsvp_delete_filter(f);
 	rtnl_unlock();
 }
 
-static void rsvp_delete_filter_rcu(struct rcu_head *head)
-{
-	struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu);
-
-	INIT_WORK(&f->work, rsvp_delete_filter_work);
-	tcf_queue_work(&f->work);
-}
-
 static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 {
 	tcf_unbind_filter(tp, &f->res);
@@ -317,7 +307,7 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
 	 * in cleanup() callback
 	 */
 	if (tcf_exts_get_net(&f->exts))
-		call_rcu(&f->rcu, rsvp_delete_filter_rcu);
+		tcf_queue_work(&f->rwork, rsvp_delete_filter_work);
 	else
 		__rsvp_delete_filter(f);
 }
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index b49cc990a000..32f4bbd82f35 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -28,20 +28,14 @@
 struct tcindex_filter_result {
 	struct tcf_exts		exts;
 	struct tcf_result	res;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 };
 
 struct tcindex_filter {
 	u16 key;
 	struct tcindex_filter_result result;
 	struct tcindex_filter __rcu *next;
-	union {
-		struct work_struct work;
-		struct rcu_head rcu;
-	};
+	struct rcu_work rwork;
 };
 
 
@@ -152,21 +146,14 @@ static void tcindex_destroy_rexts_work(struct work_struct *work)
 {
 	struct tcindex_filter_result *r;
 
-	r = container_of(work, struct tcindex_filter_result, work);
+	r = container_of(to_rcu_work(work),
+			 struct tcindex_filter_result,
+			 rwork);
 	rtnl_lock();
 	__tcindex_destroy_rexts(r);
 	rtnl_unlock();
 }
 
-static void tcindex_destroy_rexts(struct rcu_head *head)
-{
-	struct tcindex_filter_result *r;
-
-	r = container_of(head, struct tcindex_filter_result, rcu);
-	INIT_WORK(&r->work, tcindex_destroy_rexts_work);
-	tcf_queue_work(&r->work);
-}
-
 static void __tcindex_destroy_fexts(struct tcindex_filter *f)
 {
 	tcf_exts_destroy(&f->result.exts);
@@ -176,23 +163,15 @@ static void __tcindex_destroy_fexts(struct tcindex_filter *f)
 
 static void tcindex_destroy_fexts_work(struct work_struct *work)
 {
-	struct tcindex_filter *f = container_of(work, struct tcindex_filter,
-						work);
+	struct tcindex_filter *f = container_of(to_rcu_work(work),
+						struct tcindex_filter,
+						rwork);
 
 	rtnl_lock();
 	__tcindex_destroy_fexts(f);
 	rtnl_unlock();
 }
 
-static void tcindex_destroy_fexts(struct rcu_head *head)
-{
-	struct tcindex_filter *f = container_of(head, struct tcindex_filter,
-						rcu);
-
-	INIT_WORK(&f->work, tcindex_destroy_fexts_work);
-	tcf_queue_work(&f->work);
-}
-
 static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last,
 			  struct netlink_ext_ack *extack)
 {
@@ -228,12 +207,12 @@ found:
 	 */
 	if (f) {
 		if (tcf_exts_get_net(&f->result.exts))
-			call_rcu(&f->rcu, tcindex_destroy_fexts);
+			tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work);
 		else
 			__tcindex_destroy_fexts(f);
 	} else {
 		if (tcf_exts_get_net(&r->exts))
-			call_rcu(&r->rcu, tcindex_destroy_rexts);
+			tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
 		else
 			__tcindex_destroy_rexts(r);
 	}
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index bac47b5d18fd..fb861f90fde6 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,10 +68,7 @@ struct tc_u_knode {
 	u32 __percpu		*pcpu_success;
 #endif
 	struct tcf_proto	*tp;
-	union {
-		struct work_struct	work;
-		struct rcu_head		rcu;
-	};
+	struct rcu_work		rwork;
 	/* The 'sel' field MUST be the last field in structure to allow for
 	 * tc_u32_keys allocated at end of structure.
 	 */
@@ -436,21 +433,14 @@ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
  */
 static void u32_delete_key_work(struct work_struct *work)
 {
-	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+	struct tc_u_knode *key = container_of(to_rcu_work(work),
+					      struct tc_u_knode,
+					      rwork);
 	rtnl_lock();
 	u32_destroy_key(key->tp, key, false);
 	rtnl_unlock();
 }
 
-static void u32_delete_key_rcu(struct rcu_head *rcu)
-{
-	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
-	INIT_WORK(&key->work, u32_delete_key_work);
-	tcf_queue_work(&key->work);
-}
-
 /* u32_delete_key_freepf_rcu is the rcu callback variant
  * that free's the entire structure including the statistics
  * percpu variables. Only use this if the key is not a copy
@@ -460,21 +450,14 @@ static void u32_delete_key_rcu(struct rcu_head *rcu)
  */
 static void u32_delete_key_freepf_work(struct work_struct *work)
 {
-	struct tc_u_knode *key = container_of(work, struct tc_u_knode, work);
-
+	struct tc_u_knode *key = container_of(to_rcu_work(work),
+					      struct tc_u_knode,
+					      rwork);
 	rtnl_lock();
 	u32_destroy_key(key->tp, key, true);
 	rtnl_unlock();
 }
 
-static void u32_delete_key_freepf_rcu(struct rcu_head *rcu)
-{
-	struct tc_u_knode *key = container_of(rcu, struct tc_u_knode, rcu);
-
-	INIT_WORK(&key->work, u32_delete_key_freepf_work);
-	tcf_queue_work(&key->work);
-}
-
 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 {
 	struct tc_u_knode __rcu **kp;
@@ -491,7 +474,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 				tcf_unbind_filter(tp, &key->res);
 				idr_remove(&ht->handle_idr, key->handle);
 				tcf_exts_get_net(&key->exts);
-				call_rcu(&key->rcu, u32_delete_key_freepf_rcu);
+				tcf_queue_work(&key->rwork, u32_delete_key_freepf_work);
 				return 0;
 			}
 		}
@@ -611,7 +594,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 			u32_remove_hw_knode(tp, n, extack);
 			idr_remove(&ht->handle_idr, n->handle);
 			if (tcf_exts_get_net(&n->exts))
-				call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
+				tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
 			else
 				u32_destroy_key(n->tp, n, true);
 		}
@@ -995,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		tcf_exts_get_net(&n->exts);
-		call_rcu(&n->rcu, u32_delete_key_rcu);
+		tcf_queue_work(&n->rwork, u32_delete_key_work);
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 9f323973c915d402378cb3e1336dd6ed4c45144b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:47 -0700
Subject: net/ipv4: Udate fib_table_lookup tracepoint

Commit 4a2d73a4fb36 ("ipv4: fib_rules: support match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

In addition, make the IPv4 tracepoint similar to the IPv6 one where
the lookup parameters and result are dumped in 1 event. It is much
easier to use and understand the outcome of the lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib.h | 72 +++++++++++++++++++++++++++-------------------
 net/ipv4/fib_trie.c        | 14 +++++----
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 81b7e985bb45..f5a1d4c518d8 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -12,12 +12,14 @@
 
 TRACE_EVENT(fib_table_lookup,
 
-	TP_PROTO(u32 tb_id, const struct flowi4 *flp),
+	TP_PROTO(u32 tb_id, const struct flowi4 *flp,
+		 const struct fib_nh *nh, int err),
 
-	TP_ARGS(tb_id, flp),
+	TP_ARGS(tb_id, flp, nh, err),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	4	)
 		__array(	__u8,	dst,	4	)
+		__array(	__u8,	gw,	4	)
+		__array(	__u8,	saddr,	4	)
+		__field(	u16,	sport		)
+		__field(	u16,	dport		)
+		__field(	u8,	proto		)
+		__dynamic_array(char,  name,   IFNAMSIZ )
 	),
 
 	TP_fast_assign(
 		__be32 *p32;
 
 		__entry->tb_id = tb_id;
+		__entry->err = err;
 		__entry->oif = flp->flowi4_oif;
 		__entry->iif = flp->flowi4_iif;
 		__entry->tos = flp->flowi4_tos;
@@ -42,36 +51,41 @@ TRACE_EVENT(fib_table_lookup,
 
 		p32 = (__be32 *) __entry->dst;
 		*p32 = flp->daddr;
-	),
-
-	TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags)
-);
-
-TRACE_EVENT(fib_table_lookup_nh,
-
-	TP_PROTO(const struct fib_nh *nh),
-
-	TP_ARGS(nh),
-
-	TP_STRUCT__entry(
-		__string(	name,	nh->nh_dev->name)
-		__field(	int,	oif		)
-		__array(	__u8,	src,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32 = (__be32 *) __entry->src;
 
-		__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set");
-		__entry->oif = nh->nh_oif;
-		*p32 = nh->nh_saddr;
+		__entry->proto = flp->flowi4_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl4_sport);
+			__entry->dport = ntohs(flp->fl4_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
+		if (nh) {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = nh->nh_saddr;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = nh->nh_gw;
+
+			__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-");
+		} else {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = 0;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = 0;
+
+			__assign_str(name, "-");
+		}
 	),
 
-	TP_printk("nexthop dev %s oif %d src %pI4",
-		  __get_str(name), __entry->oif, __entry->src)
+	TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
 
 TRACE_EVENT(fib_validate_source,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3ce98c..65c340f230ae 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	unsigned long index;
 	t_key cindex;
 
-	trace_fib_table_lookup(tb->tb_id, flp);
-
 	pn = t->kv;
 	cindex = 0;
 
 	n = get_child_rcu(pn, cindex);
-	if (!n)
+	if (!n) {
+		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
 		return -EAGAIN;
+	}
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ backtrace:
 				 * nothing for us to do as we do not have any
 				 * further nodes to parse.
 				 */
-				if (IS_TRIE(pn))
+				if (IS_TRIE(pn)) {
+					trace_fib_table_lookup(tb->tb_id, flp,
+							       NULL, -EAGAIN);
 					return -EAGAIN;
+				}
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(stats->backtrack);
 #endif
@@ -1459,6 +1462,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
+			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
 			return err;
 		}
 		if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ found:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
-			trace_fib_table_lookup_nh(nh);
+			trace_fib_table_lookup(tb->tb_id, flp, nh, err);
 
 			return err;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 30d444d30049490398178ca4337ab49156571886 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:48 -0700
Subject: net/ipv6: Udate fib6_table_lookup tracepoint

Commit bb0ad1987e96 ("ipv6: fib6_rules: support for match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h | 29 ++++++++++++++++++++++-------
 net/core/net-traces.c       |  4 ----
 net/ipv6/route.c            |  9 +++++++--
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 1b8d951e3c12..b088b54d699c 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup,
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
-
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	16	)
 		__array(	__u8,	dst,	16	)
-
+		__field(        u16,	sport		)
+		__field(        u16,	dport		)
+		__field(        u8,	proto		)
+		__field(        u8,	rt_type		)
 		__dynamic_array(	char,	name,	IFNAMSIZ )
 		__array(		__u8,	gw,	16	 )
 	),
@@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
+		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
@@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
+		__entry->proto = flp->flowi6_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl6_sport);
+			__entry->dport = ntohs(flp->fl6_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
 		if (f6i->fib6_nh.nh_dev) {
 			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
-			__assign_str(name, "");
+			__assign_str(name, "-");
 		}
 		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
@@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup,
 		}
 	),
 
-	TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags, __get_str(name), __entry->gw)
+	TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->err)
 );
 
 #endif /* _TRACE_FIB6_H */
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
 #include <trace/events/tcp.h>
 #include <trace/events/fib.h>
 #include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
 #if IS_ENABLED(CONFIG_BRIDGE)
 #include <trace/events/bridge.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0a35ded448a6..22c4de2317d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,14 +64,19 @@
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
 #include <net/ip.h>
-#include <trace/events/fib6.h>
-
 #include <linux/uaccess.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
 
+static int ip6_rt_type_to_error(u8 fib6_type);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
+
 enum rt6_nud_state {
 	RT6_NUD_FAIL_HARD = -3,
 	RT6_NUD_FAIL_PROBE = -2,
-- 
cgit v1.2.3-59-g8ed1b


From c949cbbbe5d6ff3dcacf82e8fd26f627285e8a12 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 23 May 2018 17:08:49 -0700
Subject: net/ipv4: Remove tracepoint in fib_validate_source

Tracepoint does not add value and the call to fib_lookup follows
it which shows the same information and the fib lookup result.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib.h | 35 -----------------------------------
 net/ipv4/fib_frontend.c    |  2 --
 2 files changed, 37 deletions(-)

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index f5a1d4c518d8..9763cddd0594 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -87,41 +87,6 @@ TRACE_EVENT(fib_table_lookup,
 		  __entry->tos, __entry->scope, __entry->flags,
 		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
-
-TRACE_EVENT(fib_validate_source,
-
-	TP_PROTO(const struct net_device *dev, const struct flowi4 *flp),
-
-	TP_ARGS(dev, flp),
-
-	TP_STRUCT__entry(
-		__string(	name,	dev->name	)
-		__field(	int,	oif		)
-		__field(	int,	iif		)
-		__field(	__u8,	tos		)
-		__array(	__u8,	src,	4	)
-		__array(	__u8,	dst,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32;
-
-		__assign_str(name, dev ? dev->name : "not set");
-		__entry->oif = flp->flowi4_oif;
-		__entry->iif = flp->flowi4_iif;
-		__entry->tos = flp->flowi4_tos;
-
-		p32 = (__be32 *) __entry->src;
-		*p32 = flp->saddr;
-
-		p32 = (__be32 *) __entry->dst;
-		*p32 = flp->daddr;
-	),
-
-	TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4",
-		  __get_str(name), __entry->oif, __entry->iif, __entry->tos,
-		  __entry->src, __entry->dst)
-);
 #endif /* _TRACE_FIB_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 897ae92dff0f..045c43a27c12 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 		fl4.fl4_dport = 0;
 	}
 
-	trace_fib_validate_source(dev, &fl4);
-
 	if (fib_lookup(net, &fl4, &res, 0))
 		goto last_resort;
 	if (res.type != RTN_UNICAST &&
-- 
cgit v1.2.3-59-g8ed1b


From d97cde6ab547a8115fb4fae73f030a96519ac3c6 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Wed, 23 May 2018 18:02:00 -0700
Subject: hv_netvsc: fix bogus ifalias on network device

If the guest network adapter is not configured with DeviceNaming
enabled on the host, then the query for friendly name will return
success but with a zero length name. Which then leads to a garbage value
(stack contents) for ifalias.

Fix is simple, just don't set name if  host doesn't return it.

Fixes: 0fe554a46a0f ("hv_netvsc: propogate Hyper-V friendly name into interface alias")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/rndis_filter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 7f3dab4b4cbc..5428bb261102 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1237,7 +1237,10 @@ static void rndis_get_friendly_name(struct net_device *net,
 	if (rndis_filter_query_device(rndis_device, net_device,
 				      RNDIS_OID_GEN_FRIENDLY_NAME,
 				      wname, &size) != 0)
-		return;
+		return;	/* ignore if host does not support */
+
+	if (size == 0)
+		return;	/* name not set */
 
 	/* Convert Windows Unicode string to UTF-8 */
 	len = ucs2_as_utf8(ifalias, wname, sizeof(ifalias));
-- 
cgit v1.2.3-59-g8ed1b


From 24f132e29c041cc0654e5aa8ab1127d45d4b46d3 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:48 -0700
Subject: nfp: add ndo_set_mac_address for representors

Adding a netdev to a bond requires that its mac address can be modified.
The default eth_mac_addr is sufficient to satisfy this requirement.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 09e87d5f4f72..117eca6819de 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -277,6 +277,7 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 	.ndo_get_vf_config	= nfp_app_get_vf_config,
 	.ndo_set_vf_link_state	= nfp_app_set_vf_link_state,
 	.ndo_set_features	= nfp_port_set_features,
+	.ndo_set_mac_address    = eth_mac_addr,
 };
 
 static void nfp_repr_clean(struct nfp_repr *repr)
-- 
cgit v1.2.3-59-g8ed1b


From 1945ca7a81d64e09ff0bd24deb64a328614f7b7b Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:49 -0700
Subject: nfp: nfpcore: add rtsym writing function

Add an rtsym API function that combines the lookup of a symbol and the
writing of a value to it. Values can be written as unsigned 32 or 64 bits.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h  |  2 +
 .../net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c | 43 ++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
index c9724fb7ea4b..df599d5b6bb3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
@@ -100,6 +100,8 @@ nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name);
 
 u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
 		      int *error);
+int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name,
+		       u64 value);
 u8 __iomem *
 nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id,
 	      unsigned int min_size, struct nfp_cpp_area **area);
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
index 46107aefad1c..9e34216578da 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
@@ -286,6 +286,49 @@ exit:
 	return val;
 }
 
+/**
+ * nfp_rtsym_write_le() - Write an unsigned scalar value to a symbol
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ * @value:	Value to write
+ *
+ * Lookup a symbol and write a value to it. Symbol can be 4 or 8 bytes in size.
+ * If 4 bytes then the lower 32-bits of 'value' are used. Value will be
+ * written as simple little-endian unsigned value.
+ *
+ * Return: 0 on success or error code.
+ */
+int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name,
+		       u64 value)
+{
+	const struct nfp_rtsym *sym;
+	int err;
+	u32 id;
+
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym)
+		return -ENOENT;
+
+	id = NFP_CPP_ISLAND_ID(sym->target, NFP_CPP_ACTION_RW, 0, sym->domain);
+
+	switch (sym->size) {
+	case 4:
+		err = nfp_cpp_writel(rtbl->cpp, id, sym->addr, value);
+		break;
+	case 8:
+		err = nfp_cpp_writeq(rtbl->cpp, id, sym->addr, value);
+		break;
+	default:
+		nfp_err(rtbl->cpp,
+			"rtsym '%s' unsupported or non-scalar size: %lld\n",
+			name, sym->size);
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
 u8 __iomem *
 nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id,
 	      unsigned int min_size, struct nfp_cpp_area **area)
-- 
cgit v1.2.3-59-g8ed1b


From 898bc7d634b4ffbdc3511212625c735d400a8cc6 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:50 -0700
Subject: nfp: flower: check for/turn on LAG support in firmware

Check if the fw contains the _abi_flower_balance_sync_enable symbol. If it
does then write a 1 to this indicating that the driver is willing to
receive NIC to kernel LAG related control messages.

If the write is successful, update the list of extra features supported by
the fw and add a stub to accept LAG cmsgs.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c |  5 +++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h |  1 +
 drivers/net/ethernet/netronome/nfp/flower/main.c | 12 ++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/main.h |  1 +
 4 files changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 577659f332e4..03aae2ed9983 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -239,6 +239,7 @@ nfp_flower_cmsg_portreify_rx(struct nfp_app *app, struct sk_buff *skb)
 static void
 nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
 	struct nfp_flower_cmsg_hdr *cmsg_hdr;
 	enum nfp_flower_cmsg_type_port type;
 
@@ -258,6 +259,10 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS:
 		nfp_tunnel_keep_alive(app, skb);
 		break;
+	case NFP_FLOWER_CMSG_TYPE_LAG_CONFIG:
+		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+			break;
+		/* fall through */
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
 				     type);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index bee4367a2c38..3a42a1fc55cb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -366,6 +366,7 @@ struct nfp_flower_cmsg_hdr {
 enum nfp_flower_cmsg_type_port {
 	NFP_FLOWER_CMSG_TYPE_FLOW_ADD =		0,
 	NFP_FLOWER_CMSG_TYPE_FLOW_DEL =		2,
+	NFP_FLOWER_CMSG_TYPE_LAG_CONFIG =	4,
 	NFP_FLOWER_CMSG_TYPE_PORT_REIFY =	6,
 	NFP_FLOWER_CMSG_TYPE_MAC_REPR =		7,
 	NFP_FLOWER_CMSG_TYPE_PORT_MOD =		8,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 4e67c0cbf9f0..1910c3e2b3e5 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -546,8 +546,20 @@ static int nfp_flower_init(struct nfp_app *app)
 	else
 		app_priv->flower_ext_feats = features;
 
+	/* Tell the firmware that the driver supports lag. */
+	err = nfp_rtsym_write_le(app->pf->rtbl,
+				 "_abi_flower_balance_sync_enable", 1);
+	if (!err)
+		app_priv->flower_ext_feats |= NFP_FL_FEATS_LAG;
+	else if (err == -ENOENT)
+		nfp_warn(app->cpp, "LAG not supported by FW.\n");
+	else
+		goto err_cleanup_metadata;
+
 	return 0;
 
+err_cleanup_metadata:
+	nfp_flower_metadata_cleanup(app);
 err_free_app_priv:
 	vfree(app->priv);
 	return err;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 733ff53cc601..6e82aa4ed84b 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -67,6 +67,7 @@ struct nfp_app;
 /* Extra features bitmap. */
 #define NFP_FL_FEATS_GENEVE		BIT(0)
 #define NFP_FL_NBI_MTU_SETTING		BIT(1)
+#define NFP_FL_FEATS_LAG		BIT(31)
 
 struct nfp_fl_mask_id {
 	struct circ_buf mask_id_free_list;
-- 
cgit v1.2.3-59-g8ed1b


From b945245297416a3c68ed12f2ada1c7162f5f73fd Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:51 -0700
Subject: nfp: flower: add per repr private data for LAG offload

Add a bitmap to each flower repr to track its state if it is enslaved by a
bond. This LAG state may be different to the port state - for example, the
port may be up but LAG state may be down due to the selection in an
active/backup bond.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/main.c | 26 ++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/main.h |  8 ++++++++
 2 files changed, 34 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 1910c3e2b3e5..202284b42fd9 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -185,6 +185,10 @@ nfp_flower_repr_netdev_init(struct nfp_app *app, struct net_device *netdev)
 static void
 nfp_flower_repr_netdev_clean(struct nfp_app *app, struct net_device *netdev)
 {
+	struct nfp_repr *repr = netdev_priv(netdev);
+
+	kfree(repr->app_priv);
+
 	tc_setup_cb_egdev_unregister(netdev, nfp_flower_setup_tc_egress_cb,
 				     netdev_priv(netdev));
 }
@@ -225,7 +229,9 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 	u8 nfp_pcie = nfp_cppcore_pcie_unit(app->pf->cpp);
 	struct nfp_flower_priv *priv = app->priv;
 	atomic_t *replies = &priv->reify_replies;
+	struct nfp_flower_repr_priv *repr_priv;
 	enum nfp_port_type port_type;
+	struct nfp_repr *nfp_repr;
 	struct nfp_reprs *reprs;
 	int i, err, reify_cnt;
 	const u8 queue = 0;
@@ -248,6 +254,15 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 			goto err_reprs_clean;
 		}
 
+		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
+		if (!repr_priv) {
+			err = -ENOMEM;
+			goto err_reprs_clean;
+		}
+
+		nfp_repr = netdev_priv(repr);
+		nfp_repr->app_priv = repr_priv;
+
 		/* For now we only support 1 PF */
 		WARN_ON(repr_type == NFP_REPR_TYPE_PF && i);
 
@@ -324,6 +339,8 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 {
 	struct nfp_eth_table *eth_tbl = app->pf->eth_tbl;
 	atomic_t *replies = &priv->reify_replies;
+	struct nfp_flower_repr_priv *repr_priv;
+	struct nfp_repr *nfp_repr;
 	struct sk_buff *ctrl_skb;
 	struct nfp_reprs *reprs;
 	int err, reify_cnt;
@@ -351,6 +368,15 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 			goto err_reprs_clean;
 		}
 
+		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
+		if (!repr_priv) {
+			err = -ENOMEM;
+			goto err_reprs_clean;
+		}
+
+		nfp_repr = netdev_priv(repr);
+		nfp_repr->app_priv = repr_priv;
+
 		port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 6e82aa4ed84b..7ce255705446 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -160,6 +160,14 @@ struct nfp_flower_priv {
 	struct nfp_mtu_conf mtu_conf;
 };
 
+/**
+ * struct nfp_flower_repr_priv - Flower APP per-repr priv data
+ * @lag_port_flags:	Extended port flags to record lag state of repr
+ */
+struct nfp_flower_repr_priv {
+	unsigned long lag_port_flags;
+};
+
 struct nfp_fl_key_ls {
 	u32 key_layer_two;
 	u8 key_layer;
-- 
cgit v1.2.3-59-g8ed1b


From f44aa9ef7950a56daa3a5b41f069761f945f1a1f Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:52 -0700
Subject: net: include hash policy in LAG changeupper info

LAG upper event notifiers contain the tx type used by the LAG device.
Extend this to also include the hash policy used for tx types that
utilize hashing.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 27 ++++++++++++++++++++++++++-
 drivers/net/team/team.c         |  1 +
 include/linux/netdevice.h       | 11 +++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fea17b92b1ae..bd53a71f6b00 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1218,12 +1218,37 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
+static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
+					       enum netdev_lag_tx_type type)
+{
+	if (type != NETDEV_LAG_TX_TYPE_HASH)
+		return NETDEV_LAG_HASH_NONE;
+
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_LAYER2:
+		return NETDEV_LAG_HASH_L2;
+	case BOND_XMIT_POLICY_LAYER34:
+		return NETDEV_LAG_HASH_L34;
+	case BOND_XMIT_POLICY_LAYER23:
+		return NETDEV_LAG_HASH_L23;
+	case BOND_XMIT_POLICY_ENCAP23:
+		return NETDEV_LAG_HASH_E23;
+	case BOND_XMIT_POLICY_ENCAP34:
+		return NETDEV_LAG_HASH_E34;
+	default:
+		return NETDEV_LAG_HASH_UNKNOWN;
+	}
+}
+
 static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
 				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
+	enum netdev_lag_tx_type type;
 
-	lag_upper_info.tx_type = bond_lag_tx_type(bond);
+	type = bond_lag_tx_type(bond);
+	lag_upper_info.tx_type = type;
+	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
 
 	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
 					    &lag_upper_info, extack);
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d6ff881165d0..e6730a01d130 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1129,6 +1129,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port,
 	int err;
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
+	lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
 					   &lag_upper_info, extack);
 	if (err)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index debdb6286170..8452f72087ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2332,8 +2332,19 @@ enum netdev_lag_tx_type {
 	NETDEV_LAG_TX_TYPE_HASH,
 };
 
+enum netdev_lag_hash {
+	NETDEV_LAG_HASH_NONE,
+	NETDEV_LAG_HASH_L2,
+	NETDEV_LAG_HASH_L34,
+	NETDEV_LAG_HASH_L23,
+	NETDEV_LAG_HASH_E23,
+	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_UNKNOWN,
+};
+
 struct netdev_lag_upper_info {
 	enum netdev_lag_tx_type tx_type;
+	enum netdev_lag_hash hash_type;
 };
 
 struct netdev_lag_lower_state_info {
-- 
cgit v1.2.3-59-g8ed1b


From bb9a8d031140f186d13d82f57b0f5646d596652f Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:53 -0700
Subject: nfp: flower: monitor and offload LAG groups

Monitor LAG events via the NETDEV_CHANGEUPPER/NETDEV_CHANGELOWERSTATE
notifiers to maintain a list of offloadable groups. Sync these groups with
HW via a delayed workqueue to prevent excessive re-configuration. When the
workqueue is triggered it may generate multiple control messages for
different groups. These messages are linked via a batch ID and flags to
indicate a new batch and the end of a batch.

Update private data in each repr to track their LAG lower state flags. The
state of a repr is used to determine the active netdevs that can be
offloaded. For example, in active-backup mode, we only offload the netdev
currently active.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/Makefile        |   1 +
 .../net/ethernet/netronome/nfp/flower/lag_conf.c   | 589 +++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/main.c   |  29 +-
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  30 ++
 4 files changed, 646 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/lag_conf.c

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index 6373f56205fd..4afb10375397 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -37,6 +37,7 @@ ifeq ($(CONFIG_NFP_APP_FLOWER),y)
 nfp-objs += \
 	    flower/action.o \
 	    flower/cmsg.o \
+	    flower/lag_conf.o \
 	    flower/main.o \
 	    flower/match.o \
 	    flower/metadata.o \
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
new file mode 100644
index 000000000000..35a700b879d7
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "main.h"
+
+/* LAG group config flags. */
+#define NFP_FL_LAG_LAST			BIT(1)
+#define NFP_FL_LAG_FIRST		BIT(2)
+#define NFP_FL_LAG_SWITCH		BIT(6)
+#define NFP_FL_LAG_RESET		BIT(7)
+
+/* LAG port state flags. */
+#define NFP_PORT_LAG_LINK_UP		BIT(0)
+#define NFP_PORT_LAG_TX_ENABLED		BIT(1)
+#define NFP_PORT_LAG_CHANGED		BIT(2)
+
+enum nfp_fl_lag_batch {
+	NFP_FL_LAG_BATCH_FIRST,
+	NFP_FL_LAG_BATCH_MEMBER,
+	NFP_FL_LAG_BATCH_FINISHED
+};
+
+/**
+ * struct nfp_flower_cmsg_lag_config - control message payload for LAG config
+ * @ctrl_flags:	Configuration flags
+ * @reserved:	Reserved for future use
+ * @ttl:	Time to live of packet - host always sets to 0xff
+ * @pkt_number:	Config message packet number - increment for each message
+ * @batch_ver:	Batch version of messages - increment for each batch of messages
+ * @group_id:	Group ID applicable
+ * @group_inst:	Group instance number - increment when group is reused
+ * @members:	Array of 32-bit words listing all active group members
+ */
+struct nfp_flower_cmsg_lag_config {
+	u8 ctrl_flags;
+	u8 reserved[2];
+	u8 ttl;
+	__be32 pkt_number;
+	__be32 batch_ver;
+	__be32 group_id;
+	__be32 group_inst;
+	__be32 members[];
+};
+
+/**
+ * struct nfp_fl_lag_group - list entry for each LAG group
+ * @group_id:		Assigned group ID for host/kernel sync
+ * @group_inst:		Group instance in case of ID reuse
+ * @list:		List entry
+ * @master_ndev:	Group master Netdev
+ * @dirty:		Marked if the group needs synced to HW
+ * @offloaded:		Marked if the group is currently offloaded to NIC
+ * @to_remove:		Marked if the group should be removed from NIC
+ * @to_destroy:		Marked if the group should be removed from driver
+ * @slave_cnt:		Number of slaves in group
+ */
+struct nfp_fl_lag_group {
+	unsigned int group_id;
+	u8 group_inst;
+	struct list_head list;
+	struct net_device *master_ndev;
+	bool dirty;
+	bool offloaded;
+	bool to_remove;
+	bool to_destroy;
+	unsigned int slave_cnt;
+};
+
+#define NFP_FL_LAG_PKT_NUMBER_MASK	GENMASK(30, 0)
+#define NFP_FL_LAG_VERSION_MASK		GENMASK(22, 0)
+#define NFP_FL_LAG_HOST_TTL		0xff
+
+/* Use this ID with zero members to ack a batch config */
+#define NFP_FL_LAG_SYNC_ID		0
+#define NFP_FL_LAG_GROUP_MIN		1 /* ID 0 reserved */
+#define NFP_FL_LAG_GROUP_MAX		32 /* IDs 1 to 31 are valid */
+
+/* wait for more config */
+#define NFP_FL_LAG_DELAY		(msecs_to_jiffies(2))
+
+static unsigned int nfp_fl_get_next_pkt_number(struct nfp_fl_lag *lag)
+{
+	lag->pkt_num++;
+	lag->pkt_num &= NFP_FL_LAG_PKT_NUMBER_MASK;
+
+	return lag->pkt_num;
+}
+
+static void nfp_fl_increment_version(struct nfp_fl_lag *lag)
+{
+	/* LSB is not considered by firmware so add 2 for each increment. */
+	lag->batch_ver += 2;
+	lag->batch_ver &= NFP_FL_LAG_VERSION_MASK;
+
+	/* Zero is reserved by firmware. */
+	if (!lag->batch_ver)
+		lag->batch_ver += 2;
+}
+
+static struct nfp_fl_lag_group *
+nfp_fl_lag_group_create(struct nfp_fl_lag *lag, struct net_device *master)
+{
+	struct nfp_fl_lag_group *group;
+	struct nfp_flower_priv *priv;
+	int id;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	id = ida_simple_get(&lag->ida_handle, NFP_FL_LAG_GROUP_MIN,
+			    NFP_FL_LAG_GROUP_MAX, GFP_KERNEL);
+	if (id < 0) {
+		nfp_flower_cmsg_warn(priv->app,
+				     "No more bonding groups available\n");
+		return ERR_PTR(id);
+	}
+
+	group = kmalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		ida_simple_remove(&lag->ida_handle, id);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	group->group_id = id;
+	group->master_ndev = master;
+	group->dirty = true;
+	group->offloaded = false;
+	group->to_remove = false;
+	group->to_destroy = false;
+	group->slave_cnt = 0;
+	group->group_inst = ++lag->global_inst;
+	list_add_tail(&group->list, &lag->group_list);
+
+	return group;
+}
+
+static struct nfp_fl_lag_group *
+nfp_fl_lag_find_group_for_master_with_lag(struct nfp_fl_lag *lag,
+					  struct net_device *master)
+{
+	struct nfp_fl_lag_group *entry;
+
+	if (!master)
+		return NULL;
+
+	list_for_each_entry(entry, &lag->group_list, list)
+		if (entry->master_ndev == master)
+			return entry;
+
+	return NULL;
+}
+
+static int
+nfp_fl_lag_config_group(struct nfp_fl_lag *lag, struct nfp_fl_lag_group *group,
+			struct net_device **active_members,
+			unsigned int member_cnt, enum nfp_fl_lag_batch *batch)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+	struct nfp_flower_priv *priv;
+	unsigned long int flags;
+	unsigned int size, i;
+	struct sk_buff *skb;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+	size = sizeof(*cmsg_payload) + sizeof(__be32) * member_cnt;
+	skb = nfp_flower_cmsg_alloc(priv->app, size,
+				    NFP_FLOWER_CMSG_TYPE_LAG_CONFIG,
+				    GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	flags = 0;
+
+	/* Increment batch version for each new batch of config messages. */
+	if (*batch == NFP_FL_LAG_BATCH_FIRST) {
+		flags |= NFP_FL_LAG_FIRST;
+		nfp_fl_increment_version(lag);
+		*batch = NFP_FL_LAG_BATCH_MEMBER;
+	}
+
+	/* If it is a reset msg then it is also the end of the batch. */
+	if (lag->rst_cfg) {
+		flags |= NFP_FL_LAG_RESET;
+		*batch = NFP_FL_LAG_BATCH_FINISHED;
+	}
+
+	/* To signal the end of a batch, both the switch and last flags are set
+	 * and the the reserved SYNC group ID is used.
+	 */
+	if (*batch == NFP_FL_LAG_BATCH_FINISHED) {
+		flags |= NFP_FL_LAG_SWITCH | NFP_FL_LAG_LAST;
+		lag->rst_cfg = false;
+		cmsg_payload->group_id = cpu_to_be32(NFP_FL_LAG_SYNC_ID);
+		cmsg_payload->group_inst = 0;
+	} else {
+		cmsg_payload->group_id = cpu_to_be32(group->group_id);
+		cmsg_payload->group_inst = cpu_to_be32(group->group_inst);
+	}
+
+	cmsg_payload->reserved[0] = 0;
+	cmsg_payload->reserved[1] = 0;
+	cmsg_payload->ttl = NFP_FL_LAG_HOST_TTL;
+	cmsg_payload->ctrl_flags = flags;
+	cmsg_payload->batch_ver = cpu_to_be32(lag->batch_ver);
+	cmsg_payload->pkt_number = cpu_to_be32(nfp_fl_get_next_pkt_number(lag));
+
+	for (i = 0; i < member_cnt; i++)
+		cmsg_payload->members[i] =
+			cpu_to_be32(nfp_repr_get_port_id(active_members[i]));
+
+	nfp_ctrl_tx(priv->app->ctrl, skb);
+	return 0;
+}
+
+static void nfp_fl_lag_do_work(struct work_struct *work)
+{
+	enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST;
+	struct nfp_fl_lag_group *entry, *storage;
+	struct delayed_work *delayed_work;
+	struct nfp_flower_priv *priv;
+	struct nfp_fl_lag *lag;
+	int err;
+
+	delayed_work = to_delayed_work(work);
+	lag = container_of(delayed_work, struct nfp_fl_lag, work);
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	mutex_lock(&lag->lock);
+	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
+		struct net_device *iter_netdev, **acti_netdevs;
+		struct nfp_flower_repr_priv *repr_priv;
+		int active_count = 0, slaves = 0;
+		struct nfp_repr *repr;
+		unsigned long *flags;
+
+		if (entry->to_remove) {
+			/* Active count of 0 deletes group on hw. */
+			err = nfp_fl_lag_config_group(lag, entry, NULL, 0,
+						      &batch);
+			if (!err) {
+				entry->to_remove = false;
+				entry->offloaded = false;
+			} else {
+				nfp_flower_cmsg_warn(priv->app,
+						     "group delete failed\n");
+				schedule_delayed_work(&lag->work,
+						      NFP_FL_LAG_DELAY);
+				continue;
+			}
+
+			if (entry->to_destroy) {
+				ida_simple_remove(&lag->ida_handle,
+						  entry->group_id);
+				list_del(&entry->list);
+				kfree(entry);
+			}
+			continue;
+		}
+
+		acti_netdevs = kmalloc_array(entry->slave_cnt,
+					     sizeof(*acti_netdevs), GFP_KERNEL);
+
+		/* Include sanity check in the loop. It may be that a bond has
+		 * changed between processing the last notification and the
+		 * work queue triggering. If the number of slaves has changed
+		 * or it now contains netdevs that cannot be offloaded, ignore
+		 * the group until pending notifications are processed.
+		 */
+		rcu_read_lock();
+		for_each_netdev_in_bond_rcu(entry->master_ndev, iter_netdev) {
+			if (!nfp_netdev_is_nfp_repr(iter_netdev)) {
+				slaves = 0;
+				break;
+			}
+
+			repr = netdev_priv(iter_netdev);
+
+			if (repr->app != priv->app) {
+				slaves = 0;
+				break;
+			}
+
+			slaves++;
+			if (slaves > entry->slave_cnt)
+				break;
+
+			/* Check the ports for state changes. */
+			repr_priv = repr->app_priv;
+			flags = &repr_priv->lag_port_flags;
+
+			if (*flags & NFP_PORT_LAG_CHANGED) {
+				*flags &= ~NFP_PORT_LAG_CHANGED;
+				entry->dirty = true;
+			}
+
+			if ((*flags & NFP_PORT_LAG_TX_ENABLED) &&
+			    (*flags & NFP_PORT_LAG_LINK_UP))
+				acti_netdevs[active_count++] = iter_netdev;
+		}
+		rcu_read_unlock();
+
+		if (slaves != entry->slave_cnt || !entry->dirty) {
+			kfree(acti_netdevs);
+			continue;
+		}
+
+		err = nfp_fl_lag_config_group(lag, entry, acti_netdevs,
+					      active_count, &batch);
+		if (!err) {
+			entry->offloaded = true;
+			entry->dirty = false;
+		} else {
+			nfp_flower_cmsg_warn(priv->app,
+					     "group offload failed\n");
+			schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+		}
+
+		kfree(acti_netdevs);
+	}
+
+	/* End the config batch if at least one packet has been batched. */
+	if (batch == NFP_FL_LAG_BATCH_MEMBER) {
+		batch = NFP_FL_LAG_BATCH_FINISHED;
+		err = nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch);
+		if (err)
+			nfp_flower_cmsg_warn(priv->app,
+					     "group batch end cmsg failed\n");
+	}
+
+	mutex_unlock(&lag->lock);
+}
+
+static void
+nfp_fl_lag_schedule_group_remove(struct nfp_fl_lag *lag,
+				 struct nfp_fl_lag_group *group)
+{
+	group->to_remove = true;
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+}
+
+static int
+nfp_fl_lag_schedule_group_delete(struct nfp_fl_lag *lag,
+				 struct net_device *master)
+{
+	struct nfp_fl_lag_group *group;
+
+	mutex_lock(&lag->lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(lag, master);
+	if (!group) {
+		mutex_unlock(&lag->lock);
+		return -ENOENT;
+	}
+
+	group->to_remove = true;
+	group->to_destroy = true;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_changeupper_event(struct nfp_fl_lag *lag,
+			     struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *upper = info->upper_dev, *iter_netdev;
+	struct netdev_lag_upper_info *lag_upper_info;
+	struct nfp_fl_lag_group *group;
+	struct nfp_flower_priv *priv;
+	unsigned int slave_count = 0;
+	bool can_offload = true;
+	struct nfp_repr *repr;
+
+	if (!netif_is_lag_master(upper))
+		return 0;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, iter_netdev) {
+		if (!nfp_netdev_is_nfp_repr(iter_netdev)) {
+			can_offload = false;
+			break;
+		}
+		repr = netdev_priv(iter_netdev);
+
+		/* Ensure all ports are created by the same app/on same card. */
+		if (repr->app != priv->app) {
+			can_offload = false;
+			break;
+		}
+
+		slave_count++;
+	}
+	rcu_read_unlock();
+
+	lag_upper_info = info->upper_info;
+
+	/* Firmware supports active/backup and L3/L4 hash bonds. */
+	if (lag_upper_info &&
+	    lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+	    (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH ||
+	    (lag_upper_info->hash_type != NETDEV_LAG_HASH_L34 &&
+	    lag_upper_info->hash_type != NETDEV_LAG_HASH_E34))) {
+		can_offload = false;
+		nfp_flower_cmsg_warn(priv->app,
+				     "Unable to offload tx_type %u hash %u\n",
+				     lag_upper_info->tx_type,
+				     lag_upper_info->hash_type);
+	}
+
+	mutex_lock(&lag->lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(lag, upper);
+
+	if (slave_count == 0 || !can_offload) {
+		/* Cannot offload the group - remove if previously offloaded. */
+		if (group && group->offloaded)
+			nfp_fl_lag_schedule_group_remove(lag, group);
+
+		mutex_unlock(&lag->lock);
+		return 0;
+	}
+
+	if (!group) {
+		group = nfp_fl_lag_group_create(lag, upper);
+		if (IS_ERR(group)) {
+			mutex_unlock(&lag->lock);
+			return PTR_ERR(group);
+		}
+	}
+
+	group->dirty = true;
+	group->slave_cnt = slave_count;
+
+	/* Group may have been on queue for removal but is now offfloable. */
+	group->to_remove = false;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_changels_event(struct nfp_fl_lag *lag, struct net_device *netdev,
+			  struct netdev_notifier_changelowerstate_info *info)
+{
+	struct netdev_lag_lower_state_info *lag_lower_info;
+	struct nfp_flower_repr_priv *repr_priv;
+	struct nfp_flower_priv *priv;
+	struct nfp_repr *repr;
+	unsigned long *flags;
+
+	if (!netif_is_lag_port(netdev) || !nfp_netdev_is_nfp_repr(netdev))
+		return 0;
+
+	lag_lower_info = info->lower_state_info;
+	if (!lag_lower_info)
+		return 0;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+	repr = netdev_priv(netdev);
+
+	/* Verify that the repr is associated with this app. */
+	if (repr->app != priv->app)
+		return 0;
+
+	repr_priv = repr->app_priv;
+	flags = &repr_priv->lag_port_flags;
+
+	mutex_lock(&lag->lock);
+	if (lag_lower_info->link_up)
+		*flags |= NFP_PORT_LAG_LINK_UP;
+	else
+		*flags &= ~NFP_PORT_LAG_LINK_UP;
+
+	if (lag_lower_info->tx_enabled)
+		*flags |= NFP_PORT_LAG_TX_ENABLED;
+	else
+		*flags &= ~NFP_PORT_LAG_TX_ENABLED;
+
+	*flags |= NFP_PORT_LAG_CHANGED;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_netdev_event(struct notifier_block *nb, unsigned long event,
+			void *ptr)
+{
+	struct net_device *netdev;
+	struct nfp_fl_lag *lag;
+	int err;
+
+	netdev = netdev_notifier_info_to_dev(ptr);
+	lag = container_of(nb, struct nfp_fl_lag, lag_nb);
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		err = nfp_fl_lag_changeupper_event(lag, ptr);
+		if (err)
+			return NOTIFY_BAD;
+		return NOTIFY_OK;
+	case NETDEV_CHANGELOWERSTATE:
+		err = nfp_fl_lag_changels_event(lag, netdev, ptr);
+		if (err)
+			return NOTIFY_BAD;
+		return NOTIFY_OK;
+	case NETDEV_UNREGISTER:
+		if (netif_is_bond_master(netdev)) {
+			err = nfp_fl_lag_schedule_group_delete(lag, netdev);
+			if (err)
+				return NOTIFY_BAD;
+			return NOTIFY_OK;
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+int nfp_flower_lag_reset(struct nfp_fl_lag *lag)
+{
+	enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST;
+
+	lag->rst_cfg = true;
+	return nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch);
+}
+
+void nfp_flower_lag_init(struct nfp_fl_lag *lag)
+{
+	INIT_DELAYED_WORK(&lag->work, nfp_fl_lag_do_work);
+	INIT_LIST_HEAD(&lag->group_list);
+	mutex_init(&lag->lock);
+	ida_init(&lag->ida_handle);
+
+	/* 0 is a reserved batch version so increment to first valid value. */
+	nfp_fl_increment_version(lag);
+
+	lag->lag_nb.notifier_call = nfp_fl_lag_netdev_event;
+}
+
+void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag)
+{
+	struct nfp_fl_lag_group *entry, *storage;
+
+	cancel_delayed_work_sync(&lag->work);
+
+	/* Remove all groups. */
+	mutex_lock(&lag->lock);
+	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	mutex_unlock(&lag->lock);
+	mutex_destroy(&lag->lock);
+	ida_destroy(&lag->ida_handle);
+}
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 202284b42fd9..19cfa162ac65 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -575,12 +575,14 @@ static int nfp_flower_init(struct nfp_app *app)
 	/* Tell the firmware that the driver supports lag. */
 	err = nfp_rtsym_write_le(app->pf->rtbl,
 				 "_abi_flower_balance_sync_enable", 1);
-	if (!err)
+	if (!err) {
 		app_priv->flower_ext_feats |= NFP_FL_FEATS_LAG;
-	else if (err == -ENOENT)
+		nfp_flower_lag_init(&app_priv->nfp_lag);
+	} else if (err == -ENOENT) {
 		nfp_warn(app->cpp, "LAG not supported by FW.\n");
-	else
+	} else {
 		goto err_cleanup_metadata;
+	}
 
 	return 0;
 
@@ -599,6 +601,9 @@ static void nfp_flower_clean(struct nfp_app *app)
 	skb_queue_purge(&app_priv->cmsg_skbs_low);
 	flush_work(&app_priv->cmsg_work);
 
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		nfp_flower_lag_cleanup(&app_priv->nfp_lag);
+
 	nfp_flower_metadata_cleanup(app);
 	vfree(app->priv);
 	app->priv = NULL;
@@ -665,11 +670,29 @@ nfp_flower_repr_change_mtu(struct nfp_app *app, struct net_device *netdev,
 
 static int nfp_flower_start(struct nfp_app *app)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
+	int err;
+
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		err = nfp_flower_lag_reset(&app_priv->nfp_lag);
+		if (err)
+			return err;
+
+		err = register_netdevice_notifier(&app_priv->nfp_lag.lag_nb);
+		if (err)
+			return err;
+	}
+
 	return nfp_tunnel_config_start(app);
 }
 
 static void nfp_flower_stop(struct nfp_app *app)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
+
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		unregister_netdevice_notifier(&app_priv->nfp_lag.lag_nb);
+
 	nfp_tunnel_config_stop(app);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 7ce255705446..e03efb034948 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -43,6 +43,7 @@
 #include <net/pkt_cls.h>
 #include <net/tcp.h>
 #include <linux/workqueue.h>
+#include <linux/idr.h>
 
 struct net_device;
 struct nfp_app;
@@ -97,6 +98,30 @@ struct nfp_mtu_conf {
 	spinlock_t lock;
 };
 
+/**
+ * struct nfp_fl_lag - Flower APP priv data for link aggregation
+ * @lag_nb:		Notifier to track master/slave events
+ * @work:		Work queue for writing configs to the HW
+ * @lock:		Lock to protect lag_group_list
+ * @group_list:		List of all master/slave groups offloaded
+ * @ida_handle:		IDA to handle group ids
+ * @pkt_num:		Incremented for each config packet sent
+ * @batch_ver:		Incremented for each batch of config packets
+ * @global_inst:	Instance allocator for groups
+ * @rst_cfg:		Marker to reset HW LAG config
+ */
+struct nfp_fl_lag {
+	struct notifier_block lag_nb;
+	struct delayed_work work;
+	struct mutex lock;
+	struct list_head group_list;
+	struct ida ida_handle;
+	unsigned int pkt_num;
+	unsigned int batch_ver;
+	u8 global_inst;
+	bool rst_cfg;
+};
+
 /**
  * struct nfp_flower_priv - Flower APP per-vNIC priv data
  * @app:		Back pointer to app
@@ -129,6 +154,7 @@ struct nfp_mtu_conf {
  *			from firmware for repr reify
  * @reify_wait_queue:	wait queue for repr reify response counting
  * @mtu_conf:		Configuration of repr MTU value
+ * @nfp_lag:		Link aggregation data block
  */
 struct nfp_flower_priv {
 	struct nfp_app *app;
@@ -158,6 +184,7 @@ struct nfp_flower_priv {
 	atomic_t reify_replies;
 	wait_queue_head_t reify_wait_queue;
 	struct nfp_mtu_conf mtu_conf;
+	struct nfp_fl_lag nfp_lag;
 };
 
 /**
@@ -250,5 +277,8 @@ void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb);
 void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb);
 int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
 				  void *cb_priv);
+void nfp_flower_lag_init(struct nfp_fl_lag *lag);
+void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
+int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2e1cc5226b44100696dcab2d59d6fbc789db6153 Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:54 -0700
Subject: nfp: flower: implement host cmsg handler for LAG

Adds the control message handler to synchronize offloaded group config
with that of the kernel. Such messages are sent from fw to driver and
feature the following 3 flags:

- Data: an attached cmsg could not be processed - store for retransmission
- Xon: FW can accept new messages - retransmit any stored cmsgs
- Sync: full sync requested so retransmit all kernel LAG group info

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c   |  8 +-
 .../net/ethernet/netronome/nfp/flower/lag_conf.c   | 95 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  4 +
 3 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 03aae2ed9983..cb8565222621 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -242,6 +242,7 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	struct nfp_flower_priv *app_priv = app->priv;
 	struct nfp_flower_cmsg_hdr *cmsg_hdr;
 	enum nfp_flower_cmsg_type_port type;
+	bool skb_stored = false;
 
 	cmsg_hdr = nfp_flower_cmsg_get_hdr(skb);
 
@@ -260,8 +261,10 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 		nfp_tunnel_keep_alive(app, skb);
 		break;
 	case NFP_FLOWER_CMSG_TYPE_LAG_CONFIG:
-		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+			skb_stored = nfp_flower_lag_unprocessed_msg(app, skb);
 			break;
+		}
 		/* fall through */
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
@@ -269,7 +272,8 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 		goto out;
 	}
 
-	dev_consume_skb_any(skb);
+	if (!skb_stored)
+		dev_consume_skb_any(skb);
 	return;
 out:
 	dev_kfree_skb_any(skb);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
index 35a700b879d7..a09fe2778250 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -36,6 +36,9 @@
 /* LAG group config flags. */
 #define NFP_FL_LAG_LAST			BIT(1)
 #define NFP_FL_LAG_FIRST		BIT(2)
+#define NFP_FL_LAG_DATA			BIT(3)
+#define NFP_FL_LAG_XON			BIT(4)
+#define NFP_FL_LAG_SYNC			BIT(5)
 #define NFP_FL_LAG_SWITCH		BIT(6)
 #define NFP_FL_LAG_RESET		BIT(7)
 
@@ -108,6 +111,8 @@ struct nfp_fl_lag_group {
 /* wait for more config */
 #define NFP_FL_LAG_DELAY		(msecs_to_jiffies(2))
 
+#define NFP_FL_LAG_RETRANS_LIMIT	100 /* max retrans cmsgs to store */
+
 static unsigned int nfp_fl_get_next_pkt_number(struct nfp_fl_lag *lag)
 {
 	lag->pkt_num++;
@@ -360,6 +365,92 @@ static void nfp_fl_lag_do_work(struct work_struct *work)
 	mutex_unlock(&lag->lock);
 }
 
+static int
+nfp_fl_lag_put_unprocessed(struct nfp_fl_lag *lag, struct sk_buff *skb)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	if (be32_to_cpu(cmsg_payload->group_id) >= NFP_FL_LAG_GROUP_MAX)
+		return -EINVAL;
+
+	/* Drop cmsg retrans if storage limit is exceeded to prevent
+	 * overloading. If the fw notices that expected messages have not been
+	 * received in a given time block, it will request a full resync.
+	 */
+	if (skb_queue_len(&lag->retrans_skbs) >= NFP_FL_LAG_RETRANS_LIMIT)
+		return -ENOSPC;
+
+	__skb_queue_tail(&lag->retrans_skbs, skb);
+
+	return 0;
+}
+
+static void nfp_fl_send_unprocessed(struct nfp_fl_lag *lag)
+{
+	struct nfp_flower_priv *priv;
+	struct sk_buff *skb;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	while ((skb = __skb_dequeue(&lag->retrans_skbs)))
+		nfp_ctrl_tx(priv->app->ctrl, skb);
+}
+
+bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group_entry;
+	unsigned long int flags;
+	bool store_skb = false;
+	int err;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	flags = cmsg_payload->ctrl_flags;
+
+	/* Note the intentional fall through below. If DATA and XON are both
+	 * set, the message will stored and sent again with the rest of the
+	 * unprocessed messages list.
+	 */
+
+	/* Store */
+	if (flags & NFP_FL_LAG_DATA)
+		if (!nfp_fl_lag_put_unprocessed(&priv->nfp_lag, skb))
+			store_skb = true;
+
+	/* Send stored */
+	if (flags & NFP_FL_LAG_XON)
+		nfp_fl_send_unprocessed(&priv->nfp_lag);
+
+	/* Resend all */
+	if (flags & NFP_FL_LAG_SYNC) {
+		/* To resend all config:
+		 * 1) Clear all unprocessed messages
+		 * 2) Mark all groups dirty
+		 * 3) Reset NFP group config
+		 * 4) Schedule a LAG config update
+		 */
+
+		__skb_queue_purge(&priv->nfp_lag.retrans_skbs);
+
+		mutex_lock(&priv->nfp_lag.lock);
+		list_for_each_entry(group_entry, &priv->nfp_lag.group_list,
+				    list)
+			group_entry->dirty = true;
+
+		err = nfp_flower_lag_reset(&priv->nfp_lag);
+		if (err)
+			nfp_flower_cmsg_warn(priv->app,
+					     "mem err in group reset msg\n");
+		mutex_unlock(&priv->nfp_lag.lock);
+
+		schedule_delayed_work(&priv->nfp_lag.work, 0);
+	}
+
+	return store_skb;
+}
+
 static void
 nfp_fl_lag_schedule_group_remove(struct nfp_fl_lag *lag,
 				 struct nfp_fl_lag_group *group)
@@ -565,6 +656,8 @@ void nfp_flower_lag_init(struct nfp_fl_lag *lag)
 	mutex_init(&lag->lock);
 	ida_init(&lag->ida_handle);
 
+	__skb_queue_head_init(&lag->retrans_skbs);
+
 	/* 0 is a reserved batch version so increment to first valid value. */
 	nfp_fl_increment_version(lag);
 
@@ -577,6 +670,8 @@ void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag)
 
 	cancel_delayed_work_sync(&lag->work);
 
+	__skb_queue_purge(&lag->retrans_skbs);
+
 	/* Remove all groups. */
 	mutex_lock(&lag->lock);
 	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index e03efb034948..2fd75c155ccb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -109,6 +109,8 @@ struct nfp_mtu_conf {
  * @batch_ver:		Incremented for each batch of config packets
  * @global_inst:	Instance allocator for groups
  * @rst_cfg:		Marker to reset HW LAG config
+ * @retrans_skbs:	Cmsgs that could not be processed by HW and require
+ *			retransmission
  */
 struct nfp_fl_lag {
 	struct notifier_block lag_nb;
@@ -120,6 +122,7 @@ struct nfp_fl_lag {
 	unsigned int batch_ver;
 	u8 global_inst;
 	bool rst_cfg;
+	struct sk_buff_head retrans_skbs;
 };
 
 /**
@@ -280,5 +283,6 @@ int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
 void nfp_flower_lag_init(struct nfp_fl_lag *lag);
 void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
 int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
+bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 7e24a59311ea4a92c38f76756496b06293c50afb Mon Sep 17 00:00:00 2001
From: John Hurley <john.hurley@netronome.com>
Date: Wed, 23 May 2018 19:22:55 -0700
Subject: nfp: flower: compute link aggregation action

If the egress device of an offloaded rule is a LAG port, then encode the
output port to the NFP with a LAG identifier and the offloaded group ID.

A prelag action is also offloaded which must be the first action of the
series (although may appear after other pre-actions - e.g. tunnels). This
causes the FW to check that it has the necessary information to output to
the requested LAG port. If it does not, the packet is sent to the kernel
before any other actions are applied to it.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 131 +++++++++++++++++----
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  13 ++
 .../net/ethernet/netronome/nfp/flower/lag_conf.c   |  42 +++++++
 drivers/net/ethernet/netronome/nfp/flower/main.h   |   9 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    |   2 +-
 5 files changed, 169 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 80df9a5d4217..4a6d2db75071 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -72,6 +72,42 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan,
 	push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci);
 }
 
+static int
+nfp_fl_pre_lag(struct nfp_app *app, const struct tc_action *action,
+	       struct nfp_fl_payload *nfp_flow, int act_len)
+{
+	size_t act_size = sizeof(struct nfp_fl_pre_lag);
+	struct nfp_fl_pre_lag *pre_lag;
+	struct net_device *out_dev;
+	int err;
+
+	out_dev = tcf_mirred_dev(action);
+	if (!out_dev || !netif_is_lag_master(out_dev))
+		return 0;
+
+	if (act_len + act_size > NFP_FL_MAX_A_SIZ)
+		return -EOPNOTSUPP;
+
+	/* Pre_lag action must be first on action list.
+	 * If other actions already exist they need pushed forward.
+	 */
+	if (act_len)
+		memmove(nfp_flow->action_data + act_size,
+			nfp_flow->action_data, act_len);
+
+	pre_lag = (struct nfp_fl_pre_lag *)nfp_flow->action_data;
+	err = nfp_flower_lag_populate_pre_action(app, out_dev, pre_lag);
+	if (err)
+		return err;
+
+	pre_lag->head.jump_id = NFP_FL_ACTION_OPCODE_PRE_LAG;
+	pre_lag->head.len_lw = act_size >> NFP_FL_LW_SIZ;
+
+	nfp_flow->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+
+	return act_size;
+}
+
 static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev,
 					 enum nfp_flower_tun_type tun_type)
 {
@@ -88,12 +124,13 @@ static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev,
 }
 
 static int
-nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
-	      struct nfp_fl_payload *nfp_flow, bool last,
-	      struct net_device *in_dev, enum nfp_flower_tun_type tun_type,
-	      int *tun_out_cnt)
+nfp_fl_output(struct nfp_app *app, struct nfp_fl_output *output,
+	      const struct tc_action *action, struct nfp_fl_payload *nfp_flow,
+	      bool last, struct net_device *in_dev,
+	      enum nfp_flower_tun_type tun_type, int *tun_out_cnt)
 {
 	size_t act_size = sizeof(struct nfp_fl_output);
+	struct nfp_flower_priv *priv = app->priv;
 	struct net_device *out_dev;
 	u16 tmp_flags;
 
@@ -118,6 +155,15 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 		output->flags = cpu_to_be16(tmp_flags |
 					    NFP_FL_OUT_FLAGS_USE_TUN);
 		output->port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type);
+	} else if (netif_is_lag_master(out_dev) &&
+		   priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		int gid;
+
+		output->flags = cpu_to_be16(tmp_flags);
+		gid = nfp_flower_lag_get_output_id(app, out_dev);
+		if (gid < 0)
+			return gid;
+		output->port = cpu_to_be32(NFP_FL_LAG_OUT | gid);
 	} else {
 		/* Set action output parameters. */
 		output->flags = cpu_to_be16(tmp_flags);
@@ -164,7 +210,7 @@ static struct nfp_fl_pre_tunnel *nfp_fl_pre_tunnel(char *act_data, int act_len)
 	struct nfp_fl_pre_tunnel *pre_tun_act;
 
 	/* Pre_tunnel action must be first on action list.
-	 * If other actions already exist they need pushed forward.
+	 * If other actions already exist they need to be pushed forward.
 	 */
 	if (act_len)
 		memmove(act_data + act_size, act_data, act_len);
@@ -443,42 +489,73 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 }
 
 static int
-nfp_flower_loop_action(const struct tc_action *a,
+nfp_flower_output_action(struct nfp_app *app, const struct tc_action *a,
+			 struct nfp_fl_payload *nfp_fl, int *a_len,
+			 struct net_device *netdev, bool last,
+			 enum nfp_flower_tun_type *tun_type, int *tun_out_cnt,
+			 int *out_cnt)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_output *output;
+	int err, prelag_size;
+
+	if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
+		return -EOPNOTSUPP;
+
+	output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
+	err = nfp_fl_output(app, output, a, nfp_fl, last, netdev, *tun_type,
+			    tun_out_cnt);
+	if (err)
+		return err;
+
+	*a_len += sizeof(struct nfp_fl_output);
+
+	if (priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		/* nfp_fl_pre_lag returns -err or size of prelag action added.
+		 * This will be 0 if it is not egressing to a lag dev.
+		 */
+		prelag_size = nfp_fl_pre_lag(app, a, nfp_fl, *a_len);
+		if (prelag_size < 0)
+			return prelag_size;
+		else if (prelag_size > 0 && (!last || *out_cnt))
+			return -EOPNOTSUPP;
+
+		*a_len += prelag_size;
+	}
+	(*out_cnt)++;
+
+	return 0;
+}
+
+static int
+nfp_flower_loop_action(struct nfp_app *app, const struct tc_action *a,
 		       struct nfp_fl_payload *nfp_fl, int *a_len,
 		       struct net_device *netdev,
-		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt)
+		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt,
+		       int *out_cnt)
 {
 	struct nfp_fl_set_ipv4_udp_tun *set_tun;
 	struct nfp_fl_pre_tunnel *pre_tun;
 	struct nfp_fl_push_vlan *psh_v;
 	struct nfp_fl_pop_vlan *pop_v;
-	struct nfp_fl_output *output;
 	int err;
 
 	if (is_tcf_gact_shot(a)) {
 		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_DROP);
 	} else if (is_tcf_mirred_egress_redirect(a)) {
-		if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
-			return -EOPNOTSUPP;
-
-		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, true, netdev, *tun_type,
-				    tun_out_cnt);
+		err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev,
+					       true, tun_type, tun_out_cnt,
+					       out_cnt);
 		if (err)
 			return err;
 
-		*a_len += sizeof(struct nfp_fl_output);
 	} else if (is_tcf_mirred_egress_mirror(a)) {
-		if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
-			return -EOPNOTSUPP;
-
-		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, false, netdev, *tun_type,
-				    tun_out_cnt);
+		err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev,
+					       false, tun_type, tun_out_cnt,
+					       out_cnt);
 		if (err)
 			return err;
 
-		*a_len += sizeof(struct nfp_fl_output);
 	} else if (is_tcf_vlan(a) && tcf_vlan_action(a) == TCA_VLAN_ACT_POP) {
 		if (*a_len + sizeof(struct nfp_fl_pop_vlan) > NFP_FL_MAX_A_SIZ)
 			return -EOPNOTSUPP;
@@ -535,11 +612,12 @@ nfp_flower_loop_action(const struct tc_action *a,
 	return 0;
 }
 
-int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
+int nfp_flower_compile_action(struct nfp_app *app,
+			      struct tc_cls_flower_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow)
 {
-	int act_len, act_cnt, err, tun_out_cnt;
+	int act_len, act_cnt, err, tun_out_cnt, out_cnt;
 	enum nfp_flower_tun_type tun_type;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
@@ -550,11 +628,12 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
 	act_len = 0;
 	act_cnt = 0;
 	tun_out_cnt = 0;
+	out_cnt = 0;
 
 	tcf_exts_to_list(flow->exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev,
-					     &tun_type, &tun_out_cnt);
+		err = nfp_flower_loop_action(app, a, nfp_flow, &act_len, netdev,
+					     &tun_type, &tun_out_cnt, &out_cnt);
 		if (err)
 			return err;
 		act_cnt++;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 3a42a1fc55cb..4a7f3510a296 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -92,6 +92,7 @@
 #define NFP_FL_ACTION_OPCODE_SET_IPV6_DST	12
 #define NFP_FL_ACTION_OPCODE_SET_UDP		14
 #define NFP_FL_ACTION_OPCODE_SET_TCP		15
+#define NFP_FL_ACTION_OPCODE_PRE_LAG		16
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
 #define NFP_FL_ACTION_OPCODE_NUM		32
 
@@ -103,6 +104,9 @@
 #define NFP_FL_PUSH_VLAN_CFI		BIT(12)
 #define NFP_FL_PUSH_VLAN_VID		GENMASK(11, 0)
 
+/* LAG ports */
+#define NFP_FL_LAG_OUT			0xC0DE0000
+
 /* Tunnel ports */
 #define NFP_FL_PORT_TYPE_TUN		0x50000000
 #define NFP_FL_IPV4_TUNNEL_TYPE		GENMASK(7, 4)
@@ -177,6 +181,15 @@ struct nfp_fl_pop_vlan {
 	__be16 reserved;
 };
 
+struct nfp_fl_pre_lag {
+	struct nfp_fl_act_head head;
+	__be16 group_id;
+	u8 lag_version[3];
+	u8 instance;
+};
+
+#define NFP_FL_PRE_LAG_VER_OFF	8
+
 struct nfp_fl_pre_tunnel {
 	struct nfp_fl_act_head head;
 	__be16 reserved;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
index a09fe2778250..0c4c957717ea 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -184,6 +184,48 @@ nfp_fl_lag_find_group_for_master_with_lag(struct nfp_fl_lag *lag,
 	return NULL;
 }
 
+int nfp_flower_lag_populate_pre_action(struct nfp_app *app,
+				       struct net_device *master,
+				       struct nfp_fl_pre_lag *pre_act)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group = NULL;
+	__be32 temp_vers;
+
+	mutex_lock(&priv->nfp_lag.lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag,
+							  master);
+	if (!group) {
+		mutex_unlock(&priv->nfp_lag.lock);
+		return -ENOENT;
+	}
+
+	pre_act->group_id = cpu_to_be16(group->group_id);
+	temp_vers = cpu_to_be32(priv->nfp_lag.batch_ver <<
+				NFP_FL_PRE_LAG_VER_OFF);
+	memcpy(pre_act->lag_version, &temp_vers, 3);
+	pre_act->instance = group->group_inst;
+	mutex_unlock(&priv->nfp_lag.lock);
+
+	return 0;
+}
+
+int nfp_flower_lag_get_output_id(struct nfp_app *app, struct net_device *master)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group = NULL;
+	int group_id = -ENOENT;
+
+	mutex_lock(&priv->nfp_lag.lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag,
+							  master);
+	if (group)
+		group_id = group->group_id;
+	mutex_unlock(&priv->nfp_lag.lock);
+
+	return group_id;
+}
+
 static int
 nfp_fl_lag_config_group(struct nfp_fl_lag *lag, struct nfp_fl_lag_group *group,
 			struct net_device **active_members,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 2fd75c155ccb..bbe5764d26cb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -45,6 +45,7 @@
 #include <linux/workqueue.h>
 #include <linux/idr.h>
 
+struct nfp_fl_pre_lag;
 struct net_device;
 struct nfp_app;
 
@@ -253,7 +254,8 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct net_device *netdev,
 				  struct nfp_fl_payload *nfp_flow,
 				  enum nfp_flower_tun_type tun_type);
-int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
+int nfp_flower_compile_action(struct nfp_app *app,
+			      struct tc_cls_flower_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow);
 int nfp_compile_flow_metadata(struct nfp_app *app,
@@ -284,5 +286,10 @@ void nfp_flower_lag_init(struct nfp_fl_lag *lag);
 void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
 int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
 bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb);
+int nfp_flower_lag_populate_pre_action(struct nfp_app *app,
+				       struct net_device *master,
+				       struct nfp_fl_pre_lag *pre_act);
+int nfp_flower_lag_get_output_id(struct nfp_app *app,
+				 struct net_device *master);
 
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 70ec9d821b91..c42e64f32333 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -440,7 +440,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_destroy_flow;
 
-	err = nfp_flower_compile_action(flow, netdev, flow_pay);
+	err = nfp_flower_compile_action(app, flow, netdev, flow_pay);
 	if (err)
 		goto err_destroy_flow;
 
-- 
cgit v1.2.3-59-g8ed1b


From 46dbf98c69f476a0928393310617f15a9c6469b5 Mon Sep 17 00:00:00 2001
From: Anilkumar Kolli <akolli@codeaurora.org>
Date: Wed, 23 May 2018 11:09:02 +0300
Subject: ath10k: add memory dump support for QCA9888 and QCA99X0

This patch adds firmware crash memory dump support for QCA9888 and QCA99X0.

Tested on:

QCA9888 firmware 10.4-3.5.3-00053
QCA99X0 firmware 10.4.1.00030-1

Signed-off-by: Anilkumar Kolli <akolli@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/coredump.c | 98 ++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c
index f90cec0ebb1c..4d28063052fe 100644
--- a/drivers/net/wireless/ath/ath10k/coredump.c
+++ b/drivers/net/wireless/ath/ath10k/coredump.c
@@ -701,6 +701,89 @@ static const struct ath10k_mem_region qca988x_hw20_mem_regions[] = {
 	},
 };
 
+static const struct ath10k_mem_region qca99x0_hw20_mem_regions[] = {
+	{
+		.type = ATH10K_MEM_REGION_TYPE_DRAM,
+		.start = 0x400000,
+		.len = 0x60000,
+		.name = "DRAM",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_REG,
+		.start = 0x98000,
+		.len = 0x50000,
+		.name = "IRAM",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOSRAM,
+		.start = 0xC0000,
+		.len = 0x40000,
+		.name = "SRAM",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOREG,
+		.start = 0x30000,
+		.len = 0x7000,
+		.name = "APB REG 1",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOREG,
+		.start = 0x3f000,
+		.len = 0x3000,
+		.name = "APB REG 2",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOREG,
+		.start = 0x43000,
+		.len = 0x3000,
+		.name = "WIFI REG",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOREG,
+		.start = 0x4A000,
+		.len = 0x5000,
+		.name = "CE REG",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+	{
+		.type = ATH10K_MEM_REGION_TYPE_IOREG,
+		.start = 0x80000,
+		.len = 0x6000,
+		.name = "SOC REG",
+		.section_table = {
+			.sections = NULL,
+			.size = 0,
+		},
+	},
+};
+
 static const struct ath10k_mem_region qca9984_hw10_mem_regions[] = {
 	{
 		.type = ATH10K_MEM_REGION_TYPE_DRAM,
@@ -848,6 +931,21 @@ static const struct ath10k_hw_mem_layout hw_mem_layouts[] = {
 			.size = ARRAY_SIZE(qca9984_hw10_mem_regions),
 		},
 	},
+	{
+		.hw_id = QCA9888_HW_2_0_DEV_VERSION,
+		.region_table = {
+			.regions = qca9984_hw10_mem_regions,
+			.size = ARRAY_SIZE(qca9984_hw10_mem_regions),
+		},
+	},
+	{
+		.hw_id = QCA99X0_HW_2_0_DEV_VERSION,
+		.region_table = {
+			.regions = qca99x0_hw20_mem_regions,
+			.size = ARRAY_SIZE(qca99x0_hw20_mem_regions),
+		},
+	},
+
 };
 
 static u32 ath10k_coredump_get_ramdump_size(struct ath10k *ar)
-- 
cgit v1.2.3-59-g8ed1b


From be8cce96f14dc925ecfb702be0392a52cf78adb5 Mon Sep 17 00:00:00 2001
From: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Date: Wed, 23 May 2018 11:09:09 +0300
Subject: ath10k: add support to configure channel dwell time

Configure channel dwell time from duration of the scan request
received from mac80211 when the duration is non-zero. When the
scan request does not have duration value, use the default ones,
the current implementation.

Corresponding flag NL80211_EXT_FEATURE_SET_SCAN_DWELL is
advertized.

Supported Chipsets:
 -QCA988X/QCA9887 PCI
 -QCA99X0/QCA9984/QCA9888/QCA4019 PCI
 -QCA6174/QCA9377 PCI/USB/SDIO
 -WCN3990 SNOC

Tested on QCA9984 with firmware ver 10.4-3.6-0010

Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.h |  1 +
 drivers/net/wireless/ath/ath10k/mac.c  | 23 ++++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index e4ac8f2831fd..eb7ff1116442 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -44,6 +44,7 @@
 #define WO(_f)      ((_f##_OFFSET) >> 2)
 
 #define ATH10K_SCAN_ID 0
+#define ATH10K_SCAN_CHANNEL_SWITCH_WMI_EVT_OVERHEAD 10 /* msec */
 #define WMI_READY_TIMEOUT (5 * HZ)
 #define ATH10K_FLUSH_TIMEOUT_HZ (5 * HZ)
 #define ATH10K_CONNECTION_LOSS_HZ (3 * HZ)
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 487a7a7380fd..e3b67ffdab72 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -5675,6 +5675,7 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
 	struct wmi_start_scan_arg arg;
 	int ret = 0;
 	int i;
+	u32 scan_timeout;
 
 	mutex_lock(&ar->conf_mutex);
 
@@ -5736,6 +5737,22 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
 			arg.channels[i] = req->channels[i]->center_freq;
 	}
 
+	/* if duration is set, default dwell times will be overwritten */
+	if (req->duration) {
+		arg.dwell_time_active = req->duration;
+		arg.dwell_time_passive = req->duration;
+		arg.burst_duration_ms = req->duration;
+
+		scan_timeout = min_t(u32, arg.max_rest_time *
+				(arg.n_channels - 1) + (req->duration +
+				ATH10K_SCAN_CHANNEL_SWITCH_WMI_EVT_OVERHEAD) *
+				arg.n_channels, arg.max_scan_time + 200);
+
+	} else {
+		/* Add a 200ms margin to account for event/command processing */
+		scan_timeout = arg.max_scan_time + 200;
+	}
+
 	ret = ath10k_start_scan(ar, &arg);
 	if (ret) {
 		ath10k_warn(ar, "failed to start hw scan: %d\n", ret);
@@ -5744,10 +5761,8 @@ static int ath10k_hw_scan(struct ieee80211_hw *hw,
 		spin_unlock_bh(&ar->data_lock);
 	}
 
-	/* Add a 200ms margin to account for event/command processing */
 	ieee80211_queue_delayed_work(ar->hw, &ar->scan.timeout,
-				     msecs_to_jiffies(arg.max_scan_time +
-						      200));
+				     msecs_to_jiffies(scan_timeout));
 
 exit:
 	mutex_unlock(&ar->conf_mutex);
@@ -8364,6 +8379,8 @@ int ath10k_mac_register(struct ath10k *ar)
 	}
 
 	wiphy_ext_feature_set(ar->hw->wiphy, NL80211_EXT_FEATURE_VHT_IBSS);
+	wiphy_ext_feature_set(ar->hw->wiphy,
+			      NL80211_EXT_FEATURE_SET_SCAN_DWELL);
 
 	/*
 	 * on LL hardware queues are managed entirely by the FW
-- 
cgit v1.2.3-59-g8ed1b


From 699e2302c286a14afe7b7394151ce6c4e1790cc1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:09:53 +0300
Subject: ath: Add regulatory mapping for Bahamas

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index 5d80be213fac..f296ff838d2a 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -68,6 +68,7 @@ enum CountryCode {
 	CTRY_AUSTRALIA = 36,
 	CTRY_AUSTRIA = 40,
 	CTRY_AZERBAIJAN = 31,
+	CTRY_BAHAMAS = 44,
 	CTRY_BAHRAIN = 48,
 	CTRY_BANGLADESH = 50,
 	CTRY_BARBADOS = 52,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index bdd2b4d61f2f..cde0268cbed6 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -298,6 +298,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_AUSTRALIA2, FCC6_WORLD, "AU"},
 	{CTRY_AUSTRIA, ETSI1_WORLD, "AT"},
 	{CTRY_AZERBAIJAN, ETSI4_WORLD, "AZ"},
+	{CTRY_BAHAMAS, FCC3_WORLD, "BS"},
 	{CTRY_BAHRAIN, APL6_WORLD, "BH"},
 	{CTRY_BANGLADESH, NULL1_WORLD, "BD"},
 	{CTRY_BARBADOS, FCC2_WORLD, "BB"},
-- 
cgit v1.2.3-59-g8ed1b


From 9c790f2d234f65697e3b0948adbfdf36dbe63dd7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:09:59 +0300
Subject: ath: Add regulatory mapping for Bermuda

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: FCC
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index f296ff838d2a..e7b43901d195 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -75,6 +75,7 @@ enum CountryCode {
 	CTRY_BELARUS = 112,
 	CTRY_BELGIUM = 56,
 	CTRY_BELIZE = 84,
+	CTRY_BERMUDA = 60,
 	CTRY_BOLIVIA = 68,
 	CTRY_BOSNIA_HERZ = 70,
 	CTRY_BRAZIL = 76,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index cde0268cbed6..a8c7f306fd7b 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -306,6 +306,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_BELGIUM, ETSI1_WORLD, "BE"},
 	{CTRY_BELGIUM2, ETSI4_WORLD, "BL"},
 	{CTRY_BELIZE, APL1_ETSIC, "BZ"},
+	{CTRY_BERMUDA, FCC3_FCCA, "BM"},
 	{CTRY_BOLIVIA, APL1_ETSIC, "BO"},
 	{CTRY_BOSNIA_HERZ, ETSI1_WORLD, "BA"},
 	{CTRY_BRAZIL, FCC3_WORLD, "BR"},
-- 
cgit v1.2.3-59-g8ed1b


From b840fa9123086c485788f9c8e96dd4c18e1154cb Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:12 +0300
Subject: ath: Add regulatory mapping for Kenya

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index a8c7f306fd7b..981f604758b0 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -411,6 +411,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 
 	{CTRY_JORDAN, ETSI2_WORLD, "JO"},
 	{CTRY_KAZAKHSTAN, NULL1_WORLD, "KZ"},
+	{CTRY_KENYA, APL1_WORLD, "KE"},
 	{CTRY_KOREA_NORTH, APL9_WORLD, "KP"},
 	{CTRY_KOREA_ROC, APL9_WORLD, "KR"},
 	{CTRY_KOREA_ROC2, APL2_WORLD, "K2"},
-- 
cgit v1.2.3-59-g8ed1b


From a71c984bc4e5e4f1e94a4c362cce764834352d9a Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:19 +0300
Subject: ath: Add regulatory mapping for Mauritius

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index e7b43901d195..8fbf0a684bc9 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -138,6 +138,7 @@ enum CountryCode {
 	CTRY_MACEDONIA = 807,
 	CTRY_MALAYSIA = 458,
 	CTRY_MALTA = 470,
+	CTRY_MAURITIUS = 480,
 	CTRY_MEXICO = 484,
 	CTRY_MONACO = 492,
 	CTRY_MOROCCO = 504,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 981f604758b0..3718bbcdd945 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -426,6 +426,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_MACEDONIA, NULL1_WORLD, "MK"},
 	{CTRY_MALAYSIA, APL8_WORLD, "MY"},
 	{CTRY_MALTA, ETSI1_WORLD, "MT"},
+	{CTRY_MAURITIUS, ETSI1_WORLD, "MU"},
 	{CTRY_MEXICO, FCC1_FCCA, "MX"},
 	{CTRY_MONACO, ETSI4_WORLD, "MC"},
 	{CTRY_MOROCCO, APL4_WORLD, "MA"},
-- 
cgit v1.2.3-59-g8ed1b


From a0a6f2a916dccc394404117534e9587866a7eefa Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:22 +0300
Subject: ath: Add regulatory mapping for Montenegro

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index 8fbf0a684bc9..8edf6d894519 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -141,6 +141,7 @@ enum CountryCode {
 	CTRY_MAURITIUS = 480,
 	CTRY_MEXICO = 484,
 	CTRY_MONACO = 492,
+	CTRY_MONTENEGRO = 499,
 	CTRY_MOROCCO = 504,
 	CTRY_NEPAL = 524,
 	CTRY_NETHERLANDS = 528,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 3718bbcdd945..ef636831812a 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -429,6 +429,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_MAURITIUS, ETSI1_WORLD, "MU"},
 	{CTRY_MEXICO, FCC1_FCCA, "MX"},
 	{CTRY_MONACO, ETSI4_WORLD, "MC"},
+	{CTRY_MONTENEGRO, ETSI1_WORLD, "ME"},
 	{CTRY_MOROCCO, APL4_WORLD, "MA"},
 	{CTRY_NEPAL, APL1_WORLD, "NP"},
 	{CTRY_NETHERLANDS, ETSI1_WORLD, "NL"},
-- 
cgit v1.2.3-59-g8ed1b


From a20f1338c5de7a62a9b9b4353618e2c336251b3a Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:26 +0300
Subject: ath: Add regulatory mapping for Nicaragua

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: FCC
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index ef636831812a..6a14049f8b8f 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -435,6 +435,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_NETHERLANDS, ETSI1_WORLD, "NL"},
 	{CTRY_NETHERLANDS_ANTILLES, ETSI1_WORLD, "AN"},
 	{CTRY_NEW_ZEALAND, FCC2_ETSIC, "NZ"},
+	{CTRY_NICARAGUA, FCC3_FCCA, "NI"},
 	{CTRY_NORWAY, ETSI1_WORLD, "NO"},
 	{CTRY_OMAN, FCC3_WORLD, "OM"},
 	{CTRY_PAKISTAN, NULL1_WORLD, "PK"},
-- 
cgit v1.2.3-59-g8ed1b


From 67a956682472c9fef28f1e171a10428c6e7a800f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:29 +0300
Subject: ath: Add regulatory mapping for Paraguya

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 6a14049f8b8f..ad085c795662 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -441,6 +441,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_PAKISTAN, NULL1_WORLD, "PK"},
 	{CTRY_PANAMA, FCC1_FCCA, "PA"},
 	{CTRY_PAPUA_NEW_GUINEA, FCC1_WORLD, "PG"},
+	{CTRY_PARAGUAY, FCC3_WORLD, "PY"},
 	{CTRY_PERU, APL1_WORLD, "PE"},
 	{CTRY_PHILIPPINES, APL1_WORLD, "PH"},
 	{CTRY_POLAND, ETSI1_WORLD, "PL"},
-- 
cgit v1.2.3-59-g8ed1b


From 2a3169a54bb53717928392a04fb84deb765b51f1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:43 +0300
Subject: ath: Add regulatory mapping for Serbia

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index 8edf6d894519..58afd500bdb3 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -163,6 +163,7 @@ enum CountryCode {
 	CTRY_ROMANIA = 642,
 	CTRY_RUSSIA = 643,
 	CTRY_SAUDI_ARABIA = 682,
+	CTRY_SERBIA = 688,
 	CTRY_SERBIA_MONTENEGRO = 891,
 	CTRY_SINGAPORE = 702,
 	CTRY_SLOVAKIA = 703,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index ad085c795662..f3764ac9e4e3 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -451,6 +451,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_ROMANIA, NULL1_WORLD, "RO"},
 	{CTRY_RUSSIA, NULL1_WORLD, "RU"},
 	{CTRY_SAUDI_ARABIA, NULL1_WORLD, "SA"},
+	{CTRY_SERBIA, ETSI1_WORLD, "RS"},
 	{CTRY_SERBIA_MONTENEGRO, ETSI1_WORLD, "CS"},
 	{CTRY_SINGAPORE, APL6_WORLD, "SG"},
 	{CTRY_SLOVAKIA, ETSI1_WORLD, "SK"},
-- 
cgit v1.2.3-59-g8ed1b


From 667ddac5745fb9fddfe8f7fd2523070f50bd4442 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:48 +0300
Subject: ath: Add regulatory mapping for Tanzania

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index 58afd500bdb3..894be37344aa 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -175,6 +175,7 @@ enum CountryCode {
 	CTRY_SWITZERLAND = 756,
 	CTRY_SYRIA = 760,
 	CTRY_TAIWAN = 158,
+	CTRY_TANZANIA = 834,
 	CTRY_THAILAND = 764,
 	CTRY_TRINIDAD_Y_TOBAGO = 780,
 	CTRY_TUNISIA = 788,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index f3764ac9e4e3..64f004ec0ef2 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -463,6 +463,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_SWITZERLAND, ETSI1_WORLD, "CH"},
 	{CTRY_SYRIA, NULL1_WORLD, "SY"},
 	{CTRY_TAIWAN, APL3_FCCA, "TW"},
+	{CTRY_TANZANIA, APL1_WORLD, "TZ"},
 	{CTRY_THAILAND, FCC3_WORLD, "TH"},
 	{CTRY_TRINIDAD_Y_TOBAGO, FCC3_WORLD, "TT"},
 	{CTRY_TUNISIA, ETSI3_WORLD, "TN"},
-- 
cgit v1.2.3-59-g8ed1b


From 1ea3986ad2bc72081c69f3fbc1e5e0eeb3c44f17 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:10:54 +0300
Subject: ath: Add regulatory mapping for Uganda

The country code is used by the ath to detect the ISO 3166-1 alpha-2 name
and to select the correct conformance test limits (CTL) for a country. If
the country isn't available and it is still programmed in the EEPROM then
it will cause an error and stop the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this country are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd.h        | 1 +
 drivers/net/wireless/ath/regd_common.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd.h b/drivers/net/wireless/ath/regd.h
index 894be37344aa..d73e45e26547 100644
--- a/drivers/net/wireless/ath/regd.h
+++ b/drivers/net/wireless/ath/regd.h
@@ -181,6 +181,7 @@ enum CountryCode {
 	CTRY_TUNISIA = 788,
 	CTRY_TURKEY = 792,
 	CTRY_UAE = 784,
+	CTRY_UGANDA = 800,
 	CTRY_UKRAINE = 804,
 	CTRY_UNITED_KINGDOM = 826,
 	CTRY_UNITED_STATES = 840,
diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 64f004ec0ef2..f7f0efd437b8 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -468,6 +468,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_TRINIDAD_Y_TOBAGO, FCC3_WORLD, "TT"},
 	{CTRY_TUNISIA, ETSI3_WORLD, "TN"},
 	{CTRY_TURKEY, ETSI3_WORLD, "TR"},
+	{CTRY_UGANDA, FCC3_WORLD, "UG"},
 	{CTRY_UKRAINE, NULL1_WORLD, "UA"},
 	{CTRY_UAE, NULL1_WORLD, "AE"},
 	{CTRY_UNITED_KINGDOM, ETSI1_WORLD, "GB"},
-- 
cgit v1.2.3-59-g8ed1b


From 4f183687e3fad3ce0e06e38976cad81bc4541990 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:05 +0300
Subject: ath: Add regulatory mapping for APL2_FCCA

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't available and
it is still programmed in the EEPROM then it will cause an error and stop
the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this regdomain code are:

* 2.4GHz: FCC
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index f7f0efd437b8..969dc12506ee 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -59,6 +59,7 @@ enum EnumRd {
 	MKK1_MKKA1 = 0x4A,
 	MKK1_MKKA2 = 0x4B,
 	MKK1_MKKC = 0x4C,
+	APL2_FCCA = 0x4D,
 
 	APL3_FCCA = 0x50,
 	APL1_WORLD = 0x52,
@@ -188,6 +189,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 	{FCC1_FCCA, CTL_FCC, CTL_FCC},
 	{APL1_WORLD, CTL_FCC, CTL_ETSI},
 	{APL2_WORLD, CTL_FCC, CTL_ETSI},
+	{APL2_FCCA, CTL_FCC, CTL_FCC},
 	{APL3_WORLD, CTL_FCC, CTL_ETSI},
 	{APL4_WORLD, CTL_FCC, CTL_ETSI},
 	{APL5_WORLD, CTL_FCC, CTL_ETSI},
-- 
cgit v1.2.3-59-g8ed1b


From 9ba8df0c52b3e6baa436374b429d3d73bd09a320 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:14 +0300
Subject: ath: Add regulatory mapping for APL13_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't available and
it is still programmed in the EEPROM then it will cause an error and stop
the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this regdomain code are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 969dc12506ee..f3969d05c816 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -68,6 +68,7 @@ enum EnumRd {
 	APL1_ETSIC = 0x55,
 	APL2_ETSIC = 0x56,
 	APL5_WORLD = 0x58,
+	APL13_WORLD = 0x5A,
 	APL6_WORLD = 0x5B,
 	APL7_FCCA = 0x5C,
 	APL8_WORLD = 0x5D,
@@ -193,6 +194,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 	{APL3_WORLD, CTL_FCC, CTL_ETSI},
 	{APL4_WORLD, CTL_FCC, CTL_ETSI},
 	{APL5_WORLD, CTL_FCC, CTL_ETSI},
+	{APL13_WORLD, CTL_ETSI, CTL_ETSI},
 	{APL6_WORLD, CTL_ETSI, CTL_ETSI},
 	{APL8_WORLD, CTL_ETSI, CTL_ETSI},
 	{APL9_WORLD, CTL_ETSI, CTL_ETSI},
-- 
cgit v1.2.3-59-g8ed1b


From 45faf6e096da8bb80e1ddf8c08a26a9601d9469e Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:18 +0300
Subject: ath: Add regulatory mapping for ETSI8_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't available and
it is still programmed in the EEPROM then it will cause an error and stop
the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this regdomain code are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index f3969d05c816..95a378e6a88b 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -44,6 +44,7 @@ enum EnumRd {
 	ETSI4_ETSIC = 0x38,
 	ETSI5_WORLD = 0x39,
 	ETSI6_WORLD = 0x34,
+	ETSI8_WORLD = 0x3D,
 	ETSI_RESERVED = 0x33,
 
 	MKK1_MKKA = 0x40,
@@ -181,6 +182,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 	{ETSI4_WORLD, CTL_ETSI, CTL_ETSI},
 	{ETSI5_WORLD, CTL_ETSI, CTL_ETSI},
 	{ETSI6_WORLD, CTL_ETSI, CTL_ETSI},
+	{ETSI8_WORLD, CTL_ETSI, CTL_ETSI},
 
 	/* XXX: For ETSI3_ETSIA, Was NO_CTL meant for the 2 GHz band ? */
 	{ETSI3_ETSIA, CTL_ETSI, CTL_ETSI},
-- 
cgit v1.2.3-59-g8ed1b


From 897fab6c5d901398beaeb4d764b1816fbed9c488 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:24 +0300
Subject: ath: Add regulatory mapping for ETSI9_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't available and
it is still programmed in the EEPROM then it will cause an error and stop
the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this regdomain code are:

* 2.4GHz: ETSI
* 5GHz: ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 95a378e6a88b..66932c55d352 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -45,6 +45,7 @@ enum EnumRd {
 	ETSI5_WORLD = 0x39,
 	ETSI6_WORLD = 0x34,
 	ETSI8_WORLD = 0x3D,
+	ETSI9_WORLD = 0x3E,
 	ETSI_RESERVED = 0x33,
 
 	MKK1_MKKA = 0x40,
@@ -183,6 +184,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 	{ETSI5_WORLD, CTL_ETSI, CTL_ETSI},
 	{ETSI6_WORLD, CTL_ETSI, CTL_ETSI},
 	{ETSI8_WORLD, CTL_ETSI, CTL_ETSI},
+	{ETSI9_WORLD, CTL_ETSI, CTL_ETSI},
 
 	/* XXX: For ETSI3_ETSIA, Was NO_CTL meant for the 2 GHz band ? */
 	{ETSI3_ETSIA, CTL_ETSI, CTL_ETSI},
-- 
cgit v1.2.3-59-g8ed1b


From 01fb2994a98dc72c8818c274f7b5983d5dd885c7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:30 +0300
Subject: ath: Add regulatory mapping for FCC3_ETSIC

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't available and
it is still programmed in the EEPROM then it will cause an error and stop
the initialization with:

  Invalid EEPROM contents

The current CTL mappings for this regdomain code are:

* 2.4GHz: ETSI
* 5GHz: FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 66932c55d352..437f5dca015a 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -35,6 +35,7 @@ enum EnumRd {
 	FRANCE_RES = 0x31,
 	FCC3_FCCA = 0x3A,
 	FCC3_WORLD = 0x3B,
+	FCC3_ETSIC = 0x3F,
 
 	ETSI1_WORLD = 0x37,
 	ETSI3_ETSIA = 0x32,
@@ -172,6 +173,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 	{FCC2_ETSIC, CTL_FCC, CTL_ETSI},
 	{FCC3_FCCA, CTL_FCC, CTL_FCC},
 	{FCC3_WORLD, CTL_FCC, CTL_ETSI},
+	{FCC3_ETSIC, CTL_FCC, CTL_ETSI},
 	{FCC4_FCCA, CTL_FCC, CTL_FCC},
 	{FCC5_FCCA, CTL_FCC, CTL_FCC},
 	{FCC6_FCCA, CTL_FCC, CTL_FCC},
-- 
cgit v1.2.3-59-g8ed1b


From fed8f5e8303757dcd6b5a149eb6d6b23b3440e2d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:36 +0300
Subject: ath: Map Albania to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 437f5dca015a..ea0d53e21bae 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -299,7 +299,7 @@ static struct reg_dmn_pair_mapping regDomainPairs[] = {
 static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_DEBUG, NO_ENUMRD, "DB"},
 	{CTRY_DEFAULT, FCC1_FCCA, "CO"},
-	{CTRY_ALBANIA, NULL1_WORLD, "AL"},
+	{CTRY_ALBANIA, ETSI1_WORLD, "AL"},
 	{CTRY_ALGERIA, NULL1_WORLD, "DZ"},
 	{CTRY_ARGENTINA, FCC3_WORLD, "AR"},
 	{CTRY_ARMENIA, ETSI4_WORLD, "AM"},
-- 
cgit v1.2.3-59-g8ed1b


From 485daee92bde2b7ec85cb36ab6fdd0f8d2925af7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:45 +0300
Subject: ath: Map Algeria to APL13_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index ea0d53e21bae..37dfa164daf0 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -300,7 +300,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_DEBUG, NO_ENUMRD, "DB"},
 	{CTRY_DEFAULT, FCC1_FCCA, "CO"},
 	{CTRY_ALBANIA, ETSI1_WORLD, "AL"},
-	{CTRY_ALGERIA, NULL1_WORLD, "DZ"},
+	{CTRY_ALGERIA, APL13_WORLD, "DZ"},
 	{CTRY_ARGENTINA, FCC3_WORLD, "AR"},
 	{CTRY_ARMENIA, ETSI4_WORLD, "AM"},
 	{CTRY_ARUBA, ETSI1_WORLD, "AW"},
-- 
cgit v1.2.3-59-g8ed1b


From 8120cc0b215c5100c10a1c9aef19532a28d83d62 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:49 +0300
Subject: ath: Map Australia to FCC3_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 37dfa164daf0..3a6110ba637d 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -304,7 +304,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_ARGENTINA, FCC3_WORLD, "AR"},
 	{CTRY_ARMENIA, ETSI4_WORLD, "AM"},
 	{CTRY_ARUBA, ETSI1_WORLD, "AW"},
-	{CTRY_AUSTRALIA, FCC2_WORLD, "AU"},
+	{CTRY_AUSTRALIA, FCC3_WORLD, "AU"},
 	{CTRY_AUSTRALIA2, FCC6_WORLD, "AU"},
 	{CTRY_AUSTRIA, ETSI1_WORLD, "AT"},
 	{CTRY_AZERBAIJAN, ETSI4_WORLD, "AZ"},
-- 
cgit v1.2.3-59-g8ed1b


From 1507b32891c0f1b5041a61d3c41e93d54602f76d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:11:55 +0300
Subject: ath: Map Bangladesh to APL1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 3a6110ba637d..d531ae0c9dd6 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -310,7 +310,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_AZERBAIJAN, ETSI4_WORLD, "AZ"},
 	{CTRY_BAHAMAS, FCC3_WORLD, "BS"},
 	{CTRY_BAHRAIN, APL6_WORLD, "BH"},
-	{CTRY_BANGLADESH, NULL1_WORLD, "BD"},
+	{CTRY_BANGLADESH, APL1_WORLD, "BD"},
 	{CTRY_BARBADOS, FCC2_WORLD, "BB"},
 	{CTRY_BELARUS, ETSI1_WORLD, "BY"},
 	{CTRY_BELGIUM, ETSI1_WORLD, "BE"},
-- 
cgit v1.2.3-59-g8ed1b


From 37090d9447050fa6234fa29ecea19165d0b41f64 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:01 +0300
Subject: ath: Map Brunei Darussalam to APL6_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: FCC -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index d531ae0c9dd6..b8f321664078 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -320,7 +320,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_BOLIVIA, APL1_ETSIC, "BO"},
 	{CTRY_BOSNIA_HERZ, ETSI1_WORLD, "BA"},
 	{CTRY_BRAZIL, FCC3_WORLD, "BR"},
-	{CTRY_BRUNEI_DARUSSALAM, APL1_WORLD, "BN"},
+	{CTRY_BRUNEI_DARUSSALAM, APL6_WORLD, "BN"},
 	{CTRY_BULGARIA, ETSI6_WORLD, "BG"},
 	{CTRY_CAMBODIA, ETSI1_WORLD, "KH"},
 	{CTRY_CANADA, FCC3_FCCA, "CA"},
-- 
cgit v1.2.3-59-g8ed1b


From fed60f81a2d0839b2611929ab60b439e7d3d9714 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:05 +0300
Subject: ath: Map Bulgaria to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index b8f321664078..3039b9b50a8c 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -321,7 +321,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_BOSNIA_HERZ, ETSI1_WORLD, "BA"},
 	{CTRY_BRAZIL, FCC3_WORLD, "BR"},
 	{CTRY_BRUNEI_DARUSSALAM, APL6_WORLD, "BN"},
-	{CTRY_BULGARIA, ETSI6_WORLD, "BG"},
+	{CTRY_BULGARIA, ETSI1_WORLD, "BG"},
 	{CTRY_CAMBODIA, ETSI1_WORLD, "KH"},
 	{CTRY_CANADA, FCC3_FCCA, "CA"},
 	{CTRY_CANADA2, FCC6_FCCA, "CA"},
-- 
cgit v1.2.3-59-g8ed1b


From 515e968624b2d9fd413dc7c90c24fb3c768a12a7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:11 +0300
Subject: ath: Map Colombia to FCC1_FCCA

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 3039b9b50a8c..17f7e7768395 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -327,7 +327,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_CANADA2, FCC6_FCCA, "CA"},
 	{CTRY_CHILE, APL6_WORLD, "CL"},
 	{CTRY_CHINA, APL1_WORLD, "CN"},
-	{CTRY_COLOMBIA, FCC1_FCCA, "CO"},
+	{CTRY_COLOMBIA, FCC3_WORLD, "CO"},
 	{CTRY_COSTA_RICA, FCC1_WORLD, "CR"},
 	{CTRY_CROATIA, ETSI1_WORLD, "HR"},
 	{CTRY_CYPRUS, ETSI1_WORLD, "CY"},
-- 
cgit v1.2.3-59-g8ed1b


From 0fa31a86998b558ce7ef21ef84bde175d9aba11d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:15 +0300
Subject: ath: Map Czech to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 17f7e7768395..da0f958e927e 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -331,7 +331,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_COSTA_RICA, FCC1_WORLD, "CR"},
 	{CTRY_CROATIA, ETSI1_WORLD, "HR"},
 	{CTRY_CYPRUS, ETSI1_WORLD, "CY"},
-	{CTRY_CZECH, ETSI3_WORLD, "CZ"},
+	{CTRY_CZECH, ETSI1_WORLD, "CZ"},
 	{CTRY_DENMARK, ETSI1_WORLD, "DK"},
 	{CTRY_DOMINICAN_REPUBLIC, FCC1_FCCA, "DO"},
 	{CTRY_ECUADOR, FCC1_WORLD, "EC"},
-- 
cgit v1.2.3-59-g8ed1b


From 143a9b9bd7ee9d63f216a164c04fe6e4ee129ed5 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:17 +0300
Subject: ath: Map Honduras to FCC3_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index da0f958e927e..342f2625ad8a 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -348,7 +348,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_GUAM, FCC1_FCCA, "GU"},
 	{CTRY_GUATEMALA, FCC1_FCCA, "GT"},
 	{CTRY_HAITI, ETSI1_WORLD, "HT"},
-	{CTRY_HONDURAS, NULL1_WORLD, "HN"},
+	{CTRY_HONDURAS, FCC3_WORLD, "HN"},
 	{CTRY_HONG_KONG, FCC3_WORLD, "HK"},
 	{CTRY_HUNGARY, ETSI1_WORLD, "HU"},
 	{CTRY_ICELAND, ETSI1_WORLD, "IS"},
-- 
cgit v1.2.3-59-g8ed1b


From 62ba2b22d12026a98557450753441aa9e2a9959d Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:32 +0300
Subject: ath: Map Isreal to ETSI3_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 342f2625ad8a..44c7d5de900a 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -356,7 +356,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_INDONESIA, NULL1_WORLD, "ID"},
 	{CTRY_IRAN, APL1_WORLD, "IR"},
 	{CTRY_IRELAND, ETSI1_WORLD, "IE"},
-	{CTRY_ISRAEL, NULL1_WORLD, "IL"},
+	{CTRY_ISRAEL, ETSI3_WORLD, "IL"},
 	{CTRY_ITALY, ETSI1_WORLD, "IT"},
 	{CTRY_JAMAICA, FCC3_WORLD, "JM"},
 
-- 
cgit v1.2.3-59-g8ed1b


From 64874ed2a73a4d743796ef103392606399a043cd Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:50 +0300
Subject: ath: Map Macedonia to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 44c7d5de900a..dfe0754522a6 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -433,7 +433,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_LITHUANIA, ETSI1_WORLD, "LT"},
 	{CTRY_LUXEMBOURG, ETSI1_WORLD, "LU"},
 	{CTRY_MACAU, FCC2_WORLD, "MO"},
-	{CTRY_MACEDONIA, NULL1_WORLD, "MK"},
+	{CTRY_MACEDONIA, ETSI1_WORLD, "MK"},
 	{CTRY_MALAYSIA, APL8_WORLD, "MY"},
 	{CTRY_MALTA, ETSI1_WORLD, "MT"},
 	{CTRY_MAURITIUS, ETSI1_WORLD, "MU"},
-- 
cgit v1.2.3-59-g8ed1b


From 823a64065adf27d94d27369bad5d2587821b0d98 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:12:56 +0300
Subject: ath: Map Malasia to FCC1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: ETSI -> FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index dfe0754522a6..6d24c85dbdd7 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -434,7 +434,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_LUXEMBOURG, ETSI1_WORLD, "LU"},
 	{CTRY_MACAU, FCC2_WORLD, "MO"},
 	{CTRY_MACEDONIA, ETSI1_WORLD, "MK"},
-	{CTRY_MALAYSIA, APL8_WORLD, "MY"},
+	{CTRY_MALAYSIA, FCC1_WORLD, "MY"},
 	{CTRY_MALTA, ETSI1_WORLD, "MT"},
 	{CTRY_MAURITIUS, ETSI1_WORLD, "MU"},
 	{CTRY_MEXICO, FCC1_FCCA, "MX"},
-- 
cgit v1.2.3-59-g8ed1b


From 9bfc2bb32e01c3a759dbef007cee938390157270 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:05 +0300
Subject: ath: Map New Zealand to FCC3_ETSIC

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 6d24c85dbdd7..035b023422e2 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -444,7 +444,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_NEPAL, APL1_WORLD, "NP"},
 	{CTRY_NETHERLANDS, ETSI1_WORLD, "NL"},
 	{CTRY_NETHERLANDS_ANTILLES, ETSI1_WORLD, "AN"},
-	{CTRY_NEW_ZEALAND, FCC2_ETSIC, "NZ"},
+	{CTRY_NEW_ZEALAND, FCC3_ETSIC, "NZ"},
 	{CTRY_NICARAGUA, FCC3_FCCA, "NI"},
 	{CTRY_NORWAY, ETSI1_WORLD, "NO"},
 	{CTRY_OMAN, FCC3_WORLD, "OM"},
-- 
cgit v1.2.3-59-g8ed1b


From b5c11e4744062285acf6835c8bd123edb244a2f1 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:08 +0300
Subject: ath: Map Peru to APL1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 035b023422e2..9ab23631d043 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -452,7 +452,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_PANAMA, FCC1_FCCA, "PA"},
 	{CTRY_PAPUA_NEW_GUINEA, FCC1_WORLD, "PG"},
 	{CTRY_PARAGUAY, FCC3_WORLD, "PY"},
-	{CTRY_PERU, APL1_WORLD, "PE"},
+	{CTRY_PERU, FCC3_WORLD, "PE"},
 	{CTRY_PHILIPPINES, APL1_WORLD, "PH"},
 	{CTRY_POLAND, ETSI1_WORLD, "PL"},
 	{CTRY_PORTUGAL, ETSI1_WORLD, "PT"},
-- 
cgit v1.2.3-59-g8ed1b


From c454d4c258444522b1c07f59e989795d3c72cbac Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:15 +0300
Subject: ath: Map Philippines to FCC3_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

This change itself doesn't change the selected CTL of this country and is
only required to stay in sync with the QCA mappings.

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 9ab23631d043..a10a1b8cb55b 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -453,7 +453,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_PAPUA_NEW_GUINEA, FCC1_WORLD, "PG"},
 	{CTRY_PARAGUAY, FCC3_WORLD, "PY"},
 	{CTRY_PERU, FCC3_WORLD, "PE"},
-	{CTRY_PHILIPPINES, APL1_WORLD, "PH"},
+	{CTRY_PHILIPPINES, FCC3_WORLD, "PH"},
 	{CTRY_POLAND, ETSI1_WORLD, "PL"},
 	{CTRY_PORTUGAL, ETSI1_WORLD, "PT"},
 	{CTRY_PUERTO_RICO, FCC1_FCCA, "PR"},
-- 
cgit v1.2.3-59-g8ed1b


From 12f415566fb4487bd5ce6273ed15d1ab6340339c Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:22 +0300
Subject: ath: Map Romania to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index a10a1b8cb55b..3afce206e049 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -458,7 +458,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_PORTUGAL, ETSI1_WORLD, "PT"},
 	{CTRY_PUERTO_RICO, FCC1_FCCA, "PR"},
 	{CTRY_QATAR, APL1_WORLD, "QA"},
-	{CTRY_ROMANIA, NULL1_WORLD, "RO"},
+	{CTRY_ROMANIA, ETSI1_WORLD, "RO"},
 	{CTRY_RUSSIA, NULL1_WORLD, "RU"},
 	{CTRY_SAUDI_ARABIA, NULL1_WORLD, "SA"},
 	{CTRY_SERBIA, ETSI1_WORLD, "RS"},
-- 
cgit v1.2.3-59-g8ed1b


From b7e015132d85f511cfa242a6aa04644fea55fdbb Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:30 +0300
Subject: ath: Map Russia to ETSI8_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 3afce206e049..4a70c52f2940 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -459,7 +459,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_PUERTO_RICO, FCC1_FCCA, "PR"},
 	{CTRY_QATAR, APL1_WORLD, "QA"},
 	{CTRY_ROMANIA, ETSI1_WORLD, "RO"},
-	{CTRY_RUSSIA, NULL1_WORLD, "RU"},
+	{CTRY_RUSSIA, ETSI8_WORLD, "RU"},
 	{CTRY_SAUDI_ARABIA, NULL1_WORLD, "SA"},
 	{CTRY_SERBIA, ETSI1_WORLD, "RS"},
 	{CTRY_SERBIA_MONTENEGRO, ETSI1_WORLD, "CS"},
-- 
cgit v1.2.3-59-g8ed1b


From 054e077880027a469e44444dc9b18d4f2561dbf7 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:43 +0300
Subject: ath: Map Singapore to FCC3_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: ETSI -> FCC

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 4a70c52f2940..470fe45be3cd 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -463,7 +463,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_SAUDI_ARABIA, NULL1_WORLD, "SA"},
 	{CTRY_SERBIA, ETSI1_WORLD, "RS"},
 	{CTRY_SERBIA_MONTENEGRO, ETSI1_WORLD, "CS"},
-	{CTRY_SINGAPORE, APL6_WORLD, "SG"},
+	{CTRY_SINGAPORE, FCC3_WORLD, "SG"},
 	{CTRY_SLOVAKIA, ETSI1_WORLD, "SK"},
 	{CTRY_SLOVENIA, ETSI1_WORLD, "SI"},
 	{CTRY_SOUTH_AFRICA, FCC3_WORLD, "ZA"},
-- 
cgit v1.2.3-59-g8ed1b


From 4e78075c48212aa5e17cf0d35ccc2cd077e6f09a Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:13:52 +0300
Subject: ath: Map Ukraine to ETSI9_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 470fe45be3cd..44826f862cf7 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -479,7 +479,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_TUNISIA, ETSI3_WORLD, "TN"},
 	{CTRY_TURKEY, ETSI3_WORLD, "TR"},
 	{CTRY_UGANDA, FCC3_WORLD, "UG"},
-	{CTRY_UKRAINE, NULL1_WORLD, "UA"},
+	{CTRY_UKRAINE, ETSI9_WORLD, "UA"},
 	{CTRY_UAE, NULL1_WORLD, "AE"},
 	{CTRY_UNITED_KINGDOM, ETSI1_WORLD, "GB"},
 	{CTRY_UNITED_STATES, FCC3_FCCA, "US"},
-- 
cgit v1.2.3-59-g8ed1b


From f1abff21acc5c21bfba472c74d76f4430d06bdd3 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@openmesh.com>
Date: Wed, 23 May 2018 11:14:11 +0300
Subject: ath: Map Zimbabwe to ETSI1_WORLD

The regdomain code is used to select the correct the correct conformance
test limits (CTL) for a country. If the regdomain code isn't correctly
mapped to the actual CTL entries in EEPROM then it could happen that the
device violates the regulations. But it can also happen that the device is
then not able to be used with its full txpower on all rates.

The CTL mappings for this regdomain code were now changed to:

* 2.4GHz: ETSI
* 5GHz: NO_CTL -> ETSI

Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/regd_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/regd_common.h b/drivers/net/wireless/ath/regd_common.h
index 44826f862cf7..4021e37a225a 100644
--- a/drivers/net/wireless/ath/regd_common.h
+++ b/drivers/net/wireless/ath/regd_common.h
@@ -492,7 +492,7 @@ static struct country_code_to_enum_rd allCountries[] = {
 	{CTRY_VENEZUELA, APL2_ETSIC, "VE"},
 	{CTRY_VIET_NAM, NULL1_WORLD, "VN"},
 	{CTRY_YEMEN, NULL1_WORLD, "YE"},
-	{CTRY_ZIMBABWE, NULL1_WORLD, "ZW"},
+	{CTRY_ZIMBABWE, ETSI1_WORLD, "ZW"},
 };
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From cf3c0ae6a32b5c3bb19b07d48dff646fc3896e51 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 23 May 2018 11:14:17 +0300
Subject: ath10k: remove useless test before clk_disable_unprepare

clk_disable_unprepare() already checks that the clock pointer is valid.
No need to test it before calling it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/ahb.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/ahb.c b/drivers/net/wireless/ath/ath10k/ahb.c
index 35d10490f6c3..fa39ffffd34d 100644
--- a/drivers/net/wireless/ath/ath10k/ahb.c
+++ b/drivers/net/wireless/ath/ath10k/ahb.c
@@ -180,14 +180,11 @@ static void ath10k_ahb_clock_disable(struct ath10k *ar)
 {
 	struct ath10k_ahb *ar_ahb = ath10k_ahb_priv(ar);
 
-	if (!IS_ERR_OR_NULL(ar_ahb->cmd_clk))
-		clk_disable_unprepare(ar_ahb->cmd_clk);
+	clk_disable_unprepare(ar_ahb->cmd_clk);
 
-	if (!IS_ERR_OR_NULL(ar_ahb->ref_clk))
-		clk_disable_unprepare(ar_ahb->ref_clk);
+	clk_disable_unprepare(ar_ahb->ref_clk);
 
-	if (!IS_ERR_OR_NULL(ar_ahb->rtc_clk))
-		clk_disable_unprepare(ar_ahb->rtc_clk);
+	clk_disable_unprepare(ar_ahb->rtc_clk);
 }
 
 static int ath10k_ahb_rst_ctrl_init(struct ath10k *ar)
-- 
cgit v1.2.3-59-g8ed1b


From 9a81cc23dfef24e254d3ff15c52bf46f0736f4c2 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:22 +0300
Subject: wcn36xx: fix buffer commit logic on TX path

When wcn36xx_dxe_tx_frame() is entered while the device is still processing
the queue asyncronously, we are racing against the firmware code with
updates to the buffer descriptors. Presumably, the firmware scans the ring
buffer that holds the descriptors and scans for a valid control descriptor,
and then assumes that the next descriptor contains the payload. If, however,
the control descriptor is marked valid, but the payload descriptor isn't,
the packet is not sent out.

Another issue with the current code is that is lacks memory barriers before
descriptors are marked valid. This is important because the CPU may reorder
writes to memory, even if it is allocated as coherent DMA area, and hence
the device may see incompletely written data.

To fix this, the code in wcn36xx_dxe_tx_frame() was restructured a bit so
that the payload descriptor is made valid before the control descriptor.
Memory barriers are added to ensure coherency of shared memory areas.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 75 +++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index bd2b946a65c9..3d1cf7dd1f8e 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -642,8 +642,8 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 			 struct sk_buff *skb,
 			 bool is_low)
 {
-	struct wcn36xx_dxe_ctl *ctl = NULL;
-	struct wcn36xx_dxe_desc *desc = NULL;
+	struct wcn36xx_dxe_desc *desc_bd, *desc_skb;
+	struct wcn36xx_dxe_ctl *ctl_bd, *ctl_skb;
 	struct wcn36xx_dxe_ch *ch = NULL;
 	unsigned long flags;
 	int ret;
@@ -651,74 +651,75 @@ int wcn36xx_dxe_tx_frame(struct wcn36xx *wcn,
 	ch = is_low ? &wcn->dxe_tx_l_ch : &wcn->dxe_tx_h_ch;
 
 	spin_lock_irqsave(&ch->lock, flags);
-	ctl = ch->head_blk_ctl;
+	ctl_bd = ch->head_blk_ctl;
+	ctl_skb = ctl_bd->next;
 
 	/*
 	 * If skb is not null that means that we reached the tail of the ring
 	 * hence ring is full. Stop queues to let mac80211 back off until ring
 	 * has an empty slot again.
 	 */
-	if (NULL != ctl->next->skb) {
+	if (NULL != ctl_skb->skb) {
 		ieee80211_stop_queues(wcn->hw);
 		wcn->queues_stopped = true;
 		spin_unlock_irqrestore(&ch->lock, flags);
 		return -EBUSY;
 	}
 
-	ctl->skb = NULL;
-	desc = ctl->desc;
+	if (unlikely(ctl_skb->bd_cpu_addr)) {
+		wcn36xx_err("bd_cpu_addr cannot be NULL for skb DXE\n");
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	desc_bd = ctl_bd->desc;
+	desc_skb = ctl_skb->desc;
+
+	ctl_bd->skb = NULL;
 
 	/* write buffer descriptor */
-	memcpy(ctl->bd_cpu_addr, bd, sizeof(*bd));
+	memcpy(ctl_bd->bd_cpu_addr, bd, sizeof(*bd));
 
 	/* Set source address of the BD we send */
-	desc->src_addr_l = ctl->bd_phy_addr;
-
-	desc->dst_addr_l = ch->dxe_wq;
-	desc->fr_len = sizeof(struct wcn36xx_tx_bd);
-	desc->ctrl = ch->ctrl_bd;
+	desc_bd->src_addr_l = ctl_bd->bd_phy_addr;
+	desc_bd->dst_addr_l = ch->dxe_wq;
+	desc_bd->fr_len = sizeof(struct wcn36xx_tx_bd);
 
 	wcn36xx_dbg(WCN36XX_DBG_DXE, "DXE TX\n");
 
 	wcn36xx_dbg_dump(WCN36XX_DBG_DXE_DUMP, "DESC1 >>> ",
-			 (char *)desc, sizeof(*desc));
+			 (char *)desc_bd, sizeof(*desc_bd));
 	wcn36xx_dbg_dump(WCN36XX_DBG_DXE_DUMP,
-			 "BD   >>> ", (char *)ctl->bd_cpu_addr,
+			 "BD   >>> ", (char *)ctl_bd->bd_cpu_addr,
 			 sizeof(struct wcn36xx_tx_bd));
 
-	/* Set source address of the SKB we send */
-	ctl = ctl->next;
-	desc = ctl->desc;
-	if (ctl->bd_cpu_addr) {
-		wcn36xx_err("bd_cpu_addr cannot be NULL for skb DXE\n");
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	desc->src_addr_l = dma_map_single(wcn->dev,
-					  skb->data,
-					  skb->len,
-					  DMA_TO_DEVICE);
-	if (dma_mapping_error(wcn->dev, desc->src_addr_l)) {
+	desc_skb->src_addr_l = dma_map_single(wcn->dev,
+					      skb->data,
+					      skb->len,
+					      DMA_TO_DEVICE);
+	if (dma_mapping_error(wcn->dev, desc_skb->src_addr_l)) {
 		dev_err(wcn->dev, "unable to DMA map src_addr_l\n");
 		ret = -ENOMEM;
 		goto unlock;
 	}
 
-	ctl->skb = skb;
-	desc->dst_addr_l = ch->dxe_wq;
-	desc->fr_len = ctl->skb->len;
-
-	/* set dxe descriptor to VALID */
-	desc->ctrl = ch->ctrl_skb;
+	ctl_skb->skb = skb;
+	desc_skb->dst_addr_l = ch->dxe_wq;
+	desc_skb->fr_len = ctl_skb->skb->len;
 
 	wcn36xx_dbg_dump(WCN36XX_DBG_DXE_DUMP, "DESC2 >>> ",
-			 (char *)desc, sizeof(*desc));
+			 (char *)desc_skb, sizeof(*desc_skb));
 	wcn36xx_dbg_dump(WCN36XX_DBG_DXE_DUMP, "SKB   >>> ",
-			 (char *)ctl->skb->data, ctl->skb->len);
+			 (char *)ctl_skb->skb->data, ctl_skb->skb->len);
 
 	/* Move the head of the ring to the next empty descriptor */
-	 ch->head_blk_ctl = ctl->next;
+	 ch->head_blk_ctl = ctl_skb->next;
+
+	/* Commit all previous writes and set descriptors to VALID */
+	wmb();
+	desc_skb->ctrl = ch->ctrl_skb;
+	wmb();
+	desc_bd->ctrl = ch->ctrl_bd;
 
 	/*
 	 * When connected and trying to send data frame chip can be in sleep
-- 
cgit v1.2.3-59-g8ed1b


From 57e06e0e2a862bd40df47c7ad752fcad1d04c259 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:26 +0300
Subject: wcn36xx: set DMA mask explicitly

The device takes 32-bit addresses only, so inform the DMA API about it.
This is the default on msm8016, so that doesn't change anything, but
it's best practice to be explicit.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index e3b91b3b38ef..e6330ad372b3 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -1309,6 +1309,12 @@ static int wcn36xx_probe(struct platform_device *pdev)
 	mutex_init(&wcn->hal_mutex);
 	mutex_init(&wcn->scan_lock);
 
+	ret = dma_set_mask_and_coherent(wcn->dev, DMA_BIT_MASK(32));
+	if (ret < 0) {
+		wcn36xx_err("failed to set DMA mask: %d\n", ret);
+		goto out_wq;
+	}
+
 	INIT_WORK(&wcn->scan_work, wcn36xx_hw_scan_worker);
 
 	wcn->smd_channel = qcom_wcnss_open_channel(wcnss, "WLAN_CTRL", wcn36xx_smd_rsp_process, hw);
-- 
cgit v1.2.3-59-g8ed1b


From ba437e72378c91a2fd6766014cc02fa98b61f2dc Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:30 +0300
Subject: wcn36xx: don't disable RX IRQ from handler

There's no need to disable the IRQ from inside its handler.
Instead just grab the spinlock of the channel that is being processed.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 3d1cf7dd1f8e..d6dd47b211ba 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -476,9 +476,8 @@ static irqreturn_t wcn36xx_irq_rx_ready(int irq, void *dev)
 {
 	struct wcn36xx *wcn = (struct wcn36xx *)dev;
 
-	disable_irq_nosync(wcn->rx_irq);
 	wcn36xx_dxe_rx_frame(wcn);
-	enable_irq(wcn->rx_irq);
+
 	return IRQ_HANDLED;
 }
 
@@ -514,8 +513,8 @@ out_err:
 static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 				     struct wcn36xx_dxe_ch *ch)
 {
-	struct wcn36xx_dxe_ctl *ctl = ch->head_blk_ctl;
-	struct wcn36xx_dxe_desc *dxe = ctl->desc;
+	struct wcn36xx_dxe_desc *dxe;
+	struct wcn36xx_dxe_ctl *ctl;
 	dma_addr_t  dma_addr;
 	struct sk_buff *skb;
 	int ret = 0, int_mask;
@@ -529,6 +528,11 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 		int_mask = WCN36XX_DXE_INT_CH3_MASK;
 	}
 
+	spin_lock(&ch->lock);
+
+	ctl = ch->head_blk_ctl;
+	dxe = ctl->desc;
+
 	while (!(READ_ONCE(dxe->ctrl) & WCN36xx_DXE_CTRL_VLD)) {
 		skb = ctl->skb;
 		dma_addr = dxe->dst_addr_l;
@@ -549,6 +553,9 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 	wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_ENCH_ADDR, int_mask);
 
 	ch->head_blk_ctl = ctl;
+
+	spin_unlock(&ch->lock);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From edd23ab403cf092a20ea185770f197f502ac32f0 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:35 +0300
Subject: wcn36xx: clear all masks in RX interrupt

Like on the TX side, check for the interrupt reason when the RX interrupt
is latched and clear the ERR, DONE and ED masks.

This seems to help with connection timeouts and network stream
starvatations. And FWIW, the downstream driver does the same thing.

Note that in analogy to the TX side, WCN36XX_DXE_0_INT_CLR should be set to
WCN36XX_INT_MASK_CHAN_RX_{L,H} rather than WCN36XX_DXE_INT_CH{1,3}_MASK. It
did the right thing however, as the defines happen to have identical values.

Also, instead of determining register addresses and values inside
wcn36xx_rx_handle_packets(), pass them as arguments.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 62 ++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 22 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index d6dd47b211ba..d11c9c536627 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -511,23 +511,40 @@ out_err:
 }
 
 static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
-				     struct wcn36xx_dxe_ch *ch)
+				     struct wcn36xx_dxe_ch *ch,
+				     u32 ctrl,
+				     u32 en_mask,
+				     u32 int_mask,
+				     u32 status_reg)
 {
 	struct wcn36xx_dxe_desc *dxe;
 	struct wcn36xx_dxe_ctl *ctl;
 	dma_addr_t  dma_addr;
 	struct sk_buff *skb;
-	int ret = 0, int_mask;
-	u32 value;
+	u32 int_reason;
+	int ret;
 
-	if (ch->ch_type == WCN36XX_DXE_CH_RX_L) {
-		value = WCN36XX_DXE_CTRL_RX_L;
-		int_mask = WCN36XX_DXE_INT_CH1_MASK;
-	} else {
-		value = WCN36XX_DXE_CTRL_RX_H;
-		int_mask = WCN36XX_DXE_INT_CH3_MASK;
+	wcn36xx_dxe_read_register(wcn, status_reg, &int_reason);
+	wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_0_INT_CLR, int_mask);
+
+	if (int_reason & WCN36XX_CH_STAT_INT_ERR_MASK) {
+		wcn36xx_dxe_write_register(wcn,
+					   WCN36XX_DXE_0_INT_ERR_CLR,
+					   int_mask);
+
+		wcn36xx_err("DXE IRQ reported error on RX channel\n");
 	}
 
+	if (int_reason & WCN36XX_CH_STAT_INT_DONE_MASK)
+		wcn36xx_dxe_write_register(wcn,
+					   WCN36XX_DXE_0_INT_DONE_CLR,
+					   int_mask);
+
+	if (int_reason & WCN36XX_CH_STAT_INT_ED_MASK)
+		wcn36xx_dxe_write_register(wcn,
+					   WCN36XX_DXE_0_INT_ED_CLR,
+					   int_mask);
+
 	spin_lock(&ch->lock);
 
 	ctl = ch->head_blk_ctl;
@@ -546,11 +563,11 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 			wcn36xx_rx_skb(wcn, skb);
 		} /* else keep old skb not submitted and use it for rx DMA */
 
-		dxe->ctrl = value;
+		dxe->ctrl = ctrl;
 		ctl = ctl->next;
 		dxe = ctl->desc;
 	}
-	wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_ENCH_ADDR, int_mask);
+	wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_ENCH_ADDR, en_mask);
 
 	ch->head_blk_ctl = ctl;
 
@@ -566,19 +583,20 @@ void wcn36xx_dxe_rx_frame(struct wcn36xx *wcn)
 	wcn36xx_dxe_read_register(wcn, WCN36XX_DXE_INT_SRC_RAW_REG, &int_src);
 
 	/* RX_LOW_PRI */
-	if (int_src & WCN36XX_DXE_INT_CH1_MASK) {
-		wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_0_INT_CLR,
-					   WCN36XX_DXE_INT_CH1_MASK);
-		wcn36xx_rx_handle_packets(wcn, &(wcn->dxe_rx_l_ch));
-	}
+	if (int_src & WCN36XX_DXE_INT_CH1_MASK)
+		wcn36xx_rx_handle_packets(wcn, &wcn->dxe_rx_l_ch,
+					  WCN36XX_DXE_CTRL_RX_L,
+					  WCN36XX_DXE_INT_CH1_MASK,
+					  WCN36XX_INT_MASK_CHAN_RX_L,
+					  WCN36XX_DXE_CH_STATUS_REG_ADDR_RX_L);
 
 	/* RX_HIGH_PRI */
-	if (int_src & WCN36XX_DXE_INT_CH3_MASK) {
-		/* Clean up all the INT within this channel */
-		wcn36xx_dxe_write_register(wcn, WCN36XX_DXE_0_INT_CLR,
-					   WCN36XX_DXE_INT_CH3_MASK);
-		wcn36xx_rx_handle_packets(wcn, &(wcn->dxe_rx_h_ch));
-	}
+	if (int_src & WCN36XX_DXE_INT_CH3_MASK)
+		wcn36xx_rx_handle_packets(wcn, &wcn->dxe_rx_h_ch,
+					  WCN36XX_DXE_CTRL_RX_H,
+					  WCN36XX_DXE_INT_CH3_MASK,
+					  WCN36XX_INT_MASK_CHAN_RX_H,
+					  WCN36XX_DXE_CH_STATUS_REG_ADDR_RX_H);
 
 	if (!int_src)
 		wcn36xx_warn("No DXE interrupt pending\n");
-- 
cgit v1.2.3-59-g8ed1b


From ce1d4be82b1009374f7bea0229fb6758cb1afb84 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:40 +0300
Subject: wcn36xx: only handle packets when ED or DONE bit is set

On RX and TX interrupts, check for the WCN36XX_CH_STAT_INT_ED_MASK or
WCN36XX_CH_STAT_INT_DONE_MASK in the interrupt reason register, and
only handle packets when it is set. This way, reap_tx_dxes() is only
invoked when needed.

This brings the dequeing logic in line with what the prima downstream
driver is doing.

While at it, also log the interrupt reason.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index d11c9c536627..8c64e05ca3b7 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -430,8 +430,12 @@ static irqreturn_t wcn36xx_irq_tx_complete(int irq, void *dev)
 						   WCN36XX_INT_MASK_CHAN_TX_H);
 		}
 
-		wcn36xx_dbg(WCN36XX_DBG_DXE, "dxe tx ready high\n");
-		reap_tx_dxes(wcn, &wcn->dxe_tx_h_ch);
+		wcn36xx_dbg(WCN36XX_DBG_DXE, "dxe tx ready high, reason %08x\n",
+			    int_reason);
+
+		if (int_reason & (WCN36XX_CH_STAT_INT_DONE_MASK |
+				  WCN36XX_CH_STAT_INT_ED_MASK))
+			reap_tx_dxes(wcn, &wcn->dxe_tx_h_ch);
 	}
 
 	if (int_src & WCN36XX_INT_MASK_CHAN_TX_L) {
@@ -465,8 +469,12 @@ static irqreturn_t wcn36xx_irq_tx_complete(int irq, void *dev)
 						   WCN36XX_INT_MASK_CHAN_TX_L);
 		}
 
-		wcn36xx_dbg(WCN36XX_DBG_DXE, "dxe tx ready low\n");
-		reap_tx_dxes(wcn, &wcn->dxe_tx_l_ch);
+		wcn36xx_dbg(WCN36XX_DBG_DXE, "dxe tx ready low, reason %08x\n",
+			    int_reason);
+
+		if (int_reason & (WCN36XX_CH_STAT_INT_DONE_MASK |
+				  WCN36XX_CH_STAT_INT_ED_MASK))
+			reap_tx_dxes(wcn, &wcn->dxe_tx_l_ch);
 	}
 
 	return IRQ_HANDLED;
@@ -545,6 +553,10 @@ static int wcn36xx_rx_handle_packets(struct wcn36xx *wcn,
 					   WCN36XX_DXE_0_INT_ED_CLR,
 					   int_mask);
 
+	if (!(int_reason & (WCN36XX_CH_STAT_INT_DONE_MASK |
+			    WCN36XX_CH_STAT_INT_ED_MASK)))
+		return 0;
+
 	spin_lock(&ch->lock);
 
 	ctl = ch->head_blk_ctl;
-- 
cgit v1.2.3-59-g8ed1b


From 18c7ed138824119f2ef9c9019e222e3e750b42b8 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:44 +0300
Subject: wcn36xx: consider CTRL_EOP bit when looking for valid descriptors

In reap_tx_dxes(), when we iterate over the linked descriptors, only
consider such valid that have WCN36xx_DXE_CTRL_EOP set.

This is what the prima downstream driver is doing as well.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/dxe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/dxe.c b/drivers/net/wireless/ath/wcn36xx/dxe.c
index 8c64e05ca3b7..06cfe8d311f3 100644
--- a/drivers/net/wireless/ath/wcn36xx/dxe.c
+++ b/drivers/net/wireless/ath/wcn36xx/dxe.c
@@ -370,7 +370,9 @@ static void reap_tx_dxes(struct wcn36xx *wcn, struct wcn36xx_dxe_ch *ch)
 	do {
 		if (READ_ONCE(ctl->desc->ctrl) & WCN36xx_DXE_CTRL_VLD)
 			break;
-		if (ctl->skb) {
+
+		if (ctl->skb &&
+		    READ_ONCE(ctl->desc->ctrl) & WCN36xx_DXE_CTRL_EOP) {
 			dma_unmap_single(wcn->dev, ctl->desc->src_addr_l,
 					 ctl->skb->len, DMA_TO_DEVICE);
 			info = IEEE80211_SKB_CB(ctl->skb);
-- 
cgit v1.2.3-59-g8ed1b


From 2a46c829a9266bf2a2be1a3d25dbac5fcd4eb9c1 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:50 +0300
Subject: wcn36xx: set PREASSOC and IDLE stated when BSS info changes

When a BSSID is joined, set the link status to 'preassoc', and set it to
'idle' when the BSS is deleted.

This is what the downstream driver is doing, and it seems to improve the
reliability during connect/disconnect stress tests.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index e6330ad372b3..662e50540b07 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -798,6 +798,8 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 		if (!is_zero_ether_addr(bss_conf->bssid)) {
 			vif_priv->is_joining = true;
 			vif_priv->bss_index = WCN36XX_HAL_BSS_INVALID_IDX;
+			wcn36xx_smd_set_link_st(wcn, bss_conf->bssid, vif->addr,
+						WCN36XX_HAL_LINK_PREASSOC_STATE);
 			wcn36xx_smd_join(wcn, bss_conf->bssid,
 					 vif->addr, WCN36XX_HW_CHANNEL(wcn));
 			wcn36xx_smd_config_bss(wcn, vif, NULL,
@@ -805,6 +807,8 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw,
 		} else {
 			vif_priv->is_joining = false;
 			wcn36xx_smd_delete_bss(wcn, vif);
+			wcn36xx_smd_set_link_st(wcn, bss_conf->bssid, vif->addr,
+						WCN36XX_HAL_LINK_IDLE_STATE);
 			vif_priv->encrypt_type = WCN36XX_HAL_ED_NONE;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 773f9a28bcdafd6cb77fb1afc545c62746d53990 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:55 +0300
Subject: wcn36xx: drain pending indicator messages on shutdown

When the interface is shut down, wcn36xx_smd_close() potentially races
against the queue worker. Make sure to cancel the work, and then free all
the remnants in hal_ind_queue manually.

This is again just a theoretical issue, not something that was triggered in
the wild.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index ea74f2b92df5..0a505b5e038b 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -2513,5 +2513,11 @@ out:
 
 void wcn36xx_smd_close(struct wcn36xx *wcn)
 {
+	struct wcn36xx_hal_ind_msg *msg, *tmp;
+
+	cancel_work_sync(&wcn->hal_ind_work);
 	destroy_workqueue(wcn->hal_ind_wq);
+
+	list_for_each_entry_safe(msg, tmp, &wcn->hal_ind_queue, list)
+		kfree(msg);
 }
-- 
cgit v1.2.3-59-g8ed1b


From a50c6c8412f7114cbb0514dbea7a5a9a3d317479 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:14:59 +0300
Subject: wcn36xx: simplify wcn36xx_smd_open()

Drop the extra warning about failed allocations, both the core and the
only caller of this function will warn loud enough in such cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 0a505b5e038b..43c8aa79fad4 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -2494,21 +2494,15 @@ static void wcn36xx_ind_smd_work(struct work_struct *work)
 }
 int wcn36xx_smd_open(struct wcn36xx *wcn)
 {
-	int ret = 0;
 	wcn->hal_ind_wq = create_freezable_workqueue("wcn36xx_smd_ind");
-	if (!wcn->hal_ind_wq) {
-		wcn36xx_err("failed to allocate wq\n");
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!wcn->hal_ind_wq)
+		return -ENOMEM;
+
 	INIT_WORK(&wcn->hal_ind_work, wcn36xx_ind_smd_work);
 	INIT_LIST_HEAD(&wcn->hal_ind_queue);
 	spin_lock_init(&wcn->hal_ind_lock);
 
 	return 0;
-
-out:
-	return ret;
 }
 
 void wcn36xx_smd_close(struct wcn36xx *wcn)
-- 
cgit v1.2.3-59-g8ed1b


From ffbc9197b4721634dc6c0fefa9b31e565fa89cee Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 May 2018 11:15:03 +0300
Subject: wcn36xx: improve debug and error messages for SMD

Add a missing newline in wcn36xx_smd_send_and_wait() and also log the
command request and response type that was processed.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/smd.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 43c8aa79fad4..b855a58d5aac 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -252,23 +252,29 @@ static int wcn36xx_smd_send_and_wait(struct wcn36xx *wcn, size_t len)
 {
 	int ret = 0;
 	unsigned long start;
+	struct wcn36xx_hal_msg_header *hdr =
+		(struct wcn36xx_hal_msg_header *)wcn->hal_buf;
+	u16 req_type = hdr->msg_type;
+
 	wcn36xx_dbg_dump(WCN36XX_DBG_SMD_DUMP, "HAL >>> ", wcn->hal_buf, len);
 
 	init_completion(&wcn->hal_rsp_compl);
 	start = jiffies;
 	ret = rpmsg_send(wcn->smd_channel, wcn->hal_buf, len);
 	if (ret) {
-		wcn36xx_err("HAL TX failed\n");
+		wcn36xx_err("HAL TX failed for req %d\n", req_type);
 		goto out;
 	}
 	if (wait_for_completion_timeout(&wcn->hal_rsp_compl,
 		msecs_to_jiffies(HAL_MSG_TIMEOUT)) <= 0) {
-		wcn36xx_err("Timeout! No SMD response in %dms\n",
-			    HAL_MSG_TIMEOUT);
+		wcn36xx_err("Timeout! No SMD response to req %d in %dms\n",
+			    req_type, HAL_MSG_TIMEOUT);
 		ret = -ETIME;
 		goto out;
 	}
-	wcn36xx_dbg(WCN36XX_DBG_SMD, "SMD command completed in %dms",
+	wcn36xx_dbg(WCN36XX_DBG_SMD,
+		    "SMD command (req %d, rsp %d) completed in %dms\n",
+		    req_type, hdr->msg_type,
 		    jiffies_to_msecs(jiffies - start));
 out:
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From f40105e6747892e8edab94020567c158c9bec0df Mon Sep 17 00:00:00 2001
From: Sriram R <srirrama@codeaurora.org>
Date: Tue, 15 May 2018 14:39:48 +0530
Subject: ath: add support to get the detected radar specifications

This enables ath10k/ath9k drivers to collect the specifications of the
radar type once it is detected by the dfs pattern detector unit.
Usage of the collected info is specific to driver implementation.
For example, collected radar info could be used by the host driver
to send to co-processors for additional processing/validation.

Note: 'radar_detector_specs' data containing the specifications of
different radar types which was private within dfs_pattern_detector/
dfs_pri_detector is now shared with drivers as well for making use
of this information.

Signed-off-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/wmi.c           | 2 +-
 drivers/net/wireless/ath/ath9k/dfs.c            | 2 +-
 drivers/net/wireless/ath/dfs_pattern_detector.c | 5 ++++-
 drivers/net/wireless/ath/dfs_pattern_detector.h | 3 ++-
 drivers/net/wireless/ath/dfs_pri_detector.h     | 3 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 46fb96ee5852..1bccac002e97 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -3757,7 +3757,7 @@ static void ath10k_dfs_radar_report(struct ath10k *ar,
 
 	ATH10K_DFS_STAT_INC(ar, pulses_detected);
 
-	if (!ar->dfs_detector->add_pulse(ar->dfs_detector, &pe)) {
+	if (!ar->dfs_detector->add_pulse(ar->dfs_detector, &pe, NULL)) {
 		ath10k_dbg(ar, ATH10K_DBG_REGULATORY,
 			   "dfs no pulse pattern detected, yet\n");
 		return;
diff --git a/drivers/net/wireless/ath/ath9k/dfs.c b/drivers/net/wireless/ath/ath9k/dfs.c
index c8844f55574c..acb9602aa464 100644
--- a/drivers/net/wireless/ath/ath9k/dfs.c
+++ b/drivers/net/wireless/ath/ath9k/dfs.c
@@ -277,7 +277,7 @@ ath9k_dfs_process_radar_pulse(struct ath_softc *sc, struct pulse_event *pe)
 	DFS_STAT_INC(sc, pulses_processed);
 	if (pd == NULL)
 		return;
-	if (!pd->add_pulse(pd, pe))
+	if (!pd->add_pulse(pd, pe, NULL))
 		return;
 	DFS_STAT_INC(sc, radar_detected);
 	ieee80211_radar_detected(sc->hw);
diff --git a/drivers/net/wireless/ath/dfs_pattern_detector.c b/drivers/net/wireless/ath/dfs_pattern_detector.c
index 448b83eea810..d52b31b45df7 100644
--- a/drivers/net/wireless/ath/dfs_pattern_detector.c
+++ b/drivers/net/wireless/ath/dfs_pattern_detector.c
@@ -268,7 +268,8 @@ static void dpd_exit(struct dfs_pattern_detector *dpd)
 }
 
 static bool
-dpd_add_pulse(struct dfs_pattern_detector *dpd, struct pulse_event *event)
+dpd_add_pulse(struct dfs_pattern_detector *dpd, struct pulse_event *event,
+	      struct radar_detector_specs *rs)
 {
 	u32 i;
 	struct channel_detector *cd;
@@ -294,6 +295,8 @@ dpd_add_pulse(struct dfs_pattern_detector *dpd, struct pulse_event *event)
 		struct pri_detector *pd = cd->detectors[i];
 		struct pri_sequence *ps = pd->add_pulse(pd, event);
 		if (ps != NULL) {
+			if (rs != NULL)
+				memcpy(rs, pd->rs, sizeof(*rs));
 			ath_dbg(dpd->common, DFS,
 				"DFS: radar found on freq=%d: id=%d, pri=%d, "
 				"count=%d, count_false=%d\n",
diff --git a/drivers/net/wireless/ath/dfs_pattern_detector.h b/drivers/net/wireless/ath/dfs_pattern_detector.h
index 92be3530e9b5..18db6f4f3568 100644
--- a/drivers/net/wireless/ath/dfs_pattern_detector.h
+++ b/drivers/net/wireless/ath/dfs_pattern_detector.h
@@ -97,7 +97,8 @@ struct dfs_pattern_detector {
 	bool (*set_dfs_domain)(struct dfs_pattern_detector *dpd,
 			   enum nl80211_dfs_regions region);
 	bool (*add_pulse)(struct dfs_pattern_detector *dpd,
-			  struct pulse_event *pe);
+			  struct pulse_event *pe,
+			  struct radar_detector_specs *rs);
 
 	struct ath_dfs_pool_stats (*get_stats)(struct dfs_pattern_detector *dpd);
 	enum nl80211_dfs_regions region;
diff --git a/drivers/net/wireless/ath/dfs_pri_detector.h b/drivers/net/wireless/ath/dfs_pri_detector.h
index 79f0fff4d1e6..86339f2b4d3a 100644
--- a/drivers/net/wireless/ath/dfs_pri_detector.h
+++ b/drivers/net/wireless/ath/dfs_pri_detector.h
@@ -62,8 +62,9 @@ struct pri_detector {
 	     (*add_pulse)(struct pri_detector *de, struct pulse_event *e);
 	void (*reset)    (struct pri_detector *de, u64 ts);
 
-/* private: internal use only */
 	const struct radar_detector_specs *rs;
+
+/* private: internal use only */
 	u64 last_ts;
 	struct list_head sequences;
 	struct list_head pulses;
-- 
cgit v1.2.3-59-g8ed1b


From 6f6eb1bcbeff48c875617b800f00f1c5d1b12290 Mon Sep 17 00:00:00 2001
From: Sriram R <srirrama@codeaurora.org>
Date: Tue, 15 May 2018 14:39:49 +0530
Subject: ath10k: DFS Host Confirmation

In the 10.4-3.6 firmware branch there's a new DFS Host confirmation
feature which is advertised using WMI_SERVICE_HOST_DFS_CHECK_SUPPORT flag.

This new features enables the ath10k host to send information to the
firmware on the specifications of detected radar type. This allows the
firmware to validate if the host's radar pattern detector unit is
operational and check if the radar information shared by host matches
the radar pulses sent as phy error events from firmware. If the check
fails the firmware won't allow use of DFS channels on AP mode when using
FCC regulatory region.

Hence this patch is mandatory when using a firmware from 10.4-3.6 branch.
Else, DFS channels on FCC regions cannot be used.

Supported Chipsets : QCA9984/QCA9888/QCA4019
Firmware Version : 10.4-3.6-00104

Signed-off-by: Sriram R <srirrama@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.h    |  21 ++++
 drivers/net/wireless/ath/ath10k/mac.c     |  12 ++
 drivers/net/wireless/ath/ath10k/wmi-ops.h |  32 +++++
 drivers/net/wireless/ath/ath10k/wmi.c     | 186 ++++++++++++++++++++++++++++--
 drivers/net/wireless/ath/ath10k/wmi.h     |  32 +++++
 5 files changed, 273 insertions(+), 10 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.h b/drivers/net/wireless/ath/ath10k/core.h
index eb7ff1116442..951dbdd1c9eb 100644
--- a/drivers/net/wireless/ath/ath10k/core.h
+++ b/drivers/net/wireless/ath/ath10k/core.h
@@ -176,6 +176,7 @@ struct ath10k_wmi {
 	struct completion service_ready;
 	struct completion unified_ready;
 	struct completion barrier;
+	struct completion radar_confirm;
 	wait_queue_head_t tx_credits_wq;
 	DECLARE_BITMAP(svc_map, WMI_SERVICE_MAX);
 	struct wmi_cmd_map *cmd;
@@ -378,6 +379,21 @@ struct ath10k_dfs_stats {
 	u32 radar_detected;
 };
 
+enum ath10k_radar_confirmation_state {
+	ATH10K_RADAR_CONFIRMATION_IDLE = 0,
+	ATH10K_RADAR_CONFIRMATION_INPROGRESS,
+	ATH10K_RADAR_CONFIRMATION_STOPPED,
+};
+
+struct ath10k_radar_found_info {
+	u32 pri_min;
+	u32 pri_max;
+	u32 width_min;
+	u32 width_max;
+	u32 sidx_min;
+	u32 sidx_max;
+};
+
 #define ATH10K_MAX_NUM_PEER_IDS (1 << 11) /* htt rx_desc limit */
 
 struct ath10k_peer {
@@ -1110,6 +1126,11 @@ struct ath10k {
 
 	u32 sta_tid_stats_mask;
 
+	/* protected by data_lock */
+	enum ath10k_radar_confirmation_state radar_conf_state;
+	struct ath10k_radar_found_info last_radar_info;
+	struct work_struct radar_confirmation_work;
+
 	/* must be last */
 	u8 drv_priv[0] __aligned(sizeof(void *));
 };
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index e3b67ffdab72..e9c2fb318c03 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -3217,6 +3217,15 @@ static void ath10k_reg_notifier(struct wiphy *wiphy,
 					       ar->hw->wiphy->bands[NL80211_BAND_5GHZ]);
 }
 
+static void ath10k_stop_radar_confirmation(struct ath10k *ar)
+{
+	spin_lock_bh(&ar->data_lock);
+	ar->radar_conf_state = ATH10K_RADAR_CONFIRMATION_STOPPED;
+	spin_unlock_bh(&ar->data_lock);
+
+	cancel_work_sync(&ar->radar_confirmation_work);
+}
+
 /***************/
 /* TX handlers */
 /***************/
@@ -4333,6 +4342,7 @@ void ath10k_halt(struct ath10k *ar)
 
 	ath10k_scan_finish(ar);
 	ath10k_peer_cleanup_all(ar);
+	ath10k_stop_radar_confirmation(ar);
 	ath10k_core_stop(ar);
 	ath10k_hif_power_down(ar);
 
@@ -4758,6 +4768,8 @@ static int ath10k_start(struct ieee80211_hw *hw)
 	ath10k_spectral_start(ar);
 	ath10k_thermal_set_throttling(ar);
 
+	ar->radar_conf_state = ATH10K_RADAR_CONFIRMATION_IDLE;
+
 	mutex_unlock(&ar->conf_mutex);
 	return 0;
 
diff --git a/drivers/net/wireless/ath/ath10k/wmi-ops.h b/drivers/net/wireless/ath/ath10k/wmi-ops.h
index e37d16b31afe..5ecce04005d2 100644
--- a/drivers/net/wireless/ath/ath10k/wmi-ops.h
+++ b/drivers/net/wireless/ath/ath10k/wmi-ops.h
@@ -55,6 +55,8 @@ struct wmi_ops {
 			      struct wmi_wow_ev_arg *arg);
 	int (*pull_echo_ev)(struct ath10k *ar, struct sk_buff *skb,
 			    struct wmi_echo_ev_arg *arg);
+	int (*pull_dfs_status_ev)(struct ath10k *ar, struct sk_buff *skb,
+				  struct wmi_dfs_status_ev_arg *arg);
 	int (*pull_svc_avail)(struct ath10k *ar, struct sk_buff *skb,
 			      struct wmi_svc_avail_ev_arg *arg);
 
@@ -188,6 +190,9 @@ struct wmi_ops {
 						const struct wmi_tdls_peer_update_cmd_arg *arg,
 						const struct wmi_tdls_peer_capab_arg *cap,
 						const struct wmi_channel_arg *chan);
+	struct sk_buff *(*gen_radar_found)
+			(struct ath10k *ar,
+			 const struct ath10k_radar_found_info *arg);
 	struct sk_buff *(*gen_adaptive_qcs)(struct ath10k *ar, bool enable);
 	struct sk_buff *(*gen_pdev_get_tpc_config)(struct ath10k *ar,
 						   u32 param);
@@ -395,6 +400,16 @@ ath10k_wmi_pull_echo_ev(struct ath10k *ar, struct sk_buff *skb,
 	return ar->wmi.ops->pull_echo_ev(ar, skb, arg);
 }
 
+static inline int
+ath10k_wmi_pull_dfs_status(struct ath10k *ar, struct sk_buff *skb,
+			   struct wmi_dfs_status_ev_arg *arg)
+{
+	if (!ar->wmi.ops->pull_dfs_status_ev)
+		return -EOPNOTSUPP;
+
+	return ar->wmi.ops->pull_dfs_status_ev(ar, skb, arg);
+}
+
 static inline enum wmi_txbf_conf
 ath10k_wmi_get_txbf_conf_scheme(struct ath10k *ar)
 {
@@ -1511,4 +1526,21 @@ ath10k_wmi_pdev_get_tpc_table_cmdid(struct ath10k *ar, u32 param)
 				   ar->wmi.cmd->pdev_get_tpc_table_cmdid);
 }
 
+static inline int
+ath10k_wmi_report_radar_found(struct ath10k *ar,
+			      const struct ath10k_radar_found_info *arg)
+{
+	struct sk_buff *skb;
+
+	if (!ar->wmi.ops->gen_radar_found)
+		return -EOPNOTSUPP;
+
+	skb = ar->wmi.ops->gen_radar_found(ar, arg);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	return ath10k_wmi_cmd_send(ar, skb,
+				   ar->wmi.cmd->radar_found_cmdid);
+}
+
 #endif
diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 1bccac002e97..f97ab795cf2e 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -34,6 +34,7 @@
 
 #define ATH10K_WMI_BARRIER_ECHO_ID 0xBA991E9
 #define ATH10K_WMI_BARRIER_TIMEOUT_HZ (3 * HZ)
+#define ATH10K_WMI_DFS_CONF_TIMEOUT_HZ (HZ / 6)
 
 /* MAIN WMI cmd track */
 static struct wmi_cmd_map wmi_cmd_map = {
@@ -199,6 +200,7 @@ static struct wmi_cmd_map wmi_cmd_map = {
 	.set_cca_params_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_bss_chan_info_request_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_get_tpc_table_cmdid = WMI_CMD_UNSUPPORTED,
+	.radar_found_cmdid = WMI_CMD_UNSUPPORTED,
 };
 
 /* 10.X WMI cmd track */
@@ -367,6 +369,7 @@ static struct wmi_cmd_map wmi_10x_cmd_map = {
 	.set_cca_params_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_bss_chan_info_request_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_get_tpc_table_cmdid = WMI_CMD_UNSUPPORTED,
+	.radar_found_cmdid = WMI_CMD_UNSUPPORTED,
 };
 
 /* 10.2.4 WMI cmd track */
@@ -535,6 +538,7 @@ static struct wmi_cmd_map wmi_10_2_4_cmd_map = {
 	.pdev_bss_chan_info_request_cmdid =
 		WMI_10_2_PDEV_BSS_CHAN_INFO_REQUEST_CMDID,
 	.pdev_get_tpc_table_cmdid = WMI_CMD_UNSUPPORTED,
+	.radar_found_cmdid = WMI_CMD_UNSUPPORTED,
 };
 
 /* 10.4 WMI cmd track */
@@ -745,6 +749,7 @@ static struct wmi_cmd_map wmi_10_4_cmd_map = {
 	.tdls_set_state_cmdid = WMI_10_4_TDLS_SET_STATE_CMDID,
 	.tdls_peer_update_cmdid = WMI_10_4_TDLS_PEER_UPDATE_CMDID,
 	.tdls_set_offchan_mode_cmdid = WMI_10_4_TDLS_SET_OFFCHAN_MODE_CMDID,
+	.radar_found_cmdid = WMI_10_4_RADAR_FOUND_CMDID,
 };
 
 /* MAIN WMI VDEV param map */
@@ -1490,6 +1495,7 @@ static struct wmi_cmd_map wmi_10_2_cmd_map = {
 	.pdev_get_ani_ofdm_config_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_reserve_ast_entry_cmdid = WMI_CMD_UNSUPPORTED,
 	.pdev_get_tpc_table_cmdid = WMI_CMD_UNSUPPORTED,
+	.radar_found_cmdid = WMI_CMD_UNSUPPORTED,
 };
 
 static struct wmi_pdev_param_map wmi_10_4_pdev_param_map = {
@@ -3683,6 +3689,68 @@ void ath10k_wmi_event_tbttoffset_update(struct ath10k *ar, struct sk_buff *skb)
 	ath10k_dbg(ar, ATH10K_DBG_WMI, "WMI_TBTTOFFSET_UPDATE_EVENTID\n");
 }
 
+static void ath10k_radar_detected(struct ath10k *ar)
+{
+	ath10k_dbg(ar, ATH10K_DBG_REGULATORY, "dfs radar detected\n");
+	ATH10K_DFS_STAT_INC(ar, radar_detected);
+
+	/* Control radar events reporting in debugfs file
+	 * dfs_block_radar_events
+	 */
+	if (ar->dfs_block_radar_events)
+		ath10k_info(ar, "DFS Radar detected, but ignored as requested\n");
+	else
+		ieee80211_radar_detected(ar->hw);
+}
+
+static void ath10k_radar_confirmation_work(struct work_struct *work)
+{
+	struct ath10k *ar = container_of(work, struct ath10k,
+					 radar_confirmation_work);
+	struct ath10k_radar_found_info radar_info;
+	int ret, time_left;
+
+	reinit_completion(&ar->wmi.radar_confirm);
+
+	spin_lock_bh(&ar->data_lock);
+	memcpy(&radar_info, &ar->last_radar_info, sizeof(radar_info));
+	spin_unlock_bh(&ar->data_lock);
+
+	ret = ath10k_wmi_report_radar_found(ar, &radar_info);
+	if (ret) {
+		ath10k_warn(ar, "failed to send radar found %d\n", ret);
+		goto wait_complete;
+	}
+
+	time_left = wait_for_completion_timeout(&ar->wmi.radar_confirm,
+						ATH10K_WMI_DFS_CONF_TIMEOUT_HZ);
+	if (time_left) {
+		/* DFS Confirmation status event received and
+		 * necessary action completed.
+		 */
+		goto wait_complete;
+	} else {
+		/* DFS Confirmation event not received from FW.Considering this
+		 * as real radar.
+		 */
+		ath10k_dbg(ar, ATH10K_DBG_REGULATORY,
+			   "dfs confirmation not received from fw, considering as radar\n");
+		goto radar_detected;
+	}
+
+radar_detected:
+	ath10k_radar_detected(ar);
+
+	/* Reset state to allow sending confirmation on consecutive radar
+	 * detections, unless radar confirmation is disabled/stopped.
+	 */
+wait_complete:
+	spin_lock_bh(&ar->data_lock);
+	if (ar->radar_conf_state != ATH10K_RADAR_CONFIRMATION_STOPPED)
+		ar->radar_conf_state = ATH10K_RADAR_CONFIRMATION_IDLE;
+	spin_unlock_bh(&ar->data_lock);
+}
+
 static void ath10k_dfs_radar_report(struct ath10k *ar,
 				    struct wmi_phyerr_ev_arg *phyerr,
 				    const struct phyerr_radar_report *rr,
@@ -3691,8 +3759,10 @@ static void ath10k_dfs_radar_report(struct ath10k *ar,
 	u32 reg0, reg1, tsf32l;
 	struct ieee80211_channel *ch;
 	struct pulse_event pe;
+	struct radar_detector_specs rs;
 	u64 tsf64;
 	u8 rssi, width;
+	struct ath10k_radar_found_info *radar_info;
 
 	reg0 = __le32_to_cpu(rr->reg0);
 	reg1 = __le32_to_cpu(rr->reg1);
@@ -3757,25 +3827,46 @@ static void ath10k_dfs_radar_report(struct ath10k *ar,
 
 	ATH10K_DFS_STAT_INC(ar, pulses_detected);
 
-	if (!ar->dfs_detector->add_pulse(ar->dfs_detector, &pe, NULL)) {
+	if (!ar->dfs_detector->add_pulse(ar->dfs_detector, &pe, &rs)) {
 		ath10k_dbg(ar, ATH10K_DBG_REGULATORY,
 			   "dfs no pulse pattern detected, yet\n");
 		return;
 	}
 
-radar_detected:
-	ath10k_dbg(ar, ATH10K_DBG_REGULATORY, "dfs radar detected\n");
-	ATH10K_DFS_STAT_INC(ar, radar_detected);
+	if ((test_bit(WMI_SERVICE_HOST_DFS_CHECK_SUPPORT, ar->wmi.svc_map)) &&
+	    ar->dfs_detector->region == NL80211_DFS_FCC) {
+		/* Consecutive radar indications need not be
+		 * sent to the firmware until we get confirmation
+		 * for the previous detected radar.
+		 */
+		spin_lock_bh(&ar->data_lock);
+		if (ar->radar_conf_state != ATH10K_RADAR_CONFIRMATION_IDLE) {
+			spin_unlock_bh(&ar->data_lock);
+			return;
+		}
+		ar->radar_conf_state = ATH10K_RADAR_CONFIRMATION_INPROGRESS;
+		radar_info = &ar->last_radar_info;
 
-	/* Control radar events reporting in debugfs file
-	 * dfs_block_radar_events
-	 */
-	if (ar->dfs_block_radar_events) {
-		ath10k_info(ar, "DFS Radar detected, but ignored as requested\n");
+		radar_info->pri_min = rs.pri_min;
+		radar_info->pri_max = rs.pri_max;
+		radar_info->width_min = rs.width_min;
+		radar_info->width_max = rs.width_max;
+		/*TODO Find sidx_min and sidx_max */
+		radar_info->sidx_min = MS(reg0, RADAR_REPORT_REG0_PULSE_SIDX);
+		radar_info->sidx_max = MS(reg0, RADAR_REPORT_REG0_PULSE_SIDX);
+
+		ath10k_dbg(ar, ATH10K_DBG_REGULATORY,
+			   "sending wmi radar found cmd pri_min %d pri_max %d width_min %d width_max %d sidx_min %d sidx_max %d\n",
+			   radar_info->pri_min, radar_info->pri_max,
+			   radar_info->width_min, radar_info->width_max,
+			   radar_info->sidx_min, radar_info->sidx_max);
+		ieee80211_queue_work(ar->hw, &ar->radar_confirmation_work);
+		spin_unlock_bh(&ar->data_lock);
 		return;
 	}
 
-	ieee80211_radar_detected(ar->hw);
+radar_detected:
+	ath10k_radar_detected(ar);
 }
 
 static int ath10k_dfs_fft_report(struct ath10k *ar,
@@ -4125,6 +4216,47 @@ void ath10k_wmi_event_phyerr(struct ath10k *ar, struct sk_buff *skb)
 	}
 }
 
+static int
+ath10k_wmi_10_4_op_pull_dfs_status_ev(struct ath10k *ar, struct sk_buff *skb,
+				      struct wmi_dfs_status_ev_arg *arg)
+{
+	struct wmi_dfs_status_ev_arg *ev = (void *)skb->data;
+
+	if (skb->len < sizeof(*ev))
+		return -EPROTO;
+
+	arg->status = ev->status;
+
+	return 0;
+}
+
+static void
+ath10k_wmi_event_dfs_status_check(struct ath10k *ar, struct sk_buff *skb)
+{
+	struct wmi_dfs_status_ev_arg status_arg = {};
+	int ret;
+
+	ret = ath10k_wmi_pull_dfs_status(ar, skb, &status_arg);
+
+	if (ret) {
+		ath10k_warn(ar, "failed to parse dfs status event: %d\n", ret);
+		return;
+	}
+
+	ath10k_dbg(ar, ATH10K_DBG_REGULATORY,
+		   "dfs status event received from fw: %d\n",
+		   status_arg.status);
+
+	/* Even in case of radar detection failure we follow the same
+	 * behaviour as if radar is detected i.e to switch to a different
+	 * channel.
+	 */
+	if (status_arg.status == WMI_HW_RADAR_DETECTED ||
+	    status_arg.status == WMI_RADAR_DETECTION_FAIL)
+		ath10k_radar_detected(ar);
+	complete(&ar->wmi.radar_confirm);
+}
+
 void ath10k_wmi_event_roam(struct ath10k *ar, struct sk_buff *skb)
 {
 	struct wmi_roam_ev_arg arg = {};
@@ -5876,6 +6008,9 @@ static void ath10k_wmi_10_4_op_rx(struct ath10k *ar, struct sk_buff *skb)
 	case WMI_10_4_PDEV_TPC_TABLE_EVENTID:
 		ath10k_wmi_event_tpc_final_table(ar, skb);
 		break;
+	case WMI_10_4_DFS_STATUS_CHECK_EVENTID:
+		ath10k_wmi_event_dfs_status_check(ar, skb);
+		break;
 	default:
 		ath10k_warn(ar, "Unknown eventid: %d\n", id);
 		break;
@@ -8461,6 +8596,32 @@ ath10k_wmi_10_4_gen_tdls_peer_update(struct ath10k *ar,
 	return skb;
 }
 
+static struct sk_buff *
+ath10k_wmi_10_4_gen_radar_found(struct ath10k *ar,
+				const struct ath10k_radar_found_info *arg)
+{
+	struct wmi_radar_found_info *cmd;
+	struct sk_buff *skb;
+
+	skb = ath10k_wmi_alloc_skb(ar, sizeof(*cmd));
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	cmd = (struct wmi_radar_found_info *)skb->data;
+	cmd->pri_min   = __cpu_to_le32(arg->pri_min);
+	cmd->pri_max   = __cpu_to_le32(arg->pri_max);
+	cmd->width_min = __cpu_to_le32(arg->width_min);
+	cmd->width_max = __cpu_to_le32(arg->width_max);
+	cmd->sidx_min  = __cpu_to_le32(arg->sidx_min);
+	cmd->sidx_max  = __cpu_to_le32(arg->sidx_max);
+
+	ath10k_dbg(ar, ATH10K_DBG_WMI,
+		   "wmi radar found pri_min %d pri_max %d width_min %d width_max %d sidx_min %d sidx_max %d\n",
+		   arg->pri_min, arg->pri_max, arg->width_min,
+		   arg->width_max, arg->sidx_min, arg->sidx_max);
+	return skb;
+}
+
 static struct sk_buff *
 ath10k_wmi_op_gen_echo(struct ath10k *ar, u32 value)
 {
@@ -8798,6 +8959,7 @@ static const struct wmi_ops wmi_10_4_ops = {
 	.pull_svc_rdy = ath10k_wmi_main_op_pull_svc_rdy_ev,
 	.pull_rdy = ath10k_wmi_op_pull_rdy_ev,
 	.pull_roam_ev = ath10k_wmi_op_pull_roam_ev,
+	.pull_dfs_status_ev = ath10k_wmi_10_4_op_pull_dfs_status_ev,
 	.get_txbf_conf_scheme = ath10k_wmi_10_4_txbf_conf_scheme,
 
 	.gen_pdev_suspend = ath10k_wmi_op_gen_pdev_suspend,
@@ -8844,6 +9006,7 @@ static const struct wmi_ops wmi_10_4_ops = {
 	.gen_tdls_peer_update = ath10k_wmi_10_4_gen_tdls_peer_update,
 	.gen_pdev_get_tpc_table_cmdid =
 			ath10k_wmi_10_4_op_gen_pdev_get_tpc_table_cmdid,
+	.gen_radar_found = ath10k_wmi_10_4_gen_radar_found,
 
 	/* shared with 10.2 */
 	.pull_echo_ev = ath10k_wmi_op_pull_echo_ev,
@@ -8906,8 +9069,11 @@ int ath10k_wmi_attach(struct ath10k *ar)
 	init_completion(&ar->wmi.service_ready);
 	init_completion(&ar->wmi.unified_ready);
 	init_completion(&ar->wmi.barrier);
+	init_completion(&ar->wmi.radar_confirm);
 
 	INIT_WORK(&ar->svc_rdy_work, ath10k_wmi_event_service_ready_work);
+	INIT_WORK(&ar->radar_confirmation_work,
+		  ath10k_radar_confirmation_work);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/ath/ath10k/wmi.h b/drivers/net/wireless/ath/ath10k/wmi.h
index 700b12f46baf..b48db54e9865 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.h
+++ b/drivers/net/wireless/ath/ath10k/wmi.h
@@ -969,6 +969,7 @@ struct wmi_cmd_map {
 	u32 vdev_sifs_trigger_time_cmdid;
 	u32 pdev_wds_entry_list_cmdid;
 	u32 tdls_set_offchan_mode_cmdid;
+	u32 radar_found_cmdid;
 };
 
 /*
@@ -1803,6 +1804,11 @@ enum wmi_10_4_cmd_id {
 	WMI_10_4_TDLS_SET_STATE_CMDID,
 	WMI_10_4_TDLS_PEER_UPDATE_CMDID,
 	WMI_10_4_TDLS_SET_OFFCHAN_MODE_CMDID,
+	WMI_10_4_PDEV_SEND_FD_CMDID,
+	WMI_10_4_ENABLE_FILS_CMDID,
+	WMI_10_4_PDEV_SET_BRIDGE_MACADDR_CMDID,
+	WMI_10_4_ATF_GROUP_WMM_AC_CONFIG_REQUEST_CMDID,
+	WMI_10_4_RADAR_FOUND_CMDID,
 	WMI_10_4_PDEV_UTF_CMDID = WMI_10_4_END_CMDID - 1,
 };
 
@@ -1878,6 +1884,9 @@ enum wmi_10_4_event_id {
 	WMI_10_4_PDEV_TPC_TABLE_EVENTID,
 	WMI_10_4_PDEV_WDS_ENTRY_LIST_EVENTID,
 	WMI_10_4_TDLS_PEER_EVENTID,
+	WMI_10_4_HOST_SWFDA_EVENTID,
+	WMI_10_4_ESP_ESTIMATE_EVENTID,
+	WMI_10_4_DFS_STATUS_CHECK_EVENTID,
 	WMI_10_4_PDEV_UTF_EVENTID = WMI_10_4_END_EVENTID - 1,
 };
 
@@ -3398,6 +3407,25 @@ struct wmi_10_4_phyerr_event {
 	u8 buf[0];
 } __packed;
 
+struct wmi_radar_found_info {
+	__le32 pri_min;
+	__le32 pri_max;
+	__le32 width_min;
+	__le32 width_max;
+	__le32 sidx_min;
+	__le32 sidx_max;
+} __packed;
+
+enum wmi_radar_confirmation_status {
+	/* Detected radar was due to SW pulses */
+	WMI_SW_RADAR_DETECTED    = 0,
+
+	WMI_RADAR_DETECTION_FAIL = 1,
+
+	/* Real radar detected */
+	WMI_HW_RADAR_DETECTED    = 2,
+};
+
 #define PHYERR_TLV_SIG				0xBB
 #define PHYERR_TLV_TAG_SEARCH_FFT_REPORT	0xFB
 #define PHYERR_TLV_TAG_RADAR_PULSE_SUMMARY	0xF8
@@ -6631,6 +6659,10 @@ struct wmi_phyerr_hdr_arg {
 	const void *phyerrs;
 };
 
+struct wmi_dfs_status_ev_arg {
+	u32 status;
+};
+
 struct wmi_svc_rdy_ev_arg {
 	__le32 min_tx_power;
 	__le32 max_tx_power;
-- 
cgit v1.2.3-59-g8ed1b


From 87f825e6e246cee0cdeaafd5bba4ef8c6df2748d Mon Sep 17 00:00:00 2001
From: Eyal Ilsar <eilsar@codeaurora.org>
Date: Tue, 22 May 2018 22:02:56 +0300
Subject: wcn36xx: Add support for Factory Test Mode (FTM)

Introduce infrastructure for supporting Factory Test Mode (FTM) of the
wireless LAN subsystem. In order for the user space to access the
firmware in test mode the relevant netlink channel needs to be exposed
from the kernel driver.

The above is achieved as follows:
1) Register wcn36xx driver to testmode callback from netlink
2) Add testmode callback implementation to handle incoming FTM commands
3) Add FTM command packet structure
4) Add handling for GET_BUILD_RELEASE_NUMBER (msgid=0x32A2)
5) Add generic handling for all PTT_MSG packets

Signed-off-by: Eyal Ilsar <eilsar@codeaurora.org>
Signed-off-by: Ramon Fried <ramon.fried@linaro.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/wcn36xx/Makefile     |   2 +
 drivers/net/wireless/ath/wcn36xx/hal.h        |  16 +++
 drivers/net/wireless/ath/wcn36xx/main.c       |   3 +
 drivers/net/wireless/ath/wcn36xx/smd.c        |  81 ++++++++++++++
 drivers/net/wireless/ath/wcn36xx/smd.h        |   4 +
 drivers/net/wireless/ath/wcn36xx/testmode.c   | 149 ++++++++++++++++++++++++++
 drivers/net/wireless/ath/wcn36xx/testmode.h   |  46 ++++++++
 drivers/net/wireless/ath/wcn36xx/testmode_i.h |  29 +++++
 drivers/net/wireless/ath/wcn36xx/wcn36xx.h    |   2 +
 9 files changed, 332 insertions(+)
 create mode 100644 drivers/net/wireless/ath/wcn36xx/testmode.c
 create mode 100644 drivers/net/wireless/ath/wcn36xx/testmode.h
 create mode 100644 drivers/net/wireless/ath/wcn36xx/testmode_i.h

diff --git a/drivers/net/wireless/ath/wcn36xx/Makefile b/drivers/net/wireless/ath/wcn36xx/Makefile
index 3b09435104eb..582049f65735 100644
--- a/drivers/net/wireless/ath/wcn36xx/Makefile
+++ b/drivers/net/wireless/ath/wcn36xx/Makefile
@@ -6,3 +6,5 @@ wcn36xx-y +=   main.o \
                smd.o \
                pmc.o \
                debug.o
+
+wcn36xx-$(CONFIG_NL80211_TESTMODE) += testmode.o
diff --git a/drivers/net/wireless/ath/wcn36xx/hal.h b/drivers/net/wireless/ath/wcn36xx/hal.h
index 2aed6c233508..8abda2760e04 100644
--- a/drivers/net/wireless/ath/wcn36xx/hal.h
+++ b/drivers/net/wireless/ath/wcn36xx/hal.h
@@ -2236,6 +2236,22 @@ struct wcn36xx_hal_switch_channel_rsp_msg {
 
 } __packed;
 
+struct wcn36xx_hal_process_ptt_msg_req_msg {
+	struct wcn36xx_hal_msg_header header;
+
+	/* Actual FTM Command body */
+	u8 ptt_msg[0];
+} __packed;
+
+struct wcn36xx_hal_process_ptt_msg_rsp_msg {
+	struct wcn36xx_hal_msg_header header;
+
+	/* FTM Command response status */
+	u32 ptt_msg_resp_status;
+	/* Actual FTM Command body */
+	u8 ptt_msg[0];
+} __packed;
+
 struct update_edca_params_req_msg {
 	struct wcn36xx_hal_msg_header header;
 
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 662e50540b07..aeb5e6e806be 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -26,6 +26,7 @@
 #include <linux/soc/qcom/smem_state.h>
 #include <linux/soc/qcom/wcnss_ctrl.h>
 #include "wcn36xx.h"
+#include "testmode.h"
 
 unsigned int wcn36xx_dbg_mask;
 module_param_named(debug_mask, wcn36xx_dbg_mask, uint, 0644);
@@ -1146,6 +1147,8 @@ static const struct ieee80211_ops wcn36xx_ops = {
 	.sta_add		= wcn36xx_sta_add,
 	.sta_remove		= wcn36xx_sta_remove,
 	.ampdu_action		= wcn36xx_ampdu_action,
+
+	CFG80211_TESTMODE_CMD(wcn36xx_tm_cmd)
 };
 
 static int wcn36xx_init_ieee80211(struct wcn36xx *wcn)
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index b855a58d5aac..b4dadf75d565 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -298,12 +298,26 @@ static void init_hal_msg(struct wcn36xx_hal_msg_header *hdr,
 		msg_body.header.len = sizeof(msg_body);			\
 	} while (0)							\
 
+#define INIT_HAL_PTT_MSG(p_msg_body, ppt_msg_len) \
+	do { \
+		memset(p_msg_body, 0, sizeof(*p_msg_body) + ppt_msg_len); \
+		p_msg_body->header.msg_type = WCN36XX_HAL_PROCESS_PTT_REQ; \
+		p_msg_body->header.msg_version = WCN36XX_HAL_MSG_VERSION0; \
+		p_msg_body->header.len = sizeof(*p_msg_body) + ppt_msg_len; \
+	} while (0)
+
 #define PREPARE_HAL_BUF(send_buf, msg_body) \
 	do {							\
 		memset(send_buf, 0, msg_body.header.len);	\
 		memcpy(send_buf, &msg_body, sizeof(msg_body));	\
 	} while (0)						\
 
+#define PREPARE_HAL_PTT_MSG_BUF(send_buf, p_msg_body) \
+	do {							\
+		memset(send_buf, 0, p_msg_body->header.len); \
+		memcpy(send_buf, p_msg_body, p_msg_body->header.len); \
+	} while (0)
+
 static int wcn36xx_smd_rsp_status_check(void *buf, size_t len)
 {
 	struct wcn36xx_fw_msg_status_rsp *rsp;
@@ -760,6 +774,71 @@ out:
 	return ret;
 }
 
+static int wcn36xx_smd_process_ptt_msg_rsp(void *buf, size_t len,
+					   void **p_ptt_rsp_msg)
+{
+	struct wcn36xx_hal_process_ptt_msg_rsp_msg *rsp;
+	int ret;
+
+	ret = wcn36xx_smd_rsp_status_check(buf, len);
+	if (ret)
+		return ret;
+
+	rsp = (struct wcn36xx_hal_process_ptt_msg_rsp_msg *)buf;
+
+	wcn36xx_dbg(WCN36XX_DBG_HAL, "process ptt msg responded with length %d\n",
+		    rsp->header.len);
+	wcn36xx_dbg_dump(WCN36XX_DBG_HAL_DUMP, "HAL_PTT_MSG_RSP:", rsp->ptt_msg,
+			 rsp->header.len - sizeof(rsp->ptt_msg_resp_status));
+
+	if (rsp->header.len > 0) {
+		*p_ptt_rsp_msg = kmalloc(rsp->header.len, GFP_ATOMIC);
+		if (!*p_ptt_rsp_msg)
+			return -ENOMEM;
+		memcpy(*p_ptt_rsp_msg, rsp->ptt_msg, rsp->header.len);
+	}
+	return ret;
+}
+
+int wcn36xx_smd_process_ptt_msg(struct wcn36xx *wcn,
+				struct ieee80211_vif *vif, void *ptt_msg, size_t len,
+		void **ptt_rsp_msg)
+{
+	struct wcn36xx_hal_process_ptt_msg_req_msg *p_msg_body;
+	int ret;
+
+	mutex_lock(&wcn->hal_mutex);
+	p_msg_body = kmalloc(
+		sizeof(struct wcn36xx_hal_process_ptt_msg_req_msg) + len,
+		GFP_ATOMIC);
+	if (!p_msg_body) {
+		ret = -ENOMEM;
+		goto out_nomem;
+	}
+	INIT_HAL_PTT_MSG(p_msg_body, len);
+
+	memcpy(&p_msg_body->ptt_msg, ptt_msg, len);
+
+	PREPARE_HAL_PTT_MSG_BUF(wcn->hal_buf, p_msg_body);
+
+	ret = wcn36xx_smd_send_and_wait(wcn, p_msg_body->header.len);
+	if (ret) {
+		wcn36xx_err("Sending hal_process_ptt_msg failed\n");
+		goto out;
+	}
+	ret = wcn36xx_smd_process_ptt_msg_rsp(wcn->hal_buf, wcn->hal_rsp_len,
+					      ptt_rsp_msg);
+	if (ret) {
+		wcn36xx_err("process_ptt_msg response failed err=%d\n", ret);
+		goto out;
+	}
+out:
+	kfree(p_msg_body);
+out_nomem:
+	mutex_unlock(&wcn->hal_mutex);
+	return ret;
+}
+
 static int wcn36xx_smd_update_scan_params_rsp(void *buf, size_t len)
 {
 	struct wcn36xx_hal_update_scan_params_resp *rsp;
@@ -2396,6 +2475,7 @@ int wcn36xx_smd_rsp_process(struct rpmsg_device *rpdev,
 	case WCN36XX_HAL_JOIN_RSP:
 	case WCN36XX_HAL_UPDATE_SCAN_PARAM_RSP:
 	case WCN36XX_HAL_CH_SWITCH_RSP:
+	case WCN36XX_HAL_PROCESS_PTT_RSP:
 	case WCN36XX_HAL_FEATURE_CAPS_EXCHANGE_RSP:
 	case WCN36XX_HAL_8023_MULTICAST_LIST_RSP:
 	case WCN36XX_HAL_START_SCAN_OFFLOAD_RSP:
@@ -2436,6 +2516,7 @@ int wcn36xx_smd_rsp_process(struct rpmsg_device *rpdev,
 
 	return 0;
 }
+
 static void wcn36xx_ind_smd_work(struct work_struct *work)
 {
 	struct wcn36xx *wcn =
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.h b/drivers/net/wireless/ath/wcn36xx/smd.h
index 61bb8d43138c..ff15df8ab56f 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.h
+++ b/drivers/net/wireless/ath/wcn36xx/smd.h
@@ -86,6 +86,10 @@ int wcn36xx_smd_send_beacon(struct wcn36xx *wcn, struct ieee80211_vif *vif,
 			    u16 p2p_off);
 int wcn36xx_smd_switch_channel(struct wcn36xx *wcn,
 			       struct ieee80211_vif *vif, int ch);
+int wcn36xx_smd_process_ptt_msg(struct wcn36xx *wcn,
+				struct ieee80211_vif *vif,
+				void *ptt_msg, size_t len,
+				void **ptt_rsp_msg);
 int wcn36xx_smd_update_proberesp_tmpl(struct wcn36xx *wcn,
 				      struct ieee80211_vif *vif,
 				      struct sk_buff *skb);
diff --git a/drivers/net/wireless/ath/wcn36xx/testmode.c b/drivers/net/wireless/ath/wcn36xx/testmode.c
new file mode 100644
index 000000000000..1279064a3b71
--- /dev/null
+++ b/drivers/net/wireless/ath/wcn36xx/testmode.c
@@ -0,0 +1,149 @@
+﻿/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <net/netlink.h>
+#include <linux/firmware.h>
+#include <net/cfg80211.h>
+#include "wcn36xx.h"
+
+#include "testmode.h"
+#include "testmode_i.h"
+#include "hal.h"
+#include "smd.h"
+
+static const struct nla_policy wcn36xx_tm_policy[WCN36XX_TM_ATTR_MAX + 1] = {
+	[WCN36XX_TM_ATTR_CMD] = { .type = NLA_U16 },
+	[WCN36XX_TM_ATTR_DATA] = { .type = NLA_BINARY,
+	.len = WCN36XX_TM_DATA_MAX_LEN },
+};
+
+struct build_release_number {
+	u16 drv_major;
+	u16 drv_minor;
+	u16 drv_patch;
+	u16 drv_build;
+	u16 ptt_max;
+	u16 ptt_min;
+	u16 fw_ver;
+} __packed;
+
+static int wcn36xx_tm_cmd_ptt(struct wcn36xx *wcn, struct ieee80211_vif *vif,
+			      struct nlattr *tb[])
+{
+	int ret = 0, buf_len;
+	void *buf;
+	struct ftm_rsp_msg *msg, *rsp = NULL;
+	struct sk_buff *skb;
+
+	if (!tb[WCN36XX_TM_ATTR_DATA])
+		return -EINVAL;
+
+	buf = nla_data(tb[WCN36XX_TM_ATTR_DATA]);
+	buf_len = nla_len(tb[WCN36XX_TM_ATTR_DATA]);
+	msg = (struct ftm_rsp_msg *)buf;
+
+	wcn36xx_dbg(WCN36XX_DBG_TESTMODE,
+		    "testmode cmd wmi msg_id 0x%04X msg_len %d buf %pK buf_len %d\n",
+		   msg->msg_id, msg->msg_body_length,
+		   buf, buf_len);
+
+	wcn36xx_dbg_dump(WCN36XX_DBG_TESTMODE_DUMP, "REQ ", buf, buf_len);
+
+	if (msg->msg_id == MSG_GET_BUILD_RELEASE_NUMBER) {
+		struct build_release_number *body =
+				(struct build_release_number *)
+				msg->msg_response;
+
+		body->drv_major = wcn->fw_major;
+		body->drv_minor = wcn->fw_minor;
+		body->drv_patch = wcn->fw_version;
+		body->drv_build = wcn->fw_revision;
+		body->ptt_max = 10;
+		body->ptt_min = 0;
+
+		rsp = msg;
+		rsp->resp_status = 0;
+	} else {
+		wcn36xx_dbg(WCN36XX_DBG_TESTMODE,
+			    "PPT Request >> HAL size %d\n",
+				msg->msg_body_length);
+
+		msg->resp_status = wcn36xx_smd_process_ptt_msg(wcn, vif, msg,
+							       msg->msg_body_length, (void *)(&rsp));
+
+		wcn36xx_dbg(WCN36XX_DBG_TESTMODE,
+			    "Response status = %d\n",
+				msg->resp_status);
+		if (rsp)
+			wcn36xx_dbg(WCN36XX_DBG_TESTMODE,
+				    "PPT Response << HAL size %d\n",
+					rsp->msg_body_length);
+	}
+
+	if (!rsp) {
+		rsp = msg;
+		wcn36xx_warn("No response! Echoing request with response status %d\n",
+			     rsp->resp_status);
+	}
+	wcn36xx_dbg_dump(WCN36XX_DBG_TESTMODE_DUMP, "RSP ",
+			 rsp, rsp->msg_body_length);
+
+	skb = cfg80211_testmode_alloc_reply_skb(wcn->hw->wiphy,
+						nla_total_size(msg->msg_body_length));
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = nla_put(skb, WCN36XX_TM_ATTR_DATA, rsp->msg_body_length, rsp);
+	if (ret) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	ret = cfg80211_testmode_reply(skb);
+
+out:
+	if (rsp != msg)
+		kfree(rsp);
+
+	return ret;
+}
+
+int wcn36xx_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		   void *data, int len)
+{
+	struct wcn36xx *wcn = hw->priv;
+	struct nlattr *tb[WCN36XX_TM_ATTR_MAX + 1];
+	int ret = 0;
+	unsigned short attr;
+
+	wcn36xx_dbg_dump(WCN36XX_DBG_TESTMODE_DUMP, "Data:", data, len);
+	ret = nla_parse(tb, WCN36XX_TM_ATTR_MAX, data, len,
+			wcn36xx_tm_policy, NULL);
+	if (ret)
+		return ret;
+
+	if (!tb[WCN36XX_TM_ATTR_CMD])
+		return -EINVAL;
+
+	attr = nla_get_u16(tb[WCN36XX_TM_ATTR_CMD]);
+
+	if (attr != WCN36XX_TM_CMD_PTT)
+		return -EOPNOTSUPP;
+
+	return wcn36xx_tm_cmd_ptt(wcn, vif, tb);
+}
diff --git a/drivers/net/wireless/ath/wcn36xx/testmode.h b/drivers/net/wireless/ath/wcn36xx/testmode.h
new file mode 100644
index 000000000000..4c6cfdb46580
--- /dev/null
+++ b/drivers/net/wireless/ath/wcn36xx/testmode.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "wcn36xx.h"
+
+struct ftm_rsp_msg {
+	u16 msg_id;
+	u16 msg_body_length;
+	u32 resp_status;
+	u8 msg_response[0];
+} __packed;
+
+/* The request buffer of FTM which contains a byte of command and the request */
+struct ftm_payload {
+	u16 ftm_cmd_type;
+	struct ftm_rsp_msg ftm_cmd_msg;
+} __packed;
+
+#define MSG_GET_BUILD_RELEASE_NUMBER 0x32A2
+
+#ifdef CONFIG_NL80211_TESTMODE
+int wcn36xx_tm_cmd(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+		   void *data, int len);
+
+#else
+static inline int wcn36xx_tm_cmd(struct ieee80211_hw *hw,
+				 struct ieee80211_vif *vif,
+				void *data, int len)
+{
+	return 0;
+}
+
+#endif
diff --git a/drivers/net/wireless/ath/wcn36xx/testmode_i.h b/drivers/net/wireless/ath/wcn36xx/testmode_i.h
new file mode 100644
index 000000000000..8a1477ffd5a0
--- /dev/null
+++ b/drivers/net/wireless/ath/wcn36xx/testmode_i.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define WCN36XX_TM_DATA_MAX_LEN		5000
+
+enum wcn36xx_tm_attr {
+	__WCN36XX_TM_ATTR_INVALID	= 0,
+	WCN36XX_TM_ATTR_CMD		= 1,
+	WCN36XX_TM_ATTR_DATA		= 2,
+
+	/* keep last */
+	__WCN36XX_TM_ATTR_AFTER_LAST,
+	WCN36XX_TM_ATTR_MAX		= __WCN36XX_TM_ATTR_AFTER_LAST - 1,
+};
+
+#define WCN36XX_TM_CMD_PTT 3
diff --git a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
index 9343989d1169..11e74015c79a 100644
--- a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
+++ b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
@@ -50,6 +50,8 @@ enum wcn36xx_debug_mask {
 	WCN36XX_DBG_BEACON_DUMP	= 0x00001000,
 	WCN36XX_DBG_PMC		= 0x00002000,
 	WCN36XX_DBG_PMC_DUMP	= 0x00004000,
+	WCN36XX_DBG_TESTMODE		= 0x00008000,
+	WCN36XX_DBG_TESTMODE_DUMP	= 0x00010000,
 	WCN36XX_DBG_ANY		= 0xffffffff,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7d850abd5f4edb1b1ca4b4141a4453305736f564 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 24 May 2018 11:56:48 +0300
Subject: net: bridge: add support for port isolation

This patch adds support for a new port flag - BR_ISOLATED. If it is set
then isolated ports cannot communicate between each other, but they can
still communicate with non-isolated ports. The same can be achieved via
ACLs but they can't scale with large number of ports and also the
complexity of the rules grows. This feature can be used to achieve
isolated vlan functionality (similar to pvlan) as well, though currently
it will be port-wide (for all vlans on the port). The new test in
should_deliver uses data that is already cache hot and the new boolean
is used to avoid an additional source port test in should_deliver.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Reviewed-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    | 1 +
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_forward.c      | 3 ++-
 net/bridge/br_input.c        | 1 +
 net/bridge/br_netlink.c      | 9 ++++++++-
 net/bridge/br_private.h      | 9 +++++++++
 net/bridge/br_sysfs_if.c     | 2 ++
 7 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 585d27182425..7843b98e1c6e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -50,6 +50,7 @@ struct br_ip_list {
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
 #define BR_NEIGH_SUPPRESS	BIT(15)
+#define BR_ISOLATED		BIT(16)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b85266420bfb..cf01b6824244 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -333,6 +333,7 @@ enum {
 	IFLA_BRPORT_BCAST_FLOOD,
 	IFLA_BRPORT_GROUP_FWD_MASK,
 	IFLA_BRPORT_NEIGH_SUPPRESS,
+	IFLA_BRPORT_ISOLATED,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 7a7fd672ccf2..9019f326fe81 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -30,7 +30,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
 		br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
-		nbp_switchdev_allowed_egress(p, skb);
+		nbp_switchdev_allowed_egress(p, skb) &&
+		!br_skb_isolated(p, skb);
 }
 
 int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f98a7d25866..72074276c088 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -114,6 +114,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		goto drop;
 
 	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+	BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
 
 	if (IS_ENABLED(CONFIG_INET) &&
 	    (skb->protocol == htons(ETH_P_ARP) ||
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 015f465c514b..9f5eb05b0373 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -139,6 +139,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_PROXYARP_WIFI */
 		+ nla_total_size(1)	/* IFLA_BRPORT_VLAN_TUNNEL */
 		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_SUPPRESS */
+		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -213,7 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 							BR_VLAN_TUNNEL)) ||
 	    nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
 	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
-		       !!(p->flags & BR_NEIGH_SUPPRESS)))
+		       !!(p->flags & BR_NEIGH_SUPPRESS)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -660,6 +662,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_VLAN_TUNNEL] = { .type = NLA_U8 },
 	[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
 	[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
+	[IFLA_BRPORT_ISOLATED]	= { .type = NLA_U8 },
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -810,6 +813,10 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	if (err)
 		return err;
 
+	err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+	if (err)
+		return err;
+
 	br_port_flags_change(p, old_flags ^ p->flags);
 	return 0;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 742f40aefdaf..11520ed528b0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -423,6 +423,7 @@ struct br_input_skb_cb {
 #endif
 
 	bool proxyarp_replied;
+	bool src_port_isolated;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	bool vlan_filtered;
@@ -574,6 +575,14 @@ int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	      enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
 
+/* return true if both source port and dest port are isolated */
+static inline bool br_skb_isolated(const struct net_bridge_port *to,
+				   const struct sk_buff *skb)
+{
+	return BR_INPUT_SKB_CB(skb)->src_port_isolated &&
+	       (to->flags & BR_ISOLATED);
+}
+
 /* br_if.c */
 void br_port_carrier_check(struct net_bridge_port *p, bool *notified);
 int br_add_bridge(struct net *net, const char *name);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index fd31ad83ec7b..f99c5bf5c906 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -192,6 +192,7 @@ BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
 BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
 BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
+BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -243,6 +244,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_broadcast_flood,
 	&brport_attr_group_fwd_mask,
 	&brport_attr_neigh_suppress,
+	&brport_attr_isolated,
 	NULL
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 0c235113b3c42197dba66baf76697359b03a5046 Mon Sep 17 00:00:00 2001
From: Martin Habets <mhabets@solarflare.com>
Date: Thu, 24 May 2018 10:14:00 +0100
Subject: sfc: stop the TX queue before pushing new buffers

efx_enqueue_skb() can push new buffers for the xmit_more functionality.
We must stops the TX queue before this or else the TX queue does not get
restarted and we get a netdev watchdog.

In the error handling we may now need to unwind more than 1 packet, and
we may need to push the new buffers onto the partner queue.

v2: In the error leg also push this queue if xmit_more is set

Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2")
Reported-by: Jarod Wilson <jarod@redhat.com>
Tested-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Martin Habets <mhabets@solarflare.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/tx.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index cece961f2e82..c3ad564ac4c0 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -435,17 +435,18 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
 	} while (1);
 }
 
-/* Remove buffers put into a tx_queue.  None of the buffers must have
- * an skb attached.
+/* Remove buffers put into a tx_queue for the current packet.
+ * None of the buffers must have an skb attached.
  */
-static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
+static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue,
+			       unsigned int insert_count)
 {
 	struct efx_tx_buffer *buffer;
 	unsigned int bytes_compl = 0;
 	unsigned int pkts_compl = 0;
 
 	/* Work backwards until we hit the original insert pointer value */
-	while (tx_queue->insert_count != tx_queue->write_count) {
+	while (tx_queue->insert_count != insert_count) {
 		--tx_queue->insert_count;
 		buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
 		efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
@@ -504,6 +505,8 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue,
  */
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 {
+	unsigned int old_insert_count = tx_queue->insert_count;
+	bool xmit_more = skb->xmit_more;
 	bool data_mapped = false;
 	unsigned int segments;
 	unsigned int skb_len;
@@ -553,8 +556,10 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 	/* Update BQL */
 	netdev_tx_sent_queue(tx_queue->core_txq, skb_len);
 
+	efx_tx_maybe_stop_queue(tx_queue);
+
 	/* Pass off to hardware */
-	if (!skb->xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
+	if (!xmit_more || netif_xmit_stopped(tx_queue->core_txq)) {
 		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
 
 		/* There could be packets left on the partner queue if those
@@ -577,14 +582,26 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 		tx_queue->tx_packets++;
 	}
 
-	efx_tx_maybe_stop_queue(tx_queue);
-
 	return NETDEV_TX_OK;
 
 
 err:
-	efx_enqueue_unwind(tx_queue);
+	efx_enqueue_unwind(tx_queue, old_insert_count);
 	dev_kfree_skb_any(skb);
+
+	/* If we're not expecting another transmit and we had something to push
+	 * on this queue or a partner queue then we need to push here to get the
+	 * previous packets out.
+	 */
+	if (!xmit_more) {
+		struct efx_tx_queue *txq2 = efx_tx_queue_partner(tx_queue);
+
+		if (txq2->xmit_more_available)
+			efx_nic_push_buffers(txq2);
+
+		efx_nic_push_buffers(tx_queue);
+	}
+
 	return NETDEV_TX_OK;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b526e56b31832385c97b224fcb9f45536a568527 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 24 May 2018 19:27:07 +0800
Subject: net: fec: remove stale comment

This comment is outdated as fec_ptp_ioctl has been replaced by fec_ptp_set/fec_ptp_get
since commit 1d5244d0e43b ("fec: Implement the SIOCGHWTSTAMP ioctl")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_ptp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index f81439796ac7..d438ef8a371d 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -466,12 +466,6 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp,
 	return -EOPNOTSUPP;
 }
 
-/**
- * fec_ptp_hwtstamp_ioctl - control hardware time stamping
- * @ndev: pointer to net_device
- * @ifreq: ioctl data
- * @cmd: particular ioctl requested
- */
 int fec_ptp_set(struct net_device *ndev, struct ifreq *ifr)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-- 
cgit v1.2.3-59-g8ed1b


From 57ccaedb74158be6d7d6edb255a6c153ec19e618 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Thu, 24 May 2018 17:49:30 +0530
Subject: cxgb4/cxgb4vf: link management changes for new SFP

newer SFPs like SFP28 and QSFP28 Transceiver Modules present
several new possibilities which we haven't faced before. Fix the
assumptions in the code reflecting the more limited capabilities
of previous Transceiver Module systems

Original work by Casey Leedom <leedom@chelsio.com>

Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 22 +++++-----
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c         | 34 ++++++++++++++--
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c     | 47 +++++++++++++++++++++-
 3 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 59d04d73c672..f7eef93ffc87 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -800,24 +800,20 @@ static int set_link_ksettings(struct net_device *dev,
 	if (base->duplex != DUPLEX_FULL)
 		return -EINVAL;
 
-	if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
-		/* PHY offers a single speed.  See if that's what's
-		 * being requested.
-		 */
-		if (base->autoneg == AUTONEG_DISABLE &&
-		    (lc->pcaps & speed_to_fw_caps(base->speed)))
-			return 0;
-		return -EINVAL;
-	}
-
 	old_lc = *lc;
-	if (base->autoneg == AUTONEG_DISABLE) {
+	if (!(lc->pcaps & FW_PORT_CAP32_ANEG) ||
+	    base->autoneg == AUTONEG_DISABLE) {
 		fw_caps = speed_to_fw_caps(base->speed);
 
-		if (!(lc->pcaps & fw_caps))
+		/* Must only specify a single speed which must be supported
+		 * as part of the Physical Port Capabilities.
+		 */
+		if ((fw_caps & (fw_caps - 1)) != 0 ||
+		    !(lc->pcaps & fw_caps))
 			return -EINVAL;
+
 		lc->speed_caps = fw_caps;
-		lc->acaps = 0;
+		lc->acaps = fw_caps;
 	} else {
 		fw_caps =
 			 lmm_to_fw_caps(link_ksettings->link_modes.advertising);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 704f6960a6ea..962384cdd64b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -4066,6 +4066,7 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
 	fw_port_cap32_t fw_fc, cc_fec, fw_fec, rcap;
 	struct fw_port_cmd cmd;
 	unsigned int fw_mdi;
+	int ret;
 
 	fw_mdi = (FW_PORT_CAP32_MDI_V(FW_PORT_CAP32_MDI_AUTO) & lc->pcaps);
 	/* Convert driver coding of Pause Frame Flow Control settings into the
@@ -4100,6 +4101,13 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
 		rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
 	}
 
+	if (rcap & ~lc->pcaps) {
+		dev_err(adapter->pdev_dev,
+			"Requested Port Capabilities %#x exceed Physical Port Capabilities %#x\n",
+			rcap, lc->pcaps);
+		return -EINVAL;
+	}
+
 	/* And send that on to the Firmware ...
 	 */
 	memset(&cmd, 0, sizeof(cmd));
@@ -4110,13 +4118,21 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
 		cpu_to_be32(FW_PORT_CMD_ACTION_V(fw_caps == FW_CAPS16
 						 ? FW_PORT_ACTION_L1_CFG
 						 : FW_PORT_ACTION_L1_CFG32) |
-			    FW_LEN16(cmd));
+						 FW_LEN16(cmd));
 	if (fw_caps == FW_CAPS16)
 		cmd.u.l1cfg.rcap = cpu_to_be32(fwcaps32_to_caps16(rcap));
 	else
 		cmd.u.l1cfg32.rcap32 = cpu_to_be32(rcap);
-	return t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL,
-				       sleep_ok, timeout);
+
+	ret = t4_wr_mbox_meat_timeout(adapter, mbox, &cmd, sizeof(cmd), NULL,
+				      sleep_ok, timeout);
+	if (ret) {
+		dev_err(adapter->pdev_dev,
+			"Requested Port Capabilities %#x rejected, error %d\n",
+			rcap, -ret);
+		return ret;
+	}
+	return ret;
 }
 
 /**
@@ -8395,7 +8411,9 @@ void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl)
 		lc->lpacaps = lpacaps;
 		lc->acaps = acaps & ADVERT_MASK;
 
-		if (lc->acaps & FW_PORT_CAP32_ANEG) {
+		if (!(lc->acaps & FW_PORT_CAP32_ANEG)) {
+			lc->autoneg = AUTONEG_DISABLE;
+		} else if (lc->acaps & FW_PORT_CAP32_ANEG) {
 			lc->autoneg = AUTONEG_ENABLE;
 		} else {
 			/* When Autoneg is disabled, user needs to set
@@ -8600,6 +8618,13 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
 	lc->requested_fec = FEC_AUTO;
 	lc->fec = fwcap_to_cc_fec(lc->def_acaps);
 
+	/* If the Port is capable of Auto-Negtotiation, initialize it as
+	 * "enabled" and copy over all of the Physical Port Capabilities
+	 * to the Advertised Port Capabilities.  Otherwise mark it as
+	 * Auto-Negotiate disabled and select the highest supported speed
+	 * for the link.  Note parallel structure in t4_link_l1cfg_core()
+	 * and t4_handle_get_port_info().
+	 */
 	if (lc->pcaps & FW_PORT_CAP32_ANEG) {
 		lc->acaps = lc->pcaps & ADVERT_MASK;
 		lc->autoneg = AUTONEG_ENABLE;
@@ -8607,6 +8632,7 @@ static void init_link_config(struct link_config *lc, fw_port_cap32_t pcaps,
 	} else {
 		lc->acaps = 0;
 		lc->autoneg = AUTONEG_DISABLE;
+		lc->speed_caps = fwcap_to_fwspeed(acaps);
 	}
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index 3017f7873ff9..8786431a8825 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -405,6 +405,36 @@ static unsigned int fwcap_to_speed(fw_port_cap32_t caps)
 	return 0;
 }
 
+/**
+ *      fwcap_to_fwspeed - return highest speed in Port Capabilities
+ *      @acaps: advertised Port Capabilities
+ *
+ *      Get the highest speed for the port from the advertised Port
+ *      Capabilities.  It will be either the highest speed from the list of
+ *      speeds or whatever user has set using ethtool.
+ */
+static fw_port_cap32_t fwcap_to_fwspeed(fw_port_cap32_t acaps)
+{
+	#define TEST_SPEED_RETURN(__caps_speed) \
+		do { \
+			if (acaps & FW_PORT_CAP32_SPEED_##__caps_speed) \
+				return FW_PORT_CAP32_SPEED_##__caps_speed; \
+		} while (0)
+
+	TEST_SPEED_RETURN(400G);
+	TEST_SPEED_RETURN(200G);
+	TEST_SPEED_RETURN(100G);
+	TEST_SPEED_RETURN(50G);
+	TEST_SPEED_RETURN(40G);
+	TEST_SPEED_RETURN(25G);
+	TEST_SPEED_RETURN(10G);
+	TEST_SPEED_RETURN(1G);
+	TEST_SPEED_RETURN(100M);
+
+	#undef TEST_SPEED_RETURN
+	return 0;
+}
+
 /*
  *	init_link_config - initialize a link's SW state
  *	@lc: structure holding the link state
@@ -431,6 +461,13 @@ static void init_link_config(struct link_config *lc,
 	lc->requested_fec = FEC_AUTO;
 	lc->fec = lc->auto_fec;
 
+	/* If the Port is capable of Auto-Negtotiation, initialize it as
+	 * "enabled" and copy over all of the Physical Port Capabilities
+	 * to the Advertised Port Capabilities.  Otherwise mark it as
+	 * Auto-Negotiate disabled and select the highest supported speed
+	 * for the link.  Note parallel structure in t4_link_l1cfg_core()
+	 * and t4_handle_get_port_info().
+	 */
 	if (lc->pcaps & FW_PORT_CAP32_ANEG) {
 		lc->acaps = acaps & ADVERT_MASK;
 		lc->autoneg = AUTONEG_ENABLE;
@@ -438,6 +475,7 @@ static void init_link_config(struct link_config *lc,
 	} else {
 		lc->acaps = 0;
 		lc->autoneg = AUTONEG_DISABLE;
+		lc->speed_caps = fwcap_to_fwspeed(acaps);
 	}
 }
 
@@ -1955,7 +1993,14 @@ static void t4vf_handle_get_port_info(struct port_info *pi,
 		lc->lpacaps = lpacaps;
 		lc->acaps = acaps & ADVERT_MASK;
 
-		if (lc->acaps & FW_PORT_CAP32_ANEG) {
+		/* If we're not physically capable of Auto-Negotiation, note
+		 * this as Auto-Negotiation disabled.  Otherwise, we track
+		 * what Auto-Negotiation settings we have.  Note parallel
+		 * structure in init_link_config().
+		 */
+		if (!(lc->pcaps & FW_PORT_CAP32_ANEG)) {
+			lc->autoneg = AUTONEG_DISABLE;
+		} else if (lc->acaps & FW_PORT_CAP32_ANEG) {
 			lc->autoneg = AUTONEG_ENABLE;
 		} else {
 			/* When Autoneg is disabled, user needs to set
-- 
cgit v1.2.3-59-g8ed1b


From e8d452923ae6cdcf2fd7bf4fac4a3751eda56931 Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Thu, 24 May 2018 18:32:15 +0530
Subject: cxgb4: clean up init_one

clean up init_one and use chip_ver consistently throughout
init_one() for chip version.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   | 47 +++++++++++++----------
 drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h |  2 +
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 8405187f8e3f..903c0af37456 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5240,14 +5240,11 @@ static void free_some_resources(struct adapter *adapter)
 		   NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
 #define SEGMENT_SIZE 128
 
-static int get_chip_type(struct pci_dev *pdev, u32 pl_rev)
+static int t4_get_chip_type(struct adapter *adap, int ver)
 {
-	u16 device_id;
-
-	/* Retrieve adapter's device ID */
-	pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
+	u32 pl_rev = REV_G(t4_read_reg(adap, PL_REV_A));
 
-	switch (device_id >> 12) {
+	switch (ver) {
 	case CHELSIO_T4:
 		return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
 	case CHELSIO_T5:
@@ -5255,8 +5252,7 @@ static int get_chip_type(struct pci_dev *pdev, u32 pl_rev)
 	case CHELSIO_T6:
 		return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
 	default:
-		dev_err(&pdev->dev, "Device %d is not supported\n",
-			device_id);
+		break;
 	}
 	return -EINVAL;
 }
@@ -5426,15 +5422,18 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs)
 
 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-	int func, i, err, s_qpp, qpp, num_seg;
+	struct net_device *netdev;
+	struct adapter *adapter;
+	static int adap_idx = 1;
+	int s_qpp, qpp, num_seg;
 	struct port_info *pi;
 	bool highdma = false;
-	struct adapter *adapter = NULL;
-	struct net_device *netdev;
-	void __iomem *regs;
-	u32 whoami, pl_rev;
 	enum chip_type chip;
-	static int adap_idx = 1;
+	void __iomem *regs;
+	int func, chip_ver;
+	u16 device_id;
+	int i, err;
+	u32 whoami;
 
 	printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION);
 
@@ -5470,11 +5469,17 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto out_free_adapter;
 
 	/* We control everything through one PF */
-	whoami = readl(regs + PL_WHOAMI_A);
-	pl_rev = REV_G(readl(regs + PL_REV_A));
-	chip = get_chip_type(pdev, pl_rev);
-	func = CHELSIO_CHIP_VERSION(chip) <= CHELSIO_T5 ?
-		SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
+	whoami = t4_read_reg(adapter, PL_WHOAMI_A);
+	pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
+	chip = t4_get_chip_type(adapter, CHELSIO_PCI_ID_VER(device_id));
+	if (chip < 0) {
+		dev_err(&pdev->dev, "Device %d is not supported\n", device_id);
+		err = chip;
+		goto out_free_adapter;
+	}
+	chip_ver = CHELSIO_CHIP_VERSION(chip);
+	func = chip_ver <= CHELSIO_T5 ?
+	       SOURCEPF_G(whoami) : T6_SOURCEPF_G(whoami);
 
 	adapter->pdev = pdev;
 	adapter->pdev_dev = &pdev->dev;
@@ -5640,7 +5645,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
 			NETIF_F_HW_TC;
 
-		if (CHELSIO_CHIP_VERSION(chip) > CHELSIO_T5) {
+		if (chip_ver > CHELSIO_T5) {
 			netdev->hw_enc_features |= NETIF_F_IP_CSUM |
 						   NETIF_F_IPV6_CSUM |
 						   NETIF_F_RXCSUM |
@@ -5720,7 +5725,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev_warn(&pdev->dev, "could not allocate MPS Encap entries, continuing\n");
 
 #if IS_ENABLED(CONFIG_IPV6)
-	if ((CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5) &&
+	if (chip_ver <= CHELSIO_T5 &&
 	    (!(t4_read_reg(adapter, LE_DB_CONFIG_A) & ASLIPCOMPEN_F))) {
 		/* CLIP functionality is not present in hardware,
 		 * hence disable all offload features
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
index 54b718111e3f..721c77577ec5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_chip_type.h
@@ -34,6 +34,8 @@
 #ifndef __T4_CHIP_TYPE_H__
 #define __T4_CHIP_TYPE_H__
 
+#define CHELSIO_PCI_ID_VER(__DeviceID)  ((__DeviceID) >> 12)
+
 #define CHELSIO_T4		0x4
 #define CHELSIO_T5		0x5
 #define CHELSIO_T6		0x6
-- 
cgit v1.2.3-59-g8ed1b


From e2f4f4e927f1356d561f612f1c55d64654f60353 Mon Sep 17 00:00:00 2001
From: Arjun Vynipadath <arjun@chelsio.com>
Date: Thu, 24 May 2018 19:33:37 +0530
Subject: cxgb4/cxgb4vf: Notify link changes to OS-dependent code

We have a confusion of two different abstractions in the Common
Code:  Physical Link (Port) and Logical Network Interface (Virtual
Interface), and we haven't been properly managing the state of the
intersection of those two abstractions.
On the one hand we have the Physical state of the Link -- up or down --
and on the other we have the logical state of the VI, enabled or not.
{ethN} refers to both the Physical and Logical State. In this case,
ifconfig only affects/interrogates the Logical State of a VI,
and ethtool only deals with the Physical State. And these are different.

So, just because we disable the VI, we don't really want to change the
Physical Link Up/Down state.  Thus, the previous hack to set
"lc->link_ok = 0" when we disable a VI is completely incorrect.

Where we get into trouble is where the Physical Link State and the
Logical VI State cross swords.  And that happens in
t4_handle_get_port_info() where we need to manage/safe the Physical
Link State, but we also need to know when the Logical VI State has
changed and pass that back up to the OS-dependent Driver routine
t4_os_link_changed() which is concerned about the Logical Interface.

So we enable a VI and that causes Firmware to send us a new Port
Information message, but if none of the Physical Link State
particulars have changed, we don't call t4_os_link_changed().

This fix uses the existing OS Contract APIs for the Common Code to
inform the OS-dependent portion of the Host Driver when the "Link" (really
Logical Network Interface) is "up" or "down". A new API
t4_enable_pi_params() is added which calls t4_enable_vi_params() and,
if that is successful, then calls back to the OS Contract API
t4_os_link_changed() notifying the OS-dependent layer of the
potential Link State change.

Original Work by : Casey Leedom <leedom@chelsio.com>

Signed-off-by: Santosh Rastapur <santosh@chelsio.com>
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  3 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  5 ++--
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c         | 28 ++++++++++++++++++++++
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    |  5 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h |  5 +++-
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c     | 24 +++++++++++++++++++
 6 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 0f305d97f4ee..0dbe2d9e22d6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1738,6 +1738,9 @@ int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid,
 		     bool ucast, u64 vec, bool sleep_ok);
 int t4_enable_vi_params(struct adapter *adap, unsigned int mbox,
 			unsigned int viid, bool rx_en, bool tx_en, bool dcb_en);
+int t4_enable_pi_params(struct adapter *adap, unsigned int mbox,
+			struct port_info *pi,
+			bool rx_en, bool tx_en, bool dcb_en);
 int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
 		 bool rx_en, bool tx_en);
 int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 903c0af37456..0efae2030e71 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -465,7 +465,7 @@ static int link_start(struct net_device *dev)
 				    &pi->link_cfg);
 	if (ret == 0) {
 		local_bh_disable();
-		ret = t4_enable_vi_params(pi->adapter, mb, pi->viid, true,
+		ret = t4_enable_pi_params(pi->adapter, mb, pi, true,
 					  true, CXGB4_DCB_ENABLED);
 		local_bh_enable();
 	}
@@ -2344,7 +2344,8 @@ static int cxgb_close(struct net_device *dev)
 
 	netif_tx_stop_all_queues(dev);
 	netif_carrier_off(dev);
-	ret = t4_enable_vi(adapter, adapter->pf, pi->viid, false, false);
+	ret = t4_enable_pi_params(adapter, adapter->pf, pi,
+				  false, false, false);
 #ifdef CONFIG_CHELSIO_T4_DCB
 	cxgb4_dcb_reset(dev);
 	dcb_tx_queue_prio_enable(dev, false);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 962384cdd64b..39da7e3c804b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -8013,6 +8013,34 @@ int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
 	return t4_enable_vi_params(adap, mbox, viid, rx_en, tx_en, 0);
 }
 
+/**
+ *	t4_enable_pi_params - enable/disable a Port's Virtual Interface
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @pi: the Port Information structure
+ *      @rx_en: 1=enable Rx, 0=disable Rx
+ *      @tx_en: 1=enable Tx, 0=disable Tx
+ *      @dcb_en: 1=enable delivery of Data Center Bridging messages.
+ *
+ *      Enables/disables a Port's Virtual Interface.  Note that setting DCB
+ *	Enable only makes sense when enabling a Virtual Interface ...
+ *	If the Virtual Interface enable/disable operation is successful,
+ *	we notify the OS-specific code of a potential Link Status change
+ *	via the OS Contract API t4_os_link_changed().
+ */
+int t4_enable_pi_params(struct adapter *adap, unsigned int mbox,
+			struct port_info *pi,
+			bool rx_en, bool tx_en, bool dcb_en)
+{
+	int ret = t4_enable_vi_params(adap, mbox, pi->viid,
+				      rx_en, tx_en, dcb_en);
+	if (ret)
+		return ret;
+	t4_os_link_changed(adap, pi->port_id,
+			   rx_en && tx_en && pi->link_cfg.link_ok);
+	return 0;
+}
+
 /**
  *	t4_identify_port - identify a VI's port by blinking its LED
  *	@adap: the adapter
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 71f13bd2b5e4..ff84791a0ff8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -274,7 +274,7 @@ static int link_start(struct net_device *dev)
 	 * is enabled on a port.
 	 */
 	if (ret == 0)
-		ret = t4vf_enable_vi(pi->adapter, pi->viid, true, true);
+		ret = t4vf_enable_pi(pi->adapter, pi, true, true);
 
 	/* The Virtual Interfaces are connected to an internal switch on the
 	 * chip which allows VIs attached to the same port to talk to each
@@ -822,8 +822,7 @@ static int cxgb4vf_stop(struct net_device *dev)
 
 	netif_tx_stop_all_queues(dev);
 	netif_carrier_off(dev);
-	t4vf_enable_vi(adapter, pi->viid, false, false);
-	pi->link_cfg.link_ok = 0;
+	t4vf_enable_pi(adapter, pi, false, false);
 
 	clear_bit(pi->port_id, &adapter->open_device_map);
 	if (adapter->open_device_map == 0)
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h
index 712e8f0c71b4..ccca67cf4487 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h
@@ -391,7 +391,10 @@ int t4vf_config_rss_range(struct adapter *, unsigned int, int, int,
 
 int t4vf_alloc_vi(struct adapter *, int);
 int t4vf_free_vi(struct adapter *, int);
-int t4vf_enable_vi(struct adapter *, unsigned int, bool, bool);
+int t4vf_enable_vi(struct adapter *adapter, unsigned int viid, bool rx_en,
+		   bool tx_en);
+int t4vf_enable_pi(struct adapter *adapter, struct port_info *pi, bool rx_en,
+		   bool tx_en);
 int t4vf_identify_port(struct adapter *, unsigned int, unsigned int);
 
 int t4vf_set_rxmode(struct adapter *, unsigned int, int, int, int, int, int,
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index 8786431a8825..5b8c08cf523f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -1399,6 +1399,30 @@ int t4vf_enable_vi(struct adapter *adapter, unsigned int viid,
 	return t4vf_wr_mbox(adapter, &cmd, sizeof(cmd), NULL);
 }
 
+/**
+ *	t4vf_enable_pi - enable/disable a Port's virtual interface
+ *	@adapter: the adapter
+ *	@pi: the Port Information structure
+ *	@rx_en: 1=enable Rx, 0=disable Rx
+ *	@tx_en: 1=enable Tx, 0=disable Tx
+ *
+ *	Enables/disables a Port's virtual interface.  If the Virtual
+ *	Interface enable/disable operation is successful, we notify the
+ *	OS-specific code of a potential Link Status change via the OS Contract
+ *	API t4vf_os_link_changed().
+ */
+int t4vf_enable_pi(struct adapter *adapter, struct port_info *pi,
+		   bool rx_en, bool tx_en)
+{
+	int ret = t4vf_enable_vi(adapter, pi->viid, rx_en, tx_en);
+
+	if (ret)
+		return ret;
+	t4vf_os_link_changed(adapter, pi->pidx,
+			     rx_en && tx_en && pi->link_cfg.link_ok);
+	return 0;
+}
+
 /**
  *	t4vf_identify_port - identify a VI's port by blinking its LED
  *	@adapter: the adapter
-- 
cgit v1.2.3-59-g8ed1b


From 87885310c199be78a144dff4fec8a94f081920b8 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:49 -0700
Subject: qede: Refactor ethtool rx classification flow.

This patch simplifies the ethtool rx flow configuration
[via ethtool -U/-N] flow code base by dividing it logically
into various APIs based on given protocols. It also separates
various validations and calculations done along the flow
in their own APIs.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 512 ++++++++++++++++---------
 1 file changed, 330 insertions(+), 182 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 43569b1839be..bd5b4e4c1d47 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -38,6 +38,7 @@
 #include <linux/qed/qed_if.h>
 #include "qede.h"
 
+#define QEDE_FILTER_PRINT_MAX_LEN	(64)
 struct qede_arfs_tuple {
 	union {
 		__be32 src_ipv4;
@@ -51,6 +52,18 @@ struct qede_arfs_tuple {
 	__be16  dst_port;
 	__be16  eth_proto;
 	u8      ip_proto;
+
+	/* Describe filtering mode needed for this kind of filter */
+	enum qed_filter_config_mode mode;
+
+	/* Used to compare new/old filters. Return true if IPs match */
+	bool (*ip_comp)(struct qede_arfs_tuple *a, struct qede_arfs_tuple *b);
+
+	/* Given an address into ethhdr build a header from tuple info */
+	void (*build_hdr)(struct qede_arfs_tuple *t, void *header);
+
+	/* Stringify the tuple for a print into the provided buffer */
+	void (*stringify)(struct qede_arfs_tuple *t, void *buffer);
 };
 
 struct qede_arfs_fltr_node {
@@ -90,7 +103,9 @@ struct qede_arfs {
 	spinlock_t		arfs_list_lock;
 	unsigned long		*arfs_fltr_bmap;
 	int			filter_count;
-	bool			enable;
+
+	/* Currently configured filtering mode */
+	enum qed_filter_config_mode mode;
 };
 
 static void qede_configure_arfs_fltr(struct qede_dev *edev,
@@ -110,11 +125,15 @@ static void qede_configure_arfs_fltr(struct qede_dev *edev,
 	params.qid = rxq_id;
 	params.b_is_add = add_fltr;
 
-	DP_VERBOSE(edev, NETIF_MSG_RX_STATUS,
-		   "%s arfs filter flow_id=%d, sw_id=%d, src_port=%d, dst_port=%d, rxq=%d\n",
-		   add_fltr ? "Adding" : "Deleting",
-		   n->flow_id, n->sw_id, ntohs(n->tuple.src_port),
-		   ntohs(n->tuple.dst_port), rxq_id);
+	if (n->tuple.stringify) {
+		char tuple_buffer[QEDE_FILTER_PRINT_MAX_LEN];
+
+		n->tuple.stringify(&n->tuple, tuple_buffer);
+		DP_VERBOSE(edev, NETIF_MSG_RX_STATUS,
+			   "%s sw_id[0x%x]: %s [queue %d]\n",
+			   add_fltr ? "Adding" : "Deleting",
+			   n->sw_id, tuple_buffer, rxq_id);
+	}
 
 	n->used = true;
 	n->filter_op = add_fltr;
@@ -145,14 +164,13 @@ qede_enqueue_fltr_and_config_searcher(struct qede_dev *edev,
 	INIT_HLIST_NODE(&fltr->node);
 	hlist_add_head(&fltr->node,
 		       QEDE_ARFS_BUCKET_HEAD(edev, bucket_idx));
-	edev->arfs->filter_count++;
-
-	if (edev->arfs->filter_count == 1 && !edev->arfs->enable) {
-		enum qed_filter_config_mode mode;
 
-		mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
-		edev->ops->configure_arfs_searcher(edev->cdev, mode);
-		edev->arfs->enable = true;
+	edev->arfs->filter_count++;
+	if (edev->arfs->filter_count == 1 &&
+	    edev->arfs->mode == QED_FILTER_CONFIG_MODE_DISABLE) {
+		edev->ops->configure_arfs_searcher(edev->cdev,
+						   fltr->tuple.mode);
+		edev->arfs->mode = fltr->tuple.mode;
 	}
 
 	return 0;
@@ -167,14 +185,15 @@ qede_dequeue_fltr_and_config_searcher(struct qede_dev *edev,
 			 fltr->buf_len, DMA_TO_DEVICE);
 
 	qede_free_arfs_filter(edev, fltr);
-	edev->arfs->filter_count--;
 
-	if (!edev->arfs->filter_count && edev->arfs->enable) {
+	edev->arfs->filter_count--;
+	if (!edev->arfs->filter_count &&
+	    edev->arfs->mode != QED_FILTER_CONFIG_MODE_DISABLE) {
 		enum qed_filter_config_mode mode;
 
 		mode = QED_FILTER_CONFIG_MODE_DISABLE;
-		edev->arfs->enable = false;
 		edev->ops->configure_arfs_searcher(edev->cdev, mode);
+		edev->arfs->mode = QED_FILTER_CONFIG_MODE_DISABLE;
 	}
 }
 
@@ -264,25 +283,17 @@ void qede_process_arfs_filters(struct qede_dev *edev, bool free_fltr)
 		}
 	}
 
+#ifdef CONFIG_RFS_ACCEL
 	spin_lock_bh(&edev->arfs->arfs_list_lock);
 
-	if (!edev->arfs->filter_count) {
-		if (edev->arfs->enable) {
-			enum qed_filter_config_mode mode;
-
-			mode = QED_FILTER_CONFIG_MODE_DISABLE;
-			edev->arfs->enable = false;
-			edev->ops->configure_arfs_searcher(edev->cdev, mode);
-		}
-#ifdef CONFIG_RFS_ACCEL
-	} else {
+	if (edev->arfs->filter_count) {
 		set_bit(QEDE_SP_ARFS_CONFIG, &edev->sp_flags);
 		schedule_delayed_work(&edev->sp_task,
 				      QEDE_SP_TASK_POLL_DELAY);
-#endif
 	}
 
 	spin_unlock_bh(&edev->arfs->arfs_list_lock);
+#endif
 }
 
 /* This function waits until all aRFS filters get deleted and freed.
@@ -512,6 +523,7 @@ int qede_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	eth->h_proto = skb->protocol;
 	n->tuple.eth_proto = skb->protocol;
 	n->tuple.ip_proto = ip_proto;
+	n->tuple.mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
 	memcpy(n->data + ETH_HLEN, skb->data, skb_headlen(skb));
 
 	rc = qede_enqueue_fltr_and_config_searcher(edev, n, tbl_idx);
@@ -1339,38 +1351,6 @@ qede_get_arfs_fltr_by_loc(struct hlist_head *head, u32 location)
 	return NULL;
 }
 
-static bool
-qede_compare_user_flow_ips(struct qede_arfs_fltr_node *tpos,
-			   struct ethtool_rx_flow_spec *fsp,
-			   __be16 proto)
-{
-	if (proto == htons(ETH_P_IP)) {
-		struct ethtool_tcpip4_spec *ip;
-
-		ip = &fsp->h_u.tcp_ip4_spec;
-
-		if (tpos->tuple.src_ipv4 == ip->ip4src &&
-		    tpos->tuple.dst_ipv4 == ip->ip4dst)
-			return true;
-		else
-			return false;
-	} else {
-		struct ethtool_tcpip6_spec *ip6;
-		struct in6_addr *src;
-
-		ip6 = &fsp->h_u.tcp_ip6_spec;
-		src = &tpos->tuple.src_ipv6;
-
-		if (!memcmp(src, &ip6->ip6src, sizeof(struct in6_addr)) &&
-		    !memcmp(&tpos->tuple.dst_ipv6, &ip6->ip6dst,
-			    sizeof(struct in6_addr)))
-			return true;
-		else
-			return false;
-	}
-	return false;
-}
-
 int qede_get_cls_rule_all(struct qede_dev *edev, struct ethtool_rxnfc *info,
 			  u32 *rule_locs)
 {
@@ -1461,96 +1441,306 @@ unlock:
 }
 
 static int
-qede_validate_and_check_flow_exist(struct qede_dev *edev,
-				   struct ethtool_rx_flow_spec *fsp,
-				   int *min_hlen)
+qede_poll_arfs_filter_config(struct qede_dev *edev,
+			     struct qede_arfs_fltr_node *fltr)
 {
-	__be16 src_port = 0x0, dst_port = 0x0;
-	struct qede_arfs_fltr_node *fltr;
-	struct hlist_node *temp;
-	struct hlist_head *head;
-	__be16 eth_proto;
-	u8 ip_proto;
+	int count = QEDE_ARFS_POLL_COUNT;
 
-	if (fsp->location >= QEDE_RFS_MAX_FLTR ||
-	    fsp->ring_cookie >= QEDE_RSS_COUNT(edev))
-		return -EINVAL;
+	while (fltr->used && count) {
+		msleep(20);
+		count--;
+	}
+
+	if (count == 0 || fltr->fw_rc) {
+		DP_NOTICE(edev, "Timeout in polling filter config\n");
+		qede_dequeue_fltr_and_config_searcher(edev, fltr);
+		return -EIO;
+	}
+
+	return fltr->fw_rc;
+}
+
+static int qede_flow_get_min_header_size(struct qede_arfs_tuple *t)
+{
+	int size = ETH_HLEN;
+
+	if (t->eth_proto == htons(ETH_P_IP))
+		size += sizeof(struct iphdr);
+	else
+		size += sizeof(struct ipv6hdr);
+
+	if (t->ip_proto == IPPROTO_TCP)
+		size += sizeof(struct tcphdr);
+	else
+		size += sizeof(struct udphdr);
+
+	return size;
+}
+
+static bool qede_flow_spec_ipv4_cmp(struct qede_arfs_tuple *a,
+				    struct qede_arfs_tuple *b)
+{
+	if (a->eth_proto != htons(ETH_P_IP) ||
+	    b->eth_proto != htons(ETH_P_IP))
+		return false;
+
+	return (a->src_ipv4 == b->src_ipv4) &&
+	       (a->dst_ipv4 == b->dst_ipv4);
+}
+
+static void qede_flow_build_ipv4_hdr(struct qede_arfs_tuple *t,
+				     void *header)
+{
+	__be16 *ports = (__be16 *)(header + ETH_HLEN + sizeof(struct iphdr));
+	struct iphdr *ip = (struct iphdr *)(header + ETH_HLEN);
+	struct ethhdr *eth = (struct ethhdr *)header;
+
+	eth->h_proto = t->eth_proto;
+	ip->saddr = t->src_ipv4;
+	ip->daddr = t->dst_ipv4;
+	ip->version = 0x4;
+	ip->ihl = 0x5;
+	ip->protocol = t->ip_proto;
+	ip->tot_len = cpu_to_be16(qede_flow_get_min_header_size(t) - ETH_HLEN);
+
+	/* ports is weakly typed to suit both TCP and UDP ports */
+	ports[0] = t->src_port;
+	ports[1] = t->dst_port;
+}
+
+static void qede_flow_stringify_ipv4_hdr(struct qede_arfs_tuple *t,
+					 void *buffer)
+{
+	const char *prefix = t->ip_proto == IPPROTO_TCP ? "TCP" : "UDP";
+
+	snprintf(buffer, QEDE_FILTER_PRINT_MAX_LEN,
+		 "%s %pI4 (%04x) -> %pI4 (%04x)",
+		 prefix, &t->src_ipv4, t->src_port,
+		 &t->dst_ipv4, t->dst_port);
+}
+
+static bool qede_flow_spec_ipv6_cmp(struct qede_arfs_tuple *a,
+				    struct qede_arfs_tuple *b)
+{
+	if (a->eth_proto != htons(ETH_P_IPV6) ||
+	    b->eth_proto != htons(ETH_P_IPV6))
+		return false;
+
+	if (memcmp(&a->src_ipv6, &b->src_ipv6, sizeof(struct in6_addr)))
+		return false;
+
+	if (memcmp(&a->dst_ipv6, &b->dst_ipv6, sizeof(struct in6_addr)))
+		return false;
+
+	return true;
+}
 
-	if (fsp->flow_type == TCP_V4_FLOW) {
-		*min_hlen += sizeof(struct iphdr) +
-				sizeof(struct tcphdr);
-		eth_proto = htons(ETH_P_IP);
-		ip_proto = IPPROTO_TCP;
-	} else if (fsp->flow_type == UDP_V4_FLOW) {
-		*min_hlen += sizeof(struct iphdr) +
-				sizeof(struct udphdr);
-		eth_proto = htons(ETH_P_IP);
-		ip_proto = IPPROTO_UDP;
-	} else if (fsp->flow_type == TCP_V6_FLOW) {
-		*min_hlen += sizeof(struct ipv6hdr) +
-				sizeof(struct tcphdr);
-		eth_proto = htons(ETH_P_IPV6);
-		ip_proto = IPPROTO_TCP;
-	} else if (fsp->flow_type == UDP_V6_FLOW) {
-		*min_hlen += sizeof(struct ipv6hdr) +
-				sizeof(struct udphdr);
-		eth_proto = htons(ETH_P_IPV6);
-		ip_proto = IPPROTO_UDP;
+static void qede_flow_build_ipv6_hdr(struct qede_arfs_tuple *t,
+				     void *header)
+{
+	__be16 *ports = (__be16 *)(header + ETH_HLEN + sizeof(struct ipv6hdr));
+	struct ipv6hdr *ip6 = (struct ipv6hdr *)(header + ETH_HLEN);
+	struct ethhdr *eth = (struct ethhdr *)header;
+
+	eth->h_proto = t->eth_proto;
+	memcpy(&ip6->saddr, &t->src_ipv6, sizeof(struct in6_addr));
+	memcpy(&ip6->daddr, &t->dst_ipv6, sizeof(struct in6_addr));
+	ip6->version = 0x6;
+
+	if (t->ip_proto == IPPROTO_TCP) {
+		ip6->nexthdr = NEXTHDR_TCP;
+		ip6->payload_len = cpu_to_be16(sizeof(struct tcphdr));
 	} else {
-		DP_NOTICE(edev, "Unsupported flow type = 0x%x\n",
-			  fsp->flow_type);
-		return -EPROTONOSUPPORT;
+		ip6->nexthdr = NEXTHDR_UDP;
+		ip6->payload_len = cpu_to_be16(sizeof(struct udphdr));
 	}
 
-	if (eth_proto == htons(ETH_P_IP)) {
-		src_port = fsp->h_u.tcp_ip4_spec.psrc;
-		dst_port = fsp->h_u.tcp_ip4_spec.pdst;
+	/* ports is weakly typed to suit both TCP and UDP ports */
+	ports[0] = t->src_port;
+	ports[1] = t->dst_port;
+}
+
+static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev,
+					       struct qede_arfs_tuple *t,
+					       struct ethtool_rx_flow_spec *fs)
+{
+	t->eth_proto = htons(ETH_P_IP);
+	t->src_ipv4 = fs->h_u.tcp_ip4_spec.ip4src;
+	t->dst_ipv4 = fs->h_u.tcp_ip4_spec.ip4dst;
+	t->src_port = fs->h_u.tcp_ip4_spec.psrc;
+	t->dst_port = fs->h_u.tcp_ip4_spec.pdst;
+
+	/* We must have a valid 4-tuple */
+	if (t->src_port && t->dst_port && t->src_ipv4 && t->dst_ipv4) {
+		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
 	} else {
-		src_port = fsp->h_u.tcp_ip6_spec.psrc;
-		dst_port = fsp->h_u.tcp_ip6_spec.pdst;
+		DP_INFO(edev, "Invalid N-tuple\n");
+		return -EOPNOTSUPP;
 	}
 
-	head = QEDE_ARFS_BUCKET_HEAD(edev, 0);
-	hlist_for_each_entry_safe(fltr, temp, head, node) {
-		if ((fltr->tuple.ip_proto == ip_proto &&
-		     fltr->tuple.eth_proto == eth_proto &&
-		     qede_compare_user_flow_ips(fltr, fsp, eth_proto) &&
-		     fltr->tuple.src_port == src_port &&
-		     fltr->tuple.dst_port == dst_port) ||
-		    fltr->sw_id == fsp->location)
-			return -EEXIST;
+	t->ip_comp = qede_flow_spec_ipv4_cmp;
+	t->build_hdr = qede_flow_build_ipv4_hdr;
+	t->stringify = qede_flow_stringify_ipv4_hdr;
+
+	return 0;
+}
+
+static int qede_flow_spec_to_tuple_tcpv4(struct qede_dev *edev,
+					 struct qede_arfs_tuple *t,
+					 struct ethtool_rx_flow_spec *fs)
+{
+	t->ip_proto = IPPROTO_TCP;
+
+	if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qede_flow_spec_to_tuple_udpv4(struct qede_dev *edev,
+					 struct qede_arfs_tuple *t,
+					 struct ethtool_rx_flow_spec *fs)
+{
+	t->ip_proto = IPPROTO_UDP;
+
+	if (qede_flow_spec_to_tuple_ipv4_common(edev, t, fs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev,
+					       struct qede_arfs_tuple *t,
+					       struct ethtool_rx_flow_spec *fs)
+{
+	struct in6_addr zero_addr;
+	void *p;
+
+	p = &zero_addr;
+	memset(p, 0, sizeof(zero_addr));
+
+	t->eth_proto = htons(ETH_P_IPV6);
+	memcpy(&t->src_ipv6, &fs->h_u.tcp_ip6_spec.ip6src,
+	       sizeof(struct in6_addr));
+	memcpy(&t->dst_ipv6, &fs->h_u.tcp_ip6_spec.ip6dst,
+	       sizeof(struct in6_addr));
+	t->src_port = fs->h_u.tcp_ip6_spec.psrc;
+	t->dst_port = fs->h_u.tcp_ip6_spec.pdst;
+
+	/* We must make sure we have a valid 4-tuple */
+	if (t->src_port && t->dst_port &&
+	    memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) &&
+	    memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) {
+		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+	} else {
+		DP_INFO(edev, "Invalid N-tuple\n");
+		return -EOPNOTSUPP;
 	}
 
+	t->ip_comp = qede_flow_spec_ipv6_cmp;
+	t->build_hdr = qede_flow_build_ipv6_hdr;
+
 	return 0;
 }
 
-static int
-qede_poll_arfs_filter_config(struct qede_dev *edev,
-			     struct qede_arfs_fltr_node *fltr)
+static int qede_flow_spec_to_tuple_tcpv6(struct qede_dev *edev,
+					 struct qede_arfs_tuple *t,
+					 struct ethtool_rx_flow_spec *fs)
 {
-	int count = QEDE_ARFS_POLL_COUNT;
+	t->ip_proto = IPPROTO_TCP;
 
-	while (fltr->used && count) {
-		msleep(20);
-		count--;
+	if (qede_flow_spec_to_tuple_ipv6_common(edev, t, fs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qede_flow_spec_to_tuple_udpv6(struct qede_dev *edev,
+					 struct qede_arfs_tuple *t,
+					 struct ethtool_rx_flow_spec *fs)
+{
+	t->ip_proto = IPPROTO_UDP;
+
+	if (qede_flow_spec_to_tuple_ipv6_common(edev, t, fs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int qede_flow_spec_to_tuple(struct qede_dev *edev,
+				   struct qede_arfs_tuple *t,
+				   struct ethtool_rx_flow_spec *fs)
+{
+	memset(t, 0, sizeof(*t));
+
+	switch ((fs->flow_type & ~FLOW_EXT)) {
+	case TCP_V4_FLOW:
+		return qede_flow_spec_to_tuple_tcpv4(edev, t, fs);
+	case UDP_V4_FLOW:
+		return qede_flow_spec_to_tuple_udpv4(edev, t, fs);
+	case TCP_V6_FLOW:
+		return qede_flow_spec_to_tuple_tcpv6(edev, t, fs);
+	case UDP_V6_FLOW:
+		return qede_flow_spec_to_tuple_udpv6(edev, t, fs);
+	default:
+		DP_VERBOSE(edev, NETIF_MSG_IFUP,
+			   "Can't support flow of type %08x\n", fs->flow_type);
+		return -EOPNOTSUPP;
 	}
 
-	if (count == 0 || fltr->fw_rc) {
-		qede_dequeue_fltr_and_config_searcher(edev, fltr);
-		return -EIO;
+	return 0;
+}
+
+static int qede_flow_spec_validate(struct qede_dev *edev,
+				   struct ethtool_rx_flow_spec *fs,
+				   struct qede_arfs_tuple *t)
+{
+	if (fs->location >= QEDE_RFS_MAX_FLTR) {
+		DP_INFO(edev, "Location out-of-bounds\n");
+		return -EINVAL;
 	}
 
-	return fltr->fw_rc;
+	/* Check location isn't already in use */
+	if (test_bit(fs->location, edev->arfs->arfs_fltr_bmap)) {
+		DP_INFO(edev, "Location already in use\n");
+		return -EINVAL;
+	}
+
+	if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) {
+		DP_INFO(edev, "Queue out-of-bounds\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Must be called while qede lock is held */
+static struct qede_arfs_fltr_node *
+qede_flow_find_fltr(struct qede_dev *edev, struct qede_arfs_tuple *t)
+{
+	struct qede_arfs_fltr_node *fltr;
+	struct hlist_node *temp;
+	struct hlist_head *head;
+
+	head = QEDE_ARFS_BUCKET_HEAD(edev, 0);
+
+	hlist_for_each_entry_safe(fltr, temp, head, node) {
+		if (fltr->tuple.ip_proto == t->ip_proto &&
+		    fltr->tuple.src_port == t->src_port &&
+		    fltr->tuple.dst_port == t->dst_port &&
+		    t->ip_comp(&fltr->tuple, t))
+			return fltr;
+	}
+
+	return NULL;
 }
 
 int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 {
 	struct ethtool_rx_flow_spec *fsp = &info->fs;
 	struct qede_arfs_fltr_node *n;
-	int min_hlen = ETH_HLEN, rc;
-	struct ethhdr *eth;
-	struct iphdr *ip;
-	__be16 *ports;
+	struct qede_arfs_tuple t;
+	int min_hlen, rc;
 
 	__qede_lock(edev);
 
@@ -1559,16 +1749,28 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 		goto unlock;
 	}
 
-	rc = qede_validate_and_check_flow_exist(edev, fsp, &min_hlen);
+	/* Translate the flow specification into something fittign our DB */
+	rc = qede_flow_spec_to_tuple(edev, &t, fsp);
+	if (rc)
+		goto unlock;
+
+	/* Make sure location is valid and filter isn't already set */
+	rc = qede_flow_spec_validate(edev, fsp, &t);
 	if (rc)
 		goto unlock;
 
+	if (qede_flow_find_fltr(edev, &t)) {
+		rc = -EINVAL;
+		goto unlock;
+	}
+
 	n = kzalloc(sizeof(*n), GFP_KERNEL);
 	if (!n) {
 		rc = -ENOMEM;
 		goto unlock;
 	}
 
+	min_hlen = qede_flow_get_min_header_size(&t);
 	n->data = kzalloc(min_hlen, GFP_KERNEL);
 	if (!n->data) {
 		kfree(n);
@@ -1581,66 +1783,11 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 	n->buf_len = min_hlen;
 	n->rxq_id = fsp->ring_cookie;
 	n->next_rxq_id = n->rxq_id;
-	eth = (struct ethhdr *)n->data;
 
-	if (info->fs.flow_type == TCP_V4_FLOW ||
-	    info->fs.flow_type == UDP_V4_FLOW) {
-		ports = (__be16 *)(n->data + ETH_HLEN +
-					sizeof(struct iphdr));
-		eth->h_proto = htons(ETH_P_IP);
-		n->tuple.eth_proto = htons(ETH_P_IP);
-		n->tuple.src_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4src;
-		n->tuple.dst_ipv4 = info->fs.h_u.tcp_ip4_spec.ip4dst;
-		n->tuple.src_port = info->fs.h_u.tcp_ip4_spec.psrc;
-		n->tuple.dst_port = info->fs.h_u.tcp_ip4_spec.pdst;
-		ports[0] = n->tuple.src_port;
-		ports[1] = n->tuple.dst_port;
-		ip = (struct iphdr *)(n->data + ETH_HLEN);
-		ip->saddr = info->fs.h_u.tcp_ip4_spec.ip4src;
-		ip->daddr = info->fs.h_u.tcp_ip4_spec.ip4dst;
-		ip->version = 0x4;
-		ip->ihl = 0x5;
-
-		if (info->fs.flow_type == TCP_V4_FLOW) {
-			n->tuple.ip_proto = IPPROTO_TCP;
-			ip->protocol = IPPROTO_TCP;
-		} else {
-			n->tuple.ip_proto = IPPROTO_UDP;
-			ip->protocol = IPPROTO_UDP;
-		}
-		ip->tot_len = cpu_to_be16(min_hlen - ETH_HLEN);
-	} else {
-		struct ipv6hdr *ip6;
-
-		ip6 = (struct ipv6hdr *)(n->data + ETH_HLEN);
-		ports = (__be16 *)(n->data + ETH_HLEN +
-					sizeof(struct ipv6hdr));
-		eth->h_proto = htons(ETH_P_IPV6);
-		n->tuple.eth_proto = htons(ETH_P_IPV6);
-		memcpy(&n->tuple.src_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6src,
-		       sizeof(struct in6_addr));
-		memcpy(&n->tuple.dst_ipv6, &info->fs.h_u.tcp_ip6_spec.ip6dst,
-		       sizeof(struct in6_addr));
-		n->tuple.src_port = info->fs.h_u.tcp_ip6_spec.psrc;
-		n->tuple.dst_port = info->fs.h_u.tcp_ip6_spec.pdst;
-		ports[0] = n->tuple.src_port;
-		ports[1] = n->tuple.dst_port;
-		memcpy(&ip6->saddr, &n->tuple.src_ipv6,
-		       sizeof(struct in6_addr));
-		memcpy(&ip6->daddr, &n->tuple.dst_ipv6,
-		       sizeof(struct in6_addr));
-		ip6->version = 0x6;
+	memcpy(&n->tuple, &t, sizeof(n->tuple));
 
-		if (info->fs.flow_type == TCP_V6_FLOW) {
-			n->tuple.ip_proto = IPPROTO_TCP;
-			ip6->nexthdr = NEXTHDR_TCP;
-			ip6->payload_len = cpu_to_be16(sizeof(struct tcphdr));
-		} else {
-			n->tuple.ip_proto = IPPROTO_UDP;
-			ip6->nexthdr = NEXTHDR_UDP;
-			ip6->payload_len = cpu_to_be16(sizeof(struct udphdr));
-		}
-	}
+	/* Build a minimal header according to the flow */
+	n->tuple.build_hdr(&n->tuple, n->data);
 
 	rc = qede_enqueue_fltr_and_config_searcher(edev, n, 0);
 	if (rc)
@@ -1650,6 +1797,7 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 	rc = qede_poll_arfs_filter_config(edev, n);
 unlock:
 	__qede_unlock(edev);
+
 	return rc;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 89ffd14ee95dca812874fcd25ad3538ff3592a49 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:50 -0700
Subject: qede: Validate unsupported configurations

Validate and prevent some of the configurations for
unsupported [by firmware] inputs [for example - mac ext,
vlans, masks/prefix, tos/tclass] via ethtool -N/-U.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 73 ++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index bd5b4e4c1d47..43ed420900c1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1560,10 +1560,63 @@ static void qede_flow_build_ipv6_hdr(struct qede_arfs_tuple *t,
 	ports[1] = t->dst_port;
 }
 
+/* Validate fields which are set and not accepted by the driver */
+static int qede_flow_spec_validate_unused(struct qede_dev *edev,
+					  struct ethtool_rx_flow_spec *fs)
+{
+	if (fs->flow_type & FLOW_MAC_EXT) {
+		DP_INFO(edev, "Don't support MAC extensions\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->flow_type & FLOW_EXT) &&
+	    (fs->h_ext.vlan_etype || fs->h_ext.vlan_tci)) {
+		DP_INFO(edev, "Don't support vlan-based classification\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->flow_type & FLOW_EXT) &&
+	    (fs->h_ext.data[0] || fs->h_ext.data[1])) {
+		DP_INFO(edev, "Don't support user defined data\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev,
 					       struct qede_arfs_tuple *t,
 					       struct ethtool_rx_flow_spec *fs)
 {
+	if ((fs->h_u.tcp_ip4_spec.ip4src &
+	     fs->m_u.tcp_ip4_spec.ip4src) != fs->h_u.tcp_ip4_spec.ip4src) {
+		DP_INFO(edev, "Don't support IP-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->h_u.tcp_ip4_spec.ip4dst &
+	     fs->m_u.tcp_ip4_spec.ip4dst) != fs->h_u.tcp_ip4_spec.ip4dst) {
+		DP_INFO(edev, "Don't support IP-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->h_u.tcp_ip4_spec.psrc &
+	     fs->m_u.tcp_ip4_spec.psrc) != fs->h_u.tcp_ip4_spec.psrc) {
+		DP_INFO(edev, "Don't support port-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->h_u.tcp_ip4_spec.pdst &
+	     fs->m_u.tcp_ip4_spec.pdst) != fs->h_u.tcp_ip4_spec.pdst) {
+		DP_INFO(edev, "Don't support port-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (fs->h_u.tcp_ip4_spec.tos) {
+		DP_INFO(edev, "Don't support tos\n");
+		return -EOPNOTSUPP;
+	}
+
 	t->eth_proto = htons(ETH_P_IP);
 	t->src_ipv4 = fs->h_u.tcp_ip4_spec.ip4src;
 	t->dst_ipv4 = fs->h_u.tcp_ip4_spec.ip4dst;
@@ -1619,6 +1672,23 @@ static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev,
 	p = &zero_addr;
 	memset(p, 0, sizeof(zero_addr));
 
+	if ((fs->h_u.tcp_ip6_spec.psrc &
+	     fs->m_u.tcp_ip6_spec.psrc) != fs->h_u.tcp_ip6_spec.psrc) {
+		DP_INFO(edev, "Don't support port-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((fs->h_u.tcp_ip6_spec.pdst &
+	     fs->m_u.tcp_ip6_spec.pdst) != fs->h_u.tcp_ip6_spec.pdst) {
+		DP_INFO(edev, "Don't support port-masks\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (fs->h_u.tcp_ip6_spec.tclass) {
+		DP_INFO(edev, "Don't support tclass\n");
+		return -EOPNOTSUPP;
+	}
+
 	t->eth_proto = htons(ETH_P_IPV6);
 	memcpy(&t->src_ipv6, &fs->h_u.tcp_ip6_spec.ip6src,
 	       sizeof(struct in6_addr));
@@ -1673,6 +1743,9 @@ static int qede_flow_spec_to_tuple(struct qede_dev *edev,
 {
 	memset(t, 0, sizeof(*t));
 
+	if (qede_flow_spec_validate_unused(edev, fs))
+		return -EOPNOTSUPP;
+
 	switch ((fs->flow_type & ~FLOW_EXT)) {
 	case TCP_V4_FLOW:
 		return qede_flow_spec_to_tuple_tcpv4(edev, t, fs);
-- 
cgit v1.2.3-59-g8ed1b


From 3893fc62b1769db3ef160f7f1e36d3db754497ee Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:51 -0700
Subject: qed*: Support other classification modes.

Currently, driver supports flow classification to PF
receive queues based on TCP/UDP 4 tuples [src_ip, dst_ip,
src_port, dst_port] only.

This patch enables to configure different flow profiles
[For example - only UDP dest port or src_ip based] on the
adapter so that classification can be done according to
just those fields as well. Although, at a time just one
type of flow configuration is supported due to limited
number of flow profiles available on the device.

For example -

ethtool -N enp7s0f0 flow-type udp4 dst-port 45762 action 2
ethtool -N enp7s0f0 flow-type tcp4 src-ip 192.16.4.10 action 1
ethtool -N enp7s0f0 flow-type udp6 dst-port 45762 action 3

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 31 ++++++++++++++++++++++++--
 include/linux/qed/qed_eth_if.h                 |  1 +
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 5e655c3601cf..3cb8a8088467 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1973,6 +1973,8 @@ qed_arfs_mode_to_hsi(enum qed_filter_config_mode mode)
 		return GFT_PROFILE_TYPE_4_TUPLE;
 	if (mode == QED_FILTER_CONFIG_MODE_IP_DEST)
 		return GFT_PROFILE_TYPE_IP_DST_ADDR;
+	if (mode == QED_FILTER_CONFIG_MODE_IP_SRC)
+		return GFT_PROFILE_TYPE_IP_SRC_ADDR;
 	return GFT_PROFILE_TYPE_L4_DST_PORT;
 }
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 43ed420900c1..9b84f0cb12e7 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1623,9 +1623,17 @@ static int qede_flow_spec_to_tuple_ipv4_common(struct qede_dev *edev,
 	t->src_port = fs->h_u.tcp_ip4_spec.psrc;
 	t->dst_port = fs->h_u.tcp_ip4_spec.pdst;
 
-	/* We must have a valid 4-tuple */
+	/* We must either have a valid 4-tuple or only dst port
+	 * or only src ip as an input
+	 */
 	if (t->src_port && t->dst_port && t->src_ipv4 && t->dst_ipv4) {
 		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+	} else if (!t->src_port && t->dst_port &&
+		   !t->src_ipv4 && !t->dst_ipv4) {
+		t->mode = QED_FILTER_CONFIG_MODE_L4_PORT;
+	}  else if (!t->src_port && !t->dst_port &&
+		    !t->dst_ipv4 && t->src_ipv4) {
+		t->mode = QED_FILTER_CONFIG_MODE_IP_SRC;
 	} else {
 		DP_INFO(edev, "Invalid N-tuple\n");
 		return -EOPNOTSUPP;
@@ -1697,11 +1705,21 @@ static int qede_flow_spec_to_tuple_ipv6_common(struct qede_dev *edev,
 	t->src_port = fs->h_u.tcp_ip6_spec.psrc;
 	t->dst_port = fs->h_u.tcp_ip6_spec.pdst;
 
-	/* We must make sure we have a valid 4-tuple */
+	/* We must make sure we have a valid 4-tuple or only dest port
+	 * or only src ip as an input
+	 */
 	if (t->src_port && t->dst_port &&
 	    memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) &&
 	    memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) {
 		t->mode = QED_FILTER_CONFIG_MODE_5_TUPLE;
+	} else if (!t->src_port && t->dst_port &&
+		   !memcmp(&t->src_ipv6, p, sizeof(struct in6_addr)) &&
+		   !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr))) {
+		t->mode = QED_FILTER_CONFIG_MODE_L4_PORT;
+	} else if (!t->src_port && !t->dst_port &&
+		   !memcmp(&t->dst_ipv6, p, sizeof(struct in6_addr)) &&
+		   memcmp(&t->src_ipv6, p, sizeof(struct in6_addr))) {
+		t->mode = QED_FILTER_CONFIG_MODE_IP_SRC;
 	} else {
 		DP_INFO(edev, "Invalid N-tuple\n");
 		return -EOPNOTSUPP;
@@ -1779,6 +1797,15 @@ static int qede_flow_spec_validate(struct qede_dev *edev,
 		return -EINVAL;
 	}
 
+	/* Check if the filtering-mode could support the filter */
+	if (edev->arfs->filter_count &&
+	    edev->arfs->mode != t->mode) {
+		DP_INFO(edev,
+			"flow_spec would require filtering mode %08x, but %08x is configured\n",
+			t->mode, edev->arfs->filter_count);
+		return -EINVAL;
+	}
+
 	if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) {
 		DP_INFO(edev, "Queue out-of-bounds\n");
 		return -EINVAL;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 7f9756fe9180..557e86e1c7d9 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -66,6 +66,7 @@ enum qed_filter_config_mode {
 	QED_FILTER_CONFIG_MODE_5_TUPLE,
 	QED_FILTER_CONFIG_MODE_L4_PORT,
 	QED_FILTER_CONFIG_MODE_IP_DEST,
+	QED_FILTER_CONFIG_MODE_IP_SRC,
 };
 
 struct qed_ntuple_filter_params {
-- 
cgit v1.2.3-59-g8ed1b


From 39385ab02c3e6ffe8f70a445433c7419fd2df753 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:52 -0700
Subject: qede: Support flow classification to the VFs.

With the supported classification modes [4 tuples based,
udp port based, src-ip based], flows can be classified
to the VFs as well. With this patch, flows can be re-directed
to the requested VF provided in "action" field of command.

Please note that driver doesn't really care about the queue bits
in "action" field for the VFs. Since queue will be still chosen
by FW using RSS hash. [I.e., the classification would be done
according to vport-only]

For examples -

ethtool -N p5p1 flow-type udp4 dst-port 8000 action 0x100000000
ethtool -N p5p1 flow-type tcp4 src-ip 192.16.6.10 action 0x200000000
ethtool -U p5p1 flow-type tcp4 src-ip 192.168.40.100 dst-ip \
	192.168.40.200 src-port 6660 dst-port 5550 \
	action 0x100000000

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 34 +++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 9b84f0cb12e7..6c02c21d996d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -86,6 +86,7 @@ struct qede_arfs_fltr_node {
 	u16 sw_id;
 	u16 rxq_id;
 	u16 next_rxq_id;
+	u8 vfid;
 	bool filter_op;
 	bool used;
 	u8 fw_rc;
@@ -125,14 +126,19 @@ static void qede_configure_arfs_fltr(struct qede_dev *edev,
 	params.qid = rxq_id;
 	params.b_is_add = add_fltr;
 
+	if (n->vfid) {
+		params.b_is_vf = true;
+		params.vf_id = n->vfid - 1;
+	}
+
 	if (n->tuple.stringify) {
 		char tuple_buffer[QEDE_FILTER_PRINT_MAX_LEN];
 
 		n->tuple.stringify(&n->tuple, tuple_buffer);
 		DP_VERBOSE(edev, NETIF_MSG_RX_STATUS,
-			   "%s sw_id[0x%x]: %s [queue %d]\n",
+			   "%s sw_id[0x%x]: %s [vf %u queue %d]\n",
 			   add_fltr ? "Adding" : "Deleting",
-			   n->sw_id, tuple_buffer, rxq_id);
+			   n->sw_id, tuple_buffer, n->vfid, rxq_id);
 	}
 
 	n->used = true;
@@ -1435,6 +1441,10 @@ int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd)
 
 	fsp->ring_cookie = fltr->rxq_id;
 
+	if (fltr->vfid) {
+		fsp->ring_cookie |= ((u64)fltr->vfid) <<
+					ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+	}
 unlock:
 	__qede_unlock(edev);
 	return rc;
@@ -1806,6 +1816,9 @@ static int qede_flow_spec_validate(struct qede_dev *edev,
 		return -EINVAL;
 	}
 
+	if (ethtool_get_flow_spec_ring_vf(fs->ring_cookie))
+		return 0;
+
 	if (fs->ring_cookie >= QEDE_RSS_COUNT(edev)) {
 		DP_INFO(edev, "Queue out-of-bounds\n");
 		return -EINVAL;
@@ -1835,6 +1848,19 @@ qede_flow_find_fltr(struct qede_dev *edev, struct qede_arfs_tuple *t)
 	return NULL;
 }
 
+static void qede_flow_set_destination(struct qede_dev *edev,
+				      struct qede_arfs_fltr_node *n,
+				      struct ethtool_rx_flow_spec *fs)
+{
+	n->vfid = ethtool_get_flow_spec_ring_vf(fs->ring_cookie);
+	n->rxq_id = ethtool_get_flow_spec_ring(fs->ring_cookie);
+	n->next_rxq_id = n->rxq_id;
+
+	if (n->vfid)
+		DP_VERBOSE(edev, QED_MSG_SP,
+			   "Configuring N-tuple for VF 0x%02x\n", n->vfid - 1);
+}
+
 int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 {
 	struct ethtool_rx_flow_spec *fsp = &info->fs;
@@ -1881,11 +1907,11 @@ int qede_add_cls_rule(struct qede_dev *edev, struct ethtool_rxnfc *info)
 	n->sw_id = fsp->location;
 	set_bit(n->sw_id, edev->arfs->arfs_fltr_bmap);
 	n->buf_len = min_hlen;
-	n->rxq_id = fsp->ring_cookie;
-	n->next_rxq_id = n->rxq_id;
 
 	memcpy(&n->tuple, &t, sizeof(n->tuple));
 
+	qede_flow_set_destination(edev, n, fsp);
+
 	/* Build a minimal header according to the flow */
 	n->tuple.build_hdr(&n->tuple, n->data);
 
-- 
cgit v1.2.3-59-g8ed1b


From 608e00d0a2eb53079c55dc9c14d8711bbb3a4390 Mon Sep 17 00:00:00 2001
From: Manish Chopra <manish.chopra@cavium.com>
Date: Thu, 24 May 2018 09:54:53 -0700
Subject: qed*: Support drop action classification

With this patch, User can configure for the supported
flows to be dropped. Added a stat "gft_filter_drop"
as well to be populated in ethtool for the dropped flows.

For example -

ethtool -N p5p1 flow-type udp4 dst-port 8000 action -1
ethtool -N p5p1 flow-type tcp4 scr-ip 192.168.8.1 action -1

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Shahed Shaikh <shahed.shaikh@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 35 ++++++++++++++-----------
 drivers/net/ethernet/qlogic/qede/qede.h         |  1 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  1 +
 drivers/net/ethernet/qlogic/qede/qede_filter.c  | 14 ++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  1 +
 include/linux/qed/qed_eth_if.h                  |  3 +++
 include/linux/qed/qed_if.h                      |  1 +
 7 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 3cb8a8088467..1c0d0c217936 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1677,6 +1677,8 @@ static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn,
 	    HILO_64_REGPAIR(tstats.mftag_filter_discard);
 	p_stats->common.mac_filter_discards +=
 	    HILO_64_REGPAIR(tstats.eth_mac_filter_discard);
+	p_stats->common.gft_filter_drop +=
+		HILO_64_REGPAIR(tstats.eth_gft_drop_pkt);
 }
 
 static void __qed_get_vport_ustats_addrlen(struct qed_hwfn *p_hwfn,
@@ -2015,16 +2017,6 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
 	u8 abs_vport_id = 0;
 	int rc = -EINVAL;
 
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
-		rc = qed_fw_l2_queue(p_hwfn, p_params->qid, &abs_rx_q_id);
-		if (rc)
-			return rc;
-	}
-
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
 	init_data.cid = qed_spq_get_cid(p_hwfn);
@@ -2049,15 +2041,28 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn,
 	DMA_REGPAIR_LE(p_ramrod->pkt_hdr_addr, p_params->addr);
 	p_ramrod->pkt_hdr_length = cpu_to_le16(p_params->length);
 
-	if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
-		p_ramrod->rx_qid_valid = 1;
-		p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id);
+	if (p_params->b_is_drop) {
+		p_ramrod->vport_id = cpu_to_le16(ETH_GFT_TRASHCAN_VPORT);
+	} else {
+		rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
+		if (rc)
+			return rc;
+
+		if (p_params->qid != QED_RFS_NTUPLE_QID_RSS) {
+			rc = qed_fw_l2_queue(p_hwfn, p_params->qid,
+					     &abs_rx_q_id);
+			if (rc)
+				return rc;
+
+			p_ramrod->rx_qid_valid = 1;
+			p_ramrod->rx_qid = cpu_to_le16(abs_rx_q_id);
+		}
+
+		p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id);
 	}
 
 	p_ramrod->flow_id_valid = 0;
 	p_ramrod->flow_id = 0;
-
-	p_ramrod->vport_id = cpu_to_le16((u16)abs_vport_id);
 	p_ramrod->filter_action = p_params->b_is_add ? GFT_ADD_FILTER
 	    : GFT_DELETE_FILTER;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 2d3f09ed413b..81c5c8dfa2ef 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -75,6 +75,7 @@ struct qede_stats_common {
 	u64 rx_bcast_pkts;
 	u64 mftag_filter_discards;
 	u64 mac_filter_discards;
+	u64 gft_filter_drop;
 	u64 tx_ucast_bytes;
 	u64 tx_mcast_bytes;
 	u64 tx_bcast_bytes;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 8c6fdad91986..6906e04b609e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -161,6 +161,7 @@ static const struct {
 	QEDE_STAT(no_buff_discards),
 	QEDE_PF_STAT(mftag_filter_discards),
 	QEDE_PF_STAT(mac_filter_discards),
+	QEDE_PF_STAT(gft_filter_drop),
 	QEDE_STAT(tx_err_drop_pkts),
 	QEDE_STAT(ttl0_discard),
 	QEDE_STAT(packet_too_big_discard),
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 6c02c21d996d..e9e088d9c815 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -90,6 +90,7 @@ struct qede_arfs_fltr_node {
 	bool filter_op;
 	bool used;
 	u8 fw_rc;
+	bool b_is_drop;
 	struct hlist_node node;
 };
 
@@ -125,6 +126,7 @@ static void qede_configure_arfs_fltr(struct qede_dev *edev,
 	params.length = n->buf_len;
 	params.qid = rxq_id;
 	params.b_is_add = add_fltr;
+	params.b_is_drop = n->b_is_drop;
 
 	if (n->vfid) {
 		params.b_is_vf = true;
@@ -1445,6 +1447,9 @@ int qede_get_cls_rule_entry(struct qede_dev *edev, struct ethtool_rxnfc *cmd)
 		fsp->ring_cookie |= ((u64)fltr->vfid) <<
 					ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
 	}
+
+	if (fltr->b_is_drop)
+		fsp->ring_cookie = RX_CLS_FLOW_DISC;
 unlock:
 	__qede_unlock(edev);
 	return rc;
@@ -1816,6 +1821,10 @@ static int qede_flow_spec_validate(struct qede_dev *edev,
 		return -EINVAL;
 	}
 
+	/* If drop requested then no need to validate other data */
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC)
+		return 0;
+
 	if (ethtool_get_flow_spec_ring_vf(fs->ring_cookie))
 		return 0;
 
@@ -1852,6 +1861,11 @@ static void qede_flow_set_destination(struct qede_dev *edev,
 				      struct qede_arfs_fltr_node *n,
 				      struct ethtool_rx_flow_spec *fs)
 {
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
+		n->b_is_drop = true;
+		return;
+	}
+
 	n->vfid = ethtool_get_flow_spec_ring_vf(fs->ring_cookie);
 	n->rxq_id = ethtool_get_flow_spec_ring(fs->ring_cookie);
 	n->next_rxq_id = n->rxq_id;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 9e70f713c3c5..d118771e1a7b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -347,6 +347,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	p_common->rx_bcast_pkts = stats.common.rx_bcast_pkts;
 	p_common->mftag_filter_discards = stats.common.mftag_filter_discards;
 	p_common->mac_filter_discards = stats.common.mac_filter_discards;
+	p_common->gft_filter_drop = stats.common.gft_filter_drop;
 
 	p_common->tx_ucast_bytes = stats.common.tx_ucast_bytes;
 	p_common->tx_mcast_bytes = stats.common.tx_mcast_bytes;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 557e86e1c7d9..2978fa4add42 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -89,6 +89,9 @@ struct qed_ntuple_filter_params {
 
 	/* true iff this filter is to be added. Else to be removed */
 	bool b_is_add;
+
+	/* If flow needs to be dropped */
+	bool b_is_drop;
 };
 
 struct qed_dev_eth_info {
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 44af652e7407..ac991a3b8f03 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1129,6 +1129,7 @@ struct qed_eth_stats_common {
 	u64	rx_bcast_pkts;
 	u64	mftag_filter_discards;
 	u64	mac_filter_discards;
+	u64	gft_filter_drop;
 	u64	tx_ucast_bytes;
 	u64	tx_mcast_bytes;
 	u64	tx_bcast_bytes;
-- 
cgit v1.2.3-59-g8ed1b


From a45675796a0763f5e0ffb75496c3865e961560d3 Mon Sep 17 00:00:00 2001
From: Bo Chen <chenbo@pdx.edu>
Date: Thu, 24 May 2018 12:48:35 -0700
Subject: 8139too: Remove unnecessary netif_napi_del()

The call to free_netdev() in __rtl8139_cleanup_dev() clears the network device
napi list, and explicit calls to netif_napi_del() are unnecessary.

Signed-off-by: Bo Chen <chenbo@pdx.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/realtek/8139too.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index d118da5a10a2..ffd68a7bc9e1 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -1104,7 +1104,6 @@ static int rtl8139_init_one(struct pci_dev *pdev,
 	return 0;
 
 err_out:
-	netif_napi_del(&tp->napi);
 	__rtl8139_cleanup_dev (dev);
 	pci_disable_device (pdev);
 	return i;
@@ -1119,7 +1118,6 @@ static void rtl8139_remove_one(struct pci_dev *pdev)
 	assert (dev != NULL);
 
 	cancel_delayed_work_sync(&tp->thread);
-	netif_napi_del(&tp->napi);
 
 	unregister_netdev (dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5972be6b2495c6bffbf444497517fd1c070eef78 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Thu, 24 May 2018 17:56:42 -0700
Subject: openvswitch: Add conntrack limit netlink definition

Define netlink messages and attributes to support user kernel
communication that uses the conntrack limit feature.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 713e56ce681f..863aabaa5cc9 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -937,4 +937,32 @@ enum ovs_meter_band_type {
 
 #define OVS_METER_BAND_TYPE_MAX (__OVS_METER_BAND_TYPE_MAX - 1)
 
+/* Conntrack limit */
+#define OVS_CT_LIMIT_FAMILY  "ovs_ct_limit"
+#define OVS_CT_LIMIT_MCGROUP "ovs_ct_limit"
+#define OVS_CT_LIMIT_VERSION 0x1
+
+enum ovs_ct_limit_cmd {
+	OVS_CT_LIMIT_CMD_UNSPEC,
+	OVS_CT_LIMIT_CMD_SET,		/* Add or modify ct limit. */
+	OVS_CT_LIMIT_CMD_DEL,		/* Delete ct limit. */
+	OVS_CT_LIMIT_CMD_GET		/* Get ct limit. */
+};
+
+enum ovs_ct_limit_attr {
+	OVS_CT_LIMIT_ATTR_UNSPEC,
+	OVS_CT_LIMIT_ATTR_ZONE_LIMIT,	/* Nested struct ovs_zone_limit. */
+	__OVS_CT_LIMIT_ATTR_MAX
+};
+
+#define OVS_CT_LIMIT_ATTR_MAX (__OVS_CT_LIMIT_ATTR_MAX - 1)
+
+#define OVS_ZONE_LIMIT_DEFAULT_ZONE -1
+
+struct ovs_zone_limit {
+	int zone_id;
+	__u32 limit;
+	__u32 count;
+};
+
 #endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit v1.2.3-59-g8ed1b


From 11efd5cb04a184eea4f57b68ea63dddd463158d1 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Thu, 24 May 2018 17:56:43 -0700
Subject: openvswitch: Support conntrack zone limit

Currently, nf_conntrack_max is used to limit the maximum number of
conntrack entries in the conntrack table for every network namespace.
For the VMs and containers that reside in the same namespace,
they share the same conntrack table, and the total # of conntrack entries
for all the VMs and containers are limited by nf_conntrack_max.  In this
case, if one of the VM/container abuses the usage the conntrack entries,
it blocks the others from committing valid conntrack entries into the
conntrack table.  Even if we can possibly put the VM in different network
namespace, the current nf_conntrack_max configuration is kind of rigid
that we cannot limit different VM/container to have different # conntrack
entries.

To address the aforementioned issue, this patch proposes to have a
fine-grained mechanism that could further limit the # of conntrack entries
per-zone.  For example, we can designate different zone to different VM,
and set conntrack limit to each zone.  By providing this isolation, a
mis-behaved VM only consumes the conntrack entries in its own zone, and
it will not influence other well-behaved VMs.  Moreover, the users can
set various conntrack limit to different zone based on their preference.

The proposed implementation utilizes Netfilter's nf_conncount backend
to count the number of connections in a particular zone.  If the number of
connection is above a configured limitation, ovs will return ENOMEM to the
userspace.  If userspace does not configure the zone limit, the limit
defaults to zero that is no limitation, which is backward compatible to
the behavior without this patch.

The following high leve APIs are provided to the userspace:
  - OVS_CT_LIMIT_CMD_SET:
    * set default connection limit for all zones
    * set the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_DEL:
    * remove the connection limit for a particular zone
  - OVS_CT_LIMIT_CMD_GET:
    * get the default connection limit for all zones
    * get the connection limit for a particular zone

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/Kconfig     |   3 +-
 net/openvswitch/conntrack.c | 551 +++++++++++++++++++++++++++++++++++++++++++-
 net/openvswitch/conntrack.h |   9 +-
 net/openvswitch/datapath.c  |   7 +-
 net/openvswitch/datapath.h  |   3 +
 5 files changed, 567 insertions(+), 6 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2650205cdaf9..89da9512ec1e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -9,7 +9,8 @@ config OPENVSWITCH
 		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
 				     (!NF_NAT || NF_NAT) && \
 				     (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
-				     (!NF_NAT_IPV6 || NF_NAT_IPV6)))
+				     (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
+				     (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 02fc343feb66..284aca2a252d 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -16,8 +16,11 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/sctp.h>
+#include <linux/static_key.h>
 #include <net/ip.h>
+#include <net/genetlink.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_count.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -76,6 +79,31 @@ struct ovs_conntrack_info {
 #endif
 };
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+#define OVS_CT_LIMIT_UNLIMITED	0
+#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
+#define CT_LIMIT_HASH_BUCKETS 512
+static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
+
+struct ovs_ct_limit {
+	/* Elements in ovs_ct_limit_info->limits hash table */
+	struct hlist_node hlist_node;
+	struct rcu_head rcu;
+	u16 zone;
+	u32 limit;
+};
+
+struct ovs_ct_limit_info {
+	u32 default_limit;
+	struct hlist_head *limits;
+	struct nf_conncount_data *data;
+};
+
+static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
+	[OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
+};
+#endif
+
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels);
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -1036,6 +1064,89 @@ static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 	return false;
 }
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static struct hlist_head *ct_limit_hash_bucket(
+	const struct ovs_ct_limit_info *info, u16 zone)
+{
+	return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)];
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_set(const struct ovs_ct_limit_info *info,
+			 struct ovs_ct_limit *new_ct_limit)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, new_ct_limit->zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == new_ct_limit->zone) {
+			hlist_replace_rcu(&ct_limit->hlist_node,
+					  &new_ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+
+	hlist_add_head_rcu(&new_ct_limit->hlist_node, head);
+}
+
+/* Call with ovs_mutex */
+static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+	struct hlist_node *n;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) {
+		if (ct_limit->zone == zone) {
+			hlist_del_rcu(&ct_limit->hlist_node);
+			kfree_rcu(ct_limit, rcu);
+			return;
+		}
+	}
+}
+
+/* Call with RCU read lock */
+static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+
+	head = ct_limit_hash_bucket(info, zone);
+	hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+		if (ct_limit->zone == zone)
+			return ct_limit->limit;
+	}
+
+	return info->default_limit;
+}
+
+static int ovs_ct_check_limit(struct net *net,
+			      const struct ovs_conntrack_info *info,
+			      const struct nf_conntrack_tuple *tuple)
+{
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	u32 per_zone_limit, connections;
+	u32 conncount_key;
+
+	conncount_key = info->zone.id;
+
+	per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
+	if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
+		return 0;
+
+	connections = nf_conncount_count(net, ct_limit_info->data,
+					 &conncount_key, tuple, &info->zone);
+	if (connections > per_zone_limit)
+		return -ENOMEM;
+
+	return 0;
+}
+#endif
+
 /* Lookup connection and confirm if unconfirmed. */
 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 			 const struct ovs_conntrack_info *info,
@@ -1054,6 +1165,21 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
 	if (!ct)
 		return 0;
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
+		if (!nf_ct_is_confirmed(ct)) {
+			err = ovs_ct_check_limit(net, info,
+				&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+			if (err) {
+				net_warn_ratelimited("openvswitch: zone: %u "
+					"execeeds conntrack limit\n",
+					info->zone.id);
+				return err;
+			}
+		}
+	}
+#endif
+
 	/* Set the conntrack event mask if given.  NEW and DELETE events have
 	 * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener
 	 * typically would receive many kinds of updates.  Setting the event
@@ -1655,7 +1781,420 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
 		nf_ct_tmpl_free(ct_info->ct);
 }
 
-void ovs_ct_init(struct net *net)
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
+{
+	int i, err;
+
+	ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
+					 GFP_KERNEL);
+	if (!ovs_net->ct_limit_info)
+		return -ENOMEM;
+
+	ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
+	ovs_net->ct_limit_info->limits =
+		kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
+			      GFP_KERNEL);
+	if (!ovs_net->ct_limit_info->limits) {
+		kfree(ovs_net->ct_limit_info);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
+
+	ovs_net->ct_limit_info->data =
+		nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+
+	if (IS_ERR(ovs_net->ct_limit_info->data)) {
+		err = PTR_ERR(ovs_net->ct_limit_info->data);
+		kfree(ovs_net->ct_limit_info->limits);
+		kfree(ovs_net->ct_limit_info);
+		pr_err("openvswitch: failed to init nf_conncount %d\n", err);
+		return err;
+	}
+	return 0;
+}
+
+static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
+{
+	const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
+	int i;
+
+	nf_conncount_destroy(net, NFPROTO_INET, info->data);
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		struct hlist_head *head = &info->limits[i];
+		struct ovs_ct_limit *ct_limit;
+
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+			kfree_rcu(ct_limit, rcu);
+	}
+	kfree(ovs_net->ct_limit_info->limits);
+	kfree(ovs_net->ct_limit_info);
+}
+
+static struct sk_buff *
+ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
+			     struct ovs_header **ovs_reply_header)
+{
+	struct ovs_header *ovs_header = info->userhdr;
+	struct sk_buff *skb;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	*ovs_reply_header = genlmsg_put(skb, info->snd_portid,
+					info->snd_seq,
+					&dp_ct_limit_genl_family, 0, cmd);
+
+	if (!*ovs_reply_header) {
+		nlmsg_free(skb);
+		return ERR_PTR(-EMSGSIZE);
+	}
+	(*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex;
+
+	return skb;
+}
+
+static bool check_zone_id(int zone_id, u16 *pzone)
+{
+	if (zone_id >= 0 && zone_id <= 65535) {
+		*pzone = (u16)zone_id;
+		return true;
+	}
+	return false;
+}
+
+static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct ovs_zone_limit *zone_limit;
+	int rem;
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(zone_limit->zone_id ==
+				OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+			ovs_lock();
+			info->default_limit = zone_limit->limit;
+			ovs_unlock();
+		} else if (unlikely(!check_zone_id(
+				zone_limit->zone_id, &zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			struct ovs_ct_limit *ct_limit;
+
+			ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+			if (!ct_limit)
+				return -ENOMEM;
+
+			ct_limit->zone = zone;
+			ct_limit->limit = zone_limit->limit;
+
+			ovs_lock();
+			ct_limit_set(info, ct_limit);
+			ovs_unlock();
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "set zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info)
+{
+	struct ovs_zone_limit *zone_limit;
+	int rem;
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(zone_limit->zone_id ==
+				OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+			ovs_lock();
+			info->default_limit = OVS_CT_LIMIT_DEFAULT;
+			ovs_unlock();
+		} else if (unlikely(!check_zone_id(
+				zone_limit->zone_id, &zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			ovs_lock();
+			ct_limit_del(info, zone);
+			ovs_unlock();
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
+					  struct sk_buff *reply)
+{
+	struct ovs_zone_limit zone_limit;
+	int err;
+
+	zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
+	zone_limit.limit = info->default_limit;
+	err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int __ovs_ct_limit_get_zone_limit(struct net *net,
+					 struct nf_conncount_data *data,
+					 u16 zone_id, u32 limit,
+					 struct sk_buff *reply)
+{
+	struct nf_conntrack_zone ct_zone;
+	struct ovs_zone_limit zone_limit;
+	u32 conncount_key = zone_id;
+
+	zone_limit.zone_id = zone_id;
+	zone_limit.limit = limit;
+	nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
+
+	zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL,
+					      &ct_zone);
+	return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
+}
+
+static int ovs_ct_limit_get_zone_limit(struct net *net,
+				       struct nlattr *nla_zone_limit,
+				       struct ovs_ct_limit_info *info,
+				       struct sk_buff *reply)
+{
+	struct ovs_zone_limit *zone_limit;
+	int rem, err;
+	u32 limit;
+	u16 zone;
+
+	rem = NLA_ALIGN(nla_len(nla_zone_limit));
+	zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
+
+	while (rem >= sizeof(*zone_limit)) {
+		if (unlikely(zone_limit->zone_id ==
+				OVS_ZONE_LIMIT_DEFAULT_ZONE)) {
+			err = ovs_ct_limit_get_default_limit(info, reply);
+			if (err)
+				return err;
+		} else if (unlikely(!check_zone_id(zone_limit->zone_id,
+							&zone))) {
+			OVS_NLERR(true, "zone id is out of range");
+		} else {
+			rcu_read_lock();
+			limit = ct_limit_get(info, zone);
+			rcu_read_unlock();
+
+			err = __ovs_ct_limit_get_zone_limit(
+				net, info->data, zone, limit, reply);
+			if (err)
+				return err;
+		}
+		rem -= NLA_ALIGN(sizeof(*zone_limit));
+		zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
+				NLA_ALIGN(sizeof(*zone_limit)));
+	}
+
+	if (rem)
+		OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
+
+	return 0;
+}
+
+static int ovs_ct_limit_get_all_zone_limit(struct net *net,
+					   struct ovs_ct_limit_info *info,
+					   struct sk_buff *reply)
+{
+	struct ovs_ct_limit *ct_limit;
+	struct hlist_head *head;
+	int i, err = 0;
+
+	err = ovs_ct_limit_get_default_limit(info, reply);
+	if (err)
+		return err;
+
+	rcu_read_lock();
+	for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
+		head = &info->limits[i];
+		hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
+			err = __ovs_ct_limit_get_zone_limit(net, info->data,
+				ct_limit->zone, ct_limit->limit, reply);
+			if (err)
+				goto exit_err;
+		}
+	}
+
+exit_err:
+	rcu_read_unlock();
+	return err;
+}
+
+static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = -EINVAL;
+		goto exit_err;
+	}
+
+	err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	static_branch_enable(&ovs_ct_limit_enabled);
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = -EINVAL;
+		goto exit_err;
+	}
+
+	err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT],
+					  ct_limit_info);
+	if (err)
+		goto exit_err;
+
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr **a = info->attrs;
+	struct nlattr *nla_reply;
+	struct sk_buff *reply;
+	struct ovs_header *ovs_reply_header;
+	struct net *net = sock_net(skb->sk);
+	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+	struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
+	int err;
+
+	reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET,
+					     &ovs_reply_header);
+	if (IS_ERR(reply))
+		return PTR_ERR(reply);
+
+	nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+
+	if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
+		err = ovs_ct_limit_get_zone_limit(
+			net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info,
+			reply);
+		if (err)
+			goto exit_err;
+	} else {
+		err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info,
+						      reply);
+		if (err)
+			goto exit_err;
+	}
+
+	nla_nest_end(reply, nla_reply);
+	genlmsg_end(reply, ovs_reply_header);
+	return genlmsg_reply(reply, info);
+
+exit_err:
+	nlmsg_free(reply);
+	return err;
+}
+
+static struct genl_ops ct_limit_genl_ops[] = {
+	{ .cmd = OVS_CT_LIMIT_CMD_SET,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_set,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_DEL,
+		.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+					   * privilege. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_del,
+	},
+	{ .cmd = OVS_CT_LIMIT_CMD_GET,
+		.flags = 0,		  /* OK for unprivileged users. */
+		.policy = ct_limit_policy,
+		.doit = ovs_ct_limit_cmd_get,
+	},
+};
+
+static const struct genl_multicast_group ovs_ct_limit_multicast_group = {
+	.name = OVS_CT_LIMIT_MCGROUP,
+};
+
+struct genl_family dp_ct_limit_genl_family __ro_after_init = {
+	.hdrsize = sizeof(struct ovs_header),
+	.name = OVS_CT_LIMIT_FAMILY,
+	.version = OVS_CT_LIMIT_VERSION,
+	.maxattr = OVS_CT_LIMIT_ATTR_MAX,
+	.netnsok = true,
+	.parallel_ops = true,
+	.ops = ct_limit_genl_ops,
+	.n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+	.mcgrps = &ovs_ct_limit_multicast_group,
+	.n_mcgrps = 1,
+	.module = THIS_MODULE,
+};
+#endif
+
+int ovs_ct_init(struct net *net)
 {
 	unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
@@ -1666,12 +2205,22 @@ void ovs_ct_init(struct net *net)
 	} else {
 		ovs_net->xt_label = true;
 	}
+
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	return ovs_ct_limit_init(net, ovs_net);
+#else
+	return 0;
+#endif
 }
 
 void ovs_ct_exit(struct net *net)
 {
 	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
 
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	ovs_ct_limit_exit(net, ovs_net);
+#endif
+
 	if (ovs_net->xt_label)
 		nf_connlabels_put(net);
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 399dfdd2c4f9..900dadd70974 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -17,10 +17,11 @@
 #include "flow.h"
 
 struct ovs_conntrack_info;
+struct ovs_ct_limit_info;
 enum ovs_key_attr;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-void ovs_ct_init(struct net *);
+int ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
 int ovs_ct_copy_action(struct net *, const struct nlattr *,
@@ -44,7 +45,7 @@ void ovs_ct_free_action(const struct nlattr *a);
 #else
 #include <linux/errno.h>
 
-static inline void ovs_ct_init(struct net *net) { }
+static inline int ovs_ct_init(struct net *net) { return 0; }
 
 static inline void ovs_ct_exit(struct net *net) { }
 
@@ -104,4 +105,8 @@ static inline void ovs_ct_free_action(const struct nlattr *a) { }
 
 #define CT_SUPPORTED_MASK 0
 #endif /* CONFIG_NF_CONNTRACK */
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+extern struct genl_family dp_ct_limit_genl_family;
+#endif
 #endif /* ovs_conntrack.h */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 015e24e08909..a61818e94396 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2288,6 +2288,9 @@ static struct genl_family * const dp_genl_families[] = {
 	&dp_flow_genl_family,
 	&dp_packet_genl_family,
 	&dp_meter_genl_family,
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	&dp_ct_limit_genl_family,
+#endif
 };
 
 static void dp_unregister_genl(int n_families)
@@ -2323,8 +2326,7 @@ static int __net_init ovs_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&ovs_net->dps);
 	INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq);
-	ovs_ct_init(net);
-	return 0;
+	return ovs_ct_init(net);
 }
 
 static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
@@ -2469,3 +2471,4 @@ MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY);
 MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY);
+MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 523d65526766..c9eb267c6f7e 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,6 +144,9 @@ struct dp_upcall_info {
 struct ovs_net {
 	struct list_head dps;
 	struct work_struct dp_notify_work;
+#if	IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+	struct ovs_ct_limit_info *ct_limit_info;
+#endif
 
 	/* Module reference for configuring conntrack. */
 	bool xt_label;
-- 
cgit v1.2.3-59-g8ed1b


From e52cde71709348c0d67bf0f213b438fa4d6cf9a9 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 24 May 2018 20:52:14 -0700
Subject: net: dsa: dsa_loop: Make dynamic debugging helpful

Remove redundant debug prints from phy_read/write since we can trace those
calls through trace events. Enhance dynamic debug prints to print arguments
which helps figuring how what is going on at the driver level with higher level
configuration interfaces.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/dsa_loop.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index 58f14af04639..816f34d64736 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -67,7 +67,7 @@ static struct phy_device *phydevs[PHY_MAX_ADDR];
 static enum dsa_tag_protocol dsa_loop_get_protocol(struct dsa_switch *ds,
 						   int port)
 {
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d\n", __func__, port);
 
 	return DSA_TAG_PROTO_NONE;
 }
@@ -124,8 +124,6 @@ static int dsa_loop_phy_read(struct dsa_switch *ds, int port, int regnum)
 	struct mii_bus *bus = ps->bus;
 	int ret;
 
-	dev_dbg(ds->dev, "%s\n", __func__);
-
 	ret = mdiobus_read_nested(bus, ps->port_base + port, regnum);
 	if (ret < 0)
 		ps->ports[port].mib[DSA_LOOP_PHY_READ_ERR].val++;
@@ -142,8 +140,6 @@ static int dsa_loop_phy_write(struct dsa_switch *ds, int port,
 	struct mii_bus *bus = ps->bus;
 	int ret;
 
-	dev_dbg(ds->dev, "%s\n", __func__);
-
 	ret = mdiobus_write_nested(bus, ps->port_base + port, regnum, value);
 	if (ret < 0)
 		ps->ports[port].mib[DSA_LOOP_PHY_WRITE_ERR].val++;
@@ -156,7 +152,8 @@ static int dsa_loop_phy_write(struct dsa_switch *ds, int port,
 static int dsa_loop_port_bridge_join(struct dsa_switch *ds, int port,
 				     struct net_device *bridge)
 {
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d, bridge: %s\n",
+		__func__, port, bridge->name);
 
 	return 0;
 }
@@ -164,19 +161,22 @@ static int dsa_loop_port_bridge_join(struct dsa_switch *ds, int port,
 static void dsa_loop_port_bridge_leave(struct dsa_switch *ds, int port,
 				       struct net_device *bridge)
 {
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d, bridge: %s\n",
+		__func__, port, bridge->name);
 }
 
 static void dsa_loop_port_stp_state_set(struct dsa_switch *ds, int port,
 					u8 state)
 {
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d, state: %d\n",
+		__func__, port, state);
 }
 
 static int dsa_loop_port_vlan_filtering(struct dsa_switch *ds, int port,
 					bool vlan_filtering)
 {
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d, vlan_filtering: %d\n",
+		__func__, port, vlan_filtering);
 
 	return 0;
 }
@@ -188,7 +188,8 @@ dsa_loop_port_vlan_prepare(struct dsa_switch *ds, int port,
 	struct dsa_loop_priv *ps = ds->priv;
 	struct mii_bus *bus = ps->bus;
 
-	dev_dbg(ds->dev, "%s\n", __func__);
+	dev_dbg(ds->dev, "%s: port: %d, vlan: %d-%d",
+		__func__, port, vlan->vid_begin, vlan->vid_end);
 
 	/* Just do a sleeping operation to make lockdep checks effective */
 	mdiobus_read(bus, ps->port_base + port, MII_BMSR);
@@ -209,8 +210,6 @@ static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port,
 	struct dsa_loop_vlan *vl;
 	u16 vid;
 
-	dev_dbg(ds->dev, "%s\n", __func__);
-
 	/* Just do a sleeping operation to make lockdep checks effective */
 	mdiobus_read(bus, ps->port_base + port, MII_BMSR);
 
@@ -222,6 +221,9 @@ static void dsa_loop_port_vlan_add(struct dsa_switch *ds, int port,
 			vl->untagged |= BIT(port);
 		else
 			vl->untagged &= ~BIT(port);
+
+		dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n",
+			__func__, port, vid, untagged ? "un" : "", pvid);
 	}
 
 	if (pvid)
@@ -237,8 +239,6 @@ static int dsa_loop_port_vlan_del(struct dsa_switch *ds, int port,
 	struct dsa_loop_vlan *vl;
 	u16 vid, pvid = ps->pvid;
 
-	dev_dbg(ds->dev, "%s\n", __func__);
-
 	/* Just do a sleeping operation to make lockdep checks effective */
 	mdiobus_read(bus, ps->port_base + port, MII_BMSR);
 
@@ -251,6 +251,9 @@ static int dsa_loop_port_vlan_del(struct dsa_switch *ds, int port,
 
 		if (pvid == vid)
 			pvid = 1;
+
+		dev_dbg(ds->dev, "%s: port: %d vlan: %d, %stagged, pvid: %d\n",
+			__func__, port, vid, untagged ? "un" : "", pvid);
 	}
 	ps->pvid = pvid;
 
-- 
cgit v1.2.3-59-g8ed1b


From 52fff3274b08c2eaea33d3df546fcd91040dee3f Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Wed, 16 May 2018 17:20:17 +0900
Subject: net/mlx5: E-Switch, Reorganize and rename fdb flow tables

We have several fdb flow tables for each of the legacy and switchdev
modes. In the switchdev mode, there are fast path and slow path flow
tables. Towards adding more flow tables in upcoming patches, reorganize
and rename the various existing ones to reflect their functionality.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 22 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  5 +++--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 22 +++++++++++-----------
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 09f0e11c6ffc..6cab1dd66d1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -200,7 +200,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	spec->match_criteria_enable = match_header;
 	flow_act.action =  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	flow_rule =
-		mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
+		mlx5_add_flow_rules(esw->fdb_table.legacy.fdb, spec,
 				    &flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev,
@@ -282,7 +282,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
 		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
 		goto out;
 	}
-	esw->fdb_table.fdb = fdb;
+	esw->fdb_table.legacy.fdb = fdb;
 
 	/* Addresses group : Full match unicast/multicast addresses */
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -343,9 +343,9 @@ out:
 			mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
 			esw->fdb_table.legacy.addr_grp = NULL;
 		}
-		if (!IS_ERR_OR_NULL(esw->fdb_table.fdb)) {
-			mlx5_destroy_flow_table(esw->fdb_table.fdb);
-			esw->fdb_table.fdb = NULL;
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.fdb)) {
+			mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+			esw->fdb_table.legacy.fdb = NULL;
 		}
 	}
 
@@ -355,15 +355,15 @@ out:
 
 static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw)
 {
-	if (!esw->fdb_table.fdb)
+	if (!esw->fdb_table.legacy.fdb)
 		return;
 
 	esw_debug(esw->dev, "Destroy FDB Table\n");
 	mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
-	mlx5_destroy_flow_table(esw->fdb_table.fdb);
-	esw->fdb_table.fdb = NULL;
+	mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+	esw->fdb_table.legacy.fdb = NULL;
 	esw->fdb_table.legacy.addr_grp = NULL;
 	esw->fdb_table.legacy.allmulti_grp = NULL;
 	esw->fdb_table.legacy.promisc_grp = NULL;
@@ -396,7 +396,7 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 
 fdb_add:
 	/* SRIOV is enabled: Forward UC MAC to vport */
-	if (esw->fdb_table.fdb && esw->mode == SRIOV_LEGACY)
+	if (esw->fdb_table.legacy.fdb && esw->mode == SRIOV_LEGACY)
 		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
 
 	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
@@ -486,7 +486,7 @@ static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	u8 *mac = vaddr->node.addr;
 	u32 vport = vaddr->vport;
 
-	if (!esw->fdb_table.fdb)
+	if (!esw->fdb_table.legacy.fdb)
 		return 0;
 
 	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
@@ -526,7 +526,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	u8 *mac = vaddr->node.addr;
 	u32 vport = vaddr->vport;
 
-	if (!esw->fdb_table.fdb)
+	if (!esw->fdb_table.legacy.fdb)
 		return 0;
 
 	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index f47a14e31b7d..d1a3f7fcca1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -117,16 +117,17 @@ struct mlx5_vport {
 };
 
 struct mlx5_eswitch_fdb {
-	void *fdb;
 	union {
 		struct legacy_fdb {
+			struct mlx5_flow_table *fdb;
 			struct mlx5_flow_group *addr_grp;
 			struct mlx5_flow_group *allmulti_grp;
 			struct mlx5_flow_group *promisc_grp;
 		} legacy;
 
 		struct offloads_fdb {
-			struct mlx5_flow_table *fdb;
+			struct mlx5_flow_table *fast_fdb;
+			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
 			struct mlx5_flow_handle *miss_rule_uni;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b9ea464bcfa9..bb8eac5523a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -119,7 +119,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
 		flow_act.encap_id = attr->encap_id;
 
-	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
+	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.offloads.fast_fdb,
 				   spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
 		goto err_add_rule;
@@ -363,7 +363,7 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 	dest.vport.num = vport;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
-	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
 					&flow_act, &dest, 1);
 	if (IS_ERR(flow_rule))
 		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
@@ -407,7 +407,7 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dest.vport.num = 0;
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
-	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
 					&flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		err = PTR_ERR(flow_rule);
@@ -422,7 +422,7 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v,
 			      outer_headers.dmac_47_16);
 	dmac_v[0] = 0x01;
-	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
 					&flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		err = PTR_ERR(flow_rule);
@@ -476,7 +476,7 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
 		goto out;
 	}
-	esw->fdb_table.fdb = fdb;
+	esw->fdb_table.offloads.fast_fdb = fdb;
 
 out:
 	return err;
@@ -484,7 +484,7 @@ out:
 
 static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 {
-	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
 }
 
 #define MAX_PF_SQ 256
@@ -530,7 +530,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 		esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err);
 		goto slow_fdb_err;
 	}
-	esw->fdb_table.offloads.fdb = fdb;
+	esw->fdb_table.offloads.slow_fdb = fdb;
 
 	/* create send-to-vport group */
 	memset(flow_group_in, 0, inlen);
@@ -586,9 +586,9 @@ miss_rule_err:
 miss_err:
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 send_vport_err:
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fdb);
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
-	mlx5_destroy_flow_table(esw->fdb_table.fdb);
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
 fast_fdb_err:
 ns_err:
 	kvfree(flow_group_in);
@@ -597,7 +597,7 @@ ns_err:
 
 static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 {
-	if (!esw->fdb_table.fdb)
+	if (!esw->fdb_table.offloads.fast_fdb)
 		return;
 
 	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
@@ -606,7 +606,7 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
 
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fdb);
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 	esw_destroy_offloads_fast_fdb_table(esw);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b45630021b52c203c2547a95a5c811b57aed4ead Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Tue, 24 Apr 2018 11:21:46 +0900
Subject: net/mlx5: Add cap bits for flow table destination in FDB table

If set, the FDB table supports the forward action with a
destination list that includes a flow table.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index edbddeaacc88..05b480fae27d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -524,7 +524,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 };
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
-	u8     reserved_at_0[0x200];
+	u8      reserved_at_0[0x1c];
+	u8      fdb_multi_path_to_table[0x1];
+	u8      reserved_at_1d[0x1e3];
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
 
-- 
cgit v1.2.3-59-g8ed1b


From a842dd04cf85fbc6e21e65a344b957f4a1dc0413 Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Wed, 16 May 2018 17:42:52 +0900
Subject: net/mlx5: E-switch, Create a second level FDB flow table

If firmware supports the forward action with a destination list
that includes a flow table, create a second level FDB flow table.

This is going to be used for flow based mirroring under the switchdev
offloads mode.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  4 +++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 31 +++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  2 +-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index d1a3f7fcca1c..d06c11629121 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -55,6 +55,9 @@
 #define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
 	min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
 
+#define mlx5_esw_has_fwd_fdb(dev) \
+	MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
+
 struct vport_ingress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -127,6 +130,7 @@ struct mlx5_eswitch_fdb {
 
 		struct offloads_fdb {
 			struct mlx5_flow_table *fast_fdb;
+			struct mlx5_flow_table *fwd_fdb;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bb8eac5523a7..8ea11f24380c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -454,7 +454,7 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 	if (!root_ns) {
 		esw_warn(dev, "Failed to get FDB flow namespace\n");
 		err = -EOPNOTSUPP;
-		goto out;
+		goto out_namespace;
 	}
 
 	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
@@ -464,6 +464,9 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
 			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
+	if (mlx5_esw_has_fwd_fdb(dev))
+		esw_size >>= 1;
+
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
 		flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
 
@@ -474,16 +477,36 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
-		goto out;
+		goto out_namespace;
 	}
 	esw->fdb_table.offloads.fast_fdb = fdb;
 
-out:
+	if (!mlx5_esw_has_fwd_fdb(dev))
+		goto out_namespace;
+
+	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
+						  esw_size,
+						  ESW_OFFLOADS_NUM_GROUPS, 1,
+						  flags);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create fwd table err %d\n", err);
+		goto out_ft;
+	}
+	esw->fdb_table.offloads.fwd_fdb = fdb;
+
+	return err;
+
+out_ft:
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+out_namespace:
 	return err;
 }
 
 static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 {
+	if (mlx5_esw_has_fwd_fdb(esw->dev))
+		mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
 }
 
@@ -588,7 +611,7 @@ miss_err:
 send_vport_err:
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+	esw_destroy_offloads_fast_fdb_table(esw);
 fast_fdb_err:
 ns_err:
 	kvfree(flow_group_in);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 806e95523f9e..f9c2c03083eb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2495,7 +2495,7 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 	if (!steering->fdb_root_ns)
 		return -ENOMEM;
 
-	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 1);
+	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 2);
 	if (IS_ERR(prio))
 		goto out_err;
 
-- 
cgit v1.2.3-59-g8ed1b


From 592d3651596924b0b1e9030634c48d262451eeaf Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Fri, 4 May 2018 14:09:00 +0900
Subject: net/mlx5e: Parse mirroring action for offloaded TC eswitch flows

Currently, we only support the mirred redirect TC sub-action. In order
to support flow based vport mirroring, add support to parse the mirred
mirror sub-action.

For mirroring, user-space will typically set the action order such that
the mirror port (mirror VF) sees packets as the original port (VF under
mirroring) sent them or as it will receive them.

In the general case, it means that packets are potentially sent to the
mirror port before or after some actions were applied on them. To
properly do that, we should follow on the exact action order as set for
the flow and make sure this will also be the case when we program the HW
offload.

We introduce a counter for the output ports (attr->out_count), which we
increase when parsing each mirred redirect/mirror sub-action and when
dealing with encap.

We introduce a counter (attr->mirror_count) telling us if split is
needed. If no split is needed and mirroring is just multicasting to
vport, the mirror count is zero, all the actions of the TC flow should
apply on that single HW flow.

If split is needed, the mirror count tells where to do the split, all
non-mirred tc actions should apply only after the split.

The mirror count is set while parsing the following actions encap/decap,
header re-write, vlan push/pop.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 26 +++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 10 ++++++--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 28 ++++++++++++----------
 3 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a9c96fe8e4fe..302c5500f9ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -844,8 +844,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		}
 		out_priv = netdev_priv(encap_dev);
 		rpriv = out_priv->ppriv;
-		attr->out_rep = rpriv->rep;
-		attr->out_mdev = out_priv->mdev;
+		attr->out_rep[attr->out_count] = rpriv->rep;
+		attr->out_mdev[attr->out_count++] = out_priv->mdev;
 	}
 
 	err = mlx5_eswitch_add_vlan_action(esw, attr);
@@ -2537,6 +2537,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				return err;
 
 			action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			attr->mirror_count = attr->out_count;
 			continue;
 		}
 
@@ -2548,12 +2549,18 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			return -EOPNOTSUPP;
 		}
 
-		if (is_tcf_mirred_egress_redirect(a)) {
-			struct net_device *out_dev;
+		if (is_tcf_mirred_egress_redirect(a) || is_tcf_mirred_egress_mirror(a)) {
 			struct mlx5e_priv *out_priv;
+			struct net_device *out_dev;
 
 			out_dev = tcf_mirred_dev(a);
 
+			if (attr->out_count >= MLX5_MAX_FLOW_FWD_VPORTS) {
+				pr_err("can't support more than %d output ports, can't offload forwarding\n",
+				       attr->out_count);
+				return -EOPNOTSUPP;
+			}
+
 			if (switchdev_port_same_parent_id(priv->netdev,
 							  out_dev) ||
 			    is_merged_eswitch_dev(priv, out_dev)) {
@@ -2561,8 +2568,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
 				out_priv = netdev_priv(out_dev);
 				rpriv = out_priv->ppriv;
-				attr->out_rep = rpriv->rep;
-				attr->out_mdev = out_priv->mdev;
+				attr->out_rep[attr->out_count] = rpriv->rep;
+				attr->out_mdev[attr->out_count++] = out_priv->mdev;
 			} else if (encap) {
 				parse_attr->mirred_ifindex = out_dev->ifindex;
 				parse_attr->tun_info = *info;
@@ -2585,6 +2592,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				encap = true;
 			else
 				return -EOPNOTSUPP;
+			attr->mirror_count = attr->out_count;
 			continue;
 		}
 
@@ -2606,6 +2614,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			} else { /* action is TCA_VLAN_ACT_MODIFY */
 				return -EOPNOTSUPP;
 			}
+			attr->mirror_count = attr->out_count;
 			continue;
 		}
 
@@ -2621,6 +2630,11 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	if (!actions_match_supported(priv, exts, parse_attr, flow))
 		return -EOPNOTSUPP;
 
+	if (attr->out_count > 1 && !mlx5_esw_has_fwd_fdb(priv->mdev)) {
+		netdev_warn_once(priv->netdev, "current firmware doesn't support split rule for port mirroring\n");
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index d06c11629121..386b0e354c41 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -239,12 +239,18 @@ enum mlx5_flow_match_level {
 	MLX5_MATCH_L4	= MLX5_INLINE_MODE_TCP_UDP,
 };
 
+/* current maximum for flow based vport multicasting */
+#define MLX5_MAX_FLOW_FWD_VPORTS 2
+
 struct mlx5_esw_flow_attr {
 	struct mlx5_eswitch_rep *in_rep;
-	struct mlx5_eswitch_rep *out_rep;
-	struct mlx5_core_dev	*out_mdev;
+	struct mlx5_eswitch_rep *out_rep[MLX5_MAX_FLOW_FWD_VPORTS];
+	struct mlx5_core_dev	*out_mdev[MLX5_MAX_FLOW_FWD_VPORTS];
 	struct mlx5_core_dev	*in_mdev;
 
+	int mirror_count;
+	int out_count;
+
 	int	action;
 	__be16	vlan_proto;
 	u16	vlan_vid;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8ea11f24380c..7db8b9a41925 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -48,12 +48,12 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
-	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
 	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
+	int j, i = 0;
 	void *misc;
-	int i = 0;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
@@ -70,14 +70,16 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	}
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-		dest[i].vport.num = attr->out_rep->vport;
-		if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) {
-			dest[i].vport.vhca_id =
-				MLX5_CAP_GEN(attr->out_mdev, vhca_id);
-			dest[i].vport.vhca_id_valid = 1;
+		for (j = attr->mirror_count; j < attr->out_count; j++) {
+			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+			dest[i].vport.num = attr->out_rep[j]->vport;
+			if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) {
+				dest[i].vport.vhca_id =
+					MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+				dest[i].vport.vhca_id_valid = 1;
+			}
+			i++;
 		}
-		i++;
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
 		counter = mlx5_fc_create(esw->dev, true);
@@ -173,7 +175,7 @@ esw_vlan_action_get_vport(struct mlx5_esw_flow_attr *attr, bool push, bool pop)
 	struct mlx5_eswitch_rep *in_rep, *out_rep, *vport = NULL;
 
 	in_rep  = attr->in_rep;
-	out_rep = attr->out_rep;
+	out_rep = attr->out_rep[0];
 
 	if (push)
 		vport = in_rep;
@@ -194,7 +196,7 @@ static int esw_add_vlan_action_check(struct mlx5_esw_flow_attr *attr,
 		goto out_notsupp;
 
 	in_rep  = attr->in_rep;
-	out_rep = attr->out_rep;
+	out_rep = attr->out_rep[0];
 
 	if (push && in_rep->vport == FDB_UPLINK_VPORT)
 		goto out_notsupp;
@@ -245,7 +247,7 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
 
 	if (!push && !pop && fwd) {
 		/* tracks VF --> wire rules without vlan push action */
-		if (attr->out_rep->vport == FDB_UPLINK_VPORT) {
+		if (attr->out_rep[0]->vport == FDB_UPLINK_VPORT) {
 			vport->vlan_refcount++;
 			attr->vlan_handled = true;
 		}
@@ -305,7 +307,7 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
 
 	if (!push && !pop && fwd) {
 		/* tracks VF --> wire rules without vlan push action */
-		if (attr->out_rep->vport == FDB_UPLINK_VPORT)
+		if (attr->out_rep[0]->vport == FDB_UPLINK_VPORT)
 			vport->vlan_refcount--;
 
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e4ad91f23f10a9e7e7775f854584f4d54fe7f299 Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Wed, 16 May 2018 17:54:38 +0900
Subject: net/mlx5e: Split offloaded eswitch TC rules for port mirroring

If a TC rule needs to be split for mirroring, create two HW rules,
in the first level and the second level flow tables accordingly.

In the first level flow table, forward the packet to the mirror
port and forward the packet to the second level flow table for
further processing, eg. encap, vlan push or header re-write.

Currently the matching is repeated in both stages.

While here, simplify the setup of the vhca id valid indicator also
in the existing code.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 57 +++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  4 ++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 68 +++++++++++++++++++---
 3 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 302c5500f9ad..9372d914abe5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -75,12 +75,14 @@ enum {
 	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
 };
 
+#define MLX5E_TC_MAX_SPLITS 1
+
 struct mlx5e_tc_flow {
 	struct rhash_head	node;
 	struct mlx5e_priv	*priv;
 	u64			cookie;
 	u8			flags;
-	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
 	struct list_head	encap;   /* flows sharing the same encap ID */
 	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
 	struct list_head	hairpin; /* flows sharing the same hairpin */
@@ -794,8 +796,8 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
 	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
 	struct mlx5_fc *counter = NULL;
 
-	counter = mlx5_flow_rule_counter(flow->rule);
-	mlx5_del_flow_rules(flow->rule);
+	counter = mlx5_flow_rule_counter(flow->rule[0]);
+	mlx5_del_flow_rules(flow->rule[0]);
 	mlx5_fc_destroy(priv->mdev, counter);
 
 	if (!mlx5e_tc_num_filters(priv) && priv->fs.tc.t) {
@@ -870,9 +872,18 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		rule = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
 		if (IS_ERR(rule))
 			goto err_add_rule;
+
+		if (attr->mirror_count) {
+			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &parse_attr->spec, attr);
+			if (IS_ERR(flow->rule[1]))
+				goto err_fwd_rule;
+		}
 	}
 	return rule;
 
+err_fwd_rule:
+	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+	rule = flow->rule[1];
 err_add_rule:
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		mlx5e_detach_mod_hdr(priv, flow);
@@ -893,7 +904,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 
 	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
 		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
-		mlx5_eswitch_del_offloaded_rule(esw, flow->rule, attr);
+		if (attr->mirror_count)
+			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+		mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 	}
 
 	mlx5_eswitch_del_vlan_action(esw, attr);
@@ -929,13 +942,25 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 	list_for_each_entry(flow, &e->flows, encap) {
 		esw_attr = flow->esw_attr;
 		esw_attr->encap_id = e->encap_id;
-		flow->rule = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
-		if (IS_ERR(flow->rule)) {
-			err = PTR_ERR(flow->rule);
+		flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
+		if (IS_ERR(flow->rule[0])) {
+			err = PTR_ERR(flow->rule[0]);
 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
 				       err);
 			continue;
 		}
+
+		if (esw_attr->mirror_count) {
+			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
+			if (IS_ERR(flow->rule[1])) {
+				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], esw_attr);
+				err = PTR_ERR(flow->rule[1]);
+				mlx5_core_warn(priv->mdev, "Failed to update cached mirror flow, %d\n",
+					       err);
+				continue;
+			}
+		}
+
 		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
 	}
 }
@@ -948,8 +973,12 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 
 	list_for_each_entry(flow, &e->flows, encap) {
 		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+			struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+
 			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
-			mlx5_eswitch_del_offloaded_rule(esw, flow->rule, flow->esw_attr);
+			if (attr->mirror_count)
+				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 		}
 	}
 
@@ -984,7 +1013,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 			continue;
 		list_for_each_entry(flow, &e->flows, encap) {
 			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
-				counter = mlx5_flow_rule_counter(flow->rule);
+				counter = mlx5_flow_rule_counter(flow->rule[0]);
 				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
 					neigh_used = true;
@@ -2714,16 +2743,16 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
 		err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow);
 		if (err < 0)
 			goto err_free;
-		flow->rule = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow);
+		flow->rule[0] = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow);
 	} else {
 		err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow);
 		if (err < 0)
 			goto err_free;
-		flow->rule = mlx5e_tc_add_nic_flow(priv, parse_attr, flow);
+		flow->rule[0] = mlx5e_tc_add_nic_flow(priv, parse_attr, flow);
 	}
 
-	if (IS_ERR(flow->rule)) {
-		err = PTR_ERR(flow->rule);
+	if (IS_ERR(flow->rule[0])) {
+		err = PTR_ERR(flow->rule[0]);
 		if (err != -EAGAIN)
 			goto err_free;
 	}
@@ -2796,7 +2825,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
 		return 0;
 
-	counter = mlx5_flow_rule_counter(flow->rule);
+	counter = mlx5_flow_rule_counter(flow->rule[0]);
 	if (!counter)
 		return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 386b0e354c41..b174da2884c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -219,6 +219,10 @@ struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr);
+struct mlx5_flow_handle *
+mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_spec *spec,
+			  struct mlx5_esw_flow_attr *attr);
 void
 mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 7db8b9a41925..cecd201f0b73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -50,6 +50,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
 	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_table *ft = NULL;
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
 	int j, i = 0;
@@ -58,6 +59,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (attr->mirror_count)
+		ft = esw->fdb_table.offloads.fwd_fdb;
+	else
+		ft = esw->fdb_table.offloads.fast_fdb;
+
 	flow_act.action = attr->action;
 	/* if per flow vlan pop/push is emulated, don't set that into the firmware */
 	if (!mlx5_eswitch_vlan_actions_supported(esw->dev))
@@ -73,11 +79,9 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		for (j = attr->mirror_count; j < attr->out_count; j++) {
 			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 			dest[i].vport.num = attr->out_rep[j]->vport;
-			if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) {
-				dest[i].vport.vhca_id =
-					MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
-				dest[i].vport.vhca_id_valid = 1;
-			}
+			dest[i].vport.vhca_id =
+				MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+			dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
 			i++;
 		}
 	}
@@ -121,8 +125,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
 		flow_act.encap_id = attr->encap_id;
 
-	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.offloads.fast_fdb,
-				   spec, &flow_act, dest, i);
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
 		goto err_add_rule;
 	else
@@ -136,6 +139,57 @@ err_counter_alloc:
 	return rule;
 }
 
+struct mlx5_flow_handle *
+mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_spec *spec,
+			  struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_handle *rule;
+	void *misc;
+	int i;
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	for (i = 0; i < attr->mirror_count; i++) {
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+		dest[i].vport.num = attr->out_rep[i]->vport;
+		dest[i].vport.vhca_id =
+			MLX5_CAP_GEN(attr->out_mdev[i], vhca_id);
+		dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+	}
+	dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest[i].ft = esw->fdb_table.offloads.fwd_fdb,
+	i++;
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
+
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET(fte_match_set_misc, misc,
+			 source_eswitch_owner_vhca_id,
+			 MLX5_CAP_GEN(attr->in_mdev, vhca_id));
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc,
+				 source_eswitch_owner_vhca_id);
+
+	if (attr->match_level == MLX5_MATCH_NONE)
+		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	else
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
+					      MLX5_MATCH_MISC_PARAMETERS;
+
+	rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fast_fdb, spec, &flow_act, dest, i);
+
+	if (!IS_ERR(rule))
+		esw->offloads.num_flows++;
+
+	return rule;
+}
+
 void
 mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
-- 
cgit v1.2.3-59-g8ed1b


From ddf385e31f574c1c47215b6b1cf53343d7d204d6 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 2 May 2018 18:30:56 +0300
Subject: net/mlx5e: Use WQ API functions instead of direct fields access

Use the WQ API to get the WQ size, and to map a counter
into a WQ entry index.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  9 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 37 ++++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 25 +++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  8 ++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      | 19 ++++++++++--
 5 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c5c7a6d687ff..061c4e90692e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -450,7 +450,7 @@ struct mlx5e_icosq {
 static inline bool
 mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n)
 {
-	return (((wq->sz_m1 & (cc - pc)) >= n) || (cc == pc));
+	return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
 }
 
 struct mlx5e_dma_info {
@@ -956,10 +956,9 @@ static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
 				      struct mlx5e_tx_wqe **wqe,
 				      u16 *pi)
 {
-	struct mlx5_wq_cyc *wq;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 
-	wq = &sq->wq;
-	*pi = sq->pc & wq->sz_m1;
+	*pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	*wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
 	memset(*wqe, 0, sizeof(**wqe));
 }
@@ -967,7 +966,7 @@ static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
 static inline
 struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
 {
-	u16                         pi   = *pc & wq->sz_m1;
+	u16                         pi   = mlx5_wq_cyc_ctr2ix(wq, *pc);
 	struct mlx5e_tx_wqe        *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
 	struct mlx5_wqe_ctrl_seg   *cseg = &wqe->ctrl;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cee44c21766c..41f57afc5140 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -836,13 +836,15 @@ err_free_rq:
 static void mlx5e_activate_rq(struct mlx5e_rq *rq)
 {
 	struct mlx5e_icosq *sq = &rq->channel->icosq;
-	u16 pi = sq->pc & sq->wq.sz_m1;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 	struct mlx5e_tx_wqe *nopwqe;
 
+	u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
 	set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
 	sq->db.ico_wqe[pi].opcode     = MLX5_OPCODE_NOP;
-	nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
-	mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
+	nopwqe = mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
 }
 
 static void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
@@ -885,6 +887,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 {
 	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
 	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 	int err;
 
 	sq->pdev      = c->pdev;
@@ -894,10 +897,10 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->min_inline_mode = params->tx_min_inline_mode;
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
-	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
 	if (err)
 		return err;
-	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+	wq->db = &wq->db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
@@ -940,22 +943,23 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 {
 	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
 	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 	int err;
 
 	sq->channel   = c;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
-	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
 	if (err)
 		return err;
-	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+	wq->db = &wq->db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
 
-	sq->edge = (sq->wq.sz_m1 + 1) - MLX5E_ICOSQ_MAX_WQEBBS;
+	sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5E_ICOSQ_MAX_WQEBBS;
 
 	return 0;
 
@@ -1005,6 +1009,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 {
 	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
 	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 	int err;
 
 	sq->pdev      = c->pdev;
@@ -1022,10 +1027,10 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 		set_bit(MLX5E_SQ_STATE_TLS, &sq->state);
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
-	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
 	if (err)
 		return err;
-	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
+	wq->db    = &wq->db[MLX5_SND_DBR];
 
 	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
@@ -1034,7 +1039,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	INIT_WORK(&sq->dim.work, mlx5e_tx_dim_work);
 	sq->dim.mode = params->tx_cq_moderation.cq_period_mode;
 
-	sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+	sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5_SEND_WQE_MAX_WQEBBS;
 
 	return 0;
 
@@ -1238,6 +1243,7 @@ static inline void netif_tx_disable_queue(struct netdev_queue *txq)
 static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
 {
 	struct mlx5e_channel *c = sq->channel;
+	struct mlx5_wq_cyc *wq = &sq->wq;
 
 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
 	/* prevent netif_tx_wake_queue */
@@ -1246,12 +1252,13 @@ static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
 	netif_tx_disable_queue(sq->txq);
 
 	/* last doorbell out, godspeed .. */
-	if (mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1)) {
+	if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
+		u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 		struct mlx5e_tx_wqe *nop;
 
-		sq->db.wqe_info[(sq->pc & sq->wq.sz_m1)].skb = NULL;
-		nop = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
-		mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &nop->ctrl);
+		sq->db.wqe_info[pi].skb = NULL;
+		nop = mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nop->ctrl);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 53f72923b164..7fd3ec877ba4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -54,7 +54,7 @@ static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
 static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
 				       void *data)
 {
-	u32 ci = cqcc & cq->wq.fbc.sz_m1;
+	u32 ci = mlx5_cqwq_ctr2ix(&cq->wq, cqcc);
 
 	memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
 }
@@ -76,10 +76,11 @@ static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
 
 static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
 {
-	struct mlx5_frag_buf_ctrl *fbc = &cq->wq.fbc;
-	u8 op_own = (cqcc >> fbc->log_sz) & 1;
-	u32 wq_sz = 1 << fbc->log_sz;
-	u32 ci = cqcc & fbc->sz_m1;
+	struct mlx5_cqwq *wq = &cq->wq;
+
+	u8  op_own = mlx5_cqwq_get_ctr_wrap_cnt(wq, cqcc) & 1;
+	u32 ci     = mlx5_cqwq_ctr2ix(wq, cqcc);
+	u32 wq_sz  = mlx5_cqwq_get_size(wq);
 	u32 ci_top = min_t(u32, wq_sz, ci + n);
 
 	for (; ci < ci_top; ci++, n--) {
@@ -112,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			(cq->decmprs_wqe_counter + 1) & rq->wq.sz_m1;
+			mlx5_wq_ll_ctr2ix(&rq->wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -395,7 +396,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	int i;
 
 	/* fill sq edge with nops to avoid wqe wrap around */
-	while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+	while ((pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc)) > sq->edge) {
 		sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
 		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
 	}
@@ -482,7 +483,7 @@ static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
 					     struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &sq->wq;
-	u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
+	u16 ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
 	struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
 
 	mlx5_cqwq_pop(&cq->wq);
@@ -731,7 +732,7 @@ static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
 {
 	struct mlx5_wq_cyc *wq = &sq->wq;
 	struct mlx5e_tx_wqe *wqe;
-	u16 pi = (sq->pc - 1) & wq->sz_m1; /* last pi */
+	u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc - 1); /* last pi */
 
 	wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
 
@@ -744,7 +745,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 {
 	struct mlx5e_xdpsq       *sq   = &rq->xdpsq;
 	struct mlx5_wq_cyc       *wq   = &sq->wq;
-	u16                       pi   = sq->pc & wq->sz_m1;
+	u16                       pi   = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	struct mlx5e_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
 
 	struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
@@ -1214,7 +1215,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
 
 			last_wqe = (sqcc == wqe_counter);
 
-			ci = sqcc & sq->wq.sz_m1;
+			ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
 			di = &sq->db.di[ci];
 
 			sqcc++;
@@ -1239,7 +1240,7 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
 	u16 ci;
 
 	while (sq->cc != sq->pc) {
-		ci = sq->cc & sq->wq.sz_m1;
+		ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
 		di = &sq->db.di[ci];
 		sq->cc++;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2d3f17da5f5c..c9864fcf00b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -331,7 +331,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg);
 
 	/* fill sq edge with nops to avoid wqe wrap around */
-	while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+	while ((pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc)) > sq->edge) {
 		sq->db.wqe_info[pi].skb = NULL;
 		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
 		sq->stats.nop++;
@@ -496,7 +496,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 
 			last_wqe = (sqcc == wqe_counter);
 
-			ci = sqcc & sq->wq.sz_m1;
+			ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
 			wi = &sq->db.wqe_info[ci];
 			skb = wi->skb;
 
@@ -559,7 +559,7 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
 	int i;
 
 	while (sq->cc != sq->pc) {
-		ci = sq->cc & sq->wq.sz_m1;
+		ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
 		wi = &sq->db.wqe_info[ci];
 		skb = wi->skb;
 
@@ -606,7 +606,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5_av *av, u32 dqpn, u32 dqkey)
 {
 	struct mlx5_wq_cyc       *wq   = &sq->wq;
-	u16                       pi   = sq->pc & wq->sz_m1;
+	u16                       pi   = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	struct mlx5i_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
 	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index f3dfa0ca3c5d..a3572e148f09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -123,9 +123,14 @@ static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2)
 	return !equal && !smaller;
 }
 
+static inline u32 mlx5_cqwq_ctr2ix(struct mlx5_cqwq *wq, u32 ctr)
+{
+	return ctr & wq->fbc.sz_m1;
+}
+
 static inline u32 mlx5_cqwq_get_ci(struct mlx5_cqwq *wq)
 {
-	return wq->cc & wq->fbc.sz_m1;
+	return mlx5_cqwq_ctr2ix(wq, wq->cc);
 }
 
 static inline void *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix)
@@ -133,9 +138,14 @@ static inline void *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix)
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
+static inline u32 mlx5_cqwq_get_ctr_wrap_cnt(struct mlx5_cqwq *wq, u32 ctr)
+{
+	return ctr >> wq->fbc.log_sz;
+}
+
 static inline u32 mlx5_cqwq_get_wrap_cnt(struct mlx5_cqwq *wq)
 {
-	return wq->cc >> wq->fbc.log_sz;
+	return mlx5_cqwq_get_ctr_wrap_cnt(wq, wq->cc);
 }
 
 static inline void mlx5_cqwq_pop(struct mlx5_cqwq *wq)
@@ -174,6 +184,11 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 	return !wq->cur_sz;
 }
 
+static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
+{
+	return ctr & wq->sz_m1;
+}
+
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
 	return wq->buf + (ix << wq->log_stride);
-- 
cgit v1.2.3-59-g8ed1b


From 043dc78ecf07f3fc5b87270518d7f322aea2f748 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 21 Mar 2018 16:31:08 +0200
Subject: net/mlx5e: TX, Use actual WQE size for SQ edge fill

We fill SQ edge with NOPs to avoid WQEs wrap.
Here, instead of doing that in advance for the maximum possible
WQE size, we do it on-demand using the actual WQE size.
We re-order some parts in mlx5e_sq_xmit to finish the calculation
of WQE size (ds_cnt) before doing any writes to the WQE buffer.

When SQ work queue is fragmented (introduced in an downstream patch),
dealing with WQE wraps becomes more frequent. This change would drastically
reduce the overhead in this case.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet rate of 64B packets, single transmit ring, size 8K.

Before: 14.9 Mpps
After:  15.8 Mpps

Improvement of 6%.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   4 -
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  27 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    | 213 +++++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h  |  23 +++
 5 files changed, 178 insertions(+), 92 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 061c4e90692e..3c0f0a0343fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -183,6 +183,7 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg  eth;
+	struct mlx5_wqe_data_seg data[0];
 };
 
 struct mlx5e_rx_wqe {
@@ -374,7 +375,6 @@ struct mlx5e_txqsq {
 	struct netdev_queue       *txq;
 	u32                        sqn;
 	u8                         min_inline_mode;
-	u16                        edge;
 	struct device             *pdev;
 	__be32                     mkey_be;
 	unsigned long              state;
@@ -439,7 +439,6 @@ struct mlx5e_icosq {
 	struct mlx5_wq_cyc         wq;
 	void __iomem              *uar_map;
 	u32                        sqn;
-	u16                        edge;
 	unsigned long              state;
 
 	/* control path */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 41f57afc5140..a8b1e43384ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -959,8 +959,6 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	if (err)
 		goto err_sq_wq_destroy;
 
-	sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5E_ICOSQ_MAX_WQEBBS;
-
 	return 0;
 
 err_sq_wq_destroy:
@@ -1039,8 +1037,6 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	INIT_WORK(&sq->dim.work, mlx5e_tx_dim_work);
 	sq->dim.mode = params->tx_cq_moderation.cq_period_mode;
 
-	sq->edge = mlx5_wq_cyc_get_size(wq) - MLX5_SEND_WQE_MAX_WQEBBS;
-
 	return 0;
 
 err_sq_wq_destroy:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 7fd3ec877ba4..f4d2c8886492 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -383,6 +383,22 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
 	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 }
 
+static inline void mlx5e_fill_icosq_edge(struct mlx5e_icosq *sq,
+					 struct mlx5_wq_cyc *wq,
+					 u16 pi)
+{
+	struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi];
+	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+
+	edge_wi = wi + nnops;
+
+	/* fill sq edge with nops to avoid wqe wrapping two pages */
+	for (; wi < edge_wi; wi++) {
+		wi->opcode = MLX5_OPCODE_NOP;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	}
+}
+
 static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
 	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
@@ -391,14 +407,15 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	struct mlx5_wq_cyc *wq = &sq->wq;
 	struct mlx5e_umr_wqe *umr_wqe;
 	u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
-	int err;
 	u16 pi;
+	int err;
 	int i;
 
-	/* fill sq edge with nops to avoid wqe wrap around */
-	while ((pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc)) > sq->edge) {
-		sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
-		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
+	if (unlikely(pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_size(wq))) {
+		mlx5e_fill_icosq_edge(sq, wq, pi);
+		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	}
 
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index c9864fcf00b9..fc68e72b0b2b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -230,13 +230,10 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct
 }
 
 static inline u16
-mlx5e_txwqe_build_eseg_gso(struct mlx5e_txqsq *sq, struct sk_buff *skb,
-			   struct mlx5_wqe_eth_seg *eseg, unsigned int *num_bytes)
+mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb)
 {
 	u16 ihs;
 
-	eseg->mss    = cpu_to_be16(skb_shinfo(skb)->gso_size);
-
 	if (skb->encapsulation) {
 		ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
 		sq->stats.tso_inner_packets++;
@@ -247,7 +244,6 @@ mlx5e_txwqe_build_eseg_gso(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		sq->stats.tso_bytes += skb->len - ihs;
 	}
 
-	*num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
 	return ihs;
 }
 
@@ -300,17 +296,34 @@ dma_unmap_wqe_err:
 	return -ENOMEM;
 }
 
+static inline void mlx5e_fill_sq_edge(struct mlx5e_txqsq *sq,
+				      struct mlx5_wq_cyc *wq,
+				      u16 pi)
+{
+	struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi];
+	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+
+	edge_wi = wi + nnops;
+
+	/* fill sq edge with nops to avoid wqe wrap around */
+	for (; wi < edge_wi; wi++) {
+		wi->skb        = NULL;
+		wi->num_wqebbs = 1;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	}
+	sq->stats.nop += nnops;
+}
+
 static inline void
 mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
-		     u8 opcode, u16 ds_cnt, u32 num_bytes, u8 num_dma,
+		     u8 opcode, u16 ds_cnt, u8 num_wqebbs, u32 num_bytes, u8 num_dma,
 		     struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg)
 {
 	struct mlx5_wq_cyc *wq = &sq->wq;
-	u16 pi;
 
 	wi->num_bytes = num_bytes;
 	wi->num_dma = num_dma;
-	wi->num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	wi->num_wqebbs = num_wqebbs;
 	wi->skb = skb;
 
 	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
@@ -329,58 +342,85 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 	if (!skb->xmit_more || netif_xmit_stopped(sq->txq))
 		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg);
-
-	/* fill sq edge with nops to avoid wqe wrap around */
-	while ((pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc)) > sq->edge) {
-		sq->db.wqe_info[pi].skb = NULL;
-		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
-		sq->stats.nop++;
-	}
 }
 
+#define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg *)NULL)->inline_hdr.start))
+
 netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5e_tx_wqe *wqe, u16 pi)
 {
-	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
-
-	struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
-	struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5_wqe_ctrl_seg *cseg;
+	struct mlx5_wqe_eth_seg  *eseg;
+	struct mlx5_wqe_data_seg *dseg;
+	struct mlx5e_tx_wqe_info *wi;
 
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
-	u8  opcode = MLX5_OPCODE_SEND;
-	unsigned int num_bytes;
+	u16 ds_cnt, ds_cnt_inl = 0;
+	u8 num_wqebbs, opcode;
+	u16 headlen, ihs;
+	u32 num_bytes;
 	int num_dma;
-	u16 headlen;
-	u16 ds_cnt;
-	u16 ihs;
-
-	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+	__be16 mss;
 
+	/* Calc ihs and ds cnt, no writes to wqe yet */
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (skb_is_gso(skb)) {
-		opcode = MLX5_OPCODE_LSO;
-		ihs = mlx5e_txwqe_build_eseg_gso(sq, skb, eseg, &num_bytes);
+		opcode    = MLX5_OPCODE_LSO;
+		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
+		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
+		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
 		sq->stats.packets += skb_shinfo(skb)->gso_segs;
 	} else {
-		ihs = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		opcode    = MLX5_OPCODE_SEND;
+		mss       = 0;
+		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
 		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
 		sq->stats.packets++;
 	}
-	sq->stats.bytes += num_bytes;
+
+	sq->stats.bytes     += num_bytes;
 	sq->stats.xmit_more += skb->xmit_more;
 
-	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	headlen = skb_len - ihs - skb->data_len;
+	ds_cnt += !!headlen;
+	ds_cnt += skb_shinfo(skb)->nr_frags;
+
+	if (ihs) {
+		ihs += !!skb_vlan_tag_present(skb) * VLAN_HLEN;
+
+		ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS);
+		ds_cnt += ds_cnt_inl;
+	}
+
+	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
+		mlx5e_fill_sq_edge(sq, wq, pi);
+		mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
+	}
+
+	/* fill wqe */
+	wi   = &sq->db.wqe_info[pi];
+	cseg = &wqe->ctrl;
+	eseg = &wqe->eth;
+	dseg =  wqe->data;
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	eseg->mss = mss;
+
 	if (ihs) {
 		if (skb_vlan_tag_present(skb)) {
-			mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs, &skb_data, &skb_len);
-			ihs += VLAN_HLEN;
+			mlx5e_insert_vlan(eseg->inline_hdr.start, skb,
+					  ihs - VLAN_HLEN, &skb_data, &skb_len);
 			sq->stats.added_vlan_packets++;
 		} else {
 			memcpy(eseg->inline_hdr.start, skb_data, ihs);
 			mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
 		}
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
-		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
+		dseg += ds_cnt_inl;
 	} else if (skb_vlan_tag_present(skb)) {
 		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
 		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
@@ -389,14 +429,12 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		sq->stats.added_vlan_packets++;
 	}
 
-	headlen = skb_len - skb->data_len;
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen,
-					  (struct mlx5_wqe_data_seg *)cseg + ds_cnt);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
-	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt + num_dma,
-			     num_bytes, num_dma, wi, cseg);
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes,
+			     num_dma, wi, cseg);
 
 	return NETDEV_TX_OK;
 
@@ -581,18 +619,6 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
 }
 
 #ifdef CONFIG_MLX5_CORE_IPOIB
-
-struct mlx5_wqe_eth_pad {
-	u8 rsvd0[16];
-};
-
-struct mlx5i_tx_wqe {
-	struct mlx5_wqe_ctrl_seg     ctrl;
-	struct mlx5_wqe_datagram_seg datagram;
-	struct mlx5_wqe_eth_pad      pad;
-	struct mlx5_wqe_eth_seg      eth;
-};
-
 static inline void
 mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey,
 			   struct mlx5_wqe_datagram_seg *dseg)
@@ -605,59 +631,85 @@ mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey,
 netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5_av *av, u32 dqpn, u32 dqkey)
 {
-	struct mlx5_wq_cyc       *wq   = &sq->wq;
-	u16                       pi   = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
-	struct mlx5i_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
-	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5i_tx_wqe *wqe;
 
-	struct mlx5_wqe_ctrl_seg     *cseg = &wqe->ctrl;
-	struct mlx5_wqe_datagram_seg *datagram = &wqe->datagram;
-	struct mlx5_wqe_eth_seg      *eseg = &wqe->eth;
+	struct mlx5_wqe_datagram_seg *datagram;
+	struct mlx5_wqe_ctrl_seg *cseg;
+	struct mlx5_wqe_eth_seg  *eseg;
+	struct mlx5_wqe_data_seg *dseg;
+	struct mlx5e_tx_wqe_info *wi;
 
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
-	u8  opcode = MLX5_OPCODE_SEND;
-	unsigned int num_bytes;
+	u16 ds_cnt, ds_cnt_inl = 0;
+	u8 num_wqebbs, opcode;
+	u16 headlen, ihs, pi;
+	u32 num_bytes;
 	int num_dma;
-	u16 headlen;
-	u16 ds_cnt;
-	u16 ihs;
-
-	memset(wqe, 0, sizeof(*wqe));
+	__be16 mss;
 
-	mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram);
-
-	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+	mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
 
+	/* Calc ihs and ds cnt, no writes to wqe yet */
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (skb_is_gso(skb)) {
-		opcode = MLX5_OPCODE_LSO;
-		ihs = mlx5e_txwqe_build_eseg_gso(sq, skb, eseg, &num_bytes);
+		opcode    = MLX5_OPCODE_LSO;
+		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
+		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
+		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
 		sq->stats.packets += skb_shinfo(skb)->gso_segs;
 	} else {
-		ihs = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		opcode    = MLX5_OPCODE_SEND;
+		mss       = 0;
+		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
 		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
 		sq->stats.packets++;
 	}
 
-	sq->stats.bytes += num_bytes;
+	sq->stats.bytes     += num_bytes;
 	sq->stats.xmit_more += skb->xmit_more;
 
-	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	headlen = skb_len - ihs - skb->data_len;
+	ds_cnt += !!headlen;
+	ds_cnt += skb_shinfo(skb)->nr_frags;
+
+	if (ihs) {
+		ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS);
+		ds_cnt += ds_cnt_inl;
+	}
+
+	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
+		mlx5e_fill_sq_edge(sq, wq, pi);
+		mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
+	}
+
+	/* fill wqe */
+	wi       = &sq->db.wqe_info[pi];
+	cseg     = &wqe->ctrl;
+	datagram = &wqe->datagram;
+	eseg     = &wqe->eth;
+	dseg     =  wqe->data;
+
+	mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram);
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	eseg->mss = mss;
+
 	if (ihs) {
 		memcpy(eseg->inline_hdr.start, skb_data, ihs);
-		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
-		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
+		dseg += ds_cnt_inl;
 	}
 
-	headlen = skb_len - skb->data_len;
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen,
-					  (struct mlx5_wqe_data_seg *)cseg + ds_cnt);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
-	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt + num_dma,
-			     num_bytes, num_dma, wi, cseg);
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes,
+			     num_dma, wi, cseg);
 
 	return NETDEV_TX_OK;
 
@@ -667,5 +719,4 @@ err_drop:
 
 	return NETDEV_TX_OK;
 }
-
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index 6d9053bcbe95..45a11864e544 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -93,6 +93,29 @@ const struct mlx5e_profile *mlx5i_pkey_get_profile(void);
 /* Extract mlx5e_priv from IPoIB netdev */
 #define mlx5i_epriv(netdev) ((void *)(((struct mlx5i_priv *)netdev_priv(netdev))->mlx5e_priv))
 
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+struct mlx5i_tx_wqe {
+	struct mlx5_wqe_ctrl_seg     ctrl;
+	struct mlx5_wqe_datagram_seg datagram;
+	struct mlx5_wqe_eth_pad      pad;
+	struct mlx5_wqe_eth_seg      eth;
+	struct mlx5_wqe_data_seg     data[0];
+};
+
+static inline void mlx5i_sq_fetch_wqe(struct mlx5e_txqsq *sq,
+				      struct mlx5i_tx_wqe **wqe,
+				      u16 *pi)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+
+	*pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	*wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
+	memset(*wqe, 0, sizeof(**wqe));
+}
+
 netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5_av *av, u32 dqpn, u32 dqkey);
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
-- 
cgit v1.2.3-59-g8ed1b


From 549322f2f988b2ced90656727c0b4850e4b49fc6 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 9 May 2018 13:46:51 +0300
Subject: net/mlx5i: Use compilation flag in IPOIB header

If CONFIG_MLX5_CORE_IPOIB is not set, compile-out the
IPOIB related headers.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
index 45a11864e544..08eac92fc26c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -33,6 +33,8 @@
 #ifndef __MLX5E_IPOB_H__
 #define __MLX5E_IPOB_H__
 
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
 #include <linux/mlx5/fs.h>
 #include "en.h"
 
@@ -120,4 +122,5 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 			  struct mlx5_av *av, u32 dqpn, u32 dqkey);
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
 
+#endif /* CONFIG_MLX5_CORE_IPOIB */
 #endif /* __MLX5E_IPOB_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 3a2f70331226c140e5aa27ee6bbe2a5c618acb4c Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 4 Apr 2018 12:54:23 +0300
Subject: net/mlx5: Use order-0 allocations for all WQ types

Complete the transition of all WQ types to use fragmented
order-0 coherent memory instead of high-order allocations.

CQ-WQ already uses order-0.
Here we do the same for cyclic and linked-list WQs.

This allows the driver to load cleanly on systems with a highly
fragmented coherent memory.

Performance tests:
ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
Packet rate of 64B packets, single transmit ring, size 8K.

No degradation is sensed.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 15 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 17 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    | 24 +++---
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.c    | 14 ++--
 .../net/ethernet/mellanox/mlx5/core/fpga/conn.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c       | 94 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/wq.h       | 33 ++++----
 include/linux/mlx5/driver.h                        | 16 +++-
 9 files changed, 123 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3c0f0a0343fd..9396db54973f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -314,7 +314,7 @@ struct mlx5e_cq {
 
 	/* control */
 	struct mlx5_core_dev      *mdev;
-	struct mlx5_frag_wq_ctrl   wq_ctrl;
+	struct mlx5_wq_ctrl        wq_ctrl;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_tx_wqe_info {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8b1e43384ca..0c167e5fc346 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -646,8 +646,8 @@ static int mlx5e_create_rq(struct mlx5e_rq *rq,
 						MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
 
-	mlx5_fill_page_array(&rq->wq_ctrl.buf,
-			     (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+	mlx5_fill_page_frag_array(&rq->wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
 
@@ -1096,7 +1096,8 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
 
-	mlx5_fill_page_array(&csp->wq_ctrl->buf, (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+	mlx5_fill_page_frag_array(&csp->wq_ctrl->buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
 
 	err = mlx5_core_create_sq(mdev, in, inlen, sqn);
 
@@ -1538,7 +1539,7 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 
 static void mlx5e_free_cq(struct mlx5e_cq *cq)
 {
-	mlx5_cqwq_destroy(&cq->wq_ctrl);
+	mlx5_wq_destroy(&cq->wq_ctrl);
 }
 
 static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
@@ -1554,7 +1555,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-		sizeof(u64) * cq->wq_ctrl.frag_buf.npages;
+		sizeof(u64) * cq->wq_ctrl.buf.npages;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1563,7 +1564,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
-	mlx5_fill_page_frag_array(&cq->wq_ctrl.frag_buf,
+	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
 				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
@@ -1571,7 +1572,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
 	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
-	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f4d2c8886492..ac54380d41e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -383,16 +383,16 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
 	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
 }
 
-static inline void mlx5e_fill_icosq_edge(struct mlx5e_icosq *sq,
-					 struct mlx5_wq_cyc *wq,
-					 u16 pi)
+static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
+					      struct mlx5_wq_cyc *wq,
+					      u16 pi, u16 frag_pi)
 {
 	struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi];
-	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+	u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi;
 
 	edge_wi = wi + nnops;
 
-	/* fill sq edge with nops to avoid wqe wrapping two pages */
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
 	for (; wi < edge_wi; wi++) {
 		wi->opcode = MLX5_OPCODE_NOP;
 		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
@@ -407,14 +407,15 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	struct mlx5_wq_cyc *wq = &sq->wq;
 	struct mlx5e_umr_wqe *umr_wqe;
 	u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
-	u16 pi;
+	u16 pi, frag_pi;
 	int err;
 	int i;
 
 	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
 
-	if (unlikely(pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_icosq_edge(sq, wq, pi);
+	if (unlikely(frag_pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_icosq_frag_edge(sq, wq, pi, frag_pi);
 		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index fc68e72b0b2b..d37566be06e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -296,16 +296,16 @@ dma_unmap_wqe_err:
 	return -ENOMEM;
 }
 
-static inline void mlx5e_fill_sq_edge(struct mlx5e_txqsq *sq,
-				      struct mlx5_wq_cyc *wq,
-				      u16 pi)
+static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq,
+					   struct mlx5_wq_cyc *wq,
+					   u16 pi, u16 frag_pi)
 {
 	struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi];
-	u8 nnops = mlx5_wq_cyc_get_size(wq) - pi;
+	u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi;
 
 	edge_wi = wi + nnops;
 
-	/* fill sq edge with nops to avoid wqe wrap around */
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
 	for (; wi < edge_wi; wi++) {
 		wi->skb        = NULL;
 		wi->num_wqebbs = 1;
@@ -358,8 +358,8 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
 	u16 ds_cnt, ds_cnt_inl = 0;
+	u16 headlen, ihs, frag_pi;
 	u8 num_wqebbs, opcode;
-	u16 headlen, ihs;
 	u32 num_bytes;
 	int num_dma;
 	__be16 mss;
@@ -395,8 +395,9 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	}
 
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_sq_edge(sq, wq, pi);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
+	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
 		mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
@@ -642,9 +643,9 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
+	u16 headlen, ihs, pi, frag_pi;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u8 num_wqebbs, opcode;
-	u16 headlen, ihs, pi;
 	u32 num_bytes;
 	int num_dma;
 	__be16 mss;
@@ -680,8 +681,9 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	}
 
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
-	if (unlikely(pi + num_wqebbs > mlx5_wq_cyc_get_size(wq))) {
-		mlx5e_fill_sq_edge(sq, wq, pi);
+	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
+	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
 		mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index de7fe087d6fe..4e5a5cf25f17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -454,7 +454,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	}
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-		sizeof(u64) * conn->cq.wq_ctrl.frag_buf.npages;
+		sizeof(u64) * conn->cq.wq_ctrl.buf.npages;
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in) {
 		err = -ENOMEM;
@@ -469,12 +469,12 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size));
 	MLX5_SET(cqc, cqc, c_eqn, eqn);
 	MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index);
-	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.frag_buf.page_shift -
+	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.buf.page_shift -
 			   MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr, conn->cq.wq_ctrl.db.dma);
 
 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
-	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.frag_buf, pas);
+	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas);
 
 	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
 	kvfree(in);
@@ -500,7 +500,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	goto out;
 
 err_cqwq:
-	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
 out:
 	return err;
 }
@@ -510,7 +510,7 @@ static void mlx5_fpga_conn_destroy_cq(struct mlx5_fpga_conn *conn)
 	tasklet_disable(&conn->cq.tasklet);
 	tasklet_kill(&conn->cq.tasklet);
 	mlx5_core_destroy_cq(conn->fdev->mdev, &conn->cq.mcq);
-	mlx5_cqwq_destroy(&conn->cq.wq_ctrl);
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
 }
 
 static int mlx5_fpga_conn_create_wq(struct mlx5_fpga_conn *conn, void *qpc)
@@ -591,8 +591,8 @@ static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
 	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
 		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
 
-	mlx5_fill_page_array(&conn->qp.wq_ctrl.buf,
-			     (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
+	mlx5_fill_page_frag_array(&conn->qp.wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
 
 	err = mlx5_core_create_qp(mdev, &conn->qp.mqp, in, inlen);
 	if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
index 44bd9eccc711..634ae10e287b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
@@ -54,7 +54,7 @@ struct mlx5_fpga_conn {
 	/* CQ */
 	struct {
 		struct mlx5_cqwq wq;
-		struct mlx5_frag_wq_ctrl wq_ctrl;
+		struct mlx5_wq_ctrl wq_ctrl;
 		struct mlx5_core_cq mcq;
 		struct tasklet_struct tasklet;
 	} cq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index ea66448ba365..5b8b35392025 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -36,7 +36,12 @@
 
 u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq)
 {
-	return (u32)wq->sz_m1 + 1;
+	return (u32)wq->fbc.sz_m1 + 1;
+}
+
+u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq)
+{
+	return (u32)wq->fbc.frag_sz_m1 + 1;
 }
 
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
@@ -46,12 +51,12 @@ u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
 
 u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
 {
-	return (u32)wq->sz_m1 + 1;
+	return (u32)wq->fbc.sz_m1 + 1;
 }
 
 static u32 mlx5_wq_cyc_get_byte_size(struct mlx5_wq_cyc *wq)
 {
-	return mlx5_wq_cyc_get_size(wq) << wq->log_stride;
+	return mlx5_wq_cyc_get_size(wq) << wq->fbc.log_stride;
 }
 
 static u32 mlx5_wq_qp_get_byte_size(struct mlx5_wq_qp *wq)
@@ -67,17 +72,19 @@ static u32 mlx5_cqwq_get_byte_size(struct mlx5_cqwq *wq)
 
 static u32 mlx5_wq_ll_get_byte_size(struct mlx5_wq_ll *wq)
 {
-	return mlx5_wq_ll_get_size(wq) << wq->log_stride;
+	return mlx5_wq_ll_get_size(wq) << wq->fbc.log_stride;
 }
 
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl)
 {
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	int err;
 
-	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
-	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -85,14 +92,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->buf = wq_ctrl->buf.frags->buf;
+	fbc->frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
 	wq_ctrl->mdev = mdev;
@@ -105,17 +112,35 @@ err_db_free:
 	return err;
 }
 
+static void mlx5e_qp_set_frag_buf(struct mlx5_frag_buf *buf,
+				  struct mlx5_wq_qp *qp)
+{
+	struct mlx5_frag_buf *rqb, *sqb;
+
+	rqb = &qp->rq.fbc.frag_buf;
+	*rqb = *buf;
+	rqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
+	rqb->npages = 1 << get_order(rqb->size);
+
+	sqb = &qp->sq.fbc.frag_buf;
+	*sqb = *buf;
+	sqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
+	sqb->npages = 1 << get_order(sqb->size);
+	sqb->frags += rqb->npages; /* first part is for the rq */
+}
+
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
 	int err;
 
-	wq->rq.log_stride = MLX5_GET(qpc, qpc, log_rq_stride) + 4;
-	wq->rq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_rq_size)) - 1;
-
-	wq->sq.log_stride = ilog2(MLX5_SEND_WQE_BB);
-	wq->sq.sz_m1 = (1 << MLX5_GET(qpc, qpc, log_sq_size)) - 1;
+	mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4,
+		      MLX5_GET(qpc, qpc, log_rq_size),
+		      &wq->rq.fbc);
+	mlx5_fill_fbc(ilog2(MLX5_SEND_WQE_BB),
+		      MLX5_GET(qpc, qpc, log_sq_size),
+		      &wq->sq.fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -123,15 +148,15 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->rq.buf = wq_ctrl->buf.frags->buf;
-	wq->sq.buf = wq->rq.buf + mlx5_wq_cyc_get_byte_size(&wq->rq);
+	mlx5e_qp_set_frag_buf(&wq_ctrl->buf, wq);
+
 	wq->rq.db  = &wq_ctrl->db.db[MLX5_RCV_DBR];
 	wq->sq.db  = &wq_ctrl->db.db[MLX5_SND_DBR];
 
@@ -147,7 +172,7 @@ err_db_free:
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_frag_wq_ctrl *wq_ctrl)
+		     struct mlx5_wq_ctrl *wq_ctrl)
 {
 	int err;
 
@@ -160,7 +185,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	}
 
 	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
-				       &wq_ctrl->frag_buf,
+				       &wq_ctrl->buf,
 				       param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n",
@@ -168,7 +193,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		goto err_db_free;
 	}
 
-	wq->fbc.frag_buf = wq_ctrl->frag_buf;
+	wq->fbc.frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
 	wq_ctrl->mdev = mdev;
@@ -185,12 +210,14 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *wqc, struct mlx5_wq_ll *wq,
 		      struct mlx5_wq_ctrl *wq_ctrl)
 {
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
 	struct mlx5_wqe_srq_next_seg *next_seg;
 	int err;
 	int i;
 
-	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
-	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -198,17 +225,17 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
 		goto err_db_free;
 	}
 
-	wq->buf = wq_ctrl->buf.frags->buf;
+	wq->fbc.frag_buf = wq_ctrl->buf;
 	wq->db  = wq_ctrl->db.db;
 
-	for (i = 0; i < wq->sz_m1; i++) {
+	for (i = 0; i < fbc->sz_m1; i++) {
 		next_seg = mlx5_wq_ll_get_wqe(wq, i);
 		next_seg->next_wqe_index = cpu_to_be16(i + 1);
 	}
@@ -227,12 +254,7 @@ err_db_free:
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl)
 {
-	mlx5_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
+	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
 	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
 }
 
-void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl)
-{
-	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->frag_buf);
-	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
-}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index a3572e148f09..b9d7c01fc7cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -48,17 +48,9 @@ struct mlx5_wq_ctrl {
 	struct mlx5_db		db;
 };
 
-struct mlx5_frag_wq_ctrl {
-	struct mlx5_core_dev	*mdev;
-	struct mlx5_frag_buf	frag_buf;
-	struct mlx5_db		db;
-};
-
 struct mlx5_wq_cyc {
-	void			*buf;
+	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
-	u16			sz_m1;
-	u8			log_stride;
 };
 
 struct mlx5_wq_qp {
@@ -73,20 +65,19 @@ struct mlx5_cqwq {
 };
 
 struct mlx5_wq_ll {
-	void			*buf;
+	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
 	__be16			*tail_next;
-	u16			sz_m1;
 	u16			head;
 	u16			wqe_ctr;
 	u16			cur_sz;
-	u8			log_stride;
 };
 
 int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		       void *wqc, struct mlx5_wq_cyc *wq,
 		       struct mlx5_wq_ctrl *wq_ctrl);
 u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
+u32 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq);
 
 int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		      void *qpc, struct mlx5_wq_qp *wq,
@@ -94,7 +85,7 @@ int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_frag_wq_ctrl *wq_ctrl);
+		     struct mlx5_wq_ctrl *wq_ctrl);
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq);
 
 int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
@@ -103,16 +94,20 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
-void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl);
 
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
-	return ctr & wq->sz_m1;
+	return ctr & wq->fbc.sz_m1;
+}
+
+static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr)
+{
+	return ctr & wq->fbc.frag_sz_m1;
 }
 
 static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
 {
-	return wq->buf + (ix << wq->log_stride);
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
 static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2)
@@ -176,7 +171,7 @@ static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq)
 
 static inline int mlx5_wq_ll_is_full(struct mlx5_wq_ll *wq)
 {
-	return wq->cur_sz == wq->sz_m1;
+	return wq->cur_sz == wq->fbc.sz_m1;
 }
 
 static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
@@ -186,12 +181,12 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 
 static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
 {
-	return ctr & wq->sz_m1;
+	return ctr & wq->fbc.sz_m1;
 }
 
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
-	return wq->buf + (ix << wq->log_stride);
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
 }
 
 static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 92d292454351..80cbb7fdce4a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -983,16 +983,24 @@ static inline u32 mlx5_base_mkey(const u32 key)
 	return key & 0xffffff00u;
 }
 
-static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
-					      void *cqc)
+static inline void mlx5_fill_fbc(u8 log_stride, u8 log_sz,
+				 struct mlx5_frag_buf_ctrl *fbc)
 {
-	fbc->log_stride	= 6 + MLX5_GET(cqc, cqc, cqe_sz);
-	fbc->log_sz	= MLX5_GET(cqc, cqc, log_cq_size);
+	fbc->log_stride = log_stride;
+	fbc->log_sz     = log_sz;
 	fbc->sz_m1	= (1 << fbc->log_sz) - 1;
 	fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride;
 	fbc->frag_sz_m1	= (1 << fbc->log_frag_strides) - 1;
 }
 
+static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc,
+					      void *cqc)
+{
+	mlx5_fill_fbc(6 + MLX5_GET(cqc, cqc, cqe_sz),
+		      MLX5_GET(cqc, cqc, log_cq_size),
+		      fbc);
+}
+
 static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc,
 					  u32 ix)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6ab75516cf35c4789ca8baa206b21fdbc03c7870 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 2 Mar 2018 22:21:32 -0800
Subject: net/mlx5e: Move phy link down events counter out of SW stats

PHY link down events counter belongs to phy_counters group.
although it has special handling, it doesn't mean it can't be there.

Move it to phy_counters_grp handler.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 37 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  3 --
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index e17919c0af08..973939ed8bb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -81,7 +81,6 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_busy) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, link_down_events_phy) },
 };
 
 #define NUM_SW_COUNTERS			ARRAY_SIZE(sw_stats_desc)
@@ -175,9 +174,6 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 		}
 	}
 
-	s->link_down_events_phy = MLX5_GET(ppcnt_reg,
-				priv->stats.pport.phy_counters,
-				counter_set.phys_layer_cntrs.link_down_events);
 	memcpy(&priv->stats.sw, s, sizeof(*s));
 }
 
@@ -580,12 +576,13 @@ static const struct counter_desc pport_phy_statistical_stats_desc[] = {
 	{ "rx_corrected_bits_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits) },
 };
 
-#define NUM_PPORT_PHY_COUNTERS		ARRAY_SIZE(pport_phy_statistical_stats_desc)
+#define NUM_PPORT_PHY_STATISTICAL_COUNTERS ARRAY_SIZE(pport_phy_statistical_stats_desc)
 
 static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv)
 {
+	/* "1" for link_down_events special counter */
 	return MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group) ?
-		NUM_PPORT_PHY_COUNTERS : 0;
+		NUM_PPORT_PHY_STATISTICAL_COUNTERS + 1 : 1;
 }
 
 static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data,
@@ -593,10 +590,14 @@ static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data,
 {
 	int i;
 
-	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
-		for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
-			strcpy(data + (idx++) * ETH_GSTRING_LEN,
-			       pport_phy_statistical_stats_desc[i].format);
+	strcpy(data + (idx++) * ETH_GSTRING_LEN, "link_down_events_phy");
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		return idx;
+
+	for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_phy_statistical_stats_desc[i].format);
 	return idx;
 }
 
@@ -604,11 +605,17 @@ static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
 {
 	int i;
 
-	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
-		for (i = 0; i < NUM_PPORT_PHY_COUNTERS; i++)
-			data[idx++] =
-				MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
-						    pport_phy_statistical_stats_desc, i);
+	/* link_down_events_phy has special handling since it is not stored in __be64 format */
+	data[idx++] = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters,
+			       counter_set.phys_layer_cntrs.link_down_events);
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		return idx;
+
+	for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+		data[idx++] =
+			MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
+					    pport_phy_statistical_stats_desc, i);
 	return idx;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index a36e6a87066b..39ced559929a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -97,9 +97,6 @@ struct mlx5e_sw_stats {
 	u64 tx_tls_ooo;
 	u64 tx_tls_resync_bytes;
 #endif
-
-	/* Special handling counters */
-	u64 link_down_events_phy;
 };
 
 struct mlx5e_qcounter_stats {
-- 
cgit v1.2.3-59-g8ed1b


From 868a01a27d80a59e719f0c369d1b26b923fc7674 Mon Sep 17 00:00:00 2001
From: Shalom Lagziel <shaloml@mellanox.com>
Date: Mon, 12 Feb 2018 17:43:07 +0200
Subject: net/mlx5e: Introducing new statistics rwlock

Introduce a new read/write lock that will protect statistics gathering from
netdev channels configuration changes.
e.g. when channels are being replaced (increase/decrease number of rings)
prevent statistic gathering (ndo_get_stats64) to read the statistics of
in-active channels (channels that are being closed).

Plus update channels software statistics on the fly when calling
ndo_get_stats64, and remove it from stats periodic work.

Fixes: 9218b44dcc05 ("net/mlx5e: Statistics handling refactoring")
Signed-off-by: Shalom Lagziel <shaloml@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  8 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 16 +++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  8 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  2 ++
 5 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9396db54973f..c3c79f2835d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -771,6 +771,8 @@ struct mlx5e_priv {
 	struct mutex               state_lock; /* Protects Interface state */
 	struct mlx5e_rq            drop_rq;
 
+	rwlock_t                   stats_lock; /* Protects channels SW stats updates */
+	bool                       channels_active;
 	struct mlx5e_channels      channels;
 	u32                        tisn[MLX5E_MAX_NUM_TC];
 	struct mlx5e_rqt           indir_rqt;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0c167e5fc346..0e9c64580abb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2658,6 +2658,9 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 
 	mlx5e_build_channels_tx_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
+	write_lock(&priv->stats_lock);
+	priv->channels_active = true;
+	write_unlock(&priv->stats_lock);
 	netif_tx_start_all_queues(priv->netdev);
 
 	if (MLX5_VPORT_MANAGER(priv->mdev))
@@ -2679,6 +2682,9 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
 	 */
 	netif_tx_stop_all_queues(priv->netdev);
 	netif_tx_disable(priv->netdev);
+	write_lock(&priv->stats_lock);
+	priv->channels_active = false;
+	write_unlock(&priv->stats_lock);
 	mlx5e_deactivate_channels(&priv->channels);
 }
 
@@ -3223,6 +3229,7 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
 		stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
 	} else {
+		mlx5e_grp_sw_update_stats(priv);
 		stats->rx_packets = sstats->rx_packets;
 		stats->rx_bytes   = sstats->rx_bytes;
 		stats->tx_packets = sstats->tx_packets;
@@ -4248,6 +4255,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
 			       profile->max_nch(mdev), netdev->mtu);
 
 	mutex_init(&priv->state_lock);
+	rwlock_init(&priv->stats_lock);
 
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index c3034f58aa33..1a3f9e091385 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -130,6 +130,10 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 	struct mlx5e_sq_stats *sq_stats;
 	int i, j;
 
+	read_lock(&priv->stats_lock);
+	if (!priv->channels_active)
+	        goto out;
+
 	memset(s, 0, sizeof(*s));
 	for (i = 0; i < priv->channels.num; i++) {
 		struct mlx5e_channel *c = priv->channels.c[i];
@@ -146,12 +150,8 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 			s->tx_bytes		+= sq_stats->bytes;
 		}
 	}
-}
-
-static void mlx5e_rep_update_stats(struct mlx5e_priv *priv)
-{
-	mlx5e_rep_update_sw_counters(priv);
-	mlx5e_rep_update_hw_counters(priv);
+out:
+	read_unlock(&priv->stats_lock);
 }
 
 static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
@@ -871,6 +871,8 @@ mlx5e_get_sw_stats64(const struct net_device *dev,
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
 
+	mlx5e_rep_update_sw_counters(priv);
+
 	stats->rx_packets = sstats->rx_packets;
 	stats->rx_bytes   = sstats->rx_bytes;
 	stats->tx_packets = sstats->tx_packets;
@@ -1046,7 +1048,7 @@ static const struct mlx5e_profile mlx5e_rep_profile = {
 	.cleanup_rx		= mlx5e_cleanup_rep_rx,
 	.init_tx		= mlx5e_init_rep_tx,
 	.cleanup_tx		= mlx5e_cleanup_nic_tx,
-	.update_stats           = mlx5e_rep_update_stats,
+	.update_stats           = mlx5e_rep_update_hw_counters,
 	.max_nch		= mlx5e_get_rep_max_num_channels,
 	.update_carrier		= NULL,
 	.rx_handlers.handle_rx_cqe       = mlx5e_handle_rx_cqe_rep,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 973939ed8bb5..323f2af4200b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -108,7 +108,7 @@ static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
 	return idx;
 }
 
-static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
+void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 {
 	struct mlx5e_sw_stats temp, *s = &temp;
 	struct mlx5e_rq_stats *rq_stats;
@@ -117,6 +117,9 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	int i, j;
 
 	memset(s, 0, sizeof(*s));
+	read_lock(&priv->stats_lock);
+	if (!priv->channels_active)
+		goto out;
 	for (i = 0; i < priv->channels.num; i++) {
 		struct mlx5e_channel *c = priv->channels.c[i];
 
@@ -175,6 +178,8 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	}
 
 	memcpy(&priv->stats.sw, s, sizeof(*s));
+out:
+	read_unlock(&priv->stats_lock);
 }
 
 static const struct counter_desc q_stats_desc[] = {
@@ -1224,7 +1229,6 @@ const struct mlx5e_stats_grp mlx5e_stats_grps[] = {
 		.get_num_stats = mlx5e_grp_sw_get_num_stats,
 		.fill_strings = mlx5e_grp_sw_fill_strings,
 		.fill_stats = mlx5e_grp_sw_fill_stats,
-		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
 		.update_stats = mlx5e_grp_sw_update_stats,
 	},
 	{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 39ced559929a..390c7afa5188 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -239,4 +239,6 @@ struct mlx5e_stats_grp {
 extern const struct mlx5e_stats_grp mlx5e_stats_grps[];
 extern const int mlx5e_num_stats_grps;
 
+void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv);
+
 #endif /* __MLX5_EN_STATS_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 05909babce5328f468f7ac3a1033431c895f97a5 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Thu, 12 Apr 2018 16:03:37 +0300
Subject: net/mlx5e: Avoid reset netdev stats on configuration changes

Move all RQ, SQ and channel counters from the channel objects into the
priv structure.  With this change, counters will not be reset upon
channel configuration changes.

Channel's statistics for SQs which are associated with TCs higher than
zero will be presented in ethtool -S, only for SQs which were opened at
least once since the module was loaded (regardless of their open/close
current status).  This is done in order to decrease the total amount of
statistics presented and calculated for the common out of box use (no
QoS).

mlx5e_channel_stats is a compound of CH,RQ,SQs stats in order to
create locality for the NAPI when handling TX and RX of the same
channel.

Align the new statistics struct per ring to avoid several channels
update to the same cache line at the same time.
Packet rate was tested, no degradation sensed.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
CC: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 14 +++-
 .../mellanox/mlx5/core/en_accel/tls_rxtx.c         |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 28 +++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 75 ++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 56 ++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c    | 49 +++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  |  6 +-
 8 files changed, 136 insertions(+), 100 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c3c79f2835d2..1c04df043e07 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -358,7 +358,6 @@ struct mlx5e_txqsq {
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
 	u32                        dma_fifo_pc;
-	struct mlx5e_sq_stats      stats;
 
 	struct mlx5e_cq            cq;
 
@@ -371,6 +370,7 @@ struct mlx5e_txqsq {
 	/* read only */
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
+	struct mlx5e_sq_stats     *stats;
 	void __iomem              *uar_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
@@ -526,7 +526,7 @@ struct mlx5e_rq {
 	struct mlx5e_channel  *channel;
 	struct device         *pdev;
 	struct net_device     *netdev;
-	struct mlx5e_rq_stats  stats;
+	struct mlx5e_rq_stats *stats;
 	struct mlx5e_cq        cq;
 	struct mlx5e_page_cache page_cache;
 	struct hwtstamp_config *tstamp;
@@ -574,7 +574,7 @@ struct mlx5e_channel {
 
 	/* data path - accessed per napi poll */
 	struct irq_desc *irq_desc;
-	struct mlx5e_ch_stats      stats;
+	struct mlx5e_ch_stats     *stats;
 
 	/* control */
 	struct mlx5e_priv         *priv;
@@ -590,6 +590,12 @@ struct mlx5e_channels {
 	struct mlx5e_params    params;
 };
 
+struct mlx5e_channel_stats {
+	struct mlx5e_ch_stats ch;
+	struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_rq_stats rq;
+} ____cacheline_aligned_in_smp;
+
 enum mlx5e_traffic_types {
 	MLX5E_TT_IPV4_TCP,
 	MLX5E_TT_IPV6_TCP,
@@ -793,6 +799,8 @@ struct mlx5e_priv {
 	struct mlx5_core_dev      *mdev;
 	struct net_device         *netdev;
 	struct mlx5e_stats         stats;
+	struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS];
+	u8                         max_opened_tc;
 	struct hwtstamp_config     tstamp;
 	u16                        q_counter;
 	u16                        drop_rq_q_counter;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
index ad2790fb5966..15aef71d1957 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -174,7 +174,7 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
 	int headln;
 	int i;
 
-	sq->stats.tls_ooo++;
+	sq->stats->tls_ooo++;
 
 	if (mlx5e_tls_get_sync_data(context, tcp_seq, &info)) {
 		/* We might get here if a retransmission reaches the driver
@@ -220,7 +220,7 @@ mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context *context,
 	skb_shinfo(nskb)->nr_frags = info.nr_frags;
 	nskb->data_len = info.sync_len;
 	nskb->len += info.sync_len;
-	sq->stats.tls_resync_bytes += nskb->len;
+	sq->stats->tls_resync_bytes += nskb->len;
 	mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln,
 				    cpu_to_be64(info.rcd_sn));
 	mlx5e_sq_xmit(sq, nskb, *wqe, *pi);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0e9c64580abb..9b19863b059d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -423,6 +423,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	rq->ix      = c->ix;
 	rq->mdev    = mdev;
 	rq->hw_mtu  = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	rq->stats   = &c->priv->channel_stats[c->ix].rq;
 
 	rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
 	if (IS_ERR(rq->xdp_prog)) {
@@ -1003,7 +1004,8 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 			     int txq_ix,
 			     struct mlx5e_params *params,
 			     struct mlx5e_sq_param *param,
-			     struct mlx5e_txqsq *sq)
+			     struct mlx5e_txqsq *sq,
+			     int tc)
 {
 	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
 	struct mlx5_core_dev *mdev = c->mdev;
@@ -1018,6 +1020,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	sq->txq_ix    = txq_ix;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
+	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
 	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
@@ -1176,13 +1179,14 @@ static int mlx5e_open_txqsq(struct mlx5e_channel *c,
 			    int txq_ix,
 			    struct mlx5e_params *params,
 			    struct mlx5e_sq_param *param,
-			    struct mlx5e_txqsq *sq)
+			    struct mlx5e_txqsq *sq,
+			    int tc)
 {
 	struct mlx5e_create_sq_param csp = {};
 	u32 tx_rate;
 	int err;
 
-	err = mlx5e_alloc_txqsq(c, txq_ix, params, param, sq);
+	err = mlx5e_alloc_txqsq(c, txq_ix, params, param, sq, tc);
 	if (err)
 		return err;
 
@@ -1370,7 +1374,7 @@ static void mlx5e_sq_recover(struct work_struct *work)
 		return;
 
 	mlx5e_reset_txqsq_cc_pc(sq);
-	sq->stats.recover++;
+	sq->stats->recover++;
 	recover->last_recover = jiffies;
 	mlx5e_activate_txqsq(sq);
 }
@@ -1665,14 +1669,14 @@ static int mlx5e_open_sqs(struct mlx5e_channel *c,
 			  struct mlx5e_params *params,
 			  struct mlx5e_channel_param *cparam)
 {
-	int err;
-	int tc;
+	struct mlx5e_priv *priv = c->priv;
+	int err, tc, max_nch = priv->profile->max_nch(priv->mdev);
 
 	for (tc = 0; tc < params->num_tc; tc++) {
-		int txq_ix = c->ix + tc * params->num_channels;
+		int txq_ix = c->ix + tc * max_nch;
 
 		err = mlx5e_open_txqsq(c, c->priv->tisn[tc], txq_ix,
-				       params, &cparam->sq, &c->sq[tc]);
+				       params, &cparam->sq, &c->sq[tc], tc);
 		if (err)
 			goto err_close_sqs;
 	}
@@ -1802,6 +1806,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
 	c->num_tc   = params->num_tc;
 	c->xdp      = !!params->xdp_prog;
+	c->stats    = &priv->channel_stats[ix].ch;
 
 	mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
 	c->irq_desc = irq_to_desc(irq);
@@ -2634,7 +2639,7 @@ static void mlx5e_build_channels_tx_maps(struct mlx5e_priv *priv)
 	struct mlx5e_txqsq *sq;
 	int i, tc;
 
-	for (i = 0; i < priv->channels.num; i++)
+	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
 		for (tc = 0; tc < priv->profile->max_tc; tc++)
 			priv->channel_tc2txq[i][tc] = i + tc * priv->channels.num;
 
@@ -3139,6 +3144,8 @@ static int mlx5e_setup_tc_mqprio(struct net_device *netdev,
 	if (err)
 		goto out;
 
+	priv->max_opened_tc = max_t(u8, priv->max_opened_tc,
+				    new_channels.params.num_tc);
 	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
 out:
 	mutex_unlock(&priv->state_lock);
@@ -3826,7 +3833,7 @@ static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
 		return false;
 
 	netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->eqn);
-	sq->channel->stats.eq_rearm++;
+	sq->channel->stats->eq_rearm++;
 	return true;
 }
 
@@ -4250,6 +4257,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
 	priv->profile     = profile;
 	priv->ppriv       = ppriv;
 	priv->msglevel    = MLX5E_MSG_LEVEL;
+	priv->max_opened_tc = 1;
 
 	mlx5e_build_nic_params(mdev, &priv->channels.params,
 			       profile->max_nch(mdev), netdev->mtu);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 1a3f9e091385..de6364125f0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -138,13 +138,13 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 	for (i = 0; i < priv->channels.num; i++) {
 		struct mlx5e_channel *c = priv->channels.c[i];
 
-		rq_stats = &c->rq.stats;
+		rq_stats = c->rq.stats;
 
 		s->rx_packets	+= rq_stats->packets;
 		s->rx_bytes	+= rq_stats->bytes;
 
 		for (j = 0; j < priv->channels.params.num_tc; j++) {
-			sq_stats = &c->sq[j].stats;
+			sq_stats = c->sq[j].stats;
 
 			s->tx_packets		+= sq_stats->packets;
 			s->tx_bytes		+= sq_stats->bytes;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ac54380d41e4..bfef73b37fbc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -65,7 +65,7 @@ static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
 	mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
 	cq->decmprs_left        = be32_to_cpu(cq->title.byte_cnt);
 	cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
-	rq->stats.cqe_compress_blks++;
+	rq->stats->cqe_compress_blks++;
 }
 
 static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
@@ -146,7 +146,7 @@ static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
 	mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
 	cq->wq.cc = cqcc;
 	cq->decmprs_left -= cqe_count;
-	rq->stats.cqe_compress_pkts += cqe_count;
+	rq->stats->cqe_compress_pkts += cqe_count;
 
 	return cqe_count;
 }
@@ -176,14 +176,15 @@ static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
 {
 	struct mlx5e_page_cache *cache = &rq->page_cache;
 	u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
+	struct mlx5e_rq_stats *stats = rq->stats;
 
 	if (tail_next == cache->head) {
-		rq->stats.cache_full++;
+		stats->cache_full++;
 		return false;
 	}
 
 	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
-		rq->stats.cache_waive++;
+		stats->cache_waive++;
 		return false;
 	}
 
@@ -196,20 +197,21 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
 				      struct mlx5e_dma_info *dma_info)
 {
 	struct mlx5e_page_cache *cache = &rq->page_cache;
+	struct mlx5e_rq_stats *stats = rq->stats;
 
 	if (unlikely(cache->head == cache->tail)) {
-		rq->stats.cache_empty++;
+		stats->cache_empty++;
 		return false;
 	}
 
 	if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
-		rq->stats.cache_busy++;
+		stats->cache_busy++;
 		return false;
 	}
 
 	*dma_info = cache->page_cache[cache->head];
 	cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
-	rq->stats.cache_reuse++;
+	stats->cache_reuse++;
 
 	dma_sync_single_for_device(rq->pdev, dma_info->addr,
 				   RQ_PAGE_SIZE(rq),
@@ -294,7 +296,7 @@ static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
 					   struct mlx5e_wqe_frag_info *wi)
 {
 	if (mlx5e_page_reuse(rq, wi)) {
-		rq->stats.page_reuse++;
+		rq->stats->page_reuse++;
 		return;
 	}
 
@@ -452,7 +454,7 @@ err_unmap:
 		dma_info--;
 		mlx5e_page_release(rq, dma_info, true);
 	}
-	rq->stats.buff_alloc_err++;
+	rq->stats->buff_alloc_err++;
 
 	return err;
 }
@@ -480,7 +482,7 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 		err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
 		if (unlikely(err)) {
-			rq->stats.buff_alloc_err++;
+			rq->stats->buff_alloc_err++;
 			break;
 		}
 
@@ -652,6 +654,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 				     struct sk_buff *skb,
 				     bool   lro)
 {
+	struct mlx5e_rq_stats *stats = rq->stats;
 	int network_depth = 0;
 
 	if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
@@ -659,7 +662,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 
 	if (lro) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		rq->stats.csum_unnecessary++;
+		stats->csum_unnecessary++;
 		return;
 	}
 
@@ -674,7 +677,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 			skb->csum = csum_partial(skb->data + ETH_HLEN,
 						 network_depth - ETH_HLEN,
 						 skb->csum);
-		rq->stats.csum_complete++;
+		stats->csum_complete++;
 		return;
 	}
 
@@ -684,15 +687,15 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 		if (cqe_is_tunneled(cqe)) {
 			skb->csum_level = 1;
 			skb->encapsulation = 1;
-			rq->stats.csum_unnecessary_inner++;
+			stats->csum_unnecessary_inner++;
 			return;
 		}
-		rq->stats.csum_unnecessary++;
+		stats->csum_unnecessary++;
 		return;
 	}
 csum_none:
 	skb->ip_summed = CHECKSUM_NONE;
-	rq->stats.csum_none++;
+	stats->csum_none++;
 }
 
 static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
@@ -701,6 +704,7 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 				      struct sk_buff *skb)
 {
 	u8 lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
+	struct mlx5e_rq_stats *stats = rq->stats;
 	struct net_device *netdev = rq->netdev;
 
 	skb->mac_len = ETH_HLEN;
@@ -710,9 +714,9 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 		/* Subtract one since we already counted this as one
 		 * "regular" packet in mlx5e_complete_rx_cqe()
 		 */
-		rq->stats.packets += lro_num_seg - 1;
-		rq->stats.lro_packets++;
-		rq->stats.lro_bytes += cqe_bcnt;
+		stats->packets += lro_num_seg - 1;
+		stats->lro_packets++;
+		stats->lro_bytes += cqe_bcnt;
 	}
 
 	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
@@ -727,7 +731,7 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
 	if (cqe_has_vlan(cqe)) {
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 				       be16_to_cpu(cqe->vlan_info));
-		rq->stats.removed_vlan_packets++;
+		stats->removed_vlan_packets++;
 	}
 
 	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
@@ -741,8 +745,10 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
 					 u32 cqe_bcnt,
 					 struct sk_buff *skb)
 {
-	rq->stats.packets++;
-	rq->stats.bytes += cqe_bcnt;
+	struct mlx5e_rq_stats *stats = rq->stats;
+
+	stats->packets++;
+	stats->bytes += cqe_bcnt;
 	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
 }
 
@@ -774,10 +780,12 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 	dma_addr_t dma_addr  = di->addr + data_offset;
 	unsigned int dma_len = xdp->data_end - xdp->data;
 
+	struct mlx5e_rq_stats *stats = rq->stats;
+
 	prefetchw(wqe);
 
 	if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || rq->hw_mtu < dma_len)) {
-		rq->stats.xdp_drop++;
+		stats->xdp_drop++;
 		return false;
 	}
 
@@ -787,7 +795,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 			mlx5e_xmit_xdp_doorbell(sq);
 			sq->db.doorbell = false;
 		}
-		rq->stats.xdp_tx_full++;
+		stats->xdp_tx_full++;
 		return false;
 	}
 
@@ -821,7 +829,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
 
 	sq->db.doorbell = true;
 
-	rq->stats.xdp_tx++;
+	stats->xdp_tx++;
 	return true;
 }
 
@@ -868,7 +876,7 @@ static inline bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
 	case XDP_ABORTED:
 		trace_xdp_exception(rq->netdev, prog, act);
 	case XDP_DROP:
-		rq->stats.xdp_drop++;
+		rq->stats->xdp_drop++;
 		return true;
 	}
 }
@@ -881,7 +889,7 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
 	struct sk_buff *skb = build_skb(va, frag_size);
 
 	if (unlikely(!skb)) {
-		rq->stats.buff_alloc_err++;
+		rq->stats->buff_alloc_err++;
 		return NULL;
 	}
 
@@ -913,7 +921,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	wi->offset += frag_size;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
-		rq->stats.wqe_err++;
+		rq->stats->wqe_err++;
 		return NULL;
 	}
 
@@ -1030,7 +1038,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	skb = napi_alloc_skb(rq->cq.napi,
 			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, sizeof(long)));
 	if (unlikely(!skb)) {
-		rq->stats.buff_alloc_err++;
+		rq->stats->buff_alloc_err++;
 		return NULL;
 	}
 
@@ -1116,12 +1124,12 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi->consumed_strides += cstrides;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
-		rq->stats.wqe_err++;
+		rq->stats->wqe_err++;
 		goto mpwrq_cqe_out;
 	}
 
 	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
-		rq->stats.mpwqe_filler++;
+		rq->stats->mpwqe_filler++;
 		goto mpwrq_cqe_out;
 	}
 
@@ -1276,6 +1284,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 					 u32 cqe_bcnt,
 					 struct sk_buff *skb)
 {
+	struct mlx5e_rq_stats *stats = rq->stats;
 	struct hwtstamp_config *tstamp;
 	struct net_device *netdev;
 	struct mlx5e_priv *priv;
@@ -1337,9 +1346,9 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 	skb->dev = netdev;
 
-	rq->stats.csum_complete++;
-	rq->stats.packets++;
-	rq->stats.bytes += cqe_bcnt;
+	stats->csum_complete++;
+	stats->packets++;
+	stats->bytes += cqe_bcnt;
 }
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 323f2af4200b..776b4d68e156 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -111,20 +111,19 @@ static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
 void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 {
 	struct mlx5e_sw_stats temp, *s = &temp;
-	struct mlx5e_rq_stats *rq_stats;
-	struct mlx5e_sq_stats *sq_stats;
-	struct mlx5e_ch_stats *ch_stats;
-	int i, j;
+	int i;
 
 	memset(s, 0, sizeof(*s));
 	read_lock(&priv->stats_lock);
 	if (!priv->channels_active)
 		goto out;
-	for (i = 0; i < priv->channels.num; i++) {
-		struct mlx5e_channel *c = priv->channels.c[i];
 
-		rq_stats = &c->rq.stats;
-		ch_stats = &c->stats;
+	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++) {
+		struct mlx5e_channel_stats *channel_stats =
+			&priv->channel_stats[i];
+		struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
+		struct mlx5e_ch_stats *ch_stats = &channel_stats->ch;
+		int j;
 
 		s->rx_packets	+= rq_stats->packets;
 		s->rx_bytes	+= rq_stats->bytes;
@@ -151,8 +150,8 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 		s->rx_cache_waive += rq_stats->cache_waive;
 		s->ch_eq_rearm += ch_stats->eq_rearm;
 
-		for (j = 0; j < priv->channels.params.num_tc; j++) {
-			sq_stats = &c->sq[j].stats;
+		for (j = 0; j < priv->max_opened_tc; j++) {
+			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
 
 			s->tx_packets		+= sq_stats->packets;
 			s->tx_bytes		+= sq_stats->bytes;
@@ -1160,30 +1159,37 @@ static const struct counter_desc ch_stats_desc[] = {
 
 static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
 {
-	return (NUM_RQ_STATS * priv->channels.num) +
-		(NUM_CH_STATS * priv->channels.num) +
-		(NUM_SQ_STATS * priv->channels.num * priv->channels.params.num_tc);
+	int max_nch = priv->profile->max_nch(priv->mdev);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	return (NUM_RQ_STATS * max_nch) +
+	       (NUM_CH_STATS * max_nch) +
+	       (NUM_SQ_STATS * max_nch * priv->max_opened_tc);
 }
 
 static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 					   int idx)
 {
+	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 		return idx;
 
-	for (i = 0; i < priv->channels.num; i++)
+	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN,
 				ch_stats_desc[j].format, i);
 
-	for (i = 0; i < priv->channels.num; i++)
+	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN, rq_stats_desc[j].format, i);
 
-	for (tc = 0; tc < priv->channels.params.num_tc; tc++)
-		for (i = 0; i < priv->channels.num; i++)
+	/* priv->channel_tc2txq[i][tc] is valid only when device is open */
+	for (tc = 0; tc < priv->max_opened_tc; tc++)
+		for (i = 0; i < max_nch; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
 				sprintf(data + (idx++) * ETH_GSTRING_LEN,
 					sq_stats_desc[j].format,
@@ -1195,29 +1201,29 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
 					 int idx)
 {
-	struct mlx5e_channels *channels = &priv->channels;
+	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 		return idx;
 
-	for (i = 0; i < channels->num; i++)
+	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			data[idx++] =
-				MLX5E_READ_CTR64_CPU(&channels->c[i]->stats,
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].ch,
 						     ch_stats_desc, j);
 
-	for (i = 0; i < channels->num; i++)
+	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
 			data[idx++] =
-				MLX5E_READ_CTR64_CPU(&channels->c[i]->rq.stats,
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].rq,
 						     rq_stats_desc, j);
 
-	for (tc = 0; tc < priv->channels.params.num_tc; tc++)
-		for (i = 0; i < channels->num; i++)
+	for (tc = 0; tc < priv->max_opened_tc; tc++)
+		for (i = 0; i < max_nch; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
 				data[idx++] =
-					MLX5E_READ_CTR64_CPU(&channels->c[i]->sq[tc].stats,
+					MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].sq[tc],
 							     sq_stats_desc, j);
 
 	return idx;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index d37566be06e1..aafd75257fd0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -220,28 +220,29 @@ mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct
 		if (skb->encapsulation) {
 			eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM |
 					  MLX5_ETH_WQE_L4_INNER_CSUM;
-			sq->stats.csum_partial_inner++;
+			sq->stats->csum_partial_inner++;
 		} else {
 			eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM;
-			sq->stats.csum_partial++;
+			sq->stats->csum_partial++;
 		}
 	} else
-		sq->stats.csum_none++;
+		sq->stats->csum_none++;
 }
 
 static inline u16
 mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb)
 {
+	struct mlx5e_sq_stats *stats = sq->stats;
 	u16 ihs;
 
 	if (skb->encapsulation) {
 		ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
-		sq->stats.tso_inner_packets++;
-		sq->stats.tso_inner_bytes += skb->len - ihs;
+		stats->tso_inner_packets++;
+		stats->tso_inner_bytes += skb->len - ihs;
 	} else {
 		ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
-		sq->stats.tso_packets++;
-		sq->stats.tso_bytes += skb->len - ihs;
+		stats->tso_packets++;
+		stats->tso_bytes += skb->len - ihs;
 	}
 
 	return ihs;
@@ -311,7 +312,7 @@ static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq,
 		wi->num_wqebbs = 1;
 		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
 	}
-	sq->stats.nop += nnops;
+	sq->stats->nop += nnops;
 }
 
 static inline void
@@ -337,7 +338,7 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	sq->pc += wi->num_wqebbs;
 	if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, MLX5E_SQ_STOP_ROOM))) {
 		netif_tx_stop_queue(sq->txq);
-		sq->stats.stopped++;
+		sq->stats->stopped++;
 	}
 
 	if (!skb->xmit_more || netif_xmit_stopped(sq->txq))
@@ -355,6 +356,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5_wqe_data_seg *dseg;
 	struct mlx5e_tx_wqe_info *wi;
 
+	struct mlx5e_sq_stats *stats = sq->stats;
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
 	u16 ds_cnt, ds_cnt_inl = 0;
@@ -371,17 +373,17 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
 		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
 		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
-		sq->stats.packets += skb_shinfo(skb)->gso_segs;
+		stats->packets += skb_shinfo(skb)->gso_segs;
 	} else {
 		opcode    = MLX5_OPCODE_SEND;
 		mss       = 0;
 		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
 		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
-		sq->stats.packets++;
+		stats->packets++;
 	}
 
-	sq->stats.bytes     += num_bytes;
-	sq->stats.xmit_more += skb->xmit_more;
+	stats->bytes     += num_bytes;
+	stats->xmit_more += skb->xmit_more;
 
 	headlen = skb_len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
@@ -415,7 +417,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		if (skb_vlan_tag_present(skb)) {
 			mlx5e_insert_vlan(eseg->inline_hdr.start, skb,
 					  ihs - VLAN_HLEN, &skb_data, &skb_len);
-			sq->stats.added_vlan_packets++;
+			stats->added_vlan_packets++;
 		} else {
 			memcpy(eseg->inline_hdr.start, skb_data, ihs);
 			mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
@@ -427,7 +429,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
 			eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN);
 		eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
-		sq->stats.added_vlan_packets++;
+		stats->added_vlan_packets++;
 	}
 
 	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
@@ -440,7 +442,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	return NETDEV_TX_OK;
 
 err_drop:
-	sq->stats.dropped++;
+	stats->dropped++;
 	dev_kfree_skb_any(skb);
 
 	return NETDEV_TX_OK;
@@ -524,7 +526,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				queue_work(cq->channel->priv->wq,
 					   &sq->recover.recover_work);
 			}
-			sq->stats.cqe_err++;
+			sq->stats->cqe_err++;
 		}
 
 		do {
@@ -584,7 +586,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 				   MLX5E_SQ_STOP_ROOM) &&
 	    !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
 		netif_tx_wake_queue(sq->txq);
-		sq->stats.wake++;
+		sq->stats->wake++;
 	}
 
 	return (i == MLX5E_TX_CQ_POLL_BUDGET);
@@ -641,6 +643,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5_wqe_data_seg *dseg;
 	struct mlx5e_tx_wqe_info *wi;
 
+	struct mlx5e_sq_stats *stats = sq->stats;
 	unsigned char *skb_data = skb->data;
 	unsigned int skb_len = skb->len;
 	u16 headlen, ihs, pi, frag_pi;
@@ -659,17 +662,17 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
 		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
 		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
-		sq->stats.packets += skb_shinfo(skb)->gso_segs;
+		stats->packets += skb_shinfo(skb)->gso_segs;
 	} else {
 		opcode    = MLX5_OPCODE_SEND;
 		mss       = 0;
 		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
 		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
-		sq->stats.packets++;
+		stats->packets++;
 	}
 
-	sq->stats.bytes     += num_bytes;
-	sq->stats.xmit_more += skb->xmit_more;
+	stats->bytes     += num_bytes;
+	stats->xmit_more += skb->xmit_more;
 
 	headlen = skb_len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
@@ -716,7 +719,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	return NETDEV_TX_OK;
 
 err_drop:
-	sq->stats.dropped++;
+	stats->dropped++;
 	dev_kfree_skb_any(skb);
 
 	return NETDEV_TX_OK;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 5d6f9ce2bf80..1b17f682693b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -46,24 +46,26 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 
 static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
 {
+	struct mlx5e_sq_stats *stats = sq->stats;
 	struct net_dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
 		return;
 
-	net_dim_sample(sq->cq.event_ctr, sq->stats.packets, sq->stats.bytes,
+	net_dim_sample(sq->cq.event_ctr, stats->packets, stats->bytes,
 		       &dim_sample);
 	net_dim(&sq->dim, dim_sample);
 }
 
 static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
 {
+	struct mlx5e_rq_stats *stats = rq->stats;
 	struct net_dim_sample dim_sample;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
 		return;
 
-	net_dim_sample(rq->cq.event_ctr, rq->stats.packets, rq->stats.bytes,
+	net_dim_sample(rq->cq.event_ctr, stats->packets, stats->bytes,
 		       &dim_sample);
 	net_dim(&rq->dim, dim_sample);
 }
-- 
cgit v1.2.3-59-g8ed1b


From a493f5f9d8c2e87f2d1322c01e477502b3cd11c5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 10:23:13 -0700
Subject: libbpf: Install btf.h with libbpf

install_headers target should contain all headers that are part of
libbpf. Add missing btf.h

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index f3fab4af4260..5390e7725e43 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -189,6 +189,7 @@ install_headers:
 	$(call QUIET_INSTALL, headers) \
 		$(call do_install,bpf.h,$(prefix)/include/bpf,644); \
 		$(call do_install,libbpf.h,$(prefix)/include/bpf,644);
+		$(call do_install,btf.h,$(prefix)/include/bpf,644);
 
 install: install_lib
 
-- 
cgit v1.2.3-59-g8ed1b


From 53c8036cb715f3577a7fe1db6e6ad06e8697b36f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 May 2018 23:33:19 +0200
Subject: bpf: btf: avoid -Wreturn-type warning

gcc warns about a noreturn function possibly returning in
some configurations:

kernel/bpf/btf.c: In function 'env_type_is_resolve_sink':
kernel/bpf/btf.c:729:1: error: control reaches end of non-void function [-Werror=return-type]

Using BUG() instead of BUG_ON() avoids that warning and otherwise
does the exact same thing.

Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7e90fd13b5b5..3d20aa1f4b54 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
 			!btf_type_is_array(next_type) &&
 			!btf_type_is_struct(next_type);
 	default:
-		BUG_ON(1);
+		BUG();
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From dc3b8ae9d271897e09b27fa4e4e0000de98590d1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 May 2018 23:33:20 +0200
Subject: bpf: avoid -Wmaybe-uninitialized warning

The stack_map_get_build_id_offset() function is too long for gcc to track
whether 'work' may or may not be initialized at the end of it, leading
to a false-positive warning:

kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset':
kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this function [-Werror=maybe-uninitialized]

This removes the 'in_nmi_ctx' flag and uses the state of that variable
itself to see if it got initialized.

Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/stackmap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b59ace0f0f09..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
-	bool in_nmi_ctx = in_nmi();
 	bool irq_work_busy = false;
-	struct stack_map_irq_work *work;
+	struct stack_map_irq_work *work = NULL;
 
-	if (in_nmi_ctx) {
+	if (in_nmi()) {
 		work = this_cpu_ptr(&up_read_work);
 		if (work->irq_work.flags & IRQ_WORK_BUSY)
 			/* cannot queue more up_read, fallback */
@@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
 
-	if (!in_nmi_ctx) {
+	if (!work) {
 		up_read(&current->mm->mmap_sem);
 	} else {
 		work->sem = &current->mm->mmap_sem;
-- 
cgit v1.2.3-59-g8ed1b


From 3b296633ae6b91bfff815eb005d220b6ad4a47b1 Mon Sep 17 00:00:00 2001
From: Mathieu Xhonneux <m.xhonneux@gmail.com>
Date: Sat, 26 May 2018 15:44:08 +0100
Subject: selftests/bpf: missing headers test_lwt_seg6local

Previous patch "selftests/bpf: test for seg6local End.BPF action" lacks
some UAPI headers in tools/.

clang -I. -I./include/uapi -I../../../include/uapi -idirafter
/usr/local/include -idirafter
/data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
-idirafter /usr/include -Wno-compare-distinct-pointer-types \
         -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |      \
llc -march=bpf -mcpu=generic  -filetype=obj -o
[...]/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
         ^~~~~~~~~~~~~~~~~~~~
1 error generated.
make: Leaving directory
`/data/users/yhs/work/net-next/tools/testing/selftests/bpf'

v2: moving the headers to tools/include/uapi/.

Reported-by: Y Song <ys114321@gmail.com>
Signed-off-by: Mathieu Xhonneux <m.xhonneux@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/seg6.h       | 55 ++++++++++++++++++++++++
 tools/include/uapi/linux/seg6_local.h | 80 +++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tools/include/uapi/linux/seg6.h
 create mode 100644 tools/include/uapi/linux/seg6_local.h

diff --git a/tools/include/uapi/linux/seg6.h b/tools/include/uapi/linux/seg6.h
new file mode 100644
index 000000000000..286e8d6a8e98
--- /dev/null
+++ b/tools/include/uapi/linux/seg6.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_H
+#define _UAPI_LINUX_SEG6_H
+
+#include <linux/types.h>
+#include <linux/in6.h>		/* For struct in6_addr. */
+
+/*
+ * SRH
+ */
+struct ipv6_sr_hdr {
+	__u8	nexthdr;
+	__u8	hdrlen;
+	__u8	type;
+	__u8	segments_left;
+	__u8	first_segment; /* Represents the last_entry field of SRH */
+	__u8	flags;
+	__u16	tag;
+
+	struct in6_addr segments[0];
+};
+
+#define SR6_FLAG1_PROTECTED	(1 << 6)
+#define SR6_FLAG1_OAM		(1 << 5)
+#define SR6_FLAG1_ALERT		(1 << 4)
+#define SR6_FLAG1_HMAC		(1 << 3)
+
+#define SR6_TLV_INGRESS		1
+#define SR6_TLV_EGRESS		2
+#define SR6_TLV_OPAQUE		3
+#define SR6_TLV_PADDING		4
+#define SR6_TLV_HMAC		5
+
+#define sr_has_hmac(srh) ((srh)->flags & SR6_FLAG1_HMAC)
+
+struct sr6_tlv {
+	__u8 type;
+	__u8 len;
+	__u8 data[0];
+};
+
+#endif
diff --git a/tools/include/uapi/linux/seg6_local.h b/tools/include/uapi/linux/seg6_local.h
new file mode 100644
index 000000000000..edc138bdc56d
--- /dev/null
+++ b/tools/include/uapi/linux/seg6_local.h
@@ -0,0 +1,80 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_LOCAL_H
+#define _UAPI_LINUX_SEG6_LOCAL_H
+
+#include <linux/seg6.h>
+
+enum {
+	SEG6_LOCAL_UNSPEC,
+	SEG6_LOCAL_ACTION,
+	SEG6_LOCAL_SRH,
+	SEG6_LOCAL_TABLE,
+	SEG6_LOCAL_NH4,
+	SEG6_LOCAL_NH6,
+	SEG6_LOCAL_IIF,
+	SEG6_LOCAL_OIF,
+	SEG6_LOCAL_BPF,
+	__SEG6_LOCAL_MAX,
+};
+#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
+
+enum {
+	SEG6_LOCAL_ACTION_UNSPEC	= 0,
+	/* node segment */
+	SEG6_LOCAL_ACTION_END		= 1,
+	/* adjacency segment (IPv6 cross-connect) */
+	SEG6_LOCAL_ACTION_END_X		= 2,
+	/* lookup of next seg NH in table */
+	SEG6_LOCAL_ACTION_END_T		= 3,
+	/* decap and L2 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX2	= 4,
+	/* decap and IPv6 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX6	= 5,
+	/* decap and IPv4 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX4	= 6,
+	/* decap and lookup of DA in v6 table */
+	SEG6_LOCAL_ACTION_END_DT6	= 7,
+	/* decap and lookup of DA in v4 table */
+	SEG6_LOCAL_ACTION_END_DT4	= 8,
+	/* binding segment with insertion */
+	SEG6_LOCAL_ACTION_END_B6	= 9,
+	/* binding segment with encapsulation */
+	SEG6_LOCAL_ACTION_END_B6_ENCAP	= 10,
+	/* binding segment with MPLS encap */
+	SEG6_LOCAL_ACTION_END_BM	= 11,
+	/* lookup last seg in table */
+	SEG6_LOCAL_ACTION_END_S		= 12,
+	/* forward to SR-unaware VNF with static proxy */
+	SEG6_LOCAL_ACTION_END_AS	= 13,
+	/* forward to SR-unaware VNF with masquerading */
+	SEG6_LOCAL_ACTION_END_AM	= 14,
+	/* custom BPF action */
+	SEG6_LOCAL_ACTION_END_BPF	= 15,
+
+	__SEG6_LOCAL_ACTION_MAX,
+};
+
+#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
+
+enum {
+	SEG6_LOCAL_BPF_PROG_UNSPEC,
+	SEG6_LOCAL_BPF_PROG,
+	SEG6_LOCAL_BPF_PROG_NAME,
+	__SEG6_LOCAL_BPF_PROG_MAX,
+};
+
+#define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1)
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 13193b0f392f5a65d0d54185cb95ed5e99c0a5bf Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:22 -0700
Subject: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n

Static key is used to enable/disable cgroup-bpf related code paths at
run time.

Though it's not defined when cgroup-bpf is disabled at compile time,
i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do
this:

	#ifdef CONFIG_CGROUP_BPF
		if (cgroup_bpf_enabled) {
			/* ... some work ... */
		}
	#endif

This code can be simplified by setting cgroup_bpf_enabled to 0 for
CONFIG_CGROUP_BPF=n case:

	if (cgroup_bpf_enabled) {
		/* ... some work ... */
	}

And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that
defined for both states of CONFIG_CGROUP_BPF.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 30d15e64b993..de8e89a3758b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -185,6 +185,7 @@ struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
+#define cgroup_bpf_enabled (0)
 #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
-- 
cgit v1.2.3-59-g8ed1b


From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:23 -0700
Subject: bpf: Hooks for sys_sendmsg

In addition to already existing BPF hooks for sys_bind and sys_connect,
the patch provides new hooks for sys_sendmsg.

It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR`
that provides access to socket itlself (properties like family, type,
protocol) and user-passed `struct sockaddr *` so that BPF program can
override destination IP and port for system calls such as sendto(2) or
sendmsg(2) and/or assign source IP to the socket.

The hooks are implemented as two new attach types:
`BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and
UDPv6 correspondingly.

UDPv4 and UDPv6 separate attach types for same reason as sys_bind and
sys_connect hooks, i.e. to prevent reading from / writing to e.g.
user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound.

The difference with already existing hooks is sys_sendmsg are
implemented only for unconnected UDP.

For TCP it doesn't make sense to change user-provided `struct sockaddr *`
at sendto(2)/sendmsg(2) time since socket either was already connected
and has source/destination set or wasn't connected and call to
sendto(2)/sendmsg(2) would lead to ENOTCONN anyway.

Connected UDP is already handled by sys_connect hooks that can override
source/destination at connect time and use fast-path later, i.e. these
hooks don't affect UDP fast-path.

Rewriting source IP is implemented differently than that in sys_connect
hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to
just bind socket to desired local IP address since source IP can be set
on per-packet basis by using ancillary data (cmsg(3)). So no matter if
socket is bound or not, source IP has to be rewritten on every call to
sys_sendmsg.

To do so two new fields are added to UAPI `struct bpf_sock_addr`;
* `msg_src_ip4` to set source IPv4 for UDPv4;
* `msg_src_ip6` to set source IPv6 for UDPv6.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 23 +++++++++++++++++------
 include/linux/filter.h     |  1 +
 include/uapi/linux/bpf.h   |  8 ++++++++
 kernel/bpf/cgroup.c        | 11 ++++++++++-
 kernel/bpf/syscall.c       |  8 ++++++++
 net/core/filter.c          | 39 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c             | 20 ++++++++++++++++++--
 net/ipv6/udp.c             | 24 ++++++++++++++++++++++++
 8 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index de8e89a3758b..975fb4cf1bb7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type);
+				      enum bpf_attach_type type,
+				      void *t_ctx);
 
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
@@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  NULL);	       \
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx)		       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)	{					       \
 		lock_sock(sk);						       \
-		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type,     \
+							  t_ctx);	       \
 		release_sock(sk);					       \
 	}								       \
 	__ret;								       \
@@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
 
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL)
 
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
-	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL)
+
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx)
+
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
 
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
@@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d358d1815c16..d90abdaea94b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern {
 	 * only two (src and dst) are available at convert_ctx_access time
 	 */
 	u64 tmp_reg;
+	void *t_ctx;	/* Attach type specific context. */
 };
 
 struct bpf_sock_ops_kern {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type)
+				      enum bpf_attach_type type,
+				      void *t_ctx)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
 		.uaddr = uaddr,
+		.t_ctx = t_ctx,
 	};
+	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
+	if (!ctx.uaddr) {
+		memset(&unspec, 0, sizeof(unspec));
+		ctx.uaddr = (struct sockaddr *)&unspec;
+	}
+
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 388d4feda348..e254526d6744 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1249,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP6_SENDMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1565,6 +1567,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1635,6 +1639,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1692,6 +1698,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index acf1f4fb99d1..24e6ce8be567 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5299,6 +5299,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5308,6 +5309,24 @@ static bool sock_addr_is_valid_access(int off, int size,
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP6_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP4_SENDMSG:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_UDP6_SENDMSG:
 			break;
 		default:
 			return false;
@@ -5318,6 +5337,9 @@ static bool sock_addr_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
 		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
@@ -6072,6 +6094,23 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
 					SK_FL_PROTO_SHIFT);
 		break;
+
+	case offsetof(struct bpf_sock_addr, msg_src_ip4):
+		/* Treat t_ctx as struct in_addr for msg_src_ip4. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in_addr, t_ctx,
+			s_addr, BPF_SIZE(si->code), 0, tmp_reg);
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
+				msg_src_ip6[3]):
+		off = si->off;
+		off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
+		/* Treat t_ctx as struct in6_addr for msg_src_ip6. */
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
+			s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d71f1f3e1155..3c27d00b5730 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -901,6 +901,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct udp_sock *up = udp_sk(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
 	struct flowi4 fl4_stack;
 	struct flowi4 *fl4;
 	int ulen = len;
@@ -955,8 +956,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	/*
 	 *	Get and verify the address.
 	 */
-	if (msg->msg_name) {
-		DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	if (usin) {
 		if (msg->msg_namelen < sizeof(*usin))
 			return -EINVAL;
 		if (usin->sin_family != AF_INET) {
@@ -1010,6 +1010,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		rcu_read_unlock();
 	}
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+					    (struct sockaddr *)usin, &ipc.addr);
+		if (err)
+			goto out_free;
+		if (usin) {
+			if (usin->sin_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_free;
+			}
+			daddr = usin->sin_addr.s_addr;
+			dport = usin->sin_port;
+		}
+	}
+
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 426c9d2b418d..9f729a7b8cf0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1316,6 +1316,29 @@ do_udp_sendmsg:
 		fl6.saddr = np->saddr;
 	fl6.fl6_sport = inet->inet_sport;
 
+	if (cgroup_bpf_enabled && !connected) {
+		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+					   (struct sockaddr *)sin6, &fl6.saddr);
+		if (err)
+			goto out_no_dst;
+		if (sin6) {
+			if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+				/* BPF program rewrote IPv6-only by IPv4-mapped
+				 * IPv6. It's currently unsupported.
+				 */
+				err = -ENOTSUPP;
+				goto out_no_dst;
+			}
+			if (sin6->sin6_port == 0) {
+				/* BPF program set invalid port. Reject it. */
+				err = -EINVAL;
+				goto out_no_dst;
+			}
+			fl6.fl6_dport = sin6->sin6_port;
+			fl6.daddr = sin6->sin6_addr;
+		}
+	}
+
 	final_p = fl6_update_dst(&fl6, opt, &final);
 	if (final_p)
 		connected = false;
@@ -1395,6 +1418,7 @@ do_append_data:
 
 out:
 	dst_release(dst);
+out_no_dst:
 	fl6_sock_release(flowlabel);
 	txopt_put(opt_to_free);
 	if (!err)
-- 
cgit v1.2.3-59-g8ed1b


From 3024cf8248c49d9c57f56f6b650e8693f6e1bb57 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:24 -0700
Subject: bpf: Sync bpf.h to tools/

Sync new `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG`
attach types to tools/.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9b8c6e310e9a..cc68787f2d97 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -160,6 +160,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2363,6 +2365,12 @@ struct bpf_sock_addr {
 	__u32 family;		/* Allows 4-byte read, but no write */
 	__u32 type;		/* Allows 4-byte read, but no write */
 	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
 };
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
-- 
cgit v1.2.3-59-g8ed1b


From 72481f398c4fc6ca32013310c3dec0ef3e6e5bf2 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:25 -0700
Subject: libbpf: Support guessing sendmsg{4,6} progs

libbpf can guess prog type and expected attach type based on section
name. Add hints for "cgroup/sendmsg4" and "cgroup/sendmsg6" section
names.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d20411ebfa2f..b1a60ac8424e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2043,6 +2043,8 @@ static const struct {
 	BPF_SA_PROG_SEC("cgroup/bind6",	BPF_CGROUP_INET6_BIND),
 	BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT),
 	BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT),
+	BPF_SA_PROG_SEC("cgroup/sendmsg4", BPF_CGROUP_UDP4_SENDMSG),
+	BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
 	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
 	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
 };
-- 
cgit v1.2.3-59-g8ed1b


From 9be71aa6e56632195ef5b4ee20b87854262e6e85 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:26 -0700
Subject: selftests/bpf: Prepare test_sock_addr for extension

test_sock_addr was not easy to extend since it was focused on sys_bind
and sys_connect quite a bit.

Reorganized it so that it'll be easier to cover new test-cases for
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR`:

- decouple test-cases so that only one BPF prog is tested at a time;

- check programmatically that local IP:port for sys_bind, source IP and
  destination IP:port for sys_connect are rewritten property by tested
  BPF programs.

The output of new version:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  Test case: bind4: load prog with wrong expected attach type .. [PASS]
  Test case: bind4: attach prog with wrong attach type .. [PASS]
  Test case: bind4: rewrite IP & TCP port in .. [PASS]
  Test case: bind4: rewrite IP & UDP port in .. [PASS]
  Test case: bind6: load prog with wrong expected attach type .. [PASS]
  Test case: bind6: attach prog with wrong attach type .. [PASS]
  Test case: bind6: rewrite IP & TCP port in .. [PASS]
  Test case: bind6: rewrite IP & UDP port in .. [PASS]
  Test case: connect4: load prog with wrong expected attach type .. [PASS]
  Test case: connect4: attach prog with wrong attach type .. [PASS]
  Test case: connect4: rewrite IP & TCP port .. [PASS]
  Test case: connect4: rewrite IP & UDP port .. [PASS]
  Test case: connect6: load prog with wrong expected attach type .. [PASS]
  Test case: connect6: attach prog with wrong attach type .. [PASS]
  Test case: connect6: rewrite IP & TCP port .. [PASS]
  Test case: connect6: rewrite IP & UDP port .. [PASS]
  Summary: 16 PASSED, 0 FAILED

(stderr contains errors from libbpf when testing load/attach with
invalid arguments)

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sock_addr.c | 655 +++++++++++++++++++--------
 1 file changed, 460 insertions(+), 195 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 2950f80ba7fb..ed3e397a56f6 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -17,34 +17,292 @@
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
 
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
 #define CG_PATH	"/foo"
 #define CONNECT4_PROG_PATH	"./connect4_prog.o"
 #define CONNECT6_PROG_PATH	"./connect6_prog.o"
 
 #define SERV4_IP		"192.168.1.254"
 #define SERV4_REWRITE_IP	"127.0.0.1"
+#define SRC4_REWRITE_IP		"127.0.0.4"
 #define SERV4_PORT		4040
 #define SERV4_REWRITE_PORT	4444
 
 #define SERV6_IP		"face:b00c:1234:5678::abcd"
 #define SERV6_REWRITE_IP	"::1"
+#define SRC6_REWRITE_IP		"::6"
 #define SERV6_PORT		6060
 #define SERV6_REWRITE_PORT	6666
 
 #define INET_NTOP_BUF	40
 
-typedef int (*load_fn)(enum bpf_attach_type, const char *comment);
+struct sock_addr_test;
+
+typedef int (*load_fn)(const struct sock_addr_test *test);
 typedef int (*info_fn)(int, struct sockaddr *, socklen_t *);
 
-struct program {
-	enum bpf_attach_type type;
-	load_fn	loadfn;
-	int fd;
-	const char *name;
-	enum bpf_attach_type invalid_type;
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_addr_test {
+	const char *descr;
+	/* BPF prog properties */
+	load_fn loadfn;
+	enum bpf_attach_type expected_attach_type;
+	enum bpf_attach_type attach_type;
+	/* Socket properties */
+	int domain;
+	int type;
+	/* IP:port pairs for BPF prog to override */
+	const char *requested_ip;
+	unsigned short requested_port;
+	const char *expected_ip;
+	unsigned short expected_port;
+	const char *expected_src_ip;
+	/* Expected test result */
+	enum {
+		LOAD_REJECT,
+		ATTACH_REJECT,
+		SUCCESS,
+	} expected_result;
 };
 
-char bpf_log_buf[BPF_LOG_BUF_SIZE];
+static int bind4_prog_load(const struct sock_addr_test *test);
+static int bind6_prog_load(const struct sock_addr_test *test);
+static int connect4_prog_load(const struct sock_addr_test *test);
+static int connect6_prog_load(const struct sock_addr_test *test);
+
+static struct sock_addr_test tests[] = {
+	/* bind */
+	{
+		"bind4: load prog with wrong expected attach type",
+		bind4_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"bind4: attach prog with wrong attach type",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"bind4: rewrite IP & TCP port in",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind4: rewrite IP & UDP port in",
+		bind4_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind6: load prog with wrong expected attach type",
+		bind6_prog_load,
+		BPF_CGROUP_INET4_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"bind6: attach prog with wrong attach type",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET4_BIND,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"bind6: rewrite IP & TCP port in",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+	{
+		"bind6: rewrite IP & UDP port in",
+		bind6_prog_load,
+		BPF_CGROUP_INET6_BIND,
+		BPF_CGROUP_INET6_BIND,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		NULL,
+		SUCCESS,
+	},
+
+	/* connect */
+	{
+		"connect4: load prog with wrong expected attach type",
+		connect4_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"connect4: attach prog with wrong attach type",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"connect4: rewrite IP & TCP port",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect4: rewrite IP & UDP port",
+		connect4_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect6: load prog with wrong expected attach type",
+		connect6_prog_load,
+		BPF_CGROUP_INET4_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"connect6: attach prog with wrong attach type",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET4_CONNECT,
+		AF_INET,
+		SOCK_STREAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"connect6: rewrite IP & TCP port",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_STREAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"connect6: rewrite IP & UDP port",
+		connect6_prog_load,
+		BPF_CGROUP_INET6_CONNECT,
+		BPF_CGROUP_INET6_CONNECT,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+};
 
 static int mk_sockaddr(int domain, const char *ip, unsigned short port,
 		       struct sockaddr *addr, socklen_t addr_len)
@@ -84,25 +342,23 @@ static int mk_sockaddr(int domain, const char *ip, unsigned short port,
 	return 0;
 }
 
-static int load_insns(enum bpf_attach_type attach_type,
-		      const struct bpf_insn *insns, size_t insns_cnt,
-		      const char *comment)
+static int load_insns(const struct sock_addr_test *test,
+		      const struct bpf_insn *insns, size_t insns_cnt)
 {
 	struct bpf_load_program_attr load_attr;
 	int ret;
 
 	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
 	load_attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	load_attr.expected_attach_type = attach_type;
+	load_attr.expected_attach_type = test->expected_attach_type;
 	load_attr.insns = insns;
 	load_attr.insns_cnt = insns_cnt;
 	load_attr.license = "GPL";
 
 	ret = bpf_load_program_xattr(&load_attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
-	if (ret < 0 && comment) {
-		log_err(">>> Loading %s program error.\n"
-			">>> Output from verifier:\n%s\n-------\n",
-			comment, bpf_log_buf);
+	if (ret < 0 && test->expected_result != LOAD_REJECT) {
+		log_err(">>> Loading program error.\n"
+			">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
 	}
 
 	return ret;
@@ -119,8 +375,7 @@ static int load_insns(enum bpf_attach_type attach_type,
  * to count jumps properly.
  */
 
-static int bind4_prog_load(enum bpf_attach_type attach_type,
-			   const char *comment)
+static int bind4_prog_load(const struct sock_addr_test *test)
 {
 	union {
 		uint8_t u4_addr8[4];
@@ -186,12 +441,10 @@ static int bind4_prog_load(enum bpf_attach_type attach_type,
 		BPF_EXIT_INSN(),
 	};
 
-	return load_insns(attach_type, insns,
-			  sizeof(insns) / sizeof(struct bpf_insn), comment);
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
 }
 
-static int bind6_prog_load(enum bpf_attach_type attach_type,
-			   const char *comment)
+static int bind6_prog_load(const struct sock_addr_test *test)
 {
 	struct sockaddr_in6 addr6_rw;
 	struct in6_addr ip6;
@@ -254,13 +507,10 @@ static int bind6_prog_load(enum bpf_attach_type attach_type,
 		BPF_EXIT_INSN(),
 	};
 
-	return load_insns(attach_type, insns,
-			  sizeof(insns) / sizeof(struct bpf_insn), comment);
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
 }
 
-static int connect_prog_load_path(const char *path,
-				  enum bpf_attach_type attach_type,
-				  const char *comment)
+static int load_path(const struct sock_addr_test *test, const char *path)
 {
 	struct bpf_prog_load_attr attr;
 	struct bpf_object *obj;
@@ -269,75 +519,83 @@ static int connect_prog_load_path(const char *path,
 	memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
 	attr.file = path;
 	attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
-	attr.expected_attach_type = attach_type;
+	attr.expected_attach_type = test->expected_attach_type;
 
 	if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
-		if (comment)
-			log_err(">>> Loading %s program at %s error.\n",
-				comment, path);
+		if (test->expected_result != LOAD_REJECT)
+			log_err(">>> Loading program (%s) error.\n", path);
 		return -1;
 	}
 
 	return prog_fd;
 }
 
-static int connect4_prog_load(enum bpf_attach_type attach_type,
-			      const char *comment)
+static int connect4_prog_load(const struct sock_addr_test *test)
 {
-	return connect_prog_load_path(CONNECT4_PROG_PATH, attach_type, comment);
+	return load_path(test, CONNECT4_PROG_PATH);
 }
 
-static int connect6_prog_load(enum bpf_attach_type attach_type,
-			      const char *comment)
+static int connect6_prog_load(const struct sock_addr_test *test)
 {
-	return connect_prog_load_path(CONNECT6_PROG_PATH, attach_type, comment);
+	return load_path(test, CONNECT6_PROG_PATH);
 }
 
-static void print_ip_port(int sockfd, info_fn fn, const char *fmt)
+static int cmp_addr(const struct sockaddr_storage *addr1,
+		    const struct sockaddr_storage *addr2, int cmp_port)
 {
-	char addr_buf[INET_NTOP_BUF];
-	struct sockaddr_storage addr;
-	struct sockaddr_in6 *addr6;
-	struct sockaddr_in *addr4;
-	socklen_t addr_len;
-	unsigned short port;
-	void *nip;
-
-	addr_len = sizeof(struct sockaddr_storage);
-	memset(&addr, 0, addr_len);
-
-	if (fn(sockfd, (struct sockaddr *)&addr, (socklen_t *)&addr_len) == 0) {
-		if (addr.ss_family == AF_INET) {
-			addr4 = (struct sockaddr_in *)&addr;
-			nip = (void *)&addr4->sin_addr;
-			port = ntohs(addr4->sin_port);
-		} else if (addr.ss_family == AF_INET6) {
-			addr6 = (struct sockaddr_in6 *)&addr;
-			nip = (void *)&addr6->sin6_addr;
-			port = ntohs(addr6->sin6_port);
-		} else {
-			return;
-		}
-		const char *addr_str =
-			inet_ntop(addr.ss_family, nip, addr_buf, INET_NTOP_BUF);
-		printf(fmt, addr_str ? addr_str : "??", port);
+	const struct sockaddr_in *four1, *four2;
+	const struct sockaddr_in6 *six1, *six2;
+
+	if (addr1->ss_family != addr2->ss_family)
+		return -1;
+
+	if (addr1->ss_family == AF_INET) {
+		four1 = (const struct sockaddr_in *)addr1;
+		four2 = (const struct sockaddr_in *)addr2;
+		return !((four1->sin_port == four2->sin_port || !cmp_port) &&
+			 four1->sin_addr.s_addr == four2->sin_addr.s_addr);
+	} else if (addr1->ss_family == AF_INET6) {
+		six1 = (const struct sockaddr_in6 *)addr1;
+		six2 = (const struct sockaddr_in6 *)addr2;
+		return !((six1->sin6_port == six2->sin6_port || !cmp_port) &&
+			 !memcmp(&six1->sin6_addr, &six2->sin6_addr,
+				 sizeof(struct in6_addr)));
 	}
+
+	return -1;
+}
+
+static int cmp_sock_addr(info_fn fn, int sock1,
+			 const struct sockaddr_storage *addr2, int cmp_port)
+{
+	struct sockaddr_storage addr1;
+	socklen_t len1 = sizeof(addr1);
+
+	memset(&addr1, 0, len1);
+	if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0)
+		return -1;
+
+	return cmp_addr(&addr1, addr2, cmp_port);
+}
+
+static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2)
+{
+	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0);
 }
 
-static void print_local_ip_port(int sockfd, const char *fmt)
+static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2)
 {
-	print_ip_port(sockfd, getsockname, fmt);
+	return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1);
 }
 
-static void print_remote_ip_port(int sockfd, const char *fmt)
+static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2)
 {
-	print_ip_port(sockfd, getpeername, fmt);
+	return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1);
 }
 
 static int start_server(int type, const struct sockaddr_storage *addr,
 			socklen_t addr_len)
 {
-
 	int fd;
 
 	fd = socket(addr->ss_family, type, 0);
@@ -358,8 +616,6 @@ static int start_server(int type, const struct sockaddr_storage *addr,
 		}
 	}
 
-	print_local_ip_port(fd, "\t   Actual: bind(%s, %d)\n");
-
 	goto out;
 close_out:
 	close(fd);
@@ -372,19 +628,19 @@ static int connect_to_server(int type, const struct sockaddr_storage *addr,
 			     socklen_t addr_len)
 {
 	int domain;
-	int fd;
+	int fd = -1;
 
 	domain = addr->ss_family;
 
 	if (domain != AF_INET && domain != AF_INET6) {
 		log_err("Unsupported address family");
-		return -1;
+		goto err;
 	}
 
 	fd = socket(domain, type, 0);
 	if (fd == -1) {
-		log_err("Failed to creating client socket");
-		return -1;
+		log_err("Failed to create client socket");
+		goto err;
 	}
 
 	if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) {
@@ -392,162 +648,188 @@ static int connect_to_server(int type, const struct sockaddr_storage *addr,
 		goto err;
 	}
 
-	print_remote_ip_port(fd, "\t   Actual: connect(%s, %d)");
-	print_local_ip_port(fd, " from (%s, %d)\n");
-
-	return 0;
+	goto out;
 err:
 	close(fd);
-	return -1;
+	fd = -1;
+out:
+	return fd;
 }
 
-static void print_test_case_num(int domain, int type)
+static int init_addrs(const struct sock_addr_test *test,
+		      struct sockaddr_storage *requested_addr,
+		      struct sockaddr_storage *expected_addr,
+		      struct sockaddr_storage *expected_src_addr)
 {
-	static int test_num;
-
-	printf("Test case #%d (%s/%s):\n", ++test_num,
-	       (domain == AF_INET ? "IPv4" :
-		domain == AF_INET6 ? "IPv6" :
-		"unknown_domain"),
-	       (type == SOCK_STREAM ? "TCP" :
-		type == SOCK_DGRAM ? "UDP" :
-		"unknown_type"));
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+
+	if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port,
+			(struct sockaddr *)expected_addr, addr_len) == -1)
+		goto err;
+
+	if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port,
+			(struct sockaddr *)requested_addr, addr_len) == -1)
+		goto err;
+
+	if (test->expected_src_ip &&
+	    mk_sockaddr(test->domain, test->expected_src_ip, 0,
+			(struct sockaddr *)expected_src_addr, addr_len) == -1)
+		goto err;
+
+	return 0;
+err:
+	return -1;
 }
 
-static int run_test_case(int domain, int type, const char *ip,
-			 unsigned short port)
+static int run_bind_test_case(const struct sock_addr_test *test)
 {
-	struct sockaddr_storage addr;
-	socklen_t addr_len = sizeof(addr);
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	int clientfd = -1;
 	int servfd = -1;
 	int err = 0;
 
-	print_test_case_num(domain, type);
-
-	if (mk_sockaddr(domain, ip, port, (struct sockaddr *)&addr,
-			addr_len) == -1)
-		return -1;
+	if (init_addrs(test, &requested_addr, &expected_addr, NULL))
+		goto err;
 
-	printf("\tRequested: bind(%s, %d) ..\n", ip, port);
-	servfd = start_server(type, &addr, addr_len);
+	servfd = start_server(test->type, &requested_addr, addr_len);
 	if (servfd == -1)
 		goto err;
 
-	printf("\tRequested: connect(%s, %d) from (*, *) ..\n", ip, port);
-	if (connect_to_server(type, &addr, addr_len))
+	if (cmp_local_addr(servfd, &expected_addr))
+		goto err;
+
+	/* Try to connect to server just in case */
+	clientfd = connect_to_server(test->type, &expected_addr, addr_len);
+	if (clientfd == -1)
 		goto err;
 
 	goto out;
 err:
 	err = -1;
 out:
+	close(clientfd);
 	close(servfd);
 	return err;
 }
 
-static void close_progs_fds(struct program *progs, size_t prog_cnt)
+static int run_connect_test_case(const struct sock_addr_test *test)
 {
-	size_t i;
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage expected_src_addr;
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	int clientfd = -1;
+	int servfd = -1;
+	int err = 0;
 
-	for (i = 0; i < prog_cnt; ++i) {
-		close(progs[i].fd);
-		progs[i].fd = -1;
-	}
-}
+	if (init_addrs(test, &requested_addr, &expected_addr,
+		       &expected_src_addr))
+		goto err;
 
-static int load_and_attach_progs(int cgfd, struct program *progs,
-				 size_t prog_cnt)
-{
-	size_t i;
-
-	for (i = 0; i < prog_cnt; ++i) {
-		printf("Load %s with invalid type (can pollute stderr) ",
-		       progs[i].name);
-		fflush(stdout);
-		progs[i].fd = progs[i].loadfn(progs[i].invalid_type, NULL);
-		if (progs[i].fd != -1) {
-			log_err("Load with invalid type accepted for %s",
-				progs[i].name);
-			goto err;
-		}
-		printf("... REJECTED\n");
+	/* Prepare server to connect to */
+	servfd = start_server(test->type, &expected_addr, addr_len);
+	if (servfd == -1)
+		goto err;
 
-		printf("Load %s with valid type", progs[i].name);
-		progs[i].fd = progs[i].loadfn(progs[i].type, progs[i].name);
-		if (progs[i].fd == -1) {
-			log_err("Failed to load program %s", progs[i].name);
-			goto err;
-		}
-		printf(" ... OK\n");
-
-		printf("Attach %s with invalid type", progs[i].name);
-		if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].invalid_type,
-				    BPF_F_ALLOW_OVERRIDE) != -1) {
-			log_err("Attach with invalid type accepted for %s",
-				progs[i].name);
-			goto err;
-		}
-		printf(" ... REJECTED\n");
+	clientfd = connect_to_server(test->type, &requested_addr, addr_len);
+	if (clientfd == -1)
+		goto err;
 
-		printf("Attach %s with valid type", progs[i].name);
-		if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].type,
-				    BPF_F_ALLOW_OVERRIDE) == -1) {
-			log_err("Failed to attach program %s", progs[i].name);
-			goto err;
-		}
-		printf(" ... OK\n");
-	}
+	/* Make sure src and dst addrs were overridden properly */
+	if (cmp_peer_addr(clientfd, &expected_addr))
+		goto err;
 
-	return 0;
+	if (cmp_local_ip(clientfd, &expected_src_addr))
+		goto err;
+
+	goto out;
 err:
-	close_progs_fds(progs, prog_cnt);
-	return -1;
+	err = -1;
+out:
+	close(clientfd);
+	close(servfd);
+	return err;
 }
 
-static int run_domain_test(int domain, int cgfd, struct program *progs,
-			   size_t prog_cnt, const char *ip, unsigned short port)
+static int run_test_case(int cgfd, const struct sock_addr_test *test)
 {
+	int progfd = -1;
 	int err = 0;
 
-	if (load_and_attach_progs(cgfd, progs, prog_cnt) == -1)
+	printf("Test case: %s .. ", test->descr);
+
+	progfd = test->loadfn(test);
+	if (test->expected_result == LOAD_REJECT && progfd < 0)
+		goto out;
+	else if (test->expected_result == LOAD_REJECT || progfd < 0)
+		goto err;
+
+	err = bpf_prog_attach(progfd, cgfd, test->attach_type,
+			      BPF_F_ALLOW_OVERRIDE);
+	if (test->expected_result == ATTACH_REJECT && err) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	} else if (test->expected_result == ATTACH_REJECT || err) {
 		goto err;
+	}
 
-	if (run_test_case(domain, SOCK_STREAM, ip, port) == -1)
+	switch (test->attach_type) {
+	case BPF_CGROUP_INET4_BIND:
+	case BPF_CGROUP_INET6_BIND:
+		err = run_bind_test_case(test);
+		break;
+	case BPF_CGROUP_INET4_CONNECT:
+	case BPF_CGROUP_INET6_CONNECT:
+		err = run_connect_test_case(test);
+		break;
+	default:
 		goto err;
+	}
 
-	if (run_test_case(domain, SOCK_DGRAM, ip, port) == -1)
+	if (err || test->expected_result != SUCCESS)
 		goto err;
 
 	goto out;
 err:
 	err = -1;
 out:
-	close_progs_fds(progs, prog_cnt);
+	/* Detaching w/o checking return code: best effort attempt. */
+	if (progfd != -1)
+		bpf_prog_detach(cgfd, test->attach_type);
+	close(progfd);
+	printf("[%s]\n", err ? "FAIL" : "PASS");
 	return err;
 }
 
-static int run_test(void)
+static int run_tests(int cgfd)
+{
+	int passes = 0;
+	int fails = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+		if (run_test_case(cgfd, &tests[i]))
+			++fails;
+		else
+			++passes;
+	}
+	printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+	return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
 {
-	size_t inet6_prog_cnt;
-	size_t inet_prog_cnt;
 	int cgfd = -1;
 	int err = 0;
 
-	struct program inet6_progs[] = {
-		{BPF_CGROUP_INET6_BIND, bind6_prog_load, -1, "bind6",
-		 BPF_CGROUP_INET4_BIND},
-		{BPF_CGROUP_INET6_CONNECT, connect6_prog_load, -1, "connect6",
-		 BPF_CGROUP_INET4_CONNECT},
-	};
-	inet6_prog_cnt = sizeof(inet6_progs) / sizeof(struct program);
-
-	struct program inet_progs[] = {
-		{BPF_CGROUP_INET4_BIND, bind4_prog_load, -1, "bind4",
-		 BPF_CGROUP_INET6_BIND},
-		{BPF_CGROUP_INET4_CONNECT, connect4_prog_load, -1, "connect4",
-		 BPF_CGROUP_INET6_CONNECT},
-	};
-	inet_prog_cnt = sizeof(inet_progs) / sizeof(struct program);
+	if (argc < 2) {
+		fprintf(stderr,
+			"%s has to be run via %s.sh. Skip direct run.\n",
+			argv[0], argv[0]);
+		exit(err);
+	}
 
 	if (setup_cgroup_environment())
 		goto err;
@@ -559,12 +841,7 @@ static int run_test(void)
 	if (join_cgroup(CG_PATH))
 		goto err;
 
-	if (run_domain_test(AF_INET, cgfd, inet_progs, inet_prog_cnt, SERV4_IP,
-			    SERV4_PORT) == -1)
-		goto err;
-
-	if (run_domain_test(AF_INET6, cgfd, inet6_progs, inet6_prog_cnt,
-			    SERV6_IP, SERV6_PORT) == -1)
+	if (run_tests(cgfd))
 		goto err;
 
 	goto out;
@@ -573,17 +850,5 @@ err:
 out:
 	close(cgfd);
 	cleanup_cgroup_environment();
-	printf(err ? "### FAIL\n" : "### SUCCESS\n");
 	return err;
 }
-
-int main(int argc, char **argv)
-{
-	if (argc < 2) {
-		fprintf(stderr,
-			"%s has to be run via %s.sh. Skip direct run.\n",
-			argv[0], argv[0]);
-		exit(0);
-	}
-	return run_test();
-}
-- 
cgit v1.2.3-59-g8ed1b


From 04b6ab731209eac1e130fa00281a29278eca2f57 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 25 May 2018 08:55:27 -0700
Subject: selftests/bpf: Selftest for sys_sendmsg hooks

Add selftest for BPF_CGROUP_UDP4_SENDMSG and BPF_CGROUP_UDP6_SENDMSG
attach types.

Try to sendmsg(2) to specific IP:port and test that:
* source IP is overridden as expected.
* remote IP:port pair is overridden as expected;

Both UDPv4 and UDPv6 are tested.

Output:
  # test_sock_addr.sh 2>/dev/null
  Wait for testing IPv4/IPv6 to become available ... OK
  ... pre-existing test-cases skipped ...
  Test case: sendmsg4: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg4: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg4: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg4: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg4: deny call .. [PASS]
  Test case: sendmsg6: load prog with wrong expected attach type .. [PASS]
  Test case: sendmsg6: attach prog with wrong attach type .. [PASS]
  Test case: sendmsg6: rewrite IP & port (asm) .. [PASS]
  Test case: sendmsg6: rewrite IP & port (C) .. [PASS]
  Test case: sendmsg6: IPv4-mapped IPv6 .. [PASS]
  Test case: sendmsg6: deny call .. [PASS]
  Summary: 27 PASSED, 0 FAILED

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile         |   2 +-
 tools/testing/selftests/bpf/sendmsg4_prog.c  |  49 +++
 tools/testing/selftests/bpf/sendmsg6_prog.c  |  60 ++++
 tools/testing/selftests/bpf/test_sock_addr.c | 518 +++++++++++++++++++++++++++
 4 files changed, 628 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/sendmsg4_prog.c
 create mode 100644 tools/testing/selftests/bpf/sendmsg6_prog.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 85044448bbc7..a1b66da965d0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/sendmsg4_prog.c b/tools/testing/selftests/bpf/sendmsg4_prog.c
new file mode 100644
index 000000000000..a91536b1c47e
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg4_prog.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC1_IP4		0xAC100001U /* 172.16.0.1 */
+#define SRC2_IP4		0x00000000U
+#define SRC_REWRITE_IP4		0x7f000004U
+#define DST_IP4			0xC0A801FEU /* 192.168.1.254 */
+#define DST_REWRITE_IP4		0x7f000001U
+#define DST_PORT		4040
+#define DST_REWRITE_PORT4	4444
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg4")
+int sendmsg_v4_prog(struct bpf_sock_addr *ctx)
+{
+	if (ctx->type != SOCK_DGRAM)
+		return 0;
+
+	/* Rewrite source. */
+	if (ctx->msg_src_ip4 == bpf_htonl(SRC1_IP4) ||
+	    ctx->msg_src_ip4 == bpf_htonl(SRC2_IP4)) {
+		ctx->msg_src_ip4 = bpf_htonl(SRC_REWRITE_IP4);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	/* Rewrite destination. */
+	if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&
+	     ctx->user_port == bpf_htons(DST_PORT)) {
+		ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
+		ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/sendmsg6_prog.c b/tools/testing/selftests/bpf/sendmsg6_prog.c
new file mode 100644
index 000000000000..5aeaa284fc47
--- /dev/null
+++ b/tools/testing/selftests/bpf/sendmsg6_prog.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define SRC_REWRITE_IP6_0	0
+#define SRC_REWRITE_IP6_1	0
+#define SRC_REWRITE_IP6_2	0
+#define SRC_REWRITE_IP6_3	6
+
+#define DST_REWRITE_IP6_0	0
+#define DST_REWRITE_IP6_1	0
+#define DST_REWRITE_IP6_2	0
+#define DST_REWRITE_IP6_3	1
+
+#define DST_REWRITE_PORT6	6666
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_prog(struct bpf_sock_addr *ctx)
+{
+	if (ctx->type != SOCK_DGRAM)
+		return 0;
+
+	/* Rewrite source. */
+	if (ctx->msg_src_ip6[3] == bpf_htonl(1) ||
+	    ctx->msg_src_ip6[3] == bpf_htonl(0)) {
+		ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+		ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+		ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+		ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+	} else {
+		/* Unexpected source. Reject sendmsg. */
+		return 0;
+	}
+
+	/* Rewrite destination. */
+	if ((ctx->user_ip6[0] & 0xFFFF) == bpf_htons(0xFACE) &&
+	     ctx->user_ip6[0] >> 16 == bpf_htons(0xB00C)) {
+		ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_0);
+		ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_1);
+		ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
+		ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_3);
+
+		ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+	} else {
+		/* Unexpected destination. Reject sendmsg. */
+		return 0;
+	}
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index ed3e397a56f6..a5e76b9219b9 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -1,12 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2018 Facebook
 
+#define _GNU_SOURCE
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
 #include <arpa/inet.h>
+#include <netinet/in.h>
 #include <sys/types.h>
+#include <sys/select.h>
 #include <sys/socket.h>
 
 #include <linux/filter.h>
@@ -17,6 +21,10 @@
 #include "cgroup_helpers.h"
 #include "bpf_rlimit.h"
 
+#ifndef ENOTSUPP
+# define ENOTSUPP 524
+#endif
+
 #ifndef ARRAY_SIZE
 # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
@@ -24,15 +32,20 @@
 #define CG_PATH	"/foo"
 #define CONNECT4_PROG_PATH	"./connect4_prog.o"
 #define CONNECT6_PROG_PATH	"./connect6_prog.o"
+#define SENDMSG4_PROG_PATH	"./sendmsg4_prog.o"
+#define SENDMSG6_PROG_PATH	"./sendmsg6_prog.o"
 
 #define SERV4_IP		"192.168.1.254"
 #define SERV4_REWRITE_IP	"127.0.0.1"
+#define SRC4_IP			"172.16.0.1"
 #define SRC4_REWRITE_IP		"127.0.0.4"
 #define SERV4_PORT		4040
 #define SERV4_REWRITE_PORT	4444
 
 #define SERV6_IP		"face:b00c:1234:5678::abcd"
 #define SERV6_REWRITE_IP	"::1"
+#define SERV6_V4MAPPED_IP	"::ffff:192.168.0.4"
+#define SRC6_IP			"::1"
 #define SRC6_REWRITE_IP		"::6"
 #define SERV6_PORT		6060
 #define SERV6_REWRITE_PORT	6666
@@ -65,6 +78,8 @@ struct sock_addr_test {
 	enum {
 		LOAD_REJECT,
 		ATTACH_REJECT,
+		SYSCALL_EPERM,
+		SYSCALL_ENOTSUPP,
 		SUCCESS,
 	} expected_result;
 };
@@ -73,6 +88,12 @@ static int bind4_prog_load(const struct sock_addr_test *test);
 static int bind6_prog_load(const struct sock_addr_test *test);
 static int connect4_prog_load(const struct sock_addr_test *test);
 static int connect6_prog_load(const struct sock_addr_test *test);
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test);
 
 static struct sock_addr_test tests[] = {
 	/* bind */
@@ -302,6 +323,162 @@ static struct sock_addr_test tests[] = {
 		SRC6_REWRITE_IP,
 		SUCCESS,
 	},
+
+	/* sendmsg */
+	{
+		"sendmsg4: load prog with wrong expected attach type",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"sendmsg4: attach prog with wrong attach type",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"sendmsg4: rewrite IP & port (asm)",
+		sendmsg4_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg4: rewrite IP & port (C)",
+		sendmsg4_rw_c_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg4: deny call",
+		sendmsg_deny_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET,
+		SOCK_DGRAM,
+		SERV4_IP,
+		SERV4_PORT,
+		SERV4_REWRITE_IP,
+		SERV4_REWRITE_PORT,
+		SRC4_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
+	{
+		"sendmsg6: load prog with wrong expected attach type",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP4_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		LOAD_REJECT,
+	},
+	{
+		"sendmsg6: attach prog with wrong attach type",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP4_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		NULL,
+		0,
+		NULL,
+		0,
+		NULL,
+		ATTACH_REJECT,
+	},
+	{
+		"sendmsg6: rewrite IP & port (asm)",
+		sendmsg6_rw_asm_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg6: rewrite IP & port (C)",
+		sendmsg6_rw_c_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SUCCESS,
+	},
+	{
+		"sendmsg6: IPv4-mapped IPv6",
+		sendmsg6_rw_v4mapped_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_ENOTSUPP,
+	},
+	{
+		"sendmsg6: deny call",
+		sendmsg_deny_prog_load,
+		BPF_CGROUP_UDP6_SENDMSG,
+		BPF_CGROUP_UDP6_SENDMSG,
+		AF_INET6,
+		SOCK_DGRAM,
+		SERV6_IP,
+		SERV6_PORT,
+		SERV6_REWRITE_IP,
+		SERV6_REWRITE_PORT,
+		SRC6_REWRITE_IP,
+		SYSCALL_EPERM,
+	},
 };
 
 static int mk_sockaddr(int domain, const char *ip, unsigned short port,
@@ -540,6 +717,141 @@ static int connect6_prog_load(const struct sock_addr_test *test)
 	return load_path(test, CONNECT6_PROG_PATH);
 }
 
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test)
+{
+	struct bpf_insn insns[] = {
+		/* return 0 */
+		BPF_MOV64_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+	struct sockaddr_in dst4_rw_addr;
+	struct in_addr src4_rw_ip;
+
+	if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) {
+		log_err("Invalid IPv4: %s", SRC4_REWRITE_IP);
+		return -1;
+	}
+
+	if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT,
+			(struct sockaddr *)&dst4_rw_addr,
+			sizeof(dst4_rw_addr)) == -1)
+		return -1;
+
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+		/* if (sk.family == AF_INET && */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, family)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8),
+
+		/*     sk.type == SOCK_DGRAM)  { */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, type)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6),
+
+		/*      msg_src_ip4 = src4_rw_ip */
+		BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, msg_src_ip4)),
+
+		/*      user_ip4 = dst4_rw_addr.sin_addr */
+		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_ip4)),
+
+		/*      user_port = dst4_rw_addr.sin_port */
+		BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_port)),
+		/* } */
+
+		/* return 1 */
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test)
+{
+	return load_path(test, SENDMSG4_PROG_PATH);
+}
+
+static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test,
+					 const char *rw_dst_ip)
+{
+	struct sockaddr_in6 dst6_rw_addr;
+	struct in6_addr src6_rw_ip;
+
+	if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) {
+		log_err("Invalid IPv6: %s", SRC6_REWRITE_IP);
+		return -1;
+	}
+
+	if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT,
+			(struct sockaddr *)&dst6_rw_addr,
+			sizeof(dst6_rw_addr)) == -1)
+		return -1;
+
+	struct bpf_insn insns[] = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+		/* if (sk.family == AF_INET6) { */
+		BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+			    offsetof(struct bpf_sock_addr, family)),
+		BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18),
+
+#define STORE_IPV6_WORD_N(DST, SRC, N)					       \
+		BPF_MOV32_IMM(BPF_REG_7, SRC[N]),			       \
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,		       \
+			    offsetof(struct bpf_sock_addr, DST[N]))
+
+#define STORE_IPV6(DST, SRC)						       \
+		STORE_IPV6_WORD_N(DST, SRC, 0),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 1),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 2),				       \
+		STORE_IPV6_WORD_N(DST, SRC, 3)
+
+		STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32),
+		STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32),
+
+		/*      user_port = dst6_rw_addr.sin6_port */
+		BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port),
+		BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+			    offsetof(struct bpf_sock_addr, user_port)),
+
+		/* } */
+
+		/* return 1 */
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+
+	return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP);
+}
+
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test)
+{
+	return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP);
+}
+
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test)
+{
+	return load_path(test, SENDMSG6_PROG_PATH);
+}
+
 static int cmp_addr(const struct sockaddr_storage *addr1,
 		    const struct sockaddr_storage *addr2, int cmp_port)
 {
@@ -656,6 +968,135 @@ out:
 	return fd;
 }
 
+int init_pktinfo(int domain, struct cmsghdr *cmsg)
+{
+	struct in6_pktinfo *pktinfo6;
+	struct in_pktinfo *pktinfo4;
+
+	if (domain == AF_INET) {
+		cmsg->cmsg_level = SOL_IP;
+		cmsg->cmsg_type = IP_PKTINFO;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
+		pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg);
+		memset(pktinfo4, 0, sizeof(struct in_pktinfo));
+		if (inet_pton(domain, SRC4_IP,
+			      (void *)&pktinfo4->ipi_spec_dst) != 1)
+			return -1;
+	} else if (domain == AF_INET6) {
+		cmsg->cmsg_level = SOL_IPV6;
+		cmsg->cmsg_type = IPV6_PKTINFO;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
+		pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+		memset(pktinfo6, 0, sizeof(struct in6_pktinfo));
+		if (inet_pton(domain, SRC6_IP,
+			      (void *)&pktinfo6->ipi6_addr) != 1)
+			return -1;
+	} else {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int sendmsg_to_server(const struct sockaddr_storage *addr,
+			     socklen_t addr_len, int set_cmsg, int *syscall_err)
+{
+	union {
+		char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))];
+		struct cmsghdr align;
+	} control6;
+	union {
+		char buf[CMSG_SPACE(sizeof(struct in_pktinfo))];
+		struct cmsghdr align;
+	} control4;
+	struct msghdr hdr;
+	struct iovec iov;
+	char data = 'a';
+	int domain;
+	int fd = -1;
+
+	domain = addr->ss_family;
+
+	if (domain != AF_INET && domain != AF_INET6) {
+		log_err("Unsupported address family");
+		goto err;
+	}
+
+	fd = socket(domain, SOCK_DGRAM, 0);
+	if (fd == -1) {
+		log_err("Failed to create client socket");
+		goto err;
+	}
+
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = &data;
+	iov.iov_len = sizeof(data);
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_name = (void *)addr;
+	hdr.msg_namelen = addr_len;
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+
+	if (set_cmsg) {
+		if (domain == AF_INET) {
+			hdr.msg_control = &control4;
+			hdr.msg_controllen = sizeof(control4.buf);
+		} else if (domain == AF_INET6) {
+			hdr.msg_control = &control6;
+			hdr.msg_controllen = sizeof(control6.buf);
+		}
+		if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) {
+			log_err("Fail to init pktinfo");
+			goto err;
+		}
+	}
+
+	if (sendmsg(fd, &hdr, 0) != sizeof(data)) {
+		log_err("Fail to send message to server");
+		*syscall_err = errno;
+		goto err;
+	}
+
+	goto out;
+err:
+	close(fd);
+	fd = -1;
+out:
+	return fd;
+}
+
+static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr)
+{
+	struct timeval tv;
+	struct msghdr hdr;
+	struct iovec iov;
+	char data[64];
+	fd_set rfds;
+
+	FD_ZERO(&rfds);
+	FD_SET(sockfd, &rfds);
+
+	tv.tv_sec = 2;
+	tv.tv_usec = 0;
+
+	if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 ||
+	    !FD_ISSET(sockfd, &rfds))
+		return -1;
+
+	memset(&iov, 0, sizeof(iov));
+	iov.iov_base = data;
+	iov.iov_len = sizeof(data);
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.msg_name = src_addr;
+	hdr.msg_namelen = sizeof(struct sockaddr_storage);
+	hdr.msg_iov = &iov;
+	hdr.msg_iovlen = 1;
+
+	return recvmsg(sockfd, &hdr, 0);
+}
+
 static int init_addrs(const struct sock_addr_test *test,
 		      struct sockaddr_storage *requested_addr,
 		      struct sockaddr_storage *expected_addr,
@@ -753,6 +1194,69 @@ out:
 	return err;
 }
 
+static int run_sendmsg_test_case(const struct sock_addr_test *test)
+{
+	socklen_t addr_len = sizeof(struct sockaddr_storage);
+	struct sockaddr_storage expected_src_addr;
+	struct sockaddr_storage requested_addr;
+	struct sockaddr_storage expected_addr;
+	struct sockaddr_storage real_src_addr;
+	int clientfd = -1;
+	int servfd = -1;
+	int set_cmsg;
+	int err = 0;
+
+	if (test->type != SOCK_DGRAM)
+		goto err;
+
+	if (init_addrs(test, &requested_addr, &expected_addr,
+		       &expected_src_addr))
+		goto err;
+
+	/* Prepare server to sendmsg to */
+	servfd = start_server(test->type, &expected_addr, addr_len);
+	if (servfd == -1)
+		goto err;
+
+	for (set_cmsg = 0; set_cmsg <= 1; ++set_cmsg) {
+		if (clientfd >= 0)
+			close(clientfd);
+
+		clientfd = sendmsg_to_server(&requested_addr, addr_len,
+					     set_cmsg, &err);
+		if (err)
+			goto out;
+		else if (clientfd == -1)
+			goto err;
+
+		/* Try to receive message on server instead of using
+		 * getpeername(2) on client socket, to check that client's
+		 * destination address was rewritten properly, since
+		 * getpeername(2) doesn't work with unconnected datagram
+		 * sockets.
+		 *
+		 * Get source address from recvmsg(2) as well to make sure
+		 * source was rewritten properly: getsockname(2) can't be used
+		 * since socket is unconnected and source defined for one
+		 * specific packet may differ from the one used by default and
+		 * returned by getsockname(2).
+		 */
+		if (recvmsg_from_client(servfd, &real_src_addr) == -1)
+			goto err;
+
+		if (cmp_addr(&real_src_addr, &expected_src_addr, /*cmp_port*/0))
+			goto err;
+	}
+
+	goto out;
+err:
+	err = -1;
+out:
+	close(clientfd);
+	close(servfd);
+	return err;
+}
+
 static int run_test_case(int cgfd, const struct sock_addr_test *test)
 {
 	int progfd = -1;
@@ -784,10 +1288,24 @@ static int run_test_case(int cgfd, const struct sock_addr_test *test)
 	case BPF_CGROUP_INET6_CONNECT:
 		err = run_connect_test_case(test);
 		break;
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
+		err = run_sendmsg_test_case(test);
+		break;
 	default:
 		goto err;
 	}
 
+	if (test->expected_result == SYSCALL_EPERM && err == EPERM) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	}
+
+	if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) {
+		err = 0; /* error was expected, reset it */
+		goto out;
+	}
+
 	if (err || test->expected_result != SUCCESS)
 		goto err;
 
-- 
cgit v1.2.3-59-g8ed1b


From e5a10bb2acf246c13dc164fd37d4c77d9acaf88f Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Wed, 23 May 2018 16:26:35 +0200
Subject: netfilter: add includes to nf_socket.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These have to be included always when nf_socket.h is included.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
index 8230fefff9f5..29b6313f0557 100644
--- a/include/net/netfilter/nf_socket.h
+++ b/include/net/netfilter/nf_socket.h
@@ -2,10 +2,8 @@
 #ifndef _NF_SOCK_H_
 #define _NF_SOCK_H_
 
-struct net_device;
-struct sk_buff;
-struct sock;
-struct net;
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
 
 static inline bool nf_sk_is_transparent(struct sock *sk)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 0168e8b36145a7db353055bdd2673096165c8a3a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 24 May 2018 13:17:28 +0200
Subject: netfilter: nat: merge ipv4/ipv6 masquerade code into main nat module

Instead of using extra modules for these, turn the config options into
an implicit dependency that adds masq feature to the protocol specific nf_nat module.

before:
   text    data     bss     dec     hex filename
   2001     860       4    2865     b31 net/ipv4/netfilter/nf_nat_masquerade_ipv4.ko
   5579     780       2    6361    18d9 net/ipv4/netfilter/nf_nat_ipv4.ko
   2860     836       8    3704     e78 net/ipv6/netfilter/nf_nat_masquerade_ipv6.ko
   6648     780       2    7430    1d06 net/ipv6/netfilter/nf_nat_ipv6.ko

after:
   text    data     bss     dec     hex filename
   7245     872       8    8125    1fbd net/ipv4/netfilter/nf_nat_ipv4.ko
   9165     848      12   10025    2729 net/ipv6/netfilter/nf_nat_ipv6.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig                  | 5 +----
 net/ipv4/netfilter/Makefile                 | 4 +---
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c | 4 ----
 net/ipv6/netfilter/Kconfig                  | 5 +----
 net/ipv6/netfilter/Makefile                 | 2 +-
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 4 ----
 6 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 280048e1e395..d03bc5a01a70 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -129,10 +129,7 @@ config NFT_CHAIN_NAT_IPV4
 	  source and destination ports.
 
 config NF_NAT_MASQUERADE_IPV4
-	tristate "IPv4 masquerade support"
-	help
-	  This is the kernel functionality to provide NAT in the masquerade
-	  flavour (automatic source address selection).
+	bool
 
 config NFT_MASQ_IPV4
 	tristate "IPv4 masquerading support for nf_tables"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 0e5edd0c7926..c4b05b174091 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -10,6 +10,7 @@ nf_conntrack_ipv4-y	:=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
 obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
 
 nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
+nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
 obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 
 # defrag
@@ -32,9 +33,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
 $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
 obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
 
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-
-
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index f538c5001547..ad3aeff152ed 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -7,7 +7,6 @@
  */
 
 #include <linux/types.h>
-#include <linux/module.h>
 #include <linux/atomic.h>
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
@@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void)
 	unregister_inetaddr_notifier(&masq_inet_notifier);
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index ce77bcc2490c..9f5b00a39adf 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -136,10 +136,7 @@ config NF_NAT_IPV6
 if NF_NAT_IPV6
 
 config NF_NAT_MASQUERADE_IPV6
-	tristate "IPv6 masquerade support"
-	help
-	  This is the kernel functionality to provide NAT in the masquerade
-	  flavour (automatic source address selection) for IPv6.
+	bool
 
 endif # NF_NAT_IPV6
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 44273d6f03a5..71518f22ae39 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -18,8 +18,8 @@ nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
 
 nf_nat_ipv6-y		:= nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
+nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
 
 # defrag
 nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 9dfc2b90c362..e6eb7cf9b54f 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/atomic.h>
 #include <linux/netdevice.h>
 #include <linux/ipv6.h>
@@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void)
 	unregister_netdevice_notifier(&masq_dev_notifier);
 }
 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3-59-g8ed1b


From 1ac89d20150e377b74d2ef23f56db0f08088426c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 24 May 2018 13:17:29 +0200
Subject: netfilter: nat: merge nf_nat_redirect into nf_nat

Similar to previous patch, this time, merge redirect+nat.
The redirect module is just 2k in size, get rid of it and make
redirect part available from the nat core.

before:
   text    data     bss     dec     hex filename
  19461    1484    4138   25083    61fb net/netfilter/nf_nat.ko
   1236     792       0    2028     7ec net/netfilter/nf_nat_redirect.ko
after:
  20340    1508    4138   25986    6582 net/netfilter/nf_nat.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig           | 6 +-----
 net/netfilter/Makefile          | 2 +-
 net/netfilter/nf_nat_redirect.c | 4 ----
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a5b60e6a983e..3ec8886850b2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -433,11 +433,7 @@ config NF_NAT_TFTP
 	default NF_NAT && NF_CONNTRACK_TFTP
 
 config NF_NAT_REDIRECT
-        tristate "IPv4/IPv6 redirect support"
-	depends on NF_NAT
-        help
-          This is the kernel functionality to redirect packets to local
-          machine through NAT.
+	bool
 
 config NETFILTER_SYNPROXY
 	tristate
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1aa710b5d384..9b3434360d49 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
 obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
 
 obj-$(CONFIG_NF_NAT) += nf_nat.o
-obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 
 # NAT helpers
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 7c4bb0a773ca..adee04af8d43 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -15,7 +15,6 @@
 #include <linux/inetdevice.h>
 #include <linux/ip.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/netfilter.h>
 #include <linux/types.h>
@@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3-59-g8ed1b


From 003087911af28941a95fa053db0ac36b2ee27207 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 25 May 2018 00:25:47 +0200
Subject: netfilter: nfnetlink: allow commit to fail

->commit() cannot fail at the moment.

Followup-patch adds kmalloc calls in the commit phase, so we'll need
to be able to handle errors.

Make it so that -EGAIN causes a full replay, and make other errors
cause the transaction to fail.

Failing is ok from a consistency point of view as long as we
perform all actions that could return an error before
we increment the generation counter and the base seq.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..88c9e222b670 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -441,7 +441,14 @@ done:
 		kfree_skb(skb);
 		goto replay;
 	} else if (status == NFNL_BATCH_DONE) {
-		ss->commit(net, oskb);
+		err = ss->commit(net, oskb);
+		if (err == -EAGAIN) {
+			status |= NFNL_BATCH_REPLAY;
+			goto done;
+		} else if (err) {
+			ss->abort(net, oskb);
+			netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+		}
 	} else {
 		ss->abort(net, oskb);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1b40428c04b1f89aa71db0310638d26e6eaaac07 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Thu, 24 May 2018 04:57:51 -0700
Subject: bnx2x: Collect the device debug information during Tx timeout.

Tx-timeout mostly happens due to some issue in the device. In such cases,
debug dump would be helpful for identifying the cause of the issue.
This patch adds support to spill debug data during the Tx timeout. Here
bnx2x_panic_dump() API is used instead of bnx2x_panic(), since we still
want to allow the Tx-timeout recovery a chance to succeed.

Changes from previous version:
-------------------------------
v2: Fixed a coding error.

Please consider applying this to "net-next".

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 95871576ab92..8cd73ff5debc 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4962,8 +4962,13 @@ void bnx2x_tx_timeout(struct net_device *dev)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
-#ifdef BNX2X_STOP_ON_ERROR
+	/* We want the information of the dump logged,
+	 * but calling bnx2x_panic() would kill all chances of recovery.
+	 */
 	if (!bp->panic)
+#ifndef BNX2X_STOP_ON_ERROR
+		bnx2x_panic_dump(bp, false);
+#else
 		bnx2x_panic();
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From 29555fa3de865630570b5f53c847b953413daf1a Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 24 May 2018 16:09:07 +0200
Subject: net: stmmac: Use mutex instead of spinlock

Some drivers, such as DWC EQOS on Tegra, need to perform operations that
can sleep under this lock (clk_set_rate() in tegra_eqos_fix_speed()) for
proper operation. Since there is no need for this lock to be a spinlock,
convert it to a mutex instead.

Fixes: e6ea2d16fc61 ("net: stmmac: dwc-qos: Add Tegra186 support")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Tested-by: Bhadram Varka <vbhadram@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h       |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   | 12 ++++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 31 ++++++++++------------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 4d425b1a0c59..fbfe5dcefa87 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -122,7 +122,7 @@ struct stmmac_priv {
 	struct net_device *dev;
 	struct device *device;
 	struct mac_device_info *hw;
-	spinlock_t lock;
+	struct mutex lock;
 
 	/* RX Queue */
 	struct stmmac_rx_queue rx_queue[MTL_MAX_RX_QUEUES];
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 6d82b3ef5c3b..5710864fa809 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -390,9 +390,9 @@ stmmac_ethtool_set_link_ksettings(struct net_device *dev,
 			ADVERTISED_10baseT_Half |
 			ADVERTISED_10baseT_Full);
 
-		spin_lock(&priv->lock);
+		mutex_lock(&priv->lock);
 		stmmac_pcs_ctrl_ane(priv, priv->ioaddr, 1, priv->hw->ps, 0);
-		spin_unlock(&priv->lock);
+		mutex_unlock(&priv->lock);
 
 		return 0;
 	}
@@ -632,12 +632,12 @@ static void stmmac_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	spin_lock_irq(&priv->lock);
+	mutex_lock(&priv->lock);
 	if (device_can_wakeup(priv->device)) {
 		wol->supported = WAKE_MAGIC | WAKE_UCAST;
 		wol->wolopts = priv->wolopts;
 	}
-	spin_unlock_irq(&priv->lock);
+	mutex_unlock(&priv->lock);
 }
 
 static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
@@ -666,9 +666,9 @@ static int stmmac_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 		disable_irq_wake(priv->wol_irq);
 	}
 
-	spin_lock_irq(&priv->lock);
+	mutex_lock(&priv->lock);
 	priv->wolopts = wol->wolopts;
-	spin_unlock_irq(&priv->lock);
+	mutex_unlock(&priv->lock);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index c32de53a00d3..77af85c981db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -381,7 +381,6 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 {
 	struct net_device *ndev = priv->dev;
 	int interface = priv->plat->interface;
-	unsigned long flags;
 	bool ret = false;
 
 	if ((interface != PHY_INTERFACE_MODE_MII) &&
@@ -408,7 +407,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 			 * changed).
 			 * In that case the driver disable own timers.
 			 */
-			spin_lock_irqsave(&priv->lock, flags);
+			mutex_lock(&priv->lock);
 			if (priv->eee_active) {
 				netdev_dbg(priv->dev, "disable EEE\n");
 				del_timer_sync(&priv->eee_ctrl_timer);
@@ -416,11 +415,11 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 						tx_lpi_timer);
 			}
 			priv->eee_active = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
+			mutex_unlock(&priv->lock);
 			goto out;
 		}
 		/* Activate the EEE and start timers */
-		spin_lock_irqsave(&priv->lock, flags);
+		mutex_lock(&priv->lock);
 		if (!priv->eee_active) {
 			priv->eee_active = 1;
 			timer_setup(&priv->eee_ctrl_timer,
@@ -435,7 +434,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 		stmmac_set_eee_pls(priv, priv->hw, ndev->phydev->link);
 
 		ret = true;
-		spin_unlock_irqrestore(&priv->lock, flags);
+		mutex_unlock(&priv->lock);
 
 		netdev_dbg(priv->dev, "Energy-Efficient Ethernet initialized\n");
 	}
@@ -811,13 +810,12 @@ static void stmmac_adjust_link(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 	struct phy_device *phydev = dev->phydev;
-	unsigned long flags;
 	bool new_state = false;
 
 	if (!phydev)
 		return;
 
-	spin_lock_irqsave(&priv->lock, flags);
+	mutex_lock(&priv->lock);
 
 	if (phydev->link) {
 		u32 ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
@@ -876,7 +874,7 @@ static void stmmac_adjust_link(struct net_device *dev)
 	if (new_state && netif_msg_link(priv))
 		phy_print_status(phydev);
 
-	spin_unlock_irqrestore(&priv->lock, flags);
+	mutex_unlock(&priv->lock);
 
 	if (phydev->is_pseudo_fixed_link)
 		/* Stop PHY layer to call the hook to adjust the link in case
@@ -4275,7 +4273,7 @@ int stmmac_dvr_probe(struct device *device,
 			       (8 * priv->plat->rx_queues_to_use));
 	}
 
-	spin_lock_init(&priv->lock);
+	mutex_init(&priv->lock);
 
 	/* If a specific clk_csr value is passed from the platform
 	 * this means that the CSR Clock Range selection cannot be
@@ -4359,6 +4357,7 @@ int stmmac_dvr_remove(struct device *dev)
 	    priv->hw->pcs != STMMAC_PCS_RTBI)
 		stmmac_mdio_unregister(ndev);
 	destroy_workqueue(priv->wq);
+	mutex_destroy(&priv->lock);
 	free_netdev(ndev);
 
 	return 0;
@@ -4376,7 +4375,6 @@ int stmmac_suspend(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
-	unsigned long flags;
 
 	if (!ndev || !netif_running(ndev))
 		return 0;
@@ -4384,7 +4382,7 @@ int stmmac_suspend(struct device *dev)
 	if (ndev->phydev)
 		phy_stop(ndev->phydev);
 
-	spin_lock_irqsave(&priv->lock, flags);
+	mutex_lock(&priv->lock);
 
 	netif_device_detach(ndev);
 	stmmac_stop_all_queues(priv);
@@ -4405,7 +4403,7 @@ int stmmac_suspend(struct device *dev)
 		clk_disable(priv->plat->pclk);
 		clk_disable(priv->plat->stmmac_clk);
 	}
-	spin_unlock_irqrestore(&priv->lock, flags);
+	mutex_unlock(&priv->lock);
 
 	priv->oldlink = false;
 	priv->speed = SPEED_UNKNOWN;
@@ -4450,7 +4448,6 @@ int stmmac_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
-	unsigned long flags;
 
 	if (!netif_running(ndev))
 		return 0;
@@ -4462,9 +4459,9 @@ int stmmac_resume(struct device *dev)
 	 * from another devices (e.g. serial console).
 	 */
 	if (device_may_wakeup(priv->device)) {
-		spin_lock_irqsave(&priv->lock, flags);
+		mutex_lock(&priv->lock);
 		stmmac_pmt(priv, priv->hw, 0);
-		spin_unlock_irqrestore(&priv->lock, flags);
+		mutex_unlock(&priv->lock);
 		priv->irq_wake = 0;
 	} else {
 		pinctrl_pm_select_default_state(priv->device);
@@ -4478,7 +4475,7 @@ int stmmac_resume(struct device *dev)
 
 	netif_device_attach(ndev);
 
-	spin_lock_irqsave(&priv->lock, flags);
+	mutex_lock(&priv->lock);
 
 	stmmac_reset_queues_param(priv);
 
@@ -4492,7 +4489,7 @@ int stmmac_resume(struct device *dev)
 
 	stmmac_start_all_queues(priv);
 
-	spin_unlock_irqrestore(&priv->lock, flags);
+	mutex_unlock(&priv->lock);
 
 	if (ndev->phydev)
 		phy_start(ndev->phydev);
-- 
cgit v1.2.3-59-g8ed1b


From cb1603948a0b80f27d2525bb7ac0d4bbb3849926 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 24 May 2018 17:49:35 +0200
Subject: vrf: add CRC32c offload to device features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SCTP sockets originated in a VRF can improve their performance if CRC32c
computation is delegated to underlying devices: update device features,
setting NETIF_F_SCTP_CRC. Iterating the following command in the topology
proposed with [1],

 # ip vrf exec vrf-h2 netperf -H 192.0.2.1 -t SCTP_STREAM -- -m 10K

the measured throughput in Mbit/s improved from 2395 ± 1% to 2720 ± 1%.

[1] https://www.spinics.net/lists/netdev/msg486007.html

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 90b5f3900c22..f93547f257fb 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1254,7 +1254,7 @@ static void vrf_setup(struct net_device *dev)
 
 	/* enable offload features */
 	dev->features   |= NETIF_F_GSO_SOFTWARE;
-	dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM;
+	dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
 	dev->features   |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
 
 	dev->hw_features = dev->features;
-- 
cgit v1.2.3-59-g8ed1b


From 30c8bd5aa8b2c78546c3e52337101b9c85879320 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:13 -0700
Subject: net: Introduce generic failover module

The failover module provides a generic interface for paravirtual drivers
to register a netdev and a set of ops with a failover instance. The ops
are used as event handlers that get called to handle netdev register/
unregister/link change/name change events on slave pci ethernet devices
with the same mac address as the failover netdev.

This enables paravirtual drivers to use a VF as an accelerated low latency
datapath. It also allows migration of VMs with direct attached VFs by
failing over to the paravirtual datapath when the VF is unplugged.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/failover.rst |  18 ++
 MAINTAINERS                           |   8 +
 include/linux/netdevice.h             |  16 ++
 include/net/failover.h                |  36 ++++
 net/Kconfig                           |  13 ++
 net/core/Makefile                     |   1 +
 net/core/failover.c                   | 315 ++++++++++++++++++++++++++++++++++
 7 files changed, 407 insertions(+)
 create mode 100644 Documentation/networking/failover.rst
 create mode 100644 include/net/failover.h
 create mode 100644 net/core/failover.c

diff --git a/Documentation/networking/failover.rst b/Documentation/networking/failover.rst
new file mode 100644
index 000000000000..f0c8483cdbf5
--- /dev/null
+++ b/Documentation/networking/failover.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========
+FAILOVER
+========
+
+Overview
+========
+
+The failover module provides a generic interface for paravirtual drivers
+to register a netdev and a set of ops with a failover instance. The ops
+are used as event handlers that get called to handle netdev register/
+unregister/link change/name change events on slave pci ethernet devices
+with the same mac address as the failover netdev.
+
+This enables paravirtual drivers to use a VF as an accelerated low latency
+datapath. It also allows live migration of VMs with direct attached VFs by
+failing over to the paravirtual datapath when the VF is unplugged.
diff --git a/MAINTAINERS b/MAINTAINERS
index f492431b239b..6c59bdf49a8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5411,6 +5411,14 @@ S:	Maintained
 F:	Documentation/hwmon/f71805f
 F:	drivers/hwmon/f71805f.c
 
+FAILOVER MODULE
+M:	Sridhar Samudrala <sridhar.samudrala@intel.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/failover.c
+F:	include/net/failover.h
+F:	Documentation/networking/failover.rst
+
 FANOTIFY
 M:	Jan Kara <jack@suse.cz>
 R:	Amir Goldstein <amir73il@gmail.com>
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..f45b1a4e37ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1425,6 +1425,8 @@ struct net_device_ops {
  *	entity (i.e. the master device for bridged veth)
  * @IFF_MACSEC: device is a MACsec device
  * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
+ * @IFF_FAILOVER: device is a failover master device
+ * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1454,6 +1456,8 @@ enum netdev_priv_flags {
 	IFF_PHONY_HEADROOM		= 1<<24,
 	IFF_MACSEC			= 1<<25,
 	IFF_NO_RX_HANDLER		= 1<<26,
+	IFF_FAILOVER			= 1<<27,
+	IFF_FAILOVER_SLAVE		= 1<<28,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1482,6 +1486,8 @@ enum netdev_priv_flags {
 #define IFF_RXFH_CONFIGURED		IFF_RXFH_CONFIGURED
 #define IFF_MACSEC			IFF_MACSEC
 #define IFF_NO_RX_HANDLER		IFF_NO_RX_HANDLER
+#define IFF_FAILOVER			IFF_FAILOVER
+#define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -4336,6 +4342,16 @@ static inline bool netif_is_rxfh_configured(const struct net_device *dev)
 	return dev->priv_flags & IFF_RXFH_CONFIGURED;
 }
 
+static inline bool netif_is_failover(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_FAILOVER;
+}
+
+static inline bool netif_is_failover_slave(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_FAILOVER_SLAVE;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/failover.h b/include/net/failover.h
new file mode 100644
index 000000000000..bb15438f39c7
--- /dev/null
+++ b/include/net/failover.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _FAILOVER_H
+#define _FAILOVER_H
+
+#include <linux/netdevice.h>
+
+struct failover_ops {
+	int (*slave_pre_register)(struct net_device *slave_dev,
+				  struct net_device *failover_dev);
+	int (*slave_register)(struct net_device *slave_dev,
+			      struct net_device *failover_dev);
+	int (*slave_pre_unregister)(struct net_device *slave_dev,
+				    struct net_device *failover_dev);
+	int (*slave_unregister)(struct net_device *slave_dev,
+				struct net_device *failover_dev);
+	int (*slave_link_change)(struct net_device *slave_dev,
+				 struct net_device *failover_dev);
+	int (*slave_name_change)(struct net_device *slave_dev,
+				 struct net_device *failover_dev);
+	rx_handler_result_t (*slave_handle_frame)(struct sk_buff **pskb);
+};
+
+struct failover {
+	struct list_head list;
+	struct net_device __rcu *failover_dev;
+	struct failover_ops __rcu *ops;
+};
+
+struct failover *failover_register(struct net_device *dev,
+				   struct failover_ops *ops);
+void failover_unregister(struct failover *failover);
+int failover_slave_unregister(struct net_device *slave_dev);
+
+#endif /* _FAILOVER_H */
diff --git a/net/Kconfig b/net/Kconfig
index ba554cedb615..f738a6f27665 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -432,6 +432,19 @@ config MAY_USE_DEVLINK
 config PAGE_POOL
        bool
 
+config FAILOVER
+	tristate "Generic failover module"
+	help
+	  The failover module provides a generic interface for paravirtual
+	  drivers to register a netdev and a set of ops with a failover
+	  instance. The ops are used as event handlers that get called to
+	  handle netdev register/unregister/link change/name change events
+	  on slave pci ethernet devices with the same mac address as the
+	  failover netdev. This enables paravirtual drivers to use a
+	  VF as an accelerated low latency datapath. It also allows live
+	  migration of VMs with direct attached VFs by failing over to the
+	  paravirtual datapath when the VF is unplugged.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.
diff --git a/net/core/Makefile b/net/core/Makefile
index 7080417f8bc8..80175e6a2eb8 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -31,3 +31,4 @@ obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
 obj-$(CONFIG_GRO_CELLS) += gro_cells.o
+obj-$(CONFIG_FAILOVER) += failover.o
diff --git a/net/core/failover.c b/net/core/failover.c
new file mode 100644
index 000000000000..4a92a98ccce9
--- /dev/null
+++ b/net/core/failover.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* A common module to handle registrations and notifications for paravirtual
+ * drivers to enable accelerated datapath and support VF live migration.
+ *
+ * The notifier and event handling code is based on netvsc driver.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <uapi/linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/failover.h>
+
+static LIST_HEAD(failover_list);
+static DEFINE_SPINLOCK(failover_lock);
+
+static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
+{
+	struct net_device *failover_dev;
+	struct failover *failover;
+
+	spin_lock(&failover_lock);
+	list_for_each_entry(failover, &failover_list, list) {
+		failover_dev = rtnl_dereference(failover->failover_dev);
+		if (ether_addr_equal(failover_dev->perm_addr, mac)) {
+			*ops = rtnl_dereference(failover->ops);
+			spin_unlock(&failover_lock);
+			return failover_dev;
+		}
+	}
+	spin_unlock(&failover_lock);
+	return NULL;
+}
+
+/**
+ * failover_slave_register - Register a slave netdev
+ *
+ * @slave_dev: slave netdev that is being registered
+ *
+ * Registers a slave device to a failover instance. Only ethernet devices
+ * are supported.
+ */
+static int failover_slave_register(struct net_device *slave_dev)
+{
+	struct netdev_lag_upper_info lag_upper_info;
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+	int err;
+
+	if (slave_dev->type != ARPHRD_ETHER)
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (fops && fops->slave_pre_register &&
+	    fops->slave_pre_register(slave_dev, failover_dev))
+		goto done;
+
+	err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
+					 failover_dev);
+	if (err) {
+		netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
+			   err);
+		goto done;
+	}
+
+	lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
+	err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
+					   &lag_upper_info, NULL);
+	if (err) {
+		netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
+			   failover_dev->name, err);
+		goto err_upper_link;
+	}
+
+	slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+
+	if (fops && fops->slave_register &&
+	    !fops->slave_register(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+	netdev_upper_dev_unlink(slave_dev, failover_dev);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+err_upper_link:
+	netdev_rx_handler_unregister(slave_dev);
+done:
+	return NOTIFY_DONE;
+}
+
+/**
+ * failover_slave_unregister - Unregister a slave netdev
+ *
+ * @slave_dev: slave netdev that is being unregistered
+ *
+ * Unregisters a slave device from a failover instance.
+ */
+int failover_slave_unregister(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (fops && fops->slave_pre_unregister &&
+	    fops->slave_pre_unregister(slave_dev, failover_dev))
+		goto done;
+
+	netdev_rx_handler_unregister(slave_dev);
+	netdev_upper_dev_unlink(slave_dev, failover_dev);
+	slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+
+	if (fops && fops->slave_unregister &&
+	    !fops->slave_unregister(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_GPL(failover_slave_unregister);
+
+static int failover_slave_link_change(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (!netif_running(failover_dev))
+		goto done;
+
+	if (fops && fops->slave_link_change &&
+	    !fops->slave_link_change(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+
+static int failover_slave_name_change(struct net_device *slave_dev)
+{
+	struct net_device *failover_dev;
+	struct failover_ops *fops;
+
+	if (!netif_is_failover_slave(slave_dev))
+		goto done;
+
+	ASSERT_RTNL();
+
+	failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
+	if (!failover_dev)
+		goto done;
+
+	if (!netif_running(failover_dev))
+		goto done;
+
+	if (fops && fops->slave_name_change &&
+	    !fops->slave_name_change(slave_dev, failover_dev))
+		return NOTIFY_OK;
+
+done:
+	return NOTIFY_DONE;
+}
+
+static int
+failover_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip parent events */
+	if (netif_is_failover(event_dev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return failover_slave_register(event_dev);
+	case NETDEV_UNREGISTER:
+		return failover_slave_unregister(event_dev);
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		return failover_slave_link_change(event_dev);
+	case NETDEV_CHANGENAME:
+		return failover_slave_name_change(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block failover_notifier = {
+	.notifier_call = failover_event,
+};
+
+static void
+failover_existing_slave_register(struct net_device *failover_dev)
+{
+	struct net *net = dev_net(failover_dev);
+	struct net_device *dev;
+
+	rtnl_lock();
+	for_each_netdev(net, dev) {
+		if (netif_is_failover(dev))
+			continue;
+		if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+			failover_slave_register(dev);
+	}
+	rtnl_unlock();
+}
+
+/**
+ * failover_register - Register a failover instance
+ *
+ * @dev: failover netdev
+ * @ops: failover ops
+ *
+ * Allocate and register a failover instance for a failover netdev. ops
+ * provides handlers for slave device register/unregister/link change/
+ * name change events.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *failover_register(struct net_device *dev,
+				   struct failover_ops *ops)
+{
+	struct failover *failover;
+
+	if (dev->type != ARPHRD_ETHER)
+		return ERR_PTR(-EINVAL);
+
+	failover = kzalloc(sizeof(*failover), GFP_KERNEL);
+	if (!failover)
+		return ERR_PTR(-ENOMEM);
+
+	rcu_assign_pointer(failover->ops, ops);
+	dev_hold(dev);
+	dev->priv_flags |= IFF_FAILOVER;
+	rcu_assign_pointer(failover->failover_dev, dev);
+
+	spin_lock(&failover_lock);
+	list_add_tail(&failover->list, &failover_list);
+	spin_unlock(&failover_lock);
+
+	netdev_info(dev, "failover master:%s registered\n", dev->name);
+
+	failover_existing_slave_register(dev);
+
+	return failover;
+}
+EXPORT_SYMBOL_GPL(failover_register);
+
+/**
+ * failover_unregister - Unregister a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters and frees a failover instance.
+ */
+void failover_unregister(struct failover *failover)
+{
+	struct net_device *failover_dev;
+
+	failover_dev = rcu_dereference(failover->failover_dev);
+
+	netdev_info(failover_dev, "failover master:%s unregistered\n",
+		    failover_dev->name);
+
+	failover_dev->priv_flags &= ~IFF_FAILOVER;
+	dev_put(failover_dev);
+
+	spin_lock(&failover_lock);
+	list_del(&failover->list);
+	spin_unlock(&failover_lock);
+
+	kfree(failover);
+}
+EXPORT_SYMBOL_GPL(failover_unregister);
+
+static __init int
+failover_init(void)
+{
+	register_netdevice_notifier(&failover_notifier);
+
+	return 0;
+}
+module_init(failover_init);
+
+static __exit
+void failover_exit(void)
+{
+	unregister_netdevice_notifier(&failover_notifier);
+}
+module_exit(failover_exit);
+
+MODULE_DESCRIPTION("Generic failover infrastructure/interface");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 1ff78076d8dd182c883e6caa285479ae7db32ef8 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:14 -0700
Subject: netvsc: refactor notifier/event handling code to use the failover
 framework

Use the registration/notification framework supported by the generic
failover infrastructure.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/Kconfig      |   1 +
 drivers/net/hyperv/hyperv_net.h |   2 +
 drivers/net/hyperv/netvsc_drv.c | 222 +++++++++++-----------------------------
 3 files changed, 60 insertions(+), 165 deletions(-)

diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig
index 0765d5f61714..23a2d145813a 100644
--- a/drivers/net/hyperv/Kconfig
+++ b/drivers/net/hyperv/Kconfig
@@ -2,5 +2,6 @@ config HYPERV_NET
 	tristate "Microsoft Hyper-V virtual network driver"
 	depends on HYPERV
 	select UCS2_STRING
+	select FAILOVER
 	help
 	  Select this option to enable the Hyper-V virtual network driver.
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 1be34d2e3563..99d8e7398a5b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -932,6 +932,8 @@ struct net_device_context {
 	u32 vf_alloc;
 	/* Serial number of the VF to team with */
 	u32 vf_serial;
+
+	struct failover *failover;
 };
 
 /* Per channel data */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 60a5769ef5a1..ebe964203eff 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -43,6 +43,7 @@
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
+#include <net/failover.h>
 
 #include "hyperv_net.h"
 
@@ -1779,46 +1780,6 @@ out_unlock:
 	rtnl_unlock();
 }
 
-static struct net_device *get_netvsc_bymac(const u8 *mac)
-{
-	struct net_device *dev;
-
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
-
-		if (ether_addr_equal(mac, dev->perm_addr))
-			return dev;
-	}
-
-	return NULL;
-}
-
-static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
-{
-	struct net_device *dev;
-
-	ASSERT_RTNL();
-
-	for_each_netdev(&init_net, dev) {
-		struct net_device_context *net_device_ctx;
-
-		if (dev->netdev_ops != &device_ops)
-			continue;	/* not a netvsc device */
-
-		net_device_ctx = netdev_priv(dev);
-		if (!rtnl_dereference(net_device_ctx->nvdev))
-			continue;	/* device is removed */
-
-		if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
-			return dev;	/* a match */
-	}
-
-	return NULL;
-}
-
 /* Called when VF is injecting data into network stack.
  * Change the associated network device from VF to netvsc.
  * note: already called with rcu_read_lock
@@ -1841,46 +1802,6 @@ static rx_handler_result_t netvsc_vf_handle_frame(struct sk_buff **pskb)
 	return RX_HANDLER_ANOTHER;
 }
 
-static int netvsc_vf_join(struct net_device *vf_netdev,
-			  struct net_device *ndev)
-{
-	struct net_device_context *ndev_ctx = netdev_priv(ndev);
-	int ret;
-
-	ret = netdev_rx_handler_register(vf_netdev,
-					 netvsc_vf_handle_frame, ndev);
-	if (ret != 0) {
-		netdev_err(vf_netdev,
-			   "can not register netvsc VF receive handler (err = %d)\n",
-			   ret);
-		goto rx_handler_failed;
-	}
-
-	ret = netdev_master_upper_dev_link(vf_netdev, ndev,
-					   NULL, NULL, NULL);
-	if (ret != 0) {
-		netdev_err(vf_netdev,
-			   "can not set master device %s (err = %d)\n",
-			   ndev->name, ret);
-		goto upper_link_failed;
-	}
-
-	/* set slave flag before open to prevent IPv6 addrconf */
-	vf_netdev->flags |= IFF_SLAVE;
-
-	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
-
-	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
-
-	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
-	return 0;
-
-upper_link_failed:
-	netdev_rx_handler_unregister(vf_netdev);
-rx_handler_failed:
-	return ret;
-}
-
 static void __netvsc_vf_setup(struct net_device *ndev,
 			      struct net_device *vf_netdev)
 {
@@ -1931,85 +1852,95 @@ static void netvsc_vf_setup(struct work_struct *w)
 	rtnl_unlock();
 }
 
-static int netvsc_register_vf(struct net_device *vf_netdev)
+static int netvsc_pre_register_vf(struct net_device *vf_netdev,
+				  struct net_device *ndev)
 {
-	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
 
-	if (vf_netdev->addr_len != ETH_ALEN)
-		return NOTIFY_DONE;
-
-	/*
-	 * We will use the MAC address to locate the synthetic interface to
-	 * associate with the VF interface. If we don't find a matching
-	 * synthetic interface, move on.
-	 */
-	ndev = get_netvsc_bymac(vf_netdev->perm_addr);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
-		return NOTIFY_DONE;
+		return -ENODEV;
+
+	return 0;
+}
+
+static int netvsc_register_vf(struct net_device *vf_netdev,
+			      struct net_device *ndev)
+{
+	struct net_device_context *ndev_ctx = netdev_priv(ndev);
 
-	if (netvsc_vf_join(vf_netdev, ndev) != 0)
-		return NOTIFY_DONE;
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
 
-	netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
+	schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
 
 	dev_hold(vf_netdev);
-	rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
-	return NOTIFY_OK;
+	rcu_assign_pointer(ndev_ctx->vf_netdev, vf_netdev);
+
+	return 0;
 }
 
 /* VF up/down change detected, schedule to change data path */
-static int netvsc_vf_changed(struct net_device *vf_netdev)
+static int netvsc_vf_changed(struct net_device *vf_netdev,
+			     struct net_device *ndev)
 {
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device *netvsc_dev;
-	struct net_device *ndev;
 	bool vf_is_up = netif_running(vf_netdev);
 
-	ndev = get_netvsc_byref(vf_netdev);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
 	netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
 	if (!netvsc_dev)
-		return NOTIFY_DONE;
+		return -ENODEV;
 
 	netvsc_switch_datapath(ndev, vf_is_up);
 	netdev_info(ndev, "Data path switched %s VF: %s\n",
 		    vf_is_up ? "to" : "from", vf_netdev->name);
 
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int netvsc_unregister_vf(struct net_device *vf_netdev)
+static int netvsc_pre_unregister_vf(struct net_device *vf_netdev,
+				    struct net_device *ndev)
 {
-	struct net_device *ndev;
 	struct net_device_context *net_device_ctx;
 
-	ndev = get_netvsc_byref(vf_netdev);
-	if (!ndev)
-		return NOTIFY_DONE;
-
 	net_device_ctx = netdev_priv(ndev);
 	cancel_delayed_work_sync(&net_device_ctx->vf_takeover);
 
+	return 0;
+}
+
+static int netvsc_unregister_vf(struct net_device *vf_netdev,
+				struct net_device *ndev)
+{
+	struct net_device_context *net_device_ctx;
+
+	net_device_ctx = netdev_priv(ndev);
+
 	netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
-	netdev_rx_handler_unregister(vf_netdev);
-	netdev_upper_dev_unlink(vf_netdev, ndev);
 	RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
 	dev_put(vf_netdev);
 
-	return NOTIFY_OK;
+	return 0;
 }
 
+static struct failover_ops netvsc_failover_ops = {
+	.slave_pre_register	= netvsc_pre_register_vf,
+	.slave_register		= netvsc_register_vf,
+	.slave_pre_unregister	= netvsc_pre_unregister_vf,
+	.slave_unregister	= netvsc_unregister_vf,
+	.slave_link_change	= netvsc_vf_changed,
+	.slave_handle_frame	= netvsc_vf_handle_frame,
+};
+
 static int netvsc_probe(struct hv_device *dev,
 			const struct hv_vmbus_device_id *dev_id)
 {
@@ -2099,8 +2030,14 @@ static int netvsc_probe(struct hv_device *dev,
 		goto register_failed;
 	}
 
+	net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
+	if (IS_ERR(net_device_ctx->failover))
+		goto err_failover;
+
 	return ret;
 
+err_failover:
+	unregister_netdev(net);
 register_failed:
 	rndis_filter_device_remove(dev, nvdev);
 rndis_failed:
@@ -2141,13 +2078,15 @@ static int netvsc_remove(struct hv_device *dev)
 	rtnl_lock();
 	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
 	if (vf_netdev)
-		netvsc_unregister_vf(vf_netdev);
+		failover_slave_unregister(vf_netdev);
 
 	if (nvdev)
 		rndis_filter_device_remove(dev, nvdev);
 
 	unregister_netdevice(net);
 
+	failover_unregister(ndev_ctx->failover);
+
 	rtnl_unlock();
 	rcu_read_unlock();
 
@@ -2174,54 +2113,8 @@ static struct  hv_driver netvsc_drv = {
 	.remove = netvsc_remove,
 };
 
-/*
- * On Hyper-V, every VF interface is matched with a corresponding
- * synthetic interface. The synthetic interface is presented first
- * to the guest. When the corresponding VF instance is registered,
- * we will take care of switching the data path.
- */
-static int netvsc_netdev_event(struct notifier_block *this,
-			       unsigned long event, void *ptr)
-{
-	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
-
-	/* Skip our own events */
-	if (event_dev->netdev_ops == &device_ops)
-		return NOTIFY_DONE;
-
-	/* Avoid non-Ethernet type devices */
-	if (event_dev->type != ARPHRD_ETHER)
-		return NOTIFY_DONE;
-
-	/* Avoid Vlan dev with same MAC registering as VF */
-	if (is_vlan_dev(event_dev))
-		return NOTIFY_DONE;
-
-	/* Avoid Bonding master dev with same MAC registering as VF */
-	if ((event_dev->priv_flags & IFF_BONDING) &&
-	    (event_dev->flags & IFF_MASTER))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_REGISTER:
-		return netvsc_register_vf(event_dev);
-	case NETDEV_UNREGISTER:
-		return netvsc_unregister_vf(event_dev);
-	case NETDEV_UP:
-	case NETDEV_DOWN:
-		return netvsc_vf_changed(event_dev);
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block netvsc_netdev_notifier = {
-	.notifier_call = netvsc_netdev_event,
-};
-
 static void __exit netvsc_drv_exit(void)
 {
-	unregister_netdevice_notifier(&netvsc_netdev_notifier);
 	vmbus_driver_unregister(&netvsc_drv);
 }
 
@@ -2241,7 +2134,6 @@ static int __init netvsc_drv_init(void)
 	if (ret)
 		return ret;
 
-	register_netdevice_notifier(&netvsc_netdev_notifier);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cfc80d9a11635404a40199a1c9471c96890f3f74 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:15 -0700
Subject: net: Introduce net_failover driver

The net_failover driver provides an automated failover mechanism via APIs
to create and destroy a failover master netdev and manages a primary and
standby slave netdevs that get registered via the generic failover
infrastructure.

The failover netdev acts a master device and controls 2 slave devices. The
original paravirtual interface gets registered as 'standby' slave netdev and
a passthru/vf device with the same MAC gets registered as 'primary' slave
netdev. Both 'standby' and 'failover' netdevs are associated with the same
'pci' device. The user accesses the network interface via 'failover' netdev.
The 'failover' netdev chooses 'primary' netdev as default for transmits when
it is available with link up and running.

This can be used by paravirtual drivers to enable an alternate low latency
datapath. It also enables hypervisor controlled live migration of a VM with
direct attached VF by failing over to the paravirtual datapath when the VF
is unplugged.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/net_failover.rst |  26 +
 MAINTAINERS                               |   8 +
 drivers/net/Kconfig                       |  12 +
 drivers/net/Makefile                      |   1 +
 drivers/net/net_failover.c                | 836 ++++++++++++++++++++++++++++++
 include/net/net_failover.h                |  40 ++
 6 files changed, 923 insertions(+)
 create mode 100644 Documentation/networking/net_failover.rst
 create mode 100644 drivers/net/net_failover.c
 create mode 100644 include/net/net_failover.h

diff --git a/Documentation/networking/net_failover.rst b/Documentation/networking/net_failover.rst
new file mode 100644
index 000000000000..d4513ad31809
--- /dev/null
+++ b/Documentation/networking/net_failover.rst
@@ -0,0 +1,26 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+NET_FAILOVER
+============
+
+Overview
+========
+
+The net_failover driver provides an automated failover mechanism via APIs
+to create and destroy a failover master netdev and mananges a primary and
+standby slave netdevs that get registered via the generic failover
+infrastructrure.
+
+The failover netdev acts a master device and controls 2 slave devices. The
+original paravirtual interface is registered as 'standby' slave netdev and
+a passthru/vf device with the same MAC gets registered as 'primary' slave
+netdev. Both 'standby' and 'failover' netdevs are associated with the same
+'pci' device. The user accesses the network interface via 'failover' netdev.
+The 'failover' netdev chooses 'primary' netdev as default for transmits when
+it is available with link up and running.
+
+This can be used by paravirtual drivers to enable an alternate low latency
+datapath. It also enables hypervisor controlled live migration of a VM with
+direct attached VF by failing over to the paravirtual datapath when the VF
+is unplugged.
diff --git a/MAINTAINERS b/MAINTAINERS
index 6c59bdf49a8a..1831ff5863a1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9654,6 +9654,14 @@ S:	Maintained
 F:	Documentation/hwmon/nct6775
 F:	drivers/hwmon/nct6775.c
 
+NET_FAILOVER MODULE
+M:	Sridhar Samudrala <sridhar.samudrala@intel.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	driver/net/net_failover.c
+F:	include/net/net_failover.h
+F:	Documentation/networking/net_failover.rst
+
 NETEFFECT IWARP RNIC DRIVER (IW_NES)
 M:	Faisal Latif <faisal.latif@intel.com>
 L:	linux-rdma@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index a029b27fd002..2cdaff90a9ec 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -510,4 +510,16 @@ config NETDEVSIM
 	  To compile this driver as a module, choose M here: the module
 	  will be called netdevsim.
 
+config NET_FAILOVER
+	tristate "Failover driver"
+	select FAILOVER
+	help
+	  This provides an automated failover mechanism via APIs to create
+	  and destroy a failover master netdev and manages a primary and
+	  standby slave netdevs that get registered via the generic failover
+	  infrastructure. This can be used by paravirtual drivers to enable
+	  an alternate low latency datapath. It alsoenables live migration of
+	  a VM with direct attached VF by failing over to the paravirtual
+	  datapath when the VF is unplugged.
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 91e67e375dd4..21cde7e78621 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_FUJITSU_ES) += fjes/
 thunderbolt-net-y += thunderbolt.o
 obj-$(CONFIG_THUNDERBOLT_NET) += thunderbolt-net.o
 obj-$(CONFIG_NETDEVSIM) += netdevsim/
+obj-$(CONFIG_NET_FAILOVER) += net_failover.o
diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
new file mode 100644
index 000000000000..8b508e2cf29b
--- /dev/null
+++ b/drivers/net/net_failover.c
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+/* This provides a net_failover interface for paravirtual drivers to
+ * provide an alternate datapath by exporting APIs to create and
+ * destroy a upper 'net_failover' netdev. The upper dev manages the
+ * original paravirtual interface as a 'standby' netdev and uses the
+ * generic failover infrastructure to register and manage a direct
+ * attached VF as a 'primary' netdev. This enables live migration of
+ * a VM with direct attached VF by failing over to the paravirtual
+ * datapath when the VF is unplugged.
+ *
+ * Some of the netdev management routines are based on bond/team driver as
+ * this driver provides active-backup functionality similar to those drivers.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/pci.h>
+#include <net/sch_generic.h>
+#include <uapi/linux/if_arp.h>
+#include <net/net_failover.h>
+
+static bool net_failover_xmit_ready(struct net_device *dev)
+{
+	return netif_running(dev) && netif_carrier_ok(dev);
+}
+
+static int net_failover_open(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int err;
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		err = dev_open(primary_dev);
+		if (err)
+			goto err_primary_open;
+	}
+
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		err = dev_open(standby_dev);
+		if (err)
+			goto err_standby_open;
+	}
+
+	if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+	    (standby_dev && net_failover_xmit_ready(standby_dev))) {
+		netif_carrier_on(dev);
+		netif_tx_wake_all_queues(dev);
+	}
+
+	return 0;
+
+err_standby_open:
+	dev_close(primary_dev);
+err_primary_open:
+	netif_tx_disable(dev);
+	return err;
+}
+
+static int net_failover_close(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	netif_tx_disable(dev);
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		dev_close(slave_dev);
+
+	slave_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		dev_close(slave_dev);
+
+	return 0;
+}
+
+static netdev_tx_t net_failover_drop_xmit(struct sk_buff *skb,
+					  struct net_device *dev)
+{
+	atomic_long_inc(&dev->tx_dropped);
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t net_failover_start_xmit(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *xmit_dev;
+
+	/* Try xmit via primary netdev followed by standby netdev */
+	xmit_dev = rcu_dereference_bh(nfo_info->primary_dev);
+	if (!xmit_dev || !net_failover_xmit_ready(xmit_dev)) {
+		xmit_dev = rcu_dereference_bh(nfo_info->standby_dev);
+		if (!xmit_dev || !net_failover_xmit_ready(xmit_dev))
+			return net_failover_drop_xmit(skb, dev);
+	}
+
+	skb->dev = xmit_dev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	return dev_queue_xmit(skb);
+}
+
+static u16 net_failover_select_queue(struct net_device *dev,
+				     struct sk_buff *skb, void *accel_priv,
+				     select_queue_fallback_t fallback)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev;
+	u16 txq;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		const struct net_device_ops *ops = primary_dev->netdev_ops;
+
+		if (ops->ndo_select_queue)
+			txq = ops->ndo_select_queue(primary_dev, skb,
+						    accel_priv, fallback);
+		else
+			txq = fallback(primary_dev, skb);
+
+		qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+		return txq;
+	}
+
+	txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
+
+	/* Save the original txq to restore before passing to the driver */
+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+
+	if (unlikely(txq >= dev->real_num_tx_queues)) {
+		do {
+			txq -= dev->real_num_tx_queues;
+		} while (txq >= dev->real_num_tx_queues);
+	}
+
+	return txq;
+}
+
+/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
+ * that some drivers can provide 32bit values only.
+ */
+static void net_failover_fold_stats(struct rtnl_link_stats64 *_res,
+				    const struct rtnl_link_stats64 *_new,
+				    const struct rtnl_link_stats64 *_old)
+{
+	const u64 *new = (const u64 *)_new;
+	const u64 *old = (const u64 *)_old;
+	u64 *res = (u64 *)_res;
+	int i;
+
+	for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
+		u64 nv = new[i];
+		u64 ov = old[i];
+		s64 delta = nv - ov;
+
+		/* detects if this particular field is 32bit only */
+		if (((nv | ov) >> 32) == 0)
+			delta = (s64)(s32)((u32)nv - (u32)ov);
+
+		/* filter anomalies, some drivers reset their stats
+		 * at down/up events.
+		 */
+		if (delta > 0)
+			res[i] += delta;
+	}
+}
+
+static void net_failover_get_stats(struct net_device *dev,
+				   struct rtnl_link_stats64 *stats)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	const struct rtnl_link_stats64 *new;
+	struct rtnl_link_stats64 temp;
+	struct net_device *slave_dev;
+
+	spin_lock(&nfo_info->stats_lock);
+	memcpy(stats, &nfo_info->failover_stats, sizeof(*stats));
+
+	rcu_read_lock();
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev) {
+		new = dev_get_stats(slave_dev, &temp);
+		net_failover_fold_stats(stats, new, &nfo_info->primary_stats);
+		memcpy(&nfo_info->primary_stats, new, sizeof(*new));
+	}
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev) {
+		new = dev_get_stats(slave_dev, &temp);
+		net_failover_fold_stats(stats, new, &nfo_info->standby_stats);
+		memcpy(&nfo_info->standby_stats, new, sizeof(*new));
+	}
+
+	rcu_read_unlock();
+
+	memcpy(&nfo_info->failover_stats, stats, sizeof(*stats));
+	spin_unlock(&nfo_info->stats_lock);
+}
+
+static int net_failover_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int ret = 0;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		ret = dev_set_mtu(primary_dev, new_mtu);
+		if (ret)
+			return ret;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		ret = dev_set_mtu(standby_dev, new_mtu);
+		if (ret) {
+			if (primary_dev)
+				dev_set_mtu(primary_dev, dev->mtu);
+			return ret;
+		}
+	}
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static void net_failover_set_rx_mode(struct net_device *dev)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	rcu_read_lock();
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev) {
+		dev_uc_sync_multiple(slave_dev, dev);
+		dev_mc_sync_multiple(slave_dev, dev);
+	}
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev) {
+		dev_uc_sync_multiple(slave_dev, dev);
+		dev_mc_sync_multiple(slave_dev, dev);
+	}
+
+	rcu_read_unlock();
+}
+
+static int net_failover_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+					u16 vid)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+	int ret = 0;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		ret = vlan_vid_add(primary_dev, proto, vid);
+		if (ret)
+			return ret;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		ret = vlan_vid_add(standby_dev, proto, vid);
+		if (ret)
+			if (primary_dev)
+				vlan_vid_del(primary_dev, proto, vid);
+	}
+
+	return ret;
+}
+
+static int net_failover_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+					 u16 vid)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	slave_dev = rcu_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		vlan_vid_del(slave_dev, proto, vid);
+
+	slave_dev = rcu_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		vlan_vid_del(slave_dev, proto, vid);
+
+	return 0;
+}
+
+static const struct net_device_ops failover_dev_ops = {
+	.ndo_open		= net_failover_open,
+	.ndo_stop		= net_failover_close,
+	.ndo_start_xmit		= net_failover_start_xmit,
+	.ndo_select_queue	= net_failover_select_queue,
+	.ndo_get_stats64	= net_failover_get_stats,
+	.ndo_change_mtu		= net_failover_change_mtu,
+	.ndo_set_rx_mode	= net_failover_set_rx_mode,
+	.ndo_vlan_rx_add_vid	= net_failover_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= net_failover_vlan_rx_kill_vid,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_features_check	= passthru_features_check,
+};
+
+#define FAILOVER_NAME "net_failover"
+#define FAILOVER_VERSION "0.1"
+
+static void nfo_ethtool_get_drvinfo(struct net_device *dev,
+				    struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, FAILOVER_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, FAILOVER_VERSION, sizeof(drvinfo->version));
+}
+
+static int nfo_ethtool_get_link_ksettings(struct net_device *dev,
+					  struct ethtool_link_ksettings *cmd)
+{
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *slave_dev;
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+		slave_dev = rtnl_dereference(nfo_info->standby_dev);
+		if (!slave_dev || !net_failover_xmit_ready(slave_dev)) {
+			cmd->base.duplex = DUPLEX_UNKNOWN;
+			cmd->base.port = PORT_OTHER;
+			cmd->base.speed = SPEED_UNKNOWN;
+
+			return 0;
+		}
+	}
+
+	return __ethtool_get_link_ksettings(slave_dev, cmd);
+}
+
+static const struct ethtool_ops failover_ethtool_ops = {
+	.get_drvinfo            = nfo_ethtool_get_drvinfo,
+	.get_link               = ethtool_op_get_link,
+	.get_link_ksettings     = nfo_ethtool_get_link_ksettings,
+};
+
+/* Called when slave dev is injecting data into network stack.
+ * Change the associated network device from lower dev to failover dev.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t net_failover_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *dev = rcu_dereference(skb->dev->rx_handler_data);
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+
+	if (primary_dev && skb->dev == standby_dev)
+		return RX_HANDLER_EXACT;
+
+	skb->dev = dev;
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static void net_failover_compute_features(struct net_device *dev)
+{
+	u32 vlan_features = FAILOVER_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+	netdev_features_t enc_features  = FAILOVER_ENC_FEATURES;
+	unsigned short max_hard_header_len = ETH_HLEN;
+	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+					IFF_XMIT_DST_RELEASE_PERM;
+	struct net_failover_info *nfo_info = netdev_priv(dev);
+	struct net_device *primary_dev, *standby_dev;
+
+	primary_dev = rcu_dereference(nfo_info->primary_dev);
+	if (primary_dev) {
+		vlan_features =
+			netdev_increment_features(vlan_features,
+						  primary_dev->vlan_features,
+						  FAILOVER_VLAN_FEATURES);
+		enc_features =
+			netdev_increment_features(enc_features,
+						  primary_dev->hw_enc_features,
+						  FAILOVER_ENC_FEATURES);
+
+		dst_release_flag &= primary_dev->priv_flags;
+		if (primary_dev->hard_header_len > max_hard_header_len)
+			max_hard_header_len = primary_dev->hard_header_len;
+	}
+
+	standby_dev = rcu_dereference(nfo_info->standby_dev);
+	if (standby_dev) {
+		vlan_features =
+			netdev_increment_features(vlan_features,
+						  standby_dev->vlan_features,
+						  FAILOVER_VLAN_FEATURES);
+		enc_features =
+			netdev_increment_features(enc_features,
+						  standby_dev->hw_enc_features,
+						  FAILOVER_ENC_FEATURES);
+
+		dst_release_flag &= standby_dev->priv_flags;
+		if (standby_dev->hard_header_len > max_hard_header_len)
+			max_hard_header_len = standby_dev->hard_header_len;
+	}
+
+	dev->vlan_features = vlan_features;
+	dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	dev->hard_header_len = max_hard_header_len;
+
+	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	if (dst_release_flag == (IFF_XMIT_DST_RELEASE |
+				 IFF_XMIT_DST_RELEASE_PERM))
+		dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+	netdev_change_features(dev);
+}
+
+static void net_failover_lower_state_changed(struct net_device *slave_dev,
+					     struct net_device *primary_dev,
+					     struct net_device *standby_dev)
+{
+	struct netdev_lag_lower_state_info info;
+
+	if (netif_carrier_ok(slave_dev))
+		info.link_up = true;
+	else
+		info.link_up = false;
+
+	if (slave_dev == primary_dev) {
+		if (netif_running(primary_dev))
+			info.tx_enabled = true;
+		else
+			info.tx_enabled = false;
+	} else {
+		if ((primary_dev && netif_running(primary_dev)) ||
+		    (!netif_running(standby_dev)))
+			info.tx_enabled = false;
+		else
+			info.tx_enabled = true;
+	}
+
+	netdev_lower_state_changed(slave_dev, &info);
+}
+
+static int net_failover_slave_pre_register(struct net_device *slave_dev,
+					   struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+
+	nfo_info = netdev_priv(failover_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+	if (slave_is_standby ? standby_dev : primary_dev) {
+		netdev_err(failover_dev, "%s attempting to register as slave dev when %s already present\n",
+			   slave_dev->name,
+			   slave_is_standby ? "standby" : "primary");
+		return -EINVAL;
+	}
+
+	/* We want to allow only a direct attached VF device as a primary
+	 * netdev. As there is no easy way to check for a VF device, restrict
+	 * this to a pci device.
+	 */
+	if (!slave_is_standby && (!slave_dev->dev.parent ||
+				  !dev_is_pci(slave_dev->dev.parent)))
+		return -EINVAL;
+
+	if (failover_dev->features & NETIF_F_VLAN_CHALLENGED &&
+	    vlan_uses_dev(failover_dev)) {
+		netdev_err(failover_dev, "Device %s is VLAN challenged and failover device has VLAN set up\n",
+			   failover_dev->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int net_failover_slave_register(struct net_device *slave_dev,
+				       struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+	u32 orig_mtu;
+	int err;
+
+	/* Align MTU of slave with failover dev */
+	orig_mtu = slave_dev->mtu;
+	err = dev_set_mtu(slave_dev, failover_dev->mtu);
+	if (err) {
+		netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n",
+			   slave_dev->name, failover_dev->mtu);
+		goto done;
+	}
+
+	dev_hold(slave_dev);
+
+	if (netif_running(failover_dev)) {
+		err = dev_open(slave_dev);
+		if (err && (err != -EBUSY)) {
+			netdev_err(failover_dev, "Opening slave %s failed err:%d\n",
+				   slave_dev->name, err);
+			goto err_dev_open;
+		}
+	}
+
+	netif_addr_lock_bh(failover_dev);
+	dev_uc_sync_multiple(slave_dev, failover_dev);
+	dev_uc_sync_multiple(slave_dev, failover_dev);
+	netif_addr_unlock_bh(failover_dev);
+
+	err = vlan_vids_add_by_dev(slave_dev, failover_dev);
+	if (err) {
+		netdev_err(failover_dev, "Failed to add vlan ids to device %s err:%d\n",
+			   slave_dev->name, err);
+		goto err_vlan_add;
+	}
+
+	nfo_info = netdev_priv(failover_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+
+	if (slave_is_standby) {
+		rcu_assign_pointer(nfo_info->standby_dev, slave_dev);
+		standby_dev = slave_dev;
+		dev_get_stats(standby_dev, &nfo_info->standby_stats);
+	} else {
+		rcu_assign_pointer(nfo_info->primary_dev, slave_dev);
+		primary_dev = slave_dev;
+		dev_get_stats(primary_dev, &nfo_info->primary_stats);
+		failover_dev->min_mtu = slave_dev->min_mtu;
+		failover_dev->max_mtu = slave_dev->max_mtu;
+	}
+
+	net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+	net_failover_compute_features(failover_dev);
+
+	call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
+
+	netdev_info(failover_dev, "failover %s slave:%s registered\n",
+		    slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+	return 0;
+
+err_vlan_add:
+	dev_uc_unsync(slave_dev, failover_dev);
+	dev_mc_unsync(slave_dev, failover_dev);
+	dev_close(slave_dev);
+err_dev_open:
+	dev_put(slave_dev);
+	dev_set_mtu(slave_dev, orig_mtu);
+done:
+	return err;
+}
+
+static int net_failover_slave_pre_unregister(struct net_device *slave_dev,
+					     struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int net_failover_slave_unregister(struct net_device *slave_dev,
+					 struct net_device *failover_dev)
+{
+	struct net_device *standby_dev, *primary_dev;
+	struct net_failover_info *nfo_info;
+	bool slave_is_standby;
+
+	nfo_info = netdev_priv(failover_dev);
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	vlan_vids_del_by_dev(slave_dev, failover_dev);
+	dev_uc_unsync(slave_dev, failover_dev);
+	dev_mc_unsync(slave_dev, failover_dev);
+	dev_close(slave_dev);
+
+	nfo_info = netdev_priv(failover_dev);
+	dev_get_stats(failover_dev, &nfo_info->failover_stats);
+
+	slave_is_standby = slave_dev->dev.parent == failover_dev->dev.parent;
+	if (slave_is_standby) {
+		RCU_INIT_POINTER(nfo_info->standby_dev, NULL);
+	} else {
+		RCU_INIT_POINTER(nfo_info->primary_dev, NULL);
+		if (standby_dev) {
+			failover_dev->min_mtu = standby_dev->min_mtu;
+			failover_dev->max_mtu = standby_dev->max_mtu;
+		}
+	}
+
+	dev_put(slave_dev);
+
+	net_failover_compute_features(failover_dev);
+
+	netdev_info(failover_dev, "failover %s slave:%s unregistered\n",
+		    slave_is_standby ? "standby" : "primary", slave_dev->name);
+
+	return 0;
+}
+
+static int net_failover_slave_link_change(struct net_device *slave_dev,
+					  struct net_device *failover_dev)
+{
+	struct net_device *primary_dev, *standby_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	if ((primary_dev && net_failover_xmit_ready(primary_dev)) ||
+	    (standby_dev && net_failover_xmit_ready(standby_dev))) {
+		netif_carrier_on(failover_dev);
+		netif_tx_wake_all_queues(failover_dev);
+	} else {
+		dev_get_stats(failover_dev, &nfo_info->failover_stats);
+		netif_carrier_off(failover_dev);
+		netif_tx_stop_all_queues(failover_dev);
+	}
+
+	net_failover_lower_state_changed(slave_dev, primary_dev, standby_dev);
+
+	return 0;
+}
+
+static int net_failover_slave_name_change(struct net_device *slave_dev,
+					  struct net_device *failover_dev)
+{
+	struct net_device *primary_dev, *standby_dev;
+	struct net_failover_info *nfo_info;
+
+	nfo_info = netdev_priv(failover_dev);
+
+	primary_dev = rtnl_dereference(nfo_info->primary_dev);
+	standby_dev = rtnl_dereference(nfo_info->standby_dev);
+
+	if (slave_dev != primary_dev && slave_dev != standby_dev)
+		return -ENODEV;
+
+	/* We need to bring up the slave after the rename by udev in case
+	 * open failed with EBUSY when it was registered.
+	 */
+	dev_open(slave_dev);
+
+	return 0;
+}
+
+static struct failover_ops net_failover_ops = {
+	.slave_pre_register	= net_failover_slave_pre_register,
+	.slave_register		= net_failover_slave_register,
+	.slave_pre_unregister	= net_failover_slave_pre_unregister,
+	.slave_unregister	= net_failover_slave_unregister,
+	.slave_link_change	= net_failover_slave_link_change,
+	.slave_name_change	= net_failover_slave_name_change,
+	.slave_handle_frame	= net_failover_handle_frame,
+};
+
+/**
+ * net_failover_create - Create and register a failover instance
+ *
+ * @dev: standby netdev
+ *
+ * Creates a failover netdev and registers a failover instance for a standby
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ * The failover netdev acts as a master device and controls 2 slave devices -
+ * the original standby netdev and a VF netdev with the same MAC gets
+ * registered as primary netdev.
+ *
+ * Return: pointer to failover instance
+ */
+struct failover *net_failover_create(struct net_device *standby_dev)
+{
+	struct device *dev = standby_dev->dev.parent;
+	struct net_device *failover_dev;
+	struct failover *failover;
+	int err;
+
+	/* Alloc at least 2 queues, for now we are going with 16 assuming
+	 * that VF devices being enslaved won't have too many queues.
+	 */
+	failover_dev = alloc_etherdev_mq(sizeof(struct net_failover_info), 16);
+	if (!failover_dev) {
+		dev_err(dev, "Unable to allocate failover_netdev!\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev_net_set(failover_dev, dev_net(standby_dev));
+	SET_NETDEV_DEV(failover_dev, dev);
+
+	failover_dev->netdev_ops = &failover_dev_ops;
+	failover_dev->ethtool_ops = &failover_ethtool_ops;
+
+	/* Initialize the device options */
+	failover_dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
+	failover_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
+				       IFF_TX_SKB_SHARING);
+
+	/* don't acquire failover netdev's netif_tx_lock when transmitting */
+	failover_dev->features |= NETIF_F_LLTX;
+
+	/* Don't allow failover devices to change network namespaces. */
+	failover_dev->features |= NETIF_F_NETNS_LOCAL;
+
+	failover_dev->hw_features = FAILOVER_VLAN_FEATURES |
+				    NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_HW_VLAN_CTAG_RX |
+				    NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	failover_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	failover_dev->features |= failover_dev->hw_features;
+
+	memcpy(failover_dev->dev_addr, standby_dev->dev_addr,
+	       failover_dev->addr_len);
+
+	failover_dev->min_mtu = standby_dev->min_mtu;
+	failover_dev->max_mtu = standby_dev->max_mtu;
+
+	err = register_netdev(failover_dev);
+	if (err) {
+		dev_err(dev, "Unable to register failover_dev!\n");
+		goto err_register_netdev;
+	}
+
+	netif_carrier_off(failover_dev);
+
+	failover = failover_register(failover_dev, &net_failover_ops);
+	if (IS_ERR(failover))
+		goto err_failover_register;
+
+	return failover;
+
+err_failover_register:
+	unregister_netdev(failover_dev);
+err_register_netdev:
+	free_netdev(failover_dev);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(net_failover_create);
+
+/**
+ * net_failover_destroy - Destroy a failover instance
+ *
+ * @failover: pointer to failover instance
+ *
+ * Unregisters any slave netdevs associated with the failover instance by
+ * calling failover_slave_unregister().
+ * unregisters the failover instance itself and finally frees the failover
+ * netdev. Used by paravirtual drivers that use 3-netdev model.
+ *
+ */
+void net_failover_destroy(struct failover *failover)
+{
+	struct net_failover_info *nfo_info;
+	struct net_device *failover_dev;
+	struct net_device *slave_dev;
+
+	if (!failover)
+		return;
+
+	failover_dev = rcu_dereference(failover->failover_dev);
+	nfo_info = netdev_priv(failover_dev);
+
+	netif_device_detach(failover_dev);
+
+	rtnl_lock();
+
+	slave_dev = rtnl_dereference(nfo_info->primary_dev);
+	if (slave_dev)
+		failover_slave_unregister(slave_dev);
+
+	slave_dev = rtnl_dereference(nfo_info->standby_dev);
+	if (slave_dev)
+		failover_slave_unregister(slave_dev);
+
+	failover_unregister(failover);
+
+	unregister_netdevice(failover_dev);
+
+	rtnl_unlock();
+
+	free_netdev(failover_dev);
+}
+EXPORT_SYMBOL_GPL(net_failover_destroy);
+
+static __init int
+net_failover_init(void)
+{
+	return 0;
+}
+module_init(net_failover_init);
+
+static __exit
+void net_failover_exit(void)
+{
+}
+module_exit(net_failover_exit);
+
+MODULE_DESCRIPTION("Failover driver for Paravirtual drivers");
+MODULE_LICENSE("GPL v2");
diff --git a/include/net/net_failover.h b/include/net/net_failover.h
new file mode 100644
index 000000000000..b12a1c469d1c
--- /dev/null
+++ b/include/net/net_failover.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _NET_FAILOVER_H
+#define _NET_FAILOVER_H
+
+#include <net/failover.h>
+
+/* failover state */
+struct net_failover_info {
+	/* primary netdev with same MAC */
+	struct net_device __rcu *primary_dev;
+
+	/* standby netdev */
+	struct net_device __rcu *standby_dev;
+
+	/* primary netdev stats */
+	struct rtnl_link_stats64 primary_stats;
+
+	/* standby netdev stats */
+	struct rtnl_link_stats64 standby_stats;
+
+	/* aggregated stats */
+	struct rtnl_link_stats64 failover_stats;
+
+	/* spinlock while updating stats */
+	spinlock_t stats_lock;
+};
+
+struct failover *net_failover_create(struct net_device *standby_dev);
+void net_failover_destroy(struct failover *failover);
+
+#define FAILOVER_VLAN_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
+				 NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
+				 NETIF_F_HIGHDMA | NETIF_F_LRO)
+
+#define FAILOVER_ENC_FEATURES	(NETIF_F_HW_CSUM | NETIF_F_SG | \
+				 NETIF_F_RXCSUM | NETIF_F_ALL_TSO)
+
+#endif /* _NET_FAILOVER_H */
-- 
cgit v1.2.3-59-g8ed1b


From 9805069d14c1b0b66b1600ea60cfc08f94841bd8 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:16 -0700
Subject: virtio_net: Introduce VIRTIO_NET_F_STANDBY feature bit

This feature bit can be used by hypervisor to indicate virtio_net device to
act as a standby for another device with the same MAC address.

VIRTIO_NET_F_STANDBY is defined as bit 62 as it is a device feature bit.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c        | 2 +-
 include/uapi/linux/virtio_net.h | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..fe4534ee50a1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3030,7 +3030,7 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
-	VIRTIO_NET_F_SPEED_DUPLEX
+	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 5de6ed37695b..a3715a3224c1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -57,6 +57,9 @@
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
 
+#define VIRTIO_NET_F_STANDBY	  62	/* Act as standby for another device
+					 * with the same MAC.
+					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
 
 #ifndef VIRTIO_NET_NO_LEGACY
-- 
cgit v1.2.3-59-g8ed1b


From ba5e4426e80e0435358c7117c339e6a4c22c34ad Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sridhar.samudrala@intel.com>
Date: Thu, 24 May 2018 09:55:17 -0700
Subject: virtio_net: Extend virtio to use VF datapath when available

This patch enables virtio_net to switch over to a VF datapath when STANDBY
feature is enabled and a VF netdev is present with the same MAC address.
It allows live migration of a VM with a direct attached VF without the need
to setup a bond/team between a VF and virtio net device in the guest.

It uses the API that is exported by the net_failover driver to create and
and destroy a master failover netdev. When STANDBY feature is enabled, an
additional netdev(failover netdev) is created that acts as a master device
and tracks the state of the 2 lower netdevs. The original virtio_net netdev
is marked as 'standby' netdev and a passthru device with the same MAC is
registered as 'primary' netdev.

The hypervisor needs to unplug the VF device from the guest on the source
host and reset the MAC filter of the VF to initiate failover of datapath
to virtio before starting the migration. After the migration is completed,
the destination hypervisor sets the MAC filter on the VF and plugs it back
to the guest to switch over to VF datapath.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/net_failover.rst | 90 +++++++++++++++++++++++++++++++
 drivers/net/Kconfig                       |  1 +
 drivers/net/virtio_net.c                  | 38 ++++++++++++-
 3 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/net_failover.rst b/Documentation/networking/net_failover.rst
index d4513ad31809..70ca2f5800c4 100644
--- a/Documentation/networking/net_failover.rst
+++ b/Documentation/networking/net_failover.rst
@@ -24,3 +24,93 @@ This can be used by paravirtual drivers to enable an alternate low latency
 datapath. It also enables hypervisor controlled live migration of a VM with
 direct attached VF by failing over to the paravirtual datapath when the VF
 is unplugged.
+
+virtio-net accelerated datapath: STANDBY mode
+=============================================
+
+net_failover enables hypervisor controlled accelerated datapath to virtio-net
+enabled VMs in a transparent manner with no/minimal guest userspace chanages.
+
+To support this, the hypervisor needs to enable VIRTIO_NET_F_STANDBY
+feature on the virtio-net interface and assign the same MAC address to both
+virtio-net and VF interfaces.
+
+Here is an example XML snippet that shows such configuration.
+
+ <interface type='network'>
+   <mac address='52:54:00:00:12:53'/>
+   <source network='enp66s0f0_br'/>
+   <target dev='tap01'/>
+   <model type='virtio'/>
+   <driver name='vhost' queues='4'/>
+   <link state='down'/>
+   <address type='pci' domain='0x0000' bus='0x00' slot='0x0a' function='0x0'/>
+ </interface>
+ <interface type='hostdev' managed='yes'>
+   <mac address='52:54:00:00:12:53'/>
+   <source>
+     <address type='pci' domain='0x0000' bus='0x42' slot='0x02' function='0x5'/>
+   </source>
+   <address type='pci' domain='0x0000' bus='0x00' slot='0x0b' function='0x0'/>
+ </interface>
+
+Booting a VM with the above configuration will result in the following 3
+netdevs created in the VM.
+
+4: ens10: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
+    link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+    inet 192.168.12.53/24 brd 192.168.12.255 scope global dynamic ens10
+       valid_lft 42482sec preferred_lft 42482sec
+    inet6 fe80::97d8:db2:8c10:b6d6/64 scope link
+       valid_lft forever preferred_lft forever
+5: ens10nsby: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel master ens10 state UP group default qlen 1000
+    link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+7: ens11: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq master ens10 state UP group default qlen 1000
+    link/ether 52:54:00:00:12:53 brd ff:ff:ff:ff:ff:ff
+
+ens10 is the 'failover' master netdev, ens10nsby and ens11 are the slave
+'standby' and 'primary' netdevs respectively.
+
+Live Migration of a VM with SR-IOV VF & virtio-net in STANDBY mode
+==================================================================
+
+net_failover also enables hypervisor controlled live migration to be supported
+with VMs that have direct attached SR-IOV VF devices by automatic failover to
+the paravirtual datapath when the VF is unplugged.
+
+Here is a sample script that shows the steps to initiate live migration on
+the source hypervisor.
+
+# cat vf_xml
+<interface type='hostdev' managed='yes'>
+  <mac address='52:54:00:00:12:53'/>
+  <source>
+    <address type='pci' domain='0x0000' bus='0x42' slot='0x02' function='0x5'/>
+  </source>
+  <address type='pci' domain='0x0000' bus='0x00' slot='0x0b' function='0x0'/>
+</interface>
+
+# Source Hypervisor
+#!/bin/bash
+
+DOMAIN=fedora27-tap01
+PF=enp66s0f0
+VF_NUM=5
+TAP_IF=tap01
+VF_XML=
+
+MAC=52:54:00:00:12:53
+ZERO_MAC=00:00:00:00:00:00
+
+virsh domif-setlink $DOMAIN $TAP_IF up
+bridge fdb del $MAC dev $PF master
+virsh detach-device $DOMAIN $VF_XML
+ip link set $PF vf $VF_NUM mac $ZERO_MAC
+
+virsh migrate --live $DOMAIN qemu+ssh://$REMOTE_HOST/system
+
+# Destination Hypervisor
+#!/bin/bash
+
+virsh attach-device $DOMAIN $VF_XML
+virsh domif-setlink $DOMAIN $TAP_IF down
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 2cdaff90a9ec..d03775100f7d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -332,6 +332,7 @@ config VETH
 config VIRTIO_NET
 	tristate "Virtio network driver"
 	depends on VIRTIO
+	select NET_FAILOVER
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
 	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fe4534ee50a1..8f08a3e1bbaa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -30,8 +30,11 @@
 #include <linux/cpu.h>
 #include <linux/average.h>
 #include <linux/filter.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
 #include <net/route.h>
 #include <net/xdp.h>
+#include <net/net_failover.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -210,6 +213,9 @@ struct virtnet_info {
 	u32 speed;
 
 	unsigned long guest_offloads;
+
+	/* failover when STANDBY feature enabled */
+	struct failover *failover;
 };
 
 struct padded_vnet_hdr {
@@ -1554,6 +1560,9 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 	struct sockaddr *addr;
 	struct scatterlist sg;
 
+	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
+		return -EOPNOTSUPP;
+
 	addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
 	if (!addr)
 		return -ENOMEM;
@@ -2337,6 +2346,22 @@ static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
+				      size_t len)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int ret;
+
+	if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
+		return -EOPNOTSUPP;
+
+	ret = snprintf(buf, len, "sby");
+	if (ret >= len)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -2354,6 +2379,7 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
 	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
+	.ndo_get_phys_port_name	= virtnet_get_phys_port_name,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -2907,10 +2933,16 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	virtnet_init_settings(dev);
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
+		vi->failover = net_failover_create(vi->dev);
+		if (IS_ERR(vi->failover))
+			goto free_vqs;
+	}
+
 	err = register_netdev(dev);
 	if (err) {
 		pr_debug("virtio_net: registering device failed\n");
-		goto free_vqs;
+		goto free_failover;
 	}
 
 	virtio_device_ready(vdev);
@@ -2947,6 +2979,8 @@ free_unregister_netdev:
 	vi->vdev->config->reset(vdev);
 
 	unregister_netdev(dev);
+free_failover:
+	net_failover_destroy(vi->failover);
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
@@ -2981,6 +3015,8 @@ static void virtnet_remove(struct virtio_device *vdev)
 
 	unregister_netdev(vi->dev);
 
+	net_failover_destroy(vi->failover);
+
 	remove_vq_common(vi);
 
 	free_netdev(vi->dev);
-- 
cgit v1.2.3-59-g8ed1b


From 049ff57a2aa6ead1daf16b4d11da44ed17bdb0b5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 24 May 2018 22:40:12 +0200
Subject: net: phy: realtek: add suspend/resume callbacks for RTL8211B

Add RTL8211B suspend / resume callbacks.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/realtek.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 9f48ecf9c627..082fb40c656d 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -145,6 +145,20 @@ static int rtl8211f_config_init(struct phy_device *phydev)
 	return phy_modify_paged(phydev, 0xd08, 0x11, RTL8211F_TX_DELAY, val);
 }
 
+static int rtl8211b_suspend(struct phy_device *phydev)
+{
+	phy_write(phydev, MII_MMD_DATA, BIT(9));
+
+	return genphy_suspend(phydev);
+}
+
+static int rtl8211b_resume(struct phy_device *phydev)
+{
+	phy_write(phydev, MII_MMD_DATA, 0);
+
+	return genphy_resume(phydev);
+}
+
 static struct phy_driver realtek_drvs[] = {
 	{
 		.phy_id         = 0x00008201,
@@ -174,6 +188,8 @@ static struct phy_driver realtek_drvs[] = {
 		.config_intr	= &rtl8211b_config_intr,
 		.read_mmd	= &genphy_read_mmd_unsupported,
 		.write_mmd	= &genphy_write_mmd_unsupported,
+		.suspend	= rtl8211b_suspend,
+		.resume		= rtl8211b_resume,
 	}, {
 		.phy_id		= 0x001cc914,
 		.name		= "RTL8211DN Gigabit Ethernet",
-- 
cgit v1.2.3-59-g8ed1b


From b1d2e4e03f92734ff524f96c4b2287133de7a4a3 Mon Sep 17 00:00:00 2001
From: Jon Maxwell <jmaxwell37@gmail.com>
Date: Fri, 25 May 2018 07:38:29 +1000
Subject: ifb: fix packets checksum

Fixup the checksum for CHECKSUM_COMPLETE when pulling skbs on RX path.
Otherwise we get splats when tc mirred is used to redirect packets to ifb.

Before fix:

nic: hw csum failure

Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ifb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 5f2897ec0edc..d345c61d476c 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -102,7 +102,7 @@ static void ifb_ri_tasklet(unsigned long _txp)
 		if (!skb->tc_from_ingress) {
 			dev_queue_xmit(skb);
 		} else {
-			skb_pull(skb, skb->mac_len);
+			skb_pull_rcsum(skb, skb->mac_len);
 			netif_receive_skb(skb);
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ceefc71d4c055dab2bba2d2bfa6e7c2855154a24 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Fri, 25 May 2018 12:40:34 +0800
Subject: ptp: rework gianfar_ptp as QorIQ common PTP driver

gianfar_ptp was the PTP clock driver for 1588 timer
module of Freescale QorIQ eTSEC (Enhanced Three-Speed
Ethernet Controllers) platforms. Actually QorIQ DPAA
(Data Path Acceleration Architecture) platforms is
also using the same 1588 timer module in hardware.

This patch is to rework gianfar_ptp as QorIQ common
PTP driver to support both DPAA and eTSEC. Moved
gianfar_ptp.c to drivers/ptp/, renamed it as
ptp_qoriq.c, and renamed many variables. There were
not any function changes.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/Makefile      |   1 -
 drivers/net/ethernet/freescale/gianfar_ptp.c | 572 --------------------------
 drivers/ptp/Kconfig                          |  14 +-
 drivers/ptp/Makefile                         |   1 +
 drivers/ptp/ptp_qoriq.c                      | 584 +++++++++++++++++++++++++++
 5 files changed, 592 insertions(+), 580 deletions(-)
 delete mode 100644 drivers/net/ethernet/freescale/gianfar_ptp.c
 create mode 100644 drivers/ptp/ptp_qoriq.c

diff --git a/drivers/net/ethernet/freescale/Makefile b/drivers/net/ethernet/freescale/Makefile
index ed8ad0fefbda..0914a3ea4405 100644
--- a/drivers/net/ethernet/freescale/Makefile
+++ b/drivers/net/ethernet/freescale/Makefile
@@ -14,7 +14,6 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_FSL_PQ_MDIO) += fsl_pq_mdio.o
 obj-$(CONFIG_FSL_XGMAC_MDIO) += xgmac_mdio.o
 obj-$(CONFIG_GIANFAR) += gianfar_driver.o
-obj-$(CONFIG_PTP_1588_CLOCK_GIANFAR) += gianfar_ptp.o
 gianfar_driver-objs := gianfar.o \
 		gianfar_ethtool.o
 obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o
diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c
deleted file mode 100644
index 9f8d4f8e57e3..000000000000
--- a/drivers/net/ethernet/freescale/gianfar_ptp.c
+++ /dev/null
@@ -1,572 +0,0 @@
-/*
- * PTP 1588 clock using the eTSEC
- *
- * Copyright (C) 2010 OMICRON electronics GmbH
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/device.h>
-#include <linux/hrtimer.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_platform.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <linux/ptp_clock_kernel.h>
-
-#include "gianfar.h"
-
-/*
- * gianfar ptp registers
- * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
- */
-struct gianfar_ptp_registers {
-	u32 tmr_ctrl;     /* Timer control register */
-	u32 tmr_tevent;   /* Timestamp event register */
-	u32 tmr_temask;   /* Timer event mask register */
-	u32 tmr_pevent;   /* Timestamp event register */
-	u32 tmr_pemask;   /* Timer event mask register */
-	u32 tmr_stat;     /* Timestamp status register */
-	u32 tmr_cnt_h;    /* Timer counter high register */
-	u32 tmr_cnt_l;    /* Timer counter low register */
-	u32 tmr_add;      /* Timer drift compensation addend register */
-	u32 tmr_acc;      /* Timer accumulator register */
-	u32 tmr_prsc;     /* Timer prescale */
-	u8  res1[4];
-	u32 tmroff_h;     /* Timer offset high */
-	u32 tmroff_l;     /* Timer offset low */
-	u8  res2[8];
-	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
-	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
-	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
-	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
-	u8  res3[48];
-	u32 tmr_fiper1;   /* Timer fixed period interval */
-	u32 tmr_fiper2;   /* Timer fixed period interval */
-	u32 tmr_fiper3;   /* Timer fixed period interval */
-	u8  res4[20];
-	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
-};
-
-/* Bit definitions for the TMR_CTRL register */
-#define ALM1P                 (1<<31) /* Alarm1 output polarity */
-#define ALM2P                 (1<<30) /* Alarm2 output polarity */
-#define FIPERST               (1<<28) /* FIPER start indication */
-#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
-#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
-#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
-#define TCLK_PERIOD_MASK      (0x3ff)
-#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
-#define FRD                   (1<<14) /* FIPER Realignment Disable */
-#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
-#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
-#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
-#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
-#define COPH                  (1<<7) /* Generated clock output phase. */
-#define CIPH                  (1<<6) /* External oscillator input clock phase */
-#define TMSR                  (1<<5) /* Timer soft reset. */
-#define BYP                   (1<<3) /* Bypass drift compensated clock */
-#define TE                    (1<<2) /* 1588 timer enable. */
-#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
-#define CKSEL_MASK            (0x3)
-
-/* Bit definitions for the TMR_TEVENT register */
-#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
-#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
-#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
-#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
-#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
-#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
-#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
-
-/* Bit definitions for the TMR_TEMASK register */
-#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
-#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
-#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
-#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
-#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
-#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
-
-/* Bit definitions for the TMR_PEVENT register */
-#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
-#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
-#define RXP                   (1<<0) /* PTP frame has been received */
-
-/* Bit definitions for the TMR_PEMASK register */
-#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
-#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
-#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
-
-/* Bit definitions for the TMR_STAT register */
-#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
-#define STAT_VEC_MASK         (0x3f)
-
-/* Bit definitions for the TMR_PRSC register */
-#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
-#define PRSC_OCK_MASK         (0xffff)
-
-
-#define DRIVER		"gianfar_ptp"
-#define DEFAULT_CKSEL	1
-#define N_EXT_TS	2
-#define REG_SIZE	sizeof(struct gianfar_ptp_registers)
-
-struct etsects {
-	struct gianfar_ptp_registers __iomem *regs;
-	spinlock_t lock; /* protects regs */
-	struct ptp_clock *clock;
-	struct ptp_clock_info caps;
-	struct resource *rsrc;
-	int irq;
-	u64 alarm_interval; /* for periodic alarm */
-	u64 alarm_value;
-	u32 tclk_period;  /* nanoseconds */
-	u32 tmr_prsc;
-	u32 tmr_add;
-	u32 cksel;
-	u32 tmr_fiper1;
-	u32 tmr_fiper2;
-};
-
-/*
- * Register access functions
- */
-
-/* Caller must hold etsects->lock. */
-static u64 tmr_cnt_read(struct etsects *etsects)
-{
-	u64 ns;
-	u32 lo, hi;
-
-	lo = gfar_read(&etsects->regs->tmr_cnt_l);
-	hi = gfar_read(&etsects->regs->tmr_cnt_h);
-	ns = ((u64) hi) << 32;
-	ns |= lo;
-	return ns;
-}
-
-/* Caller must hold etsects->lock. */
-static void tmr_cnt_write(struct etsects *etsects, u64 ns)
-{
-	u32 hi = ns >> 32;
-	u32 lo = ns & 0xffffffff;
-
-	gfar_write(&etsects->regs->tmr_cnt_l, lo);
-	gfar_write(&etsects->regs->tmr_cnt_h, hi);
-}
-
-/* Caller must hold etsects->lock. */
-static void set_alarm(struct etsects *etsects)
-{
-	u64 ns;
-	u32 lo, hi;
-
-	ns = tmr_cnt_read(etsects) + 1500000000ULL;
-	ns = div_u64(ns, 1000000000UL) * 1000000000ULL;
-	ns -= etsects->tclk_period;
-	hi = ns >> 32;
-	lo = ns & 0xffffffff;
-	gfar_write(&etsects->regs->tmr_alarm1_l, lo);
-	gfar_write(&etsects->regs->tmr_alarm1_h, hi);
-}
-
-/* Caller must hold etsects->lock. */
-static void set_fipers(struct etsects *etsects)
-{
-	set_alarm(etsects);
-	gfar_write(&etsects->regs->tmr_fiper1, etsects->tmr_fiper1);
-	gfar_write(&etsects->regs->tmr_fiper2, etsects->tmr_fiper2);
-}
-
-/*
- * Interrupt service routine
- */
-
-static irqreturn_t isr(int irq, void *priv)
-{
-	struct etsects *etsects = priv;
-	struct ptp_clock_event event;
-	u64 ns;
-	u32 ack = 0, lo, hi, mask, val;
-
-	val = gfar_read(&etsects->regs->tmr_tevent);
-
-	if (val & ETS1) {
-		ack |= ETS1;
-		hi = gfar_read(&etsects->regs->tmr_etts1_h);
-		lo = gfar_read(&etsects->regs->tmr_etts1_l);
-		event.type = PTP_CLOCK_EXTTS;
-		event.index = 0;
-		event.timestamp = ((u64) hi) << 32;
-		event.timestamp |= lo;
-		ptp_clock_event(etsects->clock, &event);
-	}
-
-	if (val & ETS2) {
-		ack |= ETS2;
-		hi = gfar_read(&etsects->regs->tmr_etts2_h);
-		lo = gfar_read(&etsects->regs->tmr_etts2_l);
-		event.type = PTP_CLOCK_EXTTS;
-		event.index = 1;
-		event.timestamp = ((u64) hi) << 32;
-		event.timestamp |= lo;
-		ptp_clock_event(etsects->clock, &event);
-	}
-
-	if (val & ALM2) {
-		ack |= ALM2;
-		if (etsects->alarm_value) {
-			event.type = PTP_CLOCK_ALARM;
-			event.index = 0;
-			event.timestamp = etsects->alarm_value;
-			ptp_clock_event(etsects->clock, &event);
-		}
-		if (etsects->alarm_interval) {
-			ns = etsects->alarm_value + etsects->alarm_interval;
-			hi = ns >> 32;
-			lo = ns & 0xffffffff;
-			spin_lock(&etsects->lock);
-			gfar_write(&etsects->regs->tmr_alarm2_l, lo);
-			gfar_write(&etsects->regs->tmr_alarm2_h, hi);
-			spin_unlock(&etsects->lock);
-			etsects->alarm_value = ns;
-		} else {
-			gfar_write(&etsects->regs->tmr_tevent, ALM2);
-			spin_lock(&etsects->lock);
-			mask = gfar_read(&etsects->regs->tmr_temask);
-			mask &= ~ALM2EN;
-			gfar_write(&etsects->regs->tmr_temask, mask);
-			spin_unlock(&etsects->lock);
-			etsects->alarm_value = 0;
-			etsects->alarm_interval = 0;
-		}
-	}
-
-	if (val & PP1) {
-		ack |= PP1;
-		event.type = PTP_CLOCK_PPS;
-		ptp_clock_event(etsects->clock, &event);
-	}
-
-	if (ack) {
-		gfar_write(&etsects->regs->tmr_tevent, ack);
-		return IRQ_HANDLED;
-	} else
-		return IRQ_NONE;
-}
-
-/*
- * PTP clock operations
- */
-
-static int ptp_gianfar_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
-{
-	u64 adj, diff;
-	u32 tmr_add;
-	int neg_adj = 0;
-	struct etsects *etsects = container_of(ptp, struct etsects, caps);
-
-	if (scaled_ppm < 0) {
-		neg_adj = 1;
-		scaled_ppm = -scaled_ppm;
-	}
-	tmr_add = etsects->tmr_add;
-	adj = tmr_add;
-
-	/* calculate diff as adj*(scaled_ppm/65536)/1000000
-	 * and round() to the nearest integer
-	 */
-	adj *= scaled_ppm;
-	diff = div_u64(adj, 8000000);
-	diff = (diff >> 13) + ((diff >> 12) & 1);
-
-	tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff;
-
-	gfar_write(&etsects->regs->tmr_add, tmr_add);
-
-	return 0;
-}
-
-static int ptp_gianfar_adjtime(struct ptp_clock_info *ptp, s64 delta)
-{
-	s64 now;
-	unsigned long flags;
-	struct etsects *etsects = container_of(ptp, struct etsects, caps);
-
-	spin_lock_irqsave(&etsects->lock, flags);
-
-	now = tmr_cnt_read(etsects);
-	now += delta;
-	tmr_cnt_write(etsects, now);
-	set_fipers(etsects);
-
-	spin_unlock_irqrestore(&etsects->lock, flags);
-
-	return 0;
-}
-
-static int ptp_gianfar_gettime(struct ptp_clock_info *ptp,
-			       struct timespec64 *ts)
-{
-	u64 ns;
-	unsigned long flags;
-	struct etsects *etsects = container_of(ptp, struct etsects, caps);
-
-	spin_lock_irqsave(&etsects->lock, flags);
-
-	ns = tmr_cnt_read(etsects);
-
-	spin_unlock_irqrestore(&etsects->lock, flags);
-
-	*ts = ns_to_timespec64(ns);
-
-	return 0;
-}
-
-static int ptp_gianfar_settime(struct ptp_clock_info *ptp,
-			       const struct timespec64 *ts)
-{
-	u64 ns;
-	unsigned long flags;
-	struct etsects *etsects = container_of(ptp, struct etsects, caps);
-
-	ns = timespec64_to_ns(ts);
-
-	spin_lock_irqsave(&etsects->lock, flags);
-
-	tmr_cnt_write(etsects, ns);
-	set_fipers(etsects);
-
-	spin_unlock_irqrestore(&etsects->lock, flags);
-
-	return 0;
-}
-
-static int ptp_gianfar_enable(struct ptp_clock_info *ptp,
-			      struct ptp_clock_request *rq, int on)
-{
-	struct etsects *etsects = container_of(ptp, struct etsects, caps);
-	unsigned long flags;
-	u32 bit, mask;
-
-	switch (rq->type) {
-	case PTP_CLK_REQ_EXTTS:
-		switch (rq->extts.index) {
-		case 0:
-			bit = ETS1EN;
-			break;
-		case 1:
-			bit = ETS2EN;
-			break;
-		default:
-			return -EINVAL;
-		}
-		spin_lock_irqsave(&etsects->lock, flags);
-		mask = gfar_read(&etsects->regs->tmr_temask);
-		if (on)
-			mask |= bit;
-		else
-			mask &= ~bit;
-		gfar_write(&etsects->regs->tmr_temask, mask);
-		spin_unlock_irqrestore(&etsects->lock, flags);
-		return 0;
-
-	case PTP_CLK_REQ_PPS:
-		spin_lock_irqsave(&etsects->lock, flags);
-		mask = gfar_read(&etsects->regs->tmr_temask);
-		if (on)
-			mask |= PP1EN;
-		else
-			mask &= ~PP1EN;
-		gfar_write(&etsects->regs->tmr_temask, mask);
-		spin_unlock_irqrestore(&etsects->lock, flags);
-		return 0;
-
-	default:
-		break;
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static const struct ptp_clock_info ptp_gianfar_caps = {
-	.owner		= THIS_MODULE,
-	.name		= "gianfar clock",
-	.max_adj	= 512000,
-	.n_alarm	= 0,
-	.n_ext_ts	= N_EXT_TS,
-	.n_per_out	= 0,
-	.n_pins		= 0,
-	.pps		= 1,
-	.adjfine	= ptp_gianfar_adjfine,
-	.adjtime	= ptp_gianfar_adjtime,
-	.gettime64	= ptp_gianfar_gettime,
-	.settime64	= ptp_gianfar_settime,
-	.enable		= ptp_gianfar_enable,
-};
-
-static int gianfar_ptp_probe(struct platform_device *dev)
-{
-	struct device_node *node = dev->dev.of_node;
-	struct etsects *etsects;
-	struct timespec64 now;
-	int err = -ENOMEM;
-	u32 tmr_ctrl;
-	unsigned long flags;
-
-	etsects = kzalloc(sizeof(*etsects), GFP_KERNEL);
-	if (!etsects)
-		goto no_memory;
-
-	err = -ENODEV;
-
-	etsects->caps = ptp_gianfar_caps;
-
-	if (of_property_read_u32(node, "fsl,cksel", &etsects->cksel))
-		etsects->cksel = DEFAULT_CKSEL;
-
-	if (of_property_read_u32(node,
-				 "fsl,tclk-period", &etsects->tclk_period) ||
-	    of_property_read_u32(node,
-				 "fsl,tmr-prsc", &etsects->tmr_prsc) ||
-	    of_property_read_u32(node,
-				 "fsl,tmr-add", &etsects->tmr_add) ||
-	    of_property_read_u32(node,
-				 "fsl,tmr-fiper1", &etsects->tmr_fiper1) ||
-	    of_property_read_u32(node,
-				 "fsl,tmr-fiper2", &etsects->tmr_fiper2) ||
-	    of_property_read_u32(node,
-				 "fsl,max-adj", &etsects->caps.max_adj)) {
-		pr_err("device tree node missing required elements\n");
-		goto no_node;
-	}
-
-	etsects->irq = platform_get_irq(dev, 0);
-
-	if (etsects->irq < 0) {
-		pr_err("irq not in device tree\n");
-		goto no_node;
-	}
-	if (request_irq(etsects->irq, isr, 0, DRIVER, etsects)) {
-		pr_err("request_irq failed\n");
-		goto no_node;
-	}
-
-	etsects->rsrc = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	if (!etsects->rsrc) {
-		pr_err("no resource\n");
-		goto no_resource;
-	}
-	if (request_resource(&iomem_resource, etsects->rsrc)) {
-		pr_err("resource busy\n");
-		goto no_resource;
-	}
-
-	spin_lock_init(&etsects->lock);
-
-	etsects->regs = ioremap(etsects->rsrc->start,
-				resource_size(etsects->rsrc));
-	if (!etsects->regs) {
-		pr_err("ioremap ptp registers failed\n");
-		goto no_ioremap;
-	}
-	getnstimeofday64(&now);
-	ptp_gianfar_settime(&etsects->caps, &now);
-
-	tmr_ctrl =
-	  (etsects->tclk_period & TCLK_PERIOD_MASK) << TCLK_PERIOD_SHIFT |
-	  (etsects->cksel & CKSEL_MASK) << CKSEL_SHIFT;
-
-	spin_lock_irqsave(&etsects->lock, flags);
-
-	gfar_write(&etsects->regs->tmr_ctrl,   tmr_ctrl);
-	gfar_write(&etsects->regs->tmr_add,    etsects->tmr_add);
-	gfar_write(&etsects->regs->tmr_prsc,   etsects->tmr_prsc);
-	gfar_write(&etsects->regs->tmr_fiper1, etsects->tmr_fiper1);
-	gfar_write(&etsects->regs->tmr_fiper2, etsects->tmr_fiper2);
-	set_alarm(etsects);
-	gfar_write(&etsects->regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
-
-	spin_unlock_irqrestore(&etsects->lock, flags);
-
-	etsects->clock = ptp_clock_register(&etsects->caps, &dev->dev);
-	if (IS_ERR(etsects->clock)) {
-		err = PTR_ERR(etsects->clock);
-		goto no_clock;
-	}
-	gfar_phc_index = ptp_clock_index(etsects->clock);
-
-	platform_set_drvdata(dev, etsects);
-
-	return 0;
-
-no_clock:
-	iounmap(etsects->regs);
-no_ioremap:
-	release_resource(etsects->rsrc);
-no_resource:
-	free_irq(etsects->irq, etsects);
-no_node:
-	kfree(etsects);
-no_memory:
-	return err;
-}
-
-static int gianfar_ptp_remove(struct platform_device *dev)
-{
-	struct etsects *etsects = platform_get_drvdata(dev);
-
-	gfar_write(&etsects->regs->tmr_temask, 0);
-	gfar_write(&etsects->regs->tmr_ctrl,   0);
-
-	gfar_phc_index = -1;
-	ptp_clock_unregister(etsects->clock);
-	iounmap(etsects->regs);
-	release_resource(etsects->rsrc);
-	free_irq(etsects->irq, etsects);
-	kfree(etsects);
-
-	return 0;
-}
-
-static const struct of_device_id match_table[] = {
-	{ .compatible = "fsl,etsec-ptp" },
-	{},
-};
-MODULE_DEVICE_TABLE(of, match_table);
-
-static struct platform_driver gianfar_ptp_driver = {
-	.driver = {
-		.name		= "gianfar_ptp",
-		.of_match_table	= match_table,
-	},
-	.probe       = gianfar_ptp_probe,
-	.remove      = gianfar_ptp_remove,
-};
-
-module_platform_driver(gianfar_ptp_driver);
-
-MODULE_AUTHOR("Richard Cochran <richardcochran@gmail.com>");
-MODULE_DESCRIPTION("PTP clock using the eTSEC");
-MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index a21ad10d613c..474c988d2e95 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -41,19 +41,19 @@ config PTP_1588_CLOCK_DTE
 	  To compile this driver as a module, choose M here: the module
 	  will be called ptp_dte.
 
-config PTP_1588_CLOCK_GIANFAR
-	tristate "Freescale eTSEC as PTP clock"
+config PTP_1588_CLOCK_QORIQ
+	tristate "Freescale QorIQ 1588 timer as PTP clock"
 	depends on GIANFAR
 	depends on PTP_1588_CLOCK
 	default y
 	help
-	  This driver adds support for using the eTSEC as a PTP
-	  clock. This clock is only useful if your PTP programs are
-	  getting hardware time stamps on the PTP Ethernet packets
-	  using the SO_TIMESTAMPING API.
+	  This driver adds support for using the Freescale QorIQ 1588
+	  timer as a PTP clock. This clock is only useful if your PTP
+	  programs are getting hardware time stamps on the PTP Ethernet
+	  packets using the SO_TIMESTAMPING API.
 
 	  To compile this driver as a module, choose M here: the module
-	  will be called gianfar_ptp.
+	  will be called ptp_qoriq.
 
 config PTP_1588_CLOCK_IXP46X
 	tristate "Intel IXP46x as PTP clock"
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index fd28207f5379..19efa9cfa950 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
 obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
 obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
+obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)	+= ptp_qoriq.o
diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
new file mode 100644
index 000000000000..5110cce78fb5
--- /dev/null
+++ b/drivers/ptp/ptp_qoriq.c
@@ -0,0 +1,584 @@
+/*
+ * PTP 1588 clock for Freescale QorIQ 1588 timer
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/device.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+/*
+ * qoriq ptp registers
+ * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
+ */
+struct qoriq_ptp_registers {
+	u32 tmr_ctrl;     /* Timer control register */
+	u32 tmr_tevent;   /* Timestamp event register */
+	u32 tmr_temask;   /* Timer event mask register */
+	u32 tmr_pevent;   /* Timestamp event register */
+	u32 tmr_pemask;   /* Timer event mask register */
+	u32 tmr_stat;     /* Timestamp status register */
+	u32 tmr_cnt_h;    /* Timer counter high register */
+	u32 tmr_cnt_l;    /* Timer counter low register */
+	u32 tmr_add;      /* Timer drift compensation addend register */
+	u32 tmr_acc;      /* Timer accumulator register */
+	u32 tmr_prsc;     /* Timer prescale */
+	u8  res1[4];
+	u32 tmroff_h;     /* Timer offset high */
+	u32 tmroff_l;     /* Timer offset low */
+	u8  res2[8];
+	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
+	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
+	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
+	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
+	u8  res3[48];
+	u32 tmr_fiper1;   /* Timer fixed period interval */
+	u32 tmr_fiper2;   /* Timer fixed period interval */
+	u32 tmr_fiper3;   /* Timer fixed period interval */
+	u8  res4[20];
+	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
+};
+
+/* Bit definitions for the TMR_CTRL register */
+#define ALM1P                 (1<<31) /* Alarm1 output polarity */
+#define ALM2P                 (1<<30) /* Alarm2 output polarity */
+#define FIPERST               (1<<28) /* FIPER start indication */
+#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
+#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
+#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
+#define TCLK_PERIOD_MASK      (0x3ff)
+#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
+#define FRD                   (1<<14) /* FIPER Realignment Disable */
+#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
+#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
+#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
+#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
+#define COPH                  (1<<7) /* Generated clock output phase. */
+#define CIPH                  (1<<6) /* External oscillator input clock phase */
+#define TMSR                  (1<<5) /* Timer soft reset. */
+#define BYP                   (1<<3) /* Bypass drift compensated clock */
+#define TE                    (1<<2) /* 1588 timer enable. */
+#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
+#define CKSEL_MASK            (0x3)
+
+/* Bit definitions for the TMR_TEVENT register */
+#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
+#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
+#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
+#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
+#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
+#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
+#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
+
+/* Bit definitions for the TMR_TEMASK register */
+#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
+#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
+#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
+#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
+#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
+#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
+
+/* Bit definitions for the TMR_PEVENT register */
+#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
+#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
+#define RXP                   (1<<0) /* PTP frame has been received */
+
+/* Bit definitions for the TMR_PEMASK register */
+#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
+#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
+#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
+
+/* Bit definitions for the TMR_STAT register */
+#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
+#define STAT_VEC_MASK         (0x3f)
+
+/* Bit definitions for the TMR_PRSC register */
+#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
+#define PRSC_OCK_MASK         (0xffff)
+
+
+#define DRIVER		"ptp_qoriq"
+#define DEFAULT_CKSEL	1
+#define N_EXT_TS	2
+#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
+
+struct qoriq_ptp {
+	struct qoriq_ptp_registers __iomem *regs;
+	spinlock_t lock; /* protects regs */
+	struct ptp_clock *clock;
+	struct ptp_clock_info caps;
+	struct resource *rsrc;
+	int irq;
+	int phc_index;
+	u64 alarm_interval; /* for periodic alarm */
+	u64 alarm_value;
+	u32 tclk_period;  /* nanoseconds */
+	u32 tmr_prsc;
+	u32 tmr_add;
+	u32 cksel;
+	u32 tmr_fiper1;
+	u32 tmr_fiper2;
+};
+
+static inline u32 qoriq_read(unsigned __iomem *addr)
+{
+	u32 val;
+
+	val = ioread32be(addr);
+	return val;
+}
+
+static inline void qoriq_write(unsigned __iomem *addr, u32 val)
+{
+	iowrite32be(val, addr);
+}
+
+/*
+ * Register access functions
+ */
+
+/* Caller must hold qoriq_ptp->lock. */
+static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp)
+{
+	u64 ns;
+	u32 lo, hi;
+
+	lo = qoriq_read(&qoriq_ptp->regs->tmr_cnt_l);
+	hi = qoriq_read(&qoriq_ptp->regs->tmr_cnt_h);
+	ns = ((u64) hi) << 32;
+	ns |= lo;
+	return ns;
+}
+
+/* Caller must hold qoriq_ptp->lock. */
+static void tmr_cnt_write(struct qoriq_ptp *qoriq_ptp, u64 ns)
+{
+	u32 hi = ns >> 32;
+	u32 lo = ns & 0xffffffff;
+
+	qoriq_write(&qoriq_ptp->regs->tmr_cnt_l, lo);
+	qoriq_write(&qoriq_ptp->regs->tmr_cnt_h, hi);
+}
+
+/* Caller must hold qoriq_ptp->lock. */
+static void set_alarm(struct qoriq_ptp *qoriq_ptp)
+{
+	u64 ns;
+	u32 lo, hi;
+
+	ns = tmr_cnt_read(qoriq_ptp) + 1500000000ULL;
+	ns = div_u64(ns, 1000000000UL) * 1000000000ULL;
+	ns -= qoriq_ptp->tclk_period;
+	hi = ns >> 32;
+	lo = ns & 0xffffffff;
+	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_l, lo);
+	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_h, hi);
+}
+
+/* Caller must hold qoriq_ptp->lock. */
+static void set_fipers(struct qoriq_ptp *qoriq_ptp)
+{
+	set_alarm(qoriq_ptp);
+	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+}
+
+/*
+ * Interrupt service routine
+ */
+
+static irqreturn_t isr(int irq, void *priv)
+{
+	struct qoriq_ptp *qoriq_ptp = priv;
+	struct ptp_clock_event event;
+	u64 ns;
+	u32 ack = 0, lo, hi, mask, val;
+
+	val = qoriq_read(&qoriq_ptp->regs->tmr_tevent);
+
+	if (val & ETS1) {
+		ack |= ETS1;
+		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts1_h);
+		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts1_l);
+		event.type = PTP_CLOCK_EXTTS;
+		event.index = 0;
+		event.timestamp = ((u64) hi) << 32;
+		event.timestamp |= lo;
+		ptp_clock_event(qoriq_ptp->clock, &event);
+	}
+
+	if (val & ETS2) {
+		ack |= ETS2;
+		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts2_h);
+		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts2_l);
+		event.type = PTP_CLOCK_EXTTS;
+		event.index = 1;
+		event.timestamp = ((u64) hi) << 32;
+		event.timestamp |= lo;
+		ptp_clock_event(qoriq_ptp->clock, &event);
+	}
+
+	if (val & ALM2) {
+		ack |= ALM2;
+		if (qoriq_ptp->alarm_value) {
+			event.type = PTP_CLOCK_ALARM;
+			event.index = 0;
+			event.timestamp = qoriq_ptp->alarm_value;
+			ptp_clock_event(qoriq_ptp->clock, &event);
+		}
+		if (qoriq_ptp->alarm_interval) {
+			ns = qoriq_ptp->alarm_value + qoriq_ptp->alarm_interval;
+			hi = ns >> 32;
+			lo = ns & 0xffffffff;
+			spin_lock(&qoriq_ptp->lock);
+			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_l, lo);
+			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_h, hi);
+			spin_unlock(&qoriq_ptp->lock);
+			qoriq_ptp->alarm_value = ns;
+		} else {
+			qoriq_write(&qoriq_ptp->regs->tmr_tevent, ALM2);
+			spin_lock(&qoriq_ptp->lock);
+			mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+			mask &= ~ALM2EN;
+			qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+			spin_unlock(&qoriq_ptp->lock);
+			qoriq_ptp->alarm_value = 0;
+			qoriq_ptp->alarm_interval = 0;
+		}
+	}
+
+	if (val & PP1) {
+		ack |= PP1;
+		event.type = PTP_CLOCK_PPS;
+		ptp_clock_event(qoriq_ptp->clock, &event);
+	}
+
+	if (ack) {
+		qoriq_write(&qoriq_ptp->regs->tmr_tevent, ack);
+		return IRQ_HANDLED;
+	} else
+		return IRQ_NONE;
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	u64 adj, diff;
+	u32 tmr_add;
+	int neg_adj = 0;
+	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+
+	if (scaled_ppm < 0) {
+		neg_adj = 1;
+		scaled_ppm = -scaled_ppm;
+	}
+	tmr_add = qoriq_ptp->tmr_add;
+	adj = tmr_add;
+
+	/* calculate diff as adj*(scaled_ppm/65536)/1000000
+	 * and round() to the nearest integer
+	 */
+	adj *= scaled_ppm;
+	diff = div_u64(adj, 8000000);
+	diff = (diff >> 13) + ((diff >> 12) & 1);
+
+	tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff;
+
+	qoriq_write(&qoriq_ptp->regs->tmr_add, tmr_add);
+
+	return 0;
+}
+
+static int ptp_qoriq_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	s64 now;
+	unsigned long flags;
+	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+
+	spin_lock_irqsave(&qoriq_ptp->lock, flags);
+
+	now = tmr_cnt_read(qoriq_ptp);
+	now += delta;
+	tmr_cnt_write(qoriq_ptp, now);
+	set_fipers(qoriq_ptp);
+
+	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+
+	return 0;
+}
+
+static int ptp_qoriq_gettime(struct ptp_clock_info *ptp,
+			       struct timespec64 *ts)
+{
+	u64 ns;
+	unsigned long flags;
+	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+
+	spin_lock_irqsave(&qoriq_ptp->lock, flags);
+
+	ns = tmr_cnt_read(qoriq_ptp);
+
+	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int ptp_qoriq_settime(struct ptp_clock_info *ptp,
+			       const struct timespec64 *ts)
+{
+	u64 ns;
+	unsigned long flags;
+	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+
+	ns = timespec64_to_ns(ts);
+
+	spin_lock_irqsave(&qoriq_ptp->lock, flags);
+
+	tmr_cnt_write(qoriq_ptp, ns);
+	set_fipers(qoriq_ptp);
+
+	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+
+	return 0;
+}
+
+static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
+			      struct ptp_clock_request *rq, int on)
+{
+	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+	unsigned long flags;
+	u32 bit, mask;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_EXTTS:
+		switch (rq->extts.index) {
+		case 0:
+			bit = ETS1EN;
+			break;
+		case 1:
+			bit = ETS2EN;
+			break;
+		default:
+			return -EINVAL;
+		}
+		spin_lock_irqsave(&qoriq_ptp->lock, flags);
+		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		if (on)
+			mask |= bit;
+		else
+			mask &= ~bit;
+		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+		return 0;
+
+	case PTP_CLK_REQ_PPS:
+		spin_lock_irqsave(&qoriq_ptp->lock, flags);
+		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		if (on)
+			mask |= PP1EN;
+		else
+			mask &= ~PP1EN;
+		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+		return 0;
+
+	default:
+		break;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_qoriq_caps = {
+	.owner		= THIS_MODULE,
+	.name		= "qoriq ptp clock",
+	.max_adj	= 512000,
+	.n_alarm	= 0,
+	.n_ext_ts	= N_EXT_TS,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 1,
+	.adjfine	= ptp_qoriq_adjfine,
+	.adjtime	= ptp_qoriq_adjtime,
+	.gettime64	= ptp_qoriq_gettime,
+	.settime64	= ptp_qoriq_settime,
+	.enable		= ptp_qoriq_enable,
+};
+
+static int qoriq_ptp_probe(struct platform_device *dev)
+{
+	struct device_node *node = dev->dev.of_node;
+	struct qoriq_ptp *qoriq_ptp;
+	struct timespec64 now;
+	int err = -ENOMEM;
+	u32 tmr_ctrl;
+	unsigned long flags;
+
+	qoriq_ptp = kzalloc(sizeof(*qoriq_ptp), GFP_KERNEL);
+	if (!qoriq_ptp)
+		goto no_memory;
+
+	err = -ENODEV;
+
+	qoriq_ptp->caps = ptp_qoriq_caps;
+
+	if (of_property_read_u32(node, "fsl,cksel", &qoriq_ptp->cksel))
+		qoriq_ptp->cksel = DEFAULT_CKSEL;
+
+	if (of_property_read_u32(node,
+				 "fsl,tclk-period", &qoriq_ptp->tclk_period) ||
+	    of_property_read_u32(node,
+				 "fsl,tmr-prsc", &qoriq_ptp->tmr_prsc) ||
+	    of_property_read_u32(node,
+				 "fsl,tmr-add", &qoriq_ptp->tmr_add) ||
+	    of_property_read_u32(node,
+				 "fsl,tmr-fiper1", &qoriq_ptp->tmr_fiper1) ||
+	    of_property_read_u32(node,
+				 "fsl,tmr-fiper2", &qoriq_ptp->tmr_fiper2) ||
+	    of_property_read_u32(node,
+				 "fsl,max-adj", &qoriq_ptp->caps.max_adj)) {
+		pr_err("device tree node missing required elements\n");
+		goto no_node;
+	}
+
+	qoriq_ptp->irq = platform_get_irq(dev, 0);
+
+	if (qoriq_ptp->irq < 0) {
+		pr_err("irq not in device tree\n");
+		goto no_node;
+	}
+	if (request_irq(qoriq_ptp->irq, isr, 0, DRIVER, qoriq_ptp)) {
+		pr_err("request_irq failed\n");
+		goto no_node;
+	}
+
+	qoriq_ptp->rsrc = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!qoriq_ptp->rsrc) {
+		pr_err("no resource\n");
+		goto no_resource;
+	}
+	if (request_resource(&iomem_resource, qoriq_ptp->rsrc)) {
+		pr_err("resource busy\n");
+		goto no_resource;
+	}
+
+	spin_lock_init(&qoriq_ptp->lock);
+
+	qoriq_ptp->regs = ioremap(qoriq_ptp->rsrc->start,
+				resource_size(qoriq_ptp->rsrc));
+	if (!qoriq_ptp->regs) {
+		pr_err("ioremap ptp registers failed\n");
+		goto no_ioremap;
+	}
+	getnstimeofday64(&now);
+	ptp_qoriq_settime(&qoriq_ptp->caps, &now);
+
+	tmr_ctrl =
+	  (qoriq_ptp->tclk_period & TCLK_PERIOD_MASK) << TCLK_PERIOD_SHIFT |
+	  (qoriq_ptp->cksel & CKSEL_MASK) << CKSEL_SHIFT;
+
+	spin_lock_irqsave(&qoriq_ptp->lock, flags);
+
+	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl);
+	qoriq_write(&qoriq_ptp->regs->tmr_add,    qoriq_ptp->tmr_add);
+	qoriq_write(&qoriq_ptp->regs->tmr_prsc,   qoriq_ptp->tmr_prsc);
+	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+	set_alarm(qoriq_ptp);
+	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
+
+	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
+
+	qoriq_ptp->clock = ptp_clock_register(&qoriq_ptp->caps, &dev->dev);
+	if (IS_ERR(qoriq_ptp->clock)) {
+		err = PTR_ERR(qoriq_ptp->clock);
+		goto no_clock;
+	}
+	qoriq_ptp->phc_index = ptp_clock_index(qoriq_ptp->clock);
+
+	platform_set_drvdata(dev, qoriq_ptp);
+
+	return 0;
+
+no_clock:
+	iounmap(qoriq_ptp->regs);
+no_ioremap:
+	release_resource(qoriq_ptp->rsrc);
+no_resource:
+	free_irq(qoriq_ptp->irq, qoriq_ptp);
+no_node:
+	kfree(qoriq_ptp);
+no_memory:
+	return err;
+}
+
+static int qoriq_ptp_remove(struct platform_device *dev)
+{
+	struct qoriq_ptp *qoriq_ptp = platform_get_drvdata(dev);
+
+	qoriq_write(&qoriq_ptp->regs->tmr_temask, 0);
+	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   0);
+
+	ptp_clock_unregister(qoriq_ptp->clock);
+	iounmap(qoriq_ptp->regs);
+	release_resource(qoriq_ptp->rsrc);
+	free_irq(qoriq_ptp->irq, qoriq_ptp);
+	kfree(qoriq_ptp);
+
+	return 0;
+}
+
+static const struct of_device_id match_table[] = {
+	{ .compatible = "fsl,etsec-ptp" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, match_table);
+
+static struct platform_driver qoriq_ptp_driver = {
+	.driver = {
+		.name		= "ptp_qoriq",
+		.of_match_table	= match_table,
+	},
+	.probe       = qoriq_ptp_probe,
+	.remove      = qoriq_ptp_remove,
+};
+
+module_platform_driver(qoriq_ptp_driver);
+
+MODULE_AUTHOR("Richard Cochran <richardcochran@gmail.com>");
+MODULE_DESCRIPTION("PTP clock for Freescale QorIQ 1588 timer");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-59-g8ed1b


From 6c50c1ed72aeb5cd1ae8cd267d5a79904c82a04f Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Fri, 25 May 2018 12:40:35 +0800
Subject: ptp_qoriq: move some definitions to header file

This patch is to move some definitions in ptp_qoriq.c
to the header file.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_qoriq.c       | 132 +--------------------------------------
 include/linux/fsl/ptp_qoriq.h | 141 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 131 deletions(-)
 create mode 100644 include/linux/fsl/ptp_qoriq.h

diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index 5110cce78fb5..1468a1642b49 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -28,139 +28,9 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/timex.h>
-#include <linux/io.h>
 #include <linux/slab.h>
 
-#include <linux/ptp_clock_kernel.h>
-
-/*
- * qoriq ptp registers
- * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
- */
-struct qoriq_ptp_registers {
-	u32 tmr_ctrl;     /* Timer control register */
-	u32 tmr_tevent;   /* Timestamp event register */
-	u32 tmr_temask;   /* Timer event mask register */
-	u32 tmr_pevent;   /* Timestamp event register */
-	u32 tmr_pemask;   /* Timer event mask register */
-	u32 tmr_stat;     /* Timestamp status register */
-	u32 tmr_cnt_h;    /* Timer counter high register */
-	u32 tmr_cnt_l;    /* Timer counter low register */
-	u32 tmr_add;      /* Timer drift compensation addend register */
-	u32 tmr_acc;      /* Timer accumulator register */
-	u32 tmr_prsc;     /* Timer prescale */
-	u8  res1[4];
-	u32 tmroff_h;     /* Timer offset high */
-	u32 tmroff_l;     /* Timer offset low */
-	u8  res2[8];
-	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
-	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
-	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
-	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
-	u8  res3[48];
-	u32 tmr_fiper1;   /* Timer fixed period interval */
-	u32 tmr_fiper2;   /* Timer fixed period interval */
-	u32 tmr_fiper3;   /* Timer fixed period interval */
-	u8  res4[20];
-	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
-	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
-};
-
-/* Bit definitions for the TMR_CTRL register */
-#define ALM1P                 (1<<31) /* Alarm1 output polarity */
-#define ALM2P                 (1<<30) /* Alarm2 output polarity */
-#define FIPERST               (1<<28) /* FIPER start indication */
-#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
-#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
-#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
-#define TCLK_PERIOD_MASK      (0x3ff)
-#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
-#define FRD                   (1<<14) /* FIPER Realignment Disable */
-#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
-#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
-#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
-#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
-#define COPH                  (1<<7) /* Generated clock output phase. */
-#define CIPH                  (1<<6) /* External oscillator input clock phase */
-#define TMSR                  (1<<5) /* Timer soft reset. */
-#define BYP                   (1<<3) /* Bypass drift compensated clock */
-#define TE                    (1<<2) /* 1588 timer enable. */
-#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
-#define CKSEL_MASK            (0x3)
-
-/* Bit definitions for the TMR_TEVENT register */
-#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
-#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
-#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
-#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
-#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
-#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
-#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
-
-/* Bit definitions for the TMR_TEMASK register */
-#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
-#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
-#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
-#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
-#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
-#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
-
-/* Bit definitions for the TMR_PEVENT register */
-#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
-#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
-#define RXP                   (1<<0) /* PTP frame has been received */
-
-/* Bit definitions for the TMR_PEMASK register */
-#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
-#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
-#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
-
-/* Bit definitions for the TMR_STAT register */
-#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
-#define STAT_VEC_MASK         (0x3f)
-
-/* Bit definitions for the TMR_PRSC register */
-#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
-#define PRSC_OCK_MASK         (0xffff)
-
-
-#define DRIVER		"ptp_qoriq"
-#define DEFAULT_CKSEL	1
-#define N_EXT_TS	2
-#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
-
-struct qoriq_ptp {
-	struct qoriq_ptp_registers __iomem *regs;
-	spinlock_t lock; /* protects regs */
-	struct ptp_clock *clock;
-	struct ptp_clock_info caps;
-	struct resource *rsrc;
-	int irq;
-	int phc_index;
-	u64 alarm_interval; /* for periodic alarm */
-	u64 alarm_value;
-	u32 tclk_period;  /* nanoseconds */
-	u32 tmr_prsc;
-	u32 tmr_add;
-	u32 cksel;
-	u32 tmr_fiper1;
-	u32 tmr_fiper2;
-};
-
-static inline u32 qoriq_read(unsigned __iomem *addr)
-{
-	u32 val;
-
-	val = ioread32be(addr);
-	return val;
-}
-
-static inline void qoriq_write(unsigned __iomem *addr, u32 val)
-{
-	iowrite32be(val, addr);
-}
+#include <linux/fsl/ptp_qoriq.h>
 
 /*
  * Register access functions
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
new file mode 100644
index 000000000000..b462d9ea8007
--- /dev/null
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ * Copyright 2018 NXP
+ */
+#ifndef __PTP_QORIQ_H__
+#define __PTP_QORIQ_H__
+
+#include <linux/io.h>
+#include <linux/ptp_clock_kernel.h>
+
+/*
+ * qoriq ptp registers
+ * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
+ */
+struct qoriq_ptp_registers {
+	u32 tmr_ctrl;     /* Timer control register */
+	u32 tmr_tevent;   /* Timestamp event register */
+	u32 tmr_temask;   /* Timer event mask register */
+	u32 tmr_pevent;   /* Timestamp event register */
+	u32 tmr_pemask;   /* Timer event mask register */
+	u32 tmr_stat;     /* Timestamp status register */
+	u32 tmr_cnt_h;    /* Timer counter high register */
+	u32 tmr_cnt_l;    /* Timer counter low register */
+	u32 tmr_add;      /* Timer drift compensation addend register */
+	u32 tmr_acc;      /* Timer accumulator register */
+	u32 tmr_prsc;     /* Timer prescale */
+	u8  res1[4];
+	u32 tmroff_h;     /* Timer offset high */
+	u32 tmroff_l;     /* Timer offset low */
+	u8  res2[8];
+	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
+	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
+	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
+	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
+	u8  res3[48];
+	u32 tmr_fiper1;   /* Timer fixed period interval */
+	u32 tmr_fiper2;   /* Timer fixed period interval */
+	u32 tmr_fiper3;   /* Timer fixed period interval */
+	u8  res4[20];
+	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
+	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
+};
+
+/* Bit definitions for the TMR_CTRL register */
+#define ALM1P                 (1<<31) /* Alarm1 output polarity */
+#define ALM2P                 (1<<30) /* Alarm2 output polarity */
+#define FIPERST               (1<<28) /* FIPER start indication */
+#define PP1L                  (1<<27) /* Fiper1 pulse loopback mode enabled. */
+#define PP2L                  (1<<26) /* Fiper2 pulse loopback mode enabled. */
+#define TCLK_PERIOD_SHIFT     (16) /* 1588 timer reference clock period. */
+#define TCLK_PERIOD_MASK      (0x3ff)
+#define RTPE                  (1<<15) /* Record Tx Timestamp to PAL Enable. */
+#define FRD                   (1<<14) /* FIPER Realignment Disable */
+#define ESFDP                 (1<<11) /* External Tx/Rx SFD Polarity. */
+#define ESFDE                 (1<<10) /* External Tx/Rx SFD Enable. */
+#define ETEP2                 (1<<9) /* External trigger 2 edge polarity */
+#define ETEP1                 (1<<8) /* External trigger 1 edge polarity */
+#define COPH                  (1<<7) /* Generated clock output phase. */
+#define CIPH                  (1<<6) /* External oscillator input clock phase */
+#define TMSR                  (1<<5) /* Timer soft reset. */
+#define BYP                   (1<<3) /* Bypass drift compensated clock */
+#define TE                    (1<<2) /* 1588 timer enable. */
+#define CKSEL_SHIFT           (0)    /* 1588 Timer reference clock source */
+#define CKSEL_MASK            (0x3)
+
+/* Bit definitions for the TMR_TEVENT register */
+#define ETS2                  (1<<25) /* External trigger 2 timestamp sampled */
+#define ETS1                  (1<<24) /* External trigger 1 timestamp sampled */
+#define ALM2                  (1<<17) /* Current time = alarm time register 2 */
+#define ALM1                  (1<<16) /* Current time = alarm time register 1 */
+#define PP1                   (1<<7)  /* periodic pulse generated on FIPER1 */
+#define PP2                   (1<<6)  /* periodic pulse generated on FIPER2 */
+#define PP3                   (1<<5)  /* periodic pulse generated on FIPER3 */
+
+/* Bit definitions for the TMR_TEMASK register */
+#define ETS2EN                (1<<25) /* External trigger 2 timestamp enable */
+#define ETS1EN                (1<<24) /* External trigger 1 timestamp enable */
+#define ALM2EN                (1<<17) /* Timer ALM2 event enable */
+#define ALM1EN                (1<<16) /* Timer ALM1 event enable */
+#define PP1EN                 (1<<7) /* Periodic pulse event 1 enable */
+#define PP2EN                 (1<<6) /* Periodic pulse event 2 enable */
+
+/* Bit definitions for the TMR_PEVENT register */
+#define TXP2                  (1<<9) /* PTP transmitted timestamp im TXTS2 */
+#define TXP1                  (1<<8) /* PTP transmitted timestamp in TXTS1 */
+#define RXP                   (1<<0) /* PTP frame has been received */
+
+/* Bit definitions for the TMR_PEMASK register */
+#define TXP2EN                (1<<9) /* Transmit PTP packet event 2 enable */
+#define TXP1EN                (1<<8) /* Transmit PTP packet event 1 enable */
+#define RXPEN                 (1<<0) /* Receive PTP packet event enable */
+
+/* Bit definitions for the TMR_STAT register */
+#define STAT_VEC_SHIFT        (0) /* Timer general purpose status vector */
+#define STAT_VEC_MASK         (0x3f)
+
+/* Bit definitions for the TMR_PRSC register */
+#define PRSC_OCK_SHIFT        (0) /* Output clock division/prescale factor. */
+#define PRSC_OCK_MASK         (0xffff)
+
+
+#define DRIVER		"ptp_qoriq"
+#define DEFAULT_CKSEL	1
+#define N_EXT_TS	2
+#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
+
+struct qoriq_ptp {
+	struct qoriq_ptp_registers __iomem *regs;
+	spinlock_t lock; /* protects regs */
+	struct ptp_clock *clock;
+	struct ptp_clock_info caps;
+	struct resource *rsrc;
+	int irq;
+	int phc_index;
+	u64 alarm_interval; /* for periodic alarm */
+	u64 alarm_value;
+	u32 tclk_period;  /* nanoseconds */
+	u32 tmr_prsc;
+	u32 tmr_add;
+	u32 cksel;
+	u32 tmr_fiper1;
+	u32 tmr_fiper2;
+};
+
+static inline u32 qoriq_read(unsigned __iomem *addr)
+{
+	u32 val;
+
+	val = ioread32be(addr);
+	return val;
+}
+
+static inline void qoriq_write(unsigned __iomem *addr, u32 val)
+{
+	iowrite32be(val, addr);
+}
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 7349a74ea75ca27606ead81df3ed67f1b32a94ba Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Fri, 25 May 2018 12:40:36 +0800
Subject: net: ethernet: gianfar_ethtool: get phc index through drvdata

Global variable gfar_phc_index was used to get and store
phc index through gianfar_ptp driver. However gianfar_ptp
had been renamed as ptp_qoriq for QorIQ common PTP driver.
This gfar_phc_index doesn't work any more, and the phc index
is stored in drvdata now. This patch is to support getting
phc index through ptp_qoriq drvdata.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/gianfar.h         |  3 ---
 drivers/net/ethernet/freescale/gianfar_ethtool.c | 23 ++++++++++++++++++-----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
index 5aa814799d70..8e42c0246611 100644
--- a/drivers/net/ethernet/freescale/gianfar.h
+++ b/drivers/net/ethernet/freescale/gianfar.h
@@ -1372,7 +1372,4 @@ struct filer_table {
 	struct gfar_filer_entry fe[MAX_FILER_CACHE_IDX + 20];
 };
 
-/* The gianfar_ptp module will set this variable */
-extern int gfar_phc_index;
-
 #endif /* __GIANFAR_H */
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index a93e0199c369..8cb98cae0a6f 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -41,6 +41,8 @@
 #include <linux/phy.h>
 #include <linux/sort.h>
 #include <linux/if_vlan.h>
+#include <linux/of_platform.h>
+#include <linux/fsl/ptp_qoriq.h>
 
 #include "gianfar.h"
 
@@ -1509,24 +1511,35 @@ static int gfar_get_nfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
 	return ret;
 }
 
-int gfar_phc_index = -1;
-EXPORT_SYMBOL(gfar_phc_index);
-
 static int gfar_get_ts_info(struct net_device *dev,
 			    struct ethtool_ts_info *info)
 {
 	struct gfar_private *priv = netdev_priv(dev);
+	struct platform_device *ptp_dev;
+	struct device_node *ptp_node;
+	struct qoriq_ptp *ptp = NULL;
+
+	info->phc_index = -1;
 
 	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)) {
 		info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
 					SOF_TIMESTAMPING_SOFTWARE;
-		info->phc_index = -1;
 		return 0;
 	}
+
+	ptp_node = of_find_compatible_node(NULL, NULL, "fsl,etsec-ptp");
+	if (ptp_node) {
+		ptp_dev = of_find_device_by_node(ptp_node);
+		if (ptp_dev)
+			ptp = platform_get_drvdata(ptp_dev);
+	}
+
+	if (ptp)
+		info->phc_index = ptp->phc_index;
+
 	info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
 				SOF_TIMESTAMPING_RX_HARDWARE |
 				SOF_TIMESTAMPING_RAW_HARDWARE;
-	info->phc_index = gfar_phc_index;
 	info->tx_types = (1 << HWTSTAMP_TX_OFF) |
 			 (1 << HWTSTAMP_TX_ON);
 	info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
-- 
cgit v1.2.3-59-g8ed1b


From a98ac8bd24791afbdd092eb53e89b536d8996464 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Fri, 25 May 2018 12:40:37 +0800
Subject: dt-bindings: ptp: add ptp-qoriq.txt

This patch is to add a documentation for ptp_qoriq dt-bindings.
The description for ptp_qoriq dt-bindings was actually moved
from Documentation/devicetree/bindings/net/fsl-tsec-phy.txt,
since gianfar_ptp driver was moved to ptp_qoriq driver.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/fsl-tsec-phy.txt       | 68 +--------------------
 .../devicetree/bindings/ptp/ptp-qoriq.txt          | 69 ++++++++++++++++++++++
 2 files changed, 70 insertions(+), 67 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/ptp/ptp-qoriq.txt

diff --git a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
index 79bf352e659c..047bdf7bdd2f 100644
--- a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
+++ b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
@@ -86,70 +86,4 @@ Example:
 
 * Gianfar PTP clock nodes
 
-General Properties:
-
-  - compatible   Should be "fsl,etsec-ptp"
-  - reg          Offset and length of the register set for the device
-  - interrupts   There should be at least two interrupts. Some devices
-                 have as many as four PTP related interrupts.
-
-Clock Properties:
-
-  - fsl,cksel        Timer reference clock source.
-  - fsl,tclk-period  Timer reference clock period in nanoseconds.
-  - fsl,tmr-prsc     Prescaler, divides the output clock.
-  - fsl,tmr-add      Frequency compensation value.
-  - fsl,tmr-fiper1   Fixed interval period pulse generator.
-  - fsl,tmr-fiper2   Fixed interval period pulse generator.
-  - fsl,max-adj      Maximum frequency adjustment in parts per billion.
-
-  These properties set the operational parameters for the PTP
-  clock. You must choose these carefully for the clock to work right.
-  Here is how to figure good values:
-
-  TimerOsc     = selected reference clock   MHz
-  tclk_period  = desired clock period       nanoseconds
-  NominalFreq  = 1000 / tclk_period         MHz
-  FreqDivRatio = TimerOsc / NominalFreq     (must be greater that 1.0)
-  tmr_add      = ceil(2^32 / FreqDivRatio)
-  OutputClock  = NominalFreq / tmr_prsc     MHz
-  PulseWidth   = 1 / OutputClock            microseconds
-  FiperFreq1   = desired frequency in Hz
-  FiperDiv1    = 1000000 * OutputClock / FiperFreq1
-  tmr_fiper1   = tmr_prsc * tclk_period * FiperDiv1 - tclk_period
-  max_adj      = 1000000000 * (FreqDivRatio - 1.0) - 1
-
-  The calculation for tmr_fiper2 is the same as for tmr_fiper1. The
-  driver expects that tmr_fiper1 will be correctly set to produce a 1
-  Pulse Per Second (PPS) signal, since this will be offered to the PPS
-  subsystem to synchronize the Linux clock.
-
-  Reference clock source is determined by the value, which is holded
-  in CKSEL bits in TMR_CTRL register. "fsl,cksel" property keeps the
-  value, which will be directly written in those bits, that is why,
-  according to reference manual, the next clock sources can be used:
-
-  <0> - external high precision timer reference clock (TSEC_TMR_CLK
-        input is used for this purpose);
-  <1> - eTSEC system clock;
-  <2> - eTSEC1 transmit clock;
-  <3> - RTC clock input.
-
-  When this attribute is not used, eTSEC system clock will serve as
-  IEEE 1588 timer reference clock.
-
-Example:
-
-	ptp_clock@24e00 {
-		compatible = "fsl,etsec-ptp";
-		reg = <0x24E00 0xB0>;
-		interrupts = <12 0x8 13 0x8>;
-		interrupt-parent = < &ipic >;
-		fsl,cksel       = <1>;
-		fsl,tclk-period = <10>;
-		fsl,tmr-prsc    = <100>;
-		fsl,tmr-add     = <0x999999A4>;
-		fsl,tmr-fiper1  = <0x3B9AC9F6>;
-		fsl,tmr-fiper2  = <0x00018696>;
-		fsl,max-adj     = <659999998>;
-	};
+Refer to Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
new file mode 100644
index 000000000000..0f569d8e73a3
--- /dev/null
+++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
@@ -0,0 +1,69 @@
+* Freescale QorIQ 1588 timer based PTP clock
+
+General Properties:
+
+  - compatible   Should be "fsl,etsec-ptp"
+  - reg          Offset and length of the register set for the device
+  - interrupts   There should be at least two interrupts. Some devices
+                 have as many as four PTP related interrupts.
+
+Clock Properties:
+
+  - fsl,cksel        Timer reference clock source.
+  - fsl,tclk-period  Timer reference clock period in nanoseconds.
+  - fsl,tmr-prsc     Prescaler, divides the output clock.
+  - fsl,tmr-add      Frequency compensation value.
+  - fsl,tmr-fiper1   Fixed interval period pulse generator.
+  - fsl,tmr-fiper2   Fixed interval period pulse generator.
+  - fsl,max-adj      Maximum frequency adjustment in parts per billion.
+
+  These properties set the operational parameters for the PTP
+  clock. You must choose these carefully for the clock to work right.
+  Here is how to figure good values:
+
+  TimerOsc     = selected reference clock   MHz
+  tclk_period  = desired clock period       nanoseconds
+  NominalFreq  = 1000 / tclk_period         MHz
+  FreqDivRatio = TimerOsc / NominalFreq     (must be greater that 1.0)
+  tmr_add      = ceil(2^32 / FreqDivRatio)
+  OutputClock  = NominalFreq / tmr_prsc     MHz
+  PulseWidth   = 1 / OutputClock            microseconds
+  FiperFreq1   = desired frequency in Hz
+  FiperDiv1    = 1000000 * OutputClock / FiperFreq1
+  tmr_fiper1   = tmr_prsc * tclk_period * FiperDiv1 - tclk_period
+  max_adj      = 1000000000 * (FreqDivRatio - 1.0) - 1
+
+  The calculation for tmr_fiper2 is the same as for tmr_fiper1. The
+  driver expects that tmr_fiper1 will be correctly set to produce a 1
+  Pulse Per Second (PPS) signal, since this will be offered to the PPS
+  subsystem to synchronize the Linux clock.
+
+  Reference clock source is determined by the value, which is holded
+  in CKSEL bits in TMR_CTRL register. "fsl,cksel" property keeps the
+  value, which will be directly written in those bits, that is why,
+  according to reference manual, the next clock sources can be used:
+
+  <0> - external high precision timer reference clock (TSEC_TMR_CLK
+        input is used for this purpose);
+  <1> - eTSEC system clock;
+  <2> - eTSEC1 transmit clock;
+  <3> - RTC clock input.
+
+  When this attribute is not used, eTSEC system clock will serve as
+  IEEE 1588 timer reference clock.
+
+Example:
+
+	ptp_clock@24e00 {
+		compatible = "fsl,etsec-ptp";
+		reg = <0x24E00 0xB0>;
+		interrupts = <12 0x8 13 0x8>;
+		interrupt-parent = < &ipic >;
+		fsl,cksel       = <1>;
+		fsl,tclk-period = <10>;
+		fsl,tmr-prsc    = <100>;
+		fsl,tmr-add     = <0x999999A4>;
+		fsl,tmr-fiper1  = <0x3B9AC9F6>;
+		fsl,tmr-fiper2  = <0x00018696>;
+		fsl,max-adj     = <659999998>;
+	};
-- 
cgit v1.2.3-59-g8ed1b


From 6528e02cc9ff7a195e2f81fd422458cefa83ad10 Mon Sep 17 00:00:00 2001
From: Christophe Roullier <christophe.roullier@st.com>
Date: Fri, 25 May 2018 09:46:38 +0200
Subject: net: ethernet: stmmac: add adaptation for stm32mp157c.

Glue codes to support stm32mp157c device and stay
compatible with stm32 mcu familly

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
Acked-by: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 267 ++++++++++++++++++++--
 1 file changed, 252 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
index 9e6db16af663..7e2e79dedebf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c
@@ -16,49 +16,180 @@
 #include <linux/of_net.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
 #include <linux/stmmac.h>
 
 #include "stmmac_platform.h"
 
-#define MII_PHY_SEL_MASK	BIT(23)
+#define SYSCFG_MCU_ETH_MASK		BIT(23)
+#define SYSCFG_MP1_ETH_MASK		GENMASK(23, 16)
+
+#define SYSCFG_PMCR_ETH_CLK_SEL		BIT(16)
+#define SYSCFG_PMCR_ETH_REF_CLK_SEL	BIT(17)
+#define SYSCFG_PMCR_ETH_SEL_MII		BIT(20)
+#define SYSCFG_PMCR_ETH_SEL_RGMII	BIT(21)
+#define SYSCFG_PMCR_ETH_SEL_RMII	BIT(23)
+#define SYSCFG_PMCR_ETH_SEL_GMII	0
+#define SYSCFG_MCU_ETH_SEL_MII		0
+#define SYSCFG_MCU_ETH_SEL_RMII		1
 
 struct stm32_dwmac {
 	struct clk *clk_tx;
 	struct clk *clk_rx;
+	struct clk *clk_eth_ck;
+	struct clk *clk_ethstp;
+	struct clk *syscfg_clk;
+	bool int_phyclk;	/* Clock from RCC to drive PHY */
 	u32 mode_reg;		/* MAC glue-logic mode register */
 	struct regmap *regmap;
 	u32 speed;
+	const struct stm32_ops *ops;
+	struct device *dev;
+};
+
+struct stm32_ops {
+	int (*set_mode)(struct plat_stmmacenet_data *plat_dat);
+	int (*clk_prepare)(struct stm32_dwmac *dwmac, bool prepare);
+	int (*suspend)(struct stm32_dwmac *dwmac);
+	void (*resume)(struct stm32_dwmac *dwmac);
+	int (*parse_data)(struct stm32_dwmac *dwmac,
+			  struct device *dev);
+	u32 syscfg_eth_mask;
 };
 
 static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat)
 {
 	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
-	u32 reg = dwmac->mode_reg;
-	u32 val;
 	int ret;
 
-	val = (plat_dat->interface == PHY_INTERFACE_MODE_MII) ? 0 : 1;
-	ret = regmap_update_bits(dwmac->regmap, reg, MII_PHY_SEL_MASK, val);
-	if (ret)
-		return ret;
+	if (dwmac->ops->set_mode) {
+		ret = dwmac->ops->set_mode(plat_dat);
+		if (ret)
+			return ret;
+	}
 
 	ret = clk_prepare_enable(dwmac->clk_tx);
 	if (ret)
 		return ret;
 
-	ret = clk_prepare_enable(dwmac->clk_rx);
-	if (ret)
-		clk_disable_unprepare(dwmac->clk_tx);
+	if (!dwmac->dev->power.is_suspended) {
+		ret = clk_prepare_enable(dwmac->clk_rx);
+		if (ret) {
+			clk_disable_unprepare(dwmac->clk_tx);
+			return ret;
+		}
+	}
+
+	if (dwmac->ops->clk_prepare) {
+		ret = dwmac->ops->clk_prepare(dwmac, true);
+		if (ret) {
+			clk_disable_unprepare(dwmac->clk_rx);
+			clk_disable_unprepare(dwmac->clk_tx);
+		}
+	}
 
 	return ret;
 }
 
+static int stm32mp1_clk_prepare(struct stm32_dwmac *dwmac, bool prepare)
+{
+	int ret = 0;
+
+	if (prepare) {
+		ret = clk_prepare_enable(dwmac->syscfg_clk);
+		if (ret)
+			return ret;
+
+		if (dwmac->int_phyclk) {
+			ret = clk_prepare_enable(dwmac->clk_eth_ck);
+			if (ret) {
+				clk_disable_unprepare(dwmac->syscfg_clk);
+				return ret;
+			}
+		}
+	} else {
+		clk_disable_unprepare(dwmac->syscfg_clk);
+		if (dwmac->int_phyclk)
+			clk_disable_unprepare(dwmac->clk_eth_ck);
+	}
+	return ret;
+}
+
+static int stm32mp1_set_mode(struct plat_stmmacenet_data *plat_dat)
+{
+	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
+	u32 reg = dwmac->mode_reg;
+	int val;
+
+	switch (plat_dat->interface) {
+	case PHY_INTERFACE_MODE_MII:
+		val = SYSCFG_PMCR_ETH_SEL_MII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_MII\n");
+		break;
+	case PHY_INTERFACE_MODE_GMII:
+		val = SYSCFG_PMCR_ETH_SEL_GMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_GMII\n");
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		val = SYSCFG_PMCR_ETH_SEL_RMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_REF_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RMII\n");
+		break;
+	case PHY_INTERFACE_MODE_RGMII:
+		val = SYSCFG_PMCR_ETH_SEL_RGMII;
+		if (dwmac->int_phyclk)
+			val |= SYSCFG_PMCR_ETH_CLK_SEL;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RGMII\n");
+		break;
+	default:
+		pr_debug("SYSCFG init :  Do not manage %d interface\n",
+			 plat_dat->interface);
+		/* Do not manage others interfaces */
+		return -EINVAL;
+	}
+
+	return regmap_update_bits(dwmac->regmap, reg,
+				 dwmac->ops->syscfg_eth_mask, val);
+}
+
+static int stm32mcu_set_mode(struct plat_stmmacenet_data *plat_dat)
+{
+	struct stm32_dwmac *dwmac = plat_dat->bsp_priv;
+	u32 reg = dwmac->mode_reg;
+	int val;
+
+	switch (plat_dat->interface) {
+	case PHY_INTERFACE_MODE_MII:
+		val = SYSCFG_MCU_ETH_SEL_MII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_MII\n");
+		break;
+	case PHY_INTERFACE_MODE_RMII:
+		val = SYSCFG_MCU_ETH_SEL_RMII;
+		pr_debug("SYSCFG init : PHY_INTERFACE_MODE_RMII\n");
+		break;
+	default:
+		pr_debug("SYSCFG init :  Do not manage %d interface\n",
+			 plat_dat->interface);
+		/* Do not manage others interfaces */
+		return -EINVAL;
+	}
+
+	return regmap_update_bits(dwmac->regmap, reg,
+				 dwmac->ops->syscfg_eth_mask, val);
+}
+
 static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac)
 {
 	clk_disable_unprepare(dwmac->clk_tx);
 	clk_disable_unprepare(dwmac->clk_rx);
+
+	if (dwmac->ops->clk_prepare)
+		dwmac->ops->clk_prepare(dwmac, false);
 }
 
 static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
@@ -70,15 +201,22 @@ static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
 	/*  Get TX/RX clocks */
 	dwmac->clk_tx = devm_clk_get(dev, "mac-clk-tx");
 	if (IS_ERR(dwmac->clk_tx)) {
-		dev_err(dev, "No tx clock provided...\n");
+		dev_err(dev, "No ETH Tx clock provided...\n");
 		return PTR_ERR(dwmac->clk_tx);
 	}
+
 	dwmac->clk_rx = devm_clk_get(dev, "mac-clk-rx");
 	if (IS_ERR(dwmac->clk_rx)) {
-		dev_err(dev, "No rx clock provided...\n");
+		dev_err(dev, "No ETH Rx clock provided...\n");
 		return PTR_ERR(dwmac->clk_rx);
 	}
 
+	if (dwmac->ops->parse_data) {
+		err = dwmac->ops->parse_data(dwmac, dev);
+		if (err)
+			return err;
+	}
+
 	/* Get mode register */
 	dwmac->regmap = syscon_regmap_lookup_by_phandle(np, "st,syscon");
 	if (IS_ERR(dwmac->regmap))
@@ -91,11 +229,46 @@ static int stm32_dwmac_parse_data(struct stm32_dwmac *dwmac,
 	return err;
 }
 
+static int stm32mp1_parse_data(struct stm32_dwmac *dwmac,
+			       struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+
+	dwmac->int_phyclk = of_property_read_bool(np, "st,int-phyclk");
+
+	/* Check if internal clk from RCC selected */
+	if (dwmac->int_phyclk) {
+		/*  Get ETH_CLK clocks */
+		dwmac->clk_eth_ck = devm_clk_get(dev, "eth-ck");
+		if (IS_ERR(dwmac->clk_eth_ck)) {
+			dev_err(dev, "No ETH CK clock provided...\n");
+			return PTR_ERR(dwmac->clk_eth_ck);
+		}
+	}
+
+	/*  Clock used for low power mode */
+	dwmac->clk_ethstp = devm_clk_get(dev, "ethstp");
+	if (IS_ERR(dwmac->clk_ethstp)) {
+		dev_err(dev, "No ETH peripheral clock provided for CStop mode ...\n");
+		return PTR_ERR(dwmac->clk_ethstp);
+	}
+
+	/*  Clock for sysconfig */
+	dwmac->syscfg_clk = devm_clk_get(dev, "syscfg-clk");
+	if (IS_ERR(dwmac->syscfg_clk)) {
+		dev_err(dev, "No syscfg clock provided...\n");
+		return PTR_ERR(dwmac->syscfg_clk);
+	}
+
+	return 0;
+}
+
 static int stm32_dwmac_probe(struct platform_device *pdev)
 {
 	struct plat_stmmacenet_data *plat_dat;
 	struct stmmac_resources stmmac_res;
 	struct stm32_dwmac *dwmac;
+	const struct stm32_ops *data;
 	int ret;
 
 	ret = stmmac_get_platform_resources(pdev, &stmmac_res);
@@ -112,6 +285,16 @@ static int stm32_dwmac_probe(struct platform_device *pdev)
 		goto err_remove_config_dt;
 	}
 
+	data = of_device_get_match_data(&pdev->dev);
+	if (!data) {
+		dev_err(&pdev->dev, "no of match data provided\n");
+		ret = -EINVAL;
+		goto err_remove_config_dt;
+	}
+
+	dwmac->ops = data;
+	dwmac->dev = &pdev->dev;
+
 	ret = stm32_dwmac_parse_data(dwmac, &pdev->dev);
 	if (ret) {
 		dev_err(&pdev->dev, "Unable to parse OF data\n");
@@ -149,15 +332,48 @@ static int stm32_dwmac_remove(struct platform_device *pdev)
 	return ret;
 }
 
+static int stm32mp1_suspend(struct stm32_dwmac *dwmac)
+{
+	int ret = 0;
+
+	ret = clk_prepare_enable(dwmac->clk_ethstp);
+	if (ret)
+		return ret;
+
+	clk_disable_unprepare(dwmac->clk_tx);
+	clk_disable_unprepare(dwmac->syscfg_clk);
+	if (dwmac->int_phyclk)
+		clk_disable_unprepare(dwmac->clk_eth_ck);
+
+	return ret;
+}
+
+static void stm32mp1_resume(struct stm32_dwmac *dwmac)
+{
+	clk_disable_unprepare(dwmac->clk_ethstp);
+}
+
+static int stm32mcu_suspend(struct stm32_dwmac *dwmac)
+{
+	clk_disable_unprepare(dwmac->clk_tx);
+	clk_disable_unprepare(dwmac->clk_rx);
+
+	return 0;
+}
+
 #ifdef CONFIG_PM_SLEEP
 static int stm32_dwmac_suspend(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
+	struct stm32_dwmac *dwmac = priv->plat->bsp_priv;
+
 	int ret;
 
 	ret = stmmac_suspend(dev);
-	stm32_dwmac_clk_disable(priv->plat->bsp_priv);
+
+	if (dwmac->ops->suspend)
+		ret = dwmac->ops->suspend(dwmac);
 
 	return ret;
 }
@@ -166,8 +382,12 @@ static int stm32_dwmac_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
+	struct stm32_dwmac *dwmac = priv->plat->bsp_priv;
 	int ret;
 
+	if (dwmac->ops->resume)
+		dwmac->ops->resume(dwmac);
+
 	ret = stm32_dwmac_init(priv->plat);
 	if (ret)
 		return ret;
@@ -181,8 +401,24 @@ static int stm32_dwmac_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(stm32_dwmac_pm_ops,
 	stm32_dwmac_suspend, stm32_dwmac_resume);
 
+static struct stm32_ops stm32mcu_dwmac_data = {
+	.set_mode = stm32mcu_set_mode,
+	.suspend = stm32mcu_suspend,
+	.syscfg_eth_mask = SYSCFG_MCU_ETH_MASK
+};
+
+static struct stm32_ops stm32mp1_dwmac_data = {
+	.set_mode = stm32mp1_set_mode,
+	.clk_prepare = stm32mp1_clk_prepare,
+	.suspend = stm32mp1_suspend,
+	.resume = stm32mp1_resume,
+	.parse_data = stm32mp1_parse_data,
+	.syscfg_eth_mask = SYSCFG_MP1_ETH_MASK
+};
+
 static const struct of_device_id stm32_dwmac_match[] = {
-	{ .compatible = "st,stm32-dwmac"},
+	{ .compatible = "st,stm32-dwmac", .data = &stm32mcu_dwmac_data},
+	{ .compatible = "st,stm32mp1-dwmac", .data = &stm32mp1_dwmac_data},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, stm32_dwmac_match);
@@ -199,5 +435,6 @@ static struct platform_driver stm32_dwmac_driver = {
 module_platform_driver(stm32_dwmac_driver);
 
 MODULE_AUTHOR("Alexandre Torgue <alexandre.torgue@gmail.com>");
-MODULE_DESCRIPTION("STMicroelectronics MCU DWMAC Specific Glue layer");
+MODULE_AUTHOR("Christophe Roullier <christophe.roullier@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 DWMAC Specific Glue layer");
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 1f3feacbfd48a3fa740e3561c9b7cbc8cf694d1f Mon Sep 17 00:00:00 2001
From: Christophe Roullier <christophe.roullier@st.com>
Date: Fri, 25 May 2018 09:46:39 +0200
Subject: dt-bindings: stm32-dwmac: add support of MPU families

Add description for Ethernet MPU families fields

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stm32-dwmac.txt | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/stm32-dwmac.txt b/Documentation/devicetree/bindings/net/stm32-dwmac.txt
index 489dbcb66c5a..1341012722aa 100644
--- a/Documentation/devicetree/bindings/net/stm32-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/stm32-dwmac.txt
@@ -6,14 +6,28 @@ Please see stmmac.txt for the other unchanged properties.
 The device node has following properties.
 
 Required properties:
-- compatible:  Should be "st,stm32-dwmac" to select glue, and
+- compatible:  For MCU family should be "st,stm32-dwmac" to select glue, and
 	       "snps,dwmac-3.50a" to select IP version.
+	       For MPU family should be "st,stm32mp1-dwmac" to select
+	       glue, and "snps,dwmac-4.20a" to select IP version.
 - clocks: Must contain a phandle for each entry in clock-names.
 - clock-names: Should be "stmmaceth" for the host clock.
 	       Should be "mac-clk-tx" for the MAC TX clock.
 	       Should be "mac-clk-rx" for the MAC RX clock.
+	       For MPU family need to add also "ethstp" for power mode clock and,
+	                                       "syscfg-clk" for SYSCFG clock.
+- interrupt-names: Should contain a list of interrupt names corresponding to
+           the interrupts in the interrupts property, if available.
+		   Should be "macirq" for the main MAC IRQ
+		   Should be "eth_wake_irq" for the IT which wake up system
 - st,syscon : Should be phandle/offset pair. The phandle to the syscon node which
-	      encompases the glue register, and the offset of the control register.
+	       encompases the glue register, and the offset of the control register.
+
+Optional properties:
+- clock-names:     For MPU family "mac-clk-ck" for PHY without quartz
+- st,int-phyclk (boolean) :  valid only where PHY do not have quartz and need to be clock
+	           by RCC
+
 Example:
 
 	ethernet@40028000 {
-- 
cgit v1.2.3-59-g8ed1b


From 026e57585ff1598499bba119fe57306b83011f95 Mon Sep 17 00:00:00 2001
From: Christophe Roullier <christophe.roullier@st.com>
Date: Fri, 25 May 2018 09:46:40 +0200
Subject: net: stmmac: add dwmac-4.20a compatible

Manage dwmac-4.20a version from synopsys

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index ebd3e5ffa73c..6d141f3931eb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -472,7 +472,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	}
 
 	if (of_device_is_compatible(np, "snps,dwmac-4.00") ||
-	    of_device_is_compatible(np, "snps,dwmac-4.10a")) {
+	    of_device_is_compatible(np, "snps,dwmac-4.10a") ||
+	    of_device_is_compatible(np, "snps,dwmac-4.20a")) {
 		plat->has_gmac4 = 1;
 		plat->has_gmac = 0;
 		plat->pmt = 1;
-- 
cgit v1.2.3-59-g8ed1b


From 1f809b47e570fe420adc93ba55712a8f8f6d5be7 Mon Sep 17 00:00:00 2001
From: Christophe Roullier <christophe.roullier@st.com>
Date: Fri, 25 May 2018 09:46:41 +0200
Subject: dt-bindings: stm32: add compatible for syscon

This patch describes syscon DT bindings.

Signed-off-by: Christophe Roullier <christophe.roullier@st.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/arm/stm32.txt            | 10 ----------
 .../devicetree/bindings/arm/stm32/stm32-syscon.txt         | 14 ++++++++++++++
 Documentation/devicetree/bindings/arm/stm32/stm32.txt      | 10 ++++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/arm/stm32.txt
 create mode 100644 Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
 create mode 100644 Documentation/devicetree/bindings/arm/stm32/stm32.txt

diff --git a/Documentation/devicetree/bindings/arm/stm32.txt b/Documentation/devicetree/bindings/arm/stm32.txt
deleted file mode 100644
index 6808ed9ddfd5..000000000000
--- a/Documentation/devicetree/bindings/arm/stm32.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-STMicroelectronics STM32 Platforms Device Tree Bindings
-
-Each device tree must specify which STM32 SoC it uses,
-using one of the following compatible strings:
-
-  st,stm32f429
-  st,stm32f469
-  st,stm32f746
-  st,stm32h743
-  st,stm32mp157
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt b/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
new file mode 100644
index 000000000000..99980aee26e5
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
@@ -0,0 +1,14 @@
+STMicroelectronics STM32 Platforms System Controller
+
+Properties:
+   - compatible : should contain two values. First value must be :
+                 - " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
+                 second value must be always "syscon".
+   - reg : offset and length of the register set.
+
+ Example:
+         syscfg: syscon@50020000 {
+                 compatible = "st,stm32mp157-syscfg", "syscon";
+                 reg = <0x50020000 0x400>;
+         };
+
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.txt b/Documentation/devicetree/bindings/arm/stm32/stm32.txt
new file mode 100644
index 000000000000..6808ed9ddfd5
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32.txt
@@ -0,0 +1,10 @@
+STMicroelectronics STM32 Platforms Device Tree Bindings
+
+Each device tree must specify which STM32 SoC it uses,
+using one of the following compatible strings:
+
+  st,stm32f429
+  st,stm32f469
+  st,stm32f746
+  st,stm32h743
+  st,stm32mp157
-- 
cgit v1.2.3-59-g8ed1b


From 102cd909635612c0be784a519651954a7924c786 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 25 May 2018 15:00:20 +0200
Subject: qmi_wwan: apply SET_DTR quirk to the SIMCOM shared device ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SIMCOM are reusing a single device ID for many (all of their?)
different modems, based on different chipsets and firmwares. Newer
Qualcomm chipset generations require setting DTR to wake the QMI
function.  The SIM7600E modem is using such a chipset, making it
fail to work with this driver despite the device ID match.

Fix by unconditionally enabling the SET_DTR quirk for all SIMCOM
modems using this specific device ID.  This is similar to what
we already have done for another case of device IDs recycled over
multiple chipset generations: 14cf4a771b30 ("drivers: net: usb:
qmi_wwan: add QMI_QUIRK_SET_DTR for Telit PID 0x1201")

Initial testing on an older SIM7100 modem shows no immediate side
effects.

Reported-by: Sebastian Sjoholm <sebastian.sjoholm@gmail.com>
Cc: Reinhard Speyerer <rspmn@arcor.de>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 42565dd33aa6..148e78f8b48c 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1248,7 +1248,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},	/* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
 	{QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)},	/* HP lt4120 Snapdragon X5 LTE */
 	{QMI_FIXED_INTF(0x22de, 0x9061, 3)},	/* WeTelecom WPD-600N */
-	{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},	/* SIMCom 7230E */
+	{QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)},	/* SIMCom 7100E, 7230E, 7600E ++ */
 	{QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)},	/* Quectel EC25, EC20 R2.0  Mini PCIe */
 	{QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)},	/* Quectel EC21 Mini PCIe */
 	{QMI_FIXED_INTF(0x2c7c, 0x0296, 4)},	/* Quectel BG96 */
-- 
cgit v1.2.3-59-g8ed1b


From e9be0e993d95adbe5efe0e0f03b2a3e71f5bb2b6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 25 May 2018 16:28:44 +0200
Subject: net: sched: shrink struct Qdisc

The struct Qdisc has a lot of holes, especially after commit
a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback"),
which as a side effect, moved the fields just after 'busylock'
on a new cacheline.

Since both 'padded' and 'refcnt' are not updated frequently, and
there is a hole before 'gso_skb', we can move such fields there,
saving a cacheline without any performance side effect.

Before this commit:

pahole -C Qdisc net/sche/sch_generic.o
	# ...
        /* size: 384, cachelines: 6, members: 25 */
        /* sum members: 236, holes: 3, sum holes: 92 */
        /* padding: 56 */

After this commit:
pahole -C Qdisc net/sche/sch_generic.o
	# ...
	/* size: 320, cachelines: 5, members: 25 */
	/* sum members: 236, holes: 2, sum holes: 28 */
	/* padding: 56 */

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 98c10a28cd01..827a3711dc68 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -85,6 +85,8 @@ struct Qdisc {
 	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
+	int			padded;
+	refcount_t		refcnt;
 
 	/*
 	 * For performance sake on SMP, we put highly modified fields at the end
@@ -97,8 +99,6 @@ struct Qdisc {
 	unsigned long		state;
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
-	int			padded;
-	refcount_t		refcnt;
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
-- 
cgit v1.2.3-59-g8ed1b


From 846fcc83638f8908a465583fa93f4d6f14161420 Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 25 May 2018 19:42:56 +0100
Subject: net: hns3: Updates RX packet info fetch in case of multi BD

In the latest revision of the hardware, if a packet is spanning
across multiple BDs then only VLD bit and current data size info
is valid in each BD, and rest of the information is only valid
in the last BD of the packet. In such case we should make sure
we are fetching RX packet size from the first descriptor and
information like VLAN should be fetched from last BD.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Reviewed-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index cac51954f2cf..ae8d7493c813 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2085,9 +2085,8 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 
 	prefetch(desc);
 
-	length = le16_to_cpu(desc->rx.pkt_len);
+	length = le16_to_cpu(desc->rx.size);
 	bd_base_info = le32_to_cpu(desc->rx.bd_base_info);
-	l234info = le32_to_cpu(desc->rx.l234_info);
 
 	/* Check valid BD */
 	if (!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))
@@ -2121,22 +2120,6 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 
 	prefetchw(skb->data);
 
-	/* Based on hw strategy, the tag offloaded will be stored at
-	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
-	 * in one layer tag case.
-	 */
-	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
-		u16 vlan_tag;
-
-		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
-		if (!(vlan_tag & VLAN_VID_MASK))
-			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
-		if (vlan_tag & VLAN_VID_MASK)
-			__vlan_hwaccel_put_tag(skb,
-					       htons(ETH_P_8021Q),
-					       vlan_tag);
-	}
-
 	bnum = 1;
 	if (length <= HNS3_RX_HEAD_SIZE) {
 		memcpy(__skb_put(skb, length), va, ALIGN(length, sizeof(long)));
@@ -2172,6 +2155,23 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	}
 
 	*out_bnum = bnum;
+	/* Based on hw strategy, the tag offloaded will be stored at
+	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
+	 * in one layer tag case.
+	 */
+	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
+		u16 vlan_tag;
+
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		if (!(vlan_tag & VLAN_VID_MASK))
+			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		if (vlan_tag & VLAN_VID_MASK)
+			__vlan_hwaccel_put_tag(skb,
+					       htons(ETH_P_8021Q),
+					       vlan_tag);
+	}
+
+	l234info = le32_to_cpu(desc->rx.l234_info);
 
 	if (unlikely(!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
 		netdev_err(netdev, "no valid bd,%016llx,%016llx\n",
-- 
cgit v1.2.3-59-g8ed1b


From dcb35ccef85e51cf3ad36acd08c07ec8d9ff9e2a Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 25 May 2018 19:42:57 +0100
Subject: net: hns3: Add support for tx_accept_tag2 and tx_accept_untag2 config

HNS3 Hardware can support up to two VLAN tags in transmit leg, the PPP
module can handle the packets based on the tag1 and tag2 config. This
patch adds support for tag2 config for vlan handling

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |  7 ++++--
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 26 +++++++++++++++++-----
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  6 +++--
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index ee3cbac6dfaa..3fa08f780b06 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -704,11 +704,14 @@ struct hclge_vlan_filter_vf_cfg_cmd {
 	u8  vf_bitmap[16];
 };
 
-#define HCLGE_ACCEPT_TAG_B		0
-#define HCLGE_ACCEPT_UNTAG_B		1
+#define HCLGE_ACCEPT_TAG1_B		0
+#define HCLGE_ACCEPT_UNTAG1_B		1
 #define HCLGE_PORT_INS_TAG1_EN_B	2
 #define HCLGE_PORT_INS_TAG2_EN_B	3
 #define HCLGE_CFG_NIC_ROCE_SEL_B	4
+#define HCLGE_ACCEPT_TAG2_B		5
+#define HCLGE_ACCEPT_UNTAG2_B		6
+
 struct hclge_vport_vtag_tx_cfg_cmd {
 	u8 vport_vlan_cfg;
 	u8 vf_offset;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2f0bbb6708b9..c0b8d5a32bde 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4687,10 +4687,14 @@ static int hclge_set_vlan_tx_offload_cfg(struct hclge_vport *vport)
 	req = (struct hclge_vport_vtag_tx_cfg_cmd *)desc.data;
 	req->def_vlan_tag1 = cpu_to_le16(vcfg->default_tag1);
 	req->def_vlan_tag2 = cpu_to_le16(vcfg->default_tag2);
-	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG_B,
-		     vcfg->accept_tag ? 1 : 0);
-	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG_B,
-		     vcfg->accept_untag ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG1_B,
+			vcfg->accept_tag1 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG1_B,
+			vcfg->accept_untag1 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_TAG2_B,
+			vcfg->accept_tag2 ? 1 : 0);
+	hnae_set_bit(req->vport_vlan_cfg, HCLGE_ACCEPT_UNTAG2_B,
+			vcfg->accept_untag2 ? 1 : 0);
 	hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG1_EN_B,
 		     vcfg->insert_tag1_en ? 1 : 0);
 	hnae_set_bit(req->vport_vlan_cfg, HCLGE_PORT_INS_TAG2_EN_B,
@@ -4814,8 +4818,18 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
 
 	for (i = 0; i < hdev->num_alloc_vport; i++) {
 		vport = &hdev->vport[i];
-		vport->txvlan_cfg.accept_tag = true;
-		vport->txvlan_cfg.accept_untag = true;
+		vport->txvlan_cfg.accept_tag1 = true;
+		vport->txvlan_cfg.accept_untag1 = true;
+
+		/* accept_tag2 and accept_untag2 are not supported on
+		 * pdev revision(0x20), new revision support them. The
+		 * value of this two fields will not return error when driver
+		 * send command to fireware in revision(0x20).
+		 * This two fields can not configured by user.
+		 */
+		vport->txvlan_cfg.accept_tag2 = true;
+		vport->txvlan_cfg.accept_untag2 = true;
+
 		vport->txvlan_cfg.insert_tag1_en = false;
 		vport->txvlan_cfg.insert_tag2_en = false;
 		vport->txvlan_cfg.default_tag1 = 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 93177d91eea4..677f1e4521c5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -570,8 +570,10 @@ struct hclge_dev {
 
 /* VPort level vlan tag configuration for TX direction */
 struct hclge_tx_vtag_cfg {
-	bool accept_tag;	/* Whether accept tagged packet from host */
-	bool accept_untag;	/* Whether accept untagged packet from host */
+	bool accept_tag1;	/* Whether accept tag1 packet from host */
+	bool accept_untag1;	/* Whether accept untag1 packet from host */
+	bool accept_tag2;
+	bool accept_untag2;
 	bool insert_tag1_en;	/* Whether insert inner vlan tag */
 	bool insert_tag2_en;	/* Whether insert outer vlan tag */
 	u16  default_tag1;	/* The default inner vlan tag to insert */
-- 
cgit v1.2.3-59-g8ed1b


From 5b5455a9ed5a9d7c26313a353bc2c3c1f6e1f5cb Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 25 May 2018 19:42:58 +0100
Subject: net: hns3: Add STRP_TAGP field support for hardware revision 0x21

Hardware Revision(0x21) Buffer Descriptor adds a field STRP_TAGP
for vlan stripped processed indication. STRP_TAGP field has 2 bits,
bit 0 is stripped indication of the vlan tag in outer vlan tag
field, bit 1 is stripped indication of the vlan tag in inner vlan
tag field. For each bit, 0 indicates the tag is not stripped and
1 indicates the tag is stripped.

This patch adds STRP_TAGP support for revision(0x21), and does not
change the revision(0x20) action.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 42 ++++++++++++++++++++++---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h |  3 ++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ae8d7493c813..1bcb676f459c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2066,6 +2066,39 @@ static void hns3_rx_skb(struct hns3_enet_ring *ring, struct sk_buff *skb)
 	napi_gro_receive(&ring->tqp_vector->napi, skb);
 }
 
+static u16 hns3_parse_vlan_tag(struct hns3_enet_ring *ring,
+			       struct hns3_desc *desc, u32 l234info)
+{
+	struct pci_dev *pdev = ring->tqp->handle->pdev;
+	u16 vlan_tag;
+
+	if (pdev->revision == 0x20) {
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		if (!(vlan_tag & VLAN_VID_MASK))
+			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+
+		return vlan_tag;
+	}
+
+#define HNS3_STRP_OUTER_VLAN	0x1
+#define HNS3_STRP_INNER_VLAN	0x2
+
+	switch (hnae_get_field(l234info, HNS3_RXD_STRP_TAGP_M,
+			       HNS3_RXD_STRP_TAGP_S)) {
+	case HNS3_STRP_OUTER_VLAN:
+		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
+		break;
+	case HNS3_STRP_INNER_VLAN:
+		vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		break;
+	default:
+		vlan_tag = 0;
+		break;
+	}
+
+	return vlan_tag;
+}
+
 static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 			     struct sk_buff **out_skb, int *out_bnum)
 {
@@ -2155,6 +2188,9 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	}
 
 	*out_bnum = bnum;
+
+	l234info = le32_to_cpu(desc->rx.l234_info);
+
 	/* Based on hw strategy, the tag offloaded will be stored at
 	 * ot_vlan_tag in two layer tag case, and stored at vlan_tag
 	 * in one layer tag case.
@@ -2162,17 +2198,13 @@ static int hns3_handle_rx_bd(struct hns3_enet_ring *ring,
 	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX) {
 		u16 vlan_tag;
 
-		vlan_tag = le16_to_cpu(desc->rx.ot_vlan_tag);
-		if (!(vlan_tag & VLAN_VID_MASK))
-			vlan_tag = le16_to_cpu(desc->rx.vlan_tag);
+		vlan_tag = hns3_parse_vlan_tag(ring, desc, l234info);
 		if (vlan_tag & VLAN_VID_MASK)
 			__vlan_hwaccel_put_tag(skb,
 					       htons(ETH_P_8021Q),
 					       vlan_tag);
 	}
 
-	l234info = le32_to_cpu(desc->rx.l234_info);
-
 	if (unlikely(!hnae_get_bit(bd_base_info, HNS3_RXD_VLD_B))) {
 		netdev_err(netdev, "no valid bd,%016llx,%016llx\n",
 			   ((u64 *)desc)[0], ((u64 *)desc)[1]);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 5b40f5a53761..38e91ca71e07 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -104,6 +104,9 @@ enum hns3_nic_state {
 #define HNS3_RXD_L4ID_S				8
 #define HNS3_RXD_L4ID_M				(0xf << HNS3_RXD_L4ID_S)
 #define HNS3_RXD_FRAG_B				12
+#define HNS3_RXD_STRP_TAGP_S			13
+#define HNS3_RXD_STRP_TAGP_M			(0x3 << HNS3_RXD_STRP_TAGP_S)
+
 #define HNS3_RXD_L2E_B				16
 #define HNS3_RXD_L3E_B				17
 #define HNS3_RXD_L4E_B				18
-- 
cgit v1.2.3-59-g8ed1b


From 96c0e8614eba889d8beb00e9d74168c1c607908b Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 25 May 2018 19:42:59 +0100
Subject: net: hns3: Add support to enable TX/RX promisc mode for H/W rev(0x21)

HCLGE_PROMISC_TX_EN_B and HCLGE_PROMISC_RX_EN_B are not supported
on pdev revision(0x20), new revision(0x21) supports them.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h  | 2 ++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index 3fa08f780b06..e9be6aa88b65 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -484,6 +484,8 @@ struct hclge_promisc_param {
 	u8 enable;
 };
 
+#define HCLGE_PROMISC_TX_EN_B	BIT(4)
+#define HCLGE_PROMISC_RX_EN_B	BIT(5)
 #define HCLGE_PROMISC_EN_B	1
 #define HCLGE_PROMISC_EN_ALL	0x7
 #define HCLGE_PROMISC_EN_UC	0x1
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c0b8d5a32bde..8ad8f62dc501 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3586,7 +3586,14 @@ int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev,
 
 	req = (struct hclge_promisc_cfg_cmd *)desc.data;
 	req->vf_id = param->vf_id;
-	req->flag = (param->enable << HCLGE_PROMISC_EN_B);
+
+	/* HCLGE_PROMISC_TX_EN_B and HCLGE_PROMISC_RX_EN_B are not supported on
+	 * pdev revision(0x20), new revision support them. The
+	 * value of this two fields will not return error when driver
+	 * send command to fireware in revision(0x20).
+	 */
+	req->flag = (param->enable << HCLGE_PROMISC_EN_B) |
+		HCLGE_PROMISC_TX_EN_B | HCLGE_PROMISC_RX_EN_B;
 
 	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
 	if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From 7c4bfcb0556bc8e1cd86b25db0a5058b49f5a085 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Fri, 25 May 2018 19:43:00 +0100
Subject: net: hns3: Fix for PF mailbox receving unknown message

Before the firmware updates the crq's tail pointer, if the PF driver
reads the data in the crq, the data may be incomplete at this time,
which will lead to the driver read an unknown message.

This patch fixes it by checking if crq is not empty before reading the
message.

Fixes: c1a81619d73a ("net: hns3: Add mailbox interrupt handling to PF driver")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 23 +++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index b6ae26ba0a46..31f3d9a43d8d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -382,6 +382,13 @@ static void hclge_reset_vf(struct hclge_vport *vport,
 	hclge_func_reset_cmd(hdev, mbx_req->mbx_src_vfid);
 }
 
+static bool hclge_cmd_crq_empty(struct hclge_hw *hw)
+{
+	u32 tail = hclge_read_dev(hw, HCLGE_NIC_CRQ_TAIL_REG);
+
+	return tail == hw->cmq.crq.next_to_use;
+}
+
 void hclge_mbx_handler(struct hclge_dev *hdev)
 {
 	struct hclge_cmq_ring *crq = &hdev->hw.cmq.crq;
@@ -390,12 +397,23 @@ void hclge_mbx_handler(struct hclge_dev *hdev)
 	struct hclge_desc *desc;
 	int ret, flag;
 
-	flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
 	/* handle all the mailbox requests in the queue */
-	while (hnae_get_bit(flag, HCLGE_CMDQ_RX_OUTVLD_B)) {
+	while (!hclge_cmd_crq_empty(&hdev->hw)) {
 		desc = &crq->desc[crq->next_to_use];
 		req = (struct hclge_mbx_vf_to_pf_cmd *)desc->data;
 
+		flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
+		if (unlikely(!hnae_get_bit(flag, HCLGE_CMDQ_RX_OUTVLD_B))) {
+			dev_warn(&hdev->pdev->dev,
+				 "dropped invalid mailbox message, code = %d\n",
+				 req->msg[0]);
+
+			/* dropping/not processing this invalid message */
+			crq->desc[crq->next_to_use].flag = 0;
+			hclge_mbx_ring_ptr_move_crq(crq);
+			continue;
+		}
+
 		vport = &hdev->vport[req->mbx_src_vfid];
 
 		switch (req->msg[0]) {
@@ -470,7 +488,6 @@ void hclge_mbx_handler(struct hclge_dev *hdev)
 		}
 		crq->desc[crq->next_to_use].flag = 0;
 		hclge_mbx_ring_ptr_move_crq(crq);
-		flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
 	}
 
 	/* Write back CMDQ_RQ header pointer, M7 need this pointer */
-- 
cgit v1.2.3-59-g8ed1b


From 90b99b094b4bd270542e3ade8cf74e21c228c069 Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 25 May 2018 19:43:01 +0100
Subject: net: hns3: Fixes the state to indicate client-type initialization

HNAE3 module supports kernel nic driver, user nic driver and roce driver,
and there are 3 client types. Driver uses one bit(HNAE3_CLIENT_INITED_B)
to indicate the client initialization state, it will cause confusion
for 3 client types. This patch fixes it by use 3 bits to indicate the
initialization state.

Fixes: 38caee9d3ee8 ("net: hns3: Add support of the HNAE3 framework")
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c | 49 +++++++++++++++++++++++++++--
 drivers/net/ethernet/hisilicon/hns3/hnae3.h |  4 ++-
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 63d7dbfb90bf..9d79dad2c6aa 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -36,6 +36,49 @@ static bool hnae3_client_match(enum hnae3_client_type client_type,
 	return false;
 }
 
+static void hnae3_set_client_init_flag(struct hnae3_client *client,
+				       struct hnae3_ae_dev *ae_dev, int inited)
+{
+	switch (client->type) {
+	case HNAE3_CLIENT_KNIC:
+		hnae_set_bit(ae_dev->flag, HNAE3_KNIC_CLIENT_INITED_B, inited);
+		break;
+	case HNAE3_CLIENT_UNIC:
+		hnae_set_bit(ae_dev->flag, HNAE3_UNIC_CLIENT_INITED_B, inited);
+		break;
+	case HNAE3_CLIENT_ROCE:
+		hnae_set_bit(ae_dev->flag, HNAE3_ROCE_CLIENT_INITED_B, inited);
+		break;
+	default:
+		break;
+	}
+}
+
+static int hnae3_get_client_init_flag(struct hnae3_client *client,
+				       struct hnae3_ae_dev *ae_dev)
+{
+	int inited = 0;
+
+	switch (client->type) {
+	case HNAE3_CLIENT_KNIC:
+		inited = hnae_get_bit(ae_dev->flag,
+				       HNAE3_KNIC_CLIENT_INITED_B);
+		break;
+	case HNAE3_CLIENT_UNIC:
+		inited = hnae_get_bit(ae_dev->flag,
+				       HNAE3_UNIC_CLIENT_INITED_B);
+		break;
+	case HNAE3_CLIENT_ROCE:
+		inited = hnae_get_bit(ae_dev->flag,
+				      HNAE3_ROCE_CLIENT_INITED_B);
+		break;
+	default:
+		break;
+	}
+
+	return inited;
+}
+
 static int hnae3_match_n_instantiate(struct hnae3_client *client,
 				     struct hnae3_ae_dev *ae_dev, bool is_reg)
 {
@@ -56,14 +99,14 @@ static int hnae3_match_n_instantiate(struct hnae3_client *client,
 			return ret;
 		}
 
-		hnae_set_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B, 1);
+		hnae3_set_client_init_flag(client, ae_dev, 1);
 		return 0;
 	}
 
-	if (hnae_get_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B)) {
+	if (hnae3_get_client_init_flag(client, ae_dev)) {
 		ae_dev->ops->uninit_client_instance(client, ae_dev);
 
-		hnae_set_bit(ae_dev->flag, HNAE3_CLIENT_INITED_B, 0);
+		hnae3_set_client_init_flag(client, ae_dev, 0);
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 45c571eea2ae..f250c592a218 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -54,7 +54,9 @@
 #define HNAE3_DEV_INITED_B			0x0
 #define HNAE3_DEV_SUPPORT_ROCE_B		0x1
 #define HNAE3_DEV_SUPPORT_DCB_B			0x2
-#define HNAE3_CLIENT_INITED_B			0x3
+#define HNAE3_KNIC_CLIENT_INITED_B		0x3
+#define HNAE3_UNIC_CLIENT_INITED_B		0x4
+#define HNAE3_ROCE_CLIENT_INITED_B		0x5
 
 #define HNAE3_DEV_SUPPORT_ROCE_DCB_BITS (BIT(HNAE3_DEV_SUPPORT_DCB_B) |\
 		BIT(HNAE3_DEV_SUPPORT_ROCE_B))
-- 
cgit v1.2.3-59-g8ed1b


From 7d0b130cbbfa4651cc1ab9268a2956c1b9d82ff9 Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Fri, 25 May 2018 19:43:02 +0100
Subject: net: hns3: Fixes the init of the VALID BD info in the descriptor

RX Buffer Descriptor contains a VALID bit which indicates if the BD
is valid and has some data. This field is set by HNS3 hardware to
intimate the driver of some valid data present in the BD. nd should
be reset by the driver when BD is being used again. In the existing
code this bit was not being (re-)initialized properly and hence was
causing problems.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 1bcb676f459c..d1ef104d6478 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1819,6 +1819,7 @@ static void hns3_replace_buffer(struct hns3_enet_ring *ring, int i,
 	hns3_unmap_buffer(ring, &ring->desc_cb[i]);
 	ring->desc_cb[i] = *res_cb;
 	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma);
+	ring->desc[i].rx.bd_base_info = 0;
 }
 
 static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i)
@@ -1826,6 +1827,7 @@ static void hns3_reuse_buffer(struct hns3_enet_ring *ring, int i)
 	ring->desc_cb[i].reuse_flag = 0;
 	ring->desc[i].addr = cpu_to_le64(ring->desc_cb[i].dma
 		+ ring->desc_cb[i].page_offset);
+	ring->desc[i].rx.bd_base_info = 0;
 }
 
 static void hns3_nic_reclaim_one_desc(struct hns3_enet_ring *ring, int *bytes,
-- 
cgit v1.2.3-59-g8ed1b


From 6986df9686c55464b0fa0f36a02f4da81d99a3b1 Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Fri, 25 May 2018 19:43:03 +0100
Subject: net: hns3: Removes unnecessary check when clearing TX/RX rings

Our code will ensure that hns3_clear_tx_ring is not used to cleared
RX rings and hns3_clear_rx_ring is not used to cleared TX rings. So
the ring type check is unnecessary.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index d1ef104d6478..4c13c527d3e9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3217,9 +3217,6 @@ static void hns3_recover_hw_addr(struct net_device *ndev)
 
 static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
 {
-	if (!HNAE3_IS_TX_RING(ring))
-		return;
-
 	while (ring->next_to_clean != ring->next_to_use) {
 		hns3_free_buffer_detach(ring, ring->next_to_clean);
 		ring_ptr_move_fw(ring, next_to_clean);
@@ -3228,9 +3225,6 @@ static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
 
 static void hns3_clear_rx_ring(struct hns3_enet_ring *ring)
 {
-	if (HNAE3_IS_TX_RING(ring))
-		return;
-
 	while (ring->next_to_use != ring->next_to_clean) {
 		/* When a buffer is not reused, it's memory has been
 		 * freed in hns3_handle_rx_bd or will be freed by
-- 
cgit v1.2.3-59-g8ed1b


From 7b763f3f7e52b1e0284c9f42acc4229935b7ec5f Mon Sep 17 00:00:00 2001
From: Fuyun Liang <liangfuyun1@huawei.com>
Date: Fri, 25 May 2018 19:43:04 +0100
Subject: net: hns3: Clear TX/RX rings when stopping port & un-initializing
 client

When we down the port, some packets are left in TX/RX buffer. When we
up the port again, these old packets are forwarded to protocol stack
or are sent to internet. It will make some problem. TX/RX buffer should
be cleared when stopping port. This patch adds some function to ensure
the buffer is clean when port is started. We should clear the rings
when clients are being un-initialized as well.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Fuyun Liang <liangfuyun1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    | 115 +++++++++++++++++++--
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h    |   1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |   4 +
 3 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 4c13c527d3e9..05290129793f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -25,6 +25,9 @@
 #include "hnae3.h"
 #include "hns3_enet.h"
 
+static void hns3_clear_all_ring(struct hnae3_handle *h);
+static void hns3_force_clear_all_rx_ring(struct hnae3_handle *h);
+
 static const char hns3_driver_name[] = "hns3";
 const char hns3_driver_version[] = VERMAGIC_STRING;
 static const char hns3_driver_string[] =
@@ -273,6 +276,10 @@ static int hns3_nic_net_up(struct net_device *netdev)
 	int i, j;
 	int ret;
 
+	ret = hns3_nic_reset_all_ring(h);
+	if (ret)
+		return ret;
+
 	/* get irq resource for all vectors */
 	ret = hns3_nic_init_irq(priv);
 	if (ret) {
@@ -333,17 +340,19 @@ static void hns3_nic_net_down(struct net_device *netdev)
 	if (test_and_set_bit(HNS3_NIC_STATE_DOWN, &priv->state))
 		return;
 
+	/* disable vectors */
+	for (i = 0; i < priv->vector_num; i++)
+		hns3_vector_disable(&priv->tqp_vector[i]);
+
 	/* stop ae_dev */
 	ops = priv->ae_handle->ae_algo->ops;
 	if (ops->stop)
 		ops->stop(priv->ae_handle);
 
-	/* disable vectors */
-	for (i = 0; i < priv->vector_num; i++)
-		hns3_vector_disable(&priv->tqp_vector[i]);
-
 	/* free irq resources */
 	hns3_nic_uninit_irq(priv);
+
+	hns3_clear_all_ring(priv->ae_handle);
 }
 
 static int hns3_nic_net_stop(struct net_device *netdev)
@@ -2939,8 +2948,6 @@ int hns3_init_all_ring(struct hns3_nic_priv *priv)
 			goto out_when_alloc_ring_memory;
 		}
 
-		hns3_init_ring_hw(priv->ring_data[i].ring);
-
 		u64_stats_init(&priv->ring_data[i].ring->syncp);
 	}
 
@@ -3102,6 +3109,8 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 	if (netdev->reg_state != NETREG_UNINITIALIZED)
 		unregister_netdev(netdev);
 
+	hns3_force_clear_all_rx_ring(handle);
+
 	ret = hns3_nic_uninit_vector_data(priv);
 	if (ret)
 		netdev_err(netdev, "uninit vector error\n");
@@ -3218,12 +3227,46 @@ static void hns3_recover_hw_addr(struct net_device *ndev)
 static void hns3_clear_tx_ring(struct hns3_enet_ring *ring)
 {
 	while (ring->next_to_clean != ring->next_to_use) {
+		ring->desc[ring->next_to_clean].tx.bdtp_fe_sc_vld_ra_ri = 0;
 		hns3_free_buffer_detach(ring, ring->next_to_clean);
 		ring_ptr_move_fw(ring, next_to_clean);
 	}
 }
 
-static void hns3_clear_rx_ring(struct hns3_enet_ring *ring)
+static int hns3_clear_rx_ring(struct hns3_enet_ring *ring)
+{
+	struct hns3_desc_cb res_cbs;
+	int ret;
+
+	while (ring->next_to_use != ring->next_to_clean) {
+		/* When a buffer is not reused, it's memory has been
+		 * freed in hns3_handle_rx_bd or will be freed by
+		 * stack, so we need to replace the buffer here.
+		 */
+		if (!ring->desc_cb[ring->next_to_use].reuse_flag) {
+			ret = hns3_reserve_buffer_map(ring, &res_cbs);
+			if (ret) {
+				u64_stats_update_begin(&ring->syncp);
+				ring->stats.sw_err_cnt++;
+				u64_stats_update_end(&ring->syncp);
+				/* if alloc new buffer fail, exit directly
+				 * and reclear in up flow.
+				 */
+				netdev_warn(ring->tqp->handle->kinfo.netdev,
+					    "reserve buffer map failed, ret = %d\n",
+					    ret);
+				return ret;
+			}
+			hns3_replace_buffer(ring, ring->next_to_use,
+					    &res_cbs);
+		}
+		ring_ptr_move_fw(ring, next_to_use);
+	}
+
+	return 0;
+}
+
+static void hns3_force_clear_rx_ring(struct hns3_enet_ring *ring)
 {
 	while (ring->next_to_use != ring->next_to_clean) {
 		/* When a buffer is not reused, it's memory has been
@@ -3240,6 +3283,19 @@ static void hns3_clear_rx_ring(struct hns3_enet_ring *ring)
 	}
 }
 
+static void hns3_force_clear_all_rx_ring(struct hnae3_handle *h)
+{
+	struct net_device *ndev = h->kinfo.netdev;
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hns3_enet_ring *ring;
+	u32 i;
+
+	for (i = 0; i < h->kinfo.num_tqps; i++) {
+		ring = priv->ring_data[i + h->kinfo.num_tqps].ring;
+		hns3_force_clear_rx_ring(ring);
+	}
+}
+
 static void hns3_clear_all_ring(struct hnae3_handle *h)
 {
 	struct net_device *ndev = h->kinfo.netdev;
@@ -3257,10 +3313,51 @@ static void hns3_clear_all_ring(struct hnae3_handle *h)
 		netdev_tx_reset_queue(dev_queue);
 
 		ring = priv->ring_data[i + h->kinfo.num_tqps].ring;
+		/* Continue to clear other rings even if clearing some
+		 * rings failed.
+		 */
 		hns3_clear_rx_ring(ring);
 	}
 }
 
+int hns3_nic_reset_all_ring(struct hnae3_handle *h)
+{
+	struct net_device *ndev = h->kinfo.netdev;
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+	struct hns3_enet_ring *rx_ring;
+	int i, j;
+	int ret;
+
+	for (i = 0; i < h->kinfo.num_tqps; i++) {
+		h->ae_algo->ops->reset_queue(h, i);
+		hns3_init_ring_hw(priv->ring_data[i].ring);
+
+		/* We need to clear tx ring here because self test will
+		 * use the ring and will not run down before up
+		 */
+		hns3_clear_tx_ring(priv->ring_data[i].ring);
+		priv->ring_data[i].ring->next_to_clean = 0;
+		priv->ring_data[i].ring->next_to_use = 0;
+
+		rx_ring = priv->ring_data[i + h->kinfo.num_tqps].ring;
+		hns3_init_ring_hw(rx_ring);
+		ret = hns3_clear_rx_ring(rx_ring);
+		if (ret)
+			return ret;
+
+		/* We can not know the hardware head and tail when this
+		 * function is called in reset flow, so we reuse all desc.
+		 */
+		for (j = 0; j < rx_ring->desc_num; j++)
+			hns3_reuse_buffer(rx_ring, j);
+
+		rx_ring->next_to_clean = 0;
+		rx_ring->next_to_use = 0;
+	}
+
+	return 0;
+}
+
 static int hns3_reset_notify_down_enet(struct hnae3_handle *handle)
 {
 	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
@@ -3330,7 +3427,7 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
 	int ret;
 
-	hns3_clear_all_ring(handle);
+	hns3_force_clear_all_rx_ring(handle);
 
 	ret = hns3_nic_uninit_vector_data(priv);
 	if (ret) {
@@ -3466,8 +3563,6 @@ int hns3_set_channels(struct net_device *netdev,
 	if (if_running)
 		hns3_nic_net_stop(netdev);
 
-	hns3_clear_all_ring(h);
-
 	ret = hns3_nic_uninit_vector_data(priv);
 	if (ret) {
 		dev_err(&netdev->dev,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 38e91ca71e07..3b083d5ae9ce 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -625,6 +625,7 @@ int hns3_set_channels(struct net_device *netdev,
 bool hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget);
 int hns3_init_all_ring(struct hns3_nic_priv *priv);
 int hns3_uninit_all_ring(struct hns3_nic_priv *priv);
+int hns3_nic_reset_all_ring(struct hnae3_handle *h);
 netdev_tx_t hns3_nic_net_xmit(struct sk_buff *skb, struct net_device *netdev);
 int hns3_clean_rx_ring(
 		struct hns3_enet_ring *ring, int budget,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index c16bb6cb0564..8f8cc2413656 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -108,6 +108,10 @@ static int hns3_lp_up(struct net_device *ndev, enum hnae3_loop loop_mode)
 	if (!h->ae_algo->ops->start)
 		return -EOPNOTSUPP;
 
+	ret = hns3_nic_reset_all_ring(h);
+	if (ret)
+		return ret;
+
 	ret = h->ae_algo->ops->start(h);
 	if (ret) {
 		netdev_err(ndev,
-- 
cgit v1.2.3-59-g8ed1b


From 16b496fd9063a8e00e79438bab50ff45f071be6a Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 25 May 2018 19:43:05 +0100
Subject: net: hns3: Remove unused led control code

In the previous implementation of led control for fibre port , parses the
port speed configuration, checks the link status and traffic status per
second, and updates the blink status of link led, traffic led and speed
led.

Now, the firmware takes responsibility to handle the led, the dirver just
needs to deal with locate command.

So the codes for link led, traffic led and speed led are useless now. This
patch removes these redundant codes.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h |   1 -
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 109 ---------------------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |   2 -
 3 files changed, 112 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index e9be6aa88b65..de2f6f118859 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -115,7 +115,6 @@ enum hclge_opcode_type {
 	HCLGE_OPC_QUERY_LINK_STATUS	= 0x0307,
 	HCLGE_OPC_CONFIG_MAX_FRM_SIZE	= 0x0308,
 	HCLGE_OPC_CONFIG_SPEED_DUP	= 0x0309,
-	HCLGE_OPC_STATS_MAC_TRAFFIC	= 0x0314,
 	/* MACSEC command */
 
 	/* PFC/Pause CMD*/
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 8ad8f62dc501..20988aadae21 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -39,7 +39,6 @@ static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
 static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu);
 static int hclge_init_vlan_config(struct hclge_dev *hdev);
 static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev);
-static int hclge_update_led_status(struct hclge_dev *hdev);
 
 static struct hnae3_ae_algo ae_algo;
 
@@ -504,38 +503,6 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev)
 	return 0;
 }
 
-static int hclge_mac_get_traffic_stats(struct hclge_dev *hdev)
-{
-	struct hclge_mac_stats *mac_stats = &hdev->hw_stats.mac_stats;
-	struct hclge_desc desc;
-	__le64 *desc_data;
-	int ret;
-
-	/* for fiber port, need to query the total rx/tx packets statstics,
-	 * used for data transferring checking.
-	 */
-	if (hdev->hw.mac.media_type != HNAE3_MEDIA_TYPE_FIBER)
-		return 0;
-
-	if (test_bit(HCLGE_STATE_STATISTICS_UPDATING, &hdev->state))
-		return 0;
-
-	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_STATS_MAC_TRAFFIC, true);
-	ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-	if (ret) {
-		dev_err(&hdev->pdev->dev,
-			"Get MAC total pkt stats fail, ret = %d\n", ret);
-
-		return ret;
-	}
-
-	desc_data = (__le64 *)(&desc.data[0]);
-	mac_stats->mac_tx_total_pkt_num += le64_to_cpu(*desc_data++);
-	mac_stats->mac_rx_total_pkt_num += le64_to_cpu(*desc_data);
-
-	return 0;
-}
-
 static int hclge_mac_update_stats(struct hclge_dev *hdev)
 {
 #define HCLGE_MAC_CMD_NUM 21
@@ -2916,20 +2883,13 @@ static void hclge_service_task(struct work_struct *work)
 	struct hclge_dev *hdev =
 		container_of(work, struct hclge_dev, service_task);
 
-	/* The total rx/tx packets statstics are wanted to be updated
-	 * per second. Both hclge_update_stats_for_all() and
-	 * hclge_mac_get_traffic_stats() can do it.
-	 */
 	if (hdev->hw_stats.stats_timer >= HCLGE_STATS_TIMER_INTERVAL) {
 		hclge_update_stats_for_all(hdev);
 		hdev->hw_stats.stats_timer = 0;
-	} else {
-		hclge_mac_get_traffic_stats(hdev);
 	}
 
 	hclge_update_speed_duplex(hdev);
 	hclge_update_link_status(hdev);
-	hclge_update_led_status(hdev);
 	hclge_service_complete(hdev);
 }
 
@@ -6100,75 +6060,6 @@ static int hclge_set_led_id(struct hnae3_handle *handle,
 	return ret;
 }
 
-enum hclge_led_port_speed {
-	HCLGE_SPEED_LED_FOR_1G,
-	HCLGE_SPEED_LED_FOR_10G,
-	HCLGE_SPEED_LED_FOR_25G,
-	HCLGE_SPEED_LED_FOR_40G,
-	HCLGE_SPEED_LED_FOR_50G,
-	HCLGE_SPEED_LED_FOR_100G,
-};
-
-static u8 hclge_led_get_speed_status(u32 speed)
-{
-	u8 speed_led;
-
-	switch (speed) {
-	case HCLGE_MAC_SPEED_1G:
-		speed_led = HCLGE_SPEED_LED_FOR_1G;
-		break;
-	case HCLGE_MAC_SPEED_10G:
-		speed_led = HCLGE_SPEED_LED_FOR_10G;
-		break;
-	case HCLGE_MAC_SPEED_25G:
-		speed_led = HCLGE_SPEED_LED_FOR_25G;
-		break;
-	case HCLGE_MAC_SPEED_40G:
-		speed_led = HCLGE_SPEED_LED_FOR_40G;
-		break;
-	case HCLGE_MAC_SPEED_50G:
-		speed_led = HCLGE_SPEED_LED_FOR_50G;
-		break;
-	case HCLGE_MAC_SPEED_100G:
-		speed_led = HCLGE_SPEED_LED_FOR_100G;
-		break;
-	default:
-		speed_led = HCLGE_LED_NO_CHANGE;
-	}
-
-	return speed_led;
-}
-
-static int hclge_update_led_status(struct hclge_dev *hdev)
-{
-	u8 port_speed_status, link_status, activity_status;
-	u64 rx_pkts, tx_pkts;
-
-	if (hdev->hw.mac.media_type != HNAE3_MEDIA_TYPE_FIBER)
-		return 0;
-
-	port_speed_status = hclge_led_get_speed_status(hdev->hw.mac.speed);
-
-	rx_pkts = hdev->hw_stats.mac_stats.mac_rx_total_pkt_num;
-	tx_pkts = hdev->hw_stats.mac_stats.mac_tx_total_pkt_num;
-	if (rx_pkts != hdev->rx_pkts_for_led ||
-	    tx_pkts != hdev->tx_pkts_for_led)
-		activity_status = HCLGE_LED_ON;
-	else
-		activity_status = HCLGE_LED_OFF;
-	hdev->rx_pkts_for_led = rx_pkts;
-	hdev->tx_pkts_for_led = tx_pkts;
-
-	if (hdev->hw.mac.link)
-		link_status = HCLGE_LED_ON;
-	else
-		link_status = HCLGE_LED_OFF;
-
-	return hclge_set_led_status_sfp(hdev, port_speed_status,
-					activity_status, link_status,
-					HCLGE_LED_NO_CHANGE);
-}
-
 static void hclge_get_link_mode(struct hnae3_handle *handle,
 				unsigned long *supported,
 				unsigned long *advertising)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 677f1e4521c5..7fcabdeb4ce9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -563,8 +563,6 @@ struct hclge_dev {
 
 	struct hclge_vlan_type_cfg vlan_type_cfg;
 
-	u64 rx_pkts_for_led;
-	u64 tx_pkts_for_led;
 	unsigned long vlan_table[VLAN_N_VID][BITS_TO_LONGS(HCLGE_VPORT_NUM)];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f6f75abc46db2002f28705a2fd4867e895169807 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 25 May 2018 19:43:06 +0100
Subject: net: hns3: Adds support for led locate command for copper port

Firmware now supports control of all leds. Existing HNS3 driver code
only supported led locate command over SFP Fibre ports. But now it
is also supported over copper port.
This patch removes existing not needed code for the led locate
command and updates the led control command between driver and
firmware.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h | 12 ++------
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 35 +++-------------------
 2 files changed, 6 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
index de2f6f118859..d9aaa76c76eb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -817,21 +817,13 @@ struct hclge_reset_cmd {
 #define HCLGE_NIC_CMQ_DESC_NUM		1024
 #define HCLGE_NIC_CMQ_DESC_NUM_S	3
 
-#define HCLGE_LED_PORT_SPEED_STATE_S	0
-#define HCLGE_LED_PORT_SPEED_STATE_M	GENMASK(5, 0)
-#define HCLGE_LED_ACTIVITY_STATE_S	0
-#define HCLGE_LED_ACTIVITY_STATE_M	GENMASK(1, 0)
-#define HCLGE_LED_LINK_STATE_S		0
-#define HCLGE_LED_LINK_STATE_M		GENMASK(1, 0)
 #define HCLGE_LED_LOCATE_STATE_S	0
 #define HCLGE_LED_LOCATE_STATE_M	GENMASK(1, 0)
 
 struct hclge_set_led_state_cmd {
-	u8 port_speed_led_config;
-	u8 link_led_config;
-	u8 activity_led_config;
+	u8 rsv1[3];
 	u8 locate_led_config;
-	u8 rsv[20];
+	u8 rsv2[20];
 };
 
 int hclge_cmd_init(struct hclge_dev *hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 20988aadae21..69166858a6dc 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5991,9 +5991,7 @@ static void hclge_get_regs(struct hnae3_handle *handle, u32 *version,
 			"Get 64 bit register failed, ret = %d.\n", ret);
 }
 
-static int hclge_set_led_status_sfp(struct hclge_dev *hdev, u8 speed_led_status,
-				    u8 act_led_status, u8 link_led_status,
-				    u8 locate_led_status)
+static int hclge_set_led_status(struct hclge_dev *hdev, u8 locate_led_status)
 {
 	struct hclge_set_led_state_cmd *req;
 	struct hclge_desc desc;
@@ -6002,12 +6000,6 @@ static int hclge_set_led_status_sfp(struct hclge_dev *hdev, u8 speed_led_status,
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_LED_STATUS_CFG, false);
 
 	req = (struct hclge_set_led_state_cmd *)desc.data;
-	hnae_set_field(req->port_speed_led_config, HCLGE_LED_PORT_SPEED_STATE_M,
-		       HCLGE_LED_PORT_SPEED_STATE_S, speed_led_status);
-	hnae_set_field(req->link_led_config, HCLGE_LED_ACTIVITY_STATE_M,
-		       HCLGE_LED_ACTIVITY_STATE_S, act_led_status);
-	hnae_set_field(req->activity_led_config, HCLGE_LED_LINK_STATE_M,
-		       HCLGE_LED_LINK_STATE_S, link_led_status);
 	hnae_set_field(req->locate_led_config, HCLGE_LED_LOCATE_STATE_M,
 		       HCLGE_LED_LOCATE_STATE_S, locate_led_status);
 
@@ -6028,36 +6020,17 @@ enum hclge_led_status {
 static int hclge_set_led_id(struct hnae3_handle *handle,
 			    enum ethtool_phys_id_state status)
 {
-#define BLINK_FREQUENCY		2
 	struct hclge_vport *vport = hclge_get_vport(handle);
 	struct hclge_dev *hdev = vport->back;
-	struct phy_device *phydev = hdev->hw.mac.phydev;
-	int ret = 0;
-
-	if (phydev || hdev->hw.mac.media_type != HNAE3_MEDIA_TYPE_FIBER)
-		return -EOPNOTSUPP;
 
 	switch (status) {
 	case ETHTOOL_ID_ACTIVE:
-		ret = hclge_set_led_status_sfp(hdev,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_ON);
-		break;
+		return hclge_set_led_status(hdev, HCLGE_LED_ON);
 	case ETHTOOL_ID_INACTIVE:
-		ret = hclge_set_led_status_sfp(hdev,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_NO_CHANGE,
-					       HCLGE_LED_OFF);
-		break;
+		return hclge_set_led_status(hdev, HCLGE_LED_OFF);
 	default:
-		ret = -EINVAL;
-		break;
+		return -EINVAL;
 	}
-
-	return ret;
 }
 
 static void hclge_get_link_mode(struct hnae3_handle *handle,
-- 
cgit v1.2.3-59-g8ed1b


From 544a7bcd5cc74734003cf7f12502b26cebfed19f Mon Sep 17 00:00:00 2001
From: Lijun Ou <oulijun@huawei.com>
Date: Fri, 25 May 2018 19:43:07 +0100
Subject: net: hns3: Fixes initalization of RoCE handle and makes it
 conditional

When register a RoCE client with hnae3vf device, it needs to judge
the device whether support RoCE vf function. Otherwise, it will
lead to calltrace when RoCE is not support vf function and remove
roce device.

The calltrace as follows:
[   93.156614] Unable to handle kernel NULL pointer dereference at virtual address 00000015
 <SNIP>
[   93.278784] Call trace:
[   93.278788]  hnae3_match_n_instantiate+0x24/0xd8 [hnae3]
[   93.278790]  hnae3_register_client+0xcc/0x150 [hnae3]
[   93.278801]  hns_roce_hw_v2_init+0x18/0x1000 [hns_roce_hw_v2]
[   93.278805]  do_one_initcall+0x58/0x160
[   93.278807]  do_init_module+0x64/0x1d8
[   93.278809]  load_module+0x135c/0x15c8
[   93.278811]  SyS_finit_module+0x100/0x118
[   93.278816]  __sys_trace_return+0x0/0x4
[   93.278827] Code: aa0003f5 12001c56 aa1e03e0 d503201f (b9402660)

Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support")
Reported-by: Xinwei Kong <kong.kongxinwei@hisilicon.com>
Reported-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Zhou Wang <wangzhou1@hisilicon.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 2b0e3295989f..266cdcba506e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1500,10 +1500,12 @@ static int hclgevf_init_instance(struct hclgevf_dev *hdev,
 			return ret;
 		break;
 	case HNAE3_CLIENT_ROCE:
-		hdev->roce_client = client;
-		hdev->roce.client = client;
+		if (hnae3_dev_roce_supported(hdev)) {
+			hdev->roce_client = client;
+			hdev->roce.client = client;
+		}
 
-		if (hdev->roce_client && hnae3_dev_roce_supported(hdev)) {
+		if (hdev->roce_client && hdev->nic_client) {
 			ret = hclgevf_init_roce_base_info(hdev);
 			if (ret)
 				return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 728a9dc61f132eb567f58c234e11ef80a3519cc0 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 17 May 2018 11:29:50 -0700
Subject: wlcore: sdio: Fix flakey SDIO runtime PM handling

We can have pm_runtime_get_sync() return 1, and we can have
pm_runtime_put_sync() return -EBUSY. See rpm_suspend() and
rpm_resume() for more information.

Fix the issue by returning 0 from wl12xx_sdio_power_on() on success.
And use pm_runtime_put() instead of pm_runtime_put_sync() for
wl12xx_sdio_power_off(), then the MMC subsystem will idle the bus
when suitable.

Otherwise wlcore can sometimes get confused and may report bogus
errors and WLAN connection can fail.

Note that while wlcore checks the return value for wl1271_power_on(),
the return value is ignored for wl1271_power_off(). Let's fix them
both though to avoid further confusion in the future.

Fixes: 60f36637bbbd ("wlcore: sdio: allow pm to handle sdio power")
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ti/wlcore/sdio.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/ti/wlcore/sdio.c b/drivers/net/wireless/ti/wlcore/sdio.c
index 6dbe61d47dc3..15d5ac126061 100644
--- a/drivers/net/wireless/ti/wlcore/sdio.c
+++ b/drivers/net/wireless/ti/wlcore/sdio.c
@@ -159,28 +159,36 @@ static int wl12xx_sdio_power_on(struct wl12xx_sdio_glue *glue)
 		pm_runtime_put_noidle(&card->dev);
 		dev_err(glue->dev, "%s: failed to get_sync(%d)\n",
 			__func__, ret);
-		goto out;
+
+		return ret;
 	}
 
 	sdio_claim_host(func);
 	sdio_enable_func(func);
 	sdio_release_host(func);
 
-out:
-	return ret;
+	return 0;
 }
 
 static int wl12xx_sdio_power_off(struct wl12xx_sdio_glue *glue)
 {
 	struct sdio_func *func = dev_to_sdio_func(glue->dev);
 	struct mmc_card *card = func->card;
+	int error;
 
 	sdio_claim_host(func);
 	sdio_disable_func(func);
 	sdio_release_host(func);
 
 	/* Let runtime PM know the card is powered off */
-	return pm_runtime_put_sync(&card->dev);
+	error = pm_runtime_put(&card->dev);
+	if (error < 0 && error != -EBUSY) {
+		dev_err(&card->dev, "%s failed: %i\n", __func__, error);
+
+		return error;
+	}
+
+	return 0;
 }
 
 static int wl12xx_sdio_set_power(struct device *child, bool enable)
-- 
cgit v1.2.3-59-g8ed1b


From 6acfbb81ab0a800fd0ce0795103852255d671635 Mon Sep 17 00:00:00 2001
From: Tzu-En Huang <tehuang@realtek.com>
Date: Fri, 18 May 2018 17:29:54 +0800
Subject: rtlwifi: support accurate nullfunc frame tx ack report

In order to realize the keep-alive mechanism in mac80211 stack, reporting
accurate tx ack status for nullfunc frame is added in this commit.

If current frame is nullfunc frame, we ask firmware to report by filling
TX report bit in TX descriptor. After this frame DMA done, TX interrupt is
triggered but TX status is unknown at this moment, so enqueue this skb
into tx_report->queue. Finally, C2H report will be received if the frame
is transmitted successfully or retried over, and then we report to mac80211
with IEEE80211_TX_STAT_ACK flag only if it's successful. Otherwise, if
failure or timeout (one second), we report to mac80211 without this flag.

Signed-off-by: Tzu-En Huang <tehuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c        | 84 +++++++++++++++++++---
 drivers/net/wireless/realtek/rtlwifi/base.h        |  5 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c         | 13 ++--
 .../net/wireless/realtek/rtlwifi/rtl8192ee/trx.c   |  5 +-
 .../net/wireless/realtek/rtlwifi/rtl8723be/trx.c   |  5 +-
 .../net/wireless/realtek/rtlwifi/rtl8821ae/trx.c   |  5 +-
 drivers/net/wireless/realtek/rtlwifi/wifi.h        | 16 +++++
 7 files changed, 112 insertions(+), 21 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 762a29cdf7ad..6620c6842b91 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -574,6 +574,7 @@ int rtl_init_core(struct ieee80211_hw *hw)
 	INIT_LIST_HEAD(&rtlpriv->entry_list);
 	INIT_LIST_HEAD(&rtlpriv->c2hcmd_list);
 	INIT_LIST_HEAD(&rtlpriv->scan_list.list);
+	skb_queue_head_init(&rtlpriv->tx_report.queue);
 
 	rtlmac->link_state = MAC80211_NOLINK;
 
@@ -585,11 +586,14 @@ int rtl_init_core(struct ieee80211_hw *hw)
 EXPORT_SYMBOL_GPL(rtl_init_core);
 
 static void rtl_free_entries_from_scan_list(struct ieee80211_hw *hw);
+static void rtl_free_entries_from_ack_queue(struct ieee80211_hw *hw,
+					    bool timeout);
 
 void rtl_deinit_core(struct ieee80211_hw *hw)
 {
 	rtl_c2hcmd_launcher(hw, 0);
 	rtl_free_entries_from_scan_list(hw);
+	rtl_free_entries_from_ack_queue(hw, false);
 }
 EXPORT_SYMBOL_GPL(rtl_deinit_core);
 
@@ -1575,22 +1579,52 @@ end:
 }
 EXPORT_SYMBOL_GPL(rtl_is_special_data);
 
+void rtl_tx_ackqueue(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_tx_report *tx_report = &rtlpriv->tx_report;
+
+	__skb_queue_tail(&tx_report->queue, skb);
+}
+EXPORT_SYMBOL_GPL(rtl_tx_ackqueue);
+
+static void rtl_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb,
+			  bool ack)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct ieee80211_tx_info *info;
+
+	info = IEEE80211_SKB_CB(skb);
+	ieee80211_tx_info_clear_status(info);
+	if (ack) {
+		RT_TRACE(rtlpriv, COMP_TX_REPORT, DBG_LOUD,
+			 "tx report: ack\n");
+		info->flags |= IEEE80211_TX_STAT_ACK;
+	} else {
+		RT_TRACE(rtlpriv, COMP_TX_REPORT, DBG_LOUD,
+			 "tx report: not ack\n");
+		info->flags &= ~IEEE80211_TX_STAT_ACK;
+	}
+	ieee80211_tx_status_irqsafe(hw, skb);
+}
+
 bool rtl_is_tx_report_skb(struct ieee80211_hw *hw, struct sk_buff *skb)
 {
 	u16 ether_type;
 	const u8 *ether_type_ptr;
+	__le16 fc = rtl_get_fc(skb);
 
 	ether_type_ptr = rtl_skb_ether_type_ptr(hw, skb, true);
 	ether_type = be16_to_cpup((__be16 *)ether_type_ptr);
 
-	/* EAPOL */
-	if (ether_type == ETH_P_PAE)
+	if (ether_type == ETH_P_PAE || ieee80211_is_nullfunc(fc))
 		return true;
 
 	return false;
 }
 
-static u16 rtl_get_tx_report_sn(struct ieee80211_hw *hw)
+static u16 rtl_get_tx_report_sn(struct ieee80211_hw *hw,
+				struct rtlwifi_tx_info *tx_info)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_tx_report *tx_report = &rtlpriv->tx_report;
@@ -1604,29 +1638,33 @@ static u16 rtl_get_tx_report_sn(struct ieee80211_hw *hw)
 
 	tx_report->last_sent_sn = sn;
 	tx_report->last_sent_time = jiffies;
-
+	tx_info->sn = sn;
+	tx_info->send_time = tx_report->last_sent_time;
 	RT_TRACE(rtlpriv, COMP_TX_REPORT, DBG_DMESG,
 		 "Send TX-Report sn=0x%X\n", sn);
 
 	return sn;
 }
 
-void rtl_get_tx_report(struct rtl_tcb_desc *ptcb_desc, u8 *pdesc,
-		       struct ieee80211_hw *hw)
+void rtl_set_tx_report(struct rtl_tcb_desc *ptcb_desc, u8 *pdesc,
+		       struct ieee80211_hw *hw, struct rtlwifi_tx_info *tx_info)
 {
 	if (ptcb_desc->use_spe_rpt) {
-		u16 sn = rtl_get_tx_report_sn(hw);
+		u16 sn = rtl_get_tx_report_sn(hw, tx_info);
 
 		SET_TX_DESC_SPE_RPT(pdesc, 1);
 		SET_TX_DESC_SW_DEFINE(pdesc, sn);
 	}
 }
-EXPORT_SYMBOL_GPL(rtl_get_tx_report);
+EXPORT_SYMBOL_GPL(rtl_set_tx_report);
 
 void rtl_tx_report_handler(struct ieee80211_hw *hw, u8 *tmp_buf, u8 c2h_cmd_len)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_tx_report *tx_report = &rtlpriv->tx_report;
+	struct rtlwifi_tx_info *tx_info;
+	struct sk_buff_head *queue = &tx_report->queue;
+	struct sk_buff *skb;
 	u16 sn;
 	u8 st, retry;
 
@@ -1642,6 +1680,14 @@ void rtl_tx_report_handler(struct ieee80211_hw *hw, u8 *tmp_buf, u8 c2h_cmd_len)
 
 	tx_report->last_recv_sn = sn;
 
+	skb_queue_walk(queue, skb) {
+		tx_info = rtl_tx_skb_cb_info(skb);
+		if (tx_info->sn == sn) {
+			skb_unlink(skb, queue);
+			rtl_tx_status(hw, skb, st == 0);
+			break;
+		}
+	}
 	RT_TRACE(rtlpriv, COMP_TX_REPORT, DBG_DMESG,
 		 "Recv TX-Report st=0x%02X sn=0x%X retry=0x%X\n",
 		 st, sn, retry);
@@ -1909,6 +1955,25 @@ static void rtl_free_entries_from_scan_list(struct ieee80211_hw *hw)
 	}
 }
 
+static void rtl_free_entries_from_ack_queue(struct ieee80211_hw *hw,
+					    bool chk_timeout)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_tx_report *tx_report = &rtlpriv->tx_report;
+	struct sk_buff_head *queue = &tx_report->queue;
+	struct sk_buff *skb, *tmp;
+	struct rtlwifi_tx_info *tx_info;
+
+	skb_queue_walk_safe(queue, skb, tmp) {
+		tx_info = rtl_tx_skb_cb_info(skb);
+		if (chk_timeout &&
+		    time_after(tx_info->send_time + HZ, jiffies))
+			continue;
+		skb_unlink(skb, queue);
+		rtl_tx_status(hw, skb, false);
+	}
+}
+
 void rtl_scan_list_expire(struct ieee80211_hw *hw)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
@@ -2173,6 +2238,9 @@ label_lps_done:
 
 	/* <6> scan list */
 	rtl_scan_list_expire(hw);
+
+	/* <7> check ack queue */
+	rtl_free_entries_from_ack_queue(hw, true);
 }
 
 void rtl_watch_dog_timer_callback(struct timer_list *t)
diff --git a/drivers/net/wireless/realtek/rtlwifi/base.h b/drivers/net/wireless/realtek/rtlwifi/base.h
index acc924635818..19e7477839e4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.h
+++ b/drivers/net/wireless/realtek/rtlwifi/base.h
@@ -130,9 +130,10 @@ bool rtl_tx_mgmt_proc(struct ieee80211_hw *hw, struct sk_buff *skb);
 u8 rtl_is_special_data(struct ieee80211_hw *hw, struct sk_buff *skb, u8 is_tx,
 		       bool is_enc);
 
+void rtl_tx_ackqueue(struct ieee80211_hw *hw, struct sk_buff *skb);
 bool rtl_is_tx_report_skb(struct ieee80211_hw *hw, struct sk_buff *skb);
-void rtl_get_tx_report(struct rtl_tcb_desc *ptcb_desc, u8 *pdesc,
-		       struct ieee80211_hw *hw);
+void rtl_set_tx_report(struct rtl_tcb_desc *ptcb_desc, u8 *pdesc,
+		       struct ieee80211_hw *hw, struct rtlwifi_tx_info *info);
 void rtl_tx_report_handler(struct ieee80211_hw *hw, u8 *tmp_buf,
 			   u8 c2h_cmd_len);
 bool rtl_check_tx_report_acked(struct ieee80211_hw *hw);
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 57bb8f049e59..d0c509ef790e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -619,12 +619,15 @@ static void _rtl_pci_tx_isr(struct ieee80211_hw *hw, int prio)
 			rtlpriv->link_info.tidtx_inperiod[tid]++;
 
 		info = IEEE80211_SKB_CB(skb);
-		ieee80211_tx_info_clear_status(info);
 
-		info->flags |= IEEE80211_TX_STAT_ACK;
-		/*info->status.rates[0].count = 1; */
-
-		ieee80211_tx_status_irqsafe(hw, skb);
+		if (likely(!ieee80211_is_nullfunc(fc))) {
+			ieee80211_tx_info_clear_status(info);
+			info->flags |= IEEE80211_TX_STAT_ACK;
+			/*info->status.rates[0].count = 1; */
+			ieee80211_tx_status_irqsafe(hw, skb);
+		} else {
+			rtl_tx_ackqueue(hw, skb);
+		}
 
 		if ((ring->entries - skb_queue_len(&ring->queue)) <= 4) {
 			RT_TRACE(rtlpriv, COMP_ERR, DBG_DMESG,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
index 4f7444331b07..852a2701ef55 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
@@ -662,6 +662,7 @@ void rtl92ee_tx_fill_desc(struct ieee80211_hw *hw,
 	struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 	struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
 	struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
+	struct rtlwifi_tx_info *tx_info = rtl_tx_skb_cb_info(skb);
 	u8 *pdesc = (u8 *)pdesc_tx;
 	u16 seq_number;
 	__le16 fc = hdr->frame_control;
@@ -723,8 +724,6 @@ void rtl92ee_tx_fill_desc(struct ieee80211_hw *hw,
 			SET_TX_DESC_OFFSET(pdesc, USB_HWDESC_HEADER_LEN);
 		}
 
-		/* tx report */
-		rtl_get_tx_report(ptcb_desc, pdesc, hw);
 
 		SET_TX_DESC_TX_RATE(pdesc, ptcb_desc->hw_rate);
 
@@ -827,6 +826,8 @@ void rtl92ee_tx_fill_desc(struct ieee80211_hw *hw,
 				SET_TX_DESC_HTC(pdesc, 1);
 			}
 		}
+		/* tx report */
+		rtl_set_tx_report(ptcb_desc, pdesc, hw, tx_info);
 	}
 
 	SET_TX_DESC_FIRST_SEG(pdesc, (firstseg ? 1 : 0));
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
index fd9b38aa08a1..deb8f9501b51 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
@@ -431,6 +431,7 @@ void rtl8723be_tx_fill_desc(struct ieee80211_hw *hw,
 	struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 	struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
 	struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
+	struct rtlwifi_tx_info *tx_info = rtl_tx_skb_cb_info(skb);
 	u8 *pdesc = (u8 *)pdesc_tx;
 	u16 seq_number;
 	__le16 fc = hdr->frame_control;
@@ -488,8 +489,6 @@ void rtl8723be_tx_fill_desc(struct ieee80211_hw *hw,
 			SET_TX_DESC_OFFSET(pdesc, USB_HWDESC_HEADER_LEN);
 		}
 
-		/* tx report */
-		rtl_get_tx_report(ptcb_desc, pdesc, hw);
 
 		/* ptcb_desc->use_driver_rate = true; */
 		SET_TX_DESC_TX_RATE(pdesc, ptcb_desc->hw_rate);
@@ -578,6 +577,8 @@ void rtl8723be_tx_fill_desc(struct ieee80211_hw *hw,
 				SET_TX_DESC_HTC(pdesc, 1);
 			}
 		}
+		/* tx report */
+		rtl_set_tx_report(ptcb_desc, pdesc, hw, tx_info);
 	}
 
 	SET_TX_DESC_FIRST_SEG(pdesc, (firstseg ? 1 : 0));
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
index 1e1bacf562f3..63fb80039f82 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
@@ -691,6 +691,7 @@ void rtl8821ae_tx_fill_desc(struct ieee80211_hw *hw,
 	struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
 	struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
 	struct rtl_hal *rtlhal = rtl_hal(rtlpriv);
+	struct rtlwifi_tx_info *tx_info = rtl_tx_skb_cb_info(skb);
 	u8 *pdesc = (u8 *)pdesc_tx;
 	u16 seq_number;
 	__le16 fc = hdr->frame_control;
@@ -740,8 +741,6 @@ void rtl8821ae_tx_fill_desc(struct ieee80211_hw *hw,
 			SET_TX_DESC_OFFSET(pdesc, USB_HWDESC_HEADER_LEN);
 		}
 
-		/* tx report */
-		rtl_get_tx_report(ptcb_desc, pdesc, hw);
 
 		/* ptcb_desc->use_driver_rate = true; */
 		SET_TX_DESC_TX_RATE(pdesc, ptcb_desc->hw_rate);
@@ -818,6 +817,8 @@ void rtl8821ae_tx_fill_desc(struct ieee80211_hw *hw,
 				SET_TX_DESC_HTC(pdesc, 1);
 			}
 		}
+		/* tx report */
+		rtl_set_tx_report(ptcb_desc, pdesc, hw, tx_info);
 	}
 
 	SET_TX_DESC_FIRST_SEG(pdesc, (firstseg ? 1 : 0));
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 208010fcde21..1259d2b66d17 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -1010,6 +1010,21 @@ enum dm_info_query {
 	DM_INFO_SIZE,
 };
 
+struct rtlwifi_tx_info {
+	int sn;
+	unsigned long send_time;
+};
+
+static inline struct rtlwifi_tx_info *rtl_tx_skb_cb_info(struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+
+	BUILD_BUG_ON(sizeof(struct rtlwifi_tx_info) >
+		     sizeof(info->status.status_driver_data));
+
+	return (struct rtlwifi_tx_info *)(info->status.status_driver_data);
+}
+
 struct octet_string {
 	u8 *octet;
 	u16 length;
@@ -1967,6 +1982,7 @@ struct rtl_tx_report {
 	u16 last_sent_sn;
 	unsigned long last_sent_time;
 	u16 last_recv_sn;
+	struct sk_buff_head queue;
 };
 
 struct rtl_ps_ctl {
-- 
cgit v1.2.3-59-g8ed1b


From faa2800653978d67ef6ee2121334d66271ba6789 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:29:55 +0800
Subject: rtlwifi: remove CONNECTION_MONITOR flag

To use keep-alive mechanism in mac80211 stack, since driver supports
reporting accurate nullfunc frame tx ack now.

Signed-off-by: Tzu-En Huang <tehuang@realtek.com>
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 6620c6842b91..759a802ccbee 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -396,7 +396,6 @@ static void _rtl_init_mac80211(struct ieee80211_hw *hw)
 	ieee80211_hw_set(hw, SIGNAL_DBM);
 	ieee80211_hw_set(hw, RX_INCLUDES_FCS);
 	ieee80211_hw_set(hw, AMPDU_AGGREGATION);
-	ieee80211_hw_set(hw, CONNECTION_MONITOR);
 	ieee80211_hw_set(hw, MFP_CAPABLE);
 	ieee80211_hw_set(hw, REPORTS_TX_ACK_STATUS);
 	ieee80211_hw_set(hw, SUPPORTS_AMSDU_IN_AMPDU);
-- 
cgit v1.2.3-59-g8ed1b


From c4528686d0ea222d0b29b4017ae227546bdbb7d7 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:29:56 +0800
Subject: rtlwifi: remove duplicate rx_packet_type definition

Move duplicate definitions from def.h of ic folder to wifi.h

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h | 7 -------
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/def.h | 8 --------
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/def.h | 8 --------
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h | 8 --------
 drivers/net/wireless/realtek/rtlwifi/wifi.h          | 8 ++++++++
 5 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
index 0532b9852444..32492d64d685 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
@@ -137,13 +137,6 @@ enum version_8188e {
 	VERSION_UNKNOWN = 0xFF,
 };
 
-enum rx_packet_type {
-	NORMAL_RX,
-	TX_REPORT1,
-	TX_REPORT2,
-	HIS_REPORT,
-};
-
 enum rtl819x_loopback_e {
 	RTL819X_NO_LOOPBACK = 0,
 	RTL819X_MAC_LOOPBACK = 1,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/def.h
index 60f5728b4e2d..9f7e7bb8610b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/def.h
@@ -47,14 +47,6 @@ enum version_8192e {
 	VERSION_UNKNOWN = 0xFF,
 };
 
-enum rx_packet_type {
-	NORMAL_RX,
-	TX_REPORT1,
-	TX_REPORT2,
-	HIS_REPORT,
-	C2H_PACKET,
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/def.h
index 025ea5c0f3f6..5e5403d69220 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/def.h
@@ -38,14 +38,6 @@
 /* Currently only for RTL8723B */
 #define EXT_VENDOR_ID				(BIT(18) | BIT(19))
 
-enum rx_packet_type {
-	NORMAL_RX,
-	TX_REPORT1,
-	TX_REPORT2,
-	HIS_REPORT,
-	C2H_PACKET,
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
index dfbdf539de1a..498f716bfc73 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
@@ -332,14 +332,6 @@ enum rtl_desc_qsel {
 	QSLT_CMD = 0x13,
 };
 
-enum rx_packet_type {
-	NORMAL_RX,
-	TX_REPORT1,
-	TX_REPORT2,
-	HIS_REPORT,
-	C2H_PACKET,
-};
-
 struct phy_sts_cck_8821ae_t {
 	u8 adc_pwdb_X[4];
 	u8 sq_rpt;
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 1259d2b66d17..81ac036760fc 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -1010,6 +1010,14 @@ enum dm_info_query {
 	DM_INFO_SIZE,
 };
 
+enum rx_packet_type {
+	NORMAL_RX,
+	TX_REPORT1,
+	TX_REPORT2,
+	HIS_REPORT,
+	C2H_PACKET,
+};
+
 struct rtlwifi_tx_info {
 	int sn;
 	unsigned long send_time;
-- 
cgit v1.2.3-59-g8ed1b


From 249155973f524a284a86582ff9a915be045ab760 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:29:57 +0800
Subject: rtlwifi: rename register-based C2H command IDs to V0

Current chips use packet-based C2H commands whose IDs differ from old
ones, so this commit simply gives C2H_V0_ as prefix of command IDs.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c |  6 +++---
 .../net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.h | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
index ec9bcf32f0ab..788de88ab1ee 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.c
@@ -1749,13 +1749,13 @@ void rtl_8723e_c2h_command_handle(struct ieee80211_hw *hw)
 
 
 	switch (c2h_event.cmd_id) {
-	case C2H_BT_RSSI:
+	case C2H_V0_BT_RSSI:
 			break;
 
-	case C2H_BT_OP_MODE:
+	case C2H_V0_BT_OP_MODE:
 			break;
 
-	case BT_INFO:
+	case C2H_V0_BT_INFO:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			"BT info Byte[0] (ID) is 0x%x\n",
 			c2h_event.cmd_id);
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.h
index 3723d7476717..756868897d8b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hal_btc.h
@@ -130,17 +130,17 @@ enum bt_state {
 	BT_INFO_STATE_MAX = 7
 };
 
-enum rtl8723e_c2h_evt {
-	C2H_DBG = 0,
-	C2H_TSF = 1,
-	C2H_AP_RPT_RSP = 2,
+enum rtl8723e_c2h_evt_v0 {
+	C2H_V0_DBG = 0,
+	C2H_V0_TSF = 1,
+	C2H_V0_AP_RPT_RSP = 2,
 	/* The FW notify the report of the specific tx packet. */
-	C2H_CCX_TX_RPT = 3,
-	C2H_BT_RSSI = 4,
-	C2H_BT_OP_MODE = 5,
-	C2H_HW_INFO_EXCH = 10,
-	C2H_C2H_H2C_TEST = 11,
-	BT_INFO = 12,
+	C2H_V0_CCX_TX_RPT = 3,
+	C2H_V0_BT_RSSI = 4,
+	C2H_V0_BT_OP_MODE = 5,
+	C2H_V0_HW_INFO_EXCH = 10,
+	C2H_V0_C2H_H2C_TEST = 11,
+	C2H_V0_BT_INFO = 12,
 	MAX_C2HEVENT
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 7aeb100b7d70973e1478f472b0a2fc569637ec8d Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:29:58 +0800
Subject: rtlwifi: remove duplicate C2H definition

Move C2H definition to wifi.h, because the definitions of 8192ee, 8723be
and 8821ae are the same.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c | 16 ++++++++--------
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h | 11 -----------
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c | 12 ++++++------
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h | 10 ----------
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c | 12 ++++++------
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h | 14 --------------
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 19 +++++++++++++++++++
 7 files changed, 39 insertions(+), 55 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
index f9563ae301ad..25d6c32f66c3 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
@@ -892,34 +892,34 @@ void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
 	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
 	switch (c2h_cmd_id) {
-	case C2H_8192E_DBG:
+	case C2H_DBG:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_DBG!!\n");
 		break;
-	case C2H_8192E_TXBF:
+	case C2H_TXBF:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8192E_TXBF!!\n");
 		break;
-	case C2H_8192E_TX_REPORT:
+	case C2H_TX_REPORT:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE ,
 			 "[C2H], C2H_8723BE_TX_REPORT!\n");
 		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
-	case C2H_8192E_BT_INFO:
+	case C2H_BT_INFO:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_BT_INFO!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
 			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
 						   c2h_cmd_len);
 		break;
-	case C2H_8192E_BT_MP:
+	case C2H_BT_MP:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_BT_MP!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
 			btc_ops->btc_btmpinfo_notify(rtlpriv, tmp_buf,
 						     c2h_cmd_len);
 		break;
-	case C2H_8192E_RA_RPT:
+	case C2H_RA_RPT:
 		_rtl92ee_c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
 	default:
@@ -948,8 +948,8 @@ void rtl92ee_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
 		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
 
 	switch (c2h_cmd_id) {
-	case C2H_8192E_BT_INFO:
-	case C2H_8192E_BT_MP:
+	case C2H_BT_INFO:
+	case C2H_BT_MP:
 		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
 		break;
 	default:
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
index b770f722daa6..8325adaa9663 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
@@ -128,17 +128,6 @@ enum rtl8192e_h2c_cmd {
 	MAX_92E_H2CCMD
 };
 
-enum rtl8192e_c2h_evt {
-	C2H_8192E_DBG = 0,
-	C2H_8192E_LB = 1,
-	C2H_8192E_TXBF = 2,
-	C2H_8192E_TX_REPORT = 3,
-	C2H_8192E_BT_INFO = 9,
-	C2H_8192E_BT_MP = 11,
-	C2H_8192E_RA_RPT = 12,
-	MAX_8192E_C2HEVENT
-};
-
 #define pagenum_128(_len)	\
 	(u32)(((_len) >> 7) + ((_len) & 0x7F ? 1 : 0))
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
index 4b963fd27d64..34703b9cf5e8 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
@@ -712,23 +712,23 @@ void rtl8723be_c2h_content_parsing(struct ieee80211_hw *hw,
 	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
 	switch (c2h_cmd_id) {
-	case C2H_8723B_DBG:
+	case C2H_DBG:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_DBG!!\n");
 		break;
-	case C2H_8723B_TX_REPORT:
+	case C2H_TX_REPORT:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_TX_REPORT!\n");
 		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
-	case C2H_8723B_BT_INFO:
+	case C2H_BT_INFO:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_BT_INFO!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
 			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
 						   c2h_cmd_len);
 		break;
-	case C2H_8723B_BT_MP:
+	case C2H_BT_MP:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8723BE_BT_MP!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
@@ -761,8 +761,8 @@ void rtl8723be_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
 		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
 
 	switch (c2h_cmd_id) {
-	case C2H_8723B_BT_INFO:
-	case C2H_8723B_BT_MP:
+	case C2H_BT_INFO:
+	case C2H_BT_MP:
 		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
 		break;
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
index 2de4edb62bca..8f49941028f4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
@@ -100,16 +100,6 @@ enum rtl8723b_h2c_cmd {
 	MAX_8723B_H2CCMD
 };
 
-enum rtl8723b_c2h_evt {
-	C2H_8723B_DBG = 0,
-	C2H_8723B_LB = 1,
-	C2H_8723B_TXBF = 2,
-	C2H_8723B_TX_REPORT = 3,
-	C2H_8723B_BT_INFO = 9,
-	C2H_8723B_BT_MP = 11,
-	MAX_8723B_C2HEVENT
-};
-
 #define pagenum_128(_len) (u32)(((_len)>>7) + ((_len)&0x7F ? 1 : 0))
 
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
index f2b2c549e5b2..5c7e58fbc07e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
@@ -1926,23 +1926,23 @@ void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
 	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
 	switch (c2h_cmd_id) {
-	case C2H_8812_DBG:
+	case C2H_DBG:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD, "[C2H], C2H_8812_DBG!!\n");
 		break;
-	case C2H_8812_TX_REPORT:
+	case C2H_TX_REPORT:
 		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
-	case C2H_8812_RA_RPT:
+	case C2H_RA_RPT:
 		rtl8821ae_c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
-	case C2H_8812_BT_INFO:
+	case C2H_BT_INFO:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD,
 			 "[C2H], C2H_8812_BT_INFO!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
 			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
 						   c2h_cmd_len);
 		break;
-	case C2H_8812_BT_MP:
+	case C2H_BT_MP:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
 			 "[C2H], C2H_8812_BT_MP!!\n");
 		if (rtlpriv->cfg->ops->get_btc_status())
@@ -1974,7 +1974,7 @@ void rtl8821ae_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer,
 		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
 
 	switch (c2h_cmd_id) {
-	case C2H_8812_BT_INFO:
+	case C2H_BT_INFO:
 		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
 		break;
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
index 32d46d7128f5..48c72a26c7e8 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
@@ -137,20 +137,6 @@
 #define	FW_PWR_STATE_ACTIVE	((FW_PS_RF_ON) | (FW_PS_REGISTER_ACTIVE))
 #define	FW_PWR_STATE_RF_OFF	0
 
-enum rtl8812_c2h_evt {
-	C2H_8812_DBG = 0,
-	C2H_8812_LB = 1,
-	C2H_8812_TXBF = 2,
-	C2H_8812_TX_REPORT = 3,
-	C2H_8812_BT_INFO = 9,
-	C2H_8812_BT_MP = 11,
-	C2H_8812_RA_RPT = 12,
-
-	C2H_8812_FW_SWCHNL = 0x10,
-	C2H_8812_IQK_FINISH = 0x11,
-	MAX_8812_C2HEVENT
-};
-
 enum rtl8821a_h2c_cmd {
 	H2C_8821AE_RSVDPAGE = 0,
 	H2C_8821AE_MSRRPT = 1,
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 81ac036760fc..3da8e2a6084a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -158,6 +158,25 @@ enum {
 	H2C_BT_PORT_ID = 0x71,
 };
 
+enum rtl_c2h_evt_v1 {
+	C2H_DBG = 0,
+	C2H_LB = 1,
+	C2H_TXBF = 2,
+	C2H_TX_REPORT = 3,
+	C2H_BT_INFO = 9,
+	C2H_BT_MP = 11,
+	C2H_RA_RPT = 12,
+
+	C2H_FW_SWCHNL = 0x10,
+	C2H_IQK_FINISH = 0x11,
+
+	C2H_EXT_V2 = 0xFF,
+};
+
+enum rtl_c2h_evt_v2 {
+	C2H_V2_CCX_RPT = 0x0F,
+};
+
 #define GET_TX_REPORT_SN_V1(c2h)	(c2h[6])
 #define GET_TX_REPORT_ST_V1(c2h)	(c2h[0] & 0xC0)
 #define GET_TX_REPORT_RETRY_V1(c2h)	(c2h[2] & 0x3F)
-- 
cgit v1.2.3-59-g8ed1b


From 16e78cb4a571129bbef5457c68856ff3d3895753 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:29:59 +0800
Subject: rtlwifi: remove unused fw C2H command ID

The IDs are defined by driver and map to the fw C2H IDs, but they aren't
used now result in removal.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../net/wireless/realtek/rtlwifi/rtl8188ee/def.h   | 25 ----------------------
 .../net/wireless/realtek/rtlwifi/rtl8192ce/def.h   | 25 ----------------------
 .../net/wireless/realtek/rtlwifi/rtl8723ae/def.h   | 25 ----------------------
 .../net/wireless/realtek/rtlwifi/rtl8821ae/def.h   | 25 ----------------------
 4 files changed, 100 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
index 32492d64d685..45c866d3ca88 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/def.h
@@ -176,31 +176,6 @@ enum interface_select_pci {
 	INTF_SEL3_RSV = 3,
 };
 
-enum hal_fw_c2h_cmd_id {
-	HAL_FW_C2H_CMD_READ_MACREG = 0,
-	HAL_FW_C2H_CMD_READ_BBREG = 1,
-	HAL_FW_C2H_CMD_READ_RFREG = 2,
-	HAL_FW_C2H_CMD_READ_EEPROM = 3,
-	HAL_FW_C2H_CMD_READ_EFUSE = 4,
-	HAL_FW_C2H_CMD_READ_CAM = 5,
-	HAL_FW_C2H_CMD_GET_BASICRATE = 6,
-	HAL_FW_C2H_CMD_GET_DATARATE = 7,
-	HAL_FW_C2H_CMD_SURVEY = 8,
-	HAL_FW_C2H_CMD_SURVEYDONE = 9,
-	HAL_FW_C2H_CMD_JOINBSS = 10,
-	HAL_FW_C2H_CMD_ADDSTA = 11,
-	HAL_FW_C2H_CMD_DELSTA = 12,
-	HAL_FW_C2H_CMD_ATIMDONE = 13,
-	HAL_FW_C2H_CMD_TX_REPORT = 14,
-	HAL_FW_C2H_CMD_CCX_REPORT = 15,
-	HAL_FW_C2H_CMD_DTM_REPORT = 16,
-	HAL_FW_C2H_CMD_TX_RATE_STATISTICS = 17,
-	HAL_FW_C2H_CMD_C2HLBK = 18,
-	HAL_FW_C2H_CMD_C2HDBG = 19,
-	HAL_FW_C2H_CMD_C2HFEEDBACK = 20,
-	HAL_FW_C2H_CMD_MAX
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/def.h
index b90aaf128072..d2005d7e9ad2 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ce/def.h
@@ -142,31 +142,6 @@ enum interface_select_pci {
 	INTF_SEL3_RSV = 3,
 };
 
-enum hal_fw_c2h_cmd_id {
-	HAL_FW_C2H_CMD_Read_MACREG = 0,
-	HAL_FW_C2H_CMD_Read_BBREG = 1,
-	HAL_FW_C2H_CMD_Read_RFREG = 2,
-	HAL_FW_C2H_CMD_Read_EEPROM = 3,
-	HAL_FW_C2H_CMD_Read_EFUSE = 4,
-	HAL_FW_C2H_CMD_Read_CAM = 5,
-	HAL_FW_C2H_CMD_Get_BasicRate = 6,
-	HAL_FW_C2H_CMD_Get_DataRate = 7,
-	HAL_FW_C2H_CMD_Survey = 8,
-	HAL_FW_C2H_CMD_SurveyDone = 9,
-	HAL_FW_C2H_CMD_JoinBss = 10,
-	HAL_FW_C2H_CMD_AddSTA = 11,
-	HAL_FW_C2H_CMD_DelSTA = 12,
-	HAL_FW_C2H_CMD_AtimDone = 13,
-	HAL_FW_C2H_CMD_TX_Report = 14,
-	HAL_FW_C2H_CMD_CCX_Report = 15,
-	HAL_FW_C2H_CMD_DTM_Report = 16,
-	HAL_FW_C2H_CMD_TX_Rate_Statistics = 17,
-	HAL_FW_C2H_CMD_C2HLBK = 18,
-	HAL_FW_C2H_CMD_C2HDBG = 19,
-	HAL_FW_C2H_CMD_C2HFEEDBACK = 20,
-	HAL_FW_C2H_CMD_MAX
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/def.h
index bcdf2273688e..847544817549 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/def.h
@@ -152,31 +152,6 @@ enum interface_select_pci {
 	INTF_SEL3_RSV = 3,
 };
 
-enum hal_fw_c2h_cmd_id {
-	HAL_FW_C2H_CMD_Read_MACREG = 0,
-	HAL_FW_C2H_CMD_Read_BBREG = 1,
-	HAL_FW_C2H_CMD_Read_RFREG = 2,
-	HAL_FW_C2H_CMD_Read_EEPROM = 3,
-	HAL_FW_C2H_CMD_Read_EFUSE = 4,
-	HAL_FW_C2H_CMD_Read_CAM = 5,
-	HAL_FW_C2H_CMD_Get_BasicRate = 6,
-	HAL_FW_C2H_CMD_Get_DataRate = 7,
-	HAL_FW_C2H_CMD_Survey = 8,
-	HAL_FW_C2H_CMD_SurveyDone = 9,
-	HAL_FW_C2H_CMD_JoinBss = 10,
-	HAL_FW_C2H_CMD_AddSTA = 11,
-	HAL_FW_C2H_CMD_DelSTA = 12,
-	HAL_FW_C2H_CMD_AtimDone = 13,
-	HAL_FW_C2H_CMD_TX_Report = 14,
-	HAL_FW_C2H_CMD_CCX_Report = 15,
-	HAL_FW_C2H_CMD_DTM_Report = 16,
-	HAL_FW_C2H_CMD_TX_Rate_Statistics = 17,
-	HAL_FW_C2H_CMD_C2HLBK = 18,
-	HAL_FW_C2H_CMD_C2HDBG = 19,
-	HAL_FW_C2H_CMD_C2HFEEDBACK = 20,
-	HAL_FW_C2H_CMD_MAX
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
index 498f716bfc73..3fe3aaa5fe3c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/def.h
@@ -296,31 +296,6 @@ enum interface_select_pci {
 	INTF_SEL3_RSV = 3,
 };
 
-enum hal_fw_c2h_cmd_id {
-	HAL_FW_C2H_CMD_READ_MACREG = 0,
-	HAL_FW_C2H_CMD_READ_BBREG = 1,
-	HAL_FW_C2H_CMD_READ_RFREG = 2,
-	HAL_FW_C2H_CMD_READ_EEPROM = 3,
-	HAL_FW_C2H_CMD_READ_EFUSE = 4,
-	HAL_FW_C2H_CMD_READ_CAM = 5,
-	HAL_FW_C2H_CMD_GET_BASICRATE = 6,
-	HAL_FW_C2H_CMD_GET_DATARATE = 7,
-	HAL_FW_C2H_CMD_SURVEY = 8,
-	HAL_FW_C2H_CMD_SURVEYDONE = 9,
-	HAL_FW_C2H_CMD_JOINBSS = 10,
-	HAL_FW_C2H_CMD_ADDSTA = 11,
-	HAL_FW_C2H_CMD_DELSTA = 12,
-	HAL_FW_C2H_CMD_ATIMDONE = 13,
-	HAL_FW_C2H_CMD_TX_REPORT = 14,
-	HAL_FW_C2H_CMD_CCX_REPORT = 15,
-	HAL_FW_C2H_CMD_DTM_REPORT = 16,
-	HAL_FW_C2H_CMD_TX_RATE_STATISTICS = 17,
-	HAL_FW_C2H_CMD_C2HLBK = 18,
-	HAL_FW_C2H_CMD_C2HDBG = 19,
-	HAL_FW_C2H_CMD_C2HFEEDBACK = 20,
-	HAL_FW_C2H_CMD_MAX
-};
-
 enum rtl_desc_qsel {
 	QSLT_BK = 0x2,
 	QSLT_BE = 0x0,
-- 
cgit v1.2.3-59-g8ed1b


From ff77960d62d220a0f16433e9c2664a8759f95c3a Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:00 +0800
Subject: rtlwifi: remove dummy hal_op rx_command_packet from rtl8188ee and
 rtl8723ae

The caller of hal_op rx_command_packet will assert function pointer
before calling, so we can remove dummy functions safely.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c  | 2 --
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c | 7 -------
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.h | 4 ----
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c  | 1 -
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c | 7 -------
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.h | 3 ---
 6 files changed, 24 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c
index 82681b96ef93..8c15ffd3568b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/sw.c
@@ -263,8 +263,6 @@ static struct rtl_hal_ops rtl8188ee_hal_ops = {
 	.get_rfreg = rtl88e_phy_query_rf_reg,
 	.set_rfreg = rtl88e_phy_set_rf_reg,
 	.get_btc_status = rtl88e_get_btc_status,
-	.rx_command_packet = rtl88ee_rx_command_packet,
-
 };
 
 static struct rtl_mod_params rtl88ee_mod_params = {
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c
index 9670732b2bc6..4c1f8b08fc10 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.c
@@ -850,10 +850,3 @@ void rtl88ee_tx_polling(struct ieee80211_hw *hw, u8 hw_queue)
 			       BIT(0) << (hw_queue));
 	}
 }
-
-u32 rtl88ee_rx_command_packet(struct ieee80211_hw *hw,
-			      const struct rtl_stats *status,
-			      struct sk_buff *skb)
-{
-	return 0;
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.h b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.h
index f902d6769aa8..127ba977206f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/trx.h
@@ -790,8 +790,4 @@ void rtl88ee_tx_polling(struct ieee80211_hw *hw, u8 hw_queue);
 void rtl88ee_tx_fill_cmddesc(struct ieee80211_hw *hw, u8 *pdesc,
 			     bool firstseg, bool lastseg,
 			     struct sk_buff *skb);
-u32 rtl88ee_rx_command_packet(struct ieee80211_hw *hw,
-			      const struct rtl_stats *status,
-			      struct sk_buff *skb);
-
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c
index a545ea317323..07b82700d1de 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/sw.c
@@ -260,7 +260,6 @@ static struct rtl_hal_ops rtl8723e_hal_ops = {
 	.bt_coex_off_before_lps =
 		rtl8723e_dm_bt_turn_off_bt_coexist_before_enter_lps,
 	.get_btc_status = rtl8723e_get_btc_status,
-	.rx_command_packet = rtl8723e_rx_command_packet,
 	.is_fw_header = is_fw_header,
 };
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c
index 23485602a9a1..d461d0c9631f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.c
@@ -709,10 +709,3 @@ void rtl8723e_tx_polling(struct ieee80211_hw *hw, u8 hw_queue)
 			       BIT(0) << (hw_queue));
 	}
 }
-
-u32 rtl8723e_rx_command_packet(struct ieee80211_hw *hw,
-			       const struct rtl_stats *status,
-			       struct sk_buff *skb)
-{
-	return 0;
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.h
index 985ce0b77ea5..d592b08d4ac8 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723ae/trx.h
@@ -716,7 +716,4 @@ void rtl8723e_tx_polling(struct ieee80211_hw *hw, u8 hw_queue);
 void rtl8723e_tx_fill_cmddesc(struct ieee80211_hw *hw, u8 *pdesc,
 			      bool firstseg, bool lastseg,
 			      struct sk_buff *skb);
-u32 rtl8723e_rx_command_packet(struct ieee80211_hw *hw,
-			       const struct rtl_stats *status,
-			       struct sk_buff *skb);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 16cefa449c9ced7006860d5427b9595782248ca4 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:01 +0800
Subject: rtlwifi: Add hal_op c2h_ra_report_handler for special process

We're going to merge C2H handler into one, but one special case is to
handle RA_REPORT that implements in individual IC folder. So this commit
adds a hal_op for caller in common code.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c | 7 ++++---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h | 2 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c | 1 +
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c | 5 +++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h | 2 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c | 1 +
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 2 ++
 7 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
index 25d6c32f66c3..a2d9e217bc65 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
@@ -876,8 +876,8 @@ void rtl92ee_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw, u8 p2p_ps_state)
 			     (u8 *)p2p_ps_offload);
 }
 
-static void _rtl92ee_c2h_ra_report_handler(struct ieee80211_hw *hw,
-					   u8 *cmd_buf, u8 cmd_len)
+void rtl92ee_c2h_ra_report_handler(struct ieee80211_hw *hw,
+				   u8 *cmd_buf, u8 cmd_len)
 {
 	u8 rate = cmd_buf[0] & 0x3F;
 	bool collision_state = cmd_buf[3] & BIT(0);
@@ -889,6 +889,7 @@ void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
 				 u8 c2h_cmd_len, u8 *tmp_buf)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
 	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
 	switch (c2h_cmd_id) {
@@ -920,7 +921,7 @@ void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
 						     c2h_cmd_len);
 		break;
 	case C2H_RA_RPT:
-		_rtl92ee_c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
+		hal_ops->c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
 	default:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
index 8325adaa9663..e5fae0a86a27 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
@@ -182,4 +182,6 @@ void rtl92ee_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw, u8 p2p_ps_state);
 void rtl92ee_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len);
 void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
 				 u8 c2h_cmd_len, u8 *tmp_buf);
+void rtl92ee_c2h_ra_report_handler(struct ieee80211_hw *hw,
+				   u8 *cmd_buf, u8 cmd_len);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
index ef92a789871d..c5c26b537cd2 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
@@ -252,6 +252,7 @@ static struct rtl_hal_ops rtl8192ee_hal_ops = {
 	.get_btc_status = rtl92ee_get_btc_status,
 	.rx_command_packet = rtl92ee_rx_command_packet,
 	.c2h_content_parsing = rtl92ee_c2h_content_parsing,
+	.c2h_ra_report_handler = rtl92ee_c2h_ra_report_handler,
 };
 
 static struct rtl_mod_params rtl92ee_mod_params = {
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
index 5c7e58fbc07e..bf37c428c682 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
@@ -1907,7 +1907,7 @@ void rtl8821ae_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw, u8 p2p_ps_state)
 			H2C_8821AE_P2P_PS_OFFLOAD, 1, (u8 *)p2p_ps_offload);
 }
 
-static void rtl8821ae_c2h_ra_report_handler(struct ieee80211_hw *hw,
+void rtl8821ae_c2h_ra_report_handler(struct ieee80211_hw *hw,
 				     u8 *cmd_buf, u8 cmd_len)
 {
 	struct rtl_hal *rtlhal = rtl_hal(rtl_priv(hw));
@@ -1923,6 +1923,7 @@ void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
 				   u8 *tmp_buf)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
 	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
 
 	switch (c2h_cmd_id) {
@@ -1933,7 +1934,7 @@ void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
 		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
 	case C2H_RA_RPT:
-		rtl8821ae_c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
+		hal_ops->c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
 		break;
 	case C2H_BT_INFO:
 		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
index 48c72a26c7e8..a69448be7c26 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
@@ -322,4 +322,6 @@ void rtl8821ae_c2h_packet_handler(struct ieee80211_hw *hw,
 void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
 				   u8 c2h_cmd_id, u8 c2h_cmd_len,
 				   u8 *tmp_buf);
+void rtl8821ae_c2h_ra_report_handler(struct ieee80211_hw *hw,
+				     u8 *cmd_buf, u8 cmd_len);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
index 9bb3d9dfce79..4b2cb6bda4f1 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
@@ -305,6 +305,7 @@ static struct rtl_hal_ops rtl8821ae_hal_ops = {
 	.get_btc_status = rtl8821ae_get_btc_status,
 	.rx_command_packet = rtl8821ae_rx_command_packet,
 	.c2h_content_parsing = rtl8821ae_c2h_content_parsing,
+	.c2h_ra_report_handler = rtl8821ae_c2h_ra_report_handler,
 	.add_wowlan_pattern = rtl8821ae_add_wowlan_pattern,
 };
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 3da8e2a6084a..392b3e84db7e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2348,6 +2348,8 @@ struct rtl_hal_ops {
 	u16 (*get_available_desc)(struct ieee80211_hw *hw, u8 q_idx);
 	void (*c2h_content_parsing)(struct ieee80211_hw *hw, u8 tag, u8 len,
 				    u8 *val);
+	void (*c2h_ra_report_handler)(struct ieee80211_hw *hw,
+				      u8 *cmd_buf, u8 cmd_len);
 };
 
 struct rtl_intf_ops {
-- 
cgit v1.2.3-59-g8ed1b


From daf026ae5fbe094e63bb729d437b042a0a5fd798 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:02 +0800
Subject: rtlwifi: remove duplicate C2H handler

Merge duplicate C2H handler and implement the handler in base.c.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c        | 72 +++++++++++++++++++++
 drivers/net/wireless/realtek/rtlwifi/base.h        |  3 +
 .../net/wireless/realtek/rtlwifi/rtl8192ee/fw.c    | 75 ----------------------
 .../net/wireless/realtek/rtlwifi/rtl8192ee/fw.h    |  3 -
 .../net/wireless/realtek/rtlwifi/rtl8192ee/sw.c    |  2 +-
 .../net/wireless/realtek/rtlwifi/rtl8192ee/trx.c   |  2 +-
 .../net/wireless/realtek/rtlwifi/rtl8723be/fw.c    | 69 --------------------
 .../net/wireless/realtek/rtlwifi/rtl8723be/fw.h    |  3 -
 .../net/wireless/realtek/rtlwifi/rtl8723be/sw.c    |  2 +-
 .../net/wireless/realtek/rtlwifi/rtl8723be/trx.c   |  3 +-
 .../net/wireless/realtek/rtlwifi/rtl8821ae/fw.c    | 68 --------------------
 .../net/wireless/realtek/rtlwifi/rtl8821ae/fw.h    |  5 --
 .../net/wireless/realtek/rtlwifi/rtl8821ae/sw.c    |  2 +-
 .../net/wireless/realtek/rtlwifi/rtl8821ae/trx.c   |  2 +-
 14 files changed, 81 insertions(+), 230 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 759a802ccbee..0d03e98f9cb4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -2306,6 +2306,78 @@ label_err:
 }
 EXPORT_SYMBOL(rtl_c2hcmd_enqueue);
 
+void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 cmd_id,
+			     u8 cmd_len, u8 *cmd_buf)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
+	const struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
+
+	switch (cmd_id) {
+	case C2H_DBG:
+		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD, "[C2H], C2H_DBG!!\n");
+		break;
+	case C2H_TXBF:
+		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
+			 "[C2H], C2H_TXBF!!\n");
+		break;
+	case C2H_TX_REPORT:
+		rtl_tx_report_handler(hw, cmd_buf, cmd_len);
+		break;
+	case C2H_RA_RPT:
+		if (hal_ops->c2h_ra_report_handler)
+			hal_ops->c2h_ra_report_handler(hw, cmd_buf, cmd_len);
+		break;
+	case C2H_BT_INFO:
+		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
+			 "[C2H], C2H_BT_INFO!!\n");
+		if (rtlpriv->cfg->ops->get_btc_status())
+			btc_ops->btc_btinfo_notify(rtlpriv, cmd_buf, cmd_len);
+		break;
+	case C2H_BT_MP:
+		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
+			 "[C2H], C2H_BT_MP!!\n");
+		if (rtlpriv->cfg->ops->get_btc_status())
+			btc_ops->btc_btmpinfo_notify(rtlpriv, cmd_buf, cmd_len);
+		break;
+	default:
+		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
+			 "[C2H], Unknown packet!! cmd_id(%#X)!\n", cmd_id);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(rtl_c2h_content_parsing);
+
+void rtl_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
+{
+	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	u8 c2h_cmd_id = 0, c2h_cmd_seq = 0, c2h_cmd_len = 0;
+	u8 *tmp_buf = NULL;
+
+	c2h_cmd_id = buffer[0];
+	c2h_cmd_seq = buffer[1];
+	c2h_cmd_len = len - 2;
+	tmp_buf = buffer + 2;
+
+	RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
+		 "[C2H packet], c2hCmdId=0x%x, c2hCmdSeq=0x%x, c2hCmdLen=%d\n",
+		 c2h_cmd_id, c2h_cmd_seq, c2h_cmd_len);
+
+	RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_TRACE,
+		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
+
+	switch (c2h_cmd_id) {
+	case C2H_BT_INFO:
+	case C2H_BT_MP:
+		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
+		break;
+	default:
+		rtl_c2h_content_parsing(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(rtl_c2h_packet_handler);
+
 void rtl_c2hcmd_launcher(struct ieee80211_hw *hw, int exec)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
diff --git a/drivers/net/wireless/realtek/rtlwifi/base.h b/drivers/net/wireless/realtek/rtlwifi/base.h
index 19e7477839e4..3bf174e5b07e 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.h
+++ b/drivers/net/wireless/realtek/rtlwifi/base.h
@@ -162,6 +162,9 @@ void rtl_fwevt_wq_callback(void *data);
 void rtl_c2hcmd_wq_callback(void *data);
 void rtl_c2hcmd_launcher(struct ieee80211_hw *hw, int exec);
 void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, u8 tag, u8 len, u8 *val);
+void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
+			     u8 c2h_cmd_len, u8 *tmp_buf);
+void rtl_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len);
 
 u8 rtl_mrate_idx_to_arfr_id(struct ieee80211_hw *hw, u8 rate_index,
 			    enum wireless_mode wirelessmode);
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
index a2d9e217bc65..84a0d0eb72e1 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c
@@ -884,78 +884,3 @@ void rtl92ee_c2h_ra_report_handler(struct ieee80211_hw *hw,
 
 	rtl92ee_dm_dynamic_arfb_select(hw, rate, collision_state);
 }
-
-void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
-				 u8 c2h_cmd_len, u8 *tmp_buf)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
-	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
-
-	switch (c2h_cmd_id) {
-	case C2H_DBG:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_DBG!!\n");
-		break;
-	case C2H_TXBF:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8192E_TXBF!!\n");
-		break;
-	case C2H_TX_REPORT:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE ,
-			 "[C2H], C2H_8723BE_TX_REPORT!\n");
-		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
-		break;
-	case C2H_BT_INFO:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_BT_INFO!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
-						   c2h_cmd_len);
-		break;
-	case C2H_BT_MP:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_BT_MP!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btmpinfo_notify(rtlpriv, tmp_buf,
-						     c2h_cmd_len);
-		break;
-	case C2H_RA_RPT:
-		hal_ops->c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
-		break;
-	default:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], Unknown packet!! CmdId(%#X)!\n", c2h_cmd_id);
-		break;
-	}
-}
-
-void rtl92ee_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	u8 c2h_cmd_id = 0, c2h_cmd_seq = 0, c2h_cmd_len = 0;
-	u8 *tmp_buf = NULL;
-
-	c2h_cmd_id = buffer[0];
-	c2h_cmd_seq = buffer[1];
-	c2h_cmd_len = len - 2;
-	tmp_buf = buffer + 2;
-
-	RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-		 "[C2H packet], c2hCmdId=0x%x, c2hCmdSeq=0x%x, c2hCmdLen=%d\n",
-		 c2h_cmd_id, c2h_cmd_seq, c2h_cmd_len);
-
-	RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_TRACE,
-		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
-
-	switch (c2h_cmd_id) {
-	case C2H_BT_INFO:
-	case C2H_BT_MP:
-		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
-		break;
-	default:
-		rtl92ee_c2h_content_parsing(hw, c2h_cmd_id, c2h_cmd_len,
-					    tmp_buf);
-		break;
-	}
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
index e5fae0a86a27..6a2fbf20d579 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.h
@@ -179,9 +179,6 @@ void rtl92ee_set_fw_pwrmode_cmd(struct ieee80211_hw *hw, u8 mode);
 void rtl92ee_set_fw_media_status_rpt_cmd(struct ieee80211_hw *hw, u8 mstatus);
 void rtl92ee_set_fw_rsvdpagepkt(struct ieee80211_hw *hw, bool b_dl_finished);
 void rtl92ee_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw, u8 p2p_ps_state);
-void rtl92ee_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len);
-void rtl92ee_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
-				 u8 c2h_cmd_len, u8 *tmp_buf);
 void rtl92ee_c2h_ra_report_handler(struct ieee80211_hw *hw,
 				   u8 *cmd_buf, u8 cmd_len);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
index c5c26b537cd2..fd028274c593 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
@@ -251,7 +251,7 @@ static struct rtl_hal_ops rtl8192ee_hal_ops = {
 	.fill_h2c_cmd = rtl92ee_fill_h2c_cmd,
 	.get_btc_status = rtl92ee_get_btc_status,
 	.rx_command_packet = rtl92ee_rx_command_packet,
-	.c2h_content_parsing = rtl92ee_c2h_content_parsing,
+	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl92ee_c2h_ra_report_handler,
 };
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
index 852a2701ef55..e525c2bb4457 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
@@ -1085,7 +1085,7 @@ u32 rtl92ee_rx_command_packet(struct ieee80211_hw *hw,
 		result = 0;
 		break;
 	case C2H_PACKET:
-		rtl92ee_c2h_packet_handler(hw, skb->data, (u8)skb->len);
+		rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
 		result = 1;
 		break;
 	default:
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
index 34703b9cf5e8..f2441fbb92f1 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c
@@ -703,72 +703,3 @@ void rtl8723be_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw,
 	rtl8723be_fill_h2c_cmd(hw, H2C_8723B_P2P_PS_OFFLOAD, 1,
 			       (u8 *)p2p_ps_offload);
 }
-
-void rtl8723be_c2h_content_parsing(struct ieee80211_hw *hw,
-				   u8 c2h_cmd_id,
-				   u8 c2h_cmd_len, u8 *tmp_buf)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
-
-	switch (c2h_cmd_id) {
-	case C2H_DBG:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_DBG!!\n");
-		break;
-	case C2H_TX_REPORT:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_TX_REPORT!\n");
-		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
-		break;
-	case C2H_BT_INFO:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_BT_INFO!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
-						   c2h_cmd_len);
-		break;
-	case C2H_BT_MP:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8723BE_BT_MP!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btmpinfo_notify(rtlpriv, tmp_buf,
-						     c2h_cmd_len);
-		break;
-	default:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], Unknown packet!! CmdId(%#X)!\n", c2h_cmd_id);
-		break;
-	}
-}
-
-void rtl8723be_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	u8 c2h_cmd_id = 0, c2h_cmd_seq = 0, c2h_cmd_len = 0;
-	u8 *tmp_buf = NULL;
-
-	c2h_cmd_id = buffer[0];
-	c2h_cmd_seq = buffer[1];
-	c2h_cmd_len = len - 2;
-	tmp_buf = buffer + 2;
-
-	RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-		 "[C2H packet], c2hCmdId=0x%x, c2hCmdSeq=0x%x, c2hCmdLen=%d\n",
-		 c2h_cmd_id, c2h_cmd_seq, c2h_cmd_len);
-
-	RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_TRACE,
-		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
-
-	switch (c2h_cmd_id) {
-	case C2H_BT_INFO:
-	case C2H_BT_MP:
-		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
-		break;
-
-	default:
-		rtl8723be_c2h_content_parsing(hw, c2h_cmd_id, c2h_cmd_len,
-					      tmp_buf);
-		break;
-	}
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
index 8f49941028f4..948d28646364 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.h
@@ -143,7 +143,4 @@ void rtl8723be_set_fw_pwrmode_cmd(struct ieee80211_hw *hw, u8 mode);
 void rtl8723be_set_fw_media_status_rpt_cmd(struct ieee80211_hw *hw, u8 mstatus);
 void rtl8723be_set_fw_rsvdpagepkt(struct ieee80211_hw *hw, bool b_dl_finished);
 void rtl8723be_set_p2p_ps_offload_cmd(struct ieee80211_hw *hw, u8 p2p_ps_state);
-void rtl8723be_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len);
-void rtl8723be_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
-				   u8 c2h_cmd_len, u8 *tmp_buf);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
index 6a42988aad65..9df994965c4a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
@@ -263,7 +263,7 @@ static struct rtl_hal_ops rtl8723be_hal_ops = {
 	.get_btc_status = rtl8723be_get_btc_status,
 	.rx_command_packet = rtl8723be_rx_command_packet,
 	.is_fw_header = is_fw_header,
-	.c2h_content_parsing = rtl8723be_c2h_content_parsing,
+	.c2h_content_parsing = rtl_c2h_content_parsing,
 };
 
 static struct rtl_mod_params rtl8723be_mod_params = {
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
index deb8f9501b51..0c0cece9d5f9 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
@@ -774,8 +774,7 @@ u32 rtl8723be_rx_command_packet(struct ieee80211_hw *hw,
 			result = 0;
 			break;
 	case C2H_PACKET:
-			rtl8723be_c2h_packet_handler(hw, skb->data,
-						     (u8)skb->len);
+			rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
 			result = 1;
 			break;
 	default:
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
index bf37c428c682..d868a034659f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c
@@ -1917,71 +1917,3 @@ void rtl8821ae_c2h_ra_report_handler(struct ieee80211_hw *hw,
 
 	rtl8821ae_dm_update_init_rate(hw, rate);
 }
-
-void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
-				   u8 c2h_cmd_id, u8 c2h_cmd_len,
-				   u8 *tmp_buf)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
-	struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
-
-	switch (c2h_cmd_id) {
-	case C2H_DBG:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD, "[C2H], C2H_8812_DBG!!\n");
-		break;
-	case C2H_TX_REPORT:
-		rtl_tx_report_handler(hw, tmp_buf, c2h_cmd_len);
-		break;
-	case C2H_RA_RPT:
-		hal_ops->c2h_ra_report_handler(hw, tmp_buf, c2h_cmd_len);
-		break;
-	case C2H_BT_INFO:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD,
-			 "[C2H], C2H_8812_BT_INFO!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btinfo_notify(rtlpriv, tmp_buf,
-						   c2h_cmd_len);
-		break;
-	case C2H_BT_MP:
-		RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-			 "[C2H], C2H_8812_BT_MP!!\n");
-		if (rtlpriv->cfg->ops->get_btc_status())
-			btc_ops->btc_btmpinfo_notify(rtlpriv, tmp_buf,
-						     c2h_cmd_len);
-		break;
-	default:
-		break;
-	}
-}
-
-void rtl8821ae_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer,
-				  u8 length)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	u8 c2h_cmd_id = 0, c2h_cmd_seq = 0, c2h_cmd_len = 0;
-	u8 *tmp_buf = NULL;
-
-	c2h_cmd_id = buffer[0];
-	c2h_cmd_seq = buffer[1];
-	c2h_cmd_len = length - 2;
-	tmp_buf = buffer + 2;
-
-	RT_TRACE(rtlpriv, COMP_FW, DBG_LOUD,
-		 "[C2H packet], c2hCmdId=0x%x, c2hCmdSeq=0x%x, c2hCmdLen=%d\n",
-		 c2h_cmd_id, c2h_cmd_seq, c2h_cmd_len);
-
-	RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_LOUD,
-		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
-
-	switch (c2h_cmd_id) {
-	case C2H_BT_INFO:
-		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
-		break;
-
-	default:
-		rtl8821ae_c2h_content_parsing(hw, c2h_cmd_id, c2h_cmd_len,
-					      tmp_buf);
-		break;
-	}
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
index a69448be7c26..99c902ff0b84 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.h
@@ -317,11 +317,6 @@ void rtl8821ae_set_fw_keep_alive_cmd(struct ieee80211_hw *hw, bool func_en);
 void rtl8821ae_set_fw_disconnect_decision_ctrl_cmd(struct ieee80211_hw *hw,
 						   bool enabled);
 void rtl8821ae_set_fw_global_info_cmd(struct ieee80211_hw *hw);
-void rtl8821ae_c2h_packet_handler(struct ieee80211_hw *hw,
-				  u8 *buffer, u8 length);
-void rtl8821ae_c2h_content_parsing(struct ieee80211_hw *hw,
-				   u8 c2h_cmd_id, u8 c2h_cmd_len,
-				   u8 *tmp_buf);
 void rtl8821ae_c2h_ra_report_handler(struct ieee80211_hw *hw,
 				     u8 *cmd_buf, u8 cmd_len);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
index 4b2cb6bda4f1..bcabf4dc6353 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
@@ -304,7 +304,7 @@ static struct rtl_hal_ops rtl8821ae_hal_ops = {
 	.fill_h2c_cmd = rtl8821ae_fill_h2c_cmd,
 	.get_btc_status = rtl8821ae_get_btc_status,
 	.rx_command_packet = rtl8821ae_rx_command_packet,
-	.c2h_content_parsing = rtl8821ae_c2h_content_parsing,
+	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl8821ae_c2h_ra_report_handler,
 	.add_wowlan_pattern = rtl8821ae_add_wowlan_pattern,
 };
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
index 63fb80039f82..a93d8569c186 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
@@ -1018,7 +1018,7 @@ u32 rtl8821ae_rx_command_packet(struct ieee80211_hw *hw,
 		result = 0;
 		break;
 	case C2H_PACKET:
-		rtl8821ae_c2h_packet_handler(hw, skb->data, (u8)skb->len);
+		rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
 		result = 1;
 		RT_TRACE(rtlpriv, COMP_RECV, DBG_LOUD,
 			 "skb->len=%d\n\n", skb->len);
-- 
cgit v1.2.3-59-g8ed1b


From 23ffab177b1da1564c14214f290925a77f0b3592 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:03 +0800
Subject: rtlwifi: remove hal_op rx_command_packet

Because the hal_op rx_command_packet does C2H handler if rx packet type
is C2H, and the handler have been moved to base.c so we can call the
handler directly.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/pci.c         |  4 ++--
 .../net/wireless/realtek/rtlwifi/rtl8192ee/sw.c    |  1 -
 .../net/wireless/realtek/rtlwifi/rtl8192ee/trx.c   | 24 --------------------
 .../net/wireless/realtek/rtlwifi/rtl8192ee/trx.h   |  3 ---
 .../net/wireless/realtek/rtlwifi/rtl8723be/sw.c    |  1 -
 .../net/wireless/realtek/rtlwifi/rtl8723be/trx.c   | 24 --------------------
 .../net/wireless/realtek/rtlwifi/rtl8723be/trx.h   |  3 ---
 .../net/wireless/realtek/rtlwifi/rtl8821ae/sw.c    |  1 -
 .../net/wireless/realtek/rtlwifi/rtl8821ae/trx.c   | 26 ----------------------
 .../net/wireless/realtek/rtlwifi/rtl8821ae/trx.h   |  3 ---
 drivers/net/wireless/realtek/rtlwifi/wifi.h        |  2 --
 11 files changed, 2 insertions(+), 90 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index d0c509ef790e..dd51c67c09fa 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -830,8 +830,8 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
 			goto new_trx_end;
 		}
 		/* handle command packet here */
-		if (rtlpriv->cfg->ops->rx_command_packet &&
-		    rtlpriv->cfg->ops->rx_command_packet(hw, &stats, skb)) {
+		if (stats.packet_report_type == C2H_PACKET) {
+			rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
 			dev_kfree_skb_any(skb);
 			goto new_trx_end;
 		}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
index fd028274c593..5b67ad748d67 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
@@ -250,7 +250,6 @@ static struct rtl_hal_ops rtl8192ee_hal_ops = {
 	.set_rfreg = rtl92ee_phy_set_rf_reg,
 	.fill_h2c_cmd = rtl92ee_fill_h2c_cmd,
 	.get_btc_status = rtl92ee_get_btc_status,
-	.rx_command_packet = rtl92ee_rx_command_packet,
 	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl92ee_c2h_ra_report_handler,
 };
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
index e525c2bb4457..14d6e3fc5767 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.c
@@ -1072,27 +1072,3 @@ bool rtl92ee_is_tx_desc_closed(struct ieee80211_hw *hw, u8 hw_queue, u16 index)
 void rtl92ee_tx_polling(struct ieee80211_hw *hw, u8 hw_queue)
 {
 }
-
-u32 rtl92ee_rx_command_packet(struct ieee80211_hw *hw,
-			      const struct rtl_stats *status,
-			      struct sk_buff *skb)
-{
-	u32 result = 0;
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-
-	switch (status->packet_report_type) {
-	case NORMAL_RX:
-		result = 0;
-		break;
-	case C2H_PACKET:
-		rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
-		result = 1;
-		break;
-	default:
-		RT_TRACE(rtlpriv, COMP_RECV, DBG_TRACE,
-			 "Unknown packet type %d\n", status->packet_report_type);
-		break;
-	}
-
-	return result;
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.h b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.h
index 48c16fff20c6..45df3e79f490 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/trx.h
@@ -762,7 +762,4 @@ void rtl92ee_tx_polling(struct ieee80211_hw *hw, u8 hw_queue);
 void rtl92ee_tx_fill_cmddesc(struct ieee80211_hw *hw, u8 *pdesc,
 			     bool firstseg, bool lastseg,
 			     struct sk_buff *skb);
-u32 rtl92ee_rx_command_packet(struct ieee80211_hw *hw,
-			      const struct rtl_stats *status,
-			      struct sk_buff *skb);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
index 9df994965c4a..a41e67b3f38b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
@@ -261,7 +261,6 @@ static struct rtl_hal_ops rtl8723be_hal_ops = {
 	.set_rfreg = rtl8723be_phy_set_rf_reg,
 	.fill_h2c_cmd = rtl8723be_fill_h2c_cmd,
 	.get_btc_status = rtl8723be_get_btc_status,
-	.rx_command_packet = rtl8723be_rx_command_packet,
 	.is_fw_header = is_fw_header,
 	.c2h_content_parsing = rtl_c2h_content_parsing,
 };
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
index 0c0cece9d5f9..9f8dfb5af774 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.c
@@ -761,27 +761,3 @@ void rtl8723be_tx_polling(struct ieee80211_hw *hw, u8 hw_queue)
 			       BIT(0) << (hw_queue));
 	}
 }
-
-u32 rtl8723be_rx_command_packet(struct ieee80211_hw *hw,
-				const struct rtl_stats *status,
-				struct sk_buff *skb)
-{
-	u32 result = 0;
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-
-	switch (status->packet_report_type) {
-	case NORMAL_RX:
-			result = 0;
-			break;
-	case C2H_PACKET:
-			rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
-			result = 1;
-			break;
-	default:
-			RT_TRACE(rtlpriv, COMP_RECV, DBG_TRACE,
-				 "No this packet type!!\n");
-			break;
-	}
-
-	return result;
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.h b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.h
index 988bf0586674..609f7ad7f787 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/trx.h
@@ -632,7 +632,4 @@ void rtl8723be_tx_polling(struct ieee80211_hw *hw, u8 hw_queue);
 void rtl8723be_tx_fill_cmddesc(struct ieee80211_hw *hw, u8 *pdesc,
 			       bool firstseg, bool lastseg,
 			       struct sk_buff *skb);
-u32 rtl8723be_rx_command_packet(struct ieee80211_hw *hw,
-				const struct rtl_stats *status,
-				struct sk_buff *skb);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
index bcabf4dc6353..8ff8a406db52 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
@@ -303,7 +303,6 @@ static struct rtl_hal_ops rtl8821ae_hal_ops = {
 	.set_rfreg = rtl8821ae_phy_set_rf_reg,
 	.fill_h2c_cmd = rtl8821ae_fill_h2c_cmd,
 	.get_btc_status = rtl8821ae_get_btc_status,
-	.rx_command_packet = rtl8821ae_rx_command_packet,
 	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl8821ae_c2h_ra_report_handler,
 	.add_wowlan_pattern = rtl8821ae_add_wowlan_pattern,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
index a93d8569c186..d7960dd5bf1a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.c
@@ -1005,29 +1005,3 @@ void rtl8821ae_tx_polling(struct ieee80211_hw *hw, u8 hw_queue)
 			       BIT(0) << (hw_queue));
 	}
 }
-
-u32 rtl8821ae_rx_command_packet(struct ieee80211_hw *hw,
-				const struct rtl_stats *status,
-				struct sk_buff *skb)
-{
-	u32 result = 0;
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-
-	switch (status->packet_report_type) {
-	case NORMAL_RX:
-		result = 0;
-		break;
-	case C2H_PACKET:
-		rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
-		result = 1;
-		RT_TRACE(rtlpriv, COMP_RECV, DBG_LOUD,
-			 "skb->len=%d\n\n", skb->len);
-		break;
-	default:
-		RT_TRACE(rtlpriv, COMP_RECV, DBG_LOUD,
-			 "No this packet type!!\n");
-		break;
-	}
-
-	return result;
-}
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.h b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.h
index 221dd2b29d3b..4ff0968dba81 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.h
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/trx.h
@@ -628,7 +628,4 @@ void rtl8821ae_tx_polling(struct ieee80211_hw *hw, u8 hw_queue);
 void rtl8821ae_tx_fill_cmddesc(struct ieee80211_hw *hw, u8 *pdesc,
 			       bool firstseg, bool lastseg,
 			       struct sk_buff *skb);
-u32 rtl8821ae_rx_command_packet(struct ieee80211_hw *hw,
-				const struct rtl_stats *status,
-				struct sk_buff *skb);
 #endif
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 392b3e84db7e..b40d8f5bbdce 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2340,8 +2340,6 @@ struct rtl_hal_ops {
 	void (*set_default_port_id_cmd)(struct ieee80211_hw *hw);
 	bool (*get_btc_status) (void);
 	bool (*is_fw_header)(struct rtlwifi_firmware_header *hdr);
-	u32 (*rx_command_packet)(struct ieee80211_hw *hw,
-				 const struct rtl_stats *status, struct sk_buff *skb);
 	void (*add_wowlan_pattern)(struct ieee80211_hw *hw,
 				   struct rtl_wow_pattern *rtl_pattern,
 				   u8 index);
-- 
cgit v1.2.3-59-g8ed1b


From b4f6ee489cff1e9c8637996b7c1079e8bdd979df Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:04 +0800
Subject: rtlwifi: remove hal_op c2h_content_parsing

Similar to rx_command_packet, we can call rtl_c2h_content_parsing so the
hal_op isn't necessary.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c         | 7 +++----
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c | 1 -
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c | 1 -
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c | 1 -
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 2 --
 5 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 0d03e98f9cb4..927b7d231576 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -2346,7 +2346,6 @@ void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 cmd_id,
 		break;
 	}
 }
-EXPORT_SYMBOL_GPL(rtl_c2h_content_parsing);
 
 void rtl_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
 {
@@ -2401,9 +2400,9 @@ void rtl_c2hcmd_launcher(struct ieee80211_hw *hw, int exec)
 		if (!c2hcmd)
 			break;
 
-		if (rtlpriv->cfg->ops->c2h_content_parsing && exec)
-			rtlpriv->cfg->ops->c2h_content_parsing(hw,
-					c2hcmd->tag, c2hcmd->len, c2hcmd->val);
+		if (exec)
+			rtl_c2h_content_parsing(hw, c2hcmd->tag,
+						c2hcmd->len, c2hcmd->val);
 
 		/* free */
 		kfree(c2hcmd->val);
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
index 5b67ad748d67..9ea62599ecbb 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/sw.c
@@ -250,7 +250,6 @@ static struct rtl_hal_ops rtl8192ee_hal_ops = {
 	.set_rfreg = rtl92ee_phy_set_rf_reg,
 	.fill_h2c_cmd = rtl92ee_fill_h2c_cmd,
 	.get_btc_status = rtl92ee_get_btc_status,
-	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl92ee_c2h_ra_report_handler,
 };
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
index a41e67b3f38b..c9f7b042d9c6 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723be/sw.c
@@ -262,7 +262,6 @@ static struct rtl_hal_ops rtl8723be_hal_ops = {
 	.fill_h2c_cmd = rtl8723be_fill_h2c_cmd,
 	.get_btc_status = rtl8723be_get_btc_status,
 	.is_fw_header = is_fw_header,
-	.c2h_content_parsing = rtl_c2h_content_parsing,
 };
 
 static struct rtl_mod_params rtl8723be_mod_params = {
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
index 8ff8a406db52..77f6401021c9 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/sw.c
@@ -303,7 +303,6 @@ static struct rtl_hal_ops rtl8821ae_hal_ops = {
 	.set_rfreg = rtl8821ae_phy_set_rf_reg,
 	.fill_h2c_cmd = rtl8821ae_fill_h2c_cmd,
 	.get_btc_status = rtl8821ae_get_btc_status,
-	.c2h_content_parsing = rtl_c2h_content_parsing,
 	.c2h_ra_report_handler = rtl8821ae_c2h_ra_report_handler,
 	.add_wowlan_pattern = rtl8821ae_add_wowlan_pattern,
 };
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index b40d8f5bbdce..930e1ec2280f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2344,8 +2344,6 @@ struct rtl_hal_ops {
 				   struct rtl_wow_pattern *rtl_pattern,
 				   u8 index);
 	u16 (*get_available_desc)(struct ieee80211_hw *hw, u8 q_idx);
-	void (*c2h_content_parsing)(struct ieee80211_hw *hw, u8 tag, u8 len,
-				    u8 *val);
 	void (*c2h_ra_report_handler)(struct ieee80211_hw *hw,
 				      u8 *cmd_buf, u8 cmd_len);
 };
-- 
cgit v1.2.3-59-g8ed1b


From 9ae6ed271a60f0e4476fe94f0efc5aba184a9c57 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:05 +0800
Subject: rtlwifi: use sk_buff to queue C2H commands

We use 'struct rtl_c2hcmd' to store C2H commands originally, and the code
is slightly complex to enqueue and dequeue and also wastes time to
allocate and memcpy data. Since C2H commands are asynchronous events,
they can be processed in work queue, so RX ISR enqueues C2H result in
removal of rtl_c2h_packet_handler().

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c | 96 +++++++----------------------
 drivers/net/wireless/realtek/rtlwifi/base.h |  5 +-
 drivers/net/wireless/realtek/rtlwifi/pci.c  |  3 +-
 drivers/net/wireless/realtek/rtlwifi/wifi.h |  2 +-
 4 files changed, 25 insertions(+), 81 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 927b7d231576..61f12f86fcd4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -571,9 +571,9 @@ int rtl_init_core(struct ieee80211_hw *hw)
 	spin_lock_init(&rtlpriv->locks.iqk_lock);
 	/* <5> init list */
 	INIT_LIST_HEAD(&rtlpriv->entry_list);
-	INIT_LIST_HEAD(&rtlpriv->c2hcmd_list);
 	INIT_LIST_HEAD(&rtlpriv->scan_list.list);
 	skb_queue_head_init(&rtlpriv->tx_report.queue);
+	skb_queue_head_init(&rtlpriv->c2hcmd_queue);
 
 	rtlmac->link_state = MAC80211_NOLINK;
 
@@ -2262,56 +2262,36 @@ void rtl_fwevt_wq_callback(void *data)
 	rtlpriv->cfg->ops->c2h_command_handle(hw);
 }
 
-void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, u8 tag, u8 len, u8 *val)
+void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, struct sk_buff *skb)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	unsigned long flags;
-	struct rtl_c2hcmd *c2hcmd;
-
-	c2hcmd = kmalloc(sizeof(*c2hcmd),
-			 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
-
-	if (!c2hcmd)
-		goto label_err;
-
-	c2hcmd->val = kmalloc(len,
-			      in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
-
-	if (!c2hcmd->val)
-		goto label_err2;
-
-	/* fill data */
-	c2hcmd->tag = tag;
-	c2hcmd->len = len;
-	memcpy(c2hcmd->val, val, len);
 
 	/* enqueue */
 	spin_lock_irqsave(&rtlpriv->locks.c2hcmd_lock, flags);
 
-	list_add_tail(&c2hcmd->list, &rtlpriv->c2hcmd_list);
+	__skb_queue_tail(&rtlpriv->c2hcmd_queue, skb);
 
 	spin_unlock_irqrestore(&rtlpriv->locks.c2hcmd_lock, flags);
 
 	/* wake up wq */
 	queue_delayed_work(rtlpriv->works.rtl_wq, &rtlpriv->works.c2hcmd_wq, 0);
-
-	return;
-
-label_err2:
-	kfree(c2hcmd);
-
-label_err:
-	RT_TRACE(rtlpriv, COMP_CMD, DBG_WARNING,
-		 "C2H cmd enqueue fail.\n");
 }
 EXPORT_SYMBOL(rtl_c2hcmd_enqueue);
 
-void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 cmd_id,
-			     u8 cmd_len, u8 *cmd_buf)
+static void rtl_c2h_content_parsing(struct ieee80211_hw *hw,
+				    struct sk_buff *skb)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_hal_ops *hal_ops = rtlpriv->cfg->ops;
 	const struct rtl_btc_ops *btc_ops = rtlpriv->btcoexist.btc_ops;
+	u8 cmd_id, cmd_seq, cmd_len;
+	u8 *cmd_buf = NULL;
+
+	cmd_id = skb->data[0];
+	cmd_seq = skb->data[1];
+	cmd_len = skb->len - 2;
+	cmd_buf = skb->data + 2;
 
 	switch (cmd_id) {
 	case C2H_DBG:
@@ -2347,67 +2327,35 @@ void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 cmd_id,
 	}
 }
 
-void rtl_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len)
-{
-	struct rtl_priv *rtlpriv = rtl_priv(hw);
-	u8 c2h_cmd_id = 0, c2h_cmd_seq = 0, c2h_cmd_len = 0;
-	u8 *tmp_buf = NULL;
-
-	c2h_cmd_id = buffer[0];
-	c2h_cmd_seq = buffer[1];
-	c2h_cmd_len = len - 2;
-	tmp_buf = buffer + 2;
-
-	RT_TRACE(rtlpriv, COMP_FW, DBG_TRACE,
-		 "[C2H packet], c2hCmdId=0x%x, c2hCmdSeq=0x%x, c2hCmdLen=%d\n",
-		 c2h_cmd_id, c2h_cmd_seq, c2h_cmd_len);
-
-	RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_TRACE,
-		      "[C2H packet], Content Hex:\n", tmp_buf, c2h_cmd_len);
-
-	switch (c2h_cmd_id) {
-	case C2H_BT_INFO:
-	case C2H_BT_MP:
-		rtl_c2hcmd_enqueue(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
-		break;
-	default:
-		rtl_c2h_content_parsing(hw, c2h_cmd_id, c2h_cmd_len, tmp_buf);
-		break;
-	}
-}
-EXPORT_SYMBOL_GPL(rtl_c2h_packet_handler);
-
 void rtl_c2hcmd_launcher(struct ieee80211_hw *hw, int exec)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
+	struct sk_buff *skb;
 	unsigned long flags;
-	struct rtl_c2hcmd *c2hcmd;
 	int i;
 
 	for (i = 0; i < 200; i++) {
 		/* dequeue a task */
 		spin_lock_irqsave(&rtlpriv->locks.c2hcmd_lock, flags);
 
-		c2hcmd = list_first_entry_or_null(&rtlpriv->c2hcmd_list,
-						  struct rtl_c2hcmd, list);
-
-		if (c2hcmd)
-			list_del(&c2hcmd->list);
+		skb = __skb_dequeue(&rtlpriv->c2hcmd_queue);
 
 		spin_unlock_irqrestore(&rtlpriv->locks.c2hcmd_lock, flags);
 
 		/* do it */
-		if (!c2hcmd)
+		if (!skb)
 			break;
 
+		RT_TRACE(rtlpriv, COMP_FW, DBG_DMESG, "C2H rx_desc_shift=%d\n",
+			 *((u8 *)skb->cb));
+		RT_PRINT_DATA(rtlpriv, COMP_FW, DBG_DMESG,
+			      "C2H data: ", skb->data, skb->len);
+
 		if (exec)
-			rtl_c2h_content_parsing(hw, c2hcmd->tag,
-						c2hcmd->len, c2hcmd->val);
+			rtl_c2h_content_parsing(hw, skb);
 
 		/* free */
-		kfree(c2hcmd->val);
-
-		kfree(c2hcmd);
+		dev_kfree_skb_any(skb);
 	}
 }
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/base.h b/drivers/net/wireless/realtek/rtlwifi/base.h
index 3bf174e5b07e..912f205779c3 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.h
+++ b/drivers/net/wireless/realtek/rtlwifi/base.h
@@ -161,10 +161,7 @@ void rtl_watchdog_wq_callback(void *data);
 void rtl_fwevt_wq_callback(void *data);
 void rtl_c2hcmd_wq_callback(void *data);
 void rtl_c2hcmd_launcher(struct ieee80211_hw *hw, int exec);
-void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, u8 tag, u8 len, u8 *val);
-void rtl_c2h_content_parsing(struct ieee80211_hw *hw, u8 c2h_cmd_id,
-			     u8 c2h_cmd_len, u8 *tmp_buf);
-void rtl_c2h_packet_handler(struct ieee80211_hw *hw, u8 *buffer, u8 len);
+void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, struct sk_buff *skb);
 
 u8 rtl_mrate_idx_to_arfr_id(struct ieee80211_hw *hw, u8 rate_index,
 			    enum wireless_mode wirelessmode);
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index dd51c67c09fa..ae13bcfb3bf0 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -831,8 +831,7 @@ static void _rtl_pci_rx_interrupt(struct ieee80211_hw *hw)
 		}
 		/* handle command packet here */
 		if (stats.packet_report_type == C2H_PACKET) {
-			rtl_c2h_packet_handler(hw, skb->data, (u8)skb->len);
-			dev_kfree_skb_any(skb);
+			rtl_c2hcmd_enqueue(hw, skb);
 			goto new_trx_end;
 		}
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 930e1ec2280f..9e620b943f8c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -2791,7 +2791,7 @@ struct rtl_priv {
 	struct list_head entry_list;
 
 	/* c2hcmd list for kthread level access */
-	struct list_head c2hcmd_list;
+	struct sk_buff_head c2hcmd_queue;
 
 	struct rtl_debug dbg;
 	int max_fw_size;
-- 
cgit v1.2.3-59-g8ed1b


From 9644032e307022ecb1a436537cacedb91d569d98 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:06 +0800
Subject: rtlwifi: access skb->data to get C2H data by macro

The format of C2H data is ID(1 byte) + Length(1 byte) + value, and it is
more readable to use macros to access C2H data.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c | 8 ++++----
 drivers/net/wireless/realtek/rtlwifi/wifi.h | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index 61f12f86fcd4..a5939ddfa9cb 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -2288,10 +2288,10 @@ static void rtl_c2h_content_parsing(struct ieee80211_hw *hw,
 	u8 cmd_id, cmd_seq, cmd_len;
 	u8 *cmd_buf = NULL;
 
-	cmd_id = skb->data[0];
-	cmd_seq = skb->data[1];
-	cmd_len = skb->len - 2;
-	cmd_buf = skb->data + 2;
+	cmd_id = GET_C2H_CMD_ID(skb->data);
+	cmd_seq = GET_C2H_SEQ(skb->data);
+	cmd_len = skb->len - C2H_DATA_OFFSET;
+	cmd_buf = GET_C2H_DATA_PTR(skb->data);
 
 	switch (cmd_id) {
 	case C2H_DBG:
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index 9e620b943f8c..0f3b98c5227f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -177,6 +177,11 @@ enum rtl_c2h_evt_v2 {
 	C2H_V2_CCX_RPT = 0x0F,
 };
 
+#define GET_C2H_CMD_ID(c2h)	({u8 *__c2h = c2h; __c2h[0]; })
+#define GET_C2H_SEQ(c2h)	({u8 *__c2h = c2h; __c2h[1]; })
+#define C2H_DATA_OFFSET		2
+#define GET_C2H_DATA_PTR(c2h)	({u8 *__c2h = c2h; &__c2h[C2H_DATA_OFFSET]; })
+
 #define GET_TX_REPORT_SN_V1(c2h)	(c2h[6])
 #define GET_TX_REPORT_ST_V1(c2h)	(c2h[0] & 0xC0)
 #define GET_TX_REPORT_RETRY_V1(c2h)	(c2h[2] & 0x3F)
-- 
cgit v1.2.3-59-g8ed1b


From 0a9f8f0a1ba9688a9ff5f6b83c4cc1eecd2fc9f2 Mon Sep 17 00:00:00 2001
From: Ping-Ke Shih <pkshih@realtek.com>
Date: Fri, 18 May 2018 17:30:07 +0800
Subject: rtlwifi: fix btmpinfo timeout while processing C2H_BT_INFO

In former patch, I enqueu all C2H commands and processed by a workqueue.
In case C2H_BT_INFO will issue a H2C command to set BT reg, and wait for
a C2H ack. But it is totally impossible that C2H workqueue waits for a
C2H command, so kernel log warn
	rtlwifi: :<0> btmpinfo wait (req_num=0) timeout

Since the C2H ack command C2H_BT_MP can be safely processed in interrupt
context, add a fast command path to deal with the command.

Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/base.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index a5939ddfa9cb..39c817eddd78 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -2262,11 +2262,33 @@ void rtl_fwevt_wq_callback(void *data)
 	rtlpriv->cfg->ops->c2h_command_handle(hw);
 }
 
+static void rtl_c2h_content_parsing(struct ieee80211_hw *hw,
+				    struct sk_buff *skb);
+
+static bool rtl_c2h_fast_cmd(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	u8 cmd_id = GET_C2H_CMD_ID(skb->data);
+
+	switch (cmd_id) {
+	case C2H_BT_MP:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
 void rtl_c2hcmd_enqueue(struct ieee80211_hw *hw, struct sk_buff *skb)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	unsigned long flags;
 
+	if (rtl_c2h_fast_cmd(hw, skb)) {
+		rtl_c2h_content_parsing(hw, skb);
+		return;
+	}
+
 	/* enqueue */
 	spin_lock_irqsave(&rtlpriv->locks.c2hcmd_lock, flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From 88027c8ff0a3f87d5d06d53ee25a41b90d8bccec Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 23 May 2018 18:34:45 +0800
Subject: atmel: Add missing call to pci_disable_device()

add pci_disable_device in error handling while init_atmel_card failed.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/atmel/atmel_pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/atmel/atmel_pci.c b/drivers/net/wireless/atmel/atmel_pci.c
index bcf1f274a251..30df58a41a83 100644
--- a/drivers/net/wireless/atmel/atmel_pci.c
+++ b/drivers/net/wireless/atmel/atmel_pci.c
@@ -61,8 +61,10 @@ static int atmel_pci_probe(struct pci_dev *pdev,
 	dev = init_atmel_card(pdev->irq, pdev->resource[1].start,
 			      ATMEL_FW_TYPE_506,
 			      &pdev->dev, NULL, NULL);
-	if (!dev)
+	if (!dev) {
+		pci_disable_device(pdev);
 		return -ENODEV;
+	}
 
 	pci_set_drvdata(pdev, dev);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 6e91d48371e79862ea2c05867aaebe4afe55a865 Mon Sep 17 00:00:00 2001
From: Eyal Reizer <eyalreizer@gmail.com>
Date: Mon, 28 May 2018 11:36:42 +0300
Subject: wlcore: sdio: check for valid platform device data before suspend

the wl pointer can be null In case only wlcore_sdio is probed while
no WiLink module is successfully probed, as in the case of mounting a
wl12xx module while using a device tree file configured with wl18xx
related settings.
In this case the system was crashing in wl1271_suspend() as platform
device data is not set.
Make sure wl the pointer is valid before using it.

Signed-off-by: Eyal Reizer <eyalr@ti.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ti/wlcore/sdio.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/ti/wlcore/sdio.c b/drivers/net/wireless/ti/wlcore/sdio.c
index 15d5ac126061..750bea3574ee 100644
--- a/drivers/net/wireless/ti/wlcore/sdio.c
+++ b/drivers/net/wireless/ti/wlcore/sdio.c
@@ -399,6 +399,11 @@ static int wl1271_suspend(struct device *dev)
 	mmc_pm_flag_t sdio_flags;
 	int ret = 0;
 
+	if (!wl) {
+		dev_err(dev, "no wilink module was probed\n");
+		goto out;
+	}
+
 	dev_dbg(dev, "wl1271 suspend. wow_enabled: %d\n",
 		wl->wow_enabled);
 
-- 
cgit v1.2.3-59-g8ed1b


From a53e8f3edaa0f28458f37dc5654e1912a8a00f26 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 26 May 2018 16:00:19 +0100
Subject: rsi: fix spelling mistake "Uknown" -> "Unknown"

Trivial fix to spelling mistake in rsi_dbg message text

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/rsi/rsi_91x_mac80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index bfa7569c85bb..2ca7464b7fa3 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1103,7 +1103,7 @@ static int rsi_mac80211_ampdu_action(struct ieee80211_hw *hw,
 		break;
 
 	default:
-		rsi_dbg(ERR_ZONE, "%s: Uknown AMPDU action\n", __func__);
+		rsi_dbg(ERR_ZONE, "%s: Unknown AMPDU action\n", __func__);
 		break;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 618fd1ed17d4cf193d85747c04a8b836b8fc107e Mon Sep 17 00:00:00 2001
From: Ganapathi Bhat <gbhat@marvell.com>
Date: Thu, 24 May 2018 16:36:28 +0530
Subject: mwifiex: avoid exporting mwifiex_send_cmd

This is a follow-up patch for commit 21c5c83ce833
("mwifiex: support sysfs initiated device coredump").

Let us avoid exporting mwifiex_send_cmd and instead use a utility
function mwifiex_fw_dump_event to achive the work.

Signed-off-by: Ganapathi Bhat <gbhat@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cmdevt.c | 1 -
 drivers/net/wireless/marvell/mwifiex/main.h   | 1 +
 drivers/net/wireless/marvell/mwifiex/usb.c    | 5 ++---
 drivers/net/wireless/marvell/mwifiex/util.c   | 7 +++++++
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cmdevt.c b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
index 8be1e69657a7..9cfcdf6bec52 100644
--- a/drivers/net/wireless/marvell/mwifiex/cmdevt.c
+++ b/drivers/net/wireless/marvell/mwifiex/cmdevt.c
@@ -674,7 +674,6 @@ int mwifiex_send_cmd(struct mwifiex_private *priv, u16 cmd_no,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mwifiex_send_cmd);
 
 /*
  * This function queues a command to the command pending queue.
diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h
index 8ae74ed78805..69ac0a22c28c 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.h
+++ b/drivers/net/wireless/marvell/mwifiex/main.h
@@ -1691,6 +1691,7 @@ void mwifiex_drv_info_dump(struct mwifiex_adapter *adapter);
 void mwifiex_prepare_fw_dump_info(struct mwifiex_adapter *adapter);
 void mwifiex_upload_device_dump(struct mwifiex_adapter *adapter);
 void *mwifiex_alloc_dma_align_buf(int rx_len, gfp_t flags);
+void mwifiex_fw_dump_event(struct mwifiex_private *priv);
 void mwifiex_queue_main_work(struct mwifiex_adapter *adapter);
 int mwifiex_get_wakeup_reason(struct mwifiex_private *priv, u16 action,
 			      int cmd_type,
diff --git a/drivers/net/wireless/marvell/mwifiex/usb.c b/drivers/net/wireless/marvell/mwifiex/usb.c
index 7aa39878ce49..bc475b83bb15 100644
--- a/drivers/net/wireless/marvell/mwifiex/usb.c
+++ b/drivers/net/wireless/marvell/mwifiex/usb.c
@@ -658,9 +658,8 @@ static void mwifiex_usb_coredump(struct device *dev)
 	struct usb_interface *intf = to_usb_interface(dev);
 	struct usb_card_rec *card = usb_get_intfdata(intf);
 
-	mwifiex_send_cmd(mwifiex_get_priv(card->adapter, MWIFIEX_BSS_ROLE_ANY),
-			 HostCmd_CMD_FW_DUMP_EVENT, HostCmd_ACT_GEN_SET, 0,
-			 NULL, true);
+	mwifiex_fw_dump_event(mwifiex_get_priv(card->adapter,
+					       MWIFIEX_BSS_ROLE_ANY));
 }
 
 static struct usb_driver mwifiex_usb_driver = {
diff --git a/drivers/net/wireless/marvell/mwifiex/util.c b/drivers/net/wireless/marvell/mwifiex/util.c
index 51ccf10f4413..6dd212898117 100644
--- a/drivers/net/wireless/marvell/mwifiex/util.c
+++ b/drivers/net/wireless/marvell/mwifiex/util.c
@@ -757,3 +757,10 @@ void *mwifiex_alloc_dma_align_buf(int rx_len, gfp_t flags)
 	return skb;
 }
 EXPORT_SYMBOL_GPL(mwifiex_alloc_dma_align_buf);
+
+void mwifiex_fw_dump_event(struct mwifiex_private *priv)
+{
+	mwifiex_send_cmd(priv, HostCmd_CMD_FW_DUMP_EVENT, HostCmd_ACT_GEN_SET,
+			 0, NULL, true);
+}
+EXPORT_SYMBOL_GPL(mwifiex_fw_dump_event);
-- 
cgit v1.2.3-59-g8ed1b


From b817047ae70c0bd67b677b65d0d69d72cd6e9728 Mon Sep 17 00:00:00 2001
From: Ganapathi Bhat <gbhat@marvell.com>
Date: Thu, 24 May 2018 19:18:27 +0530
Subject: mwifiex: handle race during mwifiex_usb_disconnect

Race condition is observed during rmmod of mwifiex_usb:

1. The rmmod thread will call mwifiex_usb_disconnect(), download
   SHUTDOWN command and do wait_event_interruptible_timeout(),
   waiting for response.

2. The main thread will handle the response and will do a
   wake_up_interruptible(), unblocking rmmod thread.

3. On getting unblocked, rmmod thread  will make rx_cmd.urb = NULL in
   mwifiex_usb_free().

4. The main thread will try to resubmit rx_cmd.urb in
   mwifiex_usb_submit_rx_urb(), which is NULL.

To fix, wait for main thread to complete before calling
mwifiex_usb_free().

Signed-off-by: Ganapathi Bhat <gbhat@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/usb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/usb.c b/drivers/net/wireless/marvell/mwifiex/usb.c
index bc475b83bb15..6e3cf9817730 100644
--- a/drivers/net/wireless/marvell/mwifiex/usb.c
+++ b/drivers/net/wireless/marvell/mwifiex/usb.c
@@ -644,6 +644,9 @@ static void mwifiex_usb_disconnect(struct usb_interface *intf)
 					 MWIFIEX_FUNC_SHUTDOWN);
 	}
 
+	if (adapter->workqueue)
+		flush_workqueue(adapter->workqueue);
+
 	mwifiex_usb_free(card);
 
 	mwifiex_dbg(adapter, FATAL,
-- 
cgit v1.2.3-59-g8ed1b


From ae30bdaa4c2747e96c880d909b48484f41bfd51d Mon Sep 17 00:00:00 2001
From: Ganapathi Bhat <gbhat@marvell.com>
Date: Sat, 26 May 2018 02:09:22 +0530
Subject: mwifiex: skip sending GT_REKEY_OFFLOAD_CFG if firmware has no support

If firmware does not support embedded supplicant, then it in turn
will not support GT rekey offloading. If this is the case, then
driver must not advertise WOWLAN flags related to GTK rekey and
it must also skip sending the GT_REKEY_OFFLOAD_CFG command.

Signed-off-by: Shrenik Shikhare <shrenik@marvell.com>
Signed-off-by: Cathy Luo <cluo@marvell.com>
Signed-off-by: Ganapathi Bhat <gbhat@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c  | 18 +++++++++++++++++-
 drivers/net/wireless/marvell/mwifiex/fw.h        |  1 +
 drivers/net/wireless/marvell/mwifiex/sta_event.c |  3 +++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 54a2297010d2..d04a59903e82 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -3555,6 +3555,9 @@ static int mwifiex_set_rekey_data(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct mwifiex_private *priv = mwifiex_netdev_get_priv(dev);
 
+	if (!ISSUPP_FIRMWARE_SUPPLICANT(priv->adapter->fw_cap_info))
+		return -EOPNOTSUPP;
+
 	return mwifiex_send_cmd(priv, HostCmd_CMD_GTK_REKEY_OFFLOAD_CFG,
 				HostCmd_ACT_GEN_SET, 0, data, true);
 }
@@ -4190,6 +4193,16 @@ static const struct wiphy_wowlan_support mwifiex_wowlan_support = {
 	.max_pkt_offset = MWIFIEX_MAX_OFFSET_LEN,
 	.max_nd_match_sets = MWIFIEX_MAX_ND_MATCH_SETS,
 };
+
+static const struct wiphy_wowlan_support mwifiex_wowlan_support_no_gtk = {
+	.flags = WIPHY_WOWLAN_MAGIC_PKT | WIPHY_WOWLAN_DISCONNECT |
+		 WIPHY_WOWLAN_NET_DETECT,
+	.n_patterns = MWIFIEX_MEF_MAX_FILTERS,
+	.pattern_min_len = 1,
+	.pattern_max_len = MWIFIEX_MAX_PATTERN_LEN,
+	.max_pkt_offset = MWIFIEX_MAX_OFFSET_LEN,
+	.max_nd_match_sets = MWIFIEX_MAX_ND_MATCH_SETS,
+};
 #endif
 
 static bool mwifiex_is_valid_alpha2(const char *alpha2)
@@ -4308,7 +4321,10 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter)
 				WIPHY_FLAG_TDLS_EXTERNAL_SETUP;
 
 #ifdef CONFIG_PM
-	wiphy->wowlan = &mwifiex_wowlan_support;
+	if (ISSUPP_FIRMWARE_SUPPLICANT(priv->adapter->fw_cap_info))
+		wiphy->wowlan = &mwifiex_wowlan_support;
+	else
+		wiphy->wowlan = &mwifiex_wowlan_support_no_gtk;
 #endif
 
 	wiphy->coalesce = &mwifiex_coalesce_support;
diff --git a/drivers/net/wireless/marvell/mwifiex/fw.h b/drivers/net/wireless/marvell/mwifiex/fw.h
index c5dc518f768b..b73f99dc5a72 100644
--- a/drivers/net/wireless/marvell/mwifiex/fw.h
+++ b/drivers/net/wireless/marvell/mwifiex/fw.h
@@ -249,6 +249,7 @@ enum MWIFIEX_802_11_PRIVACY_FILTER {
 #define ISSUPP_SDIO_SPA_ENABLED(FwCapInfo) (FwCapInfo & BIT(16))
 #define ISSUPP_ADHOC_ENABLED(FwCapInfo) (FwCapInfo & BIT(25))
 #define ISSUPP_RANDOM_MAC(FwCapInfo) (FwCapInfo & BIT(27))
+#define ISSUPP_FIRMWARE_SUPPLICANT(FwCapInfo) (FwCapInfo & BIT(21))
 
 #define MWIFIEX_DEF_HT_CAP	(IEEE80211_HT_CAP_DSSSCCK40 | \
 				 (1 << IEEE80211_HT_CAP_RX_STBC_SHIFT) | \
diff --git a/drivers/net/wireless/marvell/mwifiex/sta_event.c b/drivers/net/wireless/marvell/mwifiex/sta_event.c
index 93dfb76cd8a6..03a6492662ca 100644
--- a/drivers/net/wireless/marvell/mwifiex/sta_event.c
+++ b/drivers/net/wireless/marvell/mwifiex/sta_event.c
@@ -241,6 +241,9 @@ void mwifiex_reset_connect_state(struct mwifiex_private *priv, u16 reason_code,
 	if (netif_carrier_ok(priv->netdev))
 		netif_carrier_off(priv->netdev);
 
+	if (!ISSUPP_FIRMWARE_SUPPLICANT(priv->adapter->fw_cap_info))
+		return;
+
 	mwifiex_send_cmd(priv, HostCmd_CMD_GTK_REKEY_OFFLOAD_CFG,
 			 HostCmd_ACT_GEN_REMOVE, 0, NULL, false);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 666cc438f36918fb4ed5cab37f0cfaf14586b85e Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 25 May 2018 16:38:54 -0500
Subject: mwifiex: mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c | 4 ++++
 drivers/net/wireless/marvell/mwifiex/scan.c     | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index d04a59903e82..4cc58d79d837 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1158,6 +1158,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
 		case NL80211_IFTYPE_UNSPECIFIED:
 			mwifiex_dbg(priv->adapter, INFO,
 				    "%s: kept type as IBSS\n", dev->name);
+			/* fall through */
 		case NL80211_IFTYPE_ADHOC:	/* This shouldn't happen */
 			return 0;
 		default:
@@ -1188,6 +1189,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
 		case NL80211_IFTYPE_UNSPECIFIED:
 			mwifiex_dbg(priv->adapter, INFO,
 				    "%s: kept type as STA\n", dev->name);
+			/* fall through */
 		case NL80211_IFTYPE_STATION:	/* This shouldn't happen */
 			return 0;
 		default:
@@ -1210,6 +1212,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
 		case NL80211_IFTYPE_UNSPECIFIED:
 			mwifiex_dbg(priv->adapter, INFO,
 				    "%s: kept type as AP\n", dev->name);
+			/* fall through */
 		case NL80211_IFTYPE_AP:		/* This shouldn't happen */
 			return 0;
 		default:
@@ -1249,6 +1252,7 @@ mwifiex_cfg80211_change_virtual_intf(struct wiphy *wiphy,
 		case NL80211_IFTYPE_UNSPECIFIED:
 			mwifiex_dbg(priv->adapter, INFO,
 				    "%s: kept type as P2P\n", dev->name);
+			/* fall through */
 		case NL80211_IFTYPE_P2P_CLIENT:
 		case NL80211_IFTYPE_P2P_GO:
 			return 0;
diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c
index d7ce7f75ae38..19df92b6668d 100644
--- a/drivers/net/wireless/marvell/mwifiex/scan.c
+++ b/drivers/net/wireless/marvell/mwifiex/scan.c
@@ -1308,6 +1308,7 @@ int mwifiex_update_bss_desc_with_ie(struct mwifiex_adapter *adapter,
 
 		case WLAN_EID_CHANNEL_SWITCH:
 			bss_entry->chan_sw_ie_present = true;
+			/* fall through */
 		case WLAN_EID_PWR_CAPABILITY:
 		case WLAN_EID_TPC_REPORT:
 		case WLAN_EID_QUIET:
-- 
cgit v1.2.3-59-g8ed1b


From 788f4e4cf063173c25e3f3ad81e7cee923686ab1 Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Tue, 29 May 2018 09:42:08 +0800
Subject: mwifiex: increase log level for internal scan fail result

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 4cc58d79d837..a67e2d66ac9d 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -2262,7 +2262,7 @@ done:
 
 		if (!bss) {
 			if (is_scanning_required) {
-				mwifiex_dbg(priv->adapter, WARN,
+				mwifiex_dbg(priv->adapter, MSG,
 					    "assoc: requested bss not found in scan results\n");
 				break;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From db69f4e05bd5c83ea66b156fddd7cfc562092bbc Mon Sep 17 00:00:00 2001
From: Xinming Hu <huxm@marvell.com>
Date: Tue, 29 May 2018 09:42:09 +0800
Subject: mwifiex: reserve passive scan time for radar channel

Active scan is not allowed on radar channel, instead
using passvie scan with more time.

Signed-off-by: Xinming Hu <huxm@marvell.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/marvell/mwifiex/scan.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c
index 19df92b6668d..895b806cdb03 100644
--- a/drivers/net/wireless/marvell/mwifiex/scan.c
+++ b/drivers/net/wireless/marvell/mwifiex/scan.c
@@ -482,7 +482,8 @@ mwifiex_scan_create_channel_list(struct mwifiex_private *priv,
 				scan_chan_list[chan_idx].max_scan_time =
 					cpu_to_le16((u16) user_scan_in->
 					chan_list[0].scan_time);
-			else if (ch->flags & IEEE80211_CHAN_NO_IR)
+			else if ((ch->flags & IEEE80211_CHAN_NO_IR) ||
+				 (ch->flags & IEEE80211_CHAN_RADAR))
 				scan_chan_list[chan_idx].max_scan_time =
 					cpu_to_le16(adapter->passive_scan_time);
 			else
@@ -502,10 +503,12 @@ mwifiex_scan_create_channel_list(struct mwifiex_private *priv,
 			scan_chan_list[chan_idx].chan_scan_mode_bitmap
 					|= MWIFIEX_DISABLE_CHAN_FILT;
 
-			if (filtered_scan) {
+			if (filtered_scan &&
+			    !((ch->flags & IEEE80211_CHAN_NO_IR) ||
+			      (ch->flags & IEEE80211_CHAN_RADAR)))
 				scan_chan_list[chan_idx].max_scan_time =
 				cpu_to_le16(adapter->specific_scan_time);
-			}
+
 			chan_idx++;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 06895b1627fe36f0bea49d4c9fd0e27f0f7498b1 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 24 May 2018 13:54:50 -0500
Subject: rtlwifi: remove duplicate code

Remove and refactor some code in order to avoid having identical code
for different branches.

Notice that the logic has been there since 2014.

Addresses-Coverity-ID: 1426199 ("Identical code for different branches")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 .../realtek/rtlwifi/btcoexist/halbtc8723b2ant.c    | 23 ++++------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
index 279fe01bb55e..df3facc8e5a4 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtc8723b2ant.c
@@ -2876,25 +2876,10 @@ static void btc8723b2ant_action_hid(struct btc_coexist *btcoexist)
 		btc8723b2ant_ps_tdma(btcoexist, NORMAL_EXEC, true, 13);
 
 	/* sw mechanism */
-	if (BTC_WIFI_BW_HT40 == wifi_bw) {
-		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
-		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
-			btc8723b2ant_sw_mechanism(btcoexist, true, true,
-						  false, false);
-		} else {
-			btc8723b2ant_sw_mechanism(btcoexist, true, true,
-						  false, false);
-		}
-	} else {
-		if ((wifi_rssi_state == BTC_RSSI_STATE_HIGH) ||
-		    (wifi_rssi_state == BTC_RSSI_STATE_STAY_HIGH)) {
-			btc8723b2ant_sw_mechanism(btcoexist, false, true,
-						  false, false);
-		} else {
-			btc8723b2ant_sw_mechanism(btcoexist, false, true,
-						  false, false);
-		}
-	}
+	if (wifi_bw == BTC_WIFI_BW_HT40)
+		btc8723b2ant_sw_mechanism(btcoexist, true, true, false, false);
+	else
+		btc8723b2ant_sw_mechanism(btcoexist, false, true, false, false);
 }
 
 /* A2DP only / PAN(EDR) only/ A2DP+PAN(HS) */
-- 
cgit v1.2.3-59-g8ed1b


From d71dbdaa2d94d425fff4a59e10eb5be9f61a2661 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 29 May 2018 11:55:06 +0200
Subject: bpfilter: fix building without CONFIG_INET

bpfilter_process_sockopt is a callback that gets called from
ip_setsockopt() and ip_getsockopt(). However, when CONFIG_INET is
disabled, it never gets called at all, and assigning a function to the
callback pointer results in a link failure:

net/bpfilter/bpfilter_kern.o: In function `__stop_umh':
bpfilter_kern.c:(.text.unlikely+0x3): undefined reference to `bpfilter_process_sockopt'
net/bpfilter/bpfilter_kern.o: In function `load_umh':
bpfilter_kern.c:(.init.text+0x73): undefined reference to `bpfilter_process_sockopt'

Since there is no caller in this configuration, I assume we can
simply make the assignment conditional.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/bpfilter_kern.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
index 7596314b61c7..b13d058f8c34 100644
--- a/net/bpfilter/bpfilter_kern.c
+++ b/net/bpfilter/bpfilter_kern.c
@@ -33,7 +33,8 @@ static void shutdown_umh(struct umh_info *info)
 
 static void __stop_umh(void)
 {
-	if (bpfilter_process_sockopt) {
+	if (IS_ENABLED(CONFIG_INET) &&
+	    bpfilter_process_sockopt) {
 		bpfilter_process_sockopt = NULL;
 		shutdown_umh(&info);
 	}
@@ -98,7 +99,9 @@ static int __init load_umh(void)
 		stop_umh();
 		return -EFAULT;
 	}
-	bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+	if (IS_ENABLED(CONFIG_INET))
+		bpfilter_process_sockopt = &__bpfilter_process_sockopt;
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0513c083d1f460c30eae67867c0add557f7ca1b9 Mon Sep 17 00:00:00 2001
From: Golan Ben Ami <golan.ben.ami@intel.com>
Date: Mon, 28 May 2018 12:15:44 +0300
Subject: iwlwifi: add csr configuration for 6300 devices

Recently we have switched the csr addresses and values configuration
from a single configuration to all devices to a per-device configuration.
Doing that, the configuration for 6300 devices wasn't set.
This missing definition introduced a kernel panic once trying to access
the csr's.

Add the missing 6300 csr configuration.

While at it, add a checker that the csr values were indeed
configured, and bail out more gracefully if not.

Fixes: a8cbb46f831d ("iwlwifi: allow different csr flags for different device families")
Signed-off-by: Golan Ben Ami <golan.ben.ami@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/cfg/6000.c | 1 +
 drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/6000.c b/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
index 51cec0bb75fc..dbcec7ce7863 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/6000.c
@@ -373,6 +373,7 @@ const struct iwl_cfg iwl6000_3agn_cfg = {
 	.eeprom_params = &iwl6000_eeprom_params,
 	.ht_params = &iwl6000_ht_params,
 	.led_mode = IWL_LED_BLINK,
+	.csr = &iwl_csr_v1,
 };
 
 MODULE_FIRMWARE(IWL6000_MODULE_FIRMWARE(IWL6000_UCODE_API_MAX));
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
index 959de2f8bb28..38234bda9017 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
@@ -836,6 +836,9 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	struct iwl_trans *iwl_trans;
 	int ret;
 
+	if (WARN_ONCE(!cfg->csr, "CSR addresses aren't configured\n"))
+		return -EINVAL;
+
 	iwl_trans = iwl_trans_pcie_alloc(pdev, ent, cfg);
 	if (IS_ERR(iwl_trans))
 		return PTR_ERR(iwl_trans);
-- 
cgit v1.2.3-59-g8ed1b


From 0cbc06b3faba756113d4ac748b089529f813eda4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 25 May 2018 00:25:48 +0200
Subject: netfilter: nf_tables: remove synchronize_rcu in commit phase

synchronize_rcu() is expensive.

The commit phase currently enforces an unconditional
synchronize_rcu() after incrementing the generation counter.

This is to make sure that a packet always sees a consistent chain, either
nft_do_chain is still using old generation (it will skip the newly added
rules), or the new one (it will skip old ones that might still be linked
into the list).

We could just remove the synchronize_rcu(), it would not cause a crash but
it could cause us to evaluate a rule that was removed and new rule for the
same packet, instead of either-or.

To resolve this, add rule pointer array holding two generations, the
current one and the future generation.

In commit phase, allocate the rule blob and populate it with the rules that
will be active in the new generation.

Then, make this rule blob public, replacing the old generation pointer.

Then the generation counter can be incremented.

nft_do_chain() will either continue to use the current generation
(in case loop was invoked right before increment), or the new one.

Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   5 +
 net/netfilter/nf_tables_api.c     | 204 ++++++++++++++++++++++++++++++++++++--
 net/netfilter/nf_tables_core.c    |  24 +++--
 3 files changed, 215 insertions(+), 18 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 603b51401deb..6b9ddb99f3a8 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -858,6 +858,8 @@ enum nft_chain_flags {
  *	@name: name of the chain
  */
 struct nft_chain {
+	struct nft_rule			*__rcu *rules_gen_0;
+	struct nft_rule			*__rcu *rules_gen_1;
 	struct list_head		rules;
 	struct list_head		list;
 	struct nft_table		*table;
@@ -867,6 +869,9 @@ struct nft_chain {
 	u8				flags:6,
 					genmask:2;
 	char				*name;
+
+	/* Only used during control plane commit phase: */
+	struct nft_rule			**rules_next;
 };
 
 enum nft_chain_types {
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 87b2a77add65..583673743648 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1237,12 +1237,29 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
 		rcu_assign_pointer(chain->stats, newstats);
 }
 
+static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
+{
+	struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
+	struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+
+	if (g0 != g1)
+		kvfree(g1);
+	kvfree(g0);
+
+	/* should be NULL either via abort or via successful commit */
+	WARN_ON_ONCE(chain->rules_next);
+	kvfree(chain->rules_next);
+}
+
 static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
 	struct nft_chain *chain = ctx->chain;
 
 	BUG_ON(chain->use > 0);
 
+	/* no concurrent access possible anymore */
+	nf_tables_chain_free_chain_rules(chain);
+
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
@@ -1335,6 +1352,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
 	module_put(hook->type->owner);
 }
 
+struct nft_rules_old {
+	struct rcu_head h;
+	struct nft_rule **start;
+};
+
+static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+						     unsigned int alloc)
+{
+	if (alloc > INT_MAX)
+		return NULL;
+
+	alloc += 1;	/* NULL, ends rules */
+	if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+		return NULL;
+
+	alloc *= sizeof(struct nft_rule *);
+	alloc += sizeof(struct nft_rules_old);
+
+	return kvmalloc(alloc, GFP_KERNEL);
+}
+
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			      u8 policy, bool create)
 {
@@ -1344,6 +1382,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	struct nft_stats __percpu *stats;
 	struct net *net = ctx->net;
 	struct nft_chain *chain;
+	struct nft_rule **rules;
 	int err;
 
 	if (table->use == UINT_MAX)
@@ -1406,6 +1445,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		goto err1;
 	}
 
+	rules = nf_tables_chain_alloc_rules(chain, 0);
+	if (!rules) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	*rules = NULL;
+	rcu_assign_pointer(chain->rules_gen_0, rules);
+	rcu_assign_pointer(chain->rules_gen_1, rules);
+
 	err = nf_tables_register_hook(net, table, chain);
 	if (err < 0)
 		goto err1;
@@ -5850,21 +5899,162 @@ static void nf_tables_commit_release(struct net *net)
 	}
 }
 
+static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
+{
+	struct nft_rule *rule;
+	unsigned int alloc = 0;
+	int i;
+
+	/* already handled or inactive chain? */
+	if (chain->rules_next || !nft_is_active_next(net, chain))
+		return 0;
+
+	rule = list_entry(&chain->rules, struct nft_rule, list);
+	i = 0;
+
+	list_for_each_entry_continue(rule, &chain->rules, list) {
+		if (nft_is_active_next(net, rule))
+			alloc++;
+	}
+
+	chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
+	if (!chain->rules_next)
+		return -ENOMEM;
+
+	list_for_each_entry_continue(rule, &chain->rules, list) {
+		if (nft_is_active_next(net, rule))
+			chain->rules_next[i++] = rule;
+	}
+
+	chain->rules_next[i] = NULL;
+	return 0;
+}
+
+static void nf_tables_commit_chain_prepare_cancel(struct net *net)
+{
+	struct nft_trans *trans, *next;
+
+	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+		struct nft_chain *chain = trans->ctx.chain;
+
+		if (trans->msg_type == NFT_MSG_NEWRULE ||
+		    trans->msg_type == NFT_MSG_DELRULE) {
+			kvfree(chain->rules_next);
+			chain->rules_next = NULL;
+		}
+	}
+}
+
+static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+{
+	struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+
+	kvfree(o->start);
+}
+
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+{
+	struct nft_rule **r = rules;
+	struct nft_rules_old *old;
+
+	while (*r)
+		r++;
+
+	r++;	/* rcu_head is after end marker */
+	old = (void *) r;
+	old->start = rules;
+
+	call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+}
+
+static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
+{
+	struct nft_rule **g0, **g1;
+	bool next_genbit;
+
+	next_genbit = nft_gencursor_next(net);
+
+	g0 = rcu_dereference_protected(chain->rules_gen_0,
+				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+	g1 = rcu_dereference_protected(chain->rules_gen_1,
+				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+	/* No changes to this chain? */
+	if (chain->rules_next == NULL) {
+		/* chain had no change in last or next generation */
+		if (g0 == g1)
+			return;
+		/*
+		 * chain had no change in this generation; make sure next
+		 * one uses same rules as current generation.
+		 */
+		if (next_genbit) {
+			rcu_assign_pointer(chain->rules_gen_1, g0);
+			nf_tables_commit_chain_free_rules_old(g1);
+		} else {
+			rcu_assign_pointer(chain->rules_gen_0, g1);
+			nf_tables_commit_chain_free_rules_old(g0);
+		}
+
+		return;
+	}
+
+	if (next_genbit)
+		rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+	else
+		rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+
+	chain->rules_next = NULL;
+
+	if (g0 == g1)
+		return;
+
+	if (next_genbit)
+		nf_tables_commit_chain_free_rules_old(g1);
+	else
+		nf_tables_commit_chain_free_rules_old(g0);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
 	struct nft_trans_elem *te;
+	struct nft_chain *chain;
+	struct nft_table *table;
 
-	/* Bump generation counter, invalidate any dump in progress */
-	while (++net->nft.base_seq == 0);
+	/* 1.  Allocate space for next generation rules_gen_X[] */
+	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+		int ret;
 
-	/* A new generation has just started */
-	net->nft.gencursor = nft_gencursor_next(net);
+		if (trans->msg_type == NFT_MSG_NEWRULE ||
+		    trans->msg_type == NFT_MSG_DELRULE) {
+			chain = trans->ctx.chain;
+
+			ret = nf_tables_commit_chain_prepare(net, chain);
+			if (ret < 0) {
+				nf_tables_commit_chain_prepare_cancel(net);
+				return ret;
+			}
+		}
+	}
 
-	/* Make sure all packets have left the previous generation before
-	 * purging old rules.
+	/* step 2.  Make rules_gen_X visible to packet path */
+	list_for_each_entry(table, &net->nft.tables, list) {
+		list_for_each_entry(chain, &table->chains, list) {
+			if (!nft_is_active_next(net, chain))
+				continue;
+			nf_tables_commit_chain_active(net, chain);
+		}
+	}
+
+	/*
+	 * Bump generation counter, invalidate any dump in progress.
+	 * Cannot fail after this point.
 	 */
-	synchronize_rcu();
+	while (++net->nft.base_seq == 0);
+
+	/* step 3. Start new generation, rules_gen_X now in use. */
+	net->nft.gencursor = nft_gencursor_next(net);
 
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		switch (trans->msg_type) {
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 4f46d2f4e167..0548cd50ec26 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -133,7 +133,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 
 struct nft_jumpstack {
 	const struct nft_chain	*chain;
-	const struct nft_rule	*rule;
+	struct nft_rule	*const *rules;
 };
 
 unsigned int
@@ -141,27 +141,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 {
 	const struct nft_chain *chain = priv, *basechain = chain;
 	const struct net *net = nft_net(pkt);
+	struct nft_rule *const *rules;
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
 	struct nft_regs regs;
 	unsigned int stackptr = 0;
 	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
-	unsigned int gencursor = nft_genmask_cur(net);
+	bool genbit = READ_ONCE(net->nft.gencursor);
 	struct nft_traceinfo info;
 
 	info.trace = false;
 	if (static_branch_unlikely(&nft_trace_enabled))
 		nft_trace_init(&info, pkt, &regs.verdict, basechain);
 do_chain:
-	rule = list_entry(&chain->rules, struct nft_rule, list);
+	if (genbit)
+		rules = rcu_dereference(chain->rules_gen_1);
+	else
+		rules = rcu_dereference(chain->rules_gen_0);
+
 next_rule:
+	rule = *rules;
 	regs.verdict.code = NFT_CONTINUE;
-	list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
-
-		/* This rule is not active, skip. */
-		if (unlikely(rule->genmask & gencursor))
-			continue;
-
+	for (; *rules ; rules++) {
+		rule = *rules;
 		nft_rule_for_each_expr(expr, last, rule) {
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, &regs);
@@ -199,7 +201,7 @@ next_rule:
 	case NFT_JUMP:
 		BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
 		jumpstack[stackptr].chain = chain;
-		jumpstack[stackptr].rule  = rule;
+		jumpstack[stackptr].rules = rules + 1;
 		stackptr++;
 		/* fall through */
 	case NFT_GOTO:
@@ -221,7 +223,7 @@ next_rule:
 	if (stackptr > 0) {
 		stackptr--;
 		chain = jumpstack[stackptr].chain;
-		rule  = jumpstack[stackptr].rule;
+		rules = jumpstack[stackptr].rules;
 		goto next_rule;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 88491c11b0fd66881becc418d9db52462ae41870 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Sat, 26 May 2018 09:48:53 +0000
Subject: netfilter: nat: make symbol nat_hook static

Fixes the following sparse warning:

net/netfilter/nf_nat_core.c:1039:20: warning:
 symbol 'nat_hook' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 821f8d835f7a..b7df32a56e7e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1036,7 +1036,7 @@ static struct pernet_operations nat_net_ops = {
 	.size = sizeof(struct nat_net),
 };
 
-struct nf_nat_hook nat_hook = {
+static struct nf_nat_hook nat_hook = {
 	.parse_nat_setup	= nfnetlink_parse_nat_setup,
 #ifdef CONFIG_XFRM
 	.decode_session		= __nf_nat_decode_session,
-- 
cgit v1.2.3-59-g8ed1b


From eb1fb1479b710b0c1675986a41bea1022091c2a3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 27 May 2018 11:20:48 +0200
Subject: netfilter: nft_compat: use call_rcu for nfnl_compat_get

Just use .call_rcu instead.  We can drop the rcu read lock
after obtaining a reference and re-acquire on return.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 1d99a1efdafc..8d1ff654e5af 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -611,10 +611,10 @@ nla_put_failure:
 	return -1;
 }
 
-static int nfnl_compat_get(struct net *net, struct sock *nfnl,
-			   struct sk_buff *skb, const struct nlmsghdr *nlh,
-			   const struct nlattr * const tb[],
-			   struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
+			       struct sk_buff *skb, const struct nlmsghdr *nlh,
+			       const struct nlattr * const tb[],
+			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, target;
 	struct nfgenmsg *nfmsg;
@@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
 		return -EINVAL;
 	}
 
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	rcu_read_unlock();
 	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
 						 rev, target, &ret),
 						 fmt, name);
-
 	if (ret < 0)
-		return ret;
+		goto out_put;
 
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb2 == NULL)
-		return -ENOMEM;
+	if (skb2 == NULL) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
 
 	/* include the best revision for this extension in the message */
 	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
@@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl,
 				  nfmsg->nfgen_family,
 				  name, ret, target) <= 0) {
 		kfree_skb(skb2);
-		return -ENOSPC;
+		goto out_put;
 	}
 
 	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
 				MSG_DONTWAIT);
 	if (ret > 0)
 		ret = 0;
-
+out_put:
+	rcu_read_lock();
+	module_put(THIS_MODULE);
 	return ret == -EAGAIN ? -ENOBUFS : ret;
 }
 
@@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
 };
 
 static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
-	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get,
+	[NFNL_MSG_COMPAT_GET]		= { .call_rcu = nfnl_compat_get_rcu,
 					    .attr_count = NFTA_COMPAT_MAX,
 					    .policy = nfnl_compat_policy_get },
 };
-- 
cgit v1.2.3-59-g8ed1b


From d6501de8726afb9d7ed502eafc8bfa629482049d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 27 May 2018 11:21:31 +0200
Subject: netfilter: nf_tables: fix endian mismatch in return type

harmless, but it avoids sparse warnings:

nf_tables_api.c:2813:16: warning: incorrect type in return expression (different base types)
nf_tables_api.c:2863:47: warning: incorrect type in argument 3 (different base types)
nf_tables_api.c:3524:47: warning: incorrect type in argument 3 (different base types)
nf_tables_api.c:3538:55: warning: incorrect type in argument 3 (different base types)

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 583673743648..8f04bfc41bf9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2830,7 +2830,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
 	return 0;
 }
 
-static u64 nf_jiffies64_to_msecs(u64 input)
+static __be64 nf_jiffies64_to_msecs(u64 input)
 {
 	u64 ms = jiffies64_to_nsecs(input);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8a3d4c361224353435fb04b1790f498f500bc7fb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 27 May 2018 11:24:34 +0200
Subject: netfilter: nf_tables: fail batch if fatal signal is pending

abort batch processing and return so task can exit faster.
Otherwise even SIGKILL has no immediate effect.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 88c9e222b670..5a1bd23af1a3 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -25,6 +25,7 @@
 #include <linux/uaccess.h>
 #include <net/sock.h>
 #include <linux/init.h>
+#include <linux/sched/signal.h>
 
 #include <net/netlink.h>
 #include <linux/netfilter/nfnetlink.h>
@@ -330,6 +331,13 @@ replay:
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
+		if (fatal_signal_pending(current)) {
+			nfnl_err_reset(&err_list);
+			err = -EINTR;
+			status = NFNL_BATCH_FAILURE;
+			goto done;
+		}
+
 		memset(&extack, 0, sizeof(extack));
 		nlh = nlmsg_hdr(skb);
 		err = 0;
-- 
cgit v1.2.3-59-g8ed1b


From d9adf22a2918832ac94335fddf1950b12543d0b5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 27 May 2018 11:31:46 +0200
Subject: netfilter: nf_tables: use call_rcu in netlink dumps

We can make all dumps and lookups lockless.

Dumps currently only hold the nfnl mutex on the dump request itself.
Dumps can span multiple syscalls, dump continuation doesn't acquire the
nfnl mutex anywhere, i.e. the dump callbacks in nf_tables already use
rcu and never rely on nfnl mutex being held.

So, just switch all dumpers to rcu.

This requires taking a module reference before dropping the rcu lock
so rmmod is blocked, we also need to hold module reference over
the entire dump operation sequence. netlink already supports this
via the .module member in the netlink_dump_control struct.

For the non-dump case (i.e. lookup of a specific tables, chains, etc),
we need to swtich to _rcu list iteration primitive and make sure we
use GFP_ATOMIC.

This patch also adds the new nft_netlink_dump_start_rcu() helper that
takes care of the get_ref, drop-rcu-lock,start dump,
get-rcu-lock,put-ref sequence.

The helper will be reused for all dumps.

Rationale in all dump requests is:

 - use the nft_netlink_dump_start_rcu helper added in first patch
 - use GFP_ATOMIC and rcu list iteration
 - switch to .call_rcu

... thus making all dumps in nf_tables not depend on the
nfnl mutex anymore.

In the nf_tables_getgen: This callback just fetches the current base
sequence, there is no need to serialize this with nfnl nft mutex.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 111 +++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 39 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8f04bfc41bf9..3b2ad96a9a05 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -373,7 +373,7 @@ static struct nft_table *nft_table_lookup(const struct net *net,
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry(table, &net->nft.tables, list) {
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
 		    table->family == family &&
 		    nft_active_genmask(table, genmask))
@@ -546,6 +546,24 @@ done:
 	return skb->len;
 }
 
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+				      const struct nlmsghdr *nlh,
+				      struct netlink_dump_control *c)
+{
+	int err;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	rcu_read_unlock();
+	err = netlink_dump_start(nlsk, skb, nlh, c);
+	rcu_read_lock();
+	module_put(THIS_MODULE);
+
+	return err;
+}
+
+/* called with rcu_read_lock held */
 static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 			      struct sk_buff *skb, const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[],
@@ -561,8 +579,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_tables,
+			.module = THIS_MODULE,
 		};
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
@@ -571,7 +591,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -933,7 +953,7 @@ static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry(chain, &table->chains, list) {
+	list_for_each_entry_rcu(chain, &table->chains, list) {
 		if (!nla_strcmp(nla, chain->name) &&
 		    nft_active_genmask(chain, genmask))
 			return chain;
@@ -1135,6 +1155,7 @@ done:
 	return skb->len;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 			      struct sk_buff *skb, const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[],
@@ -1151,8 +1172,10 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_chains,
+			.module = THIS_MODULE,
 		};
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
@@ -1167,7 +1190,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return PTR_ERR(chain);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -1969,7 +1992,7 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
 	struct nft_rule *rule;
 
 	// FIXME: this sucks
-	list_for_each_entry(rule, &chain->rules, list) {
+	list_for_each_entry_rcu(rule, &chain->rules, list) {
 		if (handle == rule->handle)
 			return rule;
 	}
@@ -2165,6 +2188,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 	return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			     struct sk_buff *skb, const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[],
@@ -2183,18 +2207,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_rules,
 			.done = nf_tables_dump_rules_done,
+			.module = THIS_MODULE,
 		};
 
 		if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
 			struct nft_rule_dump_ctx *ctx;
 
-			ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+			ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
 			if (!ctx)
 				return -ENOMEM;
 
 			if (nla[NFTA_RULE_TABLE]) {
 				ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
-							GFP_KERNEL);
+							GFP_ATOMIC);
 				if (!ctx->table) {
 					kfree(ctx);
 					return -ENOMEM;
@@ -2202,7 +2227,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			}
 			if (nla[NFTA_RULE_CHAIN]) {
 				ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
-							GFP_KERNEL);
+							GFP_ATOMIC);
 				if (!ctx->chain) {
 					kfree(ctx->table);
 					kfree(ctx);
@@ -2212,7 +2237,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			c.data = ctx;
 		}
 
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
@@ -2233,7 +2258,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return PTR_ERR(rule);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -2704,7 +2729,7 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry(set, &table->sets, list) {
+	list_for_each_entry_rcu(set, &table->sets, list) {
 		if (!nla_strcmp(nla, set->name) &&
 		    nft_active_genmask(set, genmask))
 			return set;
@@ -3009,6 +3034,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
 	return 0;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getset(struct net *net, struct sock *nlsk,
 			    struct sk_buff *skb, const struct nlmsghdr *nlh,
 			    const struct nlattr * const nla[],
@@ -3031,17 +3057,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_sets,
 			.done = nf_tables_dump_sets_done,
+			.module = THIS_MODULE,
 		};
 		struct nft_ctx *ctx_dump;
 
-		ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL);
+		ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
 		if (ctx_dump == NULL)
 			return -ENOMEM;
 
 		*ctx_dump = ctx;
 		c.data = ctx_dump;
 
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	/* Only accept unspec with dump */
@@ -3054,7 +3081,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (skb2 == NULL)
 		return -ENOMEM;
 
@@ -3795,7 +3822,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	ext = nft_set_elem_ext(set, &elem);
 
 	err = -ENOMEM;
-	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (skb == NULL)
 		goto err1;
 
@@ -3817,6 +3844,7 @@ err1:
 	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 				struct sk_buff *skb, const struct nlmsghdr *nlh,
 				const struct nlattr * const nla[],
@@ -3841,10 +3869,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_set,
 			.done = nf_tables_dump_set_done,
+			.module = THIS_MODULE,
 		};
 		struct nft_set_dump_ctx *dump_ctx;
 
-		dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+		dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
 		if (!dump_ctx)
 			return -ENOMEM;
 
@@ -3852,7 +3881,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
 		dump_ctx->ctx = ctx;
 
 		c.data = dump_ctx;
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
@@ -4475,7 +4504,7 @@ struct nft_object *nft_obj_lookup(const struct nft_table *table,
 {
 	struct nft_object *obj;
 
-	list_for_each_entry(obj, &table->objects, list) {
+	list_for_each_entry_rcu(obj, &table->objects, list) {
 		if (!nla_strcmp(nla, obj->name) &&
 		    objtype == obj->ops->type->type &&
 		    nft_active_genmask(obj, genmask))
@@ -4805,12 +4834,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 {
 	struct nft_obj_filter *filter;
 
-	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
 	if (!filter)
 		return ERR_PTR(-ENOMEM);
 
 	if (nla[NFTA_OBJ_TABLE]) {
-		filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+		filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
 		if (!filter->table) {
 			kfree(filter);
 			return ERR_PTR(-ENOMEM);
@@ -4822,6 +4851,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 	return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 			    struct sk_buff *skb, const struct nlmsghdr *nlh,
 			    const struct nlattr * const nla[],
@@ -4841,6 +4871,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_obj,
 			.done = nf_tables_dump_obj_done,
+			.module = THIS_MODULE,
 		};
 
 		if (nla[NFTA_OBJ_TABLE] ||
@@ -4853,7 +4884,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 
 			c.data = filter;
 		}
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	if (!nla[NFTA_OBJ_NAME] ||
@@ -4873,7 +4904,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
 		return PTR_ERR(obj);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -5018,7 +5049,7 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
 {
 	struct nft_flowtable *flowtable;
 
-	list_for_each_entry(flowtable, &table->flowtables, list) {
+	list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
 		if (!nla_strcmp(nla, flowtable->name) &&
 		    nft_active_genmask(flowtable, genmask))
 			return flowtable;
@@ -5479,13 +5510,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
 {
 	struct nft_flowtable_filter *filter;
 
-	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
 	if (!filter)
 		return ERR_PTR(-ENOMEM);
 
 	if (nla[NFTA_FLOWTABLE_TABLE]) {
 		filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
-					   GFP_KERNEL);
+					   GFP_ATOMIC);
 		if (!filter->table) {
 			kfree(filter);
 			return ERR_PTR(-ENOMEM);
@@ -5494,6 +5525,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[])
 	return filter;
 }
 
+/* called with rcu_read_lock held */
 static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 				  struct sk_buff *skb,
 				  const struct nlmsghdr *nlh,
@@ -5512,6 +5544,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_flowtable,
 			.done = nf_tables_dump_flowtable_done,
+			.module = THIS_MODULE,
 		};
 
 		if (nla[NFTA_FLOWTABLE_TABLE]) {
@@ -5523,7 +5556,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 
 			c.data = filter;
 		}
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	if (!nla[NFTA_FLOWTABLE_NAME])
@@ -5539,7 +5572,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
 	if (IS_ERR(flowtable))
 		return PTR_ERR(flowtable);
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -5703,7 +5736,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
 	struct sk_buff *skb2;
 	int err;
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (skb2 == NULL)
 		return -ENOMEM;
 
@@ -5725,7 +5758,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_table_policy,
 	},
 	[NFT_MSG_GETTABLE] = {
-		.call		= nf_tables_gettable,
+		.call_rcu	= nf_tables_gettable,
 		.attr_count	= NFTA_TABLE_MAX,
 		.policy		= nft_table_policy,
 	},
@@ -5740,7 +5773,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_chain_policy,
 	},
 	[NFT_MSG_GETCHAIN] = {
-		.call		= nf_tables_getchain,
+		.call_rcu	= nf_tables_getchain,
 		.attr_count	= NFTA_CHAIN_MAX,
 		.policy		= nft_chain_policy,
 	},
@@ -5755,7 +5788,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_GETRULE] = {
-		.call		= nf_tables_getrule,
+		.call_rcu	= nf_tables_getrule,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
@@ -5770,7 +5803,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_set_policy,
 	},
 	[NFT_MSG_GETSET] = {
-		.call		= nf_tables_getset,
+		.call_rcu	= nf_tables_getset,
 		.attr_count	= NFTA_SET_MAX,
 		.policy		= nft_set_policy,
 	},
@@ -5785,7 +5818,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_GETSETELEM] = {
-		.call		= nf_tables_getsetelem,
+		.call_rcu	= nf_tables_getsetelem,
 		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
 		.policy		= nft_set_elem_list_policy,
 	},
@@ -5795,7 +5828,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_GETGEN] = {
-		.call		= nf_tables_getgen,
+		.call_rcu	= nf_tables_getgen,
 	},
 	[NFT_MSG_NEWOBJ] = {
 		.call_batch	= nf_tables_newobj,
@@ -5803,7 +5836,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_GETOBJ] = {
-		.call		= nf_tables_getobj,
+		.call_rcu	= nf_tables_getobj,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
@@ -5813,7 +5846,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_GETOBJ_RESET] = {
-		.call		= nf_tables_getobj,
+		.call_rcu	= nf_tables_getobj,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
 	},
@@ -5823,7 +5856,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_flowtable_policy,
 	},
 	[NFT_MSG_GETFLOWTABLE] = {
-		.call		= nf_tables_getflowtable,
+		.call_rcu	= nf_tables_getflowtable,
 		.attr_count	= NFTA_FLOWTABLE_MAX,
 		.policy		= nft_flowtable_policy,
 	},
-- 
cgit v1.2.3-59-g8ed1b


From e523452ac0e046d38fcfb7c16748ab8090bf98cd Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 29 May 2018 01:15:27 +0900
Subject: netfilter: nf_tables: remove unused variables

The comment and trace_loginfo are not used anymore.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_core.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 0548cd50ec26..47cf667b15ca 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -23,22 +23,6 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 
-static const char *const comments[__NFT_TRACETYPE_MAX] = {
-	[NFT_TRACETYPE_POLICY]	= "policy",
-	[NFT_TRACETYPE_RETURN]	= "return",
-	[NFT_TRACETYPE_RULE]	= "rule",
-};
-
-static const struct nf_loginfo trace_loginfo = {
-	.type = NF_LOG_TYPE_LOG,
-	.u = {
-		.log = {
-			.level = LOGLEVEL_WARNING,
-			.logflags = NF_LOG_DEFAULT_MASK,
-	        },
-	},
-};
-
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 					const struct nft_chain *chain,
 					enum nft_trace_types type)
-- 
cgit v1.2.3-59-g8ed1b


From 6fd1cfc0f0a5fec92cce8a5c3fbfd7fd82b82160 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:25 -0700
Subject: nfp: return -EOPNOTSUPP from .ndo_get_phys_port_name for VFs

After recent change we started returning 0 from
ndo_get_phys_port_name for VFs.  The name parameter for
ndo_get_phys_port_name is not initialized by the stack so
this can lead to a crash.  We should have kept returning
-EOPNOTSUPP in the first place.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index eea11e881bf5..1f572896d1ee 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3286,11 +3286,12 @@ nfp_net_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
 	if (nn->port)
 		return nfp_port_get_phys_port_name(netdev, name, len);
 
-	if (!nn->dp.is_vf) {
-		n = snprintf(name, len, "%d", nn->id);
-		if (n >= len)
-			return -EINVAL;
-	}
+	if (nn->dp.is_vf)
+		return -EOPNOTSUPP;
+
+	n = snprintf(name, len, "%d", nn->id);
+	if (n >= len)
+		return -EINVAL;
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ca14573272e8cedb947426654992ae4d7c20c985 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:26 -0700
Subject: nfp: prefix vNIC phys_port_name with 'n'

Some drivers are using a bare number inside phys_port_name
as VF id and OpenStack's regexps will pick it up.  We can't
use a bare number for your vNICs, prefix the names with 'n'.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1f572896d1ee..75110c8d6a90 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3289,7 +3289,7 @@ nfp_net_get_phys_port_name(struct net_device *netdev, char *name, size_t len)
 	if (nn->dp.is_vf)
 		return -EOPNOTSUPP;
 
-	n = snprintf(name, len, "%d", nn->id);
+	n = snprintf(name, len, "n%d", nn->id);
 	if (n >= len)
 		return -EINVAL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 055ee0d69887af1d511246d745610bdf9d627e75 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:27 -0700
Subject: nfp: abm: enable advanced queuing on demand

ABM NIC FW has a cut-through mode where the PCIe queuing
is bypassed, thus working like our standard NIC FWs.  Use this
mode by default and only enable queuing in switchdev mode where
users can configure it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 13 +++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.c | 11 +++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  2 ++
 drivers/net/ethernet/netronome/nfp/nfp_abi.h  | 14 ++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index e40f6f06417b..676d3afc9bdd 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -36,10 +36,23 @@
 
 #include "../nfpcore/nfp_cpp.h"
 #include "../nfp_app.h"
+#include "../nfp_abi.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
 #include "main.h"
 
+int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm)
+{
+	return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_ENABLE,
+			    NULL, 0, NULL, 0);
+}
+
+int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm)
+{
+	return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_DISABLE,
+			    NULL, 0, NULL, 0);
+}
+
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink)
 {
 	alink->queue_base = nn_readl(alink->vnic, NFP_NET_CFG_START_RXQ);
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 5a12bb20bced..28a18ac62040 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -182,6 +182,7 @@ static enum devlink_eswitch_mode nfp_abm_eswitch_mode_get(struct nfp_app *app)
 static int nfp_abm_eswitch_set_legacy(struct nfp_abm *abm)
 {
 	nfp_abm_kill_reprs_all(abm);
+	nfp_abm_ctrl_qm_disable(abm);
 
 	abm->eswitch_mode = DEVLINK_ESWITCH_MODE_LEGACY;
 	return 0;
@@ -200,6 +201,10 @@ static int nfp_abm_eswitch_set_switchdev(struct nfp_abm *abm)
 	struct nfp_net *nn;
 	int err;
 
+	err = nfp_abm_ctrl_qm_enable(abm);
+	if (err)
+		return err;
+
 	list_for_each_entry(nn, &pf->vnics, vnic_list) {
 		struct nfp_abm_link *alink = nn->app_priv;
 
@@ -217,6 +222,7 @@ static int nfp_abm_eswitch_set_switchdev(struct nfp_abm *abm)
 
 err_kill_all_reprs:
 	nfp_abm_kill_reprs_all(abm);
+	nfp_abm_ctrl_qm_disable(abm);
 	return err;
 }
 
@@ -350,6 +356,11 @@ static int nfp_abm_init(struct nfp_app *app)
 	if (err)
 		goto err_free_abm;
 
+	/* We start in legacy mode, make sure advanced queuing is disabled */
+	err = nfp_abm_ctrl_qm_disable(abm);
+	if (err)
+		goto err_free_abm;
+
 	err = -ENOMEM;
 	reprs = nfp_reprs_alloc(pf->max_data_vnics);
 	if (!reprs)
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 5938b69b8a84..7d129b205535 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -72,4 +72,6 @@ struct nfp_abm_link {
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
+int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
+int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_abi.h b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
index 7ffa6e6a9d1c..8b56c27931bf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_abi.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_abi.h
@@ -59,12 +59,26 @@
  * @NFP_MBOX_POOL_SET:	set shared buffer pool info/config
  * Input  - struct nfp_shared_buf_pool_info_set
  * Output - None
+ *
+ * @NFP_MBOX_PCIE_ABM_ENABLE:	enable PCIe-side advanced buffer management
+ * Enable advanced buffer management of the PCIe block.  If ABM is disabled
+ * PCIe block maintains a very short queue of buffers and does tail drop.
+ * ABM allows more advanced buffering and priority control.
+ * Input  - None
+ * Output - None
+ *
+ * @NFP_MBOX_PCIE_ABM_DISABLE:	disable PCIe-side advanced buffer management
+ * Input  - None
+ * Output - None
  */
 enum nfp_mbox_cmd {
 	NFP_MBOX_NO_CMD			= 0x00,
 
 	NFP_MBOX_POOL_GET		= 0x01,
 	NFP_MBOX_POOL_SET		= 0x02,
+
+	NFP_MBOX_PCIE_ABM_ENABLE	= 0x03,
+	NFP_MBOX_PCIE_ABM_DISABLE	= 0x04,
 };
 
 #define NFP_SHARED_BUF_COUNT_SYM_NAME	"_abi_nfd_pf%u_sb_cnt"
-- 
cgit v1.2.3-59-g8ed1b


From 25e0036fcd241fcb4541522e511168871d7c8bed Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:28 -0700
Subject: nfp: abm: add helpers for configuring queue marking levels

Queue levels for simple ECN marking are stored in _abi_nfd_out_q_lvls_X
symbol, where X is the PCIe PF id.  Find out the location of that symbol
and add helpers for modifying it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c      | 80 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h      |  3 +
 .../net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h   |  5 ++
 3 files changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 676d3afc9bdd..978884a0be19 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -35,12 +35,57 @@
 #include <linux/kernel.h>
 
 #include "../nfpcore/nfp_cpp.h"
+#include "../nfpcore/nfp_nffw.h"
 #include "../nfp_app.h"
 #include "../nfp_abi.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
 #include "main.h"
 
+#define NFP_QLVL_SYM_NAME	"_abi_nfd_out_q_lvls_%u"
+#define NFP_QLVL_STRIDE		16
+#define NFP_QLVL_THRS		8
+
+static unsigned long long
+nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int queue)
+{
+	return alink->abm->q_lvls->addr +
+		(alink->queue_base + queue) * NFP_QLVL_STRIDE + NFP_QLVL_THRS;
+}
+
+static int
+nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
+{
+	struct nfp_cpp *cpp = alink->abm->app->cpp;
+	u32 muw;
+	int err;
+
+	muw = NFP_CPP_ATOMIC_WR(alink->abm->q_lvls->target,
+				alink->abm->q_lvls->domain);
+
+	err = nfp_cpp_writel(cpp, muw, nfp_abm_q_lvl_thrs(alink, i), val);
+	if (err) {
+		nfp_err(cpp, "RED offload setting level failed on vNIC %d queue %d\n",
+			alink->id, i);
+		return err;
+	}
+
+	return 0;
+}
+
+int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val)
+{
+	int i, err;
+
+	for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+		err = nfp_abm_ctrl_set_q_lvl(alink, i, val);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm)
 {
 	return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_ENABLE,
@@ -59,13 +104,48 @@ void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink)
 	alink->queue_base /= alink->vnic->stride_rx;
 }
 
+static const struct nfp_rtsym *
+nfp_abm_ctrl_find_rtsym(struct nfp_pf *pf, const char *name, unsigned int size)
+{
+	const struct nfp_rtsym *sym;
+
+	sym = nfp_rtsym_lookup(pf->rtbl, name);
+	if (!sym) {
+		nfp_err(pf->cpp, "Symbol '%s' not found\n", name);
+		return ERR_PTR(-ENOENT);
+	}
+	if (sym->size != size) {
+		nfp_err(pf->cpp,
+			"Symbol '%s' wrong size: expected %u got %llu\n",
+			name, size, sym->size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return sym;
+}
+
+static const struct nfp_rtsym *
+nfp_abm_ctrl_find_q_rtsym(struct nfp_pf *pf, const char *name,
+			  unsigned int size)
+{
+	return nfp_abm_ctrl_find_rtsym(pf, name, size * NFP_NET_MAX_RX_RINGS);
+}
+
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm)
 {
 	struct nfp_pf *pf = abm->app->pf;
+	const struct nfp_rtsym *sym;
 	unsigned int pf_id;
+	char pf_symbol[64];
 
 	pf_id =	nfp_cppcore_pcie_unit(pf->cpp);
 	abm->pf_id = pf_id;
 
+	snprintf(pf_symbol, sizeof(pf_symbol), NFP_QLVL_SYM_NAME, pf_id);
+	sym = nfp_abm_ctrl_find_q_rtsym(pf, pf_symbol, NFP_QLVL_STRIDE);
+	if (IS_ERR(sym))
+		return PTR_ERR(sym);
+	abm->q_lvls = sym;
+
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 7d129b205535..1ac651cdc140 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -49,11 +49,13 @@ struct nfp_net;
  * @pf_id:	ID of our PF link
  * @eswitch_mode:	devlink eswitch mode, advanced functions only visible
  *			in switchdev mode
+ * @q_lvls:	queue level control area
  */
 struct nfp_abm {
 	struct nfp_app *app;
 	unsigned int pf_id;
 	enum devlink_eswitch_mode eswitch_mode;
+	const struct nfp_rtsym *q_lvls;
 };
 
 /**
@@ -72,6 +74,7 @@ struct nfp_abm_link {
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
+int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
 int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index 4e19add1c539..b0da3d436850 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -87,6 +87,11 @@ struct resource;
 
 #define NFP_CPP_TARGET_ID_MASK          0x1f
 
+#define NFP_CPP_ATOMIC_RD(target, island) \
+	NFP_CPP_ISLAND_ID((target), 3, 0, (island))
+#define NFP_CPP_ATOMIC_WR(target, island) \
+	NFP_CPP_ISLAND_ID((target), 4, 0, (island))
+
 /**
  * NFP_CPP_ID() - pack target, token, and action into a CPP ID.
  * @target:     NFP CPP target id
-- 
cgit v1.2.3-59-g8ed1b


From 8c8e6406f593ce21694ce7da6d86af49b4610e69 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:29 -0700
Subject: nfp: abm: add simple RED offload

Offload simple RED configurations.  For now support only DCTCP
like scenarios where min and max are the same.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 82 +++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h | 10 ++++
 2 files changed, 92 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 28a18ac62040..22251d88c958 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -38,6 +38,8 @@
 #include <linux/netdevice.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
 
 #include "../nfpcore/nfp.h"
 #include "../nfpcore/nfp_cpp.h"
@@ -55,6 +57,84 @@ static u32 nfp_abm_portid(enum nfp_repr_type rtype, unsigned int id)
 	       FIELD_PREP(NFP_ABM_PORTID_ID, id);
 }
 
+static void
+nfp_abm_red_destroy(struct net_device *netdev, struct nfp_abm_link *alink,
+		    u32 handle)
+{
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
+
+	if (handle != alink->qdiscs[0].handle)
+		return;
+
+	alink->qdiscs[0].handle = TC_H_UNSPEC;
+	port->tc_offload_cnt = 0;
+	nfp_abm_ctrl_set_all_q_lvls(alink, ~0);
+}
+
+static int
+nfp_abm_red_replace(struct net_device *netdev, struct nfp_abm_link *alink,
+		    struct tc_red_qopt_offload *opt)
+{
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
+	int err;
+
+	if (opt->set.min != opt->set.max || !opt->set.is_ecn) {
+		nfp_warn(alink->abm->app->cpp,
+			 "RED offload failed - unsupported parameters\n");
+		err = -EINVAL;
+		goto err_destroy;
+	}
+	err = nfp_abm_ctrl_set_all_q_lvls(alink, opt->set.min);
+	if (err)
+		goto err_destroy;
+
+	alink->qdiscs[0].handle = opt->handle;
+	port->tc_offload_cnt = 1;
+
+	return 0;
+err_destroy:
+	if (alink->qdiscs[0].handle != TC_H_UNSPEC)
+		nfp_abm_red_destroy(netdev, alink, alink->qdiscs[0].handle);
+	return err;
+}
+
+static int
+nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
+		     struct tc_red_qopt_offload *opt)
+{
+	if (opt->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	switch (opt->command) {
+	case TC_RED_REPLACE:
+		return nfp_abm_red_replace(netdev, alink, opt);
+	case TC_RED_DESTROY:
+		nfp_abm_red_destroy(netdev, alink, opt->handle);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+nfp_abm_setup_tc(struct nfp_app *app, struct net_device *netdev,
+		 enum tc_setup_type type, void *type_data)
+{
+	struct nfp_repr *repr = netdev_priv(netdev);
+	struct nfp_port *port;
+
+	port = nfp_port_from_netdev(netdev);
+	if (!port || port->type != NFP_PORT_PF_PORT)
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_QDISC_RED:
+		return nfp_abm_setup_tc_red(netdev, repr->app_priv, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static struct net_device *nfp_abm_repr_get(struct nfp_app *app, u32 port_id)
 {
 	enum nfp_repr_type rtype;
@@ -403,6 +483,8 @@ const struct nfp_app_type app_abm = {
 	.vnic_alloc	= nfp_abm_vnic_alloc,
 	.vnic_free	= nfp_abm_vnic_free,
 
+	.setup_tc	= nfp_abm_setup_tc,
+
 	.eswitch_mode_get	= nfp_abm_eswitch_mode_get,
 	.eswitch_mode_set	= nfp_abm_eswitch_mode_set,
 
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 1ac651cdc140..979f98fb808b 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -58,18 +58,28 @@ struct nfp_abm {
 	const struct nfp_rtsym *q_lvls;
 };
 
+/**
+ * struct nfp_red_qdisc - representation of single RED Qdisc
+ * @handle:	handle of currently offloaded RED Qdisc
+ */
+struct nfp_red_qdisc {
+	u32 handle;
+};
+
 /**
  * struct nfp_abm_link - port tuple of a ABM NIC
  * @abm:	back pointer to nfp_abm
  * @vnic:	data vNIC
  * @id:		id of the data vNIC
  * @queue_base:	id of base to host queue within PCIe (not QC idx)
+ * @qdiscs:	array of qdiscs
  */
 struct nfp_abm_link {
 	struct nfp_abm *abm;
 	struct nfp_net *vnic;
 	unsigned int id;
 	unsigned int queue_base;
+	struct nfp_red_qdisc qdiscs[1];
 };
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
-- 
cgit v1.2.3-59-g8ed1b


From 6172abc1e2ea12743f368875576952ddf4ebe88c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:30 -0700
Subject: net: sched: add qstats.qlen to qlen

AFAICT struct gnet_stats_queue.qlen is not used in Qdiscs.
It may, however, be useful for offloads to report HW queue
length there.  Add that value to the result of qdisc_qlen_sum().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 827a3711dc68..6488daa32f82 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -350,14 +350,14 @@ static inline int qdisc_qlen(const struct Qdisc *q)
 
 static inline int qdisc_qlen_sum(const struct Qdisc *q)
 {
-	__u32 qlen = 0;
+	__u32 qlen = q->qstats.qlen;
 	int i;
 
 	if (q->flags & TCQ_F_NOLOCK) {
 		for_each_possible_cpu(i)
 			qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
 	} else {
-		qlen = q->q.qlen;
+		qlen += q->q.qlen;
 	}
 
 	return qlen;
-- 
cgit v1.2.3-59-g8ed1b


From cb89cac8e705b0405872a870842c6c2a0a0e5ec2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:31 -0700
Subject: nfp: abm: report statistics from RED offload

Report basic and extended RED statistics back to TC.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 114 ++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.c |  92 +++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  38 +++++++++
 3 files changed, 244 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 978884a0be19..d2d9ca7a727c 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -44,8 +44,15 @@
 
 #define NFP_QLVL_SYM_NAME	"_abi_nfd_out_q_lvls_%u"
 #define NFP_QLVL_STRIDE		16
+#define NFP_QLVL_BLOG_BYTES	0
+#define NFP_QLVL_BLOG_PKTS	4
 #define NFP_QLVL_THRS		8
 
+#define NFP_QMSTAT_SYM_NAME	"_abi_nfdqm%u_stats"
+#define NFP_QMSTAT_STRIDE	32
+#define NFP_QMSTAT_DROP		16
+#define NFP_QMSTAT_ECN		24
+
 static unsigned long long
 nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int queue)
 {
@@ -53,6 +60,55 @@ nfp_abm_q_lvl_thrs(struct nfp_abm_link *alink, unsigned int queue)
 		(alink->queue_base + queue) * NFP_QLVL_STRIDE + NFP_QLVL_THRS;
 }
 
+static int
+nfp_abm_ctrl_stat(struct nfp_abm_link *alink, const struct nfp_rtsym *sym,
+		  unsigned int stride, unsigned int offset, unsigned int i,
+		  bool is_u64, u64 *res)
+{
+	struct nfp_cpp *cpp = alink->abm->app->cpp;
+	u32 val32, mur;
+	u64 val, addr;
+	int err;
+
+	mur = NFP_CPP_ATOMIC_RD(sym->target, sym->domain);
+
+	addr = sym->addr + (alink->queue_base + i) * stride + offset;
+	if (is_u64)
+		err = nfp_cpp_readq(cpp, mur, addr, &val);
+	else
+		err = nfp_cpp_readl(cpp, mur, addr, &val32);
+	if (err) {
+		nfp_err(cpp,
+			"RED offload reading stat failed on vNIC %d queue %d\n",
+			alink->id, i);
+		return err;
+	}
+
+	*res = is_u64 ? val : val32;
+	return 0;
+}
+
+static int
+nfp_abm_ctrl_stat_all(struct nfp_abm_link *alink, const struct nfp_rtsym *sym,
+		      unsigned int stride, unsigned int offset, bool is_u64,
+		      u64 *res)
+{
+	u64 val, sum = 0;
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+		err = nfp_abm_ctrl_stat(alink, sym, stride, offset, i,
+					is_u64, &val);
+		if (err)
+			return err;
+		sum += val;
+	}
+
+	*res = sum;
+	return 0;
+}
+
 static int
 nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
 {
@@ -86,6 +142,58 @@ int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val)
 	return 0;
 }
 
+int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
+			    struct nfp_alink_stats *stats)
+{
+	u64 pkts = 0, bytes = 0;
+	int i, err;
+
+	for (i = 0; i < alink->vnic->max_rx_rings; i++) {
+		pkts += nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i));
+		bytes += nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i) + 8);
+	}
+	stats->tx_pkts = pkts;
+	stats->tx_bytes = bytes;
+
+	err = nfp_abm_ctrl_stat_all(alink, alink->abm->q_lvls,
+				    NFP_QLVL_STRIDE, NFP_QLVL_BLOG_BYTES,
+				    false, &stats->backlog_bytes);
+	if (err)
+		return err;
+
+	err = nfp_abm_ctrl_stat_all(alink, alink->abm->q_lvls,
+				    NFP_QLVL_STRIDE, NFP_QLVL_BLOG_PKTS,
+				    false, &stats->backlog_pkts);
+	if (err)
+		return err;
+
+	err = nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+				    NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+				    true, &stats->drops);
+	if (err)
+		return err;
+
+	return nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+				     NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+				     true, &stats->overlimits);
+}
+
+int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
+			     struct nfp_alink_xstats *xstats)
+{
+	int err;
+
+	err = nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+				    NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+				    true, &xstats->pdrop);
+	if (err)
+		return err;
+
+	return nfp_abm_ctrl_stat_all(alink, alink->abm->qm_stats,
+				     NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+				     true, &xstats->ecn_marked);
+}
+
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm)
 {
 	return nfp_mbox_cmd(abm->app->pf, NFP_MBOX_PCIE_ABM_ENABLE,
@@ -147,5 +255,11 @@ int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm)
 		return PTR_ERR(sym);
 	abm->q_lvls = sym;
 
+	snprintf(pf_symbol, sizeof(pf_symbol), NFP_QMSTAT_SYM_NAME, pf_id);
+	sym = nfp_abm_ctrl_find_q_rtsym(pf, pf_symbol, NFP_QMSTAT_STRIDE);
+	if (IS_ERR(sym))
+		return PTR_ERR(sym);
+	abm->qm_stats = sym;
+
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 22251d88c958..d0c21899a8b7 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
+#include <net/red.h>
 
 #include "../nfpcore/nfp.h"
 #include "../nfpcore/nfp_cpp.h"
@@ -57,6 +58,23 @@ static u32 nfp_abm_portid(enum nfp_repr_type rtype, unsigned int id)
 	       FIELD_PREP(NFP_ABM_PORTID_ID, id);
 }
 
+static int nfp_abm_reset_stats(struct nfp_abm_link *alink)
+{
+	int err;
+
+	err = nfp_abm_ctrl_read_stats(alink, &alink->qdiscs[0].stats);
+	if (err)
+		return err;
+	alink->qdiscs[0].stats.backlog_pkts = 0;
+	alink->qdiscs[0].stats.backlog_bytes = 0;
+
+	err = nfp_abm_ctrl_read_xstats(alink, &alink->qdiscs[0].xstats);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static void
 nfp_abm_red_destroy(struct net_device *netdev, struct nfp_abm_link *alink,
 		    u32 handle)
@@ -88,16 +106,86 @@ nfp_abm_red_replace(struct net_device *netdev, struct nfp_abm_link *alink,
 	if (err)
 		goto err_destroy;
 
+	/* Reset stats only on new qdisc */
+	if (alink->qdiscs[0].handle != opt->handle) {
+		err = nfp_abm_reset_stats(alink);
+		if (err)
+			goto err_destroy;
+	}
+
 	alink->qdiscs[0].handle = opt->handle;
 	port->tc_offload_cnt = 1;
 
 	return 0;
 err_destroy:
+	/* If the qdisc keeps on living, but we can't offload undo changes */
+	if (alink->qdiscs[0].handle == opt->handle) {
+		opt->set.qstats->qlen -= alink->qdiscs[0].stats.backlog_pkts;
+		opt->set.qstats->backlog -=
+			alink->qdiscs[0].stats.backlog_bytes;
+	}
 	if (alink->qdiscs[0].handle != TC_H_UNSPEC)
 		nfp_abm_red_destroy(netdev, alink, alink->qdiscs[0].handle);
 	return err;
 }
 
+static void
+nfp_abm_update_stats(struct nfp_alink_stats *new, struct nfp_alink_stats *old,
+		     struct tc_qopt_offload_stats *stats)
+{
+	_bstats_update(stats->bstats, new->tx_bytes - old->tx_bytes,
+		       new->tx_pkts - old->tx_pkts);
+	stats->qstats->qlen += new->backlog_pkts - old->backlog_pkts;
+	stats->qstats->backlog += new->backlog_bytes - old->backlog_bytes;
+	stats->qstats->overlimits += new->overlimits - old->overlimits;
+	stats->qstats->drops += new->drops - old->drops;
+}
+
+static int
+nfp_abm_red_stats(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
+{
+	struct nfp_alink_stats *prev_stats;
+	struct nfp_alink_stats stats;
+	int err;
+
+	if (alink->qdiscs[0].handle != opt->handle)
+		return -EOPNOTSUPP;
+	prev_stats = &alink->qdiscs[0].stats;
+
+	err = nfp_abm_ctrl_read_stats(alink, &stats);
+	if (err)
+		return err;
+
+	nfp_abm_update_stats(&stats, prev_stats, &opt->stats);
+
+	*prev_stats = stats;
+
+	return 0;
+}
+
+static int
+nfp_abm_red_xstats(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
+{
+	struct nfp_alink_xstats *prev_xstats;
+	struct nfp_alink_xstats xstats;
+	int err;
+
+	if (alink->qdiscs[0].handle != opt->handle)
+		return -EOPNOTSUPP;
+	prev_xstats = &alink->qdiscs[0].xstats;
+
+	err = nfp_abm_ctrl_read_xstats(alink, &xstats);
+	if (err)
+		return err;
+
+	opt->xstats->forced_mark += xstats.ecn_marked - prev_xstats->ecn_marked;
+	opt->xstats->pdrop += xstats.pdrop - prev_xstats->pdrop;
+
+	*prev_xstats = xstats;
+
+	return 0;
+}
+
 static int
 nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
 		     struct tc_red_qopt_offload *opt)
@@ -111,6 +199,10 @@ nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
 	case TC_RED_DESTROY:
 		nfp_abm_red_destroy(netdev, alink, opt->handle);
 		return 0;
+	case TC_RED_STATS:
+		return nfp_abm_red_stats(alink, opt);
+	case TC_RED_XSTATS:
+		return nfp_abm_red_xstats(alink, opt);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 979f98fb808b..93a3b79cf468 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -50,20 +50,54 @@ struct nfp_net;
  * @eswitch_mode:	devlink eswitch mode, advanced functions only visible
  *			in switchdev mode
  * @q_lvls:	queue level control area
+ * @qm_stats:	queue statistics symbol
  */
 struct nfp_abm {
 	struct nfp_app *app;
 	unsigned int pf_id;
 	enum devlink_eswitch_mode eswitch_mode;
 	const struct nfp_rtsym *q_lvls;
+	const struct nfp_rtsym *qm_stats;
+};
+
+/**
+ * struct nfp_alink_stats - ABM NIC statistics
+ * @tx_pkts:		number of TXed packets
+ * @tx_bytes:		number of TXed bytes
+ * @backlog_pkts:	momentary backlog length (packets)
+ * @backlog_bytes:	momentary backlog length (bytes)
+ * @overlimits:		number of ECN marked TXed packets (accumulative)
+ * @drops:		number of tail-dropped packets (accumulative)
+ */
+struct nfp_alink_stats {
+	u64 tx_pkts;
+	u64 tx_bytes;
+	u64 backlog_pkts;
+	u64 backlog_bytes;
+	u64 overlimits;
+	u64 drops;
+};
+
+/**
+ * struct nfp_alink_xstats - extended ABM NIC statistics
+ * @ecn_marked:		number of ECN marked TXed packets
+ * @pdrop:		number of hard drops due to queue limit
+ */
+struct nfp_alink_xstats {
+	u64 ecn_marked;
+	u64 pdrop;
 };
 
 /**
  * struct nfp_red_qdisc - representation of single RED Qdisc
  * @handle:	handle of currently offloaded RED Qdisc
+ * @stats:	statistics from last refresh
+ * @xstats:	base of extended statistics
  */
 struct nfp_red_qdisc {
 	u32 handle;
+	struct nfp_alink_stats stats;
+	struct nfp_alink_xstats xstats;
 };
 
 /**
@@ -85,6 +119,10 @@ struct nfp_abm_link {
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
 int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val);
+int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
+			    struct nfp_alink_stats *stats);
+int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
+			     struct nfp_alink_xstats *xstats);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
 int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 21f31bc02938dc4d5aa5803d03f56e8681a30977 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:32 -0700
Subject: nfp: allow apps to add extra stats to ports

Allow nfp apps to add extra ethtool stats.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_app.c       | 22 ++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_app.h       | 13 +++++++++++++
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 10 ++++++++--
 drivers/net/ethernet/netronome/nfp/nfp_port.h      |  2 ++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c
index c9d8a7ab311e..f28b244f4ee7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c
@@ -43,6 +43,7 @@
 #include "nfp_main.h"
 #include "nfp_net.h"
 #include "nfp_net_repr.h"
+#include "nfp_port.h"
 
 static const struct nfp_app_type *apps[] = {
 	[NFP_APP_CORE_NIC]	= &app_nic,
@@ -85,6 +86,27 @@ const char *nfp_app_mip_name(struct nfp_app *app)
 	return nfp_mip_name(app->pf->mip);
 }
 
+u64 *nfp_app_port_get_stats(struct nfp_port *port, u64 *data)
+{
+	if (!port || !port->app || !port->app->type->port_get_stats)
+		return data;
+	return port->app->type->port_get_stats(port->app, port, data);
+}
+
+int nfp_app_port_get_stats_count(struct nfp_port *port)
+{
+	if (!port || !port->app || !port->app->type->port_get_stats_count)
+		return 0;
+	return port->app->type->port_get_stats_count(port->app, port);
+}
+
+u8 *nfp_app_port_get_stats_strings(struct nfp_port *port, u8 *data)
+{
+	if (!port || !port->app || !port->app->type->port_get_stats_strings)
+		return data;
+	return port->app->type->port_get_stats_strings(port->app, port, data);
+}
+
 struct sk_buff *
 nfp_app_ctrl_msg_alloc(struct nfp_app *app, unsigned int size, gfp_t priority)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 23b99a4e05c2..ee74caacb015 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -90,6 +90,9 @@ extern const struct nfp_app_type app_abm;
  * @repr_stop:	representor netdev stop callback
  * @check_mtu:	MTU change request on a netdev (verify it is valid)
  * @repr_change_mtu:	MTU change request on repr (make and verify change)
+ * @port_get_stats:		get extra ethtool statistics for a port
+ * @port_get_stats_count:	get count of extra statistics for a port
+ * @port_get_stats_strings:	get strings for extra statistics
  * @start:	start application logic
  * @stop:	stop application logic
  * @ctrl_msg_rx:    control message handler
@@ -132,6 +135,12 @@ struct nfp_app_type {
 	int (*repr_change_mtu)(struct nfp_app *app, struct net_device *netdev,
 			       int new_mtu);
 
+	u64 *(*port_get_stats)(struct nfp_app *app,
+			       struct nfp_port *port, u64 *data);
+	int (*port_get_stats_count)(struct nfp_app *app, struct nfp_port *port);
+	u8 *(*port_get_stats_strings)(struct nfp_app *app,
+				      struct nfp_port *port, u8 *data);
+
 	int (*start)(struct nfp_app *app);
 	void (*stop)(struct nfp_app *app);
 
@@ -404,6 +413,10 @@ static inline struct net_device *nfp_app_repr_get(struct nfp_app *app, u32 id)
 
 struct nfp_app *nfp_app_from_netdev(struct net_device *netdev);
 
+u64 *nfp_app_port_get_stats(struct nfp_port *port, u64 *data);
+int nfp_app_port_get_stats_count(struct nfp_port *port);
+u8 *nfp_app_port_get_stats_strings(struct nfp_port *port, u8 *data);
+
 struct nfp_reprs *
 nfp_reprs_get_locked(struct nfp_app *app, enum nfp_repr_type type);
 struct nfp_reprs *
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index c9016419bfa0..26d1cc4e2906 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -437,7 +437,7 @@ static int nfp_net_set_ringparam(struct net_device *netdev,
 	return nfp_net_set_ring_size(nn, rxd_cnt, txd_cnt);
 }
 
-static __printf(2, 3) u8 *nfp_pr_et(u8 *data, const char *fmt, ...)
+__printf(2, 3) u8 *nfp_pr_et(u8 *data, const char *fmt, ...)
 {
 	va_list args;
 
@@ -637,6 +637,7 @@ static void nfp_net_get_strings(struct net_device *netdev,
 						     nn->dp.num_tx_rings,
 						     false);
 		data = nfp_mac_get_stats_strings(netdev, data);
+		data = nfp_app_port_get_stats_strings(nn->port, data);
 		break;
 	}
 }
@@ -651,6 +652,7 @@ nfp_net_get_stats(struct net_device *netdev, struct ethtool_stats *stats,
 	data = nfp_vnic_get_hw_stats(data, nn->dp.ctrl_bar,
 				     nn->dp.num_rx_rings, nn->dp.num_tx_rings);
 	data = nfp_mac_get_stats(netdev, data);
+	data = nfp_app_port_get_stats(nn->port, data);
 }
 
 static int nfp_net_get_sset_count(struct net_device *netdev, int sset)
@@ -662,7 +664,8 @@ static int nfp_net_get_sset_count(struct net_device *netdev, int sset)
 		return nfp_vnic_get_sw_stats_count(netdev) +
 		       nfp_vnic_get_hw_stats_count(nn->dp.num_rx_rings,
 						   nn->dp.num_tx_rings) +
-		       nfp_mac_get_stats_count(netdev);
+		       nfp_mac_get_stats_count(netdev) +
+		       nfp_app_port_get_stats_count(nn->port);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -679,6 +682,7 @@ static void nfp_port_get_strings(struct net_device *netdev,
 			data = nfp_vnic_get_hw_stats_strings(data, 0, 0, true);
 		else
 			data = nfp_mac_get_stats_strings(netdev, data);
+		data = nfp_app_port_get_stats_strings(port, data);
 		break;
 	}
 }
@@ -693,6 +697,7 @@ nfp_port_get_stats(struct net_device *netdev, struct ethtool_stats *stats,
 		data = nfp_vnic_get_hw_stats(data, port->vnic, 0, 0);
 	else
 		data = nfp_mac_get_stats(netdev, data);
+	data = nfp_app_port_get_stats(port, data);
 }
 
 static int nfp_port_get_sset_count(struct net_device *netdev, int sset)
@@ -706,6 +711,7 @@ static int nfp_port_get_sset_count(struct net_device *netdev, int sset)
 			count = nfp_vnic_get_hw_stats_count(0, 0);
 		else
 			count = nfp_mac_get_stats_count(netdev);
+		count += nfp_app_port_get_stats_count(port);
 		return count;
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 18666750456e..51f10ae2d53e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -122,6 +122,8 @@ struct nfp_port {
 extern const struct ethtool_ops nfp_port_ethtool_ops;
 extern const struct switchdev_ops nfp_port_switchdev_ops;
 
+__printf(2, 3) u8 *nfp_pr_et(u8 *data, const char *fmt, ...);
+
 int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 		      void *type_data);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0a8b7019bbcb219ef941f877650f9c09fa331eef Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:33 -0700
Subject: nfp: abm: expose the internal stats in ethtool

There is a handful of statistics exposing some internal details
of the implementation.  Expose those via ethtool.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c | 22 ++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.c | 51 +++++++++++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/abm/main.h |  2 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index d2d9ca7a727c..79fc9147c012 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -50,6 +50,8 @@
 
 #define NFP_QMSTAT_SYM_NAME	"_abi_nfdqm%u_stats"
 #define NFP_QMSTAT_STRIDE	32
+#define NFP_QMSTAT_NON_STO	0
+#define NFP_QMSTAT_STO		8
 #define NFP_QMSTAT_DROP		16
 #define NFP_QMSTAT_ECN		24
 
@@ -142,6 +144,26 @@ int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val)
 	return 0;
 }
 
+u64 nfp_abm_ctrl_stat_non_sto(struct nfp_abm_link *alink, unsigned int i)
+{
+	u64 val;
+
+	if (nfp_abm_ctrl_stat(alink, alink->abm->qm_stats, NFP_QMSTAT_STRIDE,
+			      NFP_QMSTAT_NON_STO, i, true, &val))
+		return 0;
+	return val;
+}
+
+u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i)
+{
+	u64 val;
+
+	if (nfp_abm_ctrl_stat(alink, alink->abm->qm_stats, NFP_QMSTAT_STRIDE,
+			      NFP_QMSTAT_STO, i, true, &val))
+		return 0;
+	return val;
+}
+
 int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 			    struct nfp_alink_stats *stats)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index d0c21899a8b7..4e89159f13d3 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -497,6 +497,53 @@ static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
 	kfree(alink);
 }
 
+static u64 *
+nfp_abm_port_get_stats(struct nfp_app *app, struct nfp_port *port, u64 *data)
+{
+	struct nfp_repr *repr = netdev_priv(port->netdev);
+	struct nfp_abm_link *alink;
+	unsigned int i;
+
+	if (port->type != NFP_PORT_PF_PORT)
+		return data;
+	alink = repr->app_priv;
+	for (i = 0; i < alink->vnic->dp.num_r_vecs; i++) {
+		*data++ = nfp_abm_ctrl_stat_non_sto(alink, i);
+		*data++ = nfp_abm_ctrl_stat_sto(alink, i);
+	}
+	return data;
+}
+
+static int
+nfp_abm_port_get_stats_count(struct nfp_app *app, struct nfp_port *port)
+{
+	struct nfp_repr *repr = netdev_priv(port->netdev);
+	struct nfp_abm_link *alink;
+
+	if (port->type != NFP_PORT_PF_PORT)
+		return 0;
+	alink = repr->app_priv;
+	return alink->vnic->dp.num_r_vecs * 2;
+}
+
+static u8 *
+nfp_abm_port_get_stats_strings(struct nfp_app *app, struct nfp_port *port,
+			       u8 *data)
+{
+	struct nfp_repr *repr = netdev_priv(port->netdev);
+	struct nfp_abm_link *alink;
+	unsigned int i;
+
+	if (port->type != NFP_PORT_PF_PORT)
+		return data;
+	alink = repr->app_priv;
+	for (i = 0; i < alink->vnic->dp.num_r_vecs; i++) {
+		data = nfp_pr_et(data, "q%u_no_wait", i);
+		data = nfp_pr_et(data, "q%u_delayed", i);
+	}
+	return data;
+}
+
 static int nfp_abm_init(struct nfp_app *app)
 {
 	struct nfp_pf *pf = app->pf;
@@ -575,6 +622,10 @@ const struct nfp_app_type app_abm = {
 	.vnic_alloc	= nfp_abm_vnic_alloc,
 	.vnic_free	= nfp_abm_vnic_free,
 
+	.port_get_stats		= nfp_abm_port_get_stats,
+	.port_get_stats_count	= nfp_abm_port_get_stats_count,
+	.port_get_stats_strings	= nfp_abm_port_get_stats_strings,
+
 	.setup_tc	= nfp_abm_setup_tc,
 
 	.eswitch_mode_get	= nfp_abm_eswitch_mode_get,
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 93a3b79cf468..09fd15847961 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -123,6 +123,8 @@ int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 			    struct nfp_alink_stats *stats);
 int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
 			     struct nfp_alink_xstats *xstats);
+u64 nfp_abm_ctrl_stat_non_sto(struct nfp_abm_link *alink, unsigned int i);
+u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
 int nfp_abm_ctrl_qm_disable(struct nfp_abm *abm);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 2ef3c253f15a7784ce541417d5663d2d1e751231 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:34 -0700
Subject: nfp: abm: expose all PF queues

Allocate the PF representor as multi-queue to allow setting
the configuration per-queue.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c     | 10 +++++++---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |  5 +++--
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.h |  7 ++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 4e89159f13d3..ef77d7b0d99d 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -255,14 +255,18 @@ nfp_abm_spawn_repr(struct nfp_app *app, struct nfp_abm_link *alink,
 	struct nfp_reprs *reprs;
 	struct nfp_repr *repr;
 	struct nfp_port *port;
+	unsigned int txqs;
 	int err;
 
-	if (ptype == NFP_PORT_PHYS_PORT)
+	if (ptype == NFP_PORT_PHYS_PORT) {
 		rtype = NFP_REPR_TYPE_PHYS_PORT;
-	else
+		txqs = 1;
+	} else {
 		rtype = NFP_REPR_TYPE_PF;
+		txqs = alink->vnic->max_rx_rings;
+	}
 
-	netdev = nfp_repr_alloc(app);
+	netdev = nfp_repr_alloc_mqs(app, txqs, 1);
 	if (!netdev)
 		return -ENOMEM;
 	repr = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 117eca6819de..d7b712f6362f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -360,12 +360,13 @@ void nfp_repr_free(struct net_device *netdev)
 	__nfp_repr_free(netdev_priv(netdev));
 }
 
-struct net_device *nfp_repr_alloc(struct nfp_app *app)
+struct net_device *
+nfp_repr_alloc_mqs(struct nfp_app *app, unsigned int txqs, unsigned int rxqs)
 {
 	struct net_device *netdev;
 	struct nfp_repr *repr;
 
-	netdev = alloc_etherdev(sizeof(*repr));
+	netdev = alloc_etherdev_mqs(sizeof(*repr), txqs, rxqs);
 	if (!netdev)
 		return NULL;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
index 8366e4f3c623..1bf2b18109ab 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.h
@@ -126,7 +126,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 		  u32 cmsg_port_id, struct nfp_port *port,
 		  struct net_device *pf_netdev);
 void nfp_repr_free(struct net_device *netdev);
-struct net_device *nfp_repr_alloc(struct nfp_app *app);
+struct net_device *
+nfp_repr_alloc_mqs(struct nfp_app *app, unsigned int txqs, unsigned int rxqs);
 void nfp_repr_clean_and_free(struct nfp_repr *repr);
 void nfp_reprs_clean_and_free(struct nfp_app *app, struct nfp_reprs *reprs);
 void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
@@ -134,4 +135,8 @@ void nfp_reprs_clean_and_free_by_type(struct nfp_app *app,
 struct nfp_reprs *nfp_reprs_alloc(unsigned int num_reprs);
 int nfp_reprs_resync_phys_ports(struct nfp_app *app);
 
+static inline struct net_device *nfp_repr_alloc(struct nfp_app *app)
+{
+	return nfp_repr_alloc_mqs(app, 1, 1);
+}
 #endif /* NFP_NET_REPR_H */
-- 
cgit v1.2.3-59-g8ed1b


From f971b132300fb0df63a8de631947adc74a7b3db1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:35 -0700
Subject: net: sched: mq: add simple offload notification

mq offload is trivial, we just need to let the device know
that the root qdisc is mq.  Alternative approach would be
to export qdisc_lookup() and make drivers check the root
type themselves, but notification via ndo_setup_tc is more
in line with other qdiscs.

Note that mq doesn't hold any stats on it's own, it just
adds up stats of its children.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 10 ++++++++++
 net/sched/sch_mq.c        | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f45b1a4e37ab..6b863ed3174a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -791,6 +791,7 @@ enum tc_setup_type {
 	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 	TC_SETUP_QDISC_PRIO,
+	TC_SETUP_QDISC_MQ,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f3ec43725724..942f839dbca4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -778,6 +778,16 @@ struct tc_qopt_offload_stats {
 	struct gnet_stats_queue *qstats;
 };
 
+enum tc_mq_command {
+	TC_MQ_CREATE,
+	TC_MQ_DESTROY,
+};
+
+struct tc_mq_qopt_offload {
+	enum tc_mq_command command;
+	u32 handle;
+};
+
 enum tc_red_command {
 	TC_RED_REPLACE,
 	TC_RED_DESTROY,
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f062a18e9162..6ccf6daa2503 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -16,6 +16,7 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <net/netlink.h>
+#include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
 #include <net/sch_generic.h>
 
@@ -23,12 +24,28 @@ struct mq_sched {
 	struct Qdisc		**qdiscs;
 };
 
+static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mq_qopt_offload opt = {
+		.command = cmd,
+		.handle = sch->handle,
+	};
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return -EOPNOTSUPP;
+
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mq_sched *priv = qdisc_priv(sch);
 	unsigned int ntx;
 
+	mq_offload(sch, TC_MQ_DESTROY);
+
 	if (!priv->qdiscs)
 		return;
 	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
@@ -70,6 +87,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch->flags |= TCQ_F_MQROOT;
+
+	mq_offload(sch, TC_MQ_CREATE);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 674cb229b61039c4763838496e9241f4e6f145a8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:36 -0700
Subject: nfp: abm: multi-queue RED offload

Add support for MQ offload and setting RED parameters
on queue-by-queue basis.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/ctrl.c |  50 ++++++-
 drivers/net/ethernet/netronome/nfp/abm/main.c | 192 ++++++++++++++++++++------
 drivers/net/ethernet/netronome/nfp/abm/main.h |  14 +-
 3 files changed, 208 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
index 79fc9147c012..b157ccd8c80f 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/ctrl.c
@@ -111,8 +111,7 @@ nfp_abm_ctrl_stat_all(struct nfp_abm_link *alink, const struct nfp_rtsym *sym,
 	return 0;
 }
 
-static int
-nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
+int nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i, u32 val)
 {
 	struct nfp_cpp *cpp = alink->abm->app->cpp;
 	u32 muw;
@@ -164,6 +163,37 @@ u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i)
 	return val;
 }
 
+int nfp_abm_ctrl_read_q_stats(struct nfp_abm_link *alink, unsigned int i,
+			      struct nfp_alink_stats *stats)
+{
+	int err;
+
+	stats->tx_pkts = nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i));
+	stats->tx_bytes = nn_readq(alink->vnic, NFP_NET_CFG_RXR_STATS(i) + 8);
+
+	err = nfp_abm_ctrl_stat(alink, alink->abm->q_lvls,
+				NFP_QLVL_STRIDE, NFP_QLVL_BLOG_BYTES,
+				i, false, &stats->backlog_bytes);
+	if (err)
+		return err;
+
+	err = nfp_abm_ctrl_stat(alink, alink->abm->q_lvls,
+				NFP_QLVL_STRIDE, NFP_QLVL_BLOG_PKTS,
+				i, false, &stats->backlog_pkts);
+	if (err)
+		return err;
+
+	err = nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+				NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+				i, true, &stats->drops);
+	if (err)
+		return err;
+
+	return nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+				 NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+				 i, true, &stats->overlimits);
+}
+
 int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 			    struct nfp_alink_stats *stats)
 {
@@ -200,6 +230,22 @@ int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 				     true, &stats->overlimits);
 }
 
+int nfp_abm_ctrl_read_q_xstats(struct nfp_abm_link *alink, unsigned int i,
+			       struct nfp_alink_xstats *xstats)
+{
+	int err;
+
+	err = nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+				NFP_QMSTAT_STRIDE, NFP_QMSTAT_DROP,
+				i, true, &xstats->pdrop);
+	if (err)
+		return err;
+
+	return nfp_abm_ctrl_stat(alink, alink->abm->qm_stats,
+				 NFP_QMSTAT_STRIDE, NFP_QMSTAT_ECN,
+				 i, true, &xstats->ecn_marked);
+}
+
 int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
 			     struct nfp_alink_xstats *xstats)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index ef77d7b0d99d..21d5af1fb061 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -58,43 +58,77 @@ static u32 nfp_abm_portid(enum nfp_repr_type rtype, unsigned int id)
 	       FIELD_PREP(NFP_ABM_PORTID_ID, id);
 }
 
-static int nfp_abm_reset_stats(struct nfp_abm_link *alink)
+static int
+__nfp_abm_reset_root(struct net_device *netdev, struct nfp_abm_link *alink,
+		     u32 handle, unsigned int qs, u32 init_val)
 {
-	int err;
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
+	int ret;
 
-	err = nfp_abm_ctrl_read_stats(alink, &alink->qdiscs[0].stats);
-	if (err)
-		return err;
-	alink->qdiscs[0].stats.backlog_pkts = 0;
-	alink->qdiscs[0].stats.backlog_bytes = 0;
+	ret = nfp_abm_ctrl_set_all_q_lvls(alink, init_val);
+	memset(alink->qdiscs, 0, sizeof(*alink->qdiscs) * alink->num_qdiscs);
 
-	err = nfp_abm_ctrl_read_xstats(alink, &alink->qdiscs[0].xstats);
-	if (err)
-		return err;
+	alink->parent = handle;
+	alink->num_qdiscs = qs;
+	port->tc_offload_cnt = qs;
 
-	return 0;
+	return ret;
+}
+
+static void
+nfp_abm_reset_root(struct net_device *netdev, struct nfp_abm_link *alink,
+		   u32 handle, unsigned int qs)
+{
+	__nfp_abm_reset_root(netdev, alink, handle, qs, ~0);
+}
+
+static int
+nfp_abm_red_find(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
+{
+	unsigned int i = TC_H_MIN(opt->parent) - 1;
+
+	if (opt->parent == TC_H_ROOT)
+		i = 0;
+	else if (TC_H_MAJ(alink->parent) == TC_H_MAJ(opt->parent))
+		i = TC_H_MIN(opt->parent) - 1;
+	else
+		return -EOPNOTSUPP;
+
+	if (i >= alink->num_qdiscs || opt->handle != alink->qdiscs[i].handle)
+		return -EOPNOTSUPP;
+
+	return i;
 }
 
 static void
 nfp_abm_red_destroy(struct net_device *netdev, struct nfp_abm_link *alink,
 		    u32 handle)
 {
-	struct nfp_port *port = nfp_port_from_netdev(netdev);
+	unsigned int i;
 
-	if (handle != alink->qdiscs[0].handle)
+	for (i = 0; i < alink->num_qdiscs; i++)
+		if (handle == alink->qdiscs[i].handle)
+			break;
+	if (i == alink->num_qdiscs)
 		return;
 
-	alink->qdiscs[0].handle = TC_H_UNSPEC;
-	port->tc_offload_cnt = 0;
-	nfp_abm_ctrl_set_all_q_lvls(alink, ~0);
+	if (alink->parent == TC_H_ROOT) {
+		nfp_abm_reset_root(netdev, alink, TC_H_ROOT, 0);
+	} else {
+		nfp_abm_ctrl_set_q_lvl(alink, i, ~0);
+		memset(&alink->qdiscs[i], 0, sizeof(*alink->qdiscs));
+	}
 }
 
 static int
 nfp_abm_red_replace(struct net_device *netdev, struct nfp_abm_link *alink,
 		    struct tc_red_qopt_offload *opt)
 {
-	struct nfp_port *port = nfp_port_from_netdev(netdev);
-	int err;
+	bool existing;
+	int i, err;
+
+	i = nfp_abm_red_find(alink, opt);
+	existing = i >= 0;
 
 	if (opt->set.min != opt->set.max || !opt->set.is_ecn) {
 		nfp_warn(alink->abm->app->cpp,
@@ -102,30 +136,62 @@ nfp_abm_red_replace(struct net_device *netdev, struct nfp_abm_link *alink,
 		err = -EINVAL;
 		goto err_destroy;
 	}
-	err = nfp_abm_ctrl_set_all_q_lvls(alink, opt->set.min);
-	if (err)
-		goto err_destroy;
 
-	/* Reset stats only on new qdisc */
-	if (alink->qdiscs[0].handle != opt->handle) {
-		err = nfp_abm_reset_stats(alink);
+	if (existing) {
+		if (alink->parent == TC_H_ROOT)
+			err = nfp_abm_ctrl_set_all_q_lvls(alink, opt->set.min);
+		else
+			err = nfp_abm_ctrl_set_q_lvl(alink, i, opt->set.min);
 		if (err)
 			goto err_destroy;
+		return 0;
 	}
 
-	alink->qdiscs[0].handle = opt->handle;
-	port->tc_offload_cnt = 1;
+	if (opt->parent == TC_H_ROOT) {
+		i = 0;
+		err = __nfp_abm_reset_root(netdev, alink, TC_H_ROOT, 1,
+					   opt->set.min);
+	} else if (TC_H_MAJ(alink->parent) == TC_H_MAJ(opt->parent)) {
+		i = TC_H_MIN(opt->parent) - 1;
+		err = nfp_abm_ctrl_set_q_lvl(alink, i, opt->set.min);
+	} else {
+		return -EINVAL;
+	}
+	/* Set the handle to try full clean up, in case IO failed */
+	alink->qdiscs[i].handle = opt->handle;
+	if (err)
+		goto err_destroy;
+
+	if (opt->parent == TC_H_ROOT)
+		err = nfp_abm_ctrl_read_stats(alink, &alink->qdiscs[i].stats);
+	else
+		err = nfp_abm_ctrl_read_q_stats(alink, i,
+						&alink->qdiscs[i].stats);
+	if (err)
+		goto err_destroy;
+
+	if (opt->parent == TC_H_ROOT)
+		err = nfp_abm_ctrl_read_xstats(alink,
+					       &alink->qdiscs[i].xstats);
+	else
+		err = nfp_abm_ctrl_read_q_xstats(alink, i,
+						 &alink->qdiscs[i].xstats);
+	if (err)
+		goto err_destroy;
+
+	alink->qdiscs[i].stats.backlog_pkts = 0;
+	alink->qdiscs[i].stats.backlog_bytes = 0;
 
 	return 0;
 err_destroy:
 	/* If the qdisc keeps on living, but we can't offload undo changes */
-	if (alink->qdiscs[0].handle == opt->handle) {
-		opt->set.qstats->qlen -= alink->qdiscs[0].stats.backlog_pkts;
+	if (existing) {
+		opt->set.qstats->qlen -= alink->qdiscs[i].stats.backlog_pkts;
 		opt->set.qstats->backlog -=
-			alink->qdiscs[0].stats.backlog_bytes;
+			alink->qdiscs[i].stats.backlog_bytes;
 	}
-	if (alink->qdiscs[0].handle != TC_H_UNSPEC)
-		nfp_abm_red_destroy(netdev, alink, alink->qdiscs[0].handle);
+	nfp_abm_red_destroy(netdev, alink, opt->handle);
+
 	return err;
 }
 
@@ -146,13 +212,17 @@ nfp_abm_red_stats(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
 {
 	struct nfp_alink_stats *prev_stats;
 	struct nfp_alink_stats stats;
-	int err;
+	int i, err;
 
-	if (alink->qdiscs[0].handle != opt->handle)
-		return -EOPNOTSUPP;
-	prev_stats = &alink->qdiscs[0].stats;
+	i = nfp_abm_red_find(alink, opt);
+	if (i < 0)
+		return i;
+	prev_stats = &alink->qdiscs[i].stats;
 
-	err = nfp_abm_ctrl_read_stats(alink, &stats);
+	if (alink->parent == TC_H_ROOT)
+		err = nfp_abm_ctrl_read_stats(alink, &stats);
+	else
+		err = nfp_abm_ctrl_read_q_stats(alink, i, &stats);
 	if (err)
 		return err;
 
@@ -168,13 +238,17 @@ nfp_abm_red_xstats(struct nfp_abm_link *alink, struct tc_red_qopt_offload *opt)
 {
 	struct nfp_alink_xstats *prev_xstats;
 	struct nfp_alink_xstats xstats;
-	int err;
+	int i, err;
 
-	if (alink->qdiscs[0].handle != opt->handle)
-		return -EOPNOTSUPP;
-	prev_xstats = &alink->qdiscs[0].xstats;
+	i = nfp_abm_red_find(alink, opt);
+	if (i < 0)
+		return i;
+	prev_xstats = &alink->qdiscs[i].xstats;
 
-	err = nfp_abm_ctrl_read_xstats(alink, &xstats);
+	if (alink->parent == TC_H_ROOT)
+		err = nfp_abm_ctrl_read_xstats(alink, &xstats);
+	else
+		err = nfp_abm_ctrl_read_q_xstats(alink, i, &xstats);
 	if (err)
 		return err;
 
@@ -190,9 +264,6 @@ static int
 nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
 		     struct tc_red_qopt_offload *opt)
 {
-	if (opt->parent != TC_H_ROOT)
-		return -EOPNOTSUPP;
-
 	switch (opt->command) {
 	case TC_RED_REPLACE:
 		return nfp_abm_red_replace(netdev, alink, opt);
@@ -208,6 +279,24 @@ nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
 	}
 }
 
+static int
+nfp_abm_setup_tc_mq(struct net_device *netdev, struct nfp_abm_link *alink,
+		    struct tc_mq_qopt_offload *opt)
+{
+	switch (opt->command) {
+	case TC_MQ_CREATE:
+		nfp_abm_reset_root(netdev, alink, opt->handle,
+				   alink->total_queues);
+		return 0;
+	case TC_MQ_DESTROY:
+		if (opt->handle == alink->parent)
+			nfp_abm_reset_root(netdev, alink, TC_H_ROOT, 0);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int
 nfp_abm_setup_tc(struct nfp_app *app, struct net_device *netdev,
 		 enum tc_setup_type type, void *type_data)
@@ -220,6 +309,8 @@ nfp_abm_setup_tc(struct nfp_app *app, struct net_device *netdev,
 		return -EOPNOTSUPP;
 
 	switch (type) {
+	case TC_SETUP_QDISC_MQ:
+		return nfp_abm_setup_tc_mq(netdev, repr->app_priv, type_data);
 	case TC_SETUP_QDISC_RED:
 		return nfp_abm_setup_tc_red(netdev, repr->app_priv, type_data);
 	default:
@@ -473,13 +564,21 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 	alink->abm = abm;
 	alink->vnic = nn;
 	alink->id = id;
+	alink->parent = TC_H_ROOT;
+	alink->total_queues = alink->vnic->max_rx_rings;
+	alink->qdiscs = kvzalloc(sizeof(*alink->qdiscs) * alink->total_queues,
+				 GFP_KERNEL);
+	if (!alink->qdiscs) {
+		err = -ENOMEM;
+		goto err_free_alink;
+	}
 
 	/* This is a multi-host app, make sure MAC/PHY is up, but don't
 	 * make the MAC/PHY state follow the state of any of the ports.
 	 */
 	err = nfp_eth_set_configured(app->cpp, eth_port->index, true);
 	if (err < 0)
-		goto err_free_alink;
+		goto err_free_qdiscs;
 
 	netif_keep_dst(nn->dp.netdev);
 
@@ -488,6 +587,8 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 
 	return 0;
 
+err_free_qdiscs:
+	kvfree(alink->qdiscs);
 err_free_alink:
 	kfree(alink);
 	return err;
@@ -498,6 +599,7 @@ static void nfp_abm_vnic_free(struct nfp_app *app, struct nfp_net *nn)
 	struct nfp_abm_link *alink = nn->app_priv;
 
 	nfp_abm_kill_reprs(alink->abm, alink);
+	kvfree(alink->qdiscs);
 	kfree(alink);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.h b/drivers/net/ethernet/netronome/nfp/abm/main.h
index 09fd15847961..934a70835473 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.h
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.h
@@ -106,6 +106,9 @@ struct nfp_red_qdisc {
  * @vnic:	data vNIC
  * @id:		id of the data vNIC
  * @queue_base:	id of base to host queue within PCIe (not QC idx)
+ * @total_queues:	number of PF queues
+ * @parent:	handle of expected parent, i.e. handle of MQ, or TC_H_ROOT
+ * @num_qdiscs:	number of currently used qdiscs
  * @qdiscs:	array of qdiscs
  */
 struct nfp_abm_link {
@@ -113,16 +116,25 @@ struct nfp_abm_link {
 	struct nfp_net *vnic;
 	unsigned int id;
 	unsigned int queue_base;
-	struct nfp_red_qdisc qdiscs[1];
+	unsigned int total_queues;
+	u32 parent;
+	unsigned int num_qdiscs;
+	struct nfp_red_qdisc *qdiscs;
 };
 
 void nfp_abm_ctrl_read_params(struct nfp_abm_link *alink);
 int nfp_abm_ctrl_find_addrs(struct nfp_abm *abm);
 int nfp_abm_ctrl_set_all_q_lvls(struct nfp_abm_link *alink, u32 val);
+int nfp_abm_ctrl_set_q_lvl(struct nfp_abm_link *alink, unsigned int i,
+			   u32 val);
 int nfp_abm_ctrl_read_stats(struct nfp_abm_link *alink,
 			    struct nfp_alink_stats *stats);
+int nfp_abm_ctrl_read_q_stats(struct nfp_abm_link *alink, unsigned int i,
+			      struct nfp_alink_stats *stats);
 int nfp_abm_ctrl_read_xstats(struct nfp_abm_link *alink,
 			     struct nfp_alink_xstats *xstats);
+int nfp_abm_ctrl_read_q_xstats(struct nfp_abm_link *alink, unsigned int i,
+			       struct nfp_alink_xstats *xstats);
 u64 nfp_abm_ctrl_stat_non_sto(struct nfp_abm_link *alink, unsigned int i);
 u64 nfp_abm_ctrl_stat_sto(struct nfp_abm_link *alink, unsigned int i);
 int nfp_abm_ctrl_qm_enable(struct nfp_abm *abm);
-- 
cgit v1.2.3-59-g8ed1b


From 47c669a406d8621c69b1c199ce099b54b17b9902 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:37 -0700
Subject: net: sched: mq: request stats from offloads

MQ doesn't hold any statistics on its own, however, statistic
from offloads are requested starting from the root, hence MQ
will read the old values for its sums.  Call into the drivers,
because of the additive nature of the stats drivers are aware
of how much "pending updates" they have to children of the MQ.
Since MQ reset its stats on every dump we can simply offset
the stats, predicting how stats of offloaded children will
change.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h |  2 ++
 net/sched/sch_mq.c    | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 942f839dbca4..a3c1a2c47cd4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -781,11 +781,13 @@ struct tc_qopt_offload_stats {
 enum tc_mq_command {
 	TC_MQ_CREATE,
 	TC_MQ_DESTROY,
+	TC_MQ_STATS,
 };
 
 struct tc_mq_qopt_offload {
 	enum tc_mq_command command;
 	u32 handle;
+	struct tc_qopt_offload_stats stats;
 };
 
 enum tc_red_command {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 6ccf6daa2503..d6b8ae4ed7a3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,6 +38,22 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
 }
 
+static void mq_offload_stats(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mq_qopt_offload opt = {
+		.command = TC_MQ_STATS,
+		.handle = sch->handle,
+		.stats = {
+			.bstats = &sch->bstats,
+			.qstats = &sch->qstats,
+		},
+	};
+
+	if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+}
+
 static void mq_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -146,6 +162,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 			sch->q.qlen		+= qdisc->q.qlen;
 			sch->bstats.bytes	+= qdisc->bstats.bytes;
 			sch->bstats.packets	+= qdisc->bstats.packets;
+			sch->qstats.qlen	+= qdisc->qstats.qlen;
 			sch->qstats.backlog	+= qdisc->qstats.backlog;
 			sch->qstats.drops	+= qdisc->qstats.drops;
 			sch->qstats.requeues	+= qdisc->qstats.requeues;
@@ -154,6 +171,7 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
+	mq_offload_stats(sch);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2440711e49b6144ef74a51e29e4e876ac5f0d066 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 25 May 2018 21:53:38 -0700
Subject: nfp: abm: report correct MQ stats

Report the stat diff to make sure MQ stats add up to child stats.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/abm/main.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c
index 21d5af1fb061..1561c2724c26 100644
--- a/drivers/net/ethernet/netronome/nfp/abm/main.c
+++ b/drivers/net/ethernet/netronome/nfp/abm/main.c
@@ -279,6 +279,28 @@ nfp_abm_setup_tc_red(struct net_device *netdev, struct nfp_abm_link *alink,
 	}
 }
 
+static int
+nfp_abm_mq_stats(struct nfp_abm_link *alink, struct tc_mq_qopt_offload *opt)
+{
+	struct nfp_alink_stats stats;
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < alink->num_qdiscs; i++) {
+		if (alink->qdiscs[i].handle == TC_H_UNSPEC)
+			continue;
+
+		err = nfp_abm_ctrl_read_q_stats(alink, i, &stats);
+		if (err)
+			return err;
+
+		nfp_abm_update_stats(&stats, &alink->qdiscs[i].stats,
+				     &opt->stats);
+	}
+
+	return 0;
+}
+
 static int
 nfp_abm_setup_tc_mq(struct net_device *netdev, struct nfp_abm_link *alink,
 		    struct tc_mq_qopt_offload *opt)
@@ -292,6 +314,8 @@ nfp_abm_setup_tc_mq(struct net_device *netdev, struct nfp_abm_link *alink,
 		if (opt->handle == alink->parent)
 			nfp_abm_reset_root(netdev, alink, TC_H_ROOT, 0);
 		return 0;
+	case TC_MQ_STATS:
+		return nfp_abm_mq_stats(alink, opt);
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 77ab8d5d2950cd0e18ba943336c1172b46e9f53e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Sat, 26 May 2018 09:47:26 +0000
Subject: net: bpfilter: make function bpfilter_mbox_request() static

Fixes the following sparse warnings:

net/ipv4/bpfilter/sockopt.c:13:5: warning:
 symbol 'bpfilter_mbox_request' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/bpfilter/sockopt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
index 42a96d2d8d05..5e04ed25bc0e 100644
--- a/net/ipv4/bpfilter/sockopt.c
+++ b/net/ipv4/bpfilter/sockopt.c
@@ -10,8 +10,9 @@ int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
 				unsigned int optlen, bool is_set);
 EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
 
-int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
-			  unsigned int optlen, bool is_set)
+static int bpfilter_mbox_request(struct sock *sk, int optname,
+				 char __user *optval,
+				 unsigned int optlen, bool is_set)
 {
 	if (!bpfilter_process_sockopt) {
 		int err = request_module("bpfilter");
-- 
cgit v1.2.3-59-g8ed1b


From c1c9a3c9663b2e15176758626278792862f1ed32 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 26 May 2018 19:15:48 +0800
Subject: net: remove unnecessary genlmsg_cancel() calls

the message be freed immediately, no need to trim it
back to the previous size.

Inspired by commit 7a9b3ec1e19f ("nl80211: remove unnecessary genlmsg_cancel() calls")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c               |  2 --
 drivers/net/wireless/mac80211_hwsim.c |  1 -
 net/core/devlink.c                    |  4 ----
 net/ipv6/seg6.c                       |  1 -
 net/ncsi/ncsi-netlink.c               |  1 -
 net/nfc/netlink.c                     | 17 -----------------
 6 files changed, 26 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index e6730a01d130..267dcc929f6c 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2426,7 +2426,6 @@ send_done:
 nla_put_failure:
 	err = -EMSGSIZE;
 errout:
-	genlmsg_cancel(skb, hdr);
 	nlmsg_free(skb);
 	return err;
 }
@@ -2720,7 +2719,6 @@ send_done:
 nla_put_failure:
 	err = -EMSGSIZE;
 errout:
-	genlmsg_cancel(skb, hdr);
 	nlmsg_free(skb);
 	return err;
 }
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 89fc22520d40..9825bfd42abc 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2514,7 +2514,6 @@ static void hwsim_mcast_new_radio(int id, struct genl_info *info,
 	return;
 
 out_err:
-	genlmsg_cancel(mcast_skb, data);
 	nlmsg_free(mcast_skb);
 }
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 475246b355f0..f75ee022e6b2 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1826,7 +1826,6 @@ send_done:
 nla_put_failure:
 	err = -EMSGSIZE;
 err_table_put:
-	genlmsg_cancel(skb, hdr);
 	nlmsg_free(skb);
 	return err;
 }
@@ -2032,7 +2031,6 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr);
 	nlmsg_free(dump_ctx->skb);
 	return -EMSGSIZE;
 }
@@ -2249,7 +2247,6 @@ send_done:
 nla_put_failure:
 	err = -EMSGSIZE;
 err_table_put:
-	genlmsg_cancel(skb, hdr);
 	nlmsg_free(skb);
 	return err;
 }
@@ -2551,7 +2548,6 @@ nla_put_failure:
 	err = -EMSGSIZE;
 err_resource_put:
 err_skb_send_alloc:
-	genlmsg_cancel(skb, hdr);
 	nlmsg_free(skb);
 	return err;
 }
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 7f5621d09571..0fdf2a55e746 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -226,7 +226,6 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
 
 nla_put_failure:
 	rcu_read_unlock();
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -ENOMEM;
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index b09ef77bf4cd..99f4c22e2c8f 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -201,7 +201,6 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
 	return genlmsg_reply(skb, info);
 
 err:
-	genlmsg_cancel(skb, hdr);
 	kfree_skb(skb);
 	return rc;
 }
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f018eafc2a0d..376181cc1def 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -206,7 +206,6 @@ int nfc_genl_targets_found(struct nfc_dev *dev)
 	return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -237,7 +236,6 @@ int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -269,7 +267,6 @@ int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -299,7 +296,6 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -340,7 +336,6 @@ int nfc_genl_device_added(struct nfc_dev *dev)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -370,7 +365,6 @@ int nfc_genl_device_removed(struct nfc_dev *dev)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -434,8 +428,6 @@ int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list)
 	return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC);
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
-
 free_msg:
 	nlmsg_free(msg);
 
@@ -470,7 +462,6 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -501,7 +492,6 @@ int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -546,7 +536,6 @@ int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	/* evt_transaction is no more used */
 	devm_kfree(&dev->dev, evt_transaction);
@@ -585,7 +574,6 @@ int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -703,7 +691,6 @@ int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx,
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -735,7 +722,6 @@ int nfc_genl_dep_link_down_event(struct nfc_dev *dev)
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -1030,7 +1016,6 @@ static int nfc_genl_send_params(struct sk_buff *msg,
 	return 0;
 
 nla_put_failure:
-
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
 }
@@ -1290,7 +1275,6 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
 	return 0;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -1507,7 +1491,6 @@ static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err)
 	return;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 free_msg:
 	nlmsg_free(msg);
 	kfree(ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 66d9975c5a7c40aa7e4bb0ec0b0c37ba1f190923 Mon Sep 17 00:00:00 2001
From: Jian-Hong Pan <jian-hong@endlessm.com>
Date: Mon, 21 May 2018 18:09:20 +0800
Subject: Bluetooth: btusb: Add a new Realtek 8723DE ID 2ff8:b011

Without this patch we cannot turn on the Bluethooth adapter on ASUS
E406MA.

T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#=  2 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=2ff8 ProdID=b011 Rev= 2.00
S:  Manufacturer=Realtek
S:  Product=802.11n WLAN Adapter
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 91882f54c7bd..292b1ea066ca 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -373,6 +373,9 @@ static const struct usb_device_id blacklist_table[] = {
 	/* Additional Realtek 8723BU Bluetooth devices */
 	{ USB_DEVICE(0x7392, 0xa611), .driver_info = BTUSB_REALTEK },
 
+	/* Additional Realtek 8723DE Bluetooth devices */
+	{ USB_DEVICE(0x2ff8, 0xb011), .driver_info = BTUSB_REALTEK },
+
 	/* Additional Realtek 8821AE Bluetooth devices */
 	{ USB_DEVICE(0x0b05, 0x17dc), .driver_info = BTUSB_REALTEK },
 	{ USB_DEVICE(0x13d3, 0x3414), .driver_info = BTUSB_REALTEK },
-- 
cgit v1.2.3-59-g8ed1b


From 803cdb8ce584198cd45825822910cac7de6378cb Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 21 May 2018 22:34:52 +0200
Subject: Bluetooth: btusb: Apply QCA Rome patches for some ATH3012 models

In commit f44cb4b19ed4 ("Bluetooth: btusb: Fix quirk for Atheros
1525/QCA6174") we tried to address the non-working Atheros BT devices
by changing the quirk from BTUSB_ATH3012 to BTUSB_QCA_ROME.  This made
such devices working while it turned out to break other existing chips
with the very same USB ID, hence it was reverted afterwards.

This is another attempt to tackle the issue.  The essential point to
use BTUSB_QCA_ROME is to apply the btusb_setup_qca() and do RAM-
patching.  And the previous attempt failed because btusb_setup_qca()
returns -ENODEV if the ROM version doesn't match with the expected
ones.  For some devices that have already the "correct" ROM versions,
we may just skip the setup procedure and continue the rest.

So, the first fix we'll need is to add a check of the ROM version in
the function to skip the setup if the ROM version looks already sane,
so that it can be applied for all ath devices.

However, the world is a bit more complex than that simple solution.
Since BTUSB_ATH3012 quirk checks the bcdDevice and bails out when it's
0x0001 at the beginning of probing, so the device probe always aborts
here.

In this patch, we add another check of ROM version again, and if the
device needs patching, the probe continues.  For that, a slight
refactoring of btusb_qca_send_vendor_req() was required so that the
probe function can pass usb_device pointer directly before allocating
hci_dev stuff.

Fixes: commit f44cb4b19ed4 ("Bluetooth: btusb: Fix quirk for Atheros 1525/QCA6174")
Bugzilla: http://bugzilla.opensuse.org/show_bug.cgi?id=1082504
Tested-by: Ivan Levshin <ivan.levshin@microfocus.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 292b1ea066ca..84de6244ae66 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2502,11 +2502,9 @@ static const struct qca_device_info qca_devices_table[] = {
 	{ 0x00000302, 28, 4, 18 }, /* Rome 3.2 */
 };
 
-static int btusb_qca_send_vendor_req(struct hci_dev *hdev, u8 request,
+static int btusb_qca_send_vendor_req(struct usb_device *udev, u8 request,
 				     void *data, u16 size)
 {
-	struct btusb_data *btdata = hci_get_drvdata(hdev);
-	struct usb_device *udev = btdata->udev;
 	int pipe, err;
 	u8 *buf;
 
@@ -2521,7 +2519,7 @@ static int btusb_qca_send_vendor_req(struct hci_dev *hdev, u8 request,
 	err = usb_control_msg(udev, pipe, request, USB_TYPE_VENDOR | USB_DIR_IN,
 			      0, 0, buf, size, USB_CTRL_SET_TIMEOUT);
 	if (err < 0) {
-		bt_dev_err(hdev, "Failed to access otp area (%d)", err);
+		dev_err(&udev->dev, "Failed to access otp area (%d)", err);
 		goto done;
 	}
 
@@ -2671,20 +2669,38 @@ static int btusb_setup_qca_load_nvm(struct hci_dev *hdev,
 	return err;
 }
 
+/* identify the ROM version and check whether patches are needed */
+static bool btusb_qca_need_patch(struct usb_device *udev)
+{
+	struct qca_version ver;
+
+	if (btusb_qca_send_vendor_req(udev, QCA_GET_TARGET_VERSION, &ver,
+				      sizeof(ver)) < 0)
+		return false;
+	/* only low ROM versions need patches */
+	return !(le32_to_cpu(ver.rom_version) & ~0xffffU);
+}
+
 static int btusb_setup_qca(struct hci_dev *hdev)
 {
+	struct btusb_data *btdata = hci_get_drvdata(hdev);
+	struct usb_device *udev = btdata->udev;
 	const struct qca_device_info *info = NULL;
 	struct qca_version ver;
 	u32 ver_rom;
 	u8 status;
 	int i, err;
 
-	err = btusb_qca_send_vendor_req(hdev, QCA_GET_TARGET_VERSION, &ver,
+	err = btusb_qca_send_vendor_req(udev, QCA_GET_TARGET_VERSION, &ver,
 					sizeof(ver));
 	if (err < 0)
 		return err;
 
 	ver_rom = le32_to_cpu(ver.rom_version);
+	/* Don't care about high ROM versions */
+	if (ver_rom & ~0xffffU)
+		return 0;
+
 	for (i = 0; i < ARRAY_SIZE(qca_devices_table); i++) {
 		if (ver_rom == qca_devices_table[i].rom_version)
 			info = &qca_devices_table[i];
@@ -2694,7 +2710,7 @@ static int btusb_setup_qca(struct hci_dev *hdev)
 		return -ENODEV;
 	}
 
-	err = btusb_qca_send_vendor_req(hdev, QCA_CHECK_STATUS, &status,
+	err = btusb_qca_send_vendor_req(udev, QCA_CHECK_STATUS, &status,
 					sizeof(status));
 	if (err < 0)
 		return err;
@@ -2908,7 +2924,8 @@ static int btusb_probe(struct usb_interface *intf,
 		/* Old firmware would otherwise let ath3k driver load
 		 * patch and sysconfig files
 		 */
-		if (le16_to_cpu(udev->descriptor.bcdDevice) <= 0x0001)
+		if (le16_to_cpu(udev->descriptor.bcdDevice) <= 0x0001 &&
+		    !btusb_qca_need_patch(udev))
 			return -ENODEV;
 	}
 
@@ -3070,6 +3087,7 @@ static int btusb_probe(struct usb_interface *intf,
 	}
 
 	if (id->driver_info & BTUSB_ATH3012) {
+		data->setup_on_usb = btusb_setup_qca;
 		hdev->set_bdaddr = btusb_set_bdaddr_ath3012;
 		set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
 		set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
-- 
cgit v1.2.3-59-g8ed1b


From 45650499ee1844c0a6cc1ff143ea02b359fdac9b Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend.vanspriel@broadcom.com>
Date: Fri, 18 May 2018 00:15:12 +0200
Subject: Bluetooth: btmrvl: support sysfs initiated firmware coredump

Since commit 3c47d19ff4dc ("drivers: base: add coredump driver ops")
it is possible to initiate a device coredump from user-space. This
patch adds support for it in btmrvl_sdio adding the .coredump()
driver callback. This makes dump through debugfs obsolete so removing
it.

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btmrvl_debugfs.c | 31 -------------------------------
 drivers/bluetooth/btmrvl_drv.h     |  2 --
 drivers/bluetooth/btmrvl_main.c    |  6 ------
 drivers/bluetooth/btmrvl_sdio.c    | 11 ++++++++---
 4 files changed, 8 insertions(+), 42 deletions(-)

diff --git a/drivers/bluetooth/btmrvl_debugfs.c b/drivers/bluetooth/btmrvl_debugfs.c
index 1828ed8cae7a..023d35e3c7a7 100644
--- a/drivers/bluetooth/btmrvl_debugfs.c
+++ b/drivers/bluetooth/btmrvl_debugfs.c
@@ -167,35 +167,6 @@ static const struct file_operations btmrvl_hscmd_fops = {
 	.llseek = default_llseek,
 };
 
-static ssize_t btmrvl_fwdump_write(struct file *file, const char __user *ubuf,
-				   size_t count, loff_t *ppos)
-{
-	struct btmrvl_private *priv = file->private_data;
-	char buf[16];
-	bool result;
-
-	memset(buf, 0, sizeof(buf));
-
-	if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count)))
-		return -EFAULT;
-
-	if (strtobool(buf, &result))
-		return -EINVAL;
-
-	if (!result)
-		return -EINVAL;
-
-	btmrvl_firmware_dump(priv);
-
-	return count;
-}
-
-static const struct file_operations btmrvl_fwdump_fops = {
-	.write	= btmrvl_fwdump_write,
-	.open	= simple_open,
-	.llseek = default_llseek,
-};
-
 void btmrvl_debugfs_init(struct hci_dev *hdev)
 {
 	struct btmrvl_private *priv = hci_get_drvdata(hdev);
@@ -226,8 +197,6 @@ void btmrvl_debugfs_init(struct hci_dev *hdev)
 			    priv, &btmrvl_hscmd_fops);
 	debugfs_create_file("hscfgcmd", 0644, dbg->config_dir,
 			    priv, &btmrvl_hscfgcmd_fops);
-	debugfs_create_file("fw_dump", 0200, dbg->config_dir,
-			    priv, &btmrvl_fwdump_fops);
 
 	dbg->status_dir = debugfs_create_dir("status", hdev->debugfs);
 	debugfs_create_u8("curpsmode", 0444, dbg->status_dir,
diff --git a/drivers/bluetooth/btmrvl_drv.h b/drivers/bluetooth/btmrvl_drv.h
index fc3caf4541ba..f0454541e5fd 100644
--- a/drivers/bluetooth/btmrvl_drv.h
+++ b/drivers/bluetooth/btmrvl_drv.h
@@ -110,7 +110,6 @@ struct btmrvl_private {
 				u8 *payload, u16 nb);
 	int (*hw_wakeup_firmware)(struct btmrvl_private *priv);
 	int (*hw_process_int_status)(struct btmrvl_private *priv);
-	void (*firmware_dump)(struct btmrvl_private *priv);
 	spinlock_t driver_lock;		/* spinlock used by driver */
 #ifdef CONFIG_DEBUG_FS
 	void *debugfs_data;
@@ -183,7 +182,6 @@ int btmrvl_send_hscfg_cmd(struct btmrvl_private *priv);
 int btmrvl_enable_ps(struct btmrvl_private *priv);
 int btmrvl_prepare_command(struct btmrvl_private *priv);
 int btmrvl_enable_hs(struct btmrvl_private *priv);
-void btmrvl_firmware_dump(struct btmrvl_private *priv);
 
 #ifdef CONFIG_DEBUG_FS
 void btmrvl_debugfs_init(struct hci_dev *hdev);
diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index f6c694a1b9b0..708ad21683eb 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -358,12 +358,6 @@ int btmrvl_prepare_command(struct btmrvl_private *priv)
 	return ret;
 }
 
-void btmrvl_firmware_dump(struct btmrvl_private *priv)
-{
-	if (priv->firmware_dump)
-		priv->firmware_dump(priv);
-}
-
 static int btmrvl_tx_pkt(struct btmrvl_private *priv, struct sk_buff *skb)
 {
 	int ret = 0;
diff --git a/drivers/bluetooth/btmrvl_sdio.c b/drivers/bluetooth/btmrvl_sdio.c
index 6f99b9f3d57f..888bac49a87b 100644
--- a/drivers/bluetooth/btmrvl_sdio.c
+++ b/drivers/bluetooth/btmrvl_sdio.c
@@ -1311,9 +1311,11 @@ rdwr_status btmrvl_sdio_rdwr_firmware(struct btmrvl_private *priv,
 }
 
 /* This function dump sdio register and memory data */
-static void btmrvl_sdio_dump_firmware(struct btmrvl_private *priv)
+static void btmrvl_sdio_coredump(struct device *dev)
 {
-	struct btmrvl_sdio_card *card = priv->btmrvl_dev.card;
+	struct sdio_func *func = dev_to_sdio_func(dev);
+	struct btmrvl_sdio_card *card;
+	struct btmrvl_private *priv;
 	int ret = 0;
 	unsigned int reg, reg_start, reg_end;
 	enum rdwr_status stat;
@@ -1321,6 +1323,9 @@ static void btmrvl_sdio_dump_firmware(struct btmrvl_private *priv)
 	u8 dump_num = 0, idx, i, read_reg, doneflag = 0;
 	u32 memory_size, fw_dump_len = 0;
 
+	card = sdio_get_drvdata(func);
+	priv = card->priv;
+
 	/* dump sdio register first */
 	btmrvl_sdio_dump_regs(priv);
 
@@ -1547,7 +1552,6 @@ static int btmrvl_sdio_probe(struct sdio_func *func,
 	priv->hw_host_to_card = btmrvl_sdio_host_to_card;
 	priv->hw_wakeup_firmware = btmrvl_sdio_wakeup_fw;
 	priv->hw_process_int_status = btmrvl_sdio_process_int_status;
-	priv->firmware_dump = btmrvl_sdio_dump_firmware;
 
 	if (btmrvl_register_hdev(priv)) {
 		BT_ERR("Register hdev failed!");
@@ -1717,6 +1721,7 @@ static struct sdio_driver bt_mrvl_sdio = {
 	.remove		= btmrvl_sdio_remove,
 	.drv = {
 		.owner = THIS_MODULE,
+		.coredump = btmrvl_sdio_coredump,
 		.pm = &btmrvl_sdio_pm_ops,
 	}
 };
-- 
cgit v1.2.3-59-g8ed1b


From 0c0c09ff09bb094bcd496b28b8ac4c8ae5b7a4a8 Mon Sep 17 00:00:00 2001
From: Vaibhav Murkute <vaibhavmurkute88@gmail.com>
Date: Mon, 14 May 2018 14:38:47 -0700
Subject: Bluetooth: hci_serdev: Removed unnecessary curly braces

checkpatch.pl shows a warning for these unnecessary curly braces.
so just removed those curly braces.

Signed-off-by: Vaibhav Murkute <vaibhavmurkute88@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_serdev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index e0e6461b9200..a0e939856568 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -204,9 +204,8 @@ static int hci_uart_setup(struct hci_dev *hdev)
 		return 0;
 	}
 
-	if (skb->len != sizeof(*ver)) {
+	if (skb->len != sizeof(*ver))
 		bt_dev_err(hdev, "Event length mismatch for version info");
-	}
 
 	kfree_skb(skb);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 939bc6c59e38fc8c9193203c62271f81ff1662d9 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 22 May 2018 09:34:10 +0200
Subject: Bluetooth: btusb: Add Dell Inspiron 5565 to
 btusb_needs_reset_resume_table

The Dell Inspiron 5565 uses a QCA Rome chip which needs to be reset
(and have its firmware reloaded) for bluetooth to work after
suspend/resume.

BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=15750392
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 84de6244ae66..3a477b6b3ce6 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -411,6 +411,13 @@ static const struct dmi_system_id btusb_needs_reset_resume_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "XPS 13 9360"),
 		},
 	},
+	{
+		/* Dell Inspiron 5565 (QCA ROME device 0cf3:e009) */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 5565"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From c3cd281e2bc8da18f225f1191e9751d771411933 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 May 2018 23:36:06 +0200
Subject: net/mlx5e: fix TLS dependency

With CONFIG_TLS=m and MLX5_CORE_EN=y, we get a link failure:

drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In function `mlx5e_tls_handle_ooo':
tls_rxtx.c:(.text+0x24c): undefined reference to `tls_get_record'
drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.o: In function `mlx5e_tls_handle_tx_skb':
tls_rxtx.c:(.text+0x9a8): undefined reference to `tls_device_sk_destruct'

This narrows down the dependency to only allow the configurations
that will actually work. The existing dependency on TLS_DEVICE is
not sufficient here since MLX5_EN_TLS is a 'bool' symbol.

Fixes: c83294b9efa5 ("net/mlx5e: TLS, Add Innova TLS TX support")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index ee6684779d11..2545296a0c08 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -91,6 +91,7 @@ config MLX5_EN_TLS
 	bool "TLS cryptography-offload accelaration"
 	depends on MLX5_CORE_EN
 	depends on TLS_DEVICE
+	depends on TLS=y || MLX5_CORE=m
 	depends on MLX5_ACCEL
 	default n
 	---help---
-- 
cgit v1.2.3-59-g8ed1b


From d377df784178bf5b0a39e75dc8b1ee86e1abb3f6 Mon Sep 17 00:00:00 2001
From: Timur Tabi <timur@codeaurora.org>
Date: Sat, 26 May 2018 20:29:14 -0500
Subject: net: qcom/emac: fix device tree initialization

Commit "net: qcom/emac: Encapsulate sgmii ops under one structure"
introduced the sgmii_ops structure, but did not correctly initialize
it on device tree platforms.  This resulted in compiler warnings when
ACPI is not enabled.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Timur Tabi <timur@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
index 562420b834df..e78e5db39458 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
@@ -273,6 +273,14 @@ static int emac_sgmii_common_link_change(struct emac_adapter *adpt, bool linkup)
 	return 0;
 }
 
+static struct sgmii_ops fsm9900_ops = {
+	.init = emac_sgmii_init_fsm9900,
+	.open = emac_sgmii_common_open,
+	.close = emac_sgmii_common_close,
+	.link_change = emac_sgmii_common_link_change,
+	.reset = emac_sgmii_common_reset,
+};
+
 static struct sgmii_ops qdf2432_ops = {
 	.init = emac_sgmii_init_qdf2432,
 	.open = emac_sgmii_common_open,
@@ -281,6 +289,7 @@ static struct sgmii_ops qdf2432_ops = {
 	.reset = emac_sgmii_common_reset,
 };
 
+#ifdef CONFIG_ACPI
 static struct sgmii_ops qdf2400_ops = {
 	.init = emac_sgmii_init_qdf2400,
 	.open = emac_sgmii_common_open,
@@ -288,6 +297,7 @@ static struct sgmii_ops qdf2400_ops = {
 	.link_change = emac_sgmii_common_link_change,
 	.reset = emac_sgmii_common_reset,
 };
+#endif
 
 static int emac_sgmii_acpi_match(struct device *dev, void *data)
 {
@@ -335,11 +345,11 @@ static int emac_sgmii_acpi_match(struct device *dev, void *data)
 static const struct of_device_id emac_sgmii_dt_match[] = {
 	{
 		.compatible = "qcom,fsm9900-emac-sgmii",
-		.data = emac_sgmii_init_fsm9900,
+		.data = &fsm9900_ops,
 	},
 	{
 		.compatible = "qcom,qdf2432-emac-sgmii",
-		.data = emac_sgmii_init_qdf2432,
+		.data = &qdf2432_ops,
 	},
 	{}
 };
@@ -386,7 +396,7 @@ int emac_sgmii_config(struct platform_device *pdev, struct emac_adapter *adpt)
 			goto error_put_device;
 		}
 
-		phy->sgmii_ops->init = match->data;
+		phy->sgmii_ops = (struct sgmii_ops *)match->data;
 	}
 
 	/* Base address is the first address */
-- 
cgit v1.2.3-59-g8ed1b


From 12b003b2e4e47f9f395ffd29719370023b4f0ba8 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 27 May 2018 09:56:13 +0300
Subject: mlxsw: reg: Add Management Reset and Shutdown Register

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 3f4d7e22cece..1877d9f8a11a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -7034,6 +7034,30 @@ static inline void mlxsw_reg_mpar_pack(char *payload, u8 local_port,
 	mlxsw_reg_mpar_pa_id_set(payload, pa_id);
 }
 
+/* MRSR - Management Reset and Shutdown Register
+ * ---------------------------------------------
+ * MRSR register is used to reset or shutdown the switch or
+ * the entire system (when applicable).
+ */
+#define MLXSW_REG_MRSR_ID 0x9023
+#define MLXSW_REG_MRSR_LEN 0x08
+
+MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN);
+
+/* reg_mrsr_command
+ * Reset/shutdown command
+ * 0 - do nothing
+ * 1 - software reset
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mrsr, command, 0x00, 0, 4);
+
+static inline void mlxsw_reg_mrsr_pack(char *payload)
+{
+	MLXSW_REG_ZERO(mrsr, payload);
+	mlxsw_reg_mrsr_command_set(payload, 1);
+}
+
 /* MLCR - Management LED Control Register
  * --------------------------------------
  * Controls the system LEDs.
@@ -7898,6 +7922,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mcia),
 	MLXSW_REG(mpat),
 	MLXSW_REG(mpar),
+	MLXSW_REG(mrsr),
 	MLXSW_REG(mlcr),
 	MLXSW_REG(mpsc),
 	MLXSW_REG(mcqi),
-- 
cgit v1.2.3-59-g8ed1b


From 2a360bf0f6608f61ef149220cfb9c369f5d086b7 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 27 May 2018 09:56:14 +0300
Subject: mlxsw: cmd: Handle error after reset gracefully

There is an exception in command interface processing in case the MRSR
register is written to. The register triggers FW reset and during the
reset FW returns an error. So handle this by ignoring this error while
writing to MRSR register.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/cmd.h  | 16 +++++++++++-----
 drivers/net/ethernet/mellanox/mlxsw/core.c | 26 +++++++++++++++++++-------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
index 8da91b023b13..2bc48054b685 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -58,7 +58,7 @@ static inline void mlxsw_cmd_mbox_zero(char *mbox)
 struct mlxsw_core;
 
 int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
-		   u32 in_mod, bool out_mbox_direct,
+		   u32 in_mod, bool out_mbox_direct, bool reset_ok,
 		   char *in_mbox, size_t in_mbox_size,
 		   char *out_mbox, size_t out_mbox_size);
 
@@ -67,7 +67,7 @@ static inline int mlxsw_cmd_exec_in(struct mlxsw_core *mlxsw_core, u16 opcode,
 				    size_t in_mbox_size)
 {
 	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
-			      in_mbox, in_mbox_size, NULL, 0);
+			      false, in_mbox, in_mbox_size, NULL, 0);
 }
 
 static inline int mlxsw_cmd_exec_out(struct mlxsw_core *mlxsw_core, u16 opcode,
@@ -76,7 +76,7 @@ static inline int mlxsw_cmd_exec_out(struct mlxsw_core *mlxsw_core, u16 opcode,
 				     char *out_mbox, size_t out_mbox_size)
 {
 	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod,
-			      out_mbox_direct, NULL, 0,
+			      out_mbox_direct, false, NULL, 0,
 			      out_mbox, out_mbox_size);
 }
 
@@ -84,7 +84,7 @@ static inline int mlxsw_cmd_exec_none(struct mlxsw_core *mlxsw_core, u16 opcode,
 				      u8 opcode_mod, u32 in_mod)
 {
 	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
-			      NULL, 0, NULL, 0);
+			      false, NULL, 0, NULL, 0);
 }
 
 enum mlxsw_cmd_opcode {
@@ -179,6 +179,8 @@ enum mlxsw_cmd_status {
 	MLXSW_CMD_STATUS_BAD_INDEX	= 0x0A,
 	/* NVMEM checksum/CRC failed. */
 	MLXSW_CMD_STATUS_BAD_NVMEM	= 0x0B,
+	/* Device is currently running reset */
+	MLXSW_CMD_STATUS_RUNNING_RESET	= 0x26,
 	/* Bad management packet (silently discarded). */
 	MLXSW_CMD_STATUS_BAD_PKT	= 0x30,
 };
@@ -208,6 +210,8 @@ static inline const char *mlxsw_cmd_status_str(u8 status)
 		return "BAD_INDEX";
 	case MLXSW_CMD_STATUS_BAD_NVMEM:
 		return "BAD_NVMEM";
+	case MLXSW_CMD_STATUS_RUNNING_RESET:
+		return "RUNNING_RESET";
 	case MLXSW_CMD_STATUS_BAD_PKT:
 		return "BAD_PKT";
 	default:
@@ -869,10 +873,12 @@ MLXSW_ITEM32(cmd_mbox, config_profile, cqe_version, 0xB0, 0, 8);
  */
 
 static inline int mlxsw_cmd_access_reg(struct mlxsw_core *mlxsw_core,
+				       bool reset_ok,
 				       char *in_mbox, char *out_mbox)
 {
 	return mlxsw_cmd_exec(mlxsw_core, MLXSW_CMD_OPCODE_ACCESS_REG,
-			      0, 0, false, in_mbox, MLXSW_CMD_MBOX_SIZE,
+			      0, 0, false, reset_ok,
+			      in_mbox, MLXSW_CMD_MBOX_SIZE,
 			      out_mbox, MLXSW_CMD_MBOX_SIZE);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index a38faec45b30..1d9ecf89854e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1480,6 +1480,7 @@ static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
 {
 	enum mlxsw_emad_op_tlv_status status;
 	int err, n_retry;
+	bool reset_ok;
 	char *in_mbox, *out_mbox, *tmp;
 
 	dev_dbg(mlxsw_core->bus_info->dev, "Reg cmd access (reg_id=%x(%s),type=%s)\n",
@@ -1501,9 +1502,16 @@ static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
 	tmp = in_mbox + MLXSW_EMAD_OP_TLV_LEN * sizeof(u32);
 	mlxsw_emad_pack_reg_tlv(tmp, reg, payload);
 
+	/* There is a special treatment needed for MRSR (reset) register.
+	 * The command interface will return error after the command
+	 * is executed, so tell the lower layer to expect it
+	 * and cope accordingly.
+	 */
+	reset_ok = reg->id == MLXSW_REG_MRSR_ID;
+
 	n_retry = 0;
 retry:
-	err = mlxsw_cmd_access_reg(mlxsw_core, in_mbox, out_mbox);
+	err = mlxsw_cmd_access_reg(mlxsw_core, reset_ok, in_mbox, out_mbox);
 	if (!err) {
 		err = mlxsw_emad_process_status(out_mbox, &status);
 		if (err) {
@@ -1793,7 +1801,7 @@ static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
 }
 
 int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
-		   u32 in_mod, bool out_mbox_direct,
+		   u32 in_mod, bool out_mbox_direct, bool reset_ok,
 		   char *in_mbox, size_t in_mbox_size,
 		   char *out_mbox, size_t out_mbox_size)
 {
@@ -1816,7 +1824,15 @@ int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
 					in_mbox, in_mbox_size,
 					out_mbox, out_mbox_size, &status);
 
-	if (err == -EIO && status != MLXSW_CMD_STATUS_OK) {
+	if (!err && out_mbox) {
+		dev_dbg(mlxsw_core->bus_info->dev, "Output mailbox:\n");
+		mlxsw_core_buf_dump_dbg(mlxsw_core, out_mbox, out_mbox_size);
+	}
+
+	if (reset_ok && err == -EIO &&
+	    status == MLXSW_CMD_STATUS_RUNNING_RESET) {
+		err = 0;
+	} else if (err == -EIO && status != MLXSW_CMD_STATUS_OK) {
 		dev_err(mlxsw_core->bus_info->dev, "Cmd exec failed (opcode=%x(%s),opcode_mod=%x,in_mod=%x,status=%x(%s))\n",
 			opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod,
 			in_mod, status, mlxsw_cmd_status_str(status));
@@ -1826,10 +1842,6 @@ int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
 			in_mod);
 	}
 
-	if (!err && out_mbox) {
-		dev_dbg(mlxsw_core->bus_info->dev, "Output mailbox:\n");
-		mlxsw_core_buf_dump_dbg(mlxsw_core, out_mbox, out_mbox_size);
-	}
 	return err;
 }
 EXPORT_SYMBOL(mlxsw_cmd_exec);
-- 
cgit v1.2.3-59-g8ed1b


From f3a52c6162f835727326a0a497e289494caf1fa5 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 27 May 2018 09:56:15 +0300
Subject: mlxsw: pci: Utilize MRSR register to perform FW reset

So far, the PCI BAR0 register is used for triggering FW reset. However,
that is a legacy attitude and it is recommended to use MRSR to perform
reset instead. So do that. Move the reset into init() function as
the cmd interface needs to be used. With that, IRQ initialization needs
to be moved as well. As a side effect, the reset move simplifies
the devlink reload flow.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c |   4 +-
 drivers/net/ethernet/mellanox/mlxsw/core.h |   2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c  | 130 +++++++++++++----------------
 3 files changed, 62 insertions(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 1d9ecf89854e..8a766fe28fa0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -966,14 +966,12 @@ mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
 static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
-	const struct mlxsw_bus *mlxsw_bus = mlxsw_core->bus;
 	int err;
 
-	if (!mlxsw_bus->reset)
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_RESET))
 		return -EOPNOTSUPP;
 
 	mlxsw_core_bus_device_unregister(mlxsw_core, true);
-	mlxsw_bus->reset(mlxsw_core->bus_priv);
 	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
 					     mlxsw_core->bus,
 					     mlxsw_core->bus_priv, true,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 4eac7fbd07d5..4a8d4c7f89d9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -337,6 +337,7 @@ u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
 	mlxsw_core_res_get(mlxsw_core, MLXSW_RES_ID_##short_res_id)
 
 #define MLXSW_BUS_F_TXRX	BIT(0)
+#define MLXSW_BUS_F_RESET	BIT(1)
 
 struct mlxsw_bus {
 	const char *kind;
@@ -344,7 +345,6 @@ struct mlxsw_bus {
 		    const struct mlxsw_config_profile *profile,
 		    struct mlxsw_res *res);
 	void (*fini)(void *bus_priv);
-	void (*reset)(void *bus_priv);
 	bool (*skb_transmit_busy)(void *bus_priv,
 				  const struct mlxsw_tx_info *tx_info);
 	int (*skb_transmit)(void *bus_priv, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index db794a1a3a7e..fc4557245ff4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1371,6 +1371,51 @@ static void mlxsw_pci_mbox_free(struct mlxsw_pci *mlxsw_pci,
 			    mbox->mapaddr);
 }
 
+static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
+			      const struct pci_device_id *id)
+{
+	unsigned long end;
+	char mrsr_pl[MLXSW_REG_MRSR_LEN];
+	int err;
+
+	mlxsw_reg_mrsr_pack(mrsr_pl);
+	err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl);
+	if (err)
+		return err;
+	if (id->device == PCI_DEVICE_ID_MELLANOX_SWITCHX2) {
+		msleep(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+		return 0;
+	}
+
+	/* We must wait for the HW to become responsive once again. */
+	msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS);
+
+	end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+	do {
+		u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY);
+
+		if ((val & MLXSW_PCI_FW_READY_MASK) == MLXSW_PCI_FW_READY_MAGIC)
+			break;
+		cond_resched();
+	} while (time_before(jiffies, end));
+	return 0;
+}
+
+static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
+	if (err < 0)
+		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
+	return err;
+}
+
+static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	pci_free_irq_vectors(mlxsw_pci->pdev);
+}
+
 static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
 			  const struct mlxsw_config_profile *profile,
 			  struct mlxsw_res *res)
@@ -1398,6 +1443,16 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
 	if (err)
 		goto err_out_mbox_alloc;
 
+	err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
+	if (err)
+		goto err_sw_reset;
+
+	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+	if (err < 0) {
+		dev_err(&pdev->dev, "MSI-X init failed\n");
+		goto err_alloc_irq;
+	}
+
 	err = mlxsw_cmd_query_fw(mlxsw_core, mbox);
 	if (err)
 		goto err_query_fw;
@@ -1481,6 +1536,9 @@ err_fw_area_init:
 err_doorbell_page_bar:
 err_iface_rev:
 err_query_fw:
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+err_alloc_irq:
+err_sw_reset:
 	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
 err_out_mbox_alloc:
 	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
@@ -1496,6 +1554,7 @@ static void mlxsw_pci_fini(void *bus_priv)
 	free_irq(pci_irq_vector(mlxsw_pci->pdev, 0), mlxsw_pci);
 	mlxsw_pci_aqs_fini(mlxsw_pci);
 	mlxsw_pci_fw_area_fini(mlxsw_pci);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
 	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
 }
@@ -1677,58 +1736,6 @@ static int mlxsw_pci_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
 	return err;
 }
 
-static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
-			      const struct pci_device_id *id)
-{
-	unsigned long end;
-
-	mlxsw_pci_write32(mlxsw_pci, SW_RESET, MLXSW_PCI_SW_RESET_RST_BIT);
-	if (id->device == PCI_DEVICE_ID_MELLANOX_SWITCHX2) {
-		msleep(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
-		return 0;
-	}
-
-	/* Reset needs to be written before we read control register, and
-	 * we must wait for the HW to become responsive once again
-	 */
-	wmb();
-	msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS);
-
-	end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
-	do {
-		u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY);
-
-		if ((val & MLXSW_PCI_FW_READY_MASK) == MLXSW_PCI_FW_READY_MAGIC)
-			break;
-		cond_resched();
-	} while (time_before(jiffies, end));
-	return 0;
-}
-
-static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
-{
-	pci_free_irq_vectors(mlxsw_pci->pdev);
-}
-
-static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
-{
-	int err;
-
-	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
-	if (err < 0)
-		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
-	return err;
-}
-
-static void mlxsw_pci_reset(void *bus_priv)
-{
-	struct mlxsw_pci *mlxsw_pci = bus_priv;
-
-	mlxsw_pci_free_irq_vectors(mlxsw_pci);
-	mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
-	mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
-}
-
 static const struct mlxsw_bus mlxsw_pci_bus = {
 	.kind			= "pci",
 	.init			= mlxsw_pci_init,
@@ -1736,8 +1743,7 @@ static const struct mlxsw_bus mlxsw_pci_bus = {
 	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
 	.skb_transmit		= mlxsw_pci_skb_transmit,
 	.cmd_exec		= mlxsw_pci_cmd_exec,
-	.features		= MLXSW_BUS_F_TXRX,
-	.reset			= mlxsw_pci_reset,
+	.features		= MLXSW_BUS_F_TXRX | MLXSW_BUS_F_RESET,
 };
 
 static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -1795,18 +1801,6 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mlxsw_pci->pdev = pdev;
 	pci_set_drvdata(pdev, mlxsw_pci);
 
-	err = mlxsw_pci_sw_reset(mlxsw_pci, id);
-	if (err) {
-		dev_err(&pdev->dev, "Software reset failed\n");
-		goto err_sw_reset;
-	}
-
-	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
-	if (err < 0) {
-		dev_err(&pdev->dev, "MSI-X init failed\n");
-		goto err_msix_init;
-	}
-
 	mlxsw_pci->bus_info.device_kind = driver_name;
 	mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev);
 	mlxsw_pci->bus_info.dev = &pdev->dev;
@@ -1823,9 +1817,6 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_bus_device_register:
-	mlxsw_pci_free_irq_vectors(mlxsw_pci);
-err_msix_init:
-err_sw_reset:
 	iounmap(mlxsw_pci->hw_addr);
 err_ioremap:
 err_pci_resource_len_check:
@@ -1843,7 +1834,6 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
 	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
 
 	mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
-	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	iounmap(mlxsw_pci->hw_addr);
 	pci_release_regions(mlxsw_pci->pdev);
 	pci_disable_device(mlxsw_pci->pdev);
-- 
cgit v1.2.3-59-g8ed1b


From e6464b8c6361962f5ff99dc95d010b64432c27b5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:53 -0700
Subject: net/ipv6: Convert ipv6_add_addr to struct ifa6_config

Move config parameters for adding an ipv6 address to a struct. struct
names stem from inet6_rtm_newaddr which is the modern handler for
adding an address.

Start the conversion to ifa6_config with ipv6_add_addr. This is an argument
move only; no functional change intended. Mapping of variable changes:

    addr      -->  cfg->pfx
    peer_addr -->  cfg->peer_pfx
    pfxlen    -->  cfg->plen
    flags     -->  cfg->ifa_flags

scope, valid_lft, prefered_lft have the same names within cfg
(with corrected spelling).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  12 +++++
 net/ipv6/addrconf.c    | 134 +++++++++++++++++++++++++++----------------------
 2 files changed, 87 insertions(+), 59 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c07d4dd09361..f766af2cd1a4 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -59,6 +59,18 @@ struct in6_validator_info {
 	struct netlink_ext_ack	*extack;
 };
 
+struct ifa6_config {
+	const struct in6_addr	*pfx;
+	unsigned int		plen;
+
+	const struct in6_addr	*peer_pfx;
+
+	u32			ifa_flags;
+	u32			preferred_lft;
+	u32			valid_lft;
+	u16			scope;
+};
+
 int addrconf_init(void);
 void addrconf_cleanup(void);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fbfd71a2d9c8..4988f2265882 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -986,17 +986,15 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 /* On success it returns ifp with increased reference count */
 
 static struct inet6_ifaddr *
-ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
-	      const struct in6_addr *peer_addr, int pfxlen,
-	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft,
+ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	      bool can_block, struct netlink_ext_ack *extack)
 {
 	gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
+	int addr_type = ipv6_addr_type(cfg->pfx);
 	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
 	struct fib6_info *f6i = NULL;
 	int err = 0;
-	int addr_type = ipv6_addr_type(addr);
 
 	if (addr_type == IPV6_ADDR_ANY ||
 	    addr_type & IPV6_ADDR_MULTICAST ||
@@ -1019,7 +1017,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	 */
 	if (can_block) {
 		struct in6_validator_info i6vi = {
-			.i6vi_addr = *addr,
+			.i6vi_addr = *cfg->pfx,
 			.i6vi_dev = idev,
 			.extack = extack,
 		};
@@ -1036,7 +1034,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags);
+	f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
 	if (IS_ERR(f6i)) {
 		err = PTR_ERR(f6i);
 		f6i = NULL;
@@ -1049,21 +1047,21 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	neigh_parms_data_state_setall(idev->nd_parms);
 
-	ifa->addr = *addr;
-	if (peer_addr)
-		ifa->peer_addr = *peer_addr;
+	ifa->addr = *cfg->pfx;
+	if (cfg->peer_pfx)
+		ifa->peer_addr = *cfg->peer_pfx;
 
 	spin_lock_init(&ifa->lock);
 	INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
 	INIT_HLIST_NODE(&ifa->addr_lst);
-	ifa->scope = scope;
-	ifa->prefix_len = pfxlen;
-	ifa->flags = flags;
+	ifa->scope = cfg->scope;
+	ifa->prefix_len = cfg->plen;
+	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
-	if (!(flags & IFA_F_NODAD))
+	if (!(cfg->ifa_flags & IFA_F_NODAD))
 		ifa->flags |= IFA_F_TENTATIVE;
-	ifa->valid_lft = valid_lft;
-	ifa->prefered_lft = prefered_lft;
+	ifa->valid_lft = cfg->valid_lft;
+	ifa->prefered_lft = cfg->preferred_lft;
 	ifa->cstamp = ifa->tstamp = jiffies;
 	ifa->tokenized = false;
 
@@ -1260,11 +1258,10 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct in6_addr addr, *tmpaddr;
-	unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age;
+	unsigned long tmp_tstamp, age;
 	unsigned long regen_advance;
-	int tmp_plen;
+	struct ifa6_config cfg;
 	int ret = 0;
-	u32 addr_flags;
 	unsigned long now = jiffies;
 	long max_desync_factor;
 	s32 cnf_temp_preferred_lft;
@@ -1326,13 +1323,12 @@ retry:
 		}
 	}
 
-	tmp_valid_lft = min_t(__u32,
-			      ifp->valid_lft,
+	cfg.valid_lft = min_t(__u32, ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
-	tmp_prefered_lft = cnf_temp_preferred_lft + age -
-			    idev->desync_factor;
-	tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
-	tmp_plen = ifp->prefix_len;
+	cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
+	cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+
+	cfg.plen = ifp->prefix_len;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
@@ -1346,21 +1342,22 @@ retry:
 	 * temporary addresses being generated.
 	 */
 	age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
-	if (tmp_prefered_lft <= regen_advance + age) {
+	if (cfg.preferred_lft <= regen_advance + age) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
 		ret = -1;
 		goto out;
 	}
 
-	addr_flags = IFA_F_TEMPORARY;
+	cfg.ifa_flags = IFA_F_TEMPORARY;
 	/* set in addrconf_prefix_rcv() */
 	if (ifp->flags & IFA_F_OPTIMISTIC)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
+
+	cfg.pfx = &addr;
+	cfg.scope = ipv6_addr_scope(cfg.pfx);
 
-	ift = ipv6_add_addr(idev, &addr, NULL, tmp_plen,
-			    ipv6_addr_scope(&addr), addr_flags,
-			    tmp_valid_lft, tmp_prefered_lft, block, NULL);
+	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
 		in6_ifa_put(ifp);
 		in6_dev_put(idev);
@@ -2031,13 +2028,17 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 	spin_lock_bh(&ifp->lock);
 
 	if (ifp->flags & IFA_F_STABLE_PRIVACY) {
-		int scope = ifp->scope;
-		u32 flags = ifp->flags;
 		struct in6_addr new_addr;
 		struct inet6_ifaddr *ifp2;
-		u32 valid_lft, preferred_lft;
-		int pfxlen = ifp->prefix_len;
 		int retries = ifp->stable_privacy_retry + 1;
+		struct ifa6_config cfg = {
+			.pfx = &new_addr,
+			.plen = ifp->prefix_len,
+			.ifa_flags = ifp->flags,
+			.valid_lft = ifp->valid_lft,
+			.preferred_lft = ifp->prefered_lft,
+			.scope = ifp->scope,
+		};
 
 		if (retries > net->ipv6.sysctl.idgen_retries) {
 			net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
@@ -2050,9 +2051,6 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 						 idev))
 			goto errdad;
 
-		valid_lft = ifp->valid_lft;
-		preferred_lft = ifp->prefered_lft;
-
 		spin_unlock_bh(&ifp->lock);
 
 		if (idev->cnf.max_addresses &&
@@ -2063,9 +2061,7 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 		net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
 				     ifp->idev->dev->name);
 
-		ifp2 = ipv6_add_addr(idev, &new_addr, NULL, pfxlen,
-				     scope, flags, valid_lft,
-				     preferred_lft, false, NULL);
+		ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
 		if (IS_ERR(ifp2))
 			goto lock_errdad;
 
@@ -2507,12 +2503,20 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 
 	if (!ifp && valid_lft) {
 		int max_addresses = in6_dev->cnf.max_addresses;
+		struct ifa6_config cfg = {
+			.pfx = addr,
+			.plen = pinfo->prefix_len,
+			.ifa_flags = addr_flags,
+			.valid_lft = valid_lft,
+			.preferred_lft = prefered_lft,
+			.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+		};
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 		if ((net->ipv6.devconf_all->optimistic_dad ||
 		     in6_dev->cnf.optimistic_dad) &&
 		    !net->ipv6.devconf_all->forwarding && sllao)
-			addr_flags |= IFA_F_OPTIMISTIC;
+			cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
 		/* Do not allow to create too much of autoconfigured
@@ -2520,11 +2524,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
 		 */
 		if (!max_addresses ||
 		    ipv6_count_addresses(in6_dev) < max_addresses)
-			ifp = ipv6_add_addr(in6_dev, addr, NULL,
-					    pinfo->prefix_len,
-					    addr_type&IPV6_ADDR_SCOPE_MASK,
-					    addr_flags, valid_lft,
-					    prefered_lft, false, NULL);
+			ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);
 
 		if (IS_ERR_OR_NULL(ifp))
 			return -1;
@@ -2836,12 +2836,19 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			  __u32 prefered_lft, __u32 valid_lft,
 			  struct netlink_ext_ack *extack)
 {
+	struct ifa6_config cfg = {
+		.pfx = pfx,
+		.plen = plen,
+		.peer_pfx = peer_pfx,
+		.ifa_flags = ifa_flags,
+		.preferred_lft = prefered_lft,
+		.valid_lft = valid_lft,
+	};
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
 	unsigned long timeout;
 	clock_t expires;
-	int scope;
 	u32 flags;
 
 	ASSERT_RTNL();
@@ -2872,7 +2879,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 			return ret;
 	}
 
-	scope = ipv6_addr_scope(pfx);
+	cfg.scope = ipv6_addr_scope(pfx);
 
 	timeout = addrconf_timeout_fixup(valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
@@ -2892,9 +2899,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
 		prefered_lft = timeout;
 	}
 
-	ifp = ipv6_add_addr(idev, pfx, peer_pfx, plen, scope, ifa_flags,
-			    valid_lft, prefered_lft, true, extack);
-
+	ifp = ipv6_add_addr(idev, &cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
@@ -3010,11 +3015,16 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		     int plen, int scope)
 {
 	struct inet6_ifaddr *ifp;
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = plen,
+		.ifa_flags = IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = scope
+	};
 
-	ifp = ipv6_add_addr(idev, addr, NULL, plen,
-			    scope, IFA_F_PERMANENT,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME,
-			    true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
@@ -3104,18 +3114,24 @@ static void init_loopback(struct net_device *dev)
 void addrconf_add_linklocal(struct inet6_dev *idev,
 			    const struct in6_addr *addr, u32 flags)
 {
+	struct ifa6_config cfg = {
+		.pfx = addr,
+		.plen = 64,
+		.ifa_flags = flags | IFA_F_PERMANENT,
+		.valid_lft = INFINITY_LIFE_TIME,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.scope = IFA_LINK
+	};
 	struct inet6_ifaddr *ifp;
-	u32 addr_flags = flags | IFA_F_PERMANENT;
 
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
 	     idev->cnf.optimistic_dad) &&
 	    !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
-		addr_flags |= IFA_F_OPTIMISTIC;
+		cfg.ifa_flags |= IFA_F_OPTIMISTIC;
 #endif
 
-	ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags,
-			    INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL);
+	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
 				      0, 0, GFP_ATOMIC);
-- 
cgit v1.2.3-59-g8ed1b


From 19b1518c294569c3cd5d3b430f82beac95d0bfc5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:54 -0700
Subject: net/ipv6: Pass ifa6_config struct to inet6_addr_add

Move the creation of struct ifa6_config up to callers of inet6_addr_add.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 112 ++++++++++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 55 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4988f2265882..2db1acf70610 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2830,20 +2830,9 @@ static int ipv6_mc_config(struct sock *sk, bool join,
  *	Manual configuration of address on an interface
  */
 static int inet6_addr_add(struct net *net, int ifindex,
-			  const struct in6_addr *pfx,
-			  const struct in6_addr *peer_pfx,
-			  unsigned int plen, __u32 ifa_flags,
-			  __u32 prefered_lft, __u32 valid_lft,
+			  struct ifa6_config *cfg,
 			  struct netlink_ext_ack *extack)
 {
-	struct ifa6_config cfg = {
-		.pfx = pfx,
-		.plen = plen,
-		.peer_pfx = peer_pfx,
-		.ifa_flags = ifa_flags,
-		.preferred_lft = prefered_lft,
-		.valid_lft = valid_lft,
-	};
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
@@ -2853,14 +2842,14 @@ static int inet6_addr_add(struct net *net, int ifindex,
 
 	ASSERT_RTNL();
 
-	if (plen > 128)
+	if (cfg->plen > 128)
 		return -EINVAL;
 
 	/* check the lifetime */
-	if (!valid_lft || prefered_lft > valid_lft)
+	if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
 		return -EINVAL;
 
-	if (ifa_flags & IFA_F_MANAGETEMPADDR && plen != 64)
+	if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
 		return -EINVAL;
 
 	dev = __dev_get_by_index(net, ifindex);
@@ -2871,37 +2860,37 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 
-	if (ifa_flags & IFA_F_MCAUTOJOIN) {
+	if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
 		int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
-					 true, pfx, ifindex);
+					 true, cfg->pfx, ifindex);
 
 		if (ret < 0)
 			return ret;
 	}
 
-	cfg.scope = ipv6_addr_scope(pfx);
+	cfg->scope = ipv6_addr_scope(cfg->pfx);
 
-	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		expires = jiffies_to_clock_t(timeout * HZ);
-		valid_lft = timeout;
+		cfg->valid_lft = timeout;
 		flags = RTF_EXPIRES;
 	} else {
 		expires = 0;
 		flags = 0;
-		ifa_flags |= IFA_F_PERMANENT;
+		cfg->ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		if (timeout == 0)
-			ifa_flags |= IFA_F_DEPRECATED;
-		prefered_lft = timeout;
+			cfg->ifa_flags |= IFA_F_DEPRECATED;
+		cfg->preferred_lft = timeout;
 	}
 
-	ifp = ipv6_add_addr(idev, &cfg, true, extack);
+	ifp = ipv6_add_addr(idev, cfg, true, extack);
 	if (!IS_ERR(ifp)) {
-		if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+		if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
 			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
 					      expires, flags, GFP_KERNEL);
 		}
@@ -2917,15 +2906,15 @@ static int inet6_addr_add(struct net *net, int ifindex,
 		 * manually configured addresses
 		 */
 		addrconf_dad_start(ifp);
-		if (ifa_flags & IFA_F_MANAGETEMPADDR)
-			manage_tempaddrs(idev, ifp, valid_lft, prefered_lft,
-					 true, jiffies);
+		if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
+			manage_tempaddrs(idev, ifp, cfg->valid_lft,
+					 cfg->preferred_lft, true, jiffies);
 		in6_ifa_put(ifp);
 		addrconf_verify_rtnl();
 		return 0;
-	} else if (ifa_flags & IFA_F_MCAUTOJOIN) {
-		ipv6_mc_config(net->ipv6.mc_autojoin_sk,
-			       false, pfx, ifindex);
+	} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
+		ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
+			       cfg->pfx, ifindex);
 	}
 
 	return PTR_ERR(ifp);
@@ -2976,6 +2965,11 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
 
 int addrconf_add_ifaddr(struct net *net, void __user *arg)
 {
+	struct ifa6_config cfg = {
+		.ifa_flags = IFA_F_PERMANENT,
+		.preferred_lft = INFINITY_LIFE_TIME,
+		.valid_lft = INFINITY_LIFE_TIME,
+	};
 	struct in6_ifreq ireq;
 	int err;
 
@@ -2985,10 +2979,11 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
 		return -EFAULT;
 
+	cfg.pfx = &ireq.ifr6_addr;
+	cfg.plen = ireq.ifr6_prefixlen;
+
 	rtnl_lock();
-	err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, NULL,
-			     ireq.ifr6_prefixlen, IFA_F_PERMANENT,
-			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, NULL);
+	err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
 	rtnl_unlock();
 	return err;
 }
@@ -4624,12 +4619,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct net *net = sock_net(skb->sk);
 	struct ifaddrmsg *ifm;
 	struct nlattr *tb[IFA_MAX+1];
-	struct in6_addr *pfx, *peer_pfx;
+	struct in6_addr *peer_pfx;
 	struct inet6_ifaddr *ifa;
 	struct net_device *dev;
 	struct inet6_dev *idev;
-	u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
-	u32 ifa_flags;
+	struct ifa6_config cfg;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
@@ -4637,60 +4631,68 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	memset(&cfg, 0, sizeof(cfg));
+
 	ifm = nlmsg_data(nlh);
-	pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
-	if (!pfx)
+	cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
+	if (!cfg.pfx)
 		return -EINVAL;
 
+	cfg.peer_pfx = peer_pfx;
+	cfg.plen = ifm->ifa_prefixlen;
+	cfg.valid_lft = INFINITY_LIFE_TIME;
+	cfg.preferred_lft = INFINITY_LIFE_TIME;
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
 		ci = nla_data(tb[IFA_CACHEINFO]);
-		valid_lft = ci->ifa_valid;
-		preferred_lft = ci->ifa_prefered;
-	} else {
-		preferred_lft = INFINITY_LIFE_TIME;
-		valid_lft = INFINITY_LIFE_TIME;
+		cfg.valid_lft = ci->ifa_valid;
+		cfg.preferred_lft = ci->ifa_prefered;
 	}
 
 	dev =  __dev_get_by_index(net, ifm->ifa_index);
 	if (!dev)
 		return -ENODEV;
 
-	ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+	if (tb[IFA_FLAGS])
+		cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
+	else
+		cfg.ifa_flags = ifm->ifa_flags;
 
 	/* We ignore other flags so far. */
-	ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
-		     IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+	cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+			 IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+			 IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
 
 	idev = ipv6_find_idev(dev);
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 
 	if (!ipv6_allow_optimistic_dad(net, idev))
-		ifa_flags &= ~IFA_F_OPTIMISTIC;
+		cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
 
-	if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) {
+	if (cfg.ifa_flags & IFA_F_NODAD &&
+	    cfg.ifa_flags & IFA_F_OPTIMISTIC) {
 		NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
 		return -EINVAL;
 	}
 
-	ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
+	ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
 	if (!ifa) {
 		/*
 		 * It would be best to check for !NLM_F_CREATE here but
 		 * userspace already relies on not having to provide this.
 		 */
-		return inet6_addr_add(net, ifm->ifa_index, pfx, peer_pfx,
-				      ifm->ifa_prefixlen, ifa_flags,
-				      preferred_lft, valid_lft, extack);
+		return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_EXCL ||
 	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 		err = -EEXIST;
 	else
-		err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
+		err = inet6_addr_modify(ifa, cfg.ifa_flags, cfg.preferred_lft,
+					cfg.valid_lft);
 
 	in6_ifa_put(ifa);
 
-- 
cgit v1.2.3-59-g8ed1b


From d169a1f8baa42b416bf00df2c62a8af82c124775 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:55 -0700
Subject: net/ipv6: Pass ifa6_config struct to inet6_addr_modify

Update inet6_addr_modify to take ifa6_config argument versus a parameter
list. This is an argument move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2db1acf70610..1b1e63a4520b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4527,8 +4527,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      ifm->ifa_prefixlen);
 }
 
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
-			     u32 prefered_lft, u32 valid_lft)
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 {
 	u32 flags;
 	clock_t expires;
@@ -4538,32 +4537,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 
 	ASSERT_RTNL();
 
-	if (!valid_lft || (prefered_lft > valid_lft))
+	if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
 		return -EINVAL;
 
-	if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+	if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
 	    (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
 		return -EINVAL;
 
 	if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
-		ifa_flags &= ~IFA_F_OPTIMISTIC;
+		cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
 
-	timeout = addrconf_timeout_fixup(valid_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		expires = jiffies_to_clock_t(timeout * HZ);
-		valid_lft = timeout;
+		cfg->valid_lft = timeout;
 		flags = RTF_EXPIRES;
 	} else {
 		expires = 0;
 		flags = 0;
-		ifa_flags |= IFA_F_PERMANENT;
+		cfg->ifa_flags |= IFA_F_PERMANENT;
 	}
 
-	timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+	timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		if (timeout == 0)
-			ifa_flags |= IFA_F_DEPRECATED;
-		prefered_lft = timeout;
+			cfg->ifa_flags |= IFA_F_DEPRECATED;
+		cfg->preferred_lft = timeout;
 	}
 
 	spin_lock_bh(&ifp->lock);
@@ -4573,16 +4572,16 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 	ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
 			IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
 			IFA_F_NOPREFIXROUTE);
-	ifp->flags |= ifa_flags;
+	ifp->flags |= cfg->ifa_flags;
 	ifp->tstamp = jiffies;
-	ifp->valid_lft = valid_lft;
-	ifp->prefered_lft = prefered_lft;
+	ifp->valid_lft = cfg->valid_lft;
+	ifp->prefered_lft = cfg->preferred_lft;
 
 	spin_unlock_bh(&ifp->lock);
 	if (!(ifp->flags&IFA_F_TENTATIVE))
 		ipv6_ifa_notify(0, ifp);
 
-	if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+	if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
 				      ifp->idev->dev, expires, flags,
 				      GFP_KERNEL);
@@ -4601,10 +4600,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 	}
 
 	if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
-		if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
-			valid_lft = prefered_lft = 0;
-		manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
-				 !was_managetempaddr, jiffies);
+		if (was_managetempaddr &&
+		    !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
+			cfg->valid_lft = 0;
+			cfg->preferred_lft = 0;
+		}
+		manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+				 cfg->preferred_lft, !was_managetempaddr,
+				 jiffies);
 	}
 
 	addrconf_verify_rtnl();
@@ -4691,8 +4694,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 		err = -EEXIST;
 	else
-		err = inet6_addr_modify(ifa, cfg.ifa_flags, cfg.preferred_lft,
-					cfg.valid_lft);
+		err = inet6_addr_modify(ifa, &cfg);
 
 	in6_ifa_put(ifa);
 
-- 
cgit v1.2.3-59-g8ed1b


From 620dee9415fae09003d012cc7c23b011e5ebb43d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:56 -0700
Subject: net: Add IFA_RT_PRIORITY address attribute

Currently, if two interfaces have addresses in the same connected route,
then the order of the prefix route entries is determined by the order in
which the addresses are assigned or the links brought up.

Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix
route associated with an address giving interface managers better
control of the order of prefix routes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_addr.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 2ef053d265de..ebaf5701c9db 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
 	IFA_CACHEINFO,
 	IFA_MULTICAST,
 	IFA_FLAGS,
+	IFA_RT_PRIORITY,  /* u32, priority/metric for prefix route */
 	__IFA_MAX,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From af4d768ad28cbf6542ba70dba10b49127b31b762 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:57 -0700
Subject: net/ipv4: Add support for specifying metric of connected routes

Add support for IFA_RT_PRIORITY to ipv4 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  1 +
 include/net/route.h        |  1 +
 net/ipv4/devinet.c         | 15 +++++++++++++
 net/ipv4/fib_frontend.c    | 53 ++++++++++++++++++++++++++++++++++++----------
 4 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index e16fe7d44a71..27650f1bff3d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -139,6 +139,7 @@ struct in_ifaddr {
 	__be32			ifa_local;
 	__be32			ifa_address;
 	__be32			ifa_mask;
+	__u32			ifa_rt_priority;
 	__be32			ifa_broadcast;
 	unsigned char		ifa_scope;
 	unsigned char		ifa_prefixlen;
diff --git a/include/net/route.h b/include/net/route.h
index dbb032d5921b..bb53cdba38dc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
 void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
 
 void rt_add_uncached_list(struct rtable *rt);
 void rt_del_uncached_list(struct rtable *rt);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..d7585ab1a77a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .type = NLA_U32 },
+	[IFA_RT_PRIORITY]	= { .type = NLA_U32 },
 };
 
 #define IN4_ADDR_HSIZE_SHIFT	8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
 	else
 		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
 
+	if (tb[IFA_RT_PRIORITY])
+		ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	if (tb[IFA_CACHEINFO]) {
 		struct ifa_cacheinfo *ci;
 
@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
 					 extack);
 	} else {
+		u32 new_metric = ifa->ifa_rt_priority;
+
 		inet_free_ifa(ifa);
 
 		if (nlh->nlmsg_flags & NLM_F_EXCL ||
 		    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 			return -EEXIST;
 		ifa = ifa_existing;
+
+		if (ifa->ifa_rt_priority != new_metric) {
+			fib_modify_prefix_metric(ifa, new_metric);
+			ifa->ifa_rt_priority = new_metric;
+		}
+
 		set_ifa_lifetime(ifa, valid_lft, prefered_lft);
 		cancel_delayed_work(&check_lifetime_work);
 		queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
 	       + nla_total_size(4) /* IFA_BROADCAST */
 	       + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
 	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
 }
 
@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
 	    (ifa->ifa_label[0] &&
 	     nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
 	    nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+	    (ifa->ifa_rt_priority &&
+	     nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
 	    put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
 			  preferred, valid))
 		goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b69e2824c761..63aa39b3af03 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -847,7 +847,8 @@ out_err:
  * to fib engine. It is legal, because all events occur
  * only when netlink is already locked.
  */
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+		      struct in_ifaddr *ifa, u32 rt_priority)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
 	u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -857,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
 		.fc_type = type,
 		.fc_dst = dst,
 		.fc_dst_len = dst_len,
+		.fc_priority = rt_priority,
 		.fc_prefsrc = ifa->ifa_local,
 		.fc_oif = ifa->ifa_dev->dev->ifindex,
 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -902,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 		}
 	}
 
-	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
 
 	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
-		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 
 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_NEWROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  prefix, ifa->ifa_prefixlen, prim);
+				  prefix, ifa->ifa_prefixlen, prim,
+				  ifa->ifa_rt_priority);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+				  prim, 0);
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
-				  32, prim);
+				  32, prim, 0);
 		}
 	}
 }
 
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+	__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+
+	if (!(dev->flags & IFF_UP) ||
+	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+	    ipv4_is_zeronet(prefix) ||
+	    prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+		return;
+
+	/* add the new */
+	fib_magic(RTM_NEWROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, new_metric);
+
+	/* delete the old */
+	fib_magic(RTM_DELROUTE,
+		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+		  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
+}
+
 /* Delete primary or secondary address.
  * Optionally, on secondary address promotion consider the addresses
  * from subnet iprim as deleted, even if they are in device list.
@@ -968,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
 			fib_magic(RTM_DELROUTE,
 				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
-				  any, ifa->ifa_prefixlen, prim);
+				  any, ifa->ifa_prefixlen, prim, 0);
 		subnet = 1;
 	}
 
@@ -1052,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
 
 no_promotions:
 	if (!(ok & BRD_OK))
-		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+			  prim, 0);
 	if (subnet && ifa->ifa_prefixlen < 31) {
 		if (!(ok & BRD1_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+				  prim, 0);
 		if (!(ok & BRD0_OK))
-			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+				  prim, 0);
 	}
 	if (!(ok & LOCAL_OK)) {
 		unsigned int addr_type;
 
-		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
 
 		/* Check, that this local address finally disappeared. */
 		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
-- 
cgit v1.2.3-59-g8ed1b


From 8308f3ff1753d001f7a73f9bb0f02292b5400557 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:58 -0700
Subject: net/ipv6: Add support for specifying metric of connected routes

Add support for IFA_RT_PRIORITY to ipv6 addresses.

If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be atomically replaced.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  1 +
 include/net/if_inet6.h |  1 +
 net/ipv6/addrconf.c    | 93 ++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f766af2cd1a4..5f43f7a70fe6 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -65,6 +65,7 @@ struct ifa6_config {
 
 	const struct in6_addr	*peer_pfx;
 
+	u32			rt_priority;
 	u32			ifa_flags;
 	u32			preferred_lft;
 	u32			valid_lft;
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index db389253dc2a..d7578cf49c3a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -42,6 +42,7 @@ enum {
 struct inet6_ifaddr {
 	struct in6_addr		addr;
 	__u32			prefix_len;
+	__u32			rt_priority;
 
 	/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
 	__u32			valid_lft;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b1e63a4520b..f09afc232da4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1056,6 +1056,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	INIT_HLIST_NODE(&ifa->addr_lst);
 	ifa->scope = cfg->scope;
 	ifa->prefix_len = cfg->plen;
+	ifa->rt_priority = cfg->rt_priority;
 	ifa->flags = cfg->ifa_flags;
 	/* No need to add the TENTATIVE flag for addresses with NODAD */
 	if (!(cfg->ifa_flags & IFA_F_NODAD))
@@ -1356,6 +1357,7 @@ retry:
 
 	cfg.pfx = &addr;
 	cfg.scope = ipv6_addr_scope(cfg.pfx);
+	cfg.rt_priority = 0;
 
 	ift = ipv6_add_addr(idev, &cfg, block, NULL);
 	if (IS_ERR(ift)) {
@@ -2314,12 +2316,13 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
  */
 
 static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, u32 flags, gfp_t gfp_flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+		      struct net_device *dev, unsigned long expires,
+		      u32 flags, gfp_t gfp_flags)
 {
 	struct fib6_config cfg = {
 		.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
-		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
 		.fc_ifindex = dev->ifindex,
 		.fc_expires = expires,
 		.fc_dst_len = plen,
@@ -2683,7 +2686,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 				expires = jiffies_to_clock_t(rt_expires);
 			}
 			addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
-					      dev, expires, flags, GFP_ATOMIC);
+					      0, dev, expires, flags,
+					      GFP_ATOMIC);
 		}
 		fib6_info_release(rt);
 	}
@@ -2891,8 +2895,9 @@ static int inet6_addr_add(struct net *net, int ifindex,
 	ifp = ipv6_add_addr(idev, cfg, true, extack);
 	if (!IS_ERR(ifp)) {
 		if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-			addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
-					      expires, flags, GFP_KERNEL);
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, dev, expires,
+					      flags, GFP_KERNEL);
 		}
 
 		/* Send a netlink notification if DAD is enabled and
@@ -3056,7 +3061,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 
 	if (addr.s6_addr32[3]) {
 		add_addr(idev, &addr, plen, scope);
-		addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+		addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
 				      GFP_ATOMIC);
 		return;
 	}
@@ -3081,8 +3086,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 				}
 
 				add_addr(idev, &addr, plen, flag);
-				addrconf_prefix_route(&addr, plen, idev->dev, 0,
-						      pflags, GFP_ATOMIC);
+				addrconf_prefix_route(&addr, plen, 0, idev->dev,
+						      0, pflags, GFP_ATOMIC);
 			}
 		}
 	}
@@ -3128,7 +3133,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
 
 	ifp = ipv6_add_addr(idev, &cfg, true, NULL);
 	if (!IS_ERR(ifp)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
 				      0, 0, GFP_ATOMIC);
 		addrconf_dad_start(ifp);
 		in6_ifa_put(ifp);
@@ -3244,7 +3249,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 			addrconf_add_linklocal(idev, &addr,
 					       IFA_F_STABLE_PRIVACY);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_EUI64:
@@ -3255,7 +3260,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
 		if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
 			addrconf_add_linklocal(idev, &addr, 0);
 		else if (prefix_route)
-			addrconf_prefix_route(&addr, 64, idev->dev,
+			addrconf_prefix_route(&addr, 64, 0, idev->dev,
 					      0, 0, GFP_KERNEL);
 		break;
 	case IN6_ADDR_GEN_MODE_NONE:
@@ -3375,7 +3380,8 @@ static int fixup_permanent_addr(struct net *net,
 
 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
 		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      idev->dev, 0, 0, GFP_ATOMIC);
+				      ifp->rt_priority, idev->dev, 0, 0,
+				      GFP_ATOMIC);
 	}
 
 	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4495,6 +4501,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
 	[IFA_LOCAL]		= { .len = sizeof(struct in6_addr) },
 	[IFA_CACHEINFO]		= { .len = sizeof(struct ifa_cacheinfo) },
 	[IFA_FLAGS]		= { .len = sizeof(u32) },
+	[IFA_RT_PRIORITY]	= { .len = sizeof(u32) },
 };
 
 static int
@@ -4527,6 +4534,37 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      ifm->ifa_prefixlen);
 }
 
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+			       unsigned long expires, u32 flags)
+{
+	struct fib6_info *f6i;
+
+	f6i = addrconf_get_prefix_route(&ifp->addr,
+					ifp->prefix_len,
+					ifp->idev->dev,
+					0, RTF_GATEWAY | RTF_DEFAULT);
+	if (!f6i)
+		return -ENOENT;
+
+	if (f6i->fib6_metric != ifp->rt_priority) {
+		/* add new one */
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      ifp->rt_priority, ifp->idev->dev,
+				      expires, flags, GFP_KERNEL);
+		/* delete old one */
+		ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+	} else {
+		if (!expires)
+			fib6_clean_expires(f6i);
+		else
+			fib6_set_expires(f6i, expires);
+
+		fib6_info_release(f6i);
+	}
+
+	return 0;
+}
+
 static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 {
 	u32 flags;
@@ -4577,14 +4615,25 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
 	ifp->valid_lft = cfg->valid_lft;
 	ifp->prefered_lft = cfg->preferred_lft;
 
+	if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+		ifp->rt_priority = cfg->rt_priority;
+
 	spin_unlock_bh(&ifp->lock);
 	if (!(ifp->flags&IFA_F_TENTATIVE))
 		ipv6_ifa_notify(0, ifp);
 
 	if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
-		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
-				      ifp->idev->dev, expires, flags,
-				      GFP_KERNEL);
+		int rc = -ENOENT;
+
+		if (had_prefixroute)
+			rc = modify_prefix_route(ifp, expires, flags);
+
+		/* prefix route could have been deleted; if so restore it */
+		if (rc == -ENOENT) {
+			addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+					      ifp->rt_priority, ifp->idev->dev,
+					      expires, flags, GFP_KERNEL);
+		}
 	} else if (had_prefixroute) {
 		enum cleanup_prefix_rt_t action;
 		unsigned long rt_expires;
@@ -4643,6 +4692,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	cfg.peer_pfx = peer_pfx;
 	cfg.plen = ifm->ifa_prefixlen;
+	if (tb[IFA_RT_PRIORITY])
+		cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
 	cfg.valid_lft = INFINITY_LIFE_TIME;
 	cfg.preferred_lft = INFINITY_LIFE_TIME;
 
@@ -4745,7 +4797,8 @@ static inline int inet6_ifaddr_msgsize(void)
 	       + nla_total_size(16) /* IFA_LOCAL */
 	       + nla_total_size(16) /* IFA_ADDRESS */
 	       + nla_total_size(sizeof(struct ifa_cacheinfo))
-	       + nla_total_size(4)  /* IFA_FLAGS */;
+	       + nla_total_size(4)  /* IFA_FLAGS */
+	       + nla_total_size(4)  /* IFA_RT_PRIORITY */;
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4791,6 +4844,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
 		if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
 			goto error;
 
+	if (ifa->rt_priority &&
+	    nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+		goto error;
+
 	if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
 		goto error;
 
@@ -5635,7 +5692,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		if (!ipv6_addr_any(&ifp->peer_addr))
-			addrconf_prefix_route(&ifp->peer_addr, 128,
+			addrconf_prefix_route(&ifp->peer_addr, 128, 0,
 					      ifp->idev->dev, 0, 0,
 					      GFP_ATOMIC);
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From d69faad76584c4735911d6dbdc8b9ff672b3f763 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 27 May 2018 08:09:59 -0700
Subject: selftests: fib_tests: Add prefix route tests with metric

Add tests verifying prefix routes are inserted with expected metric.

IPv6 prefix route tests
    TEST: Default metric                                      [ OK ]
    TEST: User specified metric on first device               [ OK ]
    TEST: User specified metric on second device              [ OK ]
    TEST: Delete of address on first device                   [ OK ]
    TEST: Modify metric of address                            [ OK ]
    TEST: Prefix route removed on link down                   [ OK ]
    TEST: Prefix route with metric on link up                 [ OK ]

IPv4 prefix route tests
    TEST: Default metric                                      [ OK ]
    TEST: User specified metric on first device               [ OK ]
    TEST: User specified metric on second device              [ OK ]
    TEST: Delete of address on first device                   [ OK ]
    TEST: Modify metric of address                            [ OK ]
    TEST: Prefix route removed on link down                   [ OK ]
    TEST: Prefix route with metric on link up                 [ OK ]

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/fib_tests.sh | 181 ++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 1 deletion(-)
 mode change 100755 => 100644 tools/testing/selftests/net/fib_tests.sh

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
old mode 100755
new mode 100644
index e7d76fbc36e9..9780c5a2d73f
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,7 +6,8 @@
 
 ret=0
 
-TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt"
+# all tests in this script. Can be overridden with -t option
+TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric"
 VERBOSE=0
 PAUSE_ON_FAIL=no
 PAUSE=no
@@ -642,6 +643,8 @@ check_route6()
 	local rc=0
 
 	out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
+	[ "${out}" = "${expected}" ] && return 0
+
 	if [ -z "${out}" ]; then
 		if [ "$VERBOSE" = "1" ]; then
 			printf "\nNo route entry found\n"
@@ -911,6 +914,98 @@ ipv6_route_test()
 	route_cleanup
 }
 
+ip_addr_metric_check()
+{
+	ip addr help 2>&1 | grep -q metric
+	if [ $? -ne 0 ]; then
+		echo "iproute2 command does not support metric for addresses. Skipping test"
+		return 1
+	fi
+
+	return 0
+}
+
+ipv6_addr_metric_test()
+{
+	local rc
+
+	echo
+	echo "IPv6 prefix route tests"
+
+	ip_addr_metric_check || return 1
+
+	setup
+
+	set -e
+	$IP li add dummy1 type dummy
+	$IP li add dummy2 type dummy
+	$IP li set dummy1 up
+	$IP li set dummy2 up
+
+	# default entry is metric 256
+	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
+	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
+	log_test $? 0 "Default metric"
+
+	set -e
+	run_cmd "$IP -6 addr flush dev dummy1"
+	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
+	log_test $? 0 "User specified metric on first device"
+
+	set -e
+	run_cmd "$IP -6 addr flush dev dummy2"
+	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
+	set +e
+
+	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+	log_test $? 0 "User specified metric on second device"
+
+	run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+		rc=$?
+	fi
+	log_test $rc 0 "Delete of address on first device"
+
+	run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address"
+
+	# verify prefix route removed on down
+	run_cmd "ip netns exec testns sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
+	run_cmd "$IP li set dev dummy2 down"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 ""
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route removed on link down"
+
+	# verify prefix route re-inserted with assigned metric
+	run_cmd "$IP li set dev dummy2 up"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route with metric on link up"
+
+	$IP li del dummy1
+	$IP li del dummy2
+	cleanup
+}
+
 # add route for a prefix, flushing any existing routes first
 # expected to be the first step of a test
 add_route()
@@ -955,6 +1050,8 @@ check_route()
 	local rc=0
 
 	out=$($IP ro ls match ${pfx})
+	[ "${out}" = "${expected}" ] && return 0
+
 	if [ -z "${out}" ]; then
 		if [ "$VERBOSE" = "1" ]; then
 			printf "\nNo route entry found\n"
@@ -1181,6 +1278,86 @@ ipv4_route_test()
 	route_cleanup
 }
 
+ipv4_addr_metric_test()
+{
+	local rc
+
+	echo
+	echo "IPv4 prefix route tests"
+
+	ip_addr_metric_check || return 1
+
+	setup
+
+	set -e
+	$IP li add dummy1 type dummy
+	$IP li add dummy2 type dummy
+	$IP li set dummy1 up
+	$IP li set dummy2 up
+
+	# default entry is metric 256
+	run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
+	run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
+	log_test $? 0 "Default metric"
+
+	set -e
+	run_cmd "$IP addr flush dev dummy1"
+	run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
+	log_test $? 0 "User specified metric on first device"
+
+	set -e
+	run_cmd "$IP addr flush dev dummy2"
+	run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
+	set +e
+
+	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+	log_test $? 0 "User specified metric on second device"
+
+	run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+		rc=$?
+	fi
+	log_test $rc 0 "Delete of address on first device"
+
+	run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Modify metric of address"
+
+	# verify prefix route removed on down
+	run_cmd "$IP li set dev dummy2 down"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route ""
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route removed on link down"
+
+	# verify prefix route re-inserted with assigned metric
+	run_cmd "$IP li set dev dummy2 up"
+	rc=$?
+	if [ $rc -eq 0 ]; then
+		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+		rc=$?
+	fi
+	log_test $rc 0 "Prefix route with metric on link up"
+
+	$IP li del dummy1
+	$IP li del dummy2
+	cleanup
+}
+
 ################################################################################
 # usage
 
@@ -1245,6 +1422,8 @@ do
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
+	ipv6_addr_metric)		ipv6_addr_metric_test;;
+	ipv4_addr_metric)		ipv4_addr_metric_test;;
 
 	help) echo "Test names: $TESTS"; exit 0;;
 	esac
-- 
cgit v1.2.3-59-g8ed1b


From a491b9321d486e69040b1ea186ad9b9a3a89f338 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sun, 27 May 2018 21:15:18 +0530
Subject: crypto:chtls: key len correction

corrected the key length to copy 128b key. Removed 192b and 256b
key as user input supports key of size 128b in gcm_ctx

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_hw.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_hw.c b/drivers/crypto/chelsio/chtls/chtls_hw.c
index 54a13aa99121..55d50140f9e5 100644
--- a/drivers/crypto/chelsio/chtls/chtls_hw.c
+++ b/drivers/crypto/chelsio/chtls/chtls_hw.c
@@ -213,7 +213,7 @@ static int chtls_key_info(struct chtls_sock *csk,
 			  struct _key_ctx *kctx,
 			  u32 keylen, u32 optname)
 {
-	unsigned char key[CHCR_KEYCTX_CIPHER_KEY_SIZE_256];
+	unsigned char key[AES_KEYSIZE_128];
 	struct tls12_crypto_info_aes_gcm_128 *gcm_ctx;
 	unsigned char ghash_h[AEAD_H_SIZE];
 	struct crypto_cipher *cipher;
@@ -228,10 +228,6 @@ static int chtls_key_info(struct chtls_sock *csk,
 
 	if (keylen == AES_KEYSIZE_128) {
 		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_128;
-	} else if (keylen == AES_KEYSIZE_192) {
-		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_192;
-	} else if (keylen == AES_KEYSIZE_256) {
-		ck_size = CHCR_KEYCTX_CIPHER_KEY_SIZE_256;
 	} else {
 		pr_err("GCM: Invalid key length %d\n", keylen);
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From b7af47b33008c3558bd107de647be6f3e6c0dfc6 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sun, 27 May 2018 21:15:19 +0530
Subject: crypto: chtls: wait for memory sendmsg, sendpage

address suspicious code <gustavo@embeddedor.com>

1210       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1211       }

The issue is that in the code above, set_bit is never reached
due to the 'continue' statement at line 1208.

Also reported by bug report:<dan.carpenter@oracle.com>
1210       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Not reachable.

Its required to wait for buffer in the send path and takes care of
unaddress and un-handled SOCK_NOSPACE.

v2: use csk_mem_free where appropriate
    proper indent of goto do_nonblock
    replace out with do_rm_wq

Reported-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls.h      |  1 +
 drivers/crypto/chelsio/chtls/chtls_io.c   | 90 +++++++++++++++++++++++++++++--
 drivers/crypto/chelsio/chtls/chtls_main.c |  1 +
 3 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls.h b/drivers/crypto/chelsio/chtls/chtls.h
index f4b8f1ec0061..778c1946cc11 100644
--- a/drivers/crypto/chelsio/chtls/chtls.h
+++ b/drivers/crypto/chelsio/chtls/chtls.h
@@ -149,6 +149,7 @@ struct chtls_dev {
 	struct list_head rcu_node;
 	struct list_head na_node;
 	unsigned int send_page_order;
+	int max_host_sndbuf;
 	struct key_map kmap;
 };
 
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 5a75be43950f..b6f2607f624e 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -914,6 +914,78 @@ static u16 tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
 	return (__force u16)cpu_to_be16(thdr->length);
 }
 
+static int csk_mem_free(struct chtls_dev *cdev, struct sock *sk)
+{
+	return (cdev->max_host_sndbuf - sk->sk_wmem_queued);
+}
+
+static int csk_wait_memory(struct chtls_dev *cdev,
+			   struct sock *sk, long *timeo_p)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int sndbuf, err = 0;
+	long current_timeo;
+	long vm_wait = 0;
+	bool noblock;
+
+	current_timeo = *timeo_p;
+	noblock = (*timeo_p ? false : true);
+	sndbuf = cdev->max_host_sndbuf;
+	if (csk_mem_free(cdev, sk)) {
+		current_timeo = (prandom_u32() % (HZ / 5)) + 2;
+		vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+	}
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	while (1) {
+		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+			goto do_error;
+		if (!*timeo_p) {
+			if (noblock)
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			goto do_nonblock;
+		}
+		if (signal_pending(current))
+			goto do_interrupted;
+		sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+		if (csk_mem_free(cdev, sk) && !vm_wait)
+			break;
+
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		sk_wait_event(sk, &current_timeo, sk->sk_err ||
+			      (sk->sk_shutdown & SEND_SHUTDOWN) ||
+			      (csk_mem_free(cdev, sk) && !vm_wait), &wait);
+		sk->sk_write_pending--;
+
+		if (vm_wait) {
+			vm_wait -= current_timeo;
+			current_timeo = *timeo_p;
+			if (current_timeo != MAX_SCHEDULE_TIMEOUT) {
+				current_timeo -= vm_wait;
+				if (current_timeo < 0)
+					current_timeo = 0;
+			}
+			vm_wait = 0;
+		}
+		*timeo_p = current_timeo;
+	}
+do_rm_wq:
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return err;
+do_error:
+	err = -EPIPE;
+	goto do_rm_wq;
+do_nonblock:
+	err = -EAGAIN;
+	goto do_rm_wq;
+do_interrupted:
+	err = sock_intr_errno(*timeo_p);
+	goto do_rm_wq;
+}
+
 int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
@@ -952,6 +1024,8 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			copy = mss - skb->len;
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 		}
+		if (!csk_mem_free(cdev, sk))
+			goto wait_for_sndbuf;
 
 		if (is_tls_tx(csk) && !csk->tlshws.txleft) {
 			struct tls_hdr hdr;
@@ -1099,8 +1173,10 @@ copy:
 		if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND)
 			push_frames_if_head(sk);
 		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-		err = sk_stream_wait_memory(sk, &timeo);
+		err = csk_wait_memory(cdev, sk, &timeo);
 		if (err)
 			goto do_error;
 	}
@@ -1131,6 +1207,7 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		   int offset, size_t size, int flags)
 {
 	struct chtls_sock *csk;
+	struct chtls_dev *cdev;
 	int mss, err, copied;
 	struct tcp_sock *tp;
 	long timeo;
@@ -1138,6 +1215,7 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 	tp = tcp_sk(sk);
 	copied = 0;
 	csk = rcu_dereference_sk_user_data(sk);
+	cdev = csk->cdev;
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	err = sk_stream_wait_connect(sk, &timeo);
@@ -1156,6 +1234,8 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
 		    copy <= 0) {
 new_buf:
+			if (!csk_mem_free(cdev, sk))
+				goto wait_for_sndbuf;
 
 			if (is_tls_tx(csk)) {
 				skb = get_record_skb(sk,
@@ -1167,7 +1247,7 @@ new_buf:
 				skb = get_tx_skb(sk, 0);
 			}
 			if (!skb)
-				goto do_error;
+				goto wait_for_memory;
 			copy = mss;
 		}
 		if (copy > size)
@@ -1206,8 +1286,12 @@ new_buf:
 		if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND))
 			push_frames_if_head(sk);
 		continue;
-
+wait_for_sndbuf:
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		err = csk_wait_memory(cdev, sk, &timeo);
+		if (err)
+			goto do_error;
 	}
 out:
 	csk_reset_flag(csk, CSK_TX_MORE_DATA);
diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index 007c45c38fc7..f3eafbde2ffc 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -239,6 +239,7 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
 	spin_lock_init(&cdev->idr_lock);
 	cdev->send_page_order = min_t(uint, get_order(32768),
 				      send_page_order);
+	cdev->max_host_sndbuf = 48 * 1024;
 
 	if (lldi->vr->key.size)
 		if (chtls_init_kmap(cdev, lldi))
-- 
cgit v1.2.3-59-g8ed1b


From 53bcfb68c47dc0f2f1f6a2331dffdc9fbb98b116 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sun, 27 May 2018 21:15:20 +0530
Subject: crypto: chtls: dereference null variable

skb dereferenced before check in sendpage

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index b6f2607f624e..45107726308f 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1230,9 +1230,8 @@ int chtls_sendpage(struct sock *sk, struct page *page,
 		struct sk_buff *skb = skb_peek_tail(&csk->txq);
 		int copy, i;
 
-		copy = mss - skb->len;
 		if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
-		    copy <= 0) {
+		    (copy = mss - skb->len) <= 0) {
 new_buf:
 			if (!csk_mem_free(cdev, sk))
 				goto wait_for_sndbuf;
-- 
cgit v1.2.3-59-g8ed1b


From 82a1052679cbd2ac9956e61465e89815d15690e9 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sun, 27 May 2018 21:15:21 +0530
Subject: crypto: chtls: kbuild warnings

- unindented continue
- check for null page
- signed return

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 45107726308f..fdd92186fd2a 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -907,11 +907,11 @@ static int chtls_skb_copy_to_page_nocache(struct sock *sk,
 }
 
 /* Read TLS header to find content type and data length */
-static u16 tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
+static int tls_header_read(struct tls_hdr *thdr, struct iov_iter *from)
 {
 	if (copy_from_iter(thdr, sizeof(*thdr), from) != sizeof(*thdr))
 		return -EFAULT;
-	return (__force u16)cpu_to_be16(thdr->length);
+	return (__force int)cpu_to_be16(thdr->length);
 }
 
 static int csk_mem_free(struct chtls_dev *cdev, struct sock *sk)
@@ -1083,9 +1083,10 @@ new_buf:
 			int off = TCP_OFF(sk);
 			bool merge;
 
-			if (page)
-				pg_size <<= compound_order(page);
+			if (!page)
+				goto wait_for_memory;
 
+			pg_size <<= compound_order(page);
 			if (off < pg_size &&
 			    skb_can_coalesce(skb, i, page, off)) {
 				merge = 1;
@@ -1492,7 +1493,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			break;
 		chtls_cleanup_rbuf(sk, copied);
 		sk_wait_data(sk, &timeo, NULL);
-			continue;
+		continue;
 found_ok_skb:
 		if (!skb->len) {
 			skb_dst_set(skb, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From 06c14de5a25d8d8a9b3d1b2ba50040cc39db2ccf Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sun, 27 May 2018 21:15:22 +0530
Subject: crypto: chtls: free beyond end rspq_skb_cache

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index f3eafbde2ffc..1a6cbb2a915e 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -251,7 +251,7 @@ static void *chtls_uld_add(const struct cxgb4_lld_info *info)
 
 	return cdev;
 out_rspq_skb:
-	for (j = 0; j <= i; j++)
+	for (j = 0; j < i; j++)
 		kfree_skb(cdev->rspq_skb_cache[j]);
 	kfree_skb(cdev->askb);
 out_skb:
-- 
cgit v1.2.3-59-g8ed1b


From 2d68c0745a0e8349ff71b04af5ee020d40f78d90 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Fri, 25 May 2018 18:14:05 +0800
Subject: tcp: use data length instead of skb->len in tcp_probe

skb->len is meaningless to user.
data length could be more helpful, with which we can easily filter out
the packet without payload.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/tcp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index c1a5284713b8..703abb6e11fa 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -236,7 +236,7 @@ TRACE_EVENT(tcp_probe,
 		__field(__u16, sport)
 		__field(__u16, dport)
 		__field(__u32, mark)
-		__field(__u16, length)
+		__field(__u16, data_len)
 		__field(__u32, snd_nxt)
 		__field(__u32, snd_una)
 		__field(__u32, snd_cwnd)
@@ -261,7 +261,7 @@ TRACE_EVENT(tcp_probe,
 		__entry->dport = ntohs(inet->inet_dport);
 		__entry->mark = skb->mark;
 
-		__entry->length = skb->len;
+		__entry->data_len = skb->len - tcp_hdrlen(skb);
 		__entry->snd_nxt = tp->snd_nxt;
 		__entry->snd_una = tp->snd_una;
 		__entry->snd_cwnd = tp->snd_cwnd;
@@ -272,9 +272,9 @@ TRACE_EVENT(tcp_probe,
 		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx",
 		  __entry->saddr, __entry->daddr, __entry->mark,
-		  __entry->length, __entry->snd_nxt, __entry->snd_una,
+		  __entry->data_len, __entry->snd_nxt, __entry->snd_una,
 		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
 		  __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
 );
-- 
cgit v1.2.3-59-g8ed1b


From ae40832e53c33fab2755571dabc1378117bc50d4 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 25 May 2018 18:17:57 +0800
Subject: bpfilter: fix a build err

gcc-7.3.0 report following err:

  HOSTCC  net/bpfilter/main.o
In file included from net/bpfilter/main.c:9:0:
./include/uapi/linux/bpf.h:12:10: fatal error: linux/bpf_common.h: No such file or directory
 #include <linux/bpf_common.h>

remove it by adding a include path.
Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bpfilter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index 2af752c8ef5e..3f3cb87c668f 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -5,7 +5,7 @@
 
 hostprogs-y := bpfilter_umh
 bpfilter_umh-objs := main.o
-HOSTCFLAGS += -I. -Itools/include/
+HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
 ifeq ($(CONFIG_BPFILTER_UMH), y)
 # builtin bpfilter_umh should be compiled with -static
 # since rootfs isn't mounted at the time of __init
-- 
cgit v1.2.3-59-g8ed1b


From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Tue, 29 May 2018 12:27:44 +0100
Subject: bpf: clean up eBPF helpers documentation

These are minor edits for the eBPF helpers documentation in
include/uapi/linux/bpf.h.

The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
with a non-escaped underscore that gets interpreted by rst2man and
produces the following message in the resulting manual page:

    DOCUTILS SYSTEM MESSAGES
           System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
                  Unknown target name: "bpf_fib_lookup".

Other edits consist in:

- Improving formatting for flag values for "bpf_fib_lookup()" helper.
- Emphasising a parameter name in description of the return value for
  "bpf_get_stack()" helper.
- Removing unnecessary blank lines between "Description" and "Return"
  sections for the few helpers that would use it, for consistency.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 21 ++++++++++-----------
 tools/include/uapi/linux/bpf.h | 21 ++++++++++-----------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cc68787f2d97..3f556b35ac8d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1010,7 +1010,6 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
  * 		The positive or null stack id on success, or a negative error
  * 		in case of failure.
@@ -1821,10 +1820,9 @@ union bpf_attr {
  * 		::
  *
  * 			# sysctl kernel.perf_event_max_stack=<new value>
- *
  * 	Return
- * 		a non-negative value equal to or less than size on success, or
- * 		a negative error in case of failure.
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
  *
  * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
@@ -1845,7 +1843,6 @@ union bpf_attr {
  * 		in socket filters where *skb*\ **->data** does not always point
  * 		to the start of the mac header and where "direct packet access"
  * 		is not available.
- *
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1861,16 +1858,18 @@ union bpf_attr {
  *		rt_metric is set to metric from route.
  *
  *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
  *
- *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
- *             full lookup using FIB rules
- *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
- *             perspective (default is ingress)
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
  *
  *             *ctx* is either **struct xdp_md** for XDP programs or
  *             **struct sk_buff** tc cls_act programs.
- *
  *     Return
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
-- 
cgit v1.2.3-59-g8ed1b


From 720e7f38bc3c59bf70974af326f7fb871a1ce89c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 29 May 2018 10:40:18 +0800
Subject: bpf: hide the unused 'off' variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The local variable is only used while CONFIG_IPV6 enabled

net/core/filter.c: In function ‘sk_msg_convert_ctx_access’:
net/core/filter.c:6489:6: warning: unused variable ‘off’ [-Wunused-variable]
  int off;
      ^
This puts it into #ifdef.

Fixes: 303def35f64e ("bpf: allow sk_msg programs to read sock fields")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 24e6ce8be567..0ce93edefb0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6486,7 +6486,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+#if IS_ENABLED(CONFIG_IPV6)
 	int off;
+#endif
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
-- 
cgit v1.2.3-59-g8ed1b


From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 29 May 2018 10:58:07 -0700
Subject: bpf: Drop mpls from bpf_fib_lookup

MPLS support will not be submitted this dev cycle, but in working on it
I do see a few changes are needed to the API. For now, drop mpls from the
API. Since the fields in question are unions, the mpls fields can be added
back later without affecting the uapi.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f556b35ac8d..33f37eb0b6bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1852,10 +1852,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2556,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 9ce64f192d161acff17c99ceec7d9ce3db9252fa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 29 May 2018 11:59:13 -0700
Subject: bpf: Verify flags in bpf_fib_lookup

Verify flags argument contains only known flags. Allows programs to probe
for support as more are added.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 0ce93edefb0e..81bd2e9fe8fc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4270,6 +4270,9 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
+	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+		return -EINVAL;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
@@ -4304,6 +4307,9 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	if (plen < sizeof(*params))
 		return -EINVAL;
 
+	if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+		return -EINVAL;
+
 	switch (params->family) {
 #if IS_ENABLED(CONFIG_INET)
 	case AF_INET:
-- 
cgit v1.2.3-59-g8ed1b


From d80c9f88317272b54b0e0ec0dd6332d7aef5deea Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 25 Jul 2017 08:43:09 +0300
Subject: net/mlx5: FPGA, Add doxygen for access type enum

Add doxygen comments for enum mlx5_fpga_access_type.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
index a0573cc2fc9b..656f96be6e20 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
@@ -44,8 +44,14 @@
 #define SBU_QP_QUEUE_SIZE 8
 #define MLX5_FPGA_CMD_TIMEOUT_MSEC (60 * 1000)
 
+/**
+ * enum mlx5_fpga_access_type - Enumerated the different methods possible for
+ * accessing the device memory address space
+ */
 enum mlx5_fpga_access_type {
+	/** Use the slow CX-FPGA I2C bus */
 	MLX5_FPGA_ACCESS_TYPE_I2C = 0x0,
+	/** Use the fastest available method */
 	MLX5_FPGA_ACCESS_TYPE_DONTCARE = 0x0,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From d1a15b1a7e75503b178c75476faa689ed86db223 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Sun, 2 Jul 2017 10:47:24 +0300
Subject: net/mlx5: FPGA, Add device name

Add device name for Mellanox FPGA devices.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h |  7 +++++++
 .../net/ethernet/mellanox/mlx5/core/fpga/core.c    | 24 +++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
index d05233c9b4f6..eb8b0fe0b4e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -35,6 +35,13 @@
 
 #include <linux/mlx5/driver.h>
 
+enum mlx5_fpga_device_id {
+	MLX5_FPGA_DEVICE_UNKNOWN = 0,
+	MLX5_FPGA_DEVICE_KU040 = 1,
+	MLX5_FPGA_DEVICE_KU060 = 2,
+	MLX5_FPGA_DEVICE_KU060_2 = 3,
+};
+
 enum mlx5_fpga_image {
 	MLX5_FPGA_IMAGE_USER = 0,
 	MLX5_FPGA_IMAGE_FACTORY,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index dc8970346521..8531098a7f19 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -75,6 +75,21 @@ static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image)
 	}
 }
 
+static const char *mlx5_fpga_device_name(u32 device)
+{
+	switch (device) {
+	case MLX5_FPGA_DEVICE_KU040:
+		return "ku040";
+	case MLX5_FPGA_DEVICE_KU060:
+		return "ku060";
+	case MLX5_FPGA_DEVICE_KU060_2:
+		return "ku060_2";
+	case MLX5_FPGA_DEVICE_UNKNOWN:
+	default:
+		return "unknown";
+	}
+}
+
 static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev)
 {
 	struct mlx5_fpga_query query;
@@ -128,8 +143,9 @@ static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev)
 int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_fpga_device *fdev = mdev->fpga;
-	unsigned long flags;
 	unsigned int max_num_qps;
+	unsigned long flags;
+	u32 fpga_device_id;
 	int err;
 
 	if (!fdev)
@@ -143,8 +159,10 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 	if (err)
 		goto out;
 
-	mlx5_fpga_info(fdev, "device %u; %s image, version %u\n",
-		       MLX5_CAP_FPGA(fdev->mdev, fpga_device),
+	fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device);
+	mlx5_fpga_info(fdev, "%s:%u; %s image, version %u\n",
+		       mlx5_fpga_device_name(fpga_device_id),
+		       fpga_device_id,
 		       mlx5_fpga_image_name(fdev->last_oper_image),
 		       MLX5_CAP_FPGA(fdev->mdev, image_version));
 
-- 
cgit v1.2.3-59-g8ed1b


From 98c90f6f103aa72d2ad587b76105834b2b78d031 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 4 Jul 2017 12:53:29 +0300
Subject: net/mlx5: FPGA, print SBU identification on init

Add print of the following values on init:
1. ieee vendor id
2. sandbox product id
3. sandbox product version

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 8531098a7f19..02319f779a49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -160,11 +160,14 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 		goto out;
 
 	fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device);
-	mlx5_fpga_info(fdev, "%s:%u; %s image, version %u\n",
+	mlx5_fpga_info(fdev, "%s:%u; %s image, version %u; SBU %06x:%04x version %d\n",
 		       mlx5_fpga_device_name(fpga_device_id),
 		       fpga_device_id,
 		       mlx5_fpga_image_name(fdev->last_oper_image),
-		       MLX5_CAP_FPGA(fdev->mdev, image_version));
+		       MLX5_CAP_FPGA(fdev->mdev, image_version),
+		       MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id),
+		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id),
+		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
 
 	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
 	err = mlx5_core_reserve_gids(mdev, max_num_qps);
-- 
cgit v1.2.3-59-g8ed1b


From 90275d809ab982a7fdc1f2ed5b0fe3e4c55a5257 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Wed, 8 Nov 2017 18:07:17 +0200
Subject: net/mlx5: FPGA, Abort FPGA init if the device reports no QP
 capability

In the case that the reported max number of QPs capability
equals to zero, abort FPGA init.

Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 02319f779a49..26caa0475985 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -170,6 +170,12 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
 
 	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+	if (!max_num_qps) {
+		mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n");
+		err = -ENOTSUPP;
+		goto out;
+	}
+
 	err = mlx5_core_reserve_gids(mdev, max_num_qps);
 	if (err)
 		goto out;
-- 
cgit v1.2.3-59-g8ed1b


From ba869ee0e0f53077f58ce43a9381b4ee0a20cdd6 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Tue, 14 Nov 2017 10:30:55 +0200
Subject: net/mlx5: FPGA, Properly initialize dma direction on fpga conn send

Properly initialize dma direction on fpga conn send.
Do not rely on dma_dir == 0 (DMA_BIDIRECTIONAL).

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index 4e5a5cf25f17..bf84678b21d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -181,6 +181,7 @@ int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
 	if (!conn->qp.active)
 		return -ENOTCONN;
 
+	buf->dma_dir = DMA_TO_DEVICE;
 	err = mlx5_fpga_conn_map_buf(conn, buf);
 	if (err)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From 36dd4902004c39b7e8a16a7cf1535faa7ecfa921 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Mon, 28 Aug 2017 09:47:32 +0300
Subject: net/mlx5: FPGA, Call DMA unmap with the right size

When mlx5_fpga_conn_unmap_buf is called buf->sg[0].size
should equal the actual buffer size, not the message size.
Otherwise we will trigger the following dma debug warning
"DMA-API: device driver frees DMA memory with different size"

Fixes: 537a50574175 ('net/mlx5: FPGA, Add high-speed connection routines')
Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index bf84678b21d6..4138a770ed57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -256,8 +256,6 @@ static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn *conn,
 	ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.rq.size - 1);
 	buf = conn->qp.rq.bufs[ix];
 	conn->qp.rq.bufs[ix] = NULL;
-	if (!status)
-		buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
 	conn->qp.rq.cc++;
 
 	if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
@@ -275,6 +273,7 @@ static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn *conn,
 		return;
 	}
 
+	buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
 	mlx5_fpga_dbg(conn->fdev, "Message with %u bytes received successfully\n",
 		      buf->sg[0].size);
 	conn->recv_cb(conn->cb_arg, buf);
-- 
cgit v1.2.3-59-g8ed1b


From 01252a27837a9dd099a6e8cfa3adc4772033a5bf Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 22 May 2018 20:18:36 +0300
Subject: net/mlx5e: Get the number of offloaded TC rules from the correct
 table

As we keep the offloaded TC rules for NIC and e-switch in two different
places, make sure to return the number of offloaded flows according
to the use-case and not blindly from the priv.

Fixes: 655dc3d2b91b ('net/mlx5e: Use shared table for offloaded TC eswitch flows')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reported-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 7 +++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 5 +----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 9372d914abe5..0edf4751a8ba 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2876,3 +2876,10 @@ void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht)
 {
 	rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
 }
+
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
+{
+	struct rhashtable *tc_ht = get_tc_ht(priv);
+
+	return atomic_read(&tc_ht->nelems);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 59e52b845beb..49436bf3b80a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -68,10 +68,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 struct mlx5e_neigh_hash_entry;
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
 
-static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
-{
-	return atomic_read(&priv->fs.tc.ht.nelems);
-}
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv);
 
 #else /* CONFIG_MLX5_ESWITCH */
 static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
-- 
cgit v1.2.3-59-g8ed1b


From fabdcc2ecd58500689f4fb73dc77283623d9ab7e Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Mon, 26 Mar 2018 13:07:03 +0300
Subject: iwlwifi: mvm: drop UNKNOWN security type frames

In some cases we may get from FW errored frames with
UNKNOWN security type.

This may happen in unsecured aggregation flow, where
the first packet had a CRC error in the WEP bit, which
was followed by a failure to decrypt and was dropped.

The next frames in the aggregation "inherit" the bad metadata
of the first packet.

Make sure to drop such frames since RADA and other offloads
will not operate correctly which may have unexpected results.

In case of AP it also causes to TX AMSDU frames to the peers,
resulting with assert 0x104B.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/fw/api/rx.h |  1 +
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c  | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/fw/api/rx.h b/drivers/net/wireless/intel/iwlwifi/fw/api/rx.h
index e7565f37ece9..7e570c4a9df0 100644
--- a/drivers/net/wireless/intel/iwlwifi/fw/api/rx.h
+++ b/drivers/net/wireless/intel/iwlwifi/fw/api/rx.h
@@ -295,6 +295,7 @@ enum iwl_rx_mpdu_status {
 	IWL_RX_MPDU_STATUS_MIC_OK		= BIT(6),
 	IWL_RX_MPDU_RES_STATUS_TTAK_OK		= BIT(7),
 	IWL_RX_MPDU_STATUS_SEC_MASK		= 0x7 << 8,
+	IWL_RX_MPDU_STATUS_SEC_UNKNOWN		= IWL_RX_MPDU_STATUS_SEC_MASK,
 	IWL_RX_MPDU_STATUS_SEC_NONE		= 0x0 << 8,
 	IWL_RX_MPDU_STATUS_SEC_WEP		= 0x1 << 8,
 	IWL_RX_MPDU_STATUS_SEC_CCM		= 0x2 << 8,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index bb63e75a9b7f..2b1f0dc73c25 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -227,12 +227,24 @@ static void iwl_mvm_get_signal_strength(struct iwl_mvm *mvm,
 }
 
 static int iwl_mvm_rx_crypto(struct iwl_mvm *mvm, struct ieee80211_hdr *hdr,
-			     struct ieee80211_rx_status *stats,
-			     struct iwl_rx_mpdu_desc *desc, u32 pkt_flags,
-			     int queue, u8 *crypt_len)
+			     struct ieee80211_rx_status *stats, u16 phy_info,
+			     struct iwl_rx_mpdu_desc *desc,
+			     u32 pkt_flags, int queue, u8 *crypt_len)
 {
 	u16 status = le16_to_cpu(desc->status);
 
+	/*
+	 * Drop UNKNOWN frames in aggregation, unless in monitor mode
+	 * (where we don't have the keys).
+	 * We limit this to aggregation because in TKIP this is a valid
+	 * scenario, since we may not have the (correct) TTAK (phase 1
+	 * key) in the firmware.
+	 */
+	if (phy_info & IWL_RX_MPDU_PHY_AMPDU &&
+	    (status & IWL_RX_MPDU_STATUS_SEC_MASK) ==
+	    IWL_RX_MPDU_STATUS_SEC_UNKNOWN && !mvm->monitor_on)
+		return -1;
+
 	if (!ieee80211_has_protected(hdr->frame_control) ||
 	    (status & IWL_RX_MPDU_STATUS_SEC_MASK) ==
 	    IWL_RX_MPDU_STATUS_SEC_NONE)
@@ -870,7 +882,7 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi,
 
 	rx_status = IEEE80211_SKB_RXCB(skb);
 
-	if (iwl_mvm_rx_crypto(mvm, hdr, rx_status, desc,
+	if (iwl_mvm_rx_crypto(mvm, hdr, rx_status, phy_info, desc,
 			      le32_to_cpu(pkt->len_n_flags), queue,
 			      &crypt_len)) {
 		kfree_skb(skb);
-- 
cgit v1.2.3-59-g8ed1b


From 9960521c44a5d828f29636ceac0600603ecbddbf Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@linaro.org>
Date: Tue, 29 May 2018 18:37:16 +0200
Subject: Bluetooth: hci_qca: Fix "Sleep inside atomic section" warning

This patch fixes the following warning during boot:

 do not call blocking ops when !TASK_RUNNING; state=1 set at
 [<(ptrval)>] qca_setup+0x194/0x750 [hci_uart]
 WARNING: CPU: 2 PID: 1878 at kernel/sched/core.c:6135
 __might_sleep+0x7c/0x88

In qca_set_baudrate(), the current task state is set to
TASK_UNINTERRUPTIBLE before going to sleep for 300ms. It was then
restored to TASK_INTERRUPTIBLE. This patch sets the current task state
back to TASK_RUNNING instead.

Signed-off-by: Thierry Escande <thierry.escande@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_qca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index f05382b5a65d..51790dd02afb 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -910,7 +910,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate)
 	 */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS));
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_RUNNING);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3bf5e97d7bbd175248da02efca2b265d13fb6041 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 29 May 2018 16:33:48 +0300
Subject: Bluetooth: Re-use kstrtobool_from_user()

Re-use kstrtobool_from_user() instead of open coded variant.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c    | 23 +++++++----------------
 net/bluetooth/hci_debugfs.c | 24 ++++++++----------------
 net/bluetooth/smp.c         | 12 ++++--------
 3 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b0ee9edaae35..1dec33790198 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -76,19 +76,15 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
 {
 	struct hci_dev *hdev = file->private_data;
 	struct sk_buff *skb;
-	char buf[32];
-	size_t buf_size = min(count, (sizeof(buf)-1));
 	bool enable;
+	int err;
 
 	if (!test_bit(HCI_UP, &hdev->flags))
 		return -ENETDOWN;
 
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	buf[buf_size] = '\0';
-	if (strtobool(buf, &enable))
-		return -EINVAL;
+	err = kstrtobool_from_user(user_buf, count, &enable);
+	if (err)
+		return err;
 
 	if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
 		return -EALREADY;
@@ -135,17 +131,12 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
 				 size_t count, loff_t *ppos)
 {
 	struct hci_dev *hdev = file->private_data;
-	char buf[32];
-	size_t buf_size = min(count, (sizeof(buf)-1));
 	bool enable;
 	int err;
 
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	buf[buf_size] = '\0';
-	if (strtobool(buf, &enable))
-		return -EINVAL;
+	err = kstrtobool_from_user(user_buf, count, &enable);
+	if (err)
+		return err;
 
 	/* When the diagnostic flags are not persistent and the transport
 	 * is not active or in user channel operation, then there is no need
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 418b76e557b0..0d8ab5b3c177 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -47,19 +47,15 @@ static ssize_t __name ## _write(struct file *file,			      \
 				 size_t count, loff_t *ppos)		      \
 {									      \
 	struct hci_dev *hdev = file->private_data;			      \
-	char buf[32];							      \
-	size_t buf_size = min(count, (sizeof(buf) - 1));		      \
 	bool enable;							      \
+	int err;							      \
 									      \
 	if (test_bit(HCI_UP, &hdev->flags))				      \
 		return -EBUSY;						      \
 									      \
-	if (copy_from_user(buf, user_buf, buf_size))			      \
-		return -EFAULT;						      \
-									      \
-	buf[buf_size] = '\0';						      \
-	if (strtobool(buf, &enable))					      \
-		return -EINVAL;						      \
+	err = kstrtobool_from_user(user_buf, count, &enable);		      \
+	if (err)							      \
+		return err;						      \
 									      \
 	if (enable == test_bit(__quirk, &hdev->quirks))			      \
 		return -EALREADY;					      \
@@ -658,19 +654,15 @@ static ssize_t force_static_address_write(struct file *file,
 					  size_t count, loff_t *ppos)
 {
 	struct hci_dev *hdev = file->private_data;
-	char buf[32];
-	size_t buf_size = min(count, (sizeof(buf)-1));
 	bool enable;
+	int err;
 
 	if (test_bit(HCI_UP, &hdev->flags))
 		return -EBUSY;
 
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	buf[buf_size] = '\0';
-	if (strtobool(buf, &enable))
-		return -EINVAL;
+	err = kstrtobool_from_user(user_buf, count, &enable);
+	if (err)
+		return err;
 
 	if (enable == hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR))
 		return -EALREADY;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index a2ddae2f37d7..ae91e2d40056 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -3315,16 +3315,12 @@ static ssize_t force_bredr_smp_write(struct file *file,
 				     size_t count, loff_t *ppos)
 {
 	struct hci_dev *hdev = file->private_data;
-	char buf[32];
-	size_t buf_size = min(count, (sizeof(buf)-1));
 	bool enable;
+	int err;
 
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	buf[buf_size] = '\0';
-	if (strtobool(buf, &enable))
-		return -EINVAL;
+	err = kstrtobool_from_user(user_buf, count, &enable);
+	if (err)
+		return err;
 
 	if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
 		return -EALREADY;
-- 
cgit v1.2.3-59-g8ed1b


From c2e7f5dee722a6f6337540b15000bed575c80793 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 29 May 2018 16:33:49 +0300
Subject: Bluetooth: btmrvl: Re-use kstrtol_from_user()

Re-use kstrtol_from_user() instead of open coded variant.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btmrvl_debugfs.c | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/drivers/bluetooth/btmrvl_debugfs.c b/drivers/bluetooth/btmrvl_debugfs.c
index 023d35e3c7a7..c4867576be00 100644
--- a/drivers/bluetooth/btmrvl_debugfs.c
+++ b/drivers/bluetooth/btmrvl_debugfs.c
@@ -35,15 +35,9 @@ static ssize_t btmrvl_hscfgcmd_write(struct file *file,
 			const char __user *ubuf, size_t count, loff_t *ppos)
 {
 	struct btmrvl_private *priv = file->private_data;
-	char buf[16];
 	long result, ret;
 
-	memset(buf, 0, sizeof(buf));
-
-	if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count)))
-		return -EFAULT;
-
-	ret = kstrtol(buf, 10, &result);
+	ret = kstrtol_from_user(ubuf, count, 10, &result);
 	if (ret)
 		return ret;
 
@@ -81,15 +75,9 @@ static ssize_t btmrvl_pscmd_write(struct file *file, const char __user *ubuf,
 						size_t count, loff_t *ppos)
 {
 	struct btmrvl_private *priv = file->private_data;
-	char buf[16];
 	long result, ret;
 
-	memset(buf, 0, sizeof(buf));
-
-	if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count)))
-		return -EFAULT;
-
-	ret = kstrtol(buf, 10, &result);
+	ret = kstrtol_from_user(ubuf, count, 10, &result);
 	if (ret)
 		return ret;
 
@@ -127,15 +115,9 @@ static ssize_t btmrvl_hscmd_write(struct file *file, const char __user *ubuf,
 						size_t count, loff_t *ppos)
 {
 	struct btmrvl_private *priv = file->private_data;
-	char buf[16];
 	long result, ret;
 
-	memset(buf, 0, sizeof(buf));
-
-	if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count)))
-		return -EFAULT;
-
-	ret = kstrtol(buf, 10, &result);
+	ret = kstrtol_from_user(ubuf, count, 10, &result);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From 15c4e33030d172ac0d2a03ebbae4c3687a80ac7d Mon Sep 17 00:00:00 2001
From: Aviya Erenfeld <aviya.erenfeld@intel.com>
Date: Mon, 26 Mar 2018 14:35:32 +0300
Subject: iwlmvm: tdls: Check TDLS channel switch support

Some versions of the FW don't support channel switch in TDLS.
Add a condition that checks it.

Fixes: 307e47235a10 ("iwlwifi: mvm: configure TDLS peers to FW")
Signed-off-by: Aviya Erenfeld <aviya.erenfeld@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tdls.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
index 3d97436bbdf5..67f360c0d17e 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tdls.c
@@ -7,6 +7,7 @@
  *
  * Copyright(c) 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -18,9 +19,7 @@
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
- * USA
+ * along with this program.
  *
  * The full GNU General Public License is included in this distribution
  * in the file called COPYING.
@@ -33,6 +32,7 @@
  *
  * Copyright(c) 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(C) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -188,8 +188,14 @@ void iwl_mvm_recalc_tdls_state(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 	if (tdls_sta_cnt == 1 && sta_added)
 		iwl_mvm_power_update_mac(mvm);
 
-	/* configure the FW with TDLS peer info */
-	iwl_mvm_tdls_config(mvm, vif);
+	/* Configure the FW with TDLS peer info only if TDLS channel switch
+	 * capability is set.
+	 * TDLS config data is used currently only in TDLS channel switch code.
+	 * Supposed to serve also TDLS buffer station which is not implemneted
+	 * yet in FW*/
+	if (fw_has_capa(&mvm->fw->ucode_capa,
+			IWL_UCODE_TLV_CAPA_TDLS_CHANNEL_SWITCH))
+		iwl_mvm_tdls_config(mvm, vif);
 
 	/* when the last peer leaves, send a power update last */
 	if (tdls_sta_cnt == 0 && !sta_added)
-- 
cgit v1.2.3-59-g8ed1b


From 0eac9abace1629c20420d9122d6edc80d292b0ec Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 8 Apr 2018 12:33:31 +0300
Subject: iwlwifi: mvm: fix TSO with highly fragmented SKBs

Our hardware has a limited amount of buffer descriptors
for each Tx packet. Because of that, there is a short
piece of code that makes sure that that we don't push too
many subframes in an A-MSDU because of subframes needs 2
buffer descriptors. This code also takes into account the
number of fragment of the skb since we also need a buffer
descriptor for each fragment in the skb.

This piece of code though didn't check that the resulting
number of subframes wasn't 0.

A user reported that using NFS client, he could get skbs
that are so fragmented that the code mentioned above
returned 0 for the number of subframes making
skb_gso_segment fail and subconsequently iwlwifi would WARN.

Fix this by make sure that num_subframes is at least 1.

This fixes:
https://bugzilla.kernel.org/show_bug.cgi?id=199209

Fixes: a6d5e32f247c ("iwlwifi: mvm: send large SKBs to the transport")
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index df4c60496f72..0b7f98d00298 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -843,8 +843,6 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	 * N * subf_len + (N - 1) * pad.
 	 */
 	num_subframes = (max_amsdu_len + pad) / (subf_len + pad);
-	if (num_subframes > 1)
-		*ieee80211_get_qos_ctl(hdr) |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
 
 	tcp_payload_len = skb_tail_pointer(skb) - skb_transport_header(skb) -
 		tcp_hdrlen(skb) + skb->data_len;
@@ -855,10 +853,12 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	 *	1 more for each fragment
 	 *	1 more for the potential data in the header
 	 */
-	num_subframes =
-		min_t(unsigned int, num_subframes,
-		      (mvm->trans->max_skb_frags - 1 -
-		       skb_shinfo(skb)->nr_frags) / 2);
+	if ((num_subframes * 2 + skb_shinfo(skb)->nr_frags + 1) >
+	    mvm->trans->max_skb_frags)
+		num_subframes = 1;
+
+	if (num_subframes > 1)
+		*ieee80211_get_qos_ctl(hdr) |= IEEE80211_QOS_CTL_A_MSDU_PRESENT;
 
 	/* This skb fits in one single A-MSDU */
 	if (num_subframes * mss >= tcp_payload_len) {
-- 
cgit v1.2.3-59-g8ed1b


From 0f22e40053bd5378ad1e3250e65c574fd61c0cd6 Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Thu, 22 Mar 2018 14:14:45 +0200
Subject: iwlwifi: pcie: fix race in Rx buffer allocator

Make sure the rx_allocator worker is canceled before running the
rx_init routine.  rx_init frees and re-allocates all rxb's pages.  The
rx_allocator worker also allocates pages for the used rxb's.  Running
rx_init and rx_allocator simultaniously causes a kernel panic.  Fix
that by canceling the work in rx_init.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
index f772d70a65e4..d15f5ba2dc77 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
@@ -902,6 +902,8 @@ static int _iwl_pcie_rx_init(struct iwl_trans *trans)
 	}
 	def_rxq = trans_pcie->rxq;
 
+	cancel_work_sync(&rba->rx_alloc);
+
 	spin_lock(&rba->lock);
 	atomic_set(&rba->req_pending, 0);
 	atomic_set(&rba->req_ready, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 412fe29ffcf9a1ba9f125e72ee3da957cd6a0064 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 27 May 2018 21:04:51 +0200
Subject: Bluetooth: hci_uart: Restore hci_dev->flush callback on open()

For reasons explained in detail in commit 3611f4d2a5e0 ("hci_ldisc:
fix null pointer deref") the hci_uart_close() functions sets
hci_dev->flush to NULL.

But the device may be re-opened after a close, this commit restores the
hci_dev->flush callback on open().

Note this commit also moves the nearly empty defition of hci_uart_open()
a bit down in the file to avoid the need for forward declaring
hci_uart_flush().

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_ldisc.c  | 20 +++++++++++---------
 drivers/bluetooth/hci_serdev.c | 19 +++++++++++--------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 954213e5daa5..1c49964f5e60 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -229,15 +229,6 @@ int hci_uart_init_ready(struct hci_uart *hu)
 }
 
 /* ------- Interface to HCI layer ------ */
-/* Initialize device */
-static int hci_uart_open(struct hci_dev *hdev)
-{
-	BT_DBG("%s %p", hdev->name, hdev);
-
-	/* Nothing to do for UART driver */
-	return 0;
-}
-
 /* Reset device */
 static int hci_uart_flush(struct hci_dev *hdev)
 {
@@ -264,6 +255,17 @@ static int hci_uart_flush(struct hci_dev *hdev)
 	return 0;
 }
 
+/* Initialize device */
+static int hci_uart_open(struct hci_dev *hdev)
+{
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	/* Undo clearing this from hci_uart_close() */
+	hdev->flush = hci_uart_flush;
+
+	return 0;
+}
+
 /* Close device */
 static int hci_uart_close(struct hci_dev *hdev)
 {
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index a0e939856568..6a713f13f71c 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -101,14 +101,6 @@ static void hci_uart_write_work(struct work_struct *work)
 
 /* ------- Interface to HCI layer ------ */
 
-/* Initialize device */
-static int hci_uart_open(struct hci_dev *hdev)
-{
-	BT_DBG("%s %p", hdev->name, hdev);
-
-	return 0;
-}
-
 /* Reset device */
 static int hci_uart_flush(struct hci_dev *hdev)
 {
@@ -129,6 +121,17 @@ static int hci_uart_flush(struct hci_dev *hdev)
 	return 0;
 }
 
+/* Initialize device */
+static int hci_uart_open(struct hci_dev *hdev)
+{
+	BT_DBG("%s %p", hdev->name, hdev);
+
+	/* Undo clearing this from hci_uart_close() */
+	hdev->flush = hci_uart_flush;
+
+	return 0;
+}
+
 /* Close device */
 static int hci_uart_close(struct hci_dev *hdev)
 {
-- 
cgit v1.2.3-59-g8ed1b


From e9ca08074ddcdcc3abacbfca888dba3a110e4453 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 27 May 2018 21:04:52 +0200
Subject: Bluetooth: hci_serdev: Move serdev_device_close/open into common
 hci_serdev code

Make hci_uart_register_device() and hci_uart_unregister_device() call
serdev_device_close()/open() themselves instead of relying on the various
hci_uart drivers to do this for them.

Besides reducing code complexity, this also ensures correct error checking
of serdev_device_open(), which was missing in a few drivers.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_bcm.c    | 10 +---------
 drivers/bluetooth/hci_ll.c     |  3 ---
 drivers/bluetooth/hci_nokia.c  |  3 ---
 drivers/bluetooth/hci_serdev.c |  9 ++++++++-
 4 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index f06f0f1132fb..ddbd8c6a0ceb 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -380,10 +380,6 @@ static int bcm_open(struct hci_uart *hu)
 	mutex_lock(&bcm_device_lock);
 
 	if (hu->serdev) {
-		err = serdev_device_open(hu->serdev);
-		if (err)
-			goto err_free;
-
 		bcm->dev = serdev_device_get_drvdata(hu->serdev);
 		goto out;
 	}
@@ -420,13 +416,10 @@ out:
 	return 0;
 
 err_unset_hu:
-	if (hu->serdev)
-		serdev_device_close(hu->serdev);
 #ifdef CONFIG_PM
-	else
+	if (!hu->serdev)
 		bcm->dev->hu = NULL;
 #endif
-err_free:
 	mutex_unlock(&bcm_device_lock);
 	hu->priv = NULL;
 	kfree(bcm);
@@ -445,7 +438,6 @@ static int bcm_close(struct hci_uart *hu)
 	mutex_lock(&bcm_device_lock);
 
 	if (hu->serdev) {
-		serdev_device_close(hu->serdev);
 		bdev = serdev_device_get_drvdata(hu->serdev);
 	} else if (bcm_device_exists(bcm->dev)) {
 		bdev = bcm->dev;
diff --git a/drivers/bluetooth/hci_ll.c b/drivers/bluetooth/hci_ll.c
index 27e414b4e3a2..3e767f245ed5 100644
--- a/drivers/bluetooth/hci_ll.c
+++ b/drivers/bluetooth/hci_ll.c
@@ -141,7 +141,6 @@ static int ll_open(struct hci_uart *hu)
 
 	if (hu->serdev) {
 		struct ll_device *lldev = serdev_device_get_drvdata(hu->serdev);
-		serdev_device_open(hu->serdev);
 		if (!IS_ERR(lldev->ext_clk))
 			clk_prepare_enable(lldev->ext_clk);
 	}
@@ -179,8 +178,6 @@ static int ll_close(struct hci_uart *hu)
 		gpiod_set_value_cansleep(lldev->enable_gpio, 0);
 
 		clk_disable_unprepare(lldev->ext_clk);
-
-		serdev_device_close(hu->serdev);
 	}
 
 	hu->priv = NULL;
diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c
index 3539fd03f47e..14d159e2042d 100644
--- a/drivers/bluetooth/hci_nokia.c
+++ b/drivers/bluetooth/hci_nokia.c
@@ -477,8 +477,6 @@ static int nokia_open(struct hci_uart *hu)
 
 	dev_dbg(dev, "protocol open");
 
-	serdev_device_open(hu->serdev);
-
 	pm_runtime_enable(dev);
 
 	return 0;
@@ -513,7 +511,6 @@ static int nokia_close(struct hci_uart *hu)
 	gpiod_set_value(btdev->wakeup_bt, 0);
 
 	pm_runtime_disable(&btdev->serdev->dev);
-	serdev_device_close(btdev->serdev);
 
 	return 0;
 }
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index 6a713f13f71c..7a3d6d636192 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -284,10 +284,14 @@ int hci_uart_register_device(struct hci_uart *hu,
 
 	serdev_device_set_client_ops(hu->serdev, &hci_serdev_client_ops);
 
-	err = p->open(hu);
+	err = serdev_device_open(hu->serdev);
 	if (err)
 		return err;
 
+	err = p->open(hu);
+	if (err)
+		goto err_open;
+
 	hu->proto = p;
 	set_bit(HCI_UART_PROTO_READY, &hu->flags);
 
@@ -353,6 +357,8 @@ err_register:
 err_alloc:
 	clear_bit(HCI_UART_PROTO_READY, &hu->flags);
 	p->close(hu);
+err_open:
+	serdev_device_close(hu->serdev);
 	return err;
 }
 EXPORT_SYMBOL_GPL(hci_uart_register_device);
@@ -367,5 +373,6 @@ void hci_uart_unregister_device(struct hci_uart *hu)
 	cancel_work_sync(&hu->write_work);
 
 	hu->proto->close(hu);
+	serdev_device_close(hu->serdev);
 }
 EXPORT_SYMBOL_GPL(hci_uart_unregister_device);
-- 
cgit v1.2.3-59-g8ed1b


From fdee6d8fc630fa82e06d792f3adadab3bd48666e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 27 May 2018 21:04:53 +0200
Subject: Bluetooth: hci_serdev: Fix HCI_UART_INIT_PENDING not working

Init hci_uart->init_ready so that hci_uart_init_ready() works properly.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/hci_ldisc.c  | 2 +-
 drivers/bluetooth/hci_serdev.c | 1 +
 drivers/bluetooth/hci_uart.h   | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 1c49964f5e60..963bb0309e25 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -195,7 +195,7 @@ restart:
 	clear_bit(HCI_UART_SENDING, &hu->tx_state);
 }
 
-static void hci_uart_init_work(struct work_struct *work)
+void hci_uart_init_work(struct work_struct *work)
 {
 	struct hci_uart *hu = container_of(work, struct hci_uart, init_ready);
 	int err;
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index 7a3d6d636192..aa2543b3c286 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -308,6 +308,7 @@ int hci_uart_register_device(struct hci_uart *hu,
 	hdev->bus = HCI_UART;
 	hci_set_drvdata(hdev, hu);
 
+	INIT_WORK(&hu->init_ready, hci_uart_init_work);
 	INIT_WORK(&hu->write_work, hci_uart_write_work);
 	percpu_init_rwsem(&hu->proto_lock);
 
diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h
index 66e8c68e4607..00cab2fd7a1b 100644
--- a/drivers/bluetooth/hci_uart.h
+++ b/drivers/bluetooth/hci_uart.h
@@ -116,6 +116,7 @@ void hci_uart_unregister_device(struct hci_uart *hu);
 
 int hci_uart_tx_wakeup(struct hci_uart *hu);
 int hci_uart_init_ready(struct hci_uart *hu);
+void hci_uart_init_work(struct work_struct *work);
 void hci_uart_set_baudrate(struct hci_uart *hu, unsigned int speed);
 void hci_uart_set_flow_control(struct hci_uart *hu, bool enable);
 void hci_uart_set_speeds(struct hci_uart *hu, unsigned int init_speed,
-- 
cgit v1.2.3-59-g8ed1b


From b38c395f16562423f438065ad8d88c20ad92cab3 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 11 Apr 2018 14:05:33 +0100
Subject: iwlwifi: mvm: remove division by size of sizeof(struct
 ieee80211_wmm_rule)

The subtraction of two struct ieee80211_wmm_rule pointers leaves a result
that is automatically scaled down by the size of the size of pointed-to
type, hence the division by sizeof(struct ieee80211_wmm_rule) is
bogus and should be removed.

Detected by CoverityScan, CID#1467777 ("Extra sizeof expression")

Fixes: 77e30e10ee28 ("iwlwifi: mvm: query regdb for wmm rule if needed")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
index 0f9d56420c42..b815ba38dbdb 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c
@@ -1025,8 +1025,7 @@ iwl_parse_nvm_mcc_info(struct device *dev, const struct iwl_cfg *cfg,
 			continue;
 
 		copy_rd->reg_rules[i].wmm_rule = d_wmm +
-			(regd->reg_rules[i].wmm_rule - s_wmm) /
-			sizeof(struct ieee80211_wmm_rule);
+			(regd->reg_rules[i].wmm_rule - s_wmm);
 	}
 
 out:
-- 
cgit v1.2.3-59-g8ed1b


From d94c5a820d107fdde711ec72c16848876027713d Mon Sep 17 00:00:00 2001
From: Gregory Greenman <gregory.greenman@intel.com>
Date: Tue, 24 Apr 2018 06:26:41 +0300
Subject: iwlwifi: mvm: open BA session only when sta is authorized

Currently, a BA session is opened when the tx traffic exceeds
10 frames per second. As a result of inter-op problems with some
APs, add a condition to open BA session only when station is
already authorized.

Fixes: 482e48440a0e ("iwlwifi: mvm: change open and close criteria of a BA session")
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c |  8 ++---
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c       | 38 +++++++++++------------
 drivers/net/wireless/intel/iwlwifi/mvm/rs.h       |  7 ++---
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c      |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/sta.h      | 10 +++---
 5 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 5f0701c992a4..f3cab26f81b5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -2685,7 +2685,7 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw,
 
 	mutex_lock(&mvm->mutex);
 	/* track whether or not the station is associated */
-	mvm_sta->associated = new_state >= IEEE80211_STA_ASSOC;
+	mvm_sta->sta_state = new_state;
 
 	if (old_state == IEEE80211_STA_NOTEXIST &&
 	    new_state == IEEE80211_STA_NONE) {
@@ -2737,8 +2737,7 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw,
 			iwl_mvm_mac_ctxt_changed(mvm, vif, false, NULL);
 		}
 
-		iwl_mvm_rs_rate_init(mvm, sta, mvmvif->phy_ctxt->channel->band,
-				     true);
+		iwl_mvm_rs_rate_init(mvm, sta, mvmvif->phy_ctxt->channel->band);
 		ret = iwl_mvm_update_sta(mvm, vif, sta);
 	} else if (old_state == IEEE80211_STA_ASSOC &&
 		   new_state == IEEE80211_STA_AUTHORIZED) {
@@ -2754,8 +2753,7 @@ static int iwl_mvm_mac_sta_state(struct ieee80211_hw *hw,
 		/* enable beacon filtering */
 		WARN_ON(iwl_mvm_enable_beacon_filter(mvm, vif, 0));
 
-		iwl_mvm_rs_rate_init(mvm, sta, mvmvif->phy_ctxt->channel->band,
-				     false);
+		iwl_mvm_rs_rate_init(mvm, sta, mvmvif->phy_ctxt->channel->band);
 
 		ret = 0;
 	} else if (old_state == IEEE80211_STA_AUTHORIZED &&
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index af7bc3b7b0c3..642da10b0b7f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -3,6 +3,7 @@
  * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2016 - 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -13,10 +14,6 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
- *
  * The full GNU General Public License is included in this distribution in the
  * file called LICENSE.
  *
@@ -651,9 +648,10 @@ static void rs_tl_turn_on_agg(struct iwl_mvm *mvm, struct iwl_mvm_sta *mvmsta,
 	}
 
 	tid_data = &mvmsta->tid_data[tid];
-	if ((tid_data->state == IWL_AGG_OFF) &&
+	if (mvmsta->sta_state >= IEEE80211_STA_AUTHORIZED &&
+	    tid_data->state == IWL_AGG_OFF &&
 	    (lq_sta->tx_agg_tid_en & BIT(tid)) &&
-	    (tid_data->tx_count_last >= IWL_MVM_RS_AGG_START_THRESHOLD)) {
+	    tid_data->tx_count_last >= IWL_MVM_RS_AGG_START_THRESHOLD) {
 		IWL_DEBUG_RATE(mvm, "try to aggregate tid %d\n", tid);
 		if (rs_tl_turn_on_agg_for_tid(mvm, lq_sta, tid, sta) == 0)
 			tid_data->state = IWL_AGG_QUEUED;
@@ -1257,7 +1255,7 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 		       (unsigned long)(lq_sta->last_tx +
 				       (IWL_MVM_RS_IDLE_TIMEOUT * HZ)))) {
 		IWL_DEBUG_RATE(mvm, "Tx idle for too long. reinit rs\n");
-		iwl_mvm_rs_rate_init(mvm, sta, info->band, false);
+		iwl_mvm_rs_rate_init(mvm, sta, info->band);
 		return;
 	}
 	lq_sta->last_tx = jiffies;
@@ -2690,9 +2688,9 @@ static void rs_get_initial_rate(struct iwl_mvm *mvm,
 				struct ieee80211_sta *sta,
 				struct iwl_lq_sta *lq_sta,
 				enum nl80211_band band,
-				struct rs_rate *rate,
-				bool init)
+				struct rs_rate *rate)
 {
+	struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
 	int i, nentries;
 	unsigned long active_rate;
 	s8 best_rssi = S8_MIN;
@@ -2754,7 +2752,8 @@ static void rs_get_initial_rate(struct iwl_mvm *mvm,
 		 * bandwidth rate, and after authorization, when the phy context
 		 * is already up-to-date, re-init rs with the correct bw.
 		 */
-		u32 bw = init ? RATE_MCS_CHAN_WIDTH_20 : rs_bw_from_sta_bw(sta);
+		u32 bw = mvmsta->sta_state < IEEE80211_STA_AUTHORIZED ?
+				RATE_MCS_CHAN_WIDTH_20 : rs_bw_from_sta_bw(sta);
 
 		switch (bw) {
 		case RATE_MCS_CHAN_WIDTH_40:
@@ -2839,9 +2838,9 @@ void rs_update_last_rssi(struct iwl_mvm *mvm,
 static void rs_initialize_lq(struct iwl_mvm *mvm,
 			     struct ieee80211_sta *sta,
 			     struct iwl_lq_sta *lq_sta,
-			     enum nl80211_band band,
-			     bool init)
+			     enum nl80211_band band)
 {
+	struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
 	struct iwl_scale_tbl_info *tbl;
 	struct rs_rate *rate;
 	u8 active_tbl = 0;
@@ -2857,7 +2856,7 @@ static void rs_initialize_lq(struct iwl_mvm *mvm,
 	tbl = &(lq_sta->lq_info[active_tbl]);
 	rate = &tbl->rate;
 
-	rs_get_initial_rate(mvm, sta, lq_sta, band, rate, init);
+	rs_get_initial_rate(mvm, sta, lq_sta, band, rate);
 	rs_init_optimal_rate(mvm, sta, lq_sta);
 
 	WARN_ONCE(rate->ant != ANT_A && rate->ant != ANT_B,
@@ -2870,7 +2869,8 @@ static void rs_initialize_lq(struct iwl_mvm *mvm,
 	rs_set_expected_tpt_table(lq_sta, tbl);
 	rs_fill_lq_cmd(mvm, sta, lq_sta, rate);
 	/* TODO restore station should remember the lq cmd */
-	iwl_mvm_send_lq_cmd(mvm, &lq_sta->lq, init);
+	iwl_mvm_send_lq_cmd(mvm, &lq_sta->lq,
+			    mvmsta->sta_state < IEEE80211_STA_AUTHORIZED);
 }
 
 static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
@@ -3123,7 +3123,7 @@ void iwl_mvm_update_frame_stats(struct iwl_mvm *mvm, u32 rate, bool agg)
  * Called after adding a new station to initialize rate scaling
  */
 static void rs_drv_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
-			     enum nl80211_band band, bool init)
+			     enum nl80211_band band)
 {
 	int i, j;
 	struct ieee80211_hw *hw = mvm->hw;
@@ -3203,7 +3203,7 @@ static void rs_drv_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 #ifdef CONFIG_IWLWIFI_DEBUGFS
 	iwl_mvm_reset_frame_stats(mvm);
 #endif
-	rs_initialize_lq(mvm, sta, lq_sta, band, init);
+	rs_initialize_lq(mvm, sta, lq_sta, band);
 }
 
 static void rs_drv_rate_update(void *mvm_r,
@@ -3223,7 +3223,7 @@ static void rs_drv_rate_update(void *mvm_r,
 	for (tid = 0; tid < IWL_MAX_TID_COUNT; tid++)
 		ieee80211_stop_tx_ba_session(sta, tid);
 
-	iwl_mvm_rs_rate_init(mvm, sta, sband->band, false);
+	iwl_mvm_rs_rate_init(mvm, sta, sband->band);
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
@@ -4069,12 +4069,12 @@ static const struct rate_control_ops rs_mvm_ops_drv = {
 };
 
 void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
-			  enum nl80211_band band, bool init)
+			  enum nl80211_band band)
 {
 	if (iwl_mvm_has_tlc_offload(mvm))
 		rs_fw_rate_init(mvm, sta, band);
 	else
-		rs_drv_rate_init(mvm, sta, band, init);
+		rs_drv_rate_init(mvm, sta, band);
 }
 
 int iwl_mvm_rate_control_register(void)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
index 5e89141656c0..cffb8c852934 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.h
@@ -3,6 +3,7 @@
  * Copyright(c) 2003 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2015 Intel Mobile Communications GmbH
  * Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License as
@@ -13,10 +14,6 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
- *
  * The full GNU General Public License is included in this distribution in the
  * file called LICENSE.
  *
@@ -410,7 +407,7 @@ struct iwl_lq_sta {
 
 /* Initialize station's rate scaling information after adding station */
 void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
-			  enum nl80211_band band, bool init);
+			  enum nl80211_band band);
 
 /* Notify RS about Tx status */
 void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index 4ddd2c427b83..9263b9aa8b72 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -216,7 +216,7 @@ int iwl_mvm_sta_send_to_fw(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
 		cpu_to_le32(agg_size << STA_FLG_MAX_AGG_SIZE_SHIFT);
 	add_sta_cmd.station_flags |=
 		cpu_to_le32(mpdu_dens << STA_FLG_AGG_MPDU_DENS_SHIFT);
-	if (mvm_sta->associated)
+	if (mvm_sta->sta_state >= IEEE80211_STA_ASSOC)
 		add_sta_cmd.assoc_id = cpu_to_le16(sta->aid);
 
 	if (sta->wme) {
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
index 60502c81dfce..1c43ea8dd8cc 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.h
@@ -8,6 +8,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -18,11 +19,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110,
- * USA
- *
  * The full GNU General Public License is included in this distribution
  * in the file called COPYING.
  *
@@ -35,6 +31,7 @@
  * Copyright(c) 2012 - 2014 Intel Corporation. All rights reserved.
  * Copyright(c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright(c) 2018 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -376,6 +373,7 @@ struct iwl_mvm_rxq_dup_data {
  *	tid.
  * @max_agg_bufsize: the maximal size of the AGG buffer for this station
  * @sta_type: station type
+ * @sta_state: station state according to enum %ieee80211_sta_state
  * @bt_reduced_txpower: is reduced tx power enabled for this station
  * @next_status_eosp: the next reclaimed packet is a PS-Poll response and
  *	we need to signal the EOSP
@@ -416,6 +414,7 @@ struct iwl_mvm_sta {
 	u16 tid_disable_agg;
 	u8 max_agg_bufsize;
 	enum iwl_sta_type sta_type;
+	enum ieee80211_sta_state sta_state;
 	bool bt_reduced_txpower;
 	bool next_status_eosp;
 	spinlock_t lock;
@@ -441,7 +440,6 @@ struct iwl_mvm_sta {
 	u16 amsdu_enabled;
 	u16 max_amsdu_len;
 	bool sleeping;
-	bool associated;
 	u8 agg_tids;
 	u8 sleep_tx_count;
 	u8 avg_energy;
-- 
cgit v1.2.3-59-g8ed1b


From 5dd9f6c703e8161bf69fb9d004528bfb639548ce Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Thu, 17 May 2018 10:44:20 +0300
Subject: iwlwifi: mvm: honor the max_amsdu_subframes limit

A peer can limit the number of subframes it can handle in a
single A-MSDU.  Honor this limit.

Note that the smallest limit is 8, and we are very unlikely to reach
that limit. So this isn't really a big deal.

Fixes: a6d5e32f247c ("iwlwifi: mvm: send large SKBs to the transport")
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 0b7f98d00298..cf2591f2ac23 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -844,6 +844,10 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	 */
 	num_subframes = (max_amsdu_len + pad) / (subf_len + pad);
 
+	if (sta->max_amsdu_subframes &&
+	    num_subframes > sta->max_amsdu_subframes)
+		num_subframes = sta->max_amsdu_subframes;
+
 	tcp_payload_len = skb_tail_pointer(skb) - skb_transport_header(skb) -
 		tcp_hdrlen(skb) + skb->data_len;
 
-- 
cgit v1.2.3-59-g8ed1b


From 42116705a7b17ddc321b6e9999b8df2dc967c357 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 22 May 2018 15:08:32 +0200
Subject: iwlwifi: mvm: fix race in queue notification wait

Initially in this code, the race didn't matter since it didn't
do anything. Latest with the commit I marked this as fixing it
started to matter as something got done here that needed other
data that got freed as soon as the queue notification wait was
returning.

In the scenario we saw, apparently the IWL_MVM_RXQ_NOTIF_DEL_BA
event was sent to all queues, but processing the last event we
returned from iwl_mvm_sync_rx_queues_internal() and then from
iwl_mvm_free_reorder() and continued some processing before
wl_mvm_del_ba() was even invoked on the other CPU. Thus, when
the latter finally ran, it found that mvm->baid_map[baid] was
no longer valid.

Correct the race by moving the counter decrement and wake_up()
to be done only after all the per-event processing completed.
Note that in the commit I marked as being fixed the wake_up()
didn't exist yet (and the code was otherwise problematic) but
this particular problem already existed in a way.

Fixes: b915c10174fb ("iwlwifi: mvm: add reorder buffer per queue")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 2b1f0dc73c25..129c4c09648d 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -590,14 +590,10 @@ void iwl_mvm_rx_queue_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb,
 	notif = (void *)pkt->data;
 	internal_notif = (void *)notif->payload;
 
-	if (internal_notif->sync) {
-		if (mvm->queue_sync_cookie != internal_notif->cookie) {
-			WARN_ONCE(1,
-				  "Received expired RX queue sync message\n");
-			return;
-		}
-		if (!atomic_dec_return(&mvm->queue_sync_counter))
-			wake_up(&mvm->rx_sync_waitq);
+	if (internal_notif->sync &&
+	    mvm->queue_sync_cookie != internal_notif->cookie) {
+		WARN_ONCE(1, "Received expired RX queue sync message\n");
+		return;
 	}
 
 	switch (internal_notif->type) {
@@ -609,6 +605,10 @@ void iwl_mvm_rx_queue_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb,
 	default:
 		WARN_ONCE(1, "Invalid identifier %d", internal_notif->type);
 	}
+
+	if (internal_notif->sync &&
+	    !atomic_dec_return(&mvm->queue_sync_counter))
+		wake_up(&mvm->rx_sync_waitq);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 506247825c5e877741f33a2c657d351e5c7943ba Mon Sep 17 00:00:00 2001
From: Erel Geron <erelx.geron@intel.com>
Date: Mon, 28 May 2018 17:09:30 +0300
Subject: iwlwifi: fix non_shared_ant for 9000 devices

The non-shared antenna was wrong for 9000 device series.  Fix it to
ANT_B for correct antenna preference by coex in MVM driver.

Fixes: 89374fe60bfb ("iwlwifi: Add new PCI IDs for 9260 and 5165 series")
Signed-off-by: Erel Geron <erelx.geron@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/cfg/9000.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
index 9706624911f8..e20c30b29c03 100644
--- a/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
+++ b/drivers/net/wireless/intel/iwlwifi/cfg/9000.c
@@ -137,7 +137,7 @@ static const struct iwl_tt_params iwl9000_tt_params = {
 	.base_params = &iwl9000_base_params,				\
 	.led_mode = IWL_LED_RF_STATE,					\
 	.nvm_hw_section_num = NVM_HW_SECTION_NUM_FAMILY_9000,		\
-	.non_shared_ant = ANT_A,					\
+	.non_shared_ant = ANT_B,					\
 	.dccm_offset = IWL9000_DCCM_OFFSET,				\
 	.dccm_len = IWL9000_DCCM_LEN,					\
 	.dccm2_offset = IWL9000_DCCM2_OFFSET,				\
-- 
cgit v1.2.3-59-g8ed1b


From 170a7e3ea0709eae12c8f944b9f33c54fe80c6c1 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:08 +0100
Subject: bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not
 found

This makes is it possible for bpf prog detach to return -ENOENT.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c        | 11 +++++++++--
 kernel/trace/bpf_trace.c |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b574dddc05b8..527587de8a67 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1616,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	int new_prog_cnt, carry_prog_cnt = 0;
 	struct bpf_prog **existing_prog;
 	struct bpf_prog_array *array;
+	bool found_exclude = false;
 	int new_prog_idx = 0;
 
 	/* Figure out how many existing progs we need to carry over to
@@ -1624,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	if (old_array) {
 		existing_prog = old_array->progs;
 		for (; *existing_prog; existing_prog++) {
-			if (*existing_prog != exclude_prog &&
-			    *existing_prog != &dummy_bpf_prog.prog)
+			if (*existing_prog == exclude_prog) {
+				found_exclude = true;
+				continue;
+			}
+			if (*existing_prog != &dummy_bpf_prog.prog)
 				carry_prog_cnt++;
 			if (*existing_prog == include_prog)
 				return -EEXIST;
 		}
 	}
 
+	if (exclude_prog && !found_exclude)
+		return -ENOENT;
+
 	/* How many progs (not NULL) will be in the new array? */
 	new_prog_cnt = carry_prog_cnt;
 	if (include_prog)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 81fdf2fc94ac..af1486d9a0ed 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1006,6 +1006,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 
 	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret == -ENOENT)
+		goto unlock;
 	if (ret < 0) {
 		bpf_prog_array_delete_safe(old_array, event->prog);
 	} else {
-- 
cgit v1.2.3-59-g8ed1b


From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:09 +0100
Subject: media: rc: introduce BPF_PROG_LIRC_MODE2

Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
that the last key should be repeated.

The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
the target_fd must be the /dev/lircN device.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/media/rc/Kconfig        |  13 ++
 drivers/media/rc/Makefile       |   1 +
 drivers/media/rc/bpf-lirc.c     | 313 ++++++++++++++++++++++++++++++++++++++++
 drivers/media/rc/lirc_dev.c     |  30 ++++
 drivers/media/rc/rc-core-priv.h |  21 +++
 drivers/media/rc/rc-ir-raw.c    |  12 +-
 include/linux/bpf_lirc.h        |  29 ++++
 include/linux/bpf_types.h       |   3 +
 include/uapi/linux/bpf.h        |  53 ++++++-
 kernel/bpf/syscall.c            |   7 +
 10 files changed, 479 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/bpf-lirc.c
 create mode 100644 include/linux/bpf_lirc.h

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index eb2c3b6eca7f..d5b35a6ba899 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -25,6 +25,19 @@ config LIRC
 	   passes raw IR to and from userspace, which is needed for
 	   IR transmitting (aka "blasting") and for the lirc daemon.
 
+config BPF_LIRC_MODE2
+	bool "Support for eBPF programs attached to lirc devices"
+	depends on BPF_SYSCALL
+	depends on RC_CORE=y
+	depends on LIRC
+	help
+	   Allow attaching eBPF programs to a lirc device using the bpf(2)
+	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
+	   receivers.
+
+	   These eBPF programs can be used to decode IR into scancodes, for
+	   IR protocols not supported by the kernel decoders.
+
 menuconfig RC_DECODERS
 	bool "Remote controller decoders"
 	depends on RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 2e1c87066f6c..e0340d043fe8 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -5,6 +5,7 @@ obj-y += keymaps/
 obj-$(CONFIG_RC_CORE) += rc-core.o
 rc-core-y := rc-main.o rc-ir-raw.o
 rc-core-$(CONFIG_LIRC) += lirc_dev.o
+rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
 obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
 obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
 obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
new file mode 100644
index 000000000000..40826bba06b6
--- /dev/null
+++ b/drivers/media/rc/bpf-lirc.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+// bpf-lirc.c - handles bpf
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_lirc.h>
+#include "rc-core-priv.h"
+
+/*
+ * BPF interface for raw IR
+ */
+const struct bpf_prog_ops lirc_mode2_prog_ops = {
+};
+
+BPF_CALL_1(bpf_rc_repeat, u32*, sample)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_repeat(ctrl->dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_repeat_proto = {
+	.func	   = bpf_rc_repeat,
+	.gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+};
+
+/*
+ * Currently rc-core does not support 64-bit scancodes, but there are many
+ * known protocols with more than 32 bits. So, define the interface as u64
+ * as a future-proof.
+ */
+BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
+	   u32, toggle)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
+
+	rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_keydown_proto = {
+	.func	   = bpf_rc_keydown,
+	.gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_rc_repeat:
+		return &rc_repeat_proto;
+	case BPF_FUNC_rc_keydown:
+		return &rc_keydown_proto;
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+		/* fall through */
+	default:
+		return NULL;
+	}
+}
+
+static bool lirc_mode2_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	/* We have one field of u32 */
+	return type == BPF_READ && off == 0 && size == sizeof(u32);
+}
+
+const struct bpf_verifier_ops lirc_mode2_verifier_ops = {
+	.get_func_proto  = lirc_mode2_func_proto,
+	.is_valid_access = lirc_mode2_is_valid_access
+};
+
+#define BPF_MAX_PROGS 64
+
+static int lirc_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (raw->progs && bpf_prog_array_length(raw->progs) >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+static int lirc_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	struct ir_raw_event_ctrl *raw;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	old_array = raw->progs;
+	ret = bpf_prog_array_copy(old_array, prog, NULL, &new_array);
+	/*
+	 * Do not use bpf_prog_array_delete_safe() as we would end up
+	 * with a dummy entry in the array, and the we would free the
+	 * dummy in lirc_bpf_free()
+	 */
+	if (ret)
+		goto unlock;
+
+	rcu_assign_pointer(raw->progs, new_array);
+	bpf_prog_array_free(old_array);
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
+{
+	struct ir_raw_event_ctrl *raw = rcdev->raw;
+
+	raw->bpf_sample = sample;
+
+	if (raw->progs)
+		BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, BPF_PROG_RUN);
+}
+
+/*
+ * This should be called once the rc thread has been stopped, so there can be
+ * no concurrent bpf execution.
+ */
+void lirc_bpf_free(struct rc_dev *rcdev)
+{
+	struct bpf_prog **progs;
+
+	if (!rcdev->raw->progs)
+		return;
+
+	progs = rcu_dereference(rcdev->raw->progs)->progs;
+	while (*progs)
+		bpf_prog_put(*progs++);
+
+	bpf_prog_array_free(rcdev->raw->progs);
+}
+
+int lirc_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_attach(rcdev, prog);
+	if (ret)
+		bpf_prog_put(prog);
+
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_LIRC_MODE2);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = lirc_bpf_detach(rcdev, prog);
+
+	bpf_prog_put(prog);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct bpf_prog_array __rcu *progs;
+	struct rc_dev *rcdev;
+	u32 cnt, flags = 0;
+	int ret;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	rcdev = rc_dev_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(rcdev))
+		return PTR_ERR(rcdev);
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW) {
+		ret = -EINVAL;
+		goto put;
+	}
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		goto put;
+
+	progs = rcdev->raw->progs;
+	cnt = progs ? bpf_prog_array_length(progs) : 0;
+
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto unlock;
+	}
+
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt)
+		ret = bpf_prog_array_copy_to_user(progs, prog_ids, cnt);
+
+unlock:
+	mutex_unlock(&ir_raw_handler_lock);
+put:
+	put_device(&rcdev->dev);
+
+	return ret;
+}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 24e9fbb80e81..da7013a12a58 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/idr.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -104,6 +105,12 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 			TO_US(ev.duration), TO_STR(ev.pulse));
 	}
 
+	/*
+	 * bpf does not care about the gap generated above; that exists
+	 * for backwards compatibility
+	 */
+	lirc_bpf_run(dev, sample);
+
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list) {
 		if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports)
@@ -816,4 +823,27 @@ void __exit lirc_dev_exit(void)
 	unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX);
 }
 
+struct rc_dev *rc_dev_get_from_fd(int fd)
+{
+	struct fd f = fdget(fd);
+	struct lirc_fh *fh;
+	struct rc_dev *dev;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &lirc_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fh = f.file->private_data;
+	dev = fh->rc;
+
+	get_device(&dev->dev);
+	fdput(f);
+
+	return dev;
+}
+
 MODULE_ALIAS("lirc_dev");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index e0e6a17460f6..eb004757038b 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -13,6 +13,7 @@
 #define	MAX_IR_EVENT_SIZE	512
 
 #include <linux/slab.h>
+#include <uapi/linux/bpf.h>
 #include <media/rc-core.h>
 
 /**
@@ -57,6 +58,11 @@ struct ir_raw_event_ctrl {
 	/* raw decoder state follows */
 	struct ir_raw_event prev_ev;
 	struct ir_raw_event this_ev;
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+	u32				bpf_sample;
+	struct bpf_prog_array __rcu	*progs;
+#endif
 	struct nec_dec {
 		int state;
 		unsigned count;
@@ -126,6 +132,9 @@ struct ir_raw_event_ctrl {
 	} imon;
 };
 
+/* Mutex for locking raw IR processing and handler change */
+extern struct mutex ir_raw_handler_lock;
+
 /* macros for IR decoders */
 static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin)
 {
@@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev);
 void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc);
 int ir_lirc_register(struct rc_dev *dev);
 void ir_lirc_unregister(struct rc_dev *dev);
+struct rc_dev *rc_dev_get_from_fd(int fd);
 #else
 static inline int lirc_dev_init(void) { return 0; }
 static inline void lirc_dev_exit(void) {}
@@ -299,4 +309,15 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; }
 static inline void ir_lirc_unregister(struct rc_dev *dev) { }
 #endif
 
+/*
+ * bpf interface
+ */
+#ifdef CONFIG_BPF_LIRC_MODE2
+void lirc_bpf_free(struct rc_dev *dev);
+void lirc_bpf_run(struct rc_dev *dev, u32 sample);
+#else
+static inline void lirc_bpf_free(struct rc_dev *dev) { }
+static inline void lirc_bpf_run(struct rc_dev *dev, u32 sample) { }
+#endif
+
 #endif /* _RC_CORE_PRIV */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 374f83105a23..7675b7ee5bc7 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -14,7 +14,7 @@
 static LIST_HEAD(ir_raw_client_list);
 
 /* Used to handle IR raw handler extensions */
-static DEFINE_MUTEX(ir_raw_handler_lock);
+DEFINE_MUTEX(ir_raw_handler_lock);
 static LIST_HEAD(ir_raw_handler_list);
 static atomic64_t available_protocols = ATOMIC64_INIT(0);
 
@@ -621,9 +621,17 @@ void ir_raw_event_unregister(struct rc_dev *dev)
 	list_for_each_entry(handler, &ir_raw_handler_list, list)
 		if (handler->raw_unregister)
 			handler->raw_unregister(dev);
-	mutex_unlock(&ir_raw_handler_lock);
+
+	lirc_bpf_free(dev);
 
 	ir_raw_event_free(dev);
+
+	/*
+	 * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so
+	 * ensure that the raw member is null on unlock; this is how
+	 * "device gone" is checked.
+	 */
+	mutex_unlock(&ir_raw_handler_lock);
 }
 
 /*
diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h
new file mode 100644
index 000000000000..5f8a4283092d
--- /dev/null
+++ b/include/linux/bpf_lirc.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_LIRC_H
+#define _BPF_LIRC_H
+
+#include <uapi/linux/bpf.h>
+
+#ifdef CONFIG_BPF_LIRC_MODE2
+int lirc_prog_attach(const union bpf_attr *attr);
+int lirc_prog_detach(const union bpf_attr *attr);
+int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int lirc_prog_attach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_detach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int lirc_prog_query(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* _BPF_LIRC_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b161e506dcfc..c5700c2d5549 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_BPF_LIRC_MODE2
+BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 33f37eb0b6bf..64ac0f7a689e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e254526d6744..7365d79ae00d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -1582,6 +1583,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1654,6 +1657,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1703,6 +1708,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_LIRC_MODE2:
+		return lirc_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 6bdd533cee9aadbcd476af30bfff079abe68fcdb Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sun, 27 May 2018 12:24:10 +0100
Subject: bpf: add selftest for lirc_mode2 type program

This is simple test over rc-loopback.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c                           |   1 +
 tools/include/uapi/linux/bpf.h                     |  79 ++++++--
 tools/include/uapi/linux/lirc.h                    | 217 +++++++++++++++++++++
 tools/lib/bpf/libbpf.c                             |   1 +
 tools/testing/selftests/bpf/.gitignore             |   1 +
 tools/testing/selftests/bpf/Makefile               |   7 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   5 +
 tools/testing/selftests/bpf/test_lirc_mode2.sh     |  28 +++
 tools/testing/selftests/bpf/test_lirc_mode2_kern.c |  23 +++
 tools/testing/selftests/bpf/test_lirc_mode2_user.c | 149 ++++++++++++++
 10 files changed, 494 insertions(+), 17 deletions(-)
 create mode 100644 tools/include/uapi/linux/lirc.h
 create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
 create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 39b88e760367..a4f435203fef 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_MSG]		= "sk_msg",
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
+	[BPF_PROG_TYPE_LIRC_MODE2]	= "lirc_mode2",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3f556b35ac8d..64ac0f7a689e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -143,6 +143,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
 };
 
 enum bpf_attach_type {
@@ -162,6 +163,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_POST_BIND,
 	BPF_CGROUP_UDP4_SENDMSG,
 	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1852,10 +1854,10 @@ union bpf_attr {
  *		If lookup is successful and result shows packet is to be
  *		forwarded, the neighbor tables are searched for the nexthop.
  *		If successful (ie., FIB lookup shows forwarding and nexthop
- *		is resolved), the nexthop address is returned in ipv4_dst,
- *		ipv6_dst or mpls_out based on family, smac is set to mac
- *		address of egress device, dmac is set to nexthop mac address,
- *		rt_metric is set to metric from route.
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only).
  *
  *             *plen* argument is the size of the passed in struct.
  *             *flags* argument can be a combination of one or more of the
@@ -2005,6 +2007,53 @@ union bpf_attr {
  * 		direct packet access.
  *	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2083,7 +2132,9 @@ union bpf_attr {
 	FN(lwt_push_encap),		\
 	FN(lwt_seg6_store_bytes),	\
 	FN(lwt_seg6_adjust_srh),	\
-	FN(lwt_seg6_action),
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2537,8 +2588,10 @@ struct bpf_raw_tracepoint_args {
 #define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
 
 struct bpf_fib_lookup {
-	/* input */
-	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
 
 	/* set if lookup is to consider L4 data - e.g., FIB rules */
 	__u8	l4_protocol;
@@ -2554,22 +2607,20 @@ struct bpf_fib_lookup {
 		__u8	tos;		/* AF_INET  */
 		__be32	flowlabel;	/* AF_INET6 */
 
-		/* output: metric of fib result */
-		__u32 rt_metric;
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
 	};
 
 	union {
-		__be32		mpls_in;
 		__be32		ipv4_src;
 		__u32		ipv6_src[4];  /* in6_addr; network order */
 	};
 
-	/* input to bpf_fib_lookup, *dst is destination address.
-	 * output: bpf_fib_lookup sets to gateway address
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
 	 */
 	union {
-		/* return for MPLS lookups */
-		__be32		mpls_out[4];  /* support up to 4 labels */
 		__be32		ipv4_dst;
 		__u32		ipv6_dst[4];  /* in6_addr; network order */
 	};
diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h
new file mode 100644
index 000000000000..f189931042a7
--- /dev/null
+++ b/tools/include/uapi/linux/lirc.h
@@ -0,0 +1,217 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * lirc.h - linux infrared remote control header file
+ * last modified 2010/07/13 by Jarod Wilson
+ */
+
+#ifndef _LINUX_LIRC_H
+#define _LINUX_LIRC_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define PULSE_BIT       0x01000000
+#define PULSE_MASK      0x00FFFFFF
+
+#define LIRC_MODE2_SPACE     0x00000000
+#define LIRC_MODE2_PULSE     0x01000000
+#define LIRC_MODE2_FREQUENCY 0x02000000
+#define LIRC_MODE2_TIMEOUT   0x03000000
+
+#define LIRC_VALUE_MASK      0x00FFFFFF
+#define LIRC_MODE2_MASK      0xFF000000
+
+#define LIRC_SPACE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_SPACE)
+#define LIRC_PULSE(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_PULSE)
+#define LIRC_FREQUENCY(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_FREQUENCY)
+#define LIRC_TIMEOUT(val) (((val)&LIRC_VALUE_MASK) | LIRC_MODE2_TIMEOUT)
+
+#define LIRC_VALUE(val) ((val)&LIRC_VALUE_MASK)
+#define LIRC_MODE2(val) ((val)&LIRC_MODE2_MASK)
+
+#define LIRC_IS_SPACE(val) (LIRC_MODE2(val) == LIRC_MODE2_SPACE)
+#define LIRC_IS_PULSE(val) (LIRC_MODE2(val) == LIRC_MODE2_PULSE)
+#define LIRC_IS_FREQUENCY(val) (LIRC_MODE2(val) == LIRC_MODE2_FREQUENCY)
+#define LIRC_IS_TIMEOUT(val) (LIRC_MODE2(val) == LIRC_MODE2_TIMEOUT)
+
+/* used heavily by lirc userspace */
+#define lirc_t int
+
+/*** lirc compatible hardware features ***/
+
+#define LIRC_MODE2SEND(x) (x)
+#define LIRC_SEND2MODE(x) (x)
+#define LIRC_MODE2REC(x) ((x) << 16)
+#define LIRC_REC2MODE(x) ((x) >> 16)
+
+#define LIRC_MODE_RAW                  0x00000001
+#define LIRC_MODE_PULSE                0x00000002
+#define LIRC_MODE_MODE2                0x00000004
+#define LIRC_MODE_SCANCODE             0x00000008
+#define LIRC_MODE_LIRCCODE             0x00000010
+
+
+#define LIRC_CAN_SEND_RAW              LIRC_MODE2SEND(LIRC_MODE_RAW)
+#define LIRC_CAN_SEND_PULSE            LIRC_MODE2SEND(LIRC_MODE_PULSE)
+#define LIRC_CAN_SEND_MODE2            LIRC_MODE2SEND(LIRC_MODE_MODE2)
+#define LIRC_CAN_SEND_LIRCCODE         LIRC_MODE2SEND(LIRC_MODE_LIRCCODE)
+
+#define LIRC_CAN_SEND_MASK             0x0000003f
+
+#define LIRC_CAN_SET_SEND_CARRIER      0x00000100
+#define LIRC_CAN_SET_SEND_DUTY_CYCLE   0x00000200
+#define LIRC_CAN_SET_TRANSMITTER_MASK  0x00000400
+
+#define LIRC_CAN_REC_RAW               LIRC_MODE2REC(LIRC_MODE_RAW)
+#define LIRC_CAN_REC_PULSE             LIRC_MODE2REC(LIRC_MODE_PULSE)
+#define LIRC_CAN_REC_MODE2             LIRC_MODE2REC(LIRC_MODE_MODE2)
+#define LIRC_CAN_REC_SCANCODE          LIRC_MODE2REC(LIRC_MODE_SCANCODE)
+#define LIRC_CAN_REC_LIRCCODE          LIRC_MODE2REC(LIRC_MODE_LIRCCODE)
+
+#define LIRC_CAN_REC_MASK              LIRC_MODE2REC(LIRC_CAN_SEND_MASK)
+
+#define LIRC_CAN_SET_REC_CARRIER       (LIRC_CAN_SET_SEND_CARRIER << 16)
+#define LIRC_CAN_SET_REC_DUTY_CYCLE    (LIRC_CAN_SET_SEND_DUTY_CYCLE << 16)
+
+#define LIRC_CAN_SET_REC_DUTY_CYCLE_RANGE 0x40000000
+#define LIRC_CAN_SET_REC_CARRIER_RANGE    0x80000000
+#define LIRC_CAN_GET_REC_RESOLUTION       0x20000000
+#define LIRC_CAN_SET_REC_TIMEOUT          0x10000000
+#define LIRC_CAN_SET_REC_FILTER           0x08000000
+
+#define LIRC_CAN_MEASURE_CARRIER          0x02000000
+#define LIRC_CAN_USE_WIDEBAND_RECEIVER    0x04000000
+
+#define LIRC_CAN_SEND(x) ((x)&LIRC_CAN_SEND_MASK)
+#define LIRC_CAN_REC(x) ((x)&LIRC_CAN_REC_MASK)
+
+#define LIRC_CAN_NOTIFY_DECODE            0x01000000
+
+/*** IOCTL commands for lirc driver ***/
+
+#define LIRC_GET_FEATURES              _IOR('i', 0x00000000, __u32)
+
+#define LIRC_GET_SEND_MODE             _IOR('i', 0x00000001, __u32)
+#define LIRC_GET_REC_MODE              _IOR('i', 0x00000002, __u32)
+#define LIRC_GET_REC_RESOLUTION        _IOR('i', 0x00000007, __u32)
+
+#define LIRC_GET_MIN_TIMEOUT           _IOR('i', 0x00000008, __u32)
+#define LIRC_GET_MAX_TIMEOUT           _IOR('i', 0x00000009, __u32)
+
+/* code length in bits, currently only for LIRC_MODE_LIRCCODE */
+#define LIRC_GET_LENGTH                _IOR('i', 0x0000000f, __u32)
+
+#define LIRC_SET_SEND_MODE             _IOW('i', 0x00000011, __u32)
+#define LIRC_SET_REC_MODE              _IOW('i', 0x00000012, __u32)
+/* Note: these can reset the according pulse_width */
+#define LIRC_SET_SEND_CARRIER          _IOW('i', 0x00000013, __u32)
+#define LIRC_SET_REC_CARRIER           _IOW('i', 0x00000014, __u32)
+#define LIRC_SET_SEND_DUTY_CYCLE       _IOW('i', 0x00000015, __u32)
+#define LIRC_SET_TRANSMITTER_MASK      _IOW('i', 0x00000017, __u32)
+
+/*
+ * when a timeout != 0 is set the driver will send a
+ * LIRC_MODE2_TIMEOUT data packet, otherwise LIRC_MODE2_TIMEOUT is
+ * never sent, timeout is disabled by default
+ */
+#define LIRC_SET_REC_TIMEOUT           _IOW('i', 0x00000018, __u32)
+
+/* 1 enables, 0 disables timeout reports in MODE2 */
+#define LIRC_SET_REC_TIMEOUT_REPORTS   _IOW('i', 0x00000019, __u32)
+
+/*
+ * if enabled from the next key press on the driver will send
+ * LIRC_MODE2_FREQUENCY packets
+ */
+#define LIRC_SET_MEASURE_CARRIER_MODE	_IOW('i', 0x0000001d, __u32)
+
+/*
+ * to set a range use LIRC_SET_REC_CARRIER_RANGE with the
+ * lower bound first and later LIRC_SET_REC_CARRIER with the upper bound
+ */
+#define LIRC_SET_REC_CARRIER_RANGE     _IOW('i', 0x0000001f, __u32)
+
+#define LIRC_SET_WIDEBAND_RECEIVER     _IOW('i', 0x00000023, __u32)
+
+/*
+ * struct lirc_scancode - decoded scancode with protocol for use with
+ *	LIRC_MODE_SCANCODE
+ *
+ * @timestamp: Timestamp in nanoseconds using CLOCK_MONOTONIC when IR
+ *	was decoded.
+ * @flags: should be 0 for transmit. When receiving scancodes,
+ *	LIRC_SCANCODE_FLAG_TOGGLE or LIRC_SCANCODE_FLAG_REPEAT can be set
+ *	depending on the protocol
+ * @rc_proto: see enum rc_proto
+ * @keycode: the translated keycode. Set to 0 for transmit.
+ * @scancode: the scancode received or to be sent
+ */
+struct lirc_scancode {
+	__u64	timestamp;
+	__u16	flags;
+	__u16	rc_proto;
+	__u32	keycode;
+	__u64	scancode;
+};
+
+/* Set if the toggle bit of rc-5 or rc-6 is enabled */
+#define LIRC_SCANCODE_FLAG_TOGGLE	1
+/* Set if this is a nec or sanyo repeat */
+#define LIRC_SCANCODE_FLAG_REPEAT	2
+
+/**
+ * enum rc_proto - the Remote Controller protocol
+ *
+ * @RC_PROTO_UNKNOWN: Protocol not known
+ * @RC_PROTO_OTHER: Protocol known but proprietary
+ * @RC_PROTO_RC5: Philips RC5 protocol
+ * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol
+ * @RC_PROTO_RC5_SZ: StreamZap variant of RC5
+ * @RC_PROTO_JVC: JVC protocol
+ * @RC_PROTO_SONY12: Sony 12 bit protocol
+ * @RC_PROTO_SONY15: Sony 15 bit protocol
+ * @RC_PROTO_SONY20: Sony 20 bit protocol
+ * @RC_PROTO_NEC: NEC protocol
+ * @RC_PROTO_NECX: Extended NEC protocol
+ * @RC_PROTO_NEC32: NEC 32 bit protocol
+ * @RC_PROTO_SANYO: Sanyo protocol
+ * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard
+ * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse
+ * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol
+ * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol
+ * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol
+ * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol
+ * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
+ * @RC_PROTO_SHARP: Sharp protocol
+ * @RC_PROTO_XMP: XMP protocol
+ * @RC_PROTO_CEC: CEC protocol
+ * @RC_PROTO_IMON: iMon Pad protocol
+ */
+enum rc_proto {
+	RC_PROTO_UNKNOWN	= 0,
+	RC_PROTO_OTHER		= 1,
+	RC_PROTO_RC5		= 2,
+	RC_PROTO_RC5X_20	= 3,
+	RC_PROTO_RC5_SZ		= 4,
+	RC_PROTO_JVC		= 5,
+	RC_PROTO_SONY12		= 6,
+	RC_PROTO_SONY15		= 7,
+	RC_PROTO_SONY20		= 8,
+	RC_PROTO_NEC		= 9,
+	RC_PROTO_NECX		= 10,
+	RC_PROTO_NEC32		= 11,
+	RC_PROTO_SANYO		= 12,
+	RC_PROTO_MCIR2_KBD	= 13,
+	RC_PROTO_MCIR2_MSE	= 14,
+	RC_PROTO_RC6_0		= 15,
+	RC_PROTO_RC6_6A_20	= 16,
+	RC_PROTO_RC6_6A_24	= 17,
+	RC_PROTO_RC6_6A_32	= 18,
+	RC_PROTO_RC6_MCE	= 19,
+	RC_PROTO_SHARP		= 20,
+	RC_PROTO_XMP		= 21,
+	RC_PROTO_CEC		= 22,
+	RC_PROTO_IMON		= 23,
+};
+
+#endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b1a60ac8424e..a1e96b5de5ff 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1462,6 +1462,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_SK_MSG:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_LIRC_MODE2:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index adc8e5474b66..6ea835982464 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -17,3 +17,4 @@ test_sock_addr
 urandom_read
 test_btf
 test_sockmap
+test_lirc_mode2_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index a1b66da965d0..553d1816b77a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf test_sockmap
+	test_sock test_btf test_sockmap test_lirc_mode2_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -34,7 +34,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -44,7 +44,8 @@ TEST_PROGS := test_kmod.sh \
 	test_offload.py \
 	test_sock_addr.sh \
 	test_tunnel.sh \
-	test_lwt_seg6local.sh
+	test_lwt_seg6local.sh \
+	test_lirc_mode2.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 334d3e8c5e89..a66a9d91acf4 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -126,6 +126,11 @@ static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param,
 static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset,
 				      unsigned int len) =
 	(void *) BPF_FUNC_lwt_seg6_adjust_srh;
+static int (*bpf_rc_repeat)(void *ctx) =
+	(void *) BPF_FUNC_rc_repeat;
+static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
+			     unsigned long long scancode, unsigned int toggle) =
+	(void *) BPF_FUNC_rc_keydown;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2.sh b/tools/testing/selftests/bpf/test_lirc_mode2.sh
new file mode 100755
index 000000000000..ce2e15e4f976
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+modprobe rc-loopback
+
+for i in /sys/class/rc/rc*
+do
+	if grep -q DRV_NAME=rc-loopback $i/uevent
+	then
+		LIRCDEV=$(grep DEVNAME= $i/lirc*/uevent | sed sQDEVNAME=Q/dev/Q)
+	fi
+done
+
+if [ -n $LIRCDEV ];
+then
+	TYPE=lirc_mode2
+	./test_lirc_mode2_user $LIRCDEV
+	ret=$?
+	if [ $ret -ne 0 ]; then
+		echo -e ${RED}"FAIL: $TYPE"${NC}
+	else
+		echo -e ${GREEN}"PASS: $TYPE"${NC}
+	fi
+fi
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
new file mode 100644
index 000000000000..ba26855563a5
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_kern.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include "bpf_helpers.h"
+
+SEC("lirc_mode2")
+int bpf_decoder(unsigned int *sample)
+{
+	if (LIRC_IS_PULSE(*sample)) {
+		unsigned int duration = LIRC_VALUE(*sample);
+
+		if (duration & 0x10000)
+			bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
new file mode 100644
index 000000000000..d470d63c33db
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+// A lirc chardev is a device representing a consumer IR (cir) device which
+// can receive infrared signals from remote control and/or transmit IR.
+//
+// IR is sent as a series of pulses and space somewhat like morse code. The
+// BPF program can decode this into scancodes so that rc-core can translate
+// this into input key codes using the rc keymap.
+//
+// This test works by sending IR over rc-loopback, so the IR is processed by
+// BPF and then decoded into scancodes. The lirc chardev must be the one
+// associated with rc-loopback, see the output of ir-keytable(1).
+//
+// The following CONFIG options must be enabled for the test to succeed:
+// CONFIG_RC_CORE=y
+// CONFIG_BPF_RAWIR_EVENT=y
+// CONFIG_RC_LOOPBACK=y
+
+// Steps:
+// 1. Open the /dev/lircN device for rc-loopback (given on command line)
+// 2. Attach bpf_lirc_mode2 program which decodes some IR.
+// 3. Send some IR to the same IR device; since it is loopback, this will
+//    end up in the bpf program
+// 4. bpf program should decode IR and report keycode
+// 5. We can read keycode from same /dev/lirc device
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+int main(int argc, char **argv)
+{
+	struct bpf_object *obj;
+	int ret, lircfd, progfd, mode;
+	int testir = 0x1dead;
+	u32 prog_ids[10], prog_flags[10], prog_cnt;
+
+	if (argc != 2) {
+		printf("Usage: %s /dev/lircN\n", argv[0]);
+		return 2;
+	}
+
+	ret = bpf_prog_load("test_lirc_mode2_kern.o",
+			    BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
+	if (ret) {
+		printf("Failed to load bpf program\n");
+		return 1;
+	}
+
+	lircfd = open(argv[1], O_RDWR | O_NONBLOCK);
+	if (lircfd == -1) {
+		printf("failed to open lirc device %s: %m\n", argv[1]);
+		return 1;
+	}
+
+	/* Let's try detach it before it was ever attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+	if (ret != -1 || errno != ENOENT) {
+		printf("bpf_prog_detach2 not attached should fail: %m\n");
+		return 1;
+	}
+
+	mode = LIRC_MODE_SCANCODE;
+	if (ioctl(lircfd, LIRC_SET_REC_MODE, &mode)) {
+		printf("failed to set rec mode: %m\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 0) {
+		printf("Expected nothing to be attached\n");
+		return 1;
+	}
+
+	ret = bpf_prog_attach(progfd, lircfd, BPF_LIRC_MODE2, 0);
+	if (ret) {
+		printf("Failed to attach bpf to lirc device: %m\n");
+		return 1;
+	}
+
+	/* Write raw IR */
+	ret = write(lircfd, &testir, sizeof(testir));
+	if (ret != sizeof(testir)) {
+		printf("Failed to send test IR message: %m\n");
+		return 1;
+	}
+
+	struct pollfd pfd = { .fd = lircfd, .events = POLLIN };
+	struct lirc_scancode lsc;
+
+	poll(&pfd, 1, 100);
+
+	/* Read decoded IR */
+	ret = read(lircfd, &lsc, sizeof(lsc));
+	if (ret != sizeof(lsc)) {
+		printf("Failed to read decoded IR: %m\n");
+		return 1;
+	}
+
+	if (lsc.scancode != 0xdead || lsc.rc_proto != 64) {
+		printf("Incorrect scancode decoded\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 1) {
+		printf("Expected one program to be attached\n");
+		return 1;
+	}
+
+	/* Let's try detaching it now it is actually attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+	if (ret) {
+		printf("bpf_prog_detach2: returned %m\n");
+		return 1;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 13a370b9d275959ac75e92dc14e43eeae75804f8 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Tue, 29 May 2018 13:29:31 -0700
Subject: bpftool: Support sendmsg{4,6} attach types

Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
and bash completion.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 9 +++++++--
 tools/bpf/bpftool/bash-completion/bpftool          | 5 +++--
 tools/bpf/bpftool/cgroup.c                         | 4 +++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index d004f6382dab..7b0e6d453e92 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -27,7 +27,8 @@ MAP COMMANDS
 |
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
-|		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** }
+|		**bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
+|               **sendmsg4** | **sendmsg6** }
 |	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
@@ -70,7 +71,11 @@ DESCRIPTION
 		  **post_bind4** return from bind(2) for an inet4 socket (since 4.17);
 		  **post_bind6** return from bind(2) for an inet6 socket (since 4.17);
 		  **connect4** call to connect(2) for an inet4 socket (since 4.17);
-		  **connect6** call to connect(2) for an inet6 socket (since 4.17).
+		  **connect6** call to connect(2) for an inet6 socket (since 4.17);
+		  **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+		  unconnected udp4 socket (since 4.18);
+		  **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an
+		  unconnected udp6 socket (since 4.18).
 
 	**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
 		  Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 7bc198d60de2..1e1083321643 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -407,7 +407,7 @@ _bpftool()
                 attach|detach)
                     local ATTACH_TYPES='ingress egress sock_create sock_ops \
                         device bind4 bind6 post_bind4 post_bind6 connect4 \
-                        connect6'
+                        connect6 sendmsg4 sendmsg6'
                     local ATTACH_FLAGS='multi override'
                     local PROG_TYPE='id pinned tag'
                     case $prev in
@@ -416,7 +416,8 @@ _bpftool()
                             return 0
                             ;;
                         ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
-                        post_bind4|post_bind6|connect4|connect6)
+                        post_bind4|post_bind6|connect4|connect6|sendmsg4|\
+                        sendmsg6)
                             COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
                                 "$cur" ) )
                             return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 1351bd6b5aa7..16bee011e16c 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -20,7 +20,7 @@
 	"       ATTACH_TYPE := { ingress | egress | sock_create |\n"	       \
 	"                        sock_ops | device | bind4 | bind6 |\n"	       \
 	"                        post_bind4 | post_bind6 | connect4 |\n"       \
-	"                        connect6 }"
+	"                        connect6 | sendmsg4 | sendmsg6 }"
 
 static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET_INGRESS] = "ingress",
@@ -34,6 +34,8 @@ static const char * const attach_type_strings[] = {
 	[BPF_CGROUP_INET6_CONNECT] = "connect6",
 	[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
 	[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+	[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+	[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 1cd2fabf4bdcf95eda6a1bcebc4a0a965509da36 Mon Sep 17 00:00:00 2001
From: Artiom Vaskov <velemas@gmail.com>
Date: Wed, 30 May 2018 11:23:00 +0300
Subject: Bluetooth: btusb: Add additional device ID for RTL8822BE

The Asus ROG GL702ZC laptop contains a Realtek RTL8822BE device with
an associated BT chip using a USB ID of 13d3:3526. This ID is added
to the driver.

The /sys/kernel/debug/usb/devices portion for this device is:

T:  Bus=01 Lev=01 Prnt=01 Port=09 Cnt=04 Dev#=  5 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3526 Rev= 1.10
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Artiom Vaskov <velemas@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 3a477b6b3ce6..f73a27ea28cc 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -384,6 +384,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x13d3, 0x3462), .driver_info = BTUSB_REALTEK },
 
 	/* Additional Realtek 8822BE Bluetooth devices */
+	{ USB_DEVICE(0x13d3, 0x3526), .driver_info = BTUSB_REALTEK },
 	{ USB_DEVICE(0x0b05, 0x185c), .driver_info = BTUSB_REALTEK },
 
 	/* Silicon Wave based devices */
-- 
cgit v1.2.3-59-g8ed1b


From 3ded9f2b35f02ca00523c79f88f6cf5050622384 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 May 2018 17:49:46 +0200
Subject: net: ethernet: freescale: fix false-positive string overflow warning

While compile-testing on arm64 with gcc-8.1, I ran into a build diagnostic:

drivers/net/ethernet/freescale/fec_main.c: In function 'fec_probe':
drivers/net/ethernet/freescale/fec_main.c:3517:25: error: '%d' directive writing between 1 and 10 bytes into a region of size 5 [-Werror=format-overflow=]
   sprintf(irq_name, "int%d", i);
                         ^~
drivers/net/ethernet/freescale/fec_main.c:3517:21: note: directive argument in the range [0, 2147483646]
   sprintf(irq_name, "int%d", i);
                     ^~~~~~~
drivers/net/ethernet/freescale/fec_main.c:3517:3: note: 'sprintf' output between 5 and 14 bytes into a destination of size 8
   sprintf(irq_name, "int%d", i);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~

It appears this has never shown on ppc32 or arm32 for an unknown reason, but
now gcc fails to identify that the 'irq_cnt' loop index has an upper bound
of 3, and instead uses a bogus range.

To work around the warning, this changes the sprintf to snprintf with the
correct buffer length.

Fixes: 78cc6e7ef957 ("net: ethernet: freescale: Allow FEC with COMPILE_TEST")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Fugang Duan <fugang.duan@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index ab7521c04eb2..c729665107f5 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3514,7 +3514,7 @@ fec_probe(struct platform_device *pdev)
 		goto failed_init;
 
 	for (i = 0; i < irq_cnt; i++) {
-		sprintf(irq_name, "int%d", i);
+		snprintf(irq_name, sizeof(irq_name), "int%d", i);
 		irq = platform_get_irq_byname(pdev, irq_name);
 		if (irq < 0)
 			irq = platform_get_irq(pdev, i);
-- 
cgit v1.2.3-59-g8ed1b


From 37c9102f216c2ecd4296de1027fbfc4fa7d73877 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 May 2018 17:50:20 +0200
Subject: net: davinci: fix building davinci mdio code without CONFIG_OF

Test-building this driver on targets without CONFIG_OF revealed a build
failure:

drivers/net/ethernet/ti/davinci_mdio.c: In function 'davinci_mdio_probe':
drivers/net/ethernet/ti/davinci_mdio.c:380:9: error: implicit declaration of function 'davinci_mdio_probe_dt'; did you mean 'davinci_mdio_probe'? [-Werror=implicit-function-declaration]

This adjusts the #ifdef logic in the driver to make it build in
all configurations.

Fixes: 2652113ff043 ("net: ethernet: ti: Allow most drivers with COMPILE_TEST")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/davinci_mdio.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c
index 8ac72831af05..a98aedae1b41 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -321,7 +321,6 @@ static int davinci_mdio_write(struct mii_bus *bus, int phy_id,
 	return ret;
 }
 
-#if IS_ENABLED(CONFIG_OF)
 static int davinci_mdio_probe_dt(struct mdio_platform_data *data,
 			 struct platform_device *pdev)
 {
@@ -339,7 +338,6 @@ static int davinci_mdio_probe_dt(struct mdio_platform_data *data,
 
 	return 0;
 }
-#endif
 
 #if IS_ENABLED(CONFIG_OF)
 static const struct davinci_mdio_of_param of_cpsw_mdio_data = {
@@ -374,7 +372,7 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	if (dev->of_node) {
+	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
 		const struct of_device_id	*of_id;
 
 		ret = davinci_mdio_probe_dt(&data->pdata, pdev);
-- 
cgit v1.2.3-59-g8ed1b


From d602de8e7e7fc25fb3a2112ce4285962f15aa549 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 28 May 2018 19:51:57 -0700
Subject: drivers/net: Fix various unnecessary characters after logging
 newlines

Remove and coalesce formats when there is an unnecessary
character after a logging newline.  These extra characters
cause logging defects.

Miscellanea:

o Coalesce formats

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c             | 2 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c         | 6 ++----
 drivers/net/ethernet/qlogic/qed/qed_dev.c                   | 2 +-
 drivers/net/ethernet/qlogic/qlge/qlge_main.c                | 4 ++--
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +-
 drivers/net/wireless/intel/ipw2x00/ipw2200.c                | 3 +--
 drivers/net/wireless/intersil/prism54/islpci_eth.c          | 6 +++---
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c      | 4 ++--
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c      | 4 ++--
 drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c       | 4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c         | 2 +-
 11 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index e500528ad751..8a815bb57177 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1766,7 +1766,7 @@ static int load_firmware(struct octeon_device *oct)
 
 	ret = request_firmware(&fw, fw_name, &oct->pci_dev->dev);
 	if (ret) {
-		dev_err(&oct->pci_dev->dev, "Request firmware failed. Could not find file %s.\n.",
+		dev_err(&oct->pci_dev->dev, "Request firmware failed. Could not find file %s.\n",
 			fw_name);
 		release_firmware(fw);
 		return ret;
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
index 6cec2a6a3dcc..7503aa222392 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_ctx.c
@@ -146,8 +146,7 @@ netxen_get_minidump_template(struct netxen_adapter *adapter)
 	if ((cmd.rsp.cmd == NX_RCODE_SUCCESS) && (size == cmd.rsp.arg2)) {
 		memcpy(adapter->mdump.md_template, addr, size);
 	} else {
-		dev_err(&adapter->pdev->dev, "Failed to get minidump template, "
-			"err_code : %d, requested_size : %d, actual_size : %d\n ",
+		dev_err(&adapter->pdev->dev, "Failed to get minidump template, err_code : %d, requested_size : %d, actual_size : %d\n",
 			cmd.rsp.cmd, size, cmd.rsp.arg2);
 	}
 	pci_free_consistent(adapter->pdev, size, addr, md_template_addr);
@@ -180,8 +179,7 @@ netxen_setup_minidump(struct netxen_adapter *adapter)
 		if ((err == NX_RCODE_CMD_INVALID) ||
 			(err == NX_RCODE_CMD_NOT_IMPL)) {
 			dev_info(&adapter->pdev->dev,
-				"Flashed firmware version does not support minidump, "
-				"minimum version required is [ %u.%u.%u ].\n ",
+				"Flashed firmware version does not support minidump, minimum version required is [ %u.%u.%u ]\n",
 				NX_MD_SUPPORT_MAJOR, NX_MD_SUPPORT_MINOR,
 				NX_MD_SUPPORT_SUBVERSION);
 		}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 560528962658..fde20fd9942c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1098,7 +1098,7 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 	}
 
 	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-		   "Sending final cleanup for PFVF[%d] [Command %08x\n]",
+		   "Sending final cleanup for PFVF[%d] [Command %08x]\n",
 		   id, command);
 
 	qed_wr(p_hwfn, p_ptt, XSDM_REG_OPERATION_GEN, command);
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 8293c2028002..70de062b72a1 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2211,7 +2211,7 @@ static int ql_clean_outbound_rx_ring(struct rx_ring *rx_ring)
 	while (prod != rx_ring->cnsmr_idx) {
 
 		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
-			     "cq_id = %d, prod = %d, cnsmr = %d.\n.",
+			     "cq_id = %d, prod = %d, cnsmr = %d\n",
 			     rx_ring->cq_id, prod, rx_ring->cnsmr_idx);
 
 		net_rsp = (struct ob_mac_iocb_rsp *)rx_ring->curr_entry;
@@ -2258,7 +2258,7 @@ static int ql_clean_inbound_rx_ring(struct rx_ring *rx_ring, int budget)
 	while (prod != rx_ring->cnsmr_idx) {
 
 		netif_printk(qdev, rx_status, KERN_DEBUG, qdev->ndev,
-			     "cq_id = %d, prod = %d, cnsmr = %d.\n.",
+			     "cq_id = %d, prod = %d, cnsmr = %d\n",
 			     rx_ring->cq_id, prod, rx_ring->cnsmr_idx);
 
 		net_rsp = rx_ring->curr_entry;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index f5b405c98047..b6122aad639e 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -1264,7 +1264,7 @@ static void brcmf_link_down(struct brcmf_cfg80211_vif *vif, u16 reason)
 	brcmf_dbg(TRACE, "Enter\n");
 
 	if (test_and_clear_bit(BRCMF_VIF_STATUS_CONNECTED, &vif->sme_state)) {
-		brcmf_dbg(INFO, "Call WLC_DISASSOC to stop excess roaming\n ");
+		brcmf_dbg(INFO, "Call WLC_DISASSOC to stop excess roaming\n");
 		err = brcmf_fil_cmd_data_set(vif->ifp,
 					     BRCMF_C_DISASSOC, NULL, 0);
 		if (err) {
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
index ba3fb1d2ddb4..f26beeb6c5ff 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
@@ -7557,8 +7557,7 @@ static int ipw_associate(void *data)
 	}
 
 	if (priv->status & STATUS_DISASSOCIATING) {
-		IPW_DEBUG_ASSOC("Not attempting association (in "
-				"disassociating)\n ");
+		IPW_DEBUG_ASSOC("Not attempting association (in disassociating)\n");
 		schedule_work(&priv->associate);
 		return 0;
 	}
diff --git a/drivers/net/wireless/intersil/prism54/islpci_eth.c b/drivers/net/wireless/intersil/prism54/islpci_eth.c
index 9b0ded733294..b277113b33d3 100644
--- a/drivers/net/wireless/intersil/prism54/islpci_eth.c
+++ b/drivers/net/wireless/intersil/prism54/islpci_eth.c
@@ -57,7 +57,7 @@ islpci_eth_cleanup_transmit(islpci_private *priv,
 
 #if VERBOSE > SHOW_ERROR_MESSAGES
 			DEBUG(SHOW_TRACING,
-			      "cleanup skb %p skb->data %p skb->len %u truesize %u\n ",
+			      "cleanup skb %p skb->data %p skb->len %u truesize %u\n",
 			      skb, skb->data, skb->len, skb->truesize);
 #endif
 
@@ -328,7 +328,7 @@ islpci_eth_receive(islpci_private *priv)
 
 #if VERBOSE > SHOW_ERROR_MESSAGES
 	DEBUG(SHOW_TRACING,
-	      "frq->addr %x skb->data %p skb->len %u offset %u truesize %u\n ",
+	      "frq->addr %x skb->data %p skb->len %u offset %u truesize %u\n",
 	      control_block->rx_data_low[priv->free_data_rx].address, skb->data,
 	      skb->len, offset, skb->truesize);
 #endif
@@ -436,7 +436,7 @@ islpci_eth_receive(islpci_private *priv)
 
 #if VERBOSE > SHOW_ERROR_MESSAGES
 		DEBUG(SHOW_TRACING,
-		      "new alloc skb %p skb->data %p skb->len %u index %u truesize %u\n ",
+		      "new alloc skb %p skb->data %p skb->len %u index %u truesize %u\n",
 		      skb, skb->data, skb->len, index, skb->truesize);
 #endif
 
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
index 38b2ba1ac6f8..380e86f9e00b 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8192e.c
@@ -1267,8 +1267,8 @@ static void rtl8192eu_phy_iq_calibrate(struct rtl8xxxu_priv *priv)
 		reg_ecc = result[candidate][7];
 		dev_dbg(dev, "%s: candidate is %x\n", __func__, candidate);
 		dev_dbg(dev,
-			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x "
-			"ecc=%x\n ", __func__, reg_e94, reg_e9c,
+			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x ecc=%x\n",
+			__func__, reg_e94, reg_e9c,
 			reg_ea4, reg_eac, reg_eb4, reg_ebc, reg_ec4, reg_ecc);
 		path_a_ok = true;
 		path_b_ok = true;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c
index c4b86a84a721..26b674aca125 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_8723b.c
@@ -1175,8 +1175,8 @@ static void rtl8723bu_phy_iq_calibrate(struct rtl8xxxu_priv *priv)
 		reg_ecc = result[candidate][7];
 		dev_dbg(dev, "%s: candidate is %x\n", __func__, candidate);
 		dev_dbg(dev,
-			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x "
-			"ecc=%x\n ", __func__, reg_e94, reg_e9c,
+			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x ecc=%x\n",
+			__func__, reg_e94, reg_e9c,
 			reg_ea4, reg_eac, reg_eb4, reg_ebc, reg_ec4, reg_ecc);
 		path_a_ok = true;
 		path_b_ok = true;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
index 718a73c623a7..505ab1b055ff 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c
@@ -3406,8 +3406,8 @@ void rtl8xxxu_gen1_phy_iq_calibrate(struct rtl8xxxu_priv *priv)
 		reg_ecc = result[candidate][7];
 		dev_dbg(dev, "%s: candidate is %x\n", __func__, candidate);
 		dev_dbg(dev,
-			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x "
-			"ecc=%x\n ", __func__, reg_e94, reg_e9c,
+			"%s: e94 =%x e9c=%x ea4=%x eac=%x eb4=%x ebc=%x ec4=%x ecc=%x\n",
+			__func__, reg_e94, reg_e9c,
 			reg_ea4, reg_eac, reg_eb4, reg_ebc, reg_ec4, reg_ecc);
 		path_a_ok = true;
 		path_b_ok = true;
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
index 9111ba7ff0a1..3be8c88971e2 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/dm.c
@@ -2642,7 +2642,7 @@ static void rtl8821ae_dm_edca_choose_traffic_idx(
 		if (cur_tx_bytes > (cur_rx_bytes*4)) {
 			*pb_is_cur_rdl_state = false;
 			RT_TRACE(rtlpriv, COMP_TURBO, DBG_LOUD,
-				 "Uplink Traffic\n ");
+				 "Uplink Traffic\n");
 		} else {
 			*pb_is_cur_rdl_state = true;
 			RT_TRACE(rtlpriv, COMP_TURBO, DBG_LOUD,
-- 
cgit v1.2.3-59-g8ed1b


From 0c3a4cf84fb9ab17a4c9d3b34cfd6098887dd998 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Tue, 29 May 2018 11:47:44 +0800
Subject: MAINTAINERS: add myself as maintainer for QorIQ PTP clock driver

Added myself as maintainer for QorIQ PTP clock driver.
Since gianfar_ptp.c was renamed to ptp_qoriq.c, let's
maintain it under QorIQ PTP clock driver.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1831ff5863a1..e7396119ce58 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5628,7 +5628,6 @@ M:	Claudiu Manoil <claudiu.manoil@nxp.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/freescale/gianfar*
-X:	drivers/net/ethernet/freescale/gianfar_ptp.c
 F:	Documentation/devicetree/bindings/net/fsl-tsec-phy.txt
 
 FREESCALE GPMI NAND DRIVER
@@ -5675,6 +5674,14 @@ S:	Maintained
 F:	drivers/net/ethernet/freescale/fman
 F:	Documentation/devicetree/bindings/powerpc/fsl/fman.txt
 
+FREESCALE QORIQ PTP CLOCK DRIVER
+M:	Yangbo Lu <yangbo.lu@nxp.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/ptp/ptp_qoriq.c
+F:	include/linux/fsl/ptp_qoriq.h
+F:	Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
+
 FREESCALE QUAD SPI DRIVER
 M:	Han Xu <han.xu@nxp.com>
 L:	linux-mtd@lists.infradead.org
@@ -11421,7 +11428,6 @@ S:	Maintained
 W:	http://linuxptp.sourceforge.net/
 F:	Documentation/ABI/testing/sysfs-ptp
 F:	Documentation/ptp/*
-F:	drivers/net/ethernet/freescale/gianfar_ptp.c
 F:	drivers/net/phy/dp83640*
 F:	drivers/ptp/*
 F:	include/linux/ptp_cl*
-- 
cgit v1.2.3-59-g8ed1b


From 6d89265d78417174f344c4ec4c13347f178ea0da Mon Sep 17 00:00:00 2001
From: Andrey Shevchenko <ashevchenko@quantenna.com>
Date: Tue, 29 May 2018 14:59:57 +0300
Subject: qtnfmac: remove unused function declarations

Functions qtnf_cmd_resp_parse and qtnf_cmd_resp_check have
been removed. Remove their declarations as well.

Signed-off-by: Andrey Shevchenko <ashevchenko@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.h b/drivers/net/wireless/quantenna/qtnfmac/commands.h
index 69a7d56f7e58..cf9274add26d 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.h
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.h
@@ -58,11 +58,6 @@ int qtnf_cmd_send_change_sta(struct qtnf_vif *vif, const u8 *mac,
 			     struct station_parameters *params);
 int qtnf_cmd_send_del_sta(struct qtnf_vif *vif,
 			  struct station_del_parameters *params);
-
-int qtnf_cmd_resp_parse(struct qtnf_bus *bus, struct sk_buff *resp_skb);
-int qtnf_cmd_resp_check(const struct qtnf_vif *vif,
-			const struct sk_buff *resp_skb, u16 cmd_id,
-			u16 *result, const u8 **payload, size_t *payload_size);
 int qtnf_cmd_send_scan(struct qtnf_wmac *mac);
 int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 			  struct cfg80211_connect_params *sme);
-- 
cgit v1.2.3-59-g8ed1b


From d62b622ca4099517878db5b7bfb06bc449513365 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Tue, 29 May 2018 14:59:58 +0300
Subject: qtnfmac: simplify notation

Shorten line lengths using a more compact notation to access mac info.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 25 ++++++++++++-----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 5122dc798064..bf624d975953 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -955,6 +955,7 @@ qtnf_wiphy_setup_if_comb(struct wiphy *wiphy, struct qtnf_mac_info *mac_info)
 int qtnf_wiphy_register(struct qtnf_hw_info *hw_info, struct qtnf_wmac *mac)
 {
 	struct wiphy *wiphy = priv_to_wiphy(mac);
+	struct qtnf_mac_info *macinfo = &mac->macinfo;
 	int ret;
 
 	if (!wiphy) {
@@ -962,20 +963,20 @@ int qtnf_wiphy_register(struct qtnf_hw_info *hw_info, struct qtnf_wmac *mac)
 		return -EFAULT;
 	}
 
-	wiphy->frag_threshold = mac->macinfo.frag_thr;
-	wiphy->rts_threshold = mac->macinfo.rts_thr;
-	wiphy->retry_short = mac->macinfo.sretry_limit;
-	wiphy->retry_long = mac->macinfo.lretry_limit;
-	wiphy->coverage_class = mac->macinfo.coverage_class;
+	wiphy->frag_threshold = macinfo->frag_thr;
+	wiphy->rts_threshold = macinfo->rts_thr;
+	wiphy->retry_short = macinfo->sretry_limit;
+	wiphy->retry_long = macinfo->lretry_limit;
+	wiphy->coverage_class = macinfo->coverage_class;
 
 	wiphy->max_scan_ssids = QTNF_MAX_SSID_LIST_LENGTH;
 	wiphy->max_scan_ie_len = QTNF_MAX_VSIE_LEN;
 	wiphy->mgmt_stypes = qtnf_mgmt_stypes;
 	wiphy->max_remain_on_channel_duration = 5000;
-	wiphy->max_acl_mac_addrs = mac->macinfo.max_acl_mac_addrs;
+	wiphy->max_acl_mac_addrs = macinfo->max_acl_mac_addrs;
 	wiphy->max_num_csa_counters = 2;
 
-	ret = qtnf_wiphy_setup_if_comb(wiphy, &mac->macinfo);
+	ret = qtnf_wiphy_setup_if_comb(wiphy, macinfo);
 	if (ret)
 		goto out;
 
@@ -994,12 +995,12 @@ int qtnf_wiphy_register(struct qtnf_hw_info *hw_info, struct qtnf_wmac *mac)
 	wiphy->probe_resp_offload = NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS |
 				    NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS2;
 
-	wiphy->available_antennas_tx = mac->macinfo.num_tx_chain;
-	wiphy->available_antennas_rx = mac->macinfo.num_rx_chain;
+	wiphy->available_antennas_tx = macinfo->num_tx_chain;
+	wiphy->available_antennas_rx = macinfo->num_rx_chain;
 
-	wiphy->max_ap_assoc_sta = mac->macinfo.max_ap_assoc_sta;
-	wiphy->ht_capa_mod_mask = &mac->macinfo.ht_cap_mod_mask;
-	wiphy->vht_capa_mod_mask = &mac->macinfo.vht_cap_mod_mask;
+	wiphy->max_ap_assoc_sta = macinfo->max_ap_assoc_sta;
+	wiphy->ht_capa_mod_mask = &macinfo->ht_cap_mod_mask;
+	wiphy->vht_capa_mod_mask = &macinfo->vht_cap_mod_mask;
 
 	ether_addr_copy(wiphy->perm_addr, mac->macaddr);
 
-- 
cgit v1.2.3-59-g8ed1b


From 36e8c538b374d07b69961cb1980f8bb39061b822 Mon Sep 17 00:00:00 2001
From: Igor Mitsyanko <igor.mitsyanko.os@quantenna.com>
Date: Tue, 29 May 2018 14:59:59 +0300
Subject: qtnfmac: decode error codes from firmware replies

Introduce a function that will map an error code reported in reply
to a firmware command, into one of standard errno codes.
Use additional error codes to improve error reporting
for MAC address changes.

Signed-off-by: Igor Mitsyanko <igor.mitsyanko.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.c | 26 +++++++++++++++++++++--
 drivers/net/wireless/quantenna/qtnfmac/qlink.h    |  2 ++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index deca0060eb27..9dc4560be5d8 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -55,6 +55,28 @@ static int qtnf_cmd_check_reply_header(const struct qlink_resp *resp,
 	return 0;
 }
 
+static int qtnf_cmd_resp_result_decode(enum qlink_cmd_result qcode)
+{
+	switch (qcode) {
+	case QLINK_CMD_RESULT_OK:
+		return 0;
+	case QLINK_CMD_RESULT_INVALID:
+		return -EINVAL;
+	case QLINK_CMD_RESULT_ENOTSUPP:
+		return -ENOTSUPP;
+	case QLINK_CMD_RESULT_ENOTFOUND:
+		return -ENOENT;
+	case QLINK_CMD_RESULT_EALREADY:
+		return -EALREADY;
+	case QLINK_CMD_RESULT_EADDRINUSE:
+		return -EADDRINUSE;
+	case QLINK_CMD_RESULT_EADDRNOTAVAIL:
+		return -EADDRNOTAVAIL;
+	default:
+		return -EFAULT;
+	}
+}
+
 static int qtnf_cmd_send_with_reply(struct qtnf_bus *bus,
 				    struct sk_buff *cmd_skb,
 				    struct sk_buff **response_skb,
@@ -810,10 +832,10 @@ static int qtnf_cmd_send_add_change_intf(struct qtnf_vif *vif,
 	if (unlikely(ret))
 		goto out;
 
-	if (unlikely(res_code != QLINK_CMD_RESULT_OK)) {
+	ret = qtnf_cmd_resp_result_decode(res_code);
+	if (ret) {
 		pr_err("VIF%u.%u: CMD %d failed: %u\n", vif->mac->macid,
 		       vif->vifid, cmd_type, res_code);
-		ret = -EFAULT;
 		goto out;
 	}
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/qlink.h b/drivers/net/wireless/quantenna/qtnfmac/qlink.h
index 9ab27e158023..f85deda703fb 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/qlink.h
+++ b/drivers/net/wireless/quantenna/qtnfmac/qlink.h
@@ -674,6 +674,8 @@ enum qlink_cmd_result {
 	QLINK_CMD_RESULT_ENOTSUPP,
 	QLINK_CMD_RESULT_ENOTFOUND,
 	QLINK_CMD_RESULT_EALREADY,
+	QLINK_CMD_RESULT_EADDRINUSE,
+	QLINK_CMD_RESULT_EADDRNOTAVAIL,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 9a3beeb5b73a55d8d4ada9cf17bec24ad1f31394 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Tue, 29 May 2018 15:00:00 +0300
Subject: qtnfmac: cleanup wdev structure between its uses

Driver uses statically allocated wdev structures for each virtual
interface. However wdev structure is not properly cleaned up between
its uses. As a result, various bugs appear when userspace tools
like hostapd were not gracefully stopped.

In particular, this commit fixes the following issue:
- start hostapd with more than 2 mBSS
- kill hostapd using SIGKILL
- start again hostapd with more than 2 mBSS
However only two mBSS entities will be started: primary
and the last BSS listed in hostapd config.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 8 +++-----
 drivers/net/wireless/quantenna/qtnfmac/core.c     | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index bf624d975953..2089cb095283 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -177,8 +177,6 @@ int qtnf_del_virtual_intf(struct wiphy *wiphy, struct wireless_dev *wdev)
 	vif->netdev->ieee80211_ptr = NULL;
 	vif->netdev = NULL;
 	vif->wdev.iftype = NL80211_IFTYPE_UNSPECIFIED;
-	eth_zero_addr(vif->mac_addr);
-	eth_zero_addr(vif->bssid);
 
 	return 0;
 }
@@ -216,10 +214,12 @@ static struct wireless_dev *qtnf_add_virtual_intf(struct wiphy *wiphy,
 		}
 
 		eth_zero_addr(vif->mac_addr);
+		eth_zero_addr(vif->bssid);
 		vif->bss_priority = QTNF_DEF_BSS_PRIORITY;
+		vif->sta_state = QTNF_STA_DISCONNECTED;
+		memset(&vif->wdev, 0, sizeof(vif->wdev));
 		vif->wdev.wiphy = wiphy;
 		vif->wdev.iftype = type;
-		vif->sta_state = QTNF_STA_DISCONNECTED;
 		break;
 	default:
 		pr_err("MAC%u: unsupported IF type %d\n", mac->macid, type);
@@ -255,8 +255,6 @@ err_mac:
 	qtnf_cmd_send_del_intf(vif);
 err_cmd:
 	vif->wdev.iftype = NL80211_IFTYPE_UNSPECIFIED;
-	eth_zero_addr(vif->mac_addr);
-	eth_zero_addr(vif->bssid);
 
 	return ERR_PTR(-EFAULT);
 }
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index b3bfb4faa918..3ccbc427cf56 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -394,7 +394,6 @@ int qtnf_core_net_attach(struct qtnf_wmac *mac, struct qtnf_vif *vif,
 	dev = alloc_netdev_mqs(sizeof(struct qtnf_vif *), name,
 			       name_assign_type, ether_setup, 1, 1);
 	if (!dev) {
-		memset(&vif->wdev, 0, sizeof(vif->wdev));
 		vif->wdev.iftype = NL80211_IFTYPE_UNSPECIFIED;
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9e33e7fb47728cb53dbba09415d95d3ce0ea1ce4 Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Tue, 29 May 2018 15:00:01 +0300
Subject: qtnfmac: improve control path timeout handling

Control path will not be operational after firmware failure. Change bus
state to QTNF_FW_STATE_EP_DEAD after the control path timeout.
Don't wait for timeout if control path is already dead.

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/bus.h        |  3 ++-
 drivers/net/wireless/quantenna/qtnfmac/core.c       |  2 +-
 drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c | 15 ++++++++++++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/bus.h b/drivers/net/wireless/quantenna/qtnfmac/bus.h
index 0a1604683bab..323e47cea1e2 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/bus.h
+++ b/drivers/net/wireless/quantenna/qtnfmac/bus.h
@@ -27,7 +27,8 @@ enum qtnf_fw_state {
 	QTNF_FW_STATE_FW_DNLD_DONE,
 	QTNF_FW_STATE_BOOT_DONE,
 	QTNF_FW_STATE_ACTIVE,
-	QTNF_FW_STATE_DEAD,
+	QTNF_FW_STATE_DETACHED,
+	QTNF_FW_STATE_EP_DEAD,
 };
 
 struct qtnf_bus;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index 3ccbc427cf56..a6a450984f9a 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -628,7 +628,7 @@ void qtnf_core_detach(struct qtnf_bus *bus)
 	if (bus->fw_state == QTNF_FW_STATE_ACTIVE)
 		qtnf_cmd_send_deinit_fw(bus);
 
-	bus->fw_state = QTNF_FW_STATE_DEAD;
+	bus->fw_state = QTNF_FW_STATE_DETACHED;
 
 	if (bus->workqueue) {
 		flush_workqueue(bus->workqueue);
diff --git a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
index 6c1e139bb8f7..3120d49df565 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/pearl/pcie.c
@@ -751,8 +751,16 @@ tx_done:
 static int qtnf_pcie_control_tx(struct qtnf_bus *bus, struct sk_buff *skb)
 {
 	struct qtnf_pcie_bus_priv *priv = (void *)get_bus_priv(bus);
+	int ret;
+
+	ret = qtnf_shm_ipc_send(&priv->shm_ipc_ep_in, skb->data, skb->len);
 
-	return qtnf_shm_ipc_send(&priv->shm_ipc_ep_in, skb->data, skb->len);
+	if (ret == -ETIMEDOUT) {
+		pr_err("EP firmware is dead\n");
+		bus->fw_state = QTNF_FW_STATE_EP_DEAD;
+	}
+
+	return ret;
 }
 
 static irqreturn_t qtnf_interrupt(int irq, void *data)
@@ -1238,7 +1246,7 @@ static void qtnf_fw_work_handler(struct work_struct *work)
 	goto fw_load_exit;
 
 fw_load_fail:
-	bus->fw_state = QTNF_FW_STATE_DEAD;
+	bus->fw_state = QTNF_FW_STATE_DETACHED;
 
 fw_load_exit:
 	complete(&bus->firmware_init_complete);
@@ -1408,7 +1416,8 @@ static void qtnf_pcie_remove(struct pci_dev *pdev)
 
 	wait_for_completion(&bus->firmware_init_complete);
 
-	if (bus->fw_state == QTNF_FW_STATE_ACTIVE)
+	if (bus->fw_state == QTNF_FW_STATE_ACTIVE ||
+	    bus->fw_state == QTNF_FW_STATE_EP_DEAD)
 		qtnf_core_detach(bus);
 
 	priv = get_bus_priv(bus);
-- 
cgit v1.2.3-59-g8ed1b


From b60769e2dff0b168dfe3f03fe0d1950963c3c3da Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Tue, 29 May 2018 15:00:02 +0300
Subject: qtnfmac: fix firmware command error path

Free command skb if bus state is not QTNF_FW_STATE_ACTIVE.

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.c | 1 +
 drivers/net/wireless/quantenna/qtnfmac/trans.c    | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index 9dc4560be5d8..e2fc57be1cdd 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -102,6 +102,7 @@ static int qtnf_cmd_send_with_reply(struct qtnf_bus *bus,
 		pr_warn("VIF%u.%u: drop cmd 0x%.4X in fw state %d\n",
 			mac_id, vif_id, le16_to_cpu(cmd->cmd_id),
 			bus->fw_state);
+		dev_kfree_skb(cmd_skb);
 		return -ENODEV;
 	}
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/trans.c b/drivers/net/wireless/quantenna/qtnfmac/trans.c
index ccddfebc508a..345f34ec9750 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/trans.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/trans.c
@@ -35,8 +35,10 @@ int qtnf_trans_send_cmd_with_resp(struct qtnf_bus *bus, struct sk_buff *cmd_skb,
 	bool resp_not_handled = true;
 	struct sk_buff *resp_skb = NULL;
 
-	if (unlikely(!response_skb))
+	if (unlikely(!response_skb)) {
+		dev_kfree_skb(cmd_skb);
 		return -EFAULT;
+	}
 
 	spin_lock(&ctl_node->resp_lock);
 	ctl_node->seq_num++;
-- 
cgit v1.2.3-59-g8ed1b


From f5d2ff43b9ed9f56e7ff67640aa593fcb8fa56c1 Mon Sep 17 00:00:00 2001
From: Andrey Shevchenko <ashevchenko@quantenna.com>
Date: Tue, 29 May 2018 15:00:03 +0300
Subject: qtnfmac: fix bg_scan_period parameter processing

Do not process bg_scan_period parameter in qtnfmac driver.
Pass correct values as is. In the case of invalid values
pass default value. Leave further processing to firmware.

Signed-off-by: Andrey Shevchenko <ashevchenko@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/commands.c | 8 +++-----
 drivers/net/wireless/quantenna/qtnfmac/core.h     | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c
index e2fc57be1cdd..5eb143667539 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/commands.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c
@@ -2339,13 +2339,11 @@ int qtnf_cmd_send_connect(struct qtnf_vif *vif,
 	else
 		eth_zero_addr(cmd->prev_bssid);
 
-	if ((sme->bg_scan_period > 0) &&
-	    (sme->bg_scan_period <= QTNF_MAX_BG_SCAN_PERIOD))
+	if ((sme->bg_scan_period >= 0) &&
+	    (sme->bg_scan_period <= SHRT_MAX))
 		cmd->bg_scan_period = cpu_to_le16(sme->bg_scan_period);
-	else if (sme->bg_scan_period == -1)
-		cmd->bg_scan_period = cpu_to_le16(QTNF_DEFAULT_BG_SCAN_PERIOD);
 	else
-		cmd->bg_scan_period = 0; /* disabled */
+		cmd->bg_scan_period = cpu_to_le16(-1); /* use default value */
 
 	if (sme->flags & ASSOC_REQ_DISABLE_HT)
 		connect_flags |= QLINK_STA_CONNECT_DISABLE_HT;
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.h b/drivers/net/wireless/quantenna/qtnfmac/core.h
index 3b884c80b6ab..214435448335 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.h
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.h
@@ -44,8 +44,6 @@
 #define QTNF_MAX_VSIE_LEN		255
 #define QTNF_MAX_INTF			8
 #define QTNF_MAX_EVENT_QUEUE_LEN	255
-#define QTNF_DEFAULT_BG_SCAN_PERIOD	300
-#define QTNF_MAX_BG_SCAN_PERIOD		0xffff
 #define QTNF_SCAN_TIMEOUT_SEC		15
 
 #define QTNF_DEF_BSS_PRIORITY		0
-- 
cgit v1.2.3-59-g8ed1b


From 40d68dbb986d2116c15e70f63c419a5481d7b91c Mon Sep 17 00:00:00 2001
From: Andrey Shevchenko <ashevchenko@quantenna.com>
Date: Tue, 29 May 2018 15:00:04 +0300
Subject: qtnfmac: cancel scan on disconnect

Cancel scan operation on STA disconnect.

Signed-off-by: Andrey Shevchenko <ashevchenko@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 2089cb095283..1fcd94bf7c59 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -660,6 +660,8 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
 	if (vif->wdev.iftype != NL80211_IFTYPE_STATION)
 		return -EOPNOTSUPP;
 
+	qtnf_scan_done(mac, true);
+
 	if (vif->sta_state == QTNF_STA_DISCONNECTED)
 		return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 480daa9cb62c14bbd1b87a01cd9bc10cc56dbf32 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Tue, 29 May 2018 15:00:05 +0300
Subject: qtnfmac: fix invalid STA state on EAPOL failure

Driver switches vif sta_state into QTNF_STA_CONNECTING when cfg80211
core initiates connect procedure. Further this state is changed either
to QTNF_STA_CONNECTED or to QTNF_STA_DISCONNECTED by BSS_JOIN and
BSS_LEAVE events from firmware. However it is possible that no such
events will be sent by firmware, e.g. if EAPOL timed out.

In this case vif sta_mode will remain in QTNF_STA_CONNECTING state and
all subsequent connection attempts will fail with -EBUSY error code.
Fix this by perfroming STA state transition from QTNF_STA_CONNECTING
to QTNF_STA_DISCONNECTED in cfg80211 disconnect callback.
No need to rely upon firmware events in this case.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 21 ++++++++++++++-------
 drivers/net/wireless/quantenna/qtnfmac/event.c    |  8 +++-----
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 1fcd94bf7c59..220e2b710208 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -649,30 +649,37 @@ qtnf_disconnect(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct qtnf_wmac *mac = wiphy_priv(wiphy);
 	struct qtnf_vif *vif;
-	int ret;
+	int ret = 0;
 
 	vif = qtnf_mac_get_base_vif(mac);
 	if (!vif) {
 		pr_err("MAC%u: primary VIF is not configured\n", mac->macid);
-		return -EFAULT;
+		ret = -EFAULT;
+		goto out;
 	}
 
-	if (vif->wdev.iftype != NL80211_IFTYPE_STATION)
-		return -EOPNOTSUPP;
+	if (vif->wdev.iftype != NL80211_IFTYPE_STATION) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	qtnf_scan_done(mac, true);
 
 	if (vif->sta_state == QTNF_STA_DISCONNECTED)
-		return 0;
+		goto out;
 
 	ret = qtnf_cmd_send_disconnect(vif, reason_code);
 	if (ret) {
 		pr_err("VIF%u.%u: failed to disconnect\n", mac->macid,
 		       vif->vifid);
-		return ret;
+		goto out;
 	}
 
-	return 0;
+out:
+	if (vif->sta_state == QTNF_STA_CONNECTING)
+		vif->sta_state = QTNF_STA_DISCONNECTED;
+
+	return ret;
 }
 
 static int
diff --git a/drivers/net/wireless/quantenna/qtnfmac/event.c b/drivers/net/wireless/quantenna/qtnfmac/event.c
index 16617c44f81b..68da81bec4e9 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/event.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/event.c
@@ -211,11 +211,9 @@ qtnf_event_handle_bss_leave(struct qtnf_vif *vif,
 		return -EPROTO;
 	}
 
-	if (vif->sta_state != QTNF_STA_CONNECTED) {
-		pr_err("VIF%u.%u: BSS_LEAVE event when STA is not connected\n",
-		       vif->mac->macid, vif->vifid);
-		return -EPROTO;
-	}
+	if (vif->sta_state != QTNF_STA_CONNECTED)
+		pr_warn("VIF%u.%u: BSS_LEAVE event when STA is not connected\n",
+			vif->mac->macid, vif->vifid);
 
 	pr_debug("VIF%u.%u: disconnected\n", vif->mac->macid, vif->vifid);
 
-- 
cgit v1.2.3-59-g8ed1b


From 71b2c87df3ac37f5f83e166db136b0c1d065a781 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 30 May 2018 16:09:16 +0100
Subject: bpf: devmap: remove redundant assignment of dev = dev

The assignment dev = dev is redundant and should be removed.

Detected by CoverityScan, CID#1469486 ("Evaluation order violation")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ae16d0c373ef..1fe3fe60508a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -352,7 +352,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
-	struct net_device *dev = dev = obj ? obj->dev : NULL;
+	struct net_device *dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From bcece5dc40b9e00d0a9a91fc0acc9a825c23362b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 30 May 2018 12:24:17 -0700
Subject: bpf: Change bpf_fib_lookup to return -EAFNOSUPPORT for unsupported
 address families

Update bpf_fib_lookup to return -EAFNOSUPPORT for unsupported address
families. Allows userspace to probe for support as more are added
(e.g., AF_MPLS).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 81bd2e9fe8fc..885fb0e2f264 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4285,7 +4285,7 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 					   flags, true);
 #endif
 	}
-	return 0;
+	return -EAFNOSUPPORT;
 }
 
 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
@@ -4302,7 +4302,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
 {
 	struct net *net = dev_net(skb->dev);
-	int index = 0;
+	int index = -EAFNOSUPPORT;
 
 	if (plen < sizeof(*params))
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 3d97d88e8091f3501e016f6b4ce45a32c4b8f2f6 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Tue, 29 May 2018 23:27:31 +0800
Subject: tcp: minor optimization around tcp_hdr() usage in receive path

This is additional to the
commit ea1627c20c34 ("tcp: minor optimizations around tcp_hdr() usage").
At this point, skb->data is same with tcp_hdr() as tcp header has not
been pulled yet. So use the less expensive one to get the tcp header.

Remove the third parameter of tcp_rcv_established() and put it into
the function body.

Furthermore, the local variables are listed as a reverse christmas tree :)

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h          | 3 +--
 include/trace/events/tcp.h | 5 +++--
 net/ipv4/tcp_input.c       | 6 +++---
 net/ipv4/tcp_ipv4.c        | 2 +-
 net/ipv6/tcp_ipv6.c        | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 952d842a604a..029a51b91742 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -334,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk);
 void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th);
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 703abb6e11fa..ac55b328d61b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -248,8 +248,9 @@ TRACE_EVENT(tcp_probe,
 	),
 
 	TP_fast_assign(
-		const struct tcp_sock *tp = tcp_sk(sk);
+		const struct tcphdr *th = (const struct tcphdr *)skb->data;
 		const struct inet_sock *inet = inet_sk(sk);
+		const struct tcp_sock *tp = tcp_sk(sk);
 
 		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
 		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
@@ -261,7 +262,7 @@ TRACE_EVENT(tcp_probe,
 		__entry->dport = ntohs(inet->inet_dport);
 		__entry->mark = skb->mark;
 
-		__entry->data_len = skb->len - tcp_hdrlen(skb);
+		__entry->data_len = skb->len - __tcp_hdrlen(th);
 		__entry->snd_nxt = tp->snd_nxt;
 		__entry->snd_una = tp->snd_una;
 		__entry->snd_cwnd = tp->snd_cwnd;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1191cac72109..d5ffb573ca4d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5390,11 +5390,11 @@ discard:
  *	the rest is checked inline. Fast processing is turned on in
  *	tcp_data_queue when everything is OK.
  */
-void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 {
-	unsigned int len = skb->len;
+	const struct tcphdr *th = (const struct tcphdr *)skb->data;
 	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int len = skb->len;
 
 	/* TCP congestion window tracking */
 	trace_tcp_probe(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index adbdb503db0c..749b0ef9f405 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1486,7 +1486,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		tcp_rcv_established(sk, skb, tcp_hdr(skb));
+		tcp_rcv_established(sk, skb);
 		return 0;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7d47c2b550a9..8764a63abd91 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1322,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 
-		tcp_rcv_established(sk, skb, tcp_hdr(skb));
+		tcp_rcv_established(sk, skb);
 		if (opt_skb)
 			goto ipv6_pktoptions;
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 4341f8308d53f95a4a1ab54f45fa9af2c30ff596 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 29 May 2018 10:03:21 -0700
Subject: net: remove bypassed check in sch_direct_xmit()

Checking netif_xmit_frozen_or_stopped() at the end of sch_direct_xmit()
is being bypassed. This is because "ret" from sch_direct_xmit() will be
either NETDEV_TX_OK or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0
will reach the condition:

    if (ret && netif_xmit_frozen_or_stopped(txq))
        return false;

This patch cleans up the code by removing the whole condition.

For more discussion about this, please refer to
   https://marc.info/?t=152727195700008

Signed-off-by: Song Liu <songliubraving@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 760ab1b09f8b..69078c82963e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		return false;
 	}
 
-	if (ret && netif_xmit_frozen_or_stopped(txq))
-		return false;
-
 	return true;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 32d26a685c1802a0e485bd674e7dd038e88019f7 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Tue, 29 May 2018 02:31:24 -0700
Subject: qed*: Add link change count value to ethtool statistics display.

This patch adds driver changes for capturing the link change count in
ethtool statistics display.

Please consider applying this to "net-next".

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 12 ++++++++++--
 drivers/net/ethernet/qlogic/qede/qede.h         |  1 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  2 ++
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  1 +
 include/linux/qed/qed_if.h                      |  1 +
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 1c0d0c217936..eed4725761f5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1854,6 +1854,11 @@ static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 		p_ah->tx_1519_to_max_byte_packets =
 		    port_stats.eth.u1.ah1.t1519_to_max;
 	}
+
+	p_common->link_change_count = qed_rd(p_hwfn, p_ptt,
+					     p_hwfn->mcp_info->port_addr +
+					     offsetof(struct public_port,
+						      link_change_count));
 }
 
 static void __qed_get_vport_stats(struct qed_hwfn *p_hwfn,
@@ -1961,11 +1966,14 @@ void qed_reset_vport_stats(struct qed_dev *cdev)
 
 	/* PORT statistics are not necessarily reset, so we need to
 	 * read and create a baseline for future statistics.
+	 * Link change stat is maintained by MFW, return its value as is.
 	 */
-	if (!cdev->reset_stats)
+	if (!cdev->reset_stats) {
 		DP_INFO(cdev, "Reset stats not allocated\n");
-	else
+	} else {
 		_qed_get_vport_stats(cdev, cdev->reset_stats);
+		cdev->reset_stats->common.link_change_count = 0;
+	}
 }
 
 static enum gft_profile_type
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 81c5c8dfa2ef..d7ed0d3dbf71 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -88,6 +88,7 @@ struct qede_stats_common {
 	u64 coalesced_aborts_num;
 	u64 non_coalesced_pkts;
 	u64 coalesced_bytes;
+	u64 link_change_count;
 
 	/* port */
 	u64 rx_64_byte_packets;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 6906e04b609e..f4a0f8ff8261 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -171,6 +171,8 @@ static const struct {
 	QEDE_STAT(coalesced_aborts_num),
 	QEDE_STAT(non_coalesced_pkts),
 	QEDE_STAT(coalesced_bytes),
+
+	QEDE_STAT(link_change_count),
 };
 
 #define QEDE_NUM_STATS	ARRAY_SIZE(qede_stats_arr)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index d118771e1a7b..6a796040a32c 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -399,6 +399,7 @@ void qede_fill_by_demand_stats(struct qede_dev *edev)
 	p_common->brb_truncates = stats.common.brb_truncates;
 	p_common->brb_discards = stats.common.brb_discards;
 	p_common->tx_mac_ctrl_frames = stats.common.tx_mac_ctrl_frames;
+	p_common->link_change_count = stats.common.link_change_count;
 
 	if (QEDE_IS_BB(edev)) {
 		struct qede_stats_bb *p_bb = &edev->stats.bb;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ac991a3b8f03..b4040023cbfb 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -1180,6 +1180,7 @@ struct qed_eth_stats_common {
 	u64	tx_mac_mc_packets;
 	u64	tx_mac_bc_packets;
 	u64	tx_mac_ctrl_frames;
+	u64	link_change_count;
 };
 
 struct qed_eth_stats_bb {
-- 
cgit v1.2.3-59-g8ed1b


From d66e4348969de33cccdd2bc3920b49fb0114b306 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 02:56:03 +0200
Subject: net: bridge: Extract boilerplate around switchdev_port_obj_*()

A call to switchdev_port_obj_add() or switchdev_port_obj_del() involves
initializing a struct switchdev_obj_port_vlan, a piece of code that
repeats on each call site almost verbatim. While in the current codebase
there is just one duplicated add call, the follow-up patches add more of
both add and del calls.

Thus to remove the duplication, extract the repetition into named
functions and reuse.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_private.h   | 13 +++++++++++++
 net/bridge/br_switchdev.c | 25 +++++++++++++++++++++++++
 net/bridge/br_vlan.c      | 26 +++-----------------------
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 11520ed528b0..5216a524b537 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1139,6 +1139,8 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p,
 			       unsigned long mask);
 void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
 			     int type);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
 
 static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
 {
@@ -1168,6 +1170,17 @@ static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
 	return 0;
 }
 
+static inline int br_switchdev_port_vlan_add(struct net_device *dev,
+					     u16 vid, u16 flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void
 br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 {
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 35474d49555d..d77f807420c4 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -136,3 +136,28 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
 		break;
 	}
 }
+
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
+{
+	struct switchdev_obj_port_vlan v = {
+		.obj.orig_dev = dev,
+		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+		.flags = flags,
+		.vid_begin = vid,
+		.vid_end = vid,
+	};
+
+	return switchdev_port_obj_add(dev, &v.obj);
+}
+
+int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
+{
+	struct switchdev_obj_port_vlan v = {
+		.obj.orig_dev = dev,
+		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+		.vid_begin = vid,
+		.vid_end = vid,
+	};
+
+	return switchdev_port_obj_del(dev, &v.obj);
+}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index dc832c0934c6..e3b14c0b2060 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -82,19 +82,12 @@ static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
 static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
 			  u16 vid, u16 flags)
 {
-	struct switchdev_obj_port_vlan v = {
-		.obj.orig_dev = dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.flags = flags,
-		.vid_begin = vid,
-		.vid_end = vid,
-	};
 	int err;
 
 	/* Try switchdev op first. In case it is not supported, fallback to
 	 * 8021q add.
 	 */
-	err = switchdev_port_obj_add(dev, &v.obj);
+	err = br_switchdev_port_vlan_add(dev, vid, flags);
 	if (err == -EOPNOTSUPP)
 		return vlan_vid_add(dev, br->vlan_proto, vid);
 	return err;
@@ -130,18 +123,12 @@ static void __vlan_del_list(struct net_bridge_vlan *v)
 static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
 			  u16 vid)
 {
-	struct switchdev_obj_port_vlan v = {
-		.obj.orig_dev = dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.vid_begin = vid,
-		.vid_end = vid,
-	};
 	int err;
 
 	/* Try switchdev op first. In case it is not supported, fallback to
 	 * 8021q del.
 	 */
-	err = switchdev_port_obj_del(dev, &v.obj);
+	err = br_switchdev_port_vlan_del(dev, vid);
 	if (err == -EOPNOTSUPP) {
 		vlan_vid_del(dev, br->vlan_proto, vid);
 		return 0;
@@ -1053,13 +1040,6 @@ err_vlan_enabled:
 int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
 		 bool *changed)
 {
-	struct switchdev_obj_port_vlan v = {
-		.obj.orig_dev = port->dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.flags = flags,
-		.vid_begin = vid,
-		.vid_end = vid,
-	};
 	struct net_bridge_vlan *vlan;
 	int ret;
 
@@ -1069,7 +1049,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
 	vlan = br_vlan_find(nbp_vlan_group(port), vid);
 	if (vlan) {
 		/* Pass the flags to the hardware bridge */
-		ret = switchdev_port_obj_add(port->dev, &v.obj);
+		ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
 		if (ret && ret != -EOPNOTSUPP)
 			return ret;
 		*changed = __vlan_add_flags(vlan, flags);
-- 
cgit v1.2.3-59-g8ed1b


From dbd6dc752c8a6de428eacad47d4fd37e52f98183 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 02:56:55 +0200
Subject: net: bridge: Extract br_vlan_add_existing()

Extract the code that deals with adding a preexisting VLAN to bridge CPU
port to a separate function. A follow-up patch introduces a need to roll
back operations in this block due to an error, and this split will make
the error-handling code clearer.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan.c | 55 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index e3b14c0b2060..602c869c9821 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -551,6 +551,37 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
 	return false;
 }
 
+static int br_vlan_add_existing(struct net_bridge *br,
+				struct net_bridge_vlan_group *vg,
+				struct net_bridge_vlan *vlan,
+				u16 flags, bool *changed)
+{
+	int err;
+
+	if (!br_vlan_is_brentry(vlan)) {
+		/* Trying to change flags of non-existent bridge vlan */
+		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
+			return -EINVAL;
+		/* It was only kept for port vlans, now make it real */
+		err = br_fdb_insert(br, NULL, br->dev->dev_addr,
+				    vlan->vid);
+		if (err) {
+			br_err(br, "failed to insert local address into bridge forwarding table\n");
+			return err;
+		}
+
+		refcount_inc(&vlan->refcnt);
+		vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
+		vg->num_vlans++;
+		*changed = true;
+	}
+
+	if (__vlan_add_flags(vlan, flags))
+		*changed = true;
+
+	return 0;
+}
+
 /* Must be protected by RTNL.
  * Must be called with vid in range from 1 to 4094 inclusive.
  * changed must be true only if the vlan was created or updated
@@ -566,28 +597,8 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
 	*changed = false;
 	vg = br_vlan_group(br);
 	vlan = br_vlan_find(vg, vid);
-	if (vlan) {
-		if (!br_vlan_is_brentry(vlan)) {
-			/* Trying to change flags of non-existent bridge vlan */
-			if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
-				return -EINVAL;
-			/* It was only kept for port vlans, now make it real */
-			ret = br_fdb_insert(br, NULL, br->dev->dev_addr,
-					    vlan->vid);
-			if (ret) {
-				br_err(br, "failed insert local address into bridge forwarding table\n");
-				return ret;
-			}
-			refcount_inc(&vlan->refcnt);
-			vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
-			vg->num_vlans++;
-			*changed = true;
-		}
-		if (__vlan_add_flags(vlan, flags))
-			*changed = true;
-
-		return 0;
-	}
+	if (vlan)
+		return br_vlan_add_existing(br, vg, vlan, flags, changed);
 
 	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
 	if (!vlan)
-- 
cgit v1.2.3-59-g8ed1b


From ea4721751977e14bf2ba5bf6c764afb4c21354dd Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 02:57:46 +0200
Subject: mlxsw: spectrum_switchdev: Ignore bridge VLAN events

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 8c9cf8ee9398..cbc8fabc90c5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1144,6 +1144,9 @@ static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port,
 	struct mlxsw_sp_bridge_port *bridge_port;
 	u16 vid;
 
+	if (netif_is_bridge_master(orig_dev))
+		return -EOPNOTSUPP;
+
 	if (switchdev_trans_ph_prepare(trans))
 		return 0;
 
@@ -1741,6 +1744,9 @@ static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
 	struct mlxsw_sp_bridge_port *bridge_port;
 	u16 vid;
 
+	if (netif_is_bridge_master(orig_dev))
+		return -EOPNOTSUPP;
+
 	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
 	if (WARN_ON(!bridge_port))
 		return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 2855118fdafa753d284b80abd7835a530b5a6fcc Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 02:58:36 +0200
Subject: rocker: rocker_main: Ignore bridge VLAN events

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker_main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index e73e4febeedb..aeafdb9ac015 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -1632,6 +1632,9 @@ rocker_world_port_obj_vlan_add(struct rocker_port *rocker_port,
 {
 	struct rocker_world_ops *wops = rocker_port->rocker->wops;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (!wops->port_obj_vlan_add)
 		return -EOPNOTSUPP;
 
@@ -1647,6 +1650,9 @@ rocker_world_port_obj_vlan_del(struct rocker_port *rocker_port,
 {
 	struct rocker_world_ops *wops = rocker_port->rocker->wops;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (!wops->port_obj_vlan_del)
 		return -EOPNOTSUPP;
 	return wops->port_obj_vlan_del(rocker_port, vlan);
-- 
cgit v1.2.3-59-g8ed1b


From da0efa8888ee268ff7db74389a5e4fc32ade457a Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 02:59:26 +0200
Subject: dsa: port: Ignore bridge VLAN events

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/port.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/dsa/port.c b/net/dsa/port.c
index 2413beb995be..ed0595459df1 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -252,6 +252,9 @@ int dsa_port_vlan_add(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (br_vlan_enabled(dp->bridge_dev))
 		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
 
@@ -267,6 +270,9 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 		.vlan = vlan,
 	};
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (br_vlan_enabled(dp->bridge_dev))
 		return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
 
-- 
cgit v1.2.3-59-g8ed1b


From a73bceb86df8f1e007b7f7691d7e38bb29601943 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 03:00:07 +0200
Subject: staging: fsl-dpaa2: ethsw: Ignore bridge VLAN events

A follow-up patch enables emitting VLAN notifications for the bridge CPU
port in addition to the existing slave port notifications. These
notifications have orig_dev set to the bridge in question.

Because there's no specific support for these VLANs, just ignore the
notifications to maintain the current behavior.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/staging/fsl-dpaa2/ethsw/ethsw.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
index c723a04bc3d6..a17dd2972ccd 100644
--- a/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
+++ b/drivers/staging/fsl-dpaa2/ethsw/ethsw.c
@@ -719,6 +719,9 @@ static int port_vlans_add(struct net_device *netdev,
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	int vid, err;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	if (switchdev_trans_ph_prepare(trans))
 		return 0;
 
@@ -873,6 +876,9 @@ static int port_vlans_del(struct net_device *netdev,
 	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
 	int vid, err;
 
+	if (netif_is_bridge_master(vlan->obj.orig_dev))
+		return -EOPNOTSUPP;
+
 	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
 		err = ethsw_port_del_vlan(port_priv, vid);
 		if (err)
-- 
cgit v1.2.3-59-g8ed1b


From 9c86ce2c1ae337fc10568a12aea812ed03de8319 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 03:00:45 +0200
Subject: net: bridge: Notify about bridge VLANs

A driver might need to react to changes in settings of brentry VLANs.
Therefore send switchdev port notifications for these as well. Reuse
SWITCHDEV_OBJ_ID_PORT_VLAN for this purpose. Listeners should use
netif_is_bridge_master() on orig_dev to determine whether the
notification is about a bridge port or a bridge.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_vlan.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 602c869c9821..7df269092103 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -246,6 +246,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
 			goto out_filt;
 		v->brvlan = masterv;
 		v->stats = masterv->stats;
+	} else {
+		err = br_switchdev_port_vlan_add(dev, v->vid, flags);
+		if (err && err != -EOPNOTSUPP)
+			goto out;
 	}
 
 	/* Add the dev mac and count the vlan only if it's usable */
@@ -281,6 +285,8 @@ out_filt:
 			br_vlan_put_master(masterv);
 			v->brvlan = NULL;
 		}
+	} else {
+		br_switchdev_port_vlan_del(dev, v->vid);
 	}
 
 	goto out;
@@ -306,6 +312,11 @@ static int __vlan_del(struct net_bridge_vlan *v)
 		err = __vlan_vid_del(p->dev, p->br, v->vid);
 		if (err)
 			goto out;
+	} else {
+		err = br_switchdev_port_vlan_del(v->br->dev, v->vid);
+		if (err && err != -EOPNOTSUPP)
+			goto out;
+		err = 0;
 	}
 
 	if (br_vlan_should_use(v)) {
@@ -558,16 +569,22 @@ static int br_vlan_add_existing(struct net_bridge *br,
 {
 	int err;
 
+	err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
+	if (err && err != -EOPNOTSUPP)
+		return err;
+
 	if (!br_vlan_is_brentry(vlan)) {
 		/* Trying to change flags of non-existent bridge vlan */
-		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
-			return -EINVAL;
+		if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
+			err = -EINVAL;
+			goto err_flags;
+		}
 		/* It was only kept for port vlans, now make it real */
 		err = br_fdb_insert(br, NULL, br->dev->dev_addr,
 				    vlan->vid);
 		if (err) {
 			br_err(br, "failed to insert local address into bridge forwarding table\n");
-			return err;
+			goto err_fdb_insert;
 		}
 
 		refcount_inc(&vlan->refcnt);
@@ -580,6 +597,11 @@ static int br_vlan_add_existing(struct net_bridge *br,
 		*changed = true;
 
 	return 0;
+
+err_fdb_insert:
+err_flags:
+	br_switchdev_port_vlan_del(br->dev, vlan->vid);
+	return err;
 }
 
 /* Must be protected by RTNL.
-- 
cgit v1.2.3-59-g8ed1b


From 7edcb8ecbe6a285fc4608c22ad5d5c35b8809ff7 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 30 May 2018 03:00:49 +0200
Subject: mlxsw: spectrum_switchdev: Schedule respin during trans prepare

Since there's no special support for the bridge events, the driver
returns -EOPNOTSUPP, and thus the commit never happens. Therefore
schedule respin during the prepare stage: there's no real difference one
way or another.

This fixes the problem that mirror-to-gretap offload wouldn't adapt to
changes in bridge vlan configuration right away and another notification
would have to arrive for mlxsw to catch up.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index cbc8fabc90c5..8a15ac49cb5a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1697,7 +1697,7 @@ static int mlxsw_sp_port_obj_add(struct net_device *dev,
 		vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
 		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans);
 
-		if (switchdev_trans_ph_commit(trans)) {
+		if (switchdev_trans_ph_prepare(trans)) {
 			/* The event is emitted before the changes are actually
 			 * applied to the bridge. Therefore schedule the respin
 			 * call for later, so that the respin logic sees the
-- 
cgit v1.2.3-59-g8ed1b


From c5f732d7deb81a3c76c6d9b1045da800b22b196b Mon Sep 17 00:00:00 2001
From: Ganesh Goudar <ganeshgr@chelsio.com>
Date: Wed, 30 May 2018 17:15:50 +0530
Subject: cxgb4: Add FORCE_PAUSE bit to 32 bit port caps

Add FORCE_PAUSE bit to force local pause settings instead
of using auto negotiated values.

Signed-off-by: Santosh Rastapur <santosh@chelsio.com>
Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c    | 10 +++++++++-
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |  5 +++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 39da7e3c804b..974a868a4824 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -3941,6 +3941,7 @@ static fw_port_cap32_t fwcaps16_to_caps32(fw_port_cap16_t caps16)
 	CAP16_TO_CAP32(FC_RX);
 	CAP16_TO_CAP32(FC_TX);
 	CAP16_TO_CAP32(ANEG);
+	CAP16_TO_CAP32(FORCE_PAUSE);
 	CAP16_TO_CAP32(MDIAUTO);
 	CAP16_TO_CAP32(MDISTRAIGHT);
 	CAP16_TO_CAP32(FEC_RS);
@@ -3982,6 +3983,7 @@ static fw_port_cap16_t fwcaps32_to_caps16(fw_port_cap32_t caps32)
 	CAP32_TO_CAP16(802_3_PAUSE);
 	CAP32_TO_CAP16(802_3_ASM_DIR);
 	CAP32_TO_CAP16(ANEG);
+	CAP32_TO_CAP16(FORCE_PAUSE);
 	CAP32_TO_CAP16(MDIAUTO);
 	CAP32_TO_CAP16(MDISTRAIGHT);
 	CAP32_TO_CAP16(FEC_RS);
@@ -4014,6 +4016,8 @@ static inline fw_port_cap32_t cc_to_fwcap_pause(enum cc_pause cc_pause)
 		fw_pause |= FW_PORT_CAP32_FC_RX;
 	if (cc_pause & PAUSE_TX)
 		fw_pause |= FW_PORT_CAP32_FC_TX;
+	if (!(cc_pause & PAUSE_AUTONEG))
+		fw_pause |= FW_PORT_CAP32_FORCE_PAUSE;
 
 	return fw_pause;
 }
@@ -4101,7 +4105,11 @@ int t4_link_l1cfg_core(struct adapter *adapter, unsigned int mbox,
 		rcap = lc->acaps | fw_fc | fw_fec | fw_mdi;
 	}
 
-	if (rcap & ~lc->pcaps) {
+	/* Note that older Firmware doesn't have FW_PORT_CAP32_FORCE_PAUSE, so
+	 * we need to exclude this from this check in order to maintain
+	 * compatibility ...
+	 */
+	if ((rcap & ~lc->pcaps) & ~FW_PORT_CAP32_FORCE_PAUSE) {
 		dev_err(adapter->pdev_dev,
 			"Requested Port Capabilities %#x exceed Physical Port Capabilities %#x\n",
 			rcap, lc->pcaps);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 2d91480a5a0e..f1967cf6d43c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2475,7 +2475,7 @@ enum fw_port_cap {
 	FW_PORT_CAP_MDISTRAIGHT		= 0x0400,
 	FW_PORT_CAP_FEC_RS		= 0x0800,
 	FW_PORT_CAP_FEC_BASER_RS	= 0x1000,
-	FW_PORT_CAP_FEC_RESERVED	= 0x2000,
+	FW_PORT_CAP_FORCE_PAUSE		= 0x2000,
 	FW_PORT_CAP_802_3_PAUSE		= 0x4000,
 	FW_PORT_CAP_802_3_ASM_DIR	= 0x8000,
 };
@@ -2522,7 +2522,8 @@ enum fw_port_mdi {
 #define	FW_PORT_CAP32_FEC_RESERVED1	0x02000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED2	0x04000000UL
 #define	FW_PORT_CAP32_FEC_RESERVED3	0x08000000UL
-#define	FW_PORT_CAP32_RESERVED2		0xf0000000UL
+#define FW_PORT_CAP32_FORCE_PAUSE	0x10000000UL
+#define FW_PORT_CAP32_RESERVED2		0xe0000000UL
 
 #define FW_PORT_CAP32_SPEED_S	0
 #define FW_PORT_CAP32_SPEED_M	0xfff
-- 
cgit v1.2.3-59-g8ed1b


From 35aada99b5bfa7a9360f9653377688556f71edcb Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@cumulusnetworks.com>
Date: Wed, 30 May 2018 08:27:32 -0400
Subject: rtnetlink: Add more well known protocol values

FRRouting installs routes into the kernel associated with
the originating protocol.  Add these values to the well
known values in rtnetlink.h.

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index cabb210c93af..7d8502313c99 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -254,6 +254,11 @@ enum {
 #define RTPROT_DHCP	16      /* DHCP client */
 #define RTPROT_MROUTED	17      /* Multicast daemon */
 #define RTPROT_BABEL	42      /* Babel daemon */
+#define RTPROT_BGP	186     /* BGP Routes */
+#define RTPROT_ISIS	187     /* ISIS Routes */
+#define RTPROT_OSPF	188     /* OSPF Routes */
+#define RTPROT_RIP	189     /* RIP Routes */
+#define RTPROT_EIGRP	192     /* EIGRP Routes */
 
 /* rtm_scope
 
-- 
cgit v1.2.3-59-g8ed1b


From 1865ea9adbfaf341c5cd5d8f7d384f19948b2fe9 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Wed, 30 May 2018 10:59:49 -0700
Subject: net/mlx5: Add temperature warning event to log

Temperature warning event is sent by FW to indicate high temperature
as detected by one of the sensors on the board.
Add handling of this event by writing the numbers of the alert sensors
to the kernel log.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 +++++++++++++++++++++++
 include/linux/mlx5/device.h                  |  7 +++++++
 include/linux/mlx5/mlx5_ifc.h                |  2 +-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1814f803bd2c..1a3a2b9a7232 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -144,6 +144,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_GPIO_EVENT";
 	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
 		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+	case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+		return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
 	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
 		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
 	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -396,6 +398,20 @@ static void general_event_handler(struct mlx5_core_dev *dev,
 	}
 }
 
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+				    struct mlx5_eqe *eqe)
+{
+	u64 value_lsb;
+	u64 value_msb;
+
+	value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+	value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+	mlx5_core_warn(dev,
+		       "High temperature on sensors with bit set %llx %llx",
+		       value_msb, value_lsb);
+}
+
 /* caller must eventually call mlx5_cq_put on the returned cq */
 static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
 {
@@ -550,6 +566,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
+		case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+			mlx5_temp_warning_event(dev, eqe);
+			break;
+
 		case MLX5_EVENT_TYPE_GENERAL_EVENT:
 			general_event_handler(dev, eqe);
 			break;
@@ -827,6 +847,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
 
+	if (MLX5_CAP_GEN(dev, temp_warn_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index db0332a6d23c..c1095e08f0c8 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -314,6 +314,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
 	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
+	MLX5_EVENT_TYPE_TEMP_WARN_EVENT    = 0x17,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
 	MLX5_EVENT_TYPE_GENERAL_EVENT	   = 0x22,
 	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
@@ -626,6 +627,11 @@ struct mlx5_eqe_dct {
 	__be32  dctn;
 };
 
+struct mlx5_eqe_temp_warning {
+	__be64 sensor_warning_msb;
+	__be64 sensor_warning_lsb;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -642,6 +648,7 @@ union ev_data {
 	struct mlx5_eqe_port_module	port_module;
 	struct mlx5_eqe_pps		pps;
 	struct mlx5_eqe_dct             dct;
+	struct mlx5_eqe_temp_warning	temp_warning;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 05b480fae27d..ca6c0dfb5ffe 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -912,7 +912,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_msg[0x5];
 	u8         reserved_at_1c8[0x4];
 	u8         max_tc[0x4];
-	u8         reserved_at_1d0[0x1];
+	u8         temp_warn_event[0x1];
 	u8         dcbx[0x1];
 	u8         general_notification_event[0x1];
 	u8         reserved_at_1d3[0x2];
-- 
cgit v1.2.3-59-g8ed1b


From 1f0cf89b09305743fca5898660a7be83aab38a74 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Wed, 30 May 2018 10:59:50 -0700
Subject: net/mlx5: Add FPGA QP error event

The FPGA queue pair (QP) event fires whenever a QP on the FPGA
transitions to the error state.

At this stage, this event is unrecoverable, it may become recoverable
in the future.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c |  7 +++++--
 include/linux/mlx5/device.h                  |  1 +
 include/linux/mlx5/mlx5_ifc.h                |  1 +
 include/linux/mlx5/mlx5_ifc_fpga.h           | 16 ++++++++++++++++
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1a3a2b9a7232..406c23862f5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -164,6 +164,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
 	case MLX5_EVENT_TYPE_FPGA_ERROR:
 		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
 	case MLX5_EVENT_TYPE_GENERAL_EVENT:
 		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
 	default:
@@ -563,6 +565,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_FPGA_ERROR:
+		case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
@@ -842,11 +845,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
 
 	if (MLX5_CAP_GEN(dev, fpga))
-		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR);
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+				    (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
 	if (MLX5_CAP_GEN_MAX(dev, dct))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
 
-
 	if (MLX5_CAP_GEN(dev, temp_warn_event))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index c1095e08f0c8..0f006cf8343d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -331,6 +331,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_DCT_DRAINED        = 0x1c,
 
 	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
+	MLX5_EVENT_TYPE_FPGA_QP_ERROR      = 0x21,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ca6c0dfb5ffe..8e0b8865f91e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -60,6 +60,7 @@ enum {
 	MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION        = 0xa,
 	MLX5_EVENT_TYPE_CODING_PAGE_REQUEST                        = 0xb,
 	MLX5_EVENT_TYPE_CODING_FPGA_ERROR                          = 0x20,
+	MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR                       = 0x21
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index 193091537cb6..64d0f40d4cc3 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -470,6 +470,22 @@ struct mlx5_ifc_ipsec_counters_bits {
 	u8         dropped_cmd[0x40];
 };
 
+enum {
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RETRY_COUNTER_EXPIRED  = 0x1,
+	MLX5_FPGA_QP_ERROR_EVENT_SYNDROME_RNR_EXPIRED            = 0x2,
+};
+
+struct mlx5_ifc_fpga_qp_error_event_bits {
+	u8         reserved_at_0[0x40];
+
+	u8         reserved_at_40[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_at_60[0x60];
+
+	u8         reserved_at_c0[0x8];
+	u8         fpga_qpn[0x18];
+};
 enum mlx5_ifc_fpga_ipsec_response_syndrome {
 	MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0,
 	MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1,
-- 
cgit v1.2.3-59-g8ed1b


From e2b3e493784fe07eac385611acbaaf7bff86c496 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 30 May 2018 23:51:54 +0200
Subject: net: ti: cpsw: include gpio/consumer.h

On platforms that don't always enable CONFIG_GPIOLIB, we run into
a build failure:

drivers/net/ethernet/ti/cpsw.c: In function 'cpsw_probe':
drivers/net/ethernet/ti/cpsw.c:3006:9: error: implicit declaration of function 'devm_gpiod_get_array_optional' [-Werror=implicit-function-declaration]
  mode = devm_gpiod_get_array_optional(&pdev->dev, "mode", GPIOD_OUT_LOW);
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/ti/cpsw.c:3006:59: error: 'GPIOD_OUT_LOW' undeclared (first use in this function); did you mean 'GPIOF_INIT_LOW'?
  mode = devm_gpiod_get_array_optional(&pdev->dev, "mode", GPIOD_OUT_LOW);

Since we cannot rely on this to be visible from gpio.h, we have to include
gpio/consumer.h directly.

Fixes: 2652113ff043 ("net: ethernet: ti: Allow most drivers with COMPILE_TEST")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 643cd2d9dfb6..534596ce00d3 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -29,7 +29,7 @@
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 #include <linux/pm_runtime.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
-- 
cgit v1.2.3-59-g8ed1b


From 7bb8c9969d919517fc379cacebd8fa93704173db Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 31 May 2018 00:15:42 +0200
Subject: net: dsa: mv88e6xxx: Be explicit about DT or pdata

Make it explicit that either device tree is used or platform data.  If
neither is available, abort the probe.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 877b7cb0b6f2 ("net: dsa: mv88e6xxx: Add minimal platform_data support")
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 12df00f593b7..437cd6eb4faa 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4389,6 +4389,9 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev)
 	int port;
 	int err;
 
+	if (!np && !pdata)
+		return -EINVAL;
+
 	if (np)
 		compat_info = of_device_get_match_data(dev);
 
-- 
cgit v1.2.3-59-g8ed1b


From ccf8dbcd062a930e64741c939ca784d15316aa0c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 30 May 2018 15:20:52 -0700
Subject: rtnetlink: Remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size expected for all possible types and adds
sanity-checks at both registration and usage to make sure nothing gets
out of sync. This matches the proposed VLA solution for nfnetlink[2]. The
values chosen here were based on finding assignments for .maxtype and
.slave_maxtype and manually counting the enums:

slave_maxtype (max 33):
	IFLA_BRPORT_MAX     33
	IFLA_BOND_SLAVE_MAX  9

maxtype (max 45):
	IFLA_BOND_MAX       28
	IFLA_BR_MAX         45
	__IFLA_CAIF_HSI_MAX  8
	IFLA_CAIF_MAX        4
	IFLA_CAN_MAX        16
	IFLA_GENEVE_MAX     12
	IFLA_GRE_MAX        25
	IFLA_GTP_MAX         5
	IFLA_HSR_MAX         7
	IFLA_IPOIB_MAX       4
	IFLA_IPTUN_MAX      21
	IFLA_IPVLAN_MAX      3
	IFLA_MACSEC_MAX     15
	IFLA_MACVLAN_MAX     7
	IFLA_PPP_MAX         2
	__IFLA_RMNET_MAX     4
	IFLA_VLAN_MAX        6
	IFLA_VRF_MAX         2
	IFLA_VTI_MAX         7
	IFLA_VXLAN_MAX      28
	VETH_INFO_MAX        2
	VXCAN_INFO_MAX       2

This additionally changes maxtype and slave_maxtype fields to unsigned,
since they're only ever using positive values.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
[2] https://patchwork.kernel.org/patch/10439647/

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  4 ++--
 net/core/rtnetlink.c    | 18 ++++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 14b6b3af8918..0bbaa5488423 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -64,7 +64,7 @@ struct rtnl_link_ops {
 	size_t			priv_size;
 	void			(*setup)(struct net_device *dev);
 
-	int			maxtype;
+	unsigned int		maxtype;
 	const struct nla_policy	*policy;
 	int			(*validate)(struct nlattr *tb[],
 					    struct nlattr *data[],
@@ -92,7 +92,7 @@ struct rtnl_link_ops {
 	unsigned int		(*get_num_tx_queues)(void);
 	unsigned int		(*get_num_rx_queues)(void);
 
-	int			slave_maxtype;
+	unsigned int		slave_maxtype;
 	const struct nla_policy	*slave_policy;
 	int			(*slave_changelink)(struct net_device *dev,
 						    struct net_device *slave_dev,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 80802546c279..8ca49a0e13fb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -59,6 +59,9 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
+#define RTNL_MAX_TYPE		48
+#define RTNL_SLAVE_MAX_TYPE	36
+
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
@@ -389,6 +392,11 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
 {
 	int err;
 
+	/* Sanity-check max sizes to avoid stack buffer overflow. */
+	if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
+		    ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
+		return -EINVAL;
+
 	rtnl_lock();
 	err = __rtnl_link_register(ops);
 	rtnl_unlock();
@@ -2902,13 +2910,16 @@ replay:
 	}
 
 	if (1) {
-		struct nlattr *attr[ops ? ops->maxtype + 1 : 1];
-		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 1];
+		struct nlattr *attr[RTNL_MAX_TYPE + 1];
+		struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
 		struct nlattr **data = NULL;
 		struct nlattr **slave_data = NULL;
 		struct net *dest_net, *link_net = NULL;
 
 		if (ops) {
+			if (ops->maxtype > RTNL_MAX_TYPE)
+				return -EINVAL;
+
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
 				err = nla_parse_nested(attr, ops->maxtype,
 						       linkinfo[IFLA_INFO_DATA],
@@ -2925,6 +2936,9 @@ replay:
 		}
 
 		if (m_ops) {
+			if (ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+				return -EINVAL;
+
 			if (m_ops->slave_maxtype &&
 			    linkinfo[IFLA_INFO_SLAVE_DATA]) {
 				err = nla_parse_nested(slave_attr,
-- 
cgit v1.2.3-59-g8ed1b


From 4b8e6ac41a594ea67ded6af6af5935f03221ea4c Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 31 May 2018 02:05:07 +0000
Subject: virtio_net: fix error return code in virtnet_probe()

Fix to return a negative error code from the failover create fail error
handling case instead of 0, as done elsewhere in this function.

Fixes: ba5e4426e80e ("virtio_net: Extend virtio to use VF datapath when available")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8f08a3e1bbaa..2d55e2af7cb6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2935,8 +2935,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
 		vi->failover = net_failover_create(vi->dev);
-		if (IS_ERR(vi->failover))
+		if (IS_ERR(vi->failover)) {
+			err = PTR_ERR(vi->failover);
 			goto free_vqs;
+		}
 	}
 
 	err = register_netdev(dev);
-- 
cgit v1.2.3-59-g8ed1b


From 7849958b51aa392e3592b6b8181db0baad979b0b Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Wed, 23 May 2018 18:53:48 +0800
Subject: netfilter: fix ptr_ret.cocci warnings

net/netfilter/nft_numgen.c:117:1-3: WARNING: PTR_ERR_OR_ZERO can be used
net/netfilter/nft_hash.c:180:1-3: WARNING: PTR_ERR_OR_ZERO can be used
net/netfilter/nft_hash.c:223:1-3: WARNING: PTR_ERR_OR_ZERO can be used

 Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Fixes: b9ccc07e3f31 ("netfilter: nft_hash: add map lookups for hashing operations")
Fixes: d734a2888922 ("netfilter: nft_numgen: add map lookups for numgen statements")
CC: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: kbuild test robot <fengguang.wu@intel.com>
Acked-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_hash.c   | 10 ++--------
 net/netfilter/nft_numgen.c |  5 +----
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index f0fc21f88775..c2d237144f74 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -177,10 +177,7 @@ static int nft_jhash_map_init(const struct nft_ctx *ctx,
 	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
 					  tb[NFTA_HASH_SET_NAME],
 					  tb[NFTA_HASH_SET_ID], genmask);
-	if (IS_ERR(priv->map))
-		return PTR_ERR(priv->map);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_symhash_init(const struct nft_ctx *ctx,
@@ -220,10 +217,7 @@ static int nft_symhash_map_init(const struct nft_ctx *ctx,
 	priv->map = nft_set_lookup_global(ctx->net, ctx->table,
 					  tb[NFTA_HASH_SET_NAME],
 					  tb[NFTA_HASH_SET_ID], genmask);
-	if (IS_ERR(priv->map))
-		return PTR_ERR(priv->map);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_jhash_dump(struct sk_buff *skb,
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index cdbc62a53933..1f4d0854cf70 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -114,10 +114,7 @@ static int nft_ng_inc_map_init(const struct nft_ctx *ctx,
 					  tb[NFTA_NG_SET_NAME],
 					  tb[NFTA_NG_SET_ID], genmask);
 
-	if (IS_ERR(priv->map))
-		return PTR_ERR(priv->map);
-
-	return 0;
+	return PTR_ERR_OR_ZERO(priv->map);
 }
 
 static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
-- 
cgit v1.2.3-59-g8ed1b


From 554ced0a6e2946562c20d9fffdbaf2aa7da36b1b Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Mon, 28 May 2018 09:15:33 +0200
Subject: netfilter: nf_tables: add support for native socket matching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now it can only match the transparent flag of an ip/ipv6 socket.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  25 ++++++
 net/netfilter/Kconfig                    |   9 ++
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_socket.c               | 143 +++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+)
 create mode 100644 net/netfilter/nft_socket.c

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9c71f024f9cc..3d46c82a5ebd 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -904,6 +904,31 @@ enum nft_rt_attributes {
 };
 #define NFTA_RT_MAX		(__NFTA_RT_MAX - 1)
 
+/**
+ * enum nft_socket_attributes - nf_tables socket expression netlink attributes
+ *
+ * @NFTA_SOCKET_KEY: socket key to match
+ * @NFTA_SOCKET_DREG: destination register
+ */
+enum nft_socket_attributes {
+	NFTA_SOCKET_UNSPEC,
+	NFTA_SOCKET_KEY,
+	NFTA_SOCKET_DREG,
+	__NFTA_SOCKET_MAX
+};
+#define NFTA_SOCKET_MAX		(__NFTA_SOCKET_MAX - 1)
+
+/*
+ * enum nft_socket_keys - nf_tables socket expression keys
+ *
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ */
+enum nft_socket_keys {
+	NFT_SOCKET_TRANSPARENT,
+	__NFT_SOCKET_MAX
+};
+#define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
+
 /**
  * enum nft_ct_keys - nf_tables ct expression keys
  *
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 3ec8886850b2..276e1e32f44e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -613,6 +613,15 @@ config NFT_FIB_INET
 	  The lookup will be delegated to the IPv4 or IPv6 FIB depending
 	  on the protocol of the packet.
 
+config NFT_SOCKET
+	tristate "Netfilter nf_tables socket match support"
+	depends on IPV6 || IPV6=n
+	select NF_SOCKET_IPV4
+	select NF_SOCKET_IPV6 if IPV6
+	help
+	  This option allows matching for the presence or absence of a
+	  corresponding socket and its attributes.
+
 if NF_TABLES_NETDEV
 
 config NF_DUP_NETDEV
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 9b3434360d49..eec169555731 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_NFT_FIB)		+= nft_fib.o
 obj-$(CONFIG_NFT_FIB_INET)	+= nft_fib_inet.o
 obj-$(CONFIG_NFT_FIB_NETDEV)	+= nft_fib_netdev.o
 obj-$(CONFIG_NF_OSF)		+= nf_osf.o
+obj-$(CONFIG_NFT_SOCKET)	+= nft_socket.o
 
 # nf_tables netdev
 obj-$(CONFIG_NFT_DUP_NETDEV)	+= nft_dup_netdev.o
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
new file mode 100644
index 000000000000..d86337068ecb
--- /dev/null
+++ b/net/netfilter/nft_socket.c
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_socket.h>
+#include <net/inet_sock.h>
+
+struct nft_socket {
+	enum nft_socket_keys		key:8;
+	union {
+		enum nft_registers	dreg:8;
+	};
+};
+
+static void nft_socket_eval(const struct nft_expr *expr,
+			    struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+	struct sk_buff *skb = pkt->skb;
+	struct sock *sk = skb->sk;
+	u32 *dest = &regs->data[priv->dreg];
+
+	if (!sk)
+		switch(nft_pf(pkt)) {
+		case NFPROTO_IPV4:
+			sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+		case NFPROTO_IPV6:
+			sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
+			break;
+#endif
+		default:
+			WARN_ON_ONCE(1);
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+
+	if(!sk) {
+		nft_reg_store8(dest, 0);
+		return;
+	}
+
+	/* So that subsequent socket matching not to require other lookups. */
+	skb->sk = sk;
+
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		nft_reg_store8(dest, nf_sk_is_transparent(sk));
+		break;
+	default:
+		WARN_ON(1);
+		regs->verdict.code = NFT_BREAK;
+	}
+}
+
+static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
+	[NFTA_SOCKET_KEY]		= { .type = NLA_U32 },
+	[NFTA_SOCKET_DREG]		= { .type = NLA_U32 },
+};
+
+static int nft_socket_init(const struct nft_ctx *ctx,
+			   const struct nft_expr *expr,
+			   const struct nlattr * const tb[])
+{
+	struct nft_socket *priv = nft_expr_priv(expr);
+	unsigned int len;
+
+	if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY])
+		return -EINVAL;
+
+	switch(ctx->family) {
+	case NFPROTO_IPV4:
+#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+	case NFPROTO_IPV6:
+#endif
+	case NFPROTO_INET:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+	switch(priv->key) {
+	case NFT_SOCKET_TRANSPARENT:
+		len = sizeof(u8);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
+	return nft_validate_register_store(ctx, priv->dreg, NULL,
+					   NFT_DATA_VALUE, len);
+}
+
+static int nft_socket_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr)
+{
+	const struct nft_socket *priv = nft_expr_priv(expr);
+
+	if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+		return -1;
+	if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+		return -1;
+	return 0;
+}
+
+static struct nft_expr_type nft_socket_type;
+static const struct nft_expr_ops nft_socket_ops = {
+	.type		= &nft_socket_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_socket)),
+	.eval		= nft_socket_eval,
+	.init		= nft_socket_init,
+	.dump		= nft_socket_dump,
+};
+
+static struct nft_expr_type nft_socket_type __read_mostly = {
+	.name		= "socket",
+	.ops		= &nft_socket_ops,
+	.policy		= nft_socket_policy,
+	.maxattr	= NFTA_SOCKET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_socket_module_init(void)
+{
+	return nft_register_expr(&nft_socket_type);
+}
+
+static void __exit nft_socket_module_exit(void)
+{
+	nft_unregister_expr(&nft_socket_type);
+}
+
+module_init(nft_socket_module_init);
+module_exit(nft_socket_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables socket match module");
-- 
cgit v1.2.3-59-g8ed1b


From 1a893b44de4528887e7dabcdce7151ca2a8ee238 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 30 May 2018 11:06:22 +0200
Subject: netfilter: nf_tables: Add audit support to log statement

This extends log statement to support the behaviour achieved with
AUDIT target in iptables.

Audit logging is enabled via a pseudo log level 8. In this case any
other settings like log prefix are ignored since audit log format is
fixed.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  5 ++
 net/netfilter/nft_log.c                  | 92 +++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3d46c82a5ebd..5c7eb9b9f6d6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1080,6 +1080,11 @@ enum nft_log_attributes {
 };
 #define NFTA_LOG_MAX		(__NFTA_LOG_MAX - 1)
 
+/**
+ * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging
+ */
+#define LOGLEVEL_AUDIT		8
+
 /**
  * enum nft_queue_attributes - nf_tables queue expression netlink attributes
  *
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index a27be36dc0af..7eef1cffbf1b 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -9,12 +9,15 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <linux/audit.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -26,12 +29,93 @@ struct nft_log {
 	char			*prefix;
 };
 
+static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph);
+	if (!ih)
+		return false;
+
+	audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu",
+			 &ih->saddr, &ih->daddr, ih->protocol);
+
+	return true;
+}
+
+static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
+{
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
+	if (!ih)
+		return false;
+
+	nexthdr = ih->nexthdr;
+	ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off);
+
+	audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
+			 &ih->saddr, &ih->daddr, nexthdr);
+
+	return true;
+}
+
+static void nft_log_eval_audit(const struct nft_pktinfo *pkt)
+{
+	struct sk_buff *skb = pkt->skb;
+	struct audit_buffer *ab;
+	int fam = -1;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
+	if (!ab)
+		return;
+
+	audit_log_format(ab, "mark=%#x", skb->mark);
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_BRIDGE:
+		switch (eth_hdr(skb)->h_proto) {
+		case htons(ETH_P_IP):
+			fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+			break;
+		case htons(ETH_P_IPV6):
+			fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+			break;
+		}
+		break;
+	case NFPROTO_IPV4:
+		fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1;
+		break;
+	case NFPROTO_IPV6:
+		fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1;
+		break;
+	}
+
+	if (fam == -1)
+		audit_log_format(ab, " saddr=? daddr=? proto=-1");
+
+	audit_log_end(ab);
+}
+
 static void nft_log_eval(const struct nft_expr *expr,
 			 struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt)
 {
 	const struct nft_log *priv = nft_expr_priv(expr);
 
+	if (priv->loginfo.type == NF_LOG_TYPE_LOG &&
+	    priv->loginfo.u.log.level == LOGLEVEL_AUDIT) {
+		nft_log_eval_audit(pkt);
+		return;
+	}
+
 	nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb,
 		      nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s",
 		      priv->prefix);
@@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		} else {
 			li->u.log.level = LOGLEVEL_WARNING;
 		}
-		if (li->u.log.level > LOGLEVEL_DEBUG) {
+		if (li->u.log.level > LOGLEVEL_AUDIT) {
 			err = -EINVAL;
 			goto err1;
 		}
@@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx,
 		break;
 	}
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return 0;
+
 	err = nf_logger_find_get(ctx->family, li->type);
 	if (err < 0)
 		goto err1;
@@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	if (priv->prefix != nft_log_null_prefix)
 		kfree(priv->prefix);
 
+	if (li->u.log.level == LOGLEVEL_AUDIT)
+		return;
+
 	nf_logger_put(ctx->family, li->type);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a654de8fdc1815676ab750e70cab231fc814c29f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 30 May 2018 20:18:57 +0200
Subject: netfilter: nf_tables: fix chain dependency validation

The following ruleset:

 add table ip filter
 add chain ip filter input { type filter hook input priority 4; }
 add chain ip filter ap
 add rule ip filter input jump ap
 add rule ip filter ap masquerade

results in a panic, because the masquerade extension should be rejected
from the filter chain. The existing validation is missing a chain
dependency check when the rule is added to the non-base chain.

This patch fixes the problem by walking down the rules from the
basechains, searching for either immediate or lookup expressions, then
jumping to non-base chains and again walking down the rules to perform
the expression validation, so we make sure the full ruleset graph is
validated. This is done only once from the commit phase, in case of
problem, we abort the transaction and perform fine grain validation for
error reporting. This patch requires 003087911af2 ("netfilter:
nfnetlink: allow commit to fail") to achieve this behaviour.

This patch also adds a cleanup callback to nfnl batch interface to reset
the validate state from the exit path.

As a result of this patch, nf_tables_check_loops() doesn't use
->validate to check for loops, instead it just checks for immediate
expressions.

Reported-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h    |   1 +
 include/net/netfilter/nf_tables.h      |   2 +
 include/net/netfilter/nf_tables_core.h |   8 ++
 include/net/netns/nftables.h           |   1 +
 net/netfilter/nf_tables_api.c          | 152 ++++++++++++++++++++++++++++-----
 net/netfilter/nfnetlink.c              |   2 +
 net/netfilter/nft_immediate.c          |  27 ++++--
 net/netfilter/nft_lookup.c             |  47 ++++++++++
 8 files changed, 208 insertions(+), 32 deletions(-)

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 34551f8aaf9d..3ecc3050be0e 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -31,6 +31,7 @@ struct nfnetlink_subsystem {
 	const struct nfnl_callback *cb;	/* callback for individual types */
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
+	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6b9ddb99f3a8..435c32d8a995 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -874,6 +874,8 @@ struct nft_chain {
 	struct nft_rule			**rules_next;
 };
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
 	NFT_CHAIN_T_ROUTE,
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index cd6915b6c054..e0c0c2558ec4 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -2,6 +2,8 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+#include <net/netfilter/nf_tables.h>
+
 extern struct nft_expr_type nft_imm_type;
 extern struct nft_expr_type nft_cmp_type;
 extern struct nft_expr_type nft_lookup_type;
@@ -23,6 +25,12 @@ struct nft_cmp_fast_expr {
 	u8			len;
 };
 
+struct nft_immediate_expr {
+	struct nft_data		data;
+	enum nft_registers	dreg:8;
+	u8			dlen;
+};
+
 /* Calculate the mask for the nft_cmp_fast expression. On big endian the
  * mask needs to include the *upper* bytes when interpreting that data as
  * something smaller than the full u32, therefore a cpu_to_le32 is done.
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 29c3851b486a..94767ea3a490 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -9,6 +9,7 @@ struct netns_nftables {
 	struct list_head	commit_list;
 	unsigned int		base_seq;
 	u8			gencursor;
+	u8			validate_state;
 };
 
 #endif
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3b2ad96a9a05..c785bc5a66f1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects);
 static LIST_HEAD(nf_tables_flowtables);
 static u64 table_handle;
 
+enum {
+	NFT_VALIDATE_SKIP	= 0,
+	NFT_VALIDATE_NEED,
+	NFT_VALIDATE_DO,
+};
+
+static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+{
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
+		break;
+	case NFT_VALIDATE_NEED:
+		break;
+	case NFT_VALIDATE_DO:
+		if (new_validate_state == NFT_VALIDATE_NEED)
+			return;
+	}
+
+	net->nft.validate_state = new_validate_state;
+}
+
 static void nft_ctx_init(struct nft_ctx *ctx,
 			 struct net *net,
 			 const struct sk_buff *skb,
@@ -1921,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx,
 			goto err1;
 	}
 
-	if (ops->validate) {
-		const struct nft_data *data = NULL;
-
-		err = ops->validate(ctx, expr, &data);
-		if (err < 0)
-			goto err2;
-	}
-
 	return 0;
-
-err2:
-	if (ops->destroy)
-		ops->destroy(ctx, expr);
 err1:
 	expr->ops = NULL;
 	return err;
@@ -2299,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx,
 	nf_tables_rule_destroy(ctx, rule);
 }
 
+int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+{
+	struct nft_expr *expr, *last;
+	const struct nft_data *data;
+	struct nft_rule *rule;
+	int err;
+
+	list_for_each_entry(rule, &chain->rules, list) {
+		if (!nft_is_active_next(ctx->net, rule))
+			continue;
+
+		nft_rule_for_each_expr(expr, last, rule) {
+			if (!expr->ops->validate)
+				continue;
+
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_chain_validate);
+
+static int nft_table_validate(struct net *net, const struct nft_table *table)
+{
+	struct nft_chain *chain;
+	struct nft_ctx ctx = {
+		.net	= net,
+		.family	= table->family,
+	};
+	int err;
+
+	list_for_each_entry(chain, &table->chains, list) {
+		if (!nft_is_base_chain(chain))
+			continue;
+
+		ctx.chain = chain;
+		err = nft_chain_validate(&ctx, chain);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
 #define NFT_RULE_MAXEXPRS	128
 
 static struct nft_expr_info *info;
@@ -2426,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
 		if (err < 0)
 			goto err2;
+
+		if (info[i].ops->validate)
+			nft_validate_state_update(net, NFT_VALIDATE_NEED);
+
 		info[i].ops = NULL;
 		expr = nft_expr_next(expr);
 	}
@@ -2469,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		}
 	}
 	chain->use++;
-	return 0;
 
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, table);
+
+	return 0;
 err2:
 	nf_tables_rule_release(&ctx, rule);
 err1:
@@ -4112,6 +4176,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 							  d2.type, d2.len);
 			if (err < 0)
 				goto err3;
+
+			if (d2.type == NFT_DATA_VERDICT &&
+			    (data.verdict.code == NFT_GOTO ||
+			     data.verdict.code == NFT_JUMP))
+				nft_validate_state_update(ctx->net,
+							  NFT_VALIDATE_NEED);
 		}
 
 		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
@@ -4211,7 +4281,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	const struct nlattr *attr;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	int rem, err = 0;
+	int rem, err;
 
 	if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
 		return -EINVAL;
@@ -4232,9 +4302,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
 		err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
 		if (err < 0)
-			break;
+			return err;
 	}
-	return err;
+
+	if (net->nft.validate_state == NFT_VALIDATE_DO)
+		return nft_table_validate(net, ctx.table);
+
+	return 0;
 }
 
 /**
@@ -5867,6 +5941,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	},
 };
 
+static int nf_tables_validate(struct net *net)
+{
+	struct nft_table *table;
+
+	switch (net->nft.validate_state) {
+	case NFT_VALIDATE_SKIP:
+		break;
+	case NFT_VALIDATE_NEED:
+		nft_validate_state_update(net, NFT_VALIDATE_DO);
+		/* fall through */
+	case NFT_VALIDATE_DO:
+		list_for_each_entry(table, &net->nft.tables, list) {
+			if (nft_table_validate(net, table) < 0)
+				return -EAGAIN;
+		}
+		break;
+	}
+
+	return 0;
+}
+
 static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
@@ -6055,6 +6150,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 	struct nft_chain *chain;
 	struct nft_table *table;
 
+	/* 0. Validate ruleset, otherwise roll back for error reporting. */
+	if (nf_tables_validate(net) < 0)
+		return -EAGAIN;
+
 	/* 1.  Allocate space for next generation rules_gen_X[] */
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		int ret;
@@ -6349,6 +6448,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static void nf_tables_cleanup(struct net *net)
+{
+	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
+}
+
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
 	return net->nft.base_seq == genid;
@@ -6361,6 +6465,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
+	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
 };
 
@@ -6444,19 +6549,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			const struct nft_data *data = NULL;
+			struct nft_immediate_expr *priv;
+			const struct nft_data *data;
 			int err;
 
-			if (!expr->ops->validate)
+			if (strcmp(expr->ops->type->name, "immediate"))
 				continue;
 
-			err = expr->ops->validate(ctx, expr, &data);
-			if (err < 0)
-				return err;
-
-			if (data == NULL)
+			priv = nft_expr_priv(expr);
+			if (priv->dreg != NFT_REG_VERDICT)
 				continue;
 
+			data = &priv->data;
 			switch (data->verdict.code) {
 			case NFT_JUMP:
 			case NFT_GOTO:
@@ -6936,6 +7040,8 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
 	net->nft.base_seq = 1;
+	net->nft.validate_state = NFT_VALIDATE_SKIP;
+
 	return 0;
 }
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 5a1bd23af1a3..261820627711 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -460,6 +460,8 @@ done:
 	} else {
 		ss->abort(net, oskb);
 	}
+	if (ss->cleanup)
+		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
 	nfnl_unlock(subsys_id);
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index aa87ff8beae8..15adf8ca82c3 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -17,12 +17,6 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
 
-struct nft_immediate_expr {
-	struct nft_data		data;
-	enum nft_registers	dreg:8;
-	u8			dlen;
-};
-
 static void nft_immediate_eval(const struct nft_expr *expr,
 			       struct nft_regs *regs,
 			       const struct nft_pktinfo *pkt)
@@ -101,12 +95,27 @@ nla_put_failure:
 
 static int nft_immediate_validate(const struct nft_ctx *ctx,
 				  const struct nft_expr *expr,
-				  const struct nft_data **data)
+				  const struct nft_data **d)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+	const struct nft_data *data;
+	int err;
 
-	if (priv->dreg == NFT_REG_VERDICT)
-		*data = &priv->data;
+	if (priv->dreg != NFT_REG_VERDICT)
+		return 0;
+
+	data = &priv->data;
+
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		err = nft_chain_validate(ctx, data->verdict.chain);
+		if (err < 0)
+			return err;
+		break;
+	default:
+		break;
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f52da5e2199f..42e6fadf1417 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -149,6 +149,52 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
+				       struct nft_set *set,
+				       const struct nft_set_iter *iter,
+				       struct nft_set_elem *elem)
+{
+	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+	const struct nft_data *data;
+
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+	    *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+		return 0;
+
+	data = nft_set_ext_data(ext);
+	switch (data->verdict.code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		return nft_chain_validate(ctx, data->verdict.chain);
+	default:
+		return 0;
+	}
+}
+
+static int nft_lookup_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **d)
+{
+	const struct nft_lookup *priv = nft_expr_priv(expr);
+	struct nft_set_iter iter;
+
+	if (!(priv->set->flags & NFT_SET_MAP) ||
+	    priv->set->dtype != NFT_DATA_VERDICT)
+		return 0;
+
+	iter.genmask	= nft_genmask_next(ctx->net);
+	iter.skip	= 0;
+	iter.count	= 0;
+	iter.err	= 0;
+	iter.fn		= nft_lookup_validate_setelem;
+
+	priv->set->ops->walk(ctx, priv->set, &iter);
+	if (iter.err < 0)
+		return iter.err;
+
+	return 0;
+}
+
 static const struct nft_expr_ops nft_lookup_ops = {
 	.type		= &nft_lookup_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
 	.init		= nft_lookup_init,
 	.destroy	= nft_lookup_destroy,
 	.dump		= nft_lookup_dump,
+	.validate	= nft_lookup_validate,
 };
 
 struct nft_expr_type nft_lookup_type __read_mostly = {
-- 
cgit v1.2.3-59-g8ed1b


From 2a79fd3908acd88e6cb0e620c314d7b1fee56a02 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Wed, 30 May 2018 20:43:15 +0200
Subject: netfilter: nf_flow_table: attach dst to skbs

Some drivers, such as vxlan and wireguard, use the skb's dst in order to
determine things like PMTU. They therefore loose functionality when flow
offloading is enabled. So, we ensure the skb has it before xmit'ing it
in the offloading path.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 82451b7e0acb..15ed91309992 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -220,7 +220,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	enum flow_offload_tuple_dir dir;
 	struct flow_offload *flow;
 	struct net_device *outdev;
-	const struct rtable *rt;
+	struct rtable *rt;
 	unsigned int thoff;
 	struct iphdr *iph;
 	__be32 nexthop;
@@ -241,7 +241,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+	rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 
 	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
 	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
@@ -264,6 +264,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	skb->dev = outdev;
 	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+	skb_dst_set_noref(skb, &rt->dst);
 	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
 
 	return NF_STOLEN;
@@ -480,6 +481,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	skb->dev = outdev;
 	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+	skb_dst_set_noref(skb, &rt->dst);
 	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
 
 	return NF_STOLEN;
-- 
cgit v1.2.3-59-g8ed1b


From 7b7744e2aa93864b2a490fb3533e9417d21cadc0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 30 May 2018 12:17:56 -0700
Subject: netfilter: nfnetlink: Remove VLA usage

In the quest to remove all stack VLA usage from the kernel[1], this
allocates the maximum size expected for all possible attrs and adds
sanity-checks at both registration and usage to make sure nothing
gets out of sync.

[1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 261820627711..4d0da7042aff 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -38,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
 	rcu_dereference_protected(table[(id)].subsys, \
 				  lockdep_nfnl_is_held((id)))
 
+#define NFNL_MAX_ATTR_COUNT	32
+
 static struct {
 	struct mutex				mutex;
 	const struct nfnetlink_subsystem __rcu	*subsys;
@@ -77,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held);
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
 {
+	u8 cb_id;
+
+	/* Sanity-check attr_count size to avoid stack buffer overflow. */
+	for (cb_id = 0; cb_id < n->cb_count; cb_id++)
+		if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT))
+			return -EINVAL;
+
 	nfnl_lock(n->subsys_id);
 	if (table[n->subsys_id].subsys) {
 		nfnl_unlock(n->subsys_id);
@@ -186,11 +195,17 @@ replay:
 	{
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 		u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-		struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+		struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 		struct nlattr *attr = (void *)nlh + min_len;
 		int attrlen = nlh->nlmsg_len - min_len;
 		__u8 subsys_id = NFNL_SUBSYS_ID(type);
 
+		/* Sanity-check NFNL_MAX_ATTR_COUNT */
+		if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+			rcu_read_unlock();
+			return -ENOMEM;
+		}
+
 		err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
 				ss->cb[cb_id].policy, extack);
 		if (err < 0) {
@@ -387,10 +402,16 @@ replay:
 		{
 			int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
 			u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
-			struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
+			struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
 			struct nlattr *attr = (void *)nlh + min_len;
 			int attrlen = nlh->nlmsg_len - min_len;
 
+			/* Sanity-check NFTA_MAX_ATTR */
+			if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
+				err = -ENOMEM;
+				goto ack;
+			}
+
 			err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
 					attrlen, ss->cb[cb_id].policy, NULL);
 			if (err < 0)
-- 
cgit v1.2.3-59-g8ed1b


From d32de98ea70fe7cf606f3809f0970b31c115764b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 31 May 2018 01:58:00 +0200
Subject: netfilter: nft_fwd_netdev: allow to forward packets via neighbour
 layer

This allows us to forward packets from the netdev family via neighbour
layer, so you don't need an explicit link-layer destination when using
this expression from rules. The ttl/hop_limit field is decremented.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +
 net/netfilter/nft_fwd_netdev.c           | 146 ++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 5c7eb9b9f6d6..a089af092a29 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1260,10 +1260,14 @@ enum nft_dup_attributes {
  * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes
  *
  * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register)
+ * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register)
+ * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto)
  */
 enum nft_fwd_attributes {
 	NFTA_FWD_UNSPEC,
 	NFTA_FWD_SREG_DEV,
+	NFTA_FWD_SREG_ADDR,
+	NFTA_FWD_NFPROTO,
 	__NFTA_FWD_MAX
 };
 #define NFTA_FWD_MAX	(__NFTA_FWD_MAX - 1)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index ce13a50b9189..8abb9891cdf2 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -12,8 +12,12 @@
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_dup_netdev.h>
+#include <net/neighbour.h>
+#include <net/ip.h>
 
 struct nft_fwd_netdev {
 	enum nft_registers	sreg_dev:8;
@@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
 	[NFTA_FWD_SREG_DEV]	= { .type = NLA_U32 },
+	[NFTA_FWD_SREG_ADDR]	= { .type = NLA_U32 },
+	[NFTA_FWD_NFPROTO]	= { .type = NLA_U32 },
 };
 
 static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -62,7 +68,133 @@ nla_put_failure:
 	return -1;
 }
 
+struct nft_fwd_neigh {
+	enum nft_registers	sreg_dev:8;
+	enum nft_registers	sreg_addr:8;
+	u8			nfproto;
+};
+
+static void nft_fwd_neigh_eval(const struct nft_expr *expr,
+			      struct nft_regs *regs,
+			      const struct nft_pktinfo *pkt)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	void *addr = &regs->data[priv->sreg_addr];
+	int oif = regs->data[priv->sreg_dev];
+	unsigned int verdict = NF_STOLEN;
+	struct sk_buff *skb = pkt->skb;
+	struct net_device *dev;
+	int neigh_table;
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4: {
+		struct iphdr *iph;
+
+		if (skb->protocol != htons(ETH_P_IP)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*iph))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		iph = ip_hdr(skb);
+		ip_decrease_ttl(iph);
+		neigh_table = NEIGH_ARP_TABLE;
+		break;
+		}
+	case NFPROTO_IPV6: {
+		struct ipv6hdr *ip6h;
+
+		if (skb->protocol != htons(ETH_P_IPV6)) {
+			verdict = NFT_BREAK;
+			goto out;
+		}
+		if (skb_try_make_writable(skb, sizeof(*ip6h))) {
+			verdict = NF_DROP;
+			goto out;
+		}
+		ip6h = ipv6_hdr(skb);
+		ip6h->hop_limit--;
+		neigh_table = NEIGH_ND_TABLE;
+		break;
+		}
+	default:
+		verdict = NFT_BREAK;
+		goto out;
+	}
+
+	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+	if (dev == NULL)
+		return;
+
+	skb->dev = dev;
+	neigh_xmit(neigh_table, dev, addr, skb);
+out:
+	regs->verdict.code = verdict;
+}
+
+static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+	unsigned int addr_len;
+	int err;
+
+	if (!tb[NFTA_FWD_SREG_DEV] ||
+	    !tb[NFTA_FWD_SREG_ADDR] ||
+	    !tb[NFTA_FWD_NFPROTO])
+		return -EINVAL;
+
+	priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+	priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
+	priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
+
+	switch (priv->nfproto) {
+	case NFPROTO_IPV4:
+		addr_len = sizeof(struct in_addr);
+		break;
+	case NFPROTO_IPV6:
+		addr_len = sizeof(struct in6_addr);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+	if (err < 0)
+		return err;
+
+	return nft_validate_register_load(priv->sreg_addr, addr_len);
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) ||
+	    nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) ||
+	    nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
+	.type		= &nft_fwd_netdev_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)),
+	.eval		= nft_fwd_neigh_eval,
+	.init		= nft_fwd_neigh_init,
+	.dump		= nft_fwd_neigh_dump,
+};
+
 static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.type		= &nft_fwd_netdev_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
@@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
 	.dump		= nft_fwd_netdev_dump,
 };
 
+static const struct nft_expr_ops *
+nft_fwd_select_ops(const struct nft_ctx *ctx,
+		   const struct nlattr * const tb[])
+{
+	if (tb[NFTA_FWD_SREG_ADDR])
+		return &nft_fwd_neigh_netdev_ops;
+	if (tb[NFTA_FWD_SREG_DEV])
+		return &nft_fwd_netdev_ops;
+
+        return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
 	.family		= NFPROTO_NETDEV,
 	.name		= "fwd",
-	.ops		= &nft_fwd_netdev_ops,
+	.select_ops	= nft_fwd_select_ops,
 	.policy		= nft_fwd_netdev_policy,
 	.maxattr	= NFTA_FWD_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3-59-g8ed1b


From 0cfceb9ff9ad84877f13e8cdf5a8b971d7d34dd3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 25 May 2018 22:06:24 +0300
Subject: ipvs: add full ipv6 support to nfct

Prepare NFCT to support IPv6 for FTP:

- Do not restrict the expectation callback to PF_INET

- Split the debug messages, so that the 160-byte limitation
in IP_VS_DBG_BUF is not exceeded when printing many IPv6
addresses. This means no more than 3 addresses in one message,
i.e. 1 tuple with 2 addresses or 1 connection with 3 addresses.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_nfct.c | 101 +++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 52 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 
-#define FMT_TUPLE	"%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)	&(T)->src.u3.ip, ntohs((T)->src.u.all), \
-			&(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE	"%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T)	IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3),	\
+			ntohs((T)->src.u.all),				\
+			IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3),	\
+			ntohs((T)->dst.u.all),				\
 			(T)->dst.protonum
 
-#define FMT_CONN	"%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)	&((C)->caddr.ip), ntohs((C)->cport), \
-			&((C)->vaddr.ip), ntohs((C)->vport), \
-			&((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN	"%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C)	IP_VS_DBG_ADDR((C)->af, &((C)->caddr)),		\
+			ntohs((C)->cport),				\
+			IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)),		\
+			ntohs((C)->vport),				\
+			IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)),	\
+			ntohs((C)->dport),				\
 			(C)->protocol, (C)->state
 
 void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
 		    new_tuple.dst.protonum != IPPROTO_ICMPV6)
 			new_tuple.dst.u.tcp.port = cp->vport;
 	}
-	IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
-		  "ctinfo=%d, old reply=" FMT_TUPLE
-		  ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
-		  __func__, ct, ct->status, ctinfo,
-		  ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
-		  ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+	IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		      "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+		      __func__, ct, ct->status, ctinfo,
+		      ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+	IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+		      "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+		      __func__, ct, ct->status, ctinfo,
+		      ARG_TUPLE(&new_tuple));
 	nf_conntrack_alter_reply(ct, &new_tuple);
+	IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+		      __func__, ct, ARG_CONN(cp));
 }
 
 int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	struct ip_vs_conn_param p;
 	struct net *net = nf_ct_net(ct);
 
-	if (exp->tuple.src.l3num != PF_INET)
-		return;
-
 	/*
 	 * We assume that no NF locks are held before this callback.
 	 * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	cp = ip_vs_conn_out_get(&p);
 	if (cp) {
 		/* Change reply CLIENT->RS to CLIENT->VS */
+		IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+			      FMT_CONN "\n",
+			      __func__, ct, ct->status, ARG_CONN(cp));
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
+		IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+			      FMT_TUPLE "\n",
+			      __func__, ct, ARG_TUPLE(&new_reply));
 		new_reply.dst.u3 = cp->vaddr;
 		new_reply.dst.u.tcp.port = cp->vport;
-		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-			  ", inout cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
 		goto alter;
 	}
 
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
 	cp = ip_vs_conn_in_get(&p);
 	if (cp) {
 		/* Change reply VS->CLIENT to RS->CLIENT */
+		IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+			      FMT_CONN "\n",
+			      __func__, ct, ct->status, ARG_CONN(cp));
 		new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-		IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-			  __func__, ct, ct->status,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
+		IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+			      FMT_TUPLE "\n",
+			      __func__, ct, ARG_TUPLE(&new_reply));
 		new_reply.src.u3 = cp->daddr;
 		new_reply.src.u.tcp.port = cp->dport;
-		IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
-			  FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-			  __func__, ct,
-			  ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-			  ARG_CONN(cp));
 		goto alter;
 	}
 
-	IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-		  " - unknown expect\n",
-		  __func__, ct, ct->status, ARG_TUPLE(orig));
+	IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+		      " - unknown expect\n",
+		      __func__, ct, ct->status, ARG_TUPLE(orig));
 	return;
 
 alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
 
 	exp->expectfn = ip_vs_nfct_expect_callback;
 
-	IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
-		__func__, ct, ARG_TUPLE(&exp->tuple));
+	IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+		      __func__, ct, ARG_TUPLE(&exp->tuple));
 	nf_ct_expect_related(exp);
 	nf_ct_expect_put(exp);
 }
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
 	tuple.dst.u3 = cp->vaddr;
 	tuple.dst.u.all = cp->vport;
 
-	IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
-		" for conn " FMT_CONN "\n",
-		__func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+	IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+		      __func__, ARG_CONN(cp));
 
 	h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
 	if (h) {
 		ct = nf_ct_tuplehash_to_ctrack(h);
 		if (nf_ct_kill(ct)) {
-			IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
-				FMT_TUPLE "\n",
-				__func__, ct, ARG_TUPLE(&tuple));
+			IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+				      FMT_TUPLE "\n",
+				      __func__, ct, ARG_TUPLE(&tuple));
 		} else {
-			IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
-				FMT_TUPLE "\n",
-				__func__, ct, ARG_TUPLE(&tuple));
+			IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+				      FMT_TUPLE "\n",
+				      __func__, ct, ARG_TUPLE(&tuple));
 		}
 		nf_ct_put(ct);
 	} else {
-		IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
-			__func__, ARG_TUPLE(&tuple));
+		IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+			      __func__, ARG_TUPLE(&tuple));
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d12e12299a6915fc10131602cca41170e46ae755 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 25 May 2018 22:06:25 +0300
Subject: ipvs: add ipv6 support to ftp

Add support for FTP commands with extended format (RFC 2428):

- FTP EPRT: IPv4 and IPv6, active mode, similar to PORT
- FTP EPSV: IPv4 and IPv6, passive mode, similar to PASV.
EPSV response usually contains only port but we allow real
server to provide different address

We restrict control and data connection to be from same
address family.

Allow the "(" and ")" to be optional in PASV response.

Also, add ipvsh argument to the pkt_in/pkt_out handlers to better
access the payload after transport header.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h                   |  10 +-
 net/netfilter/ipvs/ip_vs_app.c        |  24 +-
 net/netfilter/ipvs/ip_vs_ftp.c        | 467 ++++++++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   4 +-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |   4 +-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |   4 +-
 6 files changed, 331 insertions(+), 182 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 0ac795b41ab8..03f567eb9536 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -763,14 +763,14 @@ struct ip_vs_app {
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
-		       struct sk_buff *, int *diff);
+		       struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* input hook: Process packet in outin direction, diff set for TCP.
 	 * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok,
 	 *	   2=Mangled but checksum was not updated
 	 */
 	int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
-		      struct sk_buff *, int *diff);
+		      struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh);
 
 	/* ip_vs_app initializer */
 	int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
@@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16
 int ip_vs_app_inc_get(struct ip_vs_app *inc);
 void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
-int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
-int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
+int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh);
+int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh);
 
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 1c98c907bc63..12d74896556a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
 }
 
 static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
-				  struct ip_vs_app *app)
+				  struct ip_vs_app *app,
+				  struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_out == NULL)
 		return 1;
 
-	if (!app->pkt_out(app, cp, skb, &diff))
+	if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL
  *	returns false if it can't handle packet (oom)
  */
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+		      struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_out(cp, skb, app);
+		return app_tcp_pkt_out(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_out == NULL)
 		return 1;
 
-	return app->pkt_out(app, cp, skb, NULL);
+	return app->pkt_out(app, cp, skb, NULL, ipvsh);
 }
 
 
 static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
-				 struct ip_vs_app *app)
+				 struct ip_vs_app *app,
+				 struct ip_vs_iphdr *ipvsh)
 {
 	int diff;
 	const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
 	if (app->pkt_in == NULL)
 		return 1;
 
-	if (!app->pkt_in(app, cp, skb, &diff))
+	if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
 		return 0;
 
 	/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
  *	called by ipvs packet handler, assumes previously checked cp!=NULL.
  *	returns false if can't handle packet (oom).
  */
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+		     struct ip_vs_iphdr *ipvsh)
 {
 	struct ip_vs_app *app;
 
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 	/* TCP is complicated */
 	if (cp->protocol == IPPROTO_TCP)
-		return app_tcp_pkt_in(cp, skb, app);
+		return app_tcp_pkt_in(cp, skb, app, ipvsh);
 
 	/*
 	 *	Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
 	if (app->pkt_in == NULL)
 		return 1;
 
-	return app->pkt_in(app, cp, skb, NULL);
+	return app->pkt_in(app, cp, skb, NULL, ipvsh);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/netfilter.h>
@@ -44,9 +46,18 @@
 #include <net/ip_vs.h>
 
 
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
 
+enum {
+	IP_VS_FTP_ACTIVE = 0,
+	IP_VS_FTP_PORT = 0,
+	IP_VS_FTP_PASV,
+	IP_VS_FTP_EPRT,
+	IP_VS_FTP_EPSV,
+};
 
 /*
  * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
 MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
 
 
-/*	Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+	struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+	if ((th->doff << 2) < sizeof(struct tcphdr))
+		return NULL;
 
+	return (char *)th + (th->doff << 2);
+}
 
 static int
 ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
 }
 
 
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
  */
 static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 				  const char *pattern, size_t plen,
-				  char skip, char term,
-				  __be32 *addr, __be16 *port,
-				  char **start, char **end)
+				  char skip, bool ext, int mode,
+				  union nf_inet_addr *addr, __be16 *port,
+				  __u16 af, char **start, char **end)
 {
 	char *s, c;
 	unsigned char p[6];
+	char edelim;
+	__u16 hport;
 	int i = 0;
 
 	if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			if (s == data_limit)
 				return -1;
 			if (!found) {
+				/* "(" is optional for non-extended format,
+				 * so catch the start of IPv4 address
+				 */
+				if (!ext && isdigit(*s))
+					break;
 				if (*s == skip)
 					found = 1;
 			} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
 			}
 		}
 	}
+	/* Old IPv4-only format? */
+	if (!ext) {
+		p[0] = 0;
+		for (data = s; ; data++) {
+			if (data == data_limit)
+				return -1;
+			c = *data;
+			if (isdigit(c)) {
+				p[i] = p[i]*10 + c - '0';
+			} else if (c == ',' && i < 5) {
+				i++;
+				p[i] = 0;
+			} else {
+				/* unexpected character or terminator */
+				break;
+			}
+		}
 
-	for (data = s; ; data++) {
-		if (data == data_limit)
+		if (i != 5)
 			return -1;
-		if (*data == term)
-			break;
+
+		*start = s;
+		*end = data;
+		addr->ip = get_unaligned((__be32 *) p);
+		*port = get_unaligned((__be16 *) (p + 4));
+		return 1;
 	}
-	*end = data;
+	if (s == data_limit)
+		return -1;
+	*start = s;
+	edelim = *s++;
+	if (edelim < 33 || edelim > 126)
+		return -1;
+	if (s == data_limit)
+		return -1;
+	if (*s == edelim) {
+		/* Address family is usually missing for EPSV response */
+		if (mode != IP_VS_FTP_EPSV)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		/* Then address should be missing too */
+		if (*s != edelim)
+			return -1;
+		/* Caller can pre-set addr, if needed */
+		s++;
+	} else {
+		const char *ep;
 
-	memset(p, 0, sizeof(p));
-	for (data = s; ; data++) {
-		c = *data;
-		if (c == term)
-			break;
-		if (c >= '0' && c <= '9') {
-			p[i] = p[i]*10 + c - '0';
-		} else if (c == ',' && i < 5) {
-			i++;
-		} else {
-			/* unexpected character */
+		/* We allow address only from same family */
+		if (af == AF_INET6 && *s != '2')
 			return -1;
+		if (af == AF_INET && *s != '1')
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
+		if (s == data_limit)
+			return -1;
+		if (af == AF_INET6) {
+			if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
+		} else {
+			if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+				     &ep) <= 0)
+				return -1;
 		}
+		s = (char *) ep;
+		if (s == data_limit)
+			return -1;
+		if (*s != edelim)
+			return -1;
+		s++;
 	}
-
-	if (i != 5)
+	for (hport = 0; ; s++)
+	{
+		if (s == data_limit)
+			return -1;
+		if (!isdigit(*s))
+			break;
+		hport = hport * 10 + *s - '0';
+	}
+	if (s == data_limit || !hport || *s != edelim)
 		return -1;
-
-	*start = s;
-	*addr = get_unaligned((__be32 *) p);
-	*port = get_unaligned((__be16 *) (p + 4));
+	s++;
+	*end = s;
+	*port = htons(hport);
 	return 1;
 }
 
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
  * from the server (inside-to-outside).
  * When we see one, we build a connection entry with the client address,
  * client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
  * The outgoing packet should be something like
  *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
  * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ *   "229 Entering Extended Passive Mode (|||ppp|)"
  */
 static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			 struct sk_buff *skb, int *diff)
+			 struct sk_buff *skb, int *diff,
+			 struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_limit;
 	char *start, *end;
 	union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	if (cp->app_data == &ip_vs_ftp_pasv) {
-		iph = ip_hdr(skb);
-		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-		data = (char *)th + (th->doff << 2);
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
 		data_limit = skb_tail_pointer(skb);
 
+		if (!data || data >= data_limit)
+			return 1;
+
 		if (ip_vs_ftp_get_addrport(data, data_limit,
-					   SERVER_STRING,
-					   sizeof(SERVER_STRING)-1,
-					   '(', ')',
-					   &from.ip, &port,
+					   SERVER_STRING_PASV,
+					   sizeof(SERVER_STRING_PASV)-1,
+					   '(', false, IP_VS_FTP_PASV,
+					   &from, &port, cp->af,
 					   &start, &end) != 1)
 			return 1;
 
-		IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+		IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
 			  &from.ip, ntohs(port), &cp->caddr.ip, 0);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		data = ip_vs_ftp_data_ptr(skb, ipvsh);
+		data_limit = skb_tail_pointer(skb);
 
-		/*
-		 * Now update or create an connection entry for it
+		if (!data || data >= data_limit)
+			return 1;
+
+		/* Usually, data address is not specified but
+		 * we support different address, so pre-set it.
 		 */
-		{
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-					      iph->protocol, &from, port,
-					      &cp->caddr, 0, &p);
-			n_cp = ip_vs_conn_out_get(&p);
-		}
-		if (!n_cp) {
-			struct ip_vs_conn_param p;
-			ip_vs_conn_fill_param(cp->ipvs,
-					      AF_INET, IPPROTO_TCP, &cp->caddr,
-					      0, &cp->vaddr, port, &p);
-			/* As above, this is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
-					      IP_VS_CONN_F_NO_CPORT |
-					      IP_VS_CONN_F_NFCT,
-					      cp->dest, skb->mark);
-			if (!n_cp)
-				return 0;
+		from = cp->daddr;
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING_EPSV,
+					   sizeof(SERVER_STRING_EPSV)-1,
+					   '(', true, IP_VS_FTP_EPSV,
+					   &from, &port, cp->af,
+					   &start, &end) != 1)
+			return 1;
 
-			/* add its controller */
-			ip_vs_control_add(n_cp, cp);
-		}
+		IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+	} else {
+		return 1;
+	}
 
-		/*
-		 * Replace the old passive address with the new one
-		 */
+	/* Now update or create a connection entry for it */
+	{
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &from, port,
+				      &cp->caddr, 0, &p);
+		n_cp = ip_vs_conn_out_get(&p);
+	}
+	if (!n_cp) {
+		struct ip_vs_conn_param p;
+
+		ip_vs_conn_fill_param(cp->ipvs,
+				      cp->af, ipvsh->protocol, &cp->caddr,
+				      0, &cp->vaddr, port, &p);
+		n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+				      IP_VS_CONN_F_NO_CPORT |
+				      IP_VS_CONN_F_NFCT,
+				      cp->dest, skb->mark);
+		if (!n_cp)
+			return 0;
+
+		/* add its controller */
+		ip_vs_control_add(n_cp, cp);
+	}
+
+	/* Replace the old passive address with the new one */
+	if (cp->app_data == (void *) IP_VS_FTP_PASV) {
 		from.ip = n_cp->vaddr.ip;
 		port = n_cp->vport;
 		snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			 ((unsigned char *)&from.ip)[3],
 			 ntohs(port) >> 8,
 			 ntohs(port) & 0xFF);
+	} else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+		from = n_cp->vaddr;
+		port = n_cp->vport;
+		/* Only port, client will use VIP for the data connection */
+		snprintf(buf, sizeof(buf), "|||%u|",
+			 ntohs(port));
+	} else {
+		*buf = 0;
+	}
+	buf_len = strlen(buf);
 
-		buf_len = strlen(buf);
-
-		ct = nf_ct_get(skb, &ctinfo);
-		if (ct) {
-			bool mangled;
-
-			/* If mangling fails this function will return 0
-			 * which will cause the packet to be dropped.
-			 * Mangling can only fail under memory pressure,
-			 * hopefully it will succeed on the retransmitted
-			 * packet.
-			 */
-			mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
-							   iph->ihl * 4,
-							   start - data,
-							   end - start,
-							   buf, buf_len);
-			if (mangled) {
-				ip_vs_nfct_expect_related(skb, ct, n_cp,
-							  IPPROTO_TCP, 0, 0);
-				if (skb->ip_summed == CHECKSUM_COMPLETE)
-					skb->ip_summed = CHECKSUM_UNNECESSARY;
-				/* csum is updated */
-				ret = 1;
-			}
-		}
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		bool mangled;
 
-		/*
-		 * Not setting 'diff' is intentional, otherwise the sequence
-		 * would be adjusted twice.
+		/* If mangling fails this function will return 0
+		 * which will cause the packet to be dropped.
+		 * Mangling can only fail under memory pressure,
+		 * hopefully it will succeed on the retransmitted
+		 * packet.
 		 */
-
-		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
-		ip_vs_conn_put(n_cp);
-		return ret;
+		mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+						   ipvsh->len,
+						   start - data,
+						   end - start,
+						   buf, buf_len);
+		if (mangled) {
+			ip_vs_nfct_expect_related(skb, ct, n_cp,
+						  ipvsh->protocol, 0, 0);
+			if (skb->ip_summed == CHECKSUM_COMPLETE)
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+			/* csum is updated */
+			ret = 1;
+		}
 	}
-	return 1;
+
+	/* Not setting 'diff' is intentional, otherwise the sequence
+	 * would be adjusted twice.
+	 */
+
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_conn_put(n_cp);
+	return ret;
 }
 
 
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
  * (outside-to-inside).
  *
  * The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
  * In this case, we create a connection entry using the client address and
  * port, so that the active ftp data connection from the server can reach
  * the client.
+ * Extended format:
+ *	"EPSV\r\n" when client requests server address from same family
+ *	"EPSV 1\r\n" when client requests IPv4 server address
+ *	"EPSV 2\r\n" when client requests IPv6 server address
+ *	"EPSV ALL\r\n" - not supported
+ *	EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ *	"EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ *	"EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
  */
 static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
-			struct sk_buff *skb, int *diff)
+			struct sk_buff *skb, int *diff,
+			struct ip_vs_iphdr *ipvsh)
 {
-	struct iphdr *iph;
-	struct tcphdr *th;
 	char *data, *data_start, *data_limit;
 	char *start, *end;
 	union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/* no diff required for incoming packets */
 	*diff = 0;
 
-#ifdef CONFIG_IP_VS_IPV6
-	/* This application helper doesn't work with IPv6 yet,
-	 * so turn this into a no-op for IPv6 packets
-	 */
-	if (cp->af == AF_INET6)
-		return 1;
-#endif
-
 	/* Only useful for established sessions */
 	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
 		return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	if (!skb_make_writable(skb, skb->len))
 		return 0;
 
-	/*
-	 * Detecting whether it is passive
-	 */
-	iph = ip_hdr(skb);
-	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
-	/* Since there may be OPTIONS in the TCP packet and the HLEN is
-	   the length of the header in 32-bit multiples, it is accurate
-	   to calculate data address by th+HLEN*4 */
-	data = data_start = (char *)th + (th->doff << 2);
+	data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
 	data_limit = skb_tail_pointer(skb);
+	if (!data || data >= data_limit)
+		return 1;
 
 	while (data <= data_limit - 6) {
-		if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+		if (cp->af == AF_INET &&
+		    strncasecmp(data, "PASV\r\n", 6) == 0) {
 			/* Passive mode on */
 			IP_VS_DBG(7, "got PASV at %td of %td\n",
 				  data - data_start,
 				  data_limit - data_start);
-			cp->app_data = &ip_vs_ftp_pasv;
+			cp->app_data = (void *) IP_VS_FTP_PASV;
 			return 1;
 		}
+
+		/* EPSV or EPSV<space><net-prt> */
+		if (strncasecmp(data, "EPSV", 4) == 0 &&
+		    (data[4] == ' ' || data[4] == '\r')) {
+			if (data[4] == ' ') {
+				char proto = data[5];
+
+				if (data > data_limit - 7 || data[6] != '\r')
+					return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+				if (cp->af == AF_INET6 && proto == '2') {
+				} else
+#endif
+				if (cp->af == AF_INET && proto == '1') {
+				} else {
+					return 1;
+				}
+			}
+			/* Extended Passive mode on */
+			IP_VS_DBG(7, "got EPSV at %td of %td\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = (void *) IP_VS_FTP_EPSV;
+			return 1;
+		}
+
 		data++;
 	}
 
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	 * then create a new connection entry for the coming data
 	 * connection.
 	 */
-	if (ip_vs_ftp_get_addrport(data_start, data_limit,
-				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
-				   ' ', '\r', &to.ip, &port,
-				   &start, &end) != 1)
+	if (cp->af == AF_INET &&
+	    ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING_PORT,
+				   sizeof(CLIENT_STRING_PORT)-1,
+				   ' ', false, IP_VS_FTP_PORT,
+				   &to, &port, cp->af,
+				   &start, &end) == 1) {
+
+		IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+			  ip_vs_proto_name(ipvsh->protocol),
+			  &to.ip, ntohs(port), &cp->vaddr.ip,
+			  ntohs(cp->vport)-1);
+	} else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+					  CLIENT_STRING_EPRT,
+					  sizeof(CLIENT_STRING_EPRT)-1,
+					  ' ', true, IP_VS_FTP_EPRT,
+					  &to, &port, cp->af,
+					  &start, &end) == 1) {
+
+		IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+		/* Now update or create a connection entry for it */
+		IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+			      ip_vs_proto_name(ipvsh->protocol),
+			      IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+			      ntohs(cp->vport)-1);
+	} else {
 		return 1;
-
-	IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+	}
 
 	/* Passive mode off */
-	cp->app_data = NULL;
-
-	/*
-	 * Now update or create a connection entry for it
-	 */
-	IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
-		  ip_vs_proto_name(iph->protocol),
-		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+	cp->app_data = (void *) IP_VS_FTP_ACTIVE;
 
 	{
 		struct ip_vs_conn_param p;
-		ip_vs_conn_fill_param(cp->ipvs, AF_INET,
-				      iph->protocol, &to, port, &cp->vaddr,
+		ip_vs_conn_fill_param(cp->ipvs, cp->af,
+				      ipvsh->protocol, &to, port, &cp->vaddr,
 				      htons(ntohs(cp->vport)-1), &p);
 		n_cp = ip_vs_conn_in_get(&p);
 		if (!n_cp) {
-			/* This is ipv4 only */
-			n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+			n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
 					      htons(ntohs(cp->dport)-1),
 					      IP_VS_CONN_F_NFCT, cp->dest,
 					      skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
 		ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
 		if (ret)
 			goto err_unreg;
-		pr_info("%s: loaded support on port[%d] = %d\n",
+		pr_info("%s: loaded support on port[%d] = %u\n",
 			app->name, i, ports[i]);
 	}
 	return 0;
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_out(cp, skb);
+		ret = ip_vs_app_pkt_out(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		ret = ip_vs_app_pkt_in(cp, skb);
+		ret = ip_vs_app_pkt_in(cp, skb, iph);
 		if (ret == 0)
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 569631d2b2a1..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 			return 0;
 
 		/* Call application helper if needed */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn and iph ack_seq stuff
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		/*
 		 *	Call application helper if needed
 		 */
-		if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 *	Attempt ip_vs_app call.
 		 *	It will fix ip_vs_conn
 		 */
-		if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
 			return 0;
 		/* ret=2: csum update is needed after payload mangling */
 		if (ret == 1)
-- 
cgit v1.2.3-59-g8ed1b


From af066ed3d45551e703d95d7e6c67d36a63809bc2 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Fri, 1 Jun 2018 17:16:58 +0900
Subject: rtnetlink: Fix null-ptr-deref in rtnl_newlink

In rtnl_newlink(), NULL check is performed on m_ops however member of
ops is accessed. Fixed by accessing member of m_ops instead of ops.

[  345.432629] BUG: KASAN: null-ptr-deref in rtnl_newlink+0x400/0x1110
[  345.432629] Read of size 4 at addr 0000000000000088 by task ip/986
[  345.432629]
[  345.432629] CPU: 1 PID: 986 Comm: ip Not tainted 4.17.0-rc6+ #9
[  345.432629] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  345.432629] Call Trace:
[  345.432629]  dump_stack+0xc6/0x150
[  345.432629]  ? dump_stack_print_info.cold.0+0x1b/0x1b
[  345.432629]  ? kasan_report+0xb4/0x410
[  345.432629]  kasan_report.cold.4+0x8f/0x91
[  345.432629]  ? rtnl_newlink+0x400/0x1110
[  345.432629]  rtnl_newlink+0x400/0x1110
[...]

Fixes: ccf8dbcd062a ("rtnetlink: Remove VLA usage")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8ca49a0e13fb..9a1ba2015ad8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2936,7 +2936,7 @@ replay:
 		}
 
 		if (m_ops) {
-			if (ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+			if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
 				return -EINVAL;
 
 			if (m_ops->slave_maxtype &&
-- 
cgit v1.2.3-59-g8ed1b


From c431f89b18a2bd62f74174829565e6433fc0c109 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Thu, 31 May 2018 09:52:53 +0300
Subject: net: sched: split tc_ctl_tfilter into three handlers

tc_ctl_tfilter handles three netlink message types: RTM_NEWTFILTER,
RTM_DELTFILTER, RTM_GETTFILTER. However, implementation of this function
involves a lot of branching on specific message type because most of the
code is message-specific. This significantly complicates adding new
functionality and doesn't provide much benefit of code reuse.

Split tc_ctl_tfilter to three standalone functions that handle filter new,
delete and get requests.

The only truly protocol independent part of tc_ctl_tfilter is code that
looks up queue, class, and block. Refactor this code to standalone
tcf_block_find function that is used by all three new handlers.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 438 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 293 insertions(+), 145 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 76303c45db19..c06585fb2dc6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -437,6 +437,78 @@ static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
 	return idr_find(&tn->idr, block_index);
 }
 
+/* Find tcf block.
+ * Set q, parent, cl when appropriate.
+ */
+
+static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
+					u32 *parent, unsigned long *cl,
+					int ifindex, u32 block_index,
+					struct netlink_ext_ack *extack)
+{
+	struct tcf_block *block;
+
+	if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+		block = tcf_block_lookup(net, block_index);
+		if (!block) {
+			NL_SET_ERR_MSG(extack, "Block of given index was not found");
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		const struct Qdisc_class_ops *cops;
+		struct net_device *dev;
+
+		/* Find link */
+		dev = __dev_get_by_index(net, ifindex);
+		if (!dev)
+			return ERR_PTR(-ENODEV);
+
+		/* Find qdisc */
+		if (!*parent) {
+			*q = dev->qdisc;
+			*parent = (*q)->handle;
+		} else {
+			*q = qdisc_lookup(dev, TC_H_MAJ(*parent));
+			if (!*q) {
+				NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
+				return ERR_PTR(-EINVAL);
+			}
+		}
+
+		/* Is it classful? */
+		cops = (*q)->ops->cl_ops;
+		if (!cops) {
+			NL_SET_ERR_MSG(extack, "Qdisc not classful");
+			return ERR_PTR(-EINVAL);
+		}
+
+		if (!cops->tcf_block) {
+			NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+
+		/* Do we search for filter, attached to class? */
+		if (TC_H_MIN(*parent)) {
+			*cl = cops->find(*q, *parent);
+			if (*cl == 0) {
+				NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
+				return ERR_PTR(-ENOENT);
+			}
+		}
+
+		/* And the last stroke */
+		block = cops->tcf_block(*q, *cl, extack);
+		if (!block)
+			return ERR_PTR(-EINVAL);
+		if (tcf_block_shared(block)) {
+			NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+	}
+
+	return block;
+}
+
 static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
 {
 	return list_first_entry(&block->chain_list, struct tcf_chain, list);
@@ -984,9 +1056,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 			       q, parent, 0, event, false);
 }
 
-/* Add/change/delete/get a filter node */
-
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			  struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
@@ -1007,8 +1077,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	int err;
 	int tp_created;
 
-	if ((n->nlmsg_type != RTM_GETTFILTER) &&
-	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 replay:
@@ -1026,24 +1095,13 @@ replay:
 	cl = 0;
 
 	if (prio == 0) {
-		switch (n->nlmsg_type) {
-		case RTM_DELTFILTER:
-			if (protocol || t->tcm_handle || tca[TCA_KIND]) {
-				NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
-				return -ENOENT;
-			}
-			break;
-		case RTM_NEWTFILTER:
-			/* If no priority is provided by the user,
-			 * we allocate one.
-			 */
-			if (n->nlmsg_flags & NLM_F_CREATE) {
-				prio = TC_H_MAKE(0x80000000U, 0U);
-				prio_allocate = true;
-				break;
-			}
-			/* fall-through */
-		default:
+		/* If no priority is provided by the user,
+		 * we allocate one.
+		 */
+		if (n->nlmsg_flags & NLM_F_CREATE) {
+			prio = TC_H_MAKE(0x80000000U, 0U);
+			prio_allocate = true;
+		} else {
 			NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
 			return -ENOENT;
 		}
@@ -1051,66 +1109,11 @@ replay:
 
 	/* Find head of filter chain. */
 
-	if (t->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
-		block = tcf_block_lookup(net, t->tcm_block_index);
-		if (!block) {
-			NL_SET_ERR_MSG(extack, "Block of given index was not found");
-			err = -EINVAL;
-			goto errout;
-		}
-	} else {
-		const struct Qdisc_class_ops *cops;
-		struct net_device *dev;
-
-		/* Find link */
-		dev = __dev_get_by_index(net, t->tcm_ifindex);
-		if (!dev)
-			return -ENODEV;
-
-		/* Find qdisc */
-		if (!parent) {
-			q = dev->qdisc;
-			parent = q->handle;
-		} else {
-			q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
-			if (!q) {
-				NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists");
-				return -EINVAL;
-			}
-		}
-
-		/* Is it classful? */
-		cops = q->ops->cl_ops;
-		if (!cops) {
-			NL_SET_ERR_MSG(extack, "Qdisc not classful");
-			return -EINVAL;
-		}
-
-		if (!cops->tcf_block) {
-			NL_SET_ERR_MSG(extack, "Class doesn't support blocks");
-			return -EOPNOTSUPP;
-		}
-
-		/* Do we search for filter, attached to class? */
-		if (TC_H_MIN(parent)) {
-			cl = cops->find(q, parent);
-			if (cl == 0) {
-				NL_SET_ERR_MSG(extack, "Specified class doesn't exist");
-				return -ENOENT;
-			}
-		}
-
-		/* And the last stroke */
-		block = cops->tcf_block(q, cl, extack);
-		if (!block) {
-			err = -EINVAL;
-			goto errout;
-		}
-		if (tcf_block_shared(block)) {
-			NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters");
-			err = -EOPNOTSUPP;
-			goto errout;
-		}
+	block = tcf_block_find(net, &q, &parent, &cl,
+			       t->tcm_ifindex, t->tcm_block_index, extack);
+	if (IS_ERR(block)) {
+		err = PTR_ERR(block);
+		goto errout;
 	}
 
 	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
@@ -1119,19 +1122,10 @@ replay:
 		err = -EINVAL;
 		goto errout;
 	}
-	chain = tcf_chain_get(block, chain_index,
-			      n->nlmsg_type == RTM_NEWTFILTER);
+	chain = tcf_chain_get(block, chain_index, true);
 	if (!chain) {
 		NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
-		err = n->nlmsg_type == RTM_NEWTFILTER ? -ENOMEM : -EINVAL;
-		goto errout;
-	}
-
-	if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) {
-		tfilter_notify_chain(net, skb, block, q, parent, n,
-				     chain, RTM_DELTFILTER);
-		tcf_chain_flush(chain);
-		err = 0;
+		err = -ENOMEM;
 		goto errout;
 	}
 
@@ -1152,8 +1146,7 @@ replay:
 			goto errout;
 		}
 
-		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+		if (!(n->nlmsg_flags & NLM_F_CREATE)) {
 			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
 			err = -ENOENT;
 			goto errout;
@@ -1178,56 +1171,15 @@ replay:
 	fh = tp->ops->get(tp, t->tcm_handle);
 
 	if (!fh) {
-		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
-			tcf_chain_tp_remove(chain, &chain_info, tp);
-			tfilter_notify(net, skb, n, tp, block, q, parent, fh,
-				       RTM_DELTFILTER, false);
-			tcf_proto_destroy(tp, extack);
-			err = 0;
-			goto errout;
-		}
-
-		if (n->nlmsg_type != RTM_NEWTFILTER ||
-		    !(n->nlmsg_flags & NLM_F_CREATE)) {
+		if (!(n->nlmsg_flags & NLM_F_CREATE)) {
 			NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter");
 			err = -ENOENT;
 			goto errout;
 		}
-	} else {
-		bool last;
-
-		switch (n->nlmsg_type) {
-		case RTM_NEWTFILTER:
-			if (n->nlmsg_flags & NLM_F_EXCL) {
-				if (tp_created)
-					tcf_proto_destroy(tp, NULL);
-				NL_SET_ERR_MSG(extack, "Filter already exists");
-				err = -EEXIST;
-				goto errout;
-			}
-			break;
-		case RTM_DELTFILTER:
-			err = tfilter_del_notify(net, skb, n, tp, block,
-						 q, parent, fh, false, &last,
-						 extack);
-			if (err)
-				goto errout;
-			if (last) {
-				tcf_chain_tp_remove(chain, &chain_info, tp);
-				tcf_proto_destroy(tp, extack);
-			}
-			goto errout;
-		case RTM_GETTFILTER:
-			err = tfilter_notify(net, skb, n, tp, block, q, parent,
-					     fh, RTM_NEWTFILTER, true);
-			if (err < 0)
-				NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
-			goto errout;
-		default:
-			NL_SET_ERR_MSG(extack, "Invalid netlink message type");
-			err = -EINVAL;
-			goto errout;
-		}
+	} else if (n->nlmsg_flags & NLM_F_EXCL) {
+		NL_SET_ERR_MSG(extack, "Filter already exists");
+		err = -EEXIST;
+		goto errout;
 	}
 
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
@@ -1252,6 +1204,202 @@ errout:
 	return err;
 }
 
+static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+			  struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct tcmsg *t;
+	u32 protocol;
+	u32 prio;
+	u32 parent;
+	u32 chain_index;
+	struct Qdisc *q = NULL;
+	struct tcf_chain_info chain_info;
+	struct tcf_chain *chain = NULL;
+	struct tcf_block *block;
+	struct tcf_proto *tp = NULL;
+	unsigned long cl = 0;
+	void *fh = NULL;
+	int err;
+
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+	if (err < 0)
+		return err;
+
+	t = nlmsg_data(n);
+	protocol = TC_H_MIN(t->tcm_info);
+	prio = TC_H_MAJ(t->tcm_info);
+	parent = t->tcm_parent;
+
+	if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) {
+		NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set");
+		return -ENOENT;
+	}
+
+	/* Find head of filter chain. */
+
+	block = tcf_block_find(net, &q, &parent, &cl,
+			       t->tcm_ifindex, t->tcm_block_index, extack);
+	if (IS_ERR(block)) {
+		err = PTR_ERR(block);
+		goto errout;
+	}
+
+	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+	if (chain_index > TC_ACT_EXT_VAL_MASK) {
+		NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+		err = -EINVAL;
+		goto errout;
+	}
+	chain = tcf_chain_get(block, chain_index, false);
+	if (!chain) {
+		NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	if (prio == 0) {
+		tfilter_notify_chain(net, skb, block, q, parent, n,
+				     chain, RTM_DELTFILTER);
+		tcf_chain_flush(chain);
+		err = 0;
+		goto errout;
+	}
+
+	tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+			       prio, false);
+	if (!tp || IS_ERR(tp)) {
+		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+		err = PTR_ERR(tp);
+		goto errout;
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	fh = tp->ops->get(tp, t->tcm_handle);
+
+	if (!fh) {
+		if (t->tcm_handle == 0) {
+			tcf_chain_tp_remove(chain, &chain_info, tp);
+			tfilter_notify(net, skb, n, tp, block, q, parent, fh,
+				       RTM_DELTFILTER, false);
+			tcf_proto_destroy(tp, extack);
+			err = 0;
+		} else {
+			NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+			err = -ENOENT;
+		}
+	} else {
+		bool last;
+
+		err = tfilter_del_notify(net, skb, n, tp, block,
+					 q, parent, fh, false, &last,
+					 extack);
+		if (err)
+			goto errout;
+		if (last) {
+			tcf_chain_tp_remove(chain, &chain_info, tp);
+			tcf_proto_destroy(tp, extack);
+		}
+	}
+
+errout:
+	if (chain)
+		tcf_chain_put(chain);
+	return err;
+}
+
+static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
+			  struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tca[TCA_MAX + 1];
+	struct tcmsg *t;
+	u32 protocol;
+	u32 prio;
+	u32 parent;
+	u32 chain_index;
+	struct Qdisc *q = NULL;
+	struct tcf_chain_info chain_info;
+	struct tcf_chain *chain = NULL;
+	struct tcf_block *block;
+	struct tcf_proto *tp = NULL;
+	unsigned long cl = 0;
+	void *fh = NULL;
+	int err;
+
+	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+	if (err < 0)
+		return err;
+
+	t = nlmsg_data(n);
+	protocol = TC_H_MIN(t->tcm_info);
+	prio = TC_H_MAJ(t->tcm_info);
+	parent = t->tcm_parent;
+
+	if (prio == 0) {
+		NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero");
+		return -ENOENT;
+	}
+
+	/* Find head of filter chain. */
+
+	block = tcf_block_find(net, &q, &parent, &cl,
+			       t->tcm_ifindex, t->tcm_block_index, extack);
+	if (IS_ERR(block)) {
+		err = PTR_ERR(block);
+		goto errout;
+	}
+
+	chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+	if (chain_index > TC_ACT_EXT_VAL_MASK) {
+		NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+		err = -EINVAL;
+		goto errout;
+	}
+	chain = tcf_chain_get(block, chain_index, false);
+	if (!chain) {
+		NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	tp = tcf_chain_tp_find(chain, &chain_info, protocol,
+			       prio, false);
+	if (!tp || IS_ERR(tp)) {
+		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
+		err = PTR_ERR(tp);
+		goto errout;
+	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
+		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
+		err = -EINVAL;
+		goto errout;
+	}
+
+	fh = tp->ops->get(tp, t->tcm_handle);
+
+	if (!fh) {
+		NL_SET_ERR_MSG(extack, "Specified filter handle not found");
+		err = -ENOENT;
+	} else {
+		err = tfilter_notify(net, skb, n, tp, block, q, parent,
+				     fh, RTM_NEWTFILTER, true);
+		if (err < 0)
+			NL_SET_ERR_MSG(extack, "Failed to send filter notify message");
+	}
+
+errout:
+	if (chain)
+		tcf_chain_put(chain);
+	return err;
+}
+
 struct tcf_dump_args {
 	struct tcf_walker w;
 	struct sk_buff *skb;
@@ -1634,9 +1782,9 @@ static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
-	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
-	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
-	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
+	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
 		      tc_dump_tfilter, 0);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From db9d7d36eecc8926f03a8f2e46781887577b3353 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 31 May 2018 10:07:43 +0200
Subject: net: mvpp2: Split the PPv2 driver to a dedicated directory

As the mvpp2 driver is growing, move this driver to a dedicated
directory and split it into several files.

Since this driver has a lot of register defines and structure
definitions, it can benefit from having all of this into a dedicated
header file, named mvpp2.h.

A good chunk of the mvpp2 code is dedicated to Header Parser handling, so
we introduce mvpp2_prs.h where all Header Parser definitions are located,
and mvpp2_prs.c containing the related code.

In the same way, mvpp2_cls.h and mvpp2_cls.c are created to contain
Classifier and RSS related code.

The former 'mvpp2.c' file is renamed 'mvpp2_main.c' so that we can keep
the driver binary named 'mvpp2'.

This commit is only about spliting the driver into multiple files and
doesn't introduce any new function, feature or fix besides removing
'static' keywords when needed.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/Makefile           |    2 +-
 drivers/net/ethernet/marvell/mvpp2.c            | 9159 -----------------------
 drivers/net/ethernet/marvell/mvpp2/Makefile     |    7 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h      | 1046 +++
 drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c  |  141 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.h  |   44 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 5281 +++++++++++++
 drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c  | 2467 ++++++
 drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h  |  314 +
 9 files changed, 9301 insertions(+), 9160 deletions(-)
 delete mode 100644 drivers/net/ethernet/marvell/mvpp2.c
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/Makefile
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2.h
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.h
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c
 create mode 100644 drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h

diff --git a/drivers/net/ethernet/marvell/Makefile b/drivers/net/ethernet/marvell/Makefile
index 9498ed26dbe5..55d4d10aa7d3 100644
--- a/drivers/net/ethernet/marvell/Makefile
+++ b/drivers/net/ethernet/marvell/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_MVMDIO) += mvmdio.o
 obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
 obj-$(CONFIG_MVNETA_BM) += mvneta_bm.o
 obj-$(CONFIG_MVNETA) += mvneta.o
-obj-$(CONFIG_MVPP2) += mvpp2.o
+obj-$(CONFIG_MVPP2) += mvpp2/
 obj-$(CONFIG_PXA168_ETH) += pxa168_eth.o
 obj-$(CONFIG_SKGE) += skge.o
 obj-$(CONFIG_SKY2) += sky2.o
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
deleted file mode 100644
index 6847cd431aa0..000000000000
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ /dev/null
@@ -1,9159 +0,0 @@
-/*
- * Driver for Marvell PPv2 network controller for Armada 375 SoC.
- *
- * Copyright (C) 2014 Marvell
- *
- * Marcin Wojtas <mw@semihalf.com>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#include <linux/acpi.h>
-#include <linux/kernel.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/platform_device.h>
-#include <linux/skbuff.h>
-#include <linux/inetdevice.h>
-#include <linux/mbus.h>
-#include <linux/module.h>
-#include <linux/mfd/syscon.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/of.h>
-#include <linux/of_irq.h>
-#include <linux/of_mdio.h>
-#include <linux/of_net.h>
-#include <linux/of_address.h>
-#include <linux/of_device.h>
-#include <linux/phy.h>
-#include <linux/phylink.h>
-#include <linux/phy/phy.h>
-#include <linux/clk.h>
-#include <linux/hrtimer.h>
-#include <linux/ktime.h>
-#include <linux/regmap.h>
-#include <uapi/linux/ppp_defs.h>
-#include <net/ip.h>
-#include <net/ipv6.h>
-#include <net/tso.h>
-
-/* Fifo Registers */
-#define MVPP2_RX_DATA_FIFO_SIZE_REG(port)	(0x00 + 4 * (port))
-#define MVPP2_RX_ATTR_FIFO_SIZE_REG(port)	(0x20 + 4 * (port))
-#define MVPP2_RX_MIN_PKT_SIZE_REG		0x60
-#define MVPP2_RX_FIFO_INIT_REG			0x64
-#define MVPP22_TX_FIFO_THRESH_REG(port)		(0x8840 + 4 * (port))
-#define MVPP22_TX_FIFO_SIZE_REG(port)		(0x8860 + 4 * (port))
-
-/* RX DMA Top Registers */
-#define MVPP2_RX_CTRL_REG(port)			(0x140 + 4 * (port))
-#define     MVPP2_RX_LOW_LATENCY_PKT_SIZE(s)	(((s) & 0xfff) << 16)
-#define     MVPP2_RX_USE_PSEUDO_FOR_CSUM_MASK	BIT(31)
-#define MVPP2_POOL_BUF_SIZE_REG(pool)		(0x180 + 4 * (pool))
-#define     MVPP2_POOL_BUF_SIZE_OFFSET		5
-#define MVPP2_RXQ_CONFIG_REG(rxq)		(0x800 + 4 * (rxq))
-#define     MVPP2_SNOOP_PKT_SIZE_MASK		0x1ff
-#define     MVPP2_SNOOP_BUF_HDR_MASK		BIT(9)
-#define     MVPP2_RXQ_POOL_SHORT_OFFS		20
-#define     MVPP21_RXQ_POOL_SHORT_MASK		0x700000
-#define     MVPP22_RXQ_POOL_SHORT_MASK		0xf00000
-#define     MVPP2_RXQ_POOL_LONG_OFFS		24
-#define     MVPP21_RXQ_POOL_LONG_MASK		0x7000000
-#define     MVPP22_RXQ_POOL_LONG_MASK		0xf000000
-#define     MVPP2_RXQ_PACKET_OFFSET_OFFS	28
-#define     MVPP2_RXQ_PACKET_OFFSET_MASK	0x70000000
-#define     MVPP2_RXQ_DISABLE_MASK		BIT(31)
-
-/* Top Registers */
-#define MVPP2_MH_REG(port)			(0x5040 + 4 * (port))
-#define MVPP2_DSA_EXTENDED			BIT(5)
-
-/* Parser Registers */
-#define MVPP2_PRS_INIT_LOOKUP_REG		0x1000
-#define     MVPP2_PRS_PORT_LU_MAX		0xf
-#define     MVPP2_PRS_PORT_LU_MASK(port)	(0xff << ((port) * 4))
-#define     MVPP2_PRS_PORT_LU_VAL(port, val)	((val) << ((port) * 4))
-#define MVPP2_PRS_INIT_OFFS_REG(port)		(0x1004 + ((port) & 4))
-#define     MVPP2_PRS_INIT_OFF_MASK(port)	(0x3f << (((port) % 4) * 8))
-#define     MVPP2_PRS_INIT_OFF_VAL(port, val)	((val) << (((port) % 4) * 8))
-#define MVPP2_PRS_MAX_LOOP_REG(port)		(0x100c + ((port) & 4))
-#define     MVPP2_PRS_MAX_LOOP_MASK(port)	(0xff << (((port) % 4) * 8))
-#define     MVPP2_PRS_MAX_LOOP_VAL(port, val)	((val) << (((port) % 4) * 8))
-#define MVPP2_PRS_TCAM_IDX_REG			0x1100
-#define MVPP2_PRS_TCAM_DATA_REG(idx)		(0x1104 + (idx) * 4)
-#define     MVPP2_PRS_TCAM_INV_MASK		BIT(31)
-#define MVPP2_PRS_SRAM_IDX_REG			0x1200
-#define MVPP2_PRS_SRAM_DATA_REG(idx)		(0x1204 + (idx) * 4)
-#define MVPP2_PRS_TCAM_CTRL_REG			0x1230
-#define     MVPP2_PRS_TCAM_EN_MASK		BIT(0)
-
-/* RSS Registers */
-#define MVPP22_RSS_INDEX			0x1500
-#define     MVPP22_RSS_INDEX_TABLE_ENTRY(idx)	(idx)
-#define     MVPP22_RSS_INDEX_TABLE(idx)		((idx) << 8)
-#define     MVPP22_RSS_INDEX_QUEUE(idx)		((idx) << 16)
-#define MVPP22_RSS_TABLE_ENTRY			0x1508
-#define MVPP22_RSS_TABLE			0x1510
-#define     MVPP22_RSS_TABLE_POINTER(p)		(p)
-#define MVPP22_RSS_WIDTH			0x150c
-
-/* Classifier Registers */
-#define MVPP2_CLS_MODE_REG			0x1800
-#define     MVPP2_CLS_MODE_ACTIVE_MASK		BIT(0)
-#define MVPP2_CLS_PORT_WAY_REG			0x1810
-#define     MVPP2_CLS_PORT_WAY_MASK(port)	(1 << (port))
-#define MVPP2_CLS_LKP_INDEX_REG			0x1814
-#define     MVPP2_CLS_LKP_INDEX_WAY_OFFS	6
-#define MVPP2_CLS_LKP_TBL_REG			0x1818
-#define     MVPP2_CLS_LKP_TBL_RXQ_MASK		0xff
-#define     MVPP2_CLS_LKP_TBL_LOOKUP_EN_MASK	BIT(25)
-#define MVPP2_CLS_FLOW_INDEX_REG		0x1820
-#define MVPP2_CLS_FLOW_TBL0_REG			0x1824
-#define MVPP2_CLS_FLOW_TBL1_REG			0x1828
-#define MVPP2_CLS_FLOW_TBL2_REG			0x182c
-#define MVPP2_CLS_OVERSIZE_RXQ_LOW_REG(port)	(0x1980 + ((port) * 4))
-#define     MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS	3
-#define     MVPP2_CLS_OVERSIZE_RXQ_LOW_MASK	0x7
-#define MVPP2_CLS_SWFWD_P2HQ_REG(port)		(0x19b0 + ((port) * 4))
-#define MVPP2_CLS_SWFWD_PCTRL_REG		0x19d0
-#define     MVPP2_CLS_SWFWD_PCTRL_MASK(port)	(1 << (port))
-
-/* Descriptor Manager Top Registers */
-#define MVPP2_RXQ_NUM_REG			0x2040
-#define MVPP2_RXQ_DESC_ADDR_REG			0x2044
-#define     MVPP22_DESC_ADDR_OFFS		8
-#define MVPP2_RXQ_DESC_SIZE_REG			0x2048
-#define     MVPP2_RXQ_DESC_SIZE_MASK		0x3ff0
-#define MVPP2_RXQ_STATUS_UPDATE_REG(rxq)	(0x3000 + 4 * (rxq))
-#define     MVPP2_RXQ_NUM_PROCESSED_OFFSET	0
-#define     MVPP2_RXQ_NUM_NEW_OFFSET		16
-#define MVPP2_RXQ_STATUS_REG(rxq)		(0x3400 + 4 * (rxq))
-#define     MVPP2_RXQ_OCCUPIED_MASK		0x3fff
-#define     MVPP2_RXQ_NON_OCCUPIED_OFFSET	16
-#define     MVPP2_RXQ_NON_OCCUPIED_MASK		0x3fff0000
-#define MVPP2_RXQ_THRESH_REG			0x204c
-#define     MVPP2_OCCUPIED_THRESH_OFFSET	0
-#define     MVPP2_OCCUPIED_THRESH_MASK		0x3fff
-#define MVPP2_RXQ_INDEX_REG			0x2050
-#define MVPP2_TXQ_NUM_REG			0x2080
-#define MVPP2_TXQ_DESC_ADDR_REG			0x2084
-#define MVPP2_TXQ_DESC_SIZE_REG			0x2088
-#define     MVPP2_TXQ_DESC_SIZE_MASK		0x3ff0
-#define MVPP2_TXQ_THRESH_REG			0x2094
-#define	    MVPP2_TXQ_THRESH_OFFSET		16
-#define	    MVPP2_TXQ_THRESH_MASK		0x3fff
-#define MVPP2_AGGR_TXQ_UPDATE_REG		0x2090
-#define MVPP2_TXQ_INDEX_REG			0x2098
-#define MVPP2_TXQ_PREF_BUF_REG			0x209c
-#define     MVPP2_PREF_BUF_PTR(desc)		((desc) & 0xfff)
-#define     MVPP2_PREF_BUF_SIZE_4		(BIT(12) | BIT(13))
-#define     MVPP2_PREF_BUF_SIZE_16		(BIT(12) | BIT(14))
-#define     MVPP2_PREF_BUF_THRESH(val)		((val) << 17)
-#define     MVPP2_TXQ_DRAIN_EN_MASK		BIT(31)
-#define MVPP2_TXQ_PENDING_REG			0x20a0
-#define     MVPP2_TXQ_PENDING_MASK		0x3fff
-#define MVPP2_TXQ_INT_STATUS_REG		0x20a4
-#define MVPP2_TXQ_SENT_REG(txq)			(0x3c00 + 4 * (txq))
-#define     MVPP2_TRANSMITTED_COUNT_OFFSET	16
-#define     MVPP2_TRANSMITTED_COUNT_MASK	0x3fff0000
-#define MVPP2_TXQ_RSVD_REQ_REG			0x20b0
-#define     MVPP2_TXQ_RSVD_REQ_Q_OFFSET		16
-#define MVPP2_TXQ_RSVD_RSLT_REG			0x20b4
-#define     MVPP2_TXQ_RSVD_RSLT_MASK		0x3fff
-#define MVPP2_TXQ_RSVD_CLR_REG			0x20b8
-#define     MVPP2_TXQ_RSVD_CLR_OFFSET		16
-#define MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu)	(0x2100 + 4 * (cpu))
-#define     MVPP22_AGGR_TXQ_DESC_ADDR_OFFS	8
-#define MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu)	(0x2140 + 4 * (cpu))
-#define     MVPP2_AGGR_TXQ_DESC_SIZE_MASK	0x3ff0
-#define MVPP2_AGGR_TXQ_STATUS_REG(cpu)		(0x2180 + 4 * (cpu))
-#define     MVPP2_AGGR_TXQ_PENDING_MASK		0x3fff
-#define MVPP2_AGGR_TXQ_INDEX_REG(cpu)		(0x21c0 + 4 * (cpu))
-
-/* MBUS bridge registers */
-#define MVPP2_WIN_BASE(w)			(0x4000 + ((w) << 2))
-#define MVPP2_WIN_SIZE(w)			(0x4020 + ((w) << 2))
-#define MVPP2_WIN_REMAP(w)			(0x4040 + ((w) << 2))
-#define MVPP2_BASE_ADDR_ENABLE			0x4060
-
-/* AXI Bridge Registers */
-#define MVPP22_AXI_BM_WR_ATTR_REG		0x4100
-#define MVPP22_AXI_BM_RD_ATTR_REG		0x4104
-#define MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG	0x4110
-#define MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG	0x4114
-#define MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG	0x4118
-#define MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG	0x411c
-#define MVPP22_AXI_RX_DATA_WR_ATTR_REG		0x4120
-#define MVPP22_AXI_TX_DATA_RD_ATTR_REG		0x4130
-#define MVPP22_AXI_RD_NORMAL_CODE_REG		0x4150
-#define MVPP22_AXI_RD_SNOOP_CODE_REG		0x4154
-#define MVPP22_AXI_WR_NORMAL_CODE_REG		0x4160
-#define MVPP22_AXI_WR_SNOOP_CODE_REG		0x4164
-
-/* Values for AXI Bridge registers */
-#define MVPP22_AXI_ATTR_CACHE_OFFS		0
-#define MVPP22_AXI_ATTR_DOMAIN_OFFS		12
-
-#define MVPP22_AXI_CODE_CACHE_OFFS		0
-#define MVPP22_AXI_CODE_DOMAIN_OFFS		4
-
-#define MVPP22_AXI_CODE_CACHE_NON_CACHE		0x3
-#define MVPP22_AXI_CODE_CACHE_WR_CACHE		0x7
-#define MVPP22_AXI_CODE_CACHE_RD_CACHE		0xb
-
-#define MVPP22_AXI_CODE_DOMAIN_OUTER_DOM	2
-#define MVPP22_AXI_CODE_DOMAIN_SYSTEM		3
-
-/* Interrupt Cause and Mask registers */
-#define MVPP2_ISR_TX_THRESHOLD_REG(port)	(0x5140 + 4 * (port))
-#define     MVPP2_MAX_ISR_TX_THRESHOLD		0xfffff0
-
-#define MVPP2_ISR_RX_THRESHOLD_REG(rxq)		(0x5200 + 4 * (rxq))
-#define     MVPP2_MAX_ISR_RX_THRESHOLD		0xfffff0
-#define MVPP21_ISR_RXQ_GROUP_REG(port)		(0x5400 + 4 * (port))
-
-#define MVPP22_ISR_RXQ_GROUP_INDEX_REG		0x5400
-#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK	0x380
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET	7
-
-#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
-#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK	0x380
-
-#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG	0x5404
-#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK	0x1f
-#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK	0xf00
-#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET	8
-
-#define MVPP2_ISR_ENABLE_REG(port)		(0x5420 + 4 * (port))
-#define     MVPP2_ISR_ENABLE_INTERRUPT(mask)	((mask) & 0xffff)
-#define     MVPP2_ISR_DISABLE_INTERRUPT(mask)	(((mask) << 16) & 0xffff0000)
-#define MVPP2_ISR_RX_TX_CAUSE_REG(port)		(0x5480 + 4 * (port))
-#define     MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK	0xffff
-#define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK	0xff0000
-#define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET	16
-#define     MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK	BIT(24)
-#define     MVPP2_CAUSE_FCS_ERR_MASK		BIT(25)
-#define     MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK	BIT(26)
-#define     MVPP2_CAUSE_TX_EXCEPTION_SUM_MASK	BIT(29)
-#define     MVPP2_CAUSE_RX_EXCEPTION_SUM_MASK	BIT(30)
-#define     MVPP2_CAUSE_MISC_SUM_MASK		BIT(31)
-#define MVPP2_ISR_RX_TX_MASK_REG(port)		(0x54a0 + 4 * (port))
-#define MVPP2_ISR_PON_RX_TX_MASK_REG		0x54bc
-#define     MVPP2_PON_CAUSE_RXQ_OCCUP_DESC_ALL_MASK	0xffff
-#define     MVPP2_PON_CAUSE_TXP_OCCUP_DESC_ALL_MASK	0x3fc00000
-#define     MVPP2_PON_CAUSE_MISC_SUM_MASK		BIT(31)
-#define MVPP2_ISR_MISC_CAUSE_REG		0x55b0
-
-/* Buffer Manager registers */
-#define MVPP2_BM_POOL_BASE_REG(pool)		(0x6000 + ((pool) * 4))
-#define     MVPP2_BM_POOL_BASE_ADDR_MASK	0xfffff80
-#define MVPP2_BM_POOL_SIZE_REG(pool)		(0x6040 + ((pool) * 4))
-#define     MVPP2_BM_POOL_SIZE_MASK		0xfff0
-#define MVPP2_BM_POOL_READ_PTR_REG(pool)	(0x6080 + ((pool) * 4))
-#define     MVPP2_BM_POOL_GET_READ_PTR_MASK	0xfff0
-#define MVPP2_BM_POOL_PTRS_NUM_REG(pool)	(0x60c0 + ((pool) * 4))
-#define     MVPP2_BM_POOL_PTRS_NUM_MASK		0xfff0
-#define MVPP2_BM_BPPI_READ_PTR_REG(pool)	(0x6100 + ((pool) * 4))
-#define MVPP2_BM_BPPI_PTRS_NUM_REG(pool)	(0x6140 + ((pool) * 4))
-#define     MVPP2_BM_BPPI_PTR_NUM_MASK		0x7ff
-#define MVPP22_BM_POOL_PTRS_NUM_MASK		0xfff8
-#define     MVPP2_BM_BPPI_PREFETCH_FULL_MASK	BIT(16)
-#define MVPP2_BM_POOL_CTRL_REG(pool)		(0x6200 + ((pool) * 4))
-#define     MVPP2_BM_START_MASK			BIT(0)
-#define     MVPP2_BM_STOP_MASK			BIT(1)
-#define     MVPP2_BM_STATE_MASK			BIT(4)
-#define     MVPP2_BM_LOW_THRESH_OFFS		8
-#define     MVPP2_BM_LOW_THRESH_MASK		0x7f00
-#define     MVPP2_BM_LOW_THRESH_VALUE(val)	((val) << \
-						MVPP2_BM_LOW_THRESH_OFFS)
-#define     MVPP2_BM_HIGH_THRESH_OFFS		16
-#define     MVPP2_BM_HIGH_THRESH_MASK		0x7f0000
-#define     MVPP2_BM_HIGH_THRESH_VALUE(val)	((val) << \
-						MVPP2_BM_HIGH_THRESH_OFFS)
-#define MVPP2_BM_INTR_CAUSE_REG(pool)		(0x6240 + ((pool) * 4))
-#define     MVPP2_BM_RELEASED_DELAY_MASK	BIT(0)
-#define     MVPP2_BM_ALLOC_FAILED_MASK		BIT(1)
-#define     MVPP2_BM_BPPE_EMPTY_MASK		BIT(2)
-#define     MVPP2_BM_BPPE_FULL_MASK		BIT(3)
-#define     MVPP2_BM_AVAILABLE_BP_LOW_MASK	BIT(4)
-#define MVPP2_BM_INTR_MASK_REG(pool)		(0x6280 + ((pool) * 4))
-#define MVPP2_BM_PHY_ALLOC_REG(pool)		(0x6400 + ((pool) * 4))
-#define     MVPP2_BM_PHY_ALLOC_GRNTD_MASK	BIT(0)
-#define MVPP2_BM_VIRT_ALLOC_REG			0x6440
-#define MVPP22_BM_ADDR_HIGH_ALLOC		0x6444
-#define     MVPP22_BM_ADDR_HIGH_PHYS_MASK	0xff
-#define     MVPP22_BM_ADDR_HIGH_VIRT_MASK	0xff00
-#define     MVPP22_BM_ADDR_HIGH_VIRT_SHIFT	8
-#define MVPP2_BM_PHY_RLS_REG(pool)		(0x6480 + ((pool) * 4))
-#define     MVPP2_BM_PHY_RLS_MC_BUFF_MASK	BIT(0)
-#define     MVPP2_BM_PHY_RLS_PRIO_EN_MASK	BIT(1)
-#define     MVPP2_BM_PHY_RLS_GRNTD_MASK		BIT(2)
-#define MVPP2_BM_VIRT_RLS_REG			0x64c0
-#define MVPP22_BM_ADDR_HIGH_RLS_REG		0x64c4
-#define     MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK	0xff
-#define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK	0xff00
-#define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT	8
-
-/* TX Scheduler registers */
-#define MVPP2_TXP_SCHED_PORT_INDEX_REG		0x8000
-#define MVPP2_TXP_SCHED_Q_CMD_REG		0x8004
-#define     MVPP2_TXP_SCHED_ENQ_MASK		0xff
-#define     MVPP2_TXP_SCHED_DISQ_OFFSET		8
-#define MVPP2_TXP_SCHED_CMD_1_REG		0x8010
-#define MVPP2_TXP_SCHED_PERIOD_REG		0x8018
-#define MVPP2_TXP_SCHED_MTU_REG			0x801c
-#define     MVPP2_TXP_MTU_MAX			0x7FFFF
-#define MVPP2_TXP_SCHED_REFILL_REG		0x8020
-#define     MVPP2_TXP_REFILL_TOKENS_ALL_MASK	0x7ffff
-#define     MVPP2_TXP_REFILL_PERIOD_ALL_MASK	0x3ff00000
-#define     MVPP2_TXP_REFILL_PERIOD_MASK(v)	((v) << 20)
-#define MVPP2_TXP_SCHED_TOKEN_SIZE_REG		0x8024
-#define     MVPP2_TXP_TOKEN_SIZE_MAX		0xffffffff
-#define MVPP2_TXQ_SCHED_REFILL_REG(q)		(0x8040 + ((q) << 2))
-#define     MVPP2_TXQ_REFILL_TOKENS_ALL_MASK	0x7ffff
-#define     MVPP2_TXQ_REFILL_PERIOD_ALL_MASK	0x3ff00000
-#define     MVPP2_TXQ_REFILL_PERIOD_MASK(v)	((v) << 20)
-#define MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(q)	(0x8060 + ((q) << 2))
-#define     MVPP2_TXQ_TOKEN_SIZE_MAX		0x7fffffff
-#define MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(q)	(0x8080 + ((q) << 2))
-#define     MVPP2_TXQ_TOKEN_CNTR_MAX		0xffffffff
-
-/* TX general registers */
-#define MVPP2_TX_SNOOP_REG			0x8800
-#define MVPP2_TX_PORT_FLUSH_REG			0x8810
-#define     MVPP2_TX_PORT_FLUSH_MASK(port)	(1 << (port))
-
-/* LMS registers */
-#define MVPP2_SRC_ADDR_MIDDLE			0x24
-#define MVPP2_SRC_ADDR_HIGH			0x28
-#define MVPP2_PHY_AN_CFG0_REG			0x34
-#define     MVPP2_PHY_AN_STOP_SMI0_MASK		BIT(7)
-#define MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG	0x305c
-#define     MVPP2_EXT_GLOBAL_CTRL_DEFAULT	0x27
-
-/* Per-port registers */
-#define MVPP2_GMAC_CTRL_0_REG			0x0
-#define     MVPP2_GMAC_PORT_EN_MASK		BIT(0)
-#define     MVPP2_GMAC_PORT_TYPE_MASK		BIT(1)
-#define     MVPP2_GMAC_MAX_RX_SIZE_OFFS		2
-#define     MVPP2_GMAC_MAX_RX_SIZE_MASK		0x7ffc
-#define     MVPP2_GMAC_MIB_CNTR_EN_MASK		BIT(15)
-#define MVPP2_GMAC_CTRL_1_REG			0x4
-#define     MVPP2_GMAC_PERIODIC_XON_EN_MASK	BIT(1)
-#define     MVPP2_GMAC_GMII_LB_EN_MASK		BIT(5)
-#define     MVPP2_GMAC_PCS_LB_EN_BIT		6
-#define     MVPP2_GMAC_PCS_LB_EN_MASK		BIT(6)
-#define     MVPP2_GMAC_SA_LOW_OFFS		7
-#define MVPP2_GMAC_CTRL_2_REG			0x8
-#define     MVPP2_GMAC_INBAND_AN_MASK		BIT(0)
-#define     MVPP2_GMAC_FLOW_CTRL_MASK		GENMASK(2, 1)
-#define     MVPP2_GMAC_PCS_ENABLE_MASK		BIT(3)
-#define     MVPP2_GMAC_INTERNAL_CLK_MASK	BIT(4)
-#define     MVPP2_GMAC_DISABLE_PADDING		BIT(5)
-#define     MVPP2_GMAC_PORT_RESET_MASK		BIT(6)
-#define MVPP2_GMAC_AUTONEG_CONFIG		0xc
-#define     MVPP2_GMAC_FORCE_LINK_DOWN		BIT(0)
-#define     MVPP2_GMAC_FORCE_LINK_PASS		BIT(1)
-#define     MVPP2_GMAC_IN_BAND_AUTONEG		BIT(2)
-#define     MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS	BIT(3)
-#define     MVPP2_GMAC_IN_BAND_RESTART_AN	BIT(4)
-#define     MVPP2_GMAC_CONFIG_MII_SPEED	BIT(5)
-#define     MVPP2_GMAC_CONFIG_GMII_SPEED	BIT(6)
-#define     MVPP2_GMAC_AN_SPEED_EN		BIT(7)
-#define     MVPP2_GMAC_FC_ADV_EN		BIT(9)
-#define     MVPP2_GMAC_FC_ADV_ASM_EN		BIT(10)
-#define     MVPP2_GMAC_FLOW_CTRL_AUTONEG	BIT(11)
-#define     MVPP2_GMAC_CONFIG_FULL_DUPLEX	BIT(12)
-#define     MVPP2_GMAC_AN_DUPLEX_EN		BIT(13)
-#define MVPP2_GMAC_STATUS0			0x10
-#define     MVPP2_GMAC_STATUS0_LINK_UP		BIT(0)
-#define     MVPP2_GMAC_STATUS0_GMII_SPEED	BIT(1)
-#define     MVPP2_GMAC_STATUS0_MII_SPEED	BIT(2)
-#define     MVPP2_GMAC_STATUS0_FULL_DUPLEX	BIT(3)
-#define     MVPP2_GMAC_STATUS0_RX_PAUSE		BIT(6)
-#define     MVPP2_GMAC_STATUS0_TX_PAUSE		BIT(7)
-#define     MVPP2_GMAC_STATUS0_AN_COMPLETE	BIT(11)
-#define MVPP2_GMAC_PORT_FIFO_CFG_1_REG		0x1c
-#define     MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS	6
-#define     MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK	0x1fc0
-#define     MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v)	(((v) << 6) & \
-					MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK)
-#define MVPP22_GMAC_INT_STAT			0x20
-#define     MVPP22_GMAC_INT_STAT_LINK		BIT(1)
-#define MVPP22_GMAC_INT_MASK			0x24
-#define     MVPP22_GMAC_INT_MASK_LINK_STAT	BIT(1)
-#define MVPP22_GMAC_CTRL_4_REG			0x90
-#define     MVPP22_CTRL4_EXT_PIN_GMII_SEL	BIT(0)
-#define     MVPP22_CTRL4_RX_FC_EN		BIT(3)
-#define     MVPP22_CTRL4_TX_FC_EN		BIT(4)
-#define     MVPP22_CTRL4_DP_CLK_SEL		BIT(5)
-#define     MVPP22_CTRL4_SYNC_BYPASS_DIS	BIT(6)
-#define     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE	BIT(7)
-#define MVPP22_GMAC_INT_SUM_MASK		0xa4
-#define     MVPP22_GMAC_INT_SUM_MASK_LINK_STAT	BIT(1)
-
-/* Per-port XGMAC registers. PPv2.2 only, only for GOP port 0,
- * relative to port->base.
- */
-#define MVPP22_XLG_CTRL0_REG			0x100
-#define     MVPP22_XLG_CTRL0_PORT_EN		BIT(0)
-#define     MVPP22_XLG_CTRL0_MAC_RESET_DIS	BIT(1)
-#define     MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN	BIT(7)
-#define     MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN	BIT(8)
-#define     MVPP22_XLG_CTRL0_MIB_CNT_DIS	BIT(14)
-#define MVPP22_XLG_CTRL1_REG			0x104
-#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS	0
-#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK	0x1fff
-#define MVPP22_XLG_STATUS			0x10c
-#define     MVPP22_XLG_STATUS_LINK_UP		BIT(0)
-#define MVPP22_XLG_INT_STAT			0x114
-#define     MVPP22_XLG_INT_STAT_LINK		BIT(1)
-#define MVPP22_XLG_INT_MASK			0x118
-#define     MVPP22_XLG_INT_MASK_LINK		BIT(1)
-#define MVPP22_XLG_CTRL3_REG			0x11c
-#define     MVPP22_XLG_CTRL3_MACMODESELECT_MASK	(7 << 13)
-#define     MVPP22_XLG_CTRL3_MACMODESELECT_GMAC	(0 << 13)
-#define     MVPP22_XLG_CTRL3_MACMODESELECT_10G	(1 << 13)
-#define MVPP22_XLG_EXT_INT_MASK			0x15c
-#define     MVPP22_XLG_EXT_INT_MASK_XLG		BIT(1)
-#define     MVPP22_XLG_EXT_INT_MASK_GIG		BIT(2)
-#define MVPP22_XLG_CTRL4_REG			0x184
-#define     MVPP22_XLG_CTRL4_FWD_FC		BIT(5)
-#define     MVPP22_XLG_CTRL4_FWD_PFC		BIT(6)
-#define     MVPP22_XLG_CTRL4_MACMODSELECT_GMAC	BIT(12)
-#define     MVPP22_XLG_CTRL4_EN_IDLE_CHECK	BIT(14)
-
-/* SMI registers. PPv2.2 only, relative to priv->iface_base. */
-#define MVPP22_SMI_MISC_CFG_REG			0x1204
-#define     MVPP22_SMI_POLLING_EN		BIT(10)
-
-#define MVPP22_GMAC_BASE(port)		(0x7000 + (port) * 0x1000 + 0xe00)
-
-#define MVPP2_CAUSE_TXQ_SENT_DESC_ALL_MASK	0xff
-
-/* Descriptor ring Macros */
-#define MVPP2_QUEUE_NEXT_DESC(q, index) \
-	(((index) < (q)->last_desc) ? ((index) + 1) : 0)
-
-/* XPCS registers. PPv2.2 only */
-#define MVPP22_MPCS_BASE(port)			(0x7000 + (port) * 0x1000)
-#define MVPP22_MPCS_CTRL			0x14
-#define     MVPP22_MPCS_CTRL_FWD_ERR_CONN	BIT(10)
-#define MVPP22_MPCS_CLK_RESET			0x14c
-#define     MAC_CLK_RESET_SD_TX			BIT(0)
-#define     MAC_CLK_RESET_SD_RX			BIT(1)
-#define     MAC_CLK_RESET_MAC			BIT(2)
-#define     MVPP22_MPCS_CLK_RESET_DIV_RATIO(n)	((n) << 4)
-#define     MVPP22_MPCS_CLK_RESET_DIV_SET	BIT(11)
-
-/* XPCS registers. PPv2.2 only */
-#define MVPP22_XPCS_BASE(port)			(0x7400 + (port) * 0x1000)
-#define MVPP22_XPCS_CFG0			0x0
-#define     MVPP22_XPCS_CFG0_PCS_MODE(n)	((n) << 3)
-#define     MVPP22_XPCS_CFG0_ACTIVE_LANE(n)	((n) << 5)
-
-/* System controller registers. Accessed through a regmap. */
-#define GENCONF_SOFT_RESET1				0x1108
-#define     GENCONF_SOFT_RESET1_GOP			BIT(6)
-#define GENCONF_PORT_CTRL0				0x1110
-#define     GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT		BIT(1)
-#define     GENCONF_PORT_CTRL0_RX_DATA_SAMPLE		BIT(29)
-#define     GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR	BIT(31)
-#define GENCONF_PORT_CTRL1				0x1114
-#define     GENCONF_PORT_CTRL1_EN(p)			BIT(p)
-#define     GENCONF_PORT_CTRL1_RESET(p)			(BIT(p) << 28)
-#define GENCONF_CTRL0					0x1120
-#define     GENCONF_CTRL0_PORT0_RGMII			BIT(0)
-#define     GENCONF_CTRL0_PORT1_RGMII_MII		BIT(1)
-#define     GENCONF_CTRL0_PORT1_RGMII			BIT(2)
-
-/* Various constants */
-
-/* Coalescing */
-#define MVPP2_TXDONE_COAL_PKTS_THRESH	64
-#define MVPP2_TXDONE_HRTIMER_PERIOD_NS	1000000UL
-#define MVPP2_TXDONE_COAL_USEC		1000
-#define MVPP2_RX_COAL_PKTS		32
-#define MVPP2_RX_COAL_USEC		64
-
-/* The two bytes Marvell header. Either contains a special value used
- * by Marvell switches when a specific hardware mode is enabled (not
- * supported by this driver) or is filled automatically by zeroes on
- * the RX side. Those two bytes being at the front of the Ethernet
- * header, they allow to have the IP header aligned on a 4 bytes
- * boundary automatically: the hardware skips those two bytes on its
- * own.
- */
-#define MVPP2_MH_SIZE			2
-#define MVPP2_ETH_TYPE_LEN		2
-#define MVPP2_PPPOE_HDR_SIZE		8
-#define MVPP2_VLAN_TAG_LEN		4
-#define MVPP2_VLAN_TAG_EDSA_LEN		8
-
-/* Lbtd 802.3 type */
-#define MVPP2_IP_LBDT_TYPE		0xfffa
-
-#define MVPP2_TX_CSUM_MAX_SIZE		9800
-
-/* Timeout constants */
-#define MVPP2_TX_DISABLE_TIMEOUT_MSEC	1000
-#define MVPP2_TX_PENDING_TIMEOUT_MSEC	1000
-
-#define MVPP2_TX_MTU_MAX		0x7ffff
-
-/* Maximum number of T-CONTs of PON port */
-#define MVPP2_MAX_TCONT			16
-
-/* Maximum number of supported ports */
-#define MVPP2_MAX_PORTS			4
-
-/* Maximum number of TXQs used by single port */
-#define MVPP2_MAX_TXQ			8
-
-/* MVPP2_MAX_TSO_SEGS is the maximum number of fragments to allow in the GSO
- * skb. As we need a maxium of two descriptors per fragments (1 header, 1 data),
- * multiply this value by two to count the maximum number of skb descs needed.
- */
-#define MVPP2_MAX_TSO_SEGS		300
-#define MVPP2_MAX_SKB_DESCS		(MVPP2_MAX_TSO_SEGS * 2 + MAX_SKB_FRAGS)
-
-/* Dfault number of RXQs in use */
-#define MVPP2_DEFAULT_RXQ		4
-
-/* Max number of Rx descriptors */
-#define MVPP2_MAX_RXD_MAX		1024
-#define MVPP2_MAX_RXD_DFLT		128
-
-/* Max number of Tx descriptors */
-#define MVPP2_MAX_TXD_MAX		2048
-#define MVPP2_MAX_TXD_DFLT		1024
-
-/* Amount of Tx descriptors that can be reserved at once by CPU */
-#define MVPP2_CPU_DESC_CHUNK		64
-
-/* Max number of Tx descriptors in each aggregated queue */
-#define MVPP2_AGGR_TXQ_SIZE		256
-
-/* Descriptor aligned size */
-#define MVPP2_DESC_ALIGNED_SIZE		32
-
-/* Descriptor alignment mask */
-#define MVPP2_TX_DESC_ALIGN		(MVPP2_DESC_ALIGNED_SIZE - 1)
-
-/* RX FIFO constants */
-#define MVPP2_RX_FIFO_PORT_DATA_SIZE_32KB	0x8000
-#define MVPP2_RX_FIFO_PORT_DATA_SIZE_8KB	0x2000
-#define MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB	0x1000
-#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_32KB	0x200
-#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_8KB	0x80
-#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB	0x40
-#define MVPP2_RX_FIFO_PORT_MIN_PKT		0x80
-
-/* TX FIFO constants */
-#define MVPP22_TX_FIFO_DATA_SIZE_10KB		0xa
-#define MVPP22_TX_FIFO_DATA_SIZE_3KB		0x3
-#define MVPP2_TX_FIFO_THRESHOLD_MIN		256
-#define MVPP2_TX_FIFO_THRESHOLD_10KB	\
-	(MVPP22_TX_FIFO_DATA_SIZE_10KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
-#define MVPP2_TX_FIFO_THRESHOLD_3KB	\
-	(MVPP22_TX_FIFO_DATA_SIZE_3KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
-
-/* RX buffer constants */
-#define MVPP2_SKB_SHINFO_SIZE \
-	SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
-
-#define MVPP2_RX_PKT_SIZE(mtu) \
-	ALIGN((mtu) + MVPP2_MH_SIZE + MVPP2_VLAN_TAG_LEN + \
-	      ETH_HLEN + ETH_FCS_LEN, cache_line_size())
-
-#define MVPP2_RX_BUF_SIZE(pkt_size)	((pkt_size) + NET_SKB_PAD)
-#define MVPP2_RX_TOTAL_SIZE(buf_size)	((buf_size) + MVPP2_SKB_SHINFO_SIZE)
-#define MVPP2_RX_MAX_PKT_SIZE(total_size) \
-	((total_size) - NET_SKB_PAD - MVPP2_SKB_SHINFO_SIZE)
-
-#define MVPP2_BIT_TO_BYTE(bit)		((bit) / 8)
-
-/* IPv6 max L3 address size */
-#define MVPP2_MAX_L3_ADDR_SIZE		16
-
-/* Port flags */
-#define MVPP2_F_LOOPBACK		BIT(0)
-
-/* Marvell tag types */
-enum mvpp2_tag_type {
-	MVPP2_TAG_TYPE_NONE = 0,
-	MVPP2_TAG_TYPE_MH   = 1,
-	MVPP2_TAG_TYPE_DSA  = 2,
-	MVPP2_TAG_TYPE_EDSA = 3,
-	MVPP2_TAG_TYPE_VLAN = 4,
-	MVPP2_TAG_TYPE_LAST = 5
-};
-
-/* Parser constants */
-#define MVPP2_PRS_TCAM_SRAM_SIZE	256
-#define MVPP2_PRS_TCAM_WORDS		6
-#define MVPP2_PRS_SRAM_WORDS		4
-#define MVPP2_PRS_FLOW_ID_SIZE		64
-#define MVPP2_PRS_FLOW_ID_MASK		0x3f
-#define MVPP2_PRS_TCAM_ENTRY_INVALID	1
-#define MVPP2_PRS_TCAM_DSA_TAGGED_BIT	BIT(5)
-#define MVPP2_PRS_IPV4_HEAD		0x40
-#define MVPP2_PRS_IPV4_HEAD_MASK	0xf0
-#define MVPP2_PRS_IPV4_MC		0xe0
-#define MVPP2_PRS_IPV4_MC_MASK		0xf0
-#define MVPP2_PRS_IPV4_BC_MASK		0xff
-#define MVPP2_PRS_IPV4_IHL		0x5
-#define MVPP2_PRS_IPV4_IHL_MASK		0xf
-#define MVPP2_PRS_IPV6_MC		0xff
-#define MVPP2_PRS_IPV6_MC_MASK		0xff
-#define MVPP2_PRS_IPV6_HOP_MASK		0xff
-#define MVPP2_PRS_TCAM_PROTO_MASK	0xff
-#define MVPP2_PRS_TCAM_PROTO_MASK_L	0x3f
-#define MVPP2_PRS_DBL_VLANS_MAX		100
-#define MVPP2_PRS_CAST_MASK		BIT(0)
-#define MVPP2_PRS_MCAST_VAL		BIT(0)
-#define MVPP2_PRS_UCAST_VAL		0x0
-
-/* Tcam structure:
- * - lookup ID - 4 bits
- * - port ID - 1 byte
- * - additional information - 1 byte
- * - header data - 8 bytes
- * The fields are represented by MVPP2_PRS_TCAM_DATA_REG(5)->(0).
- */
-#define MVPP2_PRS_AI_BITS			8
-#define MVPP2_PRS_PORT_MASK			0xff
-#define MVPP2_PRS_LU_MASK			0xf
-#define MVPP2_PRS_TCAM_DATA_BYTE(offs)		\
-				    (((offs) - ((offs) % 2)) * 2 + ((offs) % 2))
-#define MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)	\
-					      (((offs) * 2) - ((offs) % 2)  + 2)
-#define MVPP2_PRS_TCAM_AI_BYTE			16
-#define MVPP2_PRS_TCAM_PORT_BYTE		17
-#define MVPP2_PRS_TCAM_LU_BYTE			20
-#define MVPP2_PRS_TCAM_EN_OFFS(offs)		((offs) + 2)
-#define MVPP2_PRS_TCAM_INV_WORD			5
-
-#define MVPP2_PRS_VID_TCAM_BYTE         2
-
-/* TCAM range for unicast and multicast filtering. We have 25 entries per port,
- * with 4 dedicated to UC filtering and the rest to multicast filtering.
- * Additionnally we reserve one entry for the broadcast address, and one for
- * each port's own address.
- */
-#define MVPP2_PRS_MAC_UC_MC_FILT_MAX	25
-#define MVPP2_PRS_MAC_RANGE_SIZE	80
-
-/* Number of entries per port dedicated to UC and MC filtering */
-#define MVPP2_PRS_MAC_UC_FILT_MAX	4
-#define MVPP2_PRS_MAC_MC_FILT_MAX	(MVPP2_PRS_MAC_UC_MC_FILT_MAX - \
-					 MVPP2_PRS_MAC_UC_FILT_MAX)
-
-/* There is a TCAM range reserved for VLAN filtering entries, range size is 33
- * 10 VLAN ID filter entries per port
- * 1 default VLAN filter entry per port
- * It is assumed that there are 3 ports for filter, not including loopback port
- */
-#define MVPP2_PRS_VLAN_FILT_MAX		11
-#define MVPP2_PRS_VLAN_FILT_RANGE_SIZE	33
-
-#define MVPP2_PRS_VLAN_FILT_MAX_ENTRY   (MVPP2_PRS_VLAN_FILT_MAX - 2)
-#define MVPP2_PRS_VLAN_FILT_DFLT_ENTRY  (MVPP2_PRS_VLAN_FILT_MAX - 1)
-
-/* Tcam entries ID */
-#define MVPP2_PE_DROP_ALL		0
-#define MVPP2_PE_FIRST_FREE_TID		1
-
-/* MAC filtering range */
-#define MVPP2_PE_MAC_RANGE_END		(MVPP2_PE_VID_FILT_RANGE_START - 1)
-#define MVPP2_PE_MAC_RANGE_START	(MVPP2_PE_MAC_RANGE_END - \
-						MVPP2_PRS_MAC_RANGE_SIZE + 1)
-/* VLAN filtering range */
-#define MVPP2_PE_VID_FILT_RANGE_END     (MVPP2_PRS_TCAM_SRAM_SIZE - 31)
-#define MVPP2_PE_VID_FILT_RANGE_START   (MVPP2_PE_VID_FILT_RANGE_END - \
-					 MVPP2_PRS_VLAN_FILT_RANGE_SIZE + 1)
-#define MVPP2_PE_LAST_FREE_TID          (MVPP2_PE_MAC_RANGE_START - 1)
-#define MVPP2_PE_IP6_EXT_PROTO_UN	(MVPP2_PRS_TCAM_SRAM_SIZE - 30)
-#define MVPP2_PE_IP6_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 29)
-#define MVPP2_PE_IP4_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 28)
-#define MVPP2_PE_LAST_DEFAULT_FLOW	(MVPP2_PRS_TCAM_SRAM_SIZE - 27)
-#define MVPP2_PE_FIRST_DEFAULT_FLOW	(MVPP2_PRS_TCAM_SRAM_SIZE - 22)
-#define MVPP2_PE_EDSA_TAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 21)
-#define MVPP2_PE_EDSA_UNTAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 20)
-#define MVPP2_PE_DSA_TAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 19)
-#define MVPP2_PE_DSA_UNTAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 18)
-#define MVPP2_PE_ETYPE_EDSA_TAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 17)
-#define MVPP2_PE_ETYPE_EDSA_UNTAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 16)
-#define MVPP2_PE_ETYPE_DSA_TAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 15)
-#define MVPP2_PE_ETYPE_DSA_UNTAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 14)
-#define MVPP2_PE_MH_DEFAULT		(MVPP2_PRS_TCAM_SRAM_SIZE - 13)
-#define MVPP2_PE_DSA_DEFAULT		(MVPP2_PRS_TCAM_SRAM_SIZE - 12)
-#define MVPP2_PE_IP6_PROTO_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 11)
-#define MVPP2_PE_IP4_PROTO_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 10)
-#define MVPP2_PE_ETH_TYPE_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 9)
-#define MVPP2_PE_VID_FLTR_DEFAULT	(MVPP2_PRS_TCAM_SRAM_SIZE - 8)
-#define MVPP2_PE_VID_EDSA_FLTR_DEFAULT	(MVPP2_PRS_TCAM_SRAM_SIZE - 7)
-#define MVPP2_PE_VLAN_DBL		(MVPP2_PRS_TCAM_SRAM_SIZE - 6)
-#define MVPP2_PE_VLAN_NONE		(MVPP2_PRS_TCAM_SRAM_SIZE - 5)
-/* reserved */
-#define MVPP2_PE_MAC_MC_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 3)
-#define MVPP2_PE_MAC_UC_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 2)
-#define MVPP2_PE_MAC_NON_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 1)
-
-#define MVPP2_PRS_VID_PORT_FIRST(port)	(MVPP2_PE_VID_FILT_RANGE_START + \
-					 ((port) * MVPP2_PRS_VLAN_FILT_MAX))
-#define MVPP2_PRS_VID_PORT_LAST(port)	(MVPP2_PRS_VID_PORT_FIRST(port) \
-					 + MVPP2_PRS_VLAN_FILT_MAX_ENTRY)
-/* Index of default vid filter for given port */
-#define MVPP2_PRS_VID_PORT_DFLT(port)	(MVPP2_PRS_VID_PORT_FIRST(port) \
-					 + MVPP2_PRS_VLAN_FILT_DFLT_ENTRY)
-
-/* Sram structure
- * The fields are represented by MVPP2_PRS_TCAM_DATA_REG(3)->(0).
- */
-#define MVPP2_PRS_SRAM_RI_OFFS			0
-#define MVPP2_PRS_SRAM_RI_WORD			0
-#define MVPP2_PRS_SRAM_RI_CTRL_OFFS		32
-#define MVPP2_PRS_SRAM_RI_CTRL_WORD		1
-#define MVPP2_PRS_SRAM_RI_CTRL_BITS		32
-#define MVPP2_PRS_SRAM_SHIFT_OFFS		64
-#define MVPP2_PRS_SRAM_SHIFT_SIGN_BIT		72
-#define MVPP2_PRS_SRAM_UDF_OFFS			73
-#define MVPP2_PRS_SRAM_UDF_BITS			8
-#define MVPP2_PRS_SRAM_UDF_MASK			0xff
-#define MVPP2_PRS_SRAM_UDF_SIGN_BIT		81
-#define MVPP2_PRS_SRAM_UDF_TYPE_OFFS		82
-#define MVPP2_PRS_SRAM_UDF_TYPE_MASK		0x7
-#define MVPP2_PRS_SRAM_UDF_TYPE_L3		1
-#define MVPP2_PRS_SRAM_UDF_TYPE_L4		4
-#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS	85
-#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_MASK	0x3
-#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD		1
-#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_IP4_ADD	2
-#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_IP6_ADD	3
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS		87
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_BITS		2
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_MASK		0x3
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_ADD		0
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_IP4_ADD	2
-#define MVPP2_PRS_SRAM_OP_SEL_UDF_IP6_ADD	3
-#define MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS		89
-#define MVPP2_PRS_SRAM_AI_OFFS			90
-#define MVPP2_PRS_SRAM_AI_CTRL_OFFS		98
-#define MVPP2_PRS_SRAM_AI_CTRL_BITS		8
-#define MVPP2_PRS_SRAM_AI_MASK			0xff
-#define MVPP2_PRS_SRAM_NEXT_LU_OFFS		106
-#define MVPP2_PRS_SRAM_NEXT_LU_MASK		0xf
-#define MVPP2_PRS_SRAM_LU_DONE_BIT		110
-#define MVPP2_PRS_SRAM_LU_GEN_BIT		111
-
-/* Sram result info bits assignment */
-#define MVPP2_PRS_RI_MAC_ME_MASK		0x1
-#define MVPP2_PRS_RI_DSA_MASK			0x2
-#define MVPP2_PRS_RI_VLAN_MASK			(BIT(2) | BIT(3))
-#define MVPP2_PRS_RI_VLAN_NONE			0x0
-#define MVPP2_PRS_RI_VLAN_SINGLE		BIT(2)
-#define MVPP2_PRS_RI_VLAN_DOUBLE		BIT(3)
-#define MVPP2_PRS_RI_VLAN_TRIPLE		(BIT(2) | BIT(3))
-#define MVPP2_PRS_RI_CPU_CODE_MASK		0x70
-#define MVPP2_PRS_RI_CPU_CODE_RX_SPEC		BIT(4)
-#define MVPP2_PRS_RI_L2_CAST_MASK		(BIT(9) | BIT(10))
-#define MVPP2_PRS_RI_L2_UCAST			0x0
-#define MVPP2_PRS_RI_L2_MCAST			BIT(9)
-#define MVPP2_PRS_RI_L2_BCAST			BIT(10)
-#define MVPP2_PRS_RI_PPPOE_MASK			0x800
-#define MVPP2_PRS_RI_L3_PROTO_MASK		(BIT(12) | BIT(13) | BIT(14))
-#define MVPP2_PRS_RI_L3_UN			0x0
-#define MVPP2_PRS_RI_L3_IP4			BIT(12)
-#define MVPP2_PRS_RI_L3_IP4_OPT			BIT(13)
-#define MVPP2_PRS_RI_L3_IP4_OTHER		(BIT(12) | BIT(13))
-#define MVPP2_PRS_RI_L3_IP6			BIT(14)
-#define MVPP2_PRS_RI_L3_IP6_EXT			(BIT(12) | BIT(14))
-#define MVPP2_PRS_RI_L3_ARP			(BIT(13) | BIT(14))
-#define MVPP2_PRS_RI_L3_ADDR_MASK		(BIT(15) | BIT(16))
-#define MVPP2_PRS_RI_L3_UCAST			0x0
-#define MVPP2_PRS_RI_L3_MCAST			BIT(15)
-#define MVPP2_PRS_RI_L3_BCAST			(BIT(15) | BIT(16))
-#define MVPP2_PRS_RI_IP_FRAG_MASK		0x20000
-#define MVPP2_PRS_RI_IP_FRAG_TRUE		BIT(17)
-#define MVPP2_PRS_RI_UDF3_MASK			0x300000
-#define MVPP2_PRS_RI_UDF3_RX_SPECIAL		BIT(21)
-#define MVPP2_PRS_RI_L4_PROTO_MASK		0x1c00000
-#define MVPP2_PRS_RI_L4_TCP			BIT(22)
-#define MVPP2_PRS_RI_L4_UDP			BIT(23)
-#define MVPP2_PRS_RI_L4_OTHER			(BIT(22) | BIT(23))
-#define MVPP2_PRS_RI_UDF7_MASK			0x60000000
-#define MVPP2_PRS_RI_UDF7_IP6_LITE		BIT(29)
-#define MVPP2_PRS_RI_DROP_MASK			0x80000000
-
-/* Sram additional info bits assignment */
-#define MVPP2_PRS_IPV4_DIP_AI_BIT		BIT(0)
-#define MVPP2_PRS_IPV6_NO_EXT_AI_BIT		BIT(0)
-#define MVPP2_PRS_IPV6_EXT_AI_BIT		BIT(1)
-#define MVPP2_PRS_IPV6_EXT_AH_AI_BIT		BIT(2)
-#define MVPP2_PRS_IPV6_EXT_AH_LEN_AI_BIT	BIT(3)
-#define MVPP2_PRS_IPV6_EXT_AH_L4_AI_BIT		BIT(4)
-#define MVPP2_PRS_SINGLE_VLAN_AI		0
-#define MVPP2_PRS_DBL_VLAN_AI_BIT		BIT(7)
-#define MVPP2_PRS_EDSA_VID_AI_BIT		BIT(0)
-
-/* DSA/EDSA type */
-#define MVPP2_PRS_TAGGED		true
-#define MVPP2_PRS_UNTAGGED		false
-#define MVPP2_PRS_EDSA			true
-#define MVPP2_PRS_DSA			false
-
-/* MAC entries, shadow udf */
-enum mvpp2_prs_udf {
-	MVPP2_PRS_UDF_MAC_DEF,
-	MVPP2_PRS_UDF_MAC_RANGE,
-	MVPP2_PRS_UDF_L2_DEF,
-	MVPP2_PRS_UDF_L2_DEF_COPY,
-	MVPP2_PRS_UDF_L2_USER,
-};
-
-/* Lookup ID */
-enum mvpp2_prs_lookup {
-	MVPP2_PRS_LU_MH,
-	MVPP2_PRS_LU_MAC,
-	MVPP2_PRS_LU_DSA,
-	MVPP2_PRS_LU_VLAN,
-	MVPP2_PRS_LU_VID,
-	MVPP2_PRS_LU_L2,
-	MVPP2_PRS_LU_PPPOE,
-	MVPP2_PRS_LU_IP4,
-	MVPP2_PRS_LU_IP6,
-	MVPP2_PRS_LU_FLOWS,
-	MVPP2_PRS_LU_LAST,
-};
-
-/* L2 cast enum */
-enum mvpp2_prs_l2_cast {
-	MVPP2_PRS_L2_UNI_CAST,
-	MVPP2_PRS_L2_MULTI_CAST,
-};
-
-/* L3 cast enum */
-enum mvpp2_prs_l3_cast {
-	MVPP2_PRS_L3_UNI_CAST,
-	MVPP2_PRS_L3_MULTI_CAST,
-	MVPP2_PRS_L3_BROAD_CAST
-};
-
-/* Classifier constants */
-#define MVPP2_CLS_FLOWS_TBL_SIZE	512
-#define MVPP2_CLS_FLOWS_TBL_DATA_WORDS	3
-#define MVPP2_CLS_LKP_TBL_SIZE		64
-#define MVPP2_CLS_RX_QUEUES		256
-
-/* RSS constants */
-#define MVPP22_RSS_TABLE_ENTRIES	32
-
-/* BM constants */
-#define MVPP2_BM_JUMBO_BUF_NUM		512
-#define MVPP2_BM_LONG_BUF_NUM		1024
-#define MVPP2_BM_SHORT_BUF_NUM		2048
-#define MVPP2_BM_POOL_SIZE_MAX		(16*1024 - MVPP2_BM_POOL_PTR_ALIGN/4)
-#define MVPP2_BM_POOL_PTR_ALIGN		128
-
-/* BM cookie (32 bits) definition */
-#define MVPP2_BM_COOKIE_POOL_OFFS	8
-#define MVPP2_BM_COOKIE_CPU_OFFS	24
-
-#define MVPP2_BM_SHORT_FRAME_SIZE		512
-#define MVPP2_BM_LONG_FRAME_SIZE		2048
-#define MVPP2_BM_JUMBO_FRAME_SIZE		10240
-/* BM short pool packet size
- * These value assure that for SWF the total number
- * of bytes allocated for each buffer will be 512
- */
-#define MVPP2_BM_SHORT_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_SHORT_FRAME_SIZE)
-#define MVPP2_BM_LONG_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_LONG_FRAME_SIZE)
-#define MVPP2_BM_JUMBO_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_JUMBO_FRAME_SIZE)
-
-#define MVPP21_ADDR_SPACE_SZ		0
-#define MVPP22_ADDR_SPACE_SZ		SZ_64K
-
-#define MVPP2_MAX_THREADS		8
-#define MVPP2_MAX_QVECS			MVPP2_MAX_THREADS
-
-enum mvpp2_bm_pool_log_num {
-	MVPP2_BM_SHORT,
-	MVPP2_BM_LONG,
-	MVPP2_BM_JUMBO,
-	MVPP2_BM_POOLS_NUM
-};
-
-static struct {
-	int pkt_size;
-	int buf_num;
-} mvpp2_pools[MVPP2_BM_POOLS_NUM];
-
-/* GMAC MIB Counters register definitions */
-#define MVPP21_MIB_COUNTERS_OFFSET		0x1000
-#define MVPP21_MIB_COUNTERS_PORT_SZ		0x400
-#define MVPP22_MIB_COUNTERS_OFFSET		0x0
-#define MVPP22_MIB_COUNTERS_PORT_SZ		0x100
-
-#define MVPP2_MIB_GOOD_OCTETS_RCVD		0x0
-#define MVPP2_MIB_BAD_OCTETS_RCVD		0x8
-#define MVPP2_MIB_CRC_ERRORS_SENT		0xc
-#define MVPP2_MIB_UNICAST_FRAMES_RCVD		0x10
-#define MVPP2_MIB_BROADCAST_FRAMES_RCVD		0x18
-#define MVPP2_MIB_MULTICAST_FRAMES_RCVD		0x1c
-#define MVPP2_MIB_FRAMES_64_OCTETS		0x20
-#define MVPP2_MIB_FRAMES_65_TO_127_OCTETS	0x24
-#define MVPP2_MIB_FRAMES_128_TO_255_OCTETS	0x28
-#define MVPP2_MIB_FRAMES_256_TO_511_OCTETS	0x2c
-#define MVPP2_MIB_FRAMES_512_TO_1023_OCTETS	0x30
-#define MVPP2_MIB_FRAMES_1024_TO_MAX_OCTETS	0x34
-#define MVPP2_MIB_GOOD_OCTETS_SENT		0x38
-#define MVPP2_MIB_UNICAST_FRAMES_SENT		0x40
-#define MVPP2_MIB_MULTICAST_FRAMES_SENT		0x48
-#define MVPP2_MIB_BROADCAST_FRAMES_SENT		0x4c
-#define MVPP2_MIB_FC_SENT			0x54
-#define MVPP2_MIB_FC_RCVD			0x58
-#define MVPP2_MIB_RX_FIFO_OVERRUN		0x5c
-#define MVPP2_MIB_UNDERSIZE_RCVD		0x60
-#define MVPP2_MIB_FRAGMENTS_RCVD		0x64
-#define MVPP2_MIB_OVERSIZE_RCVD			0x68
-#define MVPP2_MIB_JABBER_RCVD			0x6c
-#define MVPP2_MIB_MAC_RCV_ERROR			0x70
-#define MVPP2_MIB_BAD_CRC_EVENT			0x74
-#define MVPP2_MIB_COLLISION			0x78
-#define MVPP2_MIB_LATE_COLLISION		0x7c
-
-#define MVPP2_MIB_COUNTERS_STATS_DELAY		(1 * HZ)
-
-#define MVPP2_DESC_DMA_MASK	DMA_BIT_MASK(40)
-
-/* Definitions */
-
-/* Shared Packet Processor resources */
-struct mvpp2 {
-	/* Shared registers' base addresses */
-	void __iomem *lms_base;
-	void __iomem *iface_base;
-
-	/* On PPv2.2, each "software thread" can access the base
-	 * register through a separate address space, each 64 KB apart
-	 * from each other. Typically, such address spaces will be
-	 * used per CPU.
-	 */
-	void __iomem *swth_base[MVPP2_MAX_THREADS];
-
-	/* On PPv2.2, some port control registers are located into the system
-	 * controller space. These registers are accessible through a regmap.
-	 */
-	struct regmap *sysctrl_base;
-
-	/* Common clocks */
-	struct clk *pp_clk;
-	struct clk *gop_clk;
-	struct clk *mg_clk;
-	struct clk *mg_core_clk;
-	struct clk *axi_clk;
-
-	/* List of pointers to port structures */
-	int port_count;
-	struct mvpp2_port *port_list[MVPP2_MAX_PORTS];
-
-	/* Aggregated TXQs */
-	struct mvpp2_tx_queue *aggr_txqs;
-
-	/* BM pools */
-	struct mvpp2_bm_pool *bm_pools;
-
-	/* PRS shadow table */
-	struct mvpp2_prs_shadow *prs_shadow;
-	/* PRS auxiliary table for double vlan entries control */
-	bool *prs_double_vlans;
-
-	/* Tclk value */
-	u32 tclk;
-
-	/* HW version */
-	enum { MVPP21, MVPP22 } hw_version;
-
-	/* Maximum number of RXQs per port */
-	unsigned int max_port_rxqs;
-
-	/* Workqueue to gather hardware statistics */
-	char queue_name[30];
-	struct workqueue_struct *stats_queue;
-};
-
-struct mvpp2_pcpu_stats {
-	struct	u64_stats_sync syncp;
-	u64	rx_packets;
-	u64	rx_bytes;
-	u64	tx_packets;
-	u64	tx_bytes;
-};
-
-/* Per-CPU port control */
-struct mvpp2_port_pcpu {
-	struct hrtimer tx_done_timer;
-	bool timer_scheduled;
-	/* Tasklet for egress finalization */
-	struct tasklet_struct tx_done_tasklet;
-};
-
-struct mvpp2_queue_vector {
-	int irq;
-	struct napi_struct napi;
-	enum { MVPP2_QUEUE_VECTOR_SHARED, MVPP2_QUEUE_VECTOR_PRIVATE } type;
-	int sw_thread_id;
-	u16 sw_thread_mask;
-	int first_rxq;
-	int nrxqs;
-	u32 pending_cause_rx;
-	struct mvpp2_port *port;
-};
-
-struct mvpp2_port {
-	u8 id;
-
-	/* Index of the port from the "group of ports" complex point
-	 * of view
-	 */
-	int gop_id;
-
-	int link_irq;
-
-	struct mvpp2 *priv;
-
-	/* Firmware node associated to the port */
-	struct fwnode_handle *fwnode;
-
-	/* Is a PHY always connected to the port */
-	bool has_phy;
-
-	/* Per-port registers' base address */
-	void __iomem *base;
-	void __iomem *stats_base;
-
-	struct mvpp2_rx_queue **rxqs;
-	unsigned int nrxqs;
-	struct mvpp2_tx_queue **txqs;
-	unsigned int ntxqs;
-	struct net_device *dev;
-
-	int pkt_size;
-
-	/* Per-CPU port control */
-	struct mvpp2_port_pcpu __percpu *pcpu;
-
-	/* Flags */
-	unsigned long flags;
-
-	u16 tx_ring_size;
-	u16 rx_ring_size;
-	struct mvpp2_pcpu_stats __percpu *stats;
-	u64 *ethtool_stats;
-
-	/* Per-port work and its lock to gather hardware statistics */
-	struct mutex gather_stats_lock;
-	struct delayed_work stats_work;
-
-	struct device_node *of_node;
-
-	phy_interface_t phy_interface;
-	struct phylink *phylink;
-	struct phy *comphy;
-
-	struct mvpp2_bm_pool *pool_long;
-	struct mvpp2_bm_pool *pool_short;
-
-	/* Index of first port's physical RXQ */
-	u8 first_rxq;
-
-	struct mvpp2_queue_vector qvecs[MVPP2_MAX_QVECS];
-	unsigned int nqvecs;
-	bool has_tx_irqs;
-
-	u32 tx_time_coal;
-};
-
-/* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
- * layout of the transmit and reception DMA descriptors, and their
- * layout is therefore defined by the hardware design
- */
-
-#define MVPP2_TXD_L3_OFF_SHIFT		0
-#define MVPP2_TXD_IP_HLEN_SHIFT		8
-#define MVPP2_TXD_L4_CSUM_FRAG		BIT(13)
-#define MVPP2_TXD_L4_CSUM_NOT		BIT(14)
-#define MVPP2_TXD_IP_CSUM_DISABLE	BIT(15)
-#define MVPP2_TXD_PADDING_DISABLE	BIT(23)
-#define MVPP2_TXD_L4_UDP		BIT(24)
-#define MVPP2_TXD_L3_IP6		BIT(26)
-#define MVPP2_TXD_L_DESC		BIT(28)
-#define MVPP2_TXD_F_DESC		BIT(29)
-
-#define MVPP2_RXD_ERR_SUMMARY		BIT(15)
-#define MVPP2_RXD_ERR_CODE_MASK		(BIT(13) | BIT(14))
-#define MVPP2_RXD_ERR_CRC		0x0
-#define MVPP2_RXD_ERR_OVERRUN		BIT(13)
-#define MVPP2_RXD_ERR_RESOURCE		(BIT(13) | BIT(14))
-#define MVPP2_RXD_BM_POOL_ID_OFFS	16
-#define MVPP2_RXD_BM_POOL_ID_MASK	(BIT(16) | BIT(17) | BIT(18))
-#define MVPP2_RXD_HWF_SYNC		BIT(21)
-#define MVPP2_RXD_L4_CSUM_OK		BIT(22)
-#define MVPP2_RXD_IP4_HEADER_ERR	BIT(24)
-#define MVPP2_RXD_L4_TCP		BIT(25)
-#define MVPP2_RXD_L4_UDP		BIT(26)
-#define MVPP2_RXD_L3_IP4		BIT(28)
-#define MVPP2_RXD_L3_IP6		BIT(30)
-#define MVPP2_RXD_BUF_HDR		BIT(31)
-
-/* HW TX descriptor for PPv2.1 */
-struct mvpp21_tx_desc {
-	u32 command;		/* Options used by HW for packet transmitting.*/
-	u8  packet_offset;	/* the offset from the buffer beginning	*/
-	u8  phys_txq;		/* destination queue ID			*/
-	u16 data_size;		/* data size of transmitted packet in bytes */
-	u32 buf_dma_addr;	/* physical addr of transmitted buffer	*/
-	u32 buf_cookie;		/* cookie for access to TX buffer in tx path */
-	u32 reserved1[3];	/* hw_cmd (for future use, BM, PON, PNC) */
-	u32 reserved2;		/* reserved (for future use)		*/
-};
-
-/* HW RX descriptor for PPv2.1 */
-struct mvpp21_rx_desc {
-	u32 status;		/* info about received packet		*/
-	u16 reserved1;		/* parser_info (for future use, PnC)	*/
-	u16 data_size;		/* size of received packet in bytes	*/
-	u32 buf_dma_addr;	/* physical address of the buffer	*/
-	u32 buf_cookie;		/* cookie for access to RX buffer in rx path */
-	u16 reserved2;		/* gem_port_id (for future use, PON)	*/
-	u16 reserved3;		/* csum_l4 (for future use, PnC)	*/
-	u8  reserved4;		/* bm_qset (for future use, BM)		*/
-	u8  reserved5;
-	u16 reserved6;		/* classify_info (for future use, PnC)	*/
-	u32 reserved7;		/* flow_id (for future use, PnC) */
-	u32 reserved8;
-};
-
-/* HW TX descriptor for PPv2.2 */
-struct mvpp22_tx_desc {
-	u32 command;
-	u8  packet_offset;
-	u8  phys_txq;
-	u16 data_size;
-	u64 reserved1;
-	u64 buf_dma_addr_ptp;
-	u64 buf_cookie_misc;
-};
-
-/* HW RX descriptor for PPv2.2 */
-struct mvpp22_rx_desc {
-	u32 status;
-	u16 reserved1;
-	u16 data_size;
-	u32 reserved2;
-	u32 reserved3;
-	u64 buf_dma_addr_key_hash;
-	u64 buf_cookie_misc;
-};
-
-/* Opaque type used by the driver to manipulate the HW TX and RX
- * descriptors
- */
-struct mvpp2_tx_desc {
-	union {
-		struct mvpp21_tx_desc pp21;
-		struct mvpp22_tx_desc pp22;
-	};
-};
-
-struct mvpp2_rx_desc {
-	union {
-		struct mvpp21_rx_desc pp21;
-		struct mvpp22_rx_desc pp22;
-	};
-};
-
-struct mvpp2_txq_pcpu_buf {
-	/* Transmitted SKB */
-	struct sk_buff *skb;
-
-	/* Physical address of transmitted buffer */
-	dma_addr_t dma;
-
-	/* Size transmitted */
-	size_t size;
-};
-
-/* Per-CPU Tx queue control */
-struct mvpp2_txq_pcpu {
-	int cpu;
-
-	/* Number of Tx DMA descriptors in the descriptor ring */
-	int size;
-
-	/* Number of currently used Tx DMA descriptor in the
-	 * descriptor ring
-	 */
-	int count;
-
-	int wake_threshold;
-	int stop_threshold;
-
-	/* Number of Tx DMA descriptors reserved for each CPU */
-	int reserved_num;
-
-	/* Infos about transmitted buffers */
-	struct mvpp2_txq_pcpu_buf *buffs;
-
-	/* Index of last TX DMA descriptor that was inserted */
-	int txq_put_index;
-
-	/* Index of the TX DMA descriptor to be cleaned up */
-	int txq_get_index;
-
-	/* DMA buffer for TSO headers */
-	char *tso_headers;
-	dma_addr_t tso_headers_dma;
-};
-
-struct mvpp2_tx_queue {
-	/* Physical number of this Tx queue */
-	u8 id;
-
-	/* Logical number of this Tx queue */
-	u8 log_id;
-
-	/* Number of Tx DMA descriptors in the descriptor ring */
-	int size;
-
-	/* Number of currently used Tx DMA descriptor in the descriptor ring */
-	int count;
-
-	/* Per-CPU control of physical Tx queues */
-	struct mvpp2_txq_pcpu __percpu *pcpu;
-
-	u32 done_pkts_coal;
-
-	/* Virtual address of thex Tx DMA descriptors array */
-	struct mvpp2_tx_desc *descs;
-
-	/* DMA address of the Tx DMA descriptors array */
-	dma_addr_t descs_dma;
-
-	/* Index of the last Tx DMA descriptor */
-	int last_desc;
-
-	/* Index of the next Tx DMA descriptor to process */
-	int next_desc_to_proc;
-};
-
-struct mvpp2_rx_queue {
-	/* RX queue number, in the range 0-31 for physical RXQs */
-	u8 id;
-
-	/* Num of rx descriptors in the rx descriptor ring */
-	int size;
-
-	u32 pkts_coal;
-	u32 time_coal;
-
-	/* Virtual address of the RX DMA descriptors array */
-	struct mvpp2_rx_desc *descs;
-
-	/* DMA address of the RX DMA descriptors array */
-	dma_addr_t descs_dma;
-
-	/* Index of the last RX DMA descriptor */
-	int last_desc;
-
-	/* Index of the next RX DMA descriptor to process */
-	int next_desc_to_proc;
-
-	/* ID of port to which physical RXQ is mapped */
-	int port;
-
-	/* Port's logic RXQ number to which physical RXQ is mapped */
-	int logic_rxq;
-};
-
-union mvpp2_prs_tcam_entry {
-	u32 word[MVPP2_PRS_TCAM_WORDS];
-	u8  byte[MVPP2_PRS_TCAM_WORDS * 4];
-};
-
-union mvpp2_prs_sram_entry {
-	u32 word[MVPP2_PRS_SRAM_WORDS];
-	u8  byte[MVPP2_PRS_SRAM_WORDS * 4];
-};
-
-struct mvpp2_prs_entry {
-	u32 index;
-	union mvpp2_prs_tcam_entry tcam;
-	union mvpp2_prs_sram_entry sram;
-};
-
-struct mvpp2_prs_shadow {
-	bool valid;
-	bool finish;
-
-	/* Lookup ID */
-	int lu;
-
-	/* User defined offset */
-	int udf;
-
-	/* Result info */
-	u32 ri;
-	u32 ri_mask;
-};
-
-struct mvpp2_cls_flow_entry {
-	u32 index;
-	u32 data[MVPP2_CLS_FLOWS_TBL_DATA_WORDS];
-};
-
-struct mvpp2_cls_lookup_entry {
-	u32 lkpid;
-	u32 way;
-	u32 data;
-};
-
-struct mvpp2_bm_pool {
-	/* Pool number in the range 0-7 */
-	int id;
-
-	/* Buffer Pointers Pool External (BPPE) size */
-	int size;
-	/* BPPE size in bytes */
-	int size_bytes;
-	/* Number of buffers for this pool */
-	int buf_num;
-	/* Pool buffer size */
-	int buf_size;
-	/* Packet size */
-	int pkt_size;
-	int frag_size;
-
-	/* BPPE virtual base address */
-	u32 *virt_addr;
-	/* BPPE DMA base address */
-	dma_addr_t dma_addr;
-
-	/* Ports using BM pool */
-	u32 port_map;
-};
-
-#define IS_TSO_HEADER(txq_pcpu, addr) \
-	((addr) >= (txq_pcpu)->tso_headers_dma && \
-	 (addr) < (txq_pcpu)->tso_headers_dma + \
-	 (txq_pcpu)->size * TSO_HEADER_SIZE)
-
-/* The prototype is added here to be used in start_dev when using ACPI. This
- * will be removed once phylink is used for all modes (dt+ACPI).
- */
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
-			     const struct phylink_link_state *state);
-
-/* Queue modes */
-#define MVPP2_QDIST_SINGLE_MODE	0
-#define MVPP2_QDIST_MULTI_MODE	1
-
-static int queue_mode = MVPP2_QDIST_SINGLE_MODE;
-
-module_param(queue_mode, int, 0444);
-MODULE_PARM_DESC(queue_mode, "Set queue_mode (single=0, multi=1)");
-
-#define MVPP2_DRIVER_NAME "mvpp2"
-#define MVPP2_DRIVER_VERSION "1.0"
-
-/* Utility/helper methods */
-
-static void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data)
-{
-	writel(data, priv->swth_base[0] + offset);
-}
-
-static u32 mvpp2_read(struct mvpp2 *priv, u32 offset)
-{
-	return readl(priv->swth_base[0] + offset);
-}
-
-static u32 mvpp2_read_relaxed(struct mvpp2 *priv, u32 offset)
-{
-	return readl_relaxed(priv->swth_base[0] + offset);
-}
-/* These accessors should be used to access:
- *
- * - per-CPU registers, where each CPU has its own copy of the
- *   register.
- *
- *   MVPP2_BM_VIRT_ALLOC_REG
- *   MVPP2_BM_ADDR_HIGH_ALLOC
- *   MVPP22_BM_ADDR_HIGH_RLS_REG
- *   MVPP2_BM_VIRT_RLS_REG
- *   MVPP2_ISR_RX_TX_CAUSE_REG
- *   MVPP2_ISR_RX_TX_MASK_REG
- *   MVPP2_TXQ_NUM_REG
- *   MVPP2_AGGR_TXQ_UPDATE_REG
- *   MVPP2_TXQ_RSVD_REQ_REG
- *   MVPP2_TXQ_RSVD_RSLT_REG
- *   MVPP2_TXQ_SENT_REG
- *   MVPP2_RXQ_NUM_REG
- *
- * - global registers that must be accessed through a specific CPU
- *   window, because they are related to an access to a per-CPU
- *   register
- *
- *   MVPP2_BM_PHY_ALLOC_REG    (related to MVPP2_BM_VIRT_ALLOC_REG)
- *   MVPP2_BM_PHY_RLS_REG      (related to MVPP2_BM_VIRT_RLS_REG)
- *   MVPP2_RXQ_THRESH_REG      (related to MVPP2_RXQ_NUM_REG)
- *   MVPP2_RXQ_DESC_ADDR_REG   (related to MVPP2_RXQ_NUM_REG)
- *   MVPP2_RXQ_DESC_SIZE_REG   (related to MVPP2_RXQ_NUM_REG)
- *   MVPP2_RXQ_INDEX_REG       (related to MVPP2_RXQ_NUM_REG)
- *   MVPP2_TXQ_PENDING_REG     (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_DESC_ADDR_REG   (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_DESC_SIZE_REG   (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_INDEX_REG       (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_PENDING_REG     (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_PREF_BUF_REG    (related to MVPP2_TXQ_NUM_REG)
- *   MVPP2_TXQ_PREF_BUF_REG    (related to MVPP2_TXQ_NUM_REG)
- */
-static void mvpp2_percpu_write(struct mvpp2 *priv, int cpu,
-			       u32 offset, u32 data)
-{
-	writel(data, priv->swth_base[cpu] + offset);
-}
-
-static u32 mvpp2_percpu_read(struct mvpp2 *priv, int cpu,
-			     u32 offset)
-{
-	return readl(priv->swth_base[cpu] + offset);
-}
-
-static void mvpp2_percpu_write_relaxed(struct mvpp2 *priv, int cpu,
-				       u32 offset, u32 data)
-{
-	writel_relaxed(data, priv->swth_base[cpu] + offset);
-}
-
-static u32 mvpp2_percpu_read_relaxed(struct mvpp2 *priv, int cpu,
-				     u32 offset)
-{
-	return readl_relaxed(priv->swth_base[cpu] + offset);
-}
-
-static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
-					    struct mvpp2_tx_desc *tx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return tx_desc->pp21.buf_dma_addr;
-	else
-		return tx_desc->pp22.buf_dma_addr_ptp & MVPP2_DESC_DMA_MASK;
-}
-
-static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
-				      struct mvpp2_tx_desc *tx_desc,
-				      dma_addr_t dma_addr)
-{
-	dma_addr_t addr, offset;
-
-	addr = dma_addr & ~MVPP2_TX_DESC_ALIGN;
-	offset = dma_addr & MVPP2_TX_DESC_ALIGN;
-
-	if (port->priv->hw_version == MVPP21) {
-		tx_desc->pp21.buf_dma_addr = addr;
-		tx_desc->pp21.packet_offset = offset;
-	} else {
-		u64 val = (u64)addr;
-
-		tx_desc->pp22.buf_dma_addr_ptp &= ~MVPP2_DESC_DMA_MASK;
-		tx_desc->pp22.buf_dma_addr_ptp |= val;
-		tx_desc->pp22.packet_offset = offset;
-	}
-}
-
-static size_t mvpp2_txdesc_size_get(struct mvpp2_port *port,
-				    struct mvpp2_tx_desc *tx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return tx_desc->pp21.data_size;
-	else
-		return tx_desc->pp22.data_size;
-}
-
-static void mvpp2_txdesc_size_set(struct mvpp2_port *port,
-				  struct mvpp2_tx_desc *tx_desc,
-				  size_t size)
-{
-	if (port->priv->hw_version == MVPP21)
-		tx_desc->pp21.data_size = size;
-	else
-		tx_desc->pp22.data_size = size;
-}
-
-static void mvpp2_txdesc_txq_set(struct mvpp2_port *port,
-				 struct mvpp2_tx_desc *tx_desc,
-				 unsigned int txq)
-{
-	if (port->priv->hw_version == MVPP21)
-		tx_desc->pp21.phys_txq = txq;
-	else
-		tx_desc->pp22.phys_txq = txq;
-}
-
-static void mvpp2_txdesc_cmd_set(struct mvpp2_port *port,
-				 struct mvpp2_tx_desc *tx_desc,
-				 unsigned int command)
-{
-	if (port->priv->hw_version == MVPP21)
-		tx_desc->pp21.command = command;
-	else
-		tx_desc->pp22.command = command;
-}
-
-static unsigned int mvpp2_txdesc_offset_get(struct mvpp2_port *port,
-					    struct mvpp2_tx_desc *tx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return tx_desc->pp21.packet_offset;
-	else
-		return tx_desc->pp22.packet_offset;
-}
-
-static dma_addr_t mvpp2_rxdesc_dma_addr_get(struct mvpp2_port *port,
-					    struct mvpp2_rx_desc *rx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return rx_desc->pp21.buf_dma_addr;
-	else
-		return rx_desc->pp22.buf_dma_addr_key_hash & MVPP2_DESC_DMA_MASK;
-}
-
-static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
-					     struct mvpp2_rx_desc *rx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return rx_desc->pp21.buf_cookie;
-	else
-		return rx_desc->pp22.buf_cookie_misc & MVPP2_DESC_DMA_MASK;
-}
-
-static size_t mvpp2_rxdesc_size_get(struct mvpp2_port *port,
-				    struct mvpp2_rx_desc *rx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return rx_desc->pp21.data_size;
-	else
-		return rx_desc->pp22.data_size;
-}
-
-static u32 mvpp2_rxdesc_status_get(struct mvpp2_port *port,
-				   struct mvpp2_rx_desc *rx_desc)
-{
-	if (port->priv->hw_version == MVPP21)
-		return rx_desc->pp21.status;
-	else
-		return rx_desc->pp22.status;
-}
-
-static void mvpp2_txq_inc_get(struct mvpp2_txq_pcpu *txq_pcpu)
-{
-	txq_pcpu->txq_get_index++;
-	if (txq_pcpu->txq_get_index == txq_pcpu->size)
-		txq_pcpu->txq_get_index = 0;
-}
-
-static void mvpp2_txq_inc_put(struct mvpp2_port *port,
-			      struct mvpp2_txq_pcpu *txq_pcpu,
-			      struct sk_buff *skb,
-			      struct mvpp2_tx_desc *tx_desc)
-{
-	struct mvpp2_txq_pcpu_buf *tx_buf =
-		txq_pcpu->buffs + txq_pcpu->txq_put_index;
-	tx_buf->skb = skb;
-	tx_buf->size = mvpp2_txdesc_size_get(port, tx_desc);
-	tx_buf->dma = mvpp2_txdesc_dma_addr_get(port, tx_desc) +
-		mvpp2_txdesc_offset_get(port, tx_desc);
-	txq_pcpu->txq_put_index++;
-	if (txq_pcpu->txq_put_index == txq_pcpu->size)
-		txq_pcpu->txq_put_index = 0;
-}
-
-/* Get number of physical egress port */
-static inline int mvpp2_egress_port(struct mvpp2_port *port)
-{
-	return MVPP2_MAX_TCONT + port->id;
-}
-
-/* Get number of physical TXQ */
-static inline int mvpp2_txq_phys(int port, int txq)
-{
-	return (MVPP2_MAX_TCONT + port) * MVPP2_MAX_TXQ + txq;
-}
-
-/* Parser configuration routines */
-
-/* Update parser tcam and sram hw entries */
-static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe)
-{
-	int i;
-
-	if (pe->index > MVPP2_PRS_TCAM_SRAM_SIZE - 1)
-		return -EINVAL;
-
-	/* Clear entry invalidation bit */
-	pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] &= ~MVPP2_PRS_TCAM_INV_MASK;
-
-	/* Write tcam index - indirect access */
-	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, pe->index);
-	for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
-		mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(i), pe->tcam.word[i]);
-
-	/* Write sram index - indirect access */
-	mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, pe->index);
-	for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
-		mvpp2_write(priv, MVPP2_PRS_SRAM_DATA_REG(i), pe->sram.word[i]);
-
-	return 0;
-}
-
-/* Initialize tcam entry from hw */
-static int mvpp2_prs_init_from_hw(struct mvpp2 *priv,
-				  struct mvpp2_prs_entry *pe, int tid)
-{
-	int i;
-
-	if (tid > MVPP2_PRS_TCAM_SRAM_SIZE - 1)
-		return -EINVAL;
-
-	memset(pe, 0, sizeof(*pe));
-	pe->index = tid;
-
-	/* Write tcam index - indirect access */
-	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, pe->index);
-
-	pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] = mvpp2_read(priv,
-			      MVPP2_PRS_TCAM_DATA_REG(MVPP2_PRS_TCAM_INV_WORD));
-	if (pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] & MVPP2_PRS_TCAM_INV_MASK)
-		return MVPP2_PRS_TCAM_ENTRY_INVALID;
-
-	for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
-		pe->tcam.word[i] = mvpp2_read(priv, MVPP2_PRS_TCAM_DATA_REG(i));
-
-	/* Write sram index - indirect access */
-	mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, pe->index);
-	for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
-		pe->sram.word[i] = mvpp2_read(priv, MVPP2_PRS_SRAM_DATA_REG(i));
-
-	return 0;
-}
-
-/* Invalidate tcam hw entry */
-static void mvpp2_prs_hw_inv(struct mvpp2 *priv, int index)
-{
-	/* Write index - indirect access */
-	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, index);
-	mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(MVPP2_PRS_TCAM_INV_WORD),
-		    MVPP2_PRS_TCAM_INV_MASK);
-}
-
-/* Enable shadow table entry and set its lookup ID */
-static void mvpp2_prs_shadow_set(struct mvpp2 *priv, int index, int lu)
-{
-	priv->prs_shadow[index].valid = true;
-	priv->prs_shadow[index].lu = lu;
-}
-
-/* Update ri fields in shadow table entry */
-static void mvpp2_prs_shadow_ri_set(struct mvpp2 *priv, int index,
-				    unsigned int ri, unsigned int ri_mask)
-{
-	priv->prs_shadow[index].ri_mask = ri_mask;
-	priv->prs_shadow[index].ri = ri;
-}
-
-/* Update lookup field in tcam sw entry */
-static void mvpp2_prs_tcam_lu_set(struct mvpp2_prs_entry *pe, unsigned int lu)
-{
-	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_LU_BYTE);
-
-	pe->tcam.byte[MVPP2_PRS_TCAM_LU_BYTE] = lu;
-	pe->tcam.byte[enable_off] = MVPP2_PRS_LU_MASK;
-}
-
-/* Update mask for single port in tcam sw entry */
-static void mvpp2_prs_tcam_port_set(struct mvpp2_prs_entry *pe,
-				    unsigned int port, bool add)
-{
-	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
-
-	if (add)
-		pe->tcam.byte[enable_off] &= ~(1 << port);
-	else
-		pe->tcam.byte[enable_off] |= 1 << port;
-}
-
-/* Update port map in tcam sw entry */
-static void mvpp2_prs_tcam_port_map_set(struct mvpp2_prs_entry *pe,
-					unsigned int ports)
-{
-	unsigned char port_mask = MVPP2_PRS_PORT_MASK;
-	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
-
-	pe->tcam.byte[MVPP2_PRS_TCAM_PORT_BYTE] = 0;
-	pe->tcam.byte[enable_off] &= ~port_mask;
-	pe->tcam.byte[enable_off] |= ~ports & MVPP2_PRS_PORT_MASK;
-}
-
-/* Obtain port map from tcam sw entry */
-static unsigned int mvpp2_prs_tcam_port_map_get(struct mvpp2_prs_entry *pe)
-{
-	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
-
-	return ~(pe->tcam.byte[enable_off]) & MVPP2_PRS_PORT_MASK;
-}
-
-/* Set byte of data and its enable bits in tcam sw entry */
-static void mvpp2_prs_tcam_data_byte_set(struct mvpp2_prs_entry *pe,
-					 unsigned int offs, unsigned char byte,
-					 unsigned char enable)
-{
-	pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(offs)] = byte;
-	pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)] = enable;
-}
-
-/* Get byte of data and its enable bits from tcam sw entry */
-static void mvpp2_prs_tcam_data_byte_get(struct mvpp2_prs_entry *pe,
-					 unsigned int offs, unsigned char *byte,
-					 unsigned char *enable)
-{
-	*byte = pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(offs)];
-	*enable = pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)];
-}
-
-/* Compare tcam data bytes with a pattern */
-static bool mvpp2_prs_tcam_data_cmp(struct mvpp2_prs_entry *pe, int offs,
-				    u16 data)
-{
-	int off = MVPP2_PRS_TCAM_DATA_BYTE(offs);
-	u16 tcam_data;
-
-	tcam_data = (pe->tcam.byte[off + 1] << 8) | pe->tcam.byte[off];
-	if (tcam_data != data)
-		return false;
-	return true;
-}
-
-/* Update ai bits in tcam sw entry */
-static void mvpp2_prs_tcam_ai_update(struct mvpp2_prs_entry *pe,
-				     unsigned int bits, unsigned int enable)
-{
-	int i, ai_idx = MVPP2_PRS_TCAM_AI_BYTE;
-
-	for (i = 0; i < MVPP2_PRS_AI_BITS; i++) {
-		if (!(enable & BIT(i)))
-			continue;
-
-		if (bits & BIT(i))
-			pe->tcam.byte[ai_idx] |= 1 << i;
-		else
-			pe->tcam.byte[ai_idx] &= ~(1 << i);
-	}
-
-	pe->tcam.byte[MVPP2_PRS_TCAM_EN_OFFS(ai_idx)] |= enable;
-}
-
-/* Get ai bits from tcam sw entry */
-static int mvpp2_prs_tcam_ai_get(struct mvpp2_prs_entry *pe)
-{
-	return pe->tcam.byte[MVPP2_PRS_TCAM_AI_BYTE];
-}
-
-/* Set ethertype in tcam sw entry */
-static void mvpp2_prs_match_etype(struct mvpp2_prs_entry *pe, int offset,
-				  unsigned short ethertype)
-{
-	mvpp2_prs_tcam_data_byte_set(pe, offset + 0, ethertype >> 8, 0xff);
-	mvpp2_prs_tcam_data_byte_set(pe, offset + 1, ethertype & 0xff, 0xff);
-}
-
-/* Set vid in tcam sw entry */
-static void mvpp2_prs_match_vid(struct mvpp2_prs_entry *pe, int offset,
-				unsigned short vid)
-{
-	mvpp2_prs_tcam_data_byte_set(pe, offset + 0, (vid & 0xf00) >> 8, 0xf);
-	mvpp2_prs_tcam_data_byte_set(pe, offset + 1, vid & 0xff, 0xff);
-}
-
-/* Set bits in sram sw entry */
-static void mvpp2_prs_sram_bits_set(struct mvpp2_prs_entry *pe, int bit_num,
-				    int val)
-{
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(bit_num)] |= (val << (bit_num % 8));
-}
-
-/* Clear bits in sram sw entry */
-static void mvpp2_prs_sram_bits_clear(struct mvpp2_prs_entry *pe, int bit_num,
-				      int val)
-{
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(bit_num)] &= ~(val << (bit_num % 8));
-}
-
-/* Update ri bits in sram sw entry */
-static void mvpp2_prs_sram_ri_update(struct mvpp2_prs_entry *pe,
-				     unsigned int bits, unsigned int mask)
-{
-	unsigned int i;
-
-	for (i = 0; i < MVPP2_PRS_SRAM_RI_CTRL_BITS; i++) {
-		int ri_off = MVPP2_PRS_SRAM_RI_OFFS;
-
-		if (!(mask & BIT(i)))
-			continue;
-
-		if (bits & BIT(i))
-			mvpp2_prs_sram_bits_set(pe, ri_off + i, 1);
-		else
-			mvpp2_prs_sram_bits_clear(pe, ri_off + i, 1);
-
-		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_RI_CTRL_OFFS + i, 1);
-	}
-}
-
-/* Obtain ri bits from sram sw entry */
-static int mvpp2_prs_sram_ri_get(struct mvpp2_prs_entry *pe)
-{
-	return pe->sram.word[MVPP2_PRS_SRAM_RI_WORD];
-}
-
-/* Update ai bits in sram sw entry */
-static void mvpp2_prs_sram_ai_update(struct mvpp2_prs_entry *pe,
-				     unsigned int bits, unsigned int mask)
-{
-	unsigned int i;
-	int ai_off = MVPP2_PRS_SRAM_AI_OFFS;
-
-	for (i = 0; i < MVPP2_PRS_SRAM_AI_CTRL_BITS; i++) {
-		if (!(mask & BIT(i)))
-			continue;
-
-		if (bits & BIT(i))
-			mvpp2_prs_sram_bits_set(pe, ai_off + i, 1);
-		else
-			mvpp2_prs_sram_bits_clear(pe, ai_off + i, 1);
-
-		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_AI_CTRL_OFFS + i, 1);
-	}
-}
-
-/* Read ai bits from sram sw entry */
-static int mvpp2_prs_sram_ai_get(struct mvpp2_prs_entry *pe)
-{
-	u8 bits;
-	int ai_off = MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_AI_OFFS);
-	int ai_en_off = ai_off + 1;
-	int ai_shift = MVPP2_PRS_SRAM_AI_OFFS % 8;
-
-	bits = (pe->sram.byte[ai_off] >> ai_shift) |
-	       (pe->sram.byte[ai_en_off] << (8 - ai_shift));
-
-	return bits;
-}
-
-/* In sram sw entry set lookup ID field of the tcam key to be used in the next
- * lookup interation
- */
-static void mvpp2_prs_sram_next_lu_set(struct mvpp2_prs_entry *pe,
-				       unsigned int lu)
-{
-	int sram_next_off = MVPP2_PRS_SRAM_NEXT_LU_OFFS;
-
-	mvpp2_prs_sram_bits_clear(pe, sram_next_off,
-				  MVPP2_PRS_SRAM_NEXT_LU_MASK);
-	mvpp2_prs_sram_bits_set(pe, sram_next_off, lu);
-}
-
-/* In the sram sw entry set sign and value of the next lookup offset
- * and the offset value generated to the classifier
- */
-static void mvpp2_prs_sram_shift_set(struct mvpp2_prs_entry *pe, int shift,
-				     unsigned int op)
-{
-	/* Set sign */
-	if (shift < 0) {
-		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_SHIFT_SIGN_BIT, 1);
-		shift = 0 - shift;
-	} else {
-		mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_SHIFT_SIGN_BIT, 1);
-	}
-
-	/* Set value */
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_SHIFT_OFFS)] =
-							   (unsigned char)shift;
-
-	/* Reset and set operation */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS,
-				  MVPP2_PRS_SRAM_OP_SEL_SHIFT_MASK);
-	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS, op);
-
-	/* Set base offset as current */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS, 1);
-}
-
-/* In the sram sw entry set sign and value of the user defined offset
- * generated to the classifier
- */
-static void mvpp2_prs_sram_offset_set(struct mvpp2_prs_entry *pe,
-				      unsigned int type, int offset,
-				      unsigned int op)
-{
-	/* Set sign */
-	if (offset < 0) {
-		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_SIGN_BIT, 1);
-		offset = 0 - offset;
-	} else {
-		mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_SIGN_BIT, 1);
-	}
-
-	/* Set value */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_OFFS,
-				  MVPP2_PRS_SRAM_UDF_MASK);
-	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_OFFS, offset);
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_UDF_OFFS +
-					MVPP2_PRS_SRAM_UDF_BITS)] &=
-	      ~(MVPP2_PRS_SRAM_UDF_MASK >> (8 - (MVPP2_PRS_SRAM_UDF_OFFS % 8)));
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_UDF_OFFS +
-					MVPP2_PRS_SRAM_UDF_BITS)] |=
-				(offset >> (8 - (MVPP2_PRS_SRAM_UDF_OFFS % 8)));
-
-	/* Set offset type */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_TYPE_OFFS,
-				  MVPP2_PRS_SRAM_UDF_TYPE_MASK);
-	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_TYPE_OFFS, type);
-
-	/* Set offset operation */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_MASK);
-	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS, op);
-
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS +
-					MVPP2_PRS_SRAM_OP_SEL_UDF_BITS)] &=
-					     ~(MVPP2_PRS_SRAM_OP_SEL_UDF_MASK >>
-				    (8 - (MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS % 8)));
-
-	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS +
-					MVPP2_PRS_SRAM_OP_SEL_UDF_BITS)] |=
-			     (op >> (8 - (MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS % 8)));
-
-	/* Set base offset as current */
-	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS, 1);
-}
-
-/* Find parser flow entry */
-static int mvpp2_prs_flow_find(struct mvpp2 *priv, int flow)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* Go through the all entires with MVPP2_PRS_LU_FLOWS */
-	for (tid = MVPP2_PRS_TCAM_SRAM_SIZE - 1; tid >= 0; tid--) {
-		u8 bits;
-
-		if (!priv->prs_shadow[tid].valid ||
-		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_FLOWS)
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-		bits = mvpp2_prs_sram_ai_get(&pe);
-
-		/* Sram store classification lookup ID in AI bits [5:0] */
-		if ((bits & MVPP2_PRS_FLOW_ID_MASK) == flow)
-			return tid;
-	}
-
-	return -ENOENT;
-}
-
-/* Return first free tcam index, seeking from start to end */
-static int mvpp2_prs_tcam_first_free(struct mvpp2 *priv, unsigned char start,
-				     unsigned char end)
-{
-	int tid;
-
-	if (start > end)
-		swap(start, end);
-
-	if (end >= MVPP2_PRS_TCAM_SRAM_SIZE)
-		end = MVPP2_PRS_TCAM_SRAM_SIZE - 1;
-
-	for (tid = start; tid <= end; tid++) {
-		if (!priv->prs_shadow[tid].valid)
-			return tid;
-	}
-
-	return -EINVAL;
-}
-
-/* Enable/disable dropping all mac da's */
-static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add)
-{
-	struct mvpp2_prs_entry pe;
-
-	if (priv->prs_shadow[MVPP2_PE_DROP_ALL].valid) {
-		/* Entry exist - update port only */
-		mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL);
-	} else {
-		/* Entry doesn't exist - create new */
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
-		pe.index = MVPP2_PE_DROP_ALL;
-
-		/* Non-promiscuous mode for all ports - DROP unknown packets */
-		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
-					 MVPP2_PRS_RI_DROP_MASK);
-
-		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-
-		/* Update shadow table */
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-	}
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port, add);
-
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Set port to unicast or multicast promiscuous mode */
-static void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port,
-				      enum mvpp2_prs_l2_cast l2_cast, bool add)
-{
-	struct mvpp2_prs_entry pe;
-	unsigned char cast_match;
-	unsigned int ri;
-	int tid;
-
-	if (l2_cast == MVPP2_PRS_L2_UNI_CAST) {
-		cast_match = MVPP2_PRS_UCAST_VAL;
-		tid = MVPP2_PE_MAC_UC_PROMISCUOUS;
-		ri = MVPP2_PRS_RI_L2_UCAST;
-	} else {
-		cast_match = MVPP2_PRS_MCAST_VAL;
-		tid = MVPP2_PE_MAC_MC_PROMISCUOUS;
-		ri = MVPP2_PRS_RI_L2_MCAST;
-	}
-
-	/* promiscuous mode - Accept unknown unicast or multicast packets */
-	if (priv->prs_shadow[tid].valid) {
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	} else {
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
-		pe.index = tid;
-
-		/* Continue - set next lookup */
-		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_DSA);
-
-		/* Set result info bits */
-		mvpp2_prs_sram_ri_update(&pe, ri, MVPP2_PRS_RI_L2_CAST_MASK);
-
-		/* Match UC or MC addresses */
-		mvpp2_prs_tcam_data_byte_set(&pe, 0, cast_match,
-					     MVPP2_PRS_CAST_MASK);
-
-		/* Shift to ethertype */
-		mvpp2_prs_sram_shift_set(&pe, 2 * ETH_ALEN,
-					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-
-		/* Update shadow table */
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
-	}
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port, add);
-
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Set entry for dsa packets */
-static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add,
-				  bool tagged, bool extend)
-{
-	struct mvpp2_prs_entry pe;
-	int tid, shift;
-
-	if (extend) {
-		tid = tagged ? MVPP2_PE_EDSA_TAGGED : MVPP2_PE_EDSA_UNTAGGED;
-		shift = 8;
-	} else {
-		tid = tagged ? MVPP2_PE_DSA_TAGGED : MVPP2_PE_DSA_UNTAGGED;
-		shift = 4;
-	}
-
-	if (priv->prs_shadow[tid].valid) {
-		/* Entry exist - update port only */
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	} else {
-		/* Entry doesn't exist - create new */
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
-		pe.index = tid;
-
-		/* Update shadow table */
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_DSA);
-
-		if (tagged) {
-			/* Set tagged bit in DSA tag */
-			mvpp2_prs_tcam_data_byte_set(&pe, 0,
-					     MVPP2_PRS_TCAM_DSA_TAGGED_BIT,
-					     MVPP2_PRS_TCAM_DSA_TAGGED_BIT);
-
-			/* Set ai bits for next iteration */
-			if (extend)
-				mvpp2_prs_sram_ai_update(&pe, 1,
-							MVPP2_PRS_SRAM_AI_MASK);
-			else
-				mvpp2_prs_sram_ai_update(&pe, 0,
-							MVPP2_PRS_SRAM_AI_MASK);
-
-			/* Set result info bits to 'single vlan' */
-			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_SINGLE,
-						 MVPP2_PRS_RI_VLAN_MASK);
-			/* If packet is tagged continue check vid filtering */
-			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
-		} else {
-			/* Shift 4 bytes for DSA tag or 8 bytes for EDSA tag*/
-			mvpp2_prs_sram_shift_set(&pe, shift,
-					MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-			/* Set result info bits to 'no vlans' */
-			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
-						 MVPP2_PRS_RI_VLAN_MASK);
-			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-		}
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-	}
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port, add);
-
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Set entry for dsa ethertype */
-static void mvpp2_prs_dsa_tag_ethertype_set(struct mvpp2 *priv, int port,
-					    bool add, bool tagged, bool extend)
-{
-	struct mvpp2_prs_entry pe;
-	int tid, shift, port_mask;
-
-	if (extend) {
-		tid = tagged ? MVPP2_PE_ETYPE_EDSA_TAGGED :
-		      MVPP2_PE_ETYPE_EDSA_UNTAGGED;
-		port_mask = 0;
-		shift = 8;
-	} else {
-		tid = tagged ? MVPP2_PE_ETYPE_DSA_TAGGED :
-		      MVPP2_PE_ETYPE_DSA_UNTAGGED;
-		port_mask = MVPP2_PRS_PORT_MASK;
-		shift = 4;
-	}
-
-	if (priv->prs_shadow[tid].valid) {
-		/* Entry exist - update port only */
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	} else {
-		/* Entry doesn't exist - create new */
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
-		pe.index = tid;
-
-		/* Set ethertype */
-		mvpp2_prs_match_etype(&pe, 0, ETH_P_EDSA);
-		mvpp2_prs_match_etype(&pe, 2, 0);
-
-		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DSA_MASK,
-					 MVPP2_PRS_RI_DSA_MASK);
-		/* Shift ethertype + 2 byte reserved + tag*/
-		mvpp2_prs_sram_shift_set(&pe, 2 + MVPP2_ETH_TYPE_LEN + shift,
-					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-		/* Update shadow table */
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_DSA);
-
-		if (tagged) {
-			/* Set tagged bit in DSA tag */
-			mvpp2_prs_tcam_data_byte_set(&pe,
-						     MVPP2_ETH_TYPE_LEN + 2 + 3,
-						 MVPP2_PRS_TCAM_DSA_TAGGED_BIT,
-						 MVPP2_PRS_TCAM_DSA_TAGGED_BIT);
-			/* Clear all ai bits for next iteration */
-			mvpp2_prs_sram_ai_update(&pe, 0,
-						 MVPP2_PRS_SRAM_AI_MASK);
-			/* If packet is tagged continue check vlans */
-			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-		} else {
-			/* Set result info bits to 'no vlans' */
-			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
-						 MVPP2_PRS_RI_VLAN_MASK);
-			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-		}
-		/* Mask/unmask all ports, depending on dsa type */
-		mvpp2_prs_tcam_port_map_set(&pe, port_mask);
-	}
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port, add);
-
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Search for existing single/triple vlan entry */
-static int mvpp2_prs_vlan_find(struct mvpp2 *priv, unsigned short tpid, int ai)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* Go through the all entries with MVPP2_PRS_LU_VLAN */
-	for (tid = MVPP2_PE_FIRST_FREE_TID;
-	     tid <= MVPP2_PE_LAST_FREE_TID; tid++) {
-		unsigned int ri_bits, ai_bits;
-		bool match;
-
-		if (!priv->prs_shadow[tid].valid ||
-		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN)
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-		match = mvpp2_prs_tcam_data_cmp(&pe, 0, swab16(tpid));
-		if (!match)
-			continue;
-
-		/* Get vlan type */
-		ri_bits = mvpp2_prs_sram_ri_get(&pe);
-		ri_bits &= MVPP2_PRS_RI_VLAN_MASK;
-
-		/* Get current ai value from tcam */
-		ai_bits = mvpp2_prs_tcam_ai_get(&pe);
-		/* Clear double vlan bit */
-		ai_bits &= ~MVPP2_PRS_DBL_VLAN_AI_BIT;
-
-		if (ai != ai_bits)
-			continue;
-
-		if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE ||
-		    ri_bits == MVPP2_PRS_RI_VLAN_TRIPLE)
-			return tid;
-	}
-
-	return -ENOENT;
-}
-
-/* Add/update single/triple vlan entry */
-static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai,
-			      unsigned int port_map)
-{
-	struct mvpp2_prs_entry pe;
-	int tid_aux, tid;
-	int ret = 0;
-
-	memset(&pe, 0, sizeof(pe));
-
-	tid = mvpp2_prs_vlan_find(priv, tpid, ai);
-
-	if (tid < 0) {
-		/* Create new tcam entry */
-		tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_LAST_FREE_TID,
-						MVPP2_PE_FIRST_FREE_TID);
-		if (tid < 0)
-			return tid;
-
-		/* Get last double vlan tid */
-		for (tid_aux = MVPP2_PE_LAST_FREE_TID;
-		     tid_aux >= MVPP2_PE_FIRST_FREE_TID; tid_aux--) {
-			unsigned int ri_bits;
-
-			if (!priv->prs_shadow[tid_aux].valid ||
-			    priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN)
-				continue;
-
-			mvpp2_prs_init_from_hw(priv, &pe, tid_aux);
-			ri_bits = mvpp2_prs_sram_ri_get(&pe);
-			if ((ri_bits & MVPP2_PRS_RI_VLAN_MASK) ==
-			    MVPP2_PRS_RI_VLAN_DOUBLE)
-				break;
-		}
-
-		if (tid <= tid_aux)
-			return -EINVAL;
-
-		memset(&pe, 0, sizeof(pe));
-		pe.index = tid;
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-
-		mvpp2_prs_match_etype(&pe, 0, tpid);
-
-		/* VLAN tag detected, proceed with VID filtering */
-		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
-
-		/* Clear all ai bits for next iteration */
-		mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-		if (ai == MVPP2_PRS_SINGLE_VLAN_AI) {
-			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_SINGLE,
-						 MVPP2_PRS_RI_VLAN_MASK);
-		} else {
-			ai |= MVPP2_PRS_DBL_VLAN_AI_BIT;
-			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_TRIPLE,
-						 MVPP2_PRS_RI_VLAN_MASK);
-		}
-		mvpp2_prs_tcam_ai_update(&pe, ai, MVPP2_PRS_SRAM_AI_MASK);
-
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
-	} else {
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	}
-	/* Update ports' mask */
-	mvpp2_prs_tcam_port_map_set(&pe, port_map);
-
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return ret;
-}
-
-/* Get first free double vlan ai number */
-static int mvpp2_prs_double_vlan_ai_free_get(struct mvpp2 *priv)
-{
-	int i;
-
-	for (i = 1; i < MVPP2_PRS_DBL_VLANS_MAX; i++) {
-		if (!priv->prs_double_vlans[i])
-			return i;
-	}
-
-	return -EINVAL;
-}
-
-/* Search for existing double vlan entry */
-static int mvpp2_prs_double_vlan_find(struct mvpp2 *priv, unsigned short tpid1,
-				      unsigned short tpid2)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* Go through the all entries with MVPP2_PRS_LU_VLAN */
-	for (tid = MVPP2_PE_FIRST_FREE_TID;
-	     tid <= MVPP2_PE_LAST_FREE_TID; tid++) {
-		unsigned int ri_mask;
-		bool match;
-
-		if (!priv->prs_shadow[tid].valid ||
-		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN)
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-
-		match = mvpp2_prs_tcam_data_cmp(&pe, 0, swab16(tpid1)) &&
-			mvpp2_prs_tcam_data_cmp(&pe, 4, swab16(tpid2));
-
-		if (!match)
-			continue;
-
-		ri_mask = mvpp2_prs_sram_ri_get(&pe) & MVPP2_PRS_RI_VLAN_MASK;
-		if (ri_mask == MVPP2_PRS_RI_VLAN_DOUBLE)
-			return tid;
-	}
-
-	return -ENOENT;
-}
-
-/* Add or update double vlan entry */
-static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1,
-				     unsigned short tpid2,
-				     unsigned int port_map)
-{
-	int tid_aux, tid, ai, ret = 0;
-	struct mvpp2_prs_entry pe;
-
-	memset(&pe, 0, sizeof(pe));
-
-	tid = mvpp2_prs_double_vlan_find(priv, tpid1, tpid2);
-
-	if (tid < 0) {
-		/* Create new tcam entry */
-		tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-				MVPP2_PE_LAST_FREE_TID);
-		if (tid < 0)
-			return tid;
-
-		/* Set ai value for new double vlan entry */
-		ai = mvpp2_prs_double_vlan_ai_free_get(priv);
-		if (ai < 0)
-			return ai;
-
-		/* Get first single/triple vlan tid */
-		for (tid_aux = MVPP2_PE_FIRST_FREE_TID;
-		     tid_aux <= MVPP2_PE_LAST_FREE_TID; tid_aux++) {
-			unsigned int ri_bits;
-
-			if (!priv->prs_shadow[tid_aux].valid ||
-			    priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN)
-				continue;
-
-			mvpp2_prs_init_from_hw(priv, &pe, tid_aux);
-			ri_bits = mvpp2_prs_sram_ri_get(&pe);
-			ri_bits &= MVPP2_PRS_RI_VLAN_MASK;
-			if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE ||
-			    ri_bits == MVPP2_PRS_RI_VLAN_TRIPLE)
-				break;
-		}
-
-		if (tid >= tid_aux)
-			return -ERANGE;
-
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-		pe.index = tid;
-
-		priv->prs_double_vlans[ai] = true;
-
-		mvpp2_prs_match_etype(&pe, 0, tpid1);
-		mvpp2_prs_match_etype(&pe, 4, tpid2);
-
-		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-		/* Shift 4 bytes - skip outer vlan tag */
-		mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_LEN,
-					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_DOUBLE,
-					 MVPP2_PRS_RI_VLAN_MASK);
-		mvpp2_prs_sram_ai_update(&pe, ai | MVPP2_PRS_DBL_VLAN_AI_BIT,
-					 MVPP2_PRS_SRAM_AI_MASK);
-
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
-	} else {
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	}
-
-	/* Update ports' mask */
-	mvpp2_prs_tcam_port_map_set(&pe, port_map);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return ret;
-}
-
-/* IPv4 header parsing for fragmentation and L4 offset */
-static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
-			       unsigned int ri, unsigned int ri_mask)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	if ((proto != IPPROTO_TCP) && (proto != IPPROTO_UDP) &&
-	    (proto != IPPROTO_IGMP))
-		return -EINVAL;
-
-	/* Not fragmented packet */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	pe.index = tid;
-
-	/* Set next lu to IPv4 */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_sram_shift_set(&pe, 12, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	/* Set L4 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
-				  sizeof(struct iphdr) - 4,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
-				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00,
-				     MVPP2_PRS_TCAM_PROTO_MASK_L);
-	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00,
-				     MVPP2_PRS_TCAM_PROTO_MASK);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK);
-	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Fragmented packet */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	pe.index = tid;
-	/* Clear ri before updating */
-	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
-	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
-	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
-
-	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE,
-				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0);
-	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* IPv4 L3 multicast or broadcast */
-static int mvpp2_prs_ip4_cast(struct mvpp2 *priv, unsigned short l3_cast)
-{
-	struct mvpp2_prs_entry pe;
-	int mask, tid;
-
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	pe.index = tid;
-
-	switch (l3_cast) {
-	case MVPP2_PRS_L3_MULTI_CAST:
-		mvpp2_prs_tcam_data_byte_set(&pe, 0, MVPP2_PRS_IPV4_MC,
-					     MVPP2_PRS_IPV4_MC_MASK);
-		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_MCAST,
-					 MVPP2_PRS_RI_L3_ADDR_MASK);
-		break;
-	case  MVPP2_PRS_L3_BROAD_CAST:
-		mask = MVPP2_PRS_IPV4_BC_MASK;
-		mvpp2_prs_tcam_data_byte_set(&pe, 0, mask, mask);
-		mvpp2_prs_tcam_data_byte_set(&pe, 1, mask, mask);
-		mvpp2_prs_tcam_data_byte_set(&pe, 2, mask, mask);
-		mvpp2_prs_tcam_data_byte_set(&pe, 3, mask, mask);
-		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_BCAST,
-					 MVPP2_PRS_RI_L3_ADDR_MASK);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
-				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Set entries for protocols over IPv6  */
-static int mvpp2_prs_ip6_proto(struct mvpp2 *priv, unsigned short proto,
-			       unsigned int ri, unsigned int ri_mask)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	if ((proto != IPPROTO_TCP) && (proto != IPPROTO_UDP) &&
-	    (proto != IPPROTO_ICMPV6) && (proto != IPPROTO_IPIP))
-		return -EINVAL;
-
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = tid;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
-				  sizeof(struct ipv6hdr) - 6,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 0, proto, MVPP2_PRS_TCAM_PROTO_MASK);
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Write HW */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* IPv6 L3 multicast entry */
-static int mvpp2_prs_ip6_cast(struct mvpp2 *priv, unsigned short l3_cast)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	if (l3_cast != MVPP2_PRS_L3_MULTI_CAST)
-		return -EINVAL;
-
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = tid;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_MCAST,
-				 MVPP2_PRS_RI_L3_ADDR_MASK);
-	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Shift back to IPv6 NH */
-	mvpp2_prs_sram_shift_set(&pe, -18, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 0, MVPP2_PRS_IPV6_MC,
-				     MVPP2_PRS_IPV6_MC_MASK);
-	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Parser per-port initialization */
-static void mvpp2_prs_hw_port_init(struct mvpp2 *priv, int port, int lu_first,
-				   int lu_max, int offset)
-{
-	u32 val;
-
-	/* Set lookup ID */
-	val = mvpp2_read(priv, MVPP2_PRS_INIT_LOOKUP_REG);
-	val &= ~MVPP2_PRS_PORT_LU_MASK(port);
-	val |=  MVPP2_PRS_PORT_LU_VAL(port, lu_first);
-	mvpp2_write(priv, MVPP2_PRS_INIT_LOOKUP_REG, val);
-
-	/* Set maximum number of loops for packet received from port */
-	val = mvpp2_read(priv, MVPP2_PRS_MAX_LOOP_REG(port));
-	val &= ~MVPP2_PRS_MAX_LOOP_MASK(port);
-	val |= MVPP2_PRS_MAX_LOOP_VAL(port, lu_max);
-	mvpp2_write(priv, MVPP2_PRS_MAX_LOOP_REG(port), val);
-
-	/* Set initial offset for packet header extraction for the first
-	 * searching loop
-	 */
-	val = mvpp2_read(priv, MVPP2_PRS_INIT_OFFS_REG(port));
-	val &= ~MVPP2_PRS_INIT_OFF_MASK(port);
-	val |= MVPP2_PRS_INIT_OFF_VAL(port, offset);
-	mvpp2_write(priv, MVPP2_PRS_INIT_OFFS_REG(port), val);
-}
-
-/* Default flow entries initialization for all ports */
-static void mvpp2_prs_def_flow_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int port;
-
-	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
-		memset(&pe, 0, sizeof(pe));
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-		pe.index = MVPP2_PE_FIRST_DEFAULT_FLOW - port;
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-
-		/* Set flow ID*/
-		mvpp2_prs_sram_ai_update(&pe, port, MVPP2_PRS_FLOW_ID_MASK);
-		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_DONE_BIT, 1);
-
-		/* Update shadow table and hw entry */
-		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_FLOWS);
-		mvpp2_prs_hw_write(priv, &pe);
-	}
-}
-
-/* Set default entry for Marvell Header field */
-static void mvpp2_prs_mh_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-
-	memset(&pe, 0, sizeof(pe));
-
-	pe.index = MVPP2_PE_MH_DEFAULT;
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MH);
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_MH_SIZE,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_MAC);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MH);
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Set default entires (place holder) for promiscuous, non-promiscuous and
- * multicast MAC addresses
- */
-static void mvpp2_prs_mac_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-
-	memset(&pe, 0, sizeof(pe));
-
-	/* Non-promiscuous mode for all ports - DROP unknown packets */
-	pe.index = MVPP2_PE_MAC_NON_PROMISCUOUS;
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
-
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
-				 MVPP2_PRS_RI_DROP_MASK);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Create dummy entries for drop all and promiscuous modes */
-	mvpp2_prs_mac_drop_all_set(priv, 0, false);
-	mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false);
-	mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false);
-}
-
-/* Set default entries for various types of dsa packets */
-static void mvpp2_prs_dsa_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-
-	/* None tagged EDSA entry - place holder */
-	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_UNTAGGED,
-			      MVPP2_PRS_EDSA);
-
-	/* Tagged EDSA entry - place holder */
-	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
-
-	/* None tagged DSA entry - place holder */
-	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_UNTAGGED,
-			      MVPP2_PRS_DSA);
-
-	/* Tagged DSA entry - place holder */
-	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
-
-	/* None tagged EDSA ethertype entry - place holder*/
-	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, false,
-					MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
-
-	/* Tagged EDSA ethertype entry - place holder*/
-	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, false,
-					MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
-
-	/* None tagged DSA ethertype entry */
-	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, true,
-					MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
-
-	/* Tagged DSA ethertype entry */
-	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, true,
-					MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
-
-	/* Set default entry, in case DSA or EDSA tag not found */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
-	pe.index = MVPP2_PE_DSA_DEFAULT;
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-
-	/* Shift 0 bytes */
-	mvpp2_prs_sram_shift_set(&pe, 0, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
-
-	/* Clear all sram ai bits for next iteration */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Initialize parser entries for VID filtering */
-static void mvpp2_prs_vid_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-
-	memset(&pe, 0, sizeof(pe));
-
-	/* Set default vid entry */
-	pe.index = MVPP2_PE_VID_FLTR_DEFAULT;
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
-
-	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_EDSA_VID_AI_BIT);
-
-	/* Skip VLAN header - Set offset to 4 bytes */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_LEN,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	/* Clear all ai bits for next iteration */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Set default vid entry for extended DSA*/
-	memset(&pe, 0, sizeof(pe));
-
-	/* Set default vid entry */
-	pe.index = MVPP2_PE_VID_EDSA_FLTR_DEFAULT;
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_EDSA_VID_AI_BIT,
-				 MVPP2_PRS_EDSA_VID_AI_BIT);
-
-	/* Skip VLAN header - Set offset to 8 bytes */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_EDSA_LEN,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	/* Clear all ai bits for next iteration */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Match basic ethertypes */
-static int mvpp2_prs_etype_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* Ethertype: PPPoE */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, ETH_P_PPP_SES);
-
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_PPPOE_HDR_SIZE,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_PPPOE_MASK,
-				 MVPP2_PRS_RI_PPPOE_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = false;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_PPPOE_MASK,
-				MVPP2_PRS_RI_PPPOE_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Ethertype: ARP */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, ETH_P_ARP);
-
-	/* Generate flow in the next iteration*/
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_ARP,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = true;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_ARP,
-				MVPP2_PRS_RI_L3_PROTO_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Ethertype: LBTD */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, MVPP2_IP_LBDT_TYPE);
-
-	/* Generate flow in the next iteration*/
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
-				 MVPP2_PRS_RI_UDF3_RX_SPECIAL,
-				 MVPP2_PRS_RI_CPU_CODE_MASK |
-				 MVPP2_PRS_RI_UDF3_MASK);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = true;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
-				MVPP2_PRS_RI_UDF3_RX_SPECIAL,
-				MVPP2_PRS_RI_CPU_CODE_MASK |
-				MVPP2_PRS_RI_UDF3_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Ethertype: IPv4 without options */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, ETH_P_IP);
-	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
-				     MVPP2_PRS_IPV4_HEAD | MVPP2_PRS_IPV4_IHL,
-				     MVPP2_PRS_IPV4_HEAD_MASK |
-				     MVPP2_PRS_IPV4_IHL_MASK);
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Skip eth_type + 4 bytes of IP header */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = false;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP4,
-				MVPP2_PRS_RI_L3_PROTO_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Ethertype: IPv4 with options */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	pe.index = tid;
-
-	/* Clear tcam data before updating */
-	pe.tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(MVPP2_ETH_TYPE_LEN)] = 0x0;
-	pe.tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(MVPP2_ETH_TYPE_LEN)] = 0x0;
-
-	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
-				     MVPP2_PRS_IPV4_HEAD,
-				     MVPP2_PRS_IPV4_HEAD_MASK);
-
-	/* Clear ri before updating */
-	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
-	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4_OPT,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = false;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP4_OPT,
-				MVPP2_PRS_RI_L3_PROTO_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Ethertype: IPv6 without options */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, ETH_P_IPV6);
-
-	/* Skip DIP of IPV6 header */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 8 +
-				 MVPP2_MAX_L3_ADDR_SIZE,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP6,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = false;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP6,
-				MVPP2_PRS_RI_L3_PROTO_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Default entry for MVPP2_PRS_LU_L2 - Unknown ethtype */
-	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
-	pe.index = MVPP2_PE_ETH_TYPE_UN;
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Generate flow in the next iteration*/
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Set L3 offset even it's unknown L3 */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
-	priv->prs_shadow[pe.index].finish = true;
-	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_UN,
-				MVPP2_PRS_RI_L3_PROTO_MASK);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Configure vlan entries and detect up to 2 successive VLAN tags.
- * Possible options:
- * 0x8100, 0x88A8
- * 0x8100, 0x8100
- * 0x8100
- * 0x88A8
- */
-static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int err;
-
-	priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool),
-					      MVPP2_PRS_DBL_VLANS_MAX,
-					      GFP_KERNEL);
-	if (!priv->prs_double_vlans)
-		return -ENOMEM;
-
-	/* Double VLAN: 0x8100, 0x88A8 */
-	err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021Q, ETH_P_8021AD,
-					MVPP2_PRS_PORT_MASK);
-	if (err)
-		return err;
-
-	/* Double VLAN: 0x8100, 0x8100 */
-	err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021Q, ETH_P_8021Q,
-					MVPP2_PRS_PORT_MASK);
-	if (err)
-		return err;
-
-	/* Single VLAN: 0x88a8 */
-	err = mvpp2_prs_vlan_add(priv, ETH_P_8021AD, MVPP2_PRS_SINGLE_VLAN_AI,
-				 MVPP2_PRS_PORT_MASK);
-	if (err)
-		return err;
-
-	/* Single VLAN: 0x8100 */
-	err = mvpp2_prs_vlan_add(priv, ETH_P_8021Q, MVPP2_PRS_SINGLE_VLAN_AI,
-				 MVPP2_PRS_PORT_MASK);
-	if (err)
-		return err;
-
-	/* Set default double vlan entry */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-	pe.index = MVPP2_PE_VLAN_DBL;
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
-
-	/* Clear ai for next iterations */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_DOUBLE,
-				 MVPP2_PRS_RI_VLAN_MASK);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_DBL_VLAN_AI_BIT,
-				 MVPP2_PRS_DBL_VLAN_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Set default vlan none entry */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
-	pe.index = MVPP2_PE_VLAN_NONE;
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
-				 MVPP2_PRS_RI_VLAN_MASK);
-
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Set entries for PPPoE ethertype */
-static int mvpp2_prs_pppoe_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* IPv4 over PPPoE with options */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, PPP_IP);
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4_OPT,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Skip eth_type + 4 bytes of IP header */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* IPv4 over PPPoE without options */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	pe.index = tid;
-
-	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
-				     MVPP2_PRS_IPV4_HEAD | MVPP2_PRS_IPV4_IHL,
-				     MVPP2_PRS_IPV4_HEAD_MASK |
-				     MVPP2_PRS_IPV4_IHL_MASK);
-
-	/* Clear ri before updating */
-	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
-	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* IPv6 over PPPoE */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
-	pe.index = tid;
-
-	mvpp2_prs_match_etype(&pe, 0, PPP_IPV6);
-
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP6,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-	/* Skip eth_type + 4 bytes of IPv6 header */
-	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	/* Set L3 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Non-IP over PPPoE */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
-	pe.index = tid;
-
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN,
-				 MVPP2_PRS_RI_L3_PROTO_MASK);
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	/* Set L3 offset even if it's unknown L3 */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
-				  MVPP2_ETH_TYPE_LEN,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Initialize entries for IPv4 */
-static int mvpp2_prs_ip4_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int err;
-
-	/* Set entries for TCP, UDP and IGMP over IPv4 */
-	err = mvpp2_prs_ip4_proto(priv, IPPROTO_TCP, MVPP2_PRS_RI_L4_TCP,
-				  MVPP2_PRS_RI_L4_PROTO_MASK);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip4_proto(priv, IPPROTO_UDP, MVPP2_PRS_RI_L4_UDP,
-				  MVPP2_PRS_RI_L4_PROTO_MASK);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip4_proto(priv, IPPROTO_IGMP,
-				  MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
-				  MVPP2_PRS_RI_UDF3_RX_SPECIAL,
-				  MVPP2_PRS_RI_CPU_CODE_MASK |
-				  MVPP2_PRS_RI_UDF3_MASK);
-	if (err)
-		return err;
-
-	/* IPv4 Broadcast */
-	err = mvpp2_prs_ip4_cast(priv, MVPP2_PRS_L3_BROAD_CAST);
-	if (err)
-		return err;
-
-	/* IPv4 Multicast */
-	err = mvpp2_prs_ip4_cast(priv, MVPP2_PRS_L3_MULTI_CAST);
-	if (err)
-		return err;
-
-	/* Default IPv4 entry for unknown protocols */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	pe.index = MVPP2_PE_IP4_PROTO_UN;
-
-	/* Set next lu to IPv4 */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_sram_shift_set(&pe, 12, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-	/* Set L4 offset */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
-				  sizeof(struct iphdr) - 4,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
-				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
-				 MVPP2_PRS_RI_L4_PROTO_MASK);
-
-	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Default IPv4 entry for unicast address */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
-	pe.index = MVPP2_PE_IP4_ADDR_UN;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UCAST,
-				 MVPP2_PRS_RI_L3_ADDR_MASK);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
-				 MVPP2_PRS_IPV4_DIP_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Initialize entries for IPv6 */
-static int mvpp2_prs_ip6_init(struct mvpp2 *priv)
-{
-	struct mvpp2_prs_entry pe;
-	int tid, err;
-
-	/* Set entries for TCP, UDP and ICMP over IPv6 */
-	err = mvpp2_prs_ip6_proto(priv, IPPROTO_TCP,
-				  MVPP2_PRS_RI_L4_TCP,
-				  MVPP2_PRS_RI_L4_PROTO_MASK);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip6_proto(priv, IPPROTO_UDP,
-				  MVPP2_PRS_RI_L4_UDP,
-				  MVPP2_PRS_RI_L4_PROTO_MASK);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip6_proto(priv, IPPROTO_ICMPV6,
-				  MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
-				  MVPP2_PRS_RI_UDF3_RX_SPECIAL,
-				  MVPP2_PRS_RI_CPU_CODE_MASK |
-				  MVPP2_PRS_RI_UDF3_MASK);
-	if (err)
-		return err;
-
-	/* IPv4 is the last header. This is similar case as 6-TCP or 17-UDP */
-	/* Result Info: UDF7=1, DS lite */
-	err = mvpp2_prs_ip6_proto(priv, IPPROTO_IPIP,
-				  MVPP2_PRS_RI_UDF7_IP6_LITE,
-				  MVPP2_PRS_RI_UDF7_MASK);
-	if (err)
-		return err;
-
-	/* IPv6 multicast */
-	err = mvpp2_prs_ip6_cast(priv, MVPP2_PRS_L3_MULTI_CAST);
-	if (err)
-		return err;
-
-	/* Entry for checking hop limit */
-	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
-					MVPP2_PE_LAST_FREE_TID);
-	if (tid < 0)
-		return tid;
-
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = tid;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN |
-				 MVPP2_PRS_RI_DROP_MASK,
-				 MVPP2_PRS_RI_L3_PROTO_MASK |
-				 MVPP2_PRS_RI_DROP_MASK);
-
-	mvpp2_prs_tcam_data_byte_set(&pe, 1, 0x00, MVPP2_PRS_IPV6_HOP_MASK);
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Default IPv6 entry for unknown protocols */
-	memset(&pe, 0, sizeof(pe));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = MVPP2_PE_IP6_PROTO_UN;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
-				 MVPP2_PRS_RI_L4_PROTO_MASK);
-	/* Set L4 offset relatively to our current place */
-	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
-				  sizeof(struct ipv6hdr) - 4,
-				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Default IPv6 entry for unknown ext protocols */
-	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = MVPP2_PE_IP6_EXT_PROTO_UN;
-
-	/* Finished: go to flowid generation */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
-				 MVPP2_PRS_RI_L4_PROTO_MASK);
-
-	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_EXT_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	/* Default IPv6 entry for unicast address */
-	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	pe.index = MVPP2_PE_IP6_ADDR_UN;
-
-	/* Finished: go to IPv6 again */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UCAST,
-				 MVPP2_PRS_RI_L3_ADDR_MASK);
-	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
-				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Shift back to IPV6 NH */
-	mvpp2_prs_sram_shift_set(&pe, -18, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
-	/* Unmask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
-
-	/* Update shadow table and hw entry */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Find tcam entry with matched pair <vid,port> */
-static int mvpp2_prs_vid_range_find(struct mvpp2 *priv, int pmap, u16 vid,
-				    u16 mask)
-{
-	unsigned char byte[2], enable[2];
-	struct mvpp2_prs_entry pe;
-	u16 rvid, rmask;
-	int tid;
-
-	/* Go through the all entries with MVPP2_PRS_LU_VID */
-	for (tid = MVPP2_PE_VID_FILT_RANGE_START;
-	     tid <= MVPP2_PE_VID_FILT_RANGE_END; tid++) {
-		if (!priv->prs_shadow[tid].valid ||
-		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VID)
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-
-		mvpp2_prs_tcam_data_byte_get(&pe, 2, &byte[0], &enable[0]);
-		mvpp2_prs_tcam_data_byte_get(&pe, 3, &byte[1], &enable[1]);
-
-		rvid = ((byte[0] & 0xf) << 8) + byte[1];
-		rmask = ((enable[0] & 0xf) << 8) + enable[1];
-
-		if (rvid != vid || rmask != mask)
-			continue;
-
-		return tid;
-	}
-
-	return -ENOENT;
-}
-
-/* Write parser entry for VID filtering */
-static int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid)
-{
-	unsigned int vid_start = MVPP2_PE_VID_FILT_RANGE_START +
-				 port->id * MVPP2_PRS_VLAN_FILT_MAX;
-	unsigned int mask = 0xfff, reg_val, shift;
-	struct mvpp2 *priv = port->priv;
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	memset(&pe, 0, sizeof(pe));
-
-	/* Scan TCAM and see if entry with this <vid,port> already exist */
-	tid = mvpp2_prs_vid_range_find(priv, (1 << port->id), vid, mask);
-
-	reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id));
-	if (reg_val & MVPP2_DSA_EXTENDED)
-		shift = MVPP2_VLAN_TAG_EDSA_LEN;
-	else
-		shift = MVPP2_VLAN_TAG_LEN;
-
-	/* No such entry */
-	if (tid < 0) {
-
-		/* Go through all entries from first to last in vlan range */
-		tid = mvpp2_prs_tcam_first_free(priv, vid_start,
-						vid_start +
-						MVPP2_PRS_VLAN_FILT_MAX_ENTRY);
-
-		/* There isn't room for a new VID filter */
-		if (tid < 0)
-			return tid;
-
-		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
-		pe.index = tid;
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-	} else {
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	}
-
-	/* Enable the current port */
-	mvpp2_prs_tcam_port_set(&pe, port->id, true);
-
-	/* Continue - set next lookup */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-
-	/* Skip VLAN header - Set offset to 4 or 8 bytes */
-	mvpp2_prs_sram_shift_set(&pe, shift, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	/* Set match on VID */
-	mvpp2_prs_match_vid(&pe, MVPP2_PRS_VID_TCAM_BYTE, vid);
-
-	/* Clear all ai bits for next iteration */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-	/* Update shadow table */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-/* Write parser entry for VID filtering */
-static void mvpp2_prs_vid_entry_remove(struct mvpp2_port *port, u16 vid)
-{
-	struct mvpp2 *priv = port->priv;
-	int tid;
-
-	/* Scan TCAM and see if entry with this <vid,port> already exist */
-	tid = mvpp2_prs_vid_range_find(priv, (1 << port->id), vid, 0xfff);
-
-	/* No such entry */
-	if (tid < 0)
-		return;
-
-	mvpp2_prs_hw_inv(priv, tid);
-	priv->prs_shadow[tid].valid = false;
-}
-
-/* Remove all existing VID filters on this port */
-static void mvpp2_prs_vid_remove_all(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	int tid;
-
-	for (tid = MVPP2_PRS_VID_PORT_FIRST(port->id);
-	     tid <= MVPP2_PRS_VID_PORT_LAST(port->id); tid++) {
-		if (priv->prs_shadow[tid].valid)
-			mvpp2_prs_vid_entry_remove(port, tid);
-	}
-}
-
-/* Remove VID filering entry for this port */
-static void mvpp2_prs_vid_disable_filtering(struct mvpp2_port *port)
-{
-	unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id);
-	struct mvpp2 *priv = port->priv;
-
-	/* Invalidate the guard entry */
-	mvpp2_prs_hw_inv(priv, tid);
-
-	priv->prs_shadow[tid].valid = false;
-}
-
-/* Add guard entry that drops packets when no VID is matched on this port */
-static void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port)
-{
-	unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id);
-	struct mvpp2 *priv = port->priv;
-	unsigned int reg_val, shift;
-	struct mvpp2_prs_entry pe;
-
-	if (priv->prs_shadow[tid].valid)
-		return;
-
-	memset(&pe, 0, sizeof(pe));
-
-	pe.index = tid;
-
-	reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id));
-	if (reg_val & MVPP2_DSA_EXTENDED)
-		shift = MVPP2_VLAN_TAG_EDSA_LEN;
-	else
-		shift = MVPP2_VLAN_TAG_LEN;
-
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
-
-	/* Mask all ports */
-	mvpp2_prs_tcam_port_map_set(&pe, 0);
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port->id, true);
-
-	/* Continue - set next lookup */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
-
-	/* Skip VLAN header - Set offset to 4 or 8 bytes */
-	mvpp2_prs_sram_shift_set(&pe, shift, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	/* Drop VLAN packets that don't belong to any VIDs on this port */
-	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
-				 MVPP2_PRS_RI_DROP_MASK);
-
-	/* Clear all ai bits for next iteration */
-	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
-
-	/* Update shadow table */
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
-	mvpp2_prs_hw_write(priv, &pe);
-}
-
-/* Parser default initialization */
-static int mvpp2_prs_default_init(struct platform_device *pdev,
-				  struct mvpp2 *priv)
-{
-	int err, index, i;
-
-	/* Enable tcam table */
-	mvpp2_write(priv, MVPP2_PRS_TCAM_CTRL_REG, MVPP2_PRS_TCAM_EN_MASK);
-
-	/* Clear all tcam and sram entries */
-	for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++) {
-		mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, index);
-		for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
-			mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(i), 0);
-
-		mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, index);
-		for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
-			mvpp2_write(priv, MVPP2_PRS_SRAM_DATA_REG(i), 0);
-	}
-
-	/* Invalidate all tcam entries */
-	for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++)
-		mvpp2_prs_hw_inv(priv, index);
-
-	priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE,
-					sizeof(*priv->prs_shadow),
-					GFP_KERNEL);
-	if (!priv->prs_shadow)
-		return -ENOMEM;
-
-	/* Always start from lookup = 0 */
-	for (index = 0; index < MVPP2_MAX_PORTS; index++)
-		mvpp2_prs_hw_port_init(priv, index, MVPP2_PRS_LU_MH,
-				       MVPP2_PRS_PORT_LU_MAX, 0);
-
-	mvpp2_prs_def_flow_init(priv);
-
-	mvpp2_prs_mh_init(priv);
-
-	mvpp2_prs_mac_init(priv);
-
-	mvpp2_prs_dsa_init(priv);
-
-	mvpp2_prs_vid_init(priv);
-
-	err = mvpp2_prs_etype_init(priv);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_vlan_init(pdev, priv);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_pppoe_init(priv);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip6_init(priv);
-	if (err)
-		return err;
-
-	err = mvpp2_prs_ip4_init(priv);
-	if (err)
-		return err;
-
-	return 0;
-}
-
-/* Compare MAC DA with tcam entry data */
-static bool mvpp2_prs_mac_range_equals(struct mvpp2_prs_entry *pe,
-				       const u8 *da, unsigned char *mask)
-{
-	unsigned char tcam_byte, tcam_mask;
-	int index;
-
-	for (index = 0; index < ETH_ALEN; index++) {
-		mvpp2_prs_tcam_data_byte_get(pe, index, &tcam_byte, &tcam_mask);
-		if (tcam_mask != mask[index])
-			return false;
-
-		if ((tcam_mask & tcam_byte) != (da[index] & mask[index]))
-			return false;
-	}
-
-	return true;
-}
-
-/* Find tcam entry with matched pair <MAC DA, port> */
-static int
-mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da,
-			    unsigned char *mask, int udf_type)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	/* Go through the all entires with MVPP2_PRS_LU_MAC */
-	for (tid = MVPP2_PE_MAC_RANGE_START;
-	     tid <= MVPP2_PE_MAC_RANGE_END; tid++) {
-		unsigned int entry_pmap;
-
-		if (!priv->prs_shadow[tid].valid ||
-		    (priv->prs_shadow[tid].lu != MVPP2_PRS_LU_MAC) ||
-		    (priv->prs_shadow[tid].udf != udf_type))
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-		entry_pmap = mvpp2_prs_tcam_port_map_get(&pe);
-
-		if (mvpp2_prs_mac_range_equals(&pe, da, mask) &&
-		    entry_pmap == pmap)
-			return tid;
-	}
-
-	return -ENOENT;
-}
-
-/* Update parser's mac da entry */
-static int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da,
-				   bool add)
-{
-	unsigned char mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-	struct mvpp2 *priv = port->priv;
-	unsigned int pmap, len, ri;
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	memset(&pe, 0, sizeof(pe));
-
-	/* Scan TCAM and see if entry with this <MAC DA, port> already exist */
-	tid = mvpp2_prs_mac_da_range_find(priv, BIT(port->id), da, mask,
-					  MVPP2_PRS_UDF_MAC_DEF);
-
-	/* No such entry */
-	if (tid < 0) {
-		if (!add)
-			return 0;
-
-		/* Create new TCAM entry */
-		/* Go through the all entries from first to last */
-		tid = mvpp2_prs_tcam_first_free(priv,
-						MVPP2_PE_MAC_RANGE_START,
-						MVPP2_PE_MAC_RANGE_END);
-		if (tid < 0)
-			return tid;
-
-		pe.index = tid;
-
-		/* Mask all ports */
-		mvpp2_prs_tcam_port_map_set(&pe, 0);
-	} else {
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-	}
-
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
-
-	/* Update port mask */
-	mvpp2_prs_tcam_port_set(&pe, port->id, add);
-
-	/* Invalidate the entry if no ports are left enabled */
-	pmap = mvpp2_prs_tcam_port_map_get(&pe);
-	if (pmap == 0) {
-		if (add)
-			return -EINVAL;
-
-		mvpp2_prs_hw_inv(priv, pe.index);
-		priv->prs_shadow[pe.index].valid = false;
-		return 0;
-	}
-
-	/* Continue - set next lookup */
-	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_DSA);
-
-	/* Set match on DA */
-	len = ETH_ALEN;
-	while (len--)
-		mvpp2_prs_tcam_data_byte_set(&pe, len, da[len], 0xff);
-
-	/* Set result info bits */
-	if (is_broadcast_ether_addr(da)) {
-		ri = MVPP2_PRS_RI_L2_BCAST;
-	} else if (is_multicast_ether_addr(da)) {
-		ri = MVPP2_PRS_RI_L2_MCAST;
-	} else {
-		ri = MVPP2_PRS_RI_L2_UCAST;
-
-		if (ether_addr_equal(da, port->dev->dev_addr))
-			ri |= MVPP2_PRS_RI_MAC_ME_MASK;
-	}
-
-	mvpp2_prs_sram_ri_update(&pe, ri, MVPP2_PRS_RI_L2_CAST_MASK |
-				 MVPP2_PRS_RI_MAC_ME_MASK);
-	mvpp2_prs_shadow_ri_set(priv, pe.index, ri, MVPP2_PRS_RI_L2_CAST_MASK |
-				MVPP2_PRS_RI_MAC_ME_MASK);
-
-	/* Shift to ethertype */
-	mvpp2_prs_sram_shift_set(&pe, 2 * ETH_ALEN,
-				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
-
-	/* Update shadow table and hw entry */
-	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_MAC_DEF;
-	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
-	mvpp2_prs_hw_write(priv, &pe);
-
-	return 0;
-}
-
-static int mvpp2_prs_update_mac_da(struct net_device *dev, const u8 *da)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	int err;
-
-	/* Remove old parser entry */
-	err = mvpp2_prs_mac_da_accept(port, dev->dev_addr, false);
-	if (err)
-		return err;
-
-	/* Add new parser entry */
-	err = mvpp2_prs_mac_da_accept(port, da, true);
-	if (err)
-		return err;
-
-	/* Set addr in the device */
-	ether_addr_copy(dev->dev_addr, da);
-
-	return 0;
-}
-
-static void mvpp2_prs_mac_del_all(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	struct mvpp2_prs_entry pe;
-	unsigned long pmap;
-	int index, tid;
-
-	for (tid = MVPP2_PE_MAC_RANGE_START;
-	     tid <= MVPP2_PE_MAC_RANGE_END; tid++) {
-		unsigned char da[ETH_ALEN], da_mask[ETH_ALEN];
-
-		if (!priv->prs_shadow[tid].valid ||
-		    (priv->prs_shadow[tid].lu != MVPP2_PRS_LU_MAC) ||
-		    (priv->prs_shadow[tid].udf != MVPP2_PRS_UDF_MAC_DEF))
-			continue;
-
-		mvpp2_prs_init_from_hw(priv, &pe, tid);
-
-		pmap = mvpp2_prs_tcam_port_map_get(&pe);
-
-		/* We only want entries active on this port */
-		if (!test_bit(port->id, &pmap))
-			continue;
-
-		/* Read mac addr from entry */
-		for (index = 0; index < ETH_ALEN; index++)
-			mvpp2_prs_tcam_data_byte_get(&pe, index, &da[index],
-						     &da_mask[index]);
-
-		/* Special cases : Don't remove broadcast and port's own
-		 * address
-		 */
-		if (is_broadcast_ether_addr(da) ||
-		    ether_addr_equal(da, port->dev->dev_addr))
-			continue;
-
-		/* Remove entry from TCAM */
-		mvpp2_prs_mac_da_accept(port, da, false);
-	}
-}
-
-static int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type)
-{
-	switch (type) {
-	case MVPP2_TAG_TYPE_EDSA:
-		/* Add port to EDSA entries */
-		mvpp2_prs_dsa_tag_set(priv, port, true,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
-		mvpp2_prs_dsa_tag_set(priv, port, true,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
-		/* Remove port from DSA entries */
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
-		break;
-
-	case MVPP2_TAG_TYPE_DSA:
-		/* Add port to DSA entries */
-		mvpp2_prs_dsa_tag_set(priv, port, true,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
-		mvpp2_prs_dsa_tag_set(priv, port, true,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
-		/* Remove port from EDSA entries */
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
-		break;
-
-	case MVPP2_TAG_TYPE_MH:
-	case MVPP2_TAG_TYPE_NONE:
-		/* Remove port form EDSA and DSA entries */
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
-		mvpp2_prs_dsa_tag_set(priv, port, false,
-				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
-		break;
-
-	default:
-		if ((type < 0) || (type > MVPP2_TAG_TYPE_EDSA))
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* Set prs flow for the port */
-static int mvpp2_prs_def_flow(struct mvpp2_port *port)
-{
-	struct mvpp2_prs_entry pe;
-	int tid;
-
-	memset(&pe, 0, sizeof(pe));
-
-	tid = mvpp2_prs_flow_find(port->priv, port->id);
-
-	/* Such entry not exist */
-	if (tid < 0) {
-		/* Go through the all entires from last to first */
-		tid = mvpp2_prs_tcam_first_free(port->priv,
-						MVPP2_PE_LAST_FREE_TID,
-					       MVPP2_PE_FIRST_FREE_TID);
-		if (tid < 0)
-			return tid;
-
-		pe.index = tid;
-
-		/* Set flow ID*/
-		mvpp2_prs_sram_ai_update(&pe, port->id, MVPP2_PRS_FLOW_ID_MASK);
-		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_DONE_BIT, 1);
-
-		/* Update shadow table */
-		mvpp2_prs_shadow_set(port->priv, pe.index, MVPP2_PRS_LU_FLOWS);
-	} else {
-		mvpp2_prs_init_from_hw(port->priv, &pe, tid);
-	}
-
-	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
-	mvpp2_prs_tcam_port_map_set(&pe, (1 << port->id));
-	mvpp2_prs_hw_write(port->priv, &pe);
-
-	return 0;
-}
-
-/* Classifier configuration routines */
-
-/* Update classification flow table registers */
-static void mvpp2_cls_flow_write(struct mvpp2 *priv,
-				 struct mvpp2_cls_flow_entry *fe)
-{
-	mvpp2_write(priv, MVPP2_CLS_FLOW_INDEX_REG, fe->index);
-	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL0_REG,  fe->data[0]);
-	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL1_REG,  fe->data[1]);
-	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL2_REG,  fe->data[2]);
-}
-
-/* Update classification lookup table register */
-static void mvpp2_cls_lookup_write(struct mvpp2 *priv,
-				   struct mvpp2_cls_lookup_entry *le)
-{
-	u32 val;
-
-	val = (le->way << MVPP2_CLS_LKP_INDEX_WAY_OFFS) | le->lkpid;
-	mvpp2_write(priv, MVPP2_CLS_LKP_INDEX_REG, val);
-	mvpp2_write(priv, MVPP2_CLS_LKP_TBL_REG, le->data);
-}
-
-/* Classifier default initialization */
-static void mvpp2_cls_init(struct mvpp2 *priv)
-{
-	struct mvpp2_cls_lookup_entry le;
-	struct mvpp2_cls_flow_entry fe;
-	int index;
-
-	/* Enable classifier */
-	mvpp2_write(priv, MVPP2_CLS_MODE_REG, MVPP2_CLS_MODE_ACTIVE_MASK);
-
-	/* Clear classifier flow table */
-	memset(&fe.data, 0, sizeof(fe.data));
-	for (index = 0; index < MVPP2_CLS_FLOWS_TBL_SIZE; index++) {
-		fe.index = index;
-		mvpp2_cls_flow_write(priv, &fe);
-	}
-
-	/* Clear classifier lookup table */
-	le.data = 0;
-	for (index = 0; index < MVPP2_CLS_LKP_TBL_SIZE; index++) {
-		le.lkpid = index;
-		le.way = 0;
-		mvpp2_cls_lookup_write(priv, &le);
-
-		le.way = 1;
-		mvpp2_cls_lookup_write(priv, &le);
-	}
-}
-
-static void mvpp2_cls_port_config(struct mvpp2_port *port)
-{
-	struct mvpp2_cls_lookup_entry le;
-	u32 val;
-
-	/* Set way for the port */
-	val = mvpp2_read(port->priv, MVPP2_CLS_PORT_WAY_REG);
-	val &= ~MVPP2_CLS_PORT_WAY_MASK(port->id);
-	mvpp2_write(port->priv, MVPP2_CLS_PORT_WAY_REG, val);
-
-	/* Pick the entry to be accessed in lookup ID decoding table
-	 * according to the way and lkpid.
-	 */
-	le.lkpid = port->id;
-	le.way = 0;
-	le.data = 0;
-
-	/* Set initial CPU queue for receiving packets */
-	le.data &= ~MVPP2_CLS_LKP_TBL_RXQ_MASK;
-	le.data |= port->first_rxq;
-
-	/* Disable classification engines */
-	le.data &= ~MVPP2_CLS_LKP_TBL_LOOKUP_EN_MASK;
-
-	/* Update lookup ID table entry */
-	mvpp2_cls_lookup_write(port->priv, &le);
-}
-
-/* Set CPU queue number for oversize packets */
-static void mvpp2_cls_oversize_rxq_set(struct mvpp2_port *port)
-{
-	u32 val;
-
-	mvpp2_write(port->priv, MVPP2_CLS_OVERSIZE_RXQ_LOW_REG(port->id),
-		    port->first_rxq & MVPP2_CLS_OVERSIZE_RXQ_LOW_MASK);
-
-	mvpp2_write(port->priv, MVPP2_CLS_SWFWD_P2HQ_REG(port->id),
-		    (port->first_rxq >> MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS));
-
-	val = mvpp2_read(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG);
-	val |= MVPP2_CLS_SWFWD_PCTRL_MASK(port->id);
-	mvpp2_write(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG, val);
-}
-
-static void *mvpp2_frag_alloc(const struct mvpp2_bm_pool *pool)
-{
-	if (likely(pool->frag_size <= PAGE_SIZE))
-		return netdev_alloc_frag(pool->frag_size);
-	else
-		return kmalloc(pool->frag_size, GFP_ATOMIC);
-}
-
-static void mvpp2_frag_free(const struct mvpp2_bm_pool *pool, void *data)
-{
-	if (likely(pool->frag_size <= PAGE_SIZE))
-		skb_free_frag(data);
-	else
-		kfree(data);
-}
-
-/* Buffer Manager configuration routines */
-
-/* Create pool */
-static int mvpp2_bm_pool_create(struct platform_device *pdev,
-				struct mvpp2 *priv,
-				struct mvpp2_bm_pool *bm_pool, int size)
-{
-	u32 val;
-
-	/* Number of buffer pointers must be a multiple of 16, as per
-	 * hardware constraints
-	 */
-	if (!IS_ALIGNED(size, 16))
-		return -EINVAL;
-
-	/* PPv2.1 needs 8 bytes per buffer pointer, PPv2.2 needs 16
-	 * bytes per buffer pointer
-	 */
-	if (priv->hw_version == MVPP21)
-		bm_pool->size_bytes = 2 * sizeof(u32) * size;
-	else
-		bm_pool->size_bytes = 2 * sizeof(u64) * size;
-
-	bm_pool->virt_addr = dma_alloc_coherent(&pdev->dev, bm_pool->size_bytes,
-						&bm_pool->dma_addr,
-						GFP_KERNEL);
-	if (!bm_pool->virt_addr)
-		return -ENOMEM;
-
-	if (!IS_ALIGNED((unsigned long)bm_pool->virt_addr,
-			MVPP2_BM_POOL_PTR_ALIGN)) {
-		dma_free_coherent(&pdev->dev, bm_pool->size_bytes,
-				  bm_pool->virt_addr, bm_pool->dma_addr);
-		dev_err(&pdev->dev, "BM pool %d is not %d bytes aligned\n",
-			bm_pool->id, MVPP2_BM_POOL_PTR_ALIGN);
-		return -ENOMEM;
-	}
-
-	mvpp2_write(priv, MVPP2_BM_POOL_BASE_REG(bm_pool->id),
-		    lower_32_bits(bm_pool->dma_addr));
-	mvpp2_write(priv, MVPP2_BM_POOL_SIZE_REG(bm_pool->id), size);
-
-	val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id));
-	val |= MVPP2_BM_START_MASK;
-	mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id), val);
-
-	bm_pool->size = size;
-	bm_pool->pkt_size = 0;
-	bm_pool->buf_num = 0;
-
-	return 0;
-}
-
-/* Set pool buffer size */
-static void mvpp2_bm_pool_bufsize_set(struct mvpp2 *priv,
-				      struct mvpp2_bm_pool *bm_pool,
-				      int buf_size)
-{
-	u32 val;
-
-	bm_pool->buf_size = buf_size;
-
-	val = ALIGN(buf_size, 1 << MVPP2_POOL_BUF_SIZE_OFFSET);
-	mvpp2_write(priv, MVPP2_POOL_BUF_SIZE_REG(bm_pool->id), val);
-}
-
-static void mvpp2_bm_bufs_get_addrs(struct device *dev, struct mvpp2 *priv,
-				    struct mvpp2_bm_pool *bm_pool,
-				    dma_addr_t *dma_addr,
-				    phys_addr_t *phys_addr)
-{
-	int cpu = get_cpu();
-
-	*dma_addr = mvpp2_percpu_read(priv, cpu,
-				      MVPP2_BM_PHY_ALLOC_REG(bm_pool->id));
-	*phys_addr = mvpp2_percpu_read(priv, cpu, MVPP2_BM_VIRT_ALLOC_REG);
-
-	if (priv->hw_version == MVPP22) {
-		u32 val;
-		u32 dma_addr_highbits, phys_addr_highbits;
-
-		val = mvpp2_percpu_read(priv, cpu, MVPP22_BM_ADDR_HIGH_ALLOC);
-		dma_addr_highbits = (val & MVPP22_BM_ADDR_HIGH_PHYS_MASK);
-		phys_addr_highbits = (val & MVPP22_BM_ADDR_HIGH_VIRT_MASK) >>
-			MVPP22_BM_ADDR_HIGH_VIRT_SHIFT;
-
-		if (sizeof(dma_addr_t) == 8)
-			*dma_addr |= (u64)dma_addr_highbits << 32;
-
-		if (sizeof(phys_addr_t) == 8)
-			*phys_addr |= (u64)phys_addr_highbits << 32;
-	}
-
-	put_cpu();
-}
-
-/* Free all buffers from the pool */
-static void mvpp2_bm_bufs_free(struct device *dev, struct mvpp2 *priv,
-			       struct mvpp2_bm_pool *bm_pool, int buf_num)
-{
-	int i;
-
-	if (buf_num > bm_pool->buf_num) {
-		WARN(1, "Pool does not have so many bufs pool(%d) bufs(%d)\n",
-		     bm_pool->id, buf_num);
-		buf_num = bm_pool->buf_num;
-	}
-
-	for (i = 0; i < buf_num; i++) {
-		dma_addr_t buf_dma_addr;
-		phys_addr_t buf_phys_addr;
-		void *data;
-
-		mvpp2_bm_bufs_get_addrs(dev, priv, bm_pool,
-					&buf_dma_addr, &buf_phys_addr);
-
-		dma_unmap_single(dev, buf_dma_addr,
-				 bm_pool->buf_size, DMA_FROM_DEVICE);
-
-		data = (void *)phys_to_virt(buf_phys_addr);
-		if (!data)
-			break;
-
-		mvpp2_frag_free(bm_pool, data);
-	}
-
-	/* Update BM driver with number of buffers removed from pool */
-	bm_pool->buf_num -= i;
-}
-
-/* Check number of buffers in BM pool */
-static int mvpp2_check_hw_buf_num(struct mvpp2 *priv, struct mvpp2_bm_pool *bm_pool)
-{
-	int buf_num = 0;
-
-	buf_num += mvpp2_read(priv, MVPP2_BM_POOL_PTRS_NUM_REG(bm_pool->id)) &
-				    MVPP22_BM_POOL_PTRS_NUM_MASK;
-	buf_num += mvpp2_read(priv, MVPP2_BM_BPPI_PTRS_NUM_REG(bm_pool->id)) &
-				    MVPP2_BM_BPPI_PTR_NUM_MASK;
-
-	/* HW has one buffer ready which is not reflected in the counters */
-	if (buf_num)
-		buf_num += 1;
-
-	return buf_num;
-}
-
-/* Cleanup pool */
-static int mvpp2_bm_pool_destroy(struct platform_device *pdev,
-				 struct mvpp2 *priv,
-				 struct mvpp2_bm_pool *bm_pool)
-{
-	int buf_num;
-	u32 val;
-
-	buf_num = mvpp2_check_hw_buf_num(priv, bm_pool);
-	mvpp2_bm_bufs_free(&pdev->dev, priv, bm_pool, buf_num);
-
-	/* Check buffer counters after free */
-	buf_num = mvpp2_check_hw_buf_num(priv, bm_pool);
-	if (buf_num) {
-		WARN(1, "cannot free all buffers in pool %d, buf_num left %d\n",
-		     bm_pool->id, bm_pool->buf_num);
-		return 0;
-	}
-
-	val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id));
-	val |= MVPP2_BM_STOP_MASK;
-	mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id), val);
-
-	dma_free_coherent(&pdev->dev, bm_pool->size_bytes,
-			  bm_pool->virt_addr,
-			  bm_pool->dma_addr);
-	return 0;
-}
-
-static int mvpp2_bm_pools_init(struct platform_device *pdev,
-			       struct mvpp2 *priv)
-{
-	int i, err, size;
-	struct mvpp2_bm_pool *bm_pool;
-
-	/* Create all pools with maximum size */
-	size = MVPP2_BM_POOL_SIZE_MAX;
-	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
-		bm_pool = &priv->bm_pools[i];
-		bm_pool->id = i;
-		err = mvpp2_bm_pool_create(pdev, priv, bm_pool, size);
-		if (err)
-			goto err_unroll_pools;
-		mvpp2_bm_pool_bufsize_set(priv, bm_pool, 0);
-	}
-	return 0;
-
-err_unroll_pools:
-	dev_err(&pdev->dev, "failed to create BM pool %d, size %d\n", i, size);
-	for (i = i - 1; i >= 0; i--)
-		mvpp2_bm_pool_destroy(pdev, priv, &priv->bm_pools[i]);
-	return err;
-}
-
-static int mvpp2_bm_init(struct platform_device *pdev, struct mvpp2 *priv)
-{
-	int i, err;
-
-	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
-		/* Mask BM all interrupts */
-		mvpp2_write(priv, MVPP2_BM_INTR_MASK_REG(i), 0);
-		/* Clear BM cause register */
-		mvpp2_write(priv, MVPP2_BM_INTR_CAUSE_REG(i), 0);
-	}
-
-	/* Allocate and initialize BM pools */
-	priv->bm_pools = devm_kcalloc(&pdev->dev, MVPP2_BM_POOLS_NUM,
-				      sizeof(*priv->bm_pools), GFP_KERNEL);
-	if (!priv->bm_pools)
-		return -ENOMEM;
-
-	err = mvpp2_bm_pools_init(pdev, priv);
-	if (err < 0)
-		return err;
-	return 0;
-}
-
-static void mvpp2_setup_bm_pool(void)
-{
-	/* Short pool */
-	mvpp2_pools[MVPP2_BM_SHORT].buf_num  = MVPP2_BM_SHORT_BUF_NUM;
-	mvpp2_pools[MVPP2_BM_SHORT].pkt_size = MVPP2_BM_SHORT_PKT_SIZE;
-
-	/* Long pool */
-	mvpp2_pools[MVPP2_BM_LONG].buf_num  = MVPP2_BM_LONG_BUF_NUM;
-	mvpp2_pools[MVPP2_BM_LONG].pkt_size = MVPP2_BM_LONG_PKT_SIZE;
-
-	/* Jumbo pool */
-	mvpp2_pools[MVPP2_BM_JUMBO].buf_num  = MVPP2_BM_JUMBO_BUF_NUM;
-	mvpp2_pools[MVPP2_BM_JUMBO].pkt_size = MVPP2_BM_JUMBO_PKT_SIZE;
-}
-
-/* Attach long pool to rxq */
-static void mvpp2_rxq_long_pool_set(struct mvpp2_port *port,
-				    int lrxq, int long_pool)
-{
-	u32 val, mask;
-	int prxq;
-
-	/* Get queue physical ID */
-	prxq = port->rxqs[lrxq]->id;
-
-	if (port->priv->hw_version == MVPP21)
-		mask = MVPP21_RXQ_POOL_LONG_MASK;
-	else
-		mask = MVPP22_RXQ_POOL_LONG_MASK;
-
-	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
-	val &= ~mask;
-	val |= (long_pool << MVPP2_RXQ_POOL_LONG_OFFS) & mask;
-	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
-}
-
-/* Attach short pool to rxq */
-static void mvpp2_rxq_short_pool_set(struct mvpp2_port *port,
-				     int lrxq, int short_pool)
-{
-	u32 val, mask;
-	int prxq;
-
-	/* Get queue physical ID */
-	prxq = port->rxqs[lrxq]->id;
-
-	if (port->priv->hw_version == MVPP21)
-		mask = MVPP21_RXQ_POOL_SHORT_MASK;
-	else
-		mask = MVPP22_RXQ_POOL_SHORT_MASK;
-
-	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
-	val &= ~mask;
-	val |= (short_pool << MVPP2_RXQ_POOL_SHORT_OFFS) & mask;
-	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
-}
-
-static void *mvpp2_buf_alloc(struct mvpp2_port *port,
-			     struct mvpp2_bm_pool *bm_pool,
-			     dma_addr_t *buf_dma_addr,
-			     phys_addr_t *buf_phys_addr,
-			     gfp_t gfp_mask)
-{
-	dma_addr_t dma_addr;
-	void *data;
-
-	data = mvpp2_frag_alloc(bm_pool);
-	if (!data)
-		return NULL;
-
-	dma_addr = dma_map_single(port->dev->dev.parent, data,
-				  MVPP2_RX_BUF_SIZE(bm_pool->pkt_size),
-				  DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(port->dev->dev.parent, dma_addr))) {
-		mvpp2_frag_free(bm_pool, data);
-		return NULL;
-	}
-	*buf_dma_addr = dma_addr;
-	*buf_phys_addr = virt_to_phys(data);
-
-	return data;
-}
-
-/* Release buffer to BM */
-static inline void mvpp2_bm_pool_put(struct mvpp2_port *port, int pool,
-				     dma_addr_t buf_dma_addr,
-				     phys_addr_t buf_phys_addr)
-{
-	int cpu = get_cpu();
-
-	if (port->priv->hw_version == MVPP22) {
-		u32 val = 0;
-
-		if (sizeof(dma_addr_t) == 8)
-			val |= upper_32_bits(buf_dma_addr) &
-				MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK;
-
-		if (sizeof(phys_addr_t) == 8)
-			val |= (upper_32_bits(buf_phys_addr)
-				<< MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT) &
-				MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK;
-
-		mvpp2_percpu_write_relaxed(port->priv, cpu,
-					   MVPP22_BM_ADDR_HIGH_RLS_REG, val);
-	}
-
-	/* MVPP2_BM_VIRT_RLS_REG is not interpreted by HW, and simply
-	 * returned in the "cookie" field of the RX
-	 * descriptor. Instead of storing the virtual address, we
-	 * store the physical address
-	 */
-	mvpp2_percpu_write_relaxed(port->priv, cpu,
-				   MVPP2_BM_VIRT_RLS_REG, buf_phys_addr);
-	mvpp2_percpu_write_relaxed(port->priv, cpu,
-				   MVPP2_BM_PHY_RLS_REG(pool), buf_dma_addr);
-
-	put_cpu();
-}
-
-/* Allocate buffers for the pool */
-static int mvpp2_bm_bufs_add(struct mvpp2_port *port,
-			     struct mvpp2_bm_pool *bm_pool, int buf_num)
-{
-	int i, buf_size, total_size;
-	dma_addr_t dma_addr;
-	phys_addr_t phys_addr;
-	void *buf;
-
-	buf_size = MVPP2_RX_BUF_SIZE(bm_pool->pkt_size);
-	total_size = MVPP2_RX_TOTAL_SIZE(buf_size);
-
-	if (buf_num < 0 ||
-	    (buf_num + bm_pool->buf_num > bm_pool->size)) {
-		netdev_err(port->dev,
-			   "cannot allocate %d buffers for pool %d\n",
-			   buf_num, bm_pool->id);
-		return 0;
-	}
-
-	for (i = 0; i < buf_num; i++) {
-		buf = mvpp2_buf_alloc(port, bm_pool, &dma_addr,
-				      &phys_addr, GFP_KERNEL);
-		if (!buf)
-			break;
-
-		mvpp2_bm_pool_put(port, bm_pool->id, dma_addr,
-				  phys_addr);
-	}
-
-	/* Update BM driver with number of buffers added to pool */
-	bm_pool->buf_num += i;
-
-	netdev_dbg(port->dev,
-		   "pool %d: pkt_size=%4d, buf_size=%4d, total_size=%4d\n",
-		   bm_pool->id, bm_pool->pkt_size, buf_size, total_size);
-
-	netdev_dbg(port->dev,
-		   "pool %d: %d of %d buffers added\n",
-		   bm_pool->id, i, buf_num);
-	return i;
-}
-
-/* Notify the driver that BM pool is being used as specific type and return the
- * pool pointer on success
- */
-static struct mvpp2_bm_pool *
-mvpp2_bm_pool_use(struct mvpp2_port *port, unsigned pool, int pkt_size)
-{
-	struct mvpp2_bm_pool *new_pool = &port->priv->bm_pools[pool];
-	int num;
-
-	if (pool >= MVPP2_BM_POOLS_NUM) {
-		netdev_err(port->dev, "Invalid pool %d\n", pool);
-		return NULL;
-	}
-
-	/* Allocate buffers in case BM pool is used as long pool, but packet
-	 * size doesn't match MTU or BM pool hasn't being used yet
-	 */
-	if (new_pool->pkt_size == 0) {
-		int pkts_num;
-
-		/* Set default buffer number or free all the buffers in case
-		 * the pool is not empty
-		 */
-		pkts_num = new_pool->buf_num;
-		if (pkts_num == 0)
-			pkts_num = mvpp2_pools[pool].buf_num;
-		else
-			mvpp2_bm_bufs_free(port->dev->dev.parent,
-					   port->priv, new_pool, pkts_num);
-
-		new_pool->pkt_size = pkt_size;
-		new_pool->frag_size =
-			SKB_DATA_ALIGN(MVPP2_RX_BUF_SIZE(pkt_size)) +
-			MVPP2_SKB_SHINFO_SIZE;
-
-		/* Allocate buffers for this pool */
-		num = mvpp2_bm_bufs_add(port, new_pool, pkts_num);
-		if (num != pkts_num) {
-			WARN(1, "pool %d: %d of %d allocated\n",
-			     new_pool->id, num, pkts_num);
-			return NULL;
-		}
-	}
-
-	mvpp2_bm_pool_bufsize_set(port->priv, new_pool,
-				  MVPP2_RX_BUF_SIZE(new_pool->pkt_size));
-
-	return new_pool;
-}
-
-/* Initialize pools for swf */
-static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port)
-{
-	int rxq;
-	enum mvpp2_bm_pool_log_num long_log_pool, short_log_pool;
-
-	/* If port pkt_size is higher than 1518B:
-	 * HW Long pool - SW Jumbo pool, HW Short pool - SW Long pool
-	 * else: HW Long pool - SW Long pool, HW Short pool - SW Short pool
-	 */
-	if (port->pkt_size > MVPP2_BM_LONG_PKT_SIZE) {
-		long_log_pool = MVPP2_BM_JUMBO;
-		short_log_pool = MVPP2_BM_LONG;
-	} else {
-		long_log_pool = MVPP2_BM_LONG;
-		short_log_pool = MVPP2_BM_SHORT;
-	}
-
-	if (!port->pool_long) {
-		port->pool_long =
-			mvpp2_bm_pool_use(port, long_log_pool,
-					  mvpp2_pools[long_log_pool].pkt_size);
-		if (!port->pool_long)
-			return -ENOMEM;
-
-		port->pool_long->port_map |= BIT(port->id);
-
-		for (rxq = 0; rxq < port->nrxqs; rxq++)
-			mvpp2_rxq_long_pool_set(port, rxq, port->pool_long->id);
-	}
-
-	if (!port->pool_short) {
-		port->pool_short =
-			mvpp2_bm_pool_use(port, short_log_pool,
-					  mvpp2_pools[short_log_pool].pkt_size);
-		if (!port->pool_short)
-			return -ENOMEM;
-
-		port->pool_short->port_map |= BIT(port->id);
-
-		for (rxq = 0; rxq < port->nrxqs; rxq++)
-			mvpp2_rxq_short_pool_set(port, rxq,
-						 port->pool_short->id);
-	}
-
-	return 0;
-}
-
-static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	enum mvpp2_bm_pool_log_num new_long_pool;
-	int pkt_size = MVPP2_RX_PKT_SIZE(mtu);
-
-	/* If port MTU is higher than 1518B:
-	 * HW Long pool - SW Jumbo pool, HW Short pool - SW Long pool
-	 * else: HW Long pool - SW Long pool, HW Short pool - SW Short pool
-	 */
-	if (pkt_size > MVPP2_BM_LONG_PKT_SIZE)
-		new_long_pool = MVPP2_BM_JUMBO;
-	else
-		new_long_pool = MVPP2_BM_LONG;
-
-	if (new_long_pool != port->pool_long->id) {
-		/* Remove port from old short & long pool */
-		port->pool_long = mvpp2_bm_pool_use(port, port->pool_long->id,
-						    port->pool_long->pkt_size);
-		port->pool_long->port_map &= ~BIT(port->id);
-		port->pool_long = NULL;
-
-		port->pool_short = mvpp2_bm_pool_use(port, port->pool_short->id,
-						     port->pool_short->pkt_size);
-		port->pool_short->port_map &= ~BIT(port->id);
-		port->pool_short = NULL;
-
-		port->pkt_size =  pkt_size;
-
-		/* Add port to new short & long pool */
-		mvpp2_swf_bm_pool_init(port);
-
-		/* Update L4 checksum when jumbo enable/disable on port */
-		if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
-			dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-			dev->hw_features &= ~(NETIF_F_IP_CSUM |
-					      NETIF_F_IPV6_CSUM);
-		} else {
-			dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-			dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-		}
-	}
-
-	dev->mtu = mtu;
-	dev->wanted_features = dev->features;
-
-	netdev_update_features(dev);
-	return 0;
-}
-
-static inline void mvpp2_interrupts_enable(struct mvpp2_port *port)
-{
-	int i, sw_thread_mask = 0;
-
-	for (i = 0; i < port->nqvecs; i++)
-		sw_thread_mask |= port->qvecs[i].sw_thread_mask;
-
-	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
-		    MVPP2_ISR_ENABLE_INTERRUPT(sw_thread_mask));
-}
-
-static inline void mvpp2_interrupts_disable(struct mvpp2_port *port)
-{
-	int i, sw_thread_mask = 0;
-
-	for (i = 0; i < port->nqvecs; i++)
-		sw_thread_mask |= port->qvecs[i].sw_thread_mask;
-
-	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
-		    MVPP2_ISR_DISABLE_INTERRUPT(sw_thread_mask));
-}
-
-static inline void mvpp2_qvec_interrupt_enable(struct mvpp2_queue_vector *qvec)
-{
-	struct mvpp2_port *port = qvec->port;
-
-	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
-		    MVPP2_ISR_ENABLE_INTERRUPT(qvec->sw_thread_mask));
-}
-
-static inline void mvpp2_qvec_interrupt_disable(struct mvpp2_queue_vector *qvec)
-{
-	struct mvpp2_port *port = qvec->port;
-
-	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
-		    MVPP2_ISR_DISABLE_INTERRUPT(qvec->sw_thread_mask));
-}
-
-/* Mask the current CPU's Rx/Tx interrupts
- * Called by on_each_cpu(), guaranteed to run with migration disabled,
- * using smp_processor_id() is OK.
- */
-static void mvpp2_interrupts_mask(void *arg)
-{
-	struct mvpp2_port *port = arg;
-
-	mvpp2_percpu_write(port->priv, smp_processor_id(),
-			   MVPP2_ISR_RX_TX_MASK_REG(port->id), 0);
-}
-
-/* Unmask the current CPU's Rx/Tx interrupts.
- * Called by on_each_cpu(), guaranteed to run with migration disabled,
- * using smp_processor_id() is OK.
- */
-static void mvpp2_interrupts_unmask(void *arg)
-{
-	struct mvpp2_port *port = arg;
-	u32 val;
-
-	val = MVPP2_CAUSE_MISC_SUM_MASK |
-		MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
-	if (port->has_tx_irqs)
-		val |= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
-
-	mvpp2_percpu_write(port->priv, smp_processor_id(),
-			   MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
-}
-
-static void
-mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask)
-{
-	u32 val;
-	int i;
-
-	if (port->priv->hw_version != MVPP22)
-		return;
-
-	if (mask)
-		val = 0;
-	else
-		val = MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
-
-	for (i = 0; i < port->nqvecs; i++) {
-		struct mvpp2_queue_vector *v = port->qvecs + i;
-
-		if (v->type != MVPP2_QUEUE_VECTOR_SHARED)
-			continue;
-
-		mvpp2_percpu_write(port->priv, v->sw_thread_id,
-				   MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
-	}
-}
-
-/* Port configuration routines */
-
-static void mvpp22_gop_init_rgmii(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	u32 val;
-
-	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
-	val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT;
-	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
-
-	regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
-	if (port->gop_id == 2)
-		val |= GENCONF_CTRL0_PORT0_RGMII | GENCONF_CTRL0_PORT1_RGMII;
-	else if (port->gop_id == 3)
-		val |= GENCONF_CTRL0_PORT1_RGMII_MII;
-	regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
-}
-
-static void mvpp22_gop_init_sgmii(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	u32 val;
-
-	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
-	val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT |
-	       GENCONF_PORT_CTRL0_RX_DATA_SAMPLE;
-	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
-
-	if (port->gop_id > 1) {
-		regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
-		if (port->gop_id == 2)
-			val &= ~GENCONF_CTRL0_PORT0_RGMII;
-		else if (port->gop_id == 3)
-			val &= ~GENCONF_CTRL0_PORT1_RGMII_MII;
-		regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
-	}
-}
-
-static void mvpp22_gop_init_10gkr(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	void __iomem *mpcs = priv->iface_base + MVPP22_MPCS_BASE(port->gop_id);
-	void __iomem *xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id);
-	u32 val;
-
-	/* XPCS */
-	val = readl(xpcs + MVPP22_XPCS_CFG0);
-	val &= ~(MVPP22_XPCS_CFG0_PCS_MODE(0x3) |
-		 MVPP22_XPCS_CFG0_ACTIVE_LANE(0x3));
-	val |= MVPP22_XPCS_CFG0_ACTIVE_LANE(2);
-	writel(val, xpcs + MVPP22_XPCS_CFG0);
-
-	/* MPCS */
-	val = readl(mpcs + MVPP22_MPCS_CTRL);
-	val &= ~MVPP22_MPCS_CTRL_FWD_ERR_CONN;
-	writel(val, mpcs + MVPP22_MPCS_CTRL);
-
-	val = readl(mpcs + MVPP22_MPCS_CLK_RESET);
-	val &= ~(MVPP22_MPCS_CLK_RESET_DIV_RATIO(0x7) | MAC_CLK_RESET_MAC |
-		 MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX);
-	val |= MVPP22_MPCS_CLK_RESET_DIV_RATIO(1);
-	writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
-
-	val &= ~MVPP22_MPCS_CLK_RESET_DIV_SET;
-	val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX;
-	writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
-}
-
-static int mvpp22_gop_init(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	u32 val;
-
-	if (!priv->sysctrl_base)
-		return 0;
-
-	switch (port->phy_interface) {
-	case PHY_INTERFACE_MODE_RGMII:
-	case PHY_INTERFACE_MODE_RGMII_ID:
-	case PHY_INTERFACE_MODE_RGMII_RXID:
-	case PHY_INTERFACE_MODE_RGMII_TXID:
-		if (port->gop_id == 0)
-			goto invalid_conf;
-		mvpp22_gop_init_rgmii(port);
-		break;
-	case PHY_INTERFACE_MODE_SGMII:
-	case PHY_INTERFACE_MODE_1000BASEX:
-	case PHY_INTERFACE_MODE_2500BASEX:
-		mvpp22_gop_init_sgmii(port);
-		break;
-	case PHY_INTERFACE_MODE_10GKR:
-		if (port->gop_id != 0)
-			goto invalid_conf;
-		mvpp22_gop_init_10gkr(port);
-		break;
-	default:
-		goto unsupported_conf;
-	}
-
-	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL1, &val);
-	val |= GENCONF_PORT_CTRL1_RESET(port->gop_id) |
-	       GENCONF_PORT_CTRL1_EN(port->gop_id);
-	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL1, val);
-
-	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
-	val |= GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR;
-	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
-
-	regmap_read(priv->sysctrl_base, GENCONF_SOFT_RESET1, &val);
-	val |= GENCONF_SOFT_RESET1_GOP;
-	regmap_write(priv->sysctrl_base, GENCONF_SOFT_RESET1, val);
-
-unsupported_conf:
-	return 0;
-
-invalid_conf:
-	netdev_err(port->dev, "Invalid port configuration\n");
-	return -EINVAL;
-}
-
-static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
-{
-	u32 val;
-
-	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
-		/* Enable the GMAC link status irq for this port */
-		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
-		val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
-		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
-	}
-
-	if (port->gop_id == 0) {
-		/* Enable the XLG/GIG irqs for this port */
-		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
-		if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
-			val |= MVPP22_XLG_EXT_INT_MASK_XLG;
-		else
-			val |= MVPP22_XLG_EXT_INT_MASK_GIG;
-		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
-	}
-}
-
-static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
-{
-	u32 val;
-
-	if (port->gop_id == 0) {
-		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
-		val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
-			 MVPP22_XLG_EXT_INT_MASK_GIG);
-		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
-	}
-
-	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
-		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
-		val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
-		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
-	}
-}
-
-static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
-{
-	u32 val;
-
-	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
-		val = readl(port->base + MVPP22_GMAC_INT_MASK);
-		val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
-		writel(val, port->base + MVPP22_GMAC_INT_MASK);
-	}
-
-	if (port->gop_id == 0) {
-		val = readl(port->base + MVPP22_XLG_INT_MASK);
-		val |= MVPP22_XLG_INT_MASK_LINK;
-		writel(val, port->base + MVPP22_XLG_INT_MASK);
-	}
-
-	mvpp22_gop_unmask_irq(port);
-}
-
-/* Sets the PHY mode of the COMPHY (which configures the serdes lanes).
- *
- * The PHY mode used by the PPv2 driver comes from the network subsystem, while
- * the one given to the COMPHY comes from the generic PHY subsystem. Hence they
- * differ.
- *
- * The COMPHY configures the serdes lanes regardless of the actual use of the
- * lanes by the physical layer. This is why configurations like
- * "PPv2 (2500BaseX) - COMPHY (2500SGMII)" are valid.
- */
-static int mvpp22_comphy_init(struct mvpp2_port *port)
-{
-	enum phy_mode mode;
-	int ret;
-
-	if (!port->comphy)
-		return 0;
-
-	switch (port->phy_interface) {
-	case PHY_INTERFACE_MODE_SGMII:
-	case PHY_INTERFACE_MODE_1000BASEX:
-		mode = PHY_MODE_SGMII;
-		break;
-	case PHY_INTERFACE_MODE_2500BASEX:
-		mode = PHY_MODE_2500SGMII;
-		break;
-	case PHY_INTERFACE_MODE_10GKR:
-		mode = PHY_MODE_10GKR;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	ret = phy_set_mode(port->comphy, mode);
-	if (ret)
-		return ret;
-
-	return phy_power_on(port->comphy);
-}
-
-static void mvpp2_port_enable(struct mvpp2_port *port)
-{
-	u32 val;
-
-	/* Only GOP port 0 has an XLG MAC */
-	if (port->gop_id == 0 &&
-	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
-	     port->phy_interface == PHY_INTERFACE_MODE_10GKR)) {
-		val = readl(port->base + MVPP22_XLG_CTRL0_REG);
-		val |= MVPP22_XLG_CTRL0_PORT_EN |
-		       MVPP22_XLG_CTRL0_MAC_RESET_DIS;
-		val &= ~MVPP22_XLG_CTRL0_MIB_CNT_DIS;
-		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
-	} else {
-		val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-		val |= MVPP2_GMAC_PORT_EN_MASK;
-		val |= MVPP2_GMAC_MIB_CNTR_EN_MASK;
-		writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
-	}
-}
-
-static void mvpp2_port_disable(struct mvpp2_port *port)
-{
-	u32 val;
-
-	/* Only GOP port 0 has an XLG MAC */
-	if (port->gop_id == 0 &&
-	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
-	     port->phy_interface == PHY_INTERFACE_MODE_10GKR)) {
-		val = readl(port->base + MVPP22_XLG_CTRL0_REG);
-		val &= ~MVPP22_XLG_CTRL0_PORT_EN;
-		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
-
-		/* Disable & reset should be done separately */
-		val &= ~MVPP22_XLG_CTRL0_MAC_RESET_DIS;
-		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
-	} else {
-		val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-		val &= ~(MVPP2_GMAC_PORT_EN_MASK);
-		writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
-	}
-}
-
-/* Set IEEE 802.3x Flow Control Xon Packet Transmission Mode */
-static void mvpp2_port_periodic_xon_disable(struct mvpp2_port *port)
-{
-	u32 val;
-
-	val = readl(port->base + MVPP2_GMAC_CTRL_1_REG) &
-		    ~MVPP2_GMAC_PERIODIC_XON_EN_MASK;
-	writel(val, port->base + MVPP2_GMAC_CTRL_1_REG);
-}
-
-/* Configure loopback port */
-static void mvpp2_port_loopback_set(struct mvpp2_port *port,
-				    const struct phylink_link_state *state)
-{
-	u32 val;
-
-	val = readl(port->base + MVPP2_GMAC_CTRL_1_REG);
-
-	if (state->speed == 1000)
-		val |= MVPP2_GMAC_GMII_LB_EN_MASK;
-	else
-		val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
-
-	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX)
-		val |= MVPP2_GMAC_PCS_LB_EN_MASK;
-	else
-		val &= ~MVPP2_GMAC_PCS_LB_EN_MASK;
-
-	writel(val, port->base + MVPP2_GMAC_CTRL_1_REG);
-}
-
-struct mvpp2_ethtool_counter {
-	unsigned int offset;
-	const char string[ETH_GSTRING_LEN];
-	bool reg_is_64b;
-};
-
-static u64 mvpp2_read_count(struct mvpp2_port *port,
-			    const struct mvpp2_ethtool_counter *counter)
-{
-	u64 val;
-
-	val = readl(port->stats_base + counter->offset);
-	if (counter->reg_is_64b)
-		val += (u64)readl(port->stats_base + counter->offset + 4) << 32;
-
-	return val;
-}
-
-/* Due to the fact that software statistics and hardware statistics are, by
- * design, incremented at different moments in the chain of packet processing,
- * it is very likely that incoming packets could have been dropped after being
- * counted by hardware but before reaching software statistics (most probably
- * multicast packets), and in the oppposite way, during transmission, FCS bytes
- * are added in between as well as TSO skb will be split and header bytes added.
- * Hence, statistics gathered from userspace with ifconfig (software) and
- * ethtool (hardware) cannot be compared.
- */
-static const struct mvpp2_ethtool_counter mvpp2_ethtool_regs[] = {
-	{ MVPP2_MIB_GOOD_OCTETS_RCVD, "good_octets_received", true },
-	{ MVPP2_MIB_BAD_OCTETS_RCVD, "bad_octets_received" },
-	{ MVPP2_MIB_CRC_ERRORS_SENT, "crc_errors_sent" },
-	{ MVPP2_MIB_UNICAST_FRAMES_RCVD, "unicast_frames_received" },
-	{ MVPP2_MIB_BROADCAST_FRAMES_RCVD, "broadcast_frames_received" },
-	{ MVPP2_MIB_MULTICAST_FRAMES_RCVD, "multicast_frames_received" },
-	{ MVPP2_MIB_FRAMES_64_OCTETS, "frames_64_octets" },
-	{ MVPP2_MIB_FRAMES_65_TO_127_OCTETS, "frames_65_to_127_octet" },
-	{ MVPP2_MIB_FRAMES_128_TO_255_OCTETS, "frames_128_to_255_octet" },
-	{ MVPP2_MIB_FRAMES_256_TO_511_OCTETS, "frames_256_to_511_octet" },
-	{ MVPP2_MIB_FRAMES_512_TO_1023_OCTETS, "frames_512_to_1023_octet" },
-	{ MVPP2_MIB_FRAMES_1024_TO_MAX_OCTETS, "frames_1024_to_max_octet" },
-	{ MVPP2_MIB_GOOD_OCTETS_SENT, "good_octets_sent", true },
-	{ MVPP2_MIB_UNICAST_FRAMES_SENT, "unicast_frames_sent" },
-	{ MVPP2_MIB_MULTICAST_FRAMES_SENT, "multicast_frames_sent" },
-	{ MVPP2_MIB_BROADCAST_FRAMES_SENT, "broadcast_frames_sent" },
-	{ MVPP2_MIB_FC_SENT, "fc_sent" },
-	{ MVPP2_MIB_FC_RCVD, "fc_received" },
-	{ MVPP2_MIB_RX_FIFO_OVERRUN, "rx_fifo_overrun" },
-	{ MVPP2_MIB_UNDERSIZE_RCVD, "undersize_received" },
-	{ MVPP2_MIB_FRAGMENTS_RCVD, "fragments_received" },
-	{ MVPP2_MIB_OVERSIZE_RCVD, "oversize_received" },
-	{ MVPP2_MIB_JABBER_RCVD, "jabber_received" },
-	{ MVPP2_MIB_MAC_RCV_ERROR, "mac_receive_error" },
-	{ MVPP2_MIB_BAD_CRC_EVENT, "bad_crc_event" },
-	{ MVPP2_MIB_COLLISION, "collision" },
-	{ MVPP2_MIB_LATE_COLLISION, "late_collision" },
-};
-
-static void mvpp2_ethtool_get_strings(struct net_device *netdev, u32 sset,
-				      u8 *data)
-{
-	if (sset == ETH_SS_STATS) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
-			memcpy(data + i * ETH_GSTRING_LEN,
-			       &mvpp2_ethtool_regs[i].string, ETH_GSTRING_LEN);
-	}
-}
-
-static void mvpp2_gather_hw_statistics(struct work_struct *work)
-{
-	struct delayed_work *del_work = to_delayed_work(work);
-	struct mvpp2_port *port = container_of(del_work, struct mvpp2_port,
-					       stats_work);
-	u64 *pstats;
-	int i;
-
-	mutex_lock(&port->gather_stats_lock);
-
-	pstats = port->ethtool_stats;
-	for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
-		*pstats++ += mvpp2_read_count(port, &mvpp2_ethtool_regs[i]);
-
-	/* No need to read again the counters right after this function if it
-	 * was called asynchronously by the user (ie. use of ethtool).
-	 */
-	cancel_delayed_work(&port->stats_work);
-	queue_delayed_work(port->priv->stats_queue, &port->stats_work,
-			   MVPP2_MIB_COUNTERS_STATS_DELAY);
-
-	mutex_unlock(&port->gather_stats_lock);
-}
-
-static void mvpp2_ethtool_get_stats(struct net_device *dev,
-				    struct ethtool_stats *stats, u64 *data)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	/* Update statistics for the given port, then take the lock to avoid
-	 * concurrent accesses on the ethtool_stats structure during its copy.
-	 */
-	mvpp2_gather_hw_statistics(&port->stats_work.work);
-
-	mutex_lock(&port->gather_stats_lock);
-	memcpy(data, port->ethtool_stats,
-	       sizeof(u64) * ARRAY_SIZE(mvpp2_ethtool_regs));
-	mutex_unlock(&port->gather_stats_lock);
-}
-
-static int mvpp2_ethtool_get_sset_count(struct net_device *dev, int sset)
-{
-	if (sset == ETH_SS_STATS)
-		return ARRAY_SIZE(mvpp2_ethtool_regs);
-
-	return -EOPNOTSUPP;
-}
-
-static void mvpp2_port_reset(struct mvpp2_port *port)
-{
-	u32 val;
-	unsigned int i;
-
-	/* Read the GOP statistics to reset the hardware counters */
-	for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
-		mvpp2_read_count(port, &mvpp2_ethtool_regs[i]);
-
-	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
-		    ~MVPP2_GMAC_PORT_RESET_MASK;
-	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	while (readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
-	       MVPP2_GMAC_PORT_RESET_MASK)
-		continue;
-}
-
-/* Change maximum receive size of the port */
-static inline void mvpp2_gmac_max_rx_size_set(struct mvpp2_port *port)
-{
-	u32 val;
-
-	val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-	val &= ~MVPP2_GMAC_MAX_RX_SIZE_MASK;
-	val |= (((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
-		    MVPP2_GMAC_MAX_RX_SIZE_OFFS);
-	writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
-}
-
-/* Change maximum receive size of the port */
-static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
-{
-	u32 val;
-
-	val =  readl(port->base + MVPP22_XLG_CTRL1_REG);
-	val &= ~MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK;
-	val |= ((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
-	       MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS;
-	writel(val, port->base + MVPP22_XLG_CTRL1_REG);
-}
-
-/* Set defaults to the MVPP2 port */
-static void mvpp2_defaults_set(struct mvpp2_port *port)
-{
-	int tx_port_num, val, queue, ptxq, lrxq;
-
-	if (port->priv->hw_version == MVPP21) {
-		/* Update TX FIFO MIN Threshold */
-		val = readl(port->base + MVPP2_GMAC_PORT_FIFO_CFG_1_REG);
-		val &= ~MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK;
-		/* Min. TX threshold must be less than minimal packet length */
-		val |= MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(64 - 4 - 2);
-		writel(val, port->base + MVPP2_GMAC_PORT_FIFO_CFG_1_REG);
-	}
-
-	/* Disable Legacy WRR, Disable EJP, Release from reset */
-	tx_port_num = mvpp2_egress_port(port);
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG,
-		    tx_port_num);
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_CMD_1_REG, 0);
-
-	/* Close bandwidth for all queues */
-	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) {
-		ptxq = mvpp2_txq_phys(port->id, queue);
-		mvpp2_write(port->priv,
-			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(ptxq), 0);
-	}
-
-	/* Set refill period to 1 usec, refill tokens
-	 * and bucket size to maximum
-	 */
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PERIOD_REG,
-		    port->priv->tclk / USEC_PER_SEC);
-	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_REFILL_REG);
-	val &= ~MVPP2_TXP_REFILL_PERIOD_ALL_MASK;
-	val |= MVPP2_TXP_REFILL_PERIOD_MASK(1);
-	val |= MVPP2_TXP_REFILL_TOKENS_ALL_MASK;
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_REFILL_REG, val);
-	val = MVPP2_TXP_TOKEN_SIZE_MAX;
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
-
-	/* Set MaximumLowLatencyPacketSize value to 256 */
-	mvpp2_write(port->priv, MVPP2_RX_CTRL_REG(port->id),
-		    MVPP2_RX_USE_PSEUDO_FOR_CSUM_MASK |
-		    MVPP2_RX_LOW_LATENCY_PKT_SIZE(256));
-
-	/* Enable Rx cache snoop */
-	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
-		queue = port->rxqs[lrxq]->id;
-		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
-		val |= MVPP2_SNOOP_PKT_SIZE_MASK |
-			   MVPP2_SNOOP_BUF_HDR_MASK;
-		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
-	}
-
-	/* At default, mask all interrupts to all present cpus */
-	mvpp2_interrupts_disable(port);
-}
-
-/* Enable/disable receiving packets */
-static void mvpp2_ingress_enable(struct mvpp2_port *port)
-{
-	u32 val;
-	int lrxq, queue;
-
-	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
-		queue = port->rxqs[lrxq]->id;
-		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
-		val &= ~MVPP2_RXQ_DISABLE_MASK;
-		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
-	}
-}
-
-static void mvpp2_ingress_disable(struct mvpp2_port *port)
-{
-	u32 val;
-	int lrxq, queue;
-
-	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
-		queue = port->rxqs[lrxq]->id;
-		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
-		val |= MVPP2_RXQ_DISABLE_MASK;
-		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
-	}
-}
-
-/* Enable transmit via physical egress queue
- * - HW starts take descriptors from DRAM
- */
-static void mvpp2_egress_enable(struct mvpp2_port *port)
-{
-	u32 qmap;
-	int queue;
-	int tx_port_num = mvpp2_egress_port(port);
-
-	/* Enable all initialized TXs. */
-	qmap = 0;
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		struct mvpp2_tx_queue *txq = port->txqs[queue];
-
-		if (txq->descs)
-			qmap |= (1 << queue);
-	}
-
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG, qmap);
-}
-
-/* Disable transmit via physical egress queue
- * - HW doesn't take descriptors from DRAM
- */
-static void mvpp2_egress_disable(struct mvpp2_port *port)
-{
-	u32 reg_data;
-	int delay;
-	int tx_port_num = mvpp2_egress_port(port);
-
-	/* Issue stop command for active channels only */
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
-	reg_data = (mvpp2_read(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG)) &
-		    MVPP2_TXP_SCHED_ENQ_MASK;
-	if (reg_data != 0)
-		mvpp2_write(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG,
-			    (reg_data << MVPP2_TXP_SCHED_DISQ_OFFSET));
-
-	/* Wait for all Tx activity to terminate. */
-	delay = 0;
-	do {
-		if (delay >= MVPP2_TX_DISABLE_TIMEOUT_MSEC) {
-			netdev_warn(port->dev,
-				    "Tx stop timed out, status=0x%08x\n",
-				    reg_data);
-			break;
-		}
-		mdelay(1);
-		delay++;
-
-		/* Check port TX Command register that all
-		 * Tx queues are stopped
-		 */
-		reg_data = mvpp2_read(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG);
-	} while (reg_data & MVPP2_TXP_SCHED_ENQ_MASK);
-}
-
-/* Rx descriptors helper methods */
-
-/* Get number of Rx descriptors occupied by received packets */
-static inline int
-mvpp2_rxq_received(struct mvpp2_port *port, int rxq_id)
-{
-	u32 val = mvpp2_read(port->priv, MVPP2_RXQ_STATUS_REG(rxq_id));
-
-	return val & MVPP2_RXQ_OCCUPIED_MASK;
-}
-
-/* Update Rx queue status with the number of occupied and available
- * Rx descriptor slots.
- */
-static inline void
-mvpp2_rxq_status_update(struct mvpp2_port *port, int rxq_id,
-			int used_count, int free_count)
-{
-	/* Decrement the number of used descriptors and increment count
-	 * increment the number of free descriptors.
-	 */
-	u32 val = used_count | (free_count << MVPP2_RXQ_NUM_NEW_OFFSET);
-
-	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_UPDATE_REG(rxq_id), val);
-}
-
-/* Get pointer to next RX descriptor to be processed by SW */
-static inline struct mvpp2_rx_desc *
-mvpp2_rxq_next_desc_get(struct mvpp2_rx_queue *rxq)
-{
-	int rx_desc = rxq->next_desc_to_proc;
-
-	rxq->next_desc_to_proc = MVPP2_QUEUE_NEXT_DESC(rxq, rx_desc);
-	prefetch(rxq->descs + rxq->next_desc_to_proc);
-	return rxq->descs + rx_desc;
-}
-
-/* Set rx queue offset */
-static void mvpp2_rxq_offset_set(struct mvpp2_port *port,
-				 int prxq, int offset)
-{
-	u32 val;
-
-	/* Convert offset from bytes to units of 32 bytes */
-	offset = offset >> 5;
-
-	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
-	val &= ~MVPP2_RXQ_PACKET_OFFSET_MASK;
-
-	/* Offset is in */
-	val |= ((offset << MVPP2_RXQ_PACKET_OFFSET_OFFS) &
-		    MVPP2_RXQ_PACKET_OFFSET_MASK);
-
-	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
-}
-
-/* Tx descriptors helper methods */
-
-/* Get pointer to next Tx descriptor to be processed (send) by HW */
-static struct mvpp2_tx_desc *
-mvpp2_txq_next_desc_get(struct mvpp2_tx_queue *txq)
-{
-	int tx_desc = txq->next_desc_to_proc;
-
-	txq->next_desc_to_proc = MVPP2_QUEUE_NEXT_DESC(txq, tx_desc);
-	return txq->descs + tx_desc;
-}
-
-/* Update HW with number of aggregated Tx descriptors to be sent
- *
- * Called only from mvpp2_tx(), so migration is disabled, using
- * smp_processor_id() is OK.
- */
-static void mvpp2_aggr_txq_pend_desc_add(struct mvpp2_port *port, int pending)
-{
-	/* aggregated access - relevant TXQ number is written in TX desc */
-	mvpp2_percpu_write(port->priv, smp_processor_id(),
-			   MVPP2_AGGR_TXQ_UPDATE_REG, pending);
-}
-
-/* Check if there are enough free descriptors in aggregated txq.
- * If not, update the number of occupied descriptors and repeat the check.
- *
- * Called only from mvpp2_tx(), so migration is disabled, using
- * smp_processor_id() is OK.
- */
-static int mvpp2_aggr_desc_num_check(struct mvpp2 *priv,
-				     struct mvpp2_tx_queue *aggr_txq, int num)
-{
-	if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE) {
-		/* Update number of occupied aggregated Tx descriptors */
-		int cpu = smp_processor_id();
-		u32 val = mvpp2_read_relaxed(priv,
-					     MVPP2_AGGR_TXQ_STATUS_REG(cpu));
-
-		aggr_txq->count = val & MVPP2_AGGR_TXQ_PENDING_MASK;
-
-		if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE)
-			return -ENOMEM;
-	}
-	return 0;
-}
-
-/* Reserved Tx descriptors allocation request
- *
- * Called only from mvpp2_txq_reserved_desc_num_proc(), itself called
- * only by mvpp2_tx(), so migration is disabled, using
- * smp_processor_id() is OK.
- */
-static int mvpp2_txq_alloc_reserved_desc(struct mvpp2 *priv,
-					 struct mvpp2_tx_queue *txq, int num)
-{
-	u32 val;
-	int cpu = smp_processor_id();
-
-	val = (txq->id << MVPP2_TXQ_RSVD_REQ_Q_OFFSET) | num;
-	mvpp2_percpu_write_relaxed(priv, cpu, MVPP2_TXQ_RSVD_REQ_REG, val);
-
-	val = mvpp2_percpu_read_relaxed(priv, cpu, MVPP2_TXQ_RSVD_RSLT_REG);
-
-	return val & MVPP2_TXQ_RSVD_RSLT_MASK;
-}
-
-/* Check if there are enough reserved descriptors for transmission.
- * If not, request chunk of reserved descriptors and check again.
- */
-static int mvpp2_txq_reserved_desc_num_proc(struct mvpp2 *priv,
-					    struct mvpp2_tx_queue *txq,
-					    struct mvpp2_txq_pcpu *txq_pcpu,
-					    int num)
-{
-	int req, cpu, desc_count;
-
-	if (txq_pcpu->reserved_num >= num)
-		return 0;
-
-	/* Not enough descriptors reserved! Update the reserved descriptor
-	 * count and check again.
-	 */
-
-	desc_count = 0;
-	/* Compute total of used descriptors */
-	for_each_present_cpu(cpu) {
-		struct mvpp2_txq_pcpu *txq_pcpu_aux;
-
-		txq_pcpu_aux = per_cpu_ptr(txq->pcpu, cpu);
-		desc_count += txq_pcpu_aux->count;
-		desc_count += txq_pcpu_aux->reserved_num;
-	}
-
-	req = max(MVPP2_CPU_DESC_CHUNK, num - txq_pcpu->reserved_num);
-	desc_count += req;
-
-	if (desc_count >
-	   (txq->size - (num_present_cpus() * MVPP2_CPU_DESC_CHUNK)))
-		return -ENOMEM;
-
-	txq_pcpu->reserved_num += mvpp2_txq_alloc_reserved_desc(priv, txq, req);
-
-	/* OK, the descriptor could have been updated: check again. */
-	if (txq_pcpu->reserved_num < num)
-		return -ENOMEM;
-	return 0;
-}
-
-/* Release the last allocated Tx descriptor. Useful to handle DMA
- * mapping failures in the Tx path.
- */
-static void mvpp2_txq_desc_put(struct mvpp2_tx_queue *txq)
-{
-	if (txq->next_desc_to_proc == 0)
-		txq->next_desc_to_proc = txq->last_desc - 1;
-	else
-		txq->next_desc_to_proc--;
-}
-
-/* Set Tx descriptors fields relevant for CSUM calculation */
-static u32 mvpp2_txq_desc_csum(int l3_offs, int l3_proto,
-			       int ip_hdr_len, int l4_proto)
-{
-	u32 command;
-
-	/* fields: L3_offset, IP_hdrlen, L3_type, G_IPv4_chk,
-	 * G_L4_chk, L4_type required only for checksum calculation
-	 */
-	command = (l3_offs << MVPP2_TXD_L3_OFF_SHIFT);
-	command |= (ip_hdr_len << MVPP2_TXD_IP_HLEN_SHIFT);
-	command |= MVPP2_TXD_IP_CSUM_DISABLE;
-
-	if (l3_proto == swab16(ETH_P_IP)) {
-		command &= ~MVPP2_TXD_IP_CSUM_DISABLE;	/* enable IPv4 csum */
-		command &= ~MVPP2_TXD_L3_IP6;		/* enable IPv4 */
-	} else {
-		command |= MVPP2_TXD_L3_IP6;		/* enable IPv6 */
-	}
-
-	if (l4_proto == IPPROTO_TCP) {
-		command &= ~MVPP2_TXD_L4_UDP;		/* enable TCP */
-		command &= ~MVPP2_TXD_L4_CSUM_FRAG;	/* generate L4 csum */
-	} else if (l4_proto == IPPROTO_UDP) {
-		command |= MVPP2_TXD_L4_UDP;		/* enable UDP */
-		command &= ~MVPP2_TXD_L4_CSUM_FRAG;	/* generate L4 csum */
-	} else {
-		command |= MVPP2_TXD_L4_CSUM_NOT;
-	}
-
-	return command;
-}
-
-/* Get number of sent descriptors and decrement counter.
- * The number of sent descriptors is returned.
- * Per-CPU access
- *
- * Called only from mvpp2_txq_done(), called from mvpp2_tx()
- * (migration disabled) and from the TX completion tasklet (migration
- * disabled) so using smp_processor_id() is OK.
- */
-static inline int mvpp2_txq_sent_desc_proc(struct mvpp2_port *port,
-					   struct mvpp2_tx_queue *txq)
-{
-	u32 val;
-
-	/* Reading status reg resets transmitted descriptor counter */
-	val = mvpp2_percpu_read_relaxed(port->priv, smp_processor_id(),
-					MVPP2_TXQ_SENT_REG(txq->id));
-
-	return (val & MVPP2_TRANSMITTED_COUNT_MASK) >>
-		MVPP2_TRANSMITTED_COUNT_OFFSET;
-}
-
-/* Called through on_each_cpu(), so runs on all CPUs, with migration
- * disabled, therefore using smp_processor_id() is OK.
- */
-static void mvpp2_txq_sent_counter_clear(void *arg)
-{
-	struct mvpp2_port *port = arg;
-	int queue;
-
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		int id = port->txqs[queue]->id;
-
-		mvpp2_percpu_read(port->priv, smp_processor_id(),
-				  MVPP2_TXQ_SENT_REG(id));
-	}
-}
-
-/* Set max sizes for Tx queues */
-static void mvpp2_txp_max_tx_size_set(struct mvpp2_port *port)
-{
-	u32	val, size, mtu;
-	int	txq, tx_port_num;
-
-	mtu = port->pkt_size * 8;
-	if (mtu > MVPP2_TXP_MTU_MAX)
-		mtu = MVPP2_TXP_MTU_MAX;
-
-	/* WA for wrong Token bucket update: Set MTU value = 3*real MTU value */
-	mtu = 3 * mtu;
-
-	/* Indirect access to registers */
-	tx_port_num = mvpp2_egress_port(port);
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
-
-	/* Set MTU */
-	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_MTU_REG);
-	val &= ~MVPP2_TXP_MTU_MAX;
-	val |= mtu;
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_MTU_REG, val);
-
-	/* TXP token size and all TXQs token size must be larger that MTU */
-	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG);
-	size = val & MVPP2_TXP_TOKEN_SIZE_MAX;
-	if (size < mtu) {
-		size = mtu;
-		val &= ~MVPP2_TXP_TOKEN_SIZE_MAX;
-		val |= size;
-		mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
-	}
-
-	for (txq = 0; txq < port->ntxqs; txq++) {
-		val = mvpp2_read(port->priv,
-				 MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq));
-		size = val & MVPP2_TXQ_TOKEN_SIZE_MAX;
-
-		if (size < mtu) {
-			size = mtu;
-			val &= ~MVPP2_TXQ_TOKEN_SIZE_MAX;
-			val |= size;
-			mvpp2_write(port->priv,
-				    MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq),
-				    val);
-		}
-	}
-}
-
-/* Set the number of packets that will be received before Rx interrupt
- * will be generated by HW.
- */
-static void mvpp2_rx_pkts_coal_set(struct mvpp2_port *port,
-				   struct mvpp2_rx_queue *rxq)
-{
-	int cpu = get_cpu();
-
-	if (rxq->pkts_coal > MVPP2_OCCUPIED_THRESH_MASK)
-		rxq->pkts_coal = MVPP2_OCCUPIED_THRESH_MASK;
-
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_THRESH_REG,
-			   rxq->pkts_coal);
-
-	put_cpu();
-}
-
-/* For some reason in the LSP this is done on each CPU. Why ? */
-static void mvpp2_tx_pkts_coal_set(struct mvpp2_port *port,
-				   struct mvpp2_tx_queue *txq)
-{
-	int cpu = get_cpu();
-	u32 val;
-
-	if (txq->done_pkts_coal > MVPP2_TXQ_THRESH_MASK)
-		txq->done_pkts_coal = MVPP2_TXQ_THRESH_MASK;
-
-	val = (txq->done_pkts_coal << MVPP2_TXQ_THRESH_OFFSET);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_THRESH_REG, val);
-
-	put_cpu();
-}
-
-static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz)
-{
-	u64 tmp = (u64)clk_hz * usec;
-
-	do_div(tmp, USEC_PER_SEC);
-
-	return tmp > U32_MAX ? U32_MAX : tmp;
-}
-
-static u32 mvpp2_cycles_to_usec(u32 cycles, unsigned long clk_hz)
-{
-	u64 tmp = (u64)cycles * USEC_PER_SEC;
-
-	do_div(tmp, clk_hz);
-
-	return tmp > U32_MAX ? U32_MAX : tmp;
-}
-
-/* Set the time delay in usec before Rx interrupt */
-static void mvpp2_rx_time_coal_set(struct mvpp2_port *port,
-				   struct mvpp2_rx_queue *rxq)
-{
-	unsigned long freq = port->priv->tclk;
-	u32 val = mvpp2_usec_to_cycles(rxq->time_coal, freq);
-
-	if (val > MVPP2_MAX_ISR_RX_THRESHOLD) {
-		rxq->time_coal =
-			mvpp2_cycles_to_usec(MVPP2_MAX_ISR_RX_THRESHOLD, freq);
-
-		/* re-evaluate to get actual register value */
-		val = mvpp2_usec_to_cycles(rxq->time_coal, freq);
-	}
-
-	mvpp2_write(port->priv, MVPP2_ISR_RX_THRESHOLD_REG(rxq->id), val);
-}
-
-static void mvpp2_tx_time_coal_set(struct mvpp2_port *port)
-{
-	unsigned long freq = port->priv->tclk;
-	u32 val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
-
-	if (val > MVPP2_MAX_ISR_TX_THRESHOLD) {
-		port->tx_time_coal =
-			mvpp2_cycles_to_usec(MVPP2_MAX_ISR_TX_THRESHOLD, freq);
-
-		/* re-evaluate to get actual register value */
-		val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
-	}
-
-	mvpp2_write(port->priv, MVPP2_ISR_TX_THRESHOLD_REG(port->id), val);
-}
-
-/* Free Tx queue skbuffs */
-static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
-				struct mvpp2_tx_queue *txq,
-				struct mvpp2_txq_pcpu *txq_pcpu, int num)
-{
-	int i;
-
-	for (i = 0; i < num; i++) {
-		struct mvpp2_txq_pcpu_buf *tx_buf =
-			txq_pcpu->buffs + txq_pcpu->txq_get_index;
-
-		if (!IS_TSO_HEADER(txq_pcpu, tx_buf->dma))
-			dma_unmap_single(port->dev->dev.parent, tx_buf->dma,
-					 tx_buf->size, DMA_TO_DEVICE);
-		if (tx_buf->skb)
-			dev_kfree_skb_any(tx_buf->skb);
-
-		mvpp2_txq_inc_get(txq_pcpu);
-	}
-}
-
-static inline struct mvpp2_rx_queue *mvpp2_get_rx_queue(struct mvpp2_port *port,
-							u32 cause)
-{
-	int queue = fls(cause) - 1;
-
-	return port->rxqs[queue];
-}
-
-static inline struct mvpp2_tx_queue *mvpp2_get_tx_queue(struct mvpp2_port *port,
-							u32 cause)
-{
-	int queue = fls(cause) - 1;
-
-	return port->txqs[queue];
-}
-
-/* Handle end of transmission */
-static void mvpp2_txq_done(struct mvpp2_port *port, struct mvpp2_tx_queue *txq,
-			   struct mvpp2_txq_pcpu *txq_pcpu)
-{
-	struct netdev_queue *nq = netdev_get_tx_queue(port->dev, txq->log_id);
-	int tx_done;
-
-	if (txq_pcpu->cpu != smp_processor_id())
-		netdev_err(port->dev, "wrong cpu on the end of Tx processing\n");
-
-	tx_done = mvpp2_txq_sent_desc_proc(port, txq);
-	if (!tx_done)
-		return;
-	mvpp2_txq_bufs_free(port, txq, txq_pcpu, tx_done);
-
-	txq_pcpu->count -= tx_done;
-
-	if (netif_tx_queue_stopped(nq))
-		if (txq_pcpu->count <= txq_pcpu->wake_threshold)
-			netif_tx_wake_queue(nq);
-}
-
-static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause,
-				  int cpu)
-{
-	struct mvpp2_tx_queue *txq;
-	struct mvpp2_txq_pcpu *txq_pcpu;
-	unsigned int tx_todo = 0;
-
-	while (cause) {
-		txq = mvpp2_get_tx_queue(port, cause);
-		if (!txq)
-			break;
-
-		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-
-		if (txq_pcpu->count) {
-			mvpp2_txq_done(port, txq, txq_pcpu);
-			tx_todo += txq_pcpu->count;
-		}
-
-		cause &= ~(1 << txq->log_id);
-	}
-	return tx_todo;
-}
-
-/* Rx/Tx queue initialization/cleanup methods */
-
-/* Allocate and initialize descriptors for aggr TXQ */
-static int mvpp2_aggr_txq_init(struct platform_device *pdev,
-			       struct mvpp2_tx_queue *aggr_txq, int cpu,
-			       struct mvpp2 *priv)
-{
-	u32 txq_dma;
-
-	/* Allocate memory for TX descriptors */
-	aggr_txq->descs = dma_zalloc_coherent(&pdev->dev,
-				MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
-				&aggr_txq->descs_dma, GFP_KERNEL);
-	if (!aggr_txq->descs)
-		return -ENOMEM;
-
-	aggr_txq->last_desc = MVPP2_AGGR_TXQ_SIZE - 1;
-
-	/* Aggr TXQ no reset WA */
-	aggr_txq->next_desc_to_proc = mvpp2_read(priv,
-						 MVPP2_AGGR_TXQ_INDEX_REG(cpu));
-
-	/* Set Tx descriptors queue starting address indirect
-	 * access
-	 */
-	if (priv->hw_version == MVPP21)
-		txq_dma = aggr_txq->descs_dma;
-	else
-		txq_dma = aggr_txq->descs_dma >>
-			MVPP22_AGGR_TXQ_DESC_ADDR_OFFS;
-
-	mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu), txq_dma);
-	mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu),
-		    MVPP2_AGGR_TXQ_SIZE);
-
-	return 0;
-}
-
-/* Create a specified Rx queue */
-static int mvpp2_rxq_init(struct mvpp2_port *port,
-			  struct mvpp2_rx_queue *rxq)
-
-{
-	u32 rxq_dma;
-	int cpu;
-
-	rxq->size = port->rx_ring_size;
-
-	/* Allocate memory for RX descriptors */
-	rxq->descs = dma_alloc_coherent(port->dev->dev.parent,
-					rxq->size * MVPP2_DESC_ALIGNED_SIZE,
-					&rxq->descs_dma, GFP_KERNEL);
-	if (!rxq->descs)
-		return -ENOMEM;
-
-	rxq->last_desc = rxq->size - 1;
-
-	/* Zero occupied and non-occupied counters - direct access */
-	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
-
-	/* Set Rx descriptors queue starting address - indirect access */
-	cpu = get_cpu();
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
-	if (port->priv->hw_version == MVPP21)
-		rxq_dma = rxq->descs_dma;
-	else
-		rxq_dma = rxq->descs_dma >> MVPP22_DESC_ADDR_OFFS;
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, rxq_dma);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, rxq->size);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_INDEX_REG, 0);
-	put_cpu();
-
-	/* Set Offset */
-	mvpp2_rxq_offset_set(port, rxq->id, NET_SKB_PAD);
-
-	/* Set coalescing pkts and time */
-	mvpp2_rx_pkts_coal_set(port, rxq);
-	mvpp2_rx_time_coal_set(port, rxq);
-
-	/* Add number of descriptors ready for receiving packets */
-	mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size);
-
-	return 0;
-}
-
-/* Push packets received by the RXQ to BM pool */
-static void mvpp2_rxq_drop_pkts(struct mvpp2_port *port,
-				struct mvpp2_rx_queue *rxq)
-{
-	int rx_received, i;
-
-	rx_received = mvpp2_rxq_received(port, rxq->id);
-	if (!rx_received)
-		return;
-
-	for (i = 0; i < rx_received; i++) {
-		struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
-		u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
-		int pool;
-
-		pool = (status & MVPP2_RXD_BM_POOL_ID_MASK) >>
-			MVPP2_RXD_BM_POOL_ID_OFFS;
-
-		mvpp2_bm_pool_put(port, pool,
-				  mvpp2_rxdesc_dma_addr_get(port, rx_desc),
-				  mvpp2_rxdesc_cookie_get(port, rx_desc));
-	}
-	mvpp2_rxq_status_update(port, rxq->id, rx_received, rx_received);
-}
-
-/* Cleanup Rx queue */
-static void mvpp2_rxq_deinit(struct mvpp2_port *port,
-			     struct mvpp2_rx_queue *rxq)
-{
-	int cpu;
-
-	mvpp2_rxq_drop_pkts(port, rxq);
-
-	if (rxq->descs)
-		dma_free_coherent(port->dev->dev.parent,
-				  rxq->size * MVPP2_DESC_ALIGNED_SIZE,
-				  rxq->descs,
-				  rxq->descs_dma);
-
-	rxq->descs             = NULL;
-	rxq->last_desc         = 0;
-	rxq->next_desc_to_proc = 0;
-	rxq->descs_dma         = 0;
-
-	/* Clear Rx descriptors queue starting address and size;
-	 * free descriptor number
-	 */
-	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
-	cpu = get_cpu();
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, 0);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, 0);
-	put_cpu();
-}
-
-/* Create and initialize a Tx queue */
-static int mvpp2_txq_init(struct mvpp2_port *port,
-			  struct mvpp2_tx_queue *txq)
-{
-	u32 val;
-	int cpu, desc, desc_per_txq, tx_port_num;
-	struct mvpp2_txq_pcpu *txq_pcpu;
-
-	txq->size = port->tx_ring_size;
-
-	/* Allocate memory for Tx descriptors */
-	txq->descs = dma_alloc_coherent(port->dev->dev.parent,
-				txq->size * MVPP2_DESC_ALIGNED_SIZE,
-				&txq->descs_dma, GFP_KERNEL);
-	if (!txq->descs)
-		return -ENOMEM;
-
-	txq->last_desc = txq->size - 1;
-
-	/* Set Tx descriptors queue starting address - indirect access */
-	cpu = get_cpu();
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG,
-			   txq->descs_dma);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_SIZE_REG,
-			   txq->size & MVPP2_TXQ_DESC_SIZE_MASK);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_INDEX_REG, 0);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_RSVD_CLR_REG,
-			   txq->id << MVPP2_TXQ_RSVD_CLR_OFFSET);
-	val = mvpp2_percpu_read(port->priv, cpu, MVPP2_TXQ_PENDING_REG);
-	val &= ~MVPP2_TXQ_PENDING_MASK;
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PENDING_REG, val);
-
-	/* Calculate base address in prefetch buffer. We reserve 16 descriptors
-	 * for each existing TXQ.
-	 * TCONTS for PON port must be continuous from 0 to MVPP2_MAX_TCONT
-	 * GBE ports assumed to be continuous from 0 to MVPP2_MAX_PORTS
-	 */
-	desc_per_txq = 16;
-	desc = (port->id * MVPP2_MAX_TXQ * desc_per_txq) +
-	       (txq->log_id * desc_per_txq);
-
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG,
-			   MVPP2_PREF_BUF_PTR(desc) | MVPP2_PREF_BUF_SIZE_16 |
-			   MVPP2_PREF_BUF_THRESH(desc_per_txq / 2));
-	put_cpu();
-
-	/* WRR / EJP configuration - indirect access */
-	tx_port_num = mvpp2_egress_port(port);
-	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
-
-	val = mvpp2_read(port->priv, MVPP2_TXQ_SCHED_REFILL_REG(txq->log_id));
-	val &= ~MVPP2_TXQ_REFILL_PERIOD_ALL_MASK;
-	val |= MVPP2_TXQ_REFILL_PERIOD_MASK(1);
-	val |= MVPP2_TXQ_REFILL_TOKENS_ALL_MASK;
-	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_REFILL_REG(txq->log_id), val);
-
-	val = MVPP2_TXQ_TOKEN_SIZE_MAX;
-	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq->log_id),
-		    val);
-
-	for_each_present_cpu(cpu) {
-		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-		txq_pcpu->size = txq->size;
-		txq_pcpu->buffs = kmalloc_array(txq_pcpu->size,
-						sizeof(*txq_pcpu->buffs),
-						GFP_KERNEL);
-		if (!txq_pcpu->buffs)
-			return -ENOMEM;
-
-		txq_pcpu->count = 0;
-		txq_pcpu->reserved_num = 0;
-		txq_pcpu->txq_put_index = 0;
-		txq_pcpu->txq_get_index = 0;
-		txq_pcpu->tso_headers = NULL;
-
-		txq_pcpu->stop_threshold = txq->size - MVPP2_MAX_SKB_DESCS;
-		txq_pcpu->wake_threshold = txq_pcpu->stop_threshold / 2;
-
-		txq_pcpu->tso_headers =
-			dma_alloc_coherent(port->dev->dev.parent,
-					   txq_pcpu->size * TSO_HEADER_SIZE,
-					   &txq_pcpu->tso_headers_dma,
-					   GFP_KERNEL);
-		if (!txq_pcpu->tso_headers)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/* Free allocated TXQ resources */
-static void mvpp2_txq_deinit(struct mvpp2_port *port,
-			     struct mvpp2_tx_queue *txq)
-{
-	struct mvpp2_txq_pcpu *txq_pcpu;
-	int cpu;
-
-	for_each_present_cpu(cpu) {
-		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-		kfree(txq_pcpu->buffs);
-
-		if (txq_pcpu->tso_headers)
-			dma_free_coherent(port->dev->dev.parent,
-					  txq_pcpu->size * TSO_HEADER_SIZE,
-					  txq_pcpu->tso_headers,
-					  txq_pcpu->tso_headers_dma);
-
-		txq_pcpu->tso_headers = NULL;
-	}
-
-	if (txq->descs)
-		dma_free_coherent(port->dev->dev.parent,
-				  txq->size * MVPP2_DESC_ALIGNED_SIZE,
-				  txq->descs, txq->descs_dma);
-
-	txq->descs             = NULL;
-	txq->last_desc         = 0;
-	txq->next_desc_to_proc = 0;
-	txq->descs_dma         = 0;
-
-	/* Set minimum bandwidth for disabled TXQs */
-	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0);
-
-	/* Set Tx descriptors queue starting address and size */
-	cpu = get_cpu();
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG, 0);
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_SIZE_REG, 0);
-	put_cpu();
-}
-
-/* Cleanup Tx ports */
-static void mvpp2_txq_clean(struct mvpp2_port *port, struct mvpp2_tx_queue *txq)
-{
-	struct mvpp2_txq_pcpu *txq_pcpu;
-	int delay, pending, cpu;
-	u32 val;
-
-	cpu = get_cpu();
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
-	val = mvpp2_percpu_read(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG);
-	val |= MVPP2_TXQ_DRAIN_EN_MASK;
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG, val);
-
-	/* The napi queue has been stopped so wait for all packets
-	 * to be transmitted.
-	 */
-	delay = 0;
-	do {
-		if (delay >= MVPP2_TX_PENDING_TIMEOUT_MSEC) {
-			netdev_warn(port->dev,
-				    "port %d: cleaning queue %d timed out\n",
-				    port->id, txq->log_id);
-			break;
-		}
-		mdelay(1);
-		delay++;
-
-		pending = mvpp2_percpu_read(port->priv, cpu,
-					    MVPP2_TXQ_PENDING_REG);
-		pending &= MVPP2_TXQ_PENDING_MASK;
-	} while (pending);
-
-	val &= ~MVPP2_TXQ_DRAIN_EN_MASK;
-	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG, val);
-	put_cpu();
-
-	for_each_present_cpu(cpu) {
-		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-
-		/* Release all packets */
-		mvpp2_txq_bufs_free(port, txq, txq_pcpu, txq_pcpu->count);
-
-		/* Reset queue */
-		txq_pcpu->count = 0;
-		txq_pcpu->txq_put_index = 0;
-		txq_pcpu->txq_get_index = 0;
-	}
-}
-
-/* Cleanup all Tx queues */
-static void mvpp2_cleanup_txqs(struct mvpp2_port *port)
-{
-	struct mvpp2_tx_queue *txq;
-	int queue;
-	u32 val;
-
-	val = mvpp2_read(port->priv, MVPP2_TX_PORT_FLUSH_REG);
-
-	/* Reset Tx ports and delete Tx queues */
-	val |= MVPP2_TX_PORT_FLUSH_MASK(port->id);
-	mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
-
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		txq = port->txqs[queue];
-		mvpp2_txq_clean(port, txq);
-		mvpp2_txq_deinit(port, txq);
-	}
-
-	on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
-
-	val &= ~MVPP2_TX_PORT_FLUSH_MASK(port->id);
-	mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
-}
-
-/* Cleanup all Rx queues */
-static void mvpp2_cleanup_rxqs(struct mvpp2_port *port)
-{
-	int queue;
-
-	for (queue = 0; queue < port->nrxqs; queue++)
-		mvpp2_rxq_deinit(port, port->rxqs[queue]);
-}
-
-/* Init all Rx queues for port */
-static int mvpp2_setup_rxqs(struct mvpp2_port *port)
-{
-	int queue, err;
-
-	for (queue = 0; queue < port->nrxqs; queue++) {
-		err = mvpp2_rxq_init(port, port->rxqs[queue]);
-		if (err)
-			goto err_cleanup;
-	}
-	return 0;
-
-err_cleanup:
-	mvpp2_cleanup_rxqs(port);
-	return err;
-}
-
-/* Init all tx queues for port */
-static int mvpp2_setup_txqs(struct mvpp2_port *port)
-{
-	struct mvpp2_tx_queue *txq;
-	int queue, err;
-
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		txq = port->txqs[queue];
-		err = mvpp2_txq_init(port, txq);
-		if (err)
-			goto err_cleanup;
-	}
-
-	if (port->has_tx_irqs) {
-		mvpp2_tx_time_coal_set(port);
-		for (queue = 0; queue < port->ntxqs; queue++) {
-			txq = port->txqs[queue];
-			mvpp2_tx_pkts_coal_set(port, txq);
-		}
-	}
-
-	on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
-	return 0;
-
-err_cleanup:
-	mvpp2_cleanup_txqs(port);
-	return err;
-}
-
-/* The callback for per-port interrupt */
-static irqreturn_t mvpp2_isr(int irq, void *dev_id)
-{
-	struct mvpp2_queue_vector *qv = dev_id;
-
-	mvpp2_qvec_interrupt_disable(qv);
-
-	napi_schedule(&qv->napi);
-
-	return IRQ_HANDLED;
-}
-
-/* Per-port interrupt for link status changes */
-static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
-{
-	struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
-	struct net_device *dev = port->dev;
-	bool event = false, link = false;
-	u32 val;
-
-	mvpp22_gop_mask_irq(port);
-
-	if (port->gop_id == 0 &&
-	    port->phy_interface == PHY_INTERFACE_MODE_10GKR) {
-		val = readl(port->base + MVPP22_XLG_INT_STAT);
-		if (val & MVPP22_XLG_INT_STAT_LINK) {
-			event = true;
-			val = readl(port->base + MVPP22_XLG_STATUS);
-			if (val & MVPP22_XLG_STATUS_LINK_UP)
-				link = true;
-		}
-	} else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
-		   port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
-		   port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
-		   port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
-		val = readl(port->base + MVPP22_GMAC_INT_STAT);
-		if (val & MVPP22_GMAC_INT_STAT_LINK) {
-			event = true;
-			val = readl(port->base + MVPP2_GMAC_STATUS0);
-			if (val & MVPP2_GMAC_STATUS0_LINK_UP)
-				link = true;
-		}
-	}
-
-	if (port->phylink) {
-		phylink_mac_change(port->phylink, link);
-		goto handled;
-	}
-
-	if (!netif_running(dev) || !event)
-		goto handled;
-
-	if (link) {
-		mvpp2_interrupts_enable(port);
-
-		mvpp2_egress_enable(port);
-		mvpp2_ingress_enable(port);
-		netif_carrier_on(dev);
-		netif_tx_wake_all_queues(dev);
-	} else {
-		netif_tx_stop_all_queues(dev);
-		netif_carrier_off(dev);
-		mvpp2_ingress_disable(port);
-		mvpp2_egress_disable(port);
-
-		mvpp2_interrupts_disable(port);
-	}
-
-handled:
-	mvpp22_gop_unmask_irq(port);
-	return IRQ_HANDLED;
-}
-
-static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
-{
-	ktime_t interval;
-
-	if (!port_pcpu->timer_scheduled) {
-		port_pcpu->timer_scheduled = true;
-		interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
-		hrtimer_start(&port_pcpu->tx_done_timer, interval,
-			      HRTIMER_MODE_REL_PINNED);
-	}
-}
-
-static void mvpp2_tx_proc_cb(unsigned long data)
-{
-	struct net_device *dev = (struct net_device *)data;
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
-	unsigned int tx_todo, cause;
-
-	if (!netif_running(dev))
-		return;
-	port_pcpu->timer_scheduled = false;
-
-	/* Process all the Tx queues */
-	cause = (1 << port->ntxqs) - 1;
-	tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
-
-	/* Set the timer in case not all the packets were processed */
-	if (tx_todo)
-		mvpp2_timer_set(port_pcpu);
-}
-
-static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
-{
-	struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
-							 struct mvpp2_port_pcpu,
-							 tx_done_timer);
-
-	tasklet_schedule(&port_pcpu->tx_done_tasklet);
-
-	return HRTIMER_NORESTART;
-}
-
-/* Main RX/TX processing routines */
-
-/* Display more error info */
-static void mvpp2_rx_error(struct mvpp2_port *port,
-			   struct mvpp2_rx_desc *rx_desc)
-{
-	u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
-	size_t sz = mvpp2_rxdesc_size_get(port, rx_desc);
-	char *err_str = NULL;
-
-	switch (status & MVPP2_RXD_ERR_CODE_MASK) {
-	case MVPP2_RXD_ERR_CRC:
-		err_str = "crc";
-		break;
-	case MVPP2_RXD_ERR_OVERRUN:
-		err_str = "overrun";
-		break;
-	case MVPP2_RXD_ERR_RESOURCE:
-		err_str = "resource";
-		break;
-	}
-	if (err_str && net_ratelimit())
-		netdev_err(port->dev,
-			   "bad rx status %08x (%s error), size=%zu\n",
-			   status, err_str, sz);
-}
-
-/* Handle RX checksum offload */
-static void mvpp2_rx_csum(struct mvpp2_port *port, u32 status,
-			  struct sk_buff *skb)
-{
-	if (((status & MVPP2_RXD_L3_IP4) &&
-	     !(status & MVPP2_RXD_IP4_HEADER_ERR)) ||
-	    (status & MVPP2_RXD_L3_IP6))
-		if (((status & MVPP2_RXD_L4_UDP) ||
-		     (status & MVPP2_RXD_L4_TCP)) &&
-		     (status & MVPP2_RXD_L4_CSUM_OK)) {
-			skb->csum = 0;
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			return;
-		}
-
-	skb->ip_summed = CHECKSUM_NONE;
-}
-
-/* Reuse skb if possible, or allocate a new skb and add it to BM pool */
-static int mvpp2_rx_refill(struct mvpp2_port *port,
-			   struct mvpp2_bm_pool *bm_pool, int pool)
-{
-	dma_addr_t dma_addr;
-	phys_addr_t phys_addr;
-	void *buf;
-
-	/* No recycle or too many buffers are in use, so allocate a new skb */
-	buf = mvpp2_buf_alloc(port, bm_pool, &dma_addr, &phys_addr,
-			      GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-
-	mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr);
-
-	return 0;
-}
-
-/* Handle tx checksum */
-static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb)
-{
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		int ip_hdr_len = 0;
-		u8 l4_proto;
-
-		if (skb->protocol == htons(ETH_P_IP)) {
-			struct iphdr *ip4h = ip_hdr(skb);
-
-			/* Calculate IPv4 checksum and L4 checksum */
-			ip_hdr_len = ip4h->ihl;
-			l4_proto = ip4h->protocol;
-		} else if (skb->protocol == htons(ETH_P_IPV6)) {
-			struct ipv6hdr *ip6h = ipv6_hdr(skb);
-
-			/* Read l4_protocol from one of IPv6 extra headers */
-			if (skb_network_header_len(skb) > 0)
-				ip_hdr_len = (skb_network_header_len(skb) >> 2);
-			l4_proto = ip6h->nexthdr;
-		} else {
-			return MVPP2_TXD_L4_CSUM_NOT;
-		}
-
-		return mvpp2_txq_desc_csum(skb_network_offset(skb),
-				skb->protocol, ip_hdr_len, l4_proto);
-	}
-
-	return MVPP2_TXD_L4_CSUM_NOT | MVPP2_TXD_IP_CSUM_DISABLE;
-}
-
-/* Main rx processing */
-static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
-		    int rx_todo, struct mvpp2_rx_queue *rxq)
-{
-	struct net_device *dev = port->dev;
-	int rx_received;
-	int rx_done = 0;
-	u32 rcvd_pkts = 0;
-	u32 rcvd_bytes = 0;
-
-	/* Get number of received packets and clamp the to-do */
-	rx_received = mvpp2_rxq_received(port, rxq->id);
-	if (rx_todo > rx_received)
-		rx_todo = rx_received;
-
-	while (rx_done < rx_todo) {
-		struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
-		struct mvpp2_bm_pool *bm_pool;
-		struct sk_buff *skb;
-		unsigned int frag_size;
-		dma_addr_t dma_addr;
-		phys_addr_t phys_addr;
-		u32 rx_status;
-		int pool, rx_bytes, err;
-		void *data;
-
-		rx_done++;
-		rx_status = mvpp2_rxdesc_status_get(port, rx_desc);
-		rx_bytes = mvpp2_rxdesc_size_get(port, rx_desc);
-		rx_bytes -= MVPP2_MH_SIZE;
-		dma_addr = mvpp2_rxdesc_dma_addr_get(port, rx_desc);
-		phys_addr = mvpp2_rxdesc_cookie_get(port, rx_desc);
-		data = (void *)phys_to_virt(phys_addr);
-
-		pool = (rx_status & MVPP2_RXD_BM_POOL_ID_MASK) >>
-			MVPP2_RXD_BM_POOL_ID_OFFS;
-		bm_pool = &port->priv->bm_pools[pool];
-
-		/* In case of an error, release the requested buffer pointer
-		 * to the Buffer Manager. This request process is controlled
-		 * by the hardware, and the information about the buffer is
-		 * comprised by the RX descriptor.
-		 */
-		if (rx_status & MVPP2_RXD_ERR_SUMMARY) {
-err_drop_frame:
-			dev->stats.rx_errors++;
-			mvpp2_rx_error(port, rx_desc);
-			/* Return the buffer to the pool */
-			mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr);
-			continue;
-		}
-
-		if (bm_pool->frag_size > PAGE_SIZE)
-			frag_size = 0;
-		else
-			frag_size = bm_pool->frag_size;
-
-		skb = build_skb(data, frag_size);
-		if (!skb) {
-			netdev_warn(port->dev, "skb build failed\n");
-			goto err_drop_frame;
-		}
-
-		err = mvpp2_rx_refill(port, bm_pool, pool);
-		if (err) {
-			netdev_err(port->dev, "failed to refill BM pools\n");
-			goto err_drop_frame;
-		}
-
-		dma_unmap_single(dev->dev.parent, dma_addr,
-				 bm_pool->buf_size, DMA_FROM_DEVICE);
-
-		rcvd_pkts++;
-		rcvd_bytes += rx_bytes;
-
-		skb_reserve(skb, MVPP2_MH_SIZE + NET_SKB_PAD);
-		skb_put(skb, rx_bytes);
-		skb->protocol = eth_type_trans(skb, dev);
-		mvpp2_rx_csum(port, rx_status, skb);
-
-		napi_gro_receive(napi, skb);
-	}
-
-	if (rcvd_pkts) {
-		struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
-
-		u64_stats_update_begin(&stats->syncp);
-		stats->rx_packets += rcvd_pkts;
-		stats->rx_bytes   += rcvd_bytes;
-		u64_stats_update_end(&stats->syncp);
-	}
-
-	/* Update Rx queue management counters */
-	wmb();
-	mvpp2_rxq_status_update(port, rxq->id, rx_done, rx_done);
-
-	return rx_todo;
-}
-
-static inline void
-tx_desc_unmap_put(struct mvpp2_port *port, struct mvpp2_tx_queue *txq,
-		  struct mvpp2_tx_desc *desc)
-{
-	struct mvpp2_txq_pcpu *txq_pcpu = this_cpu_ptr(txq->pcpu);
-
-	dma_addr_t buf_dma_addr =
-		mvpp2_txdesc_dma_addr_get(port, desc);
-	size_t buf_sz =
-		mvpp2_txdesc_size_get(port, desc);
-	if (!IS_TSO_HEADER(txq_pcpu, buf_dma_addr))
-		dma_unmap_single(port->dev->dev.parent, buf_dma_addr,
-				 buf_sz, DMA_TO_DEVICE);
-	mvpp2_txq_desc_put(txq);
-}
-
-/* Handle tx fragmentation processing */
-static int mvpp2_tx_frag_process(struct mvpp2_port *port, struct sk_buff *skb,
-				 struct mvpp2_tx_queue *aggr_txq,
-				 struct mvpp2_tx_queue *txq)
-{
-	struct mvpp2_txq_pcpu *txq_pcpu = this_cpu_ptr(txq->pcpu);
-	struct mvpp2_tx_desc *tx_desc;
-	int i;
-	dma_addr_t buf_dma_addr;
-
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-		void *addr = page_address(frag->page.p) + frag->page_offset;
-
-		tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
-		mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
-		mvpp2_txdesc_size_set(port, tx_desc, frag->size);
-
-		buf_dma_addr = dma_map_single(port->dev->dev.parent, addr,
-					      frag->size, DMA_TO_DEVICE);
-		if (dma_mapping_error(port->dev->dev.parent, buf_dma_addr)) {
-			mvpp2_txq_desc_put(txq);
-			goto cleanup;
-		}
-
-		mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
-
-		if (i == (skb_shinfo(skb)->nr_frags - 1)) {
-			/* Last descriptor */
-			mvpp2_txdesc_cmd_set(port, tx_desc,
-					     MVPP2_TXD_L_DESC);
-			mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
-		} else {
-			/* Descriptor in the middle: Not First, Not Last */
-			mvpp2_txdesc_cmd_set(port, tx_desc, 0);
-			mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
-		}
-	}
-
-	return 0;
-cleanup:
-	/* Release all descriptors that were used to map fragments of
-	 * this packet, as well as the corresponding DMA mappings
-	 */
-	for (i = i - 1; i >= 0; i--) {
-		tx_desc = txq->descs + i;
-		tx_desc_unmap_put(port, txq, tx_desc);
-	}
-
-	return -ENOMEM;
-}
-
-static inline void mvpp2_tso_put_hdr(struct sk_buff *skb,
-				     struct net_device *dev,
-				     struct mvpp2_tx_queue *txq,
-				     struct mvpp2_tx_queue *aggr_txq,
-				     struct mvpp2_txq_pcpu *txq_pcpu,
-				     int hdr_sz)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
-	dma_addr_t addr;
-
-	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
-	mvpp2_txdesc_size_set(port, tx_desc, hdr_sz);
-
-	addr = txq_pcpu->tso_headers_dma +
-	       txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
-	mvpp2_txdesc_dma_addr_set(port, tx_desc, addr);
-
-	mvpp2_txdesc_cmd_set(port, tx_desc, mvpp2_skb_tx_csum(port, skb) |
-					    MVPP2_TXD_F_DESC |
-					    MVPP2_TXD_PADDING_DISABLE);
-	mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
-}
-
-static inline int mvpp2_tso_put_data(struct sk_buff *skb,
-				     struct net_device *dev, struct tso_t *tso,
-				     struct mvpp2_tx_queue *txq,
-				     struct mvpp2_tx_queue *aggr_txq,
-				     struct mvpp2_txq_pcpu *txq_pcpu,
-				     int sz, bool left, bool last)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
-	dma_addr_t buf_dma_addr;
-
-	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
-	mvpp2_txdesc_size_set(port, tx_desc, sz);
-
-	buf_dma_addr = dma_map_single(dev->dev.parent, tso->data, sz,
-				      DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
-		mvpp2_txq_desc_put(txq);
-		return -ENOMEM;
-	}
-
-	mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
-
-	if (!left) {
-		mvpp2_txdesc_cmd_set(port, tx_desc, MVPP2_TXD_L_DESC);
-		if (last) {
-			mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
-			return 0;
-		}
-	} else {
-		mvpp2_txdesc_cmd_set(port, tx_desc, 0);
-	}
-
-	mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
-	return 0;
-}
-
-static int mvpp2_tx_tso(struct sk_buff *skb, struct net_device *dev,
-			struct mvpp2_tx_queue *txq,
-			struct mvpp2_tx_queue *aggr_txq,
-			struct mvpp2_txq_pcpu *txq_pcpu)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct tso_t tso;
-	int hdr_sz = skb_transport_offset(skb) + tcp_hdrlen(skb);
-	int i, len, descs = 0;
-
-	/* Check number of available descriptors */
-	if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq,
-				      tso_count_descs(skb)) ||
-	    mvpp2_txq_reserved_desc_num_proc(port->priv, txq, txq_pcpu,
-					     tso_count_descs(skb)))
-		return 0;
-
-	tso_start(skb, &tso);
-	len = skb->len - hdr_sz;
-	while (len > 0) {
-		int left = min_t(int, skb_shinfo(skb)->gso_size, len);
-		char *hdr = txq_pcpu->tso_headers +
-			    txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
-
-		len -= left;
-		descs++;
-
-		tso_build_hdr(skb, hdr, &tso, left, len == 0);
-		mvpp2_tso_put_hdr(skb, dev, txq, aggr_txq, txq_pcpu, hdr_sz);
-
-		while (left > 0) {
-			int sz = min_t(int, tso.size, left);
-			left -= sz;
-			descs++;
-
-			if (mvpp2_tso_put_data(skb, dev, &tso, txq, aggr_txq,
-					       txq_pcpu, sz, left, len == 0))
-				goto release;
-			tso_build_data(skb, &tso, sz);
-		}
-	}
-
-	return descs;
-
-release:
-	for (i = descs - 1; i >= 0; i--) {
-		struct mvpp2_tx_desc *tx_desc = txq->descs + i;
-		tx_desc_unmap_put(port, txq, tx_desc);
-	}
-	return 0;
-}
-
-/* Main tx processing */
-static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2_tx_queue *txq, *aggr_txq;
-	struct mvpp2_txq_pcpu *txq_pcpu;
-	struct mvpp2_tx_desc *tx_desc;
-	dma_addr_t buf_dma_addr;
-	int frags = 0;
-	u16 txq_id;
-	u32 tx_cmd;
-
-	txq_id = skb_get_queue_mapping(skb);
-	txq = port->txqs[txq_id];
-	txq_pcpu = this_cpu_ptr(txq->pcpu);
-	aggr_txq = &port->priv->aggr_txqs[smp_processor_id()];
-
-	if (skb_is_gso(skb)) {
-		frags = mvpp2_tx_tso(skb, dev, txq, aggr_txq, txq_pcpu);
-		goto out;
-	}
-	frags = skb_shinfo(skb)->nr_frags + 1;
-
-	/* Check number of available descriptors */
-	if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq, frags) ||
-	    mvpp2_txq_reserved_desc_num_proc(port->priv, txq,
-					     txq_pcpu, frags)) {
-		frags = 0;
-		goto out;
-	}
-
-	/* Get a descriptor for the first part of the packet */
-	tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
-	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
-	mvpp2_txdesc_size_set(port, tx_desc, skb_headlen(skb));
-
-	buf_dma_addr = dma_map_single(dev->dev.parent, skb->data,
-				      skb_headlen(skb), DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
-		mvpp2_txq_desc_put(txq);
-		frags = 0;
-		goto out;
-	}
-
-	mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
-
-	tx_cmd = mvpp2_skb_tx_csum(port, skb);
-
-	if (frags == 1) {
-		/* First and Last descriptor */
-		tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_L_DESC;
-		mvpp2_txdesc_cmd_set(port, tx_desc, tx_cmd);
-		mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
-	} else {
-		/* First but not Last */
-		tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_PADDING_DISABLE;
-		mvpp2_txdesc_cmd_set(port, tx_desc, tx_cmd);
-		mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
-
-		/* Continue with other skb fragments */
-		if (mvpp2_tx_frag_process(port, skb, aggr_txq, txq)) {
-			tx_desc_unmap_put(port, txq, tx_desc);
-			frags = 0;
-		}
-	}
-
-out:
-	if (frags > 0) {
-		struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
-		struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
-
-		txq_pcpu->reserved_num -= frags;
-		txq_pcpu->count += frags;
-		aggr_txq->count += frags;
-
-		/* Enable transmit */
-		wmb();
-		mvpp2_aggr_txq_pend_desc_add(port, frags);
-
-		if (txq_pcpu->count >= txq_pcpu->stop_threshold)
-			netif_tx_stop_queue(nq);
-
-		u64_stats_update_begin(&stats->syncp);
-		stats->tx_packets++;
-		stats->tx_bytes += skb->len;
-		u64_stats_update_end(&stats->syncp);
-	} else {
-		dev->stats.tx_dropped++;
-		dev_kfree_skb_any(skb);
-	}
-
-	/* Finalize TX processing */
-	if (!port->has_tx_irqs && txq_pcpu->count >= txq->done_pkts_coal)
-		mvpp2_txq_done(port, txq, txq_pcpu);
-
-	/* Set the timer in case not all frags were processed */
-	if (!port->has_tx_irqs && txq_pcpu->count <= frags &&
-	    txq_pcpu->count > 0) {
-		struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
-
-		mvpp2_timer_set(port_pcpu);
-	}
-
-	return NETDEV_TX_OK;
-}
-
-static inline void mvpp2_cause_error(struct net_device *dev, int cause)
-{
-	if (cause & MVPP2_CAUSE_FCS_ERR_MASK)
-		netdev_err(dev, "FCS error\n");
-	if (cause & MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK)
-		netdev_err(dev, "rx fifo overrun error\n");
-	if (cause & MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK)
-		netdev_err(dev, "tx fifo underrun error\n");
-}
-
-static int mvpp2_poll(struct napi_struct *napi, int budget)
-{
-	u32 cause_rx_tx, cause_rx, cause_tx, cause_misc;
-	int rx_done = 0;
-	struct mvpp2_port *port = netdev_priv(napi->dev);
-	struct mvpp2_queue_vector *qv;
-	int cpu = smp_processor_id();
-
-	qv = container_of(napi, struct mvpp2_queue_vector, napi);
-
-	/* Rx/Tx cause register
-	 *
-	 * Bits 0-15: each bit indicates received packets on the Rx queue
-	 * (bit 0 is for Rx queue 0).
-	 *
-	 * Bits 16-23: each bit indicates transmitted packets on the Tx queue
-	 * (bit 16 is for Tx queue 0).
-	 *
-	 * Each CPU has its own Rx/Tx cause register
-	 */
-	cause_rx_tx = mvpp2_percpu_read_relaxed(port->priv, qv->sw_thread_id,
-						MVPP2_ISR_RX_TX_CAUSE_REG(port->id));
-
-	cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
-	if (cause_misc) {
-		mvpp2_cause_error(port->dev, cause_misc);
-
-		/* Clear the cause register */
-		mvpp2_write(port->priv, MVPP2_ISR_MISC_CAUSE_REG, 0);
-		mvpp2_percpu_write(port->priv, cpu,
-				   MVPP2_ISR_RX_TX_CAUSE_REG(port->id),
-				   cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK);
-	}
-
-	cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
-	if (cause_tx) {
-		cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET;
-		mvpp2_tx_done(port, cause_tx, qv->sw_thread_id);
-	}
-
-	/* Process RX packets */
-	cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
-	cause_rx <<= qv->first_rxq;
-	cause_rx |= qv->pending_cause_rx;
-	while (cause_rx && budget > 0) {
-		int count;
-		struct mvpp2_rx_queue *rxq;
-
-		rxq = mvpp2_get_rx_queue(port, cause_rx);
-		if (!rxq)
-			break;
-
-		count = mvpp2_rx(port, napi, budget, rxq);
-		rx_done += count;
-		budget -= count;
-		if (budget > 0) {
-			/* Clear the bit associated to this Rx queue
-			 * so that next iteration will continue from
-			 * the next Rx queue.
-			 */
-			cause_rx &= ~(1 << rxq->logic_rxq);
-		}
-	}
-
-	if (budget > 0) {
-		cause_rx = 0;
-		napi_complete_done(napi, rx_done);
-
-		mvpp2_qvec_interrupt_enable(qv);
-	}
-	qv->pending_cause_rx = cause_rx;
-	return rx_done;
-}
-
-static void mvpp22_mode_reconfigure(struct mvpp2_port *port)
-{
-	u32 ctrl3;
-
-	/* comphy reconfiguration */
-	mvpp22_comphy_init(port);
-
-	/* gop reconfiguration */
-	mvpp22_gop_init(port);
-
-	/* Only GOP port 0 has an XLG MAC */
-	if (port->gop_id == 0) {
-		ctrl3 = readl(port->base + MVPP22_XLG_CTRL3_REG);
-		ctrl3 &= ~MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
-
-		if (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
-		    port->phy_interface == PHY_INTERFACE_MODE_10GKR)
-			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_10G;
-		else
-			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_GMAC;
-
-		writel(ctrl3, port->base + MVPP22_XLG_CTRL3_REG);
-	}
-
-	if (port->gop_id == 0 &&
-	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
-	     port->phy_interface == PHY_INTERFACE_MODE_10GKR))
-		mvpp2_xlg_max_rx_size_set(port);
-	else
-		mvpp2_gmac_max_rx_size_set(port);
-}
-
-/* Set hw internals when starting port */
-static void mvpp2_start_dev(struct mvpp2_port *port)
-{
-	int i;
-
-	mvpp2_txp_max_tx_size_set(port);
-
-	for (i = 0; i < port->nqvecs; i++)
-		napi_enable(&port->qvecs[i].napi);
-
-	/* Enable interrupts on all CPUs */
-	mvpp2_interrupts_enable(port);
-
-	if (port->priv->hw_version == MVPP22)
-		mvpp22_mode_reconfigure(port);
-
-	if (port->phylink) {
-		phylink_start(port->phylink);
-	} else {
-		/* Phylink isn't used as of now for ACPI, so the MAC has to be
-		 * configured manually when the interface is started. This will
-		 * be removed as soon as the phylink ACPI support lands in.
-		 */
-		struct phylink_link_state state = {
-			.interface = port->phy_interface,
-			.link = 1,
-		};
-		mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
-	}
-
-	netif_tx_start_all_queues(port->dev);
-}
-
-/* Set hw internals when stopping port */
-static void mvpp2_stop_dev(struct mvpp2_port *port)
-{
-	int i;
-
-	/* Disable interrupts on all CPUs */
-	mvpp2_interrupts_disable(port);
-
-	for (i = 0; i < port->nqvecs; i++)
-		napi_disable(&port->qvecs[i].napi);
-
-	if (port->phylink)
-		phylink_stop(port->phylink);
-	phy_power_off(port->comphy);
-}
-
-static int mvpp2_check_ringparam_valid(struct net_device *dev,
-				       struct ethtool_ringparam *ring)
-{
-	u16 new_rx_pending = ring->rx_pending;
-	u16 new_tx_pending = ring->tx_pending;
-
-	if (ring->rx_pending == 0 || ring->tx_pending == 0)
-		return -EINVAL;
-
-	if (ring->rx_pending > MVPP2_MAX_RXD_MAX)
-		new_rx_pending = MVPP2_MAX_RXD_MAX;
-	else if (!IS_ALIGNED(ring->rx_pending, 16))
-		new_rx_pending = ALIGN(ring->rx_pending, 16);
-
-	if (ring->tx_pending > MVPP2_MAX_TXD_MAX)
-		new_tx_pending = MVPP2_MAX_TXD_MAX;
-	else if (!IS_ALIGNED(ring->tx_pending, 32))
-		new_tx_pending = ALIGN(ring->tx_pending, 32);
-
-	/* The Tx ring size cannot be smaller than the minimum number of
-	 * descriptors needed for TSO.
-	 */
-	if (new_tx_pending < MVPP2_MAX_SKB_DESCS)
-		new_tx_pending = ALIGN(MVPP2_MAX_SKB_DESCS, 32);
-
-	if (ring->rx_pending != new_rx_pending) {
-		netdev_info(dev, "illegal Rx ring size value %d, round to %d\n",
-			    ring->rx_pending, new_rx_pending);
-		ring->rx_pending = new_rx_pending;
-	}
-
-	if (ring->tx_pending != new_tx_pending) {
-		netdev_info(dev, "illegal Tx ring size value %d, round to %d\n",
-			    ring->tx_pending, new_tx_pending);
-		ring->tx_pending = new_tx_pending;
-	}
-
-	return 0;
-}
-
-static void mvpp21_get_mac_address(struct mvpp2_port *port, unsigned char *addr)
-{
-	u32 mac_addr_l, mac_addr_m, mac_addr_h;
-
-	mac_addr_l = readl(port->base + MVPP2_GMAC_CTRL_1_REG);
-	mac_addr_m = readl(port->priv->lms_base + MVPP2_SRC_ADDR_MIDDLE);
-	mac_addr_h = readl(port->priv->lms_base + MVPP2_SRC_ADDR_HIGH);
-	addr[0] = (mac_addr_h >> 24) & 0xFF;
-	addr[1] = (mac_addr_h >> 16) & 0xFF;
-	addr[2] = (mac_addr_h >> 8) & 0xFF;
-	addr[3] = mac_addr_h & 0xFF;
-	addr[4] = mac_addr_m & 0xFF;
-	addr[5] = (mac_addr_l >> MVPP2_GMAC_SA_LOW_OFFS) & 0xFF;
-}
-
-static int mvpp2_irqs_init(struct mvpp2_port *port)
-{
-	int err, i;
-
-	for (i = 0; i < port->nqvecs; i++) {
-		struct mvpp2_queue_vector *qv = port->qvecs + i;
-
-		if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
-			irq_set_status_flags(qv->irq, IRQ_NO_BALANCING);
-
-		err = request_irq(qv->irq, mvpp2_isr, 0, port->dev->name, qv);
-		if (err)
-			goto err;
-
-		if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
-			irq_set_affinity_hint(qv->irq,
-					      cpumask_of(qv->sw_thread_id));
-	}
-
-	return 0;
-err:
-	for (i = 0; i < port->nqvecs; i++) {
-		struct mvpp2_queue_vector *qv = port->qvecs + i;
-
-		irq_set_affinity_hint(qv->irq, NULL);
-		free_irq(qv->irq, qv);
-	}
-
-	return err;
-}
-
-static void mvpp2_irqs_deinit(struct mvpp2_port *port)
-{
-	int i;
-
-	for (i = 0; i < port->nqvecs; i++) {
-		struct mvpp2_queue_vector *qv = port->qvecs + i;
-
-		irq_set_affinity_hint(qv->irq, NULL);
-		irq_clear_status_flags(qv->irq, IRQ_NO_BALANCING);
-		free_irq(qv->irq, qv);
-	}
-}
-
-static void mvpp22_init_rss(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	int i;
-
-	/* Set the table width: replace the whole classifier Rx queue number
-	 * with the ones configured in RSS table entries.
-	 */
-	mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_TABLE(0));
-	mvpp2_write(priv, MVPP22_RSS_WIDTH, 8);
-
-	/* Loop through the classifier Rx Queues and map them to a RSS table.
-	 * Map them all to the first table (0) by default.
-	 */
-	for (i = 0; i < MVPP2_CLS_RX_QUEUES; i++) {
-		mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_QUEUE(i));
-		mvpp2_write(priv, MVPP22_RSS_TABLE,
-			    MVPP22_RSS_TABLE_POINTER(0));
-	}
-
-	/* Configure the first table to evenly distribute the packets across
-	 * real Rx Queues. The table entries map a hash to an port Rx Queue.
-	 */
-	for (i = 0; i < MVPP22_RSS_TABLE_ENTRIES; i++) {
-		u32 sel = MVPP22_RSS_INDEX_TABLE(0) |
-			  MVPP22_RSS_INDEX_TABLE_ENTRY(i);
-		mvpp2_write(priv, MVPP22_RSS_INDEX, sel);
-
-		mvpp2_write(priv, MVPP22_RSS_TABLE_ENTRY, i % port->nrxqs);
-	}
-
-}
-
-static int mvpp2_open(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2 *priv = port->priv;
-	unsigned char mac_bcast[ETH_ALEN] = {
-			0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-	bool valid = false;
-	int err;
-
-	err = mvpp2_prs_mac_da_accept(port, mac_bcast, true);
-	if (err) {
-		netdev_err(dev, "mvpp2_prs_mac_da_accept BC failed\n");
-		return err;
-	}
-	err = mvpp2_prs_mac_da_accept(port, dev->dev_addr, true);
-	if (err) {
-		netdev_err(dev, "mvpp2_prs_mac_da_accept own addr failed\n");
-		return err;
-	}
-	err = mvpp2_prs_tag_mode_set(port->priv, port->id, MVPP2_TAG_TYPE_MH);
-	if (err) {
-		netdev_err(dev, "mvpp2_prs_tag_mode_set failed\n");
-		return err;
-	}
-	err = mvpp2_prs_def_flow(port);
-	if (err) {
-		netdev_err(dev, "mvpp2_prs_def_flow failed\n");
-		return err;
-	}
-
-	/* Allocate the Rx/Tx queues */
-	err = mvpp2_setup_rxqs(port);
-	if (err) {
-		netdev_err(port->dev, "cannot allocate Rx queues\n");
-		return err;
-	}
-
-	err = mvpp2_setup_txqs(port);
-	if (err) {
-		netdev_err(port->dev, "cannot allocate Tx queues\n");
-		goto err_cleanup_rxqs;
-	}
-
-	err = mvpp2_irqs_init(port);
-	if (err) {
-		netdev_err(port->dev, "cannot init IRQs\n");
-		goto err_cleanup_txqs;
-	}
-
-	/* Phylink isn't supported yet in ACPI mode */
-	if (port->of_node) {
-		err = phylink_of_phy_connect(port->phylink, port->of_node, 0);
-		if (err) {
-			netdev_err(port->dev, "could not attach PHY (%d)\n",
-				   err);
-			goto err_free_irq;
-		}
-
-		valid = true;
-	}
-
-	if (priv->hw_version == MVPP22 && port->link_irq && !port->phylink) {
-		err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
-				  dev->name, port);
-		if (err) {
-			netdev_err(port->dev, "cannot request link IRQ %d\n",
-				   port->link_irq);
-			goto err_free_irq;
-		}
-
-		mvpp22_gop_setup_irq(port);
-
-		/* In default link is down */
-		netif_carrier_off(port->dev);
-
-		valid = true;
-	} else {
-		port->link_irq = 0;
-	}
-
-	if (!valid) {
-		netdev_err(port->dev,
-			   "invalid configuration: no dt or link IRQ");
-		goto err_free_irq;
-	}
-
-	/* Unmask interrupts on all CPUs */
-	on_each_cpu(mvpp2_interrupts_unmask, port, 1);
-	mvpp2_shared_interrupt_mask_unmask(port, false);
-
-	mvpp2_start_dev(port);
-
-	if (priv->hw_version == MVPP22)
-		mvpp22_init_rss(port);
-
-	/* Start hardware statistics gathering */
-	queue_delayed_work(priv->stats_queue, &port->stats_work,
-			   MVPP2_MIB_COUNTERS_STATS_DELAY);
-
-	return 0;
-
-err_free_irq:
-	mvpp2_irqs_deinit(port);
-err_cleanup_txqs:
-	mvpp2_cleanup_txqs(port);
-err_cleanup_rxqs:
-	mvpp2_cleanup_rxqs(port);
-	return err;
-}
-
-static int mvpp2_stop(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	struct mvpp2_port_pcpu *port_pcpu;
-	int cpu;
-
-	mvpp2_stop_dev(port);
-
-	/* Mask interrupts on all CPUs */
-	on_each_cpu(mvpp2_interrupts_mask, port, 1);
-	mvpp2_shared_interrupt_mask_unmask(port, true);
-
-	if (port->phylink)
-		phylink_disconnect_phy(port->phylink);
-	if (port->link_irq)
-		free_irq(port->link_irq, port);
-
-	mvpp2_irqs_deinit(port);
-	if (!port->has_tx_irqs) {
-		for_each_present_cpu(cpu) {
-			port_pcpu = per_cpu_ptr(port->pcpu, cpu);
-
-			hrtimer_cancel(&port_pcpu->tx_done_timer);
-			port_pcpu->timer_scheduled = false;
-			tasklet_kill(&port_pcpu->tx_done_tasklet);
-		}
-	}
-	mvpp2_cleanup_rxqs(port);
-	mvpp2_cleanup_txqs(port);
-
-	cancel_delayed_work_sync(&port->stats_work);
-
-	return 0;
-}
-
-static int mvpp2_prs_mac_da_accept_list(struct mvpp2_port *port,
-					struct netdev_hw_addr_list *list)
-{
-	struct netdev_hw_addr *ha;
-	int ret;
-
-	netdev_hw_addr_list_for_each(ha, list) {
-		ret = mvpp2_prs_mac_da_accept(port, ha->addr, true);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-static void mvpp2_set_rx_promisc(struct mvpp2_port *port, bool enable)
-{
-	if (!enable && (port->dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		mvpp2_prs_vid_enable_filtering(port);
-	else
-		mvpp2_prs_vid_disable_filtering(port);
-
-	mvpp2_prs_mac_promisc_set(port->priv, port->id,
-				  MVPP2_PRS_L2_UNI_CAST, enable);
-
-	mvpp2_prs_mac_promisc_set(port->priv, port->id,
-				  MVPP2_PRS_L2_MULTI_CAST, enable);
-}
-
-static void mvpp2_set_rx_mode(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	/* Clear the whole UC and MC list */
-	mvpp2_prs_mac_del_all(port);
-
-	if (dev->flags & IFF_PROMISC) {
-		mvpp2_set_rx_promisc(port, true);
-		return;
-	}
-
-	mvpp2_set_rx_promisc(port, false);
-
-	if (netdev_uc_count(dev) > MVPP2_PRS_MAC_UC_FILT_MAX ||
-	    mvpp2_prs_mac_da_accept_list(port, &dev->uc))
-		mvpp2_prs_mac_promisc_set(port->priv, port->id,
-					  MVPP2_PRS_L2_UNI_CAST, true);
-
-	if (dev->flags & IFF_ALLMULTI) {
-		mvpp2_prs_mac_promisc_set(port->priv, port->id,
-					  MVPP2_PRS_L2_MULTI_CAST, true);
-		return;
-	}
-
-	if (netdev_mc_count(dev) > MVPP2_PRS_MAC_MC_FILT_MAX ||
-	    mvpp2_prs_mac_da_accept_list(port, &dev->mc))
-		mvpp2_prs_mac_promisc_set(port->priv, port->id,
-					  MVPP2_PRS_L2_MULTI_CAST, true);
-}
-
-static int mvpp2_set_mac_address(struct net_device *dev, void *p)
-{
-	const struct sockaddr *addr = p;
-	int err;
-
-	if (!is_valid_ether_addr(addr->sa_data))
-		return -EADDRNOTAVAIL;
-
-	err = mvpp2_prs_update_mac_da(dev, addr->sa_data);
-	if (err) {
-		/* Reconfigure parser accept the original MAC address */
-		mvpp2_prs_update_mac_da(dev, dev->dev_addr);
-		netdev_err(dev, "failed to change MAC address\n");
-	}
-	return err;
-}
-
-static int mvpp2_change_mtu(struct net_device *dev, int mtu)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	int err;
-
-	if (!IS_ALIGNED(MVPP2_RX_PKT_SIZE(mtu), 8)) {
-		netdev_info(dev, "illegal MTU value %d, round to %d\n", mtu,
-			    ALIGN(MVPP2_RX_PKT_SIZE(mtu), 8));
-		mtu = ALIGN(MVPP2_RX_PKT_SIZE(mtu), 8);
-	}
-
-	if (!netif_running(dev)) {
-		err = mvpp2_bm_update_mtu(dev, mtu);
-		if (!err) {
-			port->pkt_size =  MVPP2_RX_PKT_SIZE(mtu);
-			return 0;
-		}
-
-		/* Reconfigure BM to the original MTU */
-		err = mvpp2_bm_update_mtu(dev, dev->mtu);
-		if (err)
-			goto log_error;
-	}
-
-	mvpp2_stop_dev(port);
-
-	err = mvpp2_bm_update_mtu(dev, mtu);
-	if (!err) {
-		port->pkt_size =  MVPP2_RX_PKT_SIZE(mtu);
-		goto out_start;
-	}
-
-	/* Reconfigure BM to the original MTU */
-	err = mvpp2_bm_update_mtu(dev, dev->mtu);
-	if (err)
-		goto log_error;
-
-out_start:
-	mvpp2_start_dev(port);
-	mvpp2_egress_enable(port);
-	mvpp2_ingress_enable(port);
-
-	return 0;
-log_error:
-	netdev_err(dev, "failed to change MTU\n");
-	return err;
-}
-
-static void
-mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	unsigned int start;
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct mvpp2_pcpu_stats *cpu_stats;
-		u64 rx_packets;
-		u64 rx_bytes;
-		u64 tx_packets;
-		u64 tx_bytes;
-
-		cpu_stats = per_cpu_ptr(port->stats, cpu);
-		do {
-			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
-			rx_packets = cpu_stats->rx_packets;
-			rx_bytes   = cpu_stats->rx_bytes;
-			tx_packets = cpu_stats->tx_packets;
-			tx_bytes   = cpu_stats->tx_bytes;
-		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
-
-		stats->rx_packets += rx_packets;
-		stats->rx_bytes   += rx_bytes;
-		stats->tx_packets += tx_packets;
-		stats->tx_bytes   += tx_bytes;
-	}
-
-	stats->rx_errors	= dev->stats.rx_errors;
-	stats->rx_dropped	= dev->stats.rx_dropped;
-	stats->tx_dropped	= dev->stats.tx_dropped;
-}
-
-static int mvpp2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return -ENOTSUPP;
-
-	return phylink_mii_ioctl(port->phylink, ifr, cmd);
-}
-
-static int mvpp2_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	int ret;
-
-	ret = mvpp2_prs_vid_entry_add(port, vid);
-	if (ret)
-		netdev_err(dev, "rx-vlan-filter offloading cannot accept more than %d VIDs per port\n",
-			   MVPP2_PRS_VLAN_FILT_MAX - 1);
-	return ret;
-}
-
-static int mvpp2_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	mvpp2_prs_vid_entry_remove(port, vid);
-	return 0;
-}
-
-static int mvpp2_set_features(struct net_device *dev,
-			      netdev_features_t features)
-{
-	netdev_features_t changed = dev->features ^ features;
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) {
-		if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
-			mvpp2_prs_vid_enable_filtering(port);
-		} else {
-			/* Invalidate all registered VID filters for this
-			 * port
-			 */
-			mvpp2_prs_vid_remove_all(port);
-
-			mvpp2_prs_vid_disable_filtering(port);
-		}
-	}
-
-	return 0;
-}
-
-/* Ethtool methods */
-
-static int mvpp2_ethtool_nway_reset(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return -ENOTSUPP;
-
-	return phylink_ethtool_nway_reset(port->phylink);
-}
-
-/* Set interrupt coalescing for ethtools */
-static int mvpp2_ethtool_set_coalesce(struct net_device *dev,
-				      struct ethtool_coalesce *c)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	int queue;
-
-	for (queue = 0; queue < port->nrxqs; queue++) {
-		struct mvpp2_rx_queue *rxq = port->rxqs[queue];
-
-		rxq->time_coal = c->rx_coalesce_usecs;
-		rxq->pkts_coal = c->rx_max_coalesced_frames;
-		mvpp2_rx_pkts_coal_set(port, rxq);
-		mvpp2_rx_time_coal_set(port, rxq);
-	}
-
-	if (port->has_tx_irqs) {
-		port->tx_time_coal = c->tx_coalesce_usecs;
-		mvpp2_tx_time_coal_set(port);
-	}
-
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		struct mvpp2_tx_queue *txq = port->txqs[queue];
-
-		txq->done_pkts_coal = c->tx_max_coalesced_frames;
-
-		if (port->has_tx_irqs)
-			mvpp2_tx_pkts_coal_set(port, txq);
-	}
-
-	return 0;
-}
-
-/* get coalescing for ethtools */
-static int mvpp2_ethtool_get_coalesce(struct net_device *dev,
-				      struct ethtool_coalesce *c)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	c->rx_coalesce_usecs       = port->rxqs[0]->time_coal;
-	c->rx_max_coalesced_frames = port->rxqs[0]->pkts_coal;
-	c->tx_max_coalesced_frames = port->txqs[0]->done_pkts_coal;
-	c->tx_coalesce_usecs       = port->tx_time_coal;
-	return 0;
-}
-
-static void mvpp2_ethtool_get_drvinfo(struct net_device *dev,
-				      struct ethtool_drvinfo *drvinfo)
-{
-	strlcpy(drvinfo->driver, MVPP2_DRIVER_NAME,
-		sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, MVPP2_DRIVER_VERSION,
-		sizeof(drvinfo->version));
-	strlcpy(drvinfo->bus_info, dev_name(&dev->dev),
-		sizeof(drvinfo->bus_info));
-}
-
-static void mvpp2_ethtool_get_ringparam(struct net_device *dev,
-					struct ethtool_ringparam *ring)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	ring->rx_max_pending = MVPP2_MAX_RXD_MAX;
-	ring->tx_max_pending = MVPP2_MAX_TXD_MAX;
-	ring->rx_pending = port->rx_ring_size;
-	ring->tx_pending = port->tx_ring_size;
-}
-
-static int mvpp2_ethtool_set_ringparam(struct net_device *dev,
-				       struct ethtool_ringparam *ring)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	u16 prev_rx_ring_size = port->rx_ring_size;
-	u16 prev_tx_ring_size = port->tx_ring_size;
-	int err;
-
-	err = mvpp2_check_ringparam_valid(dev, ring);
-	if (err)
-		return err;
-
-	if (!netif_running(dev)) {
-		port->rx_ring_size = ring->rx_pending;
-		port->tx_ring_size = ring->tx_pending;
-		return 0;
-	}
-
-	/* The interface is running, so we have to force a
-	 * reallocation of the queues
-	 */
-	mvpp2_stop_dev(port);
-	mvpp2_cleanup_rxqs(port);
-	mvpp2_cleanup_txqs(port);
-
-	port->rx_ring_size = ring->rx_pending;
-	port->tx_ring_size = ring->tx_pending;
-
-	err = mvpp2_setup_rxqs(port);
-	if (err) {
-		/* Reallocate Rx queues with the original ring size */
-		port->rx_ring_size = prev_rx_ring_size;
-		ring->rx_pending = prev_rx_ring_size;
-		err = mvpp2_setup_rxqs(port);
-		if (err)
-			goto err_out;
-	}
-	err = mvpp2_setup_txqs(port);
-	if (err) {
-		/* Reallocate Tx queues with the original ring size */
-		port->tx_ring_size = prev_tx_ring_size;
-		ring->tx_pending = prev_tx_ring_size;
-		err = mvpp2_setup_txqs(port);
-		if (err)
-			goto err_clean_rxqs;
-	}
-
-	mvpp2_start_dev(port);
-	mvpp2_egress_enable(port);
-	mvpp2_ingress_enable(port);
-
-	return 0;
-
-err_clean_rxqs:
-	mvpp2_cleanup_rxqs(port);
-err_out:
-	netdev_err(dev, "failed to change ring parameters");
-	return err;
-}
-
-static void mvpp2_ethtool_get_pause_param(struct net_device *dev,
-					  struct ethtool_pauseparam *pause)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return;
-
-	phylink_ethtool_get_pauseparam(port->phylink, pause);
-}
-
-static int mvpp2_ethtool_set_pause_param(struct net_device *dev,
-					 struct ethtool_pauseparam *pause)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return -ENOTSUPP;
-
-	return phylink_ethtool_set_pauseparam(port->phylink, pause);
-}
-
-static int mvpp2_ethtool_get_link_ksettings(struct net_device *dev,
-					    struct ethtool_link_ksettings *cmd)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return -ENOTSUPP;
-
-	return phylink_ethtool_ksettings_get(port->phylink, cmd);
-}
-
-static int mvpp2_ethtool_set_link_ksettings(struct net_device *dev,
-					    const struct ethtool_link_ksettings *cmd)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (!port->phylink)
-		return -ENOTSUPP;
-
-	return phylink_ethtool_ksettings_set(port->phylink, cmd);
-}
-
-/* Device ops */
-
-static const struct net_device_ops mvpp2_netdev_ops = {
-	.ndo_open		= mvpp2_open,
-	.ndo_stop		= mvpp2_stop,
-	.ndo_start_xmit		= mvpp2_tx,
-	.ndo_set_rx_mode	= mvpp2_set_rx_mode,
-	.ndo_set_mac_address	= mvpp2_set_mac_address,
-	.ndo_change_mtu		= mvpp2_change_mtu,
-	.ndo_get_stats64	= mvpp2_get_stats64,
-	.ndo_do_ioctl		= mvpp2_ioctl,
-	.ndo_vlan_rx_add_vid	= mvpp2_vlan_rx_add_vid,
-	.ndo_vlan_rx_kill_vid	= mvpp2_vlan_rx_kill_vid,
-	.ndo_set_features	= mvpp2_set_features,
-};
-
-static const struct ethtool_ops mvpp2_eth_tool_ops = {
-	.nway_reset		= mvpp2_ethtool_nway_reset,
-	.get_link		= ethtool_op_get_link,
-	.set_coalesce		= mvpp2_ethtool_set_coalesce,
-	.get_coalesce		= mvpp2_ethtool_get_coalesce,
-	.get_drvinfo		= mvpp2_ethtool_get_drvinfo,
-	.get_ringparam		= mvpp2_ethtool_get_ringparam,
-	.set_ringparam		= mvpp2_ethtool_set_ringparam,
-	.get_strings		= mvpp2_ethtool_get_strings,
-	.get_ethtool_stats	= mvpp2_ethtool_get_stats,
-	.get_sset_count		= mvpp2_ethtool_get_sset_count,
-	.get_pauseparam		= mvpp2_ethtool_get_pause_param,
-	.set_pauseparam		= mvpp2_ethtool_set_pause_param,
-	.get_link_ksettings	= mvpp2_ethtool_get_link_ksettings,
-	.set_link_ksettings	= mvpp2_ethtool_set_link_ksettings,
-};
-
-/* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
- * had a single IRQ defined per-port.
- */
-static int mvpp2_simple_queue_vectors_init(struct mvpp2_port *port,
-					   struct device_node *port_node)
-{
-	struct mvpp2_queue_vector *v = &port->qvecs[0];
-
-	v->first_rxq = 0;
-	v->nrxqs = port->nrxqs;
-	v->type = MVPP2_QUEUE_VECTOR_SHARED;
-	v->sw_thread_id = 0;
-	v->sw_thread_mask = *cpumask_bits(cpu_online_mask);
-	v->port = port;
-	v->irq = irq_of_parse_and_map(port_node, 0);
-	if (v->irq <= 0)
-		return -EINVAL;
-	netif_napi_add(port->dev, &v->napi, mvpp2_poll,
-		       NAPI_POLL_WEIGHT);
-
-	port->nqvecs = 1;
-
-	return 0;
-}
-
-static int mvpp2_multi_queue_vectors_init(struct mvpp2_port *port,
-					  struct device_node *port_node)
-{
-	struct mvpp2_queue_vector *v;
-	int i, ret;
-
-	port->nqvecs = num_possible_cpus();
-	if (queue_mode == MVPP2_QDIST_SINGLE_MODE)
-		port->nqvecs += 1;
-
-	for (i = 0; i < port->nqvecs; i++) {
-		char irqname[16];
-
-		v = port->qvecs + i;
-
-		v->port = port;
-		v->type = MVPP2_QUEUE_VECTOR_PRIVATE;
-		v->sw_thread_id = i;
-		v->sw_thread_mask = BIT(i);
-
-		snprintf(irqname, sizeof(irqname), "tx-cpu%d", i);
-
-		if (queue_mode == MVPP2_QDIST_MULTI_MODE) {
-			v->first_rxq = i * MVPP2_DEFAULT_RXQ;
-			v->nrxqs = MVPP2_DEFAULT_RXQ;
-		} else if (queue_mode == MVPP2_QDIST_SINGLE_MODE &&
-			   i == (port->nqvecs - 1)) {
-			v->first_rxq = 0;
-			v->nrxqs = port->nrxqs;
-			v->type = MVPP2_QUEUE_VECTOR_SHARED;
-			strncpy(irqname, "rx-shared", sizeof(irqname));
-		}
-
-		if (port_node)
-			v->irq = of_irq_get_byname(port_node, irqname);
-		else
-			v->irq = fwnode_irq_get(port->fwnode, i);
-		if (v->irq <= 0) {
-			ret = -EINVAL;
-			goto err;
-		}
-
-		netif_napi_add(port->dev, &v->napi, mvpp2_poll,
-			       NAPI_POLL_WEIGHT);
-	}
-
-	return 0;
-
-err:
-	for (i = 0; i < port->nqvecs; i++)
-		irq_dispose_mapping(port->qvecs[i].irq);
-	return ret;
-}
-
-static int mvpp2_queue_vectors_init(struct mvpp2_port *port,
-				    struct device_node *port_node)
-{
-	if (port->has_tx_irqs)
-		return mvpp2_multi_queue_vectors_init(port, port_node);
-	else
-		return mvpp2_simple_queue_vectors_init(port, port_node);
-}
-
-static void mvpp2_queue_vectors_deinit(struct mvpp2_port *port)
-{
-	int i;
-
-	for (i = 0; i < port->nqvecs; i++)
-		irq_dispose_mapping(port->qvecs[i].irq);
-}
-
-/* Configure Rx queue group interrupt for this port */
-static void mvpp2_rx_irqs_setup(struct mvpp2_port *port)
-{
-	struct mvpp2 *priv = port->priv;
-	u32 val;
-	int i;
-
-	if (priv->hw_version == MVPP21) {
-		mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
-			    port->nrxqs);
-		return;
-	}
-
-	/* Handle the more complicated PPv2.2 case */
-	for (i = 0; i < port->nqvecs; i++) {
-		struct mvpp2_queue_vector *qv = port->qvecs + i;
-
-		if (!qv->nrxqs)
-			continue;
-
-		val = qv->sw_thread_id;
-		val |= port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET;
-		mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
-
-		val = qv->first_rxq;
-		val |= qv->nrxqs << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET;
-		mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
-	}
-}
-
-/* Initialize port HW */
-static int mvpp2_port_init(struct mvpp2_port *port)
-{
-	struct device *dev = port->dev->dev.parent;
-	struct mvpp2 *priv = port->priv;
-	struct mvpp2_txq_pcpu *txq_pcpu;
-	int queue, cpu, err;
-
-	/* Checks for hardware constraints */
-	if (port->first_rxq + port->nrxqs >
-	    MVPP2_MAX_PORTS * priv->max_port_rxqs)
-		return -EINVAL;
-
-	if (port->nrxqs % 4 || (port->nrxqs > priv->max_port_rxqs) ||
-	    (port->ntxqs > MVPP2_MAX_TXQ))
-		return -EINVAL;
-
-	/* Disable port */
-	mvpp2_egress_disable(port);
-	mvpp2_port_disable(port);
-
-	port->tx_time_coal = MVPP2_TXDONE_COAL_USEC;
-
-	port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs),
-				  GFP_KERNEL);
-	if (!port->txqs)
-		return -ENOMEM;
-
-	/* Associate physical Tx queues to this port and initialize.
-	 * The mapping is predefined.
-	 */
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		int queue_phy_id = mvpp2_txq_phys(port->id, queue);
-		struct mvpp2_tx_queue *txq;
-
-		txq = devm_kzalloc(dev, sizeof(*txq), GFP_KERNEL);
-		if (!txq) {
-			err = -ENOMEM;
-			goto err_free_percpu;
-		}
-
-		txq->pcpu = alloc_percpu(struct mvpp2_txq_pcpu);
-		if (!txq->pcpu) {
-			err = -ENOMEM;
-			goto err_free_percpu;
-		}
-
-		txq->id = queue_phy_id;
-		txq->log_id = queue;
-		txq->done_pkts_coal = MVPP2_TXDONE_COAL_PKTS_THRESH;
-		for_each_present_cpu(cpu) {
-			txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
-			txq_pcpu->cpu = cpu;
-		}
-
-		port->txqs[queue] = txq;
-	}
-
-	port->rxqs = devm_kcalloc(dev, port->nrxqs, sizeof(*port->rxqs),
-				  GFP_KERNEL);
-	if (!port->rxqs) {
-		err = -ENOMEM;
-		goto err_free_percpu;
-	}
-
-	/* Allocate and initialize Rx queue for this port */
-	for (queue = 0; queue < port->nrxqs; queue++) {
-		struct mvpp2_rx_queue *rxq;
-
-		/* Map physical Rx queue to port's logical Rx queue */
-		rxq = devm_kzalloc(dev, sizeof(*rxq), GFP_KERNEL);
-		if (!rxq) {
-			err = -ENOMEM;
-			goto err_free_percpu;
-		}
-		/* Map this Rx queue to a physical queue */
-		rxq->id = port->first_rxq + queue;
-		rxq->port = port->id;
-		rxq->logic_rxq = queue;
-
-		port->rxqs[queue] = rxq;
-	}
-
-	mvpp2_rx_irqs_setup(port);
-
-	/* Create Rx descriptor rings */
-	for (queue = 0; queue < port->nrxqs; queue++) {
-		struct mvpp2_rx_queue *rxq = port->rxqs[queue];
-
-		rxq->size = port->rx_ring_size;
-		rxq->pkts_coal = MVPP2_RX_COAL_PKTS;
-		rxq->time_coal = MVPP2_RX_COAL_USEC;
-	}
-
-	mvpp2_ingress_disable(port);
-
-	/* Port default configuration */
-	mvpp2_defaults_set(port);
-
-	/* Port's classifier configuration */
-	mvpp2_cls_oversize_rxq_set(port);
-	mvpp2_cls_port_config(port);
-
-	/* Provide an initial Rx packet size */
-	port->pkt_size = MVPP2_RX_PKT_SIZE(port->dev->mtu);
-
-	/* Initialize pools for swf */
-	err = mvpp2_swf_bm_pool_init(port);
-	if (err)
-		goto err_free_percpu;
-
-	return 0;
-
-err_free_percpu:
-	for (queue = 0; queue < port->ntxqs; queue++) {
-		if (!port->txqs[queue])
-			continue;
-		free_percpu(port->txqs[queue]->pcpu);
-	}
-	return err;
-}
-
-/* Checks if the port DT description has the TX interrupts
- * described. On PPv2.1, there are no such interrupts. On PPv2.2,
- * there are available, but we need to keep support for old DTs.
- */
-static bool mvpp2_port_has_tx_irqs(struct mvpp2 *priv,
-				   struct device_node *port_node)
-{
-	char *irqs[5] = { "rx-shared", "tx-cpu0", "tx-cpu1",
-			  "tx-cpu2", "tx-cpu3" };
-	int ret, i;
-
-	if (priv->hw_version == MVPP21)
-		return false;
-
-	for (i = 0; i < 5; i++) {
-		ret = of_property_match_string(port_node, "interrupt-names",
-					       irqs[i]);
-		if (ret < 0)
-			return false;
-	}
-
-	return true;
-}
-
-static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
-				     struct fwnode_handle *fwnode,
-				     char **mac_from)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	char hw_mac_addr[ETH_ALEN] = {0};
-	char fw_mac_addr[ETH_ALEN];
-
-	if (fwnode_get_mac_address(fwnode, fw_mac_addr, ETH_ALEN)) {
-		*mac_from = "firmware node";
-		ether_addr_copy(dev->dev_addr, fw_mac_addr);
-		return;
-	}
-
-	if (priv->hw_version == MVPP21) {
-		mvpp21_get_mac_address(port, hw_mac_addr);
-		if (is_valid_ether_addr(hw_mac_addr)) {
-			*mac_from = "hardware";
-			ether_addr_copy(dev->dev_addr, hw_mac_addr);
-			return;
-		}
-	}
-
-	*mac_from = "random";
-	eth_hw_addr_random(dev);
-}
-
-static void mvpp2_phylink_validate(struct net_device *dev,
-				   unsigned long *supported,
-				   struct phylink_link_state *state)
-{
-	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
-
-	phylink_set(mask, Autoneg);
-	phylink_set_port_modes(mask);
-	phylink_set(mask, Pause);
-	phylink_set(mask, Asym_Pause);
-
-	switch (state->interface) {
-	case PHY_INTERFACE_MODE_10GKR:
-		phylink_set(mask, 10000baseCR_Full);
-		phylink_set(mask, 10000baseSR_Full);
-		phylink_set(mask, 10000baseLR_Full);
-		phylink_set(mask, 10000baseLRM_Full);
-		phylink_set(mask, 10000baseER_Full);
-		phylink_set(mask, 10000baseKR_Full);
-		/* Fall-through */
-	default:
-		phylink_set(mask, 10baseT_Half);
-		phylink_set(mask, 10baseT_Full);
-		phylink_set(mask, 100baseT_Half);
-		phylink_set(mask, 100baseT_Full);
-		phylink_set(mask, 10000baseT_Full);
-		/* Fall-through */
-	case PHY_INTERFACE_MODE_1000BASEX:
-	case PHY_INTERFACE_MODE_2500BASEX:
-		phylink_set(mask, 1000baseT_Full);
-		phylink_set(mask, 1000baseX_Full);
-		phylink_set(mask, 2500baseX_Full);
-	}
-
-	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
-	bitmap_and(state->advertising, state->advertising, mask,
-		   __ETHTOOL_LINK_MODE_MASK_NBITS);
-}
-
-static void mvpp22_xlg_link_state(struct mvpp2_port *port,
-				  struct phylink_link_state *state)
-{
-	u32 val;
-
-	state->speed = SPEED_10000;
-	state->duplex = 1;
-	state->an_complete = 1;
-
-	val = readl(port->base + MVPP22_XLG_STATUS);
-	state->link = !!(val & MVPP22_XLG_STATUS_LINK_UP);
-
-	state->pause = 0;
-	val = readl(port->base + MVPP22_XLG_CTRL0_REG);
-	if (val & MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN)
-		state->pause |= MLO_PAUSE_TX;
-	if (val & MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN)
-		state->pause |= MLO_PAUSE_RX;
-}
-
-static void mvpp2_gmac_link_state(struct mvpp2_port *port,
-				  struct phylink_link_state *state)
-{
-	u32 val;
-
-	val = readl(port->base + MVPP2_GMAC_STATUS0);
-
-	state->an_complete = !!(val & MVPP2_GMAC_STATUS0_AN_COMPLETE);
-	state->link = !!(val & MVPP2_GMAC_STATUS0_LINK_UP);
-	state->duplex = !!(val & MVPP2_GMAC_STATUS0_FULL_DUPLEX);
-
-	switch (port->phy_interface) {
-	case PHY_INTERFACE_MODE_1000BASEX:
-		state->speed = SPEED_1000;
-		break;
-	case PHY_INTERFACE_MODE_2500BASEX:
-		state->speed = SPEED_2500;
-		break;
-	default:
-		if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
-			state->speed = SPEED_1000;
-		else if (val & MVPP2_GMAC_STATUS0_MII_SPEED)
-			state->speed = SPEED_100;
-		else
-			state->speed = SPEED_10;
-	}
-
-	state->pause = 0;
-	if (val & MVPP2_GMAC_STATUS0_RX_PAUSE)
-		state->pause |= MLO_PAUSE_RX;
-	if (val & MVPP2_GMAC_STATUS0_TX_PAUSE)
-		state->pause |= MLO_PAUSE_TX;
-}
-
-static int mvpp2_phylink_mac_link_state(struct net_device *dev,
-					struct phylink_link_state *state)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	if (port->priv->hw_version == MVPP22 && port->gop_id == 0) {
-		u32 mode = readl(port->base + MVPP22_XLG_CTRL3_REG);
-		mode &= MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
-
-		if (mode == MVPP22_XLG_CTRL3_MACMODESELECT_10G) {
-			mvpp22_xlg_link_state(port, state);
-			return 1;
-		}
-	}
-
-	mvpp2_gmac_link_state(port, state);
-	return 1;
-}
-
-static void mvpp2_mac_an_restart(struct net_device *dev)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	u32 val;
-
-	if (port->phy_interface != PHY_INTERFACE_MODE_SGMII)
-		return;
-
-	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	/* The RESTART_AN bit is cleared by the h/w after restarting the AN
-	 * process.
-	 */
-	val |= MVPP2_GMAC_IN_BAND_RESTART_AN | MVPP2_GMAC_IN_BAND_AUTONEG;
-	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
-static void mvpp2_xlg_config(struct mvpp2_port *port, unsigned int mode,
-			     const struct phylink_link_state *state)
-{
-	u32 ctrl0, ctrl4;
-
-	ctrl0 = readl(port->base + MVPP22_XLG_CTRL0_REG);
-	ctrl4 = readl(port->base + MVPP22_XLG_CTRL4_REG);
-
-	if (state->pause & MLO_PAUSE_TX)
-		ctrl0 |= MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN;
-	if (state->pause & MLO_PAUSE_RX)
-		ctrl0 |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
-
-	ctrl4 &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
-	ctrl4 |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC |
-		 MVPP22_XLG_CTRL4_EN_IDLE_CHECK;
-
-	writel(ctrl0, port->base + MVPP22_XLG_CTRL0_REG);
-	writel(ctrl4, port->base + MVPP22_XLG_CTRL4_REG);
-}
-
-static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
-			      const struct phylink_link_state *state)
-{
-	u32 an, ctrl0, ctrl2, ctrl4;
-
-	an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
-	ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
-	ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
-
-	/* Force link down */
-	an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
-	an |= MVPP2_GMAC_FORCE_LINK_DOWN;
-	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-
-	/* Set the GMAC in a reset state */
-	ctrl2 |= MVPP2_GMAC_PORT_RESET_MASK;
-	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
-
-	an &= ~(MVPP2_GMAC_CONFIG_MII_SPEED | MVPP2_GMAC_CONFIG_GMII_SPEED |
-		MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FC_ADV_EN |
-		MVPP2_GMAC_FC_ADV_ASM_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
-		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN |
-		MVPP2_GMAC_FORCE_LINK_DOWN);
-	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
-	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
-
-	if (state->interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
-		/* 1000BaseX and 2500BaseX ports cannot negotiate speed nor can
-		 * they negotiate duplex: they are always operating with a fixed
-		 * speed of 1000/2500Mbps in full duplex, so force 1000/2500
-		 * speed and full duplex here.
-		 */
-		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
-		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
-		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
-	} else if (!phy_interface_mode_is_rgmii(state->interface)) {
-		an |= MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG;
-	}
-
-	if (state->duplex)
-		an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
-	if (phylink_test(state->advertising, Pause))
-		an |= MVPP2_GMAC_FC_ADV_EN;
-	if (phylink_test(state->advertising, Asym_Pause))
-		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
-
-	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
-	    state->interface == PHY_INTERFACE_MODE_1000BASEX ||
-	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
-		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
-		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
-
-		ctrl4 &= ~(MVPP22_CTRL4_EXT_PIN_GMII_SEL |
-			   MVPP22_CTRL4_RX_FC_EN | MVPP22_CTRL4_TX_FC_EN);
-		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
-			 MVPP22_CTRL4_DP_CLK_SEL |
-			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
-
-		if (state->pause & MLO_PAUSE_TX)
-			ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
-		if (state->pause & MLO_PAUSE_RX)
-			ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
-	} else if (phy_interface_mode_is_rgmii(state->interface)) {
-		an |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS;
-
-		if (state->speed == SPEED_1000)
-			an |= MVPP2_GMAC_CONFIG_GMII_SPEED;
-		else if (state->speed == SPEED_100)
-			an |= MVPP2_GMAC_CONFIG_MII_SPEED;
-
-		ctrl4 &= ~MVPP22_CTRL4_DP_CLK_SEL;
-		ctrl4 |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
-			 MVPP22_CTRL4_SYNC_BYPASS_DIS |
-			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
-	}
-
-	writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
-	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
-	writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
-	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-}
-
-static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
-			     const struct phylink_link_state *state)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-
-	/* Check for invalid configuration */
-	if (state->interface == PHY_INTERFACE_MODE_10GKR && port->gop_id != 0) {
-		netdev_err(dev, "Invalid mode on %s\n", dev->name);
-		return;
-	}
-
-	netif_tx_stop_all_queues(port->dev);
-	if (!port->has_phy)
-		netif_carrier_off(port->dev);
-
-	/* Make sure the port is disabled when reconfiguring the mode */
-	mvpp2_port_disable(port);
-
-	if (port->priv->hw_version == MVPP22 &&
-	    port->phy_interface != state->interface) {
-		port->phy_interface = state->interface;
-
-		/* Reconfigure the serdes lanes */
-		phy_power_off(port->comphy);
-		mvpp22_mode_reconfigure(port);
-	}
-
-	/* mac (re)configuration */
-	if (state->interface == PHY_INTERFACE_MODE_10GKR)
-		mvpp2_xlg_config(port, mode, state);
-	else if (phy_interface_mode_is_rgmii(state->interface) ||
-		 state->interface == PHY_INTERFACE_MODE_SGMII ||
-		 state->interface == PHY_INTERFACE_MODE_1000BASEX ||
-		 state->interface == PHY_INTERFACE_MODE_2500BASEX)
-		mvpp2_gmac_config(port, mode, state);
-
-	if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
-		mvpp2_port_loopback_set(port, state);
-
-	/* If the port already was up, make sure it's still in the same state */
-	if (state->link || !port->has_phy) {
-		mvpp2_port_enable(port);
-
-		mvpp2_egress_enable(port);
-		mvpp2_ingress_enable(port);
-		if (!port->has_phy)
-			netif_carrier_on(dev);
-		netif_tx_wake_all_queues(dev);
-	}
-}
-
-static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
-			      phy_interface_t interface, struct phy_device *phy)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	u32 val;
-
-	if (!phylink_autoneg_inband(mode) &&
-	    interface != PHY_INTERFACE_MODE_10GKR) {
-		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-		val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
-		if (phy_interface_mode_is_rgmii(interface))
-			val |= MVPP2_GMAC_FORCE_LINK_PASS;
-		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	}
-
-	mvpp2_port_enable(port);
-
-	mvpp2_egress_enable(port);
-	mvpp2_ingress_enable(port);
-	netif_tx_wake_all_queues(dev);
-}
-
-static void mvpp2_mac_link_down(struct net_device *dev, unsigned int mode,
-				phy_interface_t interface)
-{
-	struct mvpp2_port *port = netdev_priv(dev);
-	u32 val;
-
-	if (!phylink_autoneg_inband(mode) &&
-	    interface != PHY_INTERFACE_MODE_10GKR) {
-		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-		val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
-		val |= MVPP2_GMAC_FORCE_LINK_DOWN;
-		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
-	}
-
-	netif_tx_stop_all_queues(dev);
-	mvpp2_egress_disable(port);
-	mvpp2_ingress_disable(port);
-
-	/* When using link interrupts to notify phylink of a MAC state change,
-	 * we do not want the port to be disabled (we want to receive further
-	 * interrupts, to be notified when the port will have a link later).
-	 */
-	if (!port->has_phy)
-		return;
-
-	mvpp2_port_disable(port);
-}
-
-static const struct phylink_mac_ops mvpp2_phylink_ops = {
-	.validate = mvpp2_phylink_validate,
-	.mac_link_state = mvpp2_phylink_mac_link_state,
-	.mac_an_restart = mvpp2_mac_an_restart,
-	.mac_config = mvpp2_mac_config,
-	.mac_link_up = mvpp2_mac_link_up,
-	.mac_link_down = mvpp2_mac_link_down,
-};
-
-/* Ports initialization */
-static int mvpp2_port_probe(struct platform_device *pdev,
-			    struct fwnode_handle *port_fwnode,
-			    struct mvpp2 *priv)
-{
-	struct phy *comphy = NULL;
-	struct mvpp2_port *port;
-	struct mvpp2_port_pcpu *port_pcpu;
-	struct device_node *port_node = to_of_node(port_fwnode);
-	struct net_device *dev;
-	struct resource *res;
-	struct phylink *phylink;
-	char *mac_from = "";
-	unsigned int ntxqs, nrxqs;
-	bool has_tx_irqs;
-	u32 id;
-	int features;
-	int phy_mode;
-	int err, i, cpu;
-
-	if (port_node) {
-		has_tx_irqs = mvpp2_port_has_tx_irqs(priv, port_node);
-	} else {
-		has_tx_irqs = true;
-		queue_mode = MVPP2_QDIST_MULTI_MODE;
-	}
-
-	if (!has_tx_irqs)
-		queue_mode = MVPP2_QDIST_SINGLE_MODE;
-
-	ntxqs = MVPP2_MAX_TXQ;
-	if (priv->hw_version == MVPP22 && queue_mode == MVPP2_QDIST_MULTI_MODE)
-		nrxqs = MVPP2_DEFAULT_RXQ * num_possible_cpus();
-	else
-		nrxqs = MVPP2_DEFAULT_RXQ;
-
-	dev = alloc_etherdev_mqs(sizeof(*port), ntxqs, nrxqs);
-	if (!dev)
-		return -ENOMEM;
-
-	phy_mode = fwnode_get_phy_mode(port_fwnode);
-	if (phy_mode < 0) {
-		dev_err(&pdev->dev, "incorrect phy mode\n");
-		err = phy_mode;
-		goto err_free_netdev;
-	}
-
-	if (port_node) {
-		comphy = devm_of_phy_get(&pdev->dev, port_node, NULL);
-		if (IS_ERR(comphy)) {
-			if (PTR_ERR(comphy) == -EPROBE_DEFER) {
-				err = -EPROBE_DEFER;
-				goto err_free_netdev;
-			}
-			comphy = NULL;
-		}
-	}
-
-	if (fwnode_property_read_u32(port_fwnode, "port-id", &id)) {
-		err = -EINVAL;
-		dev_err(&pdev->dev, "missing port-id value\n");
-		goto err_free_netdev;
-	}
-
-	dev->tx_queue_len = MVPP2_MAX_TXD_MAX;
-	dev->watchdog_timeo = 5 * HZ;
-	dev->netdev_ops = &mvpp2_netdev_ops;
-	dev->ethtool_ops = &mvpp2_eth_tool_ops;
-
-	port = netdev_priv(dev);
-	port->dev = dev;
-	port->fwnode = port_fwnode;
-	port->has_phy = !!of_find_property(port_node, "phy", NULL);
-	port->ntxqs = ntxqs;
-	port->nrxqs = nrxqs;
-	port->priv = priv;
-	port->has_tx_irqs = has_tx_irqs;
-
-	err = mvpp2_queue_vectors_init(port, port_node);
-	if (err)
-		goto err_free_netdev;
-
-	if (port_node)
-		port->link_irq = of_irq_get_byname(port_node, "link");
-	else
-		port->link_irq = fwnode_irq_get(port_fwnode, port->nqvecs + 1);
-	if (port->link_irq == -EPROBE_DEFER) {
-		err = -EPROBE_DEFER;
-		goto err_deinit_qvecs;
-	}
-	if (port->link_irq <= 0)
-		/* the link irq is optional */
-		port->link_irq = 0;
-
-	if (fwnode_property_read_bool(port_fwnode, "marvell,loopback"))
-		port->flags |= MVPP2_F_LOOPBACK;
-
-	port->id = id;
-	if (priv->hw_version == MVPP21)
-		port->first_rxq = port->id * port->nrxqs;
-	else
-		port->first_rxq = port->id * priv->max_port_rxqs;
-
-	port->of_node = port_node;
-	port->phy_interface = phy_mode;
-	port->comphy = comphy;
-
-	if (priv->hw_version == MVPP21) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 2 + id);
-		port->base = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(port->base)) {
-			err = PTR_ERR(port->base);
-			goto err_free_irq;
-		}
-
-		port->stats_base = port->priv->lms_base +
-				   MVPP21_MIB_COUNTERS_OFFSET +
-				   port->gop_id * MVPP21_MIB_COUNTERS_PORT_SZ;
-	} else {
-		if (fwnode_property_read_u32(port_fwnode, "gop-port-id",
-					     &port->gop_id)) {
-			err = -EINVAL;
-			dev_err(&pdev->dev, "missing gop-port-id value\n");
-			goto err_deinit_qvecs;
-		}
-
-		port->base = priv->iface_base + MVPP22_GMAC_BASE(port->gop_id);
-		port->stats_base = port->priv->iface_base +
-				   MVPP22_MIB_COUNTERS_OFFSET +
-				   port->gop_id * MVPP22_MIB_COUNTERS_PORT_SZ;
-	}
-
-	/* Alloc per-cpu and ethtool stats */
-	port->stats = netdev_alloc_pcpu_stats(struct mvpp2_pcpu_stats);
-	if (!port->stats) {
-		err = -ENOMEM;
-		goto err_free_irq;
-	}
-
-	port->ethtool_stats = devm_kcalloc(&pdev->dev,
-					   ARRAY_SIZE(mvpp2_ethtool_regs),
-					   sizeof(u64), GFP_KERNEL);
-	if (!port->ethtool_stats) {
-		err = -ENOMEM;
-		goto err_free_stats;
-	}
-
-	mutex_init(&port->gather_stats_lock);
-	INIT_DELAYED_WORK(&port->stats_work, mvpp2_gather_hw_statistics);
-
-	mvpp2_port_copy_mac_addr(dev, priv, port_fwnode, &mac_from);
-
-	port->tx_ring_size = MVPP2_MAX_TXD_DFLT;
-	port->rx_ring_size = MVPP2_MAX_RXD_DFLT;
-	SET_NETDEV_DEV(dev, &pdev->dev);
-
-	err = mvpp2_port_init(port);
-	if (err < 0) {
-		dev_err(&pdev->dev, "failed to init port %d\n", id);
-		goto err_free_stats;
-	}
-
-	mvpp2_port_periodic_xon_disable(port);
-
-	mvpp2_port_reset(port);
-
-	port->pcpu = alloc_percpu(struct mvpp2_port_pcpu);
-	if (!port->pcpu) {
-		err = -ENOMEM;
-		goto err_free_txq_pcpu;
-	}
-
-	if (!port->has_tx_irqs) {
-		for_each_present_cpu(cpu) {
-			port_pcpu = per_cpu_ptr(port->pcpu, cpu);
-
-			hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
-				     HRTIMER_MODE_REL_PINNED);
-			port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
-			port_pcpu->timer_scheduled = false;
-
-			tasklet_init(&port_pcpu->tx_done_tasklet,
-				     mvpp2_tx_proc_cb,
-				     (unsigned long)dev);
-		}
-	}
-
-	features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-		   NETIF_F_TSO;
-	dev->features = features | NETIF_F_RXCSUM;
-	dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO |
-			    NETIF_F_HW_VLAN_CTAG_FILTER;
-
-	if (port->pool_long->id == MVPP2_BM_JUMBO && port->id != 0) {
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-		dev->hw_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-	}
-
-	dev->vlan_features |= features;
-	dev->gso_max_segs = MVPP2_MAX_TSO_SEGS;
-	dev->priv_flags |= IFF_UNICAST_FLT;
-
-	/* MTU range: 68 - 9704 */
-	dev->min_mtu = ETH_MIN_MTU;
-	/* 9704 == 9728 - 20 and rounding to 8 */
-	dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE;
-
-	/* Phylink isn't used w/ ACPI as of now */
-	if (port_node) {
-		phylink = phylink_create(dev, port_fwnode, phy_mode,
-					 &mvpp2_phylink_ops);
-		if (IS_ERR(phylink)) {
-			err = PTR_ERR(phylink);
-			goto err_free_port_pcpu;
-		}
-		port->phylink = phylink;
-	} else {
-		port->phylink = NULL;
-	}
-
-	err = register_netdev(dev);
-	if (err < 0) {
-		dev_err(&pdev->dev, "failed to register netdev\n");
-		goto err_phylink;
-	}
-	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
-
-	priv->port_list[priv->port_count++] = port;
-
-	return 0;
-
-err_phylink:
-	if (port->phylink)
-		phylink_destroy(port->phylink);
-err_free_port_pcpu:
-	free_percpu(port->pcpu);
-err_free_txq_pcpu:
-	for (i = 0; i < port->ntxqs; i++)
-		free_percpu(port->txqs[i]->pcpu);
-err_free_stats:
-	free_percpu(port->stats);
-err_free_irq:
-	if (port->link_irq)
-		irq_dispose_mapping(port->link_irq);
-err_deinit_qvecs:
-	mvpp2_queue_vectors_deinit(port);
-err_free_netdev:
-	free_netdev(dev);
-	return err;
-}
-
-/* Ports removal routine */
-static void mvpp2_port_remove(struct mvpp2_port *port)
-{
-	int i;
-
-	unregister_netdev(port->dev);
-	if (port->phylink)
-		phylink_destroy(port->phylink);
-	free_percpu(port->pcpu);
-	free_percpu(port->stats);
-	for (i = 0; i < port->ntxqs; i++)
-		free_percpu(port->txqs[i]->pcpu);
-	mvpp2_queue_vectors_deinit(port);
-	if (port->link_irq)
-		irq_dispose_mapping(port->link_irq);
-	free_netdev(port->dev);
-}
-
-/* Initialize decoding windows */
-static void mvpp2_conf_mbus_windows(const struct mbus_dram_target_info *dram,
-				    struct mvpp2 *priv)
-{
-	u32 win_enable;
-	int i;
-
-	for (i = 0; i < 6; i++) {
-		mvpp2_write(priv, MVPP2_WIN_BASE(i), 0);
-		mvpp2_write(priv, MVPP2_WIN_SIZE(i), 0);
-
-		if (i < 4)
-			mvpp2_write(priv, MVPP2_WIN_REMAP(i), 0);
-	}
-
-	win_enable = 0;
-
-	for (i = 0; i < dram->num_cs; i++) {
-		const struct mbus_dram_window *cs = dram->cs + i;
-
-		mvpp2_write(priv, MVPP2_WIN_BASE(i),
-			    (cs->base & 0xffff0000) | (cs->mbus_attr << 8) |
-			    dram->mbus_dram_target_id);
-
-		mvpp2_write(priv, MVPP2_WIN_SIZE(i),
-			    (cs->size - 1) & 0xffff0000);
-
-		win_enable |= (1 << i);
-	}
-
-	mvpp2_write(priv, MVPP2_BASE_ADDR_ENABLE, win_enable);
-}
-
-/* Initialize Rx FIFO's */
-static void mvpp2_rx_fifo_init(struct mvpp2 *priv)
-{
-	int port;
-
-	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
-		mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(port),
-			    MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB);
-		mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(port),
-			    MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB);
-	}
-
-	mvpp2_write(priv, MVPP2_RX_MIN_PKT_SIZE_REG,
-		    MVPP2_RX_FIFO_PORT_MIN_PKT);
-	mvpp2_write(priv, MVPP2_RX_FIFO_INIT_REG, 0x1);
-}
-
-static void mvpp22_rx_fifo_init(struct mvpp2 *priv)
-{
-	int port;
-
-	/* The FIFO size parameters are set depending on the maximum speed a
-	 * given port can handle:
-	 * - Port 0: 10Gbps
-	 * - Port 1: 2.5Gbps
-	 * - Ports 2 and 3: 1Gbps
-	 */
-
-	mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(0),
-		    MVPP2_RX_FIFO_PORT_DATA_SIZE_32KB);
-	mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(0),
-		    MVPP2_RX_FIFO_PORT_ATTR_SIZE_32KB);
-
-	mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(1),
-		    MVPP2_RX_FIFO_PORT_DATA_SIZE_8KB);
-	mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(1),
-		    MVPP2_RX_FIFO_PORT_ATTR_SIZE_8KB);
-
-	for (port = 2; port < MVPP2_MAX_PORTS; port++) {
-		mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(port),
-			    MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB);
-		mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(port),
-			    MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB);
-	}
-
-	mvpp2_write(priv, MVPP2_RX_MIN_PKT_SIZE_REG,
-		    MVPP2_RX_FIFO_PORT_MIN_PKT);
-	mvpp2_write(priv, MVPP2_RX_FIFO_INIT_REG, 0x1);
-}
-
-/* Initialize Tx FIFO's: the total FIFO size is 19kB on PPv2.2 and 10G
- * interfaces must have a Tx FIFO size of 10kB. As only port 0 can do 10G,
- * configure its Tx FIFO size to 10kB and the others ports Tx FIFO size to 3kB.
- */
-static void mvpp22_tx_fifo_init(struct mvpp2 *priv)
-{
-	int port, size, thrs;
-
-	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
-		if (port == 0) {
-			size = MVPP22_TX_FIFO_DATA_SIZE_10KB;
-			thrs = MVPP2_TX_FIFO_THRESHOLD_10KB;
-		} else {
-			size = MVPP22_TX_FIFO_DATA_SIZE_3KB;
-			thrs = MVPP2_TX_FIFO_THRESHOLD_3KB;
-		}
-		mvpp2_write(priv, MVPP22_TX_FIFO_SIZE_REG(port), size);
-		mvpp2_write(priv, MVPP22_TX_FIFO_THRESH_REG(port), thrs);
-	}
-}
-
-static void mvpp2_axi_init(struct mvpp2 *priv)
-{
-	u32 val, rdval, wrval;
-
-	mvpp2_write(priv, MVPP22_BM_ADDR_HIGH_RLS_REG, 0x0);
-
-	/* AXI Bridge Configuration */
-
-	rdval = MVPP22_AXI_CODE_CACHE_RD_CACHE
-		<< MVPP22_AXI_ATTR_CACHE_OFFS;
-	rdval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
-		<< MVPP22_AXI_ATTR_DOMAIN_OFFS;
-
-	wrval = MVPP22_AXI_CODE_CACHE_WR_CACHE
-		<< MVPP22_AXI_ATTR_CACHE_OFFS;
-	wrval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
-		<< MVPP22_AXI_ATTR_DOMAIN_OFFS;
-
-	/* BM */
-	mvpp2_write(priv, MVPP22_AXI_BM_WR_ATTR_REG, wrval);
-	mvpp2_write(priv, MVPP22_AXI_BM_RD_ATTR_REG, rdval);
-
-	/* Descriptors */
-	mvpp2_write(priv, MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG, rdval);
-	mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG, wrval);
-	mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG, rdval);
-	mvpp2_write(priv, MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG, wrval);
-
-	/* Buffer Data */
-	mvpp2_write(priv, MVPP22_AXI_TX_DATA_RD_ATTR_REG, rdval);
-	mvpp2_write(priv, MVPP22_AXI_RX_DATA_WR_ATTR_REG, wrval);
-
-	val = MVPP22_AXI_CODE_CACHE_NON_CACHE
-		<< MVPP22_AXI_CODE_CACHE_OFFS;
-	val |= MVPP22_AXI_CODE_DOMAIN_SYSTEM
-		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
-	mvpp2_write(priv, MVPP22_AXI_RD_NORMAL_CODE_REG, val);
-	mvpp2_write(priv, MVPP22_AXI_WR_NORMAL_CODE_REG, val);
-
-	val = MVPP22_AXI_CODE_CACHE_RD_CACHE
-		<< MVPP22_AXI_CODE_CACHE_OFFS;
-	val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
-		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
-
-	mvpp2_write(priv, MVPP22_AXI_RD_SNOOP_CODE_REG, val);
-
-	val = MVPP22_AXI_CODE_CACHE_WR_CACHE
-		<< MVPP22_AXI_CODE_CACHE_OFFS;
-	val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
-		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
-
-	mvpp2_write(priv, MVPP22_AXI_WR_SNOOP_CODE_REG, val);
-}
-
-/* Initialize network controller common part HW */
-static int mvpp2_init(struct platform_device *pdev, struct mvpp2 *priv)
-{
-	const struct mbus_dram_target_info *dram_target_info;
-	int err, i;
-	u32 val;
-
-	/* MBUS windows configuration */
-	dram_target_info = mv_mbus_dram_info();
-	if (dram_target_info)
-		mvpp2_conf_mbus_windows(dram_target_info, priv);
-
-	if (priv->hw_version == MVPP22)
-		mvpp2_axi_init(priv);
-
-	/* Disable HW PHY polling */
-	if (priv->hw_version == MVPP21) {
-		val = readl(priv->lms_base + MVPP2_PHY_AN_CFG0_REG);
-		val |= MVPP2_PHY_AN_STOP_SMI0_MASK;
-		writel(val, priv->lms_base + MVPP2_PHY_AN_CFG0_REG);
-	} else {
-		val = readl(priv->iface_base + MVPP22_SMI_MISC_CFG_REG);
-		val &= ~MVPP22_SMI_POLLING_EN;
-		writel(val, priv->iface_base + MVPP22_SMI_MISC_CFG_REG);
-	}
-
-	/* Allocate and initialize aggregated TXQs */
-	priv->aggr_txqs = devm_kcalloc(&pdev->dev, num_present_cpus(),
-				       sizeof(*priv->aggr_txqs),
-				       GFP_KERNEL);
-	if (!priv->aggr_txqs)
-		return -ENOMEM;
-
-	for_each_present_cpu(i) {
-		priv->aggr_txqs[i].id = i;
-		priv->aggr_txqs[i].size = MVPP2_AGGR_TXQ_SIZE;
-		err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i], i, priv);
-		if (err < 0)
-			return err;
-	}
-
-	/* Fifo Init */
-	if (priv->hw_version == MVPP21) {
-		mvpp2_rx_fifo_init(priv);
-	} else {
-		mvpp22_rx_fifo_init(priv);
-		mvpp22_tx_fifo_init(priv);
-	}
-
-	if (priv->hw_version == MVPP21)
-		writel(MVPP2_EXT_GLOBAL_CTRL_DEFAULT,
-		       priv->lms_base + MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG);
-
-	/* Allow cache snoop when transmiting packets */
-	mvpp2_write(priv, MVPP2_TX_SNOOP_REG, 0x1);
-
-	/* Buffer Manager initialization */
-	err = mvpp2_bm_init(pdev, priv);
-	if (err < 0)
-		return err;
-
-	/* Parser default initialization */
-	err = mvpp2_prs_default_init(pdev, priv);
-	if (err < 0)
-		return err;
-
-	/* Classifier default initialization */
-	mvpp2_cls_init(priv);
-
-	return 0;
-}
-
-static int mvpp2_probe(struct platform_device *pdev)
-{
-	const struct acpi_device_id *acpi_id;
-	struct fwnode_handle *fwnode = pdev->dev.fwnode;
-	struct fwnode_handle *port_fwnode;
-	struct mvpp2 *priv;
-	struct resource *res;
-	void __iomem *base;
-	int i;
-	int err;
-
-	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	if (has_acpi_companion(&pdev->dev)) {
-		acpi_id = acpi_match_device(pdev->dev.driver->acpi_match_table,
-					    &pdev->dev);
-		priv->hw_version = (unsigned long)acpi_id->driver_data;
-	} else {
-		priv->hw_version =
-			(unsigned long)of_device_get_match_data(&pdev->dev);
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(base))
-		return PTR_ERR(base);
-
-	if (priv->hw_version == MVPP21) {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		priv->lms_base = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(priv->lms_base))
-			return PTR_ERR(priv->lms_base);
-	} else {
-		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-		if (has_acpi_companion(&pdev->dev)) {
-			/* In case the MDIO memory region is declared in
-			 * the ACPI, it can already appear as 'in-use'
-			 * in the OS. Because it is overlapped by second
-			 * region of the network controller, make
-			 * sure it is released, before requesting it again.
-			 * The care is taken by mvpp2 driver to avoid
-			 * concurrent access to this memory region.
-			 */
-			release_resource(res);
-		}
-		priv->iface_base = devm_ioremap_resource(&pdev->dev, res);
-		if (IS_ERR(priv->iface_base))
-			return PTR_ERR(priv->iface_base);
-	}
-
-	if (priv->hw_version == MVPP22 && dev_of_node(&pdev->dev)) {
-		priv->sysctrl_base =
-			syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
-							"marvell,system-controller");
-		if (IS_ERR(priv->sysctrl_base))
-			/* The system controller regmap is optional for dt
-			 * compatibility reasons. When not provided, the
-			 * configuration of the GoP relies on the
-			 * firmware/bootloader.
-			 */
-			priv->sysctrl_base = NULL;
-	}
-
-	mvpp2_setup_bm_pool();
-
-	for (i = 0; i < MVPP2_MAX_THREADS; i++) {
-		u32 addr_space_sz;
-
-		addr_space_sz = (priv->hw_version == MVPP21 ?
-				 MVPP21_ADDR_SPACE_SZ : MVPP22_ADDR_SPACE_SZ);
-		priv->swth_base[i] = base + i * addr_space_sz;
-	}
-
-	if (priv->hw_version == MVPP21)
-		priv->max_port_rxqs = 8;
-	else
-		priv->max_port_rxqs = 32;
-
-	if (dev_of_node(&pdev->dev)) {
-		priv->pp_clk = devm_clk_get(&pdev->dev, "pp_clk");
-		if (IS_ERR(priv->pp_clk))
-			return PTR_ERR(priv->pp_clk);
-		err = clk_prepare_enable(priv->pp_clk);
-		if (err < 0)
-			return err;
-
-		priv->gop_clk = devm_clk_get(&pdev->dev, "gop_clk");
-		if (IS_ERR(priv->gop_clk)) {
-			err = PTR_ERR(priv->gop_clk);
-			goto err_pp_clk;
-		}
-		err = clk_prepare_enable(priv->gop_clk);
-		if (err < 0)
-			goto err_pp_clk;
-
-		if (priv->hw_version == MVPP22) {
-			priv->mg_clk = devm_clk_get(&pdev->dev, "mg_clk");
-			if (IS_ERR(priv->mg_clk)) {
-				err = PTR_ERR(priv->mg_clk);
-				goto err_gop_clk;
-			}
-
-			err = clk_prepare_enable(priv->mg_clk);
-			if (err < 0)
-				goto err_gop_clk;
-
-			priv->mg_core_clk = devm_clk_get(&pdev->dev, "mg_core_clk");
-			if (IS_ERR(priv->mg_core_clk)) {
-				priv->mg_core_clk = NULL;
-			} else {
-				err = clk_prepare_enable(priv->mg_core_clk);
-				if (err < 0)
-					goto err_mg_clk;
-			}
-		}
-
-		priv->axi_clk = devm_clk_get(&pdev->dev, "axi_clk");
-		if (IS_ERR(priv->axi_clk)) {
-			err = PTR_ERR(priv->axi_clk);
-			if (err == -EPROBE_DEFER)
-				goto err_mg_core_clk;
-			priv->axi_clk = NULL;
-		} else {
-			err = clk_prepare_enable(priv->axi_clk);
-			if (err < 0)
-				goto err_mg_core_clk;
-		}
-
-		/* Get system's tclk rate */
-		priv->tclk = clk_get_rate(priv->pp_clk);
-	} else if (device_property_read_u32(&pdev->dev, "clock-frequency",
-					    &priv->tclk)) {
-		dev_err(&pdev->dev, "missing clock-frequency value\n");
-		return -EINVAL;
-	}
-
-	if (priv->hw_version == MVPP22) {
-		err = dma_set_mask(&pdev->dev, MVPP2_DESC_DMA_MASK);
-		if (err)
-			goto err_axi_clk;
-		/* Sadly, the BM pools all share the same register to
-		 * store the high 32 bits of their address. So they
-		 * must all have the same high 32 bits, which forces
-		 * us to restrict coherent memory to DMA_BIT_MASK(32).
-		 */
-		err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
-		if (err)
-			goto err_axi_clk;
-	}
-
-	/* Initialize network controller */
-	err = mvpp2_init(pdev, priv);
-	if (err < 0) {
-		dev_err(&pdev->dev, "failed to initialize controller\n");
-		goto err_axi_clk;
-	}
-
-	/* Initialize ports */
-	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
-		err = mvpp2_port_probe(pdev, port_fwnode, priv);
-		if (err < 0)
-			goto err_port_probe;
-	}
-
-	if (priv->port_count == 0) {
-		dev_err(&pdev->dev, "no ports enabled\n");
-		err = -ENODEV;
-		goto err_axi_clk;
-	}
-
-	/* Statistics must be gathered regularly because some of them (like
-	 * packets counters) are 32-bit registers and could overflow quite
-	 * quickly. For instance, a 10Gb link used at full bandwidth with the
-	 * smallest packets (64B) will overflow a 32-bit counter in less than
-	 * 30 seconds. Then, use a workqueue to fill 64-bit counters.
-	 */
-	snprintf(priv->queue_name, sizeof(priv->queue_name),
-		 "stats-wq-%s%s", netdev_name(priv->port_list[0]->dev),
-		 priv->port_count > 1 ? "+" : "");
-	priv->stats_queue = create_singlethread_workqueue(priv->queue_name);
-	if (!priv->stats_queue) {
-		err = -ENOMEM;
-		goto err_port_probe;
-	}
-
-	platform_set_drvdata(pdev, priv);
-	return 0;
-
-err_port_probe:
-	i = 0;
-	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
-		if (priv->port_list[i])
-			mvpp2_port_remove(priv->port_list[i]);
-		i++;
-	}
-err_axi_clk:
-	clk_disable_unprepare(priv->axi_clk);
-
-err_mg_core_clk:
-	if (priv->hw_version == MVPP22)
-		clk_disable_unprepare(priv->mg_core_clk);
-err_mg_clk:
-	if (priv->hw_version == MVPP22)
-		clk_disable_unprepare(priv->mg_clk);
-err_gop_clk:
-	clk_disable_unprepare(priv->gop_clk);
-err_pp_clk:
-	clk_disable_unprepare(priv->pp_clk);
-	return err;
-}
-
-static int mvpp2_remove(struct platform_device *pdev)
-{
-	struct mvpp2 *priv = platform_get_drvdata(pdev);
-	struct fwnode_handle *fwnode = pdev->dev.fwnode;
-	struct fwnode_handle *port_fwnode;
-	int i = 0;
-
-	flush_workqueue(priv->stats_queue);
-	destroy_workqueue(priv->stats_queue);
-
-	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
-		if (priv->port_list[i]) {
-			mutex_destroy(&priv->port_list[i]->gather_stats_lock);
-			mvpp2_port_remove(priv->port_list[i]);
-		}
-		i++;
-	}
-
-	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
-		struct mvpp2_bm_pool *bm_pool = &priv->bm_pools[i];
-
-		mvpp2_bm_pool_destroy(pdev, priv, bm_pool);
-	}
-
-	for_each_present_cpu(i) {
-		struct mvpp2_tx_queue *aggr_txq = &priv->aggr_txqs[i];
-
-		dma_free_coherent(&pdev->dev,
-				  MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
-				  aggr_txq->descs,
-				  aggr_txq->descs_dma);
-	}
-
-	if (is_acpi_node(port_fwnode))
-		return 0;
-
-	clk_disable_unprepare(priv->axi_clk);
-	clk_disable_unprepare(priv->mg_core_clk);
-	clk_disable_unprepare(priv->mg_clk);
-	clk_disable_unprepare(priv->pp_clk);
-	clk_disable_unprepare(priv->gop_clk);
-
-	return 0;
-}
-
-static const struct of_device_id mvpp2_match[] = {
-	{
-		.compatible = "marvell,armada-375-pp2",
-		.data = (void *)MVPP21,
-	},
-	{
-		.compatible = "marvell,armada-7k-pp22",
-		.data = (void *)MVPP22,
-	},
-	{ }
-};
-MODULE_DEVICE_TABLE(of, mvpp2_match);
-
-static const struct acpi_device_id mvpp2_acpi_match[] = {
-	{ "MRVL0110", MVPP22 },
-	{ },
-};
-MODULE_DEVICE_TABLE(acpi, mvpp2_acpi_match);
-
-static struct platform_driver mvpp2_driver = {
-	.probe = mvpp2_probe,
-	.remove = mvpp2_remove,
-	.driver = {
-		.name = MVPP2_DRIVER_NAME,
-		.of_match_table = mvpp2_match,
-		.acpi_match_table = ACPI_PTR(mvpp2_acpi_match),
-	},
-};
-
-module_platform_driver(mvpp2_driver);
-
-MODULE_DESCRIPTION("Marvell PPv2 Ethernet Driver - www.marvell.com");
-MODULE_AUTHOR("Marcin Wojtas <mw@semihalf.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/ethernet/marvell/mvpp2/Makefile b/drivers/net/ethernet/marvell/mvpp2/Makefile
new file mode 100644
index 000000000000..4d11dd9e3246
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the Marvell PPv2 driver.
+#
+obj-$(CONFIG_MVPP2) := mvpp2.o
+
+mvpp2-objs := mvpp2_main.o mvpp2_prs.o mvpp2_cls.o
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
new file mode 100644
index 000000000000..def00dc3eb4e
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -0,0 +1,1046 @@
+/*
+ * Definitions for Marvell PPv2 network controller for Armada 375 SoC.
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _MVPP2_H_
+#define _MVPP2_H_
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phylink.h>
+
+/* Fifo Registers */
+#define MVPP2_RX_DATA_FIFO_SIZE_REG(port)	(0x00 + 4 * (port))
+#define MVPP2_RX_ATTR_FIFO_SIZE_REG(port)	(0x20 + 4 * (port))
+#define MVPP2_RX_MIN_PKT_SIZE_REG		0x60
+#define MVPP2_RX_FIFO_INIT_REG			0x64
+#define MVPP22_TX_FIFO_THRESH_REG(port)		(0x8840 + 4 * (port))
+#define MVPP22_TX_FIFO_SIZE_REG(port)		(0x8860 + 4 * (port))
+
+/* RX DMA Top Registers */
+#define MVPP2_RX_CTRL_REG(port)			(0x140 + 4 * (port))
+#define     MVPP2_RX_LOW_LATENCY_PKT_SIZE(s)	(((s) & 0xfff) << 16)
+#define     MVPP2_RX_USE_PSEUDO_FOR_CSUM_MASK	BIT(31)
+#define MVPP2_POOL_BUF_SIZE_REG(pool)		(0x180 + 4 * (pool))
+#define     MVPP2_POOL_BUF_SIZE_OFFSET		5
+#define MVPP2_RXQ_CONFIG_REG(rxq)		(0x800 + 4 * (rxq))
+#define     MVPP2_SNOOP_PKT_SIZE_MASK		0x1ff
+#define     MVPP2_SNOOP_BUF_HDR_MASK		BIT(9)
+#define     MVPP2_RXQ_POOL_SHORT_OFFS		20
+#define     MVPP21_RXQ_POOL_SHORT_MASK		0x700000
+#define     MVPP22_RXQ_POOL_SHORT_MASK		0xf00000
+#define     MVPP2_RXQ_POOL_LONG_OFFS		24
+#define     MVPP21_RXQ_POOL_LONG_MASK		0x7000000
+#define     MVPP22_RXQ_POOL_LONG_MASK		0xf000000
+#define     MVPP2_RXQ_PACKET_OFFSET_OFFS	28
+#define     MVPP2_RXQ_PACKET_OFFSET_MASK	0x70000000
+#define     MVPP2_RXQ_DISABLE_MASK		BIT(31)
+
+/* Top Registers */
+#define MVPP2_MH_REG(port)			(0x5040 + 4 * (port))
+#define MVPP2_DSA_EXTENDED			BIT(5)
+
+/* Parser Registers */
+#define MVPP2_PRS_INIT_LOOKUP_REG		0x1000
+#define     MVPP2_PRS_PORT_LU_MAX		0xf
+#define     MVPP2_PRS_PORT_LU_MASK(port)	(0xff << ((port) * 4))
+#define     MVPP2_PRS_PORT_LU_VAL(port, val)	((val) << ((port) * 4))
+#define MVPP2_PRS_INIT_OFFS_REG(port)		(0x1004 + ((port) & 4))
+#define     MVPP2_PRS_INIT_OFF_MASK(port)	(0x3f << (((port) % 4) * 8))
+#define     MVPP2_PRS_INIT_OFF_VAL(port, val)	((val) << (((port) % 4) * 8))
+#define MVPP2_PRS_MAX_LOOP_REG(port)		(0x100c + ((port) & 4))
+#define     MVPP2_PRS_MAX_LOOP_MASK(port)	(0xff << (((port) % 4) * 8))
+#define     MVPP2_PRS_MAX_LOOP_VAL(port, val)	((val) << (((port) % 4) * 8))
+#define MVPP2_PRS_TCAM_IDX_REG			0x1100
+#define MVPP2_PRS_TCAM_DATA_REG(idx)		(0x1104 + (idx) * 4)
+#define     MVPP2_PRS_TCAM_INV_MASK		BIT(31)
+#define MVPP2_PRS_SRAM_IDX_REG			0x1200
+#define MVPP2_PRS_SRAM_DATA_REG(idx)		(0x1204 + (idx) * 4)
+#define MVPP2_PRS_TCAM_CTRL_REG			0x1230
+#define     MVPP2_PRS_TCAM_EN_MASK		BIT(0)
+
+/* RSS Registers */
+#define MVPP22_RSS_INDEX			0x1500
+#define     MVPP22_RSS_INDEX_TABLE_ENTRY(idx)	(idx)
+#define     MVPP22_RSS_INDEX_TABLE(idx)		((idx) << 8)
+#define     MVPP22_RSS_INDEX_QUEUE(idx)		((idx) << 16)
+#define MVPP22_RSS_TABLE_ENTRY			0x1508
+#define MVPP22_RSS_TABLE			0x1510
+#define     MVPP22_RSS_TABLE_POINTER(p)		(p)
+#define MVPP22_RSS_WIDTH			0x150c
+
+/* Classifier Registers */
+#define MVPP2_CLS_MODE_REG			0x1800
+#define     MVPP2_CLS_MODE_ACTIVE_MASK		BIT(0)
+#define MVPP2_CLS_PORT_WAY_REG			0x1810
+#define     MVPP2_CLS_PORT_WAY_MASK(port)	(1 << (port))
+#define MVPP2_CLS_LKP_INDEX_REG			0x1814
+#define     MVPP2_CLS_LKP_INDEX_WAY_OFFS	6
+#define MVPP2_CLS_LKP_TBL_REG			0x1818
+#define     MVPP2_CLS_LKP_TBL_RXQ_MASK		0xff
+#define     MVPP2_CLS_LKP_TBL_LOOKUP_EN_MASK	BIT(25)
+#define MVPP2_CLS_FLOW_INDEX_REG		0x1820
+#define MVPP2_CLS_FLOW_TBL0_REG			0x1824
+#define MVPP2_CLS_FLOW_TBL1_REG			0x1828
+#define MVPP2_CLS_FLOW_TBL2_REG			0x182c
+#define MVPP2_CLS_OVERSIZE_RXQ_LOW_REG(port)	(0x1980 + ((port) * 4))
+#define     MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS	3
+#define     MVPP2_CLS_OVERSIZE_RXQ_LOW_MASK	0x7
+#define MVPP2_CLS_SWFWD_P2HQ_REG(port)		(0x19b0 + ((port) * 4))
+#define MVPP2_CLS_SWFWD_PCTRL_REG		0x19d0
+#define     MVPP2_CLS_SWFWD_PCTRL_MASK(port)	(1 << (port))
+
+/* Descriptor Manager Top Registers */
+#define MVPP2_RXQ_NUM_REG			0x2040
+#define MVPP2_RXQ_DESC_ADDR_REG			0x2044
+#define     MVPP22_DESC_ADDR_OFFS		8
+#define MVPP2_RXQ_DESC_SIZE_REG			0x2048
+#define     MVPP2_RXQ_DESC_SIZE_MASK		0x3ff0
+#define MVPP2_RXQ_STATUS_UPDATE_REG(rxq)	(0x3000 + 4 * (rxq))
+#define     MVPP2_RXQ_NUM_PROCESSED_OFFSET	0
+#define     MVPP2_RXQ_NUM_NEW_OFFSET		16
+#define MVPP2_RXQ_STATUS_REG(rxq)		(0x3400 + 4 * (rxq))
+#define     MVPP2_RXQ_OCCUPIED_MASK		0x3fff
+#define     MVPP2_RXQ_NON_OCCUPIED_OFFSET	16
+#define     MVPP2_RXQ_NON_OCCUPIED_MASK		0x3fff0000
+#define MVPP2_RXQ_THRESH_REG			0x204c
+#define     MVPP2_OCCUPIED_THRESH_OFFSET	0
+#define     MVPP2_OCCUPIED_THRESH_MASK		0x3fff
+#define MVPP2_RXQ_INDEX_REG			0x2050
+#define MVPP2_TXQ_NUM_REG			0x2080
+#define MVPP2_TXQ_DESC_ADDR_REG			0x2084
+#define MVPP2_TXQ_DESC_SIZE_REG			0x2088
+#define     MVPP2_TXQ_DESC_SIZE_MASK		0x3ff0
+#define MVPP2_TXQ_THRESH_REG			0x2094
+#define	    MVPP2_TXQ_THRESH_OFFSET		16
+#define	    MVPP2_TXQ_THRESH_MASK		0x3fff
+#define MVPP2_AGGR_TXQ_UPDATE_REG		0x2090
+#define MVPP2_TXQ_INDEX_REG			0x2098
+#define MVPP2_TXQ_PREF_BUF_REG			0x209c
+#define     MVPP2_PREF_BUF_PTR(desc)		((desc) & 0xfff)
+#define     MVPP2_PREF_BUF_SIZE_4		(BIT(12) | BIT(13))
+#define     MVPP2_PREF_BUF_SIZE_16		(BIT(12) | BIT(14))
+#define     MVPP2_PREF_BUF_THRESH(val)		((val) << 17)
+#define     MVPP2_TXQ_DRAIN_EN_MASK		BIT(31)
+#define MVPP2_TXQ_PENDING_REG			0x20a0
+#define     MVPP2_TXQ_PENDING_MASK		0x3fff
+#define MVPP2_TXQ_INT_STATUS_REG		0x20a4
+#define MVPP2_TXQ_SENT_REG(txq)			(0x3c00 + 4 * (txq))
+#define     MVPP2_TRANSMITTED_COUNT_OFFSET	16
+#define     MVPP2_TRANSMITTED_COUNT_MASK	0x3fff0000
+#define MVPP2_TXQ_RSVD_REQ_REG			0x20b0
+#define     MVPP2_TXQ_RSVD_REQ_Q_OFFSET		16
+#define MVPP2_TXQ_RSVD_RSLT_REG			0x20b4
+#define     MVPP2_TXQ_RSVD_RSLT_MASK		0x3fff
+#define MVPP2_TXQ_RSVD_CLR_REG			0x20b8
+#define     MVPP2_TXQ_RSVD_CLR_OFFSET		16
+#define MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu)	(0x2100 + 4 * (cpu))
+#define     MVPP22_AGGR_TXQ_DESC_ADDR_OFFS	8
+#define MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu)	(0x2140 + 4 * (cpu))
+#define     MVPP2_AGGR_TXQ_DESC_SIZE_MASK	0x3ff0
+#define MVPP2_AGGR_TXQ_STATUS_REG(cpu)		(0x2180 + 4 * (cpu))
+#define     MVPP2_AGGR_TXQ_PENDING_MASK		0x3fff
+#define MVPP2_AGGR_TXQ_INDEX_REG(cpu)		(0x21c0 + 4 * (cpu))
+
+/* MBUS bridge registers */
+#define MVPP2_WIN_BASE(w)			(0x4000 + ((w) << 2))
+#define MVPP2_WIN_SIZE(w)			(0x4020 + ((w) << 2))
+#define MVPP2_WIN_REMAP(w)			(0x4040 + ((w) << 2))
+#define MVPP2_BASE_ADDR_ENABLE			0x4060
+
+/* AXI Bridge Registers */
+#define MVPP22_AXI_BM_WR_ATTR_REG		0x4100
+#define MVPP22_AXI_BM_RD_ATTR_REG		0x4104
+#define MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG	0x4110
+#define MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG	0x4114
+#define MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG	0x4118
+#define MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG	0x411c
+#define MVPP22_AXI_RX_DATA_WR_ATTR_REG		0x4120
+#define MVPP22_AXI_TX_DATA_RD_ATTR_REG		0x4130
+#define MVPP22_AXI_RD_NORMAL_CODE_REG		0x4150
+#define MVPP22_AXI_RD_SNOOP_CODE_REG		0x4154
+#define MVPP22_AXI_WR_NORMAL_CODE_REG		0x4160
+#define MVPP22_AXI_WR_SNOOP_CODE_REG		0x4164
+
+/* Values for AXI Bridge registers */
+#define MVPP22_AXI_ATTR_CACHE_OFFS		0
+#define MVPP22_AXI_ATTR_DOMAIN_OFFS		12
+
+#define MVPP22_AXI_CODE_CACHE_OFFS		0
+#define MVPP22_AXI_CODE_DOMAIN_OFFS		4
+
+#define MVPP22_AXI_CODE_CACHE_NON_CACHE		0x3
+#define MVPP22_AXI_CODE_CACHE_WR_CACHE		0x7
+#define MVPP22_AXI_CODE_CACHE_RD_CACHE		0xb
+
+#define MVPP22_AXI_CODE_DOMAIN_OUTER_DOM	2
+#define MVPP22_AXI_CODE_DOMAIN_SYSTEM		3
+
+/* Interrupt Cause and Mask registers */
+#define MVPP2_ISR_TX_THRESHOLD_REG(port)	(0x5140 + 4 * (port))
+#define     MVPP2_MAX_ISR_TX_THRESHOLD		0xfffff0
+
+#define MVPP2_ISR_RX_THRESHOLD_REG(rxq)		(0x5200 + 4 * (rxq))
+#define     MVPP2_MAX_ISR_RX_THRESHOLD		0xfffff0
+#define MVPP21_ISR_RXQ_GROUP_REG(port)		(0x5400 + 4 * (port))
+
+#define MVPP22_ISR_RXQ_GROUP_INDEX_REG		0x5400
+#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK	0x380
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET	7
+
+#define MVPP22_ISR_RXQ_GROUP_INDEX_SUBGROUP_MASK 0xf
+#define MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_MASK	0x380
+
+#define MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG	0x5404
+#define MVPP22_ISR_RXQ_SUB_GROUP_STARTQ_MASK	0x1f
+#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_MASK	0xf00
+#define MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET	8
+
+#define MVPP2_ISR_ENABLE_REG(port)		(0x5420 + 4 * (port))
+#define     MVPP2_ISR_ENABLE_INTERRUPT(mask)	((mask) & 0xffff)
+#define     MVPP2_ISR_DISABLE_INTERRUPT(mask)	(((mask) << 16) & 0xffff0000)
+#define MVPP2_ISR_RX_TX_CAUSE_REG(port)		(0x5480 + 4 * (port))
+#define     MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK	0xffff
+#define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK	0xff0000
+#define     MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET	16
+#define     MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK	BIT(24)
+#define     MVPP2_CAUSE_FCS_ERR_MASK		BIT(25)
+#define     MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK	BIT(26)
+#define     MVPP2_CAUSE_TX_EXCEPTION_SUM_MASK	BIT(29)
+#define     MVPP2_CAUSE_RX_EXCEPTION_SUM_MASK	BIT(30)
+#define     MVPP2_CAUSE_MISC_SUM_MASK		BIT(31)
+#define MVPP2_ISR_RX_TX_MASK_REG(port)		(0x54a0 + 4 * (port))
+#define MVPP2_ISR_PON_RX_TX_MASK_REG		0x54bc
+#define     MVPP2_PON_CAUSE_RXQ_OCCUP_DESC_ALL_MASK	0xffff
+#define     MVPP2_PON_CAUSE_TXP_OCCUP_DESC_ALL_MASK	0x3fc00000
+#define     MVPP2_PON_CAUSE_MISC_SUM_MASK		BIT(31)
+#define MVPP2_ISR_MISC_CAUSE_REG		0x55b0
+
+/* Buffer Manager registers */
+#define MVPP2_BM_POOL_BASE_REG(pool)		(0x6000 + ((pool) * 4))
+#define     MVPP2_BM_POOL_BASE_ADDR_MASK	0xfffff80
+#define MVPP2_BM_POOL_SIZE_REG(pool)		(0x6040 + ((pool) * 4))
+#define     MVPP2_BM_POOL_SIZE_MASK		0xfff0
+#define MVPP2_BM_POOL_READ_PTR_REG(pool)	(0x6080 + ((pool) * 4))
+#define     MVPP2_BM_POOL_GET_READ_PTR_MASK	0xfff0
+#define MVPP2_BM_POOL_PTRS_NUM_REG(pool)	(0x60c0 + ((pool) * 4))
+#define     MVPP2_BM_POOL_PTRS_NUM_MASK		0xfff0
+#define MVPP2_BM_BPPI_READ_PTR_REG(pool)	(0x6100 + ((pool) * 4))
+#define MVPP2_BM_BPPI_PTRS_NUM_REG(pool)	(0x6140 + ((pool) * 4))
+#define     MVPP2_BM_BPPI_PTR_NUM_MASK		0x7ff
+#define MVPP22_BM_POOL_PTRS_NUM_MASK		0xfff8
+#define     MVPP2_BM_BPPI_PREFETCH_FULL_MASK	BIT(16)
+#define MVPP2_BM_POOL_CTRL_REG(pool)		(0x6200 + ((pool) * 4))
+#define     MVPP2_BM_START_MASK			BIT(0)
+#define     MVPP2_BM_STOP_MASK			BIT(1)
+#define     MVPP2_BM_STATE_MASK			BIT(4)
+#define     MVPP2_BM_LOW_THRESH_OFFS		8
+#define     MVPP2_BM_LOW_THRESH_MASK		0x7f00
+#define     MVPP2_BM_LOW_THRESH_VALUE(val)	((val) << \
+						MVPP2_BM_LOW_THRESH_OFFS)
+#define     MVPP2_BM_HIGH_THRESH_OFFS		16
+#define     MVPP2_BM_HIGH_THRESH_MASK		0x7f0000
+#define     MVPP2_BM_HIGH_THRESH_VALUE(val)	((val) << \
+						MVPP2_BM_HIGH_THRESH_OFFS)
+#define MVPP2_BM_INTR_CAUSE_REG(pool)		(0x6240 + ((pool) * 4))
+#define     MVPP2_BM_RELEASED_DELAY_MASK	BIT(0)
+#define     MVPP2_BM_ALLOC_FAILED_MASK		BIT(1)
+#define     MVPP2_BM_BPPE_EMPTY_MASK		BIT(2)
+#define     MVPP2_BM_BPPE_FULL_MASK		BIT(3)
+#define     MVPP2_BM_AVAILABLE_BP_LOW_MASK	BIT(4)
+#define MVPP2_BM_INTR_MASK_REG(pool)		(0x6280 + ((pool) * 4))
+#define MVPP2_BM_PHY_ALLOC_REG(pool)		(0x6400 + ((pool) * 4))
+#define     MVPP2_BM_PHY_ALLOC_GRNTD_MASK	BIT(0)
+#define MVPP2_BM_VIRT_ALLOC_REG			0x6440
+#define MVPP22_BM_ADDR_HIGH_ALLOC		0x6444
+#define     MVPP22_BM_ADDR_HIGH_PHYS_MASK	0xff
+#define     MVPP22_BM_ADDR_HIGH_VIRT_MASK	0xff00
+#define     MVPP22_BM_ADDR_HIGH_VIRT_SHIFT	8
+#define MVPP2_BM_PHY_RLS_REG(pool)		(0x6480 + ((pool) * 4))
+#define     MVPP2_BM_PHY_RLS_MC_BUFF_MASK	BIT(0)
+#define     MVPP2_BM_PHY_RLS_PRIO_EN_MASK	BIT(1)
+#define     MVPP2_BM_PHY_RLS_GRNTD_MASK		BIT(2)
+#define MVPP2_BM_VIRT_RLS_REG			0x64c0
+#define MVPP22_BM_ADDR_HIGH_RLS_REG		0x64c4
+#define     MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK	0xff
+#define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK	0xff00
+#define     MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT	8
+
+/* TX Scheduler registers */
+#define MVPP2_TXP_SCHED_PORT_INDEX_REG		0x8000
+#define MVPP2_TXP_SCHED_Q_CMD_REG		0x8004
+#define     MVPP2_TXP_SCHED_ENQ_MASK		0xff
+#define     MVPP2_TXP_SCHED_DISQ_OFFSET		8
+#define MVPP2_TXP_SCHED_CMD_1_REG		0x8010
+#define MVPP2_TXP_SCHED_PERIOD_REG		0x8018
+#define MVPP2_TXP_SCHED_MTU_REG			0x801c
+#define     MVPP2_TXP_MTU_MAX			0x7FFFF
+#define MVPP2_TXP_SCHED_REFILL_REG		0x8020
+#define     MVPP2_TXP_REFILL_TOKENS_ALL_MASK	0x7ffff
+#define     MVPP2_TXP_REFILL_PERIOD_ALL_MASK	0x3ff00000
+#define     MVPP2_TXP_REFILL_PERIOD_MASK(v)	((v) << 20)
+#define MVPP2_TXP_SCHED_TOKEN_SIZE_REG		0x8024
+#define     MVPP2_TXP_TOKEN_SIZE_MAX		0xffffffff
+#define MVPP2_TXQ_SCHED_REFILL_REG(q)		(0x8040 + ((q) << 2))
+#define     MVPP2_TXQ_REFILL_TOKENS_ALL_MASK	0x7ffff
+#define     MVPP2_TXQ_REFILL_PERIOD_ALL_MASK	0x3ff00000
+#define     MVPP2_TXQ_REFILL_PERIOD_MASK(v)	((v) << 20)
+#define MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(q)	(0x8060 + ((q) << 2))
+#define     MVPP2_TXQ_TOKEN_SIZE_MAX		0x7fffffff
+#define MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(q)	(0x8080 + ((q) << 2))
+#define     MVPP2_TXQ_TOKEN_CNTR_MAX		0xffffffff
+
+/* TX general registers */
+#define MVPP2_TX_SNOOP_REG			0x8800
+#define MVPP2_TX_PORT_FLUSH_REG			0x8810
+#define     MVPP2_TX_PORT_FLUSH_MASK(port)	(1 << (port))
+
+/* LMS registers */
+#define MVPP2_SRC_ADDR_MIDDLE			0x24
+#define MVPP2_SRC_ADDR_HIGH			0x28
+#define MVPP2_PHY_AN_CFG0_REG			0x34
+#define     MVPP2_PHY_AN_STOP_SMI0_MASK		BIT(7)
+#define MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG	0x305c
+#define     MVPP2_EXT_GLOBAL_CTRL_DEFAULT	0x27
+
+/* Per-port registers */
+#define MVPP2_GMAC_CTRL_0_REG			0x0
+#define     MVPP2_GMAC_PORT_EN_MASK		BIT(0)
+#define     MVPP2_GMAC_PORT_TYPE_MASK		BIT(1)
+#define     MVPP2_GMAC_MAX_RX_SIZE_OFFS		2
+#define     MVPP2_GMAC_MAX_RX_SIZE_MASK		0x7ffc
+#define     MVPP2_GMAC_MIB_CNTR_EN_MASK		BIT(15)
+#define MVPP2_GMAC_CTRL_1_REG			0x4
+#define     MVPP2_GMAC_PERIODIC_XON_EN_MASK	BIT(1)
+#define     MVPP2_GMAC_GMII_LB_EN_MASK		BIT(5)
+#define     MVPP2_GMAC_PCS_LB_EN_BIT		6
+#define     MVPP2_GMAC_PCS_LB_EN_MASK		BIT(6)
+#define     MVPP2_GMAC_SA_LOW_OFFS		7
+#define MVPP2_GMAC_CTRL_2_REG			0x8
+#define     MVPP2_GMAC_INBAND_AN_MASK		BIT(0)
+#define     MVPP2_GMAC_FLOW_CTRL_MASK		GENMASK(2, 1)
+#define     MVPP2_GMAC_PCS_ENABLE_MASK		BIT(3)
+#define     MVPP2_GMAC_INTERNAL_CLK_MASK	BIT(4)
+#define     MVPP2_GMAC_DISABLE_PADDING		BIT(5)
+#define     MVPP2_GMAC_PORT_RESET_MASK		BIT(6)
+#define MVPP2_GMAC_AUTONEG_CONFIG		0xc
+#define     MVPP2_GMAC_FORCE_LINK_DOWN		BIT(0)
+#define     MVPP2_GMAC_FORCE_LINK_PASS		BIT(1)
+#define     MVPP2_GMAC_IN_BAND_AUTONEG		BIT(2)
+#define     MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS	BIT(3)
+#define     MVPP2_GMAC_IN_BAND_RESTART_AN	BIT(4)
+#define     MVPP2_GMAC_CONFIG_MII_SPEED	BIT(5)
+#define     MVPP2_GMAC_CONFIG_GMII_SPEED	BIT(6)
+#define     MVPP2_GMAC_AN_SPEED_EN		BIT(7)
+#define     MVPP2_GMAC_FC_ADV_EN		BIT(9)
+#define     MVPP2_GMAC_FC_ADV_ASM_EN		BIT(10)
+#define     MVPP2_GMAC_FLOW_CTRL_AUTONEG	BIT(11)
+#define     MVPP2_GMAC_CONFIG_FULL_DUPLEX	BIT(12)
+#define     MVPP2_GMAC_AN_DUPLEX_EN		BIT(13)
+#define MVPP2_GMAC_STATUS0			0x10
+#define     MVPP2_GMAC_STATUS0_LINK_UP		BIT(0)
+#define     MVPP2_GMAC_STATUS0_GMII_SPEED	BIT(1)
+#define     MVPP2_GMAC_STATUS0_MII_SPEED	BIT(2)
+#define     MVPP2_GMAC_STATUS0_FULL_DUPLEX	BIT(3)
+#define     MVPP2_GMAC_STATUS0_RX_PAUSE		BIT(6)
+#define     MVPP2_GMAC_STATUS0_TX_PAUSE		BIT(7)
+#define     MVPP2_GMAC_STATUS0_AN_COMPLETE	BIT(11)
+#define MVPP2_GMAC_PORT_FIFO_CFG_1_REG		0x1c
+#define     MVPP2_GMAC_TX_FIFO_MIN_TH_OFFS	6
+#define     MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK	0x1fc0
+#define     MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(v)	(((v) << 6) & \
+					MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK)
+#define MVPP22_GMAC_INT_STAT			0x20
+#define     MVPP22_GMAC_INT_STAT_LINK		BIT(1)
+#define MVPP22_GMAC_INT_MASK			0x24
+#define     MVPP22_GMAC_INT_MASK_LINK_STAT	BIT(1)
+#define MVPP22_GMAC_CTRL_4_REG			0x90
+#define     MVPP22_CTRL4_EXT_PIN_GMII_SEL	BIT(0)
+#define     MVPP22_CTRL4_RX_FC_EN		BIT(3)
+#define     MVPP22_CTRL4_TX_FC_EN		BIT(4)
+#define     MVPP22_CTRL4_DP_CLK_SEL		BIT(5)
+#define     MVPP22_CTRL4_SYNC_BYPASS_DIS	BIT(6)
+#define     MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE	BIT(7)
+#define MVPP22_GMAC_INT_SUM_MASK		0xa4
+#define     MVPP22_GMAC_INT_SUM_MASK_LINK_STAT	BIT(1)
+
+/* Per-port XGMAC registers. PPv2.2 only, only for GOP port 0,
+ * relative to port->base.
+ */
+#define MVPP22_XLG_CTRL0_REG			0x100
+#define     MVPP22_XLG_CTRL0_PORT_EN		BIT(0)
+#define     MVPP22_XLG_CTRL0_MAC_RESET_DIS	BIT(1)
+#define     MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN	BIT(7)
+#define     MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN	BIT(8)
+#define     MVPP22_XLG_CTRL0_MIB_CNT_DIS	BIT(14)
+#define MVPP22_XLG_CTRL1_REG			0x104
+#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS	0
+#define     MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK	0x1fff
+#define MVPP22_XLG_STATUS			0x10c
+#define     MVPP22_XLG_STATUS_LINK_UP		BIT(0)
+#define MVPP22_XLG_INT_STAT			0x114
+#define     MVPP22_XLG_INT_STAT_LINK		BIT(1)
+#define MVPP22_XLG_INT_MASK			0x118
+#define     MVPP22_XLG_INT_MASK_LINK		BIT(1)
+#define MVPP22_XLG_CTRL3_REG			0x11c
+#define     MVPP22_XLG_CTRL3_MACMODESELECT_MASK	(7 << 13)
+#define     MVPP22_XLG_CTRL3_MACMODESELECT_GMAC	(0 << 13)
+#define     MVPP22_XLG_CTRL3_MACMODESELECT_10G	(1 << 13)
+#define MVPP22_XLG_EXT_INT_MASK			0x15c
+#define     MVPP22_XLG_EXT_INT_MASK_XLG		BIT(1)
+#define     MVPP22_XLG_EXT_INT_MASK_GIG		BIT(2)
+#define MVPP22_XLG_CTRL4_REG			0x184
+#define     MVPP22_XLG_CTRL4_FWD_FC		BIT(5)
+#define     MVPP22_XLG_CTRL4_FWD_PFC		BIT(6)
+#define     MVPP22_XLG_CTRL4_MACMODSELECT_GMAC	BIT(12)
+#define     MVPP22_XLG_CTRL4_EN_IDLE_CHECK	BIT(14)
+
+/* SMI registers. PPv2.2 only, relative to priv->iface_base. */
+#define MVPP22_SMI_MISC_CFG_REG			0x1204
+#define     MVPP22_SMI_POLLING_EN		BIT(10)
+
+#define MVPP22_GMAC_BASE(port)		(0x7000 + (port) * 0x1000 + 0xe00)
+
+#define MVPP2_CAUSE_TXQ_SENT_DESC_ALL_MASK	0xff
+
+/* Descriptor ring Macros */
+#define MVPP2_QUEUE_NEXT_DESC(q, index) \
+	(((index) < (q)->last_desc) ? ((index) + 1) : 0)
+
+/* XPCS registers. PPv2.2 only */
+#define MVPP22_MPCS_BASE(port)			(0x7000 + (port) * 0x1000)
+#define MVPP22_MPCS_CTRL			0x14
+#define     MVPP22_MPCS_CTRL_FWD_ERR_CONN	BIT(10)
+#define MVPP22_MPCS_CLK_RESET			0x14c
+#define     MAC_CLK_RESET_SD_TX			BIT(0)
+#define     MAC_CLK_RESET_SD_RX			BIT(1)
+#define     MAC_CLK_RESET_MAC			BIT(2)
+#define     MVPP22_MPCS_CLK_RESET_DIV_RATIO(n)	((n) << 4)
+#define     MVPP22_MPCS_CLK_RESET_DIV_SET	BIT(11)
+
+/* XPCS registers. PPv2.2 only */
+#define MVPP22_XPCS_BASE(port)			(0x7400 + (port) * 0x1000)
+#define MVPP22_XPCS_CFG0			0x0
+#define     MVPP22_XPCS_CFG0_PCS_MODE(n)	((n) << 3)
+#define     MVPP22_XPCS_CFG0_ACTIVE_LANE(n)	((n) << 5)
+
+/* System controller registers. Accessed through a regmap. */
+#define GENCONF_SOFT_RESET1				0x1108
+#define     GENCONF_SOFT_RESET1_GOP			BIT(6)
+#define GENCONF_PORT_CTRL0				0x1110
+#define     GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT		BIT(1)
+#define     GENCONF_PORT_CTRL0_RX_DATA_SAMPLE		BIT(29)
+#define     GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR	BIT(31)
+#define GENCONF_PORT_CTRL1				0x1114
+#define     GENCONF_PORT_CTRL1_EN(p)			BIT(p)
+#define     GENCONF_PORT_CTRL1_RESET(p)			(BIT(p) << 28)
+#define GENCONF_CTRL0					0x1120
+#define     GENCONF_CTRL0_PORT0_RGMII			BIT(0)
+#define     GENCONF_CTRL0_PORT1_RGMII_MII		BIT(1)
+#define     GENCONF_CTRL0_PORT1_RGMII			BIT(2)
+
+/* Various constants */
+
+/* Coalescing */
+#define MVPP2_TXDONE_COAL_PKTS_THRESH	64
+#define MVPP2_TXDONE_HRTIMER_PERIOD_NS	1000000UL
+#define MVPP2_TXDONE_COAL_USEC		1000
+#define MVPP2_RX_COAL_PKTS		32
+#define MVPP2_RX_COAL_USEC		64
+
+/* The two bytes Marvell header. Either contains a special value used
+ * by Marvell switches when a specific hardware mode is enabled (not
+ * supported by this driver) or is filled automatically by zeroes on
+ * the RX side. Those two bytes being at the front of the Ethernet
+ * header, they allow to have the IP header aligned on a 4 bytes
+ * boundary automatically: the hardware skips those two bytes on its
+ * own.
+ */
+#define MVPP2_MH_SIZE			2
+#define MVPP2_ETH_TYPE_LEN		2
+#define MVPP2_PPPOE_HDR_SIZE		8
+#define MVPP2_VLAN_TAG_LEN		4
+#define MVPP2_VLAN_TAG_EDSA_LEN		8
+
+/* Lbtd 802.3 type */
+#define MVPP2_IP_LBDT_TYPE		0xfffa
+
+#define MVPP2_TX_CSUM_MAX_SIZE		9800
+
+/* Timeout constants */
+#define MVPP2_TX_DISABLE_TIMEOUT_MSEC	1000
+#define MVPP2_TX_PENDING_TIMEOUT_MSEC	1000
+
+#define MVPP2_TX_MTU_MAX		0x7ffff
+
+/* Maximum number of T-CONTs of PON port */
+#define MVPP2_MAX_TCONT			16
+
+/* Maximum number of supported ports */
+#define MVPP2_MAX_PORTS			4
+
+/* Maximum number of TXQs used by single port */
+#define MVPP2_MAX_TXQ			8
+
+/* MVPP2_MAX_TSO_SEGS is the maximum number of fragments to allow in the GSO
+ * skb. As we need a maxium of two descriptors per fragments (1 header, 1 data),
+ * multiply this value by two to count the maximum number of skb descs needed.
+ */
+#define MVPP2_MAX_TSO_SEGS		300
+#define MVPP2_MAX_SKB_DESCS		(MVPP2_MAX_TSO_SEGS * 2 + MAX_SKB_FRAGS)
+
+/* Dfault number of RXQs in use */
+#define MVPP2_DEFAULT_RXQ		4
+
+/* Max number of Rx descriptors */
+#define MVPP2_MAX_RXD_MAX		1024
+#define MVPP2_MAX_RXD_DFLT		128
+
+/* Max number of Tx descriptors */
+#define MVPP2_MAX_TXD_MAX		2048
+#define MVPP2_MAX_TXD_DFLT		1024
+
+/* Amount of Tx descriptors that can be reserved at once by CPU */
+#define MVPP2_CPU_DESC_CHUNK		64
+
+/* Max number of Tx descriptors in each aggregated queue */
+#define MVPP2_AGGR_TXQ_SIZE		256
+
+/* Descriptor aligned size */
+#define MVPP2_DESC_ALIGNED_SIZE		32
+
+/* Descriptor alignment mask */
+#define MVPP2_TX_DESC_ALIGN		(MVPP2_DESC_ALIGNED_SIZE - 1)
+
+/* RX FIFO constants */
+#define MVPP2_RX_FIFO_PORT_DATA_SIZE_32KB	0x8000
+#define MVPP2_RX_FIFO_PORT_DATA_SIZE_8KB	0x2000
+#define MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB	0x1000
+#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_32KB	0x200
+#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_8KB	0x80
+#define MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB	0x40
+#define MVPP2_RX_FIFO_PORT_MIN_PKT		0x80
+
+/* TX FIFO constants */
+#define MVPP22_TX_FIFO_DATA_SIZE_10KB		0xa
+#define MVPP22_TX_FIFO_DATA_SIZE_3KB		0x3
+#define MVPP2_TX_FIFO_THRESHOLD_MIN		256
+#define MVPP2_TX_FIFO_THRESHOLD_10KB	\
+	(MVPP22_TX_FIFO_DATA_SIZE_10KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
+#define MVPP2_TX_FIFO_THRESHOLD_3KB	\
+	(MVPP22_TX_FIFO_DATA_SIZE_3KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
+
+/* RX buffer constants */
+#define MVPP2_SKB_SHINFO_SIZE \
+	SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
+
+#define MVPP2_RX_PKT_SIZE(mtu) \
+	ALIGN((mtu) + MVPP2_MH_SIZE + MVPP2_VLAN_TAG_LEN + \
+	      ETH_HLEN + ETH_FCS_LEN, cache_line_size())
+
+#define MVPP2_RX_BUF_SIZE(pkt_size)	((pkt_size) + NET_SKB_PAD)
+#define MVPP2_RX_TOTAL_SIZE(buf_size)	((buf_size) + MVPP2_SKB_SHINFO_SIZE)
+#define MVPP2_RX_MAX_PKT_SIZE(total_size) \
+	((total_size) - NET_SKB_PAD - MVPP2_SKB_SHINFO_SIZE)
+
+#define MVPP2_BIT_TO_BYTE(bit)		((bit) / 8)
+
+/* IPv6 max L3 address size */
+#define MVPP2_MAX_L3_ADDR_SIZE		16
+
+/* Port flags */
+#define MVPP2_F_LOOPBACK		BIT(0)
+
+/* Marvell tag types */
+enum mvpp2_tag_type {
+	MVPP2_TAG_TYPE_NONE = 0,
+	MVPP2_TAG_TYPE_MH   = 1,
+	MVPP2_TAG_TYPE_DSA  = 2,
+	MVPP2_TAG_TYPE_EDSA = 3,
+	MVPP2_TAG_TYPE_VLAN = 4,
+	MVPP2_TAG_TYPE_LAST = 5
+};
+
+/* L2 cast enum */
+enum mvpp2_prs_l2_cast {
+	MVPP2_PRS_L2_UNI_CAST,
+	MVPP2_PRS_L2_MULTI_CAST,
+};
+
+/* L3 cast enum */
+enum mvpp2_prs_l3_cast {
+	MVPP2_PRS_L3_UNI_CAST,
+	MVPP2_PRS_L3_MULTI_CAST,
+	MVPP2_PRS_L3_BROAD_CAST
+};
+
+/* BM constants */
+#define MVPP2_BM_JUMBO_BUF_NUM		512
+#define MVPP2_BM_LONG_BUF_NUM		1024
+#define MVPP2_BM_SHORT_BUF_NUM		2048
+#define MVPP2_BM_POOL_SIZE_MAX		(16*1024 - MVPP2_BM_POOL_PTR_ALIGN/4)
+#define MVPP2_BM_POOL_PTR_ALIGN		128
+
+/* BM cookie (32 bits) definition */
+#define MVPP2_BM_COOKIE_POOL_OFFS	8
+#define MVPP2_BM_COOKIE_CPU_OFFS	24
+
+#define MVPP2_BM_SHORT_FRAME_SIZE		512
+#define MVPP2_BM_LONG_FRAME_SIZE		2048
+#define MVPP2_BM_JUMBO_FRAME_SIZE		10240
+/* BM short pool packet size
+ * These value assure that for SWF the total number
+ * of bytes allocated for each buffer will be 512
+ */
+#define MVPP2_BM_SHORT_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_SHORT_FRAME_SIZE)
+#define MVPP2_BM_LONG_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_LONG_FRAME_SIZE)
+#define MVPP2_BM_JUMBO_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_JUMBO_FRAME_SIZE)
+
+#define MVPP21_ADDR_SPACE_SZ		0
+#define MVPP22_ADDR_SPACE_SZ		SZ_64K
+
+#define MVPP2_MAX_THREADS		8
+#define MVPP2_MAX_QVECS			MVPP2_MAX_THREADS
+
+/* GMAC MIB Counters register definitions */
+#define MVPP21_MIB_COUNTERS_OFFSET		0x1000
+#define MVPP21_MIB_COUNTERS_PORT_SZ		0x400
+#define MVPP22_MIB_COUNTERS_OFFSET		0x0
+#define MVPP22_MIB_COUNTERS_PORT_SZ		0x100
+
+#define MVPP2_MIB_GOOD_OCTETS_RCVD		0x0
+#define MVPP2_MIB_BAD_OCTETS_RCVD		0x8
+#define MVPP2_MIB_CRC_ERRORS_SENT		0xc
+#define MVPP2_MIB_UNICAST_FRAMES_RCVD		0x10
+#define MVPP2_MIB_BROADCAST_FRAMES_RCVD		0x18
+#define MVPP2_MIB_MULTICAST_FRAMES_RCVD		0x1c
+#define MVPP2_MIB_FRAMES_64_OCTETS		0x20
+#define MVPP2_MIB_FRAMES_65_TO_127_OCTETS	0x24
+#define MVPP2_MIB_FRAMES_128_TO_255_OCTETS	0x28
+#define MVPP2_MIB_FRAMES_256_TO_511_OCTETS	0x2c
+#define MVPP2_MIB_FRAMES_512_TO_1023_OCTETS	0x30
+#define MVPP2_MIB_FRAMES_1024_TO_MAX_OCTETS	0x34
+#define MVPP2_MIB_GOOD_OCTETS_SENT		0x38
+#define MVPP2_MIB_UNICAST_FRAMES_SENT		0x40
+#define MVPP2_MIB_MULTICAST_FRAMES_SENT		0x48
+#define MVPP2_MIB_BROADCAST_FRAMES_SENT		0x4c
+#define MVPP2_MIB_FC_SENT			0x54
+#define MVPP2_MIB_FC_RCVD			0x58
+#define MVPP2_MIB_RX_FIFO_OVERRUN		0x5c
+#define MVPP2_MIB_UNDERSIZE_RCVD		0x60
+#define MVPP2_MIB_FRAGMENTS_RCVD		0x64
+#define MVPP2_MIB_OVERSIZE_RCVD			0x68
+#define MVPP2_MIB_JABBER_RCVD			0x6c
+#define MVPP2_MIB_MAC_RCV_ERROR			0x70
+#define MVPP2_MIB_BAD_CRC_EVENT			0x74
+#define MVPP2_MIB_COLLISION			0x78
+#define MVPP2_MIB_LATE_COLLISION		0x7c
+
+#define MVPP2_MIB_COUNTERS_STATS_DELAY		(1 * HZ)
+
+#define MVPP2_DESC_DMA_MASK	DMA_BIT_MASK(40)
+
+/* Definitions */
+
+/* Shared Packet Processor resources */
+struct mvpp2 {
+	/* Shared registers' base addresses */
+	void __iomem *lms_base;
+	void __iomem *iface_base;
+
+	/* On PPv2.2, each "software thread" can access the base
+	 * register through a separate address space, each 64 KB apart
+	 * from each other. Typically, such address spaces will be
+	 * used per CPU.
+	 */
+	void __iomem *swth_base[MVPP2_MAX_THREADS];
+
+	/* On PPv2.2, some port control registers are located into the system
+	 * controller space. These registers are accessible through a regmap.
+	 */
+	struct regmap *sysctrl_base;
+
+	/* Common clocks */
+	struct clk *pp_clk;
+	struct clk *gop_clk;
+	struct clk *mg_clk;
+	struct clk *mg_core_clk;
+	struct clk *axi_clk;
+
+	/* List of pointers to port structures */
+	int port_count;
+	struct mvpp2_port *port_list[MVPP2_MAX_PORTS];
+
+	/* Aggregated TXQs */
+	struct mvpp2_tx_queue *aggr_txqs;
+
+	/* BM pools */
+	struct mvpp2_bm_pool *bm_pools;
+
+	/* PRS shadow table */
+	struct mvpp2_prs_shadow *prs_shadow;
+	/* PRS auxiliary table for double vlan entries control */
+	bool *prs_double_vlans;
+
+	/* Tclk value */
+	u32 tclk;
+
+	/* HW version */
+	enum { MVPP21, MVPP22 } hw_version;
+
+	/* Maximum number of RXQs per port */
+	unsigned int max_port_rxqs;
+
+	/* Workqueue to gather hardware statistics */
+	char queue_name[30];
+	struct workqueue_struct *stats_queue;
+};
+
+struct mvpp2_pcpu_stats {
+	struct	u64_stats_sync syncp;
+	u64	rx_packets;
+	u64	rx_bytes;
+	u64	tx_packets;
+	u64	tx_bytes;
+};
+
+/* Per-CPU port control */
+struct mvpp2_port_pcpu {
+	struct hrtimer tx_done_timer;
+	bool timer_scheduled;
+	/* Tasklet for egress finalization */
+	struct tasklet_struct tx_done_tasklet;
+};
+
+struct mvpp2_queue_vector {
+	int irq;
+	struct napi_struct napi;
+	enum { MVPP2_QUEUE_VECTOR_SHARED, MVPP2_QUEUE_VECTOR_PRIVATE } type;
+	int sw_thread_id;
+	u16 sw_thread_mask;
+	int first_rxq;
+	int nrxqs;
+	u32 pending_cause_rx;
+	struct mvpp2_port *port;
+};
+
+struct mvpp2_port {
+	u8 id;
+
+	/* Index of the port from the "group of ports" complex point
+	 * of view
+	 */
+	int gop_id;
+
+	int link_irq;
+
+	struct mvpp2 *priv;
+
+	/* Firmware node associated to the port */
+	struct fwnode_handle *fwnode;
+
+	/* Is a PHY always connected to the port */
+	bool has_phy;
+
+	/* Per-port registers' base address */
+	void __iomem *base;
+	void __iomem *stats_base;
+
+	struct mvpp2_rx_queue **rxqs;
+	unsigned int nrxqs;
+	struct mvpp2_tx_queue **txqs;
+	unsigned int ntxqs;
+	struct net_device *dev;
+
+	int pkt_size;
+
+	/* Per-CPU port control */
+	struct mvpp2_port_pcpu __percpu *pcpu;
+
+	/* Flags */
+	unsigned long flags;
+
+	u16 tx_ring_size;
+	u16 rx_ring_size;
+	struct mvpp2_pcpu_stats __percpu *stats;
+	u64 *ethtool_stats;
+
+	/* Per-port work and its lock to gather hardware statistics */
+	struct mutex gather_stats_lock;
+	struct delayed_work stats_work;
+
+	struct device_node *of_node;
+
+	phy_interface_t phy_interface;
+	struct phylink *phylink;
+	struct phy *comphy;
+
+	struct mvpp2_bm_pool *pool_long;
+	struct mvpp2_bm_pool *pool_short;
+
+	/* Index of first port's physical RXQ */
+	u8 first_rxq;
+
+	struct mvpp2_queue_vector qvecs[MVPP2_MAX_QVECS];
+	unsigned int nqvecs;
+	bool has_tx_irqs;
+
+	u32 tx_time_coal;
+};
+
+/* The mvpp2_tx_desc and mvpp2_rx_desc structures describe the
+ * layout of the transmit and reception DMA descriptors, and their
+ * layout is therefore defined by the hardware design
+ */
+
+#define MVPP2_TXD_L3_OFF_SHIFT		0
+#define MVPP2_TXD_IP_HLEN_SHIFT		8
+#define MVPP2_TXD_L4_CSUM_FRAG		BIT(13)
+#define MVPP2_TXD_L4_CSUM_NOT		BIT(14)
+#define MVPP2_TXD_IP_CSUM_DISABLE	BIT(15)
+#define MVPP2_TXD_PADDING_DISABLE	BIT(23)
+#define MVPP2_TXD_L4_UDP		BIT(24)
+#define MVPP2_TXD_L3_IP6		BIT(26)
+#define MVPP2_TXD_L_DESC		BIT(28)
+#define MVPP2_TXD_F_DESC		BIT(29)
+
+#define MVPP2_RXD_ERR_SUMMARY		BIT(15)
+#define MVPP2_RXD_ERR_CODE_MASK		(BIT(13) | BIT(14))
+#define MVPP2_RXD_ERR_CRC		0x0
+#define MVPP2_RXD_ERR_OVERRUN		BIT(13)
+#define MVPP2_RXD_ERR_RESOURCE		(BIT(13) | BIT(14))
+#define MVPP2_RXD_BM_POOL_ID_OFFS	16
+#define MVPP2_RXD_BM_POOL_ID_MASK	(BIT(16) | BIT(17) | BIT(18))
+#define MVPP2_RXD_HWF_SYNC		BIT(21)
+#define MVPP2_RXD_L4_CSUM_OK		BIT(22)
+#define MVPP2_RXD_IP4_HEADER_ERR	BIT(24)
+#define MVPP2_RXD_L4_TCP		BIT(25)
+#define MVPP2_RXD_L4_UDP		BIT(26)
+#define MVPP2_RXD_L3_IP4		BIT(28)
+#define MVPP2_RXD_L3_IP6		BIT(30)
+#define MVPP2_RXD_BUF_HDR		BIT(31)
+
+/* HW TX descriptor for PPv2.1 */
+struct mvpp21_tx_desc {
+	u32 command;		/* Options used by HW for packet transmitting.*/
+	u8  packet_offset;	/* the offset from the buffer beginning	*/
+	u8  phys_txq;		/* destination queue ID			*/
+	u16 data_size;		/* data size of transmitted packet in bytes */
+	u32 buf_dma_addr;	/* physical addr of transmitted buffer	*/
+	u32 buf_cookie;		/* cookie for access to TX buffer in tx path */
+	u32 reserved1[3];	/* hw_cmd (for future use, BM, PON, PNC) */
+	u32 reserved2;		/* reserved (for future use)		*/
+};
+
+/* HW RX descriptor for PPv2.1 */
+struct mvpp21_rx_desc {
+	u32 status;		/* info about received packet		*/
+	u16 reserved1;		/* parser_info (for future use, PnC)	*/
+	u16 data_size;		/* size of received packet in bytes	*/
+	u32 buf_dma_addr;	/* physical address of the buffer	*/
+	u32 buf_cookie;		/* cookie for access to RX buffer in rx path */
+	u16 reserved2;		/* gem_port_id (for future use, PON)	*/
+	u16 reserved3;		/* csum_l4 (for future use, PnC)	*/
+	u8  reserved4;		/* bm_qset (for future use, BM)		*/
+	u8  reserved5;
+	u16 reserved6;		/* classify_info (for future use, PnC)	*/
+	u32 reserved7;		/* flow_id (for future use, PnC) */
+	u32 reserved8;
+};
+
+/* HW TX descriptor for PPv2.2 */
+struct mvpp22_tx_desc {
+	u32 command;
+	u8  packet_offset;
+	u8  phys_txq;
+	u16 data_size;
+	u64 reserved1;
+	u64 buf_dma_addr_ptp;
+	u64 buf_cookie_misc;
+};
+
+/* HW RX descriptor for PPv2.2 */
+struct mvpp22_rx_desc {
+	u32 status;
+	u16 reserved1;
+	u16 data_size;
+	u32 reserved2;
+	u32 reserved3;
+	u64 buf_dma_addr_key_hash;
+	u64 buf_cookie_misc;
+};
+
+/* Opaque type used by the driver to manipulate the HW TX and RX
+ * descriptors
+ */
+struct mvpp2_tx_desc {
+	union {
+		struct mvpp21_tx_desc pp21;
+		struct mvpp22_tx_desc pp22;
+	};
+};
+
+struct mvpp2_rx_desc {
+	union {
+		struct mvpp21_rx_desc pp21;
+		struct mvpp22_rx_desc pp22;
+	};
+};
+
+struct mvpp2_txq_pcpu_buf {
+	/* Transmitted SKB */
+	struct sk_buff *skb;
+
+	/* Physical address of transmitted buffer */
+	dma_addr_t dma;
+
+	/* Size transmitted */
+	size_t size;
+};
+
+/* Per-CPU Tx queue control */
+struct mvpp2_txq_pcpu {
+	int cpu;
+
+	/* Number of Tx DMA descriptors in the descriptor ring */
+	int size;
+
+	/* Number of currently used Tx DMA descriptor in the
+	 * descriptor ring
+	 */
+	int count;
+
+	int wake_threshold;
+	int stop_threshold;
+
+	/* Number of Tx DMA descriptors reserved for each CPU */
+	int reserved_num;
+
+	/* Infos about transmitted buffers */
+	struct mvpp2_txq_pcpu_buf *buffs;
+
+	/* Index of last TX DMA descriptor that was inserted */
+	int txq_put_index;
+
+	/* Index of the TX DMA descriptor to be cleaned up */
+	int txq_get_index;
+
+	/* DMA buffer for TSO headers */
+	char *tso_headers;
+	dma_addr_t tso_headers_dma;
+};
+
+struct mvpp2_tx_queue {
+	/* Physical number of this Tx queue */
+	u8 id;
+
+	/* Logical number of this Tx queue */
+	u8 log_id;
+
+	/* Number of Tx DMA descriptors in the descriptor ring */
+	int size;
+
+	/* Number of currently used Tx DMA descriptor in the descriptor ring */
+	int count;
+
+	/* Per-CPU control of physical Tx queues */
+	struct mvpp2_txq_pcpu __percpu *pcpu;
+
+	u32 done_pkts_coal;
+
+	/* Virtual address of thex Tx DMA descriptors array */
+	struct mvpp2_tx_desc *descs;
+
+	/* DMA address of the Tx DMA descriptors array */
+	dma_addr_t descs_dma;
+
+	/* Index of the last Tx DMA descriptor */
+	int last_desc;
+
+	/* Index of the next Tx DMA descriptor to process */
+	int next_desc_to_proc;
+};
+
+struct mvpp2_rx_queue {
+	/* RX queue number, in the range 0-31 for physical RXQs */
+	u8 id;
+
+	/* Num of rx descriptors in the rx descriptor ring */
+	int size;
+
+	u32 pkts_coal;
+	u32 time_coal;
+
+	/* Virtual address of the RX DMA descriptors array */
+	struct mvpp2_rx_desc *descs;
+
+	/* DMA address of the RX DMA descriptors array */
+	dma_addr_t descs_dma;
+
+	/* Index of the last RX DMA descriptor */
+	int last_desc;
+
+	/* Index of the next RX DMA descriptor to process */
+	int next_desc_to_proc;
+
+	/* ID of port to which physical RXQ is mapped */
+	int port;
+
+	/* Port's logic RXQ number to which physical RXQ is mapped */
+	int logic_rxq;
+};
+
+struct mvpp2_bm_pool {
+	/* Pool number in the range 0-7 */
+	int id;
+
+	/* Buffer Pointers Pool External (BPPE) size */
+	int size;
+	/* BPPE size in bytes */
+	int size_bytes;
+	/* Number of buffers for this pool */
+	int buf_num;
+	/* Pool buffer size */
+	int buf_size;
+	/* Packet size */
+	int pkt_size;
+	int frag_size;
+
+	/* BPPE virtual base address */
+	u32 *virt_addr;
+	/* BPPE DMA base address */
+	dma_addr_t dma_addr;
+
+	/* Ports using BM pool */
+	u32 port_map;
+};
+
+#define IS_TSO_HEADER(txq_pcpu, addr) \
+	((addr) >= (txq_pcpu)->tso_headers_dma && \
+	 (addr) < (txq_pcpu)->tso_headers_dma + \
+	 (txq_pcpu)->size * TSO_HEADER_SIZE)
+
+#define MVPP2_DRIVER_NAME "mvpp2"
+#define MVPP2_DRIVER_VERSION "1.0"
+
+void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data);
+u32 mvpp2_read(struct mvpp2 *priv, u32 offset);
+
+u32 mvpp2_read_relaxed(struct mvpp2 *priv, u32 offset);
+
+void mvpp2_percpu_write(struct mvpp2 *priv, int cpu, u32 offset, u32 data);
+u32 mvpp2_percpu_read(struct mvpp2 *priv, int cpu, u32 offset);
+
+void mvpp2_percpu_write_relaxed(struct mvpp2 *priv, int cpu, u32 offset,
+				u32 data);
+
+#endif
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
new file mode 100644
index 000000000000..8581d5b17dd5
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c
@@ -0,0 +1,141 @@
+/*
+ * RSS and Classifier helpers for Marvell PPv2 Network Controller
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "mvpp2.h"
+#include "mvpp2_cls.h"
+
+/* Update classification flow table registers */
+static void mvpp2_cls_flow_write(struct mvpp2 *priv,
+				 struct mvpp2_cls_flow_entry *fe)
+{
+	mvpp2_write(priv, MVPP2_CLS_FLOW_INDEX_REG, fe->index);
+	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL0_REG,  fe->data[0]);
+	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL1_REG,  fe->data[1]);
+	mvpp2_write(priv, MVPP2_CLS_FLOW_TBL2_REG,  fe->data[2]);
+}
+
+/* Update classification lookup table register */
+static void mvpp2_cls_lookup_write(struct mvpp2 *priv,
+				   struct mvpp2_cls_lookup_entry *le)
+{
+	u32 val;
+
+	val = (le->way << MVPP2_CLS_LKP_INDEX_WAY_OFFS) | le->lkpid;
+	mvpp2_write(priv, MVPP2_CLS_LKP_INDEX_REG, val);
+	mvpp2_write(priv, MVPP2_CLS_LKP_TBL_REG, le->data);
+}
+
+/* Classifier default initialization */
+void mvpp2_cls_init(struct mvpp2 *priv)
+{
+	struct mvpp2_cls_lookup_entry le;
+	struct mvpp2_cls_flow_entry fe;
+	int index;
+
+	/* Enable classifier */
+	mvpp2_write(priv, MVPP2_CLS_MODE_REG, MVPP2_CLS_MODE_ACTIVE_MASK);
+
+	/* Clear classifier flow table */
+	memset(&fe.data, 0, sizeof(fe.data));
+	for (index = 0; index < MVPP2_CLS_FLOWS_TBL_SIZE; index++) {
+		fe.index = index;
+		mvpp2_cls_flow_write(priv, &fe);
+	}
+
+	/* Clear classifier lookup table */
+	le.data = 0;
+	for (index = 0; index < MVPP2_CLS_LKP_TBL_SIZE; index++) {
+		le.lkpid = index;
+		le.way = 0;
+		mvpp2_cls_lookup_write(priv, &le);
+
+		le.way = 1;
+		mvpp2_cls_lookup_write(priv, &le);
+	}
+}
+
+void mvpp2_cls_port_config(struct mvpp2_port *port)
+{
+	struct mvpp2_cls_lookup_entry le;
+	u32 val;
+
+	/* Set way for the port */
+	val = mvpp2_read(port->priv, MVPP2_CLS_PORT_WAY_REG);
+	val &= ~MVPP2_CLS_PORT_WAY_MASK(port->id);
+	mvpp2_write(port->priv, MVPP2_CLS_PORT_WAY_REG, val);
+
+	/* Pick the entry to be accessed in lookup ID decoding table
+	 * according to the way and lkpid.
+	 */
+	le.lkpid = port->id;
+	le.way = 0;
+	le.data = 0;
+
+	/* Set initial CPU queue for receiving packets */
+	le.data &= ~MVPP2_CLS_LKP_TBL_RXQ_MASK;
+	le.data |= port->first_rxq;
+
+	/* Disable classification engines */
+	le.data &= ~MVPP2_CLS_LKP_TBL_LOOKUP_EN_MASK;
+
+	/* Update lookup ID table entry */
+	mvpp2_cls_lookup_write(port->priv, &le);
+}
+
+/* Set CPU queue number for oversize packets */
+void mvpp2_cls_oversize_rxq_set(struct mvpp2_port *port)
+{
+	u32 val;
+
+	mvpp2_write(port->priv, MVPP2_CLS_OVERSIZE_RXQ_LOW_REG(port->id),
+		    port->first_rxq & MVPP2_CLS_OVERSIZE_RXQ_LOW_MASK);
+
+	mvpp2_write(port->priv, MVPP2_CLS_SWFWD_P2HQ_REG(port->id),
+		    (port->first_rxq >> MVPP2_CLS_OVERSIZE_RXQ_LOW_BITS));
+
+	val = mvpp2_read(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG);
+	val |= MVPP2_CLS_SWFWD_PCTRL_MASK(port->id);
+	mvpp2_write(port->priv, MVPP2_CLS_SWFWD_PCTRL_REG, val);
+}
+
+void mvpp22_init_rss(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	int i;
+
+	/* Set the table width: replace the whole classifier Rx queue number
+	 * with the ones configured in RSS table entries.
+	 */
+	mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_TABLE(0));
+	mvpp2_write(priv, MVPP22_RSS_WIDTH, 8);
+
+	/* Loop through the classifier Rx Queues and map them to a RSS table.
+	 * Map them all to the first table (0) by default.
+	 */
+	for (i = 0; i < MVPP2_CLS_RX_QUEUES; i++) {
+		mvpp2_write(priv, MVPP22_RSS_INDEX, MVPP22_RSS_INDEX_QUEUE(i));
+		mvpp2_write(priv, MVPP22_RSS_TABLE,
+			    MVPP22_RSS_TABLE_POINTER(0));
+	}
+
+	/* Configure the first table to evenly distribute the packets across
+	 * real Rx Queues. The table entries map a hash to an port Rx Queue.
+	 */
+	for (i = 0; i < MVPP22_RSS_TABLE_ENTRIES; i++) {
+		u32 sel = MVPP22_RSS_INDEX_TABLE(0) |
+			  MVPP22_RSS_INDEX_TABLE_ENTRY(i);
+		mvpp2_write(priv, MVPP22_RSS_INDEX, sel);
+
+		mvpp2_write(priv, MVPP22_RSS_TABLE_ENTRY, i % port->nrxqs);
+	}
+
+}
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.h
new file mode 100644
index 000000000000..8e1d7f9ffa0b
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.h
@@ -0,0 +1,44 @@
+/*
+ * RSS and Classifier definitions for Marvell PPv2 Network Controller
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _MVPP2_CLS_H_
+#define _MVPP2_CLS_H_
+
+/* Classifier constants */
+#define MVPP2_CLS_FLOWS_TBL_SIZE	512
+#define MVPP2_CLS_FLOWS_TBL_DATA_WORDS	3
+#define MVPP2_CLS_LKP_TBL_SIZE		64
+#define MVPP2_CLS_RX_QUEUES		256
+
+/* RSS constants */
+#define MVPP22_RSS_TABLE_ENTRIES	32
+
+struct mvpp2_cls_flow_entry {
+	u32 index;
+	u32 data[MVPP2_CLS_FLOWS_TBL_DATA_WORDS];
+};
+
+struct mvpp2_cls_lookup_entry {
+	u32 lkpid;
+	u32 way;
+	u32 data;
+};
+
+void mvpp22_init_rss(struct mvpp2_port *port);
+
+void mvpp2_cls_init(struct mvpp2 *priv);
+
+void mvpp2_cls_port_config(struct mvpp2_port *port);
+
+void mvpp2_cls_oversize_rxq_set(struct mvpp2_port *port);
+
+#endif
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
new file mode 100644
index 000000000000..45622bffd81a
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -0,0 +1,5281 @@
+/*
+ * Driver for Marvell PPv2 network controller for Armada 375 SoC.
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/mbus.h>
+#include <linux/module.h>
+#include <linux/mfd/syscon.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_mdio.h>
+#include <linux/of_net.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/phy.h>
+#include <linux/phylink.h>
+#include <linux/phy/phy.h>
+#include <linux/clk.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/regmap.h>
+#include <uapi/linux/ppp_defs.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tso.h>
+
+#include "mvpp2.h"
+#include "mvpp2_prs.h"
+#include "mvpp2_cls.h"
+
+enum mvpp2_bm_pool_log_num {
+	MVPP2_BM_SHORT,
+	MVPP2_BM_LONG,
+	MVPP2_BM_JUMBO,
+	MVPP2_BM_POOLS_NUM
+};
+
+static struct {
+	int pkt_size;
+	int buf_num;
+} mvpp2_pools[MVPP2_BM_POOLS_NUM];
+
+/* The prototype is added here to be used in start_dev when using ACPI. This
+ * will be removed once phylink is used for all modes (dt+ACPI).
+ */
+static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+			     const struct phylink_link_state *state);
+
+/* Queue modes */
+#define MVPP2_QDIST_SINGLE_MODE	0
+#define MVPP2_QDIST_MULTI_MODE	1
+
+static int queue_mode = MVPP2_QDIST_SINGLE_MODE;
+
+module_param(queue_mode, int, 0444);
+MODULE_PARM_DESC(queue_mode, "Set queue_mode (single=0, multi=1)");
+
+/* Utility/helper methods */
+
+void mvpp2_write(struct mvpp2 *priv, u32 offset, u32 data)
+{
+	writel(data, priv->swth_base[0] + offset);
+}
+
+u32 mvpp2_read(struct mvpp2 *priv, u32 offset)
+{
+	return readl(priv->swth_base[0] + offset);
+}
+
+u32 mvpp2_read_relaxed(struct mvpp2 *priv, u32 offset)
+{
+	return readl_relaxed(priv->swth_base[0] + offset);
+}
+/* These accessors should be used to access:
+ *
+ * - per-CPU registers, where each CPU has its own copy of the
+ *   register.
+ *
+ *   MVPP2_BM_VIRT_ALLOC_REG
+ *   MVPP2_BM_ADDR_HIGH_ALLOC
+ *   MVPP22_BM_ADDR_HIGH_RLS_REG
+ *   MVPP2_BM_VIRT_RLS_REG
+ *   MVPP2_ISR_RX_TX_CAUSE_REG
+ *   MVPP2_ISR_RX_TX_MASK_REG
+ *   MVPP2_TXQ_NUM_REG
+ *   MVPP2_AGGR_TXQ_UPDATE_REG
+ *   MVPP2_TXQ_RSVD_REQ_REG
+ *   MVPP2_TXQ_RSVD_RSLT_REG
+ *   MVPP2_TXQ_SENT_REG
+ *   MVPP2_RXQ_NUM_REG
+ *
+ * - global registers that must be accessed through a specific CPU
+ *   window, because they are related to an access to a per-CPU
+ *   register
+ *
+ *   MVPP2_BM_PHY_ALLOC_REG    (related to MVPP2_BM_VIRT_ALLOC_REG)
+ *   MVPP2_BM_PHY_RLS_REG      (related to MVPP2_BM_VIRT_RLS_REG)
+ *   MVPP2_RXQ_THRESH_REG      (related to MVPP2_RXQ_NUM_REG)
+ *   MVPP2_RXQ_DESC_ADDR_REG   (related to MVPP2_RXQ_NUM_REG)
+ *   MVPP2_RXQ_DESC_SIZE_REG   (related to MVPP2_RXQ_NUM_REG)
+ *   MVPP2_RXQ_INDEX_REG       (related to MVPP2_RXQ_NUM_REG)
+ *   MVPP2_TXQ_PENDING_REG     (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_DESC_ADDR_REG   (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_DESC_SIZE_REG   (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_INDEX_REG       (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_PENDING_REG     (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_PREF_BUF_REG    (related to MVPP2_TXQ_NUM_REG)
+ *   MVPP2_TXQ_PREF_BUF_REG    (related to MVPP2_TXQ_NUM_REG)
+ */
+void mvpp2_percpu_write(struct mvpp2 *priv, int cpu,
+			       u32 offset, u32 data)
+{
+	writel(data, priv->swth_base[cpu] + offset);
+}
+
+u32 mvpp2_percpu_read(struct mvpp2 *priv, int cpu,
+			     u32 offset)
+{
+	return readl(priv->swth_base[cpu] + offset);
+}
+
+void mvpp2_percpu_write_relaxed(struct mvpp2 *priv, int cpu,
+				       u32 offset, u32 data)
+{
+	writel_relaxed(data, priv->swth_base[cpu] + offset);
+}
+
+u32 mvpp2_percpu_read_relaxed(struct mvpp2 *priv, int cpu,
+				     u32 offset)
+{
+	return readl_relaxed(priv->swth_base[cpu] + offset);
+}
+
+static dma_addr_t mvpp2_txdesc_dma_addr_get(struct mvpp2_port *port,
+					    struct mvpp2_tx_desc *tx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return tx_desc->pp21.buf_dma_addr;
+	else
+		return tx_desc->pp22.buf_dma_addr_ptp & MVPP2_DESC_DMA_MASK;
+}
+
+static void mvpp2_txdesc_dma_addr_set(struct mvpp2_port *port,
+				      struct mvpp2_tx_desc *tx_desc,
+				      dma_addr_t dma_addr)
+{
+	dma_addr_t addr, offset;
+
+	addr = dma_addr & ~MVPP2_TX_DESC_ALIGN;
+	offset = dma_addr & MVPP2_TX_DESC_ALIGN;
+
+	if (port->priv->hw_version == MVPP21) {
+		tx_desc->pp21.buf_dma_addr = addr;
+		tx_desc->pp21.packet_offset = offset;
+	} else {
+		u64 val = (u64)addr;
+
+		tx_desc->pp22.buf_dma_addr_ptp &= ~MVPP2_DESC_DMA_MASK;
+		tx_desc->pp22.buf_dma_addr_ptp |= val;
+		tx_desc->pp22.packet_offset = offset;
+	}
+}
+
+static size_t mvpp2_txdesc_size_get(struct mvpp2_port *port,
+				    struct mvpp2_tx_desc *tx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return tx_desc->pp21.data_size;
+	else
+		return tx_desc->pp22.data_size;
+}
+
+static void mvpp2_txdesc_size_set(struct mvpp2_port *port,
+				  struct mvpp2_tx_desc *tx_desc,
+				  size_t size)
+{
+	if (port->priv->hw_version == MVPP21)
+		tx_desc->pp21.data_size = size;
+	else
+		tx_desc->pp22.data_size = size;
+}
+
+static void mvpp2_txdesc_txq_set(struct mvpp2_port *port,
+				 struct mvpp2_tx_desc *tx_desc,
+				 unsigned int txq)
+{
+	if (port->priv->hw_version == MVPP21)
+		tx_desc->pp21.phys_txq = txq;
+	else
+		tx_desc->pp22.phys_txq = txq;
+}
+
+static void mvpp2_txdesc_cmd_set(struct mvpp2_port *port,
+				 struct mvpp2_tx_desc *tx_desc,
+				 unsigned int command)
+{
+	if (port->priv->hw_version == MVPP21)
+		tx_desc->pp21.command = command;
+	else
+		tx_desc->pp22.command = command;
+}
+
+static unsigned int mvpp2_txdesc_offset_get(struct mvpp2_port *port,
+					    struct mvpp2_tx_desc *tx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return tx_desc->pp21.packet_offset;
+	else
+		return tx_desc->pp22.packet_offset;
+}
+
+static dma_addr_t mvpp2_rxdesc_dma_addr_get(struct mvpp2_port *port,
+					    struct mvpp2_rx_desc *rx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return rx_desc->pp21.buf_dma_addr;
+	else
+		return rx_desc->pp22.buf_dma_addr_key_hash & MVPP2_DESC_DMA_MASK;
+}
+
+static unsigned long mvpp2_rxdesc_cookie_get(struct mvpp2_port *port,
+					     struct mvpp2_rx_desc *rx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return rx_desc->pp21.buf_cookie;
+	else
+		return rx_desc->pp22.buf_cookie_misc & MVPP2_DESC_DMA_MASK;
+}
+
+static size_t mvpp2_rxdesc_size_get(struct mvpp2_port *port,
+				    struct mvpp2_rx_desc *rx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return rx_desc->pp21.data_size;
+	else
+		return rx_desc->pp22.data_size;
+}
+
+static u32 mvpp2_rxdesc_status_get(struct mvpp2_port *port,
+				   struct mvpp2_rx_desc *rx_desc)
+{
+	if (port->priv->hw_version == MVPP21)
+		return rx_desc->pp21.status;
+	else
+		return rx_desc->pp22.status;
+}
+
+static void mvpp2_txq_inc_get(struct mvpp2_txq_pcpu *txq_pcpu)
+{
+	txq_pcpu->txq_get_index++;
+	if (txq_pcpu->txq_get_index == txq_pcpu->size)
+		txq_pcpu->txq_get_index = 0;
+}
+
+static void mvpp2_txq_inc_put(struct mvpp2_port *port,
+			      struct mvpp2_txq_pcpu *txq_pcpu,
+			      struct sk_buff *skb,
+			      struct mvpp2_tx_desc *tx_desc)
+{
+	struct mvpp2_txq_pcpu_buf *tx_buf =
+		txq_pcpu->buffs + txq_pcpu->txq_put_index;
+	tx_buf->skb = skb;
+	tx_buf->size = mvpp2_txdesc_size_get(port, tx_desc);
+	tx_buf->dma = mvpp2_txdesc_dma_addr_get(port, tx_desc) +
+		mvpp2_txdesc_offset_get(port, tx_desc);
+	txq_pcpu->txq_put_index++;
+	if (txq_pcpu->txq_put_index == txq_pcpu->size)
+		txq_pcpu->txq_put_index = 0;
+}
+
+/* Get number of physical egress port */
+static inline int mvpp2_egress_port(struct mvpp2_port *port)
+{
+	return MVPP2_MAX_TCONT + port->id;
+}
+
+/* Get number of physical TXQ */
+static inline int mvpp2_txq_phys(int port, int txq)
+{
+	return (MVPP2_MAX_TCONT + port) * MVPP2_MAX_TXQ + txq;
+}
+
+static void *mvpp2_frag_alloc(const struct mvpp2_bm_pool *pool)
+{
+	if (likely(pool->frag_size <= PAGE_SIZE))
+		return netdev_alloc_frag(pool->frag_size);
+	else
+		return kmalloc(pool->frag_size, GFP_ATOMIC);
+}
+
+static void mvpp2_frag_free(const struct mvpp2_bm_pool *pool, void *data)
+{
+	if (likely(pool->frag_size <= PAGE_SIZE))
+		skb_free_frag(data);
+	else
+		kfree(data);
+}
+
+/* Buffer Manager configuration routines */
+
+/* Create pool */
+static int mvpp2_bm_pool_create(struct platform_device *pdev,
+				struct mvpp2 *priv,
+				struct mvpp2_bm_pool *bm_pool, int size)
+{
+	u32 val;
+
+	/* Number of buffer pointers must be a multiple of 16, as per
+	 * hardware constraints
+	 */
+	if (!IS_ALIGNED(size, 16))
+		return -EINVAL;
+
+	/* PPv2.1 needs 8 bytes per buffer pointer, PPv2.2 needs 16
+	 * bytes per buffer pointer
+	 */
+	if (priv->hw_version == MVPP21)
+		bm_pool->size_bytes = 2 * sizeof(u32) * size;
+	else
+		bm_pool->size_bytes = 2 * sizeof(u64) * size;
+
+	bm_pool->virt_addr = dma_alloc_coherent(&pdev->dev, bm_pool->size_bytes,
+						&bm_pool->dma_addr,
+						GFP_KERNEL);
+	if (!bm_pool->virt_addr)
+		return -ENOMEM;
+
+	if (!IS_ALIGNED((unsigned long)bm_pool->virt_addr,
+			MVPP2_BM_POOL_PTR_ALIGN)) {
+		dma_free_coherent(&pdev->dev, bm_pool->size_bytes,
+				  bm_pool->virt_addr, bm_pool->dma_addr);
+		dev_err(&pdev->dev, "BM pool %d is not %d bytes aligned\n",
+			bm_pool->id, MVPP2_BM_POOL_PTR_ALIGN);
+		return -ENOMEM;
+	}
+
+	mvpp2_write(priv, MVPP2_BM_POOL_BASE_REG(bm_pool->id),
+		    lower_32_bits(bm_pool->dma_addr));
+	mvpp2_write(priv, MVPP2_BM_POOL_SIZE_REG(bm_pool->id), size);
+
+	val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id));
+	val |= MVPP2_BM_START_MASK;
+	mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id), val);
+
+	bm_pool->size = size;
+	bm_pool->pkt_size = 0;
+	bm_pool->buf_num = 0;
+
+	return 0;
+}
+
+/* Set pool buffer size */
+static void mvpp2_bm_pool_bufsize_set(struct mvpp2 *priv,
+				      struct mvpp2_bm_pool *bm_pool,
+				      int buf_size)
+{
+	u32 val;
+
+	bm_pool->buf_size = buf_size;
+
+	val = ALIGN(buf_size, 1 << MVPP2_POOL_BUF_SIZE_OFFSET);
+	mvpp2_write(priv, MVPP2_POOL_BUF_SIZE_REG(bm_pool->id), val);
+}
+
+static void mvpp2_bm_bufs_get_addrs(struct device *dev, struct mvpp2 *priv,
+				    struct mvpp2_bm_pool *bm_pool,
+				    dma_addr_t *dma_addr,
+				    phys_addr_t *phys_addr)
+{
+	int cpu = get_cpu();
+
+	*dma_addr = mvpp2_percpu_read(priv, cpu,
+				      MVPP2_BM_PHY_ALLOC_REG(bm_pool->id));
+	*phys_addr = mvpp2_percpu_read(priv, cpu, MVPP2_BM_VIRT_ALLOC_REG);
+
+	if (priv->hw_version == MVPP22) {
+		u32 val;
+		u32 dma_addr_highbits, phys_addr_highbits;
+
+		val = mvpp2_percpu_read(priv, cpu, MVPP22_BM_ADDR_HIGH_ALLOC);
+		dma_addr_highbits = (val & MVPP22_BM_ADDR_HIGH_PHYS_MASK);
+		phys_addr_highbits = (val & MVPP22_BM_ADDR_HIGH_VIRT_MASK) >>
+			MVPP22_BM_ADDR_HIGH_VIRT_SHIFT;
+
+		if (sizeof(dma_addr_t) == 8)
+			*dma_addr |= (u64)dma_addr_highbits << 32;
+
+		if (sizeof(phys_addr_t) == 8)
+			*phys_addr |= (u64)phys_addr_highbits << 32;
+	}
+
+	put_cpu();
+}
+
+/* Free all buffers from the pool */
+static void mvpp2_bm_bufs_free(struct device *dev, struct mvpp2 *priv,
+			       struct mvpp2_bm_pool *bm_pool, int buf_num)
+{
+	int i;
+
+	if (buf_num > bm_pool->buf_num) {
+		WARN(1, "Pool does not have so many bufs pool(%d) bufs(%d)\n",
+		     bm_pool->id, buf_num);
+		buf_num = bm_pool->buf_num;
+	}
+
+	for (i = 0; i < buf_num; i++) {
+		dma_addr_t buf_dma_addr;
+		phys_addr_t buf_phys_addr;
+		void *data;
+
+		mvpp2_bm_bufs_get_addrs(dev, priv, bm_pool,
+					&buf_dma_addr, &buf_phys_addr);
+
+		dma_unmap_single(dev, buf_dma_addr,
+				 bm_pool->buf_size, DMA_FROM_DEVICE);
+
+		data = (void *)phys_to_virt(buf_phys_addr);
+		if (!data)
+			break;
+
+		mvpp2_frag_free(bm_pool, data);
+	}
+
+	/* Update BM driver with number of buffers removed from pool */
+	bm_pool->buf_num -= i;
+}
+
+/* Check number of buffers in BM pool */
+static int mvpp2_check_hw_buf_num(struct mvpp2 *priv, struct mvpp2_bm_pool *bm_pool)
+{
+	int buf_num = 0;
+
+	buf_num += mvpp2_read(priv, MVPP2_BM_POOL_PTRS_NUM_REG(bm_pool->id)) &
+				    MVPP22_BM_POOL_PTRS_NUM_MASK;
+	buf_num += mvpp2_read(priv, MVPP2_BM_BPPI_PTRS_NUM_REG(bm_pool->id)) &
+				    MVPP2_BM_BPPI_PTR_NUM_MASK;
+
+	/* HW has one buffer ready which is not reflected in the counters */
+	if (buf_num)
+		buf_num += 1;
+
+	return buf_num;
+}
+
+/* Cleanup pool */
+static int mvpp2_bm_pool_destroy(struct platform_device *pdev,
+				 struct mvpp2 *priv,
+				 struct mvpp2_bm_pool *bm_pool)
+{
+	int buf_num;
+	u32 val;
+
+	buf_num = mvpp2_check_hw_buf_num(priv, bm_pool);
+	mvpp2_bm_bufs_free(&pdev->dev, priv, bm_pool, buf_num);
+
+	/* Check buffer counters after free */
+	buf_num = mvpp2_check_hw_buf_num(priv, bm_pool);
+	if (buf_num) {
+		WARN(1, "cannot free all buffers in pool %d, buf_num left %d\n",
+		     bm_pool->id, bm_pool->buf_num);
+		return 0;
+	}
+
+	val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id));
+	val |= MVPP2_BM_STOP_MASK;
+	mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(bm_pool->id), val);
+
+	dma_free_coherent(&pdev->dev, bm_pool->size_bytes,
+			  bm_pool->virt_addr,
+			  bm_pool->dma_addr);
+	return 0;
+}
+
+static int mvpp2_bm_pools_init(struct platform_device *pdev,
+			       struct mvpp2 *priv)
+{
+	int i, err, size;
+	struct mvpp2_bm_pool *bm_pool;
+
+	/* Create all pools with maximum size */
+	size = MVPP2_BM_POOL_SIZE_MAX;
+	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
+		bm_pool = &priv->bm_pools[i];
+		bm_pool->id = i;
+		err = mvpp2_bm_pool_create(pdev, priv, bm_pool, size);
+		if (err)
+			goto err_unroll_pools;
+		mvpp2_bm_pool_bufsize_set(priv, bm_pool, 0);
+	}
+	return 0;
+
+err_unroll_pools:
+	dev_err(&pdev->dev, "failed to create BM pool %d, size %d\n", i, size);
+	for (i = i - 1; i >= 0; i--)
+		mvpp2_bm_pool_destroy(pdev, priv, &priv->bm_pools[i]);
+	return err;
+}
+
+static int mvpp2_bm_init(struct platform_device *pdev, struct mvpp2 *priv)
+{
+	int i, err;
+
+	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
+		/* Mask BM all interrupts */
+		mvpp2_write(priv, MVPP2_BM_INTR_MASK_REG(i), 0);
+		/* Clear BM cause register */
+		mvpp2_write(priv, MVPP2_BM_INTR_CAUSE_REG(i), 0);
+	}
+
+	/* Allocate and initialize BM pools */
+	priv->bm_pools = devm_kcalloc(&pdev->dev, MVPP2_BM_POOLS_NUM,
+				      sizeof(*priv->bm_pools), GFP_KERNEL);
+	if (!priv->bm_pools)
+		return -ENOMEM;
+
+	err = mvpp2_bm_pools_init(pdev, priv);
+	if (err < 0)
+		return err;
+	return 0;
+}
+
+static void mvpp2_setup_bm_pool(void)
+{
+	/* Short pool */
+	mvpp2_pools[MVPP2_BM_SHORT].buf_num  = MVPP2_BM_SHORT_BUF_NUM;
+	mvpp2_pools[MVPP2_BM_SHORT].pkt_size = MVPP2_BM_SHORT_PKT_SIZE;
+
+	/* Long pool */
+	mvpp2_pools[MVPP2_BM_LONG].buf_num  = MVPP2_BM_LONG_BUF_NUM;
+	mvpp2_pools[MVPP2_BM_LONG].pkt_size = MVPP2_BM_LONG_PKT_SIZE;
+
+	/* Jumbo pool */
+	mvpp2_pools[MVPP2_BM_JUMBO].buf_num  = MVPP2_BM_JUMBO_BUF_NUM;
+	mvpp2_pools[MVPP2_BM_JUMBO].pkt_size = MVPP2_BM_JUMBO_PKT_SIZE;
+}
+
+/* Attach long pool to rxq */
+static void mvpp2_rxq_long_pool_set(struct mvpp2_port *port,
+				    int lrxq, int long_pool)
+{
+	u32 val, mask;
+	int prxq;
+
+	/* Get queue physical ID */
+	prxq = port->rxqs[lrxq]->id;
+
+	if (port->priv->hw_version == MVPP21)
+		mask = MVPP21_RXQ_POOL_LONG_MASK;
+	else
+		mask = MVPP22_RXQ_POOL_LONG_MASK;
+
+	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
+	val &= ~mask;
+	val |= (long_pool << MVPP2_RXQ_POOL_LONG_OFFS) & mask;
+	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
+}
+
+/* Attach short pool to rxq */
+static void mvpp2_rxq_short_pool_set(struct mvpp2_port *port,
+				     int lrxq, int short_pool)
+{
+	u32 val, mask;
+	int prxq;
+
+	/* Get queue physical ID */
+	prxq = port->rxqs[lrxq]->id;
+
+	if (port->priv->hw_version == MVPP21)
+		mask = MVPP21_RXQ_POOL_SHORT_MASK;
+	else
+		mask = MVPP22_RXQ_POOL_SHORT_MASK;
+
+	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
+	val &= ~mask;
+	val |= (short_pool << MVPP2_RXQ_POOL_SHORT_OFFS) & mask;
+	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
+}
+
+static void *mvpp2_buf_alloc(struct mvpp2_port *port,
+			     struct mvpp2_bm_pool *bm_pool,
+			     dma_addr_t *buf_dma_addr,
+			     phys_addr_t *buf_phys_addr,
+			     gfp_t gfp_mask)
+{
+	dma_addr_t dma_addr;
+	void *data;
+
+	data = mvpp2_frag_alloc(bm_pool);
+	if (!data)
+		return NULL;
+
+	dma_addr = dma_map_single(port->dev->dev.parent, data,
+				  MVPP2_RX_BUF_SIZE(bm_pool->pkt_size),
+				  DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(port->dev->dev.parent, dma_addr))) {
+		mvpp2_frag_free(bm_pool, data);
+		return NULL;
+	}
+	*buf_dma_addr = dma_addr;
+	*buf_phys_addr = virt_to_phys(data);
+
+	return data;
+}
+
+/* Release buffer to BM */
+static inline void mvpp2_bm_pool_put(struct mvpp2_port *port, int pool,
+				     dma_addr_t buf_dma_addr,
+				     phys_addr_t buf_phys_addr)
+{
+	int cpu = get_cpu();
+
+	if (port->priv->hw_version == MVPP22) {
+		u32 val = 0;
+
+		if (sizeof(dma_addr_t) == 8)
+			val |= upper_32_bits(buf_dma_addr) &
+				MVPP22_BM_ADDR_HIGH_PHYS_RLS_MASK;
+
+		if (sizeof(phys_addr_t) == 8)
+			val |= (upper_32_bits(buf_phys_addr)
+				<< MVPP22_BM_ADDR_HIGH_VIRT_RLS_SHIFT) &
+				MVPP22_BM_ADDR_HIGH_VIRT_RLS_MASK;
+
+		mvpp2_percpu_write_relaxed(port->priv, cpu,
+					   MVPP22_BM_ADDR_HIGH_RLS_REG, val);
+	}
+
+	/* MVPP2_BM_VIRT_RLS_REG is not interpreted by HW, and simply
+	 * returned in the "cookie" field of the RX
+	 * descriptor. Instead of storing the virtual address, we
+	 * store the physical address
+	 */
+	mvpp2_percpu_write_relaxed(port->priv, cpu,
+				   MVPP2_BM_VIRT_RLS_REG, buf_phys_addr);
+	mvpp2_percpu_write_relaxed(port->priv, cpu,
+				   MVPP2_BM_PHY_RLS_REG(pool), buf_dma_addr);
+
+	put_cpu();
+}
+
+/* Allocate buffers for the pool */
+static int mvpp2_bm_bufs_add(struct mvpp2_port *port,
+			     struct mvpp2_bm_pool *bm_pool, int buf_num)
+{
+	int i, buf_size, total_size;
+	dma_addr_t dma_addr;
+	phys_addr_t phys_addr;
+	void *buf;
+
+	buf_size = MVPP2_RX_BUF_SIZE(bm_pool->pkt_size);
+	total_size = MVPP2_RX_TOTAL_SIZE(buf_size);
+
+	if (buf_num < 0 ||
+	    (buf_num + bm_pool->buf_num > bm_pool->size)) {
+		netdev_err(port->dev,
+			   "cannot allocate %d buffers for pool %d\n",
+			   buf_num, bm_pool->id);
+		return 0;
+	}
+
+	for (i = 0; i < buf_num; i++) {
+		buf = mvpp2_buf_alloc(port, bm_pool, &dma_addr,
+				      &phys_addr, GFP_KERNEL);
+		if (!buf)
+			break;
+
+		mvpp2_bm_pool_put(port, bm_pool->id, dma_addr,
+				  phys_addr);
+	}
+
+	/* Update BM driver with number of buffers added to pool */
+	bm_pool->buf_num += i;
+
+	netdev_dbg(port->dev,
+		   "pool %d: pkt_size=%4d, buf_size=%4d, total_size=%4d\n",
+		   bm_pool->id, bm_pool->pkt_size, buf_size, total_size);
+
+	netdev_dbg(port->dev,
+		   "pool %d: %d of %d buffers added\n",
+		   bm_pool->id, i, buf_num);
+	return i;
+}
+
+/* Notify the driver that BM pool is being used as specific type and return the
+ * pool pointer on success
+ */
+static struct mvpp2_bm_pool *
+mvpp2_bm_pool_use(struct mvpp2_port *port, unsigned pool, int pkt_size)
+{
+	struct mvpp2_bm_pool *new_pool = &port->priv->bm_pools[pool];
+	int num;
+
+	if (pool >= MVPP2_BM_POOLS_NUM) {
+		netdev_err(port->dev, "Invalid pool %d\n", pool);
+		return NULL;
+	}
+
+	/* Allocate buffers in case BM pool is used as long pool, but packet
+	 * size doesn't match MTU or BM pool hasn't being used yet
+	 */
+	if (new_pool->pkt_size == 0) {
+		int pkts_num;
+
+		/* Set default buffer number or free all the buffers in case
+		 * the pool is not empty
+		 */
+		pkts_num = new_pool->buf_num;
+		if (pkts_num == 0)
+			pkts_num = mvpp2_pools[pool].buf_num;
+		else
+			mvpp2_bm_bufs_free(port->dev->dev.parent,
+					   port->priv, new_pool, pkts_num);
+
+		new_pool->pkt_size = pkt_size;
+		new_pool->frag_size =
+			SKB_DATA_ALIGN(MVPP2_RX_BUF_SIZE(pkt_size)) +
+			MVPP2_SKB_SHINFO_SIZE;
+
+		/* Allocate buffers for this pool */
+		num = mvpp2_bm_bufs_add(port, new_pool, pkts_num);
+		if (num != pkts_num) {
+			WARN(1, "pool %d: %d of %d allocated\n",
+			     new_pool->id, num, pkts_num);
+			return NULL;
+		}
+	}
+
+	mvpp2_bm_pool_bufsize_set(port->priv, new_pool,
+				  MVPP2_RX_BUF_SIZE(new_pool->pkt_size));
+
+	return new_pool;
+}
+
+/* Initialize pools for swf */
+static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port)
+{
+	int rxq;
+	enum mvpp2_bm_pool_log_num long_log_pool, short_log_pool;
+
+	/* If port pkt_size is higher than 1518B:
+	 * HW Long pool - SW Jumbo pool, HW Short pool - SW Long pool
+	 * else: HW Long pool - SW Long pool, HW Short pool - SW Short pool
+	 */
+	if (port->pkt_size > MVPP2_BM_LONG_PKT_SIZE) {
+		long_log_pool = MVPP2_BM_JUMBO;
+		short_log_pool = MVPP2_BM_LONG;
+	} else {
+		long_log_pool = MVPP2_BM_LONG;
+		short_log_pool = MVPP2_BM_SHORT;
+	}
+
+	if (!port->pool_long) {
+		port->pool_long =
+			mvpp2_bm_pool_use(port, long_log_pool,
+					  mvpp2_pools[long_log_pool].pkt_size);
+		if (!port->pool_long)
+			return -ENOMEM;
+
+		port->pool_long->port_map |= BIT(port->id);
+
+		for (rxq = 0; rxq < port->nrxqs; rxq++)
+			mvpp2_rxq_long_pool_set(port, rxq, port->pool_long->id);
+	}
+
+	if (!port->pool_short) {
+		port->pool_short =
+			mvpp2_bm_pool_use(port, short_log_pool,
+					  mvpp2_pools[short_log_pool].pkt_size);
+		if (!port->pool_short)
+			return -ENOMEM;
+
+		port->pool_short->port_map |= BIT(port->id);
+
+		for (rxq = 0; rxq < port->nrxqs; rxq++)
+			mvpp2_rxq_short_pool_set(port, rxq,
+						 port->pool_short->id);
+	}
+
+	return 0;
+}
+
+static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	enum mvpp2_bm_pool_log_num new_long_pool;
+	int pkt_size = MVPP2_RX_PKT_SIZE(mtu);
+
+	/* If port MTU is higher than 1518B:
+	 * HW Long pool - SW Jumbo pool, HW Short pool - SW Long pool
+	 * else: HW Long pool - SW Long pool, HW Short pool - SW Short pool
+	 */
+	if (pkt_size > MVPP2_BM_LONG_PKT_SIZE)
+		new_long_pool = MVPP2_BM_JUMBO;
+	else
+		new_long_pool = MVPP2_BM_LONG;
+
+	if (new_long_pool != port->pool_long->id) {
+		/* Remove port from old short & long pool */
+		port->pool_long = mvpp2_bm_pool_use(port, port->pool_long->id,
+						    port->pool_long->pkt_size);
+		port->pool_long->port_map &= ~BIT(port->id);
+		port->pool_long = NULL;
+
+		port->pool_short = mvpp2_bm_pool_use(port, port->pool_short->id,
+						     port->pool_short->pkt_size);
+		port->pool_short->port_map &= ~BIT(port->id);
+		port->pool_short = NULL;
+
+		port->pkt_size =  pkt_size;
+
+		/* Add port to new short & long pool */
+		mvpp2_swf_bm_pool_init(port);
+
+		/* Update L4 checksum when jumbo enable/disable on port */
+		if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
+			dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+			dev->hw_features &= ~(NETIF_F_IP_CSUM |
+					      NETIF_F_IPV6_CSUM);
+		} else {
+			dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+			dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+		}
+	}
+
+	dev->mtu = mtu;
+	dev->wanted_features = dev->features;
+
+	netdev_update_features(dev);
+	return 0;
+}
+
+static inline void mvpp2_interrupts_enable(struct mvpp2_port *port)
+{
+	int i, sw_thread_mask = 0;
+
+	for (i = 0; i < port->nqvecs; i++)
+		sw_thread_mask |= port->qvecs[i].sw_thread_mask;
+
+	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+		    MVPP2_ISR_ENABLE_INTERRUPT(sw_thread_mask));
+}
+
+static inline void mvpp2_interrupts_disable(struct mvpp2_port *port)
+{
+	int i, sw_thread_mask = 0;
+
+	for (i = 0; i < port->nqvecs; i++)
+		sw_thread_mask |= port->qvecs[i].sw_thread_mask;
+
+	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+		    MVPP2_ISR_DISABLE_INTERRUPT(sw_thread_mask));
+}
+
+static inline void mvpp2_qvec_interrupt_enable(struct mvpp2_queue_vector *qvec)
+{
+	struct mvpp2_port *port = qvec->port;
+
+	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+		    MVPP2_ISR_ENABLE_INTERRUPT(qvec->sw_thread_mask));
+}
+
+static inline void mvpp2_qvec_interrupt_disable(struct mvpp2_queue_vector *qvec)
+{
+	struct mvpp2_port *port = qvec->port;
+
+	mvpp2_write(port->priv, MVPP2_ISR_ENABLE_REG(port->id),
+		    MVPP2_ISR_DISABLE_INTERRUPT(qvec->sw_thread_mask));
+}
+
+/* Mask the current CPU's Rx/Tx interrupts
+ * Called by on_each_cpu(), guaranteed to run with migration disabled,
+ * using smp_processor_id() is OK.
+ */
+static void mvpp2_interrupts_mask(void *arg)
+{
+	struct mvpp2_port *port = arg;
+
+	mvpp2_percpu_write(port->priv, smp_processor_id(),
+			   MVPP2_ISR_RX_TX_MASK_REG(port->id), 0);
+}
+
+/* Unmask the current CPU's Rx/Tx interrupts.
+ * Called by on_each_cpu(), guaranteed to run with migration disabled,
+ * using smp_processor_id() is OK.
+ */
+static void mvpp2_interrupts_unmask(void *arg)
+{
+	struct mvpp2_port *port = arg;
+	u32 val;
+
+	val = MVPP2_CAUSE_MISC_SUM_MASK |
+		MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+	if (port->has_tx_irqs)
+		val |= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
+
+	mvpp2_percpu_write(port->priv, smp_processor_id(),
+			   MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
+}
+
+static void
+mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask)
+{
+	u32 val;
+	int i;
+
+	if (port->priv->hw_version != MVPP22)
+		return;
+
+	if (mask)
+		val = 0;
+	else
+		val = MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+
+	for (i = 0; i < port->nqvecs; i++) {
+		struct mvpp2_queue_vector *v = port->qvecs + i;
+
+		if (v->type != MVPP2_QUEUE_VECTOR_SHARED)
+			continue;
+
+		mvpp2_percpu_write(port->priv, v->sw_thread_id,
+				   MVPP2_ISR_RX_TX_MASK_REG(port->id), val);
+	}
+}
+
+/* Port configuration routines */
+
+static void mvpp22_gop_init_rgmii(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	u32 val;
+
+	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+	val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT;
+	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+	regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
+	if (port->gop_id == 2)
+		val |= GENCONF_CTRL0_PORT0_RGMII | GENCONF_CTRL0_PORT1_RGMII;
+	else if (port->gop_id == 3)
+		val |= GENCONF_CTRL0_PORT1_RGMII_MII;
+	regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
+}
+
+static void mvpp22_gop_init_sgmii(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	u32 val;
+
+	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+	val |= GENCONF_PORT_CTRL0_BUS_WIDTH_SELECT |
+	       GENCONF_PORT_CTRL0_RX_DATA_SAMPLE;
+	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+	if (port->gop_id > 1) {
+		regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val);
+		if (port->gop_id == 2)
+			val &= ~GENCONF_CTRL0_PORT0_RGMII;
+		else if (port->gop_id == 3)
+			val &= ~GENCONF_CTRL0_PORT1_RGMII_MII;
+		regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val);
+	}
+}
+
+static void mvpp22_gop_init_10gkr(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	void __iomem *mpcs = priv->iface_base + MVPP22_MPCS_BASE(port->gop_id);
+	void __iomem *xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id);
+	u32 val;
+
+	/* XPCS */
+	val = readl(xpcs + MVPP22_XPCS_CFG0);
+	val &= ~(MVPP22_XPCS_CFG0_PCS_MODE(0x3) |
+		 MVPP22_XPCS_CFG0_ACTIVE_LANE(0x3));
+	val |= MVPP22_XPCS_CFG0_ACTIVE_LANE(2);
+	writel(val, xpcs + MVPP22_XPCS_CFG0);
+
+	/* MPCS */
+	val = readl(mpcs + MVPP22_MPCS_CTRL);
+	val &= ~MVPP22_MPCS_CTRL_FWD_ERR_CONN;
+	writel(val, mpcs + MVPP22_MPCS_CTRL);
+
+	val = readl(mpcs + MVPP22_MPCS_CLK_RESET);
+	val &= ~(MVPP22_MPCS_CLK_RESET_DIV_RATIO(0x7) | MAC_CLK_RESET_MAC |
+		 MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX);
+	val |= MVPP22_MPCS_CLK_RESET_DIV_RATIO(1);
+	writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
+
+	val &= ~MVPP22_MPCS_CLK_RESET_DIV_SET;
+	val |= MAC_CLK_RESET_MAC | MAC_CLK_RESET_SD_RX | MAC_CLK_RESET_SD_TX;
+	writel(val, mpcs + MVPP22_MPCS_CLK_RESET);
+}
+
+static int mvpp22_gop_init(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	u32 val;
+
+	if (!priv->sysctrl_base)
+		return 0;
+
+	switch (port->phy_interface) {
+	case PHY_INTERFACE_MODE_RGMII:
+	case PHY_INTERFACE_MODE_RGMII_ID:
+	case PHY_INTERFACE_MODE_RGMII_RXID:
+	case PHY_INTERFACE_MODE_RGMII_TXID:
+		if (port->gop_id == 0)
+			goto invalid_conf;
+		mvpp22_gop_init_rgmii(port);
+		break;
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		mvpp22_gop_init_sgmii(port);
+		break;
+	case PHY_INTERFACE_MODE_10GKR:
+		if (port->gop_id != 0)
+			goto invalid_conf;
+		mvpp22_gop_init_10gkr(port);
+		break;
+	default:
+		goto unsupported_conf;
+	}
+
+	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL1, &val);
+	val |= GENCONF_PORT_CTRL1_RESET(port->gop_id) |
+	       GENCONF_PORT_CTRL1_EN(port->gop_id);
+	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL1, val);
+
+	regmap_read(priv->sysctrl_base, GENCONF_PORT_CTRL0, &val);
+	val |= GENCONF_PORT_CTRL0_CLK_DIV_PHASE_CLR;
+	regmap_write(priv->sysctrl_base, GENCONF_PORT_CTRL0, val);
+
+	regmap_read(priv->sysctrl_base, GENCONF_SOFT_RESET1, &val);
+	val |= GENCONF_SOFT_RESET1_GOP;
+	regmap_write(priv->sysctrl_base, GENCONF_SOFT_RESET1, val);
+
+unsupported_conf:
+	return 0;
+
+invalid_conf:
+	netdev_err(port->dev, "Invalid port configuration\n");
+	return -EINVAL;
+}
+
+static void mvpp22_gop_unmask_irq(struct mvpp2_port *port)
+{
+	u32 val;
+
+	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
+		/* Enable the GMAC link status irq for this port */
+		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
+		val |= MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
+		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
+	}
+
+	if (port->gop_id == 0) {
+		/* Enable the XLG/GIG irqs for this port */
+		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
+		if (port->phy_interface == PHY_INTERFACE_MODE_10GKR)
+			val |= MVPP22_XLG_EXT_INT_MASK_XLG;
+		else
+			val |= MVPP22_XLG_EXT_INT_MASK_GIG;
+		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
+	}
+}
+
+static void mvpp22_gop_mask_irq(struct mvpp2_port *port)
+{
+	u32 val;
+
+	if (port->gop_id == 0) {
+		val = readl(port->base + MVPP22_XLG_EXT_INT_MASK);
+		val &= ~(MVPP22_XLG_EXT_INT_MASK_XLG |
+			 MVPP22_XLG_EXT_INT_MASK_GIG);
+		writel(val, port->base + MVPP22_XLG_EXT_INT_MASK);
+	}
+
+	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
+		val = readl(port->base + MVPP22_GMAC_INT_SUM_MASK);
+		val &= ~MVPP22_GMAC_INT_SUM_MASK_LINK_STAT;
+		writel(val, port->base + MVPP22_GMAC_INT_SUM_MASK);
+	}
+}
+
+static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
+{
+	u32 val;
+
+	if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+	    port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
+		val = readl(port->base + MVPP22_GMAC_INT_MASK);
+		val |= MVPP22_GMAC_INT_MASK_LINK_STAT;
+		writel(val, port->base + MVPP22_GMAC_INT_MASK);
+	}
+
+	if (port->gop_id == 0) {
+		val = readl(port->base + MVPP22_XLG_INT_MASK);
+		val |= MVPP22_XLG_INT_MASK_LINK;
+		writel(val, port->base + MVPP22_XLG_INT_MASK);
+	}
+
+	mvpp22_gop_unmask_irq(port);
+}
+
+/* Sets the PHY mode of the COMPHY (which configures the serdes lanes).
+ *
+ * The PHY mode used by the PPv2 driver comes from the network subsystem, while
+ * the one given to the COMPHY comes from the generic PHY subsystem. Hence they
+ * differ.
+ *
+ * The COMPHY configures the serdes lanes regardless of the actual use of the
+ * lanes by the physical layer. This is why configurations like
+ * "PPv2 (2500BaseX) - COMPHY (2500SGMII)" are valid.
+ */
+static int mvpp22_comphy_init(struct mvpp2_port *port)
+{
+	enum phy_mode mode;
+	int ret;
+
+	if (!port->comphy)
+		return 0;
+
+	switch (port->phy_interface) {
+	case PHY_INTERFACE_MODE_SGMII:
+	case PHY_INTERFACE_MODE_1000BASEX:
+		mode = PHY_MODE_SGMII;
+		break;
+	case PHY_INTERFACE_MODE_2500BASEX:
+		mode = PHY_MODE_2500SGMII;
+		break;
+	case PHY_INTERFACE_MODE_10GKR:
+		mode = PHY_MODE_10GKR;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = phy_set_mode(port->comphy, mode);
+	if (ret)
+		return ret;
+
+	return phy_power_on(port->comphy);
+}
+
+static void mvpp2_port_enable(struct mvpp2_port *port)
+{
+	u32 val;
+
+	/* Only GOP port 0 has an XLG MAC */
+	if (port->gop_id == 0 &&
+	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+	     port->phy_interface == PHY_INTERFACE_MODE_10GKR)) {
+		val = readl(port->base + MVPP22_XLG_CTRL0_REG);
+		val |= MVPP22_XLG_CTRL0_PORT_EN |
+		       MVPP22_XLG_CTRL0_MAC_RESET_DIS;
+		val &= ~MVPP22_XLG_CTRL0_MIB_CNT_DIS;
+		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
+	} else {
+		val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+		val |= MVPP2_GMAC_PORT_EN_MASK;
+		val |= MVPP2_GMAC_MIB_CNTR_EN_MASK;
+		writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
+	}
+}
+
+static void mvpp2_port_disable(struct mvpp2_port *port)
+{
+	u32 val;
+
+	/* Only GOP port 0 has an XLG MAC */
+	if (port->gop_id == 0 &&
+	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+	     port->phy_interface == PHY_INTERFACE_MODE_10GKR)) {
+		val = readl(port->base + MVPP22_XLG_CTRL0_REG);
+		val &= ~MVPP22_XLG_CTRL0_PORT_EN;
+		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
+
+		/* Disable & reset should be done separately */
+		val &= ~MVPP22_XLG_CTRL0_MAC_RESET_DIS;
+		writel(val, port->base + MVPP22_XLG_CTRL0_REG);
+	} else {
+		val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+		val &= ~(MVPP2_GMAC_PORT_EN_MASK);
+		writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
+	}
+}
+
+/* Set IEEE 802.3x Flow Control Xon Packet Transmission Mode */
+static void mvpp2_port_periodic_xon_disable(struct mvpp2_port *port)
+{
+	u32 val;
+
+	val = readl(port->base + MVPP2_GMAC_CTRL_1_REG) &
+		    ~MVPP2_GMAC_PERIODIC_XON_EN_MASK;
+	writel(val, port->base + MVPP2_GMAC_CTRL_1_REG);
+}
+
+/* Configure loopback port */
+static void mvpp2_port_loopback_set(struct mvpp2_port *port,
+				    const struct phylink_link_state *state)
+{
+	u32 val;
+
+	val = readl(port->base + MVPP2_GMAC_CTRL_1_REG);
+
+	if (state->speed == 1000)
+		val |= MVPP2_GMAC_GMII_LB_EN_MASK;
+	else
+		val &= ~MVPP2_GMAC_GMII_LB_EN_MASK;
+
+	if (port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+	    port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    port->phy_interface == PHY_INTERFACE_MODE_2500BASEX)
+		val |= MVPP2_GMAC_PCS_LB_EN_MASK;
+	else
+		val &= ~MVPP2_GMAC_PCS_LB_EN_MASK;
+
+	writel(val, port->base + MVPP2_GMAC_CTRL_1_REG);
+}
+
+struct mvpp2_ethtool_counter {
+	unsigned int offset;
+	const char string[ETH_GSTRING_LEN];
+	bool reg_is_64b;
+};
+
+static u64 mvpp2_read_count(struct mvpp2_port *port,
+			    const struct mvpp2_ethtool_counter *counter)
+{
+	u64 val;
+
+	val = readl(port->stats_base + counter->offset);
+	if (counter->reg_is_64b)
+		val += (u64)readl(port->stats_base + counter->offset + 4) << 32;
+
+	return val;
+}
+
+/* Due to the fact that software statistics and hardware statistics are, by
+ * design, incremented at different moments in the chain of packet processing,
+ * it is very likely that incoming packets could have been dropped after being
+ * counted by hardware but before reaching software statistics (most probably
+ * multicast packets), and in the oppposite way, during transmission, FCS bytes
+ * are added in between as well as TSO skb will be split and header bytes added.
+ * Hence, statistics gathered from userspace with ifconfig (software) and
+ * ethtool (hardware) cannot be compared.
+ */
+static const struct mvpp2_ethtool_counter mvpp2_ethtool_regs[] = {
+	{ MVPP2_MIB_GOOD_OCTETS_RCVD, "good_octets_received", true },
+	{ MVPP2_MIB_BAD_OCTETS_RCVD, "bad_octets_received" },
+	{ MVPP2_MIB_CRC_ERRORS_SENT, "crc_errors_sent" },
+	{ MVPP2_MIB_UNICAST_FRAMES_RCVD, "unicast_frames_received" },
+	{ MVPP2_MIB_BROADCAST_FRAMES_RCVD, "broadcast_frames_received" },
+	{ MVPP2_MIB_MULTICAST_FRAMES_RCVD, "multicast_frames_received" },
+	{ MVPP2_MIB_FRAMES_64_OCTETS, "frames_64_octets" },
+	{ MVPP2_MIB_FRAMES_65_TO_127_OCTETS, "frames_65_to_127_octet" },
+	{ MVPP2_MIB_FRAMES_128_TO_255_OCTETS, "frames_128_to_255_octet" },
+	{ MVPP2_MIB_FRAMES_256_TO_511_OCTETS, "frames_256_to_511_octet" },
+	{ MVPP2_MIB_FRAMES_512_TO_1023_OCTETS, "frames_512_to_1023_octet" },
+	{ MVPP2_MIB_FRAMES_1024_TO_MAX_OCTETS, "frames_1024_to_max_octet" },
+	{ MVPP2_MIB_GOOD_OCTETS_SENT, "good_octets_sent", true },
+	{ MVPP2_MIB_UNICAST_FRAMES_SENT, "unicast_frames_sent" },
+	{ MVPP2_MIB_MULTICAST_FRAMES_SENT, "multicast_frames_sent" },
+	{ MVPP2_MIB_BROADCAST_FRAMES_SENT, "broadcast_frames_sent" },
+	{ MVPP2_MIB_FC_SENT, "fc_sent" },
+	{ MVPP2_MIB_FC_RCVD, "fc_received" },
+	{ MVPP2_MIB_RX_FIFO_OVERRUN, "rx_fifo_overrun" },
+	{ MVPP2_MIB_UNDERSIZE_RCVD, "undersize_received" },
+	{ MVPP2_MIB_FRAGMENTS_RCVD, "fragments_received" },
+	{ MVPP2_MIB_OVERSIZE_RCVD, "oversize_received" },
+	{ MVPP2_MIB_JABBER_RCVD, "jabber_received" },
+	{ MVPP2_MIB_MAC_RCV_ERROR, "mac_receive_error" },
+	{ MVPP2_MIB_BAD_CRC_EVENT, "bad_crc_event" },
+	{ MVPP2_MIB_COLLISION, "collision" },
+	{ MVPP2_MIB_LATE_COLLISION, "late_collision" },
+};
+
+static void mvpp2_ethtool_get_strings(struct net_device *netdev, u32 sset,
+				      u8 *data)
+{
+	if (sset == ETH_SS_STATS) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
+			memcpy(data + i * ETH_GSTRING_LEN,
+			       &mvpp2_ethtool_regs[i].string, ETH_GSTRING_LEN);
+	}
+}
+
+static void mvpp2_gather_hw_statistics(struct work_struct *work)
+{
+	struct delayed_work *del_work = to_delayed_work(work);
+	struct mvpp2_port *port = container_of(del_work, struct mvpp2_port,
+					       stats_work);
+	u64 *pstats;
+	int i;
+
+	mutex_lock(&port->gather_stats_lock);
+
+	pstats = port->ethtool_stats;
+	for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
+		*pstats++ += mvpp2_read_count(port, &mvpp2_ethtool_regs[i]);
+
+	/* No need to read again the counters right after this function if it
+	 * was called asynchronously by the user (ie. use of ethtool).
+	 */
+	cancel_delayed_work(&port->stats_work);
+	queue_delayed_work(port->priv->stats_queue, &port->stats_work,
+			   MVPP2_MIB_COUNTERS_STATS_DELAY);
+
+	mutex_unlock(&port->gather_stats_lock);
+}
+
+static void mvpp2_ethtool_get_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	/* Update statistics for the given port, then take the lock to avoid
+	 * concurrent accesses on the ethtool_stats structure during its copy.
+	 */
+	mvpp2_gather_hw_statistics(&port->stats_work.work);
+
+	mutex_lock(&port->gather_stats_lock);
+	memcpy(data, port->ethtool_stats,
+	       sizeof(u64) * ARRAY_SIZE(mvpp2_ethtool_regs));
+	mutex_unlock(&port->gather_stats_lock);
+}
+
+static int mvpp2_ethtool_get_sset_count(struct net_device *dev, int sset)
+{
+	if (sset == ETH_SS_STATS)
+		return ARRAY_SIZE(mvpp2_ethtool_regs);
+
+	return -EOPNOTSUPP;
+}
+
+static void mvpp2_port_reset(struct mvpp2_port *port)
+{
+	u32 val;
+	unsigned int i;
+
+	/* Read the GOP statistics to reset the hardware counters */
+	for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++)
+		mvpp2_read_count(port, &mvpp2_ethtool_regs[i]);
+
+	val = readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
+		    ~MVPP2_GMAC_PORT_RESET_MASK;
+	writel(val, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+	while (readl(port->base + MVPP2_GMAC_CTRL_2_REG) &
+	       MVPP2_GMAC_PORT_RESET_MASK)
+		continue;
+}
+
+/* Change maximum receive size of the port */
+static inline void mvpp2_gmac_max_rx_size_set(struct mvpp2_port *port)
+{
+	u32 val;
+
+	val = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+	val &= ~MVPP2_GMAC_MAX_RX_SIZE_MASK;
+	val |= (((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
+		    MVPP2_GMAC_MAX_RX_SIZE_OFFS);
+	writel(val, port->base + MVPP2_GMAC_CTRL_0_REG);
+}
+
+/* Change maximum receive size of the port */
+static inline void mvpp2_xlg_max_rx_size_set(struct mvpp2_port *port)
+{
+	u32 val;
+
+	val =  readl(port->base + MVPP22_XLG_CTRL1_REG);
+	val &= ~MVPP22_XLG_CTRL1_FRAMESIZELIMIT_MASK;
+	val |= ((port->pkt_size - MVPP2_MH_SIZE) / 2) <<
+	       MVPP22_XLG_CTRL1_FRAMESIZELIMIT_OFFS;
+	writel(val, port->base + MVPP22_XLG_CTRL1_REG);
+}
+
+/* Set defaults to the MVPP2 port */
+static void mvpp2_defaults_set(struct mvpp2_port *port)
+{
+	int tx_port_num, val, queue, ptxq, lrxq;
+
+	if (port->priv->hw_version == MVPP21) {
+		/* Update TX FIFO MIN Threshold */
+		val = readl(port->base + MVPP2_GMAC_PORT_FIFO_CFG_1_REG);
+		val &= ~MVPP2_GMAC_TX_FIFO_MIN_TH_ALL_MASK;
+		/* Min. TX threshold must be less than minimal packet length */
+		val |= MVPP2_GMAC_TX_FIFO_MIN_TH_MASK(64 - 4 - 2);
+		writel(val, port->base + MVPP2_GMAC_PORT_FIFO_CFG_1_REG);
+	}
+
+	/* Disable Legacy WRR, Disable EJP, Release from reset */
+	tx_port_num = mvpp2_egress_port(port);
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG,
+		    tx_port_num);
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_CMD_1_REG, 0);
+
+	/* Close bandwidth for all queues */
+	for (queue = 0; queue < MVPP2_MAX_TXQ; queue++) {
+		ptxq = mvpp2_txq_phys(port->id, queue);
+		mvpp2_write(port->priv,
+			    MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(ptxq), 0);
+	}
+
+	/* Set refill period to 1 usec, refill tokens
+	 * and bucket size to maximum
+	 */
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PERIOD_REG,
+		    port->priv->tclk / USEC_PER_SEC);
+	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_REFILL_REG);
+	val &= ~MVPP2_TXP_REFILL_PERIOD_ALL_MASK;
+	val |= MVPP2_TXP_REFILL_PERIOD_MASK(1);
+	val |= MVPP2_TXP_REFILL_TOKENS_ALL_MASK;
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_REFILL_REG, val);
+	val = MVPP2_TXP_TOKEN_SIZE_MAX;
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
+
+	/* Set MaximumLowLatencyPacketSize value to 256 */
+	mvpp2_write(port->priv, MVPP2_RX_CTRL_REG(port->id),
+		    MVPP2_RX_USE_PSEUDO_FOR_CSUM_MASK |
+		    MVPP2_RX_LOW_LATENCY_PKT_SIZE(256));
+
+	/* Enable Rx cache snoop */
+	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
+		queue = port->rxqs[lrxq]->id;
+		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
+		val |= MVPP2_SNOOP_PKT_SIZE_MASK |
+			   MVPP2_SNOOP_BUF_HDR_MASK;
+		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
+	}
+
+	/* At default, mask all interrupts to all present cpus */
+	mvpp2_interrupts_disable(port);
+}
+
+/* Enable/disable receiving packets */
+static void mvpp2_ingress_enable(struct mvpp2_port *port)
+{
+	u32 val;
+	int lrxq, queue;
+
+	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
+		queue = port->rxqs[lrxq]->id;
+		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
+		val &= ~MVPP2_RXQ_DISABLE_MASK;
+		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
+	}
+}
+
+static void mvpp2_ingress_disable(struct mvpp2_port *port)
+{
+	u32 val;
+	int lrxq, queue;
+
+	for (lrxq = 0; lrxq < port->nrxqs; lrxq++) {
+		queue = port->rxqs[lrxq]->id;
+		val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(queue));
+		val |= MVPP2_RXQ_DISABLE_MASK;
+		mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(queue), val);
+	}
+}
+
+/* Enable transmit via physical egress queue
+ * - HW starts take descriptors from DRAM
+ */
+static void mvpp2_egress_enable(struct mvpp2_port *port)
+{
+	u32 qmap;
+	int queue;
+	int tx_port_num = mvpp2_egress_port(port);
+
+	/* Enable all initialized TXs. */
+	qmap = 0;
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		struct mvpp2_tx_queue *txq = port->txqs[queue];
+
+		if (txq->descs)
+			qmap |= (1 << queue);
+	}
+
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG, qmap);
+}
+
+/* Disable transmit via physical egress queue
+ * - HW doesn't take descriptors from DRAM
+ */
+static void mvpp2_egress_disable(struct mvpp2_port *port)
+{
+	u32 reg_data;
+	int delay;
+	int tx_port_num = mvpp2_egress_port(port);
+
+	/* Issue stop command for active channels only */
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
+	reg_data = (mvpp2_read(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG)) &
+		    MVPP2_TXP_SCHED_ENQ_MASK;
+	if (reg_data != 0)
+		mvpp2_write(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG,
+			    (reg_data << MVPP2_TXP_SCHED_DISQ_OFFSET));
+
+	/* Wait for all Tx activity to terminate. */
+	delay = 0;
+	do {
+		if (delay >= MVPP2_TX_DISABLE_TIMEOUT_MSEC) {
+			netdev_warn(port->dev,
+				    "Tx stop timed out, status=0x%08x\n",
+				    reg_data);
+			break;
+		}
+		mdelay(1);
+		delay++;
+
+		/* Check port TX Command register that all
+		 * Tx queues are stopped
+		 */
+		reg_data = mvpp2_read(port->priv, MVPP2_TXP_SCHED_Q_CMD_REG);
+	} while (reg_data & MVPP2_TXP_SCHED_ENQ_MASK);
+}
+
+/* Rx descriptors helper methods */
+
+/* Get number of Rx descriptors occupied by received packets */
+static inline int
+mvpp2_rxq_received(struct mvpp2_port *port, int rxq_id)
+{
+	u32 val = mvpp2_read(port->priv, MVPP2_RXQ_STATUS_REG(rxq_id));
+
+	return val & MVPP2_RXQ_OCCUPIED_MASK;
+}
+
+/* Update Rx queue status with the number of occupied and available
+ * Rx descriptor slots.
+ */
+static inline void
+mvpp2_rxq_status_update(struct mvpp2_port *port, int rxq_id,
+			int used_count, int free_count)
+{
+	/* Decrement the number of used descriptors and increment count
+	 * increment the number of free descriptors.
+	 */
+	u32 val = used_count | (free_count << MVPP2_RXQ_NUM_NEW_OFFSET);
+
+	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_UPDATE_REG(rxq_id), val);
+}
+
+/* Get pointer to next RX descriptor to be processed by SW */
+static inline struct mvpp2_rx_desc *
+mvpp2_rxq_next_desc_get(struct mvpp2_rx_queue *rxq)
+{
+	int rx_desc = rxq->next_desc_to_proc;
+
+	rxq->next_desc_to_proc = MVPP2_QUEUE_NEXT_DESC(rxq, rx_desc);
+	prefetch(rxq->descs + rxq->next_desc_to_proc);
+	return rxq->descs + rx_desc;
+}
+
+/* Set rx queue offset */
+static void mvpp2_rxq_offset_set(struct mvpp2_port *port,
+				 int prxq, int offset)
+{
+	u32 val;
+
+	/* Convert offset from bytes to units of 32 bytes */
+	offset = offset >> 5;
+
+	val = mvpp2_read(port->priv, MVPP2_RXQ_CONFIG_REG(prxq));
+	val &= ~MVPP2_RXQ_PACKET_OFFSET_MASK;
+
+	/* Offset is in */
+	val |= ((offset << MVPP2_RXQ_PACKET_OFFSET_OFFS) &
+		    MVPP2_RXQ_PACKET_OFFSET_MASK);
+
+	mvpp2_write(port->priv, MVPP2_RXQ_CONFIG_REG(prxq), val);
+}
+
+/* Tx descriptors helper methods */
+
+/* Get pointer to next Tx descriptor to be processed (send) by HW */
+static struct mvpp2_tx_desc *
+mvpp2_txq_next_desc_get(struct mvpp2_tx_queue *txq)
+{
+	int tx_desc = txq->next_desc_to_proc;
+
+	txq->next_desc_to_proc = MVPP2_QUEUE_NEXT_DESC(txq, tx_desc);
+	return txq->descs + tx_desc;
+}
+
+/* Update HW with number of aggregated Tx descriptors to be sent
+ *
+ * Called only from mvpp2_tx(), so migration is disabled, using
+ * smp_processor_id() is OK.
+ */
+static void mvpp2_aggr_txq_pend_desc_add(struct mvpp2_port *port, int pending)
+{
+	/* aggregated access - relevant TXQ number is written in TX desc */
+	mvpp2_percpu_write(port->priv, smp_processor_id(),
+			   MVPP2_AGGR_TXQ_UPDATE_REG, pending);
+}
+
+/* Check if there are enough free descriptors in aggregated txq.
+ * If not, update the number of occupied descriptors and repeat the check.
+ *
+ * Called only from mvpp2_tx(), so migration is disabled, using
+ * smp_processor_id() is OK.
+ */
+static int mvpp2_aggr_desc_num_check(struct mvpp2 *priv,
+				     struct mvpp2_tx_queue *aggr_txq, int num)
+{
+	if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE) {
+		/* Update number of occupied aggregated Tx descriptors */
+		int cpu = smp_processor_id();
+		u32 val = mvpp2_read_relaxed(priv,
+					     MVPP2_AGGR_TXQ_STATUS_REG(cpu));
+
+		aggr_txq->count = val & MVPP2_AGGR_TXQ_PENDING_MASK;
+
+		if ((aggr_txq->count + num) > MVPP2_AGGR_TXQ_SIZE)
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+/* Reserved Tx descriptors allocation request
+ *
+ * Called only from mvpp2_txq_reserved_desc_num_proc(), itself called
+ * only by mvpp2_tx(), so migration is disabled, using
+ * smp_processor_id() is OK.
+ */
+static int mvpp2_txq_alloc_reserved_desc(struct mvpp2 *priv,
+					 struct mvpp2_tx_queue *txq, int num)
+{
+	u32 val;
+	int cpu = smp_processor_id();
+
+	val = (txq->id << MVPP2_TXQ_RSVD_REQ_Q_OFFSET) | num;
+	mvpp2_percpu_write_relaxed(priv, cpu, MVPP2_TXQ_RSVD_REQ_REG, val);
+
+	val = mvpp2_percpu_read_relaxed(priv, cpu, MVPP2_TXQ_RSVD_RSLT_REG);
+
+	return val & MVPP2_TXQ_RSVD_RSLT_MASK;
+}
+
+/* Check if there are enough reserved descriptors for transmission.
+ * If not, request chunk of reserved descriptors and check again.
+ */
+static int mvpp2_txq_reserved_desc_num_proc(struct mvpp2 *priv,
+					    struct mvpp2_tx_queue *txq,
+					    struct mvpp2_txq_pcpu *txq_pcpu,
+					    int num)
+{
+	int req, cpu, desc_count;
+
+	if (txq_pcpu->reserved_num >= num)
+		return 0;
+
+	/* Not enough descriptors reserved! Update the reserved descriptor
+	 * count and check again.
+	 */
+
+	desc_count = 0;
+	/* Compute total of used descriptors */
+	for_each_present_cpu(cpu) {
+		struct mvpp2_txq_pcpu *txq_pcpu_aux;
+
+		txq_pcpu_aux = per_cpu_ptr(txq->pcpu, cpu);
+		desc_count += txq_pcpu_aux->count;
+		desc_count += txq_pcpu_aux->reserved_num;
+	}
+
+	req = max(MVPP2_CPU_DESC_CHUNK, num - txq_pcpu->reserved_num);
+	desc_count += req;
+
+	if (desc_count >
+	   (txq->size - (num_present_cpus() * MVPP2_CPU_DESC_CHUNK)))
+		return -ENOMEM;
+
+	txq_pcpu->reserved_num += mvpp2_txq_alloc_reserved_desc(priv, txq, req);
+
+	/* OK, the descriptor could have been updated: check again. */
+	if (txq_pcpu->reserved_num < num)
+		return -ENOMEM;
+	return 0;
+}
+
+/* Release the last allocated Tx descriptor. Useful to handle DMA
+ * mapping failures in the Tx path.
+ */
+static void mvpp2_txq_desc_put(struct mvpp2_tx_queue *txq)
+{
+	if (txq->next_desc_to_proc == 0)
+		txq->next_desc_to_proc = txq->last_desc - 1;
+	else
+		txq->next_desc_to_proc--;
+}
+
+/* Set Tx descriptors fields relevant for CSUM calculation */
+static u32 mvpp2_txq_desc_csum(int l3_offs, int l3_proto,
+			       int ip_hdr_len, int l4_proto)
+{
+	u32 command;
+
+	/* fields: L3_offset, IP_hdrlen, L3_type, G_IPv4_chk,
+	 * G_L4_chk, L4_type required only for checksum calculation
+	 */
+	command = (l3_offs << MVPP2_TXD_L3_OFF_SHIFT);
+	command |= (ip_hdr_len << MVPP2_TXD_IP_HLEN_SHIFT);
+	command |= MVPP2_TXD_IP_CSUM_DISABLE;
+
+	if (l3_proto == swab16(ETH_P_IP)) {
+		command &= ~MVPP2_TXD_IP_CSUM_DISABLE;	/* enable IPv4 csum */
+		command &= ~MVPP2_TXD_L3_IP6;		/* enable IPv4 */
+	} else {
+		command |= MVPP2_TXD_L3_IP6;		/* enable IPv6 */
+	}
+
+	if (l4_proto == IPPROTO_TCP) {
+		command &= ~MVPP2_TXD_L4_UDP;		/* enable TCP */
+		command &= ~MVPP2_TXD_L4_CSUM_FRAG;	/* generate L4 csum */
+	} else if (l4_proto == IPPROTO_UDP) {
+		command |= MVPP2_TXD_L4_UDP;		/* enable UDP */
+		command &= ~MVPP2_TXD_L4_CSUM_FRAG;	/* generate L4 csum */
+	} else {
+		command |= MVPP2_TXD_L4_CSUM_NOT;
+	}
+
+	return command;
+}
+
+/* Get number of sent descriptors and decrement counter.
+ * The number of sent descriptors is returned.
+ * Per-CPU access
+ *
+ * Called only from mvpp2_txq_done(), called from mvpp2_tx()
+ * (migration disabled) and from the TX completion tasklet (migration
+ * disabled) so using smp_processor_id() is OK.
+ */
+static inline int mvpp2_txq_sent_desc_proc(struct mvpp2_port *port,
+					   struct mvpp2_tx_queue *txq)
+{
+	u32 val;
+
+	/* Reading status reg resets transmitted descriptor counter */
+	val = mvpp2_percpu_read_relaxed(port->priv, smp_processor_id(),
+					MVPP2_TXQ_SENT_REG(txq->id));
+
+	return (val & MVPP2_TRANSMITTED_COUNT_MASK) >>
+		MVPP2_TRANSMITTED_COUNT_OFFSET;
+}
+
+/* Called through on_each_cpu(), so runs on all CPUs, with migration
+ * disabled, therefore using smp_processor_id() is OK.
+ */
+static void mvpp2_txq_sent_counter_clear(void *arg)
+{
+	struct mvpp2_port *port = arg;
+	int queue;
+
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		int id = port->txqs[queue]->id;
+
+		mvpp2_percpu_read(port->priv, smp_processor_id(),
+				  MVPP2_TXQ_SENT_REG(id));
+	}
+}
+
+/* Set max sizes for Tx queues */
+static void mvpp2_txp_max_tx_size_set(struct mvpp2_port *port)
+{
+	u32	val, size, mtu;
+	int	txq, tx_port_num;
+
+	mtu = port->pkt_size * 8;
+	if (mtu > MVPP2_TXP_MTU_MAX)
+		mtu = MVPP2_TXP_MTU_MAX;
+
+	/* WA for wrong Token bucket update: Set MTU value = 3*real MTU value */
+	mtu = 3 * mtu;
+
+	/* Indirect access to registers */
+	tx_port_num = mvpp2_egress_port(port);
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
+
+	/* Set MTU */
+	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_MTU_REG);
+	val &= ~MVPP2_TXP_MTU_MAX;
+	val |= mtu;
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_MTU_REG, val);
+
+	/* TXP token size and all TXQs token size must be larger that MTU */
+	val = mvpp2_read(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG);
+	size = val & MVPP2_TXP_TOKEN_SIZE_MAX;
+	if (size < mtu) {
+		size = mtu;
+		val &= ~MVPP2_TXP_TOKEN_SIZE_MAX;
+		val |= size;
+		mvpp2_write(port->priv, MVPP2_TXP_SCHED_TOKEN_SIZE_REG, val);
+	}
+
+	for (txq = 0; txq < port->ntxqs; txq++) {
+		val = mvpp2_read(port->priv,
+				 MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq));
+		size = val & MVPP2_TXQ_TOKEN_SIZE_MAX;
+
+		if (size < mtu) {
+			size = mtu;
+			val &= ~MVPP2_TXQ_TOKEN_SIZE_MAX;
+			val |= size;
+			mvpp2_write(port->priv,
+				    MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq),
+				    val);
+		}
+	}
+}
+
+/* Set the number of packets that will be received before Rx interrupt
+ * will be generated by HW.
+ */
+static void mvpp2_rx_pkts_coal_set(struct mvpp2_port *port,
+				   struct mvpp2_rx_queue *rxq)
+{
+	int cpu = get_cpu();
+
+	if (rxq->pkts_coal > MVPP2_OCCUPIED_THRESH_MASK)
+		rxq->pkts_coal = MVPP2_OCCUPIED_THRESH_MASK;
+
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_THRESH_REG,
+			   rxq->pkts_coal);
+
+	put_cpu();
+}
+
+/* For some reason in the LSP this is done on each CPU. Why ? */
+static void mvpp2_tx_pkts_coal_set(struct mvpp2_port *port,
+				   struct mvpp2_tx_queue *txq)
+{
+	int cpu = get_cpu();
+	u32 val;
+
+	if (txq->done_pkts_coal > MVPP2_TXQ_THRESH_MASK)
+		txq->done_pkts_coal = MVPP2_TXQ_THRESH_MASK;
+
+	val = (txq->done_pkts_coal << MVPP2_TXQ_THRESH_OFFSET);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_THRESH_REG, val);
+
+	put_cpu();
+}
+
+static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz)
+{
+	u64 tmp = (u64)clk_hz * usec;
+
+	do_div(tmp, USEC_PER_SEC);
+
+	return tmp > U32_MAX ? U32_MAX : tmp;
+}
+
+static u32 mvpp2_cycles_to_usec(u32 cycles, unsigned long clk_hz)
+{
+	u64 tmp = (u64)cycles * USEC_PER_SEC;
+
+	do_div(tmp, clk_hz);
+
+	return tmp > U32_MAX ? U32_MAX : tmp;
+}
+
+/* Set the time delay in usec before Rx interrupt */
+static void mvpp2_rx_time_coal_set(struct mvpp2_port *port,
+				   struct mvpp2_rx_queue *rxq)
+{
+	unsigned long freq = port->priv->tclk;
+	u32 val = mvpp2_usec_to_cycles(rxq->time_coal, freq);
+
+	if (val > MVPP2_MAX_ISR_RX_THRESHOLD) {
+		rxq->time_coal =
+			mvpp2_cycles_to_usec(MVPP2_MAX_ISR_RX_THRESHOLD, freq);
+
+		/* re-evaluate to get actual register value */
+		val = mvpp2_usec_to_cycles(rxq->time_coal, freq);
+	}
+
+	mvpp2_write(port->priv, MVPP2_ISR_RX_THRESHOLD_REG(rxq->id), val);
+}
+
+static void mvpp2_tx_time_coal_set(struct mvpp2_port *port)
+{
+	unsigned long freq = port->priv->tclk;
+	u32 val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
+
+	if (val > MVPP2_MAX_ISR_TX_THRESHOLD) {
+		port->tx_time_coal =
+			mvpp2_cycles_to_usec(MVPP2_MAX_ISR_TX_THRESHOLD, freq);
+
+		/* re-evaluate to get actual register value */
+		val = mvpp2_usec_to_cycles(port->tx_time_coal, freq);
+	}
+
+	mvpp2_write(port->priv, MVPP2_ISR_TX_THRESHOLD_REG(port->id), val);
+}
+
+/* Free Tx queue skbuffs */
+static void mvpp2_txq_bufs_free(struct mvpp2_port *port,
+				struct mvpp2_tx_queue *txq,
+				struct mvpp2_txq_pcpu *txq_pcpu, int num)
+{
+	int i;
+
+	for (i = 0; i < num; i++) {
+		struct mvpp2_txq_pcpu_buf *tx_buf =
+			txq_pcpu->buffs + txq_pcpu->txq_get_index;
+
+		if (!IS_TSO_HEADER(txq_pcpu, tx_buf->dma))
+			dma_unmap_single(port->dev->dev.parent, tx_buf->dma,
+					 tx_buf->size, DMA_TO_DEVICE);
+		if (tx_buf->skb)
+			dev_kfree_skb_any(tx_buf->skb);
+
+		mvpp2_txq_inc_get(txq_pcpu);
+	}
+}
+
+static inline struct mvpp2_rx_queue *mvpp2_get_rx_queue(struct mvpp2_port *port,
+							u32 cause)
+{
+	int queue = fls(cause) - 1;
+
+	return port->rxqs[queue];
+}
+
+static inline struct mvpp2_tx_queue *mvpp2_get_tx_queue(struct mvpp2_port *port,
+							u32 cause)
+{
+	int queue = fls(cause) - 1;
+
+	return port->txqs[queue];
+}
+
+/* Handle end of transmission */
+static void mvpp2_txq_done(struct mvpp2_port *port, struct mvpp2_tx_queue *txq,
+			   struct mvpp2_txq_pcpu *txq_pcpu)
+{
+	struct netdev_queue *nq = netdev_get_tx_queue(port->dev, txq->log_id);
+	int tx_done;
+
+	if (txq_pcpu->cpu != smp_processor_id())
+		netdev_err(port->dev, "wrong cpu on the end of Tx processing\n");
+
+	tx_done = mvpp2_txq_sent_desc_proc(port, txq);
+	if (!tx_done)
+		return;
+	mvpp2_txq_bufs_free(port, txq, txq_pcpu, tx_done);
+
+	txq_pcpu->count -= tx_done;
+
+	if (netif_tx_queue_stopped(nq))
+		if (txq_pcpu->count <= txq_pcpu->wake_threshold)
+			netif_tx_wake_queue(nq);
+}
+
+static unsigned int mvpp2_tx_done(struct mvpp2_port *port, u32 cause,
+				  int cpu)
+{
+	struct mvpp2_tx_queue *txq;
+	struct mvpp2_txq_pcpu *txq_pcpu;
+	unsigned int tx_todo = 0;
+
+	while (cause) {
+		txq = mvpp2_get_tx_queue(port, cause);
+		if (!txq)
+			break;
+
+		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
+
+		if (txq_pcpu->count) {
+			mvpp2_txq_done(port, txq, txq_pcpu);
+			tx_todo += txq_pcpu->count;
+		}
+
+		cause &= ~(1 << txq->log_id);
+	}
+	return tx_todo;
+}
+
+/* Rx/Tx queue initialization/cleanup methods */
+
+/* Allocate and initialize descriptors for aggr TXQ */
+static int mvpp2_aggr_txq_init(struct platform_device *pdev,
+			       struct mvpp2_tx_queue *aggr_txq, int cpu,
+			       struct mvpp2 *priv)
+{
+	u32 txq_dma;
+
+	/* Allocate memory for TX descriptors */
+	aggr_txq->descs = dma_zalloc_coherent(&pdev->dev,
+				MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
+				&aggr_txq->descs_dma, GFP_KERNEL);
+	if (!aggr_txq->descs)
+		return -ENOMEM;
+
+	aggr_txq->last_desc = MVPP2_AGGR_TXQ_SIZE - 1;
+
+	/* Aggr TXQ no reset WA */
+	aggr_txq->next_desc_to_proc = mvpp2_read(priv,
+						 MVPP2_AGGR_TXQ_INDEX_REG(cpu));
+
+	/* Set Tx descriptors queue starting address indirect
+	 * access
+	 */
+	if (priv->hw_version == MVPP21)
+		txq_dma = aggr_txq->descs_dma;
+	else
+		txq_dma = aggr_txq->descs_dma >>
+			MVPP22_AGGR_TXQ_DESC_ADDR_OFFS;
+
+	mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_ADDR_REG(cpu), txq_dma);
+	mvpp2_write(priv, MVPP2_AGGR_TXQ_DESC_SIZE_REG(cpu),
+		    MVPP2_AGGR_TXQ_SIZE);
+
+	return 0;
+}
+
+/* Create a specified Rx queue */
+static int mvpp2_rxq_init(struct mvpp2_port *port,
+			  struct mvpp2_rx_queue *rxq)
+
+{
+	u32 rxq_dma;
+	int cpu;
+
+	rxq->size = port->rx_ring_size;
+
+	/* Allocate memory for RX descriptors */
+	rxq->descs = dma_alloc_coherent(port->dev->dev.parent,
+					rxq->size * MVPP2_DESC_ALIGNED_SIZE,
+					&rxq->descs_dma, GFP_KERNEL);
+	if (!rxq->descs)
+		return -ENOMEM;
+
+	rxq->last_desc = rxq->size - 1;
+
+	/* Zero occupied and non-occupied counters - direct access */
+	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
+
+	/* Set Rx descriptors queue starting address - indirect access */
+	cpu = get_cpu();
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
+	if (port->priv->hw_version == MVPP21)
+		rxq_dma = rxq->descs_dma;
+	else
+		rxq_dma = rxq->descs_dma >> MVPP22_DESC_ADDR_OFFS;
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, rxq_dma);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, rxq->size);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_INDEX_REG, 0);
+	put_cpu();
+
+	/* Set Offset */
+	mvpp2_rxq_offset_set(port, rxq->id, NET_SKB_PAD);
+
+	/* Set coalescing pkts and time */
+	mvpp2_rx_pkts_coal_set(port, rxq);
+	mvpp2_rx_time_coal_set(port, rxq);
+
+	/* Add number of descriptors ready for receiving packets */
+	mvpp2_rxq_status_update(port, rxq->id, 0, rxq->size);
+
+	return 0;
+}
+
+/* Push packets received by the RXQ to BM pool */
+static void mvpp2_rxq_drop_pkts(struct mvpp2_port *port,
+				struct mvpp2_rx_queue *rxq)
+{
+	int rx_received, i;
+
+	rx_received = mvpp2_rxq_received(port, rxq->id);
+	if (!rx_received)
+		return;
+
+	for (i = 0; i < rx_received; i++) {
+		struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
+		u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
+		int pool;
+
+		pool = (status & MVPP2_RXD_BM_POOL_ID_MASK) >>
+			MVPP2_RXD_BM_POOL_ID_OFFS;
+
+		mvpp2_bm_pool_put(port, pool,
+				  mvpp2_rxdesc_dma_addr_get(port, rx_desc),
+				  mvpp2_rxdesc_cookie_get(port, rx_desc));
+	}
+	mvpp2_rxq_status_update(port, rxq->id, rx_received, rx_received);
+}
+
+/* Cleanup Rx queue */
+static void mvpp2_rxq_deinit(struct mvpp2_port *port,
+			     struct mvpp2_rx_queue *rxq)
+{
+	int cpu;
+
+	mvpp2_rxq_drop_pkts(port, rxq);
+
+	if (rxq->descs)
+		dma_free_coherent(port->dev->dev.parent,
+				  rxq->size * MVPP2_DESC_ALIGNED_SIZE,
+				  rxq->descs,
+				  rxq->descs_dma);
+
+	rxq->descs             = NULL;
+	rxq->last_desc         = 0;
+	rxq->next_desc_to_proc = 0;
+	rxq->descs_dma         = 0;
+
+	/* Clear Rx descriptors queue starting address and size;
+	 * free descriptor number
+	 */
+	mvpp2_write(port->priv, MVPP2_RXQ_STATUS_REG(rxq->id), 0);
+	cpu = get_cpu();
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_NUM_REG, rxq->id);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_ADDR_REG, 0);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_RXQ_DESC_SIZE_REG, 0);
+	put_cpu();
+}
+
+/* Create and initialize a Tx queue */
+static int mvpp2_txq_init(struct mvpp2_port *port,
+			  struct mvpp2_tx_queue *txq)
+{
+	u32 val;
+	int cpu, desc, desc_per_txq, tx_port_num;
+	struct mvpp2_txq_pcpu *txq_pcpu;
+
+	txq->size = port->tx_ring_size;
+
+	/* Allocate memory for Tx descriptors */
+	txq->descs = dma_alloc_coherent(port->dev->dev.parent,
+				txq->size * MVPP2_DESC_ALIGNED_SIZE,
+				&txq->descs_dma, GFP_KERNEL);
+	if (!txq->descs)
+		return -ENOMEM;
+
+	txq->last_desc = txq->size - 1;
+
+	/* Set Tx descriptors queue starting address - indirect access */
+	cpu = get_cpu();
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG,
+			   txq->descs_dma);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_SIZE_REG,
+			   txq->size & MVPP2_TXQ_DESC_SIZE_MASK);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_INDEX_REG, 0);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_RSVD_CLR_REG,
+			   txq->id << MVPP2_TXQ_RSVD_CLR_OFFSET);
+	val = mvpp2_percpu_read(port->priv, cpu, MVPP2_TXQ_PENDING_REG);
+	val &= ~MVPP2_TXQ_PENDING_MASK;
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PENDING_REG, val);
+
+	/* Calculate base address in prefetch buffer. We reserve 16 descriptors
+	 * for each existing TXQ.
+	 * TCONTS for PON port must be continuous from 0 to MVPP2_MAX_TCONT
+	 * GBE ports assumed to be continuous from 0 to MVPP2_MAX_PORTS
+	 */
+	desc_per_txq = 16;
+	desc = (port->id * MVPP2_MAX_TXQ * desc_per_txq) +
+	       (txq->log_id * desc_per_txq);
+
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG,
+			   MVPP2_PREF_BUF_PTR(desc) | MVPP2_PREF_BUF_SIZE_16 |
+			   MVPP2_PREF_BUF_THRESH(desc_per_txq / 2));
+	put_cpu();
+
+	/* WRR / EJP configuration - indirect access */
+	tx_port_num = mvpp2_egress_port(port);
+	mvpp2_write(port->priv, MVPP2_TXP_SCHED_PORT_INDEX_REG, tx_port_num);
+
+	val = mvpp2_read(port->priv, MVPP2_TXQ_SCHED_REFILL_REG(txq->log_id));
+	val &= ~MVPP2_TXQ_REFILL_PERIOD_ALL_MASK;
+	val |= MVPP2_TXQ_REFILL_PERIOD_MASK(1);
+	val |= MVPP2_TXQ_REFILL_TOKENS_ALL_MASK;
+	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_REFILL_REG(txq->log_id), val);
+
+	val = MVPP2_TXQ_TOKEN_SIZE_MAX;
+	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_SIZE_REG(txq->log_id),
+		    val);
+
+	for_each_present_cpu(cpu) {
+		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
+		txq_pcpu->size = txq->size;
+		txq_pcpu->buffs = kmalloc_array(txq_pcpu->size,
+						sizeof(*txq_pcpu->buffs),
+						GFP_KERNEL);
+		if (!txq_pcpu->buffs)
+			return -ENOMEM;
+
+		txq_pcpu->count = 0;
+		txq_pcpu->reserved_num = 0;
+		txq_pcpu->txq_put_index = 0;
+		txq_pcpu->txq_get_index = 0;
+		txq_pcpu->tso_headers = NULL;
+
+		txq_pcpu->stop_threshold = txq->size - MVPP2_MAX_SKB_DESCS;
+		txq_pcpu->wake_threshold = txq_pcpu->stop_threshold / 2;
+
+		txq_pcpu->tso_headers =
+			dma_alloc_coherent(port->dev->dev.parent,
+					   txq_pcpu->size * TSO_HEADER_SIZE,
+					   &txq_pcpu->tso_headers_dma,
+					   GFP_KERNEL);
+		if (!txq_pcpu->tso_headers)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/* Free allocated TXQ resources */
+static void mvpp2_txq_deinit(struct mvpp2_port *port,
+			     struct mvpp2_tx_queue *txq)
+{
+	struct mvpp2_txq_pcpu *txq_pcpu;
+	int cpu;
+
+	for_each_present_cpu(cpu) {
+		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
+		kfree(txq_pcpu->buffs);
+
+		if (txq_pcpu->tso_headers)
+			dma_free_coherent(port->dev->dev.parent,
+					  txq_pcpu->size * TSO_HEADER_SIZE,
+					  txq_pcpu->tso_headers,
+					  txq_pcpu->tso_headers_dma);
+
+		txq_pcpu->tso_headers = NULL;
+	}
+
+	if (txq->descs)
+		dma_free_coherent(port->dev->dev.parent,
+				  txq->size * MVPP2_DESC_ALIGNED_SIZE,
+				  txq->descs, txq->descs_dma);
+
+	txq->descs             = NULL;
+	txq->last_desc         = 0;
+	txq->next_desc_to_proc = 0;
+	txq->descs_dma         = 0;
+
+	/* Set minimum bandwidth for disabled TXQs */
+	mvpp2_write(port->priv, MVPP2_TXQ_SCHED_TOKEN_CNTR_REG(txq->id), 0);
+
+	/* Set Tx descriptors queue starting address and size */
+	cpu = get_cpu();
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_ADDR_REG, 0);
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_DESC_SIZE_REG, 0);
+	put_cpu();
+}
+
+/* Cleanup Tx ports */
+static void mvpp2_txq_clean(struct mvpp2_port *port, struct mvpp2_tx_queue *txq)
+{
+	struct mvpp2_txq_pcpu *txq_pcpu;
+	int delay, pending, cpu;
+	u32 val;
+
+	cpu = get_cpu();
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_NUM_REG, txq->id);
+	val = mvpp2_percpu_read(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG);
+	val |= MVPP2_TXQ_DRAIN_EN_MASK;
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG, val);
+
+	/* The napi queue has been stopped so wait for all packets
+	 * to be transmitted.
+	 */
+	delay = 0;
+	do {
+		if (delay >= MVPP2_TX_PENDING_TIMEOUT_MSEC) {
+			netdev_warn(port->dev,
+				    "port %d: cleaning queue %d timed out\n",
+				    port->id, txq->log_id);
+			break;
+		}
+		mdelay(1);
+		delay++;
+
+		pending = mvpp2_percpu_read(port->priv, cpu,
+					    MVPP2_TXQ_PENDING_REG);
+		pending &= MVPP2_TXQ_PENDING_MASK;
+	} while (pending);
+
+	val &= ~MVPP2_TXQ_DRAIN_EN_MASK;
+	mvpp2_percpu_write(port->priv, cpu, MVPP2_TXQ_PREF_BUF_REG, val);
+	put_cpu();
+
+	for_each_present_cpu(cpu) {
+		txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
+
+		/* Release all packets */
+		mvpp2_txq_bufs_free(port, txq, txq_pcpu, txq_pcpu->count);
+
+		/* Reset queue */
+		txq_pcpu->count = 0;
+		txq_pcpu->txq_put_index = 0;
+		txq_pcpu->txq_get_index = 0;
+	}
+}
+
+/* Cleanup all Tx queues */
+static void mvpp2_cleanup_txqs(struct mvpp2_port *port)
+{
+	struct mvpp2_tx_queue *txq;
+	int queue;
+	u32 val;
+
+	val = mvpp2_read(port->priv, MVPP2_TX_PORT_FLUSH_REG);
+
+	/* Reset Tx ports and delete Tx queues */
+	val |= MVPP2_TX_PORT_FLUSH_MASK(port->id);
+	mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
+
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		txq = port->txqs[queue];
+		mvpp2_txq_clean(port, txq);
+		mvpp2_txq_deinit(port, txq);
+	}
+
+	on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
+
+	val &= ~MVPP2_TX_PORT_FLUSH_MASK(port->id);
+	mvpp2_write(port->priv, MVPP2_TX_PORT_FLUSH_REG, val);
+}
+
+/* Cleanup all Rx queues */
+static void mvpp2_cleanup_rxqs(struct mvpp2_port *port)
+{
+	int queue;
+
+	for (queue = 0; queue < port->nrxqs; queue++)
+		mvpp2_rxq_deinit(port, port->rxqs[queue]);
+}
+
+/* Init all Rx queues for port */
+static int mvpp2_setup_rxqs(struct mvpp2_port *port)
+{
+	int queue, err;
+
+	for (queue = 0; queue < port->nrxqs; queue++) {
+		err = mvpp2_rxq_init(port, port->rxqs[queue]);
+		if (err)
+			goto err_cleanup;
+	}
+	return 0;
+
+err_cleanup:
+	mvpp2_cleanup_rxqs(port);
+	return err;
+}
+
+/* Init all tx queues for port */
+static int mvpp2_setup_txqs(struct mvpp2_port *port)
+{
+	struct mvpp2_tx_queue *txq;
+	int queue, err;
+
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		txq = port->txqs[queue];
+		err = mvpp2_txq_init(port, txq);
+		if (err)
+			goto err_cleanup;
+	}
+
+	if (port->has_tx_irqs) {
+		mvpp2_tx_time_coal_set(port);
+		for (queue = 0; queue < port->ntxqs; queue++) {
+			txq = port->txqs[queue];
+			mvpp2_tx_pkts_coal_set(port, txq);
+		}
+	}
+
+	on_each_cpu(mvpp2_txq_sent_counter_clear, port, 1);
+	return 0;
+
+err_cleanup:
+	mvpp2_cleanup_txqs(port);
+	return err;
+}
+
+/* The callback for per-port interrupt */
+static irqreturn_t mvpp2_isr(int irq, void *dev_id)
+{
+	struct mvpp2_queue_vector *qv = dev_id;
+
+	mvpp2_qvec_interrupt_disable(qv);
+
+	napi_schedule(&qv->napi);
+
+	return IRQ_HANDLED;
+}
+
+/* Per-port interrupt for link status changes */
+static irqreturn_t mvpp2_link_status_isr(int irq, void *dev_id)
+{
+	struct mvpp2_port *port = (struct mvpp2_port *)dev_id;
+	struct net_device *dev = port->dev;
+	bool event = false, link = false;
+	u32 val;
+
+	mvpp22_gop_mask_irq(port);
+
+	if (port->gop_id == 0 &&
+	    port->phy_interface == PHY_INTERFACE_MODE_10GKR) {
+		val = readl(port->base + MVPP22_XLG_INT_STAT);
+		if (val & MVPP22_XLG_INT_STAT_LINK) {
+			event = true;
+			val = readl(port->base + MVPP22_XLG_STATUS);
+			if (val & MVPP22_XLG_STATUS_LINK_UP)
+				link = true;
+		}
+	} else if (phy_interface_mode_is_rgmii(port->phy_interface) ||
+		   port->phy_interface == PHY_INTERFACE_MODE_SGMII ||
+		   port->phy_interface == PHY_INTERFACE_MODE_1000BASEX ||
+		   port->phy_interface == PHY_INTERFACE_MODE_2500BASEX) {
+		val = readl(port->base + MVPP22_GMAC_INT_STAT);
+		if (val & MVPP22_GMAC_INT_STAT_LINK) {
+			event = true;
+			val = readl(port->base + MVPP2_GMAC_STATUS0);
+			if (val & MVPP2_GMAC_STATUS0_LINK_UP)
+				link = true;
+		}
+	}
+
+	if (port->phylink) {
+		phylink_mac_change(port->phylink, link);
+		goto handled;
+	}
+
+	if (!netif_running(dev) || !event)
+		goto handled;
+
+	if (link) {
+		mvpp2_interrupts_enable(port);
+
+		mvpp2_egress_enable(port);
+		mvpp2_ingress_enable(port);
+		netif_carrier_on(dev);
+		netif_tx_wake_all_queues(dev);
+	} else {
+		netif_tx_stop_all_queues(dev);
+		netif_carrier_off(dev);
+		mvpp2_ingress_disable(port);
+		mvpp2_egress_disable(port);
+
+		mvpp2_interrupts_disable(port);
+	}
+
+handled:
+	mvpp22_gop_unmask_irq(port);
+	return IRQ_HANDLED;
+}
+
+static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
+{
+	ktime_t interval;
+
+	if (!port_pcpu->timer_scheduled) {
+		port_pcpu->timer_scheduled = true;
+		interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
+		hrtimer_start(&port_pcpu->tx_done_timer, interval,
+			      HRTIMER_MODE_REL_PINNED);
+	}
+}
+
+static void mvpp2_tx_proc_cb(unsigned long data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
+	unsigned int tx_todo, cause;
+
+	if (!netif_running(dev))
+		return;
+	port_pcpu->timer_scheduled = false;
+
+	/* Process all the Tx queues */
+	cause = (1 << port->ntxqs) - 1;
+	tx_todo = mvpp2_tx_done(port, cause, smp_processor_id());
+
+	/* Set the timer in case not all the packets were processed */
+	if (tx_todo)
+		mvpp2_timer_set(port_pcpu);
+}
+
+static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer)
+{
+	struct mvpp2_port_pcpu *port_pcpu = container_of(timer,
+							 struct mvpp2_port_pcpu,
+							 tx_done_timer);
+
+	tasklet_schedule(&port_pcpu->tx_done_tasklet);
+
+	return HRTIMER_NORESTART;
+}
+
+/* Main RX/TX processing routines */
+
+/* Display more error info */
+static void mvpp2_rx_error(struct mvpp2_port *port,
+			   struct mvpp2_rx_desc *rx_desc)
+{
+	u32 status = mvpp2_rxdesc_status_get(port, rx_desc);
+	size_t sz = mvpp2_rxdesc_size_get(port, rx_desc);
+	char *err_str = NULL;
+
+	switch (status & MVPP2_RXD_ERR_CODE_MASK) {
+	case MVPP2_RXD_ERR_CRC:
+		err_str = "crc";
+		break;
+	case MVPP2_RXD_ERR_OVERRUN:
+		err_str = "overrun";
+		break;
+	case MVPP2_RXD_ERR_RESOURCE:
+		err_str = "resource";
+		break;
+	}
+	if (err_str && net_ratelimit())
+		netdev_err(port->dev,
+			   "bad rx status %08x (%s error), size=%zu\n",
+			   status, err_str, sz);
+}
+
+/* Handle RX checksum offload */
+static void mvpp2_rx_csum(struct mvpp2_port *port, u32 status,
+			  struct sk_buff *skb)
+{
+	if (((status & MVPP2_RXD_L3_IP4) &&
+	     !(status & MVPP2_RXD_IP4_HEADER_ERR)) ||
+	    (status & MVPP2_RXD_L3_IP6))
+		if (((status & MVPP2_RXD_L4_UDP) ||
+		     (status & MVPP2_RXD_L4_TCP)) &&
+		     (status & MVPP2_RXD_L4_CSUM_OK)) {
+			skb->csum = 0;
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			return;
+		}
+
+	skb->ip_summed = CHECKSUM_NONE;
+}
+
+/* Reuse skb if possible, or allocate a new skb and add it to BM pool */
+static int mvpp2_rx_refill(struct mvpp2_port *port,
+			   struct mvpp2_bm_pool *bm_pool, int pool)
+{
+	dma_addr_t dma_addr;
+	phys_addr_t phys_addr;
+	void *buf;
+
+	/* No recycle or too many buffers are in use, so allocate a new skb */
+	buf = mvpp2_buf_alloc(port, bm_pool, &dma_addr, &phys_addr,
+			      GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+
+	mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr);
+
+	return 0;
+}
+
+/* Handle tx checksum */
+static u32 mvpp2_skb_tx_csum(struct mvpp2_port *port, struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		int ip_hdr_len = 0;
+		u8 l4_proto;
+
+		if (skb->protocol == htons(ETH_P_IP)) {
+			struct iphdr *ip4h = ip_hdr(skb);
+
+			/* Calculate IPv4 checksum and L4 checksum */
+			ip_hdr_len = ip4h->ihl;
+			l4_proto = ip4h->protocol;
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			struct ipv6hdr *ip6h = ipv6_hdr(skb);
+
+			/* Read l4_protocol from one of IPv6 extra headers */
+			if (skb_network_header_len(skb) > 0)
+				ip_hdr_len = (skb_network_header_len(skb) >> 2);
+			l4_proto = ip6h->nexthdr;
+		} else {
+			return MVPP2_TXD_L4_CSUM_NOT;
+		}
+
+		return mvpp2_txq_desc_csum(skb_network_offset(skb),
+				skb->protocol, ip_hdr_len, l4_proto);
+	}
+
+	return MVPP2_TXD_L4_CSUM_NOT | MVPP2_TXD_IP_CSUM_DISABLE;
+}
+
+/* Main rx processing */
+static int mvpp2_rx(struct mvpp2_port *port, struct napi_struct *napi,
+		    int rx_todo, struct mvpp2_rx_queue *rxq)
+{
+	struct net_device *dev = port->dev;
+	int rx_received;
+	int rx_done = 0;
+	u32 rcvd_pkts = 0;
+	u32 rcvd_bytes = 0;
+
+	/* Get number of received packets and clamp the to-do */
+	rx_received = mvpp2_rxq_received(port, rxq->id);
+	if (rx_todo > rx_received)
+		rx_todo = rx_received;
+
+	while (rx_done < rx_todo) {
+		struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
+		struct mvpp2_bm_pool *bm_pool;
+		struct sk_buff *skb;
+		unsigned int frag_size;
+		dma_addr_t dma_addr;
+		phys_addr_t phys_addr;
+		u32 rx_status;
+		int pool, rx_bytes, err;
+		void *data;
+
+		rx_done++;
+		rx_status = mvpp2_rxdesc_status_get(port, rx_desc);
+		rx_bytes = mvpp2_rxdesc_size_get(port, rx_desc);
+		rx_bytes -= MVPP2_MH_SIZE;
+		dma_addr = mvpp2_rxdesc_dma_addr_get(port, rx_desc);
+		phys_addr = mvpp2_rxdesc_cookie_get(port, rx_desc);
+		data = (void *)phys_to_virt(phys_addr);
+
+		pool = (rx_status & MVPP2_RXD_BM_POOL_ID_MASK) >>
+			MVPP2_RXD_BM_POOL_ID_OFFS;
+		bm_pool = &port->priv->bm_pools[pool];
+
+		/* In case of an error, release the requested buffer pointer
+		 * to the Buffer Manager. This request process is controlled
+		 * by the hardware, and the information about the buffer is
+		 * comprised by the RX descriptor.
+		 */
+		if (rx_status & MVPP2_RXD_ERR_SUMMARY) {
+err_drop_frame:
+			dev->stats.rx_errors++;
+			mvpp2_rx_error(port, rx_desc);
+			/* Return the buffer to the pool */
+			mvpp2_bm_pool_put(port, pool, dma_addr, phys_addr);
+			continue;
+		}
+
+		if (bm_pool->frag_size > PAGE_SIZE)
+			frag_size = 0;
+		else
+			frag_size = bm_pool->frag_size;
+
+		skb = build_skb(data, frag_size);
+		if (!skb) {
+			netdev_warn(port->dev, "skb build failed\n");
+			goto err_drop_frame;
+		}
+
+		err = mvpp2_rx_refill(port, bm_pool, pool);
+		if (err) {
+			netdev_err(port->dev, "failed to refill BM pools\n");
+			goto err_drop_frame;
+		}
+
+		dma_unmap_single(dev->dev.parent, dma_addr,
+				 bm_pool->buf_size, DMA_FROM_DEVICE);
+
+		rcvd_pkts++;
+		rcvd_bytes += rx_bytes;
+
+		skb_reserve(skb, MVPP2_MH_SIZE + NET_SKB_PAD);
+		skb_put(skb, rx_bytes);
+		skb->protocol = eth_type_trans(skb, dev);
+		mvpp2_rx_csum(port, rx_status, skb);
+
+		napi_gro_receive(napi, skb);
+	}
+
+	if (rcvd_pkts) {
+		struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->rx_packets += rcvd_pkts;
+		stats->rx_bytes   += rcvd_bytes;
+		u64_stats_update_end(&stats->syncp);
+	}
+
+	/* Update Rx queue management counters */
+	wmb();
+	mvpp2_rxq_status_update(port, rxq->id, rx_done, rx_done);
+
+	return rx_todo;
+}
+
+static inline void
+tx_desc_unmap_put(struct mvpp2_port *port, struct mvpp2_tx_queue *txq,
+		  struct mvpp2_tx_desc *desc)
+{
+	struct mvpp2_txq_pcpu *txq_pcpu = this_cpu_ptr(txq->pcpu);
+
+	dma_addr_t buf_dma_addr =
+		mvpp2_txdesc_dma_addr_get(port, desc);
+	size_t buf_sz =
+		mvpp2_txdesc_size_get(port, desc);
+	if (!IS_TSO_HEADER(txq_pcpu, buf_dma_addr))
+		dma_unmap_single(port->dev->dev.parent, buf_dma_addr,
+				 buf_sz, DMA_TO_DEVICE);
+	mvpp2_txq_desc_put(txq);
+}
+
+/* Handle tx fragmentation processing */
+static int mvpp2_tx_frag_process(struct mvpp2_port *port, struct sk_buff *skb,
+				 struct mvpp2_tx_queue *aggr_txq,
+				 struct mvpp2_tx_queue *txq)
+{
+	struct mvpp2_txq_pcpu *txq_pcpu = this_cpu_ptr(txq->pcpu);
+	struct mvpp2_tx_desc *tx_desc;
+	int i;
+	dma_addr_t buf_dma_addr;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		void *addr = page_address(frag->page.p) + frag->page_offset;
+
+		tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+		mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+		mvpp2_txdesc_size_set(port, tx_desc, frag->size);
+
+		buf_dma_addr = dma_map_single(port->dev->dev.parent, addr,
+					      frag->size, DMA_TO_DEVICE);
+		if (dma_mapping_error(port->dev->dev.parent, buf_dma_addr)) {
+			mvpp2_txq_desc_put(txq);
+			goto cleanup;
+		}
+
+		mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
+
+		if (i == (skb_shinfo(skb)->nr_frags - 1)) {
+			/* Last descriptor */
+			mvpp2_txdesc_cmd_set(port, tx_desc,
+					     MVPP2_TXD_L_DESC);
+			mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
+		} else {
+			/* Descriptor in the middle: Not First, Not Last */
+			mvpp2_txdesc_cmd_set(port, tx_desc, 0);
+			mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+		}
+	}
+
+	return 0;
+cleanup:
+	/* Release all descriptors that were used to map fragments of
+	 * this packet, as well as the corresponding DMA mappings
+	 */
+	for (i = i - 1; i >= 0; i--) {
+		tx_desc = txq->descs + i;
+		tx_desc_unmap_put(port, txq, tx_desc);
+	}
+
+	return -ENOMEM;
+}
+
+static inline void mvpp2_tso_put_hdr(struct sk_buff *skb,
+				     struct net_device *dev,
+				     struct mvpp2_tx_queue *txq,
+				     struct mvpp2_tx_queue *aggr_txq,
+				     struct mvpp2_txq_pcpu *txq_pcpu,
+				     int hdr_sz)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+	dma_addr_t addr;
+
+	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+	mvpp2_txdesc_size_set(port, tx_desc, hdr_sz);
+
+	addr = txq_pcpu->tso_headers_dma +
+	       txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
+	mvpp2_txdesc_dma_addr_set(port, tx_desc, addr);
+
+	mvpp2_txdesc_cmd_set(port, tx_desc, mvpp2_skb_tx_csum(port, skb) |
+					    MVPP2_TXD_F_DESC |
+					    MVPP2_TXD_PADDING_DISABLE);
+	mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+}
+
+static inline int mvpp2_tso_put_data(struct sk_buff *skb,
+				     struct net_device *dev, struct tso_t *tso,
+				     struct mvpp2_tx_queue *txq,
+				     struct mvpp2_tx_queue *aggr_txq,
+				     struct mvpp2_txq_pcpu *txq_pcpu,
+				     int sz, bool left, bool last)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_tx_desc *tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+	dma_addr_t buf_dma_addr;
+
+	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+	mvpp2_txdesc_size_set(port, tx_desc, sz);
+
+	buf_dma_addr = dma_map_single(dev->dev.parent, tso->data, sz,
+				      DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
+		mvpp2_txq_desc_put(txq);
+		return -ENOMEM;
+	}
+
+	mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
+
+	if (!left) {
+		mvpp2_txdesc_cmd_set(port, tx_desc, MVPP2_TXD_L_DESC);
+		if (last) {
+			mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
+			return 0;
+		}
+	} else {
+		mvpp2_txdesc_cmd_set(port, tx_desc, 0);
+	}
+
+	mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+	return 0;
+}
+
+static int mvpp2_tx_tso(struct sk_buff *skb, struct net_device *dev,
+			struct mvpp2_tx_queue *txq,
+			struct mvpp2_tx_queue *aggr_txq,
+			struct mvpp2_txq_pcpu *txq_pcpu)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct tso_t tso;
+	int hdr_sz = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	int i, len, descs = 0;
+
+	/* Check number of available descriptors */
+	if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq,
+				      tso_count_descs(skb)) ||
+	    mvpp2_txq_reserved_desc_num_proc(port->priv, txq, txq_pcpu,
+					     tso_count_descs(skb)))
+		return 0;
+
+	tso_start(skb, &tso);
+	len = skb->len - hdr_sz;
+	while (len > 0) {
+		int left = min_t(int, skb_shinfo(skb)->gso_size, len);
+		char *hdr = txq_pcpu->tso_headers +
+			    txq_pcpu->txq_put_index * TSO_HEADER_SIZE;
+
+		len -= left;
+		descs++;
+
+		tso_build_hdr(skb, hdr, &tso, left, len == 0);
+		mvpp2_tso_put_hdr(skb, dev, txq, aggr_txq, txq_pcpu, hdr_sz);
+
+		while (left > 0) {
+			int sz = min_t(int, tso.size, left);
+			left -= sz;
+			descs++;
+
+			if (mvpp2_tso_put_data(skb, dev, &tso, txq, aggr_txq,
+					       txq_pcpu, sz, left, len == 0))
+				goto release;
+			tso_build_data(skb, &tso, sz);
+		}
+	}
+
+	return descs;
+
+release:
+	for (i = descs - 1; i >= 0; i--) {
+		struct mvpp2_tx_desc *tx_desc = txq->descs + i;
+		tx_desc_unmap_put(port, txq, tx_desc);
+	}
+	return 0;
+}
+
+/* Main tx processing */
+static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_tx_queue *txq, *aggr_txq;
+	struct mvpp2_txq_pcpu *txq_pcpu;
+	struct mvpp2_tx_desc *tx_desc;
+	dma_addr_t buf_dma_addr;
+	int frags = 0;
+	u16 txq_id;
+	u32 tx_cmd;
+
+	txq_id = skb_get_queue_mapping(skb);
+	txq = port->txqs[txq_id];
+	txq_pcpu = this_cpu_ptr(txq->pcpu);
+	aggr_txq = &port->priv->aggr_txqs[smp_processor_id()];
+
+	if (skb_is_gso(skb)) {
+		frags = mvpp2_tx_tso(skb, dev, txq, aggr_txq, txq_pcpu);
+		goto out;
+	}
+	frags = skb_shinfo(skb)->nr_frags + 1;
+
+	/* Check number of available descriptors */
+	if (mvpp2_aggr_desc_num_check(port->priv, aggr_txq, frags) ||
+	    mvpp2_txq_reserved_desc_num_proc(port->priv, txq,
+					     txq_pcpu, frags)) {
+		frags = 0;
+		goto out;
+	}
+
+	/* Get a descriptor for the first part of the packet */
+	tx_desc = mvpp2_txq_next_desc_get(aggr_txq);
+	mvpp2_txdesc_txq_set(port, tx_desc, txq->id);
+	mvpp2_txdesc_size_set(port, tx_desc, skb_headlen(skb));
+
+	buf_dma_addr = dma_map_single(dev->dev.parent, skb->data,
+				      skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(dev->dev.parent, buf_dma_addr))) {
+		mvpp2_txq_desc_put(txq);
+		frags = 0;
+		goto out;
+	}
+
+	mvpp2_txdesc_dma_addr_set(port, tx_desc, buf_dma_addr);
+
+	tx_cmd = mvpp2_skb_tx_csum(port, skb);
+
+	if (frags == 1) {
+		/* First and Last descriptor */
+		tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_L_DESC;
+		mvpp2_txdesc_cmd_set(port, tx_desc, tx_cmd);
+		mvpp2_txq_inc_put(port, txq_pcpu, skb, tx_desc);
+	} else {
+		/* First but not Last */
+		tx_cmd |= MVPP2_TXD_F_DESC | MVPP2_TXD_PADDING_DISABLE;
+		mvpp2_txdesc_cmd_set(port, tx_desc, tx_cmd);
+		mvpp2_txq_inc_put(port, txq_pcpu, NULL, tx_desc);
+
+		/* Continue with other skb fragments */
+		if (mvpp2_tx_frag_process(port, skb, aggr_txq, txq)) {
+			tx_desc_unmap_put(port, txq, tx_desc);
+			frags = 0;
+		}
+	}
+
+out:
+	if (frags > 0) {
+		struct mvpp2_pcpu_stats *stats = this_cpu_ptr(port->stats);
+		struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
+
+		txq_pcpu->reserved_num -= frags;
+		txq_pcpu->count += frags;
+		aggr_txq->count += frags;
+
+		/* Enable transmit */
+		wmb();
+		mvpp2_aggr_txq_pend_desc_add(port, frags);
+
+		if (txq_pcpu->count >= txq_pcpu->stop_threshold)
+			netif_tx_stop_queue(nq);
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->tx_packets++;
+		stats->tx_bytes += skb->len;
+		u64_stats_update_end(&stats->syncp);
+	} else {
+		dev->stats.tx_dropped++;
+		dev_kfree_skb_any(skb);
+	}
+
+	/* Finalize TX processing */
+	if (!port->has_tx_irqs && txq_pcpu->count >= txq->done_pkts_coal)
+		mvpp2_txq_done(port, txq, txq_pcpu);
+
+	/* Set the timer in case not all frags were processed */
+	if (!port->has_tx_irqs && txq_pcpu->count <= frags &&
+	    txq_pcpu->count > 0) {
+		struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu);
+
+		mvpp2_timer_set(port_pcpu);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static inline void mvpp2_cause_error(struct net_device *dev, int cause)
+{
+	if (cause & MVPP2_CAUSE_FCS_ERR_MASK)
+		netdev_err(dev, "FCS error\n");
+	if (cause & MVPP2_CAUSE_RX_FIFO_OVERRUN_MASK)
+		netdev_err(dev, "rx fifo overrun error\n");
+	if (cause & MVPP2_CAUSE_TX_FIFO_UNDERRUN_MASK)
+		netdev_err(dev, "tx fifo underrun error\n");
+}
+
+static int mvpp2_poll(struct napi_struct *napi, int budget)
+{
+	u32 cause_rx_tx, cause_rx, cause_tx, cause_misc;
+	int rx_done = 0;
+	struct mvpp2_port *port = netdev_priv(napi->dev);
+	struct mvpp2_queue_vector *qv;
+	int cpu = smp_processor_id();
+
+	qv = container_of(napi, struct mvpp2_queue_vector, napi);
+
+	/* Rx/Tx cause register
+	 *
+	 * Bits 0-15: each bit indicates received packets on the Rx queue
+	 * (bit 0 is for Rx queue 0).
+	 *
+	 * Bits 16-23: each bit indicates transmitted packets on the Tx queue
+	 * (bit 16 is for Tx queue 0).
+	 *
+	 * Each CPU has its own Rx/Tx cause register
+	 */
+	cause_rx_tx = mvpp2_percpu_read_relaxed(port->priv, qv->sw_thread_id,
+						MVPP2_ISR_RX_TX_CAUSE_REG(port->id));
+
+	cause_misc = cause_rx_tx & MVPP2_CAUSE_MISC_SUM_MASK;
+	if (cause_misc) {
+		mvpp2_cause_error(port->dev, cause_misc);
+
+		/* Clear the cause register */
+		mvpp2_write(port->priv, MVPP2_ISR_MISC_CAUSE_REG, 0);
+		mvpp2_percpu_write(port->priv, cpu,
+				   MVPP2_ISR_RX_TX_CAUSE_REG(port->id),
+				   cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK);
+	}
+
+	cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK;
+	if (cause_tx) {
+		cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET;
+		mvpp2_tx_done(port, cause_tx, qv->sw_thread_id);
+	}
+
+	/* Process RX packets */
+	cause_rx = cause_rx_tx & MVPP2_CAUSE_RXQ_OCCUP_DESC_ALL_MASK;
+	cause_rx <<= qv->first_rxq;
+	cause_rx |= qv->pending_cause_rx;
+	while (cause_rx && budget > 0) {
+		int count;
+		struct mvpp2_rx_queue *rxq;
+
+		rxq = mvpp2_get_rx_queue(port, cause_rx);
+		if (!rxq)
+			break;
+
+		count = mvpp2_rx(port, napi, budget, rxq);
+		rx_done += count;
+		budget -= count;
+		if (budget > 0) {
+			/* Clear the bit associated to this Rx queue
+			 * so that next iteration will continue from
+			 * the next Rx queue.
+			 */
+			cause_rx &= ~(1 << rxq->logic_rxq);
+		}
+	}
+
+	if (budget > 0) {
+		cause_rx = 0;
+		napi_complete_done(napi, rx_done);
+
+		mvpp2_qvec_interrupt_enable(qv);
+	}
+	qv->pending_cause_rx = cause_rx;
+	return rx_done;
+}
+
+static void mvpp22_mode_reconfigure(struct mvpp2_port *port)
+{
+	u32 ctrl3;
+
+	/* comphy reconfiguration */
+	mvpp22_comphy_init(port);
+
+	/* gop reconfiguration */
+	mvpp22_gop_init(port);
+
+	/* Only GOP port 0 has an XLG MAC */
+	if (port->gop_id == 0) {
+		ctrl3 = readl(port->base + MVPP22_XLG_CTRL3_REG);
+		ctrl3 &= ~MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
+
+		if (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+		    port->phy_interface == PHY_INTERFACE_MODE_10GKR)
+			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_10G;
+		else
+			ctrl3 |= MVPP22_XLG_CTRL3_MACMODESELECT_GMAC;
+
+		writel(ctrl3, port->base + MVPP22_XLG_CTRL3_REG);
+	}
+
+	if (port->gop_id == 0 &&
+	    (port->phy_interface == PHY_INTERFACE_MODE_XAUI ||
+	     port->phy_interface == PHY_INTERFACE_MODE_10GKR))
+		mvpp2_xlg_max_rx_size_set(port);
+	else
+		mvpp2_gmac_max_rx_size_set(port);
+}
+
+/* Set hw internals when starting port */
+static void mvpp2_start_dev(struct mvpp2_port *port)
+{
+	int i;
+
+	mvpp2_txp_max_tx_size_set(port);
+
+	for (i = 0; i < port->nqvecs; i++)
+		napi_enable(&port->qvecs[i].napi);
+
+	/* Enable interrupts on all CPUs */
+	mvpp2_interrupts_enable(port);
+
+	if (port->priv->hw_version == MVPP22)
+		mvpp22_mode_reconfigure(port);
+
+	if (port->phylink) {
+		phylink_start(port->phylink);
+	} else {
+		/* Phylink isn't used as of now for ACPI, so the MAC has to be
+		 * configured manually when the interface is started. This will
+		 * be removed as soon as the phylink ACPI support lands in.
+		 */
+		struct phylink_link_state state = {
+			.interface = port->phy_interface,
+			.link = 1,
+		};
+		mvpp2_mac_config(port->dev, MLO_AN_INBAND, &state);
+	}
+
+	netif_tx_start_all_queues(port->dev);
+}
+
+/* Set hw internals when stopping port */
+static void mvpp2_stop_dev(struct mvpp2_port *port)
+{
+	int i;
+
+	/* Disable interrupts on all CPUs */
+	mvpp2_interrupts_disable(port);
+
+	for (i = 0; i < port->nqvecs; i++)
+		napi_disable(&port->qvecs[i].napi);
+
+	if (port->phylink)
+		phylink_stop(port->phylink);
+	phy_power_off(port->comphy);
+}
+
+static int mvpp2_check_ringparam_valid(struct net_device *dev,
+				       struct ethtool_ringparam *ring)
+{
+	u16 new_rx_pending = ring->rx_pending;
+	u16 new_tx_pending = ring->tx_pending;
+
+	if (ring->rx_pending == 0 || ring->tx_pending == 0)
+		return -EINVAL;
+
+	if (ring->rx_pending > MVPP2_MAX_RXD_MAX)
+		new_rx_pending = MVPP2_MAX_RXD_MAX;
+	else if (!IS_ALIGNED(ring->rx_pending, 16))
+		new_rx_pending = ALIGN(ring->rx_pending, 16);
+
+	if (ring->tx_pending > MVPP2_MAX_TXD_MAX)
+		new_tx_pending = MVPP2_MAX_TXD_MAX;
+	else if (!IS_ALIGNED(ring->tx_pending, 32))
+		new_tx_pending = ALIGN(ring->tx_pending, 32);
+
+	/* The Tx ring size cannot be smaller than the minimum number of
+	 * descriptors needed for TSO.
+	 */
+	if (new_tx_pending < MVPP2_MAX_SKB_DESCS)
+		new_tx_pending = ALIGN(MVPP2_MAX_SKB_DESCS, 32);
+
+	if (ring->rx_pending != new_rx_pending) {
+		netdev_info(dev, "illegal Rx ring size value %d, round to %d\n",
+			    ring->rx_pending, new_rx_pending);
+		ring->rx_pending = new_rx_pending;
+	}
+
+	if (ring->tx_pending != new_tx_pending) {
+		netdev_info(dev, "illegal Tx ring size value %d, round to %d\n",
+			    ring->tx_pending, new_tx_pending);
+		ring->tx_pending = new_tx_pending;
+	}
+
+	return 0;
+}
+
+static void mvpp21_get_mac_address(struct mvpp2_port *port, unsigned char *addr)
+{
+	u32 mac_addr_l, mac_addr_m, mac_addr_h;
+
+	mac_addr_l = readl(port->base + MVPP2_GMAC_CTRL_1_REG);
+	mac_addr_m = readl(port->priv->lms_base + MVPP2_SRC_ADDR_MIDDLE);
+	mac_addr_h = readl(port->priv->lms_base + MVPP2_SRC_ADDR_HIGH);
+	addr[0] = (mac_addr_h >> 24) & 0xFF;
+	addr[1] = (mac_addr_h >> 16) & 0xFF;
+	addr[2] = (mac_addr_h >> 8) & 0xFF;
+	addr[3] = mac_addr_h & 0xFF;
+	addr[4] = mac_addr_m & 0xFF;
+	addr[5] = (mac_addr_l >> MVPP2_GMAC_SA_LOW_OFFS) & 0xFF;
+}
+
+static int mvpp2_irqs_init(struct mvpp2_port *port)
+{
+	int err, i;
+
+	for (i = 0; i < port->nqvecs; i++) {
+		struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+		if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
+			irq_set_status_flags(qv->irq, IRQ_NO_BALANCING);
+
+		err = request_irq(qv->irq, mvpp2_isr, 0, port->dev->name, qv);
+		if (err)
+			goto err;
+
+		if (qv->type == MVPP2_QUEUE_VECTOR_PRIVATE)
+			irq_set_affinity_hint(qv->irq,
+					      cpumask_of(qv->sw_thread_id));
+	}
+
+	return 0;
+err:
+	for (i = 0; i < port->nqvecs; i++) {
+		struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+		irq_set_affinity_hint(qv->irq, NULL);
+		free_irq(qv->irq, qv);
+	}
+
+	return err;
+}
+
+static void mvpp2_irqs_deinit(struct mvpp2_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->nqvecs; i++) {
+		struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+		irq_set_affinity_hint(qv->irq, NULL);
+		irq_clear_status_flags(qv->irq, IRQ_NO_BALANCING);
+		free_irq(qv->irq, qv);
+	}
+}
+
+static int mvpp2_open(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2 *priv = port->priv;
+	unsigned char mac_bcast[ETH_ALEN] = {
+			0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+	bool valid = false;
+	int err;
+
+	err = mvpp2_prs_mac_da_accept(port, mac_bcast, true);
+	if (err) {
+		netdev_err(dev, "mvpp2_prs_mac_da_accept BC failed\n");
+		return err;
+	}
+	err = mvpp2_prs_mac_da_accept(port, dev->dev_addr, true);
+	if (err) {
+		netdev_err(dev, "mvpp2_prs_mac_da_accept own addr failed\n");
+		return err;
+	}
+	err = mvpp2_prs_tag_mode_set(port->priv, port->id, MVPP2_TAG_TYPE_MH);
+	if (err) {
+		netdev_err(dev, "mvpp2_prs_tag_mode_set failed\n");
+		return err;
+	}
+	err = mvpp2_prs_def_flow(port);
+	if (err) {
+		netdev_err(dev, "mvpp2_prs_def_flow failed\n");
+		return err;
+	}
+
+	/* Allocate the Rx/Tx queues */
+	err = mvpp2_setup_rxqs(port);
+	if (err) {
+		netdev_err(port->dev, "cannot allocate Rx queues\n");
+		return err;
+	}
+
+	err = mvpp2_setup_txqs(port);
+	if (err) {
+		netdev_err(port->dev, "cannot allocate Tx queues\n");
+		goto err_cleanup_rxqs;
+	}
+
+	err = mvpp2_irqs_init(port);
+	if (err) {
+		netdev_err(port->dev, "cannot init IRQs\n");
+		goto err_cleanup_txqs;
+	}
+
+	/* Phylink isn't supported yet in ACPI mode */
+	if (port->of_node) {
+		err = phylink_of_phy_connect(port->phylink, port->of_node, 0);
+		if (err) {
+			netdev_err(port->dev, "could not attach PHY (%d)\n",
+				   err);
+			goto err_free_irq;
+		}
+
+		valid = true;
+	}
+
+	if (priv->hw_version == MVPP22 && port->link_irq && !port->phylink) {
+		err = request_irq(port->link_irq, mvpp2_link_status_isr, 0,
+				  dev->name, port);
+		if (err) {
+			netdev_err(port->dev, "cannot request link IRQ %d\n",
+				   port->link_irq);
+			goto err_free_irq;
+		}
+
+		mvpp22_gop_setup_irq(port);
+
+		/* In default link is down */
+		netif_carrier_off(port->dev);
+
+		valid = true;
+	} else {
+		port->link_irq = 0;
+	}
+
+	if (!valid) {
+		netdev_err(port->dev,
+			   "invalid configuration: no dt or link IRQ");
+		goto err_free_irq;
+	}
+
+	/* Unmask interrupts on all CPUs */
+	on_each_cpu(mvpp2_interrupts_unmask, port, 1);
+	mvpp2_shared_interrupt_mask_unmask(port, false);
+
+	mvpp2_start_dev(port);
+
+	if (priv->hw_version == MVPP22)
+		mvpp22_init_rss(port);
+
+	/* Start hardware statistics gathering */
+	queue_delayed_work(priv->stats_queue, &port->stats_work,
+			   MVPP2_MIB_COUNTERS_STATS_DELAY);
+
+	return 0;
+
+err_free_irq:
+	mvpp2_irqs_deinit(port);
+err_cleanup_txqs:
+	mvpp2_cleanup_txqs(port);
+err_cleanup_rxqs:
+	mvpp2_cleanup_rxqs(port);
+	return err;
+}
+
+static int mvpp2_stop(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	struct mvpp2_port_pcpu *port_pcpu;
+	int cpu;
+
+	mvpp2_stop_dev(port);
+
+	/* Mask interrupts on all CPUs */
+	on_each_cpu(mvpp2_interrupts_mask, port, 1);
+	mvpp2_shared_interrupt_mask_unmask(port, true);
+
+	if (port->phylink)
+		phylink_disconnect_phy(port->phylink);
+	if (port->link_irq)
+		free_irq(port->link_irq, port);
+
+	mvpp2_irqs_deinit(port);
+	if (!port->has_tx_irqs) {
+		for_each_present_cpu(cpu) {
+			port_pcpu = per_cpu_ptr(port->pcpu, cpu);
+
+			hrtimer_cancel(&port_pcpu->tx_done_timer);
+			port_pcpu->timer_scheduled = false;
+			tasklet_kill(&port_pcpu->tx_done_tasklet);
+		}
+	}
+	mvpp2_cleanup_rxqs(port);
+	mvpp2_cleanup_txqs(port);
+
+	cancel_delayed_work_sync(&port->stats_work);
+
+	return 0;
+}
+
+static int mvpp2_prs_mac_da_accept_list(struct mvpp2_port *port,
+					struct netdev_hw_addr_list *list)
+{
+	struct netdev_hw_addr *ha;
+	int ret;
+
+	netdev_hw_addr_list_for_each(ha, list) {
+		ret = mvpp2_prs_mac_da_accept(port, ha->addr, true);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void mvpp2_set_rx_promisc(struct mvpp2_port *port, bool enable)
+{
+	if (!enable && (port->dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+		mvpp2_prs_vid_enable_filtering(port);
+	else
+		mvpp2_prs_vid_disable_filtering(port);
+
+	mvpp2_prs_mac_promisc_set(port->priv, port->id,
+				  MVPP2_PRS_L2_UNI_CAST, enable);
+
+	mvpp2_prs_mac_promisc_set(port->priv, port->id,
+				  MVPP2_PRS_L2_MULTI_CAST, enable);
+}
+
+static void mvpp2_set_rx_mode(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	/* Clear the whole UC and MC list */
+	mvpp2_prs_mac_del_all(port);
+
+	if (dev->flags & IFF_PROMISC) {
+		mvpp2_set_rx_promisc(port, true);
+		return;
+	}
+
+	mvpp2_set_rx_promisc(port, false);
+
+	if (netdev_uc_count(dev) > MVPP2_PRS_MAC_UC_FILT_MAX ||
+	    mvpp2_prs_mac_da_accept_list(port, &dev->uc))
+		mvpp2_prs_mac_promisc_set(port->priv, port->id,
+					  MVPP2_PRS_L2_UNI_CAST, true);
+
+	if (dev->flags & IFF_ALLMULTI) {
+		mvpp2_prs_mac_promisc_set(port->priv, port->id,
+					  MVPP2_PRS_L2_MULTI_CAST, true);
+		return;
+	}
+
+	if (netdev_mc_count(dev) > MVPP2_PRS_MAC_MC_FILT_MAX ||
+	    mvpp2_prs_mac_da_accept_list(port, &dev->mc))
+		mvpp2_prs_mac_promisc_set(port->priv, port->id,
+					  MVPP2_PRS_L2_MULTI_CAST, true);
+}
+
+static int mvpp2_set_mac_address(struct net_device *dev, void *p)
+{
+	const struct sockaddr *addr = p;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	err = mvpp2_prs_update_mac_da(dev, addr->sa_data);
+	if (err) {
+		/* Reconfigure parser accept the original MAC address */
+		mvpp2_prs_update_mac_da(dev, dev->dev_addr);
+		netdev_err(dev, "failed to change MAC address\n");
+	}
+	return err;
+}
+
+static int mvpp2_change_mtu(struct net_device *dev, int mtu)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	int err;
+
+	if (!IS_ALIGNED(MVPP2_RX_PKT_SIZE(mtu), 8)) {
+		netdev_info(dev, "illegal MTU value %d, round to %d\n", mtu,
+			    ALIGN(MVPP2_RX_PKT_SIZE(mtu), 8));
+		mtu = ALIGN(MVPP2_RX_PKT_SIZE(mtu), 8);
+	}
+
+	if (!netif_running(dev)) {
+		err = mvpp2_bm_update_mtu(dev, mtu);
+		if (!err) {
+			port->pkt_size =  MVPP2_RX_PKT_SIZE(mtu);
+			return 0;
+		}
+
+		/* Reconfigure BM to the original MTU */
+		err = mvpp2_bm_update_mtu(dev, dev->mtu);
+		if (err)
+			goto log_error;
+	}
+
+	mvpp2_stop_dev(port);
+
+	err = mvpp2_bm_update_mtu(dev, mtu);
+	if (!err) {
+		port->pkt_size =  MVPP2_RX_PKT_SIZE(mtu);
+		goto out_start;
+	}
+
+	/* Reconfigure BM to the original MTU */
+	err = mvpp2_bm_update_mtu(dev, dev->mtu);
+	if (err)
+		goto log_error;
+
+out_start:
+	mvpp2_start_dev(port);
+	mvpp2_egress_enable(port);
+	mvpp2_ingress_enable(port);
+
+	return 0;
+log_error:
+	netdev_err(dev, "failed to change MTU\n");
+	return err;
+}
+
+static void
+mvpp2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	unsigned int start;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct mvpp2_pcpu_stats *cpu_stats;
+		u64 rx_packets;
+		u64 rx_bytes;
+		u64 tx_packets;
+		u64 tx_bytes;
+
+		cpu_stats = per_cpu_ptr(port->stats, cpu);
+		do {
+			start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+			rx_packets = cpu_stats->rx_packets;
+			rx_bytes   = cpu_stats->rx_bytes;
+			tx_packets = cpu_stats->tx_packets;
+			tx_bytes   = cpu_stats->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes   += rx_bytes;
+		stats->tx_packets += tx_packets;
+		stats->tx_bytes   += tx_bytes;
+	}
+
+	stats->rx_errors	= dev->stats.rx_errors;
+	stats->rx_dropped	= dev->stats.rx_dropped;
+	stats->tx_dropped	= dev->stats.tx_dropped;
+}
+
+static int mvpp2_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_mii_ioctl(port->phylink, ifr, cmd);
+}
+
+static int mvpp2_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	int ret;
+
+	ret = mvpp2_prs_vid_entry_add(port, vid);
+	if (ret)
+		netdev_err(dev, "rx-vlan-filter offloading cannot accept more than %d VIDs per port\n",
+			   MVPP2_PRS_VLAN_FILT_MAX - 1);
+	return ret;
+}
+
+static int mvpp2_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	mvpp2_prs_vid_entry_remove(port, vid);
+	return 0;
+}
+
+static int mvpp2_set_features(struct net_device *dev,
+			      netdev_features_t features)
+{
+	netdev_features_t changed = dev->features ^ features;
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_FILTER) {
+		if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+			mvpp2_prs_vid_enable_filtering(port);
+		} else {
+			/* Invalidate all registered VID filters for this
+			 * port
+			 */
+			mvpp2_prs_vid_remove_all(port);
+
+			mvpp2_prs_vid_disable_filtering(port);
+		}
+	}
+
+	return 0;
+}
+
+/* Ethtool methods */
+
+static int mvpp2_ethtool_nway_reset(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_nway_reset(port->phylink);
+}
+
+/* Set interrupt coalescing for ethtools */
+static int mvpp2_ethtool_set_coalesce(struct net_device *dev,
+				      struct ethtool_coalesce *c)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	int queue;
+
+	for (queue = 0; queue < port->nrxqs; queue++) {
+		struct mvpp2_rx_queue *rxq = port->rxqs[queue];
+
+		rxq->time_coal = c->rx_coalesce_usecs;
+		rxq->pkts_coal = c->rx_max_coalesced_frames;
+		mvpp2_rx_pkts_coal_set(port, rxq);
+		mvpp2_rx_time_coal_set(port, rxq);
+	}
+
+	if (port->has_tx_irqs) {
+		port->tx_time_coal = c->tx_coalesce_usecs;
+		mvpp2_tx_time_coal_set(port);
+	}
+
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		struct mvpp2_tx_queue *txq = port->txqs[queue];
+
+		txq->done_pkts_coal = c->tx_max_coalesced_frames;
+
+		if (port->has_tx_irqs)
+			mvpp2_tx_pkts_coal_set(port, txq);
+	}
+
+	return 0;
+}
+
+/* get coalescing for ethtools */
+static int mvpp2_ethtool_get_coalesce(struct net_device *dev,
+				      struct ethtool_coalesce *c)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	c->rx_coalesce_usecs       = port->rxqs[0]->time_coal;
+	c->rx_max_coalesced_frames = port->rxqs[0]->pkts_coal;
+	c->tx_max_coalesced_frames = port->txqs[0]->done_pkts_coal;
+	c->tx_coalesce_usecs       = port->tx_time_coal;
+	return 0;
+}
+
+static void mvpp2_ethtool_get_drvinfo(struct net_device *dev,
+				      struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, MVPP2_DRIVER_NAME,
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, MVPP2_DRIVER_VERSION,
+		sizeof(drvinfo->version));
+	strlcpy(drvinfo->bus_info, dev_name(&dev->dev),
+		sizeof(drvinfo->bus_info));
+}
+
+static void mvpp2_ethtool_get_ringparam(struct net_device *dev,
+					struct ethtool_ringparam *ring)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	ring->rx_max_pending = MVPP2_MAX_RXD_MAX;
+	ring->tx_max_pending = MVPP2_MAX_TXD_MAX;
+	ring->rx_pending = port->rx_ring_size;
+	ring->tx_pending = port->tx_ring_size;
+}
+
+static int mvpp2_ethtool_set_ringparam(struct net_device *dev,
+				       struct ethtool_ringparam *ring)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u16 prev_rx_ring_size = port->rx_ring_size;
+	u16 prev_tx_ring_size = port->tx_ring_size;
+	int err;
+
+	err = mvpp2_check_ringparam_valid(dev, ring);
+	if (err)
+		return err;
+
+	if (!netif_running(dev)) {
+		port->rx_ring_size = ring->rx_pending;
+		port->tx_ring_size = ring->tx_pending;
+		return 0;
+	}
+
+	/* The interface is running, so we have to force a
+	 * reallocation of the queues
+	 */
+	mvpp2_stop_dev(port);
+	mvpp2_cleanup_rxqs(port);
+	mvpp2_cleanup_txqs(port);
+
+	port->rx_ring_size = ring->rx_pending;
+	port->tx_ring_size = ring->tx_pending;
+
+	err = mvpp2_setup_rxqs(port);
+	if (err) {
+		/* Reallocate Rx queues with the original ring size */
+		port->rx_ring_size = prev_rx_ring_size;
+		ring->rx_pending = prev_rx_ring_size;
+		err = mvpp2_setup_rxqs(port);
+		if (err)
+			goto err_out;
+	}
+	err = mvpp2_setup_txqs(port);
+	if (err) {
+		/* Reallocate Tx queues with the original ring size */
+		port->tx_ring_size = prev_tx_ring_size;
+		ring->tx_pending = prev_tx_ring_size;
+		err = mvpp2_setup_txqs(port);
+		if (err)
+			goto err_clean_rxqs;
+	}
+
+	mvpp2_start_dev(port);
+	mvpp2_egress_enable(port);
+	mvpp2_ingress_enable(port);
+
+	return 0;
+
+err_clean_rxqs:
+	mvpp2_cleanup_rxqs(port);
+err_out:
+	netdev_err(dev, "failed to change ring parameters");
+	return err;
+}
+
+static void mvpp2_ethtool_get_pause_param(struct net_device *dev,
+					  struct ethtool_pauseparam *pause)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return;
+
+	phylink_ethtool_get_pauseparam(port->phylink, pause);
+}
+
+static int mvpp2_ethtool_set_pause_param(struct net_device *dev,
+					 struct ethtool_pauseparam *pause)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_set_pauseparam(port->phylink, pause);
+}
+
+static int mvpp2_ethtool_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_ksettings_get(port->phylink, cmd);
+}
+
+static int mvpp2_ethtool_set_link_ksettings(struct net_device *dev,
+					    const struct ethtool_link_ksettings *cmd)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (!port->phylink)
+		return -ENOTSUPP;
+
+	return phylink_ethtool_ksettings_set(port->phylink, cmd);
+}
+
+/* Device ops */
+
+static const struct net_device_ops mvpp2_netdev_ops = {
+	.ndo_open		= mvpp2_open,
+	.ndo_stop		= mvpp2_stop,
+	.ndo_start_xmit		= mvpp2_tx,
+	.ndo_set_rx_mode	= mvpp2_set_rx_mode,
+	.ndo_set_mac_address	= mvpp2_set_mac_address,
+	.ndo_change_mtu		= mvpp2_change_mtu,
+	.ndo_get_stats64	= mvpp2_get_stats64,
+	.ndo_do_ioctl		= mvpp2_ioctl,
+	.ndo_vlan_rx_add_vid	= mvpp2_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mvpp2_vlan_rx_kill_vid,
+	.ndo_set_features	= mvpp2_set_features,
+};
+
+static const struct ethtool_ops mvpp2_eth_tool_ops = {
+	.nway_reset		= mvpp2_ethtool_nway_reset,
+	.get_link		= ethtool_op_get_link,
+	.set_coalesce		= mvpp2_ethtool_set_coalesce,
+	.get_coalesce		= mvpp2_ethtool_get_coalesce,
+	.get_drvinfo		= mvpp2_ethtool_get_drvinfo,
+	.get_ringparam		= mvpp2_ethtool_get_ringparam,
+	.set_ringparam		= mvpp2_ethtool_set_ringparam,
+	.get_strings		= mvpp2_ethtool_get_strings,
+	.get_ethtool_stats	= mvpp2_ethtool_get_stats,
+	.get_sset_count		= mvpp2_ethtool_get_sset_count,
+	.get_pauseparam		= mvpp2_ethtool_get_pause_param,
+	.set_pauseparam		= mvpp2_ethtool_set_pause_param,
+	.get_link_ksettings	= mvpp2_ethtool_get_link_ksettings,
+	.set_link_ksettings	= mvpp2_ethtool_set_link_ksettings,
+};
+
+/* Used for PPv2.1, or PPv2.2 with the old Device Tree binding that
+ * had a single IRQ defined per-port.
+ */
+static int mvpp2_simple_queue_vectors_init(struct mvpp2_port *port,
+					   struct device_node *port_node)
+{
+	struct mvpp2_queue_vector *v = &port->qvecs[0];
+
+	v->first_rxq = 0;
+	v->nrxqs = port->nrxqs;
+	v->type = MVPP2_QUEUE_VECTOR_SHARED;
+	v->sw_thread_id = 0;
+	v->sw_thread_mask = *cpumask_bits(cpu_online_mask);
+	v->port = port;
+	v->irq = irq_of_parse_and_map(port_node, 0);
+	if (v->irq <= 0)
+		return -EINVAL;
+	netif_napi_add(port->dev, &v->napi, mvpp2_poll,
+		       NAPI_POLL_WEIGHT);
+
+	port->nqvecs = 1;
+
+	return 0;
+}
+
+static int mvpp2_multi_queue_vectors_init(struct mvpp2_port *port,
+					  struct device_node *port_node)
+{
+	struct mvpp2_queue_vector *v;
+	int i, ret;
+
+	port->nqvecs = num_possible_cpus();
+	if (queue_mode == MVPP2_QDIST_SINGLE_MODE)
+		port->nqvecs += 1;
+
+	for (i = 0; i < port->nqvecs; i++) {
+		char irqname[16];
+
+		v = port->qvecs + i;
+
+		v->port = port;
+		v->type = MVPP2_QUEUE_VECTOR_PRIVATE;
+		v->sw_thread_id = i;
+		v->sw_thread_mask = BIT(i);
+
+		snprintf(irqname, sizeof(irqname), "tx-cpu%d", i);
+
+		if (queue_mode == MVPP2_QDIST_MULTI_MODE) {
+			v->first_rxq = i * MVPP2_DEFAULT_RXQ;
+			v->nrxqs = MVPP2_DEFAULT_RXQ;
+		} else if (queue_mode == MVPP2_QDIST_SINGLE_MODE &&
+			   i == (port->nqvecs - 1)) {
+			v->first_rxq = 0;
+			v->nrxqs = port->nrxqs;
+			v->type = MVPP2_QUEUE_VECTOR_SHARED;
+			strncpy(irqname, "rx-shared", sizeof(irqname));
+		}
+
+		if (port_node)
+			v->irq = of_irq_get_byname(port_node, irqname);
+		else
+			v->irq = fwnode_irq_get(port->fwnode, i);
+		if (v->irq <= 0) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		netif_napi_add(port->dev, &v->napi, mvpp2_poll,
+			       NAPI_POLL_WEIGHT);
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < port->nqvecs; i++)
+		irq_dispose_mapping(port->qvecs[i].irq);
+	return ret;
+}
+
+static int mvpp2_queue_vectors_init(struct mvpp2_port *port,
+				    struct device_node *port_node)
+{
+	if (port->has_tx_irqs)
+		return mvpp2_multi_queue_vectors_init(port, port_node);
+	else
+		return mvpp2_simple_queue_vectors_init(port, port_node);
+}
+
+static void mvpp2_queue_vectors_deinit(struct mvpp2_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->nqvecs; i++)
+		irq_dispose_mapping(port->qvecs[i].irq);
+}
+
+/* Configure Rx queue group interrupt for this port */
+static void mvpp2_rx_irqs_setup(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	u32 val;
+	int i;
+
+	if (priv->hw_version == MVPP21) {
+		mvpp2_write(priv, MVPP21_ISR_RXQ_GROUP_REG(port->id),
+			    port->nrxqs);
+		return;
+	}
+
+	/* Handle the more complicated PPv2.2 case */
+	for (i = 0; i < port->nqvecs; i++) {
+		struct mvpp2_queue_vector *qv = port->qvecs + i;
+
+		if (!qv->nrxqs)
+			continue;
+
+		val = qv->sw_thread_id;
+		val |= port->id << MVPP22_ISR_RXQ_GROUP_INDEX_GROUP_OFFSET;
+		mvpp2_write(priv, MVPP22_ISR_RXQ_GROUP_INDEX_REG, val);
+
+		val = qv->first_rxq;
+		val |= qv->nrxqs << MVPP22_ISR_RXQ_SUB_GROUP_SIZE_OFFSET;
+		mvpp2_write(priv, MVPP22_ISR_RXQ_SUB_GROUP_CONFIG_REG, val);
+	}
+}
+
+/* Initialize port HW */
+static int mvpp2_port_init(struct mvpp2_port *port)
+{
+	struct device *dev = port->dev->dev.parent;
+	struct mvpp2 *priv = port->priv;
+	struct mvpp2_txq_pcpu *txq_pcpu;
+	int queue, cpu, err;
+
+	/* Checks for hardware constraints */
+	if (port->first_rxq + port->nrxqs >
+	    MVPP2_MAX_PORTS * priv->max_port_rxqs)
+		return -EINVAL;
+
+	if (port->nrxqs % 4 || (port->nrxqs > priv->max_port_rxqs) ||
+	    (port->ntxqs > MVPP2_MAX_TXQ))
+		return -EINVAL;
+
+	/* Disable port */
+	mvpp2_egress_disable(port);
+	mvpp2_port_disable(port);
+
+	port->tx_time_coal = MVPP2_TXDONE_COAL_USEC;
+
+	port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs),
+				  GFP_KERNEL);
+	if (!port->txqs)
+		return -ENOMEM;
+
+	/* Associate physical Tx queues to this port and initialize.
+	 * The mapping is predefined.
+	 */
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		int queue_phy_id = mvpp2_txq_phys(port->id, queue);
+		struct mvpp2_tx_queue *txq;
+
+		txq = devm_kzalloc(dev, sizeof(*txq), GFP_KERNEL);
+		if (!txq) {
+			err = -ENOMEM;
+			goto err_free_percpu;
+		}
+
+		txq->pcpu = alloc_percpu(struct mvpp2_txq_pcpu);
+		if (!txq->pcpu) {
+			err = -ENOMEM;
+			goto err_free_percpu;
+		}
+
+		txq->id = queue_phy_id;
+		txq->log_id = queue;
+		txq->done_pkts_coal = MVPP2_TXDONE_COAL_PKTS_THRESH;
+		for_each_present_cpu(cpu) {
+			txq_pcpu = per_cpu_ptr(txq->pcpu, cpu);
+			txq_pcpu->cpu = cpu;
+		}
+
+		port->txqs[queue] = txq;
+	}
+
+	port->rxqs = devm_kcalloc(dev, port->nrxqs, sizeof(*port->rxqs),
+				  GFP_KERNEL);
+	if (!port->rxqs) {
+		err = -ENOMEM;
+		goto err_free_percpu;
+	}
+
+	/* Allocate and initialize Rx queue for this port */
+	for (queue = 0; queue < port->nrxqs; queue++) {
+		struct mvpp2_rx_queue *rxq;
+
+		/* Map physical Rx queue to port's logical Rx queue */
+		rxq = devm_kzalloc(dev, sizeof(*rxq), GFP_KERNEL);
+		if (!rxq) {
+			err = -ENOMEM;
+			goto err_free_percpu;
+		}
+		/* Map this Rx queue to a physical queue */
+		rxq->id = port->first_rxq + queue;
+		rxq->port = port->id;
+		rxq->logic_rxq = queue;
+
+		port->rxqs[queue] = rxq;
+	}
+
+	mvpp2_rx_irqs_setup(port);
+
+	/* Create Rx descriptor rings */
+	for (queue = 0; queue < port->nrxqs; queue++) {
+		struct mvpp2_rx_queue *rxq = port->rxqs[queue];
+
+		rxq->size = port->rx_ring_size;
+		rxq->pkts_coal = MVPP2_RX_COAL_PKTS;
+		rxq->time_coal = MVPP2_RX_COAL_USEC;
+	}
+
+	mvpp2_ingress_disable(port);
+
+	/* Port default configuration */
+	mvpp2_defaults_set(port);
+
+	/* Port's classifier configuration */
+	mvpp2_cls_oversize_rxq_set(port);
+	mvpp2_cls_port_config(port);
+
+	/* Provide an initial Rx packet size */
+	port->pkt_size = MVPP2_RX_PKT_SIZE(port->dev->mtu);
+
+	/* Initialize pools for swf */
+	err = mvpp2_swf_bm_pool_init(port);
+	if (err)
+		goto err_free_percpu;
+
+	return 0;
+
+err_free_percpu:
+	for (queue = 0; queue < port->ntxqs; queue++) {
+		if (!port->txqs[queue])
+			continue;
+		free_percpu(port->txqs[queue]->pcpu);
+	}
+	return err;
+}
+
+/* Checks if the port DT description has the TX interrupts
+ * described. On PPv2.1, there are no such interrupts. On PPv2.2,
+ * there are available, but we need to keep support for old DTs.
+ */
+static bool mvpp2_port_has_tx_irqs(struct mvpp2 *priv,
+				   struct device_node *port_node)
+{
+	char *irqs[5] = { "rx-shared", "tx-cpu0", "tx-cpu1",
+			  "tx-cpu2", "tx-cpu3" };
+	int ret, i;
+
+	if (priv->hw_version == MVPP21)
+		return false;
+
+	for (i = 0; i < 5; i++) {
+		ret = of_property_match_string(port_node, "interrupt-names",
+					       irqs[i]);
+		if (ret < 0)
+			return false;
+	}
+
+	return true;
+}
+
+static void mvpp2_port_copy_mac_addr(struct net_device *dev, struct mvpp2 *priv,
+				     struct fwnode_handle *fwnode,
+				     char **mac_from)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	char hw_mac_addr[ETH_ALEN] = {0};
+	char fw_mac_addr[ETH_ALEN];
+
+	if (fwnode_get_mac_address(fwnode, fw_mac_addr, ETH_ALEN)) {
+		*mac_from = "firmware node";
+		ether_addr_copy(dev->dev_addr, fw_mac_addr);
+		return;
+	}
+
+	if (priv->hw_version == MVPP21) {
+		mvpp21_get_mac_address(port, hw_mac_addr);
+		if (is_valid_ether_addr(hw_mac_addr)) {
+			*mac_from = "hardware";
+			ether_addr_copy(dev->dev_addr, hw_mac_addr);
+			return;
+		}
+	}
+
+	*mac_from = "random";
+	eth_hw_addr_random(dev);
+}
+
+static void mvpp2_phylink_validate(struct net_device *dev,
+				   unsigned long *supported,
+				   struct phylink_link_state *state)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
+
+	phylink_set(mask, Autoneg);
+	phylink_set_port_modes(mask);
+	phylink_set(mask, Pause);
+	phylink_set(mask, Asym_Pause);
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_10GKR:
+		phylink_set(mask, 10000baseCR_Full);
+		phylink_set(mask, 10000baseSR_Full);
+		phylink_set(mask, 10000baseLR_Full);
+		phylink_set(mask, 10000baseLRM_Full);
+		phylink_set(mask, 10000baseER_Full);
+		phylink_set(mask, 10000baseKR_Full);
+		/* Fall-through */
+	default:
+		phylink_set(mask, 10baseT_Half);
+		phylink_set(mask, 10baseT_Full);
+		phylink_set(mask, 100baseT_Half);
+		phylink_set(mask, 100baseT_Full);
+		phylink_set(mask, 10000baseT_Full);
+		/* Fall-through */
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		phylink_set(mask, 1000baseT_Full);
+		phylink_set(mask, 1000baseX_Full);
+		phylink_set(mask, 2500baseX_Full);
+	}
+
+	bitmap_and(supported, supported, mask, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_and(state->advertising, state->advertising, mask,
+		   __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void mvpp22_xlg_link_state(struct mvpp2_port *port,
+				  struct phylink_link_state *state)
+{
+	u32 val;
+
+	state->speed = SPEED_10000;
+	state->duplex = 1;
+	state->an_complete = 1;
+
+	val = readl(port->base + MVPP22_XLG_STATUS);
+	state->link = !!(val & MVPP22_XLG_STATUS_LINK_UP);
+
+	state->pause = 0;
+	val = readl(port->base + MVPP22_XLG_CTRL0_REG);
+	if (val & MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN)
+		state->pause |= MLO_PAUSE_TX;
+	if (val & MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN)
+		state->pause |= MLO_PAUSE_RX;
+}
+
+static void mvpp2_gmac_link_state(struct mvpp2_port *port,
+				  struct phylink_link_state *state)
+{
+	u32 val;
+
+	val = readl(port->base + MVPP2_GMAC_STATUS0);
+
+	state->an_complete = !!(val & MVPP2_GMAC_STATUS0_AN_COMPLETE);
+	state->link = !!(val & MVPP2_GMAC_STATUS0_LINK_UP);
+	state->duplex = !!(val & MVPP2_GMAC_STATUS0_FULL_DUPLEX);
+
+	switch (port->phy_interface) {
+	case PHY_INTERFACE_MODE_1000BASEX:
+		state->speed = SPEED_1000;
+		break;
+	case PHY_INTERFACE_MODE_2500BASEX:
+		state->speed = SPEED_2500;
+		break;
+	default:
+		if (val & MVPP2_GMAC_STATUS0_GMII_SPEED)
+			state->speed = SPEED_1000;
+		else if (val & MVPP2_GMAC_STATUS0_MII_SPEED)
+			state->speed = SPEED_100;
+		else
+			state->speed = SPEED_10;
+	}
+
+	state->pause = 0;
+	if (val & MVPP2_GMAC_STATUS0_RX_PAUSE)
+		state->pause |= MLO_PAUSE_RX;
+	if (val & MVPP2_GMAC_STATUS0_TX_PAUSE)
+		state->pause |= MLO_PAUSE_TX;
+}
+
+static int mvpp2_phylink_mac_link_state(struct net_device *dev,
+					struct phylink_link_state *state)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	if (port->priv->hw_version == MVPP22 && port->gop_id == 0) {
+		u32 mode = readl(port->base + MVPP22_XLG_CTRL3_REG);
+		mode &= MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
+
+		if (mode == MVPP22_XLG_CTRL3_MACMODESELECT_10G) {
+			mvpp22_xlg_link_state(port, state);
+			return 1;
+		}
+	}
+
+	mvpp2_gmac_link_state(port, state);
+	return 1;
+}
+
+static void mvpp2_mac_an_restart(struct net_device *dev)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (port->phy_interface != PHY_INTERFACE_MODE_SGMII)
+		return;
+
+	val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	/* The RESTART_AN bit is cleared by the h/w after restarting the AN
+	 * process.
+	 */
+	val |= MVPP2_GMAC_IN_BAND_RESTART_AN | MVPP2_GMAC_IN_BAND_AUTONEG;
+	writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_xlg_config(struct mvpp2_port *port, unsigned int mode,
+			     const struct phylink_link_state *state)
+{
+	u32 ctrl0, ctrl4;
+
+	ctrl0 = readl(port->base + MVPP22_XLG_CTRL0_REG);
+	ctrl4 = readl(port->base + MVPP22_XLG_CTRL4_REG);
+
+	if (state->pause & MLO_PAUSE_TX)
+		ctrl0 |= MVPP22_XLG_CTRL0_TX_FLOW_CTRL_EN;
+	if (state->pause & MLO_PAUSE_RX)
+		ctrl0 |= MVPP22_XLG_CTRL0_RX_FLOW_CTRL_EN;
+
+	ctrl4 &= ~MVPP22_XLG_CTRL4_MACMODSELECT_GMAC;
+	ctrl4 |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC |
+		 MVPP22_XLG_CTRL4_EN_IDLE_CHECK;
+
+	writel(ctrl0, port->base + MVPP22_XLG_CTRL0_REG);
+	writel(ctrl4, port->base + MVPP22_XLG_CTRL4_REG);
+}
+
+static void mvpp2_gmac_config(struct mvpp2_port *port, unsigned int mode,
+			      const struct phylink_link_state *state)
+{
+	u32 an, ctrl0, ctrl2, ctrl4;
+
+	an = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	ctrl0 = readl(port->base + MVPP2_GMAC_CTRL_0_REG);
+	ctrl2 = readl(port->base + MVPP2_GMAC_CTRL_2_REG);
+	ctrl4 = readl(port->base + MVPP22_GMAC_CTRL_4_REG);
+
+	/* Force link down */
+	an &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+	an |= MVPP2_GMAC_FORCE_LINK_DOWN;
+	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+
+	/* Set the GMAC in a reset state */
+	ctrl2 |= MVPP2_GMAC_PORT_RESET_MASK;
+	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+
+	an &= ~(MVPP2_GMAC_CONFIG_MII_SPEED | MVPP2_GMAC_CONFIG_GMII_SPEED |
+		MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FC_ADV_EN |
+		MVPP2_GMAC_FC_ADV_ASM_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG |
+		MVPP2_GMAC_CONFIG_FULL_DUPLEX | MVPP2_GMAC_AN_DUPLEX_EN |
+		MVPP2_GMAC_FORCE_LINK_DOWN);
+	ctrl0 &= ~MVPP2_GMAC_PORT_TYPE_MASK;
+	ctrl2 &= ~(MVPP2_GMAC_PORT_RESET_MASK | MVPP2_GMAC_PCS_ENABLE_MASK);
+
+	if (state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
+		/* 1000BaseX and 2500BaseX ports cannot negotiate speed nor can
+		 * they negotiate duplex: they are always operating with a fixed
+		 * speed of 1000/2500Mbps in full duplex, so force 1000/2500
+		 * speed and full duplex here.
+		 */
+		ctrl0 |= MVPP2_GMAC_PORT_TYPE_MASK;
+		an |= MVPP2_GMAC_CONFIG_GMII_SPEED |
+		      MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+	} else if (!phy_interface_mode_is_rgmii(state->interface)) {
+		an |= MVPP2_GMAC_AN_SPEED_EN | MVPP2_GMAC_FLOW_CTRL_AUTONEG;
+	}
+
+	if (state->duplex)
+		an |= MVPP2_GMAC_CONFIG_FULL_DUPLEX;
+	if (phylink_test(state->advertising, Pause))
+		an |= MVPP2_GMAC_FC_ADV_EN;
+	if (phylink_test(state->advertising, Asym_Pause))
+		an |= MVPP2_GMAC_FC_ADV_ASM_EN;
+
+	if (state->interface == PHY_INTERFACE_MODE_SGMII ||
+	    state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+	    state->interface == PHY_INTERFACE_MODE_2500BASEX) {
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG;
+		ctrl2 |= MVPP2_GMAC_INBAND_AN_MASK | MVPP2_GMAC_PCS_ENABLE_MASK;
+
+		ctrl4 &= ~(MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+			   MVPP22_CTRL4_RX_FC_EN | MVPP22_CTRL4_TX_FC_EN);
+		ctrl4 |= MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_DP_CLK_SEL |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+
+		if (state->pause & MLO_PAUSE_TX)
+			ctrl4 |= MVPP22_CTRL4_TX_FC_EN;
+		if (state->pause & MLO_PAUSE_RX)
+			ctrl4 |= MVPP22_CTRL4_RX_FC_EN;
+	} else if (phy_interface_mode_is_rgmii(state->interface)) {
+		an |= MVPP2_GMAC_IN_BAND_AUTONEG_BYPASS;
+
+		if (state->speed == SPEED_1000)
+			an |= MVPP2_GMAC_CONFIG_GMII_SPEED;
+		else if (state->speed == SPEED_100)
+			an |= MVPP2_GMAC_CONFIG_MII_SPEED;
+
+		ctrl4 &= ~MVPP22_CTRL4_DP_CLK_SEL;
+		ctrl4 |= MVPP22_CTRL4_EXT_PIN_GMII_SEL |
+			 MVPP22_CTRL4_SYNC_BYPASS_DIS |
+			 MVPP22_CTRL4_QSGMII_BYPASS_ACTIVE;
+	}
+
+	writel(ctrl0, port->base + MVPP2_GMAC_CTRL_0_REG);
+	writel(ctrl2, port->base + MVPP2_GMAC_CTRL_2_REG);
+	writel(ctrl4, port->base + MVPP22_GMAC_CTRL_4_REG);
+	writel(an, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+}
+
+static void mvpp2_mac_config(struct net_device *dev, unsigned int mode,
+			     const struct phylink_link_state *state)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+
+	/* Check for invalid configuration */
+	if (state->interface == PHY_INTERFACE_MODE_10GKR && port->gop_id != 0) {
+		netdev_err(dev, "Invalid mode on %s\n", dev->name);
+		return;
+	}
+
+	netif_tx_stop_all_queues(port->dev);
+	if (!port->has_phy)
+		netif_carrier_off(port->dev);
+
+	/* Make sure the port is disabled when reconfiguring the mode */
+	mvpp2_port_disable(port);
+
+	if (port->priv->hw_version == MVPP22 &&
+	    port->phy_interface != state->interface) {
+		port->phy_interface = state->interface;
+
+		/* Reconfigure the serdes lanes */
+		phy_power_off(port->comphy);
+		mvpp22_mode_reconfigure(port);
+	}
+
+	/* mac (re)configuration */
+	if (state->interface == PHY_INTERFACE_MODE_10GKR)
+		mvpp2_xlg_config(port, mode, state);
+	else if (phy_interface_mode_is_rgmii(state->interface) ||
+		 state->interface == PHY_INTERFACE_MODE_SGMII ||
+		 state->interface == PHY_INTERFACE_MODE_1000BASEX ||
+		 state->interface == PHY_INTERFACE_MODE_2500BASEX)
+		mvpp2_gmac_config(port, mode, state);
+
+	if (port->priv->hw_version == MVPP21 && port->flags & MVPP2_F_LOOPBACK)
+		mvpp2_port_loopback_set(port, state);
+
+	/* If the port already was up, make sure it's still in the same state */
+	if (state->link || !port->has_phy) {
+		mvpp2_port_enable(port);
+
+		mvpp2_egress_enable(port);
+		mvpp2_ingress_enable(port);
+		if (!port->has_phy)
+			netif_carrier_on(dev);
+		netif_tx_wake_all_queues(dev);
+	}
+}
+
+static void mvpp2_mac_link_up(struct net_device *dev, unsigned int mode,
+			      phy_interface_t interface, struct phy_device *phy)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (!phylink_autoneg_inband(mode) &&
+	    interface != PHY_INTERFACE_MODE_10GKR) {
+		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+		val &= ~MVPP2_GMAC_FORCE_LINK_DOWN;
+		if (phy_interface_mode_is_rgmii(interface))
+			val |= MVPP2_GMAC_FORCE_LINK_PASS;
+		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	}
+
+	mvpp2_port_enable(port);
+
+	mvpp2_egress_enable(port);
+	mvpp2_ingress_enable(port);
+	netif_tx_wake_all_queues(dev);
+}
+
+static void mvpp2_mac_link_down(struct net_device *dev, unsigned int mode,
+				phy_interface_t interface)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	u32 val;
+
+	if (!phylink_autoneg_inband(mode) &&
+	    interface != PHY_INTERFACE_MODE_10GKR) {
+		val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+		val &= ~MVPP2_GMAC_FORCE_LINK_PASS;
+		val |= MVPP2_GMAC_FORCE_LINK_DOWN;
+		writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG);
+	}
+
+	netif_tx_stop_all_queues(dev);
+	mvpp2_egress_disable(port);
+	mvpp2_ingress_disable(port);
+
+	/* When using link interrupts to notify phylink of a MAC state change,
+	 * we do not want the port to be disabled (we want to receive further
+	 * interrupts, to be notified when the port will have a link later).
+	 */
+	if (!port->has_phy)
+		return;
+
+	mvpp2_port_disable(port);
+}
+
+static const struct phylink_mac_ops mvpp2_phylink_ops = {
+	.validate = mvpp2_phylink_validate,
+	.mac_link_state = mvpp2_phylink_mac_link_state,
+	.mac_an_restart = mvpp2_mac_an_restart,
+	.mac_config = mvpp2_mac_config,
+	.mac_link_up = mvpp2_mac_link_up,
+	.mac_link_down = mvpp2_mac_link_down,
+};
+
+/* Ports initialization */
+static int mvpp2_port_probe(struct platform_device *pdev,
+			    struct fwnode_handle *port_fwnode,
+			    struct mvpp2 *priv)
+{
+	struct phy *comphy = NULL;
+	struct mvpp2_port *port;
+	struct mvpp2_port_pcpu *port_pcpu;
+	struct device_node *port_node = to_of_node(port_fwnode);
+	struct net_device *dev;
+	struct resource *res;
+	struct phylink *phylink;
+	char *mac_from = "";
+	unsigned int ntxqs, nrxqs;
+	bool has_tx_irqs;
+	u32 id;
+	int features;
+	int phy_mode;
+	int err, i, cpu;
+
+	if (port_node) {
+		has_tx_irqs = mvpp2_port_has_tx_irqs(priv, port_node);
+	} else {
+		has_tx_irqs = true;
+		queue_mode = MVPP2_QDIST_MULTI_MODE;
+	}
+
+	if (!has_tx_irqs)
+		queue_mode = MVPP2_QDIST_SINGLE_MODE;
+
+	ntxqs = MVPP2_MAX_TXQ;
+	if (priv->hw_version == MVPP22 && queue_mode == MVPP2_QDIST_MULTI_MODE)
+		nrxqs = MVPP2_DEFAULT_RXQ * num_possible_cpus();
+	else
+		nrxqs = MVPP2_DEFAULT_RXQ;
+
+	dev = alloc_etherdev_mqs(sizeof(*port), ntxqs, nrxqs);
+	if (!dev)
+		return -ENOMEM;
+
+	phy_mode = fwnode_get_phy_mode(port_fwnode);
+	if (phy_mode < 0) {
+		dev_err(&pdev->dev, "incorrect phy mode\n");
+		err = phy_mode;
+		goto err_free_netdev;
+	}
+
+	if (port_node) {
+		comphy = devm_of_phy_get(&pdev->dev, port_node, NULL);
+		if (IS_ERR(comphy)) {
+			if (PTR_ERR(comphy) == -EPROBE_DEFER) {
+				err = -EPROBE_DEFER;
+				goto err_free_netdev;
+			}
+			comphy = NULL;
+		}
+	}
+
+	if (fwnode_property_read_u32(port_fwnode, "port-id", &id)) {
+		err = -EINVAL;
+		dev_err(&pdev->dev, "missing port-id value\n");
+		goto err_free_netdev;
+	}
+
+	dev->tx_queue_len = MVPP2_MAX_TXD_MAX;
+	dev->watchdog_timeo = 5 * HZ;
+	dev->netdev_ops = &mvpp2_netdev_ops;
+	dev->ethtool_ops = &mvpp2_eth_tool_ops;
+
+	port = netdev_priv(dev);
+	port->dev = dev;
+	port->fwnode = port_fwnode;
+	port->has_phy = !!of_find_property(port_node, "phy", NULL);
+	port->ntxqs = ntxqs;
+	port->nrxqs = nrxqs;
+	port->priv = priv;
+	port->has_tx_irqs = has_tx_irqs;
+
+	err = mvpp2_queue_vectors_init(port, port_node);
+	if (err)
+		goto err_free_netdev;
+
+	if (port_node)
+		port->link_irq = of_irq_get_byname(port_node, "link");
+	else
+		port->link_irq = fwnode_irq_get(port_fwnode, port->nqvecs + 1);
+	if (port->link_irq == -EPROBE_DEFER) {
+		err = -EPROBE_DEFER;
+		goto err_deinit_qvecs;
+	}
+	if (port->link_irq <= 0)
+		/* the link irq is optional */
+		port->link_irq = 0;
+
+	if (fwnode_property_read_bool(port_fwnode, "marvell,loopback"))
+		port->flags |= MVPP2_F_LOOPBACK;
+
+	port->id = id;
+	if (priv->hw_version == MVPP21)
+		port->first_rxq = port->id * port->nrxqs;
+	else
+		port->first_rxq = port->id * priv->max_port_rxqs;
+
+	port->of_node = port_node;
+	port->phy_interface = phy_mode;
+	port->comphy = comphy;
+
+	if (priv->hw_version == MVPP21) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 2 + id);
+		port->base = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(port->base)) {
+			err = PTR_ERR(port->base);
+			goto err_free_irq;
+		}
+
+		port->stats_base = port->priv->lms_base +
+				   MVPP21_MIB_COUNTERS_OFFSET +
+				   port->gop_id * MVPP21_MIB_COUNTERS_PORT_SZ;
+	} else {
+		if (fwnode_property_read_u32(port_fwnode, "gop-port-id",
+					     &port->gop_id)) {
+			err = -EINVAL;
+			dev_err(&pdev->dev, "missing gop-port-id value\n");
+			goto err_deinit_qvecs;
+		}
+
+		port->base = priv->iface_base + MVPP22_GMAC_BASE(port->gop_id);
+		port->stats_base = port->priv->iface_base +
+				   MVPP22_MIB_COUNTERS_OFFSET +
+				   port->gop_id * MVPP22_MIB_COUNTERS_PORT_SZ;
+	}
+
+	/* Alloc per-cpu and ethtool stats */
+	port->stats = netdev_alloc_pcpu_stats(struct mvpp2_pcpu_stats);
+	if (!port->stats) {
+		err = -ENOMEM;
+		goto err_free_irq;
+	}
+
+	port->ethtool_stats = devm_kcalloc(&pdev->dev,
+					   ARRAY_SIZE(mvpp2_ethtool_regs),
+					   sizeof(u64), GFP_KERNEL);
+	if (!port->ethtool_stats) {
+		err = -ENOMEM;
+		goto err_free_stats;
+	}
+
+	mutex_init(&port->gather_stats_lock);
+	INIT_DELAYED_WORK(&port->stats_work, mvpp2_gather_hw_statistics);
+
+	mvpp2_port_copy_mac_addr(dev, priv, port_fwnode, &mac_from);
+
+	port->tx_ring_size = MVPP2_MAX_TXD_DFLT;
+	port->rx_ring_size = MVPP2_MAX_RXD_DFLT;
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	err = mvpp2_port_init(port);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to init port %d\n", id);
+		goto err_free_stats;
+	}
+
+	mvpp2_port_periodic_xon_disable(port);
+
+	mvpp2_port_reset(port);
+
+	port->pcpu = alloc_percpu(struct mvpp2_port_pcpu);
+	if (!port->pcpu) {
+		err = -ENOMEM;
+		goto err_free_txq_pcpu;
+	}
+
+	if (!port->has_tx_irqs) {
+		for_each_present_cpu(cpu) {
+			port_pcpu = per_cpu_ptr(port->pcpu, cpu);
+
+			hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC,
+				     HRTIMER_MODE_REL_PINNED);
+			port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb;
+			port_pcpu->timer_scheduled = false;
+
+			tasklet_init(&port_pcpu->tx_done_tasklet,
+				     mvpp2_tx_proc_cb,
+				     (unsigned long)dev);
+		}
+	}
+
+	features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+		   NETIF_F_TSO;
+	dev->features = features | NETIF_F_RXCSUM;
+	dev->hw_features |= features | NETIF_F_RXCSUM | NETIF_F_GRO |
+			    NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	if (port->pool_long->id == MVPP2_BM_JUMBO && port->id != 0) {
+		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+		dev->hw_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+	}
+
+	dev->vlan_features |= features;
+	dev->gso_max_segs = MVPP2_MAX_TSO_SEGS;
+	dev->priv_flags |= IFF_UNICAST_FLT;
+
+	/* MTU range: 68 - 9704 */
+	dev->min_mtu = ETH_MIN_MTU;
+	/* 9704 == 9728 - 20 and rounding to 8 */
+	dev->max_mtu = MVPP2_BM_JUMBO_PKT_SIZE;
+
+	/* Phylink isn't used w/ ACPI as of now */
+	if (port_node) {
+		phylink = phylink_create(dev, port_fwnode, phy_mode,
+					 &mvpp2_phylink_ops);
+		if (IS_ERR(phylink)) {
+			err = PTR_ERR(phylink);
+			goto err_free_port_pcpu;
+		}
+		port->phylink = phylink;
+	} else {
+		port->phylink = NULL;
+	}
+
+	err = register_netdev(dev);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to register netdev\n");
+		goto err_phylink;
+	}
+	netdev_info(dev, "Using %s mac address %pM\n", mac_from, dev->dev_addr);
+
+	priv->port_list[priv->port_count++] = port;
+
+	return 0;
+
+err_phylink:
+	if (port->phylink)
+		phylink_destroy(port->phylink);
+err_free_port_pcpu:
+	free_percpu(port->pcpu);
+err_free_txq_pcpu:
+	for (i = 0; i < port->ntxqs; i++)
+		free_percpu(port->txqs[i]->pcpu);
+err_free_stats:
+	free_percpu(port->stats);
+err_free_irq:
+	if (port->link_irq)
+		irq_dispose_mapping(port->link_irq);
+err_deinit_qvecs:
+	mvpp2_queue_vectors_deinit(port);
+err_free_netdev:
+	free_netdev(dev);
+	return err;
+}
+
+/* Ports removal routine */
+static void mvpp2_port_remove(struct mvpp2_port *port)
+{
+	int i;
+
+	unregister_netdev(port->dev);
+	if (port->phylink)
+		phylink_destroy(port->phylink);
+	free_percpu(port->pcpu);
+	free_percpu(port->stats);
+	for (i = 0; i < port->ntxqs; i++)
+		free_percpu(port->txqs[i]->pcpu);
+	mvpp2_queue_vectors_deinit(port);
+	if (port->link_irq)
+		irq_dispose_mapping(port->link_irq);
+	free_netdev(port->dev);
+}
+
+/* Initialize decoding windows */
+static void mvpp2_conf_mbus_windows(const struct mbus_dram_target_info *dram,
+				    struct mvpp2 *priv)
+{
+	u32 win_enable;
+	int i;
+
+	for (i = 0; i < 6; i++) {
+		mvpp2_write(priv, MVPP2_WIN_BASE(i), 0);
+		mvpp2_write(priv, MVPP2_WIN_SIZE(i), 0);
+
+		if (i < 4)
+			mvpp2_write(priv, MVPP2_WIN_REMAP(i), 0);
+	}
+
+	win_enable = 0;
+
+	for (i = 0; i < dram->num_cs; i++) {
+		const struct mbus_dram_window *cs = dram->cs + i;
+
+		mvpp2_write(priv, MVPP2_WIN_BASE(i),
+			    (cs->base & 0xffff0000) | (cs->mbus_attr << 8) |
+			    dram->mbus_dram_target_id);
+
+		mvpp2_write(priv, MVPP2_WIN_SIZE(i),
+			    (cs->size - 1) & 0xffff0000);
+
+		win_enable |= (1 << i);
+	}
+
+	mvpp2_write(priv, MVPP2_BASE_ADDR_ENABLE, win_enable);
+}
+
+/* Initialize Rx FIFO's */
+static void mvpp2_rx_fifo_init(struct mvpp2 *priv)
+{
+	int port;
+
+	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
+		mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(port),
+			    MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB);
+		mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(port),
+			    MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB);
+	}
+
+	mvpp2_write(priv, MVPP2_RX_MIN_PKT_SIZE_REG,
+		    MVPP2_RX_FIFO_PORT_MIN_PKT);
+	mvpp2_write(priv, MVPP2_RX_FIFO_INIT_REG, 0x1);
+}
+
+static void mvpp22_rx_fifo_init(struct mvpp2 *priv)
+{
+	int port;
+
+	/* The FIFO size parameters are set depending on the maximum speed a
+	 * given port can handle:
+	 * - Port 0: 10Gbps
+	 * - Port 1: 2.5Gbps
+	 * - Ports 2 and 3: 1Gbps
+	 */
+
+	mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(0),
+		    MVPP2_RX_FIFO_PORT_DATA_SIZE_32KB);
+	mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(0),
+		    MVPP2_RX_FIFO_PORT_ATTR_SIZE_32KB);
+
+	mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(1),
+		    MVPP2_RX_FIFO_PORT_DATA_SIZE_8KB);
+	mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(1),
+		    MVPP2_RX_FIFO_PORT_ATTR_SIZE_8KB);
+
+	for (port = 2; port < MVPP2_MAX_PORTS; port++) {
+		mvpp2_write(priv, MVPP2_RX_DATA_FIFO_SIZE_REG(port),
+			    MVPP2_RX_FIFO_PORT_DATA_SIZE_4KB);
+		mvpp2_write(priv, MVPP2_RX_ATTR_FIFO_SIZE_REG(port),
+			    MVPP2_RX_FIFO_PORT_ATTR_SIZE_4KB);
+	}
+
+	mvpp2_write(priv, MVPP2_RX_MIN_PKT_SIZE_REG,
+		    MVPP2_RX_FIFO_PORT_MIN_PKT);
+	mvpp2_write(priv, MVPP2_RX_FIFO_INIT_REG, 0x1);
+}
+
+/* Initialize Tx FIFO's: the total FIFO size is 19kB on PPv2.2 and 10G
+ * interfaces must have a Tx FIFO size of 10kB. As only port 0 can do 10G,
+ * configure its Tx FIFO size to 10kB and the others ports Tx FIFO size to 3kB.
+ */
+static void mvpp22_tx_fifo_init(struct mvpp2 *priv)
+{
+	int port, size, thrs;
+
+	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
+		if (port == 0) {
+			size = MVPP22_TX_FIFO_DATA_SIZE_10KB;
+			thrs = MVPP2_TX_FIFO_THRESHOLD_10KB;
+		} else {
+			size = MVPP22_TX_FIFO_DATA_SIZE_3KB;
+			thrs = MVPP2_TX_FIFO_THRESHOLD_3KB;
+		}
+		mvpp2_write(priv, MVPP22_TX_FIFO_SIZE_REG(port), size);
+		mvpp2_write(priv, MVPP22_TX_FIFO_THRESH_REG(port), thrs);
+	}
+}
+
+static void mvpp2_axi_init(struct mvpp2 *priv)
+{
+	u32 val, rdval, wrval;
+
+	mvpp2_write(priv, MVPP22_BM_ADDR_HIGH_RLS_REG, 0x0);
+
+	/* AXI Bridge Configuration */
+
+	rdval = MVPP22_AXI_CODE_CACHE_RD_CACHE
+		<< MVPP22_AXI_ATTR_CACHE_OFFS;
+	rdval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+		<< MVPP22_AXI_ATTR_DOMAIN_OFFS;
+
+	wrval = MVPP22_AXI_CODE_CACHE_WR_CACHE
+		<< MVPP22_AXI_ATTR_CACHE_OFFS;
+	wrval |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+		<< MVPP22_AXI_ATTR_DOMAIN_OFFS;
+
+	/* BM */
+	mvpp2_write(priv, MVPP22_AXI_BM_WR_ATTR_REG, wrval);
+	mvpp2_write(priv, MVPP22_AXI_BM_RD_ATTR_REG, rdval);
+
+	/* Descriptors */
+	mvpp2_write(priv, MVPP22_AXI_AGGRQ_DESCR_RD_ATTR_REG, rdval);
+	mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_WR_ATTR_REG, wrval);
+	mvpp2_write(priv, MVPP22_AXI_TXQ_DESCR_RD_ATTR_REG, rdval);
+	mvpp2_write(priv, MVPP22_AXI_RXQ_DESCR_WR_ATTR_REG, wrval);
+
+	/* Buffer Data */
+	mvpp2_write(priv, MVPP22_AXI_TX_DATA_RD_ATTR_REG, rdval);
+	mvpp2_write(priv, MVPP22_AXI_RX_DATA_WR_ATTR_REG, wrval);
+
+	val = MVPP22_AXI_CODE_CACHE_NON_CACHE
+		<< MVPP22_AXI_CODE_CACHE_OFFS;
+	val |= MVPP22_AXI_CODE_DOMAIN_SYSTEM
+		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
+	mvpp2_write(priv, MVPP22_AXI_RD_NORMAL_CODE_REG, val);
+	mvpp2_write(priv, MVPP22_AXI_WR_NORMAL_CODE_REG, val);
+
+	val = MVPP22_AXI_CODE_CACHE_RD_CACHE
+		<< MVPP22_AXI_CODE_CACHE_OFFS;
+	val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
+
+	mvpp2_write(priv, MVPP22_AXI_RD_SNOOP_CODE_REG, val);
+
+	val = MVPP22_AXI_CODE_CACHE_WR_CACHE
+		<< MVPP22_AXI_CODE_CACHE_OFFS;
+	val |= MVPP22_AXI_CODE_DOMAIN_OUTER_DOM
+		<< MVPP22_AXI_CODE_DOMAIN_OFFS;
+
+	mvpp2_write(priv, MVPP22_AXI_WR_SNOOP_CODE_REG, val);
+}
+
+/* Initialize network controller common part HW */
+static int mvpp2_init(struct platform_device *pdev, struct mvpp2 *priv)
+{
+	const struct mbus_dram_target_info *dram_target_info;
+	int err, i;
+	u32 val;
+
+	/* MBUS windows configuration */
+	dram_target_info = mv_mbus_dram_info();
+	if (dram_target_info)
+		mvpp2_conf_mbus_windows(dram_target_info, priv);
+
+	if (priv->hw_version == MVPP22)
+		mvpp2_axi_init(priv);
+
+	/* Disable HW PHY polling */
+	if (priv->hw_version == MVPP21) {
+		val = readl(priv->lms_base + MVPP2_PHY_AN_CFG0_REG);
+		val |= MVPP2_PHY_AN_STOP_SMI0_MASK;
+		writel(val, priv->lms_base + MVPP2_PHY_AN_CFG0_REG);
+	} else {
+		val = readl(priv->iface_base + MVPP22_SMI_MISC_CFG_REG);
+		val &= ~MVPP22_SMI_POLLING_EN;
+		writel(val, priv->iface_base + MVPP22_SMI_MISC_CFG_REG);
+	}
+
+	/* Allocate and initialize aggregated TXQs */
+	priv->aggr_txqs = devm_kcalloc(&pdev->dev, num_present_cpus(),
+				       sizeof(*priv->aggr_txqs),
+				       GFP_KERNEL);
+	if (!priv->aggr_txqs)
+		return -ENOMEM;
+
+	for_each_present_cpu(i) {
+		priv->aggr_txqs[i].id = i;
+		priv->aggr_txqs[i].size = MVPP2_AGGR_TXQ_SIZE;
+		err = mvpp2_aggr_txq_init(pdev, &priv->aggr_txqs[i], i, priv);
+		if (err < 0)
+			return err;
+	}
+
+	/* Fifo Init */
+	if (priv->hw_version == MVPP21) {
+		mvpp2_rx_fifo_init(priv);
+	} else {
+		mvpp22_rx_fifo_init(priv);
+		mvpp22_tx_fifo_init(priv);
+	}
+
+	if (priv->hw_version == MVPP21)
+		writel(MVPP2_EXT_GLOBAL_CTRL_DEFAULT,
+		       priv->lms_base + MVPP2_MNG_EXTENDED_GLOBAL_CTRL_REG);
+
+	/* Allow cache snoop when transmiting packets */
+	mvpp2_write(priv, MVPP2_TX_SNOOP_REG, 0x1);
+
+	/* Buffer Manager initialization */
+	err = mvpp2_bm_init(pdev, priv);
+	if (err < 0)
+		return err;
+
+	/* Parser default initialization */
+	err = mvpp2_prs_default_init(pdev, priv);
+	if (err < 0)
+		return err;
+
+	/* Classifier default initialization */
+	mvpp2_cls_init(priv);
+
+	return 0;
+}
+
+static int mvpp2_probe(struct platform_device *pdev)
+{
+	const struct acpi_device_id *acpi_id;
+	struct fwnode_handle *fwnode = pdev->dev.fwnode;
+	struct fwnode_handle *port_fwnode;
+	struct mvpp2 *priv;
+	struct resource *res;
+	void __iomem *base;
+	int i;
+	int err;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	if (has_acpi_companion(&pdev->dev)) {
+		acpi_id = acpi_match_device(pdev->dev.driver->acpi_match_table,
+					    &pdev->dev);
+		priv->hw_version = (unsigned long)acpi_id->driver_data;
+	} else {
+		priv->hw_version =
+			(unsigned long)of_device_get_match_data(&pdev->dev);
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	if (priv->hw_version == MVPP21) {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+		priv->lms_base = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(priv->lms_base))
+			return PTR_ERR(priv->lms_base);
+	} else {
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+		if (has_acpi_companion(&pdev->dev)) {
+			/* In case the MDIO memory region is declared in
+			 * the ACPI, it can already appear as 'in-use'
+			 * in the OS. Because it is overlapped by second
+			 * region of the network controller, make
+			 * sure it is released, before requesting it again.
+			 * The care is taken by mvpp2 driver to avoid
+			 * concurrent access to this memory region.
+			 */
+			release_resource(res);
+		}
+		priv->iface_base = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(priv->iface_base))
+			return PTR_ERR(priv->iface_base);
+	}
+
+	if (priv->hw_version == MVPP22 && dev_of_node(&pdev->dev)) {
+		priv->sysctrl_base =
+			syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+							"marvell,system-controller");
+		if (IS_ERR(priv->sysctrl_base))
+			/* The system controller regmap is optional for dt
+			 * compatibility reasons. When not provided, the
+			 * configuration of the GoP relies on the
+			 * firmware/bootloader.
+			 */
+			priv->sysctrl_base = NULL;
+	}
+
+	mvpp2_setup_bm_pool();
+
+	for (i = 0; i < MVPP2_MAX_THREADS; i++) {
+		u32 addr_space_sz;
+
+		addr_space_sz = (priv->hw_version == MVPP21 ?
+				 MVPP21_ADDR_SPACE_SZ : MVPP22_ADDR_SPACE_SZ);
+		priv->swth_base[i] = base + i * addr_space_sz;
+	}
+
+	if (priv->hw_version == MVPP21)
+		priv->max_port_rxqs = 8;
+	else
+		priv->max_port_rxqs = 32;
+
+	if (dev_of_node(&pdev->dev)) {
+		priv->pp_clk = devm_clk_get(&pdev->dev, "pp_clk");
+		if (IS_ERR(priv->pp_clk))
+			return PTR_ERR(priv->pp_clk);
+		err = clk_prepare_enable(priv->pp_clk);
+		if (err < 0)
+			return err;
+
+		priv->gop_clk = devm_clk_get(&pdev->dev, "gop_clk");
+		if (IS_ERR(priv->gop_clk)) {
+			err = PTR_ERR(priv->gop_clk);
+			goto err_pp_clk;
+		}
+		err = clk_prepare_enable(priv->gop_clk);
+		if (err < 0)
+			goto err_pp_clk;
+
+		if (priv->hw_version == MVPP22) {
+			priv->mg_clk = devm_clk_get(&pdev->dev, "mg_clk");
+			if (IS_ERR(priv->mg_clk)) {
+				err = PTR_ERR(priv->mg_clk);
+				goto err_gop_clk;
+			}
+
+			err = clk_prepare_enable(priv->mg_clk);
+			if (err < 0)
+				goto err_gop_clk;
+
+			priv->mg_core_clk = devm_clk_get(&pdev->dev, "mg_core_clk");
+			if (IS_ERR(priv->mg_core_clk)) {
+				priv->mg_core_clk = NULL;
+			} else {
+				err = clk_prepare_enable(priv->mg_core_clk);
+				if (err < 0)
+					goto err_mg_clk;
+			}
+		}
+
+		priv->axi_clk = devm_clk_get(&pdev->dev, "axi_clk");
+		if (IS_ERR(priv->axi_clk)) {
+			err = PTR_ERR(priv->axi_clk);
+			if (err == -EPROBE_DEFER)
+				goto err_mg_core_clk;
+			priv->axi_clk = NULL;
+		} else {
+			err = clk_prepare_enable(priv->axi_clk);
+			if (err < 0)
+				goto err_mg_core_clk;
+		}
+
+		/* Get system's tclk rate */
+		priv->tclk = clk_get_rate(priv->pp_clk);
+	} else if (device_property_read_u32(&pdev->dev, "clock-frequency",
+					    &priv->tclk)) {
+		dev_err(&pdev->dev, "missing clock-frequency value\n");
+		return -EINVAL;
+	}
+
+	if (priv->hw_version == MVPP22) {
+		err = dma_set_mask(&pdev->dev, MVPP2_DESC_DMA_MASK);
+		if (err)
+			goto err_axi_clk;
+		/* Sadly, the BM pools all share the same register to
+		 * store the high 32 bits of their address. So they
+		 * must all have the same high 32 bits, which forces
+		 * us to restrict coherent memory to DMA_BIT_MASK(32).
+		 */
+		err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32));
+		if (err)
+			goto err_axi_clk;
+	}
+
+	/* Initialize network controller */
+	err = mvpp2_init(pdev, priv);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to initialize controller\n");
+		goto err_axi_clk;
+	}
+
+	/* Initialize ports */
+	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
+		err = mvpp2_port_probe(pdev, port_fwnode, priv);
+		if (err < 0)
+			goto err_port_probe;
+	}
+
+	if (priv->port_count == 0) {
+		dev_err(&pdev->dev, "no ports enabled\n");
+		err = -ENODEV;
+		goto err_axi_clk;
+	}
+
+	/* Statistics must be gathered regularly because some of them (like
+	 * packets counters) are 32-bit registers and could overflow quite
+	 * quickly. For instance, a 10Gb link used at full bandwidth with the
+	 * smallest packets (64B) will overflow a 32-bit counter in less than
+	 * 30 seconds. Then, use a workqueue to fill 64-bit counters.
+	 */
+	snprintf(priv->queue_name, sizeof(priv->queue_name),
+		 "stats-wq-%s%s", netdev_name(priv->port_list[0]->dev),
+		 priv->port_count > 1 ? "+" : "");
+	priv->stats_queue = create_singlethread_workqueue(priv->queue_name);
+	if (!priv->stats_queue) {
+		err = -ENOMEM;
+		goto err_port_probe;
+	}
+
+	platform_set_drvdata(pdev, priv);
+	return 0;
+
+err_port_probe:
+	i = 0;
+	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
+		if (priv->port_list[i])
+			mvpp2_port_remove(priv->port_list[i]);
+		i++;
+	}
+err_axi_clk:
+	clk_disable_unprepare(priv->axi_clk);
+
+err_mg_core_clk:
+	if (priv->hw_version == MVPP22)
+		clk_disable_unprepare(priv->mg_core_clk);
+err_mg_clk:
+	if (priv->hw_version == MVPP22)
+		clk_disable_unprepare(priv->mg_clk);
+err_gop_clk:
+	clk_disable_unprepare(priv->gop_clk);
+err_pp_clk:
+	clk_disable_unprepare(priv->pp_clk);
+	return err;
+}
+
+static int mvpp2_remove(struct platform_device *pdev)
+{
+	struct mvpp2 *priv = platform_get_drvdata(pdev);
+	struct fwnode_handle *fwnode = pdev->dev.fwnode;
+	struct fwnode_handle *port_fwnode;
+	int i = 0;
+
+	flush_workqueue(priv->stats_queue);
+	destroy_workqueue(priv->stats_queue);
+
+	fwnode_for_each_available_child_node(fwnode, port_fwnode) {
+		if (priv->port_list[i]) {
+			mutex_destroy(&priv->port_list[i]->gather_stats_lock);
+			mvpp2_port_remove(priv->port_list[i]);
+		}
+		i++;
+	}
+
+	for (i = 0; i < MVPP2_BM_POOLS_NUM; i++) {
+		struct mvpp2_bm_pool *bm_pool = &priv->bm_pools[i];
+
+		mvpp2_bm_pool_destroy(pdev, priv, bm_pool);
+	}
+
+	for_each_present_cpu(i) {
+		struct mvpp2_tx_queue *aggr_txq = &priv->aggr_txqs[i];
+
+		dma_free_coherent(&pdev->dev,
+				  MVPP2_AGGR_TXQ_SIZE * MVPP2_DESC_ALIGNED_SIZE,
+				  aggr_txq->descs,
+				  aggr_txq->descs_dma);
+	}
+
+	if (is_acpi_node(port_fwnode))
+		return 0;
+
+	clk_disable_unprepare(priv->axi_clk);
+	clk_disable_unprepare(priv->mg_core_clk);
+	clk_disable_unprepare(priv->mg_clk);
+	clk_disable_unprepare(priv->pp_clk);
+	clk_disable_unprepare(priv->gop_clk);
+
+	return 0;
+}
+
+static const struct of_device_id mvpp2_match[] = {
+	{
+		.compatible = "marvell,armada-375-pp2",
+		.data = (void *)MVPP21,
+	},
+	{
+		.compatible = "marvell,armada-7k-pp22",
+		.data = (void *)MVPP22,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, mvpp2_match);
+
+static const struct acpi_device_id mvpp2_acpi_match[] = {
+	{ "MRVL0110", MVPP22 },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, mvpp2_acpi_match);
+
+static struct platform_driver mvpp2_driver = {
+	.probe = mvpp2_probe,
+	.remove = mvpp2_remove,
+	.driver = {
+		.name = MVPP2_DRIVER_NAME,
+		.of_match_table = mvpp2_match,
+		.acpi_match_table = ACPI_PTR(mvpp2_acpi_match),
+	},
+};
+
+module_platform_driver(mvpp2_driver);
+
+MODULE_DESCRIPTION("Marvell PPv2 Ethernet Driver - www.marvell.com");
+MODULE_AUTHOR("Marcin Wojtas <mw@semihalf.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c
new file mode 100644
index 000000000000..6bb69f086794
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c
@@ -0,0 +1,2467 @@
+/*
+ * Header Parser helpers for Marvell PPv2 Network Controller
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <uapi/linux/ppp_defs.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include "mvpp2.h"
+#include "mvpp2_prs.h"
+
+/* Update parser tcam and sram hw entries */
+static int mvpp2_prs_hw_write(struct mvpp2 *priv, struct mvpp2_prs_entry *pe)
+{
+	int i;
+
+	if (pe->index > MVPP2_PRS_TCAM_SRAM_SIZE - 1)
+		return -EINVAL;
+
+	/* Clear entry invalidation bit */
+	pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] &= ~MVPP2_PRS_TCAM_INV_MASK;
+
+	/* Write tcam index - indirect access */
+	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, pe->index);
+	for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
+		mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(i), pe->tcam.word[i]);
+
+	/* Write sram index - indirect access */
+	mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, pe->index);
+	for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
+		mvpp2_write(priv, MVPP2_PRS_SRAM_DATA_REG(i), pe->sram.word[i]);
+
+	return 0;
+}
+
+/* Initialize tcam entry from hw */
+static int mvpp2_prs_init_from_hw(struct mvpp2 *priv,
+				  struct mvpp2_prs_entry *pe, int tid)
+{
+	int i;
+
+	if (tid > MVPP2_PRS_TCAM_SRAM_SIZE - 1)
+		return -EINVAL;
+
+	memset(pe, 0, sizeof(*pe));
+	pe->index = tid;
+
+	/* Write tcam index - indirect access */
+	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, pe->index);
+
+	pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] = mvpp2_read(priv,
+			      MVPP2_PRS_TCAM_DATA_REG(MVPP2_PRS_TCAM_INV_WORD));
+	if (pe->tcam.word[MVPP2_PRS_TCAM_INV_WORD] & MVPP2_PRS_TCAM_INV_MASK)
+		return MVPP2_PRS_TCAM_ENTRY_INVALID;
+
+	for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
+		pe->tcam.word[i] = mvpp2_read(priv, MVPP2_PRS_TCAM_DATA_REG(i));
+
+	/* Write sram index - indirect access */
+	mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, pe->index);
+	for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
+		pe->sram.word[i] = mvpp2_read(priv, MVPP2_PRS_SRAM_DATA_REG(i));
+
+	return 0;
+}
+
+/* Invalidate tcam hw entry */
+static void mvpp2_prs_hw_inv(struct mvpp2 *priv, int index)
+{
+	/* Write index - indirect access */
+	mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, index);
+	mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(MVPP2_PRS_TCAM_INV_WORD),
+		    MVPP2_PRS_TCAM_INV_MASK);
+}
+
+/* Enable shadow table entry and set its lookup ID */
+static void mvpp2_prs_shadow_set(struct mvpp2 *priv, int index, int lu)
+{
+	priv->prs_shadow[index].valid = true;
+	priv->prs_shadow[index].lu = lu;
+}
+
+/* Update ri fields in shadow table entry */
+static void mvpp2_prs_shadow_ri_set(struct mvpp2 *priv, int index,
+				    unsigned int ri, unsigned int ri_mask)
+{
+	priv->prs_shadow[index].ri_mask = ri_mask;
+	priv->prs_shadow[index].ri = ri;
+}
+
+/* Update lookup field in tcam sw entry */
+static void mvpp2_prs_tcam_lu_set(struct mvpp2_prs_entry *pe, unsigned int lu)
+{
+	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_LU_BYTE);
+
+	pe->tcam.byte[MVPP2_PRS_TCAM_LU_BYTE] = lu;
+	pe->tcam.byte[enable_off] = MVPP2_PRS_LU_MASK;
+}
+
+/* Update mask for single port in tcam sw entry */
+static void mvpp2_prs_tcam_port_set(struct mvpp2_prs_entry *pe,
+				    unsigned int port, bool add)
+{
+	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
+
+	if (add)
+		pe->tcam.byte[enable_off] &= ~(1 << port);
+	else
+		pe->tcam.byte[enable_off] |= 1 << port;
+}
+
+/* Update port map in tcam sw entry */
+static void mvpp2_prs_tcam_port_map_set(struct mvpp2_prs_entry *pe,
+					unsigned int ports)
+{
+	unsigned char port_mask = MVPP2_PRS_PORT_MASK;
+	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
+
+	pe->tcam.byte[MVPP2_PRS_TCAM_PORT_BYTE] = 0;
+	pe->tcam.byte[enable_off] &= ~port_mask;
+	pe->tcam.byte[enable_off] |= ~ports & MVPP2_PRS_PORT_MASK;
+}
+
+/* Obtain port map from tcam sw entry */
+static unsigned int mvpp2_prs_tcam_port_map_get(struct mvpp2_prs_entry *pe)
+{
+	int enable_off = MVPP2_PRS_TCAM_EN_OFFS(MVPP2_PRS_TCAM_PORT_BYTE);
+
+	return ~(pe->tcam.byte[enable_off]) & MVPP2_PRS_PORT_MASK;
+}
+
+/* Set byte of data and its enable bits in tcam sw entry */
+static void mvpp2_prs_tcam_data_byte_set(struct mvpp2_prs_entry *pe,
+					 unsigned int offs, unsigned char byte,
+					 unsigned char enable)
+{
+	pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(offs)] = byte;
+	pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)] = enable;
+}
+
+/* Get byte of data and its enable bits from tcam sw entry */
+static void mvpp2_prs_tcam_data_byte_get(struct mvpp2_prs_entry *pe,
+					 unsigned int offs, unsigned char *byte,
+					 unsigned char *enable)
+{
+	*byte = pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(offs)];
+	*enable = pe->tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)];
+}
+
+/* Compare tcam data bytes with a pattern */
+static bool mvpp2_prs_tcam_data_cmp(struct mvpp2_prs_entry *pe, int offs,
+				    u16 data)
+{
+	int off = MVPP2_PRS_TCAM_DATA_BYTE(offs);
+	u16 tcam_data;
+
+	tcam_data = (pe->tcam.byte[off + 1] << 8) | pe->tcam.byte[off];
+	if (tcam_data != data)
+		return false;
+	return true;
+}
+
+/* Update ai bits in tcam sw entry */
+static void mvpp2_prs_tcam_ai_update(struct mvpp2_prs_entry *pe,
+				     unsigned int bits, unsigned int enable)
+{
+	int i, ai_idx = MVPP2_PRS_TCAM_AI_BYTE;
+
+	for (i = 0; i < MVPP2_PRS_AI_BITS; i++) {
+		if (!(enable & BIT(i)))
+			continue;
+
+		if (bits & BIT(i))
+			pe->tcam.byte[ai_idx] |= 1 << i;
+		else
+			pe->tcam.byte[ai_idx] &= ~(1 << i);
+	}
+
+	pe->tcam.byte[MVPP2_PRS_TCAM_EN_OFFS(ai_idx)] |= enable;
+}
+
+/* Get ai bits from tcam sw entry */
+static int mvpp2_prs_tcam_ai_get(struct mvpp2_prs_entry *pe)
+{
+	return pe->tcam.byte[MVPP2_PRS_TCAM_AI_BYTE];
+}
+
+/* Set ethertype in tcam sw entry */
+static void mvpp2_prs_match_etype(struct mvpp2_prs_entry *pe, int offset,
+				  unsigned short ethertype)
+{
+	mvpp2_prs_tcam_data_byte_set(pe, offset + 0, ethertype >> 8, 0xff);
+	mvpp2_prs_tcam_data_byte_set(pe, offset + 1, ethertype & 0xff, 0xff);
+}
+
+/* Set vid in tcam sw entry */
+static void mvpp2_prs_match_vid(struct mvpp2_prs_entry *pe, int offset,
+				unsigned short vid)
+{
+	mvpp2_prs_tcam_data_byte_set(pe, offset + 0, (vid & 0xf00) >> 8, 0xf);
+	mvpp2_prs_tcam_data_byte_set(pe, offset + 1, vid & 0xff, 0xff);
+}
+
+/* Set bits in sram sw entry */
+static void mvpp2_prs_sram_bits_set(struct mvpp2_prs_entry *pe, int bit_num,
+				    int val)
+{
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(bit_num)] |= (val << (bit_num % 8));
+}
+
+/* Clear bits in sram sw entry */
+static void mvpp2_prs_sram_bits_clear(struct mvpp2_prs_entry *pe, int bit_num,
+				      int val)
+{
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(bit_num)] &= ~(val << (bit_num % 8));
+}
+
+/* Update ri bits in sram sw entry */
+static void mvpp2_prs_sram_ri_update(struct mvpp2_prs_entry *pe,
+				     unsigned int bits, unsigned int mask)
+{
+	unsigned int i;
+
+	for (i = 0; i < MVPP2_PRS_SRAM_RI_CTRL_BITS; i++) {
+		int ri_off = MVPP2_PRS_SRAM_RI_OFFS;
+
+		if (!(mask & BIT(i)))
+			continue;
+
+		if (bits & BIT(i))
+			mvpp2_prs_sram_bits_set(pe, ri_off + i, 1);
+		else
+			mvpp2_prs_sram_bits_clear(pe, ri_off + i, 1);
+
+		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_RI_CTRL_OFFS + i, 1);
+	}
+}
+
+/* Obtain ri bits from sram sw entry */
+static int mvpp2_prs_sram_ri_get(struct mvpp2_prs_entry *pe)
+{
+	return pe->sram.word[MVPP2_PRS_SRAM_RI_WORD];
+}
+
+/* Update ai bits in sram sw entry */
+static void mvpp2_prs_sram_ai_update(struct mvpp2_prs_entry *pe,
+				     unsigned int bits, unsigned int mask)
+{
+	unsigned int i;
+	int ai_off = MVPP2_PRS_SRAM_AI_OFFS;
+
+	for (i = 0; i < MVPP2_PRS_SRAM_AI_CTRL_BITS; i++) {
+		if (!(mask & BIT(i)))
+			continue;
+
+		if (bits & BIT(i))
+			mvpp2_prs_sram_bits_set(pe, ai_off + i, 1);
+		else
+			mvpp2_prs_sram_bits_clear(pe, ai_off + i, 1);
+
+		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_AI_CTRL_OFFS + i, 1);
+	}
+}
+
+/* Read ai bits from sram sw entry */
+static int mvpp2_prs_sram_ai_get(struct mvpp2_prs_entry *pe)
+{
+	u8 bits;
+	int ai_off = MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_AI_OFFS);
+	int ai_en_off = ai_off + 1;
+	int ai_shift = MVPP2_PRS_SRAM_AI_OFFS % 8;
+
+	bits = (pe->sram.byte[ai_off] >> ai_shift) |
+	       (pe->sram.byte[ai_en_off] << (8 - ai_shift));
+
+	return bits;
+}
+
+/* In sram sw entry set lookup ID field of the tcam key to be used in the next
+ * lookup interation
+ */
+static void mvpp2_prs_sram_next_lu_set(struct mvpp2_prs_entry *pe,
+				       unsigned int lu)
+{
+	int sram_next_off = MVPP2_PRS_SRAM_NEXT_LU_OFFS;
+
+	mvpp2_prs_sram_bits_clear(pe, sram_next_off,
+				  MVPP2_PRS_SRAM_NEXT_LU_MASK);
+	mvpp2_prs_sram_bits_set(pe, sram_next_off, lu);
+}
+
+/* In the sram sw entry set sign and value of the next lookup offset
+ * and the offset value generated to the classifier
+ */
+static void mvpp2_prs_sram_shift_set(struct mvpp2_prs_entry *pe, int shift,
+				     unsigned int op)
+{
+	/* Set sign */
+	if (shift < 0) {
+		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_SHIFT_SIGN_BIT, 1);
+		shift = 0 - shift;
+	} else {
+		mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_SHIFT_SIGN_BIT, 1);
+	}
+
+	/* Set value */
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_SHIFT_OFFS)] =
+							   (unsigned char)shift;
+
+	/* Reset and set operation */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS,
+				  MVPP2_PRS_SRAM_OP_SEL_SHIFT_MASK);
+	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS, op);
+
+	/* Set base offset as current */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS, 1);
+}
+
+/* In the sram sw entry set sign and value of the user defined offset
+ * generated to the classifier
+ */
+static void mvpp2_prs_sram_offset_set(struct mvpp2_prs_entry *pe,
+				      unsigned int type, int offset,
+				      unsigned int op)
+{
+	/* Set sign */
+	if (offset < 0) {
+		mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_SIGN_BIT, 1);
+		offset = 0 - offset;
+	} else {
+		mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_SIGN_BIT, 1);
+	}
+
+	/* Set value */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_OFFS,
+				  MVPP2_PRS_SRAM_UDF_MASK);
+	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_OFFS, offset);
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_UDF_OFFS +
+					MVPP2_PRS_SRAM_UDF_BITS)] &=
+	      ~(MVPP2_PRS_SRAM_UDF_MASK >> (8 - (MVPP2_PRS_SRAM_UDF_OFFS % 8)));
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_UDF_OFFS +
+					MVPP2_PRS_SRAM_UDF_BITS)] |=
+				(offset >> (8 - (MVPP2_PRS_SRAM_UDF_OFFS % 8)));
+
+	/* Set offset type */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_UDF_TYPE_OFFS,
+				  MVPP2_PRS_SRAM_UDF_TYPE_MASK);
+	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_UDF_TYPE_OFFS, type);
+
+	/* Set offset operation */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_MASK);
+	mvpp2_prs_sram_bits_set(pe, MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS, op);
+
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS +
+					MVPP2_PRS_SRAM_OP_SEL_UDF_BITS)] &=
+					     ~(MVPP2_PRS_SRAM_OP_SEL_UDF_MASK >>
+				    (8 - (MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS % 8)));
+
+	pe->sram.byte[MVPP2_BIT_TO_BYTE(MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS +
+					MVPP2_PRS_SRAM_OP_SEL_UDF_BITS)] |=
+			     (op >> (8 - (MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS % 8)));
+
+	/* Set base offset as current */
+	mvpp2_prs_sram_bits_clear(pe, MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS, 1);
+}
+
+/* Find parser flow entry */
+static int mvpp2_prs_flow_find(struct mvpp2 *priv, int flow)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* Go through the all entires with MVPP2_PRS_LU_FLOWS */
+	for (tid = MVPP2_PRS_TCAM_SRAM_SIZE - 1; tid >= 0; tid--) {
+		u8 bits;
+
+		if (!priv->prs_shadow[tid].valid ||
+		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_FLOWS)
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+		bits = mvpp2_prs_sram_ai_get(&pe);
+
+		/* Sram store classification lookup ID in AI bits [5:0] */
+		if ((bits & MVPP2_PRS_FLOW_ID_MASK) == flow)
+			return tid;
+	}
+
+	return -ENOENT;
+}
+
+/* Return first free tcam index, seeking from start to end */
+static int mvpp2_prs_tcam_first_free(struct mvpp2 *priv, unsigned char start,
+				     unsigned char end)
+{
+	int tid;
+
+	if (start > end)
+		swap(start, end);
+
+	if (end >= MVPP2_PRS_TCAM_SRAM_SIZE)
+		end = MVPP2_PRS_TCAM_SRAM_SIZE - 1;
+
+	for (tid = start; tid <= end; tid++) {
+		if (!priv->prs_shadow[tid].valid)
+			return tid;
+	}
+
+	return -EINVAL;
+}
+
+/* Enable/disable dropping all mac da's */
+static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add)
+{
+	struct mvpp2_prs_entry pe;
+
+	if (priv->prs_shadow[MVPP2_PE_DROP_ALL].valid) {
+		/* Entry exist - update port only */
+		mvpp2_prs_init_from_hw(priv, &pe, MVPP2_PE_DROP_ALL);
+	} else {
+		/* Entry doesn't exist - create new */
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
+		pe.index = MVPP2_PE_DROP_ALL;
+
+		/* Non-promiscuous mode for all ports - DROP unknown packets */
+		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
+					 MVPP2_PRS_RI_DROP_MASK);
+
+		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+
+		/* Update shadow table */
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+	}
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port, add);
+
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Set port to unicast or multicast promiscuous mode */
+void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port,
+			       enum mvpp2_prs_l2_cast l2_cast, bool add)
+{
+	struct mvpp2_prs_entry pe;
+	unsigned char cast_match;
+	unsigned int ri;
+	int tid;
+
+	if (l2_cast == MVPP2_PRS_L2_UNI_CAST) {
+		cast_match = MVPP2_PRS_UCAST_VAL;
+		tid = MVPP2_PE_MAC_UC_PROMISCUOUS;
+		ri = MVPP2_PRS_RI_L2_UCAST;
+	} else {
+		cast_match = MVPP2_PRS_MCAST_VAL;
+		tid = MVPP2_PE_MAC_MC_PROMISCUOUS;
+		ri = MVPP2_PRS_RI_L2_MCAST;
+	}
+
+	/* promiscuous mode - Accept unknown unicast or multicast packets */
+	if (priv->prs_shadow[tid].valid) {
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	} else {
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
+		pe.index = tid;
+
+		/* Continue - set next lookup */
+		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_DSA);
+
+		/* Set result info bits */
+		mvpp2_prs_sram_ri_update(&pe, ri, MVPP2_PRS_RI_L2_CAST_MASK);
+
+		/* Match UC or MC addresses */
+		mvpp2_prs_tcam_data_byte_set(&pe, 0, cast_match,
+					     MVPP2_PRS_CAST_MASK);
+
+		/* Shift to ethertype */
+		mvpp2_prs_sram_shift_set(&pe, 2 * ETH_ALEN,
+					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+
+		/* Update shadow table */
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
+	}
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port, add);
+
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Set entry for dsa packets */
+static void mvpp2_prs_dsa_tag_set(struct mvpp2 *priv, int port, bool add,
+				  bool tagged, bool extend)
+{
+	struct mvpp2_prs_entry pe;
+	int tid, shift;
+
+	if (extend) {
+		tid = tagged ? MVPP2_PE_EDSA_TAGGED : MVPP2_PE_EDSA_UNTAGGED;
+		shift = 8;
+	} else {
+		tid = tagged ? MVPP2_PE_DSA_TAGGED : MVPP2_PE_DSA_UNTAGGED;
+		shift = 4;
+	}
+
+	if (priv->prs_shadow[tid].valid) {
+		/* Entry exist - update port only */
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	} else {
+		/* Entry doesn't exist - create new */
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
+		pe.index = tid;
+
+		/* Update shadow table */
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_DSA);
+
+		if (tagged) {
+			/* Set tagged bit in DSA tag */
+			mvpp2_prs_tcam_data_byte_set(&pe, 0,
+					     MVPP2_PRS_TCAM_DSA_TAGGED_BIT,
+					     MVPP2_PRS_TCAM_DSA_TAGGED_BIT);
+
+			/* Set ai bits for next iteration */
+			if (extend)
+				mvpp2_prs_sram_ai_update(&pe, 1,
+							MVPP2_PRS_SRAM_AI_MASK);
+			else
+				mvpp2_prs_sram_ai_update(&pe, 0,
+							MVPP2_PRS_SRAM_AI_MASK);
+
+			/* Set result info bits to 'single vlan' */
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_SINGLE,
+						 MVPP2_PRS_RI_VLAN_MASK);
+			/* If packet is tagged continue check vid filtering */
+			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
+		} else {
+			/* Shift 4 bytes for DSA tag or 8 bytes for EDSA tag*/
+			mvpp2_prs_sram_shift_set(&pe, shift,
+					MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+			/* Set result info bits to 'no vlans' */
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
+						 MVPP2_PRS_RI_VLAN_MASK);
+			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+		}
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+	}
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port, add);
+
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Set entry for dsa ethertype */
+static void mvpp2_prs_dsa_tag_ethertype_set(struct mvpp2 *priv, int port,
+					    bool add, bool tagged, bool extend)
+{
+	struct mvpp2_prs_entry pe;
+	int tid, shift, port_mask;
+
+	if (extend) {
+		tid = tagged ? MVPP2_PE_ETYPE_EDSA_TAGGED :
+		      MVPP2_PE_ETYPE_EDSA_UNTAGGED;
+		port_mask = 0;
+		shift = 8;
+	} else {
+		tid = tagged ? MVPP2_PE_ETYPE_DSA_TAGGED :
+		      MVPP2_PE_ETYPE_DSA_UNTAGGED;
+		port_mask = MVPP2_PRS_PORT_MASK;
+		shift = 4;
+	}
+
+	if (priv->prs_shadow[tid].valid) {
+		/* Entry exist - update port only */
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	} else {
+		/* Entry doesn't exist - create new */
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
+		pe.index = tid;
+
+		/* Set ethertype */
+		mvpp2_prs_match_etype(&pe, 0, ETH_P_EDSA);
+		mvpp2_prs_match_etype(&pe, 2, 0);
+
+		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DSA_MASK,
+					 MVPP2_PRS_RI_DSA_MASK);
+		/* Shift ethertype + 2 byte reserved + tag*/
+		mvpp2_prs_sram_shift_set(&pe, 2 + MVPP2_ETH_TYPE_LEN + shift,
+					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+		/* Update shadow table */
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_DSA);
+
+		if (tagged) {
+			/* Set tagged bit in DSA tag */
+			mvpp2_prs_tcam_data_byte_set(&pe,
+						     MVPP2_ETH_TYPE_LEN + 2 + 3,
+						 MVPP2_PRS_TCAM_DSA_TAGGED_BIT,
+						 MVPP2_PRS_TCAM_DSA_TAGGED_BIT);
+			/* Clear all ai bits for next iteration */
+			mvpp2_prs_sram_ai_update(&pe, 0,
+						 MVPP2_PRS_SRAM_AI_MASK);
+			/* If packet is tagged continue check vlans */
+			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+		} else {
+			/* Set result info bits to 'no vlans' */
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
+						 MVPP2_PRS_RI_VLAN_MASK);
+			mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+		}
+		/* Mask/unmask all ports, depending on dsa type */
+		mvpp2_prs_tcam_port_map_set(&pe, port_mask);
+	}
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port, add);
+
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Search for existing single/triple vlan entry */
+static int mvpp2_prs_vlan_find(struct mvpp2 *priv, unsigned short tpid, int ai)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* Go through the all entries with MVPP2_PRS_LU_VLAN */
+	for (tid = MVPP2_PE_FIRST_FREE_TID;
+	     tid <= MVPP2_PE_LAST_FREE_TID; tid++) {
+		unsigned int ri_bits, ai_bits;
+		bool match;
+
+		if (!priv->prs_shadow[tid].valid ||
+		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN)
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+		match = mvpp2_prs_tcam_data_cmp(&pe, 0, swab16(tpid));
+		if (!match)
+			continue;
+
+		/* Get vlan type */
+		ri_bits = mvpp2_prs_sram_ri_get(&pe);
+		ri_bits &= MVPP2_PRS_RI_VLAN_MASK;
+
+		/* Get current ai value from tcam */
+		ai_bits = mvpp2_prs_tcam_ai_get(&pe);
+		/* Clear double vlan bit */
+		ai_bits &= ~MVPP2_PRS_DBL_VLAN_AI_BIT;
+
+		if (ai != ai_bits)
+			continue;
+
+		if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE ||
+		    ri_bits == MVPP2_PRS_RI_VLAN_TRIPLE)
+			return tid;
+	}
+
+	return -ENOENT;
+}
+
+/* Add/update single/triple vlan entry */
+static int mvpp2_prs_vlan_add(struct mvpp2 *priv, unsigned short tpid, int ai,
+			      unsigned int port_map)
+{
+	struct mvpp2_prs_entry pe;
+	int tid_aux, tid;
+	int ret = 0;
+
+	memset(&pe, 0, sizeof(pe));
+
+	tid = mvpp2_prs_vlan_find(priv, tpid, ai);
+
+	if (tid < 0) {
+		/* Create new tcam entry */
+		tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_LAST_FREE_TID,
+						MVPP2_PE_FIRST_FREE_TID);
+		if (tid < 0)
+			return tid;
+
+		/* Get last double vlan tid */
+		for (tid_aux = MVPP2_PE_LAST_FREE_TID;
+		     tid_aux >= MVPP2_PE_FIRST_FREE_TID; tid_aux--) {
+			unsigned int ri_bits;
+
+			if (!priv->prs_shadow[tid_aux].valid ||
+			    priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN)
+				continue;
+
+			mvpp2_prs_init_from_hw(priv, &pe, tid_aux);
+			ri_bits = mvpp2_prs_sram_ri_get(&pe);
+			if ((ri_bits & MVPP2_PRS_RI_VLAN_MASK) ==
+			    MVPP2_PRS_RI_VLAN_DOUBLE)
+				break;
+		}
+
+		if (tid <= tid_aux)
+			return -EINVAL;
+
+		memset(&pe, 0, sizeof(pe));
+		pe.index = tid;
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+
+		mvpp2_prs_match_etype(&pe, 0, tpid);
+
+		/* VLAN tag detected, proceed with VID filtering */
+		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
+
+		/* Clear all ai bits for next iteration */
+		mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+		if (ai == MVPP2_PRS_SINGLE_VLAN_AI) {
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_SINGLE,
+						 MVPP2_PRS_RI_VLAN_MASK);
+		} else {
+			ai |= MVPP2_PRS_DBL_VLAN_AI_BIT;
+			mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_TRIPLE,
+						 MVPP2_PRS_RI_VLAN_MASK);
+		}
+		mvpp2_prs_tcam_ai_update(&pe, ai, MVPP2_PRS_SRAM_AI_MASK);
+
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
+	} else {
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	}
+	/* Update ports' mask */
+	mvpp2_prs_tcam_port_map_set(&pe, port_map);
+
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return ret;
+}
+
+/* Get first free double vlan ai number */
+static int mvpp2_prs_double_vlan_ai_free_get(struct mvpp2 *priv)
+{
+	int i;
+
+	for (i = 1; i < MVPP2_PRS_DBL_VLANS_MAX; i++) {
+		if (!priv->prs_double_vlans[i])
+			return i;
+	}
+
+	return -EINVAL;
+}
+
+/* Search for existing double vlan entry */
+static int mvpp2_prs_double_vlan_find(struct mvpp2 *priv, unsigned short tpid1,
+				      unsigned short tpid2)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* Go through the all entries with MVPP2_PRS_LU_VLAN */
+	for (tid = MVPP2_PE_FIRST_FREE_TID;
+	     tid <= MVPP2_PE_LAST_FREE_TID; tid++) {
+		unsigned int ri_mask;
+		bool match;
+
+		if (!priv->prs_shadow[tid].valid ||
+		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VLAN)
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+
+		match = mvpp2_prs_tcam_data_cmp(&pe, 0, swab16(tpid1)) &&
+			mvpp2_prs_tcam_data_cmp(&pe, 4, swab16(tpid2));
+
+		if (!match)
+			continue;
+
+		ri_mask = mvpp2_prs_sram_ri_get(&pe) & MVPP2_PRS_RI_VLAN_MASK;
+		if (ri_mask == MVPP2_PRS_RI_VLAN_DOUBLE)
+			return tid;
+	}
+
+	return -ENOENT;
+}
+
+/* Add or update double vlan entry */
+static int mvpp2_prs_double_vlan_add(struct mvpp2 *priv, unsigned short tpid1,
+				     unsigned short tpid2,
+				     unsigned int port_map)
+{
+	int tid_aux, tid, ai, ret = 0;
+	struct mvpp2_prs_entry pe;
+
+	memset(&pe, 0, sizeof(pe));
+
+	tid = mvpp2_prs_double_vlan_find(priv, tpid1, tpid2);
+
+	if (tid < 0) {
+		/* Create new tcam entry */
+		tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+				MVPP2_PE_LAST_FREE_TID);
+		if (tid < 0)
+			return tid;
+
+		/* Set ai value for new double vlan entry */
+		ai = mvpp2_prs_double_vlan_ai_free_get(priv);
+		if (ai < 0)
+			return ai;
+
+		/* Get first single/triple vlan tid */
+		for (tid_aux = MVPP2_PE_FIRST_FREE_TID;
+		     tid_aux <= MVPP2_PE_LAST_FREE_TID; tid_aux++) {
+			unsigned int ri_bits;
+
+			if (!priv->prs_shadow[tid_aux].valid ||
+			    priv->prs_shadow[tid_aux].lu != MVPP2_PRS_LU_VLAN)
+				continue;
+
+			mvpp2_prs_init_from_hw(priv, &pe, tid_aux);
+			ri_bits = mvpp2_prs_sram_ri_get(&pe);
+			ri_bits &= MVPP2_PRS_RI_VLAN_MASK;
+			if (ri_bits == MVPP2_PRS_RI_VLAN_SINGLE ||
+			    ri_bits == MVPP2_PRS_RI_VLAN_TRIPLE)
+				break;
+		}
+
+		if (tid >= tid_aux)
+			return -ERANGE;
+
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+		pe.index = tid;
+
+		priv->prs_double_vlans[ai] = true;
+
+		mvpp2_prs_match_etype(&pe, 0, tpid1);
+		mvpp2_prs_match_etype(&pe, 4, tpid2);
+
+		mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+		/* Shift 4 bytes - skip outer vlan tag */
+		mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_LEN,
+					 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_DOUBLE,
+					 MVPP2_PRS_RI_VLAN_MASK);
+		mvpp2_prs_sram_ai_update(&pe, ai | MVPP2_PRS_DBL_VLAN_AI_BIT,
+					 MVPP2_PRS_SRAM_AI_MASK);
+
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
+	} else {
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	}
+
+	/* Update ports' mask */
+	mvpp2_prs_tcam_port_map_set(&pe, port_map);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return ret;
+}
+
+/* IPv4 header parsing for fragmentation and L4 offset */
+static int mvpp2_prs_ip4_proto(struct mvpp2 *priv, unsigned short proto,
+			       unsigned int ri, unsigned int ri_mask)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	if ((proto != IPPROTO_TCP) && (proto != IPPROTO_UDP) &&
+	    (proto != IPPROTO_IGMP))
+		return -EINVAL;
+
+	/* Not fragmented packet */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	pe.index = tid;
+
+	/* Set next lu to IPv4 */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_sram_shift_set(&pe, 12, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	/* Set L4 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
+				  sizeof(struct iphdr) - 4,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
+				 MVPP2_PRS_IPV4_DIP_AI_BIT);
+	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK_L);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00,
+				     MVPP2_PRS_TCAM_PROTO_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 5, proto, MVPP2_PRS_TCAM_PROTO_MASK);
+	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Fragmented packet */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	pe.index = tid;
+	/* Clear ri before updating */
+	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
+	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
+	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
+
+	mvpp2_prs_sram_ri_update(&pe, ri | MVPP2_PRS_RI_IP_FRAG_TRUE,
+				 ri_mask | MVPP2_PRS_RI_IP_FRAG_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 2, 0x00, 0x0);
+	mvpp2_prs_tcam_data_byte_set(&pe, 3, 0x00, 0x0);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* IPv4 L3 multicast or broadcast */
+static int mvpp2_prs_ip4_cast(struct mvpp2 *priv, unsigned short l3_cast)
+{
+	struct mvpp2_prs_entry pe;
+	int mask, tid;
+
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	pe.index = tid;
+
+	switch (l3_cast) {
+	case MVPP2_PRS_L3_MULTI_CAST:
+		mvpp2_prs_tcam_data_byte_set(&pe, 0, MVPP2_PRS_IPV4_MC,
+					     MVPP2_PRS_IPV4_MC_MASK);
+		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_MCAST,
+					 MVPP2_PRS_RI_L3_ADDR_MASK);
+		break;
+	case  MVPP2_PRS_L3_BROAD_CAST:
+		mask = MVPP2_PRS_IPV4_BC_MASK;
+		mvpp2_prs_tcam_data_byte_set(&pe, 0, mask, mask);
+		mvpp2_prs_tcam_data_byte_set(&pe, 1, mask, mask);
+		mvpp2_prs_tcam_data_byte_set(&pe, 2, mask, mask);
+		mvpp2_prs_tcam_data_byte_set(&pe, 3, mask, mask);
+		mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_BCAST,
+					 MVPP2_PRS_RI_L3_ADDR_MASK);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
+				 MVPP2_PRS_IPV4_DIP_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Set entries for protocols over IPv6  */
+static int mvpp2_prs_ip6_proto(struct mvpp2 *priv, unsigned short proto,
+			       unsigned int ri, unsigned int ri_mask)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	if ((proto != IPPROTO_TCP) && (proto != IPPROTO_UDP) &&
+	    (proto != IPPROTO_ICMPV6) && (proto != IPPROTO_IPIP))
+		return -EINVAL;
+
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = tid;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, ri, ri_mask);
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
+				  sizeof(struct ipv6hdr) - 6,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 0, proto, MVPP2_PRS_TCAM_PROTO_MASK);
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Write HW */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* IPv6 L3 multicast entry */
+static int mvpp2_prs_ip6_cast(struct mvpp2 *priv, unsigned short l3_cast)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	if (l3_cast != MVPP2_PRS_L3_MULTI_CAST)
+		return -EINVAL;
+
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = tid;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_MCAST,
+				 MVPP2_PRS_RI_L3_ADDR_MASK);
+	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Shift back to IPv6 NH */
+	mvpp2_prs_sram_shift_set(&pe, -18, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 0, MVPP2_PRS_IPV6_MC,
+				     MVPP2_PRS_IPV6_MC_MASK);
+	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Parser per-port initialization */
+static void mvpp2_prs_hw_port_init(struct mvpp2 *priv, int port, int lu_first,
+				   int lu_max, int offset)
+{
+	u32 val;
+
+	/* Set lookup ID */
+	val = mvpp2_read(priv, MVPP2_PRS_INIT_LOOKUP_REG);
+	val &= ~MVPP2_PRS_PORT_LU_MASK(port);
+	val |=  MVPP2_PRS_PORT_LU_VAL(port, lu_first);
+	mvpp2_write(priv, MVPP2_PRS_INIT_LOOKUP_REG, val);
+
+	/* Set maximum number of loops for packet received from port */
+	val = mvpp2_read(priv, MVPP2_PRS_MAX_LOOP_REG(port));
+	val &= ~MVPP2_PRS_MAX_LOOP_MASK(port);
+	val |= MVPP2_PRS_MAX_LOOP_VAL(port, lu_max);
+	mvpp2_write(priv, MVPP2_PRS_MAX_LOOP_REG(port), val);
+
+	/* Set initial offset for packet header extraction for the first
+	 * searching loop
+	 */
+	val = mvpp2_read(priv, MVPP2_PRS_INIT_OFFS_REG(port));
+	val &= ~MVPP2_PRS_INIT_OFF_MASK(port);
+	val |= MVPP2_PRS_INIT_OFF_VAL(port, offset);
+	mvpp2_write(priv, MVPP2_PRS_INIT_OFFS_REG(port), val);
+}
+
+/* Default flow entries initialization for all ports */
+static void mvpp2_prs_def_flow_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int port;
+
+	for (port = 0; port < MVPP2_MAX_PORTS; port++) {
+		memset(&pe, 0, sizeof(pe));
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+		pe.index = MVPP2_PE_FIRST_DEFAULT_FLOW - port;
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+
+		/* Set flow ID*/
+		mvpp2_prs_sram_ai_update(&pe, port, MVPP2_PRS_FLOW_ID_MASK);
+		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_DONE_BIT, 1);
+
+		/* Update shadow table and hw entry */
+		mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_FLOWS);
+		mvpp2_prs_hw_write(priv, &pe);
+	}
+}
+
+/* Set default entry for Marvell Header field */
+static void mvpp2_prs_mh_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+
+	memset(&pe, 0, sizeof(pe));
+
+	pe.index = MVPP2_PE_MH_DEFAULT;
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MH);
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_MH_SIZE,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_MAC);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MH);
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Set default entires (place holder) for promiscuous, non-promiscuous and
+ * multicast MAC addresses
+ */
+static void mvpp2_prs_mac_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+
+	memset(&pe, 0, sizeof(pe));
+
+	/* Non-promiscuous mode for all ports - DROP unknown packets */
+	pe.index = MVPP2_PE_MAC_NON_PROMISCUOUS;
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
+
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
+				 MVPP2_PRS_RI_DROP_MASK);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Create dummy entries for drop all and promiscuous modes */
+	mvpp2_prs_mac_drop_all_set(priv, 0, false);
+	mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false);
+	mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false);
+}
+
+/* Set default entries for various types of dsa packets */
+static void mvpp2_prs_dsa_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+
+	/* None tagged EDSA entry - place holder */
+	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_UNTAGGED,
+			      MVPP2_PRS_EDSA);
+
+	/* Tagged EDSA entry - place holder */
+	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
+
+	/* None tagged DSA entry - place holder */
+	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_UNTAGGED,
+			      MVPP2_PRS_DSA);
+
+	/* Tagged DSA entry - place holder */
+	mvpp2_prs_dsa_tag_set(priv, 0, false, MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
+
+	/* None tagged EDSA ethertype entry - place holder*/
+	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, false,
+					MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
+
+	/* Tagged EDSA ethertype entry - place holder*/
+	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, false,
+					MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
+
+	/* None tagged DSA ethertype entry */
+	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, true,
+					MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
+
+	/* Tagged DSA ethertype entry */
+	mvpp2_prs_dsa_tag_ethertype_set(priv, 0, true,
+					MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
+
+	/* Set default entry, in case DSA or EDSA tag not found */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_DSA);
+	pe.index = MVPP2_PE_DSA_DEFAULT;
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+
+	/* Shift 0 bytes */
+	mvpp2_prs_sram_shift_set(&pe, 0, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
+
+	/* Clear all sram ai bits for next iteration */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Initialize parser entries for VID filtering */
+static void mvpp2_prs_vid_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+
+	memset(&pe, 0, sizeof(pe));
+
+	/* Set default vid entry */
+	pe.index = MVPP2_PE_VID_FLTR_DEFAULT;
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
+
+	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_EDSA_VID_AI_BIT);
+
+	/* Skip VLAN header - Set offset to 4 bytes */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_LEN,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	/* Clear all ai bits for next iteration */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Set default vid entry for extended DSA*/
+	memset(&pe, 0, sizeof(pe));
+
+	/* Set default vid entry */
+	pe.index = MVPP2_PE_VID_EDSA_FLTR_DEFAULT;
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_EDSA_VID_AI_BIT,
+				 MVPP2_PRS_EDSA_VID_AI_BIT);
+
+	/* Skip VLAN header - Set offset to 8 bytes */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_VLAN_TAG_EDSA_LEN,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	/* Clear all ai bits for next iteration */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Match basic ethertypes */
+static int mvpp2_prs_etype_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* Ethertype: PPPoE */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, ETH_P_PPP_SES);
+
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_PPPOE_HDR_SIZE,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_PPPOE_MASK,
+				 MVPP2_PRS_RI_PPPOE_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = false;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_PPPOE_MASK,
+				MVPP2_PRS_RI_PPPOE_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Ethertype: ARP */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, ETH_P_ARP);
+
+	/* Generate flow in the next iteration*/
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_ARP,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = true;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_ARP,
+				MVPP2_PRS_RI_L3_PROTO_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Ethertype: LBTD */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, MVPP2_IP_LBDT_TYPE);
+
+	/* Generate flow in the next iteration*/
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
+				 MVPP2_PRS_RI_UDF3_RX_SPECIAL,
+				 MVPP2_PRS_RI_CPU_CODE_MASK |
+				 MVPP2_PRS_RI_UDF3_MASK);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = true;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
+				MVPP2_PRS_RI_UDF3_RX_SPECIAL,
+				MVPP2_PRS_RI_CPU_CODE_MASK |
+				MVPP2_PRS_RI_UDF3_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Ethertype: IPv4 without options */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, ETH_P_IP);
+	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
+				     MVPP2_PRS_IPV4_HEAD | MVPP2_PRS_IPV4_IHL,
+				     MVPP2_PRS_IPV4_HEAD_MASK |
+				     MVPP2_PRS_IPV4_IHL_MASK);
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Skip eth_type + 4 bytes of IP header */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = false;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP4,
+				MVPP2_PRS_RI_L3_PROTO_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Ethertype: IPv4 with options */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	pe.index = tid;
+
+	/* Clear tcam data before updating */
+	pe.tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE(MVPP2_ETH_TYPE_LEN)] = 0x0;
+	pe.tcam.byte[MVPP2_PRS_TCAM_DATA_BYTE_EN(MVPP2_ETH_TYPE_LEN)] = 0x0;
+
+	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
+				     MVPP2_PRS_IPV4_HEAD,
+				     MVPP2_PRS_IPV4_HEAD_MASK);
+
+	/* Clear ri before updating */
+	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
+	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4_OPT,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = false;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP4_OPT,
+				MVPP2_PRS_RI_L3_PROTO_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Ethertype: IPv6 without options */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, ETH_P_IPV6);
+
+	/* Skip DIP of IPV6 header */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 8 +
+				 MVPP2_MAX_L3_ADDR_SIZE,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP6,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = false;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_IP6,
+				MVPP2_PRS_RI_L3_PROTO_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Default entry for MVPP2_PRS_LU_L2 - Unknown ethtype */
+	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_L2);
+	pe.index = MVPP2_PE_ETH_TYPE_UN;
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Generate flow in the next iteration*/
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Set L3 offset even it's unknown L3 */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_L2);
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_L2_DEF;
+	priv->prs_shadow[pe.index].finish = true;
+	mvpp2_prs_shadow_ri_set(priv, pe.index, MVPP2_PRS_RI_L3_UN,
+				MVPP2_PRS_RI_L3_PROTO_MASK);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Configure vlan entries and detect up to 2 successive VLAN tags.
+ * Possible options:
+ * 0x8100, 0x88A8
+ * 0x8100, 0x8100
+ * 0x8100
+ * 0x88A8
+ */
+static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int err;
+
+	priv->prs_double_vlans = devm_kcalloc(&pdev->dev, sizeof(bool),
+					      MVPP2_PRS_DBL_VLANS_MAX,
+					      GFP_KERNEL);
+	if (!priv->prs_double_vlans)
+		return -ENOMEM;
+
+	/* Double VLAN: 0x8100, 0x88A8 */
+	err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021Q, ETH_P_8021AD,
+					MVPP2_PRS_PORT_MASK);
+	if (err)
+		return err;
+
+	/* Double VLAN: 0x8100, 0x8100 */
+	err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021Q, ETH_P_8021Q,
+					MVPP2_PRS_PORT_MASK);
+	if (err)
+		return err;
+
+	/* Single VLAN: 0x88a8 */
+	err = mvpp2_prs_vlan_add(priv, ETH_P_8021AD, MVPP2_PRS_SINGLE_VLAN_AI,
+				 MVPP2_PRS_PORT_MASK);
+	if (err)
+		return err;
+
+	/* Single VLAN: 0x8100 */
+	err = mvpp2_prs_vlan_add(priv, ETH_P_8021Q, MVPP2_PRS_SINGLE_VLAN_AI,
+				 MVPP2_PRS_PORT_MASK);
+	if (err)
+		return err;
+
+	/* Set default double vlan entry */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+	pe.index = MVPP2_PE_VLAN_DBL;
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_VID);
+
+	/* Clear ai for next iterations */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_DOUBLE,
+				 MVPP2_PRS_RI_VLAN_MASK);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_DBL_VLAN_AI_BIT,
+				 MVPP2_PRS_DBL_VLAN_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Set default vlan none entry */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VLAN);
+	pe.index = MVPP2_PE_VLAN_NONE;
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_VLAN_NONE,
+				 MVPP2_PRS_RI_VLAN_MASK);
+
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VLAN);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Set entries for PPPoE ethertype */
+static int mvpp2_prs_pppoe_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* IPv4 over PPPoE with options */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, PPP_IP);
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4_OPT,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Skip eth_type + 4 bytes of IP header */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* IPv4 over PPPoE without options */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	pe.index = tid;
+
+	mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN,
+				     MVPP2_PRS_IPV4_HEAD | MVPP2_PRS_IPV4_IHL,
+				     MVPP2_PRS_IPV4_HEAD_MASK |
+				     MVPP2_PRS_IPV4_IHL_MASK);
+
+	/* Clear ri before updating */
+	pe.sram.word[MVPP2_PRS_SRAM_RI_WORD] = 0x0;
+	pe.sram.word[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0;
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* IPv6 over PPPoE */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
+	pe.index = tid;
+
+	mvpp2_prs_match_etype(&pe, 0, PPP_IPV6);
+
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP6,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+	/* Skip eth_type + 4 bytes of IPv6 header */
+	mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	/* Set L3 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Non-IP over PPPoE */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE);
+	pe.index = tid;
+
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN,
+				 MVPP2_PRS_RI_L3_PROTO_MASK);
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	/* Set L3 offset even if it's unknown L3 */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3,
+				  MVPP2_ETH_TYPE_LEN,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Initialize entries for IPv4 */
+static int mvpp2_prs_ip4_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int err;
+
+	/* Set entries for TCP, UDP and IGMP over IPv4 */
+	err = mvpp2_prs_ip4_proto(priv, IPPROTO_TCP, MVPP2_PRS_RI_L4_TCP,
+				  MVPP2_PRS_RI_L4_PROTO_MASK);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip4_proto(priv, IPPROTO_UDP, MVPP2_PRS_RI_L4_UDP,
+				  MVPP2_PRS_RI_L4_PROTO_MASK);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip4_proto(priv, IPPROTO_IGMP,
+				  MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
+				  MVPP2_PRS_RI_UDF3_RX_SPECIAL,
+				  MVPP2_PRS_RI_CPU_CODE_MASK |
+				  MVPP2_PRS_RI_UDF3_MASK);
+	if (err)
+		return err;
+
+	/* IPv4 Broadcast */
+	err = mvpp2_prs_ip4_cast(priv, MVPP2_PRS_L3_BROAD_CAST);
+	if (err)
+		return err;
+
+	/* IPv4 Multicast */
+	err = mvpp2_prs_ip4_cast(priv, MVPP2_PRS_L3_MULTI_CAST);
+	if (err)
+		return err;
+
+	/* Default IPv4 entry for unknown protocols */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	pe.index = MVPP2_PE_IP4_PROTO_UN;
+
+	/* Set next lu to IPv4 */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_sram_shift_set(&pe, 12, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+	/* Set L4 offset */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
+				  sizeof(struct iphdr) - 4,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
+				 MVPP2_PRS_IPV4_DIP_AI_BIT);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
+				 MVPP2_PRS_RI_L4_PROTO_MASK);
+
+	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV4_DIP_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Default IPv4 entry for unicast address */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP4);
+	pe.index = MVPP2_PE_IP4_ADDR_UN;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UCAST,
+				 MVPP2_PRS_RI_L3_ADDR_MASK);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV4_DIP_AI_BIT,
+				 MVPP2_PRS_IPV4_DIP_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Initialize entries for IPv6 */
+static int mvpp2_prs_ip6_init(struct mvpp2 *priv)
+{
+	struct mvpp2_prs_entry pe;
+	int tid, err;
+
+	/* Set entries for TCP, UDP and ICMP over IPv6 */
+	err = mvpp2_prs_ip6_proto(priv, IPPROTO_TCP,
+				  MVPP2_PRS_RI_L4_TCP,
+				  MVPP2_PRS_RI_L4_PROTO_MASK);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip6_proto(priv, IPPROTO_UDP,
+				  MVPP2_PRS_RI_L4_UDP,
+				  MVPP2_PRS_RI_L4_PROTO_MASK);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip6_proto(priv, IPPROTO_ICMPV6,
+				  MVPP2_PRS_RI_CPU_CODE_RX_SPEC |
+				  MVPP2_PRS_RI_UDF3_RX_SPECIAL,
+				  MVPP2_PRS_RI_CPU_CODE_MASK |
+				  MVPP2_PRS_RI_UDF3_MASK);
+	if (err)
+		return err;
+
+	/* IPv4 is the last header. This is similar case as 6-TCP or 17-UDP */
+	/* Result Info: UDF7=1, DS lite */
+	err = mvpp2_prs_ip6_proto(priv, IPPROTO_IPIP,
+				  MVPP2_PRS_RI_UDF7_IP6_LITE,
+				  MVPP2_PRS_RI_UDF7_MASK);
+	if (err)
+		return err;
+
+	/* IPv6 multicast */
+	err = mvpp2_prs_ip6_cast(priv, MVPP2_PRS_L3_MULTI_CAST);
+	if (err)
+		return err;
+
+	/* Entry for checking hop limit */
+	tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID,
+					MVPP2_PE_LAST_FREE_TID);
+	if (tid < 0)
+		return tid;
+
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = tid;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UN |
+				 MVPP2_PRS_RI_DROP_MASK,
+				 MVPP2_PRS_RI_L3_PROTO_MASK |
+				 MVPP2_PRS_RI_DROP_MASK);
+
+	mvpp2_prs_tcam_data_byte_set(&pe, 1, 0x00, MVPP2_PRS_IPV6_HOP_MASK);
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Default IPv6 entry for unknown protocols */
+	memset(&pe, 0, sizeof(pe));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = MVPP2_PE_IP6_PROTO_UN;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
+				 MVPP2_PRS_RI_L4_PROTO_MASK);
+	/* Set L4 offset relatively to our current place */
+	mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4,
+				  sizeof(struct ipv6hdr) - 4,
+				  MVPP2_PRS_SRAM_OP_SEL_UDF_ADD);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Default IPv6 entry for unknown ext protocols */
+	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = MVPP2_PE_IP6_EXT_PROTO_UN;
+
+	/* Finished: go to flowid generation */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L4_OTHER,
+				 MVPP2_PRS_RI_L4_PROTO_MASK);
+
+	mvpp2_prs_tcam_ai_update(&pe, MVPP2_PRS_IPV6_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_EXT_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP4);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	/* Default IPv6 entry for unicast address */
+	memset(&pe, 0, sizeof(struct mvpp2_prs_entry));
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	pe.index = MVPP2_PE_IP6_ADDR_UN;
+
+	/* Finished: go to IPv6 again */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_UCAST,
+				 MVPP2_PRS_RI_L3_ADDR_MASK);
+	mvpp2_prs_sram_ai_update(&pe, MVPP2_PRS_IPV6_NO_EXT_AI_BIT,
+				 MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Shift back to IPV6 NH */
+	mvpp2_prs_sram_shift_set(&pe, -18, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	mvpp2_prs_tcam_ai_update(&pe, 0, MVPP2_PRS_IPV6_NO_EXT_AI_BIT);
+	/* Unmask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK);
+
+	/* Update shadow table and hw entry */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_IP6);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Find tcam entry with matched pair <vid,port> */
+static int mvpp2_prs_vid_range_find(struct mvpp2 *priv, int pmap, u16 vid,
+				    u16 mask)
+{
+	unsigned char byte[2], enable[2];
+	struct mvpp2_prs_entry pe;
+	u16 rvid, rmask;
+	int tid;
+
+	/* Go through the all entries with MVPP2_PRS_LU_VID */
+	for (tid = MVPP2_PE_VID_FILT_RANGE_START;
+	     tid <= MVPP2_PE_VID_FILT_RANGE_END; tid++) {
+		if (!priv->prs_shadow[tid].valid ||
+		    priv->prs_shadow[tid].lu != MVPP2_PRS_LU_VID)
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+
+		mvpp2_prs_tcam_data_byte_get(&pe, 2, &byte[0], &enable[0]);
+		mvpp2_prs_tcam_data_byte_get(&pe, 3, &byte[1], &enable[1]);
+
+		rvid = ((byte[0] & 0xf) << 8) + byte[1];
+		rmask = ((enable[0] & 0xf) << 8) + enable[1];
+
+		if (rvid != vid || rmask != mask)
+			continue;
+
+		return tid;
+	}
+
+	return -ENOENT;
+}
+
+/* Write parser entry for VID filtering */
+int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid)
+{
+	unsigned int vid_start = MVPP2_PE_VID_FILT_RANGE_START +
+				 port->id * MVPP2_PRS_VLAN_FILT_MAX;
+	unsigned int mask = 0xfff, reg_val, shift;
+	struct mvpp2 *priv = port->priv;
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	memset(&pe, 0, sizeof(pe));
+
+	/* Scan TCAM and see if entry with this <vid,port> already exist */
+	tid = mvpp2_prs_vid_range_find(priv, (1 << port->id), vid, mask);
+
+	reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id));
+	if (reg_val & MVPP2_DSA_EXTENDED)
+		shift = MVPP2_VLAN_TAG_EDSA_LEN;
+	else
+		shift = MVPP2_VLAN_TAG_LEN;
+
+	/* No such entry */
+	if (tid < 0) {
+
+		/* Go through all entries from first to last in vlan range */
+		tid = mvpp2_prs_tcam_first_free(priv, vid_start,
+						vid_start +
+						MVPP2_PRS_VLAN_FILT_MAX_ENTRY);
+
+		/* There isn't room for a new VID filter */
+		if (tid < 0)
+			return tid;
+
+		mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
+		pe.index = tid;
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+	} else {
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	}
+
+	/* Enable the current port */
+	mvpp2_prs_tcam_port_set(&pe, port->id, true);
+
+	/* Continue - set next lookup */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+
+	/* Skip VLAN header - Set offset to 4 or 8 bytes */
+	mvpp2_prs_sram_shift_set(&pe, shift, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	/* Set match on VID */
+	mvpp2_prs_match_vid(&pe, MVPP2_PRS_VID_TCAM_BYTE, vid);
+
+	/* Clear all ai bits for next iteration */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+	/* Update shadow table */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+/* Write parser entry for VID filtering */
+void mvpp2_prs_vid_entry_remove(struct mvpp2_port *port, u16 vid)
+{
+	struct mvpp2 *priv = port->priv;
+	int tid;
+
+	/* Scan TCAM and see if entry with this <vid,port> already exist */
+	tid = mvpp2_prs_vid_range_find(priv, (1 << port->id), vid, 0xfff);
+
+	/* No such entry */
+	if (tid < 0)
+		return;
+
+	mvpp2_prs_hw_inv(priv, tid);
+	priv->prs_shadow[tid].valid = false;
+}
+
+/* Remove all existing VID filters on this port */
+void mvpp2_prs_vid_remove_all(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	int tid;
+
+	for (tid = MVPP2_PRS_VID_PORT_FIRST(port->id);
+	     tid <= MVPP2_PRS_VID_PORT_LAST(port->id); tid++) {
+		if (priv->prs_shadow[tid].valid)
+			mvpp2_prs_vid_entry_remove(port, tid);
+	}
+}
+
+/* Remove VID filering entry for this port */
+void mvpp2_prs_vid_disable_filtering(struct mvpp2_port *port)
+{
+	unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id);
+	struct mvpp2 *priv = port->priv;
+
+	/* Invalidate the guard entry */
+	mvpp2_prs_hw_inv(priv, tid);
+
+	priv->prs_shadow[tid].valid = false;
+}
+
+/* Add guard entry that drops packets when no VID is matched on this port */
+void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port)
+{
+	unsigned int tid = MVPP2_PRS_VID_PORT_DFLT(port->id);
+	struct mvpp2 *priv = port->priv;
+	unsigned int reg_val, shift;
+	struct mvpp2_prs_entry pe;
+
+	if (priv->prs_shadow[tid].valid)
+		return;
+
+	memset(&pe, 0, sizeof(pe));
+
+	pe.index = tid;
+
+	reg_val = mvpp2_read(priv, MVPP2_MH_REG(port->id));
+	if (reg_val & MVPP2_DSA_EXTENDED)
+		shift = MVPP2_VLAN_TAG_EDSA_LEN;
+	else
+		shift = MVPP2_VLAN_TAG_LEN;
+
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_VID);
+
+	/* Mask all ports */
+	mvpp2_prs_tcam_port_map_set(&pe, 0);
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port->id, true);
+
+	/* Continue - set next lookup */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_L2);
+
+	/* Skip VLAN header - Set offset to 4 or 8 bytes */
+	mvpp2_prs_sram_shift_set(&pe, shift, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	/* Drop VLAN packets that don't belong to any VIDs on this port */
+	mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK,
+				 MVPP2_PRS_RI_DROP_MASK);
+
+	/* Clear all ai bits for next iteration */
+	mvpp2_prs_sram_ai_update(&pe, 0, MVPP2_PRS_SRAM_AI_MASK);
+
+	/* Update shadow table */
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_VID);
+	mvpp2_prs_hw_write(priv, &pe);
+}
+
+/* Parser default initialization */
+int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv)
+{
+	int err, index, i;
+
+	/* Enable tcam table */
+	mvpp2_write(priv, MVPP2_PRS_TCAM_CTRL_REG, MVPP2_PRS_TCAM_EN_MASK);
+
+	/* Clear all tcam and sram entries */
+	for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++) {
+		mvpp2_write(priv, MVPP2_PRS_TCAM_IDX_REG, index);
+		for (i = 0; i < MVPP2_PRS_TCAM_WORDS; i++)
+			mvpp2_write(priv, MVPP2_PRS_TCAM_DATA_REG(i), 0);
+
+		mvpp2_write(priv, MVPP2_PRS_SRAM_IDX_REG, index);
+		for (i = 0; i < MVPP2_PRS_SRAM_WORDS; i++)
+			mvpp2_write(priv, MVPP2_PRS_SRAM_DATA_REG(i), 0);
+	}
+
+	/* Invalidate all tcam entries */
+	for (index = 0; index < MVPP2_PRS_TCAM_SRAM_SIZE; index++)
+		mvpp2_prs_hw_inv(priv, index);
+
+	priv->prs_shadow = devm_kcalloc(&pdev->dev, MVPP2_PRS_TCAM_SRAM_SIZE,
+					sizeof(*priv->prs_shadow),
+					GFP_KERNEL);
+	if (!priv->prs_shadow)
+		return -ENOMEM;
+
+	/* Always start from lookup = 0 */
+	for (index = 0; index < MVPP2_MAX_PORTS; index++)
+		mvpp2_prs_hw_port_init(priv, index, MVPP2_PRS_LU_MH,
+				       MVPP2_PRS_PORT_LU_MAX, 0);
+
+	mvpp2_prs_def_flow_init(priv);
+
+	mvpp2_prs_mh_init(priv);
+
+	mvpp2_prs_mac_init(priv);
+
+	mvpp2_prs_dsa_init(priv);
+
+	mvpp2_prs_vid_init(priv);
+
+	err = mvpp2_prs_etype_init(priv);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_vlan_init(pdev, priv);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_pppoe_init(priv);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip6_init(priv);
+	if (err)
+		return err;
+
+	err = mvpp2_prs_ip4_init(priv);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/* Compare MAC DA with tcam entry data */
+static bool mvpp2_prs_mac_range_equals(struct mvpp2_prs_entry *pe,
+				       const u8 *da, unsigned char *mask)
+{
+	unsigned char tcam_byte, tcam_mask;
+	int index;
+
+	for (index = 0; index < ETH_ALEN; index++) {
+		mvpp2_prs_tcam_data_byte_get(pe, index, &tcam_byte, &tcam_mask);
+		if (tcam_mask != mask[index])
+			return false;
+
+		if ((tcam_mask & tcam_byte) != (da[index] & mask[index]))
+			return false;
+	}
+
+	return true;
+}
+
+/* Find tcam entry with matched pair <MAC DA, port> */
+static int
+mvpp2_prs_mac_da_range_find(struct mvpp2 *priv, int pmap, const u8 *da,
+			    unsigned char *mask, int udf_type)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	/* Go through the all entires with MVPP2_PRS_LU_MAC */
+	for (tid = MVPP2_PE_MAC_RANGE_START;
+	     tid <= MVPP2_PE_MAC_RANGE_END; tid++) {
+		unsigned int entry_pmap;
+
+		if (!priv->prs_shadow[tid].valid ||
+		    (priv->prs_shadow[tid].lu != MVPP2_PRS_LU_MAC) ||
+		    (priv->prs_shadow[tid].udf != udf_type))
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+		entry_pmap = mvpp2_prs_tcam_port_map_get(&pe);
+
+		if (mvpp2_prs_mac_range_equals(&pe, da, mask) &&
+		    entry_pmap == pmap)
+			return tid;
+	}
+
+	return -ENOENT;
+}
+
+/* Update parser's mac da entry */
+int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add)
+{
+	unsigned char mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+	struct mvpp2 *priv = port->priv;
+	unsigned int pmap, len, ri;
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	memset(&pe, 0, sizeof(pe));
+
+	/* Scan TCAM and see if entry with this <MAC DA, port> already exist */
+	tid = mvpp2_prs_mac_da_range_find(priv, BIT(port->id), da, mask,
+					  MVPP2_PRS_UDF_MAC_DEF);
+
+	/* No such entry */
+	if (tid < 0) {
+		if (!add)
+			return 0;
+
+		/* Create new TCAM entry */
+		/* Go through the all entries from first to last */
+		tid = mvpp2_prs_tcam_first_free(priv,
+						MVPP2_PE_MAC_RANGE_START,
+						MVPP2_PE_MAC_RANGE_END);
+		if (tid < 0)
+			return tid;
+
+		pe.index = tid;
+
+		/* Mask all ports */
+		mvpp2_prs_tcam_port_map_set(&pe, 0);
+	} else {
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+	}
+
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC);
+
+	/* Update port mask */
+	mvpp2_prs_tcam_port_set(&pe, port->id, add);
+
+	/* Invalidate the entry if no ports are left enabled */
+	pmap = mvpp2_prs_tcam_port_map_get(&pe);
+	if (pmap == 0) {
+		if (add)
+			return -EINVAL;
+
+		mvpp2_prs_hw_inv(priv, pe.index);
+		priv->prs_shadow[pe.index].valid = false;
+		return 0;
+	}
+
+	/* Continue - set next lookup */
+	mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_DSA);
+
+	/* Set match on DA */
+	len = ETH_ALEN;
+	while (len--)
+		mvpp2_prs_tcam_data_byte_set(&pe, len, da[len], 0xff);
+
+	/* Set result info bits */
+	if (is_broadcast_ether_addr(da)) {
+		ri = MVPP2_PRS_RI_L2_BCAST;
+	} else if (is_multicast_ether_addr(da)) {
+		ri = MVPP2_PRS_RI_L2_MCAST;
+	} else {
+		ri = MVPP2_PRS_RI_L2_UCAST;
+
+		if (ether_addr_equal(da, port->dev->dev_addr))
+			ri |= MVPP2_PRS_RI_MAC_ME_MASK;
+	}
+
+	mvpp2_prs_sram_ri_update(&pe, ri, MVPP2_PRS_RI_L2_CAST_MASK |
+				 MVPP2_PRS_RI_MAC_ME_MASK);
+	mvpp2_prs_shadow_ri_set(priv, pe.index, ri, MVPP2_PRS_RI_L2_CAST_MASK |
+				MVPP2_PRS_RI_MAC_ME_MASK);
+
+	/* Shift to ethertype */
+	mvpp2_prs_sram_shift_set(&pe, 2 * ETH_ALEN,
+				 MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD);
+
+	/* Update shadow table and hw entry */
+	priv->prs_shadow[pe.index].udf = MVPP2_PRS_UDF_MAC_DEF;
+	mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC);
+	mvpp2_prs_hw_write(priv, &pe);
+
+	return 0;
+}
+
+int mvpp2_prs_update_mac_da(struct net_device *dev, const u8 *da)
+{
+	struct mvpp2_port *port = netdev_priv(dev);
+	int err;
+
+	/* Remove old parser entry */
+	err = mvpp2_prs_mac_da_accept(port, dev->dev_addr, false);
+	if (err)
+		return err;
+
+	/* Add new parser entry */
+	err = mvpp2_prs_mac_da_accept(port, da, true);
+	if (err)
+		return err;
+
+	/* Set addr in the device */
+	ether_addr_copy(dev->dev_addr, da);
+
+	return 0;
+}
+
+void mvpp2_prs_mac_del_all(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	struct mvpp2_prs_entry pe;
+	unsigned long pmap;
+	int index, tid;
+
+	for (tid = MVPP2_PE_MAC_RANGE_START;
+	     tid <= MVPP2_PE_MAC_RANGE_END; tid++) {
+		unsigned char da[ETH_ALEN], da_mask[ETH_ALEN];
+
+		if (!priv->prs_shadow[tid].valid ||
+		    (priv->prs_shadow[tid].lu != MVPP2_PRS_LU_MAC) ||
+		    (priv->prs_shadow[tid].udf != MVPP2_PRS_UDF_MAC_DEF))
+			continue;
+
+		mvpp2_prs_init_from_hw(priv, &pe, tid);
+
+		pmap = mvpp2_prs_tcam_port_map_get(&pe);
+
+		/* We only want entries active on this port */
+		if (!test_bit(port->id, &pmap))
+			continue;
+
+		/* Read mac addr from entry */
+		for (index = 0; index < ETH_ALEN; index++)
+			mvpp2_prs_tcam_data_byte_get(&pe, index, &da[index],
+						     &da_mask[index]);
+
+		/* Special cases : Don't remove broadcast and port's own
+		 * address
+		 */
+		if (is_broadcast_ether_addr(da) ||
+		    ether_addr_equal(da, port->dev->dev_addr))
+			continue;
+
+		/* Remove entry from TCAM */
+		mvpp2_prs_mac_da_accept(port, da, false);
+	}
+}
+
+int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type)
+{
+	switch (type) {
+	case MVPP2_TAG_TYPE_EDSA:
+		/* Add port to EDSA entries */
+		mvpp2_prs_dsa_tag_set(priv, port, true,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
+		mvpp2_prs_dsa_tag_set(priv, port, true,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
+		/* Remove port from DSA entries */
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
+		break;
+
+	case MVPP2_TAG_TYPE_DSA:
+		/* Add port to DSA entries */
+		mvpp2_prs_dsa_tag_set(priv, port, true,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
+		mvpp2_prs_dsa_tag_set(priv, port, true,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
+		/* Remove port from EDSA entries */
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
+		break;
+
+	case MVPP2_TAG_TYPE_MH:
+	case MVPP2_TAG_TYPE_NONE:
+		/* Remove port form EDSA and DSA entries */
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_DSA);
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_DSA);
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_TAGGED, MVPP2_PRS_EDSA);
+		mvpp2_prs_dsa_tag_set(priv, port, false,
+				      MVPP2_PRS_UNTAGGED, MVPP2_PRS_EDSA);
+		break;
+
+	default:
+		if ((type < 0) || (type > MVPP2_TAG_TYPE_EDSA))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Set prs flow for the port */
+int mvpp2_prs_def_flow(struct mvpp2_port *port)
+{
+	struct mvpp2_prs_entry pe;
+	int tid;
+
+	memset(&pe, 0, sizeof(pe));
+
+	tid = mvpp2_prs_flow_find(port->priv, port->id);
+
+	/* Such entry not exist */
+	if (tid < 0) {
+		/* Go through the all entires from last to first */
+		tid = mvpp2_prs_tcam_first_free(port->priv,
+						MVPP2_PE_LAST_FREE_TID,
+					       MVPP2_PE_FIRST_FREE_TID);
+		if (tid < 0)
+			return tid;
+
+		pe.index = tid;
+
+		/* Set flow ID*/
+		mvpp2_prs_sram_ai_update(&pe, port->id, MVPP2_PRS_FLOW_ID_MASK);
+		mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_DONE_BIT, 1);
+
+		/* Update shadow table */
+		mvpp2_prs_shadow_set(port->priv, pe.index, MVPP2_PRS_LU_FLOWS);
+	} else {
+		mvpp2_prs_init_from_hw(port->priv, &pe, tid);
+	}
+
+	mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_FLOWS);
+	mvpp2_prs_tcam_port_map_set(&pe, (1 << port->id));
+	mvpp2_prs_hw_write(port->priv, &pe);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h
new file mode 100644
index 000000000000..22fbbc4c8b28
--- /dev/null
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h
@@ -0,0 +1,314 @@
+/*
+ * Header Parser definitions for Marvell PPv2 Network Controller
+ *
+ * Copyright (C) 2014 Marvell
+ *
+ * Marcin Wojtas <mw@semihalf.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include "mvpp2.h"
+
+#ifndef _MVPP2_PRS_H_
+#define _MVPP2_PRS_H_
+
+/* Parser constants */
+#define MVPP2_PRS_TCAM_SRAM_SIZE	256
+#define MVPP2_PRS_TCAM_WORDS		6
+#define MVPP2_PRS_SRAM_WORDS		4
+#define MVPP2_PRS_FLOW_ID_SIZE		64
+#define MVPP2_PRS_FLOW_ID_MASK		0x3f
+#define MVPP2_PRS_TCAM_ENTRY_INVALID	1
+#define MVPP2_PRS_TCAM_DSA_TAGGED_BIT	BIT(5)
+#define MVPP2_PRS_IPV4_HEAD		0x40
+#define MVPP2_PRS_IPV4_HEAD_MASK	0xf0
+#define MVPP2_PRS_IPV4_MC		0xe0
+#define MVPP2_PRS_IPV4_MC_MASK		0xf0
+#define MVPP2_PRS_IPV4_BC_MASK		0xff
+#define MVPP2_PRS_IPV4_IHL		0x5
+#define MVPP2_PRS_IPV4_IHL_MASK		0xf
+#define MVPP2_PRS_IPV6_MC		0xff
+#define MVPP2_PRS_IPV6_MC_MASK		0xff
+#define MVPP2_PRS_IPV6_HOP_MASK		0xff
+#define MVPP2_PRS_TCAM_PROTO_MASK	0xff
+#define MVPP2_PRS_TCAM_PROTO_MASK_L	0x3f
+#define MVPP2_PRS_DBL_VLANS_MAX		100
+#define MVPP2_PRS_CAST_MASK		BIT(0)
+#define MVPP2_PRS_MCAST_VAL		BIT(0)
+#define MVPP2_PRS_UCAST_VAL		0x0
+
+/* Tcam structure:
+ * - lookup ID - 4 bits
+ * - port ID - 1 byte
+ * - additional information - 1 byte
+ * - header data - 8 bytes
+ * The fields are represented by MVPP2_PRS_TCAM_DATA_REG(5)->(0).
+ */
+#define MVPP2_PRS_AI_BITS			8
+#define MVPP2_PRS_PORT_MASK			0xff
+#define MVPP2_PRS_LU_MASK			0xf
+#define MVPP2_PRS_TCAM_DATA_BYTE(offs)		\
+				    (((offs) - ((offs) % 2)) * 2 + ((offs) % 2))
+#define MVPP2_PRS_TCAM_DATA_BYTE_EN(offs)	\
+					      (((offs) * 2) - ((offs) % 2)  + 2)
+#define MVPP2_PRS_TCAM_AI_BYTE			16
+#define MVPP2_PRS_TCAM_PORT_BYTE		17
+#define MVPP2_PRS_TCAM_LU_BYTE			20
+#define MVPP2_PRS_TCAM_EN_OFFS(offs)		((offs) + 2)
+#define MVPP2_PRS_TCAM_INV_WORD			5
+
+#define MVPP2_PRS_VID_TCAM_BYTE         2
+
+/* TCAM range for unicast and multicast filtering. We have 25 entries per port,
+ * with 4 dedicated to UC filtering and the rest to multicast filtering.
+ * Additionnally we reserve one entry for the broadcast address, and one for
+ * each port's own address.
+ */
+#define MVPP2_PRS_MAC_UC_MC_FILT_MAX	25
+#define MVPP2_PRS_MAC_RANGE_SIZE	80
+
+/* Number of entries per port dedicated to UC and MC filtering */
+#define MVPP2_PRS_MAC_UC_FILT_MAX	4
+#define MVPP2_PRS_MAC_MC_FILT_MAX	(MVPP2_PRS_MAC_UC_MC_FILT_MAX - \
+					 MVPP2_PRS_MAC_UC_FILT_MAX)
+
+/* There is a TCAM range reserved for VLAN filtering entries, range size is 33
+ * 10 VLAN ID filter entries per port
+ * 1 default VLAN filter entry per port
+ * It is assumed that there are 3 ports for filter, not including loopback port
+ */
+#define MVPP2_PRS_VLAN_FILT_MAX		11
+#define MVPP2_PRS_VLAN_FILT_RANGE_SIZE	33
+
+#define MVPP2_PRS_VLAN_FILT_MAX_ENTRY   (MVPP2_PRS_VLAN_FILT_MAX - 2)
+#define MVPP2_PRS_VLAN_FILT_DFLT_ENTRY  (MVPP2_PRS_VLAN_FILT_MAX - 1)
+
+/* Tcam entries ID */
+#define MVPP2_PE_DROP_ALL		0
+#define MVPP2_PE_FIRST_FREE_TID		1
+
+/* MAC filtering range */
+#define MVPP2_PE_MAC_RANGE_END		(MVPP2_PE_VID_FILT_RANGE_START - 1)
+#define MVPP2_PE_MAC_RANGE_START	(MVPP2_PE_MAC_RANGE_END - \
+						MVPP2_PRS_MAC_RANGE_SIZE + 1)
+/* VLAN filtering range */
+#define MVPP2_PE_VID_FILT_RANGE_END     (MVPP2_PRS_TCAM_SRAM_SIZE - 31)
+#define MVPP2_PE_VID_FILT_RANGE_START   (MVPP2_PE_VID_FILT_RANGE_END - \
+					 MVPP2_PRS_VLAN_FILT_RANGE_SIZE + 1)
+#define MVPP2_PE_LAST_FREE_TID          (MVPP2_PE_MAC_RANGE_START - 1)
+#define MVPP2_PE_IP6_EXT_PROTO_UN	(MVPP2_PRS_TCAM_SRAM_SIZE - 30)
+#define MVPP2_PE_IP6_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 29)
+#define MVPP2_PE_IP4_ADDR_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 28)
+#define MVPP2_PE_LAST_DEFAULT_FLOW	(MVPP2_PRS_TCAM_SRAM_SIZE - 27)
+#define MVPP2_PE_FIRST_DEFAULT_FLOW	(MVPP2_PRS_TCAM_SRAM_SIZE - 22)
+#define MVPP2_PE_EDSA_TAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 21)
+#define MVPP2_PE_EDSA_UNTAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 20)
+#define MVPP2_PE_DSA_TAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 19)
+#define MVPP2_PE_DSA_UNTAGGED		(MVPP2_PRS_TCAM_SRAM_SIZE - 18)
+#define MVPP2_PE_ETYPE_EDSA_TAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 17)
+#define MVPP2_PE_ETYPE_EDSA_UNTAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 16)
+#define MVPP2_PE_ETYPE_DSA_TAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 15)
+#define MVPP2_PE_ETYPE_DSA_UNTAGGED	(MVPP2_PRS_TCAM_SRAM_SIZE - 14)
+#define MVPP2_PE_MH_DEFAULT		(MVPP2_PRS_TCAM_SRAM_SIZE - 13)
+#define MVPP2_PE_DSA_DEFAULT		(MVPP2_PRS_TCAM_SRAM_SIZE - 12)
+#define MVPP2_PE_IP6_PROTO_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 11)
+#define MVPP2_PE_IP4_PROTO_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 10)
+#define MVPP2_PE_ETH_TYPE_UN		(MVPP2_PRS_TCAM_SRAM_SIZE - 9)
+#define MVPP2_PE_VID_FLTR_DEFAULT	(MVPP2_PRS_TCAM_SRAM_SIZE - 8)
+#define MVPP2_PE_VID_EDSA_FLTR_DEFAULT	(MVPP2_PRS_TCAM_SRAM_SIZE - 7)
+#define MVPP2_PE_VLAN_DBL		(MVPP2_PRS_TCAM_SRAM_SIZE - 6)
+#define MVPP2_PE_VLAN_NONE		(MVPP2_PRS_TCAM_SRAM_SIZE - 5)
+/* reserved */
+#define MVPP2_PE_MAC_MC_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 3)
+#define MVPP2_PE_MAC_UC_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 2)
+#define MVPP2_PE_MAC_NON_PROMISCUOUS	(MVPP2_PRS_TCAM_SRAM_SIZE - 1)
+
+#define MVPP2_PRS_VID_PORT_FIRST(port)	(MVPP2_PE_VID_FILT_RANGE_START + \
+					 ((port) * MVPP2_PRS_VLAN_FILT_MAX))
+#define MVPP2_PRS_VID_PORT_LAST(port)	(MVPP2_PRS_VID_PORT_FIRST(port) \
+					 + MVPP2_PRS_VLAN_FILT_MAX_ENTRY)
+/* Index of default vid filter for given port */
+#define MVPP2_PRS_VID_PORT_DFLT(port)	(MVPP2_PRS_VID_PORT_FIRST(port) \
+					 + MVPP2_PRS_VLAN_FILT_DFLT_ENTRY)
+
+/* Sram structure
+ * The fields are represented by MVPP2_PRS_TCAM_DATA_REG(3)->(0).
+ */
+#define MVPP2_PRS_SRAM_RI_OFFS			0
+#define MVPP2_PRS_SRAM_RI_WORD			0
+#define MVPP2_PRS_SRAM_RI_CTRL_OFFS		32
+#define MVPP2_PRS_SRAM_RI_CTRL_WORD		1
+#define MVPP2_PRS_SRAM_RI_CTRL_BITS		32
+#define MVPP2_PRS_SRAM_SHIFT_OFFS		64
+#define MVPP2_PRS_SRAM_SHIFT_SIGN_BIT		72
+#define MVPP2_PRS_SRAM_UDF_OFFS			73
+#define MVPP2_PRS_SRAM_UDF_BITS			8
+#define MVPP2_PRS_SRAM_UDF_MASK			0xff
+#define MVPP2_PRS_SRAM_UDF_SIGN_BIT		81
+#define MVPP2_PRS_SRAM_UDF_TYPE_OFFS		82
+#define MVPP2_PRS_SRAM_UDF_TYPE_MASK		0x7
+#define MVPP2_PRS_SRAM_UDF_TYPE_L3		1
+#define MVPP2_PRS_SRAM_UDF_TYPE_L4		4
+#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_OFFS	85
+#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_MASK	0x3
+#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD		1
+#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_IP4_ADD	2
+#define MVPP2_PRS_SRAM_OP_SEL_SHIFT_IP6_ADD	3
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_OFFS		87
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_BITS		2
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_MASK		0x3
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_ADD		0
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_IP4_ADD	2
+#define MVPP2_PRS_SRAM_OP_SEL_UDF_IP6_ADD	3
+#define MVPP2_PRS_SRAM_OP_SEL_BASE_OFFS		89
+#define MVPP2_PRS_SRAM_AI_OFFS			90
+#define MVPP2_PRS_SRAM_AI_CTRL_OFFS		98
+#define MVPP2_PRS_SRAM_AI_CTRL_BITS		8
+#define MVPP2_PRS_SRAM_AI_MASK			0xff
+#define MVPP2_PRS_SRAM_NEXT_LU_OFFS		106
+#define MVPP2_PRS_SRAM_NEXT_LU_MASK		0xf
+#define MVPP2_PRS_SRAM_LU_DONE_BIT		110
+#define MVPP2_PRS_SRAM_LU_GEN_BIT		111
+
+/* Sram result info bits assignment */
+#define MVPP2_PRS_RI_MAC_ME_MASK		0x1
+#define MVPP2_PRS_RI_DSA_MASK			0x2
+#define MVPP2_PRS_RI_VLAN_MASK			(BIT(2) | BIT(3))
+#define MVPP2_PRS_RI_VLAN_NONE			0x0
+#define MVPP2_PRS_RI_VLAN_SINGLE		BIT(2)
+#define MVPP2_PRS_RI_VLAN_DOUBLE		BIT(3)
+#define MVPP2_PRS_RI_VLAN_TRIPLE		(BIT(2) | BIT(3))
+#define MVPP2_PRS_RI_CPU_CODE_MASK		0x70
+#define MVPP2_PRS_RI_CPU_CODE_RX_SPEC		BIT(4)
+#define MVPP2_PRS_RI_L2_CAST_MASK		(BIT(9) | BIT(10))
+#define MVPP2_PRS_RI_L2_UCAST			0x0
+#define MVPP2_PRS_RI_L2_MCAST			BIT(9)
+#define MVPP2_PRS_RI_L2_BCAST			BIT(10)
+#define MVPP2_PRS_RI_PPPOE_MASK			0x800
+#define MVPP2_PRS_RI_L3_PROTO_MASK		(BIT(12) | BIT(13) | BIT(14))
+#define MVPP2_PRS_RI_L3_UN			0x0
+#define MVPP2_PRS_RI_L3_IP4			BIT(12)
+#define MVPP2_PRS_RI_L3_IP4_OPT			BIT(13)
+#define MVPP2_PRS_RI_L3_IP4_OTHER		(BIT(12) | BIT(13))
+#define MVPP2_PRS_RI_L3_IP6			BIT(14)
+#define MVPP2_PRS_RI_L3_IP6_EXT			(BIT(12) | BIT(14))
+#define MVPP2_PRS_RI_L3_ARP			(BIT(13) | BIT(14))
+#define MVPP2_PRS_RI_L3_ADDR_MASK		(BIT(15) | BIT(16))
+#define MVPP2_PRS_RI_L3_UCAST			0x0
+#define MVPP2_PRS_RI_L3_MCAST			BIT(15)
+#define MVPP2_PRS_RI_L3_BCAST			(BIT(15) | BIT(16))
+#define MVPP2_PRS_RI_IP_FRAG_MASK		0x20000
+#define MVPP2_PRS_RI_IP_FRAG_TRUE		BIT(17)
+#define MVPP2_PRS_RI_UDF3_MASK			0x300000
+#define MVPP2_PRS_RI_UDF3_RX_SPECIAL		BIT(21)
+#define MVPP2_PRS_RI_L4_PROTO_MASK		0x1c00000
+#define MVPP2_PRS_RI_L4_TCP			BIT(22)
+#define MVPP2_PRS_RI_L4_UDP			BIT(23)
+#define MVPP2_PRS_RI_L4_OTHER			(BIT(22) | BIT(23))
+#define MVPP2_PRS_RI_UDF7_MASK			0x60000000
+#define MVPP2_PRS_RI_UDF7_IP6_LITE		BIT(29)
+#define MVPP2_PRS_RI_DROP_MASK			0x80000000
+
+/* Sram additional info bits assignment */
+#define MVPP2_PRS_IPV4_DIP_AI_BIT		BIT(0)
+#define MVPP2_PRS_IPV6_NO_EXT_AI_BIT		BIT(0)
+#define MVPP2_PRS_IPV6_EXT_AI_BIT		BIT(1)
+#define MVPP2_PRS_IPV6_EXT_AH_AI_BIT		BIT(2)
+#define MVPP2_PRS_IPV6_EXT_AH_LEN_AI_BIT	BIT(3)
+#define MVPP2_PRS_IPV6_EXT_AH_L4_AI_BIT		BIT(4)
+#define MVPP2_PRS_SINGLE_VLAN_AI		0
+#define MVPP2_PRS_DBL_VLAN_AI_BIT		BIT(7)
+#define MVPP2_PRS_EDSA_VID_AI_BIT		BIT(0)
+
+/* DSA/EDSA type */
+#define MVPP2_PRS_TAGGED		true
+#define MVPP2_PRS_UNTAGGED		false
+#define MVPP2_PRS_EDSA			true
+#define MVPP2_PRS_DSA			false
+
+/* MAC entries, shadow udf */
+enum mvpp2_prs_udf {
+	MVPP2_PRS_UDF_MAC_DEF,
+	MVPP2_PRS_UDF_MAC_RANGE,
+	MVPP2_PRS_UDF_L2_DEF,
+	MVPP2_PRS_UDF_L2_DEF_COPY,
+	MVPP2_PRS_UDF_L2_USER,
+};
+
+/* Lookup ID */
+enum mvpp2_prs_lookup {
+	MVPP2_PRS_LU_MH,
+	MVPP2_PRS_LU_MAC,
+	MVPP2_PRS_LU_DSA,
+	MVPP2_PRS_LU_VLAN,
+	MVPP2_PRS_LU_VID,
+	MVPP2_PRS_LU_L2,
+	MVPP2_PRS_LU_PPPOE,
+	MVPP2_PRS_LU_IP4,
+	MVPP2_PRS_LU_IP6,
+	MVPP2_PRS_LU_FLOWS,
+	MVPP2_PRS_LU_LAST,
+};
+
+union mvpp2_prs_tcam_entry {
+	u32 word[MVPP2_PRS_TCAM_WORDS];
+	u8  byte[MVPP2_PRS_TCAM_WORDS * 4];
+};
+
+union mvpp2_prs_sram_entry {
+	u32 word[MVPP2_PRS_SRAM_WORDS];
+	u8  byte[MVPP2_PRS_SRAM_WORDS * 4];
+};
+
+struct mvpp2_prs_entry {
+	u32 index;
+	union mvpp2_prs_tcam_entry tcam;
+	union mvpp2_prs_sram_entry sram;
+};
+
+struct mvpp2_prs_shadow {
+	bool valid;
+	bool finish;
+
+	/* Lookup ID */
+	int lu;
+
+	/* User defined offset */
+	int udf;
+
+	/* Result info */
+	u32 ri;
+	u32 ri_mask;
+};
+
+int mvpp2_prs_default_init(struct platform_device *pdev, struct mvpp2 *priv);
+
+int mvpp2_prs_mac_da_accept(struct mvpp2_port *port, const u8 *da, bool add);
+
+int mvpp2_prs_tag_mode_set(struct mvpp2 *priv, int port, int type);
+
+int mvpp2_prs_def_flow(struct mvpp2_port *port);
+
+void mvpp2_prs_vid_enable_filtering(struct mvpp2_port *port);
+
+void mvpp2_prs_vid_disable_filtering(struct mvpp2_port *port);
+
+int mvpp2_prs_vid_entry_add(struct mvpp2_port *port, u16 vid);
+
+void mvpp2_prs_vid_entry_remove(struct mvpp2_port *port, u16 vid);
+
+void mvpp2_prs_vid_remove_all(struct mvpp2_port *port);
+
+void mvpp2_prs_mac_promisc_set(struct mvpp2 *priv, int port,
+			       enum mvpp2_prs_l2_cast l2_cast, bool add);
+
+void mvpp2_prs_mac_del_all(struct mvpp2_port *port);
+
+int mvpp2_prs_update_mac_da(struct net_device *dev, const u8 *da);
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 2004a9bcb8ff2ceeea722441d445cd6e0755c2c4 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:02 +0200
Subject: selftests: forwarding: lib: Move here vlan_capture_{, un}install()

Move vlan_capture_install() and vlan_capture_uninstall() from
mirror_vlan.sh test to lib.sh so that it can be reused in other tests.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/lib.sh      | 23 ++++++++++++++++++++++
 .../selftests/net/forwarding/mirror_vlan.sh        | 23 ----------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 89ba4cdd4392..7b18a53aa556 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -514,6 +514,29 @@ icmp6_capture_uninstall()
 	__icmp_capture_add_del del 100 v6 "$@"
 }
 
+__vlan_capture_add_del()
+{
+	local add_del=$1; shift
+	local pref=$1; shift
+	local dev=$1; shift
+	local filter=$1; shift
+
+	tc filter $add_del dev "$dev" ingress \
+	   proto 802.1q pref $pref \
+	   flower $filter \
+	   action pass
+}
+
+vlan_capture_install()
+{
+	__vlan_capture_add_del add 100 "$@"
+}
+
+vlan_capture_uninstall()
+{
+	__vlan_capture_add_del del 100 "$@"
+}
+
 matchall_sink_create()
 {
 	local dev=$1; shift
diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
index 1e10520dccf4..758b6d027971 100755
--- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -76,29 +76,6 @@ test_vlan()
 	test_vlan_dir egress 0 8
 }
 
-vlan_capture_add_del()
-{
-	local add_del=$1; shift
-	local pref=$1; shift
-	local dev=$1; shift
-	local filter=$1; shift
-
-	tc filter $add_del dev "$dev" ingress \
-	   proto 802.1q pref $pref \
-	   flower $filter \
-	   action pass
-}
-
-vlan_capture_install()
-{
-	vlan_capture_add_del add 100 "$@"
-}
-
-vlan_capture_uninstall()
-{
-	vlan_capture_add_del del 100 "$@"
-}
-
 do_test_span_vlan_dir_ips()
 {
 	local expect=$1; shift
-- 
cgit v1.2.3-59-g8ed1b


From 900530f3f8731606a8dfb847b7e0d467250f9665 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:09 +0200
Subject: selftests: forwarding: mirror_lib: Move here
 do_test_span_vlan_dir_ips()

Move the function do_test_span_vlan_dir_ips() from mirror_vlan.sh test
to a library file mirror_lib.sh to allow reuse. Fill in other entry
points similar to other testing functions in mirror_lib.sh, they will be
useful in following patches.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../testing/selftests/net/forwarding/mirror_lib.sh | 35 ++++++++++++++++++++++
 .../selftests/net/forwarding/mirror_vlan.sh        | 15 ----------
 2 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
index 04cbc38e71a2..67efe2556a4d 100644
--- a/tools/testing/selftests/net/forwarding/mirror_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -92,3 +92,38 @@ test_span_dir()
 {
 	test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
 }
+
+do_test_span_vlan_dir_ips()
+{
+	local expect=$1; shift
+	local dev=$1; shift
+	local vid=$1; shift
+	local direction=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+
+	vlan_capture_install $dev "vlan_id $vid"
+	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+	vlan_capture_uninstall $dev
+}
+
+quick_test_span_vlan_dir_ips()
+{
+	do_test_span_vlan_dir_ips 10 "$@"
+}
+
+fail_test_span_vlan_dir_ips()
+{
+	do_test_span_vlan_dir_ips 0 "$@"
+}
+
+quick_test_span_vlan_dir()
+{
+	quick_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+fail_test_span_vlan_dir()
+{
+	fail_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
index 758b6d027971..20b37a53657e 100755
--- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -76,21 +76,6 @@ test_vlan()
 	test_vlan_dir egress 0 8
 }
 
-do_test_span_vlan_dir_ips()
-{
-	local expect=$1; shift
-	local dev=$1; shift
-	local vid=$1; shift
-	local direction=$1; shift
-	local ip1=$1; shift
-	local ip2=$1; shift
-
-	vlan_capture_install $dev "vlan_id $vid"
-	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
-	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
-	vlan_capture_uninstall $dev
-}
-
 test_tagged_vlan_dir()
 {
 	local direction=$1; shift
-- 
cgit v1.2.3-59-g8ed1b


From 275225fb4e3981a55b19ad0caad2eeb2821ae3d5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:15 +0200
Subject: selftests: forwarding: mirror_lib: skip_hw the VLAN capture

When the VLAN capture is installed on a front panel device and not a
soft device, the packets are counted twice: once in fast path, and once
after they are trapped to the kernel. Resolve the problem by passing
skip_hw flag to vlan_capture_install().

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_lib.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
index 67efe2556a4d..d36dc26c6c51 100644
--- a/tools/testing/selftests/net/forwarding/mirror_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -102,7 +102,10 @@ do_test_span_vlan_dir_ips()
 	local ip1=$1; shift
 	local ip2=$1; shift
 
-	vlan_capture_install $dev "vlan_id $vid"
+	# Install the capture as skip_hw to avoid double-counting of packets.
+	# The traffic is meant for local box anyway, so will be trapped to
+	# kernel.
+	vlan_capture_install $dev "skip_hw vlan_id $vid"
 	mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
 	mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
 	vlan_capture_uninstall $dev
-- 
cgit v1.2.3-59-g8ed1b


From f52f460ca9714c1aed3a4ec73b4dbd502bfba10c Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:20 +0200
Subject: selftests: forwarding: mirror_gre_lib: Add STP test

Add a reusable full test that toggles STP state of a given bridge port
and checks that the mirroring reacts appropriately. The test will be
used by bridge tests in follow-up patches.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_lib.sh     | 32 ++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
index 92ef6dde98bd..619b469365be 100644
--- a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -96,3 +96,35 @@ full_test_span_gre_dir_vlan()
 {
 	full_test_span_gre_dir_vlan_ips "$@" 192.0.2.1 192.0.2.2
 }
+
+full_test_span_gre_stp_ips()
+{
+	local tundev=$1; shift
+	local nbpdev=$1; shift
+	local what=$1; shift
+	local ip1=$1; shift
+	local ip2=$1; shift
+	local h3mac=$(mac_get $h3)
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+	bridge link set dev $nbpdev state disabled
+	sleep 1
+	fail_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+	bridge link set dev $nbpdev state forwarding
+	sleep 1
+	quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: STP state ($tcflags)"
+}
+
+full_test_span_gre_stp()
+{
+	full_test_span_gre_stp_ips "$@" 192.0.2.1 192.0.2.2
+}
-- 
cgit v1.2.3-59-g8ed1b


From b996078ea969dcd4fe6d04dc0b338319fd593460 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:26 +0200
Subject: selftests: forwarding: mirror_gre_vlan_bridge_1q: Fix tunnel name

The "ip6gretap" in the test name refers to the tunnel device type that
the test is supposed to be testing. However test_ip6gretap_forbidden()
tests, due to a typo, a gretap tunnel. Fix the typo.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
index 01ec28ac2e4a..29fde7332c88 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -108,7 +108,7 @@ test_gretap_forbidden()
 
 test_ip6gretap_forbidden()
 {
-	test_span_gre_forbidden gt4 "mirror to ip6gretap"
+	test_span_gre_forbidden gt6 "mirror to ip6gretap"
 }
 
 test_all()
-- 
cgit v1.2.3-59-g8ed1b


From a6f3282e2fabcfa6a3433bbe708fc65b7f2d6a72 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:32 +0200
Subject: selftests: forwarding: mirror_gre_vlan_bridge_1q: Test final config

After the final change reestablishes the original configuration, make
sure the traffic flows again as it should.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
index 29fde7332c88..0a3bac96f883 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -91,12 +91,13 @@ test_span_gre_forbidden()
 	# Now forbid the VLAN at the bridge and see it fail.
 	bridge vlan del dev br1 vid 555 self
 	sleep 1
-
 	fail_test_span_gre_dir $tundev ingress
-	mirror_uninstall $swp1 ingress
 
 	bridge vlan add dev br1 vid 555 self
 	sleep 1
+	quick_test_span_gre_dir $tundev ingress
+
+	mirror_uninstall $swp1 ingress
 
 	log_test "$what: vlan forbidden at a bridge ($tcflags)"
 }
-- 
cgit v1.2.3-59-g8ed1b


From 683680165c5b71ae6abf6745fed6a29a683813e3 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:37 +0200
Subject: selftests: forwarding: mirror_gre_vlan_bridge_1q: Rename two tests

Rename test_gretap_forbidden() and test_ip6gretap_forbidden() to a more
specific test_gretap_forbidden_cpu() and test_ip6gretap_forbidden_cpu().
This will make it clearer which is which when further down a patch is
introduced that forbids a VLAN on regular bridge port.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
index 0a3bac96f883..d91b3475e6df 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -10,8 +10,8 @@
 ALL_TESTS="
 	test_gretap
 	test_ip6gretap
-	test_gretap_forbidden
-	test_ip6gretap_forbidden
+	test_gretap_forbidden_cpu
+	test_ip6gretap_forbidden_cpu
 "
 
 NUM_NETIFS=6
@@ -77,7 +77,7 @@ test_ip6gretap()
 	test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap"
 }
 
-test_span_gre_forbidden()
+test_span_gre_forbidden_cpu()
 {
 	local tundev=$1; shift
 	local what=$1; shift
@@ -102,14 +102,14 @@ test_span_gre_forbidden()
 	log_test "$what: vlan forbidden at a bridge ($tcflags)"
 }
 
-test_gretap_forbidden()
+test_gretap_forbidden_cpu()
 {
-	test_span_gre_forbidden gt4 "mirror to gretap"
+	test_span_gre_forbidden_cpu gt4 "mirror to gretap"
 }
 
-test_ip6gretap_forbidden()
+test_ip6gretap_forbidden_cpu()
 {
-	test_span_gre_forbidden gt6 "mirror to ip6gretap"
+	test_span_gre_forbidden_cpu gt6 "mirror to ip6gretap"
 }
 
 test_all()
-- 
cgit v1.2.3-59-g8ed1b


From 9c7c8a82442c85bcb70abf572127343dcc7dd038 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:42 +0200
Subject: selftests: forwarding: mirror_gre_vlan_bridge_1q: Add more tests

Offloading of mirror-to-gretap in mlxsw is tricky especially in cases
when the gretap underlay involves bridges. Add more tests that exercise
the bridge handling code:

- forbidden_egress tests that check vlan removal on bridge port in the
  underlay packet path
- untagged_egress tests that similarly check "egress untagged"
- fdb_roaming tests that check whether learning FDB on a different port
  is reflected
- stp tests for handling port STP status of bridge egress port

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/forwarding/mirror_gre_vlan_bridge_1q.sh    | 129 +++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
index d91b3475e6df..5dbc7a08f4bd 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -12,6 +12,14 @@ ALL_TESTS="
 	test_ip6gretap
 	test_gretap_forbidden_cpu
 	test_ip6gretap_forbidden_cpu
+	test_gretap_forbidden_egress
+	test_ip6gretap_forbidden_egress
+	test_gretap_untagged_egress
+	test_ip6gretap_untagged_egress
+	test_gretap_fdb_roaming
+	test_ip6gretap_fdb_roaming
+	test_gretap_stp
+	test_ip6gretap_stp
 "
 
 NUM_NETIFS=6
@@ -43,12 +51,14 @@ setup_prepare()
 
 	ip link set dev $swp3 master br1
 	bridge vlan add dev $swp3 vid 555
+	bridge vlan add dev $swp2 vid 555
 }
 
 cleanup()
 {
 	pre_cleanup
 
+	ip link set dev $swp2 nomaster
 	ip link set dev $swp3 nomaster
 	vlan_destroy $h3 555
 	vlan_destroy br1 555
@@ -112,6 +122,125 @@ test_ip6gretap_forbidden_cpu()
 	test_span_gre_forbidden_cpu gt6 "mirror to ip6gretap"
 }
 
+test_span_gre_forbidden_egress()
+{
+	local tundev=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+
+	bridge vlan del dev $swp3 vid 555
+	sleep 1
+	fail_test_span_gre_dir $tundev ingress
+
+	bridge vlan add dev $swp3 vid 555
+	# Re-prime FDB
+	arping -I br1.555 192.0.2.130 -fqc 1
+	sleep 1
+	quick_test_span_gre_dir $tundev ingress
+
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: vlan forbidden at a bridge egress ($tcflags)"
+}
+
+test_gretap_forbidden_egress()
+{
+	test_span_gre_forbidden_egress gt4 "mirror to gretap"
+}
+
+test_ip6gretap_forbidden_egress()
+{
+	test_span_gre_forbidden_egress gt6 "mirror to ip6gretap"
+}
+
+test_span_gre_untagged_egress()
+{
+	local tundev=$1; shift
+	local what=$1; shift
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+
+	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_vlan_dir $h3 555 ingress
+
+	bridge vlan add dev $swp3 vid 555 pvid untagged
+	sleep 1
+	quick_test_span_gre_dir $tundev ingress
+	fail_test_span_vlan_dir $h3 555 ingress
+
+	bridge vlan add dev $swp3 vid 555
+	sleep 1
+	quick_test_span_gre_dir $tundev ingress
+	quick_test_span_vlan_dir $h3 555 ingress
+
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: vlan untagged at a bridge egress ($tcflags)"
+}
+
+test_gretap_untagged_egress()
+{
+	test_span_gre_untagged_egress gt4 "mirror to gretap"
+}
+
+test_ip6gretap_untagged_egress()
+{
+	test_span_gre_untagged_egress gt6 "mirror to ip6gretap"
+}
+
+test_span_gre_fdb_roaming()
+{
+	local tundev=$1; shift
+	local what=$1; shift
+	local h3mac=$(mac_get $h3)
+
+	RET=0
+
+	mirror_install $swp1 ingress $tundev "matchall $tcflags"
+	quick_test_span_gre_dir $tundev ingress
+
+	bridge fdb del dev $swp3 $h3mac vlan 555 master
+	bridge fdb add dev $swp2 $h3mac vlan 555 master
+	sleep 1
+	fail_test_span_gre_dir $tundev ingress
+
+	bridge fdb del dev $swp2 $h3mac vlan 555 master
+	# Re-prime FDB
+	arping -I br1.555 192.0.2.130 -fqc 1
+	sleep 1
+	quick_test_span_gre_dir $tundev ingress
+
+	mirror_uninstall $swp1 ingress
+
+	log_test "$what: MAC roaming ($tcflags)"
+}
+
+test_gretap_fdb_roaming()
+{
+	test_span_gre_fdb_roaming gt4 "mirror to gretap"
+}
+
+test_ip6gretap_fdb_roaming()
+{
+	test_span_gre_fdb_roaming gt6 "mirror to ip6gretap"
+}
+
+test_gretap_stp()
+{
+	full_test_span_gre_stp gt4 $swp3 "mirror to gretap"
+}
+
+test_ip6gretap_stp()
+{
+	full_test_span_gre_stp gt6 $swp3 "mirror to ip6gretap"
+}
+
 test_all()
 {
 	slow_path_trap_install $swp1 ingress
-- 
cgit v1.2.3-59-g8ed1b


From b5b029399fa6d64516e8fcd637f207570c569353 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 31 May 2018 19:52:47 +0200
Subject: selftests: forwarding: mirror_gre_bridge_1d_vlan: Add STP test

To test offloading of mirror-to-gretap in mlxsw for cases that a
VLAN-unaware bridge is in underlay packet path, test that the STP status
of bridge egress port is reflected.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
index 3d47afcf7fb9..3bb4c2ba7b14 100755
--- a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
@@ -11,6 +11,8 @@
 ALL_TESTS="
 	test_gretap
 	test_ip6gretap
+	test_gretap_stp
+	test_ip6gretap_stp
 "
 
 NUM_NETIFS=6
@@ -80,6 +82,16 @@ test_ip6gretap()
 	test_vlan_match gt6 'vlan_id 555 vlan_ethtype ipv6' "mirror to ip6gretap"
 }
 
+test_gretap_stp()
+{
+	full_test_span_gre_stp gt4 $swp3.555 "mirror to gretap"
+}
+
+test_ip6gretap_stp()
+{
+	full_test_span_gre_stp gt6 $swp3.555 "mirror to ip6gretap"
+}
+
 test_all()
 {
 	slow_path_trap_install $swp1 ingress
-- 
cgit v1.2.3-59-g8ed1b


From 6c251711b37ff14e2507bbc2401ac3ef0935ceb1 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:01 +0100
Subject: net: hns3: Disable vf vlan filter when vf vlan table is full

This is only 128 entries for hardware's vf vlan table, when
the vf table is full, the firmware will disable the vf vlan
filter and return a resp_code of HCLGE_VF_VLAN_NO_ENTRY to
driver.

This patch checks the if resp_code from firmware is
HCLGE_VF_VLAN_NO_ENTRY, if yes, then print a warning and
return ok to the caller.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 69166858a6dc..4ca53189d48d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -4525,9 +4525,16 @@ static int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
 	}
 
 	if (!is_kill) {
+#define HCLGE_VF_VLAN_NO_ENTRY	2
 		if (!req0->resp_code || req0->resp_code == 1)
 			return 0;
 
+		if (req0->resp_code == HCLGE_VF_VLAN_NO_ENTRY) {
+			dev_warn(&hdev->pdev->dev,
+				 "vf vlan table is full, vf vlan filter is disabled\n");
+			return 0;
+		}
+
 		dev_err(&hdev->pdev->dev,
 			"Add vf vlan filter fail, ret =%d.\n",
 			req0->resp_code);
-- 
cgit v1.2.3-59-g8ed1b


From 3b75c3df599d0068b382ef1f22396dc5d48c5a74 Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Fri, 1 Jun 2018 17:52:02 +0100
Subject: net: hns3: Add support for IFF_ALLMULTI flag

This patch adds support for IFF_ALLMULTI flag to HNS3 PF and VF
driver.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h               |  3 ++-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c           |  6 ++++--
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c        |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   |  6 ++++--
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c    |  5 +++--
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 11 +++++++----
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index f250c592a218..e8244a51ce71 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -316,7 +316,8 @@ struct hnae3_ae_ops {
 	int (*set_loopback)(struct hnae3_handle *handle,
 			    enum hnae3_loop loop_mode, bool en);
 
-	void (*set_promisc_mode)(struct hnae3_handle *handle, u32 en);
+	void (*set_promisc_mode)(struct hnae3_handle *handle, bool en_uc_pmc,
+				 bool en_mc_pmc);
 	int (*set_mtu)(struct hnae3_handle *handle, int new_mtu);
 
 	void (*get_pauseparam)(struct hnae3_handle *handle,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 05290129793f..0a6876a9b6d4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -415,9 +415,11 @@ static void hns3_nic_set_rx_mode(struct net_device *netdev)
 
 	if (h->ae_algo->ops->set_promisc_mode) {
 		if (netdev->flags & IFF_PROMISC)
-			h->ae_algo->ops->set_promisc_mode(h, 1);
+			h->ae_algo->ops->set_promisc_mode(h, true, true);
+		else if (netdev->flags & IFF_ALLMULTI)
+			h->ae_algo->ops->set_promisc_mode(h, false, true);
 		else
-			h->ae_algo->ops->set_promisc_mode(h, 0);
+			h->ae_algo->ops->set_promisc_mode(h, false, false);
 	}
 	if (__dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync))
 		netdev_err(netdev, "sync uc address fail\n");
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 8f8cc2413656..40c0425b4023 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -95,7 +95,7 @@ static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en)
 	if (ret)
 		return ret;
 
-	h->ae_algo->ops->set_promisc_mode(h, en);
+	h->ae_algo->ops->set_promisc_mode(h, en, en);
 
 	return ret;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 4ca53189d48d..18027bc543c1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3580,13 +3580,15 @@ void hclge_promisc_param_init(struct hclge_promisc_param *param, bool en_uc,
 	param->vf_id = vport_id;
 }
 
-static void hclge_set_promisc_mode(struct hnae3_handle *handle, u32 en)
+static void hclge_set_promisc_mode(struct hnae3_handle *handle, bool en_uc_pmc,
+				   bool en_mc_pmc)
 {
 	struct hclge_vport *vport = hclge_get_vport(handle);
 	struct hclge_dev *hdev = vport->back;
 	struct hclge_promisc_param param;
 
-	hclge_promisc_param_init(&param, en, en, true, vport->vport_id);
+	hclge_promisc_param_init(&param, en_uc_pmc, en_mc_pmc, true,
+				 vport->vport_id);
 	hclge_cmd_set_promisc_mode(hdev, &param);
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 31f3d9a43d8d..d299805c430b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -190,11 +190,12 @@ static int hclge_map_unmap_ring_to_vf_vector(struct hclge_vport *vport, bool en,
 static int hclge_set_vf_promisc_mode(struct hclge_vport *vport,
 				     struct hclge_mbx_vf_to_pf_cmd *req)
 {
-	bool en = req->msg[1] ? true : false;
+	bool en_uc = req->msg[1] ? true : false;
+	bool en_mc = req->msg[2] ? true : false;
 	struct hclge_promisc_param param;
 
 	/* always enable broadcast promisc bit */
-	hclge_promisc_param_init(&param, en, en, true, vport->vport_id);
+	hclge_promisc_param_init(&param, en_uc, en_mc, true, vport->vport_id);
 	return hclge_cmd_set_promisc_mode(vport->back, &param);
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 266cdcba506e..e28e0db0db5f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -654,7 +654,8 @@ static int hclgevf_put_vector(struct hnae3_handle *handle, int vector)
 	return 0;
 }
 
-static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev, u32 en)
+static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev,
+					bool en_uc_pmc, bool en_mc_pmc)
 {
 	struct hclge_mbx_vf_to_pf_cmd *req;
 	struct hclgevf_desc desc;
@@ -664,7 +665,8 @@ static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev, u32 en)
 
 	hclgevf_cmd_setup_basic_desc(&desc, HCLGEVF_OPC_MBX_VF_TO_PF, false);
 	req->msg[0] = HCLGE_MBX_SET_PROMISC_MODE;
-	req->msg[1] = en;
+	req->msg[1] = en_uc_pmc ? 1 : 0;
+	req->msg[2] = en_mc_pmc ? 1 : 0;
 
 	status = hclgevf_cmd_send(&hdev->hw, &desc, 1);
 	if (status)
@@ -674,11 +676,12 @@ static int hclgevf_cmd_set_promisc_mode(struct hclgevf_dev *hdev, u32 en)
 	return status;
 }
 
-static void hclgevf_set_promisc_mode(struct hnae3_handle *handle, u32 en)
+static void hclgevf_set_promisc_mode(struct hnae3_handle *handle,
+				     bool en_uc_pmc, bool en_mc_pmc)
 {
 	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
 
-	hclgevf_cmd_set_promisc_mode(hdev, en);
+	hclgevf_cmd_set_promisc_mode(hdev, en_uc_pmc, en_mc_pmc);
 }
 
 static int hclgevf_tqp_enable(struct hclgevf_dev *hdev, int tqp_id,
-- 
cgit v1.2.3-59-g8ed1b


From 5ec2a51ef8e2cb1fd1f04a8a7170001ab94d36fe Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 1 Jun 2018 17:52:03 +0100
Subject: net: hns3: Add repeat address checking for setting mac address

Add checking for new mac address. It doesn't need to config
the mac vlan table if it's already in use.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 0a6876a9b6d4..fe54564abdd5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1132,6 +1132,12 @@ static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p)
 	if (!mac_addr || !is_valid_ether_addr((const u8 *)mac_addr->sa_data))
 		return -EADDRNOTAVAIL;
 
+	if (ether_addr_equal(netdev->dev_addr, mac_addr->sa_data)) {
+		netdev_info(netdev, "already using mac address %pM\n",
+			    mac_addr->sa_data);
+		return 0;
+	}
+
 	ret = h->ae_algo->ops->set_mac_addr(h, mac_addr->sa_data, false);
 	if (ret) {
 		netdev_err(netdev, "set_mac_address fail, ret=%d!\n", ret);
-- 
cgit v1.2.3-59-g8ed1b


From c7fc8fb6193428e107a4d51333c158bdc226b26c Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Fri, 1 Jun 2018 17:52:04 +0100
Subject: net: hns3: Fix setting mac address error

When doing function reset or insmod hns3 dirver after rmmod,
the entries of mac vlan table are not cleared, which may cause
init mac address failed. This patch fixes it by clearing the
old mac address when doing function reset or rmmod hns3 driver.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index fe54564abdd5..235eea1393bb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3007,6 +3007,15 @@ static void hns3_init_mac_addr(struct net_device *netdev, bool init)
 
 }
 
+static void hns3_uninit_mac_addr(struct net_device *netdev)
+{
+	struct hns3_nic_priv *priv = netdev_priv(netdev);
+	struct hnae3_handle *h = priv->ae_handle;
+
+	if (h->ae_algo->ops->rm_uc_addr)
+		h->ae_algo->ops->rm_uc_addr(h, netdev->dev_addr);
+}
+
 static void hns3_nic_set_priv_ops(struct net_device *netdev)
 {
 	struct hns3_nic_priv *priv = netdev_priv(netdev);
@@ -3135,6 +3144,8 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 
 	priv->ring_data = NULL;
 
+	hns3_uninit_mac_addr(netdev);
+
 	free_netdev(netdev);
 }
 
@@ -3451,6 +3462,8 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
 
 	priv->ring_data = NULL;
 
+	hns3_uninit_mac_addr(netdev);
+
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f5be79673fc4c925708c99ec37d77e0a2c3cd30b Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:05 +0100
Subject: net: hns3: Fix for service_task not running problem after resetting

When hclge_ae_stop is called during resetting, it will cancel the
service_task by calling cancel_work_sync, which may cause the
service_task to exit without clearing HCLGE_STATE_SERVICE_SCHED
bit. If this happens, the service_task will never run again.

This patch fixes this problem by clearing it after calling
cancel_work_sync in hclge_ae_stop.

Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   | 1 +
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 18027bc543c1..746987f4ebbc 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3748,6 +3748,7 @@ static void hclge_ae_stop(struct hnae3_handle *handle)
 
 	del_timer_sync(&hdev->service_timer);
 	cancel_work_sync(&hdev->service_task);
+	clear_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state);
 
 	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
 		return;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index e28e0db0db5f..5d28052b3ea5 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1337,6 +1337,7 @@ static void hclgevf_ae_stop(struct hnae3_handle *handle)
 	hclgevf_reset_tqp_stats(handle);
 	del_timer_sync(&hdev->service_timer);
 	cancel_work_sync(&hdev->service_task);
+	clear_bit(HCLGEVF_STATE_SERVICE_SCHED, &hdev->state);
 	hclgevf_update_link_status(hdev, 0);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cd8c5c269b1d807028e939293002c989e7c07e51 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:06 +0100
Subject: net: hns3: Fix for hclge_reset running repeatly problem

When hardware sends the HCLGE_VECTOR0_EVENT_RST event through
hclge_misc_irq_handle, currently driver enables misc_vector in
the interrupt handle, and hardware generates the same interrupt
for the same reset event again and again until the reset is
complete, which causes hclge_reset running repeatly problem.

This patch fixes by enabling the misc_vector after reset is
complete.

Fixes: 4ed340ab8f49 ("net: hns3: Add reset process in hclge_main")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 40 ++++++++++++++++++----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 746987f4ebbc..fb44b1ec4669 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2587,9 +2587,11 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 		break;
 	}
 
-	/* we should clear the source of interrupt */
-	hclge_clear_event_cause(hdev, event_cause, clearval);
-	hclge_enable_vector(&hdev->misc_vector, true);
+	/* clear the source of interrupt if it is not cause by reset */
+	if (event_cause != HCLGE_VECTOR0_EVENT_RST) {
+		hclge_clear_event_cause(hdev, event_cause, clearval);
+		hclge_enable_vector(&hdev->misc_vector, true);
+	}
 
 	return IRQ_HANDLED;
 }
@@ -2777,6 +2779,33 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
 	return rst_level;
 }
 
+static void hclge_clear_reset_cause(struct hclge_dev *hdev)
+{
+	u32 clearval = 0;
+
+	switch (hdev->reset_type) {
+	case HNAE3_IMP_RESET:
+		clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B);
+		break;
+	case HNAE3_GLOBAL_RESET:
+		clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B);
+		break;
+	case HNAE3_CORE_RESET:
+		clearval = BIT(HCLGE_VECTOR0_CORERESET_INT_B);
+		break;
+	default:
+		dev_warn(&hdev->pdev->dev, "Unsupported reset event to clear:%d",
+			 hdev->reset_type);
+		break;
+	}
+
+	if (!clearval)
+		return;
+
+	hclge_write_dev(&hdev->hw, HCLGE_MISC_RESET_STS_REG, clearval);
+	hclge_enable_vector(&hdev->misc_vector, true);
+}
+
 static void hclge_reset(struct hclge_dev *hdev)
 {
 	/* perform reset of the stack & ae device for a client */
@@ -2789,6 +2818,8 @@ static void hclge_reset(struct hclge_dev *hdev)
 		hclge_reset_ae_dev(hdev->ae_dev);
 		hclge_notify_client(hdev, HNAE3_INIT_CLIENT);
 		rtnl_unlock();
+
+		hclge_clear_reset_cause(hdev);
 	} else {
 		/* schedule again to check pending resets later */
 		set_bit(hdev->reset_type, &hdev->reset_pending);
@@ -5661,9 +5692,6 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
 		return ret;
 	}
 
-	/* Enable MISC vector(vector0) */
-	hclge_enable_vector(&hdev->misc_vector, true);
-
 	dev_info(&pdev->dev, "Reset done, %s driver initialization finished.\n",
 		 HCLGE_DRIVER_NAME);
 
-- 
cgit v1.2.3-59-g8ed1b


From 9617f66867b09b326cc932416be2431c5b91c8d8 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:07 +0100
Subject: net: hns3: Fix for phy not link up problem after resetting

When resetting, phy_state_machine may be accessing the phy through
firmware if the phy is not stopped or disconnected, which will
cause firemware timeout problem because the firmware is busy
processing the reset request.

This patch fixes it by disabling the phy when resetting.

Fixes: b940aeae0ed6 ("net: hns3: never send command queue message to IMP when reset")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index fb44b1ec4669..58fef5e56831 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3761,9 +3761,6 @@ static int hclge_ae_start(struct hnae3_handle *handle)
 	/* reset tqp stats */
 	hclge_reset_tqp_stats(handle);
 
-	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
-		return 0;
-
 	ret = hclge_mac_start_phy(hdev);
 	if (ret)
 		return ret;
@@ -3781,8 +3778,10 @@ static void hclge_ae_stop(struct hnae3_handle *handle)
 	cancel_work_sync(&hdev->service_task);
 	clear_bit(HCLGE_STATE_SERVICE_SCHED, &hdev->state);
 
-	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
+	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state)) {
+		hclge_mac_stop_phy(hdev);
 		return;
+	}
 
 	for (i = 0; i < vport->alloc_tqps; i++)
 		hclge_tqp_enable(hdev, i, 0, false);
-- 
cgit v1.2.3-59-g8ed1b


From f0ad97ac12f0e91cf3846025256ca97845bcfccd Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:08 +0100
Subject: net: hns3: Add missing break in misc_irq_handle

There is a break missing in the switch/case handling in
hclge_misc_irq_handle, which causes the log to output
uncorrectly.

This patch adds the missing break, and change the dev_dbg
to dev_warn in order to better catch the error.

Fixes: c1a81619d73a ("net: hns3: Add mailbox interrupt handling to PF driver")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 58fef5e56831..19e56896887d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2580,10 +2580,10 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
 		 * mbx messages reported by this interrupt.
 		 */
 		hclge_mbx_task_schedule(hdev);
-
+		break;
 	default:
-		dev_dbg(&hdev->pdev->dev,
-			"received unknown or unhandled event of vector0\n");
+		dev_warn(&hdev->pdev->dev,
+			 "received unknown or unhandled event of vector0\n");
 		break;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 3db084d28dc04e6ebe9123e0d4f6e8ef5a775ff0 Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 1 Jun 2018 17:52:09 +0100
Subject: net: hns3: Fix for vxlan tx checksum bug

when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL
and it is udp packet, which has a dest port as the IANA assigned.
the hardware is expected to do the checksum offload, but the
hardware will not do the checksum offload when udp dest port is
4789.

This patch fixes it by doing the checksum in software.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 29 +++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 235eea1393bb..e572804169cd 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -655,6 +655,32 @@ static void hns3_set_l2l3l4_len(struct sk_buff *skb, u8 ol4_proto,
 	}
 }
 
+/* when skb->encapsulation is 0, skb->ip_summed is CHECKSUM_PARTIAL
+ * and it is udp packet, which has a dest port as the IANA assigned.
+ * the hardware is expected to do the checksum offload, but the
+ * hardware will not do the checksum offload when udp dest port is
+ * 4789.
+ */
+static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
+{
+#define IANA_VXLAN_PORT	4789
+	union {
+		struct tcphdr *tcp;
+		struct udphdr *udp;
+		struct gre_base_hdr *gre;
+		unsigned char *hdr;
+	} l4;
+
+	l4.hdr = skb_transport_header(skb);
+
+	if (!(!skb->encapsulation && l4.udp->dest == htons(IANA_VXLAN_PORT)))
+		return false;
+
+	skb_checksum_help(skb);
+
+	return true;
+}
+
 static int hns3_set_l3l4_type_csum(struct sk_buff *skb, u8 ol4_proto,
 				   u8 il4_proto, u32 *type_cs_vlan_tso,
 				   u32 *ol_type_vlan_len_msec)
@@ -743,6 +769,9 @@ static int hns3_set_l3l4_type_csum(struct sk_buff *skb, u8 ol4_proto,
 			       HNS3_L4T_TCP);
 		break;
 	case IPPROTO_UDP:
+		if (hns3_tunnel_csum_bug(skb))
+			break;
+
 		hnae_set_field(*type_cs_vlan_tso,
 			       HNS3_TXD_L4T_M,
 			       HNS3_TXD_L4T_S,
-- 
cgit v1.2.3-59-g8ed1b


From 40cca1c587c1c39fcc7fa1b2c5d315d72361dfe1 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Fri, 1 Jun 2018 17:52:10 +0100
Subject: net: hns3: Optimize the PF's process of updating multicast MAC

In the current process, the multicast MAC is added to both MAC_VLAN
table and MTA table, this will reduce the utilization of the resource.

This patch improves the process of adding multicast MAC address, the
new process starts using the MTA table to add multicast MAC after the
MAC_VLAN table is full, and the MTA is disable if it is no longer used.

Signed-off-by: Xi Wang <wangxi11@huawei.com>
Reviewed-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h        |   1 +
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c    |   6 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 138 ++++++++++++++++++---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  12 +-
 4 files changed, 136 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index e8244a51ce71..8acb1d116a02 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -353,6 +353,7 @@ struct hnae3_ae_ops {
 			   const unsigned char *addr);
 	int (*rm_mc_addr)(struct hnae3_handle *handle,
 			  const unsigned char *addr);
+	int (*update_mta_status)(struct hnae3_handle *handle);
 
 	void (*set_tso_stats)(struct hnae3_handle *handle, int enable);
 	void (*update_stats)(struct hnae3_handle *handle,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index e572804169cd..f2b31d278bc9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -423,9 +423,13 @@ static void hns3_nic_set_rx_mode(struct net_device *netdev)
 	}
 	if (__dev_uc_sync(netdev, hns3_nic_uc_sync, hns3_nic_uc_unsync))
 		netdev_err(netdev, "sync uc address fail\n");
-	if (netdev->flags & IFF_MULTICAST)
+	if (netdev->flags & IFF_MULTICAST) {
 		if (__dev_mc_sync(netdev, hns3_nic_mc_sync, hns3_nic_mc_unsync))
 			netdev_err(netdev, "sync mc address fail\n");
+
+		if (h->ae_algo->ops->update_mta_status)
+			h->ae_algo->ops->update_mta_status(h);
+	}
 }
 
 static int hns3_set_tso(struct sk_buff *skb, u32 *paylen,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 19e56896887d..2a801344eafb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2288,8 +2288,10 @@ static int hclge_mac_init(struct hclge_dev *hdev)
 	struct net_device *netdev = handle->kinfo.netdev;
 	struct hclge_mac *mac = &hdev->hw.mac;
 	u8 mac_mask[ETH_ALEN] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+	struct hclge_vport *vport;
 	int mtu;
 	int ret;
+	int i;
 
 	ret = hclge_cfg_mac_speed_dup(hdev, hdev->hw.mac.speed, HCLGE_MAC_FULL);
 	if (ret) {
@@ -2301,7 +2303,6 @@ static int hclge_mac_init(struct hclge_dev *hdev)
 	mac->link = 0;
 
 	/* Initialize the MTA table work mode */
-	hdev->accept_mta_mc	= true;
 	hdev->enable_mta	= true;
 	hdev->mta_mac_sel_type	= HCLGE_MAC_ADDR_47_36;
 
@@ -2314,11 +2315,17 @@ static int hclge_mac_init(struct hclge_dev *hdev)
 		return ret;
 	}
 
-	ret = hclge_cfg_func_mta_filter(hdev, 0, hdev->accept_mta_mc);
-	if (ret) {
-		dev_err(&hdev->pdev->dev,
-			"set mta filter mode fail ret=%d\n", ret);
-		return ret;
+	for (i = 0; i < hdev->num_alloc_vport; i++) {
+		vport = &hdev->vport[i];
+		vport->accept_mta_mc = false;
+
+		memset(vport->mta_shadow, 0, sizeof(vport->mta_shadow));
+		ret = hclge_cfg_func_mta_filter(hdev, vport->vport_id, false);
+		if (ret) {
+			dev_err(&hdev->pdev->dev,
+				"set mta filter mode fail ret=%d\n", ret);
+			return ret;
+		}
 	}
 
 	ret = hclge_set_default_mac_vlan_mask(hdev, true, mac_mask);
@@ -4005,9 +4012,88 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport,
 		return ret;
 	}
 
+	if (enable)
+		set_bit(idx, vport->mta_shadow);
+	else
+		clear_bit(idx, vport->mta_shadow);
+
 	return 0;
 }
 
+static int hclge_update_mta_status(struct hnae3_handle *handle)
+{
+	unsigned long mta_status[BITS_TO_LONGS(HCLGE_MTA_TBL_SIZE)];
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct net_device *netdev = handle->kinfo.netdev;
+	struct netdev_hw_addr *ha;
+	u16 tbl_idx;
+
+	memset(mta_status, 0, sizeof(mta_status));
+
+	/* update mta_status from mc addr list */
+	netdev_for_each_mc_addr(ha, netdev) {
+		tbl_idx = hclge_get_mac_addr_to_mta_index(vport, ha->addr);
+		set_bit(tbl_idx, mta_status);
+	}
+
+	return hclge_update_mta_status_common(vport, mta_status,
+					0, HCLGE_MTA_TBL_SIZE, true);
+}
+
+int hclge_update_mta_status_common(struct hclge_vport *vport,
+				   unsigned long *status,
+				   u16 idx,
+				   u16 count,
+				   bool update_filter)
+{
+	struct hclge_dev *hdev = vport->back;
+	u16 update_max = idx + count;
+	u16 check_max;
+	int ret = 0;
+	bool used;
+	u16 i;
+
+	/* setup mta check range */
+	if (update_filter) {
+		i = 0;
+		check_max = HCLGE_MTA_TBL_SIZE;
+	} else {
+		i = idx;
+		check_max = update_max;
+	}
+
+	used = false;
+	/* check and update all mta item */
+	for (; i < check_max; i++) {
+		/* ignore unused item */
+		if (!test_bit(i, vport->mta_shadow))
+			continue;
+
+		/* if i in update range then update it */
+		if (i >= idx && i < update_max)
+			if (!test_bit(i - idx, status))
+				hclge_set_mta_table_item(vport, i, false);
+
+		if (!used && test_bit(i, vport->mta_shadow))
+			used = true;
+	}
+
+	/* no longer use mta, disable it */
+	if (vport->accept_mta_mc && update_filter && !used) {
+		ret = hclge_cfg_func_mta_filter(hdev,
+						vport->vport_id,
+						false);
+		if (ret)
+			dev_err(&hdev->pdev->dev,
+				"disable func mta filter fail ret=%d\n",
+				ret);
+		else
+			vport->accept_mta_mc = false;
+	}
+
+	return ret;
+}
+
 static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport,
 				     struct hclge_mac_vlan_tbl_entry_cmd *req)
 {
@@ -4275,9 +4361,25 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport,
 		status = hclge_add_mac_vlan_tbl(vport, &req, desc);
 	}
 
-	/* Set MTA table for this MAC address */
-	tbl_idx = hclge_get_mac_addr_to_mta_index(vport, addr);
-	status = hclge_set_mta_table_item(vport, tbl_idx, true);
+	/* If mc mac vlan table is full, use MTA table */
+	if (status == -ENOSPC) {
+		if (!vport->accept_mta_mc) {
+			status = hclge_cfg_func_mta_filter(hdev,
+							   vport->vport_id,
+							   true);
+			if (status) {
+				dev_err(&hdev->pdev->dev,
+					"set mta filter mode fail ret=%d\n",
+					status);
+				return status;
+			}
+			vport->accept_mta_mc = true;
+		}
+
+		/* Set MTA table for this MAC address */
+		tbl_idx = hclge_get_mac_addr_to_mta_index(vport, addr);
+		status = hclge_set_mta_table_item(vport, tbl_idx, true);
+	}
 
 	return status;
 }
@@ -4297,7 +4399,6 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
 	struct hclge_mac_vlan_tbl_entry_cmd req;
 	enum hclge_cmd_status status;
 	struct hclge_desc desc[3];
-	u16 tbl_idx;
 
 	/* mac addr check */
 	if (!is_multicast_ether_addr(addr)) {
@@ -4326,17 +4427,15 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
 			status = hclge_add_mac_vlan_tbl(vport, &req, desc);
 
 	} else {
-		/* This mac addr do not exist, can't delete it */
-		dev_err(&hdev->pdev->dev,
-			"Rm multicast mac addr failed, ret = %d.\n",
-			status);
-		return -EIO;
+		/* Maybe this mac address is in mta table, but it cannot be
+		 * deleted here because an entry of mta represents an address
+		 * range rather than a specific address. the delete action to
+		 * all entries will take effect in update_mta_status called by
+		 * hns3_nic_set_rx_mode.
+		 */
+		status = 0;
 	}
 
-	/* Set MTB table for this MAC address */
-	tbl_idx = hclge_get_mac_addr_to_mta_index(vport, addr);
-	status = hclge_set_mta_table_item(vport, tbl_idx, false);
-
 	return status;
 }
 
@@ -6137,6 +6236,7 @@ static const struct hnae3_ae_ops hclge_ops = {
 	.rm_uc_addr = hclge_rm_uc_addr,
 	.add_mc_addr = hclge_add_mc_addr,
 	.rm_mc_addr = hclge_rm_mc_addr,
+	.update_mta_status = hclge_update_mta_status,
 	.set_autoneg = hclge_set_autoneg,
 	.get_autoneg = hclge_get_autoneg,
 	.get_pauseparam = hclge_get_pauseparam,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 7fcabdeb4ce9..7488534528cd 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -61,6 +61,8 @@
 #define HCLGE_RSS_TC_SIZE_6		64
 #define HCLGE_RSS_TC_SIZE_7		128
 
+#define HCLGE_MTA_TBL_SIZE		4096
+
 #define HCLGE_TQP_RESET_TRY_TIMES	10
 
 #define HCLGE_PHY_PAGE_MDIX		0
@@ -559,7 +561,6 @@ struct hclge_dev {
 
 	enum hclge_mta_dmac_sel_type mta_mac_sel_type;
 	bool enable_mta; /* Mutilcast filter enable */
-	bool accept_mta_mc; /* Whether accept mta filter multicast */
 
 	struct hclge_vlan_type_cfg vlan_type_cfg;
 
@@ -620,6 +621,9 @@ struct hclge_vport {
 	struct hclge_dev *back;  /* Back reference to associated dev */
 	struct hnae3_handle nic;
 	struct hnae3_handle roce;
+
+	bool accept_mta_mc; /* whether to accept mta filter multicast */
+	unsigned long mta_shadow[BITS_TO_LONGS(HCLGE_MTA_TBL_SIZE)];
 };
 
 void hclge_promisc_param_init(struct hclge_promisc_param *param, bool en_uc,
@@ -637,6 +641,12 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
 int hclge_cfg_func_mta_filter(struct hclge_dev *hdev,
 			      u8 func_id,
 			      bool enable);
+int hclge_update_mta_status_common(struct hclge_vport *vport,
+				   unsigned long *status,
+				   u16 idx,
+				   u16 count,
+				   bool update_filter);
+
 struct hclge_vport *hclge_get_vport(struct hnae3_handle *handle);
 int hclge_bind_ring_with_vector(struct hclge_vport *vport,
 				int vector_id, bool en,
-- 
cgit v1.2.3-59-g8ed1b


From 3a678b5806e66d0b75086bf423ecaf80ff0237c7 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Fri, 1 Jun 2018 17:52:11 +0100
Subject: net: hns3: Optimize the VF's process of updating multicast MAC

In the update flow of the new PF driver, if a multicast address is in mta
table, the VF deletion action will not take effect.

This patch adds the VF adaptation according to the new flow of PF'driver.

Signed-off-by: Xi Wang <wangxi11@huawei.com>
Reviewed-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h    |   2 +
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c |  58 +++++++++-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  | 128 ++++++++++++++++++++-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h  |   4 +
 4 files changed, 187 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h
index 519e2bd6aa60..be9dc08ccf67 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h
@@ -47,6 +47,8 @@ enum hclge_mbx_mac_vlan_subcode {
 	HCLGE_MBX_MAC_VLAN_MC_ADD,		/* add new MC mac addr */
 	HCLGE_MBX_MAC_VLAN_MC_REMOVE,		/* remove MC mac addr */
 	HCLGE_MBX_MAC_VLAN_MC_FUNC_MTA_ENABLE,	/* config func MTA enable */
+	HCLGE_MBX_MAC_VLAN_MTA_TYPE_READ,	/* read func MTA type */
+	HCLGE_MBX_MAC_VLAN_MTA_STATUS_UPDATE,	/* update MTA status */
 };
 
 /* below are per-VF vlan cfg subcodes */
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index d299805c430b..7541cb9b71ce 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -231,12 +231,51 @@ static int hclge_set_vf_uc_mac_addr(struct hclge_vport *vport,
 	return 0;
 }
 
+static int hclge_set_vf_mc_mta_status(struct hclge_vport *vport,
+				      u8 *msg, u8 idx, bool is_end)
+{
+#define HCLGE_MTA_STATUS_MSG_SIZE 13
+#define HCLGE_MTA_STATUS_MSG_BITS \
+				(HCLGE_MTA_STATUS_MSG_SIZE * BITS_PER_BYTE)
+#define HCLGE_MTA_STATUS_MSG_END_BITS \
+				(HCLGE_MTA_TBL_SIZE % HCLGE_MTA_STATUS_MSG_BITS)
+	unsigned long status[BITS_TO_LONGS(HCLGE_MTA_STATUS_MSG_BITS)];
+	u16 tbl_cnt;
+	u16 tbl_idx;
+	u8 msg_ofs;
+	u8 msg_bit;
+
+	tbl_cnt = is_end ? HCLGE_MTA_STATUS_MSG_END_BITS :
+			HCLGE_MTA_STATUS_MSG_BITS;
+
+	/* set msg field */
+	msg_ofs = 0;
+	msg_bit = 0;
+	memset(status, 0, sizeof(status));
+	for (tbl_idx = 0; tbl_idx < tbl_cnt; tbl_idx++) {
+		if (msg[msg_ofs] & BIT(msg_bit))
+			set_bit(tbl_idx, status);
+
+		msg_bit++;
+		if (msg_bit == BITS_PER_BYTE) {
+			msg_bit = 0;
+			msg_ofs++;
+		}
+	}
+
+	return hclge_update_mta_status_common(vport,
+					status, idx * HCLGE_MTA_STATUS_MSG_BITS,
+					tbl_cnt, is_end);
+}
+
 static int hclge_set_vf_mc_mac_addr(struct hclge_vport *vport,
 				    struct hclge_mbx_vf_to_pf_cmd *mbx_req,
 				    bool gen_resp)
 {
 	const u8 *mac_addr = (const u8 *)(&mbx_req->msg[2]);
 	struct hclge_dev *hdev = vport->back;
+	u8 resp_len = 0;
+	u8 resp_data;
 	int status;
 
 	if (mbx_req->msg[1] == HCLGE_MBX_MAC_VLAN_MC_ADD) {
@@ -248,6 +287,22 @@ static int hclge_set_vf_mc_mac_addr(struct hclge_vport *vport,
 		bool enable = mbx_req->msg[2];
 
 		status = hclge_cfg_func_mta_filter(hdev, func_id, enable);
+	} else if (mbx_req->msg[1] == HCLGE_MBX_MAC_VLAN_MTA_TYPE_READ) {
+		resp_data = hdev->mta_mac_sel_type;
+		resp_len = sizeof(u8);
+		gen_resp = true;
+		status = 0;
+	} else if (mbx_req->msg[1] == HCLGE_MBX_MAC_VLAN_MTA_STATUS_UPDATE) {
+		/* mta status update msg format
+		 * msg[2.6 : 2.0]  msg index
+		 * msg[2.7]        msg is end
+		 * msg[15 : 3]     mta status bits[103 : 0]
+		 */
+		bool is_end = (mbx_req->msg[2] & 0x80) ? true : false;
+
+		status = hclge_set_vf_mc_mta_status(vport, &mbx_req->msg[3],
+						    mbx_req->msg[2] & 0x7F,
+						    is_end);
 	} else {
 		dev_err(&hdev->pdev->dev,
 			"failed to set mcast mac addr, unknown subcode %d\n",
@@ -256,7 +311,8 @@ static int hclge_set_vf_mc_mac_addr(struct hclge_vport *vport,
 	}
 
 	if (gen_resp)
-		hclge_gen_resp_to_vf(vport, mbx_req, status, NULL, 0);
+		hclge_gen_resp_to_vf(vport, mbx_req, status,
+				     &resp_data, resp_len);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 5d28052b3ea5..dd8e8e6718dc 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -739,6 +739,126 @@ static int hclgevf_cfg_func_mta_filter(struct hnae3_handle *handle, bool en)
 				    msg, 1, false, NULL, 0);
 }
 
+static int hclgevf_cfg_func_mta_type(struct hclgevf_dev *hdev)
+{
+	u8 resp_msg = HCLGEVF_MTA_TYPE_SEL_MAX;
+	int ret;
+
+	ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_MULTICAST,
+				   HCLGE_MBX_MAC_VLAN_MTA_TYPE_READ,
+				   NULL, 0, true, &resp_msg, sizeof(u8));
+
+	if (ret) {
+		dev_err(&hdev->pdev->dev,
+			"Read mta type fail, ret=%d.\n", ret);
+		return ret;
+	}
+
+	if (resp_msg > HCLGEVF_MTA_TYPE_SEL_MAX) {
+		dev_err(&hdev->pdev->dev,
+			"Read mta type invalid, resp=%d.\n", resp_msg);
+		return -EINVAL;
+	}
+
+	hdev->mta_mac_sel_type = resp_msg;
+
+	return 0;
+}
+
+static u16 hclgevf_get_mac_addr_to_mta_index(struct hclgevf_dev *hdev,
+					     const u8 *addr)
+{
+	u32 rsh = HCLGEVF_MTA_TYPE_SEL_MAX - hdev->mta_mac_sel_type;
+	u16 high_val = addr[1] | (addr[0] << 8);
+
+	return (high_val >> rsh) & 0xfff;
+}
+
+static int hclgevf_do_update_mta_status(struct hclgevf_dev *hdev,
+					unsigned long *status)
+{
+#define HCLGEVF_MTA_STATUS_MSG_SIZE 13
+#define HCLGEVF_MTA_STATUS_MSG_BITS \
+			(HCLGEVF_MTA_STATUS_MSG_SIZE * BITS_PER_BYTE)
+#define HCLGEVF_MTA_STATUS_MSG_END_BITS \
+			(HCLGEVF_MTA_TBL_SIZE % HCLGEVF_MTA_STATUS_MSG_BITS)
+	u16 tbl_cnt;
+	u16 tbl_idx;
+	u8 msg_cnt;
+	u8 msg_idx;
+	int ret;
+
+	msg_cnt = DIV_ROUND_UP(HCLGEVF_MTA_TBL_SIZE,
+			       HCLGEVF_MTA_STATUS_MSG_BITS);
+	tbl_idx = 0;
+	msg_idx = 0;
+	while (msg_cnt--) {
+		u8 msg[HCLGEVF_MTA_STATUS_MSG_SIZE + 1];
+		u8 *p = &msg[1];
+		u8 msg_ofs;
+		u8 msg_bit;
+
+		memset(msg, 0, sizeof(msg));
+
+		/* set index field */
+		msg[0] = 0x7F & msg_idx;
+
+		/* set end flag field */
+		if (msg_cnt == 0) {
+			msg[0] |= 0x80;
+			tbl_cnt = HCLGEVF_MTA_STATUS_MSG_END_BITS;
+		} else {
+			tbl_cnt = HCLGEVF_MTA_STATUS_MSG_BITS;
+		}
+
+		/* set status field */
+		msg_ofs = 0;
+		msg_bit = 0;
+		while (tbl_cnt--) {
+			if (test_bit(tbl_idx, status))
+				p[msg_ofs] |= BIT(msg_bit);
+
+			tbl_idx++;
+
+			msg_bit++;
+			if (msg_bit == BITS_PER_BYTE) {
+				msg_bit = 0;
+				msg_ofs++;
+			}
+		}
+
+		ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_MULTICAST,
+					   HCLGE_MBX_MAC_VLAN_MTA_STATUS_UPDATE,
+					   msg, sizeof(msg), false, NULL, 0);
+		if (ret)
+			break;
+
+		msg_idx++;
+	}
+
+	return ret;
+}
+
+static int hclgevf_update_mta_status(struct hnae3_handle *handle)
+{
+	unsigned long mta_status[BITS_TO_LONGS(HCLGEVF_MTA_TBL_SIZE)];
+	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
+	struct net_device *netdev = hdev->nic.kinfo.netdev;
+	struct netdev_hw_addr *ha;
+	u16 tbl_idx;
+
+	/* clear status */
+	memset(mta_status, 0, sizeof(mta_status));
+
+	/* update status from mc addr list */
+	netdev_for_each_mc_addr(ha, netdev) {
+		tbl_idx = hclgevf_get_mac_addr_to_mta_index(hdev, ha->addr);
+		set_bit(tbl_idx, mta_status);
+	}
+
+	return hclgevf_do_update_mta_status(hdev, mta_status);
+}
+
 static void hclgevf_get_mac_addr(struct hnae3_handle *handle, u8 *p)
 {
 	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
@@ -1669,12 +1789,11 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
 		goto err_config;
 	}
 
-	/* Initialize VF's MTA */
-	hdev->accept_mta_mc = true;
-	ret = hclgevf_cfg_func_mta_filter(&hdev->nic, hdev->accept_mta_mc);
+	/* Initialize mta type for this VF */
+	ret = hclgevf_cfg_func_mta_type(hdev);
 	if (ret) {
 		dev_err(&hdev->pdev->dev,
-			"failed(%d) to set mta filter mode\n", ret);
+			"failed(%d) to initialize MTA type\n", ret);
 		goto err_config;
 	}
 
@@ -1829,6 +1948,7 @@ static const struct hnae3_ae_ops hclgevf_ops = {
 	.rm_uc_addr = hclgevf_rm_uc_addr,
 	.add_mc_addr = hclgevf_add_mc_addr,
 	.rm_mc_addr = hclgevf_rm_mc_addr,
+	.update_mta_status = hclgevf_update_mta_status,
 	.get_stats = hclgevf_get_stats,
 	.update_stats = hclgevf_update_stats,
 	.get_strings = hclgevf_get_strings,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
index 9763e742e6fb..0656e8e5c5f0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h
@@ -48,6 +48,9 @@
 #define HCLGEVF_RSS_CFG_TBL_NUM \
 	(HCLGEVF_RSS_IND_TBL_SIZE / HCLGEVF_RSS_CFG_TBL_SIZE)
 
+#define HCLGEVF_MTA_TBL_SIZE		4096
+#define HCLGEVF_MTA_TYPE_SEL_MAX	4
+
 /* states of hclgevf device & tasks */
 enum hclgevf_states {
 	/* device states */
@@ -152,6 +155,7 @@ struct hclgevf_dev {
 	int *vector_irq;
 
 	bool accept_mta_mc; /* whether to accept mta filter multicast */
+	u8 mta_mac_sel_type;
 	bool mbx_event_pending;
 	struct hclgevf_mbx_resp_status mbx_resp; /* mailbox response */
 	struct hclgevf_mbx_arq_ring arq; /* mailbox async rx queue */
-- 
cgit v1.2.3-59-g8ed1b


From 4b3e85a52ae4ea516fe297acad32872bc13bf620 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 31 May 2018 18:01:31 +0300
Subject: net/mlx5e: IPOIB, Fix overflowing SQ WQE memset

IPoIB WQE size is larger than a single WQEBB.  Must not fetch the WQE,
and surely not memset it, until it is guaranteed that there are enough
WQEBBs available before getting to SQ/frag edge.

Fixes: 043dc78ecf07 ("net/mlx5e: TX, Use actual WQE size for SQ edge fill")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index aafd75257fd0..9829ee02de31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -653,8 +653,6 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	int num_dma;
 	__be16 mss;
 
-	mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
-
 	/* Calc ihs and ds cnt, no writes to wqe yet */
 	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
 	if (skb_is_gso(skb)) {
@@ -686,10 +684,12 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
 	frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc);
 	if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) {
+		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
 		mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi);
-		mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
 	}
 
+	mlx5i_sq_fetch_wqe(sq, &wqe, &pi);
+
 	/* fill wqe */
 	wi       = &sq->db.wqe_info[pi];
 	cseg     = &wqe->ctrl;
-- 
cgit v1.2.3-59-g8ed1b


From c90262f8468acfed573e195de9443007dad27451 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 31 May 2018 18:04:23 +0300
Subject: net/mlx5e: IPOIB, Add a missing skb_pull

A call to mlx5e_tx_skb_pull_inline was mistakenly dropped
in the cited patch. Get it back.

Fixes: 043dc78ecf07 ("net/mlx5e: TX, Use actual WQE size for SQ edge fill")
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 9829ee02de31..725c06221e95 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -705,6 +705,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 
 	if (ihs) {
 		memcpy(eseg->inline_hdr.start, skb_data, ihs);
+		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8bfaf07f780683948a3183d3b14a76dfd50bf899 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Tue, 29 May 2018 10:54:47 +0300
Subject: net/mlx5e: Present SW stats when state is not opened

The driver can present all SW stats even when the state not opened.
Fixed get strings, count and stats to support it.

In addition, fix tc2txq to hold a static mapping which doesn't depend on
the amount of open channels, and cannot have the same value on two
different cells  while moving between configurations.
Example:
- OOB 16 channels
- Change to 2 channels, 8 TCs
- tc2txq[15][0] == tc2txq[1][7] == 15
This will cause multiple appearances of the same TX index in statistics
output.

Fixes: 76c3810bade3 ("net/mlx5e: Avoid reset netdev stats on configuration changes")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 19 +++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 10 ----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9b19863b059d..adc55de6d4f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2633,15 +2633,21 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev)
 		netdev_set_tc_queue(netdev, tc, nch, 0);
 }
 
-static void mlx5e_build_channels_tx_maps(struct mlx5e_priv *priv)
+static void mlx5e_build_tc2txq_maps(struct mlx5e_priv *priv)
 {
-	struct mlx5e_channel *c;
-	struct mlx5e_txqsq *sq;
+	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, tc;
 
-	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
+	for (i = 0; i < max_nch; i++)
 		for (tc = 0; tc < priv->profile->max_tc; tc++)
-			priv->channel_tc2txq[i][tc] = i + tc * priv->channels.num;
+			priv->channel_tc2txq[i][tc] = i + tc * max_nch;
+}
+
+static void mlx5e_build_tx2sq_maps(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+	int i, tc;
 
 	for (i = 0; i < priv->channels.num; i++) {
 		c = priv->channels.c[i];
@@ -2661,7 +2667,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 	netif_set_real_num_tx_queues(netdev, num_txqs);
 	netif_set_real_num_rx_queues(netdev, priv->channels.num);
 
-	mlx5e_build_channels_tx_maps(priv);
+	mlx5e_build_tx2sq_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
 	write_lock(&priv->stats_lock);
 	priv->channels_active = true;
@@ -4446,6 +4452,7 @@ static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
 	if (err)
 		mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
 	mlx5e_build_nic_netdev(netdev);
+	mlx5e_build_tc2txq_maps(priv);
 	mlx5e_vxlan_init(priv);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 776b4d68e156..3b2aed43f660 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -1161,9 +1161,6 @@ static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
 {
 	int max_nch = priv->profile->max_nch(priv->mdev);
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return 0;
-
 	return (NUM_RQ_STATS * max_nch) +
 	       (NUM_CH_STATS * max_nch) +
 	       (NUM_SQ_STATS * max_nch * priv->max_opened_tc);
@@ -1175,9 +1172,6 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return idx;
-
 	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN,
@@ -1187,7 +1181,6 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
 		for (j = 0; j < NUM_RQ_STATS; j++)
 			sprintf(data + (idx++) * ETH_GSTRING_LEN, rq_stats_desc[j].format, i);
 
-	/* priv->channel_tc2txq[i][tc] is valid only when device is open */
 	for (tc = 0; tc < priv->max_opened_tc; tc++)
 		for (i = 0; i < max_nch; i++)
 			for (j = 0; j < NUM_SQ_STATS; j++)
@@ -1204,9 +1197,6 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
 	int max_nch = priv->profile->max_nch(priv->mdev);
 	int i, j, tc;
 
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
-		return idx;
-
 	for (i = 0; i < max_nch; i++)
 		for (j = 0; j < NUM_CH_STATS; j++)
 			data[idx++] =
-- 
cgit v1.2.3-59-g8ed1b


From 6c63efe4cfabf230a8ed4b1d880249875ffdac13 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Tue, 29 May 2018 11:06:31 +0300
Subject: net/mlx5e: Remove redundant active_channels indication

Now, when all channels stats are saved regardless of the channel's state
{open, closed}, we can safely remove this indication and the stats spin
lock which protects it.

Fixes: 76c3810bade3 ("net/mlx5e: Avoid reset netdev stats on configuration changes")
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 2 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 7 -------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 6 ------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 5 -----
 4 files changed, 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1c04df043e07..372cdf8a496c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -777,8 +777,6 @@ struct mlx5e_priv {
 	struct mutex               state_lock; /* Protects Interface state */
 	struct mlx5e_rq            drop_rq;
 
-	rwlock_t                   stats_lock; /* Protects channels SW stats updates */
-	bool                       channels_active;
 	struct mlx5e_channels      channels;
 	u32                        tisn[MLX5E_MAX_NUM_TC];
 	struct mlx5e_rqt           indir_rqt;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index adc55de6d4f4..dd119bad102d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2669,9 +2669,6 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 
 	mlx5e_build_tx2sq_maps(priv);
 	mlx5e_activate_channels(&priv->channels);
-	write_lock(&priv->stats_lock);
-	priv->channels_active = true;
-	write_unlock(&priv->stats_lock);
 	netif_tx_start_all_queues(priv->netdev);
 
 	if (MLX5_VPORT_MANAGER(priv->mdev))
@@ -2693,9 +2690,6 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
 	 */
 	netif_tx_stop_all_queues(priv->netdev);
 	netif_tx_disable(priv->netdev);
-	write_lock(&priv->stats_lock);
-	priv->channels_active = false;
-	write_unlock(&priv->stats_lock);
 	mlx5e_deactivate_channels(&priv->channels);
 }
 
@@ -4269,7 +4263,6 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
 			       profile->max_nch(mdev), netdev->mtu);
 
 	mutex_init(&priv->state_lock);
-	rwlock_init(&priv->stats_lock);
 
 	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
 	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index de6364125f0f..1fb4835eac72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -130,10 +130,6 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 	struct mlx5e_sq_stats *sq_stats;
 	int i, j;
 
-	read_lock(&priv->stats_lock);
-	if (!priv->channels_active)
-	        goto out;
-
 	memset(s, 0, sizeof(*s));
 	for (i = 0; i < priv->channels.num; i++) {
 		struct mlx5e_channel *c = priv->channels.c[i];
@@ -150,8 +146,6 @@ static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
 			s->tx_bytes		+= sq_stats->bytes;
 		}
 	}
-out:
-	read_unlock(&priv->stats_lock);
 }
 
 static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 3b2aed43f660..697dc7397ba2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -114,9 +114,6 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	int i;
 
 	memset(s, 0, sizeof(*s));
-	read_lock(&priv->stats_lock);
-	if (!priv->channels_active)
-		goto out;
 
 	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++) {
 		struct mlx5e_channel_stats *channel_stats =
@@ -177,8 +174,6 @@ void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
 	}
 
 	memcpy(&priv->stats.sw, s, sizeof(*s));
-out:
-	read_unlock(&priv->stats_lock);
 }
 
 static const struct counter_desc q_stats_desc[] = {
-- 
cgit v1.2.3-59-g8ed1b


From 93edcb3a75897a60ec294851043088f59f4475e1 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 3 May 2018 12:40:30 +0300
Subject: net/mlx5e: Increase aRFS flow tables size

Increase the aRFS flow table size to 64k so it could contain up to 64k
different streams.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index f64b5e78519b..75e4308ba786 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -213,7 +213,7 @@ out:
 }
 
 #define MLX5E_ARFS_NUM_GROUPS	2
-#define MLX5E_ARFS_GROUP1_SIZE	BIT(12)
+#define MLX5E_ARFS_GROUP1_SIZE	(BIT(16) - 1)
 #define MLX5E_ARFS_GROUP2_SIZE	BIT(0)
 #define MLX5E_ARFS_TABLE_SIZE	(MLX5E_ARFS_GROUP1_SIZE +\
 				 MLX5E_ARFS_GROUP2_SIZE)
-- 
cgit v1.2.3-59-g8ed1b


From 250a42b6a764a7954a7a7a137bc71883f05657cb Mon Sep 17 00:00:00 2001
From: Adi Nissim <adin@mellanox.com>
Date: Sun, 1 Apr 2018 16:54:27 +0300
Subject: net/mlx5e: Support configurable MTU for vport representors

The representor MTU was hard coded to 1500 bytes.
Allow setting arbitrary MTU values up to the max supported by the FW.

Signed-off-by: Adi Nissim <adin@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 14 ++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 372cdf8a496c..a23fa47fbbcc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1102,6 +1102,10 @@ void mlx5e_update_stats_work(struct work_struct *work);
 
 int mlx5e_bits_invert(unsigned long a, int size);
 
+typedef int (*change_hw_mtu_cb)(struct mlx5e_priv *priv);
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb);
+
 /* ethtool helpers */
 void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 			       struct ethtool_drvinfo *drvinfo);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dd119bad102d..ab7b2a4e6edc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3495,7 +3495,8 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 	return features;
 }
 
-static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_channels new_channels = {};
@@ -3522,7 +3523,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 
 	if (!reset) {
 		params->sw_mtu = new_mtu;
-		mlx5e_set_dev_port_mtu(priv);
+		set_mtu_cb(priv);
 		netdev->mtu = params->sw_mtu;
 		goto out;
 	}
@@ -3531,7 +3532,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	if (err)
 		goto out;
 
-	mlx5e_switch_priv_channels(priv, &new_channels, mlx5e_set_dev_port_mtu);
+	mlx5e_switch_priv_channels(priv, &new_channels, set_mtu_cb);
 	netdev->mtu = new_channels.params.sw_mtu;
 
 out:
@@ -3539,6 +3540,11 @@ out:
 	return err;
 }
 
+static int mlx5e_change_nic_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, mlx5e_set_dev_port_mtu);
+}
+
 int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
 {
 	struct hwtstamp_config config;
@@ -4033,7 +4039,7 @@ static const struct net_device_ops mlx5e_netdev_ops = {
 	.ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
 	.ndo_set_features        = mlx5e_set_features,
 	.ndo_fix_features        = mlx5e_fix_features,
-	.ndo_change_mtu          = mlx5e_change_mtu,
+	.ndo_change_mtu          = mlx5e_change_nic_mtu,
 	.ndo_do_ioctl            = mlx5e_ioctl,
 	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
 	.ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 1fb4835eac72..8ab4c96b7f7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -900,6 +900,11 @@ static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
 	.switchdev_port_attr_get	= mlx5e_attr_get,
 };
 
+int mlx5e_change_rep_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, NULL);
+}
+
 static const struct net_device_ops mlx5e_netdev_ops_rep = {
 	.ndo_open                = mlx5e_rep_open,
 	.ndo_stop                = mlx5e_rep_close,
@@ -909,6 +914,7 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = {
 	.ndo_get_stats64         = mlx5e_rep_get_stats,
 	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
 	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+	.ndo_change_mtu          = mlx5e_change_rep_mtu,
 };
 
 static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
@@ -935,6 +941,10 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
 
 static void mlx5e_build_rep_netdev(struct net_device *netdev)
 {
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 max_mtu;
+
 	netdev->netdev_ops = &mlx5e_netdev_ops_rep;
 
 	netdev->watchdog_timeo    = 15 * HZ;
@@ -947,6 +957,10 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_TC;
 
 	eth_hw_addr_random(netdev);
+
+	netdev->min_mtu = ETH_MIN_MTU;
+	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
+	netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
 }
 
 static void mlx5e_init_rep(struct mlx5_core_dev *mdev,
-- 
cgit v1.2.3-59-g8ed1b


From 98db16bab59fed3834cd0c0f4a3dbca57a61fad5 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 29 May 2018 16:39:04 -0700
Subject: net/mlx5: FPGA, Handle QP error event

Add handlers for this event to perform graceful teardown of the device.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Adi Nissim <adin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/fpga/core.c    | 28 ++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 26caa0475985..436a8136f26f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -50,6 +50,11 @@ static const char *const mlx5_fpga_error_strings[] = {
 	"Temperature Critical",
 };
 
+static const char * const mlx5_fpga_qp_error_strings[] = {
+	"Null Syndrome",
+	"Retry Counter Expired",
+	"RNR Expired",
+};
 static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void)
 {
 	struct mlx5_fpga_device *fdev = NULL;
@@ -271,23 +276,38 @@ static const char *mlx5_fpga_syndrome_to_string(u8 syndrome)
 	return "Unknown";
 }
 
+static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome)
+{
+	if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings))
+		return mlx5_fpga_qp_error_strings[syndrome];
+	return "Unknown";
+}
+
 void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data)
 {
 	struct mlx5_fpga_device *fdev = mdev->fpga;
 	const char *event_name;
 	bool teardown = false;
 	unsigned long flags;
+	u32 fpga_qpn;
 	u8 syndrome;
 
-	if (event != MLX5_EVENT_TYPE_FPGA_ERROR) {
+	switch (event) {
+	case MLX5_EVENT_TYPE_FPGA_ERROR:
+		syndrome = MLX5_GET(fpga_error_event, data, syndrome);
+		event_name = mlx5_fpga_syndrome_to_string(syndrome);
+		break;
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome);
+		event_name = mlx5_fpga_qp_syndrome_to_string(syndrome);
+		fpga_qpn = MLX5_GET(fpga_qp_error_event, data, fpga_qpn);
+		break;
+	default:
 		mlx5_fpga_warn_ratelimited(fdev, "Unexpected event %u\n",
 					   event);
 		return;
 	}
 
-	syndrome = MLX5_GET(fpga_error_event, data, syndrome);
-	event_name = mlx5_fpga_syndrome_to_string(syndrome);
-
 	spin_lock_irqsave(&fdev->state_lock, flags);
 	switch (fdev->state) {
 	case MLX5_FPGA_STATUS_SUCCESS:
-- 
cgit v1.2.3-59-g8ed1b


From 5e7d77a9c5738457cdaa6bc230799c8f80733481 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Thu, 24 May 2018 13:44:24 +0300
Subject: net/mlx5e: TX, Obsolete maintaining local copies of skb->len/data

Instead of maintaining a local copy of skb->len/data and updating
it upon every copy to the WQE inline part, just calculate it once
when needed, using the ihs.

This obsoletes the function mlx5e_tx_skb_pull_inline.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 42 +++++++------------------
 1 file changed, 12 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 725c06221e95..f29deb44bf3b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -188,28 +188,16 @@ static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
 	return min_t(u16, hlen, skb_headlen(skb));
 }
 
-static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data,
-					    unsigned int *skb_len,
-					    unsigned int len)
-{
-	*skb_len -= len;
-	*skb_data += len;
-}
-
-static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs,
-				     unsigned char **skb_data,
-				     unsigned int *skb_len)
+static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs)
 {
 	struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
 	int cpy1_sz = 2 * ETH_ALEN;
 	int cpy2_sz = ihs - cpy1_sz;
 
-	memcpy(vhdr, *skb_data, cpy1_sz);
-	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy1_sz);
+	memcpy(vhdr, skb->data, cpy1_sz);
 	vhdr->h_vlan_proto = skb->vlan_proto;
 	vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb));
-	memcpy(&vhdr->h_vlan_encapsulated_proto, *skb_data, cpy2_sz);
-	mlx5e_tx_skb_pull_inline(skb_data, skb_len, cpy2_sz);
+	memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz);
 }
 
 static inline void
@@ -357,8 +345,6 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5e_tx_wqe_info *wi;
 
 	struct mlx5e_sq_stats *stats = sq->stats;
-	unsigned char *skb_data = skb->data;
-	unsigned int skb_len = skb->len;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u16 headlen, ihs, frag_pi;
 	u8 num_wqebbs, opcode;
@@ -385,7 +371,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	stats->bytes     += num_bytes;
 	stats->xmit_more += skb->xmit_more;
 
-	headlen = skb_len - ihs - skb->data_len;
+	headlen = skb->len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
 	ds_cnt += skb_shinfo(skb)->nr_frags;
 
@@ -414,15 +400,14 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	eseg->mss = mss;
 
 	if (ihs) {
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		if (skb_vlan_tag_present(skb)) {
-			mlx5e_insert_vlan(eseg->inline_hdr.start, skb,
-					  ihs - VLAN_HLEN, &skb_data, &skb_len);
+			ihs -= VLAN_HLEN;
+			mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs);
 			stats->added_vlan_packets++;
 		} else {
-			memcpy(eseg->inline_hdr.start, skb_data, ihs);
-			mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+			memcpy(eseg->inline_hdr.start, skb->data, ihs);
 		}
-		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	} else if (skb_vlan_tag_present(skb)) {
 		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
@@ -432,7 +417,7 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 		stats->added_vlan_packets++;
 	}
 
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
@@ -644,8 +629,6 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	struct mlx5e_tx_wqe_info *wi;
 
 	struct mlx5e_sq_stats *stats = sq->stats;
-	unsigned char *skb_data = skb->data;
-	unsigned int skb_len = skb->len;
 	u16 headlen, ihs, pi, frag_pi;
 	u16 ds_cnt, ds_cnt_inl = 0;
 	u8 num_wqebbs, opcode;
@@ -672,7 +655,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	stats->bytes     += num_bytes;
 	stats->xmit_more += skb->xmit_more;
 
-	headlen = skb_len - ihs - skb->data_len;
+	headlen = skb->len - ihs - skb->data_len;
 	ds_cnt += !!headlen;
 	ds_cnt += skb_shinfo(skb)->nr_frags;
 
@@ -704,13 +687,12 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 	eseg->mss = mss;
 
 	if (ihs) {
-		memcpy(eseg->inline_hdr.start, skb_data, ihs);
-		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+		memcpy(eseg->inline_hdr.start, skb->data, ihs);
 		eseg->inline_hdr.sz = cpu_to_be16(ihs);
 		dseg += ds_cnt_inl;
 	}
 
-	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen, dseg);
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
 	if (unlikely(num_dma < 0))
 		goto err_drop;
 
-- 
cgit v1.2.3-59-g8ed1b


From 75aa889fb9112874171dcb55b97302293dcd581e Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 2 Apr 2018 14:30:34 +0300
Subject: net/mlx5e: RX, Generalise name of non-linear SKB head size

Make name more generic by dropping MPWRQ from it, as it will be
used also in Legacy RQ in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h          | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c       | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a23fa47fbbcc..9b4ed83783e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -112,7 +112,7 @@ struct page_pool;
 
 #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
 
-#define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD	(256)
+#define MLX5E_RX_MAX_HEAD (256)
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
 #define MLX5E_DEFAULT_LRO_TIMEOUT                       32
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5551e4e22ad0..61a76faa2779 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1070,7 +1070,7 @@ struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
 {
-	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
+	u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt);
 	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
 	u32 frag_offset    = head_offset + headlen;
 	u32 byte_cnt       = cqe_bcnt - headlen;
@@ -1078,7 +1078,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	struct sk_buff *skb;
 
 	skb = napi_alloc_skb(rq->cq.napi,
-			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, sizeof(long)));
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
 	if (unlikely(!skb)) {
 		rq->stats->buff_alloc_err++;
 		return NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 027f54ac1ca2..4d316cc9b008 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -100,7 +100,7 @@ static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
 
 #ifdef CONFIG_INET
 /* loopback test */
-#define MLX5E_TEST_PKT_SIZE (MLX5_MPWRQ_SMALL_PACKET_THRESHOLD - NET_IP_ALIGN)
+#define MLX5E_TEST_PKT_SIZE (MLX5E_RX_MAX_HEAD - NET_IP_ALIGN)
 static const char mlx5e_test_text[ETH_GSTRING_LEN] = "MLX5E SELF TEST";
 #define MLX5E_TEST_MAGIC 0x5AEED15C001ULL
 
-- 
cgit v1.2.3-59-g8ed1b


From fa698366b79a07b7cb0e16d3f4b9fd9d89acfd40 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 2 Apr 2018 14:30:34 +0300
Subject: net/mlx5e: RX, Generalise function of SKB frag addition

Rename it and pass truesize as an extra argument, as it will be used also
in Legacy RQ in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 61a76faa2779..546e3bc46cd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -311,13 +311,11 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 		mlx5e_free_rx_wqe(rq, wi);
 }
 
-static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
-					    struct sk_buff *skb,
-					    struct mlx5e_dma_info *di,
-					    u32 frag_offset, u32 len)
+static inline void
+mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
+		   struct mlx5e_dma_info *di, u32 frag_offset, u32 len,
+		   unsigned int truesize)
 {
-	unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
-
 	dma_sync_single_for_cpu(rq->pdev,
 				di->addr + frag_offset,
 				len, DMA_FROM_DEVICE);
@@ -1094,9 +1092,11 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
 	while (byte_cnt) {
 		u32 pg_consumed_bytes =
 			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize =
+			ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz));
 
-		mlx5e_add_skb_frag_mpwqe(rq, skb, di, frag_offset,
-					 pg_consumed_bytes);
+		mlx5e_add_skb_frag(rq, skb, di, frag_offset,
+				   pg_consumed_bytes, truesize);
 		byte_cnt -= pg_consumed_bytes;
 		frag_offset = 0;
 		di++;
-- 
cgit v1.2.3-59-g8ed1b


From 386471f16b73ddc52d1f711d63c7dba5559c31f6 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 16 Apr 2018 17:11:07 +0300
Subject: net/mlx5e: RX, Dedicate a function for copying SKB header

Get the logic of copying the packet header into the SKB linear part
into a generic function. Function does copy length alignment
and dma buffer sync.

It is currently called only within the MPWQE flow.
In a downstream patch, it will be called within the legacy RQ flow
as well.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 30 ++++++++++++++-----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 546e3bc46cd1..634540afdcfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -324,6 +324,20 @@ mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
 			di->page, frag_offset, len, truesize);
 }
 
+static inline void
+mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb,
+		      struct mlx5e_dma_info *dma_info,
+		      int offset_from, int offset_to, u32 headlen)
+{
+	const void *from = page_address(dma_info->page) + offset_from;
+	/* Aligning len to sizeof(long) optimizes memcpy performance */
+	unsigned int len = ALIGN(headlen, sizeof(long));
+
+	dma_sync_single_for_cpu(pdev, dma_info->addr + offset_from, len,
+				DMA_FROM_DEVICE);
+	skb_copy_to_linear_data_offset(skb, offset_to, from, len);
+}
+
 static inline void
 mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 			    struct sk_buff *skb,
@@ -331,23 +345,13 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 			    u32 offset, u32 headlen)
 {
 	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
-	unsigned int len;
 
-	 /* Aligning len to sizeof(long) optimizes memcpy performance */
-	len = ALIGN(headlen_pg, sizeof(long));
-	dma_sync_single_for_cpu(pdev, dma_info->addr + offset, len,
-				DMA_FROM_DEVICE);
-	skb_copy_to_linear_data(skb, page_address(dma_info->page) + offset, len);
+	mlx5e_copy_skb_header(pdev, skb, dma_info, offset, 0, headlen_pg);
 
 	if (unlikely(offset + headlen > PAGE_SIZE)) {
 		dma_info++;
-		headlen_pg = len;
-		len = ALIGN(headlen - headlen_pg, sizeof(long));
-		dma_sync_single_for_cpu(pdev, dma_info->addr, len,
-					DMA_FROM_DEVICE);
-		skb_copy_to_linear_data_offset(skb, headlen_pg,
-					       page_address(dma_info->page),
-					       len);
+		mlx5e_copy_skb_header(pdev, skb, dma_info, 0, headlen_pg,
+				      headlen - headlen_pg);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 6c3a823e1e9c645f30c5b03fefe87fea8881060b Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 2 Apr 2018 16:28:10 +0300
Subject: net/mlx5e: RX, Remove HW LRO support in legacy RQ

Current LRO implementation in Legacy RQ uses high-order pages.
In downstream patches of this series we complete the transition
to using only order-0 pages in RX datapath (which was already done
in Striding RQ).

Unlike the more advanced Striding RQ, Legacy RQ does not make reuse
of any non-consumed buffers of non-full LRO sessions, and combining
it with order-0 pages has many performance drawbacks.

Hence, here we totally remove LRO support in Legacy RQ.
This guarantees having no out-of-order completions, which allows using
a cyclic work queue (instead of a linked-list) in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  7 +++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 33 +++++++++++++---------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 42bd256e680d..fffe514ba855 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1515,6 +1515,9 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable)
 			return -EOPNOTSUPP;
 		if (!mlx5e_striding_rq_possible(mdev, &priv->channels.params))
 			return -EINVAL;
+	} else if (priv->channels.params.lro_en) {
+		netdev_warn(netdev, "Can't set legacy RQ with LRO, disable LRO first\n");
+		return -EINVAL;
 	}
 
 	new_channels.params = priv->channels.params;
@@ -1589,6 +1592,10 @@ static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
 
 out:
 	mutex_unlock(&priv->state_lock);
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ab7b2a4e6edc..3f1f0552843c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -182,14 +182,6 @@ void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 	params->log_rq_mtu_frames = is_kdump_kernel() ?
 		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
 		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
-	switch (params->rq_wq_type) {
-	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		/* Extra room needed for build_skb */
-		params->lro_wqe_sz -= mlx5e_get_rq_headroom(mdev, params) +
-			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	}
 
 	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
@@ -503,14 +495,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_rq_wq_destroy;
 		}
 
-		byte_count = params->lro_en  ?
-				params->lro_wqe_sz :
-				MLX5E_SW2HW_MTU(params, params->sw_mtu);
+		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 #ifdef CONFIG_MLX5_EN_IPSEC
 		if (MLX5_IPSEC_DEV(mdev))
 			byte_count += MLX5E_METADATA_ETHER_LEN;
 #endif
-		rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
+		rq->wqe.page_reuse = !params->xdp_prog;
 
 		/* calc the required page order */
 		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
@@ -3311,6 +3301,12 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	mutex_lock(&priv->state_lock);
 
 	old_params = &priv->channels.params;
+	if (enable && !MLX5E_GET_PFLAG(old_params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		netdev_warn(netdev, "can't set LRO with legacy RQ\n");
+		err = -EINVAL;
+		goto out;
+	}
+
 	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	new_channels.params = *old_params;
@@ -3480,16 +3476,24 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 					    netdev_features_t features)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_params *params;
 
 	mutex_lock(&priv->state_lock);
+	params = &priv->channels.params;
 	if (!bitmap_empty(priv->fs.vlan.active_svlans, VLAN_N_VID)) {
 		/* HW strips the outer C-tag header, this is a problem
 		 * for S-tag traffic.
 		 */
 		features &= ~NETIF_F_HW_VLAN_CTAG_RX;
-		if (!priv->channels.params.vlan_strip_disable)
+		if (!params->vlan_strip_disable)
 			netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
 	}
+	if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		features &= ~NETIF_F_LRO;
+		if (params->lro_en)
+			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
+	}
+
 	mutex_unlock(&priv->state_lock);
 
 	return features;
@@ -4328,7 +4332,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
 
-	if (!!MLX5_CAP_ETH(mdev, lro_cap))
+	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
+	    mlx5e_check_fragmented_striding_rq_cap(mdev))
 		netdev->vlan_features    |= NETIF_F_LRO;
 
 	netdev->hw_features       = netdev->vlan_features;
-- 
cgit v1.2.3-59-g8ed1b


From 422d4c401edd186722d7530ffdaf11a6bb542cab Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 2 Apr 2018 17:23:14 +0300
Subject: net/mlx5e: RX, Split WQ objects for different RQ types

Replace the common RQ WQ object with two separate ones for the
different RQ types.
This is in preparation for switching to using a cyclic WQ type
in Legacy RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 128 +++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |  35 +++---
 3 files changed, 110 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b4ed83783e4..f2f2dcf6b23c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -498,10 +498,9 @@ enum mlx5e_rq_flag {
 
 struct mlx5e_rq {
 	/* data path */
-	struct mlx5_wq_ll      wq;
-
 	union {
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
@@ -509,6 +508,7 @@ struct mlx5e_rq {
 			};
 		} wqe;
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_umr_wqe   umr_wqe;
 			struct mlx5e_mpw_info *info;
 			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3f1f0552843c..3a007717cba5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -319,10 +319,30 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
 }
 
+static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+	default:
+		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+	}
+}
+
+static u32 mlx5e_rqwq_get_cur_sz(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return rq->mpwqe.wq.cur_sz;
+	default:
+		return rq->wqe.wq.cur_sz;
+	}
+}
+
 static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 				     struct mlx5e_channel *c)
 {
-	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+	int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
 				      GFP_KERNEL, cpu_to_node(c->cpu));
@@ -370,7 +390,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
-	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->wq));
+	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq));
 
 	return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
 }
@@ -397,15 +417,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
-	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
-				&rq->wq_ctrl);
-	if (err)
-		return err;
-
-	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
-
-	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-
 	rq->wq_type = params->rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
@@ -434,8 +445,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
+
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -472,6 +492,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_destroy_umr_mkey;
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
 				     GFP_KERNEL, cpu_to_node(c->cpu));
@@ -538,16 +567,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		goto err_rq_wq_destroy;
 
 	for (i = 0; i < wq_sz; i++) {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-		}
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		} else {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
 
-		wqe->data.byte_count = cpu_to_be32(byte_count);
-		wqe->data.lkey = rq->mkey_be;
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		}
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
@@ -744,51 +778,65 @@ static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
 	unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
 	struct mlx5e_channel *c = rq->channel;
 
-	struct mlx5_wq_ll *wq = &rq->wq;
-	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5_wq_ll_get_size(wq));
+	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq));
 
 	do {
-		if (wq->cur_sz >= min_wqes)
+		if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes)
 			return 0;
 
 		msleep(20);
 	} while (time_before(jiffies, exp_time));
 
 	netdev_warn(c->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
-		    c->ix, rq->rqn, wq->cur_sz, min_wqes);
+		    c->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes);
 
 	return -ETIMEDOUT;
 }
 
 static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
-	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_ix_be;
 	u16 wqe_ix;
 
-	/* UMR WQE (if in progress) is always at wq->head */
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
-	    rq->mpwqe.umr_in_progress)
-		mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
-
-	while (!mlx5_wq_ll_is_empty(wq)) {
-		wqe_ix_be = *wq->tail_next;
-		wqe_ix    = be16_to_cpu(wqe_ix_be);
-		wqe       = mlx5_wq_ll_get_wqe(&rq->wq, wqe_ix);
-		rq->dealloc_wqe(rq, wqe_ix);
-		mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
-			       &wqe->next.next_wqe_index);
-	}
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+
+		if (rq->mpwqe.umr_in_progress)
+			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
+	} else {
+		struct mlx5_wq_ll *wq = &rq->wqe.wq;
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
 
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
-		int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+		if (rq->wqe.page_reuse) {
+			int wq_sz = mlx5_wq_ll_get_size(wq);
 
-		for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-			rq->dealloc_wqe(rq, wqe_ix);
+			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+				rq->dealloc_wqe(rq, wqe_ix);
+		}
 	}
 }
 
@@ -2809,7 +2857,7 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
+	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
 				&rq->wq_ctrl);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 634540afdcfc..3b12d4de5b98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -369,7 +369,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
@@ -470,7 +470,7 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
@@ -546,7 +546,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 
 bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
@@ -987,6 +987,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -996,7 +997,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1018,7 +1019,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1029,6 +1030,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
@@ -1038,7 +1040,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1063,7 +1065,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 #endif
@@ -1164,6 +1166,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
 	struct mlx5e_rx_wqe *wqe;
+	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
 
@@ -1193,9 +1196,10 @@ mpwrq_cqe_out:
 	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
 		return;
 
-	wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	wq  = &rq->mpwqe.wq;
+	wqe = mlx5_wq_ll_get_wqe(wq, wqe_id);
 	mlx5e_free_rx_mpwqe(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
@@ -1399,6 +1403,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1408,7 +1413,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1425,7 +1430,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1435,6 +1440,7 @@ wq_free_wqe:
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1444,7 +1450,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1465,8 +1471,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
-- 
cgit v1.2.3-59-g8ed1b


From 99cbfa93a6122b1e9011d3f4e94b58e10d2f5cd0 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 2 Apr 2018 17:31:31 +0300
Subject: net/mlx5e: RX, Use cyclic WQ in legacy RQ

Now that LRO is not supported for Legacy RQ, there is no source of
out-of-order completions in the WQ, and we can use a cyclic one.
This has multiple advantages:
- reduces the WQE size (smaller PCI transactions).
- lower overhead in datapath (no handling of 'next' pointers).
- no reserved WQE for the WQ head (was need in linked-list).
- allows using a constant map between frag and dma_info struct, in downstream patch.

Performance tests:
ConnectX-4, single core, single RX ring.
Major gain in packet rate of single ring XDP drop.
Bottleneck is shifted form HW (at 16Mpps) to SW (at 20Mpps).

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  89 ++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 115 ++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/wq.c      |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      |  55 ++++++++++-
 6 files changed, 161 insertions(+), 111 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f2f2dcf6b23c..af521dd52993 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -186,9 +186,13 @@ struct mlx5e_tx_wqe {
 	struct mlx5_wqe_data_seg data[0];
 };
 
-struct mlx5e_rx_wqe {
+struct mlx5e_rx_wqe_ll {
 	struct mlx5_wqe_srq_next_seg  next;
-	struct mlx5_wqe_data_seg      data;
+	struct mlx5_wqe_data_seg      data[0];
+};
+
+struct mlx5e_rx_wqe_cyc {
+	struct mlx5_wqe_data_seg      data[0];
 };
 
 struct mlx5e_umr_wqe {
@@ -500,7 +504,7 @@ struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_ll      wq;
+			struct mlx5_wq_cyc     wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3a007717cba5..7fd2d736fbb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -166,7 +166,7 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST)
+	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
 		return linear_rq_headroom;
 
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
@@ -205,7 +205,7 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
 	params->rq_wq_type = mlx5e_striding_rq_possible(mdev, params) &&
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ?
 		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
-		MLX5_WQ_TYPE_LINKED_LIST;
+		MLX5_WQ_TYPE_CYCLIC;
 }
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
@@ -325,7 +325,7 @@ static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	default:
-		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+		return mlx5_wq_cyc_get_size(&rq->wqe.wq);
 	}
 }
 
@@ -491,15 +491,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		if (err)
 			goto err_destroy_umr_mkey;
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
-					&rq->wq_ctrl);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					 &rq->wq_ctrl);
 		if (err)
 			return err;
 
 		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
 
-		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
@@ -568,19 +568,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
-			struct mlx5e_rx_wqe *wqe =
+			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
-			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		} else {
-			struct mlx5e_rx_wqe *wqe =
-				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
+			struct mlx5e_rx_wqe_cyc *wqe =
+				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
 
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		}
 	}
 
@@ -630,7 +630,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		kfree(rq->mpwqe.info);
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		kfree(rq->wqe.frag_info);
 	}
 
@@ -801,11 +801,12 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
+		/* UMR WQE (if in progress) is always at wq->head */
 		if (rq->mpwqe.umr_in_progress)
 			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
 
 		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+			struct mlx5e_rx_wqe_ll *wqe;
 
 			wqe_ix_be = *wq->tail_next;
 			wqe_ix    = be16_to_cpu(wqe_ix_be);
@@ -815,24 +816,19 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 				       &wqe->next.next_wqe_index);
 		}
 	} else {
-		struct mlx5_wq_ll *wq = &rq->wqe.wq;
-
-		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 
-			wqe_ix_be = *wq->tail_next;
-			wqe_ix    = be16_to_cpu(wqe_ix_be);
-			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+		while (!mlx5_wq_cyc_is_empty(wq)) {
+			wqe_ix = mlx5_wq_cyc_get_tail(wq);
 			rq->dealloc_wqe(rq, wqe_ix);
-			mlx5_wq_ll_pop(wq, wqe_ix_be,
-				       &wqe->next.next_wqe_index);
+			mlx5_wq_cyc_pop(wq);
 		}
 
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
 		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_ll_get_size(wq);
+			int wq_sz = mlx5_wq_cyc_get_size(wq);
 
 			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
 				rq->dealloc_wqe(rq, wqe_ix);
@@ -1958,6 +1954,21 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
+{
+	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
+
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		sz += sizeof(struct mlx5e_rx_wqe_ll);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		sz += sizeof(struct mlx5e_rx_wqe_cyc);
+	}
+
+	return order_base_2(sz);
+}
+
 static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 				 struct mlx5e_params *params,
 				 struct mlx5e_rq_param *param)
@@ -1965,6 +1976,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int ndsegs = 1;
 
 	switch (params->rq_wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
@@ -1974,16 +1986,16 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		MLX5_SET(wq, wq, log_wqe_stride_size,
 			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
 			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
 		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
 	}
 
+	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
 	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
 	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
@@ -1999,8 +2011,9 @@ static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1));
 	MLX5_SET(rqc, rqc, counter_set_id, priv->drop_rq_q_counter);
 
 	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
@@ -2051,7 +2064,7 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 		log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
 			mlx5e_mpwqe_get_log_num_strides(mdev, params);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		log_cq_size = params->log_rq_mtu_frames;
 	}
 
@@ -2857,8 +2870,8 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
-				&rq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
+				 &rq->wq_ctrl);
 	if (err)
 		return err;
 
@@ -3360,7 +3373,7 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	new_channels.params = *old_params;
 	new_channels.params.lro_en = enable;
 
-	if (old_params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_CYCLIC) {
 		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
 		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
 			reset = false;
@@ -3566,7 +3579,7 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
 	new_channels.params = *params;
 	new_channels.params.sw_mtu = new_mtu;
 
-	if (params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
 		u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 8ab4c96b7f7c..3857f22b5500 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -927,7 +927,7 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
 	params->hard_mtu    = MLX5E_ETH_HARD_MTU;
 	params->sw_mtu      = mtu;
 	params->log_sq_size = MLX5E_REP_PARAMS_LOG_SQ_SIZE;
-	params->rq_wq_type  = MLX5_WQ_TYPE_LINKED_LIST;
+	params->rq_wq_type  = MLX5_WQ_TYPE_CYCLIC;
 	params->log_rq_mtu_frames = MLX5E_REP_PARAMS_LOG_RQ_SIZE;
 
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3b12d4de5b98..3cdf2c097356 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -270,7 +270,12 @@ static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
 		!mlx5e_page_is_reserved(wi->di.page);
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
+{
+	return &rq->wqe.frag_info[ix];
+}
+
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
 {
 	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
@@ -281,7 +286,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16
 		wi->offset = 0;
 	}
 
-	wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
+	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
 }
 
@@ -370,7 +375,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
-	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+	struct mlx5e_rx_wqe_ll *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
 
@@ -470,31 +475,32 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_ll_is_full(wq))
+	if (mlx5_wq_cyc_is_full(wq))
 		return false;
 
 	do {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+		u16 head = mlx5_wq_cyc_get_head(wq);
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
+		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
-	} while (!mlx5_wq_ll_is_full(wq));
+		mlx5_wq_cyc_push(wq);
+	} while (!mlx5_wq_cyc_is_full(wq));
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
 
-	mlx5_wq_ll_update_db_record(wq);
+	mlx5_wq_cyc_update_db_record(wq);
 
 	return !!err;
 }
@@ -987,19 +993,15 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
@@ -1007,20 +1009,19 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #ifdef CONFIG_MLX5_ESWITCH
@@ -1030,30 +1031,26 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
-	__be16 wqe_counter_be;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1064,9 +1061,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 #endif
 
@@ -1165,7 +1161,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
-	struct mlx5e_rx_wqe *wqe;
+	struct mlx5e_rx_wqe_ll *wqe;
 	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
@@ -1403,19 +1399,15 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
@@ -1430,8 +1422,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_CORE_IPOIB */
@@ -1440,38 +1431,34 @@ wq_free_wqe:
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 	skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb);
 	if (unlikely(!skb)) {
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 5b8b35392025..b97bb72b4db4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -85,6 +85,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
 		      MLX5_GET(wq, wqc, log_wq_sz),
 		      fbc);
+	wq->sz    = wq->fbc.sz_m1 + 1;
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index b9d7c01fc7cb..0b47126815b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -51,6 +51,9 @@ struct mlx5_wq_ctrl {
 struct mlx5_wq_cyc {
 	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
+	u16			sz;
+	u16			wqe_ctr;
+	u16			cur_sz;
 };
 
 struct mlx5_wq_qp {
@@ -95,6 +98,43 @@ u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
 
+static inline int mlx5_wq_cyc_is_full(struct mlx5_wq_cyc *wq)
+{
+	return wq->cur_sz == wq->sz;
+}
+
+static inline int mlx5_wq_cyc_missing(struct mlx5_wq_cyc *wq)
+{
+	return wq->sz - wq->cur_sz;
+}
+
+static inline int mlx5_wq_cyc_is_empty(struct mlx5_wq_cyc *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq)
+{
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n)
+{
+	wq->wqe_ctr += n;
+	wq->cur_sz += n;
+}
+
+static inline void mlx5_wq_cyc_pop(struct mlx5_wq_cyc *wq)
+{
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
 	return ctr & wq->fbc.sz_m1;
@@ -105,6 +145,16 @@ static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr)
 	return ctr & wq->fbc.frag_sz_m1;
 }
 
+static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr);
+}
+
+static inline u16 mlx5_wq_cyc_get_tail(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr - wq->cur_sz);
+}
+
 static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
@@ -179,11 +229,6 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 	return !wq->cur_sz;
 }
 
-static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
-{
-	return ctr & wq->fbc.sz_m1;
-}
-
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
-- 
cgit v1.2.3-59-g8ed1b


From 069d11465a802a2b58ce530312b7bb6e1e0b61a9 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 2 May 2018 18:23:58 +0300
Subject: net/mlx5e: RX, Enhance legacy Receive Queue memory scheme

Enhance the memory scheme of the legacy RQ, such that
only order-0 pages are used.

Whenever possible, prefer using a linear SKB, and build it
wrapping the WQE buffer.

Otherwise (for example, jumbo frames on x86), use non-linear SKB,
with as many frags as needed. In this case, multiple WQE
scatter entries are used, up to a maximum of 4 frags and 10KB of MTU.

This implied to remove support of HW LRO in legacy RQ, as it would
require large number of page allocations and scatter entries per WQE
on archs with PAGE_SIZE = 4KB, yielding bad performance.

In earlier patches, we guaranteed that all completions are in-order,
and that we use a cyclic WQ.
This creates an oppurtunity for a performance optimization:
The mapping between a "struct mlx5e_dma_info", and the
WQEs (struct mlx5e_wqe_frag_info) pointing to it, is constant
across different cycles of a WQ. This allows initializing
the mapping in the time of RQ creation, and not handle it
in datapath.

A struct mlx5e_dma_info that is shared between different WQEs
is allocated by the first WQE, and freed by the last one.
This implies an important requirement: WQEs that share the same
struct mlx5e_dma_info must be posted within the same NAPI.
Otherwise, upon completion, struct mlx5e_wqe_frag_info would mistakenly
point to the new struct mlx5e_dma_info, not the one that was posted
(and the HW wrote to).
This bulking requirement is actually good also for performance reasons,
hence we extend the bulk beyong the minimal requirement above.

With this memory scheme, the RQs memory footprint is reduce by a
factor of 2 on x86, and by a factor of 32 on PowerPC.
Same factors apply for the number of pages in a GRO session.

Performance tests:
ConnectX-4, single core, single RX ring, default MTU.

x86:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Packet rate (early drop in TC): no degradation
TCP streams: ~5% improvement

PowerPC:
CPU: POWER8 (raw), altivec supported

Packet rate (early drop in TC): 20% gain
TCP streams: 25% gain

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  44 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 238 ++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 207 +++++++++++++------
 3 files changed, 362 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index af521dd52993..eb9eb7aa953a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -101,11 +101,15 @@ struct page_pool;
 	(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
 	 (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
 
+#define MLX5E_MIN_SKB_FRAG_SZ		(MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM))
+#define MLX5E_LOG_MAX_RX_WQE_BULK	\
+	(ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ)))
+
 #define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x6
 #define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
 
-#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x1
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK)
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,	\
 					       MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
@@ -462,8 +466,9 @@ struct mlx5e_dma_info {
 };
 
 struct mlx5e_wqe_frag_info {
-	struct mlx5e_dma_info di;
+	struct mlx5e_dma_info *di;
 	u32 offset;
+	bool last_in_page;
 };
 
 struct mlx5e_umr_dma_info {
@@ -476,6 +481,8 @@ struct mlx5e_mpw_info {
 	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };
 
+#define MLX5E_MAX_RX_FRAGS 4
+
 /* a single cache unit is capable to serve one napi call (for non-striding rq)
  * or a MPWQE (for striding rq).
  */
@@ -493,6 +500,9 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
 typedef struct sk_buff *
 (*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			 struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
@@ -500,16 +510,27 @@ enum mlx5e_rq_flag {
 	MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
 };
 
+struct mlx5e_rq_frag_info {
+	int frag_size;
+	int frag_stride;
+};
+
+struct mlx5e_rq_frags_info {
+	struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_num_frags;
+	u8 wqe_bulk;
+};
+
 struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_cyc     wq;
-			struct mlx5e_wqe_frag_info *frag_info;
-			u32 frag_sz;	/* max possible skb frag_sz */
-			union {
-				bool page_reuse;
-			};
+			struct mlx5_wq_cyc          wq;
+			struct mlx5e_wqe_frag_info *frags;
+			struct mlx5e_dma_info      *di;
+			struct mlx5e_rq_frags_info  info;
+			mlx5e_fp_skb_from_cqe       skb_from_cqe;
 		} wqe;
 		struct {
 			struct mlx5_wq_ll      wq;
@@ -523,7 +544,6 @@ struct mlx5e_rq {
 	};
 	struct {
 		u16            headroom;
-		u8             page_order;
 		u8             map_dir;   /* dma map direction */
 	} buff;
 
@@ -879,6 +899,12 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7fd2d736fbb1..2c634e50d051 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -51,6 +51,7 @@
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
 	struct mlx5_wq_param	wq;
+	struct mlx5e_rq_frags_info frags_info;
 };
 
 struct mlx5e_sq_param {
@@ -93,7 +94,7 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 	return true;
 }
 
-static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
+static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
 {
 	if (!params->xdp_prog) {
 		u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
@@ -107,19 +108,27 @@ static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
 
 static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
 {
-	u32 linear_frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 
 	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
 }
 
+static bool mlx5e_rx_is_linear_skb(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+
+	return !params->lro_en && frag_sz <= PAGE_SIZE;
+}
+
 static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 					 struct mlx5e_params *params)
 {
-	u32 frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 	s8 signed_log_num_strides_param;
 	u8 log_num_strides;
 
-	if (params->lro_en || frag_sz > PAGE_SIZE)
+	if (!mlx5e_rx_is_linear_skb(mdev, params))
 		return false;
 
 	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
@@ -145,7 +154,7 @@ static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
 					  struct mlx5e_params *params)
 {
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return order_base_2(mlx5e_mpwqe_get_linear_frag_sz(params));
+		return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
 
 	return MLX5E_MPWQE_STRIDE_SZ(mdev,
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
@@ -163,16 +172,15 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 {
 	u16 linear_rq_headroom = params->xdp_prog ?
 		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+	bool is_linear_skb;
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
-		return linear_rq_headroom;
-
-	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return linear_rq_headroom;
+	is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
+		mlx5e_rx_is_linear_skb(mdev, params) :
+		mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
 
-	return 0;
+	return is_linear_skb ? linear_rq_headroom : 0;
 }
 
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
@@ -400,6 +408,61 @@ static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
 	return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
 }
 
+static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
+{
+	struct mlx5e_wqe_frag_info next_frag, *prev;
+	int i;
+
+	next_frag.di = &rq->wqe.di[0];
+	next_frag.offset = 0;
+	prev = NULL;
+
+	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
+		struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+		struct mlx5e_wqe_frag_info *frag =
+			&rq->wqe.frags[i << rq->wqe.info.log_num_frags];
+		int f;
+
+		for (f = 0; f < rq->wqe.info.num_frags; f++, frag++) {
+			if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) {
+				next_frag.di++;
+				next_frag.offset = 0;
+				if (prev)
+					prev->last_in_page = true;
+			}
+			*frag = next_frag;
+
+			/* prepare next */
+			next_frag.offset += frag_info[f].frag_stride;
+			prev = frag;
+		}
+	}
+
+	if (prev)
+		prev->last_in_page = true;
+}
+
+static int mlx5e_init_di_list(struct mlx5e_rq *rq,
+			      struct mlx5e_params *params,
+			      int wq_sz, int cpu)
+{
+	int len = wq_sz << rq->wqe.info.log_num_frags;
+
+	rq->wqe.di = kvzalloc_node(len * sizeof(*rq->wqe.di),
+				   GFP_KERNEL, cpu_to_node(cpu));
+	if (!rq->wqe.di)
+		return -ENOMEM;
+
+	mlx5e_init_frags_partition(rq);
+
+	return 0;
+}
+
+static void mlx5e_free_di_list(struct mlx5e_rq *rq)
+{
+	kvfree(rq->wqe.di);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_params *params,
 			  struct mlx5e_rq_param *rqp,
@@ -409,8 +472,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->mdev;
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	u32 byte_count, pool_size;
-	int npages;
+	u32 pool_size;
 	int wq_sz;
 	int err;
 	int i;
@@ -480,8 +542,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
 		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
 
-		byte_count = rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-
 		err = mlx5e_create_rq_umr_mkey(mdev, rq);
 		if (err)
 			goto err_rq_wq_destroy;
@@ -489,7 +549,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		err = mlx5e_rq_alloc_mpwqe_info(rq, c);
 		if (err)
-			goto err_destroy_umr_mkey;
+			goto err_free;
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
@@ -501,13 +561,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
-		rq->wqe.frag_info =
-			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL, cpu_to_node(c->cpu));
-		if (!rq->wqe.frag_info) {
-			err = -ENOMEM;
-			goto err_rq_wq_destroy;
-		}
+		rq->wqe.info = rqp->frags_info;
+		rq->wqe.frags =
+			kvzalloc_node((wq_sz << rq->wqe.info.log_num_frags) *
+				      sizeof(*rq->wqe.frags),
+				      GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frags)
+			goto err_free;
+
+		err = mlx5e_init_di_list(rq, params, wq_sz, c->cpu);
+		if (err)
+			goto err_free;
 		rq->post_wqes = mlx5e_post_rx_wqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
 
@@ -518,30 +582,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 #endif
 			rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
 		if (!rq->handle_rx_cqe) {
-			kfree(rq->wqe.frag_info);
 			err = -EINVAL;
 			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
-			goto err_rq_wq_destroy;
+			goto err_free;
 		}
 
-		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
-#ifdef CONFIG_MLX5_EN_IPSEC
-		if (MLX5_IPSEC_DEV(mdev))
-			byte_count += MLX5E_METADATA_ETHER_LEN;
-#endif
-		rq->wqe.page_reuse = !params->xdp_prog;
-
-		/* calc the required page order */
-		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
-		npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
-		rq->buff.page_order = order_base_2(npages);
-
-		byte_count |= MLX5_HW_START_PADDING;
+		rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_linear :
+			mlx5e_skb_from_cqe_nonlinear;
 		rq->mkey_be = c->mkey_be;
 	}
 
 	/* Create a page_pool and register it with rxq */
-	pp_params.order     = rq->buff.page_order;
+	pp_params.order     = 0;
 	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
 	pp_params.pool_size = pool_size;
 	pp_params.nid       = cpu_to_node(c->cpu);
@@ -555,21 +608,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	 */
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
-		if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-			kfree(rq->wqe.frag_info);
 		err = PTR_ERR(rq->page_pool);
 		rq->page_pool = NULL;
-		goto err_rq_wq_destroy;
+		goto err_free;
 	}
 	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 					 MEM_TYPE_PAGE_POOL, rq->page_pool);
 	if (err)
-		goto err_rq_wq_destroy;
+		goto err_free;
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
+			u32 byte_count =
+				rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
@@ -578,9 +631,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		} else {
 			struct mlx5e_rx_wqe_cyc *wqe =
 				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
+			int f;
 
-			wqe->data[0].byte_count = cpu_to_be32(byte_count);
-			wqe->data[0].lkey = rq->mkey_be;
+			for (f = 0; f < rq->wqe.info.num_frags; f++) {
+				u32 frag_size = rq->wqe.info.arr[f].frag_size |
+					MLX5_HW_START_PADDING;
+
+				wqe->data[f].byte_count = cpu_to_be32(frag_size);
+				wqe->data[f].lkey = rq->mkey_be;
+			}
+			/* check if num_frags is not a pow of two */
+			if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) {
+				wqe->data[f].byte_count = 0;
+				wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY);
+				wqe->data[f].addr = 0;
+			}
 		}
 	}
 
@@ -600,8 +665,16 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	return 0;
 
-err_destroy_umr_mkey:
-	mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+err_free:
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
+	}
 
 err_rq_wq_destroy:
 	if (rq->xdp_prog)
@@ -631,7 +704,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
-		kfree(rq->wqe.frag_info);
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
 	}
 
 	for (i = rq->page_cache.head; i != rq->page_cache.tail;
@@ -823,17 +897,8 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 			rq->dealloc_wqe(rq, wqe_ix);
 			mlx5_wq_cyc_pop(wq);
 		}
-
-		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
-		 * but yet to be re-posted.
-		 */
-		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_cyc_get_size(wq);
-
-			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-				rq->dealloc_wqe(rq, wqe_ix);
-		}
 	}
+
 }
 
 static int mlx5e_open_rq(struct mlx5e_channel *c,
@@ -1954,6 +2019,61 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+#define DEFAULT_FRAG_SIZE (2048)
+
+static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
+				      struct mlx5e_params *params,
+				      struct mlx5e_rq_frags_info *info)
+{
+	u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	int frag_size_max = DEFAULT_FRAG_SIZE;
+	u32 buf_size = 0;
+	int i;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (MLX5_IPSEC_DEV(mdev))
+		byte_count += MLX5E_METADATA_ETHER_LEN;
+#endif
+
+	if (mlx5e_rx_is_linear_skb(mdev, params)) {
+		int frag_stride;
+
+		frag_stride = mlx5e_rx_get_linear_frag_sz(params);
+		frag_stride = roundup_pow_of_two(frag_stride);
+
+		info->arr[0].frag_size = byte_count;
+		info->arr[0].frag_stride = frag_stride;
+		info->num_frags = 1;
+		info->wqe_bulk = PAGE_SIZE / frag_stride;
+		goto out;
+	}
+
+	if (byte_count > PAGE_SIZE +
+	    (MLX5E_MAX_RX_FRAGS - 1) * frag_size_max)
+		frag_size_max = PAGE_SIZE;
+
+	i = 0;
+	while (buf_size < byte_count) {
+		int frag_size = byte_count - buf_size;
+
+		if (i < MLX5E_MAX_RX_FRAGS - 1)
+			frag_size = min(frag_size, frag_size_max);
+
+		info->arr[i].frag_size = frag_size;
+		info->arr[i].frag_stride = roundup_pow_of_two(frag_size);
+
+		buf_size += frag_size;
+		i++;
+	}
+	info->num_frags = i;
+	/* number of different wqes sharing a page */
+	info->wqe_bulk = 1 + (info->num_frags % 2);
+
+out:
+	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
+	info->log_num_frags = order_base_2(info->num_frags);
+}
+
 static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
 {
 	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
@@ -1990,6 +2110,8 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
+		mlx5e_build_rq_frags_info(mdev, params, &param->frags_info);
+		ndsegs = param->frags_info.num_frags;
 	}
 
 	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3cdf2c097356..d3a1dd20e41d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -164,8 +164,6 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
 }
 
-#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
-
 static inline bool mlx5e_page_is_reserved(struct page *page)
 {
 	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
@@ -214,7 +212,7 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
 	stats->cache_reuse++;
 
 	dma_sync_single_for_device(rq->pdev, dma_info->addr,
-				   RQ_PAGE_SIZE(rq),
+				   PAGE_SIZE,
 				   DMA_FROM_DEVICE);
 	return true;
 }
@@ -230,7 +228,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 		return -ENOMEM;
 
 	dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
-				      RQ_PAGE_SIZE(rq), rq->buff.map_dir);
+				      PAGE_SIZE, rq->buff.map_dir);
 	if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
 		put_page(dma_info->page);
 		dma_info->page = NULL;
@@ -243,8 +241,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *dma_info)
 {
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
 }
 
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
@@ -262,58 +259,96 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 	}
 }
 
-static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
-				    struct mlx5e_wqe_frag_info *wi)
+static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *frag)
+{
+	int err = 0;
+
+	if (!frag->offset)
+		/* On first frag (offset == 0), replenish page (dma_info actually).
+		 * Other frags that point to the same dma_info (with a different
+		 * offset) should just use the new one without replenishing again
+		 * by themselves.
+		 */
+		err = mlx5e_page_alloc_mapped(rq, frag->di);
+
+	return err;
+}
+
+static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *frag)
 {
-	return rq->wqe.page_reuse && wi->di.page &&
-		(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
-		!mlx5e_page_is_reserved(wi->di.page);
+	if (frag->last_in_page)
+		mlx5e_page_release(rq, frag->di, true);
 }
 
 static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
 {
-	return &rq->wqe.frag_info[ix];
+	return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags];
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
+			      u16 ix)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5e_wqe_frag_info *frag = get_frag(rq, ix);
+	int err;
+	int i;
 
-	/* check if page exists, hence can be reused */
-	if (!wi->di.page) {
-		if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
-			return -ENOMEM;
-		wi->offset = 0;
+	for (i = 0; i < rq->wqe.info.num_frags; i++, frag++) {
+		err = mlx5e_get_rx_frag(rq, frag);
+		if (unlikely(err))
+			goto free_frags;
+
+		wqe->data[i].addr = cpu_to_be64(frag->di->addr +
+						frag->offset + rq->buff.headroom);
 	}
 
-	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
+
+free_frags:
+	while (--i >= 0)
+		mlx5e_put_rx_frag(rq, --frag);
+
+	return err;
 }
 
 static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
 				     struct mlx5e_wqe_frag_info *wi)
 {
-	mlx5e_page_release(rq, &wi->di, true);
-	wi->di.page = NULL;
+	int i;
+
+	for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
+		mlx5e_put_rx_frag(rq, wi);
 }
 
-static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
-					   struct mlx5e_wqe_frag_info *wi)
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-	if (mlx5e_page_reuse(rq, wi)) {
-		rq->stats->page_reuse++;
-		return;
-	}
+	struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix);
 
 	mlx5e_free_rx_wqe(rq, wi);
 }
 
-void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
+static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int err;
+	int i;
 
-	if (wi->di.page)
-		mlx5e_free_rx_wqe(rq, wi);
+	for (i = 0; i < wqe_bulk; i++) {
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
+
+		err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i);
+		if (unlikely(err))
+			goto free_wqes;
+	}
+
+	return 0;
+
+free_wqes:
+	while (--i >= 0)
+		mlx5e_dealloc_rx_wqe(rq, ix + i);
+
+	return err;
 }
 
 static inline void
@@ -476,26 +511,28 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	u8 wqe_bulk;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_cyc_is_full(wq))
+	wqe_bulk = rq->wqe.info.wqe_bulk;
+
+	if (mlx5_wq_cyc_missing(wq) < wqe_bulk)
 		return false;
 
 	do {
 		u16 head = mlx5_wq_cyc_get_head(wq);
-		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
+		err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_cyc_push(wq);
-	} while (!mlx5_wq_cyc_is_full(wq));
+		mlx5_wq_cyc_push_n(wq, wqe_bulk);
+	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
@@ -949,11 +986,11 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
 	return skb;
 }
 
-static inline
-struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
-			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
-	struct mlx5e_dma_info *di = &wi->di;
+	struct mlx5e_dma_info *di = wi->di;
 	u16 rx_headroom = rq->buff.headroom;
 	struct sk_buff *skb;
 	void *va, *data;
@@ -968,7 +1005,6 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 				      frag_size, DMA_FROM_DEVICE);
 	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
-	wi->offset += frag_size;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 		rq->stats->wqe_err++;
@@ -991,6 +1027,56 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	return skb;
 }
 
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+	struct mlx5e_wqe_frag_info *head_wi = wi;
+	u16 headlen      = min_t(u32, MLX5E_RX_MAX_HEAD, cqe_bcnt);
+	u16 frag_headlen = headlen;
+	u16 byte_cnt     = cqe_bcnt - headlen;
+	struct sk_buff *skb;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		return NULL;
+	}
+
+	/* XDP is not supported in this configuration, as incoming packets
+	 * might spread among multiple pages.
+	 */
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	while (byte_cnt) {
+		u16 frag_consumed_bytes =
+			min_t(u16, frag_info->frag_size - frag_headlen, byte_cnt);
+
+		mlx5e_add_skb_frag(rq, skb, wi->di, wi->offset + frag_headlen,
+				   frag_consumed_bytes, frag_info->frag_stride);
+		byte_cnt -= frag_consumed_bytes;
+		frag_headlen = 0;
+		frag_info++;
+		wi++;
+	}
+
+	/* copy header */
+	mlx5e_copy_skb_header(rq->pdev, skb, head_wi->di, head_wi->offset,
+			      0, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1003,23 +1089,23 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1041,16 +1127,16 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
+		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1060,7 +1146,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1409,7 +1496,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
 		goto wq_free_wqe;
 
@@ -1421,7 +1508,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 wq_free_wqe:
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 	mlx5_wq_cyc_pop(wq);
 }
 
@@ -1441,7 +1528,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
@@ -1456,7 +1543,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5ffd81943d7a57423f204cd5844bf430b5634472 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Tue, 20 Feb 2018 15:17:54 +0200
Subject: net/mlx5e: RX, Always prefer Linear SKB configuration

Prefer the linear SKB configuration of Legacy RQ over the
non-linear one of Striding RQ.

This implies that ConnectX-4 LX now uses legacy RQ by default,
as it does not support the linear configuration of Striding RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2c634e50d051..333d4ed52b94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4405,9 +4405,16 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);
 
 	/* RQ */
-	if (mlx5e_striding_rq_possible(mdev, params))
-		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ,
-				!slow_pci_heuristic(mdev));
+	/* Prefer Striding RQ, unless any of the following holds:
+	 * - Striding RQ configuration is not possible/supported.
+	 * - Slow PCI heuristic.
+	 * - Legacy RQ would use linear SKB while Striding RQ would use non-linear.
+	 */
+	if (!slow_pci_heuristic(mdev) &&
+	    mlx5e_striding_rq_possible(mdev, params) &&
+	    (mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ||
+	     !mlx5e_rx_is_linear_skb(mdev, params)))
+		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true);
 	mlx5e_set_rq_type(mdev, params);
 	mlx5e_init_rq_type_params(mdev, params);
 
-- 
cgit v1.2.3-59-g8ed1b


From f65a59ffbcc26135e225058b2e6cd49ab9f9f13f Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 18 Apr 2018 13:29:11 +0300
Subject: net/mlx5e: TX, Separate cachelines of xmit and completion stats

Avoid false sharing of cachelines by separating the cachelines of
TX stats that are dertied in xmit flow and in completion flow.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 697dc7397ba2..1646859974ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -64,11 +64,11 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
@@ -1137,11 +1137,11 @@ static const struct counter_desc sq_stats_desc[] = {
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 };
 
 static const struct counter_desc ch_stats_desc[] = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 390c7afa5188..643153bb3607 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -75,11 +75,11 @@ struct mlx5e_sw_stats {
 	u64 tx_csum_partial;
 	u64 tx_csum_partial_inner;
 	u64 tx_queue_stopped;
-	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 tx_xmit_more;
-	u64 tx_cqe_err;
 	u64 tx_recover;
+	u64 tx_queue_wake;
+	u64 tx_cqe_err;
 	u64 rx_wqe_err;
 	u64 rx_mpwqe_filler;
 	u64 rx_buff_alloc_err;
@@ -203,10 +203,11 @@ struct mlx5e_sq_stats {
 	/* less likely accessed in data path */
 	u64 csum_none;
 	u64 stopped;
-	u64 wake;
 	u64 dropped;
-	u64 cqe_err;
 	u64 recover;
+	/* dirtied @completion */
+	u64 wake ____cacheline_aligned_in_smp;
+	u64 cqe_err;
 };
 
 struct mlx5e_ch_stats {
-- 
cgit v1.2.3-59-g8ed1b


From 16edddfe3c5d192f097c27ff60cae05daa29f782 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:36 +0900
Subject: selftests/bpf: test_sockmap, check test failure

Test failures are not identified because exit code of RX/TX threads
is not checked. Also threads are not returning correct exit code.

- Return exit code from threads depending on test execution status
- In main thread, check the exit code of RX/TX threads
- Skip error checking for corked tests as they are expected to timeout

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index eb17fae458e6..7b2008a144cb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -429,8 +429,8 @@ static int sendmsg_test(struct sockmap_options *opt)
 	struct msg_stats s = {0};
 	int iov_count = opt->iov_count;
 	int iov_buf = opt->iov_length;
+	int rx_status, tx_status;
 	int cnt = opt->rate;
-	int status;
 
 	errno = 0;
 
@@ -442,7 +442,7 @@ static int sendmsg_test(struct sockmap_options *opt)
 	rxpid = fork();
 	if (rxpid == 0) {
 		if (opt->drop_expected)
-			exit(1);
+			exit(0);
 
 		if (opt->sendpage)
 			iov_count = 1;
@@ -463,7 +463,9 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		if (err && txmsg_cork)
+			err = 0;
+		exit(err ? 1 : 0);
 	} else if (rxpid == -1) {
 		perror("msg_loop_rx: ");
 		return errno;
@@ -491,14 +493,27 @@ static int sendmsg_test(struct sockmap_options *opt)
 				"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
 				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
-		exit(1);
+		exit(err ? 1 : 0);
 	} else if (txpid == -1) {
 		perror("msg_loop_tx: ");
 		return errno;
 	}
 
-	assert(waitpid(rxpid, &status, 0) == rxpid);
-	assert(waitpid(txpid, &status, 0) == txpid);
+	assert(waitpid(rxpid, &rx_status, 0) == rxpid);
+	assert(waitpid(txpid, &tx_status, 0) == txpid);
+	if (WIFEXITED(rx_status)) {
+		err = WEXITSTATUS(rx_status);
+		if (err) {
+			fprintf(stderr, "rx thread exited with err %d. ", err);
+			goto out;
+		}
+	}
+	if (WIFEXITED(tx_status)) {
+		err = WEXITSTATUS(tx_status);
+		if (err)
+			fprintf(stderr, "tx thread exited with err %d. ", err);
+	}
+out:
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 035b37ff2c6e3577b8ad7d5c3cc4619ab74559bb Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:37 +0900
Subject: selftests/bpf: test_sockmap, join cgroup in selftest mode

In case of selftest mode, temporary cgroup environment is created but
cgroup is not joined. It causes test failures. Fixed by joining the
cgroup

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7b2008a144cb..7f9ca79aadbc 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1344,6 +1344,11 @@ static int __test_suite(char *bpf_file)
 		return cg_fd;
 	}
 
+	if (join_cgroup(CG_PATH)) {
+		fprintf(stderr, "ERROR: failed to join cgroup\n");
+		return -EINVAL;
+	}
+
 	/* Tests basic commands and APIs with range of iov values */
 	txmsg_start = txmsg_end = 0;
 	err = test_txmsg(cg_fd);
-- 
cgit v1.2.3-59-g8ed1b


From a009f1f396d04a700971586ce8a142ea606cbb61 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:38 +0900
Subject: selftests/bpf: test_sockmap, timing improvements

Currently 10us delay is too low for many tests to succeed. It needs to
be increased. Also, many corked tests are expected to hit rx timeout
irrespective of timeout value.

- This patch sets 1000usec timeout value for corked tests because less
than that causes broken-pipe error in tx thread. Also sets 1 second
timeout for all other tests because less than that results in RX
timeout
- tests with apply=1 and higher number of iterations were taking lot
of time. This patch reduces test run time by reducing iterations.

real    0m12.968s
user    0m0.219s
sys     0m14.337s

Fixes: a18fda1a62c3 ("bpf: reduce runtime of test_sockmap tests")
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7f9ca79aadbc..5cd0550af595 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -345,8 +345,13 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		if (err < 0)
 			perror("recv start time: ");
 		while (s->bytes_recvd < total_bytes) {
-			timeout.tv_sec = 0;
-			timeout.tv_usec = 10;
+			if (txmsg_cork) {
+				timeout.tv_sec = 0;
+				timeout.tv_usec = 1000;
+			} else {
+				timeout.tv_sec = 1;
+				timeout.tv_usec = 0;
+			}
 
 			/* FD sets */
 			FD_ZERO(&w);
@@ -1025,14 +1030,14 @@ static int test_send(struct sockmap_options *opt, int cgrp)
 
 	opt->iov_length = 1;
 	opt->iov_count = 1;
-	opt->rate = 1024;
+	opt->rate = 512;
 	err = test_exec(cgrp, opt);
 	if (err)
 		goto out;
 
 	opt->iov_length = 256;
 	opt->iov_count = 1024;
-	opt->rate = 10;
+	opt->rate = 2;
 	err = test_exec(cgrp, opt);
 	if (err)
 		goto out;
-- 
cgit v1.2.3-59-g8ed1b


From d825e12f084650649b61209d0cd514c0e57bb0de Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:39 +0900
Subject: selftests/bpf: test_sockmap, fix data verification

When data verification is enabled, some tests fail because verification is done
incorrectly. Following changes fix it.

- Identify the size of data block to be verified
- Reset verification counter when data block size is reached
- Fixed the value printed in case of verfication failure

Fixes: 16962b2404ac ("bpf: sockmap, add selftests")
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 5cd0550af595..2bc70e1ab2eb 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -337,8 +337,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		float total_bytes;
+		int bytes_cnt = 0;
+		int chunk_sz;
 		fd_set w;
 
+		if (opt->sendpage)
+			chunk_sz = iov_length * cnt;
+		else
+			chunk_sz = iov_length * iov_count;
+
 		fcntl(fd, fd_flags);
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -393,9 +400,14 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 							errno = -EIO;
 							fprintf(stderr,
 								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-								i, j, d[j], k - 1, d[j+1], k + 1);
+								i, j, d[j], k - 1, d[j+1], k);
 							goto out_errno;
 						}
+						bytes_cnt++;
+						if (bytes_cnt == chunk_sz) {
+							k = 0;
+							bytes_cnt = 0;
+						}
 						recv--;
 					}
 				}
-- 
cgit v1.2.3-59-g8ed1b


From 73563aa3d9773bc70a2f171643cce18617a0cc6f Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Thu, 31 May 2018 13:42:40 +0900
Subject: selftests/bpf: test_sockmap, print additional test options

Print values of test options like apply, cork, start, end so that
individual failed tests can be identified for manual run

Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sockmap.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 2bc70e1ab2eb..05c8cb71724a 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -876,6 +876,8 @@ static char *test_to_str(int test)
 #define OPTSTRING 60
 static void test_options(char *options)
 {
+	char tstr[OPTSTRING];
+
 	memset(options, 0, OPTSTRING);
 
 	if (txmsg_pass)
@@ -888,14 +890,22 @@ static void test_options(char *options)
 		strncat(options, "redir_noisy,", OPTSTRING);
 	if (txmsg_drop)
 		strncat(options, "drop,", OPTSTRING);
-	if (txmsg_apply)
-		strncat(options, "apply,", OPTSTRING);
-	if (txmsg_cork)
-		strncat(options, "cork,", OPTSTRING);
-	if (txmsg_start)
-		strncat(options, "start,", OPTSTRING);
-	if (txmsg_end)
-		strncat(options, "end,", OPTSTRING);
+	if (txmsg_apply) {
+		snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_cork) {
+		snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_start) {
+		snprintf(tstr, OPTSTRING, "start %d,", txmsg_start);
+		strncat(options, tstr, OPTSTRING);
+	}
+	if (txmsg_end) {
+		snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
+		strncat(options, tstr, OPTSTRING);
+	}
 	if (txmsg_ingress)
 		strncat(options, "ingress,", OPTSTRING);
 	if (txmsg_skb)
@@ -904,7 +914,7 @@ static void test_options(char *options)
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
 {
-	char *options = calloc(60, sizeof(char));
+	char *options = calloc(OPTSTRING, sizeof(char));
 	int err;
 
 	if (test == SENDPAGE)
-- 
cgit v1.2.3-59-g8ed1b


From b9308ae696b2c35e862636eec631d95ff958c33d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 2 Jun 2018 09:06:50 -0700
Subject: bpf: btf: Check array t->size

This patch ensures array's t->size is 0.

The array size is decided by its individual elem's size and the
number of elements.  Hence, t->size is not used and
it must be 0.

A test case is added to test_btf.c

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c                       |  5 +++++
 tools/testing/selftests/bpf/test_btf.c | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3d20aa1f4b54..84ad532f2854 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1342,6 +1342,11 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (t->size) {
+		btf_verifier_log_type(env, t, "size != 0");
+		return -EINVAL;
+	}
+
 	/* Array elem type and index type cannot be in type void,
 	 * so !array->type and !array->index_type are not allowed.
 	 */
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 35064df688c1..fd8246e84149 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1178,6 +1178,29 @@ static struct btf_raw_test raw_tests[] = {
 	.err_str = "Invalid index",
 },
 
+{
+	.descr = "array test. t->size != 0\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* int[16] */				/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 1),
+		BTF_ARRAY_ENC(1, 1, 16),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "array_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "size != 0",
+},
+
 {
 	.descr = "int test. invalid int_data",
 	.raw_types = {
-- 
cgit v1.2.3-59-g8ed1b


From 8175383f2320dbc1b4e803d857ed499ed3e76199 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 2 Jun 2018 09:06:51 -0700
Subject: bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD

The t->type in BTF_KIND_FWD is not used.  It must be 0.
This patch ensures that and also adds a test case in test_btf.c

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c                       | 21 ++++++++++++++++++++-
 tools/testing/selftests/bpf/test_btf.c | 22 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 84ad532f2854..8653ab004c73 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1286,8 +1286,27 @@ static struct btf_kind_operations ptr_ops = {
 	.seq_show = btf_ptr_seq_show,
 };
 
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (t->type) {
+		btf_verifier_log_type(env, t, "type != 0");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
 static struct btf_kind_operations fwd_ops = {
-	.check_meta = btf_ref_type_check_meta,
+	.check_meta = btf_fwd_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index fd8246e84149..3619f3023088 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -1242,6 +1242,28 @@ static struct btf_raw_test raw_tests[] = {
 	.err_str = "Invalid btf_info",
 },
 
+{
+	.descr = "fwd test. t->type != 0\"",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* fwd type */				/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 1),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "fwd_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_type_id = 1,
+	.value_type_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.err_str = "type != 0",
+},
+
 }; /* struct btf_raw_test raw_tests[] */
 
 static const char *get_next_str(const char *start, const char *end)
-- 
cgit v1.2.3-59-g8ed1b


From 8d6e555773690e6fdefd99723fcd0a7e432c0c90 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 1 Jun 2018 14:54:07 +0200
Subject: netfilter: Decrease code duplication regarding transparent socket
 option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a function in include/net/netfilter/nf_socket.h to decide if a
socket has IP(V6)_TRANSPARENT socket option set or not. However this
does the same as inet_sk_transparent() in include/net/tcp.h

include/net/tcp.h:1733
/* This helper checks if socket has IP_TRANSPARENT set */
static inline bool inet_sk_transparent(const struct sock *sk)
{
	switch (sk->sk_state) {
	case TCP_TIME_WAIT:
		return inet_twsk(sk)->tw_transparent;
	case TCP_NEW_SYN_RECV:
		return inet_rsk(inet_reqsk(sk))->no_srccheck;
	}
	return inet_sk(sk)->transparent;
}

tproxy_sk_is_transparent has also been refactored to use this function
instead of reimplementing it.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 13 -------------
 net/netfilter/nft_socket.c        |  3 ++-
 net/netfilter/xt_TPROXY.c         | 15 ++-------------
 net/netfilter/xt_socket.c         |  4 ++--
 4 files changed, 6 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
index 29b6313f0557..f9d7bee9bd4e 100644
--- a/include/net/netfilter/nf_socket.h
+++ b/include/net/netfilter/nf_socket.h
@@ -3,19 +3,6 @@
 #define _NF_SOCK_H_
 
 #include <net/sock.h>
-#include <net/inet_timewait_sock.h>
-
-static inline bool nf_sk_is_transparent(struct sock *sk)
-{
-	switch (sk->sk_state) {
-	case TCP_TIME_WAIT:
-		return inet_twsk(sk)->tw_transparent;
-	case TCP_NEW_SYN_RECV:
-		return inet_rsk(inet_reqsk(sk))->no_srccheck;
-	default:
-		return inet_sk(sk)->transparent;
-	}
-}
 
 struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
 				  const struct net_device *indev);
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index d86337068ecb..f28a0b944087 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -5,6 +5,7 @@
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_socket.h>
 #include <net/inet_sock.h>
+#include <net/tcp.h>
 
 struct nft_socket {
 	enum nft_socket_keys		key:8;
@@ -48,7 +49,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
 
 	switch(priv->key) {
 	case NFT_SOCKET_TRANSPARENT:
-		nft_reg_store8(dest, nf_sk_is_transparent(sk));
+		nft_reg_store8(dest, inet_sk_transparent(sk));
 		break;
 	default:
 		WARN_ON(1);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 8c89323c06af..74df7977c17c 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -42,19 +42,8 @@ enum nf_tproxy_lookup_t {
 
 static bool tproxy_sk_is_transparent(struct sock *sk)
 {
-	switch (sk->sk_state) {
-	case TCP_TIME_WAIT:
-		if (inet_twsk(sk)->tw_transparent)
-			return true;
-		break;
-	case TCP_NEW_SYN_RECV:
-		if (inet_rsk(inet_reqsk(sk))->no_srccheck)
-			return true;
-		break;
-	default:
-		if (inet_sk(sk)->transparent)
-			return true;
-	}
+	if (inet_sk_transparent(sk))
+		return true;
 
 	sock_gen_put(sk);
 	return false;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ac7f674d19b..5c0779c4fa3c 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -73,7 +73,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		 * if XT_SOCKET_TRANSPARENT is used
 		 */
 		if (info->flags & XT_SOCKET_TRANSPARENT)
-			transparent = nf_sk_is_transparent(sk);
+			transparent = inet_sk_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
 		    transparent && sk_fullsock(sk))
@@ -130,7 +130,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
 		 * if XT_SOCKET_TRANSPARENT is used
 		 */
 		if (info->flags & XT_SOCKET_TRANSPARENT)
-			transparent = nf_sk_is_transparent(sk);
+			transparent = inet_sk_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
 		    transparent && sk_fullsock(sk))
-- 
cgit v1.2.3-59-g8ed1b


From 45ca4e0cf2734f8cc14b43e47c23618215abf1b8 Mon Sep 17 00:00:00 2001
From: Máté Eckl <ecklm94@gmail.com>
Date: Fri, 1 Jun 2018 20:44:56 +0200
Subject: netfilter: Libify xt_TPROXY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extracted functions will likely be usefull to implement tproxy
support in nf_tables.

Extrancted functions:
	- nf_tproxy_sk_is_transparent
	- nf_tproxy_laddr4
	- nf_tproxy_handle_time_wait4
	- nf_tproxy_get_sock_v4
	- nf_tproxy_laddr6
	- nf_tproxy_handle_time_wait6
	- nf_tproxy_get_sock_v6

(nf_)tproxy_handle_time_wait6 also needed some refactor as its current
implementation was xtables-specific.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h   | 113 ++++++++++++
 net/ipv4/netfilter/Kconfig          |   5 +-
 net/ipv4/netfilter/Makefile         |   1 +
 net/ipv4/netfilter/nf_tproxy_ipv4.c | 147 +++++++++++++++
 net/ipv6/netfilter/Kconfig          |   5 +-
 net/ipv6/netfilter/Makefile         |   1 +
 net/ipv6/netfilter/nf_tproxy_ipv6.c | 146 +++++++++++++++
 net/netfilter/Kconfig               |   2 +
 net/netfilter/xt_TPROXY.c           | 355 ++----------------------------------
 9 files changed, 436 insertions(+), 339 deletions(-)
 create mode 100644 include/net/netfilter/nf_tproxy.h
 create mode 100644 net/ipv4/netfilter/nf_tproxy_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_tproxy_ipv6.c

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
new file mode 100644
index 000000000000..9754a50ecde9
--- /dev/null
+++ b/include/net/netfilter/nf_tproxy.h
@@ -0,0 +1,113 @@
+#ifndef _NF_TPROXY_H_
+#define _NF_TPROXY_H_
+
+#include <net/tcp.h>
+
+enum nf_tproxy_lookup_t {
+	 NF_TPROXY_LOOKUP_LISTENER,
+	 NF_TPROXY_LOOKUP_ESTABLISHED,
+};
+
+static inline bool nf_tproxy_sk_is_transparent(struct sock *sk)
+{
+	if (inet_sk_transparent(sk))
+		return true;
+
+	sock_gen_put(sk);
+	return false;
+}
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr);
+
+/**
+ * nf_tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * nf_tproxy_handle_time_wait4() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+			    __be32 laddr, __be16 lport, struct sock *sk);
+
+/*
+ * This is used when the user wants to intercept a connection matching
+ * an explicit iptables rule. In this case the sockets are assumed
+ * matching in preference order:
+ *
+ *   - match: if there's a fully established connection matching the
+ *     _packet_ tuple, it is returned, assuming the redirection
+ *     already took place and we process a packet belonging to an
+ *     established connection
+ *
+ *   - match: if there's a listening socket matching the redirection
+ *     (e.g. on-port & on-ip of the connection), it is returned,
+ *     regardless if it was bound to 0.0.0.0 or an explicit
+ *     address. The reasoning is that if there's an explicit rule, it
+ *     does not really matter if the listener is bound to an interface
+ *     or to 0. The user already stated that he wants redirection
+ *     (since he added the rule).
+ *
+ * Please note that there's an overlap between what a TPROXY target
+ * and a socket match will match. Normally if you have both rules the
+ * "socket" match will be the first one, effectively all packets
+ * belonging to established connections going through that one.
+ */
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type);
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+		 const struct in6_addr *daddr);
+
+/**
+ * nf_tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @net:	Network namespace.
+ * @laddr:	IPv6 address to redirect to.
+ * @lport:	TCP port to redirect to or zero.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * nf_tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			    struct net *net,
+			    const struct in6_addr *laddr,
+			    const __be16 lport,
+			    struct sock *sk);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type);
+
+#endif /* _NF_TPROXY_H_ */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d03bc5a01a70..bbfc356cb1b5 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV4
 	tristate "IPv4 socket lookup support"
 	help
 	  This option enables the IPv4 socket lookup infrastructure. This is
-	  is required by the iptables socket match.
+	  is required by the {ip,nf}tables socket match.
+
+config NF_TPROXY_IPV4
+	tristate "IPv4 tproxy support"
 
 if NF_TABLES
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index c4b05b174091..8394c17c269f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
+obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
 
 # logging
 obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
new file mode 100644
index 000000000000..805e83ec3ad9
--- /dev/null
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007-2008 BalaBit IT Ltd.
+ * Author: Krisztian Kovacs
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <linux/inetdevice.h>
+
+struct sock *
+nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
+			 __be32 laddr, __be16 lport, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
+
+__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+
+	return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
+
+struct sock *
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+		      const u8 protocol,
+		      const __be32 saddr, const __be32 daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+	struct tcphdr *tcph;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NF_TPROXY_LOOKUP_LISTENER:
+			tcph = hp;
+			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+						    ip_hdrlen(skb) +
+						      __tcp_hdrlen(tcph),
+						    saddr, sport,
+						    daddr, dport,
+						    in->ifindex, 0);
+
+			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+				sk = NULL;
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NF_TPROXY_LOOKUP_ESTABLISHED:
+			sk = inet_lookup_established(net, &tcp_hashinfo,
+						    saddr, sport, daddr, dport,
+						    in->ifindex);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED &&
+			      (!connected || wildcard)) ||
+			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
+		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 9f5b00a39adf..37b14dc9d863 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,7 +29,10 @@ config NF_SOCKET_IPV6
 	tristate "IPv6 socket lookup support"
 	help
 	  This option enables the IPv6 socket lookup infrastructure. This
-	  is used by the ip6tables socket match.
+	  is used by the {ip6,nf}tables socket match.
+
+config NF_TPROXY_IPV6
+	tristate "IPv6 tproxy support"
 
 if NF_TABLES
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 71518f22ae39..10a5a1c87320 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -26,6 +26,7 @@ nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
+obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
 
 # logging
 obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
new file mode 100644
index 000000000000..bf1d6c421e3b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -0,0 +1,146 @@
+#include <net/netfilter/nf_tproxy.h>
+#include <linux/module.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+
+const struct in6_addr *
+nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
+	      const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	indev = __in6_dev_get(skb->dev);
+	if (indev) {
+		read_lock_bh(&indev->lock);
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
+				continue;
+
+			laddr = &ifa->addr;
+			break;
+		}
+		read_unlock_bh(&indev->lock);
+	}
+
+	return laddr ? laddr : daddr;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_laddr6);
+
+struct sock *
+nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 struct net *net,
+			 const struct in6_addr *laddr,
+			 const __be16 lport,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+					    &iph->saddr,
+					    nf_tproxy_laddr6(skb, laddr, &iph->daddr),
+					    hp->source,
+					    lport ? lport : hp->dest,
+					    skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
+
+struct sock *
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+		      const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in,
+		      const enum nf_tproxy_lookup_t lookup_type)
+{
+	struct sock *sk;
+	struct tcphdr *tcph;
+
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NF_TPROXY_LOOKUP_LISTENER:
+			tcph = hp;
+			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+						   thoff + __tcp_hdrlen(tcph),
+						   saddr, sport,
+						   daddr, ntohs(dport),
+						   in->ifindex, 0);
+
+			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
+				sk = NULL;
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			break;
+		case NF_TPROXY_LOOKUP_ESTABLISHED:
+			sk = __inet6_lookup_established(net, &tcp_hashinfo,
+							saddr, sport, daddr, ntohs(dport),
+							in->ifindex, 0);
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too
+			 */
+			if ((lookup_type == NF_TPROXY_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+			    (lookup_type == NF_TPROXY_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
+MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 276e1e32f44e..41240abd755f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -989,6 +989,8 @@ config NETFILTER_XT_TARGET_TPROXY
 	depends on IP_NF_MANGLE
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
+	select NF_TPROXY_IPV4
+	select NF_TPROXY_IPV6 if IP6_NF_IPTABLES
 	help
 	  This option adds a `TPROXY' target, which is somewhat similar to
 	  REDIRECT.  It can only be used in the mangle table and is useful
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 74df7977c17c..58fce4e749a9 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -33,253 +33,9 @@
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #endif
 
+#include <net/netfilter/nf_tproxy.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
-enum nf_tproxy_lookup_t {
-	 NFT_LOOKUP_LISTENER,
-	 NFT_LOOKUP_ESTABLISHED,
-};
-
-static bool tproxy_sk_is_transparent(struct sock *sk)
-{
-	if (inet_sk_transparent(sk))
-		return true;
-
-	sock_gen_put(sk);
-	return false;
-}
-
-static inline __be32
-tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
-{
-	struct in_device *indev;
-	__be32 laddr;
-
-	if (user_laddr)
-		return user_laddr;
-
-	laddr = 0;
-	indev = __in_dev_get_rcu(skb->dev);
-	for_primary_ifa(indev) {
-		laddr = ifa->ifa_local;
-		break;
-	} endfor_ifa(indev);
-
-	return laddr ? laddr : daddr;
-}
-
-/*
- * This is used when the user wants to intercept a connection matching
- * an explicit iptables rule. In this case the sockets are assumed
- * matching in preference order:
- *
- *   - match: if there's a fully established connection matching the
- *     _packet_ tuple, it is returned, assuming the redirection
- *     already took place and we process a packet belonging to an
- *     established connection
- *
- *   - match: if there's a listening socket matching the redirection
- *     (e.g. on-port & on-ip of the connection), it is returned,
- *     regardless if it was bound to 0.0.0.0 or an explicit
- *     address. The reasoning is that if there's an explicit rule, it
- *     does not really matter if the listener is bound to an interface
- *     or to 0. The user already stated that he wants redirection
- *     (since he added the rule).
- *
- * Please note that there's an overlap between what a TPROXY target
- * and a socket match will match. Normally if you have both rules the
- * "socket" match will be the first one, effectively all packets
- * belonging to established connections going through that one.
- */
-static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
-		      const u8 protocol,
-		      const __be32 saddr, const __be32 daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in,
-		      const enum nf_tproxy_lookup_t lookup_type)
-{
-	struct sock *sk;
-	struct tcphdr *tcph;
-
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_LISTENER:
-			tcph = hp;
-			sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
-						    ip_hdrlen(skb) +
-						      __tcp_hdrlen(tcph),
-						    saddr, sport,
-						    daddr, dport,
-						    in->ifindex, 0);
-
-			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
-				sk = NULL;
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = inet_lookup_established(net, &tcp_hashinfo,
-						    saddr, sport, daddr, dport,
-						    in->ifindex);
-			break;
-		default:
-			BUG();
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = (inet_sk(sk)->inet_rcv_saddr == 0);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-
-#ifdef XT_TPROXY_HAVE_IPV6
-static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
-		      const u8 protocol,
-		      const struct in6_addr *saddr, const struct in6_addr *daddr,
-		      const __be16 sport, const __be16 dport,
-		      const struct net_device *in,
-		      const enum nf_tproxy_lookup_t lookup_type)
-{
-	struct sock *sk;
-	struct tcphdr *tcph;
-
-	switch (protocol) {
-	case IPPROTO_TCP:
-		switch (lookup_type) {
-		case NFT_LOOKUP_LISTENER:
-			tcph = hp;
-			sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
-						   thoff + __tcp_hdrlen(tcph),
-						   saddr, sport,
-						   daddr, ntohs(dport),
-						   in->ifindex, 0);
-
-			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
-				sk = NULL;
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			break;
-		case NFT_LOOKUP_ESTABLISHED:
-			sk = __inet6_lookup_established(net, &tcp_hashinfo,
-							saddr, sport, daddr, ntohs(dport),
-							in->ifindex, 0);
-			break;
-		default:
-			BUG();
-		}
-		break;
-	case IPPROTO_UDP:
-		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
-				     in->ifindex);
-		if (sk) {
-			int connected = (sk->sk_state == TCP_ESTABLISHED);
-			int wildcard = ipv6_addr_any(&sk->sk_v6_rcv_saddr);
-
-			/* NOTE: we return listeners even if bound to
-			 * 0.0.0.0, those are filtered out in
-			 * xt_socket, since xt_TPROXY needs 0 bound
-			 * listeners too
-			 */
-			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
-			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
-				sock_put(sk);
-				sk = NULL;
-			}
-		}
-		break;
-	default:
-		WARN_ON(1);
-		sk = NULL;
-	}
-
-	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
-		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
-
-	return sk;
-}
-#endif
-
-/**
- * tproxy_handle_time_wait4 - handle IPv4 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @laddr:	IPv4 address to redirect to or zero.
- * @lport:	TCP port to redirect to or zero.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait4() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
-			 __be32 laddr, __be16 lport, struct sock *sk)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	struct tcphdr _hdr, *hp;
-
-	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
-					    iph->saddr, laddr ? laddr : iph->daddr,
-					    hp->source, lport ? lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 /* assign a socket to the skb -- consumes sk */
 static void
 nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
@@ -308,26 +64,26 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
-				   skb->dev, NFT_LOOKUP_ESTABLISHED);
+				   skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
 
-	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	laddr = nf_tproxy_laddr4(skb, laddr, iph->daddr);
 	if (!lport)
 		lport = hp->dest;
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
-		sk = tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
+		sk = nf_tproxy_handle_time_wait4(net, skb, laddr, lport, sk);
 	else if (!sk)
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
 					   iph->saddr, laddr,
 					   hp->source, lport,
-					   skb->dev, NFT_LOOKUP_LISTENER);
+					   skb->dev, NF_TPROXY_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && tproxy_sk_is_transparent(sk)) {
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
@@ -366,87 +122,6 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
 
 #ifdef XT_TPROXY_HAVE_IPV6
 
-static inline const struct in6_addr *
-tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr,
-	      const struct in6_addr *daddr)
-{
-	struct inet6_dev *indev;
-	struct inet6_ifaddr *ifa;
-	struct in6_addr *laddr;
-
-	if (!ipv6_addr_any(user_laddr))
-		return user_laddr;
-	laddr = NULL;
-
-	indev = __in6_dev_get(skb->dev);
-	if (indev) {
-		read_lock_bh(&indev->lock);
-		list_for_each_entry(ifa, &indev->addr_list, if_list) {
-			if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
-				continue;
-
-			laddr = &ifa->addr;
-			break;
-		}
-		read_unlock_bh(&indev->lock);
-	}
-
-	return laddr ? laddr : daddr;
-}
-
-/**
- * tproxy_handle_time_wait6 - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @tproto:	Transport protocol.
- * @thoff:	Transport protocol header offset.
- * @par:	Iptables target parameters.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
-			 const struct xt_action_param *par,
-			 struct sock *sk)
-{
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct tcphdr _hdr, *hp;
-	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
-	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
-					    &iph->saddr,
-					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
-					    hp->source,
-					    tgi->lport ? tgi->lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 static unsigned int
 tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -478,25 +153,31 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
-				   xt_in(par), NFT_LOOKUP_ESTABLISHED);
+				   xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
 
-	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	laddr = nf_tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
 	lport = tgi->lport ? tgi->lport : hp->dest;
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
-	if (sk && sk->sk_state == TCP_TIME_WAIT)
+	if (sk && sk->sk_state == TCP_TIME_WAIT) {
+		const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
 		/* reopening a TIME_WAIT connection needs special handling */
-		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+		sk = nf_tproxy_handle_time_wait6(skb, tproto, thoff,
+					      xt_net(par),
+					      &tgi->laddr.in6,
+					      tgi->lport,
+					      sk);
+	}
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
-					   xt_in(par), NFT_LOOKUP_LISTENER);
+					   xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
-	if (sk && tproxy_sk_is_transparent(sk)) {
+	if (sk && nf_tproxy_sk_is_transparent(sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-- 
cgit v1.2.3-59-g8ed1b


From 00bfb3205e62970ff14ac8756e1a72b753a990ab Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:46 +0200
Subject: netfilter: nf_tables: pass context to object destroy indirection

The new connlimit object needs this to properly deal with conntrack
dependencies.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 ++-
 net/netfilter/nf_tables_api.c     | 12 ++++++------
 net/netfilter/nft_counter.c       |  3 ++-
 net/netfilter/nft_ct.c            |  3 ++-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 435c32d8a995..81ec070582b6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1070,7 +1070,8 @@ struct nft_object_ops {
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nlattr *const tb[],
 						struct nft_object *obj);
-	void				(*destroy)(struct nft_object *obj);
+	void				(*destroy)(const struct nft_ctx *ctx,
+						   struct nft_object *obj);
 	int				(*dump)(struct sk_buff *skb,
 						struct nft_object *obj,
 						bool reset);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c785bc5a66f1..177658f4007e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4787,7 +4787,7 @@ err3:
 	kfree(obj->name);
 err2:
 	if (obj->ops->destroy)
-		obj->ops->destroy(obj);
+		obj->ops->destroy(&ctx, obj);
 	kfree(obj);
 err1:
 	module_put(type->owner);
@@ -4997,10 +4997,10 @@ err:
 	return err;
 }
 
-static void nft_obj_destroy(struct nft_object *obj)
+static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
 {
 	if (obj->ops->destroy)
-		obj->ops->destroy(obj);
+		obj->ops->destroy(ctx, obj);
 
 	module_put(obj->ops->type->owner);
 	kfree(obj->name);
@@ -6003,7 +6003,7 @@ static void nft_commit_release(struct nft_trans *trans)
 					   nft_trans_elem(trans).priv);
 		break;
 	case NFT_MSG_DELOBJ:
-		nft_obj_destroy(nft_trans_obj(trans));
+		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_DELFLOWTABLE:
 		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -6328,7 +6328,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 				     nft_trans_elem(trans).priv, true);
 		break;
 	case NFT_MSG_NEWOBJ:
-		nft_obj_destroy(nft_trans_obj(trans));
+		nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
 		break;
 	case NFT_MSG_NEWFLOWTABLE:
 		nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
@@ -7022,7 +7022,7 @@ static void __nft_release_tables(struct net *net)
 		list_for_each_entry_safe(obj, ne, &table->objects, list) {
 			list_del(&obj->list);
 			table->use--;
-			nft_obj_destroy(obj);
+			nft_obj_destroy(&ctx, obj);
 		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
 			ctx.chain = chain;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index eefe3b409925..e59a74d6b7d6 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -96,7 +96,8 @@ static void nft_counter_do_destroy(struct nft_counter_percpu_priv *priv)
 	free_percpu(priv->counter);
 }
 
-static void nft_counter_obj_destroy(struct nft_object *obj)
+static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
+				    struct nft_object *obj)
 {
 	struct nft_counter_percpu_priv *priv = nft_obj_data(obj);
 
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index ea737fd789e8..f8b19eacfa0c 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,7 +826,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static void nft_ct_helper_obj_destroy(struct nft_object *obj)
+static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx,
+				      struct nft_object *obj)
 {
 	struct nft_ct_helper_obj *priv = nft_obj_data(obj);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5e5cbc7b23eaf13e18652c03efbad5be6995de6a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:47 +0200
Subject: netfilter: nf_conncount: expose connection list interface

This patch provides an interface to maintain the list of connections and
the lookup function to obtain the number of connections in the list.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 11 +++++++++
 net/netfilter/nf_conncount.c               | 36 +++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index e61184fbfb71..1910b6572430 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -13,4 +13,15 @@ unsigned int nf_conncount_count(struct net *net,
 				const u32 *key,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
+
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+				 const struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_zone *zone,
+				 bool *addit);
+
+bool nf_conncount_add(struct hlist_head *head,
+		      const struct nf_conntrack_tuple *tuple);
+
+void nf_conncount_cache_free(struct hlist_head *hhead);
+
 #endif
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 153e690e2893..3b5059a8dcdd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -79,7 +79,7 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-static bool add_hlist(struct hlist_head *head,
+bool nf_conncount_add(struct hlist_head *head,
 		      const struct nf_conntrack_tuple *tuple)
 {
 	struct nf_conncount_tuple *conn;
@@ -91,12 +91,12 @@ static bool add_hlist(struct hlist_head *head,
 	hlist_add_head(&conn->node, head);
 	return true;
 }
+EXPORT_SYMBOL_GPL(nf_conncount_add);
 
-static unsigned int check_hlist(struct net *net,
-				struct hlist_head *head,
-				const struct nf_conntrack_tuple *tuple,
-				const struct nf_conntrack_zone *zone,
-				bool *addit)
+unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+				 const struct nf_conntrack_tuple *tuple,
+				 const struct nf_conntrack_zone *zone,
+				 bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn;
@@ -141,6 +141,7 @@ static unsigned int check_hlist(struct net *net,
 
 	return length;
 }
+EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
@@ -187,13 +188,15 @@ count_tree(struct net *net, struct rb_root *root,
 		} else {
 			/* same source network -> be counted! */
 			unsigned int count;
-			count = check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+
+			count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
+						    zone, &addit);
 
 			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
 				return count;
 
-			if (!add_hlist(&rbconn->hhead, tuple))
+			if (!nf_conncount_add(&rbconn->hhead, tuple))
 				return 0; /* hotdrop */
 
 			return count + 1;
@@ -203,7 +206,7 @@ count_tree(struct net *net, struct rb_root *root,
 			continue;
 
 		/* only used for GC on hhead, retval and 'addit' ignored */
-		check_hlist(net, &rbconn->hhead, tuple, zone, &addit);
+		nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
 		if (hlist_empty(&rbconn->hhead))
 			gc_nodes[gc_count++] = rbconn;
 	}
@@ -303,11 +306,19 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
 }
 EXPORT_SYMBOL_GPL(nf_conncount_init);
 
-static void destroy_tree(struct rb_root *r)
+void nf_conncount_cache_free(struct hlist_head *hhead)
 {
 	struct nf_conncount_tuple *conn;
-	struct nf_conncount_rb *rbconn;
 	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(conn, n, hhead, node)
+		kmem_cache_free(conncount_conn_cachep, conn);
+}
+EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
+
+static void destroy_tree(struct rb_root *r)
+{
+	struct nf_conncount_rb *rbconn;
 	struct rb_node *node;
 
 	while ((node = rb_first(r)) != NULL) {
@@ -315,8 +326,7 @@ static void destroy_tree(struct rb_root *r)
 
 		rb_erase(node, r);
 
-		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
-			kmem_cache_free(conncount_conn_cachep, conn);
+		nf_conncount_cache_free(&rbconn->hhead);
 
 		kmem_cache_free(conncount_rb_cachep, rbconn);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3453c92731884bad7c4c3a0667228b964747f3d5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:48 +0200
Subject: netfilter: nf_tables: pass ctx to nf_tables_expr_destroy()

nft_set_elem_destroy() can be called from call_rcu context. Annotate
netns and table in set object so we can populate the context object.
Moreover, pass context object to nf_tables_set_elem_destroy() from the
commit phase, since it is already available from there.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nf_tables_api.c     | 16 ++++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 81ec070582b6..e3d1bac9b0d5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -370,6 +370,8 @@ void nft_unregister_set(struct nft_set_type *type);
  *
  *	@list: table set list node
  *	@bindings: list of set bindings
+ *	@table: table this set belongs to
+ *	@net: netnamespace this set belongs to
  * 	@name: name of the set
  *	@handle: unique handle of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
@@ -393,6 +395,8 @@ void nft_unregister_set(struct nft_set_type *type);
 struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
+	struct nft_table		*table;
+	possible_net_t			net;
 	char				*name;
 	u64				handle;
 	u32				ktype;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 177658f4007e..12463984dd5c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3359,6 +3359,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	}
 
 	INIT_LIST_HEAD(&set->bindings);
+	set->table = table;
+	write_pnet(&set->net, net);
 	set->ops   = ops;
 	set->ktype = ktype;
 	set->klen  = desc.klen;
@@ -4036,12 +4038,16 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 			  bool destroy_expr)
 {
 	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+	struct nft_ctx ctx = {
+		.net	= read_pnet(&set->net),
+		.family	= set->table->family,
+	};
 
 	nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
 	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+		nf_tables_expr_destroy(&ctx, nft_set_ext_expr(ext));
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
 		(*nft_set_ext_obj(ext))->use--;
 	kfree(elem);
@@ -4051,12 +4057,13 @@ EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
 /* Only called from commit path, nft_set_elem_deactivate() already deals with
  * the refcounting from the preparation phase.
  */
-static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+				       const struct nft_set *set, void *elem)
 {
 	struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+		nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
 	kfree(elem);
 }
 
@@ -5999,7 +6006,8 @@ static void nft_commit_release(struct nft_trans *trans)
 		nft_set_destroy(nft_trans_set(trans));
 		break;
 	case NFT_MSG_DELSETELEM:
-		nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+		nf_tables_set_elem_destroy(&trans->ctx,
+					   nft_trans_elem_set(trans),
 					   nft_trans_elem(trans).priv);
 		break;
 	case NFT_MSG_DELOBJ:
-- 
cgit v1.2.3-59-g8ed1b


From 79b174ade16d90302aef6e14f5eefd0b723c1602 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:49 +0200
Subject: netfilter: nf_tables: garbage collection for stateful expressions

Use garbage collector to schedule removal of elements based of feedback
from expression that this element comes with. Therefore, the garbage
collector is not guided by timeout expirations in this new mode.

The new connlimit expression sets on the NFT_EXPR_GC flag to enable this
behaviour, the dynset expression needs to explicitly enable the garbage
collector via set->ops->gc_init call.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nft_dynset.c        |  9 +++++++++
 net/netfilter/nft_set_hash.c      | 21 +++++++++++++++++++--
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e3d1bac9b0d5..871cb3b012e9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -342,6 +342,7 @@ struct nft_set_ops {
 						const struct nft_set_desc *desc,
 						const struct nlattr * const nla[]);
 	void				(*destroy)(const struct nft_set *set);
+	void				(*gc_init)(const struct nft_set *set);
 
 	unsigned int			elemsize;
 };
@@ -712,6 +713,7 @@ struct nft_expr_type {
 };
 
 #define NFT_EXPR_STATEFUL		0x1
+#define NFT_EXPR_GC			0x2
 
 /**
  *	struct nft_expr_ops - nf_tables expression operations
@@ -748,6 +750,8 @@ struct nft_expr_ops {
 	int				(*validate)(const struct nft_ctx *ctx,
 						    const struct nft_expr *expr,
 						    const struct nft_data **data);
+	bool				(*gc)(struct net *net,
+					      const struct nft_expr *expr);
 	const struct nft_expr_type	*type;
 	void				*data;
 };
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index b07a3fd9eeea..4d49529cff61 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -195,6 +195,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 		err = -EOPNOTSUPP;
 		if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
 			goto err1;
+
+		if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
+			if (set->flags & NFT_SET_TIMEOUT)
+				goto err1;
+			if (!set->ops->gc_init)
+				goto err1;
+			set->ops->gc_init(set);
+		}
+
 	} else if (set->flags & NFT_SET_EVAL)
 		return -EINVAL;
 
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index dbf1f4ad077c..6f9a1365a09f 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -311,8 +311,16 @@ static void nft_rhash_gc(struct work_struct *work)
 			continue;
 		}
 
+		if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
+			struct nft_expr *expr = nft_set_ext_expr(&he->ext);
+
+			if (expr->ops->gc &&
+			    expr->ops->gc(read_pnet(&set->net), expr))
+				goto gc;
+		}
 		if (!nft_set_elem_expired(&he->ext))
 			continue;
+gc:
 		if (nft_set_elem_mark_busy(&he->ext))
 			continue;
 
@@ -339,6 +347,14 @@ static unsigned int nft_rhash_privsize(const struct nlattr * const nla[],
 	return sizeof(struct nft_rhash);
 }
 
+static void nft_rhash_gc_init(const struct nft_set *set)
+{
+	struct nft_rhash *priv = nft_set_priv(set);
+
+	queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
+			   nft_set_gc_interval(set));
+}
+
 static int nft_rhash_init(const struct nft_set *set,
 			  const struct nft_set_desc *desc,
 			  const struct nlattr * const tb[])
@@ -356,8 +372,8 @@ static int nft_rhash_init(const struct nft_set *set,
 
 	INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
 	if (set->flags & NFT_SET_TIMEOUT)
-		queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
-				   nft_set_gc_interval(set));
+		nft_rhash_gc_init(set);
+
 	return 0;
 }
 
@@ -647,6 +663,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = {
 		.elemsize	= offsetof(struct nft_rhash_elem, ext),
 		.estimate	= nft_rhash_estimate,
 		.init		= nft_rhash_init,
+		.gc_init	= nft_rhash_gc_init,
 		.destroy	= nft_rhash_destroy,
 		.insert		= nft_rhash_insert,
 		.activate	= nft_rhash_activate,
-- 
cgit v1.2.3-59-g8ed1b


From 371ebcbb9ee62fb46a0a27f358941588f7048678 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 23:38:50 +0200
Subject: netfilter: nf_tables: add destroy_clone expression

Before this patch, cloned expressions are released via ->destroy. This
is a problem for the new connlimit expression since the ->destroy path
drop a reference on the conntrack modules and it unregisters hooks. The
new ->destroy_clone provides context that this expression is being
released from the packet path, so it is mirroring ->clone(), where
neither module reference is dropped nor hooks need to be unregistered -
because this done from the control plane path from the ->init() path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c     | 12 ++++++++++--
 net/netfilter/nft_counter.c       |  1 +
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 871cb3b012e9..83e7b83ecf3e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -745,6 +745,8 @@ struct nft_expr_ops {
 						      const struct nft_expr *expr);
 	void				(*destroy)(const struct nft_ctx *ctx,
 						   const struct nft_expr *expr);
+	void				(*destroy_clone)(const struct nft_ctx *ctx,
+							 const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
 	int				(*validate)(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 12463984dd5c..0a6eafa49879 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4046,8 +4046,16 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 	nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
 		nft_data_release(nft_set_ext_data(ext), set->dtype);
-	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
-		nf_tables_expr_destroy(&ctx, nft_set_ext_expr(ext));
+	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
+		struct nft_expr *expr = nft_set_ext_expr(ext);
+
+		if (expr->ops->destroy_clone) {
+			expr->ops->destroy_clone(&ctx, expr);
+			module_put(expr->ops->type->owner);
+		} else {
+			nf_tables_expr_destroy(&ctx, expr);
+		}
+	}
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
 		(*nft_set_ext_obj(ext))->use--;
 	kfree(elem);
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index e59a74d6b7d6..a61d7edfc290 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -258,6 +258,7 @@ static const struct nft_expr_ops nft_counter_ops = {
 	.eval		= nft_counter_eval,
 	.init		= nft_counter_init,
 	.destroy	= nft_counter_destroy,
+	.destroy_clone	= nft_counter_destroy,
 	.dump		= nft_counter_dump,
 	.clone		= nft_counter_clone,
 };
-- 
cgit v1.2.3-59-g8ed1b


From 290180e2448c02d6b391455937098882a73a9494 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sat, 2 Jun 2018 21:38:51 +0200
Subject: netfilter: nf_tables: add connlimit support

This features which allows you to limit the maximum number of
connections per arbitrary key. The connlimit expression is stateful,
therefore it can be used from meters to dynamically populate a set, this
provides a mapping to the iptables' connlimit match. This patch also
comes that allows you define static connlimit policies.

This extension depends on the nf_conncount infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  21 ++-
 net/netfilter/Kconfig                    |   9 +
 net/netfilter/Makefile                   |   1 +
 net/netfilter/nft_connlimit.c            | 297 +++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 net/netfilter/nft_connlimit.c

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a089af092a29..ae00a3c49b8a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1043,6 +1043,24 @@ enum nft_limit_attributes {
 };
 #define NFTA_LIMIT_MAX		(__NFTA_LIMIT_MAX - 1)
 
+enum nft_connlimit_flags {
+	NFT_CONNLIMIT_F_INV	= (1 << 0),
+};
+
+/**
+ * enum nft_connlimit_attributes - nf_tables connlimit expression netlink attributes
+ *
+ * @NFTA_CONNLIMIT_COUNT: number of connections (NLA_U32)
+ * @NFTA_CONNLIMIT_FLAGS: flags (NLA_U32: enum nft_connlimit_flags)
+ */
+enum nft_connlimit_attributes {
+	NFTA_CONNLIMIT_UNSPEC,
+	NFTA_CONNLIMIT_COUNT,
+	NFTA_CONNLIMIT_FLAGS,
+	__NFTA_CONNLIMIT_MAX
+};
+#define NFTA_CONNLIMIT_MAX	(__NFTA_CONNLIMIT_MAX - 1)
+
 /**
  * enum nft_counter_attributes - nf_tables counter expression netlink attributes
  *
@@ -1357,7 +1375,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_QUOTA	2
 #define NFT_OBJECT_CT_HELPER	3
 #define NFT_OBJECT_LIMIT	4
-#define __NFT_OBJECT_MAX	5
+#define NFT_OBJECT_CONNLIMIT	5
+#define __NFT_OBJECT_MAX	6
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 41240abd755f..dbd7d1fad277 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -517,6 +517,15 @@ config NFT_COUNTER
 	  This option adds the "counter" expression that you can use to
 	  include packet and byte counters in a rule.
 
+config NFT_CONNLIMIT
+	tristate "Netfilter nf_tables connlimit module"
+	depends on NF_CONNTRACK
+	depends on NETFILTER_ADVANCED
+	select NETFILTER_CONNCOUNT
+	help
+	  This option adds the "connlimit" expression that you can use to
+	  ratelimit rule matchings per connections.
+
 config NFT_LOG
 	tristate "Netfilter nf_tables log module"
 	help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index eec169555731..44449389e527 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -80,6 +80,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
+obj-$(CONFIG_NFT_CONNLIMIT)	+= nft_connlimit.o
 obj-$(CONFIG_NFT_NUMGEN)	+= nft_numgen.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
new file mode 100644
index 000000000000..50c068d660e5
--- /dev/null
+++ b/net/netfilter/nft_connlimit.c
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_count.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+struct nft_connlimit {
+	spinlock_t		lock;
+	struct hlist_head	hhead;
+	u32			limit;
+	bool			invert;
+};
+
+static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
+					 struct nft_regs *regs,
+					 const struct nft_pktinfo *pkt,
+					 const struct nft_set_ext *ext)
+{
+	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+	const struct nf_conntrack_tuple *tuple_ptr;
+	struct nf_conntrack_tuple tuple;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int count;
+	bool addit;
+
+	tuple_ptr = &tuple;
+
+	ct = nf_ct_get(pkt->skb, &ctinfo);
+	if (ct != NULL) {
+		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+		zone = nf_ct_zone(ct);
+	} else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
+				      nft_pf(pkt), nft_net(pkt), &tuple)) {
+		regs->verdict.code = NF_DROP;
+		return;
+	}
+
+	spin_lock_bh(&priv->lock);
+	count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+				    &addit);
+
+	if (!addit)
+		goto out;
+
+	if (!nf_conncount_add(&priv->hhead, tuple_ptr)) {
+		regs->verdict.code = NF_DROP;
+		spin_unlock_bh(&priv->lock);
+		return;
+	}
+	count++;
+out:
+	spin_unlock_bh(&priv->lock);
+
+	if ((count > priv->limit) ^ priv->invert) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+}
+
+static int nft_connlimit_do_init(const struct nft_ctx *ctx,
+				 const struct nlattr * const tb[],
+				 struct nft_connlimit *priv)
+{
+	bool invert = false;
+	u32 flags, limit;
+
+	if (!tb[NFTA_CONNLIMIT_COUNT])
+		return -EINVAL;
+
+	limit = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_COUNT]));
+
+	if (tb[NFTA_CONNLIMIT_FLAGS]) {
+		flags = ntohl(nla_get_be32(tb[NFTA_CONNLIMIT_FLAGS]));
+		if (flags & ~NFT_CONNLIMIT_F_INV)
+			return -EOPNOTSUPP;
+		if (flags & NFT_CONNLIMIT_F_INV)
+			invert = true;
+	}
+
+	spin_lock_init(&priv->lock);
+	INIT_HLIST_HEAD(&priv->hhead);
+	priv->limit	= limit;
+	priv->invert	= invert;
+
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
+				     struct nft_connlimit *priv)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static int nft_connlimit_do_dump(struct sk_buff *skb,
+				 struct nft_connlimit *priv)
+{
+	if (nla_put_be32(skb, NFTA_CONNLIMIT_COUNT, htonl(priv->limit)))
+		goto nla_put_failure;
+	if (priv->invert &&
+	    nla_put_be32(skb, NFTA_CONNLIMIT_FLAGS, htonl(NFT_CONNLIMIT_F_INV)))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static inline void nft_connlimit_obj_eval(struct nft_object *obj,
+					struct nft_regs *regs,
+					const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
+				const struct nlattr * const tb[],
+				struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
+				      struct nft_object *obj)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_obj_dump(struct sk_buff *skb,
+				  struct nft_object *obj, bool reset)
+{
+	struct nft_connlimit *priv = nft_obj_data(obj);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static const struct nla_policy nft_connlimit_policy[NFTA_CONNLIMIT_MAX + 1] = {
+	[NFTA_CONNLIMIT_COUNT]	= { .type = NLA_U32 },
+	[NFTA_CONNLIMIT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static struct nft_object_type nft_connlimit_obj_type;
+static const struct nft_object_ops nft_connlimit_obj_ops = {
+	.type		= &nft_connlimit_obj_type,
+	.size		= sizeof(struct nft_connlimit),
+	.eval		= nft_connlimit_obj_eval,
+	.init		= nft_connlimit_obj_init,
+	.destroy	= nft_connlimit_obj_destroy,
+	.dump		= nft_connlimit_obj_dump,
+};
+
+static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CONNLIMIT,
+	.ops		= &nft_connlimit_obj_ops,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.policy		= nft_connlimit_policy,
+	.owner		= THIS_MODULE,
+};
+
+static void nft_connlimit_eval(const struct nft_expr *expr,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_eval(priv, regs, pkt, NULL);
+}
+
+static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_dump(skb, priv);
+}
+
+static int nft_connlimit_init(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nlattr * const tb[])
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	return nft_connlimit_do_init(ctx, tb, priv);
+}
+
+static void nft_connlimit_destroy(const struct nft_ctx *ctx,
+				const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nft_connlimit_do_destroy(ctx, priv);
+}
+
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+	struct nft_connlimit *priv_dst = nft_expr_priv(dst);
+	struct nft_connlimit *priv_src = nft_expr_priv(src);
+
+	spin_lock_init(&priv_dst->lock);
+	INIT_HLIST_HEAD(&priv_dst->hhead);
+	priv_dst->limit	 = priv_src->limit;
+	priv_dst->invert = priv_src->invert;
+
+	return 0;
+}
+
+static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
+					const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+
+	nf_conncount_cache_free(&priv->hhead);
+}
+
+static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
+{
+	struct nft_connlimit *priv = nft_expr_priv(expr);
+	bool addit, ret;
+
+	spin_lock_bh(&priv->lock);
+	nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+
+	ret = hlist_empty(&priv->hhead);
+	spin_unlock_bh(&priv->lock);
+
+	return ret;
+}
+
+static struct nft_expr_type nft_connlimit_type;
+static const struct nft_expr_ops nft_connlimit_ops = {
+	.type		= &nft_connlimit_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_connlimit)),
+	.eval		= nft_connlimit_eval,
+	.init		= nft_connlimit_init,
+	.destroy	= nft_connlimit_destroy,
+	.clone		= nft_connlimit_clone,
+	.destroy_clone	= nft_connlimit_destroy_clone,
+	.dump		= nft_connlimit_dump,
+	.gc		= nft_connlimit_gc,
+};
+
+static struct nft_expr_type nft_connlimit_type __read_mostly = {
+	.name		= "connlimit",
+	.ops		= &nft_connlimit_ops,
+	.policy		= nft_connlimit_policy,
+	.maxattr	= NFTA_CONNLIMIT_MAX,
+	.flags		= NFT_EXPR_STATEFUL | NFT_EXPR_GC,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_connlimit_module_init(void)
+{
+	int err;
+
+	err = nft_register_obj(&nft_connlimit_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_connlimit_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_connlimit_obj_type);
+	return err;
+}
+
+static void __exit nft_connlimit_module_exit(void)
+{
+	nft_unregister_expr(&nft_connlimit_type);
+	nft_unregister_obj(&nft_connlimit_obj_type);
+}
+
+module_init(nft_connlimit_module_init);
+module_exit(nft_connlimit_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso");
+MODULE_ALIAS_NFT_EXPR("connlimit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
-- 
cgit v1.2.3-59-g8ed1b


From 1b2470e59fb1e983a3655feba30cdfc03e609d51 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 2 Jun 2018 23:41:06 +0200
Subject: netfilter: nf_tables: handle chain name lookups via rhltable

If there is a significant amount of chains list search is too slow, so
add an rhlist table for this.

This speeds up ruleset loading: for every new rule we have to check if
the name already exists in current generation.

We need to be able to cope with duplicate chain names in case a transaction
drops the nfnl mutex (for request_module) and the abort of this old
transaction is still pending.

The list is kept -- we need a way to iterate chains even if hash resize is
in progress without missing an entry.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   7 ++-
 net/netfilter/nf_tables_api.c     | 113 +++++++++++++++++++++++++++++++++-----
 2 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 83e7b83ecf3e..08c005ce56e9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -9,6 +9,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/rhashtable.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
@@ -860,6 +861,7 @@ enum nft_chain_flags {
  *
  *	@rules: list of rules in the chain
  *	@list: used internally
+ *	@rhlhead: used internally
  *	@table: table that this chain belongs to
  *	@handle: chain handle
  *	@use: number of jump references to this chain
@@ -872,6 +874,7 @@ struct nft_chain {
 	struct nft_rule			*__rcu *rules_gen_1;
 	struct list_head		rules;
 	struct list_head		list;
+	struct rhlist_head		rhlhead;
 	struct nft_table		*table;
 	u64				handle;
 	u32				use;
@@ -965,7 +968,8 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	struct nft_table - nf_tables table
  *
  *	@list: used internally
- *	@chains: chains in the table
+ *	@chains_ht: chains in the table
+ *	@chains: same, for stable walks
  *	@sets: sets in the table
  *	@objects: stateful objects in the table
  *	@flowtables: flow tables in the table
@@ -979,6 +983,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  */
 struct nft_table {
 	struct list_head		list;
+	struct rhltable			chains_ht;
 	struct list_head		chains;
 	struct list_head		sets;
 	struct list_head		objects;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0a6eafa49879..2e8fd961746d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -34,6 +34,20 @@ enum {
 	NFT_VALIDATE_DO,
 };
 
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
+
+static const struct rhashtable_params nft_chain_ht_params = {
+	.head_offset		= offsetof(struct nft_chain, rhlhead),
+	.key_offset		= offsetof(struct nft_chain, name),
+	.hashfn			= nft_chain_hash,
+	.obj_hashfn		= nft_chain_hash_obj,
+	.obj_cmpfn		= nft_chain_hash_cmp,
+	.locks_mul		= 1,
+	.automatic_shrinking	= true,
+};
+
 static void nft_validate_state_update(struct net *net, u8 new_validate_state)
 {
 	switch (net->nft.validate_state) {
@@ -720,6 +734,29 @@ err:
 	return ret;
 }
 
+static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
+{
+	const char *name = data;
+
+	return jhash(name, strlen(name), seed);
+}
+
+static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct nft_chain *chain = data;
+
+	return nft_chain_hash(chain->name, 0, seed);
+}
+
+static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
+			      const void *ptr)
+{
+	const struct nft_chain *chain = ptr;
+	const char *name = arg->key;
+
+	return strcmp(chain->name, name);
+}
+
 static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 			      struct sk_buff *skb, const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[],
@@ -766,6 +803,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table->name == NULL)
 		goto err_strdup;
 
+	err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
+	if (err)
+		goto err_chain_ht;
+
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
@@ -782,6 +823,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	list_add_tail_rcu(&table->list, &net->nft.tables);
 	return 0;
 err_trans:
+	rhltable_destroy(&table->chains_ht);
+err_chain_ht:
 	kfree(table->name);
 err_strdup:
 	kfree(table);
@@ -922,6 +965,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 {
 	BUG_ON(ctx->table->use > 0);
 
+	rhltable_destroy(&ctx->table->chains_ht);
 	kfree(ctx->table->name);
 	kfree(ctx->table);
 }
@@ -967,21 +1011,35 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_chain *nft_chain_lookup(const struct nft_table *table,
+static struct nft_chain *nft_chain_lookup(struct nft_table *table,
 					  const struct nlattr *nla, u8 genmask)
 {
+	char search[NFT_CHAIN_MAXNAMELEN + 1];
+	struct rhlist_head *tmp, *list;
 	struct nft_chain *chain;
 
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry_rcu(chain, &table->chains, list) {
-		if (!nla_strcmp(nla, chain->name) &&
-		    nft_active_genmask(chain, genmask))
-			return chain;
-	}
+	nla_strlcpy(search, nla, sizeof(search));
 
-	return ERR_PTR(-ENOENT);
+	WARN_ON(!rcu_read_lock_held() &&
+		!lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+
+	chain = ERR_PTR(-ENOENT);
+	rcu_read_lock();
+	list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
+	if (!list)
+		goto out_unlock;
+
+	rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
+		if (nft_active_genmask(chain, genmask))
+			goto out_unlock;
+	}
+	chain = ERR_PTR(-ENOENT);
+out_unlock:
+	rcu_read_unlock();
+	return chain;
 }
 
 static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
@@ -1185,8 +1243,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_table *table;
 	const struct nft_chain *chain;
+	struct nft_table *table;
 	struct sk_buff *skb2;
 	int family = nfmsg->nfgen_family;
 	int err;
@@ -1504,9 +1562,17 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	if (err < 0)
 		goto err1;
 
+	err = rhltable_insert_key(&table->chains_ht, chain->name,
+				  &chain->rhlhead, nft_chain_ht_params);
+	if (err)
+		goto err2;
+
 	err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
-	if (err < 0)
+	if (err < 0) {
+		rhltable_remove(&table->chains_ht, &chain->rhlhead,
+				nft_chain_ht_params);
 		goto err2;
+	}
 
 	table->use++;
 	list_add_tail_rcu(&chain->list, &table->chains);
@@ -2206,9 +2272,9 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_table *table;
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
+	struct nft_table *table;
 	struct sk_buff *skb2;
 	int family = nfmsg->nfgen_family;
 	int err;
@@ -5981,8 +6047,16 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
 
-	if (nft_trans_chain_name(trans))
+	if (nft_trans_chain_name(trans)) {
+		rhltable_remove(&trans->ctx.table->chains_ht,
+				&trans->ctx.chain->rhlhead,
+				nft_chain_ht_params);
 		swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
+		rhltable_insert_key(&trans->ctx.table->chains_ht,
+				    trans->ctx.chain->name,
+				    &trans->ctx.chain->rhlhead,
+				    nft_chain_ht_params);
+	}
 
 	if (!nft_is_base_chain(trans->ctx.chain))
 		return;
@@ -6159,6 +6233,15 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
 		nf_tables_commit_chain_free_rules_old(g0);
 }
 
+static void nft_chain_del(struct nft_chain *chain)
+{
+	struct nft_table *table = chain->table;
+
+	WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
+				     nft_chain_ht_params));
+	list_del_rcu(&chain->list);
+}
+
 static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 {
 	struct nft_trans *trans, *next;
@@ -6233,7 +6316,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			nft_trans_destroy(trans);
 			break;
 		case NFT_MSG_DELCHAIN:
-			list_del_rcu(&trans->ctx.chain->list);
+			nft_chain_del(trans->ctx.chain);
 			nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
 			nf_tables_unregister_hook(trans->ctx.net,
 						  trans->ctx.table,
@@ -6384,7 +6467,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 				nft_trans_destroy(trans);
 			} else {
 				trans->ctx.table->use--;
-				list_del_rcu(&trans->ctx.chain->list);
+				nft_chain_del(trans->ctx.chain);
 				nf_tables_unregister_hook(trans->ctx.net,
 							  trans->ctx.table,
 							  trans->ctx.chain);
@@ -6986,7 +7069,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 		ctx->chain->use--;
 		nf_tables_rule_release(ctx, rule);
 	}
-	list_del(&ctx->chain->list);
+	nft_chain_del(ctx->chain);
 	ctx->table->use--;
 	nf_tables_chain_destroy(ctx);
 
@@ -7042,7 +7125,7 @@ static void __nft_release_tables(struct net *net)
 		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
 			ctx.chain = chain;
-			list_del(&chain->list);
+			nft_chain_del(chain);
 			table->use--;
 			nf_tables_chain_destroy(&ctx);
 		}
-- 
cgit v1.2.3-59-g8ed1b


From eaf47b17a77fda841a1102d76c15161ee438b347 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 30 May 2018 22:13:20 +0200
Subject: net: phy: consider PHY_IGNORE_INTERRUPT in state machine PHY_NOLINK
 handling

We can bail out immediately also in case of PHY_IGNORE_INTERRUPT because
phy_mac_interupt() informs us once the link is up.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef15e6..537297d2b4b4 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -894,7 +894,7 @@ void phy_state_machine(struct work_struct *work)
 			needs_aneg = true;
 		break;
 	case PHY_NOLINK:
-		if (phy_interrupt_is_valid(phydev))
+		if (phydev->irq != PHY_POLL)
 			break;
 
 		err = phy_read_status(phydev);
-- 
cgit v1.2.3-59-g8ed1b


From 9c6ffbacdb5740a8560729f18e8e0e33ad21473b Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 31 May 2018 02:04:43 +0000
Subject: hv_netvsc: fix error return code in netvsc_probe()

Fix to return a negative error code from the failover register fail
error handling case instead of 0, as done elsewhere in this function.

Fixes: 1ff78076d8dd ("netvsc: refactor notifier/event handling code to use the failover framework")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ebe964203eff..bef4d55a108c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2031,8 +2031,10 @@ static int netvsc_probe(struct hv_device *dev,
 	}
 
 	net_device_ctx->failover = failover_register(net, &netvsc_failover_ops);
-	if (IS_ERR(net_device_ctx->failover))
+	if (IS_ERR(net_device_ctx->failover)) {
+		ret = PTR_ERR(net_device_ctx->failover);
 		goto err_failover;
+	}
 
 	return ret;
 
-- 
cgit v1.2.3-59-g8ed1b


From 8cb77149e8c8c8f6fef67ac2749868c563a449c6 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 31 May 2018 02:31:12 +0000
Subject: net/mlx5: Make function mlx5_fpga_tls_send_teardown_cmd() static

Fixes the following sparse warning:

drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c:199:6: warning:
 symbol 'mlx5_fpga_tls_send_teardown_cmd' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
index 21048013826c..c9736238604a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -196,8 +196,8 @@ static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
 		 MLX5_GET(tls_flow, flow, direction_sx));
 }
 
-void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev, void *flow,
-				     u32 swid, gfp_t flags)
+static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev,
+					    void *flow, u32 swid, gfp_t flags)
 {
 	struct mlx5_teardown_stream_context *ctx;
 	struct mlx5_fpga_dma_buf *buf;
-- 
cgit v1.2.3-59-g8ed1b


From 3dc9f558cdd2dfba06e021a6b85e990d1a0b2641 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 31 May 2018 02:31:22 +0000
Subject: net/smc: fix error return code in smc_setsockopt()

Fix to return error code -EINVAL instead of 0 if optlen is invalid.

Fixes: 01d2f7e2cdd3 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 2c369d4bb1c1..973b4471b532 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1420,7 +1420,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
 		return rc;
 
 	if (optlen < sizeof(int))
-		return rc;
+		return -EINVAL;
 	get_user(val, (int __user *)optval);
 
 	lock_sock(sk);
-- 
cgit v1.2.3-59-g8ed1b


From a6ee84be2cb9d1d16efbe833c369122e676be09f Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 31 May 2018 11:48:48 +0800
Subject: net: netcp: ethss: remove unnecessary pointer set to NULL

If statement has make sure the 'slave->phy' is NULL

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/netcp_ethss.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 6a728d35e776..6e455a27a8de 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -3206,7 +3206,6 @@ static void init_secondary_ports(struct gbe_priv *gbe_dev,
 		if (!slave->phy) {
 			dev_err(dev, "phy not found for slave %d\n",
 				slave->slave_num);
-			slave->phy = NULL;
 		} else {
 			dev_dbg(dev, "phy found: id is: 0x%s\n",
 				phydev_name(slave->phy));
-- 
cgit v1.2.3-59-g8ed1b


From 06be0864c77ae6861632a678a6378511a4828d6e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:31 +0200
Subject: bpf: test case for map pointer poison with calls/branches

Add several test cases where the same or different map pointers
originate from different paths in the program and execute a map
lookup or tail call at a common location.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h                      |  10 ++
 tools/include/linux/filter.h                |  10 ++
 tools/testing/selftests/bpf/test_verifier.c | 185 ++++++++++++++++++++++++----
 3 files changed, 178 insertions(+), 27 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d90abdaea94b..6fd375fe7079 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -289,6 +289,16 @@ struct xdp_buff;
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index c5e512da8d8a..af55acf73e75 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -263,6 +263,16 @@
 #define BPF_LD_MAP_FD(DST, MAP_FD)				\
 	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
 
+/* Relative call */
+
+#define BPF_CALL_REL(TGT)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_CALL,			\
+		.dst_reg = 0,					\
+		.src_reg = BPF_PSEUDO_CALL,			\
+		.off   = 0,					\
+		.imm   = TGT })
+
 /* Program exit */
 
 #define BPF_EXIT_INSN()						\
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4b4f015be217..7cb1d74057ce 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -50,7 +50,7 @@
 
 #define MAX_INSNS	BPF_MAXINSNS
 #define MAX_FIXUPS	8
-#define MAX_NR_MAPS	4
+#define MAX_NR_MAPS	7
 #define POINTER_VALUE	0xcafe4all
 #define TEST_DATA_LEN	64
 
@@ -66,7 +66,9 @@ struct bpf_test {
 	int fixup_map1[MAX_FIXUPS];
 	int fixup_map2[MAX_FIXUPS];
 	int fixup_map3[MAX_FIXUPS];
-	int fixup_prog[MAX_FIXUPS];
+	int fixup_map4[MAX_FIXUPS];
+	int fixup_prog1[MAX_FIXUPS];
+	int fixup_prog2[MAX_FIXUPS];
 	int fixup_map_in_map[MAX_FIXUPS];
 	const char *errstr;
 	const char *errstr_unpriv;
@@ -2769,7 +2771,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.errstr_unpriv = "R3 leaks addr into helper",
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
@@ -2856,7 +2858,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -2870,7 +2872,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 41,
 	},
@@ -2884,7 +2886,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 1),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 1,
 	},
@@ -2898,7 +2900,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2912,7 +2914,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 1 },
+		.fixup_prog1 = { 1 },
 		.result = ACCEPT,
 		.retval = 2,
 	},
@@ -2926,7 +2928,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 2),
 			BPF_EXIT_INSN(),
 		},
-		.fixup_prog = { 2 },
+		.fixup_prog1 = { 2 },
 		.result = ACCEPT,
 		.retval = 42,
 	},
@@ -11681,6 +11683,112 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, array)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map2 = { 13 },
+		.fixup_map4 = { 16 },
+		.result = ACCEPT,
+		.retval = 1,
+	},
+	{
+		"calls: two calls returning different map pointers for lookup (hash, map in map)",
+		.insns = {
+			/* main prog */
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+			BPF_CALL_REL(11),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_CALL_REL(12),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0,
+				   offsetof(struct test_val, foo)),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+			/* subprog 1 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			/* subprog 2 */
+			BPF_LD_MAP_FD(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.fixup_map_in_map = { 16 },
+		.fixup_map4 = { 13 },
+		.result = REJECT,
+		.errstr = "R0 invalid mem access 'map_ptr'",
+	},
+	{
+		"cond: two branches returning different map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog1 = { 5 },
+		.fixup_prog2 = { 2 },
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "tail_call abusing map_ptr",
+		.result = ACCEPT,
+		.retval = 42,
+	},
+	{
+		"cond: two branches returning same map pointers for lookup (tail, tail)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 3),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+			BPF_LD_MAP_FD(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_tail_call),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_prog2 = { 2, 5 },
+		.result_unpriv = ACCEPT,
+		.result = ACCEPT,
+		.retval = 42,
+	},
 	{
 		"search pruning: all branches should be verified (nop operation)",
 		.insns = {
@@ -12162,12 +12270,13 @@ static int probe_filter_length(const struct bpf_insn *fp)
 	return len + 1;
 }
 
-static int create_map(uint32_t size_value, uint32_t max_elem)
+static int create_map(uint32_t type, uint32_t size_key,
+		      uint32_t size_value, uint32_t max_elem)
 {
 	int fd;
 
-	fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
-			    size_value, max_elem, BPF_F_NO_PREALLOC);
+	fd = bpf_create_map(type, size_key, size_value, max_elem,
+			    type == BPF_MAP_TYPE_HASH ? BPF_F_NO_PREALLOC : 0);
 	if (fd < 0)
 		printf("Failed to create hash map '%s'!\n", strerror(errno));
 
@@ -12200,13 +12309,13 @@ static int create_prog_dummy2(int mfd, int idx)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(void)
+static int create_prog_array(uint32_t max_elem, int p1key)
 {
-	int p1key = 0, p2key = 1;
+	int p2key = 1;
 	int mfd, p1fd, p2fd;
 
 	mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
-			     sizeof(int), 4, 0);
+			     sizeof(int), max_elem, 0);
 	if (mfd < 0) {
 		printf("Failed to create prog array '%s'!\n", strerror(errno));
 		return -1;
@@ -12261,7 +12370,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	int *fixup_map1 = test->fixup_map1;
 	int *fixup_map2 = test->fixup_map2;
 	int *fixup_map3 = test->fixup_map3;
-	int *fixup_prog = test->fixup_prog;
+	int *fixup_map4 = test->fixup_map4;
+	int *fixup_prog1 = test->fixup_prog1;
+	int *fixup_prog2 = test->fixup_prog2;
 	int *fixup_map_in_map = test->fixup_map_in_map;
 
 	if (test->fill_helper)
@@ -12272,7 +12383,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	 * that really matters is value size in this case.
 	 */
 	if (*fixup_map1) {
-		map_fds[0] = create_map(sizeof(long long), 1);
+		map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(long long), 1);
 		do {
 			prog[*fixup_map1].imm = map_fds[0];
 			fixup_map1++;
@@ -12280,7 +12392,8 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map2) {
-		map_fds[1] = create_map(sizeof(struct test_val), 1);
+		map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct test_val), 1);
 		do {
 			prog[*fixup_map2].imm = map_fds[1];
 			fixup_map2++;
@@ -12288,25 +12401,43 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog,
 	}
 
 	if (*fixup_map3) {
-		map_fds[1] = create_map(sizeof(struct other_val), 1);
+		map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+					sizeof(struct other_val), 1);
 		do {
-			prog[*fixup_map3].imm = map_fds[1];
+			prog[*fixup_map3].imm = map_fds[2];
 			fixup_map3++;
 		} while (*fixup_map3);
 	}
 
-	if (*fixup_prog) {
-		map_fds[2] = create_prog_array();
+	if (*fixup_map4) {
+		map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+					sizeof(struct test_val), 1);
+		do {
+			prog[*fixup_map4].imm = map_fds[3];
+			fixup_map4++;
+		} while (*fixup_map4);
+	}
+
+	if (*fixup_prog1) {
+		map_fds[4] = create_prog_array(4, 0);
+		do {
+			prog[*fixup_prog1].imm = map_fds[4];
+			fixup_prog1++;
+		} while (*fixup_prog1);
+	}
+
+	if (*fixup_prog2) {
+		map_fds[5] = create_prog_array(8, 7);
 		do {
-			prog[*fixup_prog].imm = map_fds[2];
-			fixup_prog++;
-		} while (*fixup_prog);
+			prog[*fixup_prog2].imm = map_fds[5];
+			fixup_prog2++;
+		} while (*fixup_prog2);
 	}
 
 	if (*fixup_map_in_map) {
-		map_fds[3] = create_map_in_map();
+		map_fds[6] = create_map_in_map();
 		do {
-			prog[*fixup_map_in_map].imm = map_fds[3];
+			prog[*fixup_map_in_map].imm = map_fds[6];
 			fixup_map_in_map++;
 		} while (*fixup_map_in_map);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From be08815c5d3b25e53cd9b53a4d768d5f3d93ba25 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:32 +0200
Subject: bpf: add also cbpf long jump test cases with heavy expansion

We have one triggering on eBPF but lets also add a cBPF example to
make sure we keep tracking them. Also add anther cBPF test running
max number of MSH ops.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 lib/test_bpf.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 317f231462d4..60aedc879361 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -356,6 +356,52 @@ static int bpf_fill_maxinsns11(struct bpf_test *self)
 	return __bpf_fill_ja(self, BPF_MAXINSNS, 68);
 }
 
+static int bpf_fill_maxinsns12(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i = 0;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	insn[0] = __BPF_JUMP(BPF_JMP | BPF_JA, len - 2, 0, 0);
+
+	for (i = 1; i < len - 1; i++)
+		insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xabababab);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns13(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i = 0;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len - 3; i++)
+		insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
+
+	insn[len - 3] = __BPF_STMT(BPF_LD | BPF_IMM, 0xabababab);
+	insn[len - 2] = __BPF_STMT(BPF_ALU | BPF_XOR | BPF_X, 0);
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
 static int bpf_fill_ja(struct bpf_test *self)
 {
 	/* Hits exactly 11 passes on x86_64 JIT. */
@@ -5289,6 +5335,23 @@ static struct bpf_test tests[] = {
 		.fill_helper = bpf_fill_maxinsns11,
 		.expected_errcode = -ENOTSUPP,
 	},
+	{
+		"BPF_MAXINSNS: jump over MSH",
+		{ },
+		CLASSIC | FLAG_EXPECTED_FAIL,
+		{ 0xfa, 0xfb, 0xfc, 0xfd, },
+		{ { 4, 0xabababab } },
+		.fill_helper = bpf_fill_maxinsns12,
+		.expected_errcode = -EINVAL,
+	},
+	{
+		"BPF_MAXINSNS: exec all MSH",
+		{ },
+		CLASSIC,
+		{ 0xfa, 0xfb, 0xfc, 0xfd, },
+		{ { 4, 0xababab83 } },
+		.fill_helper = bpf_fill_maxinsns13,
+	},
 	{
 		"BPF_MAXINSNS: ld_abs+get_processor_id",
 		{ },
-- 
cgit v1.2.3-59-g8ed1b


From 3fe2867cdf088ffb2dc5aed6cdcf757b4c62476c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:33 +0200
Subject: bpf: fixup error message from gpl helpers on license mismatch

Stating 'proprietary program' in the error is just silly since it
can also be a different open source license than that which is just
not compatible.

Reference: https://twitter.com/majek04/status/998531268039102465
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1fd9667b29f1..4f4786ea2296 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2462,7 +2462,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose(env, "cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 4316b40914ecde3738968225af56e650e8b61938 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:34 +0200
Subject: bpf: show prog and map id in fdinfo

Its trivial and straight forward to expose it for scripts that can
then use it along with bpftool in order to inspect an individual
application's used maps and progs. Right now we dump some basic
information in the fdinfo file but with the help of the map/prog
id full introspection becomes possible now.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7365d79ae00d..0fa20624707f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -327,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "value_size:\t%u\n"
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "map_id:\t%u\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->pages * 1ULL << PAGE_SHIFT);
+		   map->pages * 1ULL << PAGE_SHIFT,
+		   map->id);
 
 	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
@@ -1070,11 +1072,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "prog_id:\t%u\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
-		   prog->pages * 1ULL << PAGE_SHIFT);
+		   prog->pages * 1ULL << PAGE_SHIFT,
+		   prog->aux->id);
 }
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From b0949618826cbb64e9ba764bdd52aa14eaf5073d Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Thu, 31 May 2018 17:02:54 +1000
Subject: net/ncsi: Avoid GFP_KERNEL in response handler

ncsi_rsp_handler_gc() allocates the filter arrays using GFP_KERNEL in
softirq context, causing the below backtrace. This allocation is only a
few dozen bytes during probing so allocate with GFP_ATOMIC instead.

[   42.813372] BUG: sleeping function called from invalid context at mm/slab.h:416
[   42.820900] in_atomic(): 1, irqs_disabled(): 0, pid: 213, name: kworker/0:1
[   42.827893] INFO: lockdep is turned off.
[   42.832023] CPU: 0 PID: 213 Comm: kworker/0:1 Tainted: G        W       4.13.16-01441-gad99b38 #65
[   42.841007] Hardware name: Generic DT based system
[   42.845966] Workqueue: events ncsi_dev_work
[   42.850251] [<8010a494>] (unwind_backtrace) from [<80107510>] (show_stack+0x20/0x24)
[   42.858046] [<80107510>] (show_stack) from [<80612770>] (dump_stack+0x20/0x28)
[   42.865309] [<80612770>] (dump_stack) from [<80148248>] (___might_sleep+0x230/0x2b0)
[   42.873241] [<80148248>] (___might_sleep) from [<80148334>] (__might_sleep+0x6c/0xac)
[   42.881129] [<80148334>] (__might_sleep) from [<80240d6c>] (__kmalloc+0x210/0x2fc)
[   42.888737] [<80240d6c>] (__kmalloc) from [<8060ad54>] (ncsi_rsp_handler_gc+0xd0/0x170)
[   42.896770] [<8060ad54>] (ncsi_rsp_handler_gc) from [<8060b454>] (ncsi_rcv_rsp+0x16c/0x1d4)
[   42.905314] [<8060b454>] (ncsi_rcv_rsp) from [<804d86c8>] (__netif_receive_skb_core+0x3c8/0xb50)
[   42.914158] [<804d86c8>] (__netif_receive_skb_core) from [<804d96cc>] (__netif_receive_skb+0x20/0x7c)
[   42.923420] [<804d96cc>] (__netif_receive_skb) from [<804de4b0>] (netif_receive_skb_internal+0x78/0x6a4)
[   42.932931] [<804de4b0>] (netif_receive_skb_internal) from [<804df980>] (netif_receive_skb+0x78/0x158)
[   42.942292] [<804df980>] (netif_receive_skb) from [<8042f204>] (ftgmac100_poll+0x43c/0x4e8)
[   42.950855] [<8042f204>] (ftgmac100_poll) from [<804e094c>] (net_rx_action+0x278/0x4c4)
[   42.958918] [<804e094c>] (net_rx_action) from [<801016a8>] (__do_softirq+0xe0/0x4c4)
[   42.966716] [<801016a8>] (__do_softirq) from [<8011cd9c>] (do_softirq.part.4+0x50/0x78)
[   42.974756] [<8011cd9c>] (do_softirq.part.4) from [<8011cebc>] (__local_bh_enable_ip+0xf8/0x11c)
[   42.983579] [<8011cebc>] (__local_bh_enable_ip) from [<804dde08>] (__dev_queue_xmit+0x260/0x890)
[   42.992392] [<804dde08>] (__dev_queue_xmit) from [<804df1f0>] (dev_queue_xmit+0x1c/0x20)
[   43.000689] [<804df1f0>] (dev_queue_xmit) from [<806099c0>] (ncsi_xmit_cmd+0x1c0/0x244)
[   43.008763] [<806099c0>] (ncsi_xmit_cmd) from [<8060dc14>] (ncsi_dev_work+0x2e0/0x4c8)
[   43.016725] [<8060dc14>] (ncsi_dev_work) from [<80133dfc>] (process_one_work+0x214/0x6f8)
[   43.024940] [<80133dfc>] (process_one_work) from [<80134328>] (worker_thread+0x48/0x558)
[   43.033070] [<80134328>] (worker_thread) from [<8013ba80>] (kthread+0x130/0x174)
[   43.040506] [<8013ba80>] (kthread) from [<80102950>] (ret_from_fork+0x14/0x24)

Fixes: 062b3e1b6d4f ("net/ncsi: Refactor MAC, VLAN filters")
Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-rsp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index a6b7c7d5c829..930c1d3796f0 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -652,7 +652,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 				      NCSI_CAP_VLAN_MASK;
 
 	size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN;
-	nc->mac_filter.addrs = kzalloc(size, GFP_KERNEL);
+	nc->mac_filter.addrs = kzalloc(size, GFP_ATOMIC);
 	if (!nc->mac_filter.addrs)
 		return -ENOMEM;
 	nc->mac_filter.n_uc = rsp->uc_cnt;
@@ -661,7 +661,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 
 	nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt,
 				       sizeof(*nc->vlan_filter.vids),
-				       GFP_KERNEL);
+				       GFP_ATOMIC);
 	if (!nc->vlan_filter.vids)
 		return -ENOMEM;
 	/* Set VLAN filters active so they are cleared in the first
-- 
cgit v1.2.3-59-g8ed1b


From 09772d92cd5ad998b0d5f6f46cd1658f8cb698cf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:35 +0200
Subject: bpf: avoid retpoline for lookup/update/delete calls on maps

While some of the BPF map lookup helpers provide a ->map_gen_lookup()
callback for inlining the map lookup altogether it is not available
for every map, so the remaining ones have to call bpf_map_lookup_elem()
helper which does a dispatch to map->ops->map_lookup_elem(). In
times of retpolines, this will control and trap speculative execution
rather than letting it do its work for the indirect call and will
therefore cause a slowdown. Likewise, bpf_map_update_elem() and
bpf_map_delete_elem() do not have an inlined version and need to call
into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
handlers.

Before:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#232656
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
   16: (95) exit                                 helper

After:

  # bpftool prog dump xlated id 1
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (7a) *(u64 *)(r2 +0) = 0
    3: (18) r1 = map[id:1]
    5: (85) call __htab_map_lookup_elem#233328
    6: (15) if r0 == 0x0 goto pc+4
    7: (71) r1 = *(u8 *)(r0 +35)
    8: (55) if r1 != 0x0 goto pc+1
    9: (72) *(u8 *)(r0 +35) = 1
   10: (07) r0 += 56
   11: (15) if r0 == 0x0 goto pc+4
   12: (bf) r2 = r0
   13: (18) r1 = map[id:1]
   15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
   16: (95) exit

In all three lookup/update/delete cases however we can use the actual
address of the map callback directly if we find that there's only a
single path with a map pointer leading to the helper call, meaning
when the map pointer has not been poisoned from verifier side.
Example code can be seen above for the delete case.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/hashtab.c   | 12 ++++++---
 kernel/bpf/verifier.c  | 68 ++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6fd375fe7079..8e60f1e9a702 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -301,6 +301,9 @@ struct xdp_buff;
 
 /* Function call */
 
+#define BPF_CAST_CALL(x)					\
+		((u64 (*)(u64, u64, u64, u64, u64))(x))
+
 #define BPF_EMIT_CALL(FUNC)					\
 	((struct bpf_insn) {					\
 		.code  = BPF_JMP | BPF_CALL,			\
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 	const int ret = BPF_REG_0;
 	const int ref_reg = BPF_REG_1;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 			      offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f4786ea2296..1dd6d5a7aa5b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2421,8 +2421,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
 
 	if (func_id != BPF_FUNC_tail_call &&
-	    func_id != BPF_FUNC_map_lookup_elem)
+	    func_id != BPF_FUNC_map_lookup_elem &&
+	    func_id != BPF_FUNC_map_update_elem &&
+	    func_id != BPF_FUNC_map_delete_elem)
 		return 0;
+
 	if (meta->map_ptr == NULL) {
 		verbose(env, "kernel subsystem misconfigured verifier\n");
 		return -EINVAL;
@@ -5586,6 +5589,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
+	const struct bpf_map_ops *ops;
 	struct bpf_insn_aux_data *aux;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
@@ -5715,35 +5719,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
-		 * handlers are currently limited to 64 bit only.
+		 * and other inlining handlers are currently limited to 64 bit
+		 * only.
 		 */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
-		    insn->imm == BPF_FUNC_map_lookup_elem) {
+		    (insn->imm == BPF_FUNC_map_lookup_elem ||
+		     insn->imm == BPF_FUNC_map_update_elem ||
+		     insn->imm == BPF_FUNC_map_delete_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
 			map_ptr = BPF_MAP_PTR(aux->map_state);
-			if (!map_ptr->ops->map_gen_lookup)
-				goto patch_call_imm;
+			ops = map_ptr->ops;
+			if (insn->imm == BPF_FUNC_map_lookup_elem &&
+			    ops->map_gen_lookup) {
+				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+				if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+					verbose(env, "bpf verifier is misconfigured\n");
+					return -EINVAL;
+				}
 
-			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
-			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose(env, "bpf verifier is misconfigured\n");
-				return -EINVAL;
-			}
+				new_prog = bpf_patch_insn_data(env, i + delta,
+							       insn_buf, cnt);
+				if (!new_prog)
+					return -ENOMEM;
 
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
-						       cnt);
-			if (!new_prog)
-				return -ENOMEM;
+				delta    += cnt - 1;
+				env->prog = prog = new_prog;
+				insn      = new_prog->insnsi + i + delta;
+				continue;
+			}
 
-			delta += cnt - 1;
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+				     (void *(*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+				     (int (*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+				     (int (*)(struct bpf_map *map, void *key, void *value,
+					      u64 flags))NULL));
+			switch (insn->imm) {
+			case BPF_FUNC_map_lookup_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_update_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_delete_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+					    __bpf_call_base;
+				continue;
+			}
 
-			/* keep walking new program and skip insns we just inserted */
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-			continue;
+			goto patch_call_imm;
 		}
 
 		if (insn->imm == BPF_FUNC_redirect_map) {
-- 
cgit v1.2.3-59-g8ed1b


From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:36 +0200
Subject: bpf: add bpf_skb_cgroup_id helper

Add a new bpf_skb_cgroup_id() helper that allows to retrieve the
cgroup id from the skb's socket. This is useful in particular to
enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in
cgroup v2 by allowing ID based matching on egress. This can in
particular be used in combination with applying policy e.g. from
map lookups, and also complements the older bpf_skb_under_cgroup()
interface. In user space the cgroup id for a given path can be
retrieved through the f_handle as demonstrated in [0] recently.

  [0] https://lkml.org/lkml/2018/5/22/1190

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 19 ++++++++++++++++++-
 net/core/filter.c        | 29 +++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 64ac0f7a689e..661318141f72 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 885fb0e2f264..edbfaa613b6d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3661,6 +3661,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+#ifdef CONFIG_SOCK_CGROUP_DATA
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+	struct cgroup *cgrp;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
+	.func           = bpf_skb_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+#endif
+
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 				  unsigned long off, unsigned long len)
 {
@@ -4747,12 +4768,16 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_fib_lookup:
+		return &bpf_skb_fib_lookup_proto;
 #ifdef CONFIG_XFRM
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
-	case BPF_FUNC_fib_lookup:
-		return &bpf_skb_fib_lookup_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+	case BPF_FUNC_skb_cgroup_id:
+		return &bpf_skb_cgroup_id_proto;
+#endif
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:37 +0200
Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch

Since the remaining bits are not filled in struct bpf_tunnel_key
resp. struct bpf_xfrm_state and originate from uninitialized stack
space, we should make sure to clear them before handing control
back to the program.

Also add a padding element to struct bpf_xfrm_state for future use
similar as we have in struct bpf_tunnel_key and clear it as well.

  struct bpf_xfrm_state {
      __u32                      reqid;            /*     0     4 */
      __u32                      spi;              /*     4     4 */
      __u16                      family;           /*     8     2 */

      /* XXX 2 bytes hole, try to pack */

      union {
          __u32              remote_ipv4;          /*           4 */
          __u32              remote_ipv6[4];       /*          16 */
      };                                           /*    12    16 */

      /* size: 28, cachelines: 1, members: 4 */
      /* sum members: 26, holes: 1, sum holes: 2 */
      /* last cacheline: 28 bytes */
  };

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 3 ++-
 net/core/filter.c        | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 661318141f72..f0b6608b1f1c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2268,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2279,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
diff --git a/net/core/filter.c b/net/core/filter.c
index edbfaa613b6d..28e864777c0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3445,6 +3445,7 @@ set_compat:
 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
+	to->tunnel_ext = 0;
 
 	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -3452,6 +3453,8 @@ set_compat:
 		to->tunnel_label = be32_to_cpu(info->key.label);
 	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+		to->tunnel_label = 0;
 	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
@@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
 	to->reqid = x->props.reqid;
 	to->spi = x->id.spi;
 	to->family = x->props.family;
+	to->ext = 0;
+
 	if (to->family == AF_INET6) {
 		memcpy(to->remote_ipv6, x->props.saddr.a6,
 		       sizeof(to->remote_ipv6));
 	} else {
 		to->remote_ipv4 = x->props.saddr.a4;
+		memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From b3bbba3570a7ba064f12c33f484eb9757a2b359e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:38 +0200
Subject: bpf: fix cbpf parser bug for octal numbers

Range is 0-7, not 0-9, otherwise parser silently excludes it from the
strtol() rather than throwing an error.

Reported-by: Marc Boschma <marc@boschma.cx>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpf_exp.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpf_exp.l b/tools/bpf/bpf_exp.l
index bd83149e7be0..4da8d053d68f 100644
--- a/tools/bpf/bpf_exp.l
+++ b/tools/bpf/bpf_exp.l
@@ -175,7 +175,7 @@ extern void yyerror(const char *str);
 			yylval.number = strtol(yytext, NULL, 10);
 			return number;
 		}
-([0][0-9]+)	{
+([0][0-7]+)	{
 			yylval.number = strtol(yytext + 1, NULL, 8);
 			return number;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From bc23105ca0abdeed98366af01c700c2c3aff5cd5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:39 +0200
Subject: bpf: fix context access in tracing progs on 32 bit archs

Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
program type in test_verifier report the following errors on x86_32:

  172/p unpriv: spill/fill of different pointers ldx FAIL
  Unexpected error message!
  0: (bf) r6 = r10
  1: (07) r6 += -8
  2: (15) if r1 == 0x0 goto pc+3
  R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
  3: (bf) r2 = r10
  4: (07) r2 += -76
  5: (7b) *(u64 *)(r6 +0) = r2
  6: (55) if r1 != 0x0 goto pc+1
  R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp
  7: (7b) *(u64 *)(r6 +0) = r1
  8: (79) r1 = *(u64 *)(r6 +0)
  9: (79) r1 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

  378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (71) r0 = *(u8 *)(r1 +68)
  invalid bpf_context access off=68 size=1

  379/p check bpf_perf_event_data->sample_period half load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (69) r0 = *(u16 *)(r1 +68)
  invalid bpf_context access off=68 size=2

  380/p check bpf_perf_event_data->sample_period word load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (61) r0 = *(u32 *)(r1 +68)
  invalid bpf_context access off=68 size=4

  381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
  Failed to load prog 'Permission denied'!
  0: (b7) r0 = 0
  1: (79) r0 = *(u64 *)(r1 +68)
  invalid bpf_context access off=68 size=8

Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok()
will then bail out saying that off & (size_default - 1) which is 68 & 7
doesn't cleanly align in the case of sample_period access from struct
bpf_perf_event_data, hence verifier wrongly thinks we might be doing an
unaligned access here though underlying arch can handle it just fine.
Therefore adjust this down to machine size and check and rewrite the
offset for narrow access on that basis. We also need to fix corresponding
pe_prog_is_valid_access(), since we hit the check for off % size != 0
(e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs
for tracing work on x86_32.

Reported-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   | 30 ++++++++++++++++++++++++------
 kernel/bpf/verifier.c    |  3 ++-
 kernel/trace/bpf_trace.c | 10 ++++++++--
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8e60f1e9a702..45fc0f5000d8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 	return prog->type == BPF_PROG_TYPE_UNSPEC;
 }
 
-static inline bool
-bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
+static inline u32 bpf_ctx_off_adjust_machine(u32 size)
+{
+	const u32 size_machine = sizeof(unsigned long);
+
+	if (size > size_machine && size % size_machine == 0)
+		size = size_machine;
+
+	return size;
+}
+
+static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
+					   u32 size_default)
 {
-	bool off_ok;
+	size_default = bpf_ctx_off_adjust_machine(size_default);
+	size_access  = bpf_ctx_off_adjust_machine(size_access);
+
 #ifdef __LITTLE_ENDIAN
-	off_ok = (off & (size_default - 1)) == 0;
+	return (off & (size_default - 1)) == 0;
 #else
-	off_ok = (off & (size_default - 1)) + size == size_default;
+	return (off & (size_default - 1)) + size_access == size_default;
 #endif
-	return off_ok && size <= size_default && (size & (size - 1)) == 0;
+}
+
+static inline bool
+bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
+{
+	return bpf_ctx_narrow_align_ok(off, size, size_default) &&
+	       size <= size_default && (size & (size - 1)) == 0;
 }
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1dd6d5a7aa5b..d6403b5166f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5349,6 +5349,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 */
 		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
+			u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
 			u32 off = insn->off;
 			u8 size_code;
 
@@ -5363,7 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
 
-			insn->off = off & ~(ctx_field_size - 1);
+			insn->off = off & ~(size_default - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index af1486d9a0ed..752992ce3513 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -880,8 +880,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 		return false;
 	if (type != BPF_READ)
 		return false;
-	if (off % size != 0)
-		return false;
+	if (off % size != 0) {
+		if (sizeof(unsigned long) != 4)
+			return false;
+		if (size != 8)
+			return false;
+		if (off % size != 4)
+			return false;
+	}
 
 	switch (off) {
 	case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
-- 
cgit v1.2.3-59-g8ed1b


From 6b6a192595a9f04b2196620ee7b6027e567789ac Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:40 +0200
Subject: bpf: sync bpf uapi header with tools

Pull in recent changes from include/uapi/linux/bpf.h.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 64ac0f7a689e..f0b6608b1f1c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2054,6 +2054,22 @@ union bpf_attr {
  *
  *	Return
  *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2134,7 +2150,8 @@ union bpf_attr {
 	FN(lwt_seg6_adjust_srh),	\
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
-	FN(rc_keydown),
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2251,7 +2268,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
-	__u16 tunnel_ext;
+	__u16 tunnel_ext;	/* Padding, future use. */
 	__u32 tunnel_label;
 };
 
@@ -2262,6 +2279,7 @@ struct bpf_xfrm_state {
 	__u32 reqid;
 	__u32 spi;	/* Stored in network byte order */
 	__u16 family;
+	__u16 ext;	/* Padding, future use. */
 	union {
 		__u32 remote_ipv4;	/* Stored in network byte order */
 		__u32 remote_ipv6[4];	/* Stored in network byte order */
-- 
cgit v1.2.3-59-g8ed1b


From 10a76564ae865cbf30ed30e8cbdc1a047e0559ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Jun 2018 23:06:41 +0200
Subject: bpf, doc: add missing patchwork url and libbpf to maintainers

Add missing bits under tools/lib/bpf/ and also Q: entry in order to
make it easier for people to retrieve current patch queue.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f492431b239b..2fd51db09fa4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2722,6 +2722,7 @@ L:	netdev@vger.kernel.org
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
+Q:	https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
 S:	Supported
 F:	arch/x86/net/bpf_jit*
 F:	Documentation/networking/filter.txt
@@ -2740,6 +2741,7 @@ F:	net/sched/act_bpf.c
 F:	net/sched/cls_bpf.c
 F:	samples/bpf/
 F:	tools/bpf/
+F:	tools/lib/bpf/
 F:	tools/testing/selftests/bpf/
 
 BROADCOM B44 10/100 ETHERNET DRIVER
-- 
cgit v1.2.3-59-g8ed1b


From deba328ce9b04085c6a214f7dbf5aa8f4f9041a1 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 31 May 2018 19:51:15 +0800
Subject: net: axienet: remove stale comment of axienet_open

axienet_open no longer return -ENODEV when PHY cannot be connected to
since commit d7cc3163e026 ("net: axienet: Support phy-less mode of operation")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index e74e1e897864..f24f48f33802 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -900,7 +900,6 @@ static void axienet_dma_err_handler(unsigned long data);
  * @ndev:	Pointer to net_device structure
  *
  * Return: 0, on success.
- *	    -ENODEV, if PHY cannot be connected to
  *	    non-zero error value on failure
  *
  * This is the driver open routine. It calls phy_start to start the PHY device.
-- 
cgit v1.2.3-59-g8ed1b


From 8051ac7643e8d0f1dd7272042d66c0518e718347 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Date: Thu, 31 May 2018 09:20:20 -0300
Subject: vlan: use non-archaic spelling of failes

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 78a5a90b4267..83ea4df6ab81 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -331,7 +331,7 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
  * @mac_len: MAC header length including outer vlan headers
  *
  * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
- * Returns error if skb_cow_head failes.
+ * Returns error if skb_cow_head fails.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
@@ -379,7 +379,7 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
- * Returns error if skb_cow_head failes.
+ * Returns error if skb_cow_head fails.
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 885892fb378dc096693557ba4f2b875188619b36 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 31 May 2018 05:52:24 -0700
Subject: mlx4_core: restore optimal ICM memory allocation

Commit 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
brought two regressions caught in our regression suite.

The big one is an additional cost of 256 bytes of overhead per 4096 bytes,
or 6.25 % which is unacceptable since ICM can be pretty large.

This comes from having to allocate one struct mlx4_icm_chunk (256 bytes)
per MLX4_TABLE_CHUNK, which the buggy commit shrank to 4KB
(instead of prior 256KB)

Note that mlx4_alloc_icm() is already able to try high order allocations
and fallback to low-order allocations under high memory pressure.

Most of these allocations happen right after boot time, when we get
plenty of non fragmented memory, there is really no point being so
pessimistic and break huge pages into order-0 ones just for fun.

We only have to tweak gfp_mask a bit, to help falling back faster,
without risking OOM killings.

Second regression is an KASAN fault, that will need further investigations.

Fixes: 1383cb8103bb ("mlx4_core: allocate ICM memory in page size chunks")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tariq Toukan <tariqt@mellanox.com>
Cc: John Sperbeck <jsperbeck@google.com>
Cc: Tarick Bedeir <tarick@google.com>
Cc: Qing Huang <qing.huang@oracle.com>
Cc: Daniel Jurgens <danielj@mellanox.com>
Cc: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/icm.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 685337d58276..5342bd8a3d0b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,13 @@
 #include "fw.h"
 
 /*
- * We allocate in page size (default 4KB on many archs) chunks to avoid high
- * order memory allocations in fragmented/high usage memory situation.
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
  */
 enum {
-	MLX4_ICM_ALLOC_SIZE	= PAGE_SIZE,
-	MLX4_TABLE_CHUNK_SIZE	= PAGE_SIZE,
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18,
 };
 
 static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -135,6 +136,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	struct mlx4_icm *icm;
 	struct mlx4_icm_chunk *chunk = NULL;
 	int cur_order;
+	gfp_t mask;
 	int ret;
 
 	/* We use sg_set_buf for coherent allocs, which assumes low memory */
@@ -178,13 +180,17 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		while (1 << cur_order > npages)
 			--cur_order;
 
+		mask = gfp_mask;
+		if (cur_order)
+			mask &= ~__GFP_DIRECT_RECLAIM;
+
 		if (coherent)
 			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
 						      &chunk->mem[chunk->npages],
-						      cur_order, gfp_mask);
+						      cur_order, mask);
 		else
 			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
-						   cur_order, gfp_mask,
+						   cur_order, mask,
 						   dev->numa_node);
 
 		if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:47 +0200
Subject: xdp: add flags argument to ndo_xdp_xmit API

This patch only change the API and reject any use of flags. This is an
intermediate step that allows us to implement the flush flag operation
later, for each individual driver in a separate patch.

The plan is to implement flush operation via XDP_XMIT_FLUSH flag
and then remove XDP_XMIT_FLAGS_NONE when done.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 6 +++++-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 ++++-
 drivers/net/tun.c                             | 8 ++++++--
 drivers/net/virtio_net.c                      | 5 ++++-
 include/linux/netdevice.h                     | 7 ++++---
 include/net/xdp.h                             | 5 +++++
 kernel/bpf/devmap.c                           | 2 +-
 net/core/filter.c                             | 2 +-
 9 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..c0451d6e0790 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  * For error cases, a negative errno code is returned and no-frames
  * are transmitted (caller must handle freeing frames).
  **/
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index eb8804b3d7b6..820f76db251b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 031d65c4178d..87f088f4af52 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 }
 
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
-			  struct xdp_frame **frames)
+			  struct xdp_frame **frames, u32 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	/* During program transitions its possible adapter->xdp_prog is assigned
 	 * but ring has not been configured yet. In this case simply abort xmit.
 	 */
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2265d2ccea47..b182b8cdd219 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
+static int tun_xdp_xmit(struct net_device *dev, int n,
+			struct xdp_frame **frames, u32 flags)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_file *tfile;
@@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames
 	int cnt = n;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame);
+	return tun_xdp_xmit(dev, 1, &frame, 0);
 }
 
 static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b2647dd5d302..4ed823625953 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
 }
 
 static int virtnet_xdp_xmit(struct net_device *dev,
-			    int n, struct xdp_frame **frames)
+			    int n, struct xdp_frame **frames, u32 flags)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
@@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+		return -EINVAL;
+
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8452f72087ef..7f17785a59d7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1185,13 +1185,13 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
+ *			u32 flags);
  *	This function is used to submit @n XDP packets for transmit on a
  *	netdevice. Returns number of frames successfully transmitted, frames
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- *	TODO: Consider add flag to allow sending flush operation.
  * void (*ndo_xdp_flush)(struct net_device *dev);
  *	This function is used to inform the driver to flush a particular
  *	xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1380,7 +1380,8 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
-						struct xdp_frame **xdp);
+						struct xdp_frame **xdp,
+						u32 flags);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 7ad779237ae8..0c45f0f943ed 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -40,6 +40,11 @@ enum xdp_mem_type {
 	MEM_TYPE_MAX,
 };
 
+/* XDP flags for ndo_xdp_xmit */
+#define XDP_XMIT_FLAGS_NONE	0U
+#define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
+#define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
+
 struct xdp_mem_info {
 	u32 type; /* enum xdp_mem_type, but known size type */
 	u32 id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1fe3fe60508a..037e234056f7 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index 28e864777c0f..56e40dafdde7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
 	if (sent <= 0)
 		return sent;
 	dev->netdev_ops->ndo_xdp_flush(dev);
-- 
cgit v1.2.3-59-g8ed1b


From cdb57ed07fafb79a250e62d714b8910f2d341ef2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:52 +0200
Subject: i40e: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag i40e_xdp_xmit now performs the
same kind of ring tail update as in i40e_xdp_flush.  The advantage is
that all the necessary checks have been performed and xdp_ring can be
updated, instead of having to perform the exact same steps/checks in
i40e_xdp_flush

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c0451d6e0790..5f01e4ce9c92 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3676,6 +3676,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
 	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_ring *xdp_ring;
 	int drops = 0;
 	int i;
 
@@ -3685,20 +3686,25 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
+	xdp_ring = vsi->xdp_rings[queue_index];
+
 	for (i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		int err;
 
-		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+		err = i40e_xmit_xdp_ring(xdpf, xdp_ring);
 		if (err != I40E_XDP_TX) {
 			xdp_return_frame_rx_napi(xdpf);
 			drops++;
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		i40e_xdp_ring_update_tail(xdp_ring);
+
 	return n - drops;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5e2e609522d67750d4b3c0321d1917bb109ac41b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 10:59:57 +0200
Subject: ixgbe: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag ixgbe_xdp_xmit now performs the
same kind of ring tail update as in ixgbe_xdp_flush.  The update tail
code in ixgbe_xdp_flush is generalized and shared with ixgbe_xdp_xmit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87f088f4af52..4fd77c9067f2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10022,6 +10022,15 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
+static void ixgbe_xdp_ring_update_tail(struct ixgbe_ring *ring)
+{
+	/* Force memory writes to complete before letting h/w know there
+	 * are new descriptors to fetch.
+	 */
+	wmb();
+	writel(ring->next_to_use, ring->tail);
+}
+
 static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 			  struct xdp_frame **frames, u32 flags)
 {
@@ -10033,7 +10042,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
 		return -ENETDOWN;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	/* During program transitions its possible adapter->xdp_prog is assigned
@@ -10054,6 +10063,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 		}
 	}
 
+	if (unlikely(flags & XDP_XMIT_FLUSH))
+		ixgbe_xdp_ring_update_tail(ring);
+
 	return n - drops;
 }
 
@@ -10072,11 +10084,7 @@ static void ixgbe_xdp_flush(struct net_device *dev)
 	if (unlikely(!ring))
 		return;
 
-	/* Force memory writes to complete before letting h/w know there
-	 * are new descriptors to fetch.
-	 */
-	wmb();
-	writel(ring->next_to_use, ring->tail);
+	ixgbe_xdp_ring_update_tail(ring);
 
 	return;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0c9d917b7d74b758af1b4e37996af48d95b2ef33 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:03 +0200
Subject: tun: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag tun_xdp_xmit now performs the same
kind of socket wake up as in tun_xdp_flush(). The wake up code from
tun_xdp_flush is generalized and shared with tun_xdp_xmit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/tun.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b182b8cdd219..d82a05fb0594 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1285,6 +1285,14 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
+static void __tun_xdp_flush_tfile(struct tun_file *tfile)
+{
+	/* Notify and wake up reader process */
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
+}
+
 static int tun_xdp_xmit(struct net_device *dev, int n,
 			struct xdp_frame **frames, u32 flags)
 {
@@ -1295,7 +1303,7 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	int cnt = n;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	rcu_read_lock();
@@ -1325,6 +1333,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n,
 	}
 	spin_unlock(&tfile->tx_ring.producer_lock);
 
+	if (flags & XDP_XMIT_FLUSH)
+		__tun_xdp_flush_tfile(tfile);
+
 	rcu_read_unlock();
 	return cnt - drops;
 }
@@ -1353,11 +1364,7 @@ static void tun_xdp_flush(struct net_device *dev)
 
 	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
 					    numqueues]);
-	/* Notify and wake up reader process */
-	if (tfile->flags & TUN_FASYNC)
-		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
-	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
-
+	__tun_xdp_flush_tfile(tfile);
 out:
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5d274cb4bbae481161737a1f652e276883d3d970 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:08 +0200
Subject: virtio_net: implement flush flag for ndo_xdp_xmit

When passed the XDP_XMIT_FLUSH flag virtnet_xdp_xmit now performs the
same virtqueue_kick as virtnet_xdp_flush.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 drivers/net/virtio_net.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ed823625953..62ba8aadd8e6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -481,7 +481,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	int err;
 	int i;
 
-	if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
@@ -507,6 +507,10 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 			drops++;
 		}
 	}
+
+	if (flags & XDP_XMIT_FLUSH)
+		virtqueue_kick(sq->vq);
+
 	return n - drops;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:13 +0200
Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers

Removing XDP_XMIT_FLAGS_NONE as all driver now implement
a flush operation in their ndo_xdp_xmit call.  The compiler
will catch if any users of XDP_XMIT_FLAGS_NONE remains.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0c45f0f943ed..a3b71a4dd71d 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -41,7 +41,6 @@ enum xdp_mem_type {
 };
 
 /* XDP flags for ndo_xdp_xmit */
-#define XDP_XMIT_FLAGS_NONE	0U
 #define XDP_XMIT_FLUSH		(1U << 0)	/* doorbell signal consumer */
 #define XDP_XMIT_FLAGS_MASK	XDP_XMIT_FLUSH
 
-- 
cgit v1.2.3-59-g8ed1b


From 1e67575a5840908e33502b210a22509fe5d6ca53 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:18 +0200
Subject: bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush

This is the first real user of the XDP_XMIT_FLUSH flag.

As pointed out many times, XDP_REDIRECT without using BPF maps is
significant slower than the map variant.  This is primary due to the
lack of bulking, as the ndo_xdp_flush operation is required after each
frame (to avoid frames hanging on the egress device).

It is still possible to optimize this case.  Instead of invoking two
NDO indirect calls, which are very expensive with CONFIG_RETPOLINE,
instead instruct ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 56e40dafdde7..a72ea9f61010 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3056,10 +3056,9 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
 	if (sent <= 0)
 		return sent;
-	dev->netdev_ops->ndo_xdp_flush(dev);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c1ece6b245bd12a57124da78abafbf8a511394d6 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 31 May 2018 11:00:23 +0200
Subject: bpf/xdp: devmap can avoid calling ndo_xdp_flush

The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
appropriate places.

Notice after this patch it is possible to remove ndo_xdp_flush
completely, as this is the last user of ndo_xdp_flush. This is left
for later patches, to keep driver changes separate.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/devmap.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 037e234056f7..a7cc7b3494a9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 }
 
 static int bq_xmit_all(struct bpf_dtab_netdev *obj,
-			 struct xdp_bulk_queue *bq)
+		       struct xdp_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = obj->dev;
 	int sent = 0, drops = 0, err = 0;
@@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
@@ -276,7 +276,6 @@ void __dev_map_flush(struct bpf_map *map)
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
 		struct xdp_bulk_queue *bq;
-		struct net_device *netdev;
 
 		/* This is possible if the dev entry is removed by user space
 		 * between xdp redirect and flush op.
@@ -287,10 +286,7 @@ void __dev_map_flush(struct bpf_map *map)
 		__clear_bit(bit, bitmap);
 
 		bq = this_cpu_ptr(dev->bulkq);
-		bq_xmit_all(dev, bq);
-		netdev = dev->dev;
-		if (likely(netdev->netdev_ops->ndo_xdp_flush))
-			netdev->netdev_ops->ndo_xdp_flush(netdev);
+		bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 	}
 }
 
@@ -320,7 +316,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 
 	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
-		bq_xmit_all(obj, bq);
+		bq_xmit_all(obj, bq, 0);
 
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
@@ -359,8 +355,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
-	if (dev->dev->netdev_ops->ndo_xdp_flush) {
-		struct net_device *fl = dev->dev;
+	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
 		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
 
@@ -371,9 +366,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 			__clear_bit(dev->bit, bitmap);
 
 			bq = per_cpu_ptr(dev->bulkq, cpu);
-			bq_xmit_all(dev, bq);
-
-			fl->netdev_ops->ndo_xdp_flush(dev->dev);
+			bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 		}
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:41 -0700
Subject: bpf: implement bpf_get_current_cgroup_id() helper

bpf has been used extensively for tracing. For example, bcc
contains an almost full set of bpf-based tools to trace kernel
and user functions/events. Most tracing tools are currently
either filtered based on pid or system-wide.

Containers have been used quite extensively in industry and
cgroup is often used together to provide resource isolation
and protection. Several processes may run inside the same
container. It is often desirable to get container-level tracing
results as well, e.g. syscall count, function count, I/O
activity, etc.

This patch implements a new helper, bpf_get_current_cgroup_id(),
which will return cgroup id based on the cgroup within which
the current task is running.

The later patch will provide an example to show that
userspace can get the same cgroup id so it could
configure a filter or policy in the bpf program based on
task cgroup id.

The helper is currently implemented for tracing. It can
be added to other program types as well when needed.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 15 +++++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 5 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbe297436e5d..995c3b1e59bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
+extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 527587de8a67..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1765,6 +1765,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+
+	return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+	.func		= bpf_get_current_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 752992ce3513..e2ab5b7f29d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,6 +564,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From c7ddbbaf1e73e2f4e95cfda7b1da1be7fd199a4d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:42 -0700
Subject: tools/bpf: sync uapi bpf.h for bpf_get_current_cgroup_id() helper

Sync kernel uapi/linux/bpf.h with tools uapi/linux/bpf.h.
Also add the necessary helper define in bpf_helpers.h.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h            | 8 +++++++-
 tools/testing/selftests/bpf/bpf_helpers.h | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f0b6608b1f1c..18712b0dbfe7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2070,6 +2070,11 @@ union bpf_attr {
  * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2151,7 +2156,8 @@ union bpf_attr {
 	FN(lwt_seg6_action),		\
 	FN(rc_repeat),			\
 	FN(rc_keydown),			\
-	FN(skb_cgroup_id),
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index a66a9d91acf4..f2f28b6c8915 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -131,6 +131,8 @@ static int (*bpf_rc_repeat)(void *ctx) =
 static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
 			     unsigned long long scancode, unsigned int toggle) =
 	(void *) BPF_FUNC_rc_keydown;
+static unsigned long long (*bpf_get_current_cgroup_id)(void) =
+	(void *) BPF_FUNC_get_current_cgroup_id;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From f269099a7e7a0c6732c4a817d0e99e92216414d9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 3 Jun 2018 15:59:43 -0700
Subject: tools/bpf: add a selftest for bpf_get_current_cgroup_id() helper

Syscall name_to_handle_at() can be used to get cgroup id
for a particular cgroup path in user space. The selftest
got cgroup id from both user and kernel, and compare to
ensure they are equal to each other.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore           |   1 +
 tools/testing/selftests/bpf/Makefile             |   6 +-
 tools/testing/selftests/bpf/cgroup_helpers.c     |  57 +++++++++
 tools/testing/selftests/bpf/cgroup_helpers.h     |   1 +
 tools/testing/selftests/bpf/get_cgroup_id_kern.c |  28 +++++
 tools/testing/selftests/bpf/get_cgroup_id_user.c | 141 +++++++++++++++++++++++
 6 files changed, 232 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_kern.c
 create mode 100644 tools/testing/selftests/bpf/get_cgroup_id_user.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 6ea835982464..49938d72cf63 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -18,3 +18,4 @@ urandom_read
 test_btf
 test_sockmap
 test_lirc_mode2_user
+get_cgroup_id_user
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 553d1816b77a..607ed8729c06 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf test_sockmap test_lirc_mode2_user
+	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -34,7 +34,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
-	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o
+	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
+	get_cgroup_id_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -63,6 +64,7 @@ $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_sockmap: cgroup_helpers.c
 $(OUTPUT)/test_progs: trace_helpers.c
+$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index f3bca3ade0f3..c87b4e052ce9 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -6,6 +6,7 @@
 #include <sys/types.h>
 #include <linux/limits.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <linux/sched.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -176,3 +177,59 @@ int create_and_get_cgroup(char *path)
 
 	return fd;
 }
+
+/**
+ * get_cgroup_id() - Get cgroup id for a particular cgroup path
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * On success, it returns the cgroup id. On failure it returns 0,
+ * which is an invalid cgroup id.
+ * If there is a failure, it prints the error to stderr.
+ */
+unsigned long long get_cgroup_id(char *path)
+{
+	int dirfd, err, flags, mount_id, fhsize;
+	union {
+		unsigned long long cgid;
+		unsigned char raw_bytes[8];
+	} id;
+	char cgroup_workdir[PATH_MAX + 1];
+	struct file_handle *fhp, *fhp2;
+	unsigned long long ret = 0;
+
+	format_cgroup_path(cgroup_workdir, path);
+
+	dirfd = AT_FDCWD;
+	flags = 0;
+	fhsize = sizeof(*fhp);
+	fhp = calloc(1, fhsize);
+	if (!fhp) {
+		log_err("calloc");
+		return 0;
+	}
+	err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags);
+	if (err >= 0 || fhp->handle_bytes != 8) {
+		log_err("name_to_handle_at");
+		goto free_mem;
+	}
+
+	fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
+	fhp2 = realloc(fhp, fhsize);
+	if (!fhp2) {
+		log_err("realloc");
+		goto free_mem;
+	}
+	err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags);
+	fhp = fhp2;
+	if (err < 0) {
+		log_err("name_to_handle_at");
+		goto free_mem;
+	}
+
+	memcpy(id.raw_bytes, fhp->f_handle, 8);
+	ret = id.cgid;
+
+free_mem:
+	free(fhp);
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h
index 06485e0002b3..20a4a5dcd469 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.h
+++ b/tools/testing/selftests/bpf/cgroup_helpers.h
@@ -13,5 +13,6 @@ int create_and_get_cgroup(char *path);
 int join_cgroup(char *path);
 int setup_cgroup_environment(void);
 void cleanup_cgroup_environment(void);
+unsigned long long get_cgroup_id(char *path);
 
 #endif
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
new file mode 100644
index 000000000000..2cf8cb23f209
--- /dev/null
+++ b/tools/testing/selftests/bpf/get_cgroup_id_kern.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") cg_ids = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64),
+	.max_entries = 1,
+};
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int trace(void *ctx)
+{
+	__u32 key = 0;
+	__u64 *val;
+
+	val = bpf_map_lookup_elem(&cg_ids, &key);
+	if (val)
+		*val = bpf_get_current_cgroup_id();
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
new file mode 100644
index 000000000000..ea19a42e5894
--- /dev/null
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+
+#define CHECK(condition, tag, format...) ({		\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("%s:FAIL:%s ", __func__, tag);	\
+		printf(format);				\
+	} else {					\
+		printf("%s:PASS:%s\n", __func__, tag);	\
+	}						\
+	__ret;						\
+})
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map)
+		return -1;
+	return bpf_map__fd(map);
+}
+
+#define TEST_CGROUP "/test-bpf-get-cgroup-id/"
+
+int main(int argc, char **argv)
+{
+	const char *probe_name = "syscalls/sys_enter_nanosleep";
+	const char *file = "get_cgroup_id_kern.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	int cgroup_fd, cgidmap_fd;
+	struct bpf_object *obj;
+	__u64 kcgid = 0, ucgid;
+	int exit_code = 1;
+	char buf[256];
+	__u32 key = 0;
+
+	err = setup_cgroup_environment();
+	if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err,
+		  errno))
+		return 1;
+
+	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
+	if (CHECK(cgroup_fd < 0, "create_and_get_cgroup", "err %d errno %d\n",
+		  cgroup_fd, errno))
+		goto cleanup_cgroup_env;
+
+	err = join_cgroup(TEST_CGROUP);
+	if (CHECK(err, "join_cgroup", "err %d errno %d\n", err, errno))
+		goto cleanup_cgroup_env;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto cleanup_cgroup_env;
+
+	cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids");
+	if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  cgidmap_fd, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	/* attach to this pid so the all bpf invocations will be in the
+	 * cgroup associated with this pid.
+	 */
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* trigger some syscalls */
+	sleep(1);
+
+	err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid);
+	if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	ucgid = get_cgroup_id(TEST_CGROUP);
+	if (CHECK(kcgid != ucgid, "compare_cgroup_id",
+		  "kern cgid %llx user cgid %llx", kcgid, ucgid))
+		goto close_pmu;
+
+	exit_code = 0;
+	printf("%s:PASS\n", argv[0]);
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	bpf_object__close(obj);
+cleanup_cgroup_env:
+	cleanup_cgroup_environment();
+	return exit_code;
+}
-- 
cgit v1.2.3-59-g8ed1b


From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 3 Jun 2018 08:15:19 -0700
Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo

As Michal noted the flow struct takes both the flow label and priority.
Update the bpf_fib_lookup API to note that it is flowinfo and not just
the flow label.

Cc: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h   | 2 +-
 net/core/filter.c          | 2 +-
 samples/bpf/xdp_fwd_kern.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 18712b0dbfe7..eeb6237be5c2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2629,7 +2629,7 @@ struct bpf_fib_lookup {
 	union {
 		/* inputs to lookup */
 		__u8	tos;		/* AF_INET  */
-		__be32	flowlabel;	/* AF_INET6 */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
 
 		/* output: metric of fib result (IPv4/IPv6 only) */
 		__u32	rt_metric;
diff --git a/net/core/filter.c b/net/core/filter.c
index a72ea9f61010..3d9ba7e5965a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4221,7 +4221,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		fl6.flowi6_oif = 0;
 		strict = RT6_LOOKUP_F_HAS_SADDR;
 	}
-	fl6.flowlabel = params->flowlabel;
+	fl6.flowlabel = params->flowinfo;
 	fl6.flowi6_scope = 0;
 	fl6.flowi6_flags = 0;
 	fl6.mp_hash = 0;
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 4a6be0f87505..6673cdb9f55c 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -88,7 +88,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 			return XDP_PASS;
 
 		fib_params.family	= AF_INET6;
-		fib_params.flowlabel	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
+		fib_params.flowinfo	= *(__be32 *)ip6h & IPV6_FLOWINFO_MASK;
 		fib_params.l4_protocol	= ip6h->nexthdr;
 		fib_params.sport	= 0;
 		fib_params.dport	= 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2fa3c8a8b23041490b279d2630f1cfc9fae242fb Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Thu, 31 May 2018 07:16:32 -0700
Subject: net: virtio: simplify the virtnet_find_vqs

Use the common free functions while return successfully.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 2d55e2af7cb6..6d710b8b41c5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2609,12 +2609,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 		vi->sq[i].vq = vqs[txq2vq(i)];
 	}
 
-	kfree(names);
-	kfree(callbacks);
-	kfree(vqs);
-	kfree(ctx);
+	/* run here: ret == 0. */
 
-	return 0;
 
 err_find:
 	kfree(ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 830669e691464c005ff7262b74797fb2222da99a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 31 May 2018 12:14:38 -0400
Subject: selftests/net: enable msg_zerocopy test

The existing msg_zerocopy test takes additional protocol arguments.
Add a variant that takes no arguments and runs all supported variants.
Call this from kselftest.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/Makefile        | 2 +-
 tools/testing/selftests/net/msg_zerocopy.sh | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 7cb0f49efdb7..f39100b970af 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh
index d571d213418d..c43c6debda06 100755
--- a/tools/testing/selftests/net/msg_zerocopy.sh
+++ b/tools/testing/selftests/net/msg_zerocopy.sh
@@ -21,6 +21,14 @@ readonly DADDR6='fd::2'
 
 readonly path_sysctl_mem="net.core.optmem_max"
 
+# No arguments: automated test
+if [[ "$#" -eq "0" ]]; then
+	$0 4 tcp -t 1
+	$0 6 tcp -t 1
+	echo "OK. All tests passed"
+	exit 0
+fi
+
 # Argument parsing
 if [[ "$#" -lt "2" ]]; then
 	echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>"
-- 
cgit v1.2.3-59-g8ed1b


From 00f333e8d64d9064e636fe4024f6281e2b035a4c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 31 May 2018 12:14:39 -0400
Subject: selftests/net: udpgso: test small gso_size boundary conditions

Verify that udpgso can generate segments smaller than device mtu, down
to the extreme case of 1B gso_size.

Verify that irrespective of gso_size, udpgso restricts the number of
segments it will generate per call (UDP_MAX_SEGMENTS).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/udpgso.c | 77 +++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 48a0592db938..e279051bc631 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -34,6 +34,10 @@
 #define UDP_SEGMENT		103
 #endif
 
+#ifndef UDP_MAX_SEGMENTS
+#define UDP_MAX_SEGMENTS	(1 << 6UL)
+#endif
+
 #define CONST_MTU_TEST	1500
 
 #define CONST_HDRLEN_V4		(sizeof(struct iphdr) + sizeof(struct udphdr))
@@ -135,6 +139,38 @@ struct testcase testcases_v4[] = {
 		.gso_len = CONST_MSS_V4,
 		.tfail = true,
 	},
+	{
+		/* send a single 1B MSS: will fail, see single MSS above */
+		.tlen = 1,
+		.gso_len = 1,
+		.tfail = true,
+		.r_num_mss = 1,
+	},
+	{
+		/* send 2 1B segments */
+		.tlen = 2,
+		.gso_len = 1,
+		.r_num_mss = 2,
+	},
+	{
+		/* send 2B + 2B + 1B segments */
+		.tlen = 5,
+		.gso_len = 2,
+		.r_num_mss = 2,
+		.r_len_last = 1,
+	},
+	{
+		/* send max number of min sized segments */
+		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+		.gso_len = 1,
+		.r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4,
+	},
+	{
+		/* send max number + 1 of min sized segments: fail */
+		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V4 + 1,
+		.gso_len = 1,
+		.tfail = true,
+	},
 	{
 		/* EOL */
 	}
@@ -210,6 +246,38 @@ struct testcase testcases_v6[] = {
 		.gso_len = CONST_MSS_V6,
 		.tfail = true,
 	},
+	{
+		/* send a single 1B MSS: will fail, see single MSS above */
+		.tlen = 1,
+		.gso_len = 1,
+		.tfail = true,
+		.r_num_mss = 1,
+	},
+	{
+		/* send 2 1B segments */
+		.tlen = 2,
+		.gso_len = 1,
+		.r_num_mss = 2,
+	},
+	{
+		/* send 2B + 2B + 1B segments */
+		.tlen = 5,
+		.gso_len = 2,
+		.r_num_mss = 2,
+		.r_len_last = 1,
+	},
+	{
+		/* send max number of min sized segments */
+		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+		.gso_len = 1,
+		.r_num_mss = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6,
+	},
+	{
+		/* send max number + 1 of min sized segments: fail */
+		.tlen = UDP_MAX_SEGMENTS - CONST_HDRLEN_V6 + 1,
+		.gso_len = 1,
+		.tfail = true,
+	},
 	{
 		/* EOL */
 	}
@@ -375,7 +443,8 @@ static bool __send_one(int fd, struct msghdr *msg, int flags)
 	int ret;
 
 	ret = sendmsg(fd, msg, flags);
-	if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+	if (ret == -1 &&
+	    (errno == EMSGSIZE || errno == ENOMEM || errno == EINVAL))
 		return false;
 	if (ret == -1)
 		error(1, errno, "sendmsg");
@@ -466,7 +535,11 @@ static void run_one(struct testcase *test, int fdt, int fdr,
 	if (!sent)
 		return;
 
-	mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;
+	if (test->gso_len)
+		mss = test->gso_len;
+	else
+		mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;
+
 
 	/* Recv all full MSS datagrams */
 	for (i = 0; i < test->r_num_mss; i++) {
-- 
cgit v1.2.3-59-g8ed1b


From 75f0139fd6d9b1de5dc8167e4ff23cfd83bbe1d5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 31 May 2018 12:14:40 -0400
Subject: selftests/net: add packet socket packet_snd test

Add regression tests for PF_PACKET transmission using packet_snd.

The TPACKET ring interface has tests for transmission and reception.
This is an initial stab at the same for the send call based interface.

Packets are sent over loopback, then read twice. The entire packet is
read from another packet socket and compared. The packet is also
verified to arrive at a UDP socket for protocol conformance.

The test sends a packet over loopback, testing the following options
(not the full cross-product):

- SOCK_DGRAM
- SOCK_RAW
- vlan tag
- qdisc bypass
- bind() and sendto()
- virtio_net_hdr
- csum offload (NOT actual csum feature, ignored on loopback)
- gso

Besides these basic functionality tests, the test runs from a set
of bounds checks, positive and negative. Running over loopback, which
has dev->min_header_len, it cannot generate variable length hhlen.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/.gitignore   |   1 +
 tools/testing/selftests/net/Makefile     |   4 +-
 tools/testing/selftests/net/psock_snd.c  | 397 +++++++++++++++++++++++++++++++
 tools/testing/selftests/net/psock_snd.sh |  98 ++++++++
 4 files changed, 498 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/psock_snd.c
 create mode 100755 tools/testing/selftests/net/psock_snd.sh

diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index f0e6c35a93ae..128e548aa377 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -1,6 +1,7 @@
 msg_zerocopy
 socket
 psock_fanout
+psock_snd
 psock_tpacket
 reuseport_bpf
 reuseport_bpf_cpu
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index f39100b970af..663e11e85727 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,11 +6,11 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
-TEST_GEN_FILES += tcp_mmap tcp_inq
+TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd
 TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
diff --git a/tools/testing/selftests/net/psock_snd.c b/tools/testing/selftests/net/psock_snd.c
new file mode 100644
index 000000000000..7d15e10a9fb6
--- /dev/null
+++ b/tools/testing/selftests/net/psock_snd.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "psock_lib.h"
+
+static bool	cfg_use_bind;
+static bool	cfg_use_csum_off;
+static bool	cfg_use_csum_off_bad;
+static bool	cfg_use_dgram;
+static bool	cfg_use_gso;
+static bool	cfg_use_qdisc_bypass;
+static bool	cfg_use_vlan;
+static bool	cfg_use_vnet;
+
+static char	*cfg_ifname = "lo";
+static int	cfg_mtu	= 1500;
+static int	cfg_payload_len = DATA_LEN;
+static int	cfg_truncate_len = INT_MAX;
+static uint16_t	cfg_port = 8000;
+
+/* test sending up to max mtu + 1 */
+#define TEST_SZ	(sizeof(struct virtio_net_hdr) + ETH_HLEN + ETH_MAX_MTU + 1)
+
+static char tbuf[TEST_SZ], rbuf[TEST_SZ];
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+	unsigned long sum = 0;
+	int i;
+
+	for (i = 0; i < num_u16; i++)
+		sum += start[i];
+
+	return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+			      unsigned long sum)
+{
+	sum += add_csum_hword(start, num_u16);
+
+	while (sum >> 16)
+		sum = (sum & 0xffff) + (sum >> 16);
+
+	return ~sum;
+}
+
+static int build_vnet_header(void *header)
+{
+	struct virtio_net_hdr *vh = header;
+
+	vh->hdr_len = ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr);
+
+	if (cfg_use_csum_off) {
+		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vh->csum_start = ETH_HLEN + sizeof(struct iphdr);
+		vh->csum_offset = __builtin_offsetof(struct udphdr, check);
+
+		/* position check field exactly one byte beyond end of packet */
+		if (cfg_use_csum_off_bad)
+			vh->csum_start += sizeof(struct udphdr) + cfg_payload_len -
+					  vh->csum_offset - 1;
+	}
+
+	if (cfg_use_gso) {
+		vh->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+		vh->gso_size = cfg_mtu - sizeof(struct iphdr);
+	}
+
+	return sizeof(*vh);
+}
+
+static int build_eth_header(void *header)
+{
+	struct ethhdr *eth = header;
+
+	if (cfg_use_vlan) {
+		uint16_t *tag = header + ETH_HLEN;
+
+		eth->h_proto = htons(ETH_P_8021Q);
+		tag[1] = htons(ETH_P_IP);
+		return ETH_HLEN + 4;
+	}
+
+	eth->h_proto = htons(ETH_P_IP);
+	return ETH_HLEN;
+}
+
+static int build_ipv4_header(void *header, int payload_len)
+{
+	struct iphdr *iph = header;
+
+	iph->ihl = 5;
+	iph->version = 4;
+	iph->ttl = 8;
+	iph->tot_len = htons(sizeof(*iph) + sizeof(struct udphdr) + payload_len);
+	iph->id = htons(1337);
+	iph->protocol = IPPROTO_UDP;
+	iph->saddr = htonl((172 << 24) | (17 << 16) | 2);
+	iph->daddr = htonl((172 << 24) | (17 << 16) | 1);
+	iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+
+	return iph->ihl << 2;
+}
+
+static int build_udp_header(void *header, int payload_len)
+{
+	const int alen = sizeof(uint32_t);
+	struct udphdr *udph = header;
+	int len = sizeof(*udph) + payload_len;
+
+	udph->source = htons(9);
+	udph->dest = htons(cfg_port);
+	udph->len = htons(len);
+
+	if (cfg_use_csum_off)
+		udph->check = build_ip_csum(header - (2 * alen), alen,
+					    htons(IPPROTO_UDP) + udph->len);
+	else
+		udph->check = 0;
+
+	return sizeof(*udph);
+}
+
+static int build_packet(int payload_len)
+{
+	int off = 0;
+
+	off += build_vnet_header(tbuf);
+	off += build_eth_header(tbuf + off);
+	off += build_ipv4_header(tbuf + off, payload_len);
+	off += build_udp_header(tbuf + off, payload_len);
+
+	if (off + payload_len > sizeof(tbuf))
+		error(1, 0, "payload length exceeds max");
+
+	memset(tbuf + off, DATA_CHAR, payload_len);
+
+	return off + payload_len;
+}
+
+static void do_bind(int fd)
+{
+	struct sockaddr_ll laddr = {0};
+
+	laddr.sll_family = AF_PACKET;
+	laddr.sll_protocol = htons(ETH_P_IP);
+	laddr.sll_ifindex = if_nametoindex(cfg_ifname);
+	if (!laddr.sll_ifindex)
+		error(1, errno, "if_nametoindex");
+
+	if (bind(fd, (void *)&laddr, sizeof(laddr)))
+		error(1, errno, "bind");
+}
+
+static void do_send(int fd, char *buf, int len)
+{
+	int ret;
+
+	if (!cfg_use_vnet) {
+		buf += sizeof(struct virtio_net_hdr);
+		len -= sizeof(struct virtio_net_hdr);
+	}
+	if (cfg_use_dgram) {
+		buf += ETH_HLEN;
+		len -= ETH_HLEN;
+	}
+
+	if (cfg_use_bind) {
+		ret = write(fd, buf, len);
+	} else {
+		struct sockaddr_ll laddr = {0};
+
+		laddr.sll_protocol = htons(ETH_P_IP);
+		laddr.sll_ifindex = if_nametoindex(cfg_ifname);
+		if (!laddr.sll_ifindex)
+			error(1, errno, "if_nametoindex");
+
+		ret = sendto(fd, buf, len, 0, (void *)&laddr, sizeof(laddr));
+	}
+
+	if (ret == -1)
+		error(1, errno, "write");
+	if (ret != len)
+		error(1, 0, "write: %u %u", ret, len);
+
+	fprintf(stderr, "tx: %u\n", ret);
+}
+
+static int do_tx(void)
+{
+	const int one = 1;
+	int fd, len;
+
+	fd = socket(PF_PACKET, cfg_use_dgram ? SOCK_DGRAM : SOCK_RAW, 0);
+	if (fd == -1)
+		error(1, errno, "socket t");
+
+	if (cfg_use_bind)
+		do_bind(fd);
+
+	if (cfg_use_qdisc_bypass &&
+	    setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)))
+		error(1, errno, "setsockopt qdisc bypass");
+
+	if (cfg_use_vnet &&
+	    setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one)))
+		error(1, errno, "setsockopt vnet");
+
+	len = build_packet(cfg_payload_len);
+
+	if (cfg_truncate_len < len)
+		len = cfg_truncate_len;
+
+	do_send(fd, tbuf, len);
+
+	if (close(fd))
+		error(1, errno, "close t");
+
+	return len;
+}
+
+static int setup_rx(void)
+{
+	struct timeval tv = { .tv_usec = 100 * 1000 };
+	struct sockaddr_in raddr = {0};
+	int fd;
+
+	fd = socket(PF_INET, SOCK_DGRAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket r");
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+		error(1, errno, "setsockopt rcv timeout");
+
+	raddr.sin_family = AF_INET;
+	raddr.sin_port = htons(cfg_port);
+	raddr.sin_addr.s_addr = htonl(INADDR_ANY);
+
+	if (bind(fd, (void *)&raddr, sizeof(raddr)))
+		error(1, errno, "bind r");
+
+	return fd;
+}
+
+static void do_rx(int fd, int expected_len, char *expected)
+{
+	int ret;
+
+	ret = recv(fd, rbuf, sizeof(rbuf), 0);
+	if (ret == -1)
+		error(1, errno, "recv");
+	if (ret != expected_len)
+		error(1, 0, "recv: %u != %u", ret, expected_len);
+
+	if (memcmp(rbuf, expected, ret))
+		error(1, 0, "recv: data mismatch");
+
+	fprintf(stderr, "rx: %u\n", ret);
+}
+
+static int setup_sniffer(void)
+{
+	struct timeval tv = { .tv_usec = 100 * 1000 };
+	int fd;
+
+	fd = socket(PF_PACKET, SOCK_RAW, 0);
+	if (fd == -1)
+		error(1, errno, "socket p");
+
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+		error(1, errno, "setsockopt rcv timeout");
+
+	pair_udp_setfilter(fd);
+	do_bind(fd);
+
+	return fd;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "bcCdgl:qt:vV")) != -1) {
+		switch (c) {
+		case 'b':
+			cfg_use_bind = true;
+			break;
+		case 'c':
+			cfg_use_csum_off = true;
+			break;
+		case 'C':
+			cfg_use_csum_off_bad = true;
+			break;
+		case 'd':
+			cfg_use_dgram = true;
+			break;
+		case 'g':
+			cfg_use_gso = true;
+			break;
+		case 'l':
+			cfg_payload_len = strtoul(optarg, NULL, 0);
+			break;
+		case 'q':
+			cfg_use_qdisc_bypass = true;
+			break;
+		case 't':
+			cfg_truncate_len = strtoul(optarg, NULL, 0);
+			break;
+		case 'v':
+			cfg_use_vnet = true;
+			break;
+		case 'V':
+			cfg_use_vlan = true;
+			break;
+		default:
+			error(1, 0, "%s: parse error", argv[0]);
+		}
+	}
+
+	if (cfg_use_vlan && cfg_use_dgram)
+		error(1, 0, "option vlan (-V) conflicts with dgram (-d)");
+
+	if (cfg_use_csum_off && !cfg_use_vnet)
+		error(1, 0, "option csum offload (-c) requires vnet (-v)");
+
+	if (cfg_use_csum_off_bad && !cfg_use_csum_off)
+		error(1, 0, "option csum bad (-C) requires csum offload (-c)");
+
+	if (cfg_use_gso && !cfg_use_csum_off)
+		error(1, 0, "option gso (-g) requires csum offload (-c)");
+}
+
+static void run_test(void)
+{
+	int fdr, fds, total_len;
+
+	fdr = setup_rx();
+	fds = setup_sniffer();
+
+	total_len = do_tx();
+
+	/* BPF filter accepts only this length, vlan changes MAC */
+	if (cfg_payload_len == DATA_LEN && !cfg_use_vlan)
+		do_rx(fds, total_len - sizeof(struct virtio_net_hdr),
+		      tbuf + sizeof(struct virtio_net_hdr));
+
+	do_rx(fdr, cfg_payload_len, tbuf + total_len - cfg_payload_len);
+
+	if (close(fds))
+		error(1, errno, "close s");
+	if (close(fdr))
+		error(1, errno, "close r");
+}
+
+int main(int argc, char **argv)
+{
+	parse_opts(argc, argv);
+
+	if (system("ip link set dev lo mtu 1500"))
+		error(1, errno, "ip link set mtu");
+	if (system("ip addr add dev lo 172.17.0.1/24"))
+		error(1, errno, "ip addr add");
+
+	run_test();
+
+	fprintf(stderr, "OK\n\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/net/psock_snd.sh b/tools/testing/selftests/net/psock_snd.sh
new file mode 100755
index 000000000000..6331d91b86a6
--- /dev/null
+++ b/tools/testing/selftests/net/psock_snd.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of packet socket send regression tests
+
+set -e
+
+readonly mtu=1500
+readonly iphlen=20
+readonly udphlen=8
+
+readonly vnet_hlen=10
+readonly eth_hlen=14
+
+readonly mss="$((${mtu} - ${iphlen} - ${udphlen}))"
+readonly mss_exceeds="$((${mss} + 1))"
+
+readonly max_mtu=65535
+readonly max_mss="$((${max_mtu} - ${iphlen} - ${udphlen}))"
+readonly max_mss_exceeds="$((${max_mss} + 1))"
+
+# functional checks (not a full cross-product)
+
+echo "dgram"
+./in_netns.sh ./psock_snd -d
+
+echo "dgram bind"
+./in_netns.sh ./psock_snd -d -b
+
+echo "raw"
+./in_netns.sh ./psock_snd
+
+echo "raw bind"
+./in_netns.sh ./psock_snd -b
+
+echo "raw qdisc bypass"
+./in_netns.sh ./psock_snd -q
+
+echo "raw vlan"
+./in_netns.sh ./psock_snd -V
+
+echo "raw vnet hdr"
+./in_netns.sh ./psock_snd -v
+
+echo "raw csum_off"
+./in_netns.sh ./psock_snd -v -c
+
+echo "raw csum_off with bad offset (fails)"
+(! ./in_netns.sh ./psock_snd -v -c -C)
+
+
+# bounds check: send {max, max + 1, min, min - 1} lengths
+
+echo "raw min size"
+./in_netns.sh ./psock_snd -l 0
+
+echo "raw mtu size"
+./in_netns.sh ./psock_snd -l "${mss}"
+
+echo "raw mtu size + 1 (fails)"
+(! ./in_netns.sh ./psock_snd -l "${mss_exceeds}")
+
+# fails due to ARPHRD_ETHER check in packet_extra_vlan_len_allowed
+#
+# echo "raw vlan mtu size"
+# ./in_netns.sh ./psock_snd -V -l "${mss}"
+
+echo "raw vlan mtu size + 1 (fails)"
+(! ./in_netns.sh ./psock_snd -V -l "${mss_exceeds}")
+
+echo "dgram mtu size"
+./in_netns.sh ./psock_snd -d -l "${mss}"
+
+echo "dgram mtu size + 1 (fails)"
+(! ./in_netns.sh ./psock_snd -d -l "${mss_exceeds}")
+
+echo "raw truncate hlen (fails: does not arrive)"
+(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen}))")
+
+echo "raw truncate hlen - 1 (fails: EINVAL)"
+(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen} - 1))")
+
+
+# gso checks: implies -l, because with gso len must exceed gso_size
+
+echo "raw gso min size"
+./in_netns.sh ./psock_snd -v -c -g -l "${mss_exceeds}"
+
+echo "raw gso min size - 1 (fails)"
+(! ./in_netns.sh ./psock_snd -v -c -g -l "${mss}")
+
+echo "raw gso max size"
+./in_netns.sh ./psock_snd -v -c -g -l "${max_mss}"
+
+echo "raw gso max size + 1 (fails)"
+(! ./in_netns.sh ./psock_snd -v -c -g -l "${max_mss_exceeds}")
+
+echo "OK. All tests passed"
-- 
cgit v1.2.3-59-g8ed1b


From 00d56229671b23f4cc459f96165dec41ef5988b4 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 1 Jun 2018 00:37:44 +0200
Subject: selftests: forwarding: mirror_vlan: Uninstall trap

Instead of installing a trap before tests run and uninstalling it after
they run, mirror_vlan.sh installs it twice due to a typo. Fix the typo.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_vlan.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
index 20b37a53657e..f7124038b6c1 100755
--- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -108,7 +108,7 @@ test_all()
 
 	tests_run
 
-	trap_install $h3 ingress
+	trap_uninstall $h3 ingress
 	slow_path_trap_uninstall $swp1 egress
 	slow_path_trap_uninstall $swp1 ingress
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6ebe5a7a66ebd78f83fdd231a1d50bfc7a93598e Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 1 Jun 2018 00:37:47 +0200
Subject: selftests: forwarding: mirror_vlan: Change test description

The test description is displayed with the PASS/FAIL resolution after
the test is ran. There however already is one other test described
exactly like this, which makes it unclear which of the tests passed or
failed. Make the description unique.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/forwarding/mirror_vlan.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
index f7124038b6c1..9ab2ce77b332 100755
--- a/tools/testing/selftests/net/forwarding/mirror_vlan.sh
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -91,7 +91,7 @@ test_tagged_vlan_dir()
 				  192.0.2.17 192.0.2.18
 	mirror_uninstall $swp1 $direction
 
-	log_test "$direction mirror to vlan ($tcflags)"
+	log_test "$direction mirror tagged to vlan ($tcflags)"
 }
 
 test_tagged_vlan()
-- 
cgit v1.2.3-59-g8ed1b


From 5e9f20359a166de934b5bd02cb54c0be0e8c2890 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Thu, 31 May 2018 18:47:36 -0700
Subject: qed: Fix shared memory inconsistency between driver and the MFW.

The structure shared between driver and management firmware (MFW)
differ in sizes. The additional field defined by the MFW is not
relevant to the current driver. Add a dummy field to the structure.

Fixes: cac6f691 ("qed: Add support for Unified Fabric Port")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 8e1e6e1eb40e..beba9308011b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -11996,6 +11996,7 @@ struct public_port {
 #define EEE_REMOTE_TW_RX_MASK   0xffff0000
 #define EEE_REMOTE_TW_RX_OFFSET 16
 
+	u32 reserved1;
 	u32 oem_cfg_port;
 #define OEM_CFG_CHANNEL_TYPE_MASK                       0x00000003
 #define OEM_CFG_CHANNEL_TYPE_OFFSET                     0
-- 
cgit v1.2.3-59-g8ed1b


From b5fabb080062e7685b898e9c0ec4d95f4d526ed2 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Thu, 31 May 2018 18:47:37 -0700
Subject: qed: Fix use of incorrect shmem address.

Incorrect shared memory address is used while deriving the values
for tc and pri_type. Use shmem address corresponding to 'oem_cfg_func'
where the management firmare saves tc/pri_type values.

Fixes: cac6f691 ("qed: Add support for Unified Fabric Port")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 2612e3e458d9..6f9927d1a501 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1514,9 +1514,10 @@ void qed_mcp_read_ufp_config(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	}
 
 	qed_mcp_get_shmem_func(p_hwfn, p_ptt, &shmem_info, MCP_PF_ID(p_hwfn));
-	val = (port_cfg & OEM_CFG_FUNC_TC_MASK) >> OEM_CFG_FUNC_TC_OFFSET;
+	val = (shmem_info.oem_cfg_func & OEM_CFG_FUNC_TC_MASK) >>
+		OEM_CFG_FUNC_TC_OFFSET;
 	p_hwfn->ufp_info.tc = (u8)val;
-	val = (port_cfg & OEM_CFG_FUNC_HOST_PRI_CTRL_MASK) >>
+	val = (shmem_info.oem_cfg_func & OEM_CFG_FUNC_HOST_PRI_CTRL_MASK) >>
 		OEM_CFG_FUNC_HOST_PRI_CTRL_OFFSET;
 	if (val == OEM_CFG_FUNC_HOST_PRI_CTRL_VNIC) {
 		p_hwfn->ufp_info.pri_type = QED_UFP_PRI_VNIC;
-- 
cgit v1.2.3-59-g8ed1b


From 9a8a02c9d46dcd4c663dac39e6518b6bb7ac1631 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Thu, 31 May 2018 18:01:27 +0100
Subject: net: stmmac: Add Flexible PPS support

This adds support for Flexible PPS output (which is equivalent
to per_out output of PTP subsystem).

Tested using an oscilloscope and the following commands:

1) Start PTP4L:
	# ptp4l -A -4 -H -m -i eth0 &
2) Set Flexible PPS frequency:
	# echo <idx> <ts> <tns> <ps> <pns> > /sys/class/ptp/ptpX/period

Where, ts/tns is start time and ps/pns is period time, and ptpX is ptp
of eth0.

Signed-off-by: Jose Abreu <joabreu@synopsys.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Joao Pinto <jpinto@synopsys.com>
Cc: Vitor Soares <soares@synopsys.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre Torgue <alexandre.torgue@st.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/common.h      |  2 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      |  1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |  1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  |  2 +
 drivers/net/ethernet/stmicro/stmmac/dwmac5.c      | 55 +++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac5.h      | 22 +++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h        |  7 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      | 12 +++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  4 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c  | 42 +++++++++++++++--
 10 files changed, 145 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index a679cb729d1d..78fd0f8b8e81 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -346,6 +346,8 @@ struct dma_features {
 	/* TX and RX number of queues */
 	unsigned int number_rx_queues;
 	unsigned int number_tx_queues;
+	/* PPS output */
+	unsigned int pps_out_num;
 	/* Alternate (enhanced) DESC mode */
 	unsigned int enh_desc;
 	/* TX and RX FIFO sizes */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 6330a55953df..eb013d54025a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -187,6 +187,7 @@ enum power_event {
 #define GMAC_HW_RXFIFOSIZE		GENMASK(4, 0)
 
 /* MAC HW features2 bitmap */
+#define GMAC_HW_FEAT_PPSOUTNUM		GENMASK(26, 24)
 #define GMAC_HW_FEAT_TXCHCNT		GENMASK(21, 18)
 #define GMAC_HW_FEAT_RXCHCNT		GENMASK(15, 12)
 #define GMAC_HW_FEAT_TXQCNT		GENMASK(9, 6)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index a7121a7d9391..7e5d5db0d516 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -796,6 +796,7 @@ const struct stmmac_ops dwmac510_ops = {
 	.safety_feat_irq_status = dwmac5_safety_feat_irq_status,
 	.safety_feat_dump = dwmac5_safety_feat_dump,
 	.rxp_config = dwmac5_rxp_config,
+	.flex_pps_config = dwmac5_flex_pps_config,
 };
 
 int dwmac4_setup(struct stmmac_priv *priv)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index bf8e5a16f11c..d37f17ca62fe 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -373,6 +373,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr,
 		((hw_cap & GMAC_HW_FEAT_RXQCNT) >> 0) + 1;
 	dma_cap->number_tx_queues =
 		((hw_cap & GMAC_HW_FEAT_TXQCNT) >> 6) + 1;
+	/* PPS output */
+	dma_cap->pps_out_num = (hw_cap & GMAC_HW_FEAT_PPSOUTNUM) >> 24;
 
 	/* IEEE 1588-2002 */
 	dma_cap->time_stamp = 0;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
index b2becb80a697..3f4f3132e16b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c
@@ -8,6 +8,7 @@
 #include "dwmac4.h"
 #include "dwmac5.h"
 #include "stmmac.h"
+#include "stmmac_ptp.h"
 
 struct dwmac5_error_desc {
 	bool valid;
@@ -494,3 +495,57 @@ re_enable:
 	writel(old_val, ioaddr + GMAC_CONFIG);
 	return ret;
 }
+
+int dwmac5_flex_pps_config(void __iomem *ioaddr, int index,
+			   struct stmmac_pps_cfg *cfg, bool enable,
+			   u32 sub_second_inc, u32 systime_flags)
+{
+	u32 tnsec = readl(ioaddr + MAC_PPSx_TARGET_TIME_NSEC(index));
+	u32 val = readl(ioaddr + MAC_PPS_CONTROL);
+	u64 period;
+
+	if (!cfg->available)
+		return -EINVAL;
+	if (tnsec & TRGTBUSY0)
+		return -EBUSY;
+	if (!sub_second_inc || !systime_flags)
+		return -EINVAL;
+
+	val &= ~PPSx_MASK(index);
+
+	if (!enable) {
+		val |= PPSCMDx(index, 0x5);
+		writel(val, ioaddr + MAC_PPS_CONTROL);
+		return 0;
+	}
+
+	val |= PPSCMDx(index, 0x2);
+	val |= TRGTMODSELx(index, 0x2);
+	val |= PPSEN0;
+
+	writel(cfg->start.tv_sec, ioaddr + MAC_PPSx_TARGET_TIME_SEC(index));
+
+	if (!(systime_flags & PTP_TCR_TSCTRLSSR))
+		cfg->start.tv_nsec = (cfg->start.tv_nsec * 1000) / 465;
+	writel(cfg->start.tv_nsec, ioaddr + MAC_PPSx_TARGET_TIME_NSEC(index));
+
+	period = cfg->period.tv_sec * 1000000000;
+	period += cfg->period.tv_nsec;
+
+	do_div(period, sub_second_inc);
+
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + MAC_PPSx_INTERVAL(index));
+
+	period >>= 1;
+	if (period <= 1)
+		return -EINVAL;
+
+	writel(period - 1, ioaddr + MAC_PPSx_WIDTH(index));
+
+	/* Finally, activate it */
+	writel(val, ioaddr + MAC_PPS_CONTROL);
+	return 0;
+}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
index cc810aff7100..775db776b3cc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h
@@ -11,6 +11,25 @@
 #define PRTYEN				BIT(1)
 #define TMOUTEN				BIT(0)
 
+#define MAC_PPS_CONTROL			0x00000b70
+#define PPS_MAXIDX(x)			((((x) + 1) * 8) - 1)
+#define PPS_MINIDX(x)			((x) * 8)
+#define PPSx_MASK(x)			GENMASK(PPS_MAXIDX(x), PPS_MINIDX(x))
+#define MCGRENx(x)			BIT(PPS_MAXIDX(x))
+#define TRGTMODSELx(x, val)		\
+	GENMASK(PPS_MAXIDX(x) - 1, PPS_MAXIDX(x) - 2) & \
+	((val) << (PPS_MAXIDX(x) - 2))
+#define PPSCMDx(x, val)			\
+	GENMASK(PPS_MINIDX(x) + 3, PPS_MINIDX(x)) & \
+	((val) << PPS_MINIDX(x))
+#define PPSEN0				BIT(4)
+#define MAC_PPSx_TARGET_TIME_SEC(x)	(0x00000b80 + ((x) * 0x10))
+#define MAC_PPSx_TARGET_TIME_NSEC(x)	(0x00000b84 + ((x) * 0x10))
+#define TRGTBUSY0			BIT(31)
+#define TTSL0				GENMASK(30, 0)
+#define MAC_PPSx_INTERVAL(x)		(0x00000b88 + ((x) * 0x10))
+#define MAC_PPSx_WIDTH(x)		(0x00000b8c + ((x) * 0x10))
+
 #define MTL_RXP_CONTROL_STATUS		0x00000ca0
 #define RXPI				BIT(31)
 #define NPE				GENMASK(23, 16)
@@ -61,5 +80,8 @@ int dwmac5_safety_feat_dump(struct stmmac_safety_stats *stats,
 			int index, unsigned long *count, const char **desc);
 int dwmac5_rxp_config(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
 		      unsigned int count);
+int dwmac5_flex_pps_config(void __iomem *ioaddr, int index,
+			   struct stmmac_pps_cfg *cfg, bool enable,
+			   u32 sub_second_inc, u32 systime_flags);
 
 #endif /* __DWMAC5_H__ */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index f499a7fad6f0..e44e7b26ce82 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -241,6 +241,7 @@ struct net_device;
 struct rgmii_adv;
 struct stmmac_safety_stats;
 struct stmmac_tc_entry;
+struct stmmac_pps_cfg;
 
 /* Helpers to program the MAC core */
 struct stmmac_ops {
@@ -313,6 +314,10 @@ struct stmmac_ops {
 	/* Flexible RX Parser */
 	int (*rxp_config)(void __iomem *ioaddr, struct stmmac_tc_entry *entries,
 			  unsigned int count);
+	/* Flexible PPS */
+	int (*flex_pps_config)(void __iomem *ioaddr, int index,
+			       struct stmmac_pps_cfg *cfg, bool enable,
+			       u32 sub_second_inc, u32 systime_flags);
 };
 
 #define stmmac_core_init(__priv, __args...) \
@@ -379,6 +384,8 @@ struct stmmac_ops {
 	stmmac_do_callback(__priv, mac, safety_feat_dump, __args)
 #define stmmac_rxp_config(__priv, __args...) \
 	stmmac_do_callback(__priv, mac, rxp_config, __args)
+#define stmmac_flex_pps_config(__priv, __args...) \
+	stmmac_do_callback(__priv, mac, flex_pps_config, __args)
 
 /* PTP and HW Timer helpers */
 struct stmmac_hwtimestamp {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index fbfe5dcefa87..025efbf6145c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -100,6 +100,13 @@ struct stmmac_tc_entry {
 	} __packed val;
 };
 
+#define STMMAC_PPS_MAX		4
+struct stmmac_pps_cfg {
+	bool available;
+	struct timespec64 start;
+	struct timespec64 period;
+};
+
 struct stmmac_priv {
 	/* Frequently used values are kept adjacent for cache effect */
 	u32 tx_count_frames;
@@ -160,6 +167,8 @@ struct stmmac_priv {
 	struct ptp_clock *ptp_clock;
 	struct ptp_clock_info ptp_clock_ops;
 	unsigned int default_addend;
+	u32 sub_second_inc;
+	u32 systime_flags;
 	u32 adv_ts;
 	int use_riwt;
 	int irq_wake;
@@ -181,6 +190,9 @@ struct stmmac_priv {
 	unsigned int tc_entries_max;
 	unsigned int tc_off_max;
 	struct stmmac_tc_entry *tc_entries;
+
+	/* Pulse Per Second output */
+	struct stmmac_pps_cfg pps[STMMAC_PPS_MAX];
 };
 
 enum stmmac_state {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 77af85c981db..11fb7c777d89 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -721,6 +721,10 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 				priv->plat->has_gmac4, &sec_inc);
 		temp = div_u64(1000000000ULL, sec_inc);
 
+		/* Store sub second increment and flags for later use */
+		priv->sub_second_inc = sec_inc;
+		priv->systime_flags = value;
+
 		/* calculate default added value:
 		 * formula is :
 		 * addend = (2^32)/freq_div_ratio;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 7d3a5c7f5db6..0cb0e39a2be9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -140,17 +140,43 @@ static int stmmac_set_time(struct ptp_clock_info *ptp,
 static int stmmac_enable(struct ptp_clock_info *ptp,
 			 struct ptp_clock_request *rq, int on)
 {
-	return -EOPNOTSUPP;
+	struct stmmac_priv *priv =
+	    container_of(ptp, struct stmmac_priv, ptp_clock_ops);
+	struct stmmac_pps_cfg *cfg;
+	int ret = -EOPNOTSUPP;
+	unsigned long flags;
+
+	switch (rq->type) {
+	case PTP_CLK_REQ_PEROUT:
+		cfg = &priv->pps[rq->perout.index];
+
+		cfg->start.tv_sec = rq->perout.start.sec;
+		cfg->start.tv_nsec = rq->perout.start.nsec;
+		cfg->period.tv_sec = rq->perout.period.sec;
+		cfg->period.tv_nsec = rq->perout.period.nsec;
+
+		spin_lock_irqsave(&priv->ptp_lock, flags);
+		ret = stmmac_flex_pps_config(priv, priv->ioaddr,
+					     rq->perout.index, cfg, on,
+					     priv->sub_second_inc,
+					     priv->systime_flags);
+		spin_unlock_irqrestore(&priv->ptp_lock, flags);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
 }
 
 /* structure describing a PTP hardware clock */
-static const struct ptp_clock_info stmmac_ptp_clock_ops = {
+static struct ptp_clock_info stmmac_ptp_clock_ops = {
 	.owner = THIS_MODULE,
 	.name = "stmmac_ptp_clock",
 	.max_adj = 62500000,
 	.n_alarm = 0,
 	.n_ext_ts = 0,
-	.n_per_out = 0,
+	.n_per_out = 0, /* will be overwritten in stmmac_ptp_register */
 	.n_pins = 0,
 	.pps = 0,
 	.adjfreq = stmmac_adjust_freq,
@@ -168,6 +194,16 @@ static const struct ptp_clock_info stmmac_ptp_clock_ops = {
  */
 void stmmac_ptp_register(struct stmmac_priv *priv)
 {
+	int i;
+
+	for (i = 0; i < priv->dma_cap.pps_out_num; i++) {
+		if (i >= STMMAC_PPS_MAX)
+			break;
+		priv->pps[i].available = true;
+	}
+
+	stmmac_ptp_clock_ops.n_per_out = priv->dma_cap.pps_out_num;
+
 	spin_lock_init(&priv->ptp_lock);
 	priv->ptp_clock_ops = stmmac_ptp_clock_ops;
 
-- 
cgit v1.2.3-59-g8ed1b


From cfed0a2c98d798bed970fd450eb4d7854705b3e1 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Fri, 1 Jun 2018 07:30:49 +0530
Subject: net: ethernet: mlx4: Remove unnecessary parentheses

This patch fixes the clang warning of extraneous parentheses, with the
following coccinelle script.

@@
identifier i;
expression e;
statement s;
@@
if (
-(i == e)
+i == e
 )
s

Suggested-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Acked-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/port.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 3ef3406ff4cb..10fcc22f4590 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -614,9 +614,9 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 		int index_at_dup_port = -1;
 
 		for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
-			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i]))))
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))
 				index_at_port = i;
-			if ((vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]))))
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i])))
 				index_at_dup_port = i;
 		}
 		/* check that same vlan is not in the tables at different indices */
-- 
cgit v1.2.3-59-g8ed1b


From 2f17becfbea5e9a0529b51da7345783e96e69516 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Fri, 1 Jun 2018 00:05:21 -0400
Subject: vrf: check the original netdevice for generating redirect

Use the right device to determine if redirect should be sent especially
when using vrf. Same as well as when sending the redirect.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 3 ++-
 net/ipv6/ndisc.c      | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 60b0d1652448..021e5aef6ba3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -482,7 +482,8 @@ int ip6_forward(struct sk_buff *skb)
 	   send redirects to source routed frames.
 	   We don't send redirects to frames decapsulated from IPsec.
 	 */
-	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
+	if (IP6CB(skb)->iif == dst->dev->ifindex &&
+	    opt->srcrt == 0 && !skb_sec_path(skb)) {
 		struct in6_addr *target = NULL;
 		struct inet_peer *peer;
 		struct rt6_info *rt;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9ac5366064e3..e640d2f3c55c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1578,6 +1578,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	   ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
 	bool ret;
 
+	if (netif_is_l3_master(skb->dev)) {
+		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+		if (!dev)
+			return;
+	}
+
 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
 		ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
 			  dev->name);
-- 
cgit v1.2.3-59-g8ed1b


From d3adce9d0334fcd5de1e9aa8749c8953ffc9b5a7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Jun 2018 06:50:29 -0700
Subject: MAINTAINERS: TCP gets its first maintainer

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0ae0dbf0e15e..70d61c2b1be4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9862,6 +9862,19 @@ F:	net/ipv6/calipso.c
 F:	net/netfilter/xt_CONNSECMARK.c
 F:	net/netfilter/xt_SECMARK.c
 
+NETWORKING [TCP]
+M:	Eric Dumazet <edumazet@google.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/ipv4/tcp*.c
+F:	net/ipv4/syncookies.c
+F:	net/ipv6/tcp*.c
+F:	net/ipv6/syncookies.c
+F:	include/uapi/linux/tcp.h
+F:	include/net/tcp.h
+F:	include/linux/tcp.h
+F:	include/trace/events/tcp.h
+
 NETWORKING [TLS]
 M:	Boris Pismenny <borisp@mellanox.com>
 M:	Aviad Yehezkel <aviadye@mellanox.com>
-- 
cgit v1.2.3-59-g8ed1b


From 4e64c835254095f55044d393e628dd3e92fca304 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:11 +0200
Subject: xsk: proper fill queue descriptor validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the fill queue descriptor was not copied to kernel space
prior validating it, making it possible for userland to change the
descriptor post-kernel-validation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c       | 11 +++++------
 net/xdp/xsk_queue.h | 32 +++++++++-----------------------
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cce0e4f8a536..43554eb56fe6 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,20 +41,19 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 *id, len = xdp->data_end - xdp->data;
+	u32 id, len = xdp->data_end - xdp->data;
 	void *buffer;
-	int err = 0;
+	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	id = xskq_peek_id(xs->umem->fq);
-	if (!id)
+	if (!xskq_peek_id(xs->umem->fq, &id))
 		return -ENOSPC;
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, *id);
+	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, *id, len,
+	err = xskq_produce_batch_desc(xs->rx, id, len,
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index cb8e5be35110..b5924e7aeb2b 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -85,14 +85,15 @@ static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q)
+static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_id(q, ring->desc[idx]))
-			return &ring->desc[idx];
+		*id = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_id(q, *id))
+			return id;
 
 		q->cons_tail++;
 	}
@@ -100,28 +101,22 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q)
+static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 {
-	struct xdp_umem_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_id(q);
 	}
 
-	ring = (struct xdp_umem_ring *)q->ring;
-	return &ring->desc[q->cons_tail & q->ring_mask];
+	return xskq_validate_id(q, id);
 }
 
 static inline void xskq_discard_id(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_id(q);
 }
 
 static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
@@ -174,11 +169,9 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 		struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		if (xskq_is_valid_desc(q, &ring->desc[idx])) {
-			if (desc)
-				*desc = ring->desc[idx];
+		*desc = READ_ONCE(ring->desc[idx]);
+		if (xskq_is_valid_desc(q, desc))
 			return desc;
-		}
 
 		q->cons_tail++;
 	}
@@ -189,27 +182,20 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
 					      struct xdp_desc *desc)
 {
-	struct xdp_rxtx_ring *ring;
-
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
 		q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
 
 		/* Order consumer and data */
 		smp_rmb();
-
-		return xskq_validate_desc(q, desc);
 	}
 
-	ring = (struct xdp_rxtx_ring *)q->ring;
-	*desc = ring->desc[q->cons_tail & q->ring_mask];
-	return desc;
+	return xskq_validate_desc(q, desc);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
 {
 	q->cons_tail++;
-	(void)xskq_validate_desc(q, NULL);
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-- 
cgit v1.2.3-59-g8ed1b


From a509a95536a86ef84deb16c656d741437791b414 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:12 +0200
Subject: xsk: proper Rx drop statistics update
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, rx_dropped could be updated incorrectly, e.g. if the XDP
program redirected the frame to a socket bound to a different queue
than where the XDP program was executing.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 43554eb56fe6..966307ce4b8e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -48,8 +48,10 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id))
+	if (!xskq_peek_id(xs->umem->fq, &id)) {
+		xs->rx_dropped++;
 		return -ENOSPC;
+	}
 
 	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
 	memcpy(buffer, xdp->data, len);
@@ -57,6 +59,8 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 				      xs->umem->frame_headroom);
 	if (!err)
 		xskq_discard_id(xs->umem->fq);
+	else
+		xs->rx_dropped++;
 
 	return err;
 }
@@ -68,8 +72,6 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (likely(!err))
 		xdp_return_buff(xdp);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
@@ -87,8 +89,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	err = __xsk_rcv(xs, xdp);
 	if (!err)
 		xsk_flush(xs);
-	else
-		xs->rx_dropped++;
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:13 +0200
Subject: xsk: new descriptor addressing scheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data.  Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).

By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.

In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.

In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.

We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.

To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.

On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.

Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.

This commit changes the uapi of AF_XDP sockets, and updates the
documentation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 101 +++++++++++++++++++++---------------
 include/uapi/linux/if_xdp.h         |  12 ++---
 net/xdp/xdp_umem.c                  |  33 ++++++------
 net/xdp/xdp_umem.h                  |  27 +++-------
 net/xdp/xdp_umem_props.h            |   4 +-
 net/xdp/xsk.c                       |  30 ++++++-----
 net/xdp/xsk_queue.c                 |   2 +-
 net/xdp/xsk_queue.h                 |  43 +++++++--------
 8 files changed, 123 insertions(+), 129 deletions(-)

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 91928d9ee4bf..ff929cfab4f4 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -12,7 +12,7 @@ packet processing.
 
 This document assumes that the reader is familiar with BPF and XDP. If
 not, the Cilium project has an excellent reference guide at
-http://cilium.readthedocs.io/en/doc-1.0/bpf/.
+http://cilium.readthedocs.io/en/latest/bpf/.
 
 Using the XDP_REDIRECT action from an XDP program, the program can
 redirect ingress frames to other XDP enabled netdevs, using the
@@ -33,22 +33,22 @@ for a while due to a possible retransmit, the descriptor that points
 to that packet can be changed to point to another and reused right
 away. This again avoids copying data.
 
-The UMEM consists of a number of equally size frames and each frame
-has a unique frame id. A descriptor in one of the rings references a
-frame by referencing its frame id. The user space allocates memory for
-this UMEM using whatever means it feels is most appropriate (malloc,
-mmap, huge pages, etc). This memory area is then registered with the
-kernel using the new setsockopt XDP_UMEM_REG. The UMEM also has two
-rings: the FILL ring and the COMPLETION ring. The fill ring is used by
-the application to send down frame ids for the kernel to fill in with
-RX packet data. References to these frames will then appear in the RX
-ring once each packet has been received. The completion ring, on the
-other hand, contains frame ids that the kernel has transmitted
-completely and can now be used again by user space, for either TX or
-RX. Thus, the frame ids appearing in the completion ring are ids that
-were previously transmitted using the TX ring. In summary, the RX and
-FILL rings are used for the RX path and the TX and COMPLETION rings
-are used for the TX path.
+The UMEM consists of a number of equally sized chunks. A descriptor in
+one of the rings references a frame by referencing its addr. The addr
+is simply an offset within the entire UMEM region. The user space
+allocates memory for this UMEM using whatever means it feels is most
+appropriate (malloc, mmap, huge pages, etc). This memory area is then
+registered with the kernel using the new setsockopt XDP_UMEM_REG. The
+UMEM also has two rings: the FILL ring and the COMPLETION ring. The
+fill ring is used by the application to send down addr for the kernel
+to fill in with RX packet data. References to these frames will then
+appear in the RX ring once each packet has been received. The
+completion ring, on the other hand, contains frame addr that the
+kernel has transmitted completely and can now be used again by user
+space, for either TX or RX. Thus, the frame addrs appearing in the
+completion ring are addrs that were previously transmitted using the
+TX ring. In summary, the RX and FILL rings are used for the RX path
+and the TX and COMPLETION rings are used for the TX path.
 
 The socket is then finally bound with a bind() call to a device and a
 specific queue id on that device, and it is not until bind is
@@ -59,13 +59,13 @@ wants to do this, it simply skips the registration of the UMEM and its
 corresponding two rings, sets the XDP_SHARED_UMEM flag in the bind
 call and submits the XSK of the process it would like to share UMEM
 with as well as its own newly created XSK socket. The new process will
-then receive frame id references in its own RX ring that point to this
-shared UMEM. Note that since the ring structures are single-consumer /
-single-producer (for performance reasons), the new process has to
-create its own socket with associated RX and TX rings, since it cannot
-share this with the other process. This is also the reason that there
-is only one set of FILL and COMPLETION rings per UMEM. It is the
-responsibility of a single process to handle the UMEM.
+then receive frame addr references in its own RX ring that point to
+this shared UMEM. Note that since the ring structures are
+single-consumer / single-producer (for performance reasons), the new
+process has to create its own socket with associated RX and TX rings,
+since it cannot share this with the other process. This is also the
+reason that there is only one set of FILL and COMPLETION rings per
+UMEM. It is the responsibility of a single process to handle the UMEM.
 
 How is then packets distributed from an XDP program to the XSKs? There
 is a BPF map called XSKMAP (or BPF_MAP_TYPE_XSKMAP in full). The
@@ -102,10 +102,10 @@ UMEM
 
 UMEM is a region of virtual contiguous memory, divided into
 equal-sized frames. An UMEM is associated to a netdev and a specific
-queue id of that netdev. It is created and configured (frame size,
-frame headroom, start address and size) by using the XDP_UMEM_REG
-setsockopt system call. A UMEM is bound to a netdev and queue id, via
-the bind() system call.
+queue id of that netdev. It is created and configured (chunk size,
+headroom, start address and size) by using the XDP_UMEM_REG setsockopt
+system call. A UMEM is bound to a netdev and queue id, via the bind()
+system call.
 
 An AF_XDP is socket linked to a single UMEM, but one UMEM can have
 multiple AF_XDP sockets. To share an UMEM created via one socket A,
@@ -147,13 +147,17 @@ UMEM Fill Ring
 ~~~~~~~~~~~~~~
 
 The Fill ring is used to transfer ownership of UMEM frames from
-user-space to kernel-space. The UMEM indicies are passed in the
-ring. As an example, if the UMEM is 64k and each frame is 4k, then the
-UMEM has 16 frames and can pass indicies between 0 and 15.
+user-space to kernel-space. The UMEM addrs are passed in the ring. As
+an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
+16 chunks and can pass addrs between 0 and 64k.
 
 Frames passed to the kernel are used for the ingress path (RX rings).
 
-The user application produces UMEM indicies to this ring.
+The user application produces UMEM addrs to this ring. Note that the
+kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
+log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
+and 3000 refers to the same chunk.
+
 
 UMEM Completetion Ring
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -165,16 +169,15 @@ used.
 Frames passed from the kernel to user-space are frames that has been
 sent (TX ring) and can be used by user-space again.
 
-The user application consumes UMEM indicies from this ring.
+The user application consumes UMEM addrs from this ring.
 
 
 RX Ring
 ~~~~~~~
 
 The RX ring is the receiving side of a socket. Each entry in the ring
-is a struct xdp_desc descriptor. The descriptor contains UMEM index
-(idx), the length of the data (len), the offset into the frame
-(offset).
+is a struct xdp_desc descriptor. The descriptor contains UMEM offset
+(addr) and the length of the data (len).
 
 If no frames have been passed to kernel via the Fill ring, no
 descriptors will (or can) appear on the RX ring.
@@ -221,38 +224,50 @@ side is xdpsock_user.c and the XDP side xdpsock_kern.c.
 
 Naive ring dequeue and enqueue could look like this::
 
+    // struct xdp_rxtx_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	struct xdp_desc *desc;
+    // };
+
+    // struct xdp_umem_ring {
+    // 	__u32 *producer;
+    // 	__u32 *consumer;
+    // 	__u64 *desc;
+    // };
+
     // typedef struct xdp_rxtx_ring RING;
     // typedef struct xdp_umem_ring RING;
 
     // typedef struct xdp_desc RING_TYPE;
-    // typedef __u32 RING_TYPE;
+    // typedef __u64 RING_TYPE;
 
     int dequeue_one(RING *ring, RING_TYPE *item)
     {
-        __u32 entries = ring->ptrs.producer - ring->ptrs.consumer;
+        __u32 entries = *ring->producer - *ring->consumer;
 
         if (entries == 0)
             return -1;
 
         // read-barrier!
 
-        *item = ring->desc[ring->ptrs.consumer & (RING_SIZE - 1)];
-        ring->ptrs.consumer++;
+        *item = ring->desc[*ring->consumer & (RING_SIZE - 1)];
+        (*ring->consumer)++;
         return 0;
     }
 
     int enqueue_one(RING *ring, const RING_TYPE *item)
     {
-        u32 free_entries = RING_SIZE - (ring->ptrs.producer - ring->ptrs.consumer);
+        u32 free_entries = RING_SIZE - (*ring->producer - *ring->consumer);
 
         if (free_entries == 0)
             return -1;
 
-        ring->desc[ring->ptrs.producer & (RING_SIZE - 1)] = *item;
+        ring->desc[*ring->producer & (RING_SIZE - 1)] = *item;
 
         // write-barrier!
 
-        ring->ptrs.producer++;
+        (*ring->producer)++;
         return 0;
     }
 
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 4737cfe222f5..e411d6f9ac65 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -48,8 +48,8 @@ struct xdp_mmap_offsets {
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
 	__u64 len; /* Length of packet data area */
-	__u32 frame_size; /* Frame size */
-	__u32 frame_headroom; /* Frame head room */
+	__u32 chunk_size;
+	__u32 headroom;
 };
 
 struct xdp_statistics {
@@ -66,13 +66,11 @@ struct xdp_statistics {
 
 /* Rx/Tx descriptor */
 struct xdp_desc {
-	__u32 idx;
+	__u64 addr;
 	__u32 len;
-	__u16 offset;
-	__u8 flags;
-	__u8 padding[5];
+	__u32 options;
 };
 
-/* UMEM descriptor is __u32 */
+/* UMEM descriptor is __u64 */
 
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 87998818116f..9ad791ff4739 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
 
 #include "xdp_umem.h"
 
-#define XDP_UMEM_MIN_FRAME_SIZE 2048
+#define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
@@ -151,12 +151,12 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
-	u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom;
+	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
+	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	unsigned int nframes, nfpp;
 	int size_chk, err;
 
-	if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) {
+	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
 		 * - huge pages, or*
 		 * - using an IOMMU, or
@@ -166,7 +166,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 		return -EINVAL;
 	}
 
-	if (!is_power_of_2(frame_size))
+	if (!is_power_of_2(chunk_size))
 		return -EINVAL;
 
 	if (!PAGE_ALIGNED(addr)) {
@@ -179,33 +179,30 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	if ((addr + size) < addr)
 		return -EINVAL;
 
-	nframes = (unsigned int)div_u64(size, frame_size);
-	if (nframes == 0 || nframes > UINT_MAX)
+	chunks = (unsigned int)div_u64(size, chunk_size);
+	if (chunks == 0)
 		return -EINVAL;
 
-	nfpp = PAGE_SIZE / frame_size;
-	if (nframes < nfpp || nframes % nfpp)
+	chunks_per_page = PAGE_SIZE / chunk_size;
+	if (chunks < chunks_per_page || chunks % chunks_per_page)
 		return -EINVAL;
 
-	frame_headroom = ALIGN(frame_headroom, 64);
+	headroom = ALIGN(headroom, 64);
 
-	size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM;
+	size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 	if (size_chk < 0)
 		return -EINVAL;
 
 	umem->pid = get_task_pid(current, PIDTYPE_PID);
-	umem->size = (size_t)size;
 	umem->address = (unsigned long)addr;
-	umem->props.frame_size = frame_size;
-	umem->props.nframes = nframes;
-	umem->frame_headroom = frame_headroom;
+	umem->props.chunk_mask = ~((u64)chunk_size - 1);
+	umem->props.size = size;
+	umem->headroom = headroom;
+	umem->chunk_size_nohr = chunk_size - headroom;
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
 
-	umem->frame_size_log2 = ilog2(frame_size);
-	umem->nfpp_mask = nfpp - 1;
-	umem->nfpplog2 = ilog2(nfpp);
 	refcount_set(&umem->users, 1);
 
 	err = xdp_umem_account_pages(umem);
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 0881cf456230..aeadd1bcb72d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,35 +18,20 @@ struct xdp_umem {
 	struct xsk_queue *cq;
 	struct page **pgs;
 	struct xdp_umem_props props;
-	u32 npgs;
-	u32 frame_headroom;
-	u32 nfpp_mask;
-	u32 nfpplog2;
-	u32 frame_size_log2;
+	u32 headroom;
+	u32 chunk_size_nohr;
 	struct user_struct *user;
 	struct pid *pid;
 	unsigned long address;
-	size_t size;
 	refcount_t users;
 	struct work_struct work;
+	u32 npgs;
 };
 
-static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx)
-{
-	u64 pg, off;
-	char *data;
-
-	pg = idx >> umem->nfpplog2;
-	off = (idx & umem->nfpp_mask) << umem->frame_size_log2;
-
-	data = page_address(umem->pgs[pg]);
-	return data + off;
-}
-
-static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem,
-						    u32 idx)
+static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return xdp_umem_get_data(umem, idx) + umem->frame_headroom;
+	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
+		(addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h
index 2cf8ec485fd2..40eab10dfc49 100644
--- a/net/xdp/xdp_umem_props.h
+++ b/net/xdp/xdp_umem_props.h
@@ -7,8 +7,8 @@
 #define XDP_UMEM_PROPS_H_
 
 struct xdp_umem_props {
-	u32 frame_size;
-	u32 nframes;
+	u64 chunk_mask;
+	u64 size;
 };
 
 #endif /* XDP_UMEM_PROPS_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 966307ce4b8e..4688c750df1d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -41,24 +41,27 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
-	u32 id, len = xdp->data_end - xdp->data;
+	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
+	u64 addr;
 	int err;
 
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
-	if (!xskq_peek_id(xs->umem->fq, &id)) {
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
 		return -ENOSPC;
 	}
 
-	buffer = xdp_umem_get_data_with_headroom(xs->umem, id);
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
-	err = xskq_produce_batch_desc(xs->rx, id, len,
-				      xs->umem->frame_headroom);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
 	if (!err)
-		xskq_discard_id(xs->umem->fq);
+		xskq_discard_addr(xs->umem->fq);
 	else
 		xs->rx_dropped++;
 
@@ -95,10 +98,10 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
-	u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg;
+	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 	struct xdp_sock *xs = xdp_sk(skb->sk);
 
-	WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id));
+	WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 
 	sock_wfree(skb);
 }
@@ -123,14 +126,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	while (xskq_peek_desc(xs->tx, &desc)) {
 		char *buffer;
-		u32 id, len;
+		u64 addr;
+		u32 len;
 
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
 			goto out;
 		}
 
-		if (xskq_reserve_id(xs->umem->cq)) {
+		if (xskq_reserve_addr(xs->umem->cq)) {
 			err = -EAGAIN;
 			goto out;
 		}
@@ -153,8 +157,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		}
 
 		skb_put(skb, len);
-		id = desc.idx;
-		buffer = xdp_umem_get_data(xs->umem, id) + desc.offset;
+		addr = desc.addr;
+		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
 		if (unlikely(err)) {
 			kfree_skb(skb);
@@ -164,7 +168,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		skb->dev = xs->dev;
 		skb->priority = sk->sk_priority;
 		skb->mark = sk->sk_mark;
-		skb_shinfo(skb)->destructor_arg = (void *)(long)id;
+		skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 		skb->destructor = xsk_destruct_skb;
 
 		err = dev_direct_xmit(skb, xs->queue_id);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index ebe85e59507e..6c32e92e98fc 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -17,7 +17,7 @@ void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
 
 static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
 {
-	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32);
+	return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u64);
 }
 
 static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index b5924e7aeb2b..337e5ad3b10e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -27,7 +27,7 @@ struct xdp_rxtx_ring {
 /* Used for the fill and completion queues for buffers */
 struct xdp_umem_ring {
 	struct xdp_ring ptrs;
-	u32 desc[0] ____cacheline_aligned_in_smp;
+	u64 desc[0] ____cacheline_aligned_in_smp;
 };
 
 struct xsk_queue {
@@ -76,24 +76,25 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 
 /* UMEM queue */
 
-static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx)
+static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
-	if (unlikely(idx >= q->umem_props.nframes)) {
+	if (addr >= q->umem_props.size) {
 		q->invalid_descs++;
 		return false;
 	}
+
 	return true;
 }
 
-static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
 {
 	while (q->cons_tail != q->cons_head) {
 		struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 		unsigned int idx = q->cons_tail & q->ring_mask;
 
-		*id = READ_ONCE(ring->desc[idx]);
-		if (xskq_is_valid_id(q, *id))
-			return id;
+		*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
+		if (xskq_is_valid_addr(q, *addr))
+			return addr;
 
 		q->cons_tail++;
 	}
@@ -101,7 +102,7 @@ static inline u32 *xskq_validate_id(struct xsk_queue *q, u32 *id)
 	return NULL;
 }
 
-static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
 {
 	if (q->cons_tail == q->cons_head) {
 		WRITE_ONCE(q->ring->consumer, q->cons_tail);
@@ -111,19 +112,19 @@ static inline u32 *xskq_peek_id(struct xsk_queue *q, u32 *id)
 		smp_rmb();
 	}
 
-	return xskq_validate_id(q, id);
+	return xskq_validate_addr(q, addr);
 }
 
-static inline void xskq_discard_id(struct xsk_queue *q)
+static inline void xskq_discard_addr(struct xsk_queue *q)
 {
 	q->cons_tail++;
 }
 
-static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
+static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
-	ring->desc[q->prod_tail++ & q->ring_mask] = id;
+	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
 	smp_wmb();
@@ -132,7 +133,7 @@ static inline int xskq_produce_id(struct xsk_queue *q, u32 id)
 	return 0;
 }
 
-static inline int xskq_reserve_id(struct xsk_queue *q)
+static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
 		return -ENOSPC;
@@ -145,16 +146,11 @@ static inline int xskq_reserve_id(struct xsk_queue *q)
 
 static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 {
-	u32 buff_len;
-
-	if (unlikely(d->idx >= q->umem_props.nframes)) {
-		q->invalid_descs++;
+	if (!xskq_is_valid_addr(q, d->addr))
 		return false;
-	}
 
-	buff_len = q->umem_props.frame_size;
-	if (unlikely(d->len > buff_len || d->len == 0 ||
-		     d->offset > buff_len || d->offset + d->len > buff_len)) {
+	if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
+	    (d->addr & q->umem_props.chunk_mask)) {
 		q->invalid_descs++;
 		return false;
 	}
@@ -199,7 +195,7 @@ static inline void xskq_discard_desc(struct xsk_queue *q)
 }
 
 static inline int xskq_produce_batch_desc(struct xsk_queue *q,
-					  u32 id, u32 len, u16 offset)
+					  u64 addr, u32 len)
 {
 	struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
 	unsigned int idx;
@@ -208,9 +204,8 @@ static inline int xskq_produce_batch_desc(struct xsk_queue *q,
 		return -ENOSPC;
 
 	idx = (q->prod_head++) & q->ring_mask;
-	ring->desc[idx].idx = id;
+	ring->desc[idx].addr = addr;
 	ring->desc[idx].len = len;
-	ring->desc[idx].offset = offset;
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a412ef54fc2eb81bb55428dcdcdaa2e38ae9bba5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 13:57:14 +0200
Subject: samples/bpf: adapted to new uapi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, the xdpsock sample application is adjusted to the new descriptor
format.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 84 ++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 48 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index e379eac034ac..b71a342b9082 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -46,6 +46,7 @@
 
 #define NUM_FRAMES 131072
 #define FRAME_HEADROOM 0
+#define FRAME_SHIFT 11
 #define FRAME_SIZE 2048
 #define NUM_DESCS 1024
 #define BATCH_SIZE 16
@@ -55,6 +56,7 @@
 
 #define DEBUG_HEXDUMP 0
 
+typedef __u64 u64;
 typedef __u32 u32;
 
 static unsigned long prev_time;
@@ -81,12 +83,12 @@ struct xdp_umem_uqueue {
 	u32 size;
 	u32 *producer;
 	u32 *consumer;
-	u32 *ring;
+	u64 *ring;
 	void *map;
 };
 
 struct xdp_umem {
-	char (*frames)[FRAME_SIZE];
+	char *frames;
 	struct xdp_umem_uqueue fq;
 	struct xdp_umem_uqueue cq;
 	int fd;
@@ -214,7 +216,7 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
 	for (i = 0; i < nb; i++) {
 		u32 idx = fq->cached_prod++ & fq->mask;
 
-		fq->ring[idx] = d[i].idx;
+		fq->ring[idx] = d[i].addr;
 	}
 
 	u_smp_wmb();
@@ -224,7 +226,7 @@ static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
 	return 0;
 }
 
-static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
 				      size_t nb)
 {
 	u32 i;
@@ -246,7 +248,7 @@ static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u32 *d,
 }
 
 static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
-					       u32 *d, size_t nb)
+					       u64 *d, size_t nb)
 {
 	u32 idx, i, entries = umem_nb_avail(cq, nb);
 
@@ -266,10 +268,9 @@ static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
 	return entries;
 }
 
-static inline void *xq_get_data(struct xdpsock *xsk, __u32 idx, __u32 off)
+static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
 {
-	lassert(idx < NUM_FRAMES);
-	return &xsk->umem->frames[idx][off];
+	return &xsk->umem->frames[addr];
 }
 
 static inline int xq_enq(struct xdp_uqueue *uq,
@@ -285,9 +286,8 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		r[idx].idx = descs[i].idx;
+		r[idx].addr = descs[i].addr;
 		r[idx].len = descs[i].len;
-		r[idx].offset = descs[i].offset;
 	}
 
 	u_smp_wmb();
@@ -297,7 +297,7 @@ static inline int xq_enq(struct xdp_uqueue *uq,
 }
 
 static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
-				 __u32 idx, unsigned int ndescs)
+				 unsigned int id, unsigned int ndescs)
 {
 	struct xdp_desc *r = uq->ring;
 	unsigned int i;
@@ -308,9 +308,8 @@ static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
 	for (i = 0; i < ndescs; i++) {
 		u32 idx = uq->cached_prod++ & uq->mask;
 
-		r[idx].idx	= idx + i;
+		r[idx].addr	= (id + i) << FRAME_SHIFT;
 		r[idx].len	= sizeof(pkt_data) - 1;
-		r[idx].offset	= 0;
 	}
 
 	u_smp_wmb();
@@ -357,17 +356,21 @@ static void swap_mac_addresses(void *data)
 	*dst_addr = tmp;
 }
 
-#if DEBUG_HEXDUMP
-static void hex_dump(void *pkt, size_t length, const char *prefix)
+static void hex_dump(void *pkt, size_t length, u64 addr)
 {
-	int i = 0;
 	const unsigned char *address = (unsigned char *)pkt;
 	const unsigned char *line = address;
 	size_t line_size = 32;
 	unsigned char c;
+	char buf[32];
+	int i = 0;
 
+	if (!DEBUG_HEXDUMP)
+		return;
+
+	sprintf(buf, "addr=%llu", addr);
 	printf("length = %zu\n", length);
-	printf("%s | ", prefix);
+	printf("%s | ", buf);
 	while (length-- > 0) {
 		printf("%02X ", *address++);
 		if (!(++i % line_size) || (length == 0 && i % line_size)) {
@@ -382,12 +385,11 @@ static void hex_dump(void *pkt, size_t length, const char *prefix)
 			}
 			printf("\n");
 			if (length > 0)
-				printf("%s | ", prefix);
+				printf("%s | ", buf);
 		}
 	}
 	printf("\n");
 }
-#endif
 
 static size_t gen_eth_frame(char *frame)
 {
@@ -412,8 +414,8 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 
 	mr.addr = (__u64)bufs;
 	mr.len = NUM_FRAMES * FRAME_SIZE;
-	mr.frame_size = FRAME_SIZE;
-	mr.frame_headroom = FRAME_HEADROOM;
+	mr.chunk_size = FRAME_SIZE;
+	mr.headroom = FRAME_HEADROOM;
 
 	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
 	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
@@ -426,7 +428,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 			   &optlen) == 0);
 
 	umem->fq.map = mmap(0, off.fr.desc +
-			    FQ_NUM_DESCS * sizeof(u32),
+			    FQ_NUM_DESCS * sizeof(u64),
 			    PROT_READ | PROT_WRITE,
 			    MAP_SHARED | MAP_POPULATE, sfd,
 			    XDP_UMEM_PGOFF_FILL_RING);
@@ -439,7 +441,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->fq.ring = umem->fq.map + off.fr.desc;
 
 	umem->cq.map = mmap(0, off.cr.desc +
-			     CQ_NUM_DESCS * sizeof(u32),
+			     CQ_NUM_DESCS * sizeof(u64),
 			     PROT_READ | PROT_WRITE,
 			     MAP_SHARED | MAP_POPULATE, sfd,
 			     XDP_UMEM_PGOFF_COMPLETION_RING);
@@ -451,14 +453,14 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->cq.consumer = umem->cq.map + off.cr.consumer;
 	umem->cq.ring = umem->cq.map + off.cr.desc;
 
-	umem->frames = (char (*)[FRAME_SIZE])bufs;
+	umem->frames = bufs;
 	umem->fd = sfd;
 
 	if (opt_bench == BENCH_TXONLY) {
 		int i;
 
-		for (i = 0; i < NUM_FRAMES; i++)
-			(void)gen_eth_frame(&umem->frames[i][0]);
+		for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
+			(void)gen_eth_frame(&umem->frames[i]);
 	}
 
 	return umem;
@@ -472,7 +474,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	struct xdpsock *xsk;
 	bool shared = true;
 	socklen_t optlen;
-	u32 i;
+	u64 i;
 
 	sfd = socket(PF_XDP, SOCK_RAW, 0);
 	lassert(sfd >= 0);
@@ -508,7 +510,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	lassert(xsk->rx.map != MAP_FAILED);
 
 	if (!shared) {
-		for (i = 0; i < NUM_DESCS / 2; i++)
+		for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
 			lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
 				== 0);
 	}
@@ -727,7 +729,7 @@ static void kick_tx(int fd)
 
 static inline void complete_tx_l2fwd(struct xdpsock *xsk)
 {
-	u32 descs[BATCH_SIZE];
+	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
 	size_t ndescs;
 
@@ -749,7 +751,7 @@ static inline void complete_tx_l2fwd(struct xdpsock *xsk)
 
 static inline void complete_tx_only(struct xdpsock *xsk)
 {
-	u32 descs[BATCH_SIZE];
+	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
 
 	if (!xsk->outstanding_tx)
@@ -774,17 +776,9 @@ static void rx_drop(struct xdpsock *xsk)
 		return;
 
 	for (i = 0; i < rcvd; i++) {
-		u32 idx = descs[i].idx;
-
-		lassert(idx < NUM_FRAMES);
-#if DEBUG_HEXDUMP
-		char *pkt;
-		char buf[32];
+		char *pkt = xq_get_data(xsk, descs[i].addr);
 
-		pkt = xq_get_data(xsk, idx, descs[i].offset);
-		sprintf(buf, "idx=%d", idx);
-		hex_dump(pkt, descs[i].len, buf);
-#endif
+		hex_dump(pkt, descs[i].len, descs[i].addr);
 	}
 
 	xsk->rx_npkts += rcvd;
@@ -867,17 +861,11 @@ static void l2fwd(struct xdpsock *xsk)
 		}
 
 		for (i = 0; i < rcvd; i++) {
-			char *pkt = xq_get_data(xsk, descs[i].idx,
-						descs[i].offset);
+			char *pkt = xq_get_data(xsk, descs[i].addr);
 
 			swap_mac_addresses(pkt);
-#if DEBUG_HEXDUMP
-			char buf[32];
-			u32 idx = descs[i].idx;
 
-			sprintf(buf, "idx=%d", idx);
-			hex_dump(pkt, descs[i].len, buf);
-#endif
+			hex_dump(pkt, descs[i].len, descs[i].addr);
 		}
 
 		xsk->rx_npkts += rcvd;
-- 
cgit v1.2.3-59-g8ed1b


From a65ea68b8dd92fea86d2b8ca7e43caeaa5ddcdff Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 13:57:15 +0200
Subject: samples/bpf: minor *_nb_free performance fix

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index b71a342b9082..7494f60fbff8 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -157,15 +157,15 @@ static const char pkt_data[] =
 
 static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
 {
-	u32 free_entries = q->size - (q->cached_prod - q->cached_cons);
+	u32 free_entries = q->cached_cons - q->cached_prod;
 
 	if (free_entries >= nb)
 		return free_entries;
 
 	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer;
+	q->cached_cons = *q->consumer + q->size;
 
-	return q->size - (q->cached_prod - q->cached_cons);
+	return q->cached_cons - q->cached_prod;
 }
 
 static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
@@ -439,6 +439,7 @@ static struct xdp_umem *xdp_umem_configure(int sfd)
 	umem->fq.producer = umem->fq.map + off.fr.producer;
 	umem->fq.consumer = umem->fq.map + off.fr.consumer;
 	umem->fq.ring = umem->fq.map + off.fr.desc;
+	umem->fq.cached_cons = FQ_NUM_DESCS;
 
 	umem->cq.map = mmap(0, off.cr.desc +
 			     CQ_NUM_DESCS * sizeof(u64),
@@ -535,6 +536,7 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	xsk->tx.producer = xsk->tx.map + off.tx.producer;
 	xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
 	xsk->tx.ring = xsk->tx.map + off.tx.desc;
+	xsk->tx.cached_cons = NUM_DESCS;
 
 	sxdp.sxdp_family = PF_XDP;
 	sxdp.sxdp_ifindex = opt_ifindex;
-- 
cgit v1.2.3-59-g8ed1b


From 76a45194e5392ef6642785a3aa083818bd8541dd Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 1 Jun 2018 16:28:34 +0100
Subject: net: aquantia: make function aq_fw2x_get_mac_permanent static

The function aq_fw2x_get_mac_permanent is local to the source and does
not need to be in global scope, so make it static.

Cleans up sparse warning:
warning: symbol 'aq_fw2x_get_mac_permanent' was not declared. Should it
be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
index 8cfce95c82fc..39cd3a27fe77 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
@@ -107,7 +107,7 @@ static int aq_fw2x_update_link_status(struct aq_hw_s *self)
 	return 0;
 }
 
-int aq_fw2x_get_mac_permanent(struct aq_hw_s *self, u8 *mac)
+static int aq_fw2x_get_mac_permanent(struct aq_hw_s *self, u8 *mac)
 {
 	int err = 0;
 	u32 h = 0U;
-- 
cgit v1.2.3-59-g8ed1b


From eb73190f4fbeedf762394e92d6a4ec9ace684c88 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Jun 2018 09:23:02 -0700
Subject: net/packet: refine check for priv area size

syzbot was able to trick af_packet again [1]

Various commits tried to address the problem in the past,
but failed to take into account V3 header size.

[1]

tpacket_rcv: packet too big, clamped from 72 to 4294967224. macoff=96
BUG: KASAN: use-after-free in prb_run_all_ft_ops net/packet/af_packet.c:1016 [inline]
BUG: KASAN: use-after-free in prb_fill_curr_block.isra.59+0x4e5/0x5c0 net/packet/af_packet.c:1039
Write of size 2 at addr ffff8801cb62000e by task kworker/1:2/2106

CPU: 1 PID: 2106 Comm: kworker/1:2 Not tainted 4.17.0-rc7+ #77
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: ipv6_addrconf addrconf_dad_work
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_store2_noabort+0x17/0x20 mm/kasan/report.c:436
 prb_run_all_ft_ops net/packet/af_packet.c:1016 [inline]
 prb_fill_curr_block.isra.59+0x4e5/0x5c0 net/packet/af_packet.c:1039
 __packet_lookup_frame_in_block net/packet/af_packet.c:1094 [inline]
 packet_current_rx_frame net/packet/af_packet.c:1117 [inline]
 tpacket_rcv+0x1866/0x3340 net/packet/af_packet.c:2282
 dev_queue_xmit_nit+0x891/0xb90 net/core/dev.c:2018
 xmit_one net/core/dev.c:3049 [inline]
 dev_hard_start_xmit+0x16b/0xc10 net/core/dev.c:3069
 __dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
 neigh_resolve_output+0x679/0xad0 net/core/neighbour.c:1358
 neigh_output include/net/neighbour.h:482 [inline]
 ip6_finish_output2+0xc9c/0x2810 net/ipv6/ip6_output.c:120
 ip6_finish_output+0x5fe/0xbc0 net/ipv6/ip6_output.c:154
 NF_HOOK_COND include/linux/netfilter.h:277 [inline]
 ip6_output+0x227/0x9b0 net/ipv6/ip6_output.c:171
 dst_output include/net/dst.h:444 [inline]
 NF_HOOK include/linux/netfilter.h:288 [inline]
 ndisc_send_skb+0x100d/0x1570 net/ipv6/ndisc.c:491
 ndisc_send_ns+0x3c1/0x8d0 net/ipv6/ndisc.c:633
 addrconf_dad_work+0xbef/0x1340 net/ipv6/addrconf.c:4033
 process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
 worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
 kthread+0x345/0x410 kernel/kthread.c:240
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412

The buggy address belongs to the page:
page:ffffea00072d8800 count:0 mapcount:-127 mapping:0000000000000000 index:0xffff8801cb620e80
flags: 0x2fffc0000000000()
raw: 02fffc0000000000 0000000000000000 ffff8801cb620e80 00000000ffffff80
raw: ffffea00072e3820 ffffea0007132d20 0000000000000002 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801cb61ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff8801cb61ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff8801cb620000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
                      ^
 ffff8801cb620080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 ffff8801cb620100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

Fixes: 2b6867c2ce76 ("net/packet: fix overflow in check for priv area size")
Fixes: dc808110bb62 ("packet: handle too big packets for PACKET_V3")
Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b00aa959727d..082f981795f5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4250,7 +4250,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 		if (po->tp_version >= TPACKET_V3 &&
 		    req->tp_block_size <=
-			  BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
+		    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + sizeof(struct tpacket3_hdr))
 			goto out;
 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
 					po->tp_reserve))
-- 
cgit v1.2.3-59-g8ed1b


From fe083b3f06e91cf8508f23922867e52348205825 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 2 Jun 2018 03:46:13 +0800
Subject: net: mvpp2: mvpp2_percpu_read_relaxed() can be static

Fixes: db9d7d36eecc ("net: mvpp2: Split the PPv2 driver to a dedicated directory")
Signed-off-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 45622bffd81a..0319ed9ef8b8 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -141,7 +141,7 @@ void mvpp2_percpu_write_relaxed(struct mvpp2 *priv, int cpu,
 	writel_relaxed(data, priv->swth_base[cpu] + offset);
 }
 
-u32 mvpp2_percpu_read_relaxed(struct mvpp2 *priv, int cpu,
+static u32 mvpp2_percpu_read_relaxed(struct mvpp2 *priv, int cpu,
 				     u32 offset)
 {
 	return readl_relaxed(priv->swth_base[cpu] + offset);
-- 
cgit v1.2.3-59-g8ed1b


From fff200caf6f9179dd9a7fc67acd659e614c3f72f Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Thu, 10 May 2018 16:28:35 +0900
Subject: e1000e: Ignore TSYNCRXCTL when getting I219 clock attributes

There have been multiple reports of crashes that look like
kernel: RIP: 0010:[<ffffffff8110303f>] timecounter_read+0xf/0x50
[...]
kernel: Call Trace:
kernel:  [<ffffffffa0806b0f>] e1000e_phc_gettime+0x2f/0x60 [e1000e]
kernel:  [<ffffffffa0806c5d>] e1000e_systim_overflow_work+0x1d/0x80 [e1000e]
kernel:  [<ffffffff810992c5>] process_one_work+0x155/0x440
kernel:  [<ffffffff81099e16>] worker_thread+0x116/0x4b0
kernel:  [<ffffffff8109f422>] kthread+0xd2/0xf0
kernel:  [<ffffffff8163184f>] ret_from_fork+0x3f/0x70

These can be traced back to the fact that e1000e_systim_reset() skips the
timecounter_init() call if e1000e_get_base_timinca() returns -EINVAL, which
leads to a null deref in timecounter_read().

Commit 83129b37ef35 ("e1000e: fix systim issues", v4.2-rc1) reworked
e1000e_get_base_timinca() in such a way that it can return -EINVAL for
e1000_pch_spt if the SYSCFI bit is not set in TSYNCRXCTL.

Some experimentation has shown that on I219 (e1000_pch_spt, "MAC: 12")
adapters, the E1000_TSYNCRXCTL_SYSCFI flag is unstable; TSYNCRXCTL reads
sometimes don't have the SYSCFI bit set. Retrying the read shortly after
finds the bit to be set. This was observed at boot (probe) but also link up
and link down.

Moreover, the phc (PTP Hardware Clock) seems to operate normally even after
reads where SYSCFI=0. Therefore, remove this register read and
unconditionally set the clock parameters.

Reported-by: Achim Mildenberger <admin@fph.physik.uni-karlsruhe.de>
Message-Id: <20180425065243.g5mqewg5irkwgwgv@f2>
Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1075876
Fixes: 83129b37ef35 ("e1000e: fix systim issues")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index d3fef7fefea8..acf1e8b52b8e 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3527,15 +3527,12 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca)
 		}
 		break;
 	case e1000_pch_spt:
-		if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) {
-			/* Stable 24MHz frequency */
-			incperiod = INCPERIOD_24MHZ;
-			incvalue = INCVALUE_24MHZ;
-			shift = INCVALUE_SHIFT_24MHZ;
-			adapter->cc.shift = shift;
-			break;
-		}
-		return -EINVAL;
+		/* Stable 24MHz frequency */
+		incperiod = INCPERIOD_24MHZ;
+		incvalue = INCVALUE_24MHZ;
+		shift = INCVALUE_SHIFT_24MHZ;
+		adapter->cc.shift = shift;
+		break;
 	case e1000_pch_cnp:
 		if (er32(TSYNCRXCTL) & E1000_TSYNCRXCTL_SYSCFI) {
 			/* Stable 24MHz frequency */
-- 
cgit v1.2.3-59-g8ed1b


From 85d63445f41125dafeddda74e5b13b7eefac9407 Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 10 May 2018 12:20:13 -0700
Subject: Documentation: e100: Update the Intel 10/100 driver doc

Over the years, several of the links have changed or are no longer valid
so update them.  In addition, the default values were incorrect for a
couple of parameters.

Converted the text file to the reStructuredText (RST) format, since the
Linux kernel documentation now uses this format for documentation.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 Documentation/networking/e100.rst  | 177 +++++++++++++++++++++++++++++++++++
 Documentation/networking/e100.txt  | 183 -------------------------------------
 Documentation/networking/index.rst |   1 +
 MAINTAINERS                        |   2 +-
 4 files changed, 179 insertions(+), 184 deletions(-)
 create mode 100644 Documentation/networking/e100.rst
 delete mode 100644 Documentation/networking/e100.txt

diff --git a/Documentation/networking/e100.rst b/Documentation/networking/e100.rst
new file mode 100644
index 000000000000..d4d837027925
--- /dev/null
+++ b/Documentation/networking/e100.rst
@@ -0,0 +1,177 @@
+Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
+==============================================================
+
+June 1, 2018
+
+Contents
+========
+
+- In This Release
+- Identifying Your Adapter
+- Building and Installation
+- Driver Configuration Parameters
+- Additional Configurations
+- Known Issues
+- Support
+
+
+In This Release
+===============
+
+This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of
+Adapters. This driver includes support for Itanium(R)2-based systems.
+
+For questions related to hardware requirements, refer to the documentation
+supplied with your Intel PRO/100 adapter.
+
+The following features are now available in supported kernels:
+ - Native VLANs
+ - Channel Bonding (teaming)
+ - SNMP
+
+Channel Bonding documentation can be found in the Linux kernel source:
+/Documentation/networking/bonding.txt
+
+
+Identifying Your Adapter
+========================
+
+For information on how to identify your adapter, and for the latest Intel
+network drivers, refer to the Intel Support website:
+http://www.intel.com/support
+
+Driver Configuration Parameters
+===============================
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
+   structure that describes a receive buffer and its attributes to the network
+   controller. The data in the descriptor is used by the controller to write
+   data from the controller to host memory. In the 3.x.x driver the valid range
+   for this parameter is 64-256. The default value is 256. This parameter can be
+   changed using the command::
+
+   ethtool -G eth? rx n
+
+   Where n is the number of desired Rx descriptors.
+
+Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
+   structure that describes a transmit buffer and its attributes to the network
+   controller. The data in the descriptor is used by the controller to read
+   data from the host memory to the controller. In the 3.x.x driver the valid
+   range for this parameter is 64-256. The default value is 128. This parameter
+   can be changed using the command::
+
+   ethtool -G eth? tx n
+
+   Where n is the number of desired Tx descriptors.
+
+Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
+   default. The ethtool utility can be used as follows to force speed/duplex.::
+
+   ethtool -s eth?  autoneg off speed {10|100} duplex {full|half}
+
+   NOTE: setting the speed/duplex to incorrect values will cause the link to
+   fail.
+
+Event Log Message Level:  The driver uses the message level flag to log events
+   to syslog. The message level can be set at driver load time. It can also be
+   set using the command::
+
+   ethtool -s eth? msglvl n
+
+
+Additional Configurations
+=========================
+
+  Configuring the Driver on Different Distributions
+  -------------------------------------------------
+
+  Configuring a network driver to load properly when the system is started is
+  distribution dependent. Typically, the configuration process involves adding
+  an alias line to /etc/modprobe.d/*.conf as well as editing other system
+  startup scripts and/or configuration files.  Many popular Linux
+  distributions ship with tools to make these changes for you. To learn the
+  proper way to configure a network device for your system, refer to your
+  distribution documentation.  If during this process you are asked for the
+  driver or module name, the name for the Linux Base Driver for the Intel
+  PRO/100 Family of Adapters is e100.
+
+  As an example, if you install the e100 driver for two PRO/100 adapters
+  (eth0 and eth1), add the following to a configuration file in /etc/modprobe.d/
+
+       alias eth0 e100
+       alias eth1 e100
+
+  Viewing Link Messages
+  ---------------------
+  In order to see link messages and other Intel driver information on your
+  console, you must set the dmesg level up to six. This can be done by
+  entering the following on the command line before loading the e100 driver::
+
+       dmesg -n 6
+
+  If you wish to see all messages issued by the driver, including debug
+  messages, set the dmesg level to eight.
+
+  NOTE: This setting is not saved across reboots.
+
+
+  ethtool
+  -------
+
+  The driver utilizes the ethtool interface for driver configuration and
+  diagnostics, as well as displaying statistical information.  The ethtool
+  version 1.6 or later is required for this functionality.
+
+  The latest release of ethtool can be found from
+  https://www.kernel.org/pub/software/network/ethtool/
+
+  Enabling Wake on LAN* (WoL)
+  ---------------------------
+  WoL is provided through the ethtool* utility.  For instructions on enabling
+  WoL with ethtool, refer to the ethtool man page.
+
+  WoL will be enabled on the system during the next shut down or reboot. For
+  this driver version, in order to enable WoL, the e100 driver must be
+  loaded when shutting down or rebooting the system.
+
+  NAPI
+  ----
+
+  NAPI (Rx polling mode) is supported in the e100 driver.
+
+  See https://wiki.linuxfoundation.org/networking/napi for more information
+  on NAPI.
+
+  Multiple Interfaces on Same Ethernet Broadcast Network
+  ------------------------------------------------------
+
+  Due to the default ARP behavior on Linux, it is not possible to have
+  one system on two IP networks in the same Ethernet broadcast domain
+  (non-partitioned switch) behave as expected. All Ethernet interfaces
+  will respond to IP traffic for any IP address assigned to the system.
+  This results in unbalanced receive traffic.
+
+  If you have multiple interfaces in a server, either turn on ARP
+  filtering by
+
+  (1) entering:: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
+      (this only works if your kernel's version is higher than 2.4.5), or
+
+  (2) installing the interfaces in separate broadcast domains (either
+      in different switches or in a switch partitioned to VLANs).
+
+
+Support
+=======
+For general information, go to the Intel support website at:
+http://www.intel.com/support/
+
+or the Intel Wired Networking project hosted by Sourceforge at:
+http://sourceforge.net/projects/e1000
+If an issue is identified with the released source code on a supported kernel
+with a supported adapter, email the specific information related to the issue
+to e1000-devel@lists.sf.net.
diff --git a/Documentation/networking/e100.txt b/Documentation/networking/e100.txt
deleted file mode 100644
index 54810b82c01a..000000000000
--- a/Documentation/networking/e100.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-Linux* Base Driver for the Intel(R) PRO/100 Family of Adapters
-==============================================================
-
-March 15, 2011
-
-Contents
-========
-
-- In This Release
-- Identifying Your Adapter
-- Building and Installation
-- Driver Configuration Parameters
-- Additional Configurations
-- Known Issues
-- Support
-
-
-In This Release
-===============
-
-This file describes the Linux* Base Driver for the Intel(R) PRO/100 Family of
-Adapters. This driver includes support for Itanium(R)2-based systems.
-
-For questions related to hardware requirements, refer to the documentation
-supplied with your Intel PRO/100 adapter.
-
-The following features are now available in supported kernels:
- - Native VLANs
- - Channel Bonding (teaming)
- - SNMP
-
-Channel Bonding documentation can be found in the Linux kernel source:
-/Documentation/networking/bonding.txt
-
-
-Identifying Your Adapter
-========================
-
-For more information on how to identify your adapter, go to the Adapter &
-Driver ID Guide at:
-
-  http://support.intel.com/support/network/adapter/pro100/21397.htm
-
-For the latest Intel network drivers for Linux, refer to the following
-website. In the search field, enter your adapter name or type, or use the
-networking link on the left to search for your adapter:
-
-  http://downloadfinder.intel.com/scripts-df/support_intel.asp
-
-Driver Configuration Parameters
-===============================
-
-The default value for each parameter is generally the recommended setting,
-unless otherwise noted.
-
-Rx Descriptors: Number of receive descriptors. A receive descriptor is a data
-   structure that describes a receive buffer and its attributes to the network
-   controller. The data in the descriptor is used by the controller to write
-   data from the controller to host memory. In the 3.x.x driver the valid range
-   for this parameter is 64-256. The default value is 64. This parameter can be
-   changed using the command:
-
-   ethtool -G eth? rx n, where n is the number of desired rx descriptors.
-
-Tx Descriptors: Number of transmit descriptors. A transmit descriptor is a data
-   structure that describes a transmit buffer and its attributes to the network
-   controller. The data in the descriptor is used by the controller to read
-   data from the host memory to the controller. In the 3.x.x driver the valid
-   range for this parameter is 64-256. The default value is 64. This parameter
-   can be changed using the command:
-
-   ethtool -G eth? tx n, where n is the number of desired tx descriptors.
-
-Speed/Duplex: The driver auto-negotiates the link speed and duplex settings by
-   default. The ethtool utility can be used as follows to force speed/duplex.
-
-   ethtool -s eth?  autoneg off speed {10|100} duplex {full|half}
-
-   NOTE: setting the speed/duplex to incorrect values will cause the link to
-   fail.
-
-Event Log Message Level:  The driver uses the message level flag to log events
-   to syslog. The message level can be set at driver load time. It can also be
-   set using the command:
-
-   ethtool -s eth? msglvl n
-
-
-Additional Configurations
-=========================
-
-  Configuring the Driver on Different Distributions
-  -------------------------------------------------
-
-  Configuring a network driver to load properly when the system is started is
-  distribution dependent. Typically, the configuration process involves adding
-  an alias line to /etc/modprobe.d/*.conf as well as editing other system
-  startup scripts and/or configuration files.  Many popular Linux
-  distributions ship with tools to make these changes for you. To learn the
-  proper way to configure a network device for your system, refer to your
-  distribution documentation.  If during this process you are asked for the
-  driver or module name, the name for the Linux Base Driver for the Intel
-  PRO/100 Family of Adapters is e100.
-
-  As an example, if you install the e100 driver for two PRO/100 adapters
-  (eth0 and eth1), add the following to a configuration file in /etc/modprobe.d/
-
-       alias eth0 e100
-       alias eth1 e100
-
-  Viewing Link Messages
-  ---------------------
-  In order to see link messages and other Intel driver information on your
-  console, you must set the dmesg level up to six. This can be done by
-  entering the following on the command line before loading the e100 driver:
-
-       dmesg -n 8
-
-  If you wish to see all messages issued by the driver, including debug
-  messages, set the dmesg level to eight.
-
-  NOTE: This setting is not saved across reboots.
-
-
-  ethtool
-  -------
-
-  The driver utilizes the ethtool interface for driver configuration and
-  diagnostics, as well as displaying statistical information.  The ethtool
-  version 1.6 or later is required for this functionality.
-
-  The latest release of ethtool can be found from
-  https://www.kernel.org/pub/software/network/ethtool/
-
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
-  WoL is provided through the ethtool* utility.  For instructions on enabling
-  WoL with ethtool, refer to the ethtool man page.
-
-  WoL will be enabled on the system during the next shut down or reboot. For
-  this driver version, in order to enable WoL, the e100 driver must be
-  loaded when shutting down or rebooting the system.
-
-  NAPI
-  ----
-
-  NAPI (Rx polling mode) is supported in the e100 driver.
-
-  See www.cyberus.ca/~hadi/usenix-paper.tgz for more information on NAPI.
-
-  Multiple Interfaces on Same Ethernet Broadcast Network
-  ------------------------------------------------------
-
-  Due to the default ARP behavior on Linux, it is not possible to have
-  one system on two IP networks in the same Ethernet broadcast domain
-  (non-partitioned switch) behave as expected. All Ethernet interfaces
-  will respond to IP traffic for any IP address assigned to the system.
-  This results in unbalanced receive traffic.
-
-  If you have multiple interfaces in a server, either turn on ARP
-  filtering by
-
-  (1) entering: echo 1 > /proc/sys/net/ipv4/conf/all/arp_filter
-      (this only works if your kernel's version is higher than 2.4.5), or
-
-  (2) installing the interfaces in separate broadcast domains (either
-      in different switches or in a switch partitioned to VLANs).
-
-
-Support
-=======
-
-For general information, go to the Intel support website at:
-
-    http://support.intel.com
-
-    or the Intel Wired Networking project hosted by Sourceforge at:
-
-    http://sourceforge.net/projects/e1000
-
-If an issue is identified with the released source code on the supported
-kernel with a supported adapter, email the specific information related to the
-issue to e1000-devel@lists.sourceforge.net.
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index cbd9bdd4a79e..d11a62977edd 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -10,6 +10,7 @@ Contents:
    batman-adv
    can
    dpaa2/index
+   e100
    kapi
    z8530book
    msg_zerocopy
diff --git a/MAINTAINERS b/MAINTAINERS
index 0ae0dbf0e15e..d68981ca9896 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7089,7 +7089,7 @@ Q:	http://patchwork.ozlabs.org/project/intel-wired-lan/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
 S:	Supported
-F:	Documentation/networking/e100.txt
+F:	Documentation/networking/e100.rst
 F:	Documentation/networking/e1000.txt
 F:	Documentation/networking/e1000e.txt
 F:	Documentation/networking/igb.txt
-- 
cgit v1.2.3-59-g8ed1b


From 228046e76189ce542f15321b760e380551468017 Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 10 May 2018 12:55:38 -0700
Subject: Documentation: e1000: Update kernel documentation

Updated the e1000.txt kernel documentation with the latest information.

Also convert the text file to reStructuredText (RST) format, since the
Linux kernel documentation now uses this format for documentation.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
---
 Documentation/networking/e1000.rst | 422 +++++++++++++++++++++++++++++++++
 Documentation/networking/e1000.txt | 461 -------------------------------------
 Documentation/networking/index.rst |   1 +
 MAINTAINERS                        |   2 +-
 4 files changed, 424 insertions(+), 462 deletions(-)
 create mode 100644 Documentation/networking/e1000.rst
 delete mode 100644 Documentation/networking/e1000.txt

diff --git a/Documentation/networking/e1000.rst b/Documentation/networking/e1000.rst
new file mode 100644
index 000000000000..616848940e63
--- /dev/null
+++ b/Documentation/networking/e1000.rst
@@ -0,0 +1,422 @@
+Linux* Base Driver for Intel(R) Ethernet Network Connection
+===========================================================
+
+Intel Gigabit Linux driver.
+Copyright(c) 1999 - 2013 Intel Corporation.
+
+Contents
+========
+
+- Identifying Your Adapter
+- Command Line Parameters
+- Speed and Duplex Configuration
+- Additional Configurations
+- Support
+
+Identifying Your Adapter
+========================
+
+For more information on how to identify your adapter, go to the Adapter &
+Driver ID Guide at:
+
+    http://support.intel.com/support/go/network/adapter/idguide.htm
+
+For the latest Intel network drivers for Linux, refer to the following
+website.  In the search field, enter your adapter name or type, or use the
+networking link on the left to search for your adapter:
+
+    http://support.intel.com/support/go/network/adapter/home.htm
+
+Command Line Parameters
+=======================
+
+The default value for each parameter is generally the recommended setting,
+unless otherwise noted.
+
+NOTES:  For more information about the AutoNeg, Duplex, and Speed
+        parameters, see the "Speed and Duplex Configuration" section in
+        this document.
+
+        For more information about the InterruptThrottleRate,
+        RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
+        parameters, see the application note at:
+        http://www.intel.com/design/network/applnots/ap450.htm
+
+AutoNeg
+-------
+(Supported only on adapters with copper connections)
+Valid Range:   0x01-0x0F, 0x20-0x2F
+Default Value: 0x2F
+
+This parameter is a bit-mask that specifies the speed and duplex settings
+advertised by the adapter.  When this parameter is used, the Speed and
+Duplex parameters must not be specified.
+
+NOTE:  Refer to the Speed and Duplex section of this readme for more
+       information on the AutoNeg parameter.
+
+Duplex
+------
+(Supported only on adapters with copper connections)
+Valid Range:   0-2 (0=auto-negotiate, 1=half, 2=full)
+Default Value: 0
+
+This defines the direction in which data is allowed to flow.  Can be
+either one or two-directional.  If both Duplex and the link partner are
+set to auto-negotiate, the board auto-detects the correct duplex.  If the
+link partner is forced (either full or half), Duplex defaults to half-
+duplex.
+
+FlowControl
+-----------
+Valid Range:   0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
+Default Value: Reads flow control settings from the EEPROM
+
+This parameter controls the automatic generation(Tx) and response(Rx)
+to Ethernet PAUSE frames.
+
+InterruptThrottleRate
+---------------------
+(not supported on Intel(R) 82542, 82543 or 82544-based adapters)
+Valid Range:   0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
+                                 4=simplified balancing)
+Default Value: 3
+
+The driver can limit the amount of interrupts per second that the adapter
+will generate for incoming packets. It does this by writing a value to the
+adapter that is based on the maximum amount of interrupts that the adapter
+will generate per second.
+
+Setting InterruptThrottleRate to a value greater or equal to 100
+will program the adapter to send out a maximum of that many interrupts
+per second, even if more packets have come in. This reduces interrupt
+load on the system and can lower CPU utilization under heavy load,
+but will increase latency as packets are not processed as quickly.
+
+The default behaviour of the driver previously assumed a static
+InterruptThrottleRate value of 8000, providing a good fallback value for
+all traffic types,but lacking in small packet performance and latency.
+The hardware can handle many more small packets per second however, and
+for this reason an adaptive interrupt moderation algorithm was implemented.
+
+Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which
+it dynamically adjusts the InterruptThrottleRate value based on the traffic
+that it receives. After determining the type of incoming traffic in the last
+timeframe, it will adjust the InterruptThrottleRate to an appropriate value
+for that traffic.
+
+The algorithm classifies the incoming traffic every interval into
+classes.  Once the class is determined, the InterruptThrottleRate value is
+adjusted to suit that traffic type the best. There are three classes defined:
+"Bulk traffic", for large amounts of packets of normal size; "Low latency",
+for small amounts of traffic and/or a significant percentage of small
+packets; and "Lowest latency", for almost completely small packets or
+minimal traffic.
+
+In dynamic conservative mode, the InterruptThrottleRate value is set to 4000
+for traffic that falls in class "Bulk traffic". If traffic falls in the "Low
+latency" or "Lowest latency" class, the InterruptThrottleRate is increased
+stepwise to 20000. This default mode is suitable for most applications.
+
+For situations where low latency is vital such as cluster or
+grid computing, the algorithm can reduce latency even more when
+InterruptThrottleRate is set to mode 1. In this mode, which operates
+the same as mode 3, the InterruptThrottleRate will be increased stepwise to
+70000 for traffic in class "Lowest latency".
+
+In simplified mode the interrupt rate is based on the ratio of TX and
+RX traffic.  If the bytes per second rate is approximately equal, the
+interrupt rate will drop as low as 2000 interrupts per second.  If the
+traffic is mostly transmit or mostly receive, the interrupt rate could
+be as high as 8000.
+
+Setting InterruptThrottleRate to 0 turns off any interrupt moderation
+and may improve small packet latency, but is generally not suitable
+for bulk throughput traffic.
+
+NOTE:  InterruptThrottleRate takes precedence over the TxAbsIntDelay and
+       RxAbsIntDelay parameters.  In other words, minimizing the receive
+       and/or transmit absolute delays does not force the controller to
+       generate more interrupts than what the Interrupt Throttle Rate
+       allows.
+
+CAUTION:  If you are using the Intel(R) PRO/1000 CT Network Connection
+          (controller 82547), setting InterruptThrottleRate to a value
+          greater than 75,000, may hang (stop transmitting) adapters
+          under certain network conditions.  If this occurs a NETDEV
+          WATCHDOG message is logged in the system event log.  In
+          addition, the controller is automatically reset, restoring
+          the network connection.  To eliminate the potential for the
+          hang, ensure that InterruptThrottleRate is set no greater
+          than 75,000 and is not set to 0.
+
+NOTE:  When e1000 is loaded with default settings and multiple adapters
+       are in use simultaneously, the CPU utilization may increase non-
+       linearly.  In order to limit the CPU utilization without impacting
+       the overall throughput, we recommend that you load the driver as
+       follows::
+
+           modprobe e1000 InterruptThrottleRate=3000,3000,3000
+
+       This sets the InterruptThrottleRate to 3000 interrupts/sec for
+       the first, second, and third instances of the driver.  The range
+       of 2000 to 3000 interrupts per second works on a majority of
+       systems and is a good starting point, but the optimal value will
+       be platform-specific.  If CPU utilization is not a concern, use
+       RX_POLLING (NAPI) and default driver settings.
+
+RxDescriptors
+-------------
+Valid Range:   48-256 for 82542 and 82543-based adapters
+               48-4096 for all other supported adapters
+Default Value: 256
+
+This value specifies the number of receive buffer descriptors allocated
+by the driver.  Increasing this value allows the driver to buffer more
+incoming packets, at the expense of increased system memory utilization.
+
+Each descriptor is 16 bytes.  A receive buffer is also allocated for each
+descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending
+on the MTU setting. The maximum MTU size is 16110.
+
+NOTE:  MTU designates the frame size.  It only needs to be set for Jumbo
+       Frames.  Depending on the available system resources, the request
+       for a higher number of receive descriptors may be denied.  In this
+       case, use a lower number.
+
+RxIntDelay
+----------
+Valid Range:   0-65535 (0=off)
+Default Value: 0
+
+This value delays the generation of receive interrupts in units of 1.024
+microseconds.  Receive interrupt reduction can improve CPU efficiency if
+properly tuned for specific network traffic.  Increasing this value adds
+extra latency to frame reception and can end up decreasing the throughput
+of TCP traffic.  If the system is reporting dropped receives, this value
+may be set too high, causing the driver to run out of available receive
+descriptors.
+
+CAUTION:  When setting RxIntDelay to a value other than 0, adapters may
+          hang (stop transmitting) under certain network conditions.  If
+          this occurs a NETDEV WATCHDOG message is logged in the system
+          event log.  In addition, the controller is automatically reset,
+          restoring the network connection.  To eliminate the potential
+          for the hang ensure that RxIntDelay is set to 0.
+
+RxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range:   0-65535 (0=off)
+Default Value: 128
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+receive interrupt is generated.  Useful only if RxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is received within the set amount of time.  Proper tuning,
+along with RxIntDelay, may improve traffic throughput in specific network
+conditions.
+
+Speed
+-----
+(This parameter is supported only on adapters with copper connections.)
+Valid Settings: 0, 10, 100, 1000
+Default Value:  0 (auto-negotiate at all supported speeds)
+
+Speed forces the line speed to the specified value in megabits per second
+(Mbps).  If this parameter is not specified or is set to 0 and the link
+partner is set to auto-negotiate, the board will auto-detect the correct
+speed.  Duplex should also be set when Speed is set to either 10 or 100.
+
+TxDescriptors
+-------------
+Valid Range:   48-256 for 82542 and 82543-based adapters
+               48-4096 for all other supported adapters
+Default Value: 256
+
+This value is the number of transmit descriptors allocated by the driver.
+Increasing this value allows the driver to queue more transmits.  Each
+descriptor is 16 bytes.
+
+NOTE:  Depending on the available system resources, the request for a
+       higher number of transmit descriptors may be denied.  In this case,
+       use a lower number.
+
+TxIntDelay
+----------
+Valid Range:   0-65535 (0=off)
+Default Value: 8
+
+This value delays the generation of transmit interrupts in units of
+1.024 microseconds.  Transmit interrupt reduction can improve CPU
+efficiency if properly tuned for specific network traffic.  If the
+system is reporting dropped transmits, this value may be set too high
+causing the driver to run out of available transmit descriptors.
+
+TxAbsIntDelay
+-------------
+(This parameter is supported only on 82540, 82545 and later adapters.)
+Valid Range:   0-65535 (0=off)
+Default Value: 32
+
+This value, in units of 1.024 microseconds, limits the delay in which a
+transmit interrupt is generated.  Useful only if TxIntDelay is non-zero,
+this value ensures that an interrupt is generated after the initial
+packet is sent on the wire within the set amount of time.  Proper tuning,
+along with TxIntDelay, may improve traffic throughput in specific
+network conditions.
+
+XsumRX
+------
+(This parameter is NOT supported on the 82542-based adapter.)
+Valid Range:   0-1
+Default Value: 1
+
+A value of '1' indicates that the driver should enable IP checksum
+offload for received packets (both UDP and TCP) to the adapter hardware.
+
+Copybreak
+---------
+Valid Range:   0-xxxxxxx (0=off)
+Default Value: 256
+Usage: modprobe e1000.ko copybreak=128
+
+Driver copies all packets below or equaling this size to a fresh RX
+buffer before handing it up the stack.
+
+This parameter is different than other parameters, in that it is a
+single (not 1,1,1 etc.) parameter applied to all driver instances and
+it is also available during runtime at
+/sys/module/e1000/parameters/copybreak
+
+SmartPowerDownEnable
+--------------------
+Valid Range: 0-1
+Default Value:  0 (disabled)
+
+Allows PHY to turn off in lower power states. The user can turn off
+this parameter in supported chipsets.
+
+Speed and Duplex Configuration
+==============================
+
+Three keywords are used to control the speed and duplex configuration.
+These keywords are Speed, Duplex, and AutoNeg.
+
+If the board uses a fiber interface, these keywords are ignored, and the
+fiber interface board only links at 1000 Mbps full-duplex.
+
+For copper-based boards, the keywords interact as follows:
+
+  The default operation is auto-negotiate.  The board advertises all
+  supported speed and duplex combinations, and it links at the highest
+  common speed and duplex mode IF the link partner is set to auto-negotiate.
+
+  If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
+  is advertised (The 1000BaseT spec requires auto-negotiation.)
+
+  If Speed = 10 or 100, then both Speed and Duplex should be set.  Auto-
+  negotiation is disabled, and the AutoNeg parameter is ignored.  Partner
+  SHOULD also be forced.
+
+The AutoNeg parameter is used when more control is required over the
+auto-negotiation process.  It should be used when you wish to control which
+speed and duplex combinations are advertised during the auto-negotiation
+process.
+
+The parameter may be specified as either a decimal or hexadecimal value as
+determined by the bitmap below.
+
+Bit position   7      6      5       4       3      2      1       0
+Decimal Value  128    64     32      16      8      4      2       1
+Hex value      80     40     20      10      8      4      2       1
+Speed (Mbps)   N/A    N/A    1000    N/A     100    100    10      10
+Duplex                       Full            Full   Half   Full    Half
+
+Some examples of using AutoNeg:
+
+  modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
+  modprobe e1000 AutoNeg=1 (Same as above)
+  modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full)
+  modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full)
+  modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half)
+  modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100
+  Half)
+  modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full)
+  modprobe e1000 AutoNeg=32 (Same as above)
+
+Note that when this parameter is used, Speed and Duplex must not be specified.
+
+If the link partner is forced to a specific speed and duplex, then this
+parameter should not be used.  Instead, use the Speed and Duplex parameters
+previously mentioned to force the adapter to the same speed and duplex.
+
+Additional Configurations
+=========================
+
+  Jumbo Frames
+  ------------
+  Jumbo Frames support is enabled by changing the MTU to a value larger than
+  the default of 1500.  Use the ifconfig command to increase the MTU size.
+  For example::
+
+       ifconfig eth<x> mtu 9000 up
+
+  This setting is not saved across reboots.  It can be made permanent if
+  you add::
+
+       MTU=9000
+
+   to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>.  This example
+   applies to the Red Hat distributions; other distributions may store this
+   setting in a different location.
+
+  Notes:
+  Degradation in throughput performance may be observed in some Jumbo frames
+  environments. If this is observed, increasing the application's socket buffer
+  size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help.
+  See the specific application manual and /usr/src/linux*/Documentation/
+  networking/ip-sysctl.txt for more details.
+
+  - The maximum MTU setting for Jumbo Frames is 16110.  This value coincides
+    with the maximum Jumbo Frames size of 16128.
+
+  - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in
+    poor performance or loss of link.
+
+  - Adapters based on the Intel(R) 82542 and 82573V/E controller do not
+    support Jumbo Frames. These correspond to the following product names:
+     Intel(R) PRO/1000 Gigabit Server Adapter
+     Intel(R) PRO/1000 PM Network Connection
+
+  ethtool
+  -------
+  The driver utilizes the ethtool interface for driver configuration and
+  diagnostics, as well as displaying statistical information.  The ethtool
+  version 1.6 or later is required for this functionality.
+
+  The latest release of ethtool can be found from
+  https://www.kernel.org/pub/software/network/ethtool/
+
+  Enabling Wake on LAN* (WoL)
+  ---------------------------
+  WoL is configured through the ethtool* utility.
+
+  WoL will be enabled on the system during the next shut down or reboot.
+  For this driver version, in order to enable WoL, the e1000 driver must be
+  loaded when shutting down or rebooting the system.
+
+Support
+=======
+
+For general information, go to the Intel support website at:
+
+    http://support.intel.com
+
+or the Intel Wired Networking project hosted by Sourceforge at:
+
+    http://sourceforge.net/projects/e1000
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/e1000.txt b/Documentation/networking/e1000.txt
deleted file mode 100644
index 1f6ed848363d..000000000000
--- a/Documentation/networking/e1000.txt
+++ /dev/null
@@ -1,461 +0,0 @@
-Linux* Base Driver for Intel(R) Ethernet Network Connection
-===========================================================
-
-Intel Gigabit Linux driver.
-Copyright(c) 1999 - 2013 Intel Corporation.
-
-Contents
-========
-
-- Identifying Your Adapter
-- Command Line Parameters
-- Speed and Duplex Configuration
-- Additional Configurations
-- Support
-
-Identifying Your Adapter
-========================
-
-For more information on how to identify your adapter, go to the Adapter &
-Driver ID Guide at:
-
-    http://support.intel.com/support/go/network/adapter/idguide.htm
-
-For the latest Intel network drivers for Linux, refer to the following
-website.  In the search field, enter your adapter name or type, or use the
-networking link on the left to search for your adapter:
-
-    http://support.intel.com/support/go/network/adapter/home.htm
-
-Command Line Parameters
-=======================
-
-The default value for each parameter is generally the recommended setting,
-unless otherwise noted.
-
-NOTES:  For more information about the AutoNeg, Duplex, and Speed
-        parameters, see the "Speed and Duplex Configuration" section in
-        this document.
-
-        For more information about the InterruptThrottleRate,
-        RxIntDelay, TxIntDelay, RxAbsIntDelay, and TxAbsIntDelay
-        parameters, see the application note at:
-        http://www.intel.com/design/network/applnots/ap450.htm
-
-AutoNeg
--------
-(Supported only on adapters with copper connections)
-Valid Range:   0x01-0x0F, 0x20-0x2F
-Default Value: 0x2F
-
-This parameter is a bit-mask that specifies the speed and duplex settings
-advertised by the adapter.  When this parameter is used, the Speed and
-Duplex parameters must not be specified.
-
-NOTE:  Refer to the Speed and Duplex section of this readme for more
-       information on the AutoNeg parameter.
-
-Duplex
-------
-(Supported only on adapters with copper connections)
-Valid Range:   0-2 (0=auto-negotiate, 1=half, 2=full)
-Default Value: 0
-
-This defines the direction in which data is allowed to flow.  Can be
-either one or two-directional.  If both Duplex and the link partner are
-set to auto-negotiate, the board auto-detects the correct duplex.  If the
-link partner is forced (either full or half), Duplex defaults to half-
-duplex.
-
-FlowControl
------------
-Valid Range:   0-3 (0=none, 1=Rx only, 2=Tx only, 3=Rx&Tx)
-Default Value: Reads flow control settings from the EEPROM
-
-This parameter controls the automatic generation(Tx) and response(Rx)
-to Ethernet PAUSE frames.
-
-InterruptThrottleRate
----------------------
-(not supported on Intel(R) 82542, 82543 or 82544-based adapters)
-Valid Range:   0,1,3,4,100-100000 (0=off, 1=dynamic, 3=dynamic conservative,
-                                 4=simplified balancing)
-Default Value: 3
-
-The driver can limit the amount of interrupts per second that the adapter
-will generate for incoming packets. It does this by writing a value to the
-adapter that is based on the maximum amount of interrupts that the adapter
-will generate per second.
-
-Setting InterruptThrottleRate to a value greater or equal to 100
-will program the adapter to send out a maximum of that many interrupts
-per second, even if more packets have come in. This reduces interrupt
-load on the system and can lower CPU utilization under heavy load,
-but will increase latency as packets are not processed as quickly.
-
-The default behaviour of the driver previously assumed a static
-InterruptThrottleRate value of 8000, providing a good fallback value for
-all traffic types,but lacking in small packet performance and latency.
-The hardware can handle many more small packets per second however, and
-for this reason an adaptive interrupt moderation algorithm was implemented.
-
-Since 7.3.x, the driver has two adaptive modes (setting 1 or 3) in which
-it dynamically adjusts the InterruptThrottleRate value based on the traffic
-that it receives. After determining the type of incoming traffic in the last
-timeframe, it will adjust the InterruptThrottleRate to an appropriate value
-for that traffic.
-
-The algorithm classifies the incoming traffic every interval into
-classes.  Once the class is determined, the InterruptThrottleRate value is
-adjusted to suit that traffic type the best. There are three classes defined:
-"Bulk traffic", for large amounts of packets of normal size; "Low latency",
-for small amounts of traffic and/or a significant percentage of small
-packets; and "Lowest latency", for almost completely small packets or
-minimal traffic.
-
-In dynamic conservative mode, the InterruptThrottleRate value is set to 4000
-for traffic that falls in class "Bulk traffic". If traffic falls in the "Low
-latency" or "Lowest latency" class, the InterruptThrottleRate is increased
-stepwise to 20000. This default mode is suitable for most applications.
-
-For situations where low latency is vital such as cluster or
-grid computing, the algorithm can reduce latency even more when
-InterruptThrottleRate is set to mode 1. In this mode, which operates
-the same as mode 3, the InterruptThrottleRate will be increased stepwise to
-70000 for traffic in class "Lowest latency".
-
-In simplified mode the interrupt rate is based on the ratio of TX and
-RX traffic.  If the bytes per second rate is approximately equal, the
-interrupt rate will drop as low as 2000 interrupts per second.  If the
-traffic is mostly transmit or mostly receive, the interrupt rate could
-be as high as 8000.
-
-Setting InterruptThrottleRate to 0 turns off any interrupt moderation
-and may improve small packet latency, but is generally not suitable
-for bulk throughput traffic.
-
-NOTE:  InterruptThrottleRate takes precedence over the TxAbsIntDelay and
-       RxAbsIntDelay parameters.  In other words, minimizing the receive
-       and/or transmit absolute delays does not force the controller to
-       generate more interrupts than what the Interrupt Throttle Rate
-       allows.
-
-CAUTION:  If you are using the Intel(R) PRO/1000 CT Network Connection
-          (controller 82547), setting InterruptThrottleRate to a value
-          greater than 75,000, may hang (stop transmitting) adapters
-          under certain network conditions.  If this occurs a NETDEV
-          WATCHDOG message is logged in the system event log.  In
-          addition, the controller is automatically reset, restoring
-          the network connection.  To eliminate the potential for the
-          hang, ensure that InterruptThrottleRate is set no greater
-          than 75,000 and is not set to 0.
-
-NOTE:  When e1000 is loaded with default settings and multiple adapters
-       are in use simultaneously, the CPU utilization may increase non-
-       linearly.  In order to limit the CPU utilization without impacting
-       the overall throughput, we recommend that you load the driver as
-       follows:
-
-           modprobe e1000 InterruptThrottleRate=3000,3000,3000
-
-       This sets the InterruptThrottleRate to 3000 interrupts/sec for
-       the first, second, and third instances of the driver.  The range
-       of 2000 to 3000 interrupts per second works on a majority of
-       systems and is a good starting point, but the optimal value will
-       be platform-specific.  If CPU utilization is not a concern, use
-       RX_POLLING (NAPI) and default driver settings.
-
-RxDescriptors
--------------
-Valid Range:   80-256 for 82542 and 82543-based adapters
-               80-4096 for all other supported adapters
-Default Value: 256
-
-This value specifies the number of receive buffer descriptors allocated
-by the driver.  Increasing this value allows the driver to buffer more
-incoming packets, at the expense of increased system memory utilization.
-
-Each descriptor is 16 bytes.  A receive buffer is also allocated for each
-descriptor and can be either 2048, 4096, 8192, or 16384 bytes, depending
-on the MTU setting. The maximum MTU size is 16110.
-
-NOTE:  MTU designates the frame size.  It only needs to be set for Jumbo
-       Frames.  Depending on the available system resources, the request
-       for a higher number of receive descriptors may be denied.  In this
-       case, use a lower number.
-
-RxIntDelay
-----------
-Valid Range:   0-65535 (0=off)
-Default Value: 0
-
-This value delays the generation of receive interrupts in units of 1.024
-microseconds.  Receive interrupt reduction can improve CPU efficiency if
-properly tuned for specific network traffic.  Increasing this value adds
-extra latency to frame reception and can end up decreasing the throughput
-of TCP traffic.  If the system is reporting dropped receives, this value
-may be set too high, causing the driver to run out of available receive
-descriptors.
-
-CAUTION:  When setting RxIntDelay to a value other than 0, adapters may
-          hang (stop transmitting) under certain network conditions.  If
-          this occurs a NETDEV WATCHDOG message is logged in the system
-          event log.  In addition, the controller is automatically reset,
-          restoring the network connection.  To eliminate the potential
-          for the hang ensure that RxIntDelay is set to 0.
-
-RxAbsIntDelay
--------------
-(This parameter is supported only on 82540, 82545 and later adapters.)
-Valid Range:   0-65535 (0=off)
-Default Value: 128
-
-This value, in units of 1.024 microseconds, limits the delay in which a
-receive interrupt is generated.  Useful only if RxIntDelay is non-zero,
-this value ensures that an interrupt is generated after the initial
-packet is received within the set amount of time.  Proper tuning,
-along with RxIntDelay, may improve traffic throughput in specific network
-conditions.
-
-Speed
------
-(This parameter is supported only on adapters with copper connections.)
-Valid Settings: 0, 10, 100, 1000
-Default Value:  0 (auto-negotiate at all supported speeds)
-
-Speed forces the line speed to the specified value in megabits per second
-(Mbps).  If this parameter is not specified or is set to 0 and the link
-partner is set to auto-negotiate, the board will auto-detect the correct
-speed.  Duplex should also be set when Speed is set to either 10 or 100.
-
-TxDescriptors
--------------
-Valid Range:   80-256 for 82542 and 82543-based adapters
-               80-4096 for all other supported adapters
-Default Value: 256
-
-This value is the number of transmit descriptors allocated by the driver.
-Increasing this value allows the driver to queue more transmits.  Each
-descriptor is 16 bytes.
-
-NOTE:  Depending on the available system resources, the request for a
-       higher number of transmit descriptors may be denied.  In this case,
-       use a lower number.
-
-TxDescriptorStep
-----------------
-Valid Range:    1 (use every Tx Descriptor)
-                4 (use every 4th Tx Descriptor)
-
-Default Value:  1 (use every Tx Descriptor)
-
-On certain non-Intel architectures, it has been observed that intense TX
-traffic bursts of short packets may result in an improper descriptor
-writeback. If this occurs, the driver will report a "TX Timeout" and reset
-the adapter, after which the transmit flow will restart, though data may
-have stalled for as much as 10 seconds before it resumes.
-
-The improper writeback does not occur on the first descriptor in a system
-memory cache-line, which is typically 32 bytes, or 4 descriptors long.
-
-Setting TxDescriptorStep to a value of 4 will ensure that all TX descriptors
-are aligned to the start of a system memory cache line, and so this problem
-will not occur.
-
-NOTES: Setting TxDescriptorStep to 4 effectively reduces the number of
-       TxDescriptors available for transmits to 1/4 of the normal allocation.
-       This has a possible negative performance impact, which may be
-       compensated for by allocating more descriptors using the TxDescriptors
-       module parameter.
-
-       There are other conditions which may result in "TX Timeout", which will
-       not be resolved by the use of the TxDescriptorStep parameter. As the
-       issue addressed by this parameter has never been observed on Intel
-       Architecture platforms, it should not be used on Intel platforms.
-
-TxIntDelay
-----------
-Valid Range:   0-65535 (0=off)
-Default Value: 64
-
-This value delays the generation of transmit interrupts in units of
-1.024 microseconds.  Transmit interrupt reduction can improve CPU
-efficiency if properly tuned for specific network traffic.  If the
-system is reporting dropped transmits, this value may be set too high
-causing the driver to run out of available transmit descriptors.
-
-TxAbsIntDelay
--------------
-(This parameter is supported only on 82540, 82545 and later adapters.)
-Valid Range:   0-65535 (0=off)
-Default Value: 64
-
-This value, in units of 1.024 microseconds, limits the delay in which a
-transmit interrupt is generated.  Useful only if TxIntDelay is non-zero,
-this value ensures that an interrupt is generated after the initial
-packet is sent on the wire within the set amount of time.  Proper tuning,
-along with TxIntDelay, may improve traffic throughput in specific
-network conditions.
-
-XsumRX
-------
-(This parameter is NOT supported on the 82542-based adapter.)
-Valid Range:   0-1
-Default Value: 1
-
-A value of '1' indicates that the driver should enable IP checksum
-offload for received packets (both UDP and TCP) to the adapter hardware.
-
-Copybreak
----------
-Valid Range:   0-xxxxxxx (0=off)
-Default Value: 256
-Usage: insmod e1000.ko copybreak=128
-
-Driver copies all packets below or equaling this size to a fresh RX
-buffer before handing it up the stack.
-
-This parameter is different than other parameters, in that it is a
-single (not 1,1,1 etc.) parameter applied to all driver instances and
-it is also available during runtime at
-/sys/module/e1000/parameters/copybreak
-
-SmartPowerDownEnable
---------------------
-Valid Range: 0-1
-Default Value:  0 (disabled)
-
-Allows PHY to turn off in lower power states. The user can turn off
-this parameter in supported chipsets.
-
-KumeranLockLoss
----------------
-Valid Range: 0-1
-Default Value: 1 (enabled)
-
-This workaround skips resetting the PHY at shutdown for the initial
-silicon releases of ICH8 systems.
-
-Speed and Duplex Configuration
-==============================
-
-Three keywords are used to control the speed and duplex configuration.
-These keywords are Speed, Duplex, and AutoNeg.
-
-If the board uses a fiber interface, these keywords are ignored, and the
-fiber interface board only links at 1000 Mbps full-duplex.
-
-For copper-based boards, the keywords interact as follows:
-
-  The default operation is auto-negotiate.  The board advertises all
-  supported speed and duplex combinations, and it links at the highest
-  common speed and duplex mode IF the link partner is set to auto-negotiate.
-
-  If Speed = 1000, limited auto-negotiation is enabled and only 1000 Mbps
-  is advertised (The 1000BaseT spec requires auto-negotiation.)
-
-  If Speed = 10 or 100, then both Speed and Duplex should be set.  Auto-
-  negotiation is disabled, and the AutoNeg parameter is ignored.  Partner
-  SHOULD also be forced.
-
-The AutoNeg parameter is used when more control is required over the
-auto-negotiation process.  It should be used when you wish to control which
-speed and duplex combinations are advertised during the auto-negotiation
-process.
-
-The parameter may be specified as either a decimal or hexadecimal value as
-determined by the bitmap below.
-
-Bit position   7      6      5       4       3      2      1       0
-Decimal Value  128    64     32      16      8      4      2       1
-Hex value      80     40     20      10      8      4      2       1
-Speed (Mbps)   N/A    N/A    1000    N/A     100    100    10      10
-Duplex                       Full            Full   Half   Full    Half
-
-Some examples of using AutoNeg:
-
-  modprobe e1000 AutoNeg=0x01 (Restricts autonegotiation to 10 Half)
-  modprobe e1000 AutoNeg=1 (Same as above)
-  modprobe e1000 AutoNeg=0x02 (Restricts autonegotiation to 10 Full)
-  modprobe e1000 AutoNeg=0x03 (Restricts autonegotiation to 10 Half or 10 Full)
-  modprobe e1000 AutoNeg=0x04 (Restricts autonegotiation to 100 Half)
-  modprobe e1000 AutoNeg=0x05 (Restricts autonegotiation to 10 Half or 100
-  Half)
-  modprobe e1000 AutoNeg=0x020 (Restricts autonegotiation to 1000 Full)
-  modprobe e1000 AutoNeg=32 (Same as above)
-
-Note that when this parameter is used, Speed and Duplex must not be specified.
-
-If the link partner is forced to a specific speed and duplex, then this
-parameter should not be used.  Instead, use the Speed and Duplex parameters
-previously mentioned to force the adapter to the same speed and duplex.
-
-Additional Configurations
-=========================
-
-  Jumbo Frames
-  ------------
-  Jumbo Frames support is enabled by changing the MTU to a value larger than
-  the default of 1500.  Use the ifconfig command to increase the MTU size.
-  For example:
-
-       ifconfig eth<x> mtu 9000 up
-
-  This setting is not saved across reboots.  It can be made permanent if
-  you add:
-
-       MTU=9000
-
-   to the file /etc/sysconfig/network-scripts/ifcfg-eth<x>.  This example
-   applies to the Red Hat distributions; other distributions may store this
-   setting in a different location.
-
-  Notes:
-  Degradation in throughput performance may be observed in some Jumbo frames
-  environments. If this is observed, increasing the application's socket buffer
-  size and/or increasing the /proc/sys/net/ipv4/tcp_*mem entry values may help.
-  See the specific application manual and /usr/src/linux*/Documentation/
-  networking/ip-sysctl.txt for more details.
-
-  - The maximum MTU setting for Jumbo Frames is 16110.  This value coincides
-    with the maximum Jumbo Frames size of 16128.
-
-  - Using Jumbo frames at 10 or 100 Mbps is not supported and may result in
-    poor performance or loss of link.
-
-  - Adapters based on the Intel(R) 82542 and 82573V/E controller do not
-    support Jumbo Frames. These correspond to the following product names:
-     Intel(R) PRO/1000 Gigabit Server Adapter
-     Intel(R) PRO/1000 PM Network Connection
-
-  ethtool
-  -------
-  The driver utilizes the ethtool interface for driver configuration and
-  diagnostics, as well as displaying statistical information.  The ethtool
-  version 1.6 or later is required for this functionality.
-
-  The latest release of ethtool can be found from
-  https://www.kernel.org/pub/software/network/ethtool/
-
-  Enabling Wake on LAN* (WoL)
-  ---------------------------
-  WoL is configured through the ethtool* utility.
-
-  WoL will be enabled on the system during the next shut down or reboot.
-  For this driver version, in order to enable WoL, the e1000 driver must be
-  loaded when shutting down or rebooting the system.
-
-Support
-=======
-
-For general information, go to the Intel support website at:
-
-    http://support.intel.com
-
-or the Intel Wired Networking project hosted by Sourceforge at:
-
-    http://sourceforge.net/projects/e1000
-
-If an issue is identified with the released source code on the supported
-kernel with a supported adapter, email the specific information related
-to the issue to e1000-devel@lists.sf.net
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index d11a62977edd..fec8588a588e 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -11,6 +11,7 @@ Contents:
    can
    dpaa2/index
    e100
+   e1000
    kapi
    z8530book
    msg_zerocopy
diff --git a/MAINTAINERS b/MAINTAINERS
index d68981ca9896..32472fbf4d6e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7090,7 +7090,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git
 S:	Supported
 F:	Documentation/networking/e100.rst
-F:	Documentation/networking/e1000.txt
+F:	Documentation/networking/e1000.rst
 F:	Documentation/networking/e1000e.txt
 F:	Documentation/networking/igb.txt
 F:	Documentation/networking/igbvf.txt
-- 
cgit v1.2.3-59-g8ed1b


From 1ec2297c5cacefd9198a3cc4a27751b6a2203cf3 Mon Sep 17 00:00:00 2001
From: Joanna Yurdal <jyu@trackman.com>
Date: Wed, 16 May 2018 13:14:11 +0200
Subject: igb: Clear TSICR interrupts together with ICR

Issuing "ip link set up/down" can block TSICR interrupts, what results in
missing PTP Tx timestamp and no PPS pulse generation.

Problem happens when the link is set up with the TSICR interrupts pending.
ICR is cleared before enabling interrupts, while TSICR is not. When all TSICR
interrupts are pending at this moment, time_sync interrupt will never
be generated. TSICR should be cleared as well.

In order to reproduce the issue:
1. Setup linux with IEEE 1588 grandmaster and PPS output enabled
2. Continue setting link up/down with random intervals between commands
3. Wait until PPS is not generated ( only one pulse is generated and PPS
dies), and ptp4l complains constantly about Tx timeout.

Signed-off-by: Joanna Yurdal <jyu@trackman.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 78574c06635b..20b728218d20 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2058,6 +2058,7 @@ int igb_up(struct igb_adapter *adapter)
 		igb_assign_vector(adapter->q_vector[0], 0);
 
 	/* Clear any pending interrupts. */
+	rd32(E1000_TSICR);
 	rd32(E1000_ICR);
 	igb_irq_enable(adapter);
 
@@ -3865,6 +3866,7 @@ static int __igb_open(struct net_device *netdev, bool resuming)
 		napi_enable(&(adapter->q_vector[i]->napi));
 
 	/* Clear any pending interrupts. */
+	rd32(E1000_TSICR);
 	rd32(E1000_ICR);
 
 	igb_irq_enable(adapter);
-- 
cgit v1.2.3-59-g8ed1b


From 06140c793db51476585ee9abf032845e221982cb Mon Sep 17 00:00:00 2001
From: Sergey Nemov <sergey.nemov@intel.com>
Date: Fri, 18 May 2018 11:58:40 +0200
Subject: igb: Wait 10ms just once after TX queues reset

Move 10ms sleep out of function resetting TX queue.
Reset all the TX queues in one turn and
wait for all of them just once.

Use usleep_range() instead of mdelay() in order not to
affect transmission on other interfaces.

Signed-off-by: Sergey Nemov <sergey.nemov@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 20b728218d20..c33821d2afb3 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4055,11 +4055,6 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
 	u64 tdba = ring->dma;
 	int reg_idx = ring->reg_idx;
 
-	/* disable the queue */
-	wr32(E1000_TXDCTL(reg_idx), 0);
-	wrfl();
-	mdelay(10);
-
 	wr32(E1000_TDLEN(reg_idx),
 	     ring->count * sizeof(union e1000_adv_tx_desc));
 	wr32(E1000_TDBAL(reg_idx),
@@ -4090,8 +4085,16 @@ void igb_configure_tx_ring(struct igb_adapter *adapter,
  **/
 static void igb_configure_tx(struct igb_adapter *adapter)
 {
+	struct e1000_hw *hw = &adapter->hw;
 	int i;
 
+	/* disable the queues */
+	for (i = 0; i < adapter->num_tx_queues; i++)
+		wr32(E1000_TXDCTL(adapter->tx_ring[i]->reg_idx), 0);
+
+	wrfl();
+	usleep_range(10000, 20000);
+
 	for (i = 0; i < adapter->num_tx_queues; i++)
 		igb_configure_tx_ring(adapter, adapter->tx_ring[i]);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4be87727d4aebe36913d9f2a6806724cb593516f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 22 May 2018 11:44:29 -0400
Subject: ixgbevf: Fix coexistence of malicious driver detection with XDP

In the case of the VF driver it is supposed to provide a context descriptor
that allows us to provide information about the header offsets inside of
the frame. However in the case of XDP we don't really have any of that
information since the data is minimally processed. As a result we were
seeing malicious driver detection (MDD) events being triggered when the PF
had that functionality enabled.

To address this I have added a bit of new code that will "prime" the XDP
ring by providing one context descriptor that assumes the minimal setup of
an Ethernet frame which is an L2 header length of 14. With just that we can
provide enough information to make the hardware happy so that we don't
trigger MDD events.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h      |  1 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 36 ++++++++++++++++++-----
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
index 70c75681495f..56a1031dcc07 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
@@ -76,6 +76,7 @@ enum ixgbevf_ring_state_t {
 	__IXGBEVF_TX_DETECT_HANG,
 	__IXGBEVF_HANG_CHECK_ARMED,
 	__IXGBEVF_TX_XDP_RING,
+	__IXGBEVF_TX_XDP_RING_PRIMED,
 };
 
 #define ring_is_xdp(ring) \
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 083041129539..2d5a706c3c29 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -991,24 +991,45 @@ static int ixgbevf_xmit_xdp_ring(struct ixgbevf_ring *ring,
 		return IXGBEVF_XDP_CONSUMED;
 
 	/* record the location of the first descriptor for this packet */
-	tx_buffer = &ring->tx_buffer_info[ring->next_to_use];
-	tx_buffer->bytecount = len;
-	tx_buffer->gso_segs = 1;
-	tx_buffer->protocol = 0;
-
 	i = ring->next_to_use;
-	tx_desc = IXGBEVF_TX_DESC(ring, i);
+	tx_buffer = &ring->tx_buffer_info[i];
 
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
 	tx_buffer->data = xdp->data;
-	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+	tx_buffer->bytecount = len;
+	tx_buffer->gso_segs = 1;
+	tx_buffer->protocol = 0;
+
+	/* Populate minimal context descriptor that will provide for the
+	 * fact that we are expected to process Ethernet frames.
+	 */
+	if (!test_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state)) {
+		struct ixgbe_adv_tx_context_desc *context_desc;
+
+		set_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state);
+
+		context_desc = IXGBEVF_TX_CTXTDESC(ring, 0);
+		context_desc->vlan_macip_lens	=
+			cpu_to_le32(ETH_HLEN << IXGBE_ADVTXD_MACLEN_SHIFT);
+		context_desc->seqnum_seed	= 0;
+		context_desc->type_tucmd_mlhl	=
+			cpu_to_le32(IXGBE_TXD_CMD_DEXT |
+				    IXGBE_ADVTXD_DTYP_CTXT);
+		context_desc->mss_l4len_idx	= 0;
+
+		i = 1;
+	}
 
 	/* put descriptor type bits */
 	cmd_type = IXGBE_ADVTXD_DTYP_DATA |
 		   IXGBE_ADVTXD_DCMD_DEXT |
 		   IXGBE_ADVTXD_DCMD_IFCS;
 	cmd_type |= len | IXGBE_TXD_CMD;
+
+	tx_desc = IXGBEVF_TX_DESC(ring, i);
+	tx_desc->read.buffer_addr = cpu_to_le64(dma);
+
 	tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
 	tx_desc->read.olinfo_status =
 			cpu_to_le32((len << IXGBE_ADVTXD_PAYLEN_SHIFT) |
@@ -1688,6 +1709,7 @@ static void ixgbevf_configure_tx_ring(struct ixgbevf_adapter *adapter,
 	       sizeof(struct ixgbevf_tx_buffer) * ring->count);
 
 	clear_bit(__IXGBEVF_HANG_CHECK_ARMED, &ring->state);
+	clear_bit(__IXGBEVF_TX_XDP_RING_PRIMED, &ring->state);
 
 	IXGBE_WRITE_REG(hw, IXGBE_VFTXDCTL(reg_idx), txdctl);
 
-- 
cgit v1.2.3-59-g8ed1b


From 87ae68c8b4944d142447b88875c9c412c714434f Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Sat, 2 Jun 2018 09:40:34 +0200
Subject: ipv6: omit traffic class when calculating flow hash

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also incosistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 798558fd1681..6dcc473fbe51 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK);
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 4fc1e84d77ec..6497b815e7a2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22c4de2317d0..06593781b59a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 7d6446db1bcb1008b84db098f4629664cd8b06a6 Mon Sep 17 00:00:00 2001
From: Emil Tantilov <emil.s.tantilov@intel.com>
Date: Tue, 22 May 2018 09:18:46 -0700
Subject: ixgbevf: fix possible race in the reset subtask

Extend the RTNL lock in ixgbevf_reset_subtask() to protect the state bits
check in addition to the call to ixgbevf_reinit_locked().

This is to make sure that we get the most up-to-date values for the bits
and avoid a possible race when going down.

Suggested-by: Zhiping du <zhipingdu@tencent.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 2d5a706c3c29..59416eddd840 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3141,15 +3141,17 @@ static void ixgbevf_reset_subtask(struct ixgbevf_adapter *adapter)
 	if (!test_and_clear_bit(__IXGBEVF_RESET_REQUESTED, &adapter->state))
 		return;
 
+	rtnl_lock();
 	/* If we're already down or resetting, just bail */
 	if (test_bit(__IXGBEVF_DOWN, &adapter->state) ||
 	    test_bit(__IXGBEVF_REMOVING, &adapter->state) ||
-	    test_bit(__IXGBEVF_RESETTING, &adapter->state))
+	    test_bit(__IXGBEVF_RESETTING, &adapter->state)) {
+		rtnl_unlock();
 		return;
+	}
 
 	adapter->tx_timeout_count++;
 
-	rtnl_lock();
 	ixgbevf_reinit_locked(adapter);
 	rtnl_unlock();
 }
-- 
cgit v1.2.3-59-g8ed1b


From a925ab48dacbc9ad470a9ca4c761601ee07e476c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Jun 2018 13:20:38 -0400
Subject: Revert "ipv6: omit traffic class when calculating flow hash"

This reverts commit 87ae68c8b4944d142447b88875c9c412c714434f.

Applied the wrong version of this fix, correct version
coming up.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 -----
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 6dcc473fbe51..798558fd1681 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,11 +907,6 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
-static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6)
-{
-	return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK);
-}
-
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6497b815e7a2..4fc1e84d77ec 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = flowi6_get_flowlabel(fl6);
+	keys->tags.flow_label = (__force u32)fl6->flowlabel;
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 06593781b59a..22c4de2317d0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowlabel(key_iph);
+		keys->tags.flow_label = ip6_flowinfo(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = flowi6_get_flowlabel(fl6);
+			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From e9c721837da2a73798243ded051ff0ffdbc6066b Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 23 May 2018 20:08:13 +0800
Subject: ixgbe: introduce a helper to simplify code

ixgbe_dbg_reg_ops_read and ixgbe_dbg_netdev_ops_read copy-pasting
the same code except for ixgbe_dbg_netdev_ops_buf/ixgbe_dbg_reg_ops_buf,
so introduce a helper ixgbe_dbg_common_ops_read to remove redundant code.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c | 57 +++++++++---------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
index 55fe8114fe99..50dfb02fa34c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c
@@ -10,15 +10,9 @@ static struct dentry *ixgbe_dbg_root;
 
 static char ixgbe_dbg_reg_ops_buf[256] = "";
 
-/**
- * ixgbe_dbg_reg_ops_read - read for reg_ops datum
- * @filp: the opened file
- * @buffer: where to write the data for the user to read
- * @count: the size of the user's buffer
- * @ppos: file position offset
- **/
-static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
-				    size_t count, loff_t *ppos)
+static ssize_t ixgbe_dbg_common_ops_read(struct file *filp, char __user *buffer,
+					 size_t count, loff_t *ppos,
+					 char *dbg_buf)
 {
 	struct ixgbe_adapter *adapter = filp->private_data;
 	char *buf;
@@ -29,8 +23,7 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 		return 0;
 
 	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_reg_ops_buf);
+			adapter->netdev->name, dbg_buf);
 	if (!buf)
 		return -ENOMEM;
 
@@ -45,6 +38,20 @@ static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
 	return len;
 }
 
+/**
+ * ixgbe_dbg_reg_ops_read - read for reg_ops datum
+ * @filp: the opened file
+ * @buffer: where to write the data for the user to read
+ * @count: the size of the user's buffer
+ * @ppos: file position offset
+ **/
+static ssize_t ixgbe_dbg_reg_ops_read(struct file *filp, char __user *buffer,
+				      size_t count, loff_t *ppos)
+{
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_reg_ops_buf);
+}
+
 /**
  * ixgbe_dbg_reg_ops_write - write into reg_ops datum
  * @filp: the opened file
@@ -121,33 +128,11 @@ static char ixgbe_dbg_netdev_ops_buf[256] = "";
  * @count: the size of the user's buffer
  * @ppos: file position offset
  **/
-static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp,
-					 char __user *buffer,
+static ssize_t ixgbe_dbg_netdev_ops_read(struct file *filp, char __user *buffer,
 					 size_t count, loff_t *ppos)
 {
-	struct ixgbe_adapter *adapter = filp->private_data;
-	char *buf;
-	int len;
-
-	/* don't allow partial reads */
-	if (*ppos != 0)
-		return 0;
-
-	buf = kasprintf(GFP_KERNEL, "%s: %s\n",
-			adapter->netdev->name,
-			ixgbe_dbg_netdev_ops_buf);
-	if (!buf)
-		return -ENOMEM;
-
-	if (count < strlen(buf)) {
-		kfree(buf);
-		return -ENOSPC;
-	}
-
-	len = simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-
-	kfree(buf);
-	return len;
+	return ixgbe_dbg_common_ops_read(filp, buffer, count, ppos,
+					 ixgbe_dbg_netdev_ops_buf);
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From fa1be7e01ea863e911349e30456706749518eeab Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Mon, 4 Jun 2018 11:36:05 +0200
Subject: ipv6: omit traffic class when calculating flow hash

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also inconsistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 798558fd1681..16475c269749 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -907,6 +907,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return fl6->flowlabel & IPV6_FLOWLABEL_MASK;
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 4fc1e84d77ec..53f96e4f7bf5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 22c4de2317d0..ce6696acba73 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1981,7 +1981,7 @@ out:
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -2002,7 +2002,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From cc5b114dcf986bfd8e4c37bf65d1b7b1e5290ac6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 28 May 2018 11:07:20 +0200
Subject: bpf, i40e: add meta data support

Add support for XDP meta data when using build skb variant of
the i40e driver. Implementation is analogous to the existing
ixgbe and ixgbevf support for meta data from 366a88fe2f40 ("bpf,
ixgbe: add meta data support") and be8333322eff ("ixgbevf: Add
support for meta data"). With the build skb variant we get
192 bytes of extra headroom which can be used for encaps or
meta data.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Tested-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 39 +++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 9b698c5acd05..105a26f447c0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2032,6 +2032,21 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
 #if L1_CACHE_BYTES < 128
 	prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
+	/* Note, we get here by enabling legacy-rx via:
+	 *
+	 *    ethtool --set-priv-flags <dev> legacy-rx on
+	 *
+	 * In this mode, we currently get 0 extra XDP headroom as
+	 * opposed to having legacy-rx off, where we process XDP
+	 * packets going to stack via i40e_build_skb(). The latter
+	 * provides us currently with 192 bytes of headroom.
+	 *
+	 * For i40e_construct_skb() mode it means that the
+	 * xdp->data_meta will always point to xdp->data, since
+	 * the helper cannot expand the head. Should this ever
+	 * change in future for legacy-rx mode on, then lets also
+	 * add xdp->data_meta handling here.
+	 */
 
 	/* allocate a skb to store the frags */
 	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
@@ -2083,19 +2098,25 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
 				      struct i40e_rx_buffer *rx_buffer,
 				      struct xdp_buff *xdp)
 {
-	unsigned int size = xdp->data_end - xdp->data;
+	unsigned int metasize = xdp->data - xdp->data_meta;
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(I40E_SKB_PAD + size);
+				SKB_DATA_ALIGN(I40E_SKB_PAD +
+					       (xdp->data_end -
+						xdp->data_hard_start));
 #endif
 	struct sk_buff *skb;
 
-	/* prefetch first cache line of first page */
-	prefetch(xdp->data);
+	/* Prefetch first cache line of first page. If xdp->data_meta
+	 * is unused, this points exactly as xdp->data, otherwise we
+	 * likely have a consumer accessing first few bytes of meta
+	 * data, and then actual data.
+	 */
+	prefetch(xdp->data_meta);
 #if L1_CACHE_BYTES < 128
-	prefetch(xdp->data + L1_CACHE_BYTES);
+	prefetch(xdp->data_meta + L1_CACHE_BYTES);
 #endif
 	/* build an skb around the page buffer */
 	skb = build_skb(xdp->data_hard_start, truesize);
@@ -2103,8 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, I40E_SKB_PAD);
-	__skb_put(skb, size);
+	skb_reserve(skb, I40E_SKB_PAD + (xdp->data - xdp->data_hard_start));
+	__skb_put(skb, xdp->data_end - xdp->data);
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	/* buffer is used by skb, update page_offset */
 #if (PAGE_SIZE < 8192)
@@ -2341,7 +2364,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
-			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_meta = xdp.data;
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
-- 
cgit v1.2.3-59-g8ed1b


From 88adce4ea8f96b5191df2bea76905814cc3814e2 Mon Sep 17 00:00:00 2001
From: Tony Nguyen <anthony.l.nguyen@intel.com>
Date: Wed, 30 May 2018 09:05:12 -0700
Subject: ixgbe: fix possible race in reset subtask

Similar to ixgbevf, the same possibility for race exists. Extend the RTNL
lock in ixgbe_reset_subtask() to protect the state bits; this is to make
sure that we get the most up-to-date values for the bits and avoid a
possible race when going down.

Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index ba3035c08572..dd8a3a037c2f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7621,17 +7621,19 @@ static void ixgbe_reset_subtask(struct ixgbe_adapter *adapter)
 	if (!test_and_clear_bit(__IXGBE_RESET_REQUESTED, &adapter->state))
 		return;
 
+	rtnl_lock();
 	/* If we're already down, removing or resetting, just bail */
 	if (test_bit(__IXGBE_DOWN, &adapter->state) ||
 	    test_bit(__IXGBE_REMOVING, &adapter->state) ||
-	    test_bit(__IXGBE_RESETTING, &adapter->state))
+	    test_bit(__IXGBE_RESETTING, &adapter->state)) {
+		rtnl_unlock();
 		return;
+	}
 
 	ixgbe_dump(adapter);
 	netdev_err(adapter->netdev, "Reset adapter\n");
 	adapter->tx_timeout_count++;
 
-	rtnl_lock();
 	ixgbe_reinit_locked(adapter);
 	rtnl_unlock();
 }
-- 
cgit v1.2.3-59-g8ed1b


From f07ff01406c4f67426cb6065d1a7b2c7f0434f5a Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sat, 2 Jun 2018 21:09:34 +0300
Subject: mlxsw: spectrum_switchdev: Postpone respin on object deletion

VLAN deletion notifications are emitted before the relevant change is
projected to bridge configuration. Thus, like with VLAN addition,
schedule SPAN respin for later.

Fixes: c520bc698647 ("mlxsw: Respin SPAN on switchdev events")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index 8a15ac49cb5a..e97652c40d13 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1856,7 +1856,7 @@ static int mlxsw_sp_port_obj_del(struct net_device *dev,
 		break;
 	}
 
-	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+	mlxsw_sp_span_respin_schedule(mlxsw_sp_port->mlxsw_sp);
 
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 1fc68bb7c33fbef2216f361cfc23e41984a5edb1 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Sat, 2 Jun 2018 21:09:35 +0300
Subject: mlxsw: spectrum_span: Suppress VLAN on BRIDGE_VLAN_INFO_UNTAGGED

When offloading mirroring to gretap or ip6gretap netdevices, an 802.1q
bridge is one of the soft devices permissible in the underlay when
resolving the packet path. After the packet path is resolved to a
particular bridge egress device, flags on packet VLAN determine whether
the egressed packet should be tagged.

The current logic however only ever sets the VLAN tag, never suppresses
it. Thus if there's a VLAN netdevice above the bridge that determines
the packet VLAN, that VLAN is never unset, and mirroring is configured
with VLAN tagging.

Fix by setting the packet VLAN on both branches: set to zero (for unset)
when BRIDGE_VLAN_INFO_UNTAGGED, copy the resolved VLAN (e.g. from bridge
PVID) otherwise.

Fixes: 946a11e7408e ("mlxsw: spectrum_span: Allow bridge for gretap mirror")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index da3f7f527360..3d187d88cc7c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -191,7 +191,9 @@ mlxsw_sp_span_entry_bridge_8021q(const struct net_device *br_dev,
 
 	if (br_vlan_get_info(edev, vid, &vinfo))
 		return NULL;
-	if (!(vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED))
+	if (vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED)
+		*p_vid = 0;
+	else
 		*p_vid = vid;
 	return edev;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2a8a15526d319eed948bb38ae9f5900e2bee8565 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Wed, 30 May 2018 11:20:04 -0700
Subject: ixgbe: check ipsec ip addr against mgmt filters

Make sure we don't try to offload the decryption of an incoming
packet that should get delivered to the management engine.  This
is a corner case that will likely be very seldom seen, but could
really confuse someone if they were to hit it.

Suggested-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 88 ++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 99b170f1efd1..e1c976271bbd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -444,6 +444,89 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs,
 	return 0;
 }
 
+/**
+ * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters
+ * @xs: pointer to transformer state struct
+ **/
+static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs)
+{
+	struct net_device *dev = xs->xso.dev;
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 mfval, manc, reg;
+	int num_filters = 4;
+	bool manc_ipv4;
+	u32 bmcipval;
+	int i, j;
+
+#define MANC_EN_IPV4_FILTER      BIT(24)
+#define MFVAL_IPV4_FILTER_SHIFT  16
+#define MFVAL_IPV6_FILTER_SHIFT  24
+#define MIPAF_ARR(_m, _n)        (IXGBE_MIPAF + ((_m) * 0x10) + ((_n) * 4))
+
+#define IXGBE_BMCIP(_n)          (0x5050 + ((_n) * 4))
+#define IXGBE_BMCIPVAL           0x5060
+#define BMCIP_V4                 0x2
+#define BMCIP_V6                 0x3
+#define BMCIP_MASK               0x3
+
+	manc = IXGBE_READ_REG(hw, IXGBE_MANC);
+	manc_ipv4 = !!(manc & MANC_EN_IPV4_FILTER);
+	mfval = IXGBE_READ_REG(hw, IXGBE_MFVAL);
+	bmcipval = IXGBE_READ_REG(hw, IXGBE_BMCIPVAL);
+
+	if (xs->props.family == AF_INET) {
+		/* are there any IPv4 filters to check? */
+		if (manc_ipv4) {
+			/* the 4 ipv4 filters are all in MIPAF(3, i) */
+			for (i = 0; i < num_filters; i++) {
+				if (!(mfval & BIT(MFVAL_IPV4_FILTER_SHIFT + i)))
+					continue;
+
+				reg = IXGBE_READ_REG(hw, MIPAF_ARR(3, i));
+				if (reg == xs->id.daddr.a4)
+					return 1;
+			}
+		}
+
+		if ((bmcipval & BMCIP_MASK) == BMCIP_V4) {
+			reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(3));
+			if (reg == xs->id.daddr.a4)
+				return 1;
+		}
+
+	} else {
+		/* if there are ipv4 filters, they are in the last ipv6 slot */
+		if (manc_ipv4)
+			num_filters = 3;
+
+		for (i = 0; i < num_filters; i++) {
+			if (!(mfval & BIT(MFVAL_IPV6_FILTER_SHIFT + i)))
+				continue;
+
+			for (j = 0; j < 4; j++) {
+				reg = IXGBE_READ_REG(hw, MIPAF_ARR(i, j));
+				if (reg != xs->id.daddr.a6[j])
+					break;
+			}
+			if (j == 4)   /* did we match all 4 words? */
+				return 1;
+		}
+
+		if ((bmcipval & BMCIP_MASK) == BMCIP_V6) {
+			for (j = 0; j < 4; j++) {
+				reg = IXGBE_READ_REG(hw, IXGBE_BMCIP(j));
+				if (reg != xs->id.daddr.a6[j])
+					break;
+			}
+			if (j == 4)   /* did we match all 4 words? */
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * ixgbe_ipsec_add_sa - program device with a security association
  * @xs: pointer to transformer state struct
@@ -465,6 +548,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	if (ixgbe_ipsec_check_mgmt_ip(xs)) {
+		netdev_err(dev, "IPsec IP addr clash with mgmt filters\n");
+		return -EINVAL;
+	}
+
 	if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
 		struct rx_sa rsa;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a75fa5c166346fa3de35fe8020e43bc98a171d0 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Thu, 31 May 2018 14:12:18 -0700
Subject: ixgbe: fix broken ipsec Rx with proper cast on spi

Fix up a cast problem introduced by a sparse cleanup patch.  This fixes
a problem where the encrypted packets were not recognized on Rx and
subsequently dropped.

Fixes: 9cfbfa701b55 ("ixgbe: cleanup sparse warnings")
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index e1c976271bbd..344a1f213a5f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -663,7 +663,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs)
 
 		/* hash the new entry for faster search in Rx path */
 		hash_add_rcu(ipsec->rx_sa_list, &ipsec->rx_tbl[sa_idx].hlist,
-			     (__force u64)rsa.xs->id.spi);
+			     (__force u32)rsa.xs->id.spi);
 	} else {
 		struct tx_sa tsa;
 
-- 
cgit v1.2.3-59-g8ed1b


From 232b6743e4d12d8383aaf2d632d5ed4cc377419e Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 2 Jun 2018 22:37:42 +0300
Subject: sh_eth: make sh_eth_soft_swap() work on ARM

Browsing  thru the driver disassembly, I noticed that ARM gcc generated
no  code  whatsoever for sh_eth_soft_swap() while building a little-endian
kernel -- apparently __LITTLE_ENDIAN__ was not being #define'd, however
it got implicitly #define'd when building with the SH gcc (I could only
find the explicit #define __LITTLE_ENDIAN that was #include'd when building
a little-endian kernel).  Luckily, the Ether controller  only doing big-
endian DMA is encountered on the early SH771x SoCs only and all ARM SoCs
implement EDMR.DE and thus set 'sh_eth_cpu_data::hw_swap'. But anyway, we
need to fix the #ifdef inside sh_eth_soft_swap() to something that would
work on all architectures...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index 5dee19b61aee..d77de7651d8f 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -562,7 +562,7 @@ struct sh_eth_private {
 
 static inline void sh_eth_soft_swap(char *src, int len)
 {
-#ifdef __LITTLE_ENDIAN__
+#ifdef __LITTLE_ENDIAN
 	u32 *p = (u32 *)src;
 	u32 *maxp;
 	maxp = p + ((len + sizeof(u32) - 1) / sizeof(u32));
-- 
cgit v1.2.3-59-g8ed1b


From bb2fa4e847d779c0c3eba78c1abbe142134dcc3d Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 2 Jun 2018 22:38:56 +0300
Subject: sh_eth: uninline sh_eth_soft_swap()

sh_eth_tsu_soft_swap() is called twice by the driver, remove *inline* and
move  that function  from the header to the driver itself to let gcc decide
whether to expand it inline or not...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 11 +++++++++++
 drivers/net/ethernet/renesas/sh_eth.h | 12 ------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index d9cadfb1bc4a..82b03f13243e 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -460,6 +460,17 @@ static u32 sh_eth_tsu_read(struct sh_eth_private *mdp, int enum_index)
 	return ioread32(mdp->tsu_addr + offset);
 }
 
+static void sh_eth_soft_swap(char *src, int len)
+{
+#ifdef __LITTLE_ENDIAN
+	u32 *p = (u32 *)src;
+	u32 *maxp = p + ((len + sizeof(u32) - 1) / sizeof(u32));
+
+	for (; p < maxp; p++)
+		*p = swab32(*p);
+#endif
+}
+
 static void sh_eth_select_mii(struct net_device *ndev)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h
index d77de7651d8f..726c55a82dd7 100644
--- a/drivers/net/ethernet/renesas/sh_eth.h
+++ b/drivers/net/ethernet/renesas/sh_eth.h
@@ -560,18 +560,6 @@ struct sh_eth_private {
 	unsigned wol_enabled:1;
 };
 
-static inline void sh_eth_soft_swap(char *src, int len)
-{
-#ifdef __LITTLE_ENDIAN
-	u32 *p = (u32 *)src;
-	u32 *maxp;
-	maxp = p + ((len + sizeof(u32) - 1) / sizeof(u32));
-
-	for (; p < maxp; p++)
-		*p = swab32(*p);
-#endif
-}
-
 static inline void *sh_eth_tsu_get_offset(struct sh_eth_private *mdp,
 					  int enum_index)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 1100149a23a5053de4709dc4ba14ab14bd826562 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 2 Jun 2018 22:40:16 +0300
Subject: sh_eth: use DIV_ROUND_UP() in sh_eth_soft_swap()

When initializing 'maxp' in sh_eth_soft_swap(), the buffer length needs
to be rounded  up -- that's just asking for DIV_ROUND_UP()!

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 82b03f13243e..e9007b613f17 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -464,7 +464,7 @@ static void sh_eth_soft_swap(char *src, int len)
 {
 #ifdef __LITTLE_ENDIAN
 	u32 *p = (u32 *)src;
-	u32 *maxp = p + ((len + sizeof(u32) - 1) / sizeof(u32));
+	u32 *maxp = p + DIV_ROUND_UP(len, sizeof(u32));
 
 	for (; p < maxp; p++)
 		*p = swab32(*p);
-- 
cgit v1.2.3-59-g8ed1b


From 34ea38ca27991466a8fff849514b4181b42ae2eb Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 4 Jun 2018 08:53:41 -0700
Subject: bpf: guard bpf_get_current_cgroup_id() with CONFIG_CGROUPS

Commit bf6fa2c893c5 ("bpf: implement bpf_get_current_cgroup_id()
helper") introduced a new helper bpf_get_current_cgroup_id().
The helper has a dependency on CONFIG_CGROUPS.

When CONFIG_CGROUPS is not defined, using the helper will result
the following verifier error:
  kernel subsystem misconfigured func bpf_get_current_cgroup_id#80
which is hard for users to interpret.
Guarding the reference to bpf_get_current_cgroup_id_proto with
CONFIG_CGROUPS will result in below better message:
  unknown func bpf_get_current_cgroup_id#80

Fixes: bf6fa2c893c5 ("bpf: implement bpf_get_current_cgroup_id() helper")
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e2ab5b7f29d2..0ae6829804bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -564,8 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+#ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
+#endif
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 4e24f2dd516ed85fc8fd568ceae589ad0a07b7d5 Mon Sep 17 00:00:00 2001
From: Chas Williams <3chas3@gmail.com>
Date: Sat, 2 Jun 2018 17:49:53 -0400
Subject: Allow ethtool to change tun link settings

Let user space set whatever it would like to advertise for the
tun interface.  Preserve the existing defaults.

Signed-off-by: Chas Williams <3chas3@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c94fffee5ea9..067fc9e7e3ed 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -81,6 +81,9 @@
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
 
+static void tun_default_link_ksettings(struct net_device *dev,
+				       struct ethtool_link_ksettings *cmd);
+
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -242,6 +245,7 @@ struct tun_struct {
 	struct bpf_prog __rcu *xdp_prog;
 	struct tun_prog __rcu *steering_prog;
 	struct tun_prog __rcu *filter_prog;
+	struct ethtool_link_ksettings link_ksettings;
 };
 
 struct veth {
@@ -2295,6 +2299,7 @@ static void tun_setup(struct net_device *dev)
 
 	tun->owner = INVALID_UID;
 	tun->group = INVALID_GID;
+	tun_default_link_ksettings(dev, &tun->link_ksettings);
 
 	dev->ethtool_ops = &tun_ethtool_ops;
 	dev->needs_free_netdev = true;
@@ -3326,8 +3331,8 @@ static struct miscdevice tun_miscdev = {
 
 /* ethtool interface */
 
-static int tun_get_link_ksettings(struct net_device *dev,
-				  struct ethtool_link_ksettings *cmd)
+static void tun_default_link_ksettings(struct net_device *dev,
+				       struct ethtool_link_ksettings *cmd)
 {
 	ethtool_link_ksettings_zero_link_mode(cmd, supported);
 	ethtool_link_ksettings_zero_link_mode(cmd, advertising);
@@ -3336,6 +3341,23 @@ static int tun_get_link_ksettings(struct net_device *dev,
 	cmd->base.port		= PORT_TP;
 	cmd->base.phy_address	= 0;
 	cmd->base.autoneg	= AUTONEG_DISABLE;
+}
+
+static int tun_get_link_ksettings(struct net_device *dev,
+				  struct ethtool_link_ksettings *cmd)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+
+	memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
+	return 0;
+}
+
+static int tun_set_link_ksettings(struct net_device *dev,
+				  const struct ethtool_link_ksettings *cmd)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+
+	memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
 	return 0;
 }
 
@@ -3406,6 +3428,7 @@ static const struct ethtool_ops tun_ethtool_ops = {
 	.get_coalesce   = tun_get_coalesce,
 	.set_coalesce   = tun_set_coalesce,
 	.get_link_ksettings = tun_get_link_ksettings,
+	.set_link_ksettings = tun_set_link_ksettings,
 };
 
 static int tun_queue_resize(struct tun_struct *tun)
-- 
cgit v1.2.3-59-g8ed1b


From 1a025028d400b23477341aa7ec2ce55f8b39b554 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 3 Jun 2018 02:17:39 +0100
Subject: rxrpc: Fix handling of call quietly cancelled out on server

Sometimes an in-progress call will stop responding on the fileserver when
the fileserver quietly cancels the call with an internally marked abort
(RX_CALL_DEAD), without sending an ABORT to the client.

This causes the client's call to eventually expire from lack of incoming
packets directed its way, which currently leads to it being cancelled
locally with ETIME.  Note that it's not currently clear as to why this
happens as it's really hard to reproduce.

The rotation policy implement by kAFS, however, doesn't differentiate
between ETIME meaning we didn't get any response from the server and ETIME
meaning the call got cancelled mid-flow.  The latter leads to an oops when
fetching data as the rotation partially resets the afs_read descriptor,
which can result in a cleared page pointer being dereferenced because that
page has already been filled.

Handle this by the following means:

 (1) Set a flag on a call when we receive a packet for it.

 (2) Store the highest packet serial number so far received for a call
     (bearing in mind this may wrap).

 (3) If, when the "not received anything recently" timeout expires on a
     call, we've received at least one packet for a call and the connection
     as a whole has received packets more recently than that call, then
     cancel the call locally with ECONNRESET rather than ETIME.

     This indicates that the call was definitely in progress on the server.

 (4) In kAFS, if the rotation algorithm sees ECONNRESET rather than ETIME,
     don't try the next server, but rather abort the call.

     This avoids the oops as we don't try to reuse the afs_read struct.
     Rather, as-yet ungotten pages will be reread at a later data.

Also:

 (5) Add an rxrpc tracepoint to log detection of the call being reset.

Without this, I occasionally see an oops like the following:

    general protection fault: 0000 [#1] SMP PTI
    ...
    RIP: 0010:_copy_to_iter+0x204/0x310
    RSP: 0018:ffff8800cae0f828 EFLAGS: 00010206
    RAX: 0000000000000560 RBX: 0000000000000560 RCX: 0000000000000560
    RDX: ffff8800cae0f968 RSI: ffff8800d58b3312 RDI: 0005080000000000
    RBP: ffff8800cae0f968 R08: 0000000000000560 R09: ffff8800ca00f400
    R10: ffff8800c36f28d4 R11: 00000000000008c4 R12: ffff8800cae0f958
    R13: 0000000000000560 R14: ffff8800d58b3312 R15: 0000000000000560
    FS:  00007fdaef108080(0000) GS:ffff8800ca680000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 00007fb28a8fa000 CR3: 00000000d2a76002 CR4: 00000000001606e0
    Call Trace:
     skb_copy_datagram_iter+0x14e/0x289
     rxrpc_recvmsg_data.isra.0+0x6f3/0xf68
     ? trace_buffer_unlock_commit_regs+0x4f/0x89
     rxrpc_kernel_recv_data+0x149/0x421
     afs_extract_data+0x1e0/0x798
     ? afs_wait_for_call_to_complete+0xc9/0x52e
     afs_deliver_fs_fetch_data+0x33a/0x5ab
     afs_deliver_to_call+0x1ee/0x5e0
     ? afs_wait_for_call_to_complete+0xc9/0x52e
     afs_wait_for_call_to_complete+0x12b/0x52e
     ? wake_up_q+0x54/0x54
     afs_make_call+0x287/0x462
     ? afs_fs_fetch_data+0x3e6/0x3ed
     ? rcu_read_lock_sched_held+0x5d/0x63
     afs_fs_fetch_data+0x3e6/0x3ed
     afs_fetch_data+0xbb/0x14a
     afs_readpages+0x317/0x40d
     __do_page_cache_readahead+0x203/0x2ba
     ? ondemand_readahead+0x3a7/0x3c1
     ondemand_readahead+0x3a7/0x3c1
     generic_file_buffered_read+0x18b/0x62f
     __vfs_read+0xdb/0xfe
     vfs_read+0xb2/0x137
     ksys_read+0x50/0x8c
     do_syscall_64+0x7d/0x1a0
     entry_SYSCALL_64_after_hwframe+0x49/0xbe

Note the weird value in RDI which is a result of trying to kmap() a NULL
page pointer.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/afs/rotate.c              |  4 ++++
 include/trace/events/rxrpc.h | 32 ++++++++++++++++++++++++++++++++
 net/rxrpc/ar-internal.h      |  2 ++
 net/rxrpc/call_event.c       |  8 +++++++-
 net/rxrpc/input.c            | 10 ++++++++--
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index e065bc0768e6..1faef56b12bd 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
 	case -ETIME:
 		_debug("no conn");
 		goto iterate_address;
+
+	case -ECONNRESET:
+		_debug("call reset");
+		goto failed;
 	}
 
 restart_from_beginning:
diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 077e664ac9a2..4fff00e9da8a 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -1459,6 +1459,38 @@ TRACE_EVENT(rxrpc_tx_fail,
 		      __print_symbolic(__entry->what, rxrpc_tx_fail_traces))
 	    );
 
+TRACE_EVENT(rxrpc_call_reset,
+	    TP_PROTO(struct rxrpc_call *call),
+
+	    TP_ARGS(call),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		debug_id	)
+		    __field(u32,			cid		)
+		    __field(u32,			call_id		)
+		    __field(rxrpc_serial_t,		call_serial	)
+		    __field(rxrpc_serial_t,		conn_serial	)
+		    __field(rxrpc_seq_t,		tx_seq		)
+		    __field(rxrpc_seq_t,		rx_seq		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->debug_id = call->debug_id;
+		    __entry->cid = call->cid;
+		    __entry->call_id = call->call_id;
+		    __entry->call_serial = call->rx_serial;
+		    __entry->conn_serial = call->conn->hi_serial;
+		    __entry->tx_seq = call->tx_hard_ack;
+		    __entry->rx_seq = call->ackr_seen;
+			   ),
+
+	    TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x",
+		      __entry->debug_id,
+		      __entry->cid, __entry->call_id,
+		      __entry->call_serial, __entry->conn_serial,
+		      __entry->tx_seq, __entry->rx_seq)
+	    );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 19975d2ca9a2..b2393113a251 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -477,6 +477,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_PINGING,		/* Ping in process */
 	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
 	RXRPC_CALL_BEGAN_RX_TIMER,	/* We began the expect_rx_by timer */
+	RXRPC_CALL_RX_HEARD,		/* The peer responded at least once to this call */
 };
 
 /*
@@ -624,6 +625,7 @@ struct rxrpc_call {
 						 */
 	rxrpc_seq_t		rx_top;		/* Highest Rx slot allocated. */
 	rxrpc_seq_t		rx_expect_next;	/* Expected next packet sequence number */
+	rxrpc_serial_t		rx_serial;	/* Highest serial received for this call */
 	u8			rx_winsize;	/* Size of Rx window */
 	u8			tx_winsize;	/* Maximum size of Tx window */
 	bool			tx_phase;	/* T if transmission phase, F if receive phase */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6e0d788b4dc4..20210418904b 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -392,7 +392,13 @@ recheck_state:
 
 	/* Process events */
 	if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
-		rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+		if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) &&
+		    (int)call->conn->hi_serial - (int)call->rx_serial > 0) {
+			trace_rxrpc_call_reset(call);
+			rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ECONNRESET);
+		} else {
+			rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
+		}
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		goto recheck_state;
 	}
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index b5fd6381313d..608d078a4981 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1278,8 +1278,14 @@ void rxrpc_data_ready(struct sock *udp_sk)
 			call = NULL;
 		}
 
-		if (call && sp->hdr.serviceId != call->service_id)
-			call->service_id = sp->hdr.serviceId;
+		if (call) {
+			if (sp->hdr.serviceId != call->service_id)
+				call->service_id = sp->hdr.serviceId;
+			if ((int)sp->hdr.serial - (int)call->rx_serial > 0)
+				call->rx_serial = sp->hdr.serial;
+			if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags))
+				set_bit(RXRPC_CALL_RX_HEARD, &call->flags);
+		}
 	} else {
 		skew = 0;
 		call = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 40434a670f66f797b3a40603a286def35337dd4f Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sun, 3 Jun 2018 10:40:15 +0800
Subject: net: chelsio: Use zeroing memory allocator instead of
 allocator/memset

Use dma_zalloc_coherent for allocating zeroed
memory and remove unnecessary memset function.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb3/sge.c   | 3 +--
 drivers/net/ethernet/chelsio/cxgb4/sge.c   | 3 +--
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 7 +------
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index e988caa797cb..20b6e1b3f5e3 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -620,7 +620,7 @@ static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
 {
 	size_t len = nelem * elem_size;
 	void *s = NULL;
-	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
+	void *p = dma_zalloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
 
 	if (!p)
 		return NULL;
@@ -633,7 +633,6 @@ static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
 		}
 		*(void **)metadata = s;
 	}
-	memset(p, 0, len);
 	return p;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 276f22357f81..7a271feec5e7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -694,7 +694,7 @@ static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
 {
 	size_t len = nelem * elem_size + stat_size;
 	void *s = NULL;
-	void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
+	void *p = dma_zalloc_coherent(dev, len, phys, GFP_KERNEL);
 
 	if (!p)
 		return NULL;
@@ -708,7 +708,6 @@ static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
 	}
 	if (metadata)
 		*(void **)metadata = s;
-	memset(p, 0, len);
 	return p;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index dfce5df7538e..3007e1ac1e61 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -756,7 +756,7 @@ static void *alloc_ring(struct device *dev, size_t nelem, size_t hwsize,
 	 * Allocate the hardware ring and PCI DMA bus address space for said.
 	 */
 	size_t hwlen = nelem * hwsize + stat_size;
-	void *hwring = dma_alloc_coherent(dev, hwlen, busaddrp, GFP_KERNEL);
+	void *hwring = dma_zalloc_coherent(dev, hwlen, busaddrp, GFP_KERNEL);
 
 	if (!hwring)
 		return NULL;
@@ -776,11 +776,6 @@ static void *alloc_ring(struct device *dev, size_t nelem, size_t hwsize,
 		*(void **)swringp = swring;
 	}
 
-	/*
-	 * Zero out the hardware ring and return its address as our function
-	 * value.
-	 */
-	memset(hwring, 0, hwlen);
 	return hwring;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1f4c74132140c184b8cbcf0a7298beda1d3681de Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 2 Jun 2018 21:40:19 -0700
Subject: net: skbuff.h: drop unneeded <linux/slab.h>

<linux/skbuff.h> does not use nor need <linux/slab.h>, so drop this
header file from skbuff.h.

<linux/skbuff.h> is currently #included in around 1200 C source and
header files, making it the 31st most-used header file.

Build tested [allmodconfig] on 20 arch-es.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 693564a9a979..164cdedf6012 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -854,8 +854,6 @@ struct sk_buff {
 /*
  *	Handling routines are only of interest to the kernel
  */
-#include <linux/slab.h>
-
 
 #define SKB_ALLOC_FCLONE	0x01
 #define SKB_ALLOC_RX		0x02
-- 
cgit v1.2.3-59-g8ed1b


From de9dc650f05f5c427a623797342bb2870ceedd38 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 3 Jun 2018 10:06:13 +0300
Subject: cls_flower: Fix missing free of rhashtable

When destroying the instance, destroy the head rhashtable.

Fixes: 05cd271fd61a ("cls_flower: Support multiple masks per priority")
Reported-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 3786feab0b83..159efd98ee9a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -326,6 +326,8 @@ static void fl_destroy_sleepable(struct work_struct *work)
 	struct cls_fl_head *head = container_of(to_rcu_work(work),
 						struct cls_fl_head,
 						rwork);
+
+	rhashtable_destroy(&head->ht);
 	kfree(head);
 	module_put(THIS_MODULE);
 }
-- 
cgit v1.2.3-59-g8ed1b


From f6521c587a586de5fdafb0a322072e5ff67f8e15 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 3 Jun 2018 10:06:14 +0300
Subject: cls_flower: Fix comparing of old filter mask with new filter

We incorrectly compare the mask and the result is that we can't modify
an already existing rule.

Fix that by comparing correctly.

Fixes: 05cd271fd61a ("cls_flower: Support multiple masks per priority")
Reported-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 159efd98ee9a..2b5be42a9f1c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -877,7 +877,7 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
 			return PTR_ERR(newmask);
 
 		fnew->mask = newmask;
-	} else if (fold && fold->mask == fnew->mask) {
+	} else if (fold && fold->mask != fnew->mask) {
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 13ce3bc9c1b43566a31d3edc857a5950b2e77bd8 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sun, 3 Jun 2018 16:10:01 +0800
Subject: net: gemini: fix spelling mistake: "it" -> "is"

Trivial fix to spelling mistake in gemini dev_warn message

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cortina/gemini.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c
index bd3f6e4d1341..ff9eb45f67f8 100644
--- a/drivers/net/ethernet/cortina/gemini.c
+++ b/drivers/net/ethernet/cortina/gemini.c
@@ -539,7 +539,7 @@ static int gmac_setup_txqs(struct net_device *netdev)
 	}
 
 	if (port->txq_dma_base & ~DMA_Q_BASE_MASK) {
-		dev_warn(geth->dev, "TX queue base it not aligned\n");
+		dev_warn(geth->dev, "TX queue base is not aligned\n");
 		kfree(skb_tab);
 		return -ENOMEM;
 	}
@@ -680,7 +680,7 @@ static int gmac_setup_rxq(struct net_device *netdev)
 	if (!port->rxq_ring)
 		return -ENOMEM;
 	if (port->rxq_dma_base & ~NONTOE_QHDR0_BASE_MASK) {
-		dev_warn(geth->dev, "RX queue base it not aligned\n");
+		dev_warn(geth->dev, "RX queue base is not aligned\n");
 		return -ENOMEM;
 	}
 
@@ -905,7 +905,7 @@ static int geth_setup_freeq(struct gemini_ethernet *geth)
 	if (!geth->freeq_ring)
 		return -ENOMEM;
 	if (geth->freeq_dma_base & ~DMA_Q_BASE_MASK) {
-		dev_warn(geth->dev, "queue ring base it not aligned\n");
+		dev_warn(geth->dev, "queue ring base is not aligned\n");
 		goto err_freeq;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 6dc5aa212321c87a79746980eb258912bcf352ba Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Sun, 3 Jun 2018 17:19:04 +0530
Subject: net: ethernet: bnx2: Remove extra parentheses

The following coccinelle script removes extra parentheses to fix the
clang warning of extraneous parentheses.

@disable paren@
identifier i;
expression e;
statement s;
@@
if (
-(i == e)
+i == e
 )
s

Suggested-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 9ffc4a8c5fc7..2306523778d4 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3285,7 +3285,7 @@ next_rx:
 		sw_cons = BNX2_NEXT_RX_BD(sw_cons);
 		sw_prod = BNX2_NEXT_RX_BD(sw_prod);
 
-		if ((rx_pkt == budget))
+		if (rx_pkt == budget)
 			break;
 
 		/* Refresh hw_cons to see if there is new work */
-- 
cgit v1.2.3-59-g8ed1b


From b8aac410b76058bc00fff9398d273566fac921cf Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Sun, 3 Jun 2018 17:19:52 +0530
Subject: net: ethernet: bnx2: Replace NULL comparison

This patch fixes the checkpatch issue of NULL comparison. Replace x == NULL
with !x, by using the following coccinelle script:

@disable is_null@
expression e;
@@
-e==NULL
+!e

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2.c | 42 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 2306523778d4..3853296d78c1 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -384,7 +384,7 @@ static int bnx2_register_cnic(struct net_device *dev, struct cnic_ops *ops,
 	struct bnx2 *bp = netdev_priv(dev);
 	struct cnic_eth_dev *cp = &bp->cnic_eth_dev;
 
-	if (ops == NULL)
+	if (!ops)
 		return -EINVAL;
 
 	if (cp->drv_state & CNIC_DRV_STATE_REGD)
@@ -755,13 +755,13 @@ bnx2_alloc_tx_mem(struct bnx2 *bp)
 		struct bnx2_tx_ring_info *txr = &bnapi->tx_ring;
 
 		txr->tx_buf_ring = kzalloc(SW_TXBD_RING_SIZE, GFP_KERNEL);
-		if (txr->tx_buf_ring == NULL)
+		if (!txr->tx_buf_ring)
 			return -ENOMEM;
 
 		txr->tx_desc_ring =
 			dma_alloc_coherent(&bp->pdev->dev, TXBD_RING_SIZE,
 					   &txr->tx_desc_mapping, GFP_KERNEL);
-		if (txr->tx_desc_ring == NULL)
+		if (!txr->tx_desc_ring)
 			return -ENOMEM;
 	}
 	return 0;
@@ -779,7 +779,7 @@ bnx2_alloc_rx_mem(struct bnx2 *bp)
 
 		rxr->rx_buf_ring =
 			vzalloc(SW_RXBD_RING_SIZE * bp->rx_max_ring);
-		if (rxr->rx_buf_ring == NULL)
+		if (!rxr->rx_buf_ring)
 			return -ENOMEM;
 
 		for (j = 0; j < bp->rx_max_ring; j++) {
@@ -788,7 +788,7 @@ bnx2_alloc_rx_mem(struct bnx2 *bp)
 						   RXBD_RING_SIZE,
 						   &rxr->rx_desc_mapping[j],
 						   GFP_KERNEL);
-			if (rxr->rx_desc_ring[j] == NULL)
+			if (!rxr->rx_desc_ring[j])
 				return -ENOMEM;
 
 		}
@@ -796,7 +796,7 @@ bnx2_alloc_rx_mem(struct bnx2 *bp)
 		if (bp->rx_pg_ring_size) {
 			rxr->rx_pg_ring = vzalloc(SW_RXPG_RING_SIZE *
 						  bp->rx_max_pg_ring);
-			if (rxr->rx_pg_ring == NULL)
+			if (!rxr->rx_pg_ring)
 				return -ENOMEM;
 
 		}
@@ -807,7 +807,7 @@ bnx2_alloc_rx_mem(struct bnx2 *bp)
 						   RXBD_RING_SIZE,
 						   &rxr->rx_pg_desc_mapping[j],
 						   GFP_KERNEL);
-			if (rxr->rx_pg_desc_ring[j] == NULL)
+			if (!rxr->rx_pg_desc_ring[j])
 				return -ENOMEM;
 
 		}
@@ -845,7 +845,7 @@ bnx2_alloc_stats_blk(struct net_device *dev)
 				sizeof(struct statistics_block);
 	status_blk = dma_zalloc_coherent(&bp->pdev->dev, bp->status_stats_size,
 					 &bp->status_blk_mapping, GFP_KERNEL);
-	if (status_blk == NULL)
+	if (!status_blk)
 		return -ENOMEM;
 
 	bp->status_blk = status_blk;
@@ -914,7 +914,7 @@ bnx2_alloc_mem(struct bnx2 *bp)
 						BNX2_PAGE_SIZE,
 						&bp->ctx_blk_mapping[i],
 						GFP_KERNEL);
-			if (bp->ctx_blk[i] == NULL)
+			if (!bp->ctx_blk[i])
 				goto alloc_mem_err;
 		}
 	}
@@ -2667,7 +2667,7 @@ bnx2_alloc_bad_rbuf(struct bnx2 *bp)
 	u32 val;
 
 	good_mbuf = kmalloc(512 * sizeof(u16), GFP_KERNEL);
-	if (good_mbuf == NULL)
+	if (!good_mbuf)
 		return -ENOMEM;
 
 	BNX2_WR(bp, BNX2_MISC_ENABLE_SET_BITS,
@@ -3225,7 +3225,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 
 		if (len <= bp->rx_copy_thresh) {
 			skb = netdev_alloc_skb(bp->dev, len + 6);
-			if (skb == NULL) {
+			if (!skb) {
 				bnx2_reuse_rx_data(bp, rxr, data, sw_ring_cons,
 						  sw_ring_prod);
 				goto next_rx;
@@ -4561,7 +4561,7 @@ bnx2_nvram_write(struct bnx2 *bp, u32 offset, u8 *data_buf,
 
 	if (align_start || align_end) {
 		align_buf = kmalloc(len32, GFP_KERNEL);
-		if (align_buf == NULL)
+		if (!align_buf)
 			return -ENOMEM;
 		if (align_start) {
 			memcpy(align_buf, start, 4);
@@ -4575,7 +4575,7 @@ bnx2_nvram_write(struct bnx2 *bp, u32 offset, u8 *data_buf,
 
 	if (!(bp->flash_info->flags & BNX2_NV_BUFFERED)) {
 		flash_buffer = kmalloc(264, GFP_KERNEL);
-		if (flash_buffer == NULL) {
+		if (!flash_buffer) {
 			rc = -ENOMEM;
 			goto nvram_write_end;
 		}
@@ -5440,7 +5440,7 @@ bnx2_free_tx_skbs(struct bnx2 *bp)
 		struct bnx2_tx_ring_info *txr = &bnapi->tx_ring;
 		int j;
 
-		if (txr->tx_buf_ring == NULL)
+		if (!txr->tx_buf_ring)
 			continue;
 
 		for (j = 0; j < BNX2_TX_DESC_CNT; ) {
@@ -5448,7 +5448,7 @@ bnx2_free_tx_skbs(struct bnx2 *bp)
 			struct sk_buff *skb = tx_buf->skb;
 			int k, last;
 
-			if (skb == NULL) {
+			if (!skb) {
 				j = BNX2_NEXT_TX_BD(j);
 				continue;
 			}
@@ -5485,14 +5485,14 @@ bnx2_free_rx_skbs(struct bnx2 *bp)
 		struct bnx2_rx_ring_info *rxr = &bnapi->rx_ring;
 		int j;
 
-		if (rxr->rx_buf_ring == NULL)
+		if (!rxr->rx_buf_ring)
 			return;
 
 		for (j = 0; j < bp->rx_max_ring_idx; j++) {
 			struct bnx2_sw_bd *rx_buf = &rxr->rx_buf_ring[j];
 			u8 *data = rx_buf->data;
 
-			if (data == NULL)
+			if (!data)
 				continue;
 
 			dma_unmap_single(&bp->pdev->dev,
@@ -6826,7 +6826,7 @@ bnx2_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *net_stats)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
-	if (bp->stats_blk == NULL)
+	if (!bp->stats_blk)
 		return;
 
 	net_stats->rx_packets =
@@ -7217,7 +7217,7 @@ bnx2_get_eeprom_len(struct net_device *dev)
 {
 	struct bnx2 *bp = netdev_priv(dev);
 
-	if (bp->flash_info == NULL)
+	if (!bp->flash_info)
 		return 0;
 
 	return (int) bp->flash_size;
@@ -7678,7 +7678,7 @@ bnx2_get_ethtool_stats(struct net_device *dev,
 	u32 *temp_stats = (u32 *) bp->temp_stats_blk;
 	u8 *stats_len_arr = NULL;
 
-	if (hw_stats == NULL) {
+	if (!hw_stats) {
 		memset(buf, 0, sizeof(u64) * BNX2_NUM_STATS);
 		return;
 	}
@@ -8121,7 +8121,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bp->temp_stats_blk =
 		kzalloc(sizeof(struct statistics_block), GFP_KERNEL);
 
-	if (bp->temp_stats_blk == NULL) {
+	if (!bp->temp_stats_blk) {
 		rc = -ENOMEM;
 		goto err_out;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 39dbc646fd2c67ee9b71450ce172cbd714d4e7fb Mon Sep 17 00:00:00 2001
From: Yuval Bason <yuval.bason@cavium.com>
Date: Sun, 3 Jun 2018 19:13:07 +0300
Subject: qed: Add srq core support for RoCE and iWARP

This patch adds support for configuring SRQ and provides the necessary
APIs for rdma upper layer driver (qedr) to enable the SRQ feature.

Signed-off-by: Michal Kalderon <michal.kalderon@cavium.com>
Signed-off-by: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: Yuval Bason <yuval.bason@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c   |   5 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.h   |   1 +
 drivers/net/ethernet/qlogic/qed/qed_hsi.h   |   2 +
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c |  23 ++++
 drivers/net/ethernet/qlogic/qed/qed_main.c  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c  | 178 +++++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qed/qed_rdma.h  |   2 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c  |  17 ++-
 include/linux/qed/qed_rdma_if.h             |  12 +-
 9 files changed, 234 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 820b226d6ff8..7ed6aa0521e1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -47,6 +47,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_init_ops.h"
+#include "qed_rdma.h"
 #include "qed_reg_addr.h"
 #include "qed_sriov.h"
 
@@ -426,7 +427,7 @@ static void qed_cxt_set_srq_count(struct qed_hwfn *p_hwfn, u32 num_srqs)
 	p_mgr->srq_count = num_srqs;
 }
 
-static u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn)
 {
 	struct qed_cxt_mngr *p_mgr = p_hwfn->p_cxt_mngr;
 
@@ -2071,7 +2072,7 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
 	u32 num_cons, num_qps, num_srqs;
 	enum protocol_type proto;
 
-	num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
+	num_srqs = min_t(u32, QED_RDMA_MAX_SRQS, p_params->num_srqs);
 
 	if (p_hwfn->mcp_info->func_info.protocol == QED_PCI_ETH_RDMA) {
 		DP_NOTICE(p_hwfn,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index a4e95869889f..758a8b4c0de8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -235,6 +235,7 @@ u32 qed_cxt_get_proto_tid_count(struct qed_hwfn *p_hwfn,
 				enum protocol_type type);
 u32 qed_cxt_get_proto_cid_start(struct qed_hwfn *p_hwfn,
 				enum protocol_type type);
+u32 qed_cxt_get_srq_count(struct qed_hwfn *p_hwfn);
 int qed_cxt_free_proto_ilt(struct qed_hwfn *p_hwfn, enum protocol_type proto);
 
 #define QED_CTX_WORKING_MEM 0
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index beba9308011b..b9704be53537 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -9725,6 +9725,8 @@ enum iwarp_eqe_async_opcode {
 	IWARP_EVENT_TYPE_ASYNC_EXCEPTION_DETECTED,
 	IWARP_EVENT_TYPE_ASYNC_QP_IN_ERROR_STATE,
 	IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW,
+	IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY,
+	IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT,
 	MAX_IWARP_EQE_ASYNC_OPCODE
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 2a2b1018ed1d..474e6cff5b97 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -271,6 +271,8 @@ int qed_iwarp_create_qp(struct qed_hwfn *p_hwfn,
 	p_ramrod->sq_num_pages = qp->sq_num_pages;
 	p_ramrod->rq_num_pages = qp->rq_num_pages;
 
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(qp->srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(p_hwfn->hw_info.opaque_fid);
 	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
 	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
 
@@ -3004,8 +3006,11 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
 				 union event_ring_data *data,
 				 u8 fw_return_code)
 {
+	struct qed_rdma_events events = p_hwfn->p_rdma_info->events;
 	struct regpair *fw_handle = &data->rdma_data.async_handle;
 	struct qed_iwarp_ep *ep = NULL;
+	u16 srq_offset;
+	u16 srq_id;
 	u16 cid;
 
 	ep = (struct qed_iwarp_ep *)(uintptr_t)HILO_64(fw_handle->hi,
@@ -3067,6 +3072,24 @@ static int qed_iwarp_async_event(struct qed_hwfn *p_hwfn,
 		qed_iwarp_cid_cleaned(p_hwfn, cid);
 
 		break;
+	case IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY:
+		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_SRQ_EMPTY\n");
+		srq_offset = p_hwfn->p_rdma_info->srq_id_offset;
+		/* FW assigns value that is no greater than u16 */
+		srq_id = ((u16)le32_to_cpu(fw_handle->lo)) - srq_offset;
+		events.affiliated_event(events.context,
+					QED_IWARP_EVENT_SRQ_EMPTY,
+					&srq_id);
+		break;
+	case IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT:
+		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_SRQ_LIMIT\n");
+		srq_offset = p_hwfn->p_rdma_info->srq_id_offset;
+		/* FW assigns value that is no greater than u16 */
+		srq_id = ((u16)le32_to_cpu(fw_handle->lo)) - srq_offset;
+		events.affiliated_event(events.context,
+					QED_IWARP_EVENT_SRQ_LIMIT,
+					&srq_id);
+		break;
 	case IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW:
 		DP_NOTICE(p_hwfn, "IWARP_EVENT_TYPE_ASYNC_CQ_OVERFLOW\n");
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 68c4399ffd50..b04d57ca5176 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -64,6 +64,7 @@
 
 #define QED_ROCE_QPS			(8192)
 #define QED_ROCE_DPIS			(8)
+#define QED_RDMA_SRQS                   QED_ROCE_QPS
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -922,6 +923,7 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 	if (IS_ENABLED(CONFIG_QED_RDMA)) {
 		params->rdma_pf_params.num_qps = QED_ROCE_QPS;
 		params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
+		params->rdma_pf_params.num_srqs = QED_RDMA_SRQS;
 		/* divide by 3 the MRs to avoid MF ILT overflow */
 		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index a411f9c702a1..b8705107a93a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -259,15 +259,29 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 		goto free_cid_map;
 	}
 
+	/* Allocate bitmap for srqs */
+	p_rdma_info->num_srqs = qed_cxt_get_srq_count(p_hwfn);
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->srq_map,
+				 p_rdma_info->num_srqs, "SRQ");
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate srq bitmap, rc = %d\n", rc);
+		goto free_real_cid_map;
+	}
+
 	if (QED_IS_IWARP_PERSONALITY(p_hwfn))
 		rc = qed_iwarp_alloc(p_hwfn);
 
 	if (rc)
-		goto free_cid_map;
+		goto free_srq_map;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocation successful\n");
 	return 0;
 
+free_srq_map:
+	kfree(p_rdma_info->srq_map.bitmap);
+free_real_cid_map:
+	kfree(p_rdma_info->real_cid_map.bitmap);
 free_cid_map:
 	kfree(p_rdma_info->cid_map.bitmap);
 free_tid_map:
@@ -351,6 +365,8 @@ static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->cq_map, 1);
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->toggle_bits, 0);
 	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tid_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->srq_map, 1);
+	qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->real_cid_map, 1);
 
 	kfree(p_rdma_info->port);
 	kfree(p_rdma_info->dev);
@@ -431,6 +447,12 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn,
 	if (cdev->rdma_max_sge)
 		dev->max_sge = min_t(u32, cdev->rdma_max_sge, dev->max_sge);
 
+	dev->max_srq_sge = QED_RDMA_MAX_SGE_PER_SRQ_WQE;
+	if (p_hwfn->cdev->rdma_max_srq_sge) {
+		dev->max_srq_sge = min_t(u32,
+					 p_hwfn->cdev->rdma_max_srq_sge,
+					 dev->max_srq_sge);
+	}
 	dev->max_inline = ROCE_REQ_MAX_INLINE_DATA_SIZE;
 
 	dev->max_inline = (cdev->rdma_max_inline) ?
@@ -474,6 +496,8 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn,
 	dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE;
 	dev->max_pkey = QED_RDMA_MAX_P_KEY;
 
+	dev->max_srq = p_hwfn->p_rdma_info->num_srqs;
+	dev->max_srq_wr = QED_RDMA_MAX_SRQ_WQE_ELEM;
 	dev->max_qp_resp_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
 					  (RDMA_RESP_RD_ATOMIC_ELM_SIZE * 2);
 	dev->max_qp_req_rd_atomic_resc = RDMA_RING_PAGE_SIZE /
@@ -1628,6 +1652,155 @@ static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
 	return QED_LEADING_HWFN(cdev);
 }
 
+static int qed_rdma_modify_srq(void *rdma_cxt,
+			       struct qed_rdma_modify_srq_in_params *in_params)
+{
+	struct rdma_srq_modify_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_spq_entry *p_ent;
+	u16 opaque_fid;
+	int rc;
+
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_MODIFY_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rdma_modify_srq;
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+	p_ramrod->wqe_limit = cpu_to_le32(in_params->wqe_limit);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "modified SRQ id = %x",
+		   in_params->srq_id);
+
+	return rc;
+}
+
+static int
+qed_rdma_destroy_srq(void *rdma_cxt,
+		     struct qed_rdma_destroy_srq_in_params *in_params)
+{
+	struct rdma_srq_destroy_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	struct qed_spq_entry *p_ent;
+	struct qed_bmap *bmap;
+	u16 opaque_fid;
+	int rc;
+
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_DESTROY_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rdma_destroy_srq;
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		return rc;
+
+	bmap = &p_hwfn->p_rdma_info->srq_map;
+
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, bmap, in_params->srq_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "SRQ destroyed Id = %x",
+		   in_params->srq_id);
+
+	return rc;
+}
+
+static int
+qed_rdma_create_srq(void *rdma_cxt,
+		    struct qed_rdma_create_srq_in_params *in_params,
+		    struct qed_rdma_create_srq_out_params *out_params)
+{
+	struct rdma_srq_create_ramrod_data *p_ramrod;
+	struct qed_sp_init_data init_data = {};
+	struct qed_hwfn *p_hwfn = rdma_cxt;
+	enum qed_cxt_elem_type elem_type;
+	struct qed_spq_entry *p_ent;
+	u16 opaque_fid, srq_id;
+	struct qed_bmap *bmap;
+	u32 returned_id;
+	int rc;
+
+	bmap = &p_hwfn->p_rdma_info->srq_map;
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	rc = qed_rdma_bmap_alloc_id(p_hwfn, bmap, &returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	if (rc) {
+		DP_NOTICE(p_hwfn, "failed to allocate srq id\n");
+		return rc;
+	}
+
+	elem_type = QED_ELEM_SRQ;
+	rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, elem_type, returned_id);
+	if (rc)
+		goto err;
+	/* returned id is no greater than u16 */
+	srq_id = (u16)returned_id;
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.opaque_fid = opaque_fid;
+	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 RDMA_RAMROD_CREATE_SRQ,
+				 p_hwfn->p_rdma_info->proto, &init_data);
+	if (rc)
+		goto err;
+
+	p_ramrod = &p_ent->ramrod.rdma_create_srq;
+	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, in_params->pbl_base_addr);
+	p_ramrod->pages_in_srq_pbl = cpu_to_le16(in_params->num_pages);
+	p_ramrod->pd_id = cpu_to_le16(in_params->pd_id);
+	p_ramrod->srq_id.srq_idx = cpu_to_le16(srq_id);
+	p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
+	p_ramrod->page_size = cpu_to_le16(in_params->page_size);
+	DMA_REGPAIR_LE(p_ramrod->producers_addr, in_params->prod_pair_addr);
+
+	rc = qed_spq_post(p_hwfn, p_ent, NULL);
+	if (rc)
+		goto err;
+
+	out_params->srq_id = srq_id;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "SRQ created Id = %x\n", out_params->srq_id);
+
+	return rc;
+
+err:
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, bmap, returned_id);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+
+	return rc;
+}
+
 bool qed_rdma_allocated_qps(struct qed_hwfn *p_hwfn)
 {
 	bool result;
@@ -1773,6 +1946,9 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.rdma_free_tid = &qed_rdma_free_tid,
 	.rdma_register_tid = &qed_rdma_register_tid,
 	.rdma_deregister_tid = &qed_rdma_deregister_tid,
+	.rdma_create_srq = &qed_rdma_create_srq,
+	.rdma_modify_srq = &qed_rdma_modify_srq,
+	.rdma_destroy_srq = &qed_rdma_destroy_srq,
 	.ll2_acquire_connection = &qed_ll2_acquire_connection,
 	.ll2_establish_connection = &qed_ll2_establish_connection,
 	.ll2_terminate_connection = &qed_ll2_terminate_connection,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
index 18ec9cbd84f5..6f722ee8ee94 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h
@@ -96,6 +96,8 @@ struct qed_rdma_info {
 	u8 num_cnqs;
 	u32 num_qps;
 	u32 num_mrs;
+	u32 num_srqs;
+	u16 srq_id_offset;
 	u16 queue_zone_base;
 	u16 max_queue_zones;
 	enum protocol_type proto;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 6acfd43c1a4f..ee57fcdc698d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -65,6 +65,8 @@ qed_roce_async_event(struct qed_hwfn *p_hwfn,
 		     u8 fw_event_code,
 		     u16 echo, union event_ring_data *data, u8 fw_return_code)
 {
+	struct qed_rdma_events events = p_hwfn->p_rdma_info->events;
+
 	if (fw_event_code == ROCE_ASYNC_EVENT_DESTROY_QP_DONE) {
 		u16 icid =
 		    (u16)le32_to_cpu(data->rdma_data.rdma_destroy_qp_data.cid);
@@ -75,11 +77,18 @@ qed_roce_async_event(struct qed_hwfn *p_hwfn,
 		 */
 		qed_roce_free_real_icid(p_hwfn, icid);
 	} else {
-		struct qed_rdma_events *events = &p_hwfn->p_rdma_info->events;
+		if (fw_event_code == ROCE_ASYNC_EVENT_SRQ_EMPTY ||
+		    fw_event_code == ROCE_ASYNC_EVENT_SRQ_LIMIT) {
+			u16 srq_id = (u16)data->rdma_data.async_handle.lo;
+
+			events.affiliated_event(events.context, fw_event_code,
+						&srq_id);
+		} else {
+			union rdma_eqe_data rdata = data->rdma_data;
 
-		events->affiliated_event(p_hwfn->p_rdma_info->events.context,
-					 fw_event_code,
-				     (void *)&data->rdma_data.async_handle);
+			events.affiliated_event(events.context, fw_event_code,
+						(void *)&rdata.async_handle);
+		}
 	}
 
 	return 0;
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index 4dd72ba210f5..e05e320d28b7 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -485,7 +485,9 @@ enum qed_iwarp_event_type {
 	QED_IWARP_EVENT_ACTIVE_MPA_REPLY,
 	QED_IWARP_EVENT_LOCAL_ACCESS_ERROR,
 	QED_IWARP_EVENT_REMOTE_OPERATION_ERROR,
-	QED_IWARP_EVENT_TERMINATE_RECEIVED
+	QED_IWARP_EVENT_TERMINATE_RECEIVED,
+	QED_IWARP_EVENT_SRQ_LIMIT,
+	QED_IWARP_EVENT_SRQ_EMPTY,
 };
 
 enum qed_tcp_ip_version {
@@ -646,6 +648,14 @@ struct qed_rdma_ops {
 	int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid);
 	void (*rdma_free_tid)(void *rdma_cxt, u32 itid);
 
+	int (*rdma_create_srq)(void *rdma_cxt,
+			       struct qed_rdma_create_srq_in_params *iparams,
+			       struct qed_rdma_create_srq_out_params *oparams);
+	int (*rdma_destroy_srq)(void *rdma_cxt,
+				struct qed_rdma_destroy_srq_in_params *iparams);
+	int (*rdma_modify_srq)(void *rdma_cxt,
+			       struct qed_rdma_modify_srq_in_params *iparams);
+
 	int (*ll2_acquire_connection)(void *rdma_cxt,
 				      struct qed_ll2_acquire_data *data);
 
-- 
cgit v1.2.3-59-g8ed1b


From 79e9fed460385a3d8ba0b5782e9e74405cb199b1 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Sun, 3 Jun 2018 10:41:17 -0700
Subject: net-tcp: extend tcp_tw_reuse sysctl to enable loopback only
 optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This changes the /proc/sys/net/ipv4/tcp_tw_reuse from a boolean
to an integer.

It now takes the values 0, 1 and 2, where 0 and 1 behave as before,
while 2 enables timewait socket reuse only for sockets that we can
prove are loopback connections:
  ie. bound to 'lo' interface or where one of source or destination
  IPs is 127.0.0.0/8, ::ffff:127.0.0.0/104 or ::1.

This enables quicker reuse of ephemeral ports for loopback connections
- where tcp_tw_reuse is 100% safe from a protocol perspective
(this assumes no artificially induced packet loss on 'lo').

This also makes estblishing many loopback connections *much* faster
(allocating ports out of the first half of the ephemeral port range
is significantly faster, then allocating from the second half)

Without this change in a 32K ephemeral port space my sample program
(it just establishes and closes [::1]:ephemeral -> [::1]:server_port
connections in a tight loop) fails after 32765 connections in 24 seconds.
With it enabled 50000 connections only take 4.7 seconds.

This is particularly problematic for IPv6 where we only have one local
address and cannot play tricks with varying source IP from 127.0.0.0/8
pool.

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Wei Wang <weiwan@google.com>
Change-Id: I0377961749979d0301b7b62871a32a4b34b654e1
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 10 +++++++---
 net/ipv4/sysctl_net_ipv4.c             |  5 ++++-
 net/ipv4/tcp_ipv4.c                    | 35 +++++++++++++++++++++++++++++++---
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 924bd51327b7..6841c74eac00 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -667,11 +667,15 @@ tcp_tso_win_divisor - INTEGER
 	building larger TSO frames.
 	Default: 3
 
-tcp_tw_reuse - BOOLEAN
-	Allow to reuse TIME-WAIT sockets for new connections when it is
-	safe from protocol viewpoint. Default value is 0.
+tcp_tw_reuse - INTEGER
+	Enable reuse of TIME-WAIT sockets for new connections when it is
+	safe from protocol viewpoint.
+	0 - disable
+	1 - global enable
+	2 - enable for loopback traffic only
 	It should not be changed without advice/request of technical
 	experts.
+	Default: 2
 
 tcp_window_scaling - BOOLEAN
 	Enable window scaling as defined in RFC1323.
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d2eed3ddcb0a..d06247ba08b2 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -30,6 +30,7 @@
 
 static int zero;
 static int one = 1;
+static int two = 2;
 static int four = 4;
 static int thousand = 1000;
 static int gso_max_segs = GSO_MAX_SEGS;
@@ -845,7 +846,9 @@ static struct ctl_table ipv4_net_table[] = {
 		.data		= &init_net.ipv4.sysctl_tcp_tw_reuse,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "tcp_max_tw_buckets",
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 749b0ef9f405..633963e228bc 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
 
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 {
+	const struct inet_timewait_sock *tw = inet_twsk(sktw);
 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 	struct tcp_sock *tp = tcp_sk(sk);
+	int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
+
+	if (reuse == 2) {
+		/* Still does not detect *everything* that goes through
+		 * lo, since we require a loopback src or dst address
+		 * or direct binding to 'lo' interface.
+		 */
+		bool loopback = false;
+		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
+			loopback = true;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (tw->tw_family == AF_INET6) {
+			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
+			    (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
+			     (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
+			    (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
+			     (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+				loopback = true;
+		} else
+#endif
+		{
+			if (ipv4_is_loopback(tw->tw_daddr) ||
+			    ipv4_is_loopback(tw->tw_rcv_saddr))
+				loopback = true;
+		}
+		if (!loopback)
+			reuse = 0;
+	}
 
 	/* With PAWS, it is safe from the viewpoint
 	   of data integrity. Even without PAWS it is safe provided sequence
@@ -125,8 +155,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 	   and use initial timestamp retrieved from peer table.
 	 */
 	if (tcptw->tw_ts_recent_stamp &&
-	    (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
-			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
+	    (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 		if (tp->write_seq == 0)
 			tp->write_seq = 1;
@@ -2529,7 +2558,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_orphan_retries = 0;
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
-	net->ipv4.sysctl_tcp_tw_reuse = 0;
+	net->ipv4.sysctl_tcp_tw_reuse = 2;
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
-- 
cgit v1.2.3-59-g8ed1b


From f396922d862aa05b53ad740596652691a723ee23 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Sun, 3 Jun 2018 10:47:05 -0700
Subject: net: do not allow changing SO_REUSEADDR/SO_REUSEPORT on bound sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is not safe to do so because such sockets are already in the
hash tables and changing these options can result in invalidating
the tb->fastreuse(port) caching.

This can have later far reaching consequences wrt. bind conflict checks
which rely on these caches (for optimization purposes).

Not to mention that you can currently end up with two identical
non-reuseport listening sockets bound to the same local ip:port
by clearing reuseport on them after they've already both been bound.

There is unfortunately no EISBOUND error or anything similar,
and EISCONN seems to be misleading for a bound-but-not-connected
socket, so use EUCLEAN 'Structure needs cleaning' which AFAICT
is the closest you can get to meaning 'socket in bad state'.
(although perhaps EINVAL wouldn't be a bad choice either?)

This does unfortunately run the risk of breaking buggy
userspace programs...

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Change-Id: I77c2b3429b2fdf42671eee0fa7a8ba721c94963b
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 435a0ba85e52..feca4c98f8a0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -728,9 +728,22 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sock_valbool_flag(sk, SOCK_DBG, valbool);
 		break;
 	case SO_REUSEADDR:
-		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+		val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
+		if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+		    inet_sk(sk)->inet_num &&
+		    (sk->sk_reuse != val)) {
+			ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+			break;
+		}
+		sk->sk_reuse = val;
 		break;
 	case SO_REUSEPORT:
+		if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
+		    inet_sk(sk)->inet_num &&
+		    (sk->sk_reuseport != valbool)) {
+			ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
+			break;
+		}
 		sk->sk_reuseport = valbool;
 		break;
 	case SO_TYPE:
-- 
cgit v1.2.3-59-g8ed1b


From bb38ccce887c29a3ca78b8bd105c168c811766d9 Mon Sep 17 00:00:00 2001
From: Olivier Gayot <olivier.gayot@sigexec.com>
Date: Mon, 4 Jun 2018 12:07:37 +0200
Subject: docs: networking: fix minor typos in various documentation files

This patch fixes some typos/misspelling errors in the
Documentation/networking files.

Signed-off-by: Olivier Gayot <olivier.gayot@sigexec.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/6lowpan.txt             |  4 ++--
 Documentation/networking/gtp.txt                 |  4 ++--
 Documentation/networking/ila.txt                 |  2 +-
 Documentation/networking/ip-sysctl.txt           |  2 +-
 Documentation/networking/ipsec.txt               |  4 ++--
 Documentation/networking/ipvlan.txt              |  4 ++--
 Documentation/networking/kcm.txt                 | 10 +++++-----
 Documentation/networking/nf_conntrack-sysctl.txt |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Documentation/networking/6lowpan.txt b/Documentation/networking/6lowpan.txt
index a7dc7e939c7a..2e5a939d7e6f 100644
--- a/Documentation/networking/6lowpan.txt
+++ b/Documentation/networking/6lowpan.txt
@@ -24,10 +24,10 @@ enum lowpan_lltypes.
 
 Example to evaluate the private usually you can do:
 
-static inline sturct lowpan_priv_foobar *
+static inline struct lowpan_priv_foobar *
 lowpan_foobar_priv(struct net_device *dev)
 {
-	return (sturct lowpan_priv_foobar *)lowpan_priv(dev)->priv;
+	return (struct lowpan_priv_foobar *)lowpan_priv(dev)->priv;
 }
 
 switch (dev->type) {
diff --git a/Documentation/networking/gtp.txt b/Documentation/networking/gtp.txt
index 0d9c18f05ec6..6966bbec1ecb 100644
--- a/Documentation/networking/gtp.txt
+++ b/Documentation/networking/gtp.txt
@@ -67,7 +67,7 @@ Don't be confused by terminology:  The GTP User Plane goes through
 kernel accelerated path, while the GTP Control Plane goes to
 Userspace :)
 
-The official homepge of the module is at
+The official homepage of the module is at
 https://osmocom.org/projects/linux-kernel-gtp-u/wiki
 
 == Userspace Programs with Linux Kernel GTP-U support ==
@@ -120,7 +120,7 @@ If yo have questions regarding how to use the Kernel GTP module from
 your own software, or want to contribute to the code, please use the
 osmocom-net-grps mailing list for related discussion. The list can be
 reached at osmocom-net-gprs@lists.osmocom.org and the mailman
-interface for managign your subscription is at
+interface for managing your subscription is at
 https://lists.osmocom.org/mailman/listinfo/osmocom-net-gprs
 
 == Issue Tracker ==
diff --git a/Documentation/networking/ila.txt b/Documentation/networking/ila.txt
index 78df879abd26..a17dac9dc915 100644
--- a/Documentation/networking/ila.txt
+++ b/Documentation/networking/ila.txt
@@ -121,7 +121,7 @@ three options to deal with this:
 
 - checksum neutral mapping
 		When an address is translated the difference can be offset
-		elsewhere in a part of the packet that is covered by the
+		elsewhere in a part of the packet that is covered by
 		the checksum. The low order sixteen bits of the identifier
 		are used. This method is preferred since it doesn't require
 		parsing a packet beyond the IP header and in most cases the
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6841c74eac00..ce8fbf5aa63c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -26,7 +26,7 @@ ip_no_pmtu_disc - INTEGER
 	discarded. Outgoing frames are handled the same as in mode 1,
 	implicitly setting IP_PMTUDISC_DONT on every created socket.
 
-	Mode 3 is a hardend pmtu discover mode. The kernel will only
+	Mode 3 is a hardened pmtu discover mode. The kernel will only
 	accept fragmentation-needed errors if the underlying protocol
 	can verify them besides a plain socket lookup. Current
 	protocols for which pmtu events will be honored are TCP, SCTP
diff --git a/Documentation/networking/ipsec.txt b/Documentation/networking/ipsec.txt
index 8dbc08b7e431..ba794b7e51be 100644
--- a/Documentation/networking/ipsec.txt
+++ b/Documentation/networking/ipsec.txt
@@ -25,8 +25,8 @@ Quote from RFC3173:
    is implementation dependent.
 
 Current IPComp implementation is indeed by the book, while as in practice
-when sending non-compressed packet to the peer(whether or not packet len
-is smaller than the threshold or the compressed len is large than original
+when sending non-compressed packet to the peer (whether or not packet len
+is smaller than the threshold or the compressed len is larger than original
 packet len), the packet is dropped when checking the policy as this packet
 matches the selector but not coming from any XFRM layer, i.e., with no
 security path. Such naked packet will not eventually make it to upper layer.
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
index 812ef003e0a8..27a38e50c287 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -73,11 +73,11 @@ mode to make conn-tracking work.
 	This is the default option. To configure the IPvlan port in this mode,
 user can choose to either add this option on the command-line or don't specify
 anything. This is the traditional mode where slaves can cross-talk among
-themseleves apart from talking through the master device.
+themselves apart from talking through the master device.
 
 5.2 private:
 	If this option is added to the command-line, the port is set in private
-mode. i.e. port wont allow cross communication between slaves.
+mode. i.e. port won't allow cross communication between slaves.
 
 5.3 vepa:
 	If this is added to the command-line, the port is set in VEPA mode.
diff --git a/Documentation/networking/kcm.txt b/Documentation/networking/kcm.txt
index 9a513295b07c..b773a5278ac4 100644
--- a/Documentation/networking/kcm.txt
+++ b/Documentation/networking/kcm.txt
@@ -1,4 +1,4 @@
-Kernel Connection Mulitplexor
+Kernel Connection Multiplexor
 -----------------------------
 
 Kernel Connection Multiplexor (KCM) is a mechanism that provides a message based
@@ -31,7 +31,7 @@ KCM implements an NxM multiplexor in the kernel as diagrammed below:
 KCM sockets
 -----------
 
-The KCM sockets provide the user interface to the muliplexor. All the KCM sockets
+The KCM sockets provide the user interface to the multiplexor. All the KCM sockets
 bound to a multiplexor are considered to have equivalent function, and I/O
 operations in different sockets may be done in parallel without the need for
 synchronization between threads in userspace.
@@ -199,7 +199,7 @@ while. Example use:
 BFP programs for message delineation
 ------------------------------------
 
-BPF programs can be compiled using the BPF LLVM backend. For exmple,
+BPF programs can be compiled using the BPF LLVM backend. For example,
 the BPF program for parsing Thrift is:
 
   #include "bpf.h" /* for __sk_buff */
@@ -222,7 +222,7 @@ messages. The kernel provides necessary assurances that messages are sent
 and received atomically. This relieves much of the burden applications have
 in mapping a message based protocol onto the TCP stream. KCM also make
 application layer messages a unit of work in the kernel for the purposes of
-steerng and scheduling, which in turn allows a simpler networking model in
+steering and scheduling, which in turn allows a simpler networking model in
 multithreaded applications.
 
 Configurations
@@ -272,7 +272,7 @@ on the socket thus waking up the application thread. When the application
 sees the error (which may just be a disconnect) it should unattach the
 socket from KCM and then close it. It is assumed that once an error is
 posted on the TCP socket the data stream is unrecoverable (i.e. an error
-may have occurred in the middle of receiving a messssge).
+may have occurred in the middle of receiving a message).
 
 TCP connection monitoring
 -------------------------
diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index 433b6724797a..1669dc2419fd 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -156,7 +156,7 @@ nf_conntrack_timestamp - BOOLEAN
 nf_conntrack_udp_timeout - INTEGER (seconds)
 	default 30
 
-nf_conntrack_udp_timeout_stream2 - INTEGER (seconds)
+nf_conntrack_udp_timeout_stream - INTEGER (seconds)
 	default 180
 
 	This extended timeout will be used in case there is an UDP stream
-- 
cgit v1.2.3-59-g8ed1b


From 1f55c2865c21a49d882261cfa8d85635814ad8ab Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 4 Jun 2018 21:07:59 +0800
Subject: wan/fsl_ucc_hdlc: use dma_zalloc_coherent instead of allocator/memset

Use dma_zalloc_coherent instead of dma_alloc_coherent
followed by memset 0.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 33df76405b86..4205dfd19da3 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -270,10 +270,10 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
 	iowrite16be(DEFAULT_HDLC_ADDR, &priv->ucc_pram->haddr4);
 
 	/* Get BD buffer */
-	bd_buffer = dma_alloc_coherent(priv->dev,
-				       (RX_BD_RING_LEN + TX_BD_RING_LEN) *
-				       MAX_RX_BUF_LENGTH,
-				       &bd_dma_addr, GFP_KERNEL);
+	bd_buffer = dma_zalloc_coherent(priv->dev,
+					(RX_BD_RING_LEN + TX_BD_RING_LEN) *
+					MAX_RX_BUF_LENGTH,
+					&bd_dma_addr, GFP_KERNEL);
 
 	if (!bd_buffer) {
 		dev_err(priv->dev, "Could not allocate buffer descriptors\n");
@@ -281,9 +281,6 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
 		goto free_tiptr;
 	}
 
-	memset(bd_buffer, 0, (RX_BD_RING_LEN + TX_BD_RING_LEN)
-			* MAX_RX_BUF_LENGTH);
-
 	priv->rx_buffer = bd_buffer;
 	priv->tx_buffer = bd_buffer + RX_BD_RING_LEN * MAX_RX_BUF_LENGTH;
 
-- 
cgit v1.2.3-59-g8ed1b


From ff2e351e1928b5b81a23d78e3e4effc24db007b9 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 4 Jun 2018 21:10:31 +0800
Subject: qed: use dma_zalloc_coherent instead of allocator/memset

Use dma_zalloc_coherent instead of dma_alloc_coherent
followed by memset 0.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Tomer Tayar <Tomer.Tayar@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 7ed6aa0521e1..b5b5ff725426 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -937,14 +937,13 @@ static int qed_cxt_src_t2_alloc(struct qed_hwfn *p_hwfn)
 		u32 size = min_t(u32, total_size, psz);
 		void **p_virt = &p_mngr->t2[i].p_virt;
 
-		*p_virt = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
-					     size,
-					     &p_mngr->t2[i].p_phys, GFP_KERNEL);
+		*p_virt = dma_zalloc_coherent(&p_hwfn->cdev->pdev->dev,
+					      size, &p_mngr->t2[i].p_phys,
+					      GFP_KERNEL);
 		if (!p_mngr->t2[i].p_virt) {
 			rc = -ENOMEM;
 			goto t2_fail;
 		}
-		memset(*p_virt, 0, size);
 		p_mngr->t2[i].size = size;
 		total_size -= size;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a746407af158e0c9a5f1078a3b3de11b0c77cf03 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 4 Jun 2018 17:43:21 +0300
Subject: net_failover: Use netdev_features_t instead of u32

The features mask needs to be a netdev_features_t (u64) because a u32
is not big enough.

Fixes: cfc80d9a1163 ("net: Introduce net_failover driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/net_failover.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c
index 8b508e2cf29b..83f7420ddea5 100644
--- a/drivers/net/net_failover.c
+++ b/drivers/net/net_failover.c
@@ -380,7 +380,8 @@ static rx_handler_result_t net_failover_handle_frame(struct sk_buff **pskb)
 
 static void net_failover_compute_features(struct net_device *dev)
 {
-	u32 vlan_features = FAILOVER_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+	netdev_features_t vlan_features = FAILOVER_VLAN_FEATURES &
+					  NETIF_F_ALL_FOR_ALL;
 	netdev_features_t enc_features  = FAILOVER_ENC_FEATURES;
 	unsigned short max_hard_header_len = ETH_HLEN;
 	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
-- 
cgit v1.2.3-59-g8ed1b


From 25ea66544bfd1d9df1b7e1502f8717e85fa1e6e6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 4 Jun 2018 17:46:01 +0300
Subject: team: use netdev_features_t instead of u32

This code was introduced in 2011 around the same time that we made
netdev_features_t a u64 type.  These days a u32 is not big enough to
hold all the potential features.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 267dcc929f6c..8863fa023500 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1004,7 +1004,8 @@ static void team_port_disable(struct team *team,
 static void __team_compute_features(struct team *team)
 {
 	struct team_port *port;
-	u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
+	netdev_features_t vlan_features = TEAM_VLAN_FEATURES &
+					  NETIF_F_ALL_FOR_ALL;
 	netdev_features_t enc_features  = TEAM_ENC_FEATURES;
 	unsigned short max_hard_header_len = ETH_HLEN;
 	unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
-- 
cgit v1.2.3-59-g8ed1b


From 0e3990356d2518691e17c1ecbf7868833b6e6704 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Mon, 4 Jun 2018 18:32:23 +0300
Subject: net: sched: return error code when tcf proto is not found

If requested tcf proto is not found, get and del filter netlink protocol
handlers output error message to extack, but do not return actual error
code. Add check to return ENOENT when result of tp find function is NULL
pointer.

Fixes: c431f89b18a2 ("net: sched: split tc_ctl_tfilter into three handlers")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c06585fb2dc6..cdc3c87c53e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1274,7 +1274,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			       prio, false);
 	if (!tp || IS_ERR(tp)) {
 		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
-		err = PTR_ERR(tp);
+		err = tp ? PTR_ERR(tp) : -ENOENT;
 		goto errout;
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
 		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
@@ -1374,7 +1374,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			       prio, false);
 	if (!tp || IS_ERR(tp)) {
 		NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found");
-		err = PTR_ERR(tp);
+		err = tp ? PTR_ERR(tp) : -ENOENT;
 		goto errout;
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) {
 		NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one");
-- 
cgit v1.2.3-59-g8ed1b


From f0b964e5e4bb6b0c152f8064bb69a0f3d32a5096 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 4 Jun 2018 17:50:09 +0100
Subject: net: hns: Fix the process of adding broadcast addresses to tcam

If the multicast mask value in device tree is configured not all
0xff, the broadcast mac will be lost from tcam table after the
execution of command 'ifconfig up'. The address is appended by
hns_ae_start, but will be clear later by hns_nic_set_rx_mode
called in dev_open process.

This patch fixed it by not use the multicast mask when add a
broadcast address.

Fixes: b5996f11ea54 ("net: add Hisilicon Network Subsystem basic ethernet support")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 23 +++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index e0bc79ea3d88..85e1d14514fc 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -1648,6 +1648,15 @@ int hns_dsaf_rm_mac_addr(
 				      mac_entry->addr);
 }
 
+static void hns_dsaf_setup_mc_mask(struct dsaf_device *dsaf_dev,
+				   u8 port_num, u8 *mask, u8 *addr)
+{
+	if (MAC_IS_BROADCAST(addr))
+		memset(mask, 0xff, ETH_ALEN);
+	else
+		memcpy(mask, dsaf_dev->mac_cb[port_num]->mc_mask, ETH_ALEN);
+}
+
 static void hns_dsaf_mc_mask_bit_clear(char *dst, const char *src)
 {
 	u16 *a = (u16 *)dst;
@@ -1676,7 +1685,6 @@ int hns_dsaf_add_mac_mc_port(struct dsaf_device *dsaf_dev,
 	struct dsaf_drv_tbl_tcam_key tmp_mac_key;
 	struct dsaf_tbl_tcam_data tcam_data;
 	u8 mc_addr[ETH_ALEN];
-	u8 *mc_mask;
 	int mskid;
 
 	/*chechk mac addr */
@@ -1687,9 +1695,12 @@ int hns_dsaf_add_mac_mc_port(struct dsaf_device *dsaf_dev,
 	}
 
 	ether_addr_copy(mc_addr, mac_entry->addr);
-	mc_mask = dsaf_dev->mac_cb[mac_entry->in_port_num]->mc_mask;
 	if (!AE_IS_VER1(dsaf_dev->dsaf_ver)) {
+		u8 mc_mask[ETH_ALEN];
+
 		/* prepare for key data setting */
+		hns_dsaf_setup_mc_mask(dsaf_dev, mac_entry->in_port_num,
+				       mc_mask, mac_entry->addr);
 		hns_dsaf_mc_mask_bit_clear(mc_addr, mc_mask);
 
 		/* config key mask */
@@ -1844,7 +1855,6 @@ int hns_dsaf_del_mac_mc_port(struct dsaf_device *dsaf_dev,
 	struct dsaf_drv_tbl_tcam_key mask_key, tmp_mac_key;
 	struct dsaf_tbl_tcam_data *pmask_key = NULL;
 	u8 mc_addr[ETH_ALEN];
-	u8 *mc_mask;
 
 	if (!(void *)mac_entry) {
 		dev_err(dsaf_dev->dev,
@@ -1861,14 +1871,17 @@ int hns_dsaf_del_mac_mc_port(struct dsaf_device *dsaf_dev,
 
 	/* always mask vlan_id field */
 	ether_addr_copy(mc_addr, mac_entry->addr);
-	mc_mask = dsaf_dev->mac_cb[mac_entry->in_port_num]->mc_mask;
 
 	if (!AE_IS_VER1(dsaf_dev->dsaf_ver)) {
+		u8 mc_mask[ETH_ALEN];
+
 		/* prepare for key data setting */
+		hns_dsaf_setup_mc_mask(dsaf_dev, mac_entry->in_port_num,
+				       mc_mask, mac_entry->addr);
 		hns_dsaf_mc_mask_bit_clear(mc_addr, mc_mask);
 
 		/* config key mask */
-		hns_dsaf_set_mac_key(dsaf_dev, &mask_key, 0x00, 0xff, mc_addr);
+		hns_dsaf_set_mac_key(dsaf_dev, &mask_key, 0x00, 0xff, mc_mask);
 
 		mask_key.high.val = le32_to_cpu(mask_key.high.val);
 		mask_key.low.val = le32_to_cpu(mask_key.low.val);
-- 
cgit v1.2.3-59-g8ed1b


From 2b589a7e2bd3eb610a4b7f5e61393481755a4de9 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Fri, 11 May 2018 11:06:34 +0800
Subject: bpf, arm32: correct check_imm24

imm24 is signed, so the right range is:

  [-(1<<(24 - 1)), (1<<(24 - 1)) - 1]

Note: this patch also fix a typo.

Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler")
Signed-off-by: Wang YanQing <udknight@gmail.com>
Cc: Shubham Bansal <illusionist.neo@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux@armlinux.org.uk
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm/net/bpf_jit_32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index d3ea6454e775..0d542007b49d 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -84,7 +84,7 @@
  *
  * 1. First argument is passed using the arm 32bit registers and rest of the
  * arguments are passed on stack scratch space.
- * 2. First callee-saved arugument is mapped to arm 32 bit registers and rest
+ * 2. First callee-saved argument is mapped to arm 32 bit registers and rest
  * arguments are mapped to scratch space on stack.
  * 3. We need two 64 bit temp registers to do complex operations on eBPF
  * registers.
@@ -1192,8 +1192,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	s32 jmp_offset;
 
 #define check_imm(bits, imm) do {				\
-	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
-	    (((imm) < 0) && (~(imm) >> (bits)))) {		\
+	if ((imm) >= (1 << ((bits) - 1)) ||			\
+	    (imm) < -(1 << ((bits) - 1))) {			\
 		pr_info("[%2d] imm=%d(0x%x) out of range\n",	\
 			i, imm, imm);				\
 		return -EINVAL;					\
-- 
cgit v1.2.3-59-g8ed1b


From 68565a1af9f7012e6f2fe2bdd612f67d2d830c28 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Fri, 11 May 2018 10:52:17 +0800
Subject: bpf, arm32: fix inconsistent naming about emit_a32_lsr_{r64,i64}

The names for BPF_ALU64 | BPF_ARSH are emit_a32_arsh_*,
the names for BPF_ALU64 | BPF_LSH are emit_a32_lsh_*, but
the names for BPF_ALU64 | BPF_RSH are emit_a32_lsr_*.

For consistence reason, let's rename emit_a32_lsr_* to
emit_a32_rsh_*.

This patch also corrects a wrong comment.

Fixes: 39c13c204bb1 ("arm: eBPF JIT compiler")
Signed-off-by: Wang YanQing <udknight@gmail.com>
Cc: Shubham Bansal <illusionist.neo@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux@armlinux.org.uk
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 arch/arm/net/bpf_jit_32.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 0d542007b49d..6e8b71613039 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -701,7 +701,7 @@ static inline void emit_a32_arsh_r64(const u8 dst[], const u8 src[], bool dstk,
 }
 
 /* dst = dst >> src */
-static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk,
+static inline void emit_a32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
 				     bool sstk, struct jit_ctx *ctx) {
 	const u8 *tmp = bpf2a32[TMP_REG_1];
 	const u8 *tmp2 = bpf2a32[TMP_REG_2];
@@ -717,7 +717,7 @@ static inline void emit_a32_lsr_r64(const u8 dst[], const u8 src[], bool dstk,
 		emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
 	}
 
-	/* Do LSH operation */
+	/* Do RSH operation */
 	emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
 	emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
 	emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
@@ -767,7 +767,7 @@ static inline void emit_a32_lsh_i64(const u8 dst[], bool dstk,
 }
 
 /* dst = dst >> val */
-static inline void emit_a32_lsr_i64(const u8 dst[], bool dstk,
+static inline void emit_a32_rsh_i64(const u8 dst[], bool dstk,
 				    const u32 val, struct jit_ctx *ctx) {
 	const u8 *tmp = bpf2a32[TMP_REG_1];
 	const u8 *tmp2 = bpf2a32[TMP_REG_2];
@@ -1323,7 +1323,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	case BPF_ALU64 | BPF_RSH | BPF_K:
 		if (unlikely(imm > 63))
 			return -EINVAL;
-		emit_a32_lsr_i64(dst, dstk, imm, ctx);
+		emit_a32_rsh_i64(dst, dstk, imm, ctx);
 		break;
 	/* dst = dst << src */
 	case BPF_ALU64 | BPF_LSH | BPF_X:
@@ -1331,7 +1331,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		break;
 	/* dst = dst >> src */
 	case BPF_ALU64 | BPF_RSH | BPF_X:
-		emit_a32_lsr_r64(dst, src, dstk, sstk, ctx);
+		emit_a32_rsh_r64(dst, src, dstk, sstk, ctx);
 		break;
 	/* dst = dst >> src (signed) */
 	case BPF_ALU64 | BPF_ARSH | BPF_X:
-- 
cgit v1.2.3-59-g8ed1b


From 763ea096f3cf312608317fb1027d509cfd1efc16 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:30 +0200
Subject: i40e: remove ndo_xdp_flush call i40e_xdp_flush

Remove the ndo_xdp_flush call implementation i40e_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  1 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 19 -------------------
 drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 -
 3 files changed, 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b5daa5c9c7de..c944bd10b03d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11883,7 +11883,6 @@ static const struct net_device_ops i40e_netdev_ops = {
 	.ndo_bridge_setlink	= i40e_ndo_bridge_setlink,
 	.ndo_bpf		= i40e_xdp,
 	.ndo_xdp_xmit		= i40e_xdp_xmit,
-	.ndo_xdp_flush		= i40e_xdp_flush,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5f01e4ce9c92..713995d04783 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3707,22 +3707,3 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 
 	return n - drops;
 }
-
-/**
- * i40e_xdp_flush - Implements ndo_xdp_flush
- * @dev: netdev
- **/
-void i40e_xdp_flush(struct net_device *dev)
-{
-	struct i40e_netdev_priv *np = netdev_priv(dev);
-	unsigned int queue_index = smp_processor_id();
-	struct i40e_vsi *vsi = np->vsi;
-
-	if (test_bit(__I40E_VSI_DOWN, vsi->state))
-		return;
-
-	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
-		return;
-
-	i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
-}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 820f76db251b..bb04f6a731fe 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -489,7 +489,6 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
 int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
-void i40e_xdp_flush(struct net_device *dev);
 
 /**
  * i40e_get_head - Retrieve head from head writeback
-- 
cgit v1.2.3-59-g8ed1b


From af3746a779aa5106fb3d1771b8fb28522476b073 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:35 +0200
Subject: ixgbe: remove ndo_xdp_flush call ixgbe_xdp_flush

Remove the ndo_xdp_flush call implementation ixgbe_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4fd77c9067f2..ef1afb3a8a97 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10069,26 +10069,6 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
 	return n - drops;
 }
 
-static void ixgbe_xdp_flush(struct net_device *dev)
-{
-	struct ixgbe_adapter *adapter = netdev_priv(dev);
-	struct ixgbe_ring *ring;
-
-	/* Its possible the device went down between xdp xmit and flush so
-	 * we need to ensure device is still up.
-	 */
-	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
-		return;
-
-	ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
-	if (unlikely(!ring))
-		return;
-
-	ixgbe_xdp_ring_update_tail(ring);
-
-	return;
-}
-
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -10136,7 +10116,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_features_check	= ixgbe_features_check,
 	.ndo_bpf		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
-	.ndo_xdp_flush		= ixgbe_xdp_flush,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 16ec025aa5077144747c97a1e40635f547457a95 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:40 +0200
Subject: virtio_net: remove ndo_xdp_flush call virtnet_xdp_flush

Remove the ndo_xdp_flush call implementation virtnet_xdp_flush
as no callers of ndo_xdp_flush are left.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/virtio_net.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 62ba8aadd8e6..8c5b59e79439 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -407,18 +407,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
-static void virtnet_xdp_flush(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-	struct send_queue *sq;
-	unsigned int qp;
-
-	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
-	sq = &vi->sq[qp];
-
-	virtqueue_kick(sq->vq);
-}
-
 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 				   struct send_queue *sq,
 				   struct xdp_frame *xdpf)
@@ -2359,7 +2347,6 @@ static const struct net_device_ops virtnet_netdev = {
 #endif
 	.ndo_bpf		= virtnet_xdp,
 	.ndo_xdp_xmit		= virtnet_xdp_xmit,
-	.ndo_xdp_flush		= virtnet_xdp_flush,
 	.ndo_features_check	= passthru_features_check,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 42421a56541a3b0fb6656406b657ad4686cdaaeb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:45 +0200
Subject: tun: remove ndo_xdp_flush call tun_xdp_flush

Remove the ndo_xdp_flush call implementation tun_xdp_flush
as no callers of ndo_xdp_flush are left.

The tun drivers XDP_TX implementation also used tun_xdp_flush (and
tun_xdp_xmit).  This is easily solved by passing the XDP_XMIT_FLUSH
flag to tun_xdp_xmit in tun_xdp_tx.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/tun.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d82a05fb0594..ef09224496e8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1347,26 +1347,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!frame))
 		return -EOVERFLOW;
 
-	return tun_xdp_xmit(dev, 1, &frame, 0);
-}
-
-static void tun_xdp_flush(struct net_device *dev)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-	struct tun_file *tfile;
-	u32 numqueues;
-
-	rcu_read_lock();
-
-	numqueues = READ_ONCE(tun->numqueues);
-	if (!numqueues)
-		goto out;
-
-	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
-					    numqueues]);
-	__tun_xdp_flush_tfile(tfile);
-out:
-	rcu_read_unlock();
+	return tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
 }
 
 static const struct net_device_ops tap_netdev_ops = {
@@ -1387,7 +1368,6 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 	.ndo_bpf		= tun_xdp,
 	.ndo_xdp_xmit		= tun_xdp_xmit,
-	.ndo_xdp_flush		= tun_xdp_flush,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1706,7 +1686,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 			alloc_frag->offset += buflen;
 			if (tun_xdp_tx(tun->dev, &xdp))
 				goto err_redirect;
-			tun_xdp_flush(tun->dev);
 			rcu_read_unlock();
 			preempt_enable();
 			return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 189454e868c45ce3476bfac03ace921dec34208a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 5 Jun 2018 13:55:50 +0200
Subject: net: remove net_device operation ndo_xdp_flush

All drivers are cleaned up and no references to ndo_xdp_flush
are left in drivers, it is time to remove the net_device_ops
operation ndo_xdp_flush.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f17785a59d7..42c6ea35a6f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1192,9 +1192,6 @@ struct dev_ifalias {
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- * void (*ndo_xdp_flush)(struct net_device *dev);
- *	This function is used to inform the driver to flush a particular
- *	xdp tx queue. Must be called on same CPU as xdp_xmit.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1382,7 +1379,6 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
-	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 7f4828ff27d4c92280222f6427981052fd3319e9 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 2 Jun 2018 22:36:06 +0200
Subject: net: phy: add struct device_type representation of a PHY

A PHY is a type of MDIO device, so let's model it as struct device_type
and place PM ops, attribute groups and release callback on device type
level. For this the attribute definitions have to be moved.
This change allows us to get rid of the PM ops on a bus level in a second
step.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 96 +++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 9e4ba8e80a18..bd0f339f69fd 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -346,6 +346,55 @@ static int phy_bus_match(struct device *dev, struct device_driver *drv)
 	}
 }
 
+static ssize_t
+phy_id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct phy_device *phydev = to_phy_device(dev);
+
+	return sprintf(buf, "0x%.8lx\n", (unsigned long)phydev->phy_id);
+}
+static DEVICE_ATTR_RO(phy_id);
+
+static ssize_t
+phy_interface_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct phy_device *phydev = to_phy_device(dev);
+	const char *mode = NULL;
+
+	if (phy_is_internal(phydev))
+		mode = "internal";
+	else
+		mode = phy_modes(phydev->interface);
+
+	return sprintf(buf, "%s\n", mode);
+}
+static DEVICE_ATTR_RO(phy_interface);
+
+static ssize_t
+phy_has_fixups_show(struct device *dev, struct device_attribute *attr,
+		    char *buf)
+{
+	struct phy_device *phydev = to_phy_device(dev);
+
+	return sprintf(buf, "%d\n", phydev->has_fixups);
+}
+static DEVICE_ATTR_RO(phy_has_fixups);
+
+static struct attribute *phy_dev_attrs[] = {
+	&dev_attr_phy_id.attr,
+	&dev_attr_phy_interface.attr,
+	&dev_attr_phy_has_fixups.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(phy_dev);
+
+static const struct device_type mdio_bus_phy_type = {
+	.name = "PHY",
+	.groups = phy_dev_groups,
+	.release = phy_device_release,
+	.pm = MDIO_BUS_PHY_PM_OPS,
+};
+
 struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids)
@@ -359,11 +408,10 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 		return ERR_PTR(-ENOMEM);
 
 	mdiodev = &dev->mdio;
-	mdiodev->dev.release = phy_device_release;
 	mdiodev->dev.parent = &bus->dev;
 	mdiodev->dev.bus = &mdio_bus_type;
+	mdiodev->dev.type = &mdio_bus_phy_type;
 	mdiodev->bus = bus;
-	mdiodev->pm_ops = MDIO_BUS_PHY_PM_OPS;
 	mdiodev->bus_match = phy_bus_match;
 	mdiodev->addr = addr;
 	mdiodev->flags = MDIO_DEVICE_FLAG_PHY;
@@ -587,48 +635,6 @@ struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
 }
 EXPORT_SYMBOL(get_phy_device);
 
-static ssize_t
-phy_id_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct phy_device *phydev = to_phy_device(dev);
-
-	return sprintf(buf, "0x%.8lx\n", (unsigned long)phydev->phy_id);
-}
-static DEVICE_ATTR_RO(phy_id);
-
-static ssize_t
-phy_interface_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct phy_device *phydev = to_phy_device(dev);
-	const char *mode = NULL;
-
-	if (phy_is_internal(phydev))
-		mode = "internal";
-	else
-		mode = phy_modes(phydev->interface);
-
-	return sprintf(buf, "%s\n", mode);
-}
-static DEVICE_ATTR_RO(phy_interface);
-
-static ssize_t
-phy_has_fixups_show(struct device *dev, struct device_attribute *attr,
-		    char *buf)
-{
-	struct phy_device *phydev = to_phy_device(dev);
-
-	return sprintf(buf, "%d\n", phydev->has_fixups);
-}
-static DEVICE_ATTR_RO(phy_has_fixups);
-
-static struct attribute *phy_dev_attrs[] = {
-	&dev_attr_phy_id.attr,
-	&dev_attr_phy_interface.attr,
-	&dev_attr_phy_has_fixups.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(phy_dev);
-
 /**
  * phy_device_register - Register the phy device on the MDIO bus
  * @phydev: phy_device structure to be added to the MDIO bus
@@ -651,8 +657,6 @@ int phy_device_register(struct phy_device *phydev)
 		goto out;
 	}
 
-	phydev->mdio.dev.groups = phy_dev_groups;
-
 	err = device_add(&phydev->mdio.dev);
 	if (err) {
 		pr_err("PHY %d failed to add\n", phydev->mdio.addr);
-- 
cgit v1.2.3-59-g8ed1b


From 9107c05e2e55c1c7abd02ab38f7694e0da08b643 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 2 Jun 2018 22:37:24 +0200
Subject: net: phy: remove PM ops from MDIO bus

Current implementation of MDIO bus PM ops doesn't actually implement
bus-specific PM ops but just calls PM ops defined on a device level
what doesn't seem to be fully in line with the core PM model.

When looking e.g. at __device_suspend() the PM core looks for PM ops
of a device in a specific order:
1. device PM domain
2. device type
3. device class
4. device bus

I think it has good reason that there's no PM ops on device level.

Now that a device type representation of PHY's as special type of MDIO
devices was added (only user of MDIO bus PM ops), the MDIO bus
PM ops can be removed including member pm of struct mdio_device.

If for some other type of MDIO device PM ops are needed, it should be
modeled as struct device_type as well.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 48 ----------------------------------------------
 include/linux/mdio.h       |  1 -
 2 files changed, 49 deletions(-)

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 24b5511222c8..98f4b1f706df 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -717,58 +717,10 @@ static int mdio_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int mdio_bus_suspend(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->suspend)
-		return mdio->pm_ops->suspend(dev);
-
-	return 0;
-}
-
-static int mdio_bus_resume(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->resume)
-		return mdio->pm_ops->resume(dev);
-
-	return 0;
-}
-
-static int mdio_bus_restore(struct device *dev)
-{
-	struct mdio_device *mdio = to_mdio_device(dev);
-
-	if (mdio->pm_ops && mdio->pm_ops->restore)
-		return mdio->pm_ops->restore(dev);
-
-	return 0;
-}
-
-static const struct dev_pm_ops mdio_bus_pm_ops = {
-	.suspend = mdio_bus_suspend,
-	.resume = mdio_bus_resume,
-	.freeze = mdio_bus_suspend,
-	.thaw = mdio_bus_resume,
-	.restore = mdio_bus_restore,
-};
-
-#define MDIO_BUS_PM_OPS (&mdio_bus_pm_ops)
-
-#else
-
-#define MDIO_BUS_PM_OPS NULL
-
-#endif /* CONFIG_PM */
-
 struct bus_type mdio_bus_type = {
 	.name		= "mdio_bus",
 	.match		= mdio_bus_match,
 	.uevent		= mdio_uevent,
-	.pm		= MDIO_BUS_PM_OPS,
 };
 EXPORT_SYMBOL(mdio_bus_type);
 
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 2cfffe586885..bfa7114167d7 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -29,7 +29,6 @@ enum mdio_mutex_lock_class {
 struct mdio_device {
 	struct device dev;
 
-	const struct dev_pm_ops *pm_ops;
 	struct mii_bus *bus;
 	char modalias[MDIO_NAME_SIZE];
 
-- 
cgit v1.2.3-59-g8ed1b


From 3d609342cc04129ff7568e19316ce3d7451a27e8 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Mon, 4 Jun 2018 18:52:19 +0200
Subject: l2tp: fix refcount leakage on PPPoL2TP sockets

Commit d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session
object destroy") tried to fix a race condition where a PPPoL2TP socket
would disappear while the L2TP session was still using it. However, it
missed the root issue which is that an L2TP session may accept to be
reconnected if its associated socket has entered the release process.

The tentative fix makes the session hold the socket it is connected to.
That saves the kernel from crashing, but introduces refcount leakage,
preventing the socket from completing the release process. Once stalled,
everything the socket depends on can't be released anymore, including
the L2TP session and the l2tp_ppp module.

The root issue is that, when releasing a connected PPPoL2TP socket, the
session's ->sk pointer (RCU-protected) is reset to NULL and we have to
wait for a grace period before destroying the socket. The socket drops
the session in its ->sk_destruct callback function, so the session
will exist until the last reference on the socket is dropped.
Therefore, there is a time frame where pppol2tp_connect() may accept
reconnecting a session, as it only checks ->sk to figure out if the
session is connected. This time frame is shortened by the fact that
pppol2tp_release() calls l2tp_session_delete(), making the session
unreachable before resetting ->sk. However, pppol2tp_connect() may
grab the session before it gets unhashed by l2tp_session_delete(), but
it may test ->sk after the later got reset. The race is not so hard to
trigger and syzbot found a pretty reliable reproducer:
https://syzkaller.appspot.com/bug?id=418578d2a4389074524e04d641eacb091961b2cf

Before d02ba2a6110c, another race could let pppol2tp_release()
overwrite the ->__sk pointer of an L2TP session, thus tricking
pppol2tp_put_sk() into calling sock_put() on a socket that is different
than the one for which pppol2tp_release() was originally called. To get
there, we had to trigger the race described above, therefore having one
PPPoL2TP socket being released, while the session it is connected to is
reconnecting to a different PPPoL2TP socket. When releasing this new
socket fast enough, pppol2tp_release() overwrites the session's
->__sk pointer with the address of the new socket, before the first
pppol2tp_put_sk() call gets scheduled. Then the pppol2tp_put_sk() call
invoked by the original socket will sock_put() the new socket,
potentially dropping its last reference. When the second
pppol2tp_put_sk() finally runs, its socket has already been freed.

With d02ba2a6110c, the session takes a reference on both sockets.
Furthermore, the session's ->sk pointer is reset in the
pppol2tp_session_close() callback function rather than in
pppol2tp_release(). Therefore, ->__sk can't be overwritten and
pppol2tp_put_sk() is called only once (l2tp_session_delete() will only
run pppol2tp_session_close() once, to protect the session against
concurrent deletion requests). Now pppol2tp_put_sk() will properly
sock_put() the original socket, but the new socket will remain, as
l2tp_session_delete() prevented the release process from completing.
Here, we don't depend on the ->__sk race to trigger the bug. Getting
into the pppol2tp_connect() race is enough to leak the reference, no
matter when new socket is released.

So it all boils down to pppol2tp_connect() failing to realise that the
session has already been connected. This patch drops the unneeded extra
reference counting (mostly reverting d02ba2a6110c) and checks that
neither ->sk nor ->__sk is set before allowing a session to be
connected.

Fixes: d02ba2a6110c ("l2tp: fix race in pppol2tp_release with session object destroy")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_ppp.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index f951c768dcf2..f1c9c327f674 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -428,16 +428,6 @@ static void pppol2tp_put_sk(struct rcu_head *head)
  */
 static void pppol2tp_session_close(struct l2tp_session *session)
 {
-	struct pppol2tp_session *ps;
-
-	ps = l2tp_session_priv(session);
-	mutex_lock(&ps->sk_lock);
-	ps->__sk = rcu_dereference_protected(ps->sk,
-					     lockdep_is_held(&ps->sk_lock));
-	RCU_INIT_POINTER(ps->sk, NULL);
-	if (ps->__sk)
-		call_rcu(&ps->rcu, pppol2tp_put_sk);
-	mutex_unlock(&ps->sk_lock);
 }
 
 /* Really kill the session socket. (Called from sock_put() if
@@ -480,15 +470,24 @@ static int pppol2tp_release(struct socket *sock)
 	sock_orphan(sk);
 	sock->sk = NULL;
 
-	/* If the socket is associated with a session,
-	 * l2tp_session_delete will call pppol2tp_session_close which
-	 * will drop the session's ref on the socket.
-	 */
 	session = pppol2tp_sock_to_session(sk);
 	if (session) {
+		struct pppol2tp_session *ps;
+
 		l2tp_session_delete(session);
-		/* drop the ref obtained by pppol2tp_sock_to_session */
-		sock_put(sk);
+
+		ps = l2tp_session_priv(session);
+		mutex_lock(&ps->sk_lock);
+		ps->__sk = rcu_dereference_protected(ps->sk,
+						     lockdep_is_held(&ps->sk_lock));
+		RCU_INIT_POINTER(ps->sk, NULL);
+		mutex_unlock(&ps->sk_lock);
+		call_rcu(&ps->rcu, pppol2tp_put_sk);
+
+		/* Rely on the sock_put() call at the end of the function for
+		 * dropping the reference held by pppol2tp_sock_to_session().
+		 * The last reference will be dropped by pppol2tp_put_sk().
+		 */
 	}
 
 	release_sock(sk);
@@ -742,7 +741,8 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 		 */
 		mutex_lock(&ps->sk_lock);
 		if (rcu_dereference_protected(ps->sk,
-					      lockdep_is_held(&ps->sk_lock))) {
+					      lockdep_is_held(&ps->sk_lock)) ||
+		    ps->__sk) {
 			mutex_unlock(&ps->sk_lock);
 			error = -EEXIST;
 			goto end;
@@ -803,7 +803,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
 
 out_no_ppp:
 	/* This is how we get the session context from the socket. */
-	sock_hold(sk);
 	sk->sk_user_data = session;
 	rcu_assign_pointer(ps->sk, sk);
 	mutex_unlock(&ps->sk_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 69e2ecccd0bee6b186d0d42f8093109bfca7e990 Mon Sep 17 00:00:00 2001
From: Kun Yi <kunyi@google.com>
Date: Mon, 4 Jun 2018 13:17:04 -0700
Subject: net: phy: broadcom: Enable 125 MHz clock on LED4 pin for BCM54612E by
 default.

BCM54612E have 4 multi-functional LED pins that can be configured
through register setting; the LED4 pin can be configured to a 125MHz
reference clock output by setting the spare register. Since the dedicated
CLK125 reference clock pin is not brought out on the 48-Pin MLP, the LED4
pin is the only pin to provide such function in this package, and therefore
it is beneficial to just enable the reference clock by default.

Signed-off-by: Kun Yi <kunyi@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 16 ++++++++++++++--
 include/linux/brcmphy.h    |  4 ++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index f9c25912eb98..e86ea105c802 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -54,6 +54,8 @@ static int bcm54210e_config_init(struct phy_device *phydev)
 
 static int bcm54612e_config_init(struct phy_device *phydev)
 {
+	int reg;
+
 	/* Clear TX internal delay unless requested. */
 	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
 	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_TXID)) {
@@ -65,8 +67,6 @@ static int bcm54612e_config_init(struct phy_device *phydev)
 	/* Clear RX internal delay unless requested. */
 	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
 	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
-		u16 reg;
-
 		reg = bcm54xx_auxctl_read(phydev,
 					  MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
 		/* Disable RXD to RXC delay (default set) */
@@ -77,6 +77,18 @@ static int bcm54612e_config_init(struct phy_device *phydev)
 				     MII_BCM54XX_AUXCTL_MISC_WREN | reg);
 	}
 
+	/* Enable CLK125 MUX on LED4 if ref clock is enabled. */
+	if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) {
+		int err;
+
+		reg = bcm_phy_read_exp(phydev, BCM54612E_EXP_SPARE0);
+		err = bcm_phy_write_exp(phydev, BCM54612E_EXP_SPARE0,
+					BCM54612E_LED4_CLK125OUT_EN | reg);
+
+		if (err < 0)
+			return err;
+	}
+
 	return 0;
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index b324e01ccf2d..daa9234a9baf 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -85,6 +85,7 @@
 #define MII_BCM54XX_EXP_SEL	0x17	/* Expansion register select */
 #define MII_BCM54XX_EXP_SEL_SSD	0x0e00	/* Secondary SerDes select */
 #define MII_BCM54XX_EXP_SEL_ER	0x0f00	/* Expansion register select */
+#define MII_BCM54XX_EXP_SEL_ETC	0x0d00	/* Expansion register spare + 2k mem */
 
 #define MII_BCM54XX_AUX_CTL	0x18	/* Auxiliary control register */
 #define MII_BCM54XX_ISR		0x1a	/* BCM54xx interrupt status register */
@@ -219,6 +220,9 @@
 #define BCM54810_SHD_CLK_CTL			0x3
 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN		(1 << 9)
 
+/* BCM54612E Registers */
+#define BCM54612E_EXP_SPARE0		(MII_BCM54XX_EXP_SEL_ETC + 0x34)
+#define BCM54612E_LED4_CLK125OUT_EN	(1 << 1)
 
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
-- 
cgit v1.2.3-59-g8ed1b


From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:51 +0200
Subject: xsk: moved struct xdp_umem definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy
support.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h | 24 +++++++++++++++++++++++-
 net/xdp/xdp_umem.c     |  1 +
 net/xdp/xdp_umem.h     | 22 +---------------------
 net/xdp/xsk_queue.h    |  3 +--
 4 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 7a647c56ec15..3a6cd88f179d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -6,12 +6,34 @@
 #ifndef _LINUX_XDP_SOCK_H
 #define _LINUX_XDP_SOCK_H
 
+#include <linux/workqueue.h>
+#include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 #include <net/sock.h>
 
 struct net_device;
 struct xsk_queue;
-struct xdp_umem;
+
+struct xdp_umem_props {
+	u64 chunk_mask;
+	u64 size;
+};
+
+struct xdp_umem {
+	struct xsk_queue *fq;
+	struct xsk_queue *cq;
+	struct page **pgs;
+	struct xdp_umem_props props;
+	u32 headroom;
+	u32 chunk_size_nohr;
+	struct user_struct *user;
+	struct pid *pid;
+	unsigned long address;
+	refcount_t users;
+	struct work_struct work;
+	u32 npgs;
+};
 
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 9ad791ff4739..2793a503223e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 
 #include "xdp_umem.h"
+#include "xsk_queue.h"
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index aeadd1bcb72d..9433e8af650a 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -6,27 +6,7 @@
 #ifndef XDP_UMEM_H_
 #define XDP_UMEM_H_
 
-#include <linux/mm.h>
-#include <linux/if_xdp.h>
-#include <linux/workqueue.h>
-
-#include "xsk_queue.h"
-#include "xdp_umem_props.h"
-
-struct xdp_umem {
-	struct xsk_queue *fq;
-	struct xsk_queue *cq;
-	struct page **pgs;
-	struct xdp_umem_props props;
-	u32 headroom;
-	u32 chunk_size_nohr;
-	struct user_struct *user;
-	struct pid *pid;
-	unsigned long address;
-	refcount_t users;
-	struct work_struct work;
-	u32 npgs;
-};
+#include <net/xdp_sock.h>
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 337e5ad3b10e..5246ed420a16 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -8,8 +8,7 @@
 
 #include <linux/types.h>
 #include <linux/if_xdp.h>
-
-#include "xdp_umem_props.h"
+#include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
 
-- 
cgit v1.2.3-59-g8ed1b


From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:52 +0200
Subject: xsk: introduce xdp_umem_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xdp_umem_page holds the address for a page. Trade memory for
faster lookup. Later, we'll add DMA address here as well.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  7 ++++++-
 net/xdp/xdp_umem.c     | 15 ++++++++++++++-
 net/xdp/xdp_umem.h     |  3 +--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3a6cd88f179d..caf343a7e224 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -20,10 +20,14 @@ struct xdp_umem_props {
 	u64 size;
 };
 
+struct xdp_umem_page {
+	void *addr;
+};
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
-	struct page **pgs;
+	struct xdp_umem_page *pages;
 	struct xdp_umem_props props;
 	u32 headroom;
 	u32 chunk_size_nohr;
@@ -32,6 +36,7 @@ struct xdp_umem {
 	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
+	struct page **pgs;
 	u32 npgs;
 };
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2793a503223e..aca826011f6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
 		goto out;
 
 	mmput(mm);
+	kfree(umem->pages);
+	umem->pages = NULL;
+
 	xdp_umem_unaccount_pages(umem);
 out:
 	kfree(umem);
@@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	int size_chk, err;
+	int size_chk, err, i;
 
 	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
@@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	err = xdp_umem_pin_pages(umem);
 	if (err)
 		goto out_account;
+
+	umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+	if (!umem->pages) {
+		err = -ENOMEM;
+		goto out_account;
+	}
+
+	for (i = 0; i < umem->npgs; i++)
+		umem->pages[i].addr = page_address(umem->pgs[i]);
+
 	return 0;
 
 out_account:
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 9433e8af650a..40e8fa4a92af 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,8 +10,7 @@
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
-		(addr & (PAGE_SIZE - 1));
+	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
-- 
cgit v1.2.3-59-g8ed1b


From 74515c5750f30244a901c3c0c82a2fe534b3c9c5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:53 +0200
Subject: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend ndo_bpf with two new commands used for query zero-copy support
and register an UMEM to a queue_id of a netdev.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 42c6ea35a6f2..cc4ea7ab6d24 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -817,10 +817,13 @@ enum bpf_netdev_command {
 	BPF_OFFLOAD_DESTROY,
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
+	XDP_QUERY_XSK_UMEM,
+	XDP_SETUP_XSK_UMEM,
 };
 
 struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
+struct xdp_umem;
 
 struct netdev_bpf {
 	enum bpf_netdev_command command;
@@ -851,6 +854,11 @@ struct netdev_bpf {
 		struct {
 			struct bpf_offloaded_map *offmap;
 		};
+		/* XDP_SETUP_XSK_UMEM */
+		struct {
+			struct xdp_umem *umem;
+			u16 queue_id;
+		} xsk;
 	};
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:54 +0200
Subject: xdp: add MEM_TYPE_ZERO_COPY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Here, a new type of allocator support is added to the XDP return
API. A zero-copy allocated xdp_buff cannot be converted to an
xdp_frame. Instead is the buff has to be copied. This is not supported
at all in this commit.

Also, an opaque "handle" is added to xdp_buff. This can be used as a
context for the zero-copy allocator implementation.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp.h | 10 ++++++++++
 net/core/xdp.c    | 19 ++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index a3b71a4dd71d..2deea7166a34 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -37,6 +37,7 @@ enum xdp_mem_type {
 	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
 	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
 	MEM_TYPE_PAGE_POOL,
+	MEM_TYPE_ZERO_COPY,
 	MEM_TYPE_MAX,
 };
 
@@ -51,6 +52,10 @@ struct xdp_mem_info {
 
 struct page_pool;
 
+struct zero_copy_allocator {
+	void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
+};
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
@@ -63,6 +68,7 @@ struct xdp_buff {
 	void *data_end;
 	void *data_meta;
 	void *data_hard_start;
+	unsigned long handle;
 	struct xdp_rxq_info *rxq;
 };
 
@@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
 	int metasize;
 	int headroom;
 
+	/* TODO: implement clone, copy, use "native" MEM_TYPE */
+	if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return NULL;
+
 	/* Assure headroom is available for storing info */
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index cb8c4e061a5a..9d1f22072d5d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -31,6 +31,7 @@ struct xdp_mem_allocator {
 	union {
 		void *allocator;
 		struct page_pool *page_pool;
+		struct zero_copy_allocator *zc_alloc;
 	};
 	struct rhash_head node;
 	struct rcu_head rcu;
@@ -261,7 +262,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	xdp_rxq->mem.type = type;
 
 	if (!allocator) {
-		if (type == MEM_TYPE_PAGE_POOL)
+		if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
 			return -EINVAL; /* Setup time check page_pool req */
 		return 0;
 	}
@@ -314,7 +315,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
  * is used for those calls sites.  Thus, allowing for faster recycling
  * of xdp_frames/pages in those cases.
  */
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+			 unsigned long handle)
 {
 	struct xdp_mem_allocator *xa;
 	struct page *page;
@@ -338,6 +340,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 		page = virt_to_page(data); /* Assumes order0 page*/
 		put_page(page);
 		break;
+	case MEM_TYPE_ZERO_COPY:
+		/* NB! Only valid from an xdp_buff! */
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		xa->zc_alloc->free(xa->zc_alloc, handle);
+		rcu_read_unlock();
 	default:
 		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
 		break;
@@ -346,18 +355,18 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
 
 void xdp_return_frame(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, false);
+	__xdp_return(xdpf->data, &xdpf->mem, false, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame);
 
 void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 {
-	__xdp_return(xdpf->data, &xdpf->mem, true);
+	__xdp_return(xdpf->data, &xdpf->mem, true, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 
 void xdp_return_buff(struct xdp_buff *xdp)
 {
-	__xdp_return(xdp->data, &xdp->rxq->mem, true);
+	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
-- 
cgit v1.2.3-59-g8ed1b


From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:05:55 +0200
Subject: xsk: add zero-copy support for Rx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and
wireup ndo_bpf call in bind.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h      |  6 +++
 include/uapi/linux/if_xdp.h |  4 +-
 net/xdp/xdp_umem.c          | 77 ++++++++++++++++++++++++++++++++++++
 net/xdp/xdp_umem.h          |  3 ++
 net/xdp/xsk.c               | 96 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 165 insertions(+), 21 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index caf343a7e224..d93d3aac3fc9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -22,6 +22,7 @@ struct xdp_umem_props {
 
 struct xdp_umem_page {
 	void *addr;
+	dma_addr_t dma;
 };
 
 struct xdp_umem {
@@ -38,6 +39,9 @@ struct xdp_umem {
 	struct work_struct work;
 	struct page **pgs;
 	u32 npgs;
+	struct net_device *dev;
+	u16 queue_id;
+	bool zc;
 };
 
 struct xdp_sock {
@@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
+void xsk_umem_discard_addr(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index e411d6f9ac65..1fa0e977ea8d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -13,7 +13,9 @@
 #include <linux/types.h>
 
 /* Options for the sxdp_flags field */
-#define XDP_SHARED_UMEM 1
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
 
 struct sockaddr_xdp {
 	__u16 sxdp_family;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index aca826011f6c..f729d79b8d91 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,81 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags)
+{
+	bool force_zc, force_copy;
+	struct netdev_bpf bpf;
+	int err;
+
+	force_zc = flags & XDP_ZEROCOPY;
+	force_copy = flags & XDP_COPY;
+
+	if (force_zc && force_copy)
+		return -EINVAL;
+
+	if (force_copy)
+		return 0;
+
+	dev_hold(dev);
+
+	if (dev->netdev_ops->ndo_bpf) {
+		bpf.command = XDP_QUERY_XSK_UMEM;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? -ENOTSUPP : 0;
+		}
+
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = umem;
+		bpf.xsk.queue_id = queue_id;
+
+		rtnl_lock();
+		err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+		rtnl_unlock();
+
+		if (err) {
+			dev_put(dev);
+			return force_zc ? err : 0; /* fail or fallback */
+		}
+
+		umem->dev = dev;
+		umem->queue_id = queue_id;
+		umem->zc = true;
+		return 0;
+	}
+
+	dev_put(dev);
+	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+}
+
+void xdp_umem_clear_dev(struct xdp_umem *umem)
+{
+	struct netdev_bpf bpf;
+	int err;
+
+	if (umem->dev) {
+		bpf.command = XDP_SETUP_XSK_UMEM;
+		bpf.xsk.umem = NULL;
+		bpf.xsk.queue_id = umem->queue_id;
+
+		rtnl_lock();
+		err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
+		rtnl_unlock();
+
+		if (err)
+			WARN(1, "failed to disable umem!\n");
+
+		dev_put(umem->dev);
+		umem->dev = NULL;
+	}
+}
+
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 {
 	unsigned int i;
@@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
 	struct task_struct *task;
 	struct mm_struct *mm;
 
+	xdp_umem_clear_dev(umem);
+
 	if (umem->fq) {
 		xskq_destroy(umem->fq);
 		umem->fq = NULL;
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 40e8fa4a92af..674508a32a4d 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
+			u32 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4688c750df1d..ab64bd8260ea 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
 
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
-	return !!xs->rx;
+	return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
+		READ_ONCE(xs->umem->fq);
 }
 
-static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+{
+	return xskq_peek_addr(umem->fq, addr);
+}
+EXPORT_SYMBOL(xsk_umem_peek_addr);
+
+void xsk_umem_discard_addr(struct xdp_umem *umem)
+{
+	xskq_discard_addr(umem->fq);
+}
+EXPORT_SYMBOL(xsk_umem_discard_addr);
+
+static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	u32 len = xdp->data_end - xdp->data;
 	void *buffer;
 	u64 addr;
 	int err;
 
-	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
-		return -EINVAL;
-
 	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 	    len > xs->umem->chunk_size_nohr) {
 		xs->rx_dropped++;
@@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	buffer = xdp_umem_get_data(xs->umem, addr);
 	memcpy(buffer, xdp->data, len);
 	err = xskq_produce_batch_desc(xs->rx, addr, len);
-	if (!err)
+	if (!err) {
 		xskq_discard_addr(xs->umem->fq);
-	else
-		xs->rx_dropped++;
+		xdp_return_buff(xdp);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-	int err;
+	int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 
-	err = __xsk_rcv(xs, xdp);
-	if (likely(!err))
+	if (err) {
 		xdp_return_buff(xdp);
+		xs->rx_dropped++;
+	}
 
 	return err;
 }
 
+int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+{
+	u32 len;
+
+	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
+		return -EINVAL;
+
+	len = xdp->data_end - xdp->data;
+
+	return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
+		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
+}
+
 void xsk_flush(struct xdp_sock *xs)
 {
 	xskq_produce_flush_desc(xs->rx);
@@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs)
 
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
+	u32 len = xdp->data_end - xdp->data;
+	void *buffer;
+	u64 addr;
 	int err;
 
-	err = __xsk_rcv(xs, xdp);
-	if (!err)
+	if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+	    len > xs->umem->chunk_size_nohr) {
+		xs->rx_dropped++;
+		return -ENOSPC;
+	}
+
+	addr += xs->umem->headroom;
+
+	buffer = xdp_umem_get_data(xs->umem, addr);
+	memcpy(buffer, xdp->data, len);
+	err = xskq_produce_batch_desc(xs->rx, addr, len);
+	if (!err) {
+		xskq_discard_addr(xs->umem->fq);
 		xsk_flush(xs);
+		return 0;
+	}
 
+	xs->rx_dropped++;
 	return err;
 }
 
@@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 	struct net_device *dev;
+	u32 flags, qid;
 	int err = 0;
 
 	if (addr_len < sizeof(struct sockaddr_xdp))
@@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		goto out_unlock;
 	}
 
-	if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) ||
-	    (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) {
+	qid = sxdp->sxdp_queue_id;
+
+	if ((xs->rx && qid >= dev->real_num_rx_queues) ||
+	    (xs->tx && qid >= dev->real_num_tx_queues)) {
 		err = -EINVAL;
 		goto out_unlock;
 	}
 
-	if (sxdp->sxdp_flags & XDP_SHARED_UMEM) {
+	flags = sxdp->sxdp_flags;
+
+	if (flags & XDP_SHARED_UMEM) {
 		struct xdp_sock *umem_xs;
 		struct socket *sock;
 
+		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+			/* Cannot specify flags for shared sockets. */
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
 		if (xs->umem) {
 			/* We have already our own. */
 			err = -EINVAL;
@@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev ||
-			   umem_xs->queue_id != sxdp->sxdp_queue_id) {
+		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
@@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		/* This xsk has its own umem. */
 		xskq_set_umem(xs->umem->fq, &xs->umem->props);
 		xskq_set_umem(xs->umem->cq, &xs->umem->props);
+
+		err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
+		if (err)
+			goto out_unlock;
 	}
 
 	xs->dev = dev;
-- 
cgit v1.2.3-59-g8ed1b


From e3760c7e50ac6cdf1188fec44938dd7e6e6eef61 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:56 +0200
Subject: net: added netdevice operation for Tx

Added ndo_xsk_async_xmit. This ndo "kicks" the netdev to start to pull
userland AF_XDP Tx frames from a NAPI context.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc4ea7ab6d24..03ffeadf8a41 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,8 @@ struct net_device_ops {
 	int			(*ndo_xdp_xmit)(struct net_device *dev, int n,
 						struct xdp_frame **xdp,
 						u32 flags);
+	int			(*ndo_xsk_async_xmit)(struct net_device *dev,
+						      u32 queue_id);
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Mon, 4 Jun 2018 14:05:57 +0200
Subject: xsk: wire upp Tx zero-copy functions

Here we add the functionality required to support zero-copy Tx, and
also exposes various zero-copy related functions for the netdevs.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/xdp_sock.h |  9 +++++++
 net/xdp/xdp_umem.c     | 29 +++++++++++++++++++--
 net/xdp/xdp_umem.h     |  8 +++++-
 net/xdp/xsk.c          | 70 +++++++++++++++++++++++++++++++++++++++++++++-----
 net/xdp/xsk_queue.h    | 32 ++++++++++++++++++++++-
 5 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d93d3aac3fc9..9fe472f2ac95 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -9,6 +9,7 @@
 #include <linux/workqueue.h>
 #include <linux/if_xdp.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <net/sock.h>
 
@@ -42,6 +43,8 @@ struct xdp_umem {
 	struct net_device *dev;
 	u16 queue_id;
 	bool zc;
+	spinlock_t xsk_list_lock;
+	struct list_head xsk_list;
 };
 
 struct xdp_sock {
@@ -53,6 +56,8 @@ struct xdp_sock {
 	struct list_head flush_node;
 	u16 queue_id;
 	struct xsk_queue *tx ____cacheline_aligned_in_smp;
+	struct list_head list;
+	bool zc;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	u64 rx_dropped;
@@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
 void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
+/* Used from netdev driver */
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
 void xsk_umem_discard_addr(struct xdp_umem *umem);
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+void xsk_umem_consume_tx_done(struct xdp_umem *umem);
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f729d79b8d91..7eb4948a38d2 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -17,6 +17,29 @@
 
 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
 
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&umem->xsk_list_lock, flags);
+	list_add_rcu(&xs->list, &umem->xsk_list);
+	spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+}
+
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
+{
+	unsigned long flags;
+
+	if (xs->dev) {
+		spin_lock_irqsave(&umem->xsk_list_lock, flags);
+		list_del_rcu(&xs->list);
+		spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
+
+		if (umem->zc)
+			synchronize_net();
+	}
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags)
 {
@@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 
 	dev_hold(dev);
 
-	if (dev->netdev_ops->ndo_bpf) {
+	if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
 		bpf.command = XDP_QUERY_XSK_UMEM;
 
 		rtnl_lock();
@@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
 }
 
-void xdp_umem_clear_dev(struct xdp_umem *umem)
+static void xdp_umem_clear_dev(struct xdp_umem *umem)
 {
 	struct netdev_bpf bpf;
 	int err;
@@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	umem->npgs = size / PAGE_SIZE;
 	umem->pgs = NULL;
 	umem->user = NULL;
+	INIT_LIST_HEAD(&umem->xsk_list);
+	spin_lock_init(&umem->xsk_list_lock);
 
 	refcount_set(&umem->users, 1);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 674508a32a4d..f11560334f88 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
+static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
+{
+	return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+}
+
 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 			u32 queue_id, u16 flags);
-void xdp_umem_clear_dev(struct xdp_umem *umem);
 bool xdp_umem_validate_queues(struct xdp_umem *umem);
 void xdp_get_umem(struct xdp_umem *umem);
 void xdp_put_umem(struct xdp_umem *umem);
+void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
+void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
 
 #endif /* XDP_UMEM_H_ */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ab64bd8260ea..ddca4bf1cfc8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/net.h>
 #include <linux/netdevice.h>
+#include <linux/rculist.h>
 #include <net/xdp_sock.h>
 #include <net/xdp.h>
 
@@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
+{
+	xskq_produce_flush_addr_n(umem->cq, nb_entries);
+}
+EXPORT_SYMBOL(xsk_umem_complete_tx);
+
+void xsk_umem_consume_tx_done(struct xdp_umem *umem)
+{
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		xs->sk.sk_write_space(&xs->sk);
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx_done);
+
+bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+{
+	struct xdp_desc desc;
+	struct xdp_sock *xs;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+		if (!xskq_peek_desc(xs->tx, &desc))
+			continue;
+
+		if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+			goto out;
+
+		*dma = xdp_umem_get_dma(umem, desc.addr);
+		*len = desc.len;
+
+		xskq_discard_desc(xs->tx);
+		rcu_read_unlock();
+		return true;
+	}
+
+out:
+	rcu_read_unlock();
+	return false;
+}
+EXPORT_SYMBOL(xsk_umem_consume_tx);
+
+static int xsk_zc_xmit(struct sock *sk)
+{
+	struct xdp_sock *xs = xdp_sk(sk);
+	struct net_device *dev = xs->dev;
+
+	return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+}
+
 static void xsk_destruct_skb(struct sk_buff *skb)
 {
 	u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
@@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			    size_t total_len)
 {
-	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	u32 max_batch = TX_BATCH_SIZE;
 	struct xdp_sock *xs = xdp_sk(sk);
 	bool sent_frame = false;
@@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	if (unlikely(!xs->tx))
 		return -ENOBUFS;
-	if (need_wait)
-		return -EOPNOTSUPP;
 
 	mutex_lock(&xs->mutex);
 
@@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		skb = sock_alloc_send_skb(sk, len, !need_wait, &err);
+		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
 			err = -EAGAIN;
 			goto out;
@@ -235,6 +286,7 @@ out:
 
 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
+	bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
@@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
 		return -ENETDOWN;
+	if (need_wait)
+		return -EOPNOTSUPP;
 
-	return xsk_generic_xmit(sk, m, total_len);
+	return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 }
 
 static unsigned int xsk_poll(struct file *file, struct socket *sock,
@@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	}
 
 	xs->dev = dev;
-	xs->queue_id = sxdp->sxdp_queue_id;
-
+	xs->zc = xs->umem->zc;
+	xs->queue_id = qid;
 	xskq_set_umem(xs->rx, &xs->umem->props);
 	xskq_set_umem(xs->tx, &xs->umem->props);
+	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
 	if (err)
@@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk)
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
+	xdp_del_sk_umem(xs->umem, xs);
 	xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 5246ed420a16..ef6a6f0ec949 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -11,6 +11,7 @@
 #include <net/xdp_sock.h>
 
 #define RX_BATCH_SIZE 16
+#define LAZY_UPDATE_THRESHOLD 128
 
 struct xdp_ring {
 	u32 producer ____cacheline_aligned_in_smp;
@@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
 	return (entries > dcnt) ? dcnt : entries;
 }
 
+static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
+{
+	return q->nentries - (producer - q->cons_tail);
+}
+
 static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
 {
-	u32 free_entries = q->nentries - (producer - q->cons_tail);
+	u32 free_entries = xskq_nb_free_lazy(q, producer);
 
 	if (free_entries >= dcnt)
 		return free_entries;
@@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
 
+	if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
 	ring->desc[q->prod_tail++ & q->ring_mask] = addr;
 
 	/* Order producer and data */
@@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
+		return -ENOSPC;
+
+	ring->desc[q->prod_head++ & q->ring_mask] = addr;
+	return 0;
+}
+
+static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
+					     u32 nb_entries)
+{
+	/* Order producer and data */
+	smp_wmb();
+
+	q->prod_tail += nb_entries;
+	WRITE_ONCE(q->ring->producer, q->prod_tail);
+}
+
 static inline int xskq_reserve_addr(struct xsk_queue *q)
 {
 	if (xskq_nb_free(q, q->prod_head, 1) == 0)
-- 
cgit v1.2.3-59-g8ed1b


From 9f5232cc7f040f443f81069f553d31b27ab7eb79 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Mon, 4 Jun 2018 14:06:01 +0200
Subject: samples/bpf: xdpsock: use skb Tx path for XDP_SKB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure that XDP_SKB also uses the skb Tx path.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 7494f60fbff8..d69c8d78d3fd 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -75,6 +75,7 @@ static int opt_queue;
 static int opt_poll;
 static int opt_shared_packet_buffer;
 static int opt_interval = 1;
+static u32 opt_xdp_bind_flags;
 
 struct xdp_umem_uqueue {
 	u32 cached_prod;
@@ -541,9 +542,12 @@ static struct xdpsock *xsk_configure(struct xdp_umem *umem)
 	sxdp.sxdp_family = PF_XDP;
 	sxdp.sxdp_ifindex = opt_ifindex;
 	sxdp.sxdp_queue_id = opt_queue;
+
 	if (shared) {
 		sxdp.sxdp_flags = XDP_SHARED_UMEM;
 		sxdp.sxdp_shared_umem_fd = umem->fd;
+	} else {
+		sxdp.sxdp_flags = opt_xdp_bind_flags;
 	}
 
 	lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
@@ -699,6 +703,7 @@ static void parse_command_line(int argc, char **argv)
 			break;
 		case 'S':
 			opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
+			opt_xdp_bind_flags |= XDP_COPY;
 			break;
 		case 'N':
 			opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
-- 
cgit v1.2.3-59-g8ed1b


From f7225172f25aaf0dfd9ad65f05be8da5d6108b12 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 4 Jun 2018 13:41:42 -0700
Subject: net/ipv6: prevent use after free in ip6_route_mpath_notify

syzbot reported a use-after-free:

BUG: KASAN: use-after-free in ip6_route_mpath_notify+0xe9/0x100 net/ipv6/route.c:4180
Read of size 4 at addr ffff8801bf789cf0 by task syz-executor756/4555

CPU: 1 PID: 4555 Comm: syz-executor756 Not tainted 4.17.0-rc7+ #78
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1b9/0x294 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:432
 ip6_route_mpath_notify+0xe9/0x100 net/ipv6/route.c:4180
 ip6_route_multipath_add+0x615/0x1910 net/ipv6/route.c:4303
 inet6_rtm_newroute+0xe3/0x160 net/ipv6/route.c:4391
 ...

Allocated by task 4555:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc+0x12e/0x760 mm/slab.c:3554
 dst_alloc+0xbb/0x1d0 net/core/dst.c:104
 __ip6_dst_alloc+0x35/0xa0 net/ipv6/route.c:361
 ip6_dst_alloc+0x29/0xb0 net/ipv6/route.c:376
 ip6_route_info_create+0x4d4/0x3a30 net/ipv6/route.c:2834
 ip6_route_multipath_add+0xc7e/0x1910 net/ipv6/route.c:4240
 inet6_rtm_newroute+0xe3/0x160 net/ipv6/route.c:4391
 ...

Freed by task 4555:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 dst_destroy+0x267/0x3c0 net/core/dst.c:140
 dst_release_immediate+0x71/0x9e net/core/dst.c:205
 fib6_add+0xa40/0x1650 net/ipv6/ip6_fib.c:1305
 __ip6_ins_rt+0x6c/0x90 net/ipv6/route.c:1011
 ip6_route_multipath_add+0x513/0x1910 net/ipv6/route.c:4267
 inet6_rtm_newroute+0xe3/0x160 net/ipv6/route.c:4391
 ...

The problem is that rt_last can point to a deleted route if the insert
fails.

One reproducer is to insert a route and then add a multipath route that
has a duplicate nexthop.e.g,:
    $ ip -6 ro add vrf red 2001:db8:101::/64 nexthop via 2001:db8:1::2
    $ ip -6 ro append vrf red 2001:db8:101::/64 nexthop via 2001:db8:1::4 nexthop via 2001:db8:1::2

Fix by not setting rt_last until the it is verified the insert succeeded.

Fixes: 3b1137fe7482 ("net: ipv6: Change notifications for multipath add to RTA_MULTIPATH")
Cc: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ce6696acba73..1dc98715c78b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4412,13 +4412,17 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
 
 	err_nh = NULL;
 	list_for_each_entry(nh, &rt6_nh_list, next) {
-		rt_last = nh->fib6_info;
 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
 		fib6_info_release(nh->fib6_info);
 
-		/* save reference to first route for notification */
-		if (!rt_notif && !err)
-			rt_notif = nh->fib6_info;
+		if (!err) {
+			/* save reference to last route successfully inserted */
+			rt_last = nh->fib6_info;
+
+			/* save reference to first route for notification */
+			if (!rt_notif)
+				rt_notif = nh->fib6_info;
+		}
 
 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
 		nh->fib6_info = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From f4c9f85f3b2cb7669830cd04d0be61192a4d2436 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Mon, 4 Jun 2018 15:29:51 -0700
Subject: tcp: refactor tcp_ecn_check_ce to remove sk type cast

Refactor tcp_ecn_check_ce and __tcp_ecn_check_ce to accept struct sock*
instead of tcp_sock* to clean up type casts. This is a pure refactor
patch.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d5ffb573ca4d..355d3dffd021 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -254,8 +254,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
 	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 }
 
-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+
 	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
 	case INET_ECN_NOT_ECT:
 		/* Funny extension: if ECT is not set on a segment,
@@ -263,31 +265,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
 		 * it is probably a retransmit.
 		 */
 		if (tp->ecn_flags & TCP_ECN_SEEN)
-			tcp_enter_quickack_mode((struct sock *)tp, 1);
+			tcp_enter_quickack_mode(sk, 1);
 		break;
 	case INET_ECN_CE:
-		if (tcp_ca_needs_ecn((struct sock *)tp))
-			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+		if (tcp_ca_needs_ecn(sk))
+			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
 			/* Better not delay acks, sender can have a very low cwnd */
-			tcp_enter_quickack_mode((struct sock *)tp, 1);
+			tcp_enter_quickack_mode(sk, 1);
 			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
 		}
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	default:
-		if (tcp_ca_needs_ecn((struct sock *)tp))
-			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+		if (tcp_ca_needs_ecn(sk))
+			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	}
 }
 
-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 {
-	if (tp->ecn_flags & TCP_ECN_OK)
-		__tcp_ecn_check_ce(tp, skb);
+	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+		__tcp_ecn_check_ce(sk, skb);
 }
 
 static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -710,7 +712,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 	}
 	icsk->icsk_ack.lrcvtime = now;
 
-	tcp_ecn_check_ce(tp, skb);
+	tcp_ecn_check_ce(sk, skb);
 
 	if (skb->len >= 128)
 		tcp_grow_window(sk, skb);
@@ -4434,7 +4436,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	u32 seq, end_seq;
 	bool fragstolen;
 
-	tcp_ecn_check_ce(tp, skb);
+	tcp_ecn_check_ce(sk, skb);
 
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
-- 
cgit v1.2.3-59-g8ed1b


From 9deb441c113ae9e761e42f78d90736d762ff49a3 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Mon, 4 Jun 2018 19:26:07 -0600
Subject: net: ipv6: Generate random IID for addresses on RAWIP devices

RAWIP devices such as rmnet do not have a hardware address and
instead require the kernel to generate a random IID for the
IPv6 addresses.

Signed-off-by: Sean Tranchetti <stranche@codeaurora.org>
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 4 ++++
 net/ipv6/addrconf.c                             | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index cb02e1a015c1..b9a7548ec6a0 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -221,6 +221,10 @@ void rmnet_vnd_setup(struct net_device *rmnet_dev)
 
 	rmnet_dev->needs_free_netdev = true;
 	rmnet_dev->ethtool_ops = &rmnet_ethtool_ops;
+
+	/* This perm addr will be used as interface identifier by IPv6 */
+	rmnet_dev->addr_assign_type = NET_ADDR_RANDOM;
+	eth_random_addr(rmnet_dev->perm_addr);
 }
 
 /* Exposed API */
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f09afc232da4..5596d87952d4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2251,6 +2251,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 		return addrconf_ifid_ieee1394(eui, dev);
 	case ARPHRD_TUNNEL6:
 	case ARPHRD_IP6GRE:
+	case ARPHRD_RAWIP:
 		return addrconf_ifid_ip6tnl(eui, dev);
 	}
 	return -1;
@@ -3286,7 +3287,8 @@ static void addrconf_dev_config(struct net_device *dev)
 	    (dev->type != ARPHRD_IP6GRE) &&
 	    (dev->type != ARPHRD_IPGRE) &&
 	    (dev->type != ARPHRD_TUNNEL) &&
-	    (dev->type != ARPHRD_NONE)) {
+	    (dev->type != ARPHRD_NONE) &&
+	    (dev->type != ARPHRD_RAWIP)) {
 		/* Alas, we support only Ethernet autoconfiguration. */
 		return;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 3602207ca6582dd359308b7bd2ce08348cc0854e Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Mon, 4 Jun 2018 19:43:38 -0600
Subject: net: qualcomm: rmnet: Fix use after free while sending command ack

When sending an ack to a command packet, the skb is still referenced
after it is sent to the real device. Since the real device could
free the skb, the device pointer would be invalid.
Also, remove an unnecessary variable.

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
index 56a93df962e6..3ee8ae9b6838 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
@@ -67,7 +67,7 @@ static void rmnet_map_send_ack(struct sk_buff *skb,
 			       struct rmnet_port *port)
 {
 	struct rmnet_map_control_command *cmd;
-	int xmit_status;
+	struct net_device *dev = skb->dev;
 
 	if (port->data_format & RMNET_FLAGS_INGRESS_MAP_CKSUMV4)
 		skb_trim(skb,
@@ -78,9 +78,9 @@ static void rmnet_map_send_ack(struct sk_buff *skb,
 	cmd = RMNET_MAP_GET_CMD_START(skb);
 	cmd->cmd_type = type & 0x03;
 
-	netif_tx_lock(skb->dev);
-	xmit_status = skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev);
-	netif_tx_unlock(skb->dev);
+	netif_tx_lock(dev);
+	dev->netdev_ops->ndo_start_xmit(skb, dev);
+	netif_tx_unlock(dev);
 }
 
 /* Process MAP command frame and send N/ACK message as appropriate. Message cmd
-- 
cgit v1.2.3-59-g8ed1b


From 6f6027a52bfbb9bbf18937d2628eaf1137a5a386 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 5 Jun 2018 02:42:45 +0000
Subject: net/mlx5e: Make function mlx5e_change_rep_mtu() static

Fixes the following sparse warning:

drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:903:5: warning:
 symbol 'mlx5e_change_rep_mtu' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3857f22b5500..57987f6546e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -900,7 +900,7 @@ static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
 	.switchdev_port_attr_get	= mlx5e_attr_get,
 };
 
-int mlx5e_change_rep_mtu(struct net_device *netdev, int new_mtu)
+static int mlx5e_change_rep_mtu(struct net_device *netdev, int new_mtu)
 {
 	return mlx5e_change_mtu(netdev, new_mtu, NULL);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 47a6ca3f972f5baf06cd0c9f37d7b5268ade5363 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 5 Jun 2018 02:42:56 +0000
Subject: net/mlx5e: fix error return code in mlx5e_alloc_rq()

Fix to return error code -ENOMEM from the kvzalloc_node() error handling
case instead of 0, as done elsewhere in this function.

Fixes: 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory scheme")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 333d4ed52b94..89c96a0f708e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -566,8 +566,10 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			kvzalloc_node((wq_sz << rq->wqe.info.log_num_frags) *
 				      sizeof(*rq->wqe.frags),
 				      GFP_KERNEL, cpu_to_node(c->cpu));
-		if (!rq->wqe.frags)
+		if (!rq->wqe.frags) {
+			err = -ENOMEM;
 			goto err_free;
+		}
 
 		err = mlx5e_init_di_list(rq, params, wq_sz, c->cpu);
 		if (err)
-- 
cgit v1.2.3-59-g8ed1b


From 819dd92b9c0bc7bce9097d8c1f14240f471bb386 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 4 Jun 2018 19:53:41 -0700
Subject: bpfilter: switch to CC from HOSTCC

check that CC can build executables and use that compiler instead of HOSTCC

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Makefile               |  5 +++++
 net/Makefile           |  4 ++++
 net/bpfilter/Makefile  |  2 ++
 scripts/cc-can-link.sh | 11 +++++++++++
 4 files changed, 22 insertions(+)
 create mode 100755 scripts/cc-can-link.sh

diff --git a/Makefile b/Makefile
index 56ba070dfa09..62c110084fae 100644
--- a/Makefile
+++ b/Makefile
@@ -510,6 +510,11 @@ ifeq ($(call shell-cached,$(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $
   KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+ifeq ($(call shell-cached,$(CONFIG_SHELL) $(srctree)/scripts/cc-can-link.sh $(CC)), y)
+  CC_CAN_LINK := y
+  export CC_CAN_LINK
+endif
+
 ifeq ($(config-targets),1)
 # ===========================================================================
 # *config targets only - make sure prerequisites are updated, and descend
diff --git a/net/Makefile b/net/Makefile
index bdaf53925acd..13ec0d5415c7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -20,7 +20,11 @@ obj-$(CONFIG_TLS)		+= tls/
 obj-$(CONFIG_XFRM)		+= xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
 obj-$(CONFIG_NET)		+= ipv6/
+ifneq ($(CC_CAN_LINK),y)
+$(warning CC cannot link executables. Skipping bpfilter.)
+else
 obj-$(CONFIG_BPFILTER)		+= bpfilter/
+endif
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
index 3f3cb87c668f..aafa72001fcd 100644
--- a/net/bpfilter/Makefile
+++ b/net/bpfilter/Makefile
@@ -6,6 +6,8 @@
 hostprogs-y := bpfilter_umh
 bpfilter_umh-objs := main.o
 HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
+HOSTCC := $(CC)
+
 ifeq ($(CONFIG_BPFILTER_UMH), y)
 # builtin bpfilter_umh should be compiled with -static
 # since rootfs isn't mounted at the time of __init
diff --git a/scripts/cc-can-link.sh b/scripts/cc-can-link.sh
new file mode 100755
index 000000000000..208eb2825dab
--- /dev/null
+++ b/scripts/cc-can-link.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+cat << "END" | $@ -x c - -o /dev/null >/dev/null 2>&1 && echo "y"
+#include <stdio.h>
+int main(void)
+{
+	printf("");
+	return 0;
+}
+END
-- 
cgit v1.2.3-59-g8ed1b


From 1d88ba1ebb2763aa86172cd7ca05dedbeccc0d35 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 5 Jun 2018 12:16:58 +0800
Subject: sctp: not allow transport timeout value less than HZ/5 for hb_timer

syzbot reported a rcu_sched self-detected stall on CPU which is caused
by too small value set on rto_min with SCTP_RTOINFO sockopt. With this
value, hb_timer will get stuck there, as in its timer handler it starts
this timer again with this value, then goes to the timer handler again.

This problem is there since very beginning, and thanks to Eric for the
reproducer shared from a syzbot mail.

This patch fixes it by not allowing sctp_transport_timeout to return a
smaller value than HZ/5 for hb_timer, which is based on TCP's min rto.

Note that it doesn't fix this issue by limiting rto_min, as some users
are still using small rto and no proper value was found for it yet.

Reported-by: syzbot+3dcd59a1f907245f891f@syzkaller.appspotmail.com
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 4a95e260b674..445b7ef61677 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -637,7 +637,7 @@ unsigned long sctp_transport_timeout(struct sctp_transport *trans)
 	    trans->state != SCTP_PF)
 		timeout += trans->hbinterval;
 
-	return timeout;
+	return max_t(unsigned long, timeout, HZ / 5);
 }
 
 /* Reset transport variables to their initial values */
-- 
cgit v1.2.3-59-g8ed1b


From d96a43c66464cdf0b249fdf47b6dcd65b83af8c0 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 5 Jun 2018 11:04:03 +0300
Subject: net: sched: cls: Fix offloading when ingress dev is vxlan

When using a vxlan device as the ingress dev, we count it as a
"no offload dev", so when such a rule comes and err stop is true,
we fail early and don't try the egdev route which can offload it
through the egress device.

Fix that by not calling the block offload if one of the devices
attached to it is not offload capable, but make sure egress on such case
is capable instead.

Fixes: caa7260156eb ("net: sched: keep track of offloaded filters [..]")
Reviewed-by: Roi Dayan <roid@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cdc3c87c53e6..29fb4d68a144 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -807,10 +807,6 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
 	int ok_count = 0;
 	int err;
 
-	/* Make sure all netdevs sharing this block are offload-capable. */
-	if (block->nooffloaddevcnt && err_stop)
-		return -EOPNOTSUPP;
-
 	list_for_each_entry(block_cb, &block->cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
@@ -1729,21 +1725,31 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
 int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 		     enum tc_setup_type type, void *type_data, bool err_stop)
 {
-	int ok_count;
+	int ok_count = 0;
 	int ret;
 
-	ret = tcf_block_cb_call(block, type, type_data, err_stop);
-	if (ret < 0)
-		return ret;
-	ok_count = ret;
+	if (!block->nooffloaddevcnt) {
+		ret = tcf_block_cb_call(block, type, type_data, err_stop);
+		if (ret < 0)
+			return ret;
+		ok_count = ret;
+	}
 
 	if (!exts || ok_count)
-		return ok_count;
+		goto skip_egress;
+
 	ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
 	if (ret < 0)
 		return ret;
 	ok_count += ret;
 
+skip_egress:
+	/* if one of the netdevs sharing this block are not offload-capable
+	 * make sure we succeeded in egress instead.
+	 */
+	if (block->nooffloaddevcnt && !ok_count && err_stop)
+		return -EOPNOTSUPP;
+
 	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
-- 
cgit v1.2.3-59-g8ed1b


From 95358a9553fbec6c47ad7bd1aec20df663295088 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Tue, 5 Jun 2018 03:07:23 -0700
Subject: net-tcp: remove useless tw_timeout field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tested: 'git grep tw_timeout' comes up empty and it builds :-)

Signed-off-by: Maciej Żenczykowski <maze@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_timewait_sock.h | 1 -
 net/dccp/minisocks.c             | 1 -
 net/ipv4/tcp_minisocks.c         | 1 -
 3 files changed, 3 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 659d8ed5a3bc..78775038f011 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -61,7 +61,6 @@ struct inet_timewait_sock {
 #define tw_cookie		__tw_common.skc_cookie
 #define tw_dr			__tw_common.skc_tw_dr
 
-	int			tw_timeout;
 	__u32			tw_mark;
 	volatile unsigned char	tw_substate;
 	unsigned char		tw_rcv_wscale;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 37ccbe62eb1a..ba6fc3c1186b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -53,7 +53,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		if (timeo < rto)
 			timeo = rto;
 
-		tw->tw_timeout = DCCP_TIMEWAIT_LEN;
 		if (state == DCCP_TIME_WAIT)
 			timeo = DCCP_TIMEWAIT_LEN;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f867658b4b30..1dda1341a223 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -307,7 +307,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		if (timeo < rto)
 			timeo = rto;
 
-		tw->tw_timeout = TCP_TIMEWAIT_LEN;
 		if (state == TCP_TIME_WAIT)
 			timeo = TCP_TIMEWAIT_LEN;
 
-- 
cgit v1.2.3-59-g8ed1b


From d52c89f120de849575f6b2e5948038f2be12ce6f Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Tue, 5 Jun 2018 13:11:16 +0300
Subject: qed*: Utilize FW 8.37.2.0

This FW contains several fixes and features.

RDMA
- Several modifications and fixes for Memory Windows
- drop vlan and tcp timestamp from mss calculation in driver for
  this FW
- Fix SQ completion flow when local ack timeout is infinite
- Modifications in t10dif support

ETH
- Fix aRFS for tunneled traffic without inner IP.
- Fix chip configuration which may fail under heavy traffic conditions.
- Support receiving any-VNI in VXLAN and GENEVE RX classification.

iSCSI / FcoE
- Fix iSCSI recovery flow
- Drop vlan and tcp timestamp from mss calc for fw 8.37.2.0

Misc
- Several registers (split registers) won't read correctly with
  ethtool -d

Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: Manish Rangankar <manish.rangankar@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/qedr_hsi_rdma.h         | 139 +++---
 drivers/infiniband/hw/qedr/verbs.c                 |   4 +-
 drivers/net/ethernet/qlogic/qed/qed_debug.c        | 492 ++++++++++++---------
 drivers/net/ethernet/qlogic/qed/qed_dev.c          |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          | 462 ++++++++++++-------
 drivers/net/ethernet/qlogic/qed/qed_hw.c           |  20 +
 drivers/net/ethernet/qlogic/qed/qed_hw.h           |  12 +
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    |  50 ++-
 drivers/net/ethernet/qlogic/qed/qed_iwarp.c        |  13 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c           |   3 +
 drivers/net/ethernet/qlogic/qed/qed_l2.h           |   1 +
 drivers/net/ethernet/qlogic/qed/qed_rdma.c         |   8 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |   3 +-
 drivers/net/ethernet/qlogic/qed/qed_roce.c         |  31 +-
 drivers/scsi/qedi/qedi_iscsi.c                     |   6 -
 include/linux/qed/common_hsi.h                     |   4 +-
 include/linux/qed/iscsi_common.h                   |   8 +-
 include/linux/qed/qed_rdma_if.h                    |   4 +-
 include/linux/qed/roce_common.h                    |   1 +
 19 files changed, 727 insertions(+), 536 deletions(-)

diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
index b816c80df50b..7e1f7021396a 100644
--- a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
+++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
@@ -116,6 +116,7 @@ enum rdma_cqe_requester_status_enum {
 	RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR,
 	RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR,
 	RDMA_CQE_REQ_STS_XRC_VOILATION_ERR,
+	RDMA_CQE_REQ_STS_SIG_ERR,
 	MAX_RDMA_CQE_REQUESTER_STATUS_ENUM
 };
 
@@ -152,12 +153,12 @@ struct rdma_rq_sge {
 	struct regpair addr;
 	__le32 length;
 	__le32 flags;
-#define RDMA_RQ_SGE_L_KEY_MASK      0x3FFFFFF
-#define RDMA_RQ_SGE_L_KEY_SHIFT     0
+#define RDMA_RQ_SGE_L_KEY_LO_MASK   0x3FFFFFF
+#define RDMA_RQ_SGE_L_KEY_LO_SHIFT  0
 #define RDMA_RQ_SGE_NUM_SGES_MASK   0x7
 #define RDMA_RQ_SGE_NUM_SGES_SHIFT  26
-#define RDMA_RQ_SGE_RESERVED0_MASK  0x7
-#define RDMA_RQ_SGE_RESERVED0_SHIFT 29
+#define RDMA_RQ_SGE_L_KEY_HI_MASK   0x7
+#define RDMA_RQ_SGE_L_KEY_HI_SHIFT  29
 };
 
 struct rdma_srq_sge {
@@ -241,18 +242,39 @@ enum rdma_dif_io_direction_flg {
 	MAX_RDMA_DIF_IO_DIRECTION_FLG
 };
 
-/* RDMA DIF Runt Result Structure */
-struct rdma_dif_runt_result {
-	__le16 guard_tag;
-	__le16 reserved[3];
+struct rdma_dif_params {
+	__le32 base_ref_tag;
+	__le16 app_tag;
+	__le16 app_tag_mask;
+	__le16 runt_crc_value;
+	__le16 flags;
+#define RDMA_DIF_PARAMS_IO_DIRECTION_FLG_MASK    0x1
+#define RDMA_DIF_PARAMS_IO_DIRECTION_FLG_SHIFT   0
+#define RDMA_DIF_PARAMS_BLOCK_SIZE_MASK          0x1
+#define RDMA_DIF_PARAMS_BLOCK_SIZE_SHIFT         1
+#define RDMA_DIF_PARAMS_RUNT_VALID_FLG_MASK      0x1
+#define RDMA_DIF_PARAMS_RUNT_VALID_FLG_SHIFT     2
+#define RDMA_DIF_PARAMS_VALIDATE_CRC_GUARD_MASK  0x1
+#define RDMA_DIF_PARAMS_VALIDATE_CRC_GUARD_SHIFT 3
+#define RDMA_DIF_PARAMS_VALIDATE_REF_TAG_MASK    0x1
+#define RDMA_DIF_PARAMS_VALIDATE_REF_TAG_SHIFT   4
+#define RDMA_DIF_PARAMS_VALIDATE_APP_TAG_MASK    0x1
+#define RDMA_DIF_PARAMS_VALIDATE_APP_TAG_SHIFT   5
+#define RDMA_DIF_PARAMS_CRC_SEED_MASK            0x1
+#define RDMA_DIF_PARAMS_CRC_SEED_SHIFT           6
+#define RDMA_DIF_PARAMS_RX_REF_TAG_CONST_MASK    0x1
+#define RDMA_DIF_PARAMS_RX_REF_TAG_CONST_SHIFT   7
+#define RDMA_DIF_PARAMS_BLOCK_GUARD_TYPE_MASK    0x1
+#define RDMA_DIF_PARAMS_BLOCK_GUARD_TYPE_SHIFT   8
+#define RDMA_DIF_PARAMS_APP_ESCAPE_MASK          0x1
+#define RDMA_DIF_PARAMS_APP_ESCAPE_SHIFT         9
+#define RDMA_DIF_PARAMS_REF_ESCAPE_MASK          0x1
+#define RDMA_DIF_PARAMS_REF_ESCAPE_SHIFT         10
+#define RDMA_DIF_PARAMS_RESERVED4_MASK           0x1F
+#define RDMA_DIF_PARAMS_RESERVED4_SHIFT          11
+	__le32 reserved5;
 };
 
-/* Memory window type enumeration */
-enum rdma_mw_type {
-	RDMA_MW_TYPE_1,
-	RDMA_MW_TYPE_2A,
-	MAX_RDMA_MW_TYPE
-};
 
 struct rdma_sq_atomic_wqe {
 	__le32 reserved1;
@@ -334,17 +356,17 @@ struct rdma_sq_bind_wqe {
 #define RDMA_SQ_BIND_WQE_SE_FLG_SHIFT        3
 #define RDMA_SQ_BIND_WQE_INLINE_FLG_MASK     0x1
 #define RDMA_SQ_BIND_WQE_INLINE_FLG_SHIFT    4
-#define RDMA_SQ_BIND_WQE_RESERVED0_MASK      0x7
-#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT     5
+#define RDMA_SQ_BIND_WQE_DIF_ON_HOST_FLG_MASK  0x1
+#define RDMA_SQ_BIND_WQE_DIF_ON_HOST_FLG_SHIFT 5
+#define RDMA_SQ_BIND_WQE_RESERVED0_MASK      0x3
+#define RDMA_SQ_BIND_WQE_RESERVED0_SHIFT     6
 	u8 wqe_size;
 	u8 prev_wqe_size;
 	u8 bind_ctrl;
 #define RDMA_SQ_BIND_WQE_ZERO_BASED_MASK     0x1
 #define RDMA_SQ_BIND_WQE_ZERO_BASED_SHIFT    0
-#define RDMA_SQ_BIND_WQE_MW_TYPE_MASK        0x1
-#define RDMA_SQ_BIND_WQE_MW_TYPE_SHIFT       1
-#define RDMA_SQ_BIND_WQE_RESERVED1_MASK      0x3F
-#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT     2
+#define RDMA_SQ_BIND_WQE_RESERVED1_MASK        0x7F
+#define RDMA_SQ_BIND_WQE_RESERVED1_SHIFT       1
 	u8 access_ctrl;
 #define RDMA_SQ_BIND_WQE_REMOTE_READ_MASK    0x1
 #define RDMA_SQ_BIND_WQE_REMOTE_READ_SHIFT   0
@@ -363,6 +385,7 @@ struct rdma_sq_bind_wqe {
 	__le32 length_lo;
 	__le32 parent_l_key;
 	__le32 reserved4;
+	struct rdma_dif_params dif_params;
 };
 
 /* First element (16 bytes) of bind wqe */
@@ -392,10 +415,8 @@ struct rdma_sq_bind_wqe_2nd {
 	u8 bind_ctrl;
 #define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_MASK     0x1
 #define RDMA_SQ_BIND_WQE_2ND_ZERO_BASED_SHIFT    0
-#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_MASK        0x1
-#define RDMA_SQ_BIND_WQE_2ND_MW_TYPE_SHIFT       1
-#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK      0x3F
-#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT     2
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_MASK      0x7F
+#define RDMA_SQ_BIND_WQE_2ND_RESERVED1_SHIFT     1
 	u8 access_ctrl;
 #define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_MASK    0x1
 #define RDMA_SQ_BIND_WQE_2ND_REMOTE_READ_SHIFT   0
@@ -416,6 +437,11 @@ struct rdma_sq_bind_wqe_2nd {
 	__le32 reserved4;
 };
 
+/* Third element (16 bytes) of bind wqe */
+struct rdma_sq_bind_wqe_3rd {
+	struct rdma_dif_params dif_params;
+};
+
 /* Structure with only the SQ WQE common
  * fields. Size is of one SQ element (16B)
  */
@@ -486,30 +512,6 @@ struct rdma_sq_fmr_wqe {
 	u8 length_hi;
 	__le32 length_lo;
 	struct regpair pbl_addr;
-	__le32 dif_base_ref_tag;
-	__le16 dif_app_tag;
-	__le16 dif_app_tag_mask;
-	__le16 dif_runt_crc_value;
-	__le16 dif_flags;
-#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_IO_DIRECTION_FLG_SHIFT	0
-#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_BLOCK_SIZE_SHIFT		1
-#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_RUNT_VALID_FLG_SHIFT	2
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_CRC_GUARD_SHIFT	3
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_REF_TAG_SHIFT	4
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_VALIDATE_APP_TAG_SHIFT	5
-#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_MASK		0x1
-#define RDMA_SQ_FMR_WQE_DIF_CRC_SEED_SHIFT		6
-#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_MASK	0x1
-#define RDMA_SQ_FMR_WQE_DIF_RX_REF_TAG_CONST_SHIFT	7
-#define RDMA_SQ_FMR_WQE_RESERVED4_MASK			0xFF
-#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT			8
-	__le32 reserved5;
 };
 
 /* First element (16 bytes) of fmr wqe */
@@ -566,33 +568,6 @@ struct rdma_sq_fmr_wqe_2nd {
 	struct regpair pbl_addr;
 };
 
-/* Third element (16 bytes) of fmr wqe */
-struct rdma_sq_fmr_wqe_3rd {
-	__le32 dif_base_ref_tag;
-	__le16 dif_app_tag;
-	__le16 dif_app_tag_mask;
-	__le16 dif_runt_crc_value;
-	__le16 dif_flags;
-#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_IO_DIRECTION_FLG_SHIFT		0
-#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_MASK			0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_BLOCK_SIZE_SHIFT		1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RUNT_VALID_FLG_SHIFT		2
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_CRC_GUARD_SHIFT	3
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_REF_TAG_SHIFT		4
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_VALIDATE_APP_TAG_SHIFT		5
-#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_MASK			0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_CRC_SEED_SHIFT			6
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_MASK		0x1
-#define RDMA_SQ_FMR_WQE_3RD_DIF_RX_REF_TAG_CONST_SHIFT		7
-#define RDMA_SQ_FMR_WQE_3RD_RESERVED4_MASK			0xFF
-#define RDMA_SQ_FMR_WQE_RESERVED4_SHIFT				8
-	__le32 reserved5;
-};
 
 struct rdma_sq_local_inv_wqe {
 	struct regpair reserved;
@@ -637,8 +612,8 @@ struct rdma_sq_rdma_wqe {
 #define RDMA_SQ_RDMA_WQE_DIF_ON_HOST_FLG_SHIFT	5
 #define RDMA_SQ_RDMA_WQE_READ_INV_FLG_MASK	0x1
 #define RDMA_SQ_RDMA_WQE_READ_INV_FLG_SHIFT	6
-#define RDMA_SQ_RDMA_WQE_RESERVED0_MASK		0x1
-#define RDMA_SQ_RDMA_WQE_RESERVED0_SHIFT	7
+#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK        0x1
+#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT       7
 	u8 wqe_size;
 	u8 prev_wqe_size;
 	struct regpair remote_va;
@@ -646,13 +621,9 @@ struct rdma_sq_rdma_wqe {
 	u8 dif_flags;
 #define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_MASK            0x1
 #define RDMA_SQ_RDMA_WQE_DIF_BLOCK_SIZE_SHIFT           0
-#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_MASK  0x1
-#define RDMA_SQ_RDMA_WQE_DIF_FIRST_RDMA_IN_IO_FLG_SHIFT 1
-#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_MASK   0x1
-#define RDMA_SQ_RDMA_WQE_DIF_LAST_RDMA_IN_IO_FLG_SHIFT  2
-#define RDMA_SQ_RDMA_WQE_RESERVED1_MASK                 0x1F
-#define RDMA_SQ_RDMA_WQE_RESERVED1_SHIFT                3
-	u8 reserved2[3];
+#define RDMA_SQ_RDMA_WQE_RESERVED2_MASK        0x7F
+#define RDMA_SQ_RDMA_WQE_RESERVED2_SHIFT       1
+	u8 reserved3[3];
 };
 
 /* First element (16 bytes) of rdma wqe */
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 3f9afc02d166..e2caabb8a926 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -3276,7 +3276,7 @@ int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 				SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES,
 					  wr->num_sge);
 
-			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY,
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY_LO,
 				  wr->sg_list[i].lkey);
 
 			RQ_SGE_SET(rqe, wr->sg_list[i].addr,
@@ -3295,7 +3295,7 @@ int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 			/* First one must include the number
 			 * of SGE in the list
 			 */
-			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0);
+			SET_FIELD(flags, RDMA_RQ_SGE_L_KEY_LO, 0);
 			SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1);
 
 			RQ_SGE_SET(rqe, 0, 0, flags);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 39124b594a36..b9ec460dd996 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -183,16 +183,9 @@ enum platform_ids {
 	MAX_PLATFORM_IDS
 };
 
-struct chip_platform_defs {
-	u8 num_ports;
-	u8 num_pfs;
-	u8 num_vfs;
-};
-
 /* Chip constant definitions */
 struct chip_defs {
 	const char *name;
-	struct chip_platform_defs per_platform[MAX_PLATFORM_IDS];
 };
 
 /* Platform constant definitions */
@@ -317,6 +310,11 @@ struct phy_defs {
 	u32 tbus_data_hi_addr;
 };
 
+/* Split type definitions */
+struct split_type_defs {
+	const char *name;
+};
+
 /******************************** Constants **********************************/
 
 #define MAX_LCIDS			320
@@ -469,21 +467,9 @@ static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {NULL} };
 
 /* Chip constant definitions array */
 static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = {
-	{ "bb",
-	  {{MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB, MAX_NUM_VFS_BB},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } },
-	{ "ah",
-	  {{MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2, MAX_NUM_VFS_K2},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } },
-	{ "reserved",
-	   {{0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0},
-	   {0, 0, 0} } }
+	{"bb"},
+	{"ah"},
+	{"reserved"},
 };
 
 /* Storm constant definitions array */
@@ -1588,7 +1574,7 @@ static struct grc_param_defs s_grc_param_defs[] = {
 	{{0, 0, 0}, 0, 1, false, false, 0, 1},
 
 	/* DBG_GRC_PARAM_DUMP_BMB */
-	{{0, 0, 0}, 0, 1, false, false, 0, 1},
+	{{0, 0, 0}, 0, 1, false, false, 0, 0},
 
 	/* DBG_GRC_PARAM_DUMP_NIG */
 	{{1, 1, 1}, 0, 1, false, false, 0, 1},
@@ -1745,6 +1731,23 @@ static struct phy_defs s_phy_defs[] = {
 	 PHY_PCIE_IP_REG_AHB_CMU_CSR_0_X131_K2_E5},
 };
 
+static struct split_type_defs s_split_type_defs[] = {
+	/* SPLIT_TYPE_NONE */
+	{"eng"},
+
+	/* SPLIT_TYPE_PORT */
+	{"port"},
+
+	/* SPLIT_TYPE_PF */
+	{"pf"},
+
+	/* SPLIT_TYPE_PORT_PF */
+	{"port"},
+
+	/* SPLIT_TYPE_VF */
+	{"vf"}
+};
+
 /**************************** Private Functions ******************************/
 
 /* Reads and returns a single dword from the specified unaligned buffer */
@@ -1781,28 +1784,68 @@ static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 num_pfs = 0, max_pfs_per_port = 0;
 
 	if (dev_data->initialized)
 		return DBG_STATUS_OK;
 
+	/* Set chip */
 	if (QED_IS_K2(p_hwfn->cdev)) {
 		dev_data->chip_id = CHIP_K2;
 		dev_data->mode_enable[MODE_K2] = 1;
+		dev_data->num_vfs = MAX_NUM_VFS_K2;
+		num_pfs = MAX_NUM_PFS_K2;
+		max_pfs_per_port = MAX_NUM_PFS_K2 / 2;
 	} else if (QED_IS_BB_B0(p_hwfn->cdev)) {
 		dev_data->chip_id = CHIP_BB;
 		dev_data->mode_enable[MODE_BB] = 1;
+		dev_data->num_vfs = MAX_NUM_VFS_BB;
+		num_pfs = MAX_NUM_PFS_BB;
+		max_pfs_per_port = MAX_NUM_PFS_BB;
 	} else {
 		return DBG_STATUS_UNKNOWN_CHIP;
 	}
 
+	/* Set platofrm */
 	dev_data->platform_id = PLATFORM_ASIC;
 	dev_data->mode_enable[MODE_ASIC] = 1;
 
+	/* Set port mode */
+	switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
+	case 0:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_1] = 1;
+		break;
+	case 1:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_2] = 1;
+		break;
+	case 2:
+		dev_data->mode_enable[MODE_PORTS_PER_ENG_4] = 1;
+		break;
+	}
+
+	/* Set 100G mode */
+	if (dev_data->chip_id == CHIP_BB &&
+	    qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB) == 2)
+		dev_data->mode_enable[MODE_100G] = 1;
+
+	/* Set number of ports */
+	if (dev_data->mode_enable[MODE_PORTS_PER_ENG_1] ||
+	    dev_data->mode_enable[MODE_100G])
+		dev_data->num_ports = 1;
+	else if (dev_data->mode_enable[MODE_PORTS_PER_ENG_2])
+		dev_data->num_ports = 2;
+	else if (dev_data->mode_enable[MODE_PORTS_PER_ENG_4])
+		dev_data->num_ports = 4;
+
+	/* Set number of PFs per port */
+	dev_data->num_pfs_per_port = min_t(u32,
+					   num_pfs / dev_data->num_ports,
+					   max_pfs_per_port);
+
 	/* Initializes the GRC parameters */
 	qed_dbg_grc_init_params(p_hwfn);
 
 	dev_data->use_dmae = true;
-	dev_data->num_regs_read = 0;
 	dev_data->initialized = 1;
 
 	return DBG_STATUS_OK;
@@ -1821,9 +1864,9 @@ static struct dbg_bus_block *get_dbg_bus_block_desc(struct qed_hwfn *p_hwfn,
 /* Reads the FW info structure for the specified Storm from the chip,
  * and writes it to the specified fw_info pointer.
  */
-static void qed_read_fw_info(struct qed_hwfn *p_hwfn,
-			     struct qed_ptt *p_ptt,
-			     u8 storm_id, struct fw_info *fw_info)
+static void qed_read_storm_fw_info(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   u8 storm_id, struct fw_info *fw_info)
 {
 	struct storm_defs *storm = &s_storm_defs[storm_id];
 	struct fw_info_location fw_info_location;
@@ -1945,45 +1988,29 @@ static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn,
 				 struct qed_ptt *p_ptt,
 				 u32 *dump_buf, bool dump)
 {
-	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	char fw_ver_str[16] = EMPTY_FW_VERSION_STR;
 	char fw_img_str[16] = EMPTY_FW_IMAGE_STR;
 	struct fw_info fw_info = { {0}, {0} };
 	u32 offset = 0;
 
 	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
-		/* Read FW image/version from PRAM in a non-reset SEMI */
-		bool found = false;
-		u8 storm_id;
-
-		for (storm_id = 0; storm_id < MAX_DBG_STORMS && !found;
-		     storm_id++) {
-			struct storm_defs *storm = &s_storm_defs[storm_id];
-
-			/* Read FW version/image */
-			if (dev_data->block_in_reset[storm->block_id])
-				continue;
-
-			/* Read FW info for the current Storm */
-			qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
-
-			/* Create FW version/image strings */
-			if (snprintf(fw_ver_str, sizeof(fw_ver_str),
-				     "%d_%d_%d_%d", fw_info.ver.num.major,
-				     fw_info.ver.num.minor, fw_info.ver.num.rev,
-				     fw_info.ver.num.eng) < 0)
-				DP_NOTICE(p_hwfn,
-					  "Unexpected debug error: invalid FW version string\n");
-			switch (fw_info.ver.image_id) {
-			case FW_IMG_MAIN:
-				strcpy(fw_img_str, "main");
-				break;
-			default:
-				strcpy(fw_img_str, "unknown");
-				break;
-			}
-
-			found = true;
+		/* Read FW info from chip */
+		qed_read_fw_info(p_hwfn, p_ptt, &fw_info);
+
+		/* Create FW version/image strings */
+		if (snprintf(fw_ver_str, sizeof(fw_ver_str),
+			     "%d_%d_%d_%d", fw_info.ver.num.major,
+			     fw_info.ver.num.minor, fw_info.ver.num.rev,
+			     fw_info.ver.num.eng) < 0)
+			DP_NOTICE(p_hwfn,
+				  "Unexpected debug error: invalid FW version string\n");
+		switch (fw_info.ver.image_id) {
+		case FW_IMG_MAIN:
+			strcpy(fw_img_str, "main");
+			break;
+		default:
+			strcpy(fw_img_str, "unknown");
+			break;
 		}
 	}
 
@@ -2412,20 +2439,21 @@ static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn,
 
 /* Dumps GRC registers section header. Returns the dumped size in dwords.
  * The following parameters are dumped:
- * - count:	 no. of dumped entries
- * - split:	 split type
- * - id:	 split ID (dumped only if split_id >= 0)
+ * - count: no. of dumped entries
+ * - split_type: split type
+ * - split_id: split ID (dumped only if split_id != SPLIT_TYPE_NONE)
  * - param_name: user parameter value (dumped only if param_name != NULL
  *		 and param_val != NULL).
  */
 static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
 				 bool dump,
 				 u32 num_reg_entries,
-				 const char *split_type,
-				 int split_id,
+				 enum init_split_types split_type,
+				 u8 split_id,
 				 const char *param_name, const char *param_val)
 {
-	u8 num_params = 2 + (split_id >= 0 ? 1 : 0) + (param_name ? 1 : 0);
+	u8 num_params = 2 +
+	    (split_type != SPLIT_TYPE_NONE ? 1 : 0) + (param_name ? 1 : 0);
 	u32 offset = 0;
 
 	offset += qed_dump_section_hdr(dump_buf + offset,
@@ -2433,8 +2461,9 @@ static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
 	offset += qed_dump_num_param(dump_buf + offset,
 				     dump, "count", num_reg_entries);
 	offset += qed_dump_str_param(dump_buf + offset,
-				     dump, "split", split_type);
-	if (split_id >= 0)
+				     dump, "split",
+				     s_split_type_defs[split_type].name);
+	if (split_type != SPLIT_TYPE_NONE)
 		offset += qed_dump_num_param(dump_buf + offset,
 					     dump, "id", split_id);
 	if (param_name && param_val)
@@ -2463,9 +2492,12 @@ void qed_read_regs(struct qed_hwfn *p_hwfn,
 static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 				   struct qed_ptt *p_ptt,
 				   u32 *dump_buf,
-				   bool dump, u32 addr, u32 len, bool wide_bus)
+				   bool dump, u32 addr, u32 len, bool wide_bus,
+				   enum init_split_types split_type,
+				   u8 split_id)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 port_id = 0, pf_id = 0, vf_id = 0, fid = 0;
 
 	if (!dump)
 		return len;
@@ -2481,8 +2513,27 @@ static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 		dev_data->num_regs_read = 0;
 	}
 
+	switch (split_type) {
+	case SPLIT_TYPE_PORT:
+		port_id = split_id;
+		break;
+	case SPLIT_TYPE_PF:
+		pf_id = split_id;
+		break;
+	case SPLIT_TYPE_PORT_PF:
+		port_id = split_id / dev_data->num_pfs_per_port;
+		pf_id = port_id + dev_data->num_ports *
+		    (split_id % dev_data->num_pfs_per_port);
+		break;
+	case SPLIT_TYPE_VF:
+		vf_id = split_id;
+		break;
+	default:
+		break;
+	}
+
 	/* Try reading using DMAE */
-	if (dev_data->use_dmae &&
+	if (dev_data->use_dmae && split_type == SPLIT_TYPE_NONE &&
 	    (len >= s_platform_defs[dev_data->platform_id].dmae_thresh ||
 	     wide_bus)) {
 		if (!qed_dmae_grc2host(p_hwfn, p_ptt, DWORDS_TO_BYTES(addr),
@@ -2494,7 +2545,37 @@ static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
 			   "Failed reading from chip using DMAE, using GRC instead\n");
 	}
 
-	/* Read registers */
+	/* If not read using DMAE, read using GRC */
+
+	/* Set pretend */
+	if (split_type != dev_data->pretend.split_type || split_id !=
+	    dev_data->pretend.split_id) {
+		switch (split_type) {
+		case SPLIT_TYPE_PORT:
+			qed_port_pretend(p_hwfn, p_ptt, port_id);
+			break;
+		case SPLIT_TYPE_PF:
+			fid = pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+			qed_fid_pretend(p_hwfn, p_ptt, fid);
+			break;
+		case SPLIT_TYPE_PORT_PF:
+			fid = pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+			qed_port_fid_pretend(p_hwfn, p_ptt, port_id, fid);
+			break;
+		case SPLIT_TYPE_VF:
+			fid = BIT(PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT) |
+			      (vf_id << PXP_PRETEND_CONCRETE_FID_VFID_SHIFT);
+			qed_fid_pretend(p_hwfn, p_ptt, fid);
+			break;
+		default:
+			break;
+		}
+
+		dev_data->pretend.split_type = (u8)split_type;
+		dev_data->pretend.split_id = split_id;
+	}
+
+	/* Read registers using GRC */
 	qed_read_regs(p_hwfn, p_ptt, dump_buf, addr, len);
 
 	return len;
@@ -2518,7 +2599,8 @@ static u32 qed_grc_dump_reg_entry_hdr(u32 *dump_buf,
 static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
 				  u32 *dump_buf,
-				  bool dump, u32 addr, u32 len, bool wide_bus)
+				  bool dump, u32 addr, u32 len, bool wide_bus,
+				  enum init_split_types split_type, u8 split_id)
 {
 	u32 offset = 0;
 
@@ -2526,7 +2608,8 @@ static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
 	offset += qed_grc_dump_addr_range(p_hwfn,
 					  p_ptt,
 					  dump_buf + offset,
-					  dump, addr, len, wide_bus);
+					  dump, addr, len, wide_bus,
+					  split_type, split_id);
 
 	return offset;
 }
@@ -2559,7 +2642,8 @@ static u32 qed_grc_dump_reg_entry_skip(struct qed_hwfn *p_hwfn,
 		offset += qed_grc_dump_addr_range(p_hwfn,
 						  p_ptt,
 						  dump_buf + offset,
-						  dump, addr, curr_len, false);
+						  dump,  addr, curr_len, false,
+						  SPLIT_TYPE_NONE, 0);
 		reg_offset += curr_len;
 		addr += curr_len;
 
@@ -2581,6 +2665,8 @@ static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
 				     struct dbg_array input_regs_arr,
 				     u32 *dump_buf,
 				     bool dump,
+				     enum init_split_types split_type,
+				     u8 split_id,
 				     bool block_enable[MAX_BLOCK_ID],
 				     u32 *num_dumped_reg_entries)
 {
@@ -2628,7 +2714,8 @@ static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
 							 dump,
 							 addr,
 							 len,
-							 wide_bus);
+							 wide_bus,
+							 split_type, split_id);
 			(*num_dumped_reg_entries)++;
 		}
 	}
@@ -2643,19 +2730,28 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 				   u32 *dump_buf,
 				   bool dump,
 				   bool block_enable[MAX_BLOCK_ID],
-				   const char *split_type_name,
-				   u32 split_id,
+				   enum init_split_types split_type,
+				   u8 split_id,
 				   const char *param_name,
 				   const char *param_val)
 {
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	enum init_split_types hdr_split_type = split_type;
 	u32 num_dumped_reg_entries, offset;
+	u8 hdr_split_id = split_id;
+
+	/* In PORT_PF split type, print a port split header */
+	if (split_type == SPLIT_TYPE_PORT_PF) {
+		hdr_split_type = SPLIT_TYPE_PORT;
+		hdr_split_id = split_id / dev_data->num_pfs_per_port;
+	}
 
 	/* Calculate register dump header size (and skip it for now) */
 	offset = qed_grc_dump_regs_hdr(dump_buf,
 				       false,
 				       0,
-				       split_type_name,
-				       split_id, param_name, param_val);
+				       hdr_split_type,
+				       hdr_split_id, param_name, param_val);
 
 	/* Dump registers */
 	offset += qed_grc_dump_regs_entries(p_hwfn,
@@ -2663,6 +2759,8 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 					    input_regs_arr,
 					    dump_buf + offset,
 					    dump,
+					    split_type,
+					    split_id,
 					    block_enable,
 					    &num_dumped_reg_entries);
 
@@ -2671,8 +2769,8 @@ static u32 qed_grc_dump_split_data(struct qed_hwfn *p_hwfn,
 		qed_grc_dump_regs_hdr(dump_buf,
 				      dump,
 				      num_dumped_reg_entries,
-				      split_type_name,
-				      split_id, param_name, param_val);
+				      hdr_split_type,
+				      hdr_split_id, param_name, param_val);
 
 	return num_dumped_reg_entries > 0 ? offset : 0;
 }
@@ -2688,26 +2786,21 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 				  const char *param_name, const char *param_val)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-	struct chip_platform_defs *chip_platform;
 	u32 offset = 0, input_offset = 0;
-	struct chip_defs *chip;
-	u8 port_id, pf_id, vf_id;
 	u16 fid;
-
-	chip = &s_chip_defs[dev_data->chip_id];
-	chip_platform = &chip->per_platform[dev_data->platform_id];
-
 	while (input_offset <
 	       s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].size_in_dwords) {
 		const struct dbg_dump_split_hdr *split_hdr;
 		struct dbg_array curr_input_regs_arr;
+		enum init_split_types split_type;
+		u16 split_count = 0;
 		u32 split_data_size;
-		u8 split_type_id;
+		u8 split_id;
 
 		split_hdr =
 			(const struct dbg_dump_split_hdr *)
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset++];
-		split_type_id =
+		split_type =
 			GET_FIELD(split_hdr->hdr,
 				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
 		split_data_size =
@@ -2717,99 +2810,44 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_REG].ptr[input_offset];
 		curr_input_regs_arr.size_in_dwords = split_data_size;
 
-		switch (split_type_id) {
+		switch (split_type) {
 		case SPLIT_TYPE_NONE:
-			offset += qed_grc_dump_split_data(p_hwfn,
-							  p_ptt,
-							  curr_input_regs_arr,
-							  dump_buf + offset,
-							  dump,
-							  block_enable,
-							  "eng",
-							  (u32)(-1),
-							  param_name,
-							  param_val);
+			split_count = 1;
 			break;
-
 		case SPLIT_TYPE_PORT:
-			for (port_id = 0; port_id < chip_platform->num_ports;
-			     port_id++) {
-				if (dump)
-					qed_port_pretend(p_hwfn, p_ptt,
-							 port_id);
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn, p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump, block_enable,
-							    "port", port_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_ports;
 			break;
-
 		case SPLIT_TYPE_PF:
 		case SPLIT_TYPE_PORT_PF:
-			for (pf_id = 0; pf_id < chip_platform->num_pfs;
-			     pf_id++) {
-				u8 pfid_shift =
-					PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
-
-				if (dump) {
-					fid = pf_id << pfid_shift;
-					qed_fid_pretend(p_hwfn, p_ptt, fid);
-				}
-
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn,
-							    p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump,
-							    block_enable,
-							    "pf",
-							    pf_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_ports *
+			    dev_data->num_pfs_per_port;
 			break;
-
 		case SPLIT_TYPE_VF:
-			for (vf_id = 0; vf_id < chip_platform->num_vfs;
-			     vf_id++) {
-				u8 vfvalid_shift =
-					PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT;
-				u8 vfid_shift =
-					PXP_PRETEND_CONCRETE_FID_VFID_SHIFT;
-
-				if (dump) {
-					fid = BIT(vfvalid_shift) |
-					      (vf_id << vfid_shift);
-					qed_fid_pretend(p_hwfn, p_ptt, fid);
-				}
-
-				offset +=
-				    qed_grc_dump_split_data(p_hwfn, p_ptt,
-							    curr_input_regs_arr,
-							    dump_buf + offset,
-							    dump, block_enable,
-							    "vf", vf_id,
-							    param_name,
-							    param_val);
-			}
+			split_count = dev_data->num_vfs;
 			break;
-
 		default:
-			break;
+			return 0;
 		}
 
+		for (split_id = 0; split_id < split_count; split_id++)
+			offset += qed_grc_dump_split_data(p_hwfn, p_ptt,
+							  curr_input_regs_arr,
+							  dump_buf + offset,
+							  dump, block_enable,
+							  split_type,
+							  split_id,
+							  param_name,
+							  param_val);
+
 		input_offset += split_data_size;
 	}
 
-	/* Pretend to original PF */
+	/* Cancel pretends (pretend to original PF) */
 	if (dump) {
 		fid = p_hwfn->rel_pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
 		qed_fid_pretend(p_hwfn, p_ptt, fid);
+		dev_data->pretend.split_type = SPLIT_TYPE_NONE;
+		dev_data->pretend.split_id = 0;
 	}
 
 	return offset;
@@ -2825,7 +2863,8 @@ static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
 
 	/* Calculate header size */
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					false, 0, "eng", -1, NULL, NULL);
+					false, 0,
+					SPLIT_TYPE_NONE, 0, NULL, NULL);
 
 	/* Write reset registers */
 	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
@@ -2838,14 +2877,15 @@ static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
 						 dump,
 						 BYTES_TO_DWORDS
 						 (s_reset_regs_defs[i].addr), 1,
-						 false);
+						 false, SPLIT_TYPE_NONE, 0);
 		num_regs++;
 	}
 
 	/* Write header */
 	if (dump)
 		qed_grc_dump_regs_hdr(dump_buf,
-				      true, num_regs, "eng", -1, NULL, NULL);
+				      true, num_regs, SPLIT_TYPE_NONE,
+				      0, NULL, NULL);
 
 	return offset;
 }
@@ -2864,7 +2904,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 
 	/* Calculate header size */
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					false, 0, "eng", -1, NULL, NULL);
+					false, 0, SPLIT_TYPE_NONE,
+					0, NULL, NULL);
 
 	/* Write parity registers */
 	for (block_id = 0; block_id < MAX_BLOCK_ID; block_id++) {
@@ -2899,7 +2940,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 							 dump_buf + offset,
 							 dump,
 							 addr,
-							 1, false);
+							 1, false,
+							 SPLIT_TYPE_NONE, 0);
 			addr = GET_FIELD(reg_data->data,
 					 DBG_ATTN_REG_STS_ADDRESS);
 			offset += qed_grc_dump_reg_entry(p_hwfn,
@@ -2907,7 +2949,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 							 dump_buf + offset,
 							 dump,
 							 addr,
-							 1, false);
+							 1, false,
+							 SPLIT_TYPE_NONE, 0);
 			num_reg_entries += 2;
 		}
 	}
@@ -2929,7 +2972,7 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 						 dump,
 						 addr,
 						 1,
-						 false);
+						 false, SPLIT_TYPE_NONE, 0);
 		num_reg_entries++;
 	}
 
@@ -2937,7 +2980,8 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 	if (dump)
 		qed_grc_dump_regs_hdr(dump_buf,
 				      true,
-				      num_reg_entries, "eng", -1, NULL, NULL);
+				      num_reg_entries, SPLIT_TYPE_NONE,
+				      0, NULL, NULL);
 
 	return offset;
 }
@@ -2950,7 +2994,8 @@ static u32 qed_grc_dump_special_regs(struct qed_hwfn *p_hwfn,
 	u32 offset = 0, addr;
 
 	offset += qed_grc_dump_regs_hdr(dump_buf,
-					dump, 2, "eng", -1, NULL, NULL);
+					dump, 2, SPLIT_TYPE_NONE, 0,
+					NULL, NULL);
 
 	/* Dump R/TDIF_REG_DEBUG_ERROR_INFO_SIZE (every 8'th register should be
 	 * skipped).
@@ -3096,7 +3141,8 @@ static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
 	offset += qed_grc_dump_addr_range(p_hwfn,
 					  p_ptt,
 					  dump_buf + offset,
-					  dump, addr, len, wide_bus);
+					  dump, addr, len, wide_bus,
+					  SPLIT_TYPE_NONE, 0);
 
 	return offset;
 }
@@ -3235,12 +3281,12 @@ static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
 	       s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].size_in_dwords) {
 		const struct dbg_dump_split_hdr *split_hdr;
 		struct dbg_array curr_input_mems_arr;
+		enum init_split_types split_type;
 		u32 split_data_size;
-		u8 split_type_id;
 
 		split_hdr = (const struct dbg_dump_split_hdr *)
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset++];
-		split_type_id =
+		split_type =
 			GET_FIELD(split_hdr->hdr,
 				  DBG_DUMP_SPLIT_HDR_SPLIT_TYPE_ID);
 		split_data_size =
@@ -3250,20 +3296,15 @@ static u32 qed_grc_dump_memories(struct qed_hwfn *p_hwfn,
 			&s_dbg_arrays[BIN_BUF_DBG_DUMP_MEM].ptr[input_offset];
 		curr_input_mems_arr.size_in_dwords = split_data_size;
 
-		switch (split_type_id) {
-		case SPLIT_TYPE_NONE:
+		if (split_type == SPLIT_TYPE_NONE)
 			offset += qed_grc_dump_mem_entries(p_hwfn,
 							   p_ptt,
 							   curr_input_mems_arr,
 							   dump_buf + offset,
 							   dump);
-			break;
-
-		default:
+		else
 			DP_NOTICE(p_hwfn,
 				  "Dumping split memories is currently not supported\n");
-			break;
-		}
 
 		input_offset += split_data_size;
 	}
@@ -3623,7 +3664,8 @@ static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn,
 							  dump,
 							  addr,
 							  num_dwords_to_read,
-							  false);
+							  false,
+							  SPLIT_TYPE_NONE, 0);
 			total_dwords -= num_dwords_to_read;
 			rss_addr++;
 		}
@@ -3682,7 +3724,7 @@ static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
 						  dump,
 						  addr,
 						  len,
-						  false);
+						  false, SPLIT_TYPE_NONE, 0);
 	}
 
 	return offset;
@@ -3731,7 +3773,8 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 
 	/* Dump required non-MCP registers */
 	offset += qed_grc_dump_regs_hdr(dump_buf + offset,
-					dump, 1, "eng", -1, "block", "MCP");
+					dump, 1, SPLIT_TYPE_NONE, 0,
+					"block", "MCP");
 	addr = BYTES_TO_DWORDS(MISC_REG_SHARED_MEM_ADDR);
 	offset += qed_grc_dump_reg_entry(p_hwfn,
 					 p_ptt,
@@ -3739,7 +3782,7 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 					 dump,
 					 addr,
 					 1,
-					 false);
+					 false, SPLIT_TYPE_NONE, 0);
 
 	/* Release MCP */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
@@ -3923,7 +3966,8 @@ static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
 							  dump,
 							  addr,
 							  len,
-							  true);
+							  true, SPLIT_TYPE_NONE,
+							  0);
 		}
 
 		/* Disable block's client and debug output */
@@ -3949,28 +3993,15 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	bool parities_masked = false;
-	u8 i, port_mode = 0;
 	u32 offset = 0;
+	u8 i;
 
 	*num_dumped_dwords = 0;
+	dev_data->num_regs_read = 0;
 
-	if (dump) {
-		/* Find port mode */
-		switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
-		case 0:
-			port_mode = 1;
-			break;
-		case 1:
-			port_mode = 2;
-			break;
-		case 2:
-			port_mode = 4;
-			break;
-		}
-
-		/* Update reset state */
+	/* Update reset state */
+	if (dump)
 		qed_update_blocks_reset_state(p_hwfn, p_ptt);
-	}
 
 	/* Dump global params */
 	offset += qed_dump_common_global_params(p_hwfn,
@@ -3989,7 +4020,7 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 				     qed_grc_get_param(p_hwfn,
 						DBG_GRC_PARAM_NUM_LTIDS));
 	offset += qed_dump_num_param(dump_buf + offset,
-				     dump, "num-ports", port_mode);
+				     dump, "num-ports", dev_data->num_ports);
 
 	/* Dump reset registers (dumped before taking blocks out of reset ) */
 	if (qed_grc_is_included(p_hwfn, DBG_GRC_PARAM_DUMP_REGS))
@@ -4093,10 +4124,10 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 		offset += qed_grc_dump_phy(p_hwfn,
 					   p_ptt, dump_buf + offset, dump);
 
-	/* Dump static debug data  */
+	/* Dump static debug data (only if not during debug bus recording) */
 	if (qed_grc_is_included(p_hwfn,
 				DBG_GRC_PARAM_DUMP_STATIC) &&
-	    dev_data->bus.state == DBG_BUS_STATE_IDLE)
+	    (!dump || dev_data->bus.state == DBG_BUS_STATE_IDLE))
 		offset += qed_grc_dump_static_debug(p_hwfn,
 						    p_ptt,
 						    dump_buf + offset, dump);
@@ -4250,7 +4281,8 @@ static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
 							  dump_buf + offset,
 							  dump,
 							  addr,
-							  reg->size, wide_bus);
+							  reg->size, wide_bus,
+							  SPLIT_TYPE_NONE, 0);
 		}
 	}
 
@@ -4373,7 +4405,8 @@ qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 							    next_reg_offset,
 							    dump, addr,
 							    reg->entry_size,
-							    wide_bus);
+							    wide_bus,
+							    SPLIT_TYPE_NONE, 0);
 			}
 
 			/* Call rule condition function.
@@ -4723,7 +4756,8 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 					  dump_buf + offset,
 					  dump,
 					  BYTES_TO_DWORDS(trace_data_grc_addr),
-					  trace_data_size_dwords, false);
+					  trace_data_size_dwords, false,
+					  SPLIT_TYPE_NONE, 0);
 
 	/* Resume MCP (only if halt succeeded) */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
@@ -4829,7 +4863,8 @@ static enum dbg_status qed_reg_fifo_dump(struct qed_hwfn *p_hwfn,
 						  true,
 						  addr,
 						  len,
-						  true);
+						  true, SPLIT_TYPE_NONE,
+						  0);
 		fifo_has_data = qed_rd(p_hwfn, p_ptt,
 				       GRC_REG_TRACE_FIFO_VALID_DATA) > 0;
 	}
@@ -4898,7 +4933,8 @@ static enum dbg_status qed_igu_fifo_dump(struct qed_hwfn *p_hwfn,
 						  true,
 						  addr,
 						  len,
-						  true);
+						  true, SPLIT_TYPE_NONE,
+						  0);
 		fifo_has_data = qed_rd(p_hwfn, p_ptt,
 				       IGU_REG_ERROR_HANDLING_DATA_VALID) > 0;
 	}
@@ -4956,7 +4992,7 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn,
 					  true,
 					  addr,
 					  override_window_dwords,
-					  true);
+					  true, SPLIT_TYPE_NONE, 0);
 	qed_dump_num_param(dump_buf + size_param_offset, dump, "size",
 			   override_window_dwords);
 out:
@@ -4998,7 +5034,7 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 			continue;
 
 		/* Read FW info for the current Storm */
-		qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
+		qed_read_storm_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
 
 		asserts = &fw_info.fw_asserts_section;
 
@@ -5036,7 +5072,7 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 					    dump_buf + offset,
 					    dump, addr,
 					    asserts->list_element_dword_size,
-					    false);
+						  false, SPLIT_TYPE_NONE, 0);
 	}
 
 	/* Dump last section */
@@ -5063,6 +5099,28 @@ enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
 	return DBG_STATUS_OK;
 }
 
+bool qed_read_fw_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, struct fw_info *fw_info)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		struct storm_defs *storm = &s_storm_defs[storm_id];
+
+		/* Skip Storm if it's in reset */
+		if (dev_data->block_in_reset[storm->block_id])
+			continue;
+
+		/* Read FW info for the current Storm */
+		qed_read_storm_fw_info(p_hwfn, p_ptt, storm_id, fw_info);
+
+		return true;
+	}
+
+	return false;
+}
+
 /* Assign default GRC param values */
 void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
 {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index fde20fd9942c..b285edc8d6a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2792,7 +2792,7 @@ static void qed_hw_info_port_num_bb(struct qed_hwfn *p_hwfn,
 {
 	u32 port_mode;
 
-	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB_B0);
+	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB);
 
 	if (port_mode < 3) {
 		p_hwfn->cdev->num_ports_in_engine = 1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index b9704be53537..bee10c1781fb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -1095,14 +1095,16 @@ enum personality_type {
 struct pf_start_tunnel_config {
 	u8 set_vxlan_udp_port_flg;
 	u8 set_geneve_udp_port_flg;
+	u8 set_no_inner_l2_vxlan_udp_port_flg;
 	u8 tunnel_clss_vxlan;
 	u8 tunnel_clss_l2geneve;
 	u8 tunnel_clss_ipgeneve;
 	u8 tunnel_clss_l2gre;
 	u8 tunnel_clss_ipgre;
-	u8 reserved;
 	__le16 vxlan_udp_port;
 	__le16 geneve_udp_port;
+	__le16 no_inner_l2_vxlan_udp_port;
+	__le16 reserved[3];
 };
 
 /* Ramrod data for PF start ramrod */
@@ -1145,14 +1147,17 @@ struct pf_update_tunnel_config {
 	u8 update_rx_def_non_ucast_clss;
 	u8 set_vxlan_udp_port_flg;
 	u8 set_geneve_udp_port_flg;
+	u8 set_no_inner_l2_vxlan_udp_port_flg;
 	u8 tunnel_clss_vxlan;
 	u8 tunnel_clss_l2geneve;
 	u8 tunnel_clss_ipgeneve;
 	u8 tunnel_clss_l2gre;
 	u8 tunnel_clss_ipgre;
+	u8 reserved;
 	__le16 vxlan_udp_port;
 	__le16 geneve_udp_port;
-	__le16 reserved;
+	__le16 no_inner_l2_vxlan_udp_port;
+	__le16 reserved1[3];
 };
 
 /* Data for port update ramrod */
@@ -2535,7 +2540,14 @@ struct idle_chk_data {
 	u16 reserved2;
 };
 
-/* Debug Tools data (per HW function) */
+struct pretend_params {
+	u8 split_type;
+	u8 reserved;
+	u16 split_id;
+};
+
+/* Debug Tools data (per HW function)
+ */
 struct dbg_tools_data {
 	struct dbg_grc_data grc;
 	struct dbg_bus_data bus;
@@ -2544,8 +2556,13 @@ struct dbg_tools_data {
 	u8 block_in_reset[88];
 	u8 chip_id;
 	u8 platform_id;
+	u8 num_ports;
+	u8 num_pfs_per_port;
+	u8 num_vfs;
 	u8 initialized;
 	u8 use_dmae;
+	u8 reserved;
+	struct pretend_params pretend;
 	u32 num_regs_read;
 };
 
@@ -2974,6 +2991,24 @@ enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr);
 void qed_read_regs(struct qed_hwfn *p_hwfn,
 		   struct qed_ptt *p_ptt, u32 *buf, u32 addr, u32 len);
 
+/**
+ * @brief qed_read_fw_info - Reads FW info from the chip.
+ *
+ * The FW info contains FW-related information, such as the FW version,
+ * FW image (main/L2B/kuku), FW timestamp, etc.
+ * The FW info is read from the internal RAM of the first Storm that is not in
+ * reset.
+ *
+ * @param p_hwfn -	    HW device data
+ * @param p_ptt -	    Ptt window used for writing the registers.
+ * @param fw_info -	Out: a pointer to write the FW info into.
+ *
+ * @return true if the FW info was read successfully from one of the Storms,
+ * or false if all Storms are in reset.
+ */
+bool qed_read_fw_info(struct qed_hwfn *p_hwfn,
+		      struct qed_ptt *p_ptt, struct fw_info *fw_info);
+
 /**
  * @brief qed_dbg_grc_set_params_default - Reverts all GRC parameters to their
  *	default value.
@@ -4110,6 +4145,21 @@ void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
  */
 void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
 
+#define NUM_STORMS 6
+
+/**
+ * @brief qed_set_rdma_error_level - Sets the RDMA assert level.
+ *                                   If the severity of the error will be
+ *                                   above the level, the FW will assert.
+ * @param p_hwfn - HW device data
+ * @param p_ptt - ptt window used for writing the registers
+ * @param assert_level - An array of assert levels for each storm.
+ *
+ */
+void qed_set_rdma_error_level(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u8 assert_level[NUM_STORMS]);
+
 /* Ystorm flow control mode. Use enum fw_flow_ctrl_mode */
 #define YSTORM_FLOW_CONTROL_MODE_OFFSET			(IRO[0].base)
 #define YSTORM_FLOW_CONTROL_MODE_SIZE			(IRO[0].size)
@@ -4340,27 +4390,67 @@ void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type);
 	(IRO[46].base + ((rdma_stat_counter_id) * IRO[46].m1))
 #define TSTORM_RDMA_QUEUE_STAT_SIZE			(IRO[46].size)
 
+/* Xstorm error level for assert */
+#define XSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[47].base +	((pf_id) * IRO[47].m1))
+#define XSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[47].size)
+
+/* Ystorm error level for assert */
+#define YSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[48].base + ((pf_id) * IRO[48].m1))
+#define YSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[48].size)
+
+/* Pstorm error level for assert */
+#define PSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[49].base +	((pf_id) * IRO[49].m1))
+#define PSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[49].size)
+
+/* Tstorm error level for assert */
+#define TSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[50].base +	((pf_id) * IRO[50].m1))
+#define TSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[50].size)
+
+/* Mstorm error level for assert */
+#define MSTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[51].base + ((pf_id) * IRO[51].m1))
+#define MSTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[51].size)
+
+/* Ustorm error level for assert */
+#define USTORM_RDMA_ASSERT_LEVEL_OFFSET(pf_id) \
+	(IRO[52].base + ((pf_id) * IRO[52].m1))
+#define USTORM_RDMA_ASSERT_LEVEL_SIZE			(IRO[52].size)
+
 /* Xstorm iWARP rxmit stats */
 #define XSTORM_IWARP_RXMIT_STATS_OFFSET(pf_id) \
-	(IRO[47].base + ((pf_id) * IRO[47].m1))
-#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[47].size)
+	(IRO[53].base +	((pf_id) * IRO[53].m1))
+#define XSTORM_IWARP_RXMIT_STATS_SIZE			(IRO[53].size)
 
 /* Tstorm RoCE Event Statistics */
 #define TSTORM_ROCE_EVENTS_STAT_OFFSET(roce_pf_id) \
-	(IRO[48].base + ((roce_pf_id) * IRO[48].m1))
-#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[48].size)
+	(IRO[54].base + ((roce_pf_id) * IRO[54].m1))
+#define TSTORM_ROCE_EVENTS_STAT_SIZE			(IRO[54].size)
 
 /* DCQCN Received Statistics */
 #define YSTORM_ROCE_DCQCN_RECEIVED_STATS_OFFSET(roce_pf_id) \
-	(IRO[49].base + ((roce_pf_id) * IRO[49].m1))
-#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[49].size)
+	(IRO[55].base + ((roce_pf_id) * IRO[55].m1))
+#define YSTORM_ROCE_DCQCN_RECEIVED_STATS_SIZE		(IRO[55].size)
+
+/* RoCE Error Statistics */
+#define YSTORM_ROCE_ERROR_STATS_OFFSET(roce_pf_id) \
+	(IRO[56].base + ((roce_pf_id) * IRO[56].m1))
+#define YSTORM_ROCE_ERROR_STATS_SIZE			(IRO[56].size)
 
 /* DCQCN Sent Statistics */
 #define PSTORM_ROCE_DCQCN_SENT_STATS_OFFSET(roce_pf_id) \
-	(IRO[50].base + ((roce_pf_id) * IRO[50].m1))
-#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[50].size)
+	(IRO[57].base + ((roce_pf_id) * IRO[57].m1))
+#define PSTORM_ROCE_DCQCN_SENT_STATS_SIZE		(IRO[57].size)
 
-static const struct iro iro_arr[51] = {
+/* RoCE CQEs Statistics */
+#define USTORM_ROCE_CQE_STATS_OFFSET(roce_pf_id) \
+	(IRO[58].base + ((roce_pf_id) * IRO[58].m1))
+#define USTORM_ROCE_CQE_STATS_SIZE			(IRO[58].size)
+
+static const struct iro iro_arr[59] = {
 	{0x0, 0x0, 0x0, 0x0, 0x8},
 	{0x4cb8, 0x88, 0x0, 0x0, 0x88},
 	{0x6530, 0x20, 0x0, 0x0, 0x20},
@@ -4408,10 +4498,18 @@ static const struct iro iro_arr[51] = {
 	{0x10768, 0x20, 0x0, 0x0, 0x20},
 	{0x2d48, 0x80, 0x0, 0x0, 0x10},
 	{0x5048, 0x10, 0x0, 0x0, 0x10},
+	{0xc748, 0x8, 0x0, 0x0, 0x1},
+	{0xa128, 0x8, 0x0, 0x0, 0x1},
+	{0x10f00, 0x8, 0x0, 0x0, 0x1},
+	{0xf030, 0x8, 0x0, 0x0, 0x1},
+	{0x13028, 0x8, 0x0, 0x0, 0x1},
+	{0x12c58, 0x8, 0x0, 0x0, 0x1},
 	{0xc9b8, 0x30, 0x0, 0x0, 0x10},
-	{0xed90, 0x10, 0x0, 0x0, 0x10},
-	{0xa3a0, 0x10, 0x0, 0x0, 0x10},
+	{0xed90, 0x28, 0x0, 0x0, 0x28},
+	{0xa520, 0x18, 0x0, 0x0, 0x18},
+	{0xa6a0, 0x8, 0x0, 0x0, 0x8},
 	{0x13108, 0x8, 0x0, 0x0, 0x8},
+	{0x13c50, 0x18, 0x0, 0x0, 0x18},
 };
 
 /* Runtime array offsets */
@@ -4797,147 +4895,147 @@ static const struct iro iro_arr[51] = {
 #define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_OFFSET		39769
 #define NIG_REG_LLH_FUNC_FILTER_HDR_SEL_RT_SIZE			16
 #define NIG_REG_TX_EDPM_CTRL_RT_OFFSET				39785
-#define NIG_REG_ROCE_DUPLICATE_TO_HOST_RT_OFFSET		39786
-#define NIG_REG_PPF_TO_ENGINE_SEL_RT_OFFSET			39787
-#define NIG_REG_PPF_TO_ENGINE_SEL_RT_SIZE			8
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_OFFSET		39795
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_SIZE		1024
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_OFFSET		40819
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_OFFSET		41331
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET	41843
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE	512
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_OFFSET	42355
-#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_SIZE		512
-#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_OFFSET		42867
-#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_SIZE			32
-#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET			42899
-#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET			42900
-#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET			42901
-#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET			42902
-#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET			42903
-#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET			42904
-#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET			42905
-#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET		42906
-#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET		42907
-#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET		42908
-#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET		42909
-#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET			42910
-#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET			42911
-#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET			42912
-#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET			42913
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET		42914
-#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET			42915
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET		42916
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET		42917
-#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET			42918
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET		42919
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET		42920
-#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET			42921
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET		42922
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET		42923
-#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET			42924
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET		42925
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET		42926
-#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET			42927
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET		42928
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET		42929
-#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET			42930
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET		42931
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET		42932
-#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET			42933
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET		42934
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET		42935
-#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET			42936
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET		42937
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET		42938
-#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET			42939
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET		42940
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET		42941
-#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET			42942
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET		42943
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET		42944
-#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET			42945
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET		42946
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET		42947
-#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET			42948
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET		42949
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET		42950
-#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET			42951
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET		42952
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET		42953
-#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET			42954
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET		42955
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET		42956
-#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET			42957
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET		42958
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET		42959
-#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET			42960
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET		42961
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET		42962
-#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET			42963
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET		42964
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET		42965
-#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET			42966
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET		42967
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET		42968
-#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET			42969
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET		42970
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET		42971
-#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET			42972
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET		42973
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ20_RT_OFFSET		42974
-#define PBF_REG_BTB_GUARANTEED_VOQ20_RT_OFFSET			42975
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ20_RT_OFFSET		42976
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ21_RT_OFFSET		42977
-#define PBF_REG_BTB_GUARANTEED_VOQ21_RT_OFFSET			42978
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ21_RT_OFFSET		42979
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ22_RT_OFFSET		42980
-#define PBF_REG_BTB_GUARANTEED_VOQ22_RT_OFFSET			42981
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ22_RT_OFFSET		42982
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ23_RT_OFFSET		42983
-#define PBF_REG_BTB_GUARANTEED_VOQ23_RT_OFFSET			42984
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ23_RT_OFFSET		42985
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ24_RT_OFFSET		42986
-#define PBF_REG_BTB_GUARANTEED_VOQ24_RT_OFFSET			42987
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ24_RT_OFFSET		42988
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ25_RT_OFFSET		42989
-#define PBF_REG_BTB_GUARANTEED_VOQ25_RT_OFFSET			42990
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ25_RT_OFFSET		42991
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ26_RT_OFFSET		42992
-#define PBF_REG_BTB_GUARANTEED_VOQ26_RT_OFFSET			42993
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ26_RT_OFFSET		42994
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ27_RT_OFFSET		42995
-#define PBF_REG_BTB_GUARANTEED_VOQ27_RT_OFFSET			42996
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ27_RT_OFFSET		42997
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ28_RT_OFFSET		42998
-#define PBF_REG_BTB_GUARANTEED_VOQ28_RT_OFFSET			42999
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ28_RT_OFFSET		43000
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ29_RT_OFFSET		43001
-#define PBF_REG_BTB_GUARANTEED_VOQ29_RT_OFFSET			43002
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ29_RT_OFFSET		43003
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ30_RT_OFFSET		43004
-#define PBF_REG_BTB_GUARANTEED_VOQ30_RT_OFFSET			43005
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ30_RT_OFFSET		43006
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ31_RT_OFFSET		43007
-#define PBF_REG_BTB_GUARANTEED_VOQ31_RT_OFFSET			43008
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ31_RT_OFFSET		43009
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ32_RT_OFFSET		43010
-#define PBF_REG_BTB_GUARANTEED_VOQ32_RT_OFFSET			43011
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ32_RT_OFFSET		43012
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ33_RT_OFFSET		43013
-#define PBF_REG_BTB_GUARANTEED_VOQ33_RT_OFFSET			43014
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ33_RT_OFFSET		43015
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ34_RT_OFFSET		43016
-#define PBF_REG_BTB_GUARANTEED_VOQ34_RT_OFFSET			43017
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ34_RT_OFFSET		43018
-#define PBF_REG_YCMD_QS_NUM_LINES_VOQ35_RT_OFFSET		43019
-#define PBF_REG_BTB_GUARANTEED_VOQ35_RT_OFFSET			43020
-#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ35_RT_OFFSET		43021
-#define XCM_REG_CON_PHY_Q3_RT_OFFSET				43022
-
-#define RUNTIME_ARRAY_SIZE	43023
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_OFFSET                             39786
+#define NIG_REG_PPF_TO_ENGINE_SEL_RT_SIZE                               8
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_OFFSET                  39794
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_VALUE_RT_SIZE                    1024
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_OFFSET                     40818
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_EN_RT_SIZE                       512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_OFFSET                   41330
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_MODE_RT_SIZE                     512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_OFFSET          41842
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_PROTOCOL_TYPE_RT_SIZE            512
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_OFFSET                42354
+#define NIG_REG_LLH_PF_CLS_FUNC_FILTER_HDR_SEL_RT_SIZE                  512
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_OFFSET                        42866
+#define NIG_REG_LLH_PF_CLS_FILTERS_MAP_RT_SIZE                          32
+#define CDU_REG_CID_ADDR_PARAMS_RT_OFFSET                               42898
+#define CDU_REG_SEGMENT0_PARAMS_RT_OFFSET                               42899
+#define CDU_REG_SEGMENT1_PARAMS_RT_OFFSET                               42900
+#define CDU_REG_PF_SEG0_TYPE_OFFSET_RT_OFFSET                           42901
+#define CDU_REG_PF_SEG1_TYPE_OFFSET_RT_OFFSET                           42902
+#define CDU_REG_PF_SEG2_TYPE_OFFSET_RT_OFFSET                           42903
+#define CDU_REG_PF_SEG3_TYPE_OFFSET_RT_OFFSET                           42904
+#define CDU_REG_PF_FL_SEG0_TYPE_OFFSET_RT_OFFSET                        42905
+#define CDU_REG_PF_FL_SEG1_TYPE_OFFSET_RT_OFFSET                        42906
+#define CDU_REG_PF_FL_SEG2_TYPE_OFFSET_RT_OFFSET                        42907
+#define CDU_REG_PF_FL_SEG3_TYPE_OFFSET_RT_OFFSET                        42908
+#define CDU_REG_VF_SEG_TYPE_OFFSET_RT_OFFSET                            42909
+#define CDU_REG_VF_FL_SEG_TYPE_OFFSET_RT_OFFSET                         42910
+#define PBF_REG_TAG_ETHERTYPE_0_RT_OFFSET                               42911
+#define PBF_REG_BTB_SHARED_AREA_SIZE_RT_OFFSET                          42912
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ0_RT_OFFSET                        42913
+#define PBF_REG_BTB_GUARANTEED_VOQ0_RT_OFFSET                           42914
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ0_RT_OFFSET                    42915
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ1_RT_OFFSET                        42916
+#define PBF_REG_BTB_GUARANTEED_VOQ1_RT_OFFSET                           42917
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ1_RT_OFFSET                    42918
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ2_RT_OFFSET                        42919
+#define PBF_REG_BTB_GUARANTEED_VOQ2_RT_OFFSET                           42920
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ2_RT_OFFSET                    42921
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ3_RT_OFFSET                        42922
+#define PBF_REG_BTB_GUARANTEED_VOQ3_RT_OFFSET                           42923
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ3_RT_OFFSET                    42924
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ4_RT_OFFSET                        42925
+#define PBF_REG_BTB_GUARANTEED_VOQ4_RT_OFFSET                           42926
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ4_RT_OFFSET                    42927
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ5_RT_OFFSET                        42928
+#define PBF_REG_BTB_GUARANTEED_VOQ5_RT_OFFSET                           42929
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ5_RT_OFFSET                    42930
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ6_RT_OFFSET                        42931
+#define PBF_REG_BTB_GUARANTEED_VOQ6_RT_OFFSET                           42932
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ6_RT_OFFSET                    42933
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ7_RT_OFFSET                        42934
+#define PBF_REG_BTB_GUARANTEED_VOQ7_RT_OFFSET                           42935
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ7_RT_OFFSET                    42936
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ8_RT_OFFSET                        42937
+#define PBF_REG_BTB_GUARANTEED_VOQ8_RT_OFFSET                           42938
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ8_RT_OFFSET                    42939
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ9_RT_OFFSET                        42940
+#define PBF_REG_BTB_GUARANTEED_VOQ9_RT_OFFSET                           42941
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ9_RT_OFFSET                    42942
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ10_RT_OFFSET                       42943
+#define PBF_REG_BTB_GUARANTEED_VOQ10_RT_OFFSET                          42944
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ10_RT_OFFSET                   42945
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ11_RT_OFFSET                       42946
+#define PBF_REG_BTB_GUARANTEED_VOQ11_RT_OFFSET                          42947
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ11_RT_OFFSET                   42948
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ12_RT_OFFSET                       42949
+#define PBF_REG_BTB_GUARANTEED_VOQ12_RT_OFFSET                          42950
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ12_RT_OFFSET                   42951
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ13_RT_OFFSET                       42952
+#define PBF_REG_BTB_GUARANTEED_VOQ13_RT_OFFSET                          42953
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ13_RT_OFFSET                   42954
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ14_RT_OFFSET                       42955
+#define PBF_REG_BTB_GUARANTEED_VOQ14_RT_OFFSET                          42956
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ14_RT_OFFSET                   42957
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ15_RT_OFFSET                       42958
+#define PBF_REG_BTB_GUARANTEED_VOQ15_RT_OFFSET                          42959
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ15_RT_OFFSET                   42960
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ16_RT_OFFSET                       42961
+#define PBF_REG_BTB_GUARANTEED_VOQ16_RT_OFFSET                          42962
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ16_RT_OFFSET                   42963
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ17_RT_OFFSET                       42964
+#define PBF_REG_BTB_GUARANTEED_VOQ17_RT_OFFSET                          42965
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ17_RT_OFFSET                   42966
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ18_RT_OFFSET                       42967
+#define PBF_REG_BTB_GUARANTEED_VOQ18_RT_OFFSET                          42968
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ18_RT_OFFSET                   42969
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ19_RT_OFFSET                       42970
+#define PBF_REG_BTB_GUARANTEED_VOQ19_RT_OFFSET                          42971
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ19_RT_OFFSET                   42972
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ20_RT_OFFSET                       42973
+#define PBF_REG_BTB_GUARANTEED_VOQ20_RT_OFFSET                          42974
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ20_RT_OFFSET                   42975
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ21_RT_OFFSET                       42976
+#define PBF_REG_BTB_GUARANTEED_VOQ21_RT_OFFSET                          42977
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ21_RT_OFFSET                   42978
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ22_RT_OFFSET                       42979
+#define PBF_REG_BTB_GUARANTEED_VOQ22_RT_OFFSET                          42980
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ22_RT_OFFSET                   42981
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ23_RT_OFFSET                       42982
+#define PBF_REG_BTB_GUARANTEED_VOQ23_RT_OFFSET                          42983
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ23_RT_OFFSET                   42984
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ24_RT_OFFSET                       42985
+#define PBF_REG_BTB_GUARANTEED_VOQ24_RT_OFFSET                          42986
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ24_RT_OFFSET                   42987
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ25_RT_OFFSET                       42988
+#define PBF_REG_BTB_GUARANTEED_VOQ25_RT_OFFSET                          42989
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ25_RT_OFFSET                   42990
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ26_RT_OFFSET                       42991
+#define PBF_REG_BTB_GUARANTEED_VOQ26_RT_OFFSET                          42992
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ26_RT_OFFSET                   42993
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ27_RT_OFFSET                       42994
+#define PBF_REG_BTB_GUARANTEED_VOQ27_RT_OFFSET                          42995
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ27_RT_OFFSET                   42996
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ28_RT_OFFSET                       42997
+#define PBF_REG_BTB_GUARANTEED_VOQ28_RT_OFFSET                          42998
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ28_RT_OFFSET                   42999
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ29_RT_OFFSET                       43000
+#define PBF_REG_BTB_GUARANTEED_VOQ29_RT_OFFSET                          43001
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ29_RT_OFFSET                   43002
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ30_RT_OFFSET                       43003
+#define PBF_REG_BTB_GUARANTEED_VOQ30_RT_OFFSET                          43004
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ30_RT_OFFSET                   43005
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ31_RT_OFFSET                       43006
+#define PBF_REG_BTB_GUARANTEED_VOQ31_RT_OFFSET                          43007
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ31_RT_OFFSET                   43008
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ32_RT_OFFSET                       43009
+#define PBF_REG_BTB_GUARANTEED_VOQ32_RT_OFFSET                          43010
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ32_RT_OFFSET                   43011
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ33_RT_OFFSET                       43012
+#define PBF_REG_BTB_GUARANTEED_VOQ33_RT_OFFSET                          43013
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ33_RT_OFFSET                   43014
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ34_RT_OFFSET                       43015
+#define PBF_REG_BTB_GUARANTEED_VOQ34_RT_OFFSET                          43016
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ34_RT_OFFSET                   43017
+#define PBF_REG_YCMD_QS_NUM_LINES_VOQ35_RT_OFFSET                       43018
+#define PBF_REG_BTB_GUARANTEED_VOQ35_RT_OFFSET                          43019
+#define PBF_REG_BTB_SHARED_AREA_SETUP_VOQ35_RT_OFFSET                   43020
+#define XCM_REG_CON_PHY_Q3_RT_OFFSET                                    43021
+
+#define RUNTIME_ARRAY_SIZE 43022
+
 
 /* Init Callbacks */
 #define DMAE_READY_CB	0
@@ -5694,8 +5792,10 @@ struct eth_vport_rx_mode {
 #define ETH_VPORT_RX_MODE_MCAST_ACCEPT_ALL_SHIFT	4
 #define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_MASK		0x1
 #define ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL_SHIFT	5
-#define ETH_VPORT_RX_MODE_RESERVED1_MASK		0x3FF
-#define ETH_VPORT_RX_MODE_RESERVED1_SHIFT		6
+#define ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI_MASK		0x1
+#define ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI_SHIFT		6
+#define ETH_VPORT_RX_MODE_RESERVED1_MASK		0x1FF
+#define ETH_VPORT_RX_MODE_RESERVED1_SHIFT		7
 };
 
 /* Command for setting tpa parameters */
@@ -6756,7 +6856,7 @@ struct e4_ystorm_rdma_task_ag_ctx {
 #define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
 #define E4_YSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 key;
-	__le32 mw_cnt;
+	__le32 mw_cnt_or_qp_id;
 	u8 ref_cnt_seq;
 	u8 ctx_upd_seq;
 	__le16 dif_flags;
@@ -6812,7 +6912,7 @@ struct e4_mstorm_rdma_task_ag_ctx {
 #define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_MASK		0x1
 #define E4_MSTORM_RDMA_TASK_AG_CTX_RULE6EN_SHIFT	7
 	u8 key;
-	__le32 mw_cnt;
+	__le32 mw_cnt_or_qp_id;
 	u8 ref_cnt_seq;
 	u8 ctx_upd_seq;
 	__le16 dif_flags;
@@ -7075,8 +7175,7 @@ struct rdma_register_tid_ramrod_data {
 	struct regpair va;
 	struct regpair pbl_base;
 	struct regpair dif_error_addr;
-	struct regpair dif_runt_addr;
-	__le32 reserved4[2];
+	__le32 reserved4[4];
 };
 
 /* rdma resize cq output params */
@@ -7144,8 +7243,7 @@ struct rdma_srq_modify_ramrod_data {
 enum rdma_tid_type {
 	RDMA_TID_REGISTERED_MR,
 	RDMA_TID_FMR,
-	RDMA_TID_MW_TYPE1,
-	RDMA_TID_MW_TYPE2A,
+	RDMA_TID_MW,
 	MAX_RDMA_TID_TYPE
 };
 
@@ -7681,6 +7779,16 @@ struct e4_roce_conn_context {
 	struct ustorm_roce_conn_st_ctx ustorm_st_context;
 };
 
+/* roce cqes statistics */
+struct roce_cqe_stats {
+	__le32 req_cqe_error;
+	__le32 req_remote_access_errors;
+	__le32 req_remote_invalid_request;
+	__le32 resp_cqe_error;
+	__le32 resp_local_length_error;
+	__le32 reserved;
+};
+
 /* roce create qp requester ramrod data */
 struct roce_create_qp_req_ramrod_data {
 	__le16 flags;
@@ -7798,8 +7906,8 @@ struct roce_dcqcn_sent_stats {
 
 /* RoCE destroy qp requester output params */
 struct roce_destroy_qp_req_output_params {
-	__le32 num_bound_mw;
 	__le32 cq_prod;
+	__le32 reserved;
 };
 
 /* RoCE destroy qp requester ramrod data */
@@ -7809,8 +7917,8 @@ struct roce_destroy_qp_req_ramrod_data {
 
 /* RoCE destroy qp responder output params */
 struct roce_destroy_qp_resp_output_params {
-	__le32 num_invalidated_mw;
 	__le32 cq_prod;
+	__le32 reserved;
 };
 
 /* RoCE destroy qp responder ramrod data */
@@ -7818,16 +7926,27 @@ struct roce_destroy_qp_resp_ramrod_data {
 	struct regpair output_params_addr;
 };
 
+/* roce error statistics */
+struct roce_error_stats {
+	__le32 resp_remote_access_errors;
+	__le32 reserved;
+};
+
 /* roce special events statistics */
 struct roce_events_stats {
-	__le16 silent_drops;
-	__le16 rnr_naks_sent;
+	__le32 silent_drops;
+	__le32 rnr_naks_sent;
 	__le32 retransmit_count;
 	__le32 icrc_error_count;
-	__le32 reserved;
+	__le32 implied_nak_seq_err;
+	__le32 duplicate_request;
+	__le32 local_ack_timeout_err;
+	__le32 out_of_sequence;
+	__le32 packet_seq_err;
+	__le32 rnr_nak_retry_err;
 };
 
-/* ROCE slow path EQ cmd IDs */
+/* roce slow path EQ cmd IDs */
 enum roce_event_opcode {
 	ROCE_EVENT_CREATE_QP = 11,
 	ROCE_EVENT_MODIFY_QP,
@@ -7845,6 +7964,9 @@ struct roce_init_func_params {
 	u8 cnp_dscp;
 	u8 reserved;
 	__le32 cnp_send_timeout;
+	__le16 rl_offset;
+	u8 rl_count_log;
+	u8 reserved1[5];
 };
 
 /* roce func init ramrod data */
@@ -8532,7 +8654,7 @@ struct e4_tstorm_roce_resp_conn_ag_ctx {
 	__le16 rq_prod;
 	__le16 conn_dpi;
 	__le16 irq_cons;
-	__le32 num_invlidated_mw;
+	__le32 reg9;
 	__le32 reg10;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c
index fca2dbd93ad9..70504dcf4087 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c
@@ -360,6 +360,26 @@ void qed_port_unpretend(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	       *(u32 *)&p_ptt->pxp.pretend);
 }
 
+void qed_port_fid_pretend(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 port_id, u16 fid)
+{
+	u16 control = 0;
+
+	SET_FIELD(control, PXP_PRETEND_CMD_PORT, port_id);
+	SET_FIELD(control, PXP_PRETEND_CMD_USE_PORT, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_PORT, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_IS_CONCRETE, 1);
+	SET_FIELD(control, PXP_PRETEND_CMD_PRETEND_FUNCTION, 1);
+	if (!GET_FIELD(fid, PXP_CONCRETE_FID_VFVALID))
+		fid = GET_FIELD(fid, PXP_CONCRETE_FID_PFID);
+	p_ptt->pxp.pretend.control = cpu_to_le16(control);
+	p_ptt->pxp.pretend.fid.concrete_fid.fid = cpu_to_le16(fid);
+	REG_WR(p_hwfn,
+	       qed_ptt_config_addr(p_ptt) +
+	       offsetof(struct pxp_ptt_entry, pretend),
+	       *(u32 *)&p_ptt->pxp.pretend);
+}
+
 u32 qed_vfid_to_concrete(struct qed_hwfn *p_hwfn, u8 vfid)
 {
 	u32 concrete_fid = 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h
index 8db2839a8ec8..505e94db939d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hw.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h
@@ -244,6 +244,18 @@ void qed_port_pretend(struct qed_hwfn *p_hwfn,
 void qed_port_unpretend(struct qed_hwfn *p_hwfn,
 			struct qed_ptt *p_ptt);
 
+/**
+ * @brief qed_port_fid_pretend - pretend to another port and another function
+ *        when accessing the ptt window
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param port_id - the port to pretend to
+ * @param fid - fid field of pxp_pretend structure. Can contain either pf / vf.
+ */
+void qed_port_fid_pretend(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 port_id, u16 fid);
+
 /**
  * @brief qed_vfid_to_concrete - build a concrete FID for a
  *        given VF ID
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index 1365da7c8900..d845badf9b90 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -1245,7 +1245,7 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 		    bool udp,
 		    bool ipv4, bool ipv6, enum gft_profile_type profile_type)
 {
-	u32 reg_val, cam_line, ram_line_lo, ram_line_hi;
+	u32 reg_val, cam_line, ram_line_lo, ram_line_hi, search_non_ip_as_gft;
 
 	if (!ipv6 && !ipv4)
 		DP_NOTICE(p_hwfn,
@@ -1314,6 +1314,9 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 	ram_line_lo = 0;
 	ram_line_hi = 0;
 
+	/* Search no IP as GFT */
+	search_non_ip_as_gft = 0;
+
 	/* Tunnel type */
 	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_DST_PORT, 1);
 	SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL, 1);
@@ -1337,8 +1340,13 @@ void qed_gft_config(struct qed_hwfn *p_hwfn,
 		SET_FIELD(ram_line_lo, GFT_RAM_LINE_ETHERTYPE, 1);
 	} else if (profile_type == GFT_PROFILE_TYPE_TUNNEL_TYPE) {
 		SET_FIELD(ram_line_lo, GFT_RAM_LINE_TUNNEL_ETHERTYPE, 1);
+
+		/* Allow tunneled traffic without inner IP */
+		search_non_ip_as_gft = 1;
 	}
 
+	qed_wr(p_hwfn,
+	       p_ptt, PRS_REG_SEARCH_NON_IP_AS_GFT, search_non_ip_as_gft);
 	qed_wr(p_hwfn,
 	       p_ptt,
 	       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id,
@@ -1509,3 +1517,43 @@ void qed_enable_context_validation(struct qed_hwfn *p_hwfn,
 	ctx_validation = CDU_VALIDATION_DEFAULT_CFG << 8;
 	qed_wr(p_hwfn, p_ptt, CDU_REG_TCFC_CTX_VALID0, ctx_validation);
 }
+
+static u32 qed_get_rdma_assert_ram_addr(struct qed_hwfn *p_hwfn, u8 storm_id)
+{
+	switch (storm_id) {
+	case 0:
+		return TSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    TSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 1:
+		return MSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    MSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 2:
+		return USEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    USTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 3:
+		return XSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    XSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 4:
+		return YSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    YSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+	case 5:
+		return PSEM_REG_FAST_MEMORY + SEM_FAST_REG_INT_RAM +
+		    PSTORM_RDMA_ASSERT_LEVEL_OFFSET(p_hwfn->rel_pf_id);
+
+	default:
+		return 0;
+	}
+}
+
+void qed_set_rdma_error_level(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      u8 assert_level[NUM_STORMS])
+{
+	u8 storm_id;
+
+	for (storm_id = 0; storm_id < NUM_STORMS; storm_id++) {
+		u32 ram_addr = qed_get_rdma_assert_ram_addr(p_hwfn, storm_id);
+
+		qed_wr(p_hwfn, p_ptt, ram_addr, assert_level[storm_id]);
+	}
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index 474e6cff5b97..90a2b53096e2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1159,7 +1159,6 @@ int qed_iwarp_connect(void *rdma_cxt,
 	struct qed_iwarp_info *iwarp_info;
 	struct qed_iwarp_ep *ep;
 	u8 mpa_data_size = 0;
-	u8 ts_hdr_size = 0;
 	u32 cid;
 	int rc;
 
@@ -1218,10 +1217,7 @@ int qed_iwarp_connect(void *rdma_cxt,
 	       iparams->cm_info.private_data,
 	       iparams->cm_info.private_data_len);
 
-	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
-		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
-
-	ep->mss = iparams->mss - ts_hdr_size;
+	ep->mss = iparams->mss;
 	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
 
 	ep->event_cb = iparams->event_cb;
@@ -2337,7 +2333,6 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 	u8 local_mac_addr[ETH_ALEN];
 	struct qed_iwarp_ep *ep;
 	int tcp_start_offset;
-	u8 ts_hdr_size = 0;
 	u8 ll2_syn_handle;
 	int payload_len;
 	u32 hdr_size;
@@ -2415,11 +2410,7 @@ qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
 
 	memcpy(&ep->cm_info, &cm_info, sizeof(ep->cm_info));
 
-	if (p_hwfn->p_rdma_info->iwarp.tcp_flags & QED_IWARP_TS_EN)
-		ts_hdr_size = TIMESTAMP_HEADER_SIZE;
-
-	hdr_size = ((cm_info.ip_version == QED_TCP_IPV4) ? 40 : 60) +
-		   ts_hdr_size;
+	hdr_size = ((cm_info.ip_version == QED_TCP_IPV4) ? 40 : 60);
 	ep->mss = p_hwfn->p_rdma_info->iwarp.max_mtu - hdr_size;
 	ep->mss = min_t(u16, QED_IWARP_MAX_FW_MSS, ep->mss);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index eed4725761f5..1f6ac848109d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -586,6 +586,9 @@ qed_sp_update_accept_mode(struct qed_hwfn *p_hwfn,
 		SET_FIELD(state, ETH_VPORT_RX_MODE_BCAST_ACCEPT_ALL,
 			  !!(accept_filter & QED_ACCEPT_BCAST));
 
+		SET_FIELD(state, ETH_VPORT_RX_MODE_ACCEPT_ANY_VNI,
+			  !!(accept_filter & QED_ACCEPT_ANY_VNI));
+
 		p_ramrod->rx_mode.state = cpu_to_le16(state);
 		DP_VERBOSE(p_hwfn, QED_MSG_SP,
 			   "p_ramrod->rx_mode.state = 0x%x\n", state);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index c4030e949cce..806a8da257e9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -183,6 +183,7 @@ struct qed_filter_accept_flags {
 #define QED_ACCEPT_MCAST_MATCHED        0x08
 #define QED_ACCEPT_MCAST_UNMATCHED      0x10
 #define QED_ACCEPT_BCAST                0x20
+#define QED_ACCEPT_ANY_VNI              0x40
 };
 
 struct qed_arfs_config_params {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index b8705107a93a..101d677114f2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -1508,11 +1508,8 @@ qed_rdma_register_tid(void *rdma_cxt,
 	case QED_RDMA_TID_FMR:
 		tid_type = RDMA_TID_FMR;
 		break;
-	case QED_RDMA_TID_MW_TYPE1:
-		tid_type = RDMA_TID_MW_TYPE1;
-		break;
-	case QED_RDMA_TID_MW_TYPE2A:
-		tid_type = RDMA_TID_MW_TYPE2A;
+	case QED_RDMA_TID_MW:
+		tid_type = RDMA_TID_MW;
 		break;
 	default:
 		rc = -EINVAL;
@@ -1544,7 +1541,6 @@ qed_rdma_register_tid(void *rdma_cxt,
 			  RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG, 1);
 		DMA_REGPAIR_LE(p_ramrod->dif_error_addr,
 			       params->dif_error_addr);
-		DMA_REGPAIR_LE(p_ramrod->dif_runt_addr, params->dif_runt_addr);
 	}
 
 	rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index f7122059b6b5..d8ad2dcad8d5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -178,7 +178,7 @@
 	0x008c80UL
 #define  MCP_REG_SCRATCH	\
 	0xe20000UL
-#define  CNIG_REG_NW_PORT_MODE_BB_B0 \
+#define  CNIG_REG_NW_PORT_MODE_BB \
 	0x218200UL
 #define  MISCS_REG_CHIP_NUM \
 	0x00976cUL
@@ -1621,6 +1621,7 @@
 #define NIG_REG_TX_EDPM_CTRL_TX_EDPM_TC_EN_SHIFT 1
 
 #define PRS_REG_SEARCH_GFT 0x1f11bcUL
+#define PRS_REG_SEARCH_NON_IP_AS_GFT 0x1f11c0UL
 #define PRS_REG_CM_HDR_GFT 0x1f11c8UL
 #define PRS_REG_GFT_CAM 0x1f1100UL
 #define PRS_REG_GFT_PROFILE_MASK_RAM 0x1f1000UL
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index ee57fcdc698d..b5ce1581645f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -681,7 +681,6 @@ static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn,
 
 static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 					    struct qed_rdma_qp *qp,
-					    u32 *num_invalidated_mw,
 					    u32 *cq_prod)
 {
 	struct roce_destroy_qp_resp_output_params *p_ramrod_res;
@@ -692,8 +691,6 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 	int rc;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
-
-	*num_invalidated_mw = 0;
 	*cq_prod = qp->cq_prod;
 
 	if (!qp->resp_offloaded) {
@@ -742,7 +739,6 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 	if (rc)
 		goto err;
 
-	*num_invalidated_mw = le32_to_cpu(p_ramrod_res->num_invalidated_mw);
 	*cq_prod = le32_to_cpu(p_ramrod_res->cq_prod);
 	qp->cq_prod = *cq_prod;
 
@@ -764,8 +760,7 @@ err:
 }
 
 static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn,
-					    struct qed_rdma_qp *qp,
-					    u32 *num_bound_mw)
+					    struct qed_rdma_qp *qp)
 {
 	struct roce_destroy_qp_req_output_params *p_ramrod_res;
 	struct roce_destroy_qp_req_ramrod_data *p_ramrod;
@@ -807,7 +802,6 @@ static int qed_roce_sp_destroy_qp_requester(struct qed_hwfn *p_hwfn,
 	if (rc)
 		goto err;
 
-	*num_bound_mw = le32_to_cpu(p_ramrod_res->num_bound_mw);
 
 	/* Free ORQ - only if ramrod succeeded, in case FW is still using it */
 	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
@@ -968,8 +962,6 @@ err_resp:
 
 int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 {
-	u32 num_invalidated_mw = 0;
-	u32 num_bound_mw = 0;
 	u32 cq_prod;
 	int rc;
 
@@ -984,22 +976,14 @@ int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 
 	if (qp->cur_state != QED_ROCE_QP_STATE_RESET) {
 		rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp,
-						      &num_invalidated_mw,
 						      &cq_prod);
 		if (rc)
 			return rc;
 
 		/* Send destroy requester ramrod */
-		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
-						      &num_bound_mw);
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp);
 		if (rc)
 			return rc;
-
-		if (num_invalidated_mw != num_bound_mw) {
-			DP_NOTICE(p_hwfn,
-				  "number of invalidate memory windows is different from bounded ones\n");
-			return -EINVAL;
-		}
 	}
 
 	return 0;
@@ -1010,7 +994,6 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 		       enum qed_roce_qp_state prev_state,
 		       struct qed_rdma_modify_qp_in_params *params)
 {
-	u32 num_invalidated_mw = 0, num_bound_mw = 0;
 	int rc = 0;
 
 	/* Perform additional operations according to the current state and the
@@ -1090,7 +1073,6 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 		/* Send destroy responder ramrod */
 		rc = qed_roce_sp_destroy_qp_responder(p_hwfn,
 						      qp,
-						      &num_invalidated_mw,
 						      &cq_prod);
 
 		if (rc)
@@ -1098,14 +1080,7 @@ int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 
 		qp->cq_prod = cq_prod;
 
-		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
-						      &num_bound_mw);
-
-		if (num_invalidated_mw != num_bound_mw) {
-			DP_NOTICE(p_hwfn,
-				  "number of invalidate memory windows is different from bounded ones\n");
-			return -EINVAL;
-		}
+		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp);
 	} else {
 		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "0\n");
 	}
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index ff21aa1fd939..2f0a4f2c5ff8 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -471,14 +471,8 @@ static u16 qedi_calc_mss(u16 pmtu, u8 is_ipv6, u8 tcp_ts_en, u8 vlan_en)
 	else
 		hdrs += IPV4_HDR_LEN;
 
-	if (vlan_en)
-		hdrs += VLAN_LEN;
-
 	mss = pmtu - hdrs;
 
-	if (tcp_ts_en)
-		mss -= TCP_OPTION_LEN;
-
 	if (!mss)
 		mss = DEF_MSS;
 
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 13c8ab171437..0081fa6d1268 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -109,8 +109,8 @@
 #define MAX_NUM_LL2_TX_STATS_COUNTERS	48
 
 #define FW_MAJOR_VERSION	8
-#define FW_MINOR_VERSION	33
-#define FW_REVISION_VERSION     11
+#define FW_MINOR_VERSION        37
+#define FW_REVISION_VERSION     2
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index 938df614cb6a..b34c573f2b30 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -799,8 +799,8 @@ struct e4_mstorm_iscsi_task_ag_ctx {
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_MSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK	0x1
+#define E4_MSTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT	5
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_MASK			0x1
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT			6
 #define E4_MSTORM_ISCSI_TASK_AG_CTX_TASK_CLEANUP_FLAG_MASK	0x1
@@ -849,8 +849,8 @@ struct e4_ustorm_iscsi_task_ag_ctx {
 #define E4_USTORM_ISCSI_TASK_AG_CTX_CONNECTION_TYPE_SHIFT	0
 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_MASK		0x1
 #define E4_USTORM_ISCSI_TASK_AG_CTX_EXIST_IN_QM0_SHIFT		4
-#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_MASK			0x1
-#define E4_USTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT			5
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_MASK     0x1
+#define E4_USTORM_ISCSI_TASK_AG_CTX_CONN_CLEAR_SQ_FLAG_SHIFT    5
 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_MASK		0x3
 #define E4_USTORM_ISCSI_TASK_AG_CTX_HQ_SCANNED_CF_SHIFT		6
 	u8 flags1;
diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h
index e05e320d28b7..df4d13f7e191 100644
--- a/include/linux/qed/qed_rdma_if.h
+++ b/include/linux/qed/qed_rdma_if.h
@@ -65,8 +65,7 @@ enum qed_roce_qp_state {
 enum qed_rdma_tid_type {
 	QED_RDMA_TID_REGISTERED_MR,
 	QED_RDMA_TID_FMR,
-	QED_RDMA_TID_MW_TYPE1,
-	QED_RDMA_TID_MW_TYPE2A
+	QED_RDMA_TID_MW
 };
 
 struct qed_rdma_events {
@@ -280,7 +279,6 @@ struct qed_rdma_register_tid_in_params {
 
 	bool dif_enabled;
 	u64 dif_error_addr;
-	u64 dif_runt_addr;
 };
 
 struct qed_rdma_create_cq_in_params {
diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h
index 193bcef302e1..473fba76aa77 100644
--- a/include/linux/qed/roce_common.h
+++ b/include/linux/qed/roce_common.h
@@ -43,6 +43,7 @@
 #define ROCE_MAX_QPS			(32 * 1024)
 #define ROCE_DCQCN_NP_MAX_QPS		(64)
 #define ROCE_DCQCN_RP_MAX_QPS		(64)
+#define ROCE_LKEY_MW_DIF_EN_BIT		(28)
 
 /* Affiliated asynchronous events / errors enumeration */
 enum roce_async_events_type {
-- 
cgit v1.2.3-59-g8ed1b


From 6e86000c2c63123e174b7e198735fbb12f0258ea Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Jun 2018 13:40:34 +0200
Subject: netfilter: provide udp*_lib_lookup for nf_tproxy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is now possible to enable the libified nf_tproxy modules without
also enabling NETFILTER_XT_TARGET_TPROXY, which throws off the
ifdef logic in the udp core code:

net/ipv6/netfilter/nf_tproxy_ipv6.o: In function `nf_tproxy_get_sock_v6':
nf_tproxy_ipv6.c:(.text+0x1a8): undefined reference to `udp6_lib_lookup'
net/ipv4/netfilter/nf_tproxy_ipv4.o: In function `nf_tproxy_get_sock_v4':
nf_tproxy_ipv4.c:(.text+0x3d0): undefined reference to `udp4_lib_lookup'

We can actually simplify the conditions now to provide the two functions
exactly when they are needed.

Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 4 +---
 net/ipv6/udp.c | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d71f1f3e1155..05af6f81e7e5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
 /* Must be called under rcu_read_lock().
  * Does increment socket refcount.
  */
-#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
-    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
-    IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif)
 {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 426c9d2b418d..e97372d0059b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -285,9 +285,7 @@ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
 /* Must be called under rcu_read_lock().
  * Does increment socket refcount.
  */
-#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
-    IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \
-    IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
 struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
 			     const struct in6_addr *daddr, __be16 dport, int dif)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 4f416db98beaef2f047709e4693b0a4c64d9bbf8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Jun 2018 13:38:21 +0200
Subject: net: hns3: remove unused hclgevf_cfg_func_mta_filter

The last patch apparently added a complete replacement for this
function, but left the old one in place, which now causes a
harmless warning:

drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c:731:12: 'hclgevf_cfg_func_mta_filter' defined but not used

I assume it can be removed.

Fixes: 3a678b5806e6 ("net: hns3: Optimize the VF's process of updating multicast MAC")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index dd8e8e6718dc..bc8a5760d959 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -728,17 +728,6 @@ static void hclgevf_reset_tqp_stats(struct hnae3_handle *handle)
 	}
 }
 
-static int hclgevf_cfg_func_mta_filter(struct hnae3_handle *handle, bool en)
-{
-	struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle);
-	u8 msg[2] = {0};
-
-	msg[0] = en;
-	return hclgevf_send_mbx_msg(hdev, HCLGE_MBX_SET_MULTICAST,
-				    HCLGE_MBX_MAC_VLAN_MC_FUNC_MTA_ENABLE,
-				    msg, 1, false, NULL, 0);
-}
-
 static int hclgevf_cfg_func_mta_type(struct hclgevf_dev *hdev)
 {
 	u8 resp_msg = HCLGEVF_MTA_TYPE_SEL_MAX;
-- 
cgit v1.2.3-59-g8ed1b


From 848235edb5c93ed086700584c8ff64f6d7fc778d Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 5 Jun 2018 15:01:59 +0200
Subject: ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table
 succeeds

Currently, raw6_sk(sk)->ip6mr_table is set unconditionally during
ip6_mroute_setsockopt(MRT6_TABLE). A subsequent attempt at the same
setsockopt will fail with -ENOENT, since we haven't actually created
that table.

A similar fix for ipv4 was included in commit 5e1859fbcc3c ("ipv4: ipmr:
various fixes and cleanups").

Fixes: d1db275dd3f6 ("ipv6: ip6mr: support multiple tables")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 20a419ee8000..5c57374850b3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1760,7 +1760,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		ret = 0;
 		if (!ip6mr_new_table(net, v))
 			ret = -ENOMEM;
-		raw6_sk(sk)->ip6mr_table = v;
+		else
+			raw6_sk(sk)->ip6mr_table = v;
 		rtnl_unlock();
 		return ret;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From e783bb00ad86d9d1f01d9d3a750713070036358e Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 5 Jun 2018 15:02:00 +0200
Subject: ipmr: fix error path when ipmr_new_table fails

commit 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
refactored ipmr_new_table, so that it now returns NULL when
mr_table_alloc fails. Unfortunately, all callers of ipmr_new_table
expect an ERR_PTR.

This can result in NULL deref, for example when ipmr_rules_exit calls
ipmr_free_table with NULL net->ipv4.mrt in the
!CONFIG_IP_MROUTE_MULTIPLE_TABLES version.

This patch makes mr_table_alloc return errors, and changes
ip6mr_new_table and its callers to return/expect error pointers as
well. It also removes the version of mr_table_alloc defined under
!CONFIG_IP_MROUTE_COMMON, since it is never used.

Fixes: 0bbbf0e7d0e7 ("ipmr, ip6mr: Unite creation of new mr_table")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute_base.h | 10 ----------
 net/ipv4/ipmr_base.c        |  8 +++++---
 net/ipv6/ip6mr.c            | 18 ++++++++++++------
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index d617fe45543e..d633f737b3c6 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -307,16 +307,6 @@ static inline void vif_device_init(struct vif_device *v,
 {
 }
 
-static inline void *
-mr_table_alloc(struct net *net, u32 id,
-	       struct mr_table_ops *ops,
-	       void (*expire_func)(struct timer_list *t),
-	       void (*table_set)(struct mr_table *mrt,
-				 struct net *net))
-{
-	return NULL;
-}
-
 static inline void *mr_mfc_find_parent(struct mr_table *mrt,
 				       void *hasharg, int parent)
 {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..cafb0506c8c9 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -35,17 +35,19 @@ mr_table_alloc(struct net *net, u32 id,
 				 struct net *net))
 {
 	struct mr_table *mrt;
+	int err;
 
 	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
 	if (!mrt)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
 	mrt->ops = *ops;
-	if (rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params)) {
+	err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
+	if (err) {
 		kfree(mrt);
-		return NULL;
+		return ERR_PTR(err);
 	}
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 5c57374850b3..058fc05e5708 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -228,8 +228,8 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	INIT_LIST_HEAD(&net->ipv6.mr6_tables);
 
 	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	if (!mrt) {
-		err = -ENOMEM;
+	if (IS_ERR(mrt)) {
+		err = PTR_ERR(mrt);
 		goto err1;
 	}
 
@@ -302,8 +302,13 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 
 static int __net_init ip6mr_rules_init(struct net *net)
 {
-	net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT);
-	return net->ipv6.mrt6 ? 0 : -ENOMEM;
+	struct mr_table *mrt;
+
+	mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
+	if (IS_ERR(mrt))
+		return PTR_ERR(mrt);
+	net->ipv6.mrt6 = mrt;
+	return 0;
 }
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
@@ -1758,8 +1763,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 		rtnl_lock();
 		ret = 0;
-		if (!ip6mr_new_table(net, v))
-			ret = -ENOMEM;
+		mrt = ip6mr_new_table(net, v);
+		if (IS_ERR(mrt))
+			ret = PTR_ERR(mrt);
 		else
 			raw6_sk(sk)->ip6mr_table = v;
 		rtnl_unlock();
-- 
cgit v1.2.3-59-g8ed1b


From 5b5e7a0de2bbf2a1afcd9f49e940010e9fb80d53 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Jun 2018 06:06:19 -0700
Subject: net: metrics: add proper netlink validation

Before using nla_get_u32(), better make sure the attribute
is of the proper size.

Code recently was changed, but bug has been there from beginning
of git.

BUG: KMSAN: uninit-value in rtnetlink_put_metrics+0x553/0x960 net/core/rtnetlink.c:746
CPU: 1 PID: 14139 Comm: syz-executor6 Not tainted 4.17.0-rc5+ #103
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x149/0x260 mm/kmsan/kmsan.c:1084
 __msan_warning_32+0x6e/0xc0 mm/kmsan/kmsan_instr.c:686
 rtnetlink_put_metrics+0x553/0x960 net/core/rtnetlink.c:746
 fib_dump_info+0xc42/0x2190 net/ipv4/fib_semantics.c:1361
 rtmsg_fib+0x65f/0x8c0 net/ipv4/fib_semantics.c:419
 fib_table_insert+0x2314/0x2b50 net/ipv4/fib_trie.c:1287
 inet_rtm_newroute+0x210/0x340 net/ipv4/fib_frontend.c:779
 rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x455a09
RSP: 002b:00007faae5fd8c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007faae5fd96d4 RCX: 0000000000455a09
RDX: 0000000000000000 RSI: 0000000020000000 RDI: 0000000000000013
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 00000000000005d0 R14: 00000000006fdc20 R15: 0000000000000000

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:294 [inline]
 kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:685
 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:529
 fib_convert_metrics net/ipv4/fib_semantics.c:1056 [inline]
 fib_create_info+0x2d46/0x9dc0 net/ipv4/fib_semantics.c:1150
 fib_table_insert+0x3e4/0x2b50 net/ipv4/fib_trie.c:1146
 inet_rtm_newroute+0x210/0x340 net/ipv4/fib_frontend.c:779
 rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:189
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:315
 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan.c:322
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2753 [inline]
 __kmalloc_node_track_caller+0xb32/0x11b0 mm/slub.c:4395
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cb/0x9e0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:988 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline]
 netlink_sendmsg+0x76e/0x1350 net/netlink/af_netlink.c:1876
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: a919525ad832 ("net: Move fib_convert_metrics to metrics file")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 ++
 net/ipv4/metrics.c       | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6608db23f54b..f3c89ccf14c5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -717,6 +717,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
 			nla_strlcpy(tmp, nla, sizeof(tmp));
 			val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
 		} else {
+			if (nla_len(nla) != sizeof(u32))
+				return false;
 			val = nla_get_u32(nla);
 		}
 
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 5121c6475e6b..04311f7067e2 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -32,6 +32,8 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
 			if (val == TCP_CA_UNSPEC)
 				return -EINVAL;
 		} else {
+			if (nla_len(nla) != sizeof(u32))
+				return -EINVAL;
 			val = nla_get_u32(nla);
 		}
 		if (type == RTAX_ADVMSS && val > 65535 - 40)
-- 
cgit v1.2.3-59-g8ed1b


From ac0fc8a1bbcbe03ee67278afded105c05eb3535e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 5 Jun 2018 08:14:09 -0700
Subject: devlink: Add extack to reload and port_{un, }split operations

Add extack argument to reload, port_split and port_unsplit operations.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c       |  9 ++++++---
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c |  5 +++--
 drivers/net/netdevsim/devlink.c                  |  3 ++-
 include/net/devlink.h                            |  7 ++++---
 net/core/devlink.c                               | 18 ++++++++++--------
 5 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 8a766fe28fa0..7ed38d80bc08 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -770,7 +770,8 @@ static void mlxsw_core_driver_put(const char *kind)
 
 static int mlxsw_devlink_port_split(struct devlink *devlink,
 				    unsigned int port_index,
-				    unsigned int count)
+				    unsigned int count,
+				    struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
@@ -782,7 +783,8 @@ static int mlxsw_devlink_port_split(struct devlink *devlink,
 }
 
 static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
-				      unsigned int port_index)
+				      unsigned int port_index,
+				      struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
@@ -963,7 +965,8 @@ mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
 						     pool_type, p_cur, p_max);
 }
 
-static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink,
+						struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 	int err;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 71c2edd83031..db463e20a876 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -92,7 +92,7 @@ nfp_devlink_set_lanes(struct nfp_pf *pf, unsigned int idx, unsigned int lanes)
 
 static int
 nfp_devlink_port_split(struct devlink *devlink, unsigned int port_index,
-		       unsigned int count)
+		       unsigned int count, struct netlink_ext_ack *extack)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
 	struct nfp_eth_table_port eth_port;
@@ -123,7 +123,8 @@ out:
 }
 
 static int
-nfp_devlink_port_unsplit(struct devlink *devlink, unsigned int port_index)
+nfp_devlink_port_unsplit(struct devlink *devlink, unsigned int port_index,
+			 struct netlink_ext_ack *extack)
 {
 	struct nfp_pf *pf = devlink_priv(devlink);
 	struct nfp_eth_table_port eth_port;
diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c
index bef7db5d129a..e8366cf372ff 100644
--- a/drivers/net/netdevsim/devlink.c
+++ b/drivers/net/netdevsim/devlink.c
@@ -147,7 +147,8 @@ out:
 	return err;
 }
 
-static int nsim_devlink_reload(struct devlink *devlink)
+static int nsim_devlink_reload(struct devlink *devlink,
+			       struct netlink_ext_ack *extack)
 {
 	enum nsim_resource_id res_ids[] = {
 		NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES,
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 9686a1aa4ec9..e336ea9c73df 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -296,12 +296,13 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
-	int (*reload)(struct devlink *devlink);
+	int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
-			  unsigned int count);
-	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+			  unsigned int count, struct netlink_ext_ack *extack);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index,
+			    struct netlink_ext_ack *extack);
 	int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index,
 			   u16 pool_index,
 			   struct devlink_sb_pool_info *pool_info);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f75ee022e6b2..22099705cc41 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -702,12 +702,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
 	return 0;
 }
 
-static int devlink_port_split(struct devlink *devlink,
-			      u32 port_index, u32 count)
+static int devlink_port_split(struct devlink *devlink, u32 port_index,
+			      u32 count, struct netlink_ext_ack *extack)
 
 {
 	if (devlink->ops && devlink->ops->port_split)
-		return devlink->ops->port_split(devlink, port_index, count);
+		return devlink->ops->port_split(devlink, port_index, count,
+						extack);
 	return -EOPNOTSUPP;
 }
 
@@ -724,14 +725,15 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
 
 	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
 	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
-	return devlink_port_split(devlink, port_index, count);
+	return devlink_port_split(devlink, port_index, count, info->extack);
 }
 
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
+				struct netlink_ext_ack *extack)
 
 {
 	if (devlink->ops && devlink->ops->port_unsplit)
-		return devlink->ops->port_unsplit(devlink, port_index);
+		return devlink->ops->port_unsplit(devlink, port_index, extack);
 	return -EOPNOTSUPP;
 }
 
@@ -745,7 +747,7 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
 		return -EINVAL;
 
 	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
-	return devlink_port_unsplit(devlink, port_index);
+	return devlink_port_unsplit(devlink, port_index, info->extack);
 }
 
 static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -2599,7 +2601,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
 		return err;
 	}
-	return devlink->ops->reload(devlink);
+	return devlink->ops->reload(devlink, info->extack);
 }
 
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
-- 
cgit v1.2.3-59-g8ed1b


From 7fa76d777ec53eeece1546b737a3b93b37639575 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 5 Jun 2018 08:14:10 -0700
Subject: netdevsim: Add extack error message for devlink reload

devlink reset command can fail if a FIB resource limit is set to a value
lower than the current occupancy. Return a proper message indicating the
reason for the failure.

$ devlink resource sh netdevsim/netdevsim0
netdevsim/netdevsim0:
  name IPv4 size unlimited unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
    resources:
      name fib size unlimited occ 43 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
      name fib-rules size unlimited occ 4 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
  name IPv6 size unlimited unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
    resources:
      name fib size unlimited occ 54 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none
      name fib-rules size unlimited occ 3 unit entry size_min 0 size_max unlimited size_gran 1 dpipe_tables none

$ devlink resource set netdevsim/netdevsim0 path /IPv4/fib size 40

$ devlink dev  reload netdevsim/netdevsim0
Error: netdevsim: New size is less than current occupancy.
devlink answers: Invalid argument

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/netdevsim/devlink.c   | 4 ++--
 drivers/net/netdevsim/fib.c       | 9 ++++++---
 drivers/net/netdevsim/netdevsim.h | 3 ++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c
index e8366cf372ff..ba663e5af168 100644
--- a/drivers/net/netdevsim/devlink.c
+++ b/drivers/net/netdevsim/devlink.c
@@ -163,7 +163,7 @@ static int nsim_devlink_reload(struct devlink *devlink,
 
 		err = devlink_resource_size_get(devlink, res_ids[i], &val);
 		if (!err) {
-			err = nsim_fib_set_max(net, res_ids[i], val);
+			err = nsim_fib_set_max(net, res_ids[i], val, extack);
 			if (err)
 				return err;
 		}
@@ -181,7 +181,7 @@ static void nsim_devlink_net_reset(struct net *net)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(res_ids); ++i) {
-		if (nsim_fib_set_max(net, res_ids[i], (u64)-1)) {
+		if (nsim_fib_set_max(net, res_ids[i], (u64)-1, NULL)) {
 			pr_err("Failed to reset limit for resource %u\n",
 			       res_ids[i]);
 		}
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 9bfe9e151e13..f61d094746c0 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -64,7 +64,8 @@ u64 nsim_fib_get_val(struct net *net, enum nsim_resource_id res_id, bool max)
 	return max ? entry->max : entry->num;
 }
 
-int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val)
+int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val,
+		     struct netlink_ext_ack *extack)
 {
 	struct nsim_fib_data *fib_data = net_generic(net, nsim_fib_net_id);
 	struct nsim_fib_entry *entry;
@@ -90,10 +91,12 @@ int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val)
 	/* not allowing a new max to be less than curren occupancy
 	 * --> no means of evicting entries
 	 */
-	if (val < entry->num)
+	if (val < entry->num) {
+		NL_SET_ERR_MSG_MOD(extack, "New size is less than current occupancy");
 		err = -EINVAL;
-	else
+	} else {
 		entry->max = val;
+	}
 
 	return err;
 }
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 3a8581af3b85..8ca50b72c328 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -126,7 +126,8 @@ void nsim_devlink_exit(void);
 int nsim_fib_init(void);
 void nsim_fib_exit(void);
 u64 nsim_fib_get_val(struct net *net, enum nsim_resource_id res_id, bool max);
-int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val);
+int nsim_fib_set_max(struct net *net, enum nsim_resource_id res_id, u64 val,
+		     struct netlink_ext_ack *extack);
 #else
 static inline int nsim_devlink_setup(struct netdevsim *ns)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 3fcc773be62a9c42dc9a5c1108da298fb9f66cfa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 5 Jun 2018 08:14:11 -0700
Subject: mlxsw: Add extack messages for port_{un, }split failures

Return messages in extack for port split/unsplit errors. e.g.,
    $ devlink port split swp1s1 count 4
    Error: mlxsw_spectrum: Port cannot be split further.
    devlink answers: Invalid argument

    $ devlink port unsplit swp4
    Error: mlxsw_spectrum: Port was not split.
    devlink answers: Invalid argument

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     | 14 ++++++++++----
 drivers/net/ethernet/mellanox/mlxsw/core.h     |  5 +++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 15 ++++++++++++---
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 7ed38d80bc08..f9c724752a32 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -775,11 +775,14 @@ static int mlxsw_devlink_port_split(struct devlink *devlink,
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
-	if (port_index >= mlxsw_core->max_ports)
+	if (port_index >= mlxsw_core->max_ports) {
+		NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
 		return -EINVAL;
+	}
 	if (!mlxsw_core->driver->port_split)
 		return -EOPNOTSUPP;
-	return mlxsw_core->driver->port_split(mlxsw_core, port_index, count);
+	return mlxsw_core->driver->port_split(mlxsw_core, port_index, count,
+					      extack);
 }
 
 static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
@@ -788,11 +791,14 @@ static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 
-	if (port_index >= mlxsw_core->max_ports)
+	if (port_index >= mlxsw_core->max_ports) {
+		NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
 		return -EINVAL;
+	}
 	if (!mlxsw_core->driver->port_unsplit)
 		return -EOPNOTSUPP;
-	return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index);
+	return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index,
+						extack);
 }
 
 static int
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 4a8d4c7f89d9..552cfa29c2f7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -274,8 +274,9 @@ struct mlxsw_driver {
 	int (*port_type_set)(struct mlxsw_core *mlxsw_core, u8 local_port,
 			     enum devlink_port_type new_type);
 	int (*port_split)(struct mlxsw_core *mlxsw_core, u8 local_port,
-			  unsigned int count);
-	int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port);
+			  unsigned int count, struct netlink_ext_ack *extack);
+	int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    struct netlink_ext_ack *extack);
 	int (*sb_pool_get)(struct mlxsw_core *mlxsw_core,
 			   unsigned int sb_index, u16 pool_index,
 			   struct devlink_sb_pool_info *pool_info);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index fc39f22e5c70..968b88af2ef5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3092,7 +3092,8 @@ static void mlxsw_sp_port_unsplit_create(struct mlxsw_sp *mlxsw_sp,
 }
 
 static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
-			       unsigned int count)
+			       unsigned int count,
+			       struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	struct mlxsw_sp_port *mlxsw_sp_port;
@@ -3104,6 +3105,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
 	if (!mlxsw_sp_port) {
 		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
 			local_port);
+		NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
 		return -EINVAL;
 	}
 
@@ -3112,11 +3114,13 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
 
 	if (count != 2 && count != 4) {
 		netdev_err(mlxsw_sp_port->dev, "Port can only be split into 2 or 4 ports\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port can only be split into 2 or 4 ports");
 		return -EINVAL;
 	}
 
 	if (cur_width != MLXSW_PORT_MODULE_MAX_WIDTH) {
 		netdev_err(mlxsw_sp_port->dev, "Port cannot be split further\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port cannot be split further");
 		return -EINVAL;
 	}
 
@@ -3125,6 +3129,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
 		base_port = local_port;
 		if (mlxsw_sp->ports[base_port + 1]) {
 			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
 			return -EINVAL;
 		}
 	} else {
@@ -3132,6 +3137,7 @@ static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
 		if (mlxsw_sp->ports[base_port + 1] ||
 		    mlxsw_sp->ports[base_port + 3]) {
 			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
 			return -EINVAL;
 		}
 	}
@@ -3153,7 +3159,8 @@ err_port_split_create:
 	return err;
 }
 
-static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
+static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port,
+				 struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
 	struct mlxsw_sp_port *mlxsw_sp_port;
@@ -3165,11 +3172,13 @@ static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port)
 	if (!mlxsw_sp_port) {
 		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
 			local_port);
+		NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
 		return -EINVAL;
 	}
 
 	if (!mlxsw_sp_port->split) {
-		netdev_err(mlxsw_sp_port->dev, "Port wasn't split\n");
+		netdev_err(mlxsw_sp_port->dev, "Port was not split\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port was not split");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 644c7eebbfd59e72982d11ec6cc7d39af12450ae Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Jun 2018 09:25:19 -0700
Subject: rtnetlink: validate attributes in do_setlink()

It seems that rtnl_group_changelink() can call do_setlink
while a prior call to validate_linkmsg(dev = NULL, ...) could
not validate IFLA_ADDRESS / IFLA_BROADCAST

Make sure do_setlink() calls validate_linkmsg() instead
of letting its callers having this responsibility.

With help from Dmitry Vyukov, thanks a lot !

BUG: KMSAN: uninit-value in is_valid_ether_addr include/linux/etherdevice.h:199 [inline]
BUG: KMSAN: uninit-value in eth_prepare_mac_addr_change net/ethernet/eth.c:275 [inline]
BUG: KMSAN: uninit-value in eth_mac_addr+0x203/0x2b0 net/ethernet/eth.c:308
CPU: 1 PID: 8695 Comm: syz-executor3 Not tainted 4.17.0-rc5+ #103
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x185/0x1d0 lib/dump_stack.c:113
 kmsan_report+0x149/0x260 mm/kmsan/kmsan.c:1084
 __msan_warning_32+0x6e/0xc0 mm/kmsan/kmsan_instr.c:686
 is_valid_ether_addr include/linux/etherdevice.h:199 [inline]
 eth_prepare_mac_addr_change net/ethernet/eth.c:275 [inline]
 eth_mac_addr+0x203/0x2b0 net/ethernet/eth.c:308
 dev_set_mac_address+0x261/0x530 net/core/dev.c:7157
 do_setlink+0xbc3/0x5fc0 net/core/rtnetlink.c:2317
 rtnl_group_changelink net/core/rtnetlink.c:2824 [inline]
 rtnl_newlink+0x1fe9/0x37a0 net/core/rtnetlink.c:2976
 rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x455a09
RSP: 002b:00007fc07480ec68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00007fc07480f6d4 RCX: 0000000000455a09
RDX: 0000000000000000 RSI: 00000000200003c0 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 00000000000005d0 R14: 00000000006fdc20 R15: 0000000000000000

Uninit was stored to memory at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_save_stack mm/kmsan/kmsan.c:294 [inline]
 kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:685
 kmsan_memcpy_origins+0x11d/0x170 mm/kmsan/kmsan.c:527
 __msan_memcpy+0x109/0x160 mm/kmsan/kmsan_instr.c:478
 do_setlink+0xb84/0x5fc0 net/core/rtnetlink.c:2315
 rtnl_group_changelink net/core/rtnetlink.c:2824 [inline]
 rtnl_newlink+0x1fe9/0x37a0 net/core/rtnetlink.c:2976
 rtnetlink_rcv_msg+0xa32/0x1560 net/core/rtnetlink.c:4646
 netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2448
 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:4664
 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
 netlink_unicast+0x1678/0x1750 net/netlink/af_netlink.c:1336
 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline]
 kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:189
 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:315
 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan.c:322
 slab_post_alloc_hook mm/slab.h:446 [inline]
 slab_alloc_node mm/slub.c:2753 [inline]
 __kmalloc_node_track_caller+0xb32/0x11b0 mm/slub.c:4395
 __kmalloc_reserve net/core/skbuff.c:138 [inline]
 __alloc_skb+0x2cb/0x9e0 net/core/skbuff.c:206
 alloc_skb include/linux/skbuff.h:988 [inline]
 netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline]
 netlink_sendmsg+0x76e/0x1350 net/netlink/af_netlink.c:1876
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg net/socket.c:639 [inline]
 ___sys_sendmsg+0xec0/0x1310 net/socket.c:2117
 __sys_sendmsg net/socket.c:2155 [inline]
 __do_sys_sendmsg net/socket.c:2164 [inline]
 __se_sys_sendmsg net/socket.c:2162 [inline]
 __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162
 do_syscall_64+0x152/0x230 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: e7ed828f10bd ("netlink: support setting devgroup parameters")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9a1ba2015ad8..5ef61222fdef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2266,6 +2266,10 @@ static int do_setlink(const struct sk_buff *skb,
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
+	err = validate_linkmsg(dev, tb);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
 		struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
 							    tb, CAP_NET_ADMIN);
@@ -2629,10 +2633,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	err = validate_linkmsg(dev, tb);
-	if (err < 0)
-		goto errout;
-
 	err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
 errout:
 	return err;
-- 
cgit v1.2.3-59-g8ed1b


From 75d4e704fa8d2cf33ff295e5b441317603d7f9fd Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Tue, 5 Jun 2018 09:48:13 -0700
Subject: netdev-FAQ: clarify DaveM's position for stable backports

Per discussion with David at netconf 2018, let's clarify
DaveM's position of handling stable backports in netdev-FAQ.

This is important for people relying on upstream -stable
releases.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netdev-FAQ.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/networking/netdev-FAQ.txt b/Documentation/networking/netdev-FAQ.txt
index 2a3278d5cf35..fa951b820b25 100644
--- a/Documentation/networking/netdev-FAQ.txt
+++ b/Documentation/networking/netdev-FAQ.txt
@@ -179,6 +179,15 @@ A: No.  See above answer.  In short, if you think it really belongs in
    dash marker line as described in Documentation/process/submitting-patches.rst to
    temporarily embed that information into the patch that you send.
 
+Q: Are all networking bug fixes backported to all stable releases?
+
+A: Due to capacity, Dave could only take care of the backports for the last
+   2 stable releases. For earlier stable releases, each stable branch maintainer
+   is supposed to take care of them. If you find any patch is missing from an
+   earlier stable branch, please notify stable@vger.kernel.org with either a
+   commit ID or a formal patch backported, and CC Dave and other relevant
+   networking developers.
+
 Q: Someone said that the comment style and coding convention is different
    for the networking content.  Is this true?
 
-- 
cgit v1.2.3-59-g8ed1b


From 4016a7f15efc9189f0ce18025fb3306a8b5f9195 Mon Sep 17 00:00:00 2001
From: Govindarajulu Varadarajan <gvaradar@cisco.com>
Date: Tue, 5 Jun 2018 10:14:57 -0700
Subject: enic: fix UDP rss bits

In commit 48398b6e7065 ("enic: set UDP rss flag") driver needed to set a
single bit to enable UDP rss. This is changed to two bit. One for UDP
IPv4 and other bit for UDP IPv6. The hardware which supports this is not
released yet. When released, driver should set 2 bit to enable UDP rss for
both IPv4 and IPv6.

Also add spinlock around vnic_dev_capable_rss_hash_type().

Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cisco/enic/enic_ethtool.c | 18 ++++++++++++++----
 drivers/net/ethernet/cisco/enic/enic_main.c    | 20 ++++++++++++++------
 drivers/net/ethernet/cisco/enic/enic_res.c     |  7 ++++++-
 drivers/net/ethernet/cisco/enic/vnic_dev.c     | 18 +++++++++++-------
 drivers/net/ethernet/cisco/enic/vnic_dev.h     |  2 +-
 drivers/net/ethernet/cisco/enic/vnic_devcmd.h  | 20 +++++++++++++++++++-
 drivers/net/ethernet/cisco/enic/vnic_nic.h     |  3 ++-
 7 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 869006c2002d..f42f7a6e1559 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -476,18 +476,28 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 
 static int enic_get_rx_flow_hash(struct enic *enic, struct ethtool_rxnfc *cmd)
 {
+	u8 rss_hash_type = 0;
 	cmd->data = 0;
 
+	spin_lock_bh(&enic->devcmd_lock);
+	(void)vnic_dev_capable_rss_hash_type(enic->vdev, &rss_hash_type);
+	spin_unlock_bh(&enic->devcmd_lock);
 	switch (cmd->flow_type) {
 	case TCP_V6_FLOW:
 	case TCP_V4_FLOW:
-		cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
-		/* Fall through */
+		cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3 |
+			     RXH_IP_SRC | RXH_IP_DST;
+		break;
 	case UDP_V6_FLOW:
+		cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+		if (rss_hash_type & NIC_CFG_RSS_HASH_TYPE_UDP_IPV6)
+			cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
 	case UDP_V4_FLOW:
-		if (vnic_dev_capable_udp_rss(enic->vdev))
+		cmd->data |= RXH_IP_SRC | RXH_IP_DST;
+		if (rss_hash_type & NIC_CFG_RSS_HASH_TYPE_UDP_IPV4)
 			cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
-		/* Fall through */
+		break;
 	case SCTP_V4_FLOW:
 	case AH_ESP_V4_FLOW:
 	case AH_V4_FLOW:
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 8a8b12b720ef..30d2eaa18c04 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2320,16 +2320,24 @@ static int enic_set_rss_nic_cfg(struct enic *enic)
 {
 	struct device *dev = enic_get_dev(enic);
 	const u8 rss_default_cpu = 0;
-	u8 rss_hash_type = NIC_CFG_RSS_HASH_TYPE_IPV4 |
-		NIC_CFG_RSS_HASH_TYPE_TCP_IPV4 |
-		NIC_CFG_RSS_HASH_TYPE_IPV6 |
-		NIC_CFG_RSS_HASH_TYPE_TCP_IPV6;
 	const u8 rss_hash_bits = 7;
 	const u8 rss_base_cpu = 0;
+	u8 rss_hash_type;
+	int res;
 	u8 rss_enable = ENIC_SETTING(enic, RSS) && (enic->rq_count > 1);
 
-	if (vnic_dev_capable_udp_rss(enic->vdev))
-		rss_hash_type |= NIC_CFG_RSS_HASH_TYPE_UDP;
+	spin_lock_bh(&enic->devcmd_lock);
+	res = vnic_dev_capable_rss_hash_type(enic->vdev, &rss_hash_type);
+	spin_unlock_bh(&enic->devcmd_lock);
+	if (res) {
+		/* defaults for old adapters
+		 */
+		rss_hash_type = NIC_CFG_RSS_HASH_TYPE_IPV4	|
+				NIC_CFG_RSS_HASH_TYPE_TCP_IPV4	|
+				NIC_CFG_RSS_HASH_TYPE_IPV6	|
+				NIC_CFG_RSS_HASH_TYPE_TCP_IPV6;
+	}
+
 	if (rss_enable) {
 		if (!enic_set_rsskey(enic)) {
 			if (enic_set_rsscpu(enic, rss_hash_bits)) {
diff --git a/drivers/net/ethernet/cisco/enic/enic_res.c b/drivers/net/ethernet/cisco/enic/enic_res.c
index 9c96911fb2c8..40b20817ddd5 100644
--- a/drivers/net/ethernet/cisco/enic/enic_res.c
+++ b/drivers/net/ethernet/cisco/enic/enic_res.c
@@ -149,6 +149,7 @@ int enic_set_nic_cfg(struct enic *enic, u8 rss_default_cpu, u8 rss_hash_type,
 	u8 rss_hash_bits, u8 rss_base_cpu, u8 rss_enable, u8 tso_ipid_split_en,
 	u8 ig_vlan_strip_en)
 {
+	enum vnic_devcmd_cmd cmd = CMD_NIC_CFG;
 	u64 a0, a1;
 	u32 nic_cfg;
 	int wait = 1000;
@@ -160,7 +161,11 @@ int enic_set_nic_cfg(struct enic *enic, u8 rss_default_cpu, u8 rss_hash_type,
 	a0 = nic_cfg;
 	a1 = 0;
 
-	return vnic_dev_cmd(enic->vdev, CMD_NIC_CFG, &a0, &a1, wait);
+	if (rss_hash_type & (NIC_CFG_RSS_HASH_TYPE_UDP_IPV4 |
+			     NIC_CFG_RSS_HASH_TYPE_UDP_IPV6))
+		cmd = CMD_NIC_CFG_CHK;
+
+	return vnic_dev_cmd(enic->vdev, cmd, &a0, &a1, wait);
 }
 
 int enic_set_rss_key(struct enic *enic, dma_addr_t key_pa, u64 len)
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.c b/drivers/net/ethernet/cisco/enic/vnic_dev.c
index 76cdd4c9d11f..e9db811df59c 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.c
@@ -1282,19 +1282,23 @@ int vnic_dev_get_supported_feature_ver(struct vnic_dev *vdev, u8 feature,
 	return ret;
 }
 
-bool vnic_dev_capable_udp_rss(struct vnic_dev *vdev)
+int vnic_dev_capable_rss_hash_type(struct vnic_dev *vdev, u8 *rss_hash_type)
 {
 	u64 a0 = CMD_NIC_CFG, a1 = 0;
-	u64 rss_hash_type;
 	int wait = 1000;
 	int err;
 
 	err = vnic_dev_cmd(vdev, CMD_CAPABILITY, &a0, &a1, wait);
-	if (err || !a0)
-		return false;
+	/* rss_hash_type is valid only when a0 is 1. Adapter which does not
+	 * support CMD_CAPABILITY for rss_hash_type has a0 = 0
+	 */
+	if (err || (a0 != 1))
+		return -EOPNOTSUPP;
+
+	a1 = (a1 >> NIC_CFG_RSS_HASH_TYPE_SHIFT) &
+	     NIC_CFG_RSS_HASH_TYPE_MASK_FIELD;
 
-	rss_hash_type = (a1 >> NIC_CFG_RSS_HASH_TYPE_SHIFT) &
-			NIC_CFG_RSS_HASH_TYPE_MASK_FIELD;
+	*rss_hash_type = (u8)a1;
 
-	return (rss_hash_type & NIC_CFG_RSS_HASH_TYPE_UDP);
+	return 0;
 }
diff --git a/drivers/net/ethernet/cisco/enic/vnic_dev.h b/drivers/net/ethernet/cisco/enic/vnic_dev.h
index 59d4cc8fbb85..714fc1ed79e3 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_dev.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_dev.h
@@ -184,6 +184,6 @@ int vnic_dev_overlay_offload_cfg(struct vnic_dev *vdev, u8 overlay,
 				 u16 vxlan_udp_port_number);
 int vnic_dev_get_supported_feature_ver(struct vnic_dev *vdev, u8 feature,
 				       u64 *supported_versions, u64 *a1);
-bool vnic_dev_capable_udp_rss(struct vnic_dev *vdev);
+int vnic_dev_capable_rss_hash_type(struct vnic_dev *vdev, u8 *rss_hash_type);
 
 #endif /* _VNIC_DEV_H_ */
diff --git a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
index 41de4ba622a1..fef5a0a0663d 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
@@ -148,8 +148,26 @@ enum vnic_devcmd_cmd {
 	/* del VLAN id in (u16)a0 */
 	CMD_VLAN_DEL            = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 15),
 
-	/* nic_cfg in (u32)a0 */
+	/* nic_cfg (no wait, always succeeds)
+	 * in: (u32)a0
+	 *
+	 * Capability query:
+	 * out: (u64) a0 = 1 if a1 is valid
+	 *	(u64) a1 = (NIC_CFG bits supported) | (flags << 32)
+	 *
+	 * flags are CMD_NIC_CFG_CAPF_xxx
+	 */
 	CMD_NIC_CFG             = _CMDCNW(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 16),
+	/* nic_cfg_chk (will return error if flags are invalid)
+	 * in: (u32)a0
+	 *
+	 * Capability query:
+	 * out: (u64) a0 = 1 if a1 is valid
+	 *	(u64) a1 = (NIC_CFG bits supported) | (flags << 32)
+	 *
+	 * flags are CMD_NIC_CFG_CAPF_xxx
+	 */
+	CMD_NIC_CFG_CHK		= _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ALL, 16),
 
 	/* union vnic_rss_key in mem: (u64)a0=paddr, (u16)a1=len */
 	CMD_RSS_KEY             = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 17),
diff --git a/drivers/net/ethernet/cisco/enic/vnic_nic.h b/drivers/net/ethernet/cisco/enic/vnic_nic.h
index 5a93db0d7afc..84ff8ca17fcb 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_nic.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_nic.h
@@ -41,13 +41,14 @@
 #define NIC_CFG_IG_VLAN_STRIP_EN_MASK_FIELD	1UL
 #define NIC_CFG_IG_VLAN_STRIP_EN_SHIFT		24
 
+#define NIC_CFG_RSS_HASH_TYPE_UDP_IPV4		(1 << 0)
 #define NIC_CFG_RSS_HASH_TYPE_IPV4		(1 << 1)
 #define NIC_CFG_RSS_HASH_TYPE_TCP_IPV4		(1 << 2)
 #define NIC_CFG_RSS_HASH_TYPE_IPV6		(1 << 3)
 #define NIC_CFG_RSS_HASH_TYPE_TCP_IPV6		(1 << 4)
 #define NIC_CFG_RSS_HASH_TYPE_IPV6_EX		(1 << 5)
 #define NIC_CFG_RSS_HASH_TYPE_TCP_IPV6_EX	(1 << 6)
-#define NIC_CFG_RSS_HASH_TYPE_UDP		(1 << 7)
+#define NIC_CFG_RSS_HASH_TYPE_UDP_IPV6		(1 << 7)
 
 static inline void vnic_set_nic_cfg(u32 *nic_cfg,
 	u8 rss_default_cpu, u8 rss_hash_type,
-- 
cgit v1.2.3-59-g8ed1b


From 5040cc990cbac98733df4d58fdeac5bbdab15b49 Mon Sep 17 00:00:00 2001
From: Arun Parameswaran <arun.parameswaran@broadcom.com>
Date: Tue, 5 Jun 2018 13:38:12 -0700
Subject: net: dsa: b53: Fix for brcm tag issue in Cygnus SoC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the Broadcom Cygnus SoC, the brcm tag needs to be inserted
in between the mac address and the ether type (should use
'DSA_PROTO_TAG_BRCM') for the packets sent to the internal
b53 switch.

Since the Cygnus was added with the BCM58XX device id and the
BCM58XX uses 'DSA_PROTO_TAG_BRCM_PREPEND', the data path is
broken, due to the incorrect brcm tag location.

Add a new b53 device id (BCM583XX) for Cygnus family to fix the
issue. Add the new device id to the BCM58XX family as Cygnus
is similar to the BCM58XX in most other functionalities.

Fixes: 11606039604c ("net: dsa: b53: Support prepended Broadcom tags")

Signed-off-by: Arun Parameswaran <arun.parameswaran@broadcom.com>
Acked-by: Scott Branden <scott.branden@broadcom.com>
Reported-by: Clément Péron <peron.clem@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Clément Péron <peron.clem@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c | 15 ++++++++++++++-
 drivers/net/dsa/b53/b53_priv.h   |  2 ++
 drivers/net/dsa/b53/b53_srab.c   |  4 ++--
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index db9a80e1ee14..5e010b1592f7 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -684,7 +684,8 @@ static int b53_switch_reset(struct b53_device *dev)
 	 * still use this driver as a library and need to perform the reset
 	 * earlier.
 	 */
-	if (dev->chip_id == BCM58XX_DEVICE_ID) {
+	if (dev->chip_id == BCM58XX_DEVICE_ID ||
+	    dev->chip_id == BCM583XX_DEVICE_ID) {
 		b53_read8(dev, B53_CTRL_PAGE, B53_SOFTRESET, &reg);
 		reg |= SW_RST | EN_SW_RST | EN_CH_RST;
 		b53_write8(dev, B53_CTRL_PAGE, B53_SOFTRESET, reg);
@@ -1935,6 +1936,18 @@ static const struct b53_chip_data b53_switch_chips[] = {
 		.jumbo_pm_reg = B53_JUMBO_PORT_MASK,
 		.jumbo_size_reg = B53_JUMBO_MAX_SIZE,
 	},
+	{
+		.chip_id = BCM583XX_DEVICE_ID,
+		.dev_name = "BCM583xx/11360",
+		.vlans = 4096,
+		.enabled_ports = 0x103,
+		.arl_entries = 4,
+		.cpu_port = B53_CPU_PORT,
+		.vta_regs = B53_VTA_REGS,
+		.duplex_reg = B53_DUPLEX_STAT_GE,
+		.jumbo_pm_reg = B53_JUMBO_PORT_MASK,
+		.jumbo_size_reg = B53_JUMBO_MAX_SIZE,
+	},
 	{
 		.chip_id = BCM7445_DEVICE_ID,
 		.dev_name = "BCM7445",
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 75d9ffb75da8..df149756c282 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -62,6 +62,7 @@ enum {
 	BCM53018_DEVICE_ID = 0x53018,
 	BCM53019_DEVICE_ID = 0x53019,
 	BCM58XX_DEVICE_ID = 0x5800,
+	BCM583XX_DEVICE_ID = 0x58300,
 	BCM7445_DEVICE_ID = 0x7445,
 	BCM7278_DEVICE_ID = 0x7278,
 };
@@ -181,6 +182,7 @@ static inline int is5301x(struct b53_device *dev)
 static inline int is58xx(struct b53_device *dev)
 {
 	return dev->chip_id == BCM58XX_DEVICE_ID ||
+		dev->chip_id == BCM583XX_DEVICE_ID ||
 		dev->chip_id == BCM7445_DEVICE_ID ||
 		dev->chip_id == BCM7278_DEVICE_ID;
 }
diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index c37ffd1b6833..8247481eaa06 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -364,7 +364,7 @@ static const struct of_device_id b53_srab_of_match[] = {
 	{ .compatible = "brcm,bcm53018-srab" },
 	{ .compatible = "brcm,bcm53019-srab" },
 	{ .compatible = "brcm,bcm5301x-srab" },
-	{ .compatible = "brcm,bcm11360-srab", .data = (void *)BCM58XX_DEVICE_ID },
+	{ .compatible = "brcm,bcm11360-srab", .data = (void *)BCM583XX_DEVICE_ID },
 	{ .compatible = "brcm,bcm58522-srab", .data = (void *)BCM58XX_DEVICE_ID },
 	{ .compatible = "brcm,bcm58525-srab", .data = (void *)BCM58XX_DEVICE_ID },
 	{ .compatible = "brcm,bcm58535-srab", .data = (void *)BCM58XX_DEVICE_ID },
@@ -372,7 +372,7 @@ static const struct of_device_id b53_srab_of_match[] = {
 	{ .compatible = "brcm,bcm58623-srab", .data = (void *)BCM58XX_DEVICE_ID },
 	{ .compatible = "brcm,bcm58625-srab", .data = (void *)BCM58XX_DEVICE_ID },
 	{ .compatible = "brcm,bcm88312-srab", .data = (void *)BCM58XX_DEVICE_ID },
-	{ .compatible = "brcm,cygnus-srab", .data = (void *)BCM58XX_DEVICE_ID },
+	{ .compatible = "brcm,cygnus-srab", .data = (void *)BCM583XX_DEVICE_ID },
 	{ .compatible = "brcm,nsp-srab", .data = (void *)BCM58XX_DEVICE_ID },
 	{ /* sentinel */ },
 };
-- 
cgit v1.2.3-59-g8ed1b


From 9a99dc1c41772e0b24348f089a7a7edb91fc7723 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 6 Jun 2018 13:55:47 -0400
Subject: Revert "net: sched: cls: Fix offloading when ingress dev is vxlan"

This reverts commit d96a43c66464cdf0b249fdf47b6dcd65b83af8c0.

This potentially breaks things, so reverting as per
request by Jakub Kicinski.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 29fb4d68a144..cdc3c87c53e6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -807,6 +807,10 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
 	int ok_count = 0;
 	int err;
 
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop)
+		return -EOPNOTSUPP;
+
 	list_for_each_entry(block_cb, &block->cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
@@ -1725,31 +1729,21 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts,
 int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 		     enum tc_setup_type type, void *type_data, bool err_stop)
 {
-	int ok_count = 0;
+	int ok_count;
 	int ret;
 
-	if (!block->nooffloaddevcnt) {
-		ret = tcf_block_cb_call(block, type, type_data, err_stop);
-		if (ret < 0)
-			return ret;
-		ok_count = ret;
-	}
+	ret = tcf_block_cb_call(block, type, type_data, err_stop);
+	if (ret < 0)
+		return ret;
+	ok_count = ret;
 
 	if (!exts || ok_count)
-		goto skip_egress;
-
+		return ok_count;
 	ret = tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop);
 	if (ret < 0)
 		return ret;
 	ok_count += ret;
 
-skip_egress:
-	/* if one of the netdevs sharing this block are not offload-capable
-	 * make sure we succeeded in egress instead.
-	 */
-	if (block->nooffloaddevcnt && !ok_count && err_stop)
-		return -EOPNOTSUPP;
-
 	return ok_count;
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
-- 
cgit v1.2.3-59-g8ed1b


From dd612f18a49b63af8b3a5f572d999bdb197385bc Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 6 Jun 2018 15:03:22 +0200
Subject: bnx2x: use the right constant

Nearby code that also tests port suggests that the P0 constant should be
used when port is zero.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression e,e1;
@@

* e ? e1 : e1
// </smpl>

Fixes: 6c3218c6f7e5 ("bnx2x: Adjust ETS to 578xx")
Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index 7dd83d0ef0a0..22243c480a05 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -588,7 +588,7 @@ static void bnx2x_ets_e3b0_nig_disabled(const struct link_params *params,
 	 * slots for the highest priority.
 	 */
 	REG_WR(bp, (port) ? NIG_REG_P1_TX_ARB_NUM_STRICT_ARB_SLOTS :
-		   NIG_REG_P1_TX_ARB_NUM_STRICT_ARB_SLOTS, 0x100);
+		   NIG_REG_P0_TX_ARB_NUM_STRICT_ARB_SLOTS, 0x100);
 	/* Mapping between the CREDIT_WEIGHT registers and actual client
 	 * numbers
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From 1819e40908ee76c7219287224c22c772556c927e Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Wed, 6 Jun 2018 14:07:51 +0100
Subject: net: hns3: Fix for VF mailbox cannot receiving PF response

When the VF frequently switches the CMDQ interrupt, if the CMDQ_SRC is not
cleared, the VF will not receive the new PF response after the interrupt
is re-enabled, the corresponding log is as follows:

[  317.482222] hns3 0000:00:03.0: VF could not get mbx resp(=0) from PF
in 500 tries
[  317.483137] hns3 0000:00:03.0: VF request to get tqp info from PF
failed -5

This patch fixes this problem by clearing CMDQ_SRC before enabling
interrupt and syncing pending IRQ handlers after disabling interrupt.

Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) Support")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index bc8a5760d959..a17872aab168 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1565,6 +1565,8 @@ static int hclgevf_misc_irq_init(struct hclgevf_dev *hdev)
 		return ret;
 	}
 
+	hclgevf_clear_event_cause(hdev, 0);
+
 	/* enable misc. vector(vector 0) */
 	hclgevf_enable_vector(&hdev->misc_vector, true);
 
@@ -1575,6 +1577,7 @@ static void hclgevf_misc_irq_uninit(struct hclgevf_dev *hdev)
 {
 	/* disable misc vector(vector 0) */
 	hclgevf_enable_vector(&hdev->misc_vector, false);
+	synchronize_irq(hdev->misc_vector.vector_irq);
 	free_irq(hdev->misc_vector.vector_irq, hdev);
 	hclgevf_free_vector(hdev, 0);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6444e2a5f1e680278b58ced3568bdff84afe14a5 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Wed, 6 Jun 2018 14:07:52 +0100
Subject: net: hns3: Fix for VF mailbox receiving unknown message

Before the firmware updates the crq's tail pointer, if the VF driver
reads the data in the crq, the data may be incomplete at this time,
which will lead to the driver read an unknown message.

This patch fixes it by checking if crq is empty before reading the
message.

Fixes: b11a0bb231f3 ("net: hns3: Add mailbox support to VF driver")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c   | 23 +++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
index a28618428338..b598c06af8e0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c
@@ -126,6 +126,13 @@ int hclgevf_send_mbx_msg(struct hclgevf_dev *hdev, u16 code, u16 subcode,
 	return status;
 }
 
+static bool hclgevf_cmd_crq_empty(struct hclgevf_hw *hw)
+{
+	u32 tail = hclgevf_read_dev(hw, HCLGEVF_NIC_CRQ_TAIL_REG);
+
+	return tail == hw->cmq.crq.next_to_use;
+}
+
 void hclgevf_mbx_handler(struct hclgevf_dev *hdev)
 {
 	struct hclgevf_mbx_resp_status *resp;
@@ -140,11 +147,22 @@ void hclgevf_mbx_handler(struct hclgevf_dev *hdev)
 	resp = &hdev->mbx_resp;
 	crq = &hdev->hw.cmq.crq;
 
-	flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
-	while (hnae_get_bit(flag, HCLGEVF_CMDQ_RX_OUTVLD_B)) {
+	while (!hclgevf_cmd_crq_empty(&hdev->hw)) {
 		desc = &crq->desc[crq->next_to_use];
 		req = (struct hclge_mbx_pf_to_vf_cmd *)desc->data;
 
+		flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
+		if (unlikely(!hnae_get_bit(flag, HCLGEVF_CMDQ_RX_OUTVLD_B))) {
+			dev_warn(&hdev->pdev->dev,
+				 "dropped invalid mailbox message, code = %d\n",
+				 req->msg[0]);
+
+			/* dropping/not processing this invalid message */
+			crq->desc[crq->next_to_use].flag = 0;
+			hclge_mbx_ring_ptr_move_crq(crq);
+			continue;
+		}
+
 		/* synchronous messages are time critical and need preferential
 		 * treatment. Therefore, we need to acknowledge all the sync
 		 * responses as quickly as possible so that waiting tasks do not
@@ -205,7 +223,6 @@ void hclgevf_mbx_handler(struct hclgevf_dev *hdev)
 		}
 		crq->desc[crq->next_to_use].flag = 0;
 		hclge_mbx_ring_ptr_move_crq(crq);
-		flag = le16_to_cpu(crq->desc[crq->next_to_use].flag);
 	}
 
 	/* Write back CMDQ_RQ header pointer, M7 need this pointer */
-- 
cgit v1.2.3-59-g8ed1b


From 8e52a602b5126183f7a6487c4d48f6a00af4e4fd Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Wed, 6 Jun 2018 14:07:53 +0100
Subject: net: hns3: Optimize PF CMDQ interrupt switching process

When the PF frequently switches the CMDQ interrupt, if the CMDQ_SRC is
not cleared before the hardware interrupt is generated, the new interrupt
will not be reported.

This patch optimizes this problem by clearing CMDQ_SRC and RESET_STS
before enabling interrupt and syncing pending IRQ handlers after disabling
interrupt.

Fixes: 466b0c00391b ("net: hns3: Add support for misc interrupt")
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2a801344eafb..d318d35e598f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2557,6 +2557,15 @@ static void hclge_clear_event_cause(struct hclge_dev *hdev, u32 event_type,
 	}
 }
 
+static void hclge_clear_all_event_cause(struct hclge_dev *hdev)
+{
+	hclge_clear_event_cause(hdev, HCLGE_VECTOR0_EVENT_RST,
+				BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) |
+				BIT(HCLGE_VECTOR0_CORERESET_INT_B) |
+				BIT(HCLGE_VECTOR0_IMPRESET_INT_B));
+	hclge_clear_event_cause(hdev, HCLGE_VECTOR0_EVENT_MBX, 0);
+}
+
 static void hclge_enable_vector(struct hclge_misc_vector *vector, bool enable)
 {
 	writel(enable ? 1 : 0, vector->addr);
@@ -5688,6 +5697,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	INIT_WORK(&hdev->rst_service_task, hclge_reset_service_task);
 	INIT_WORK(&hdev->mbx_service_task, hclge_mailbox_service_task);
 
+	hclge_clear_all_event_cause(hdev);
+
 	/* Enable MISC vector(vector0) */
 	hclge_enable_vector(&hdev->misc_vector, true);
 
@@ -5817,6 +5828,8 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 	/* Disable MISC vector(vector0) */
 	hclge_enable_vector(&hdev->misc_vector, false);
+	synchronize_irq(hdev->misc_vector.vector_irq);
+
 	hclge_destroy_cmd_queue(&hdev->hw);
 	hclge_misc_irq_uninit(hdev);
 	hclge_pci_uninit(hdev);
-- 
cgit v1.2.3-59-g8ed1b


From fb1967a69f756073362b8f19347f863f227320ad Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Jun 2018 14:59:14 +0100
Subject: rxrpc: Fix terminal retransmission connection ID to include the
 channel

When retransmitting the final ACK or ABORT packet for a call, the cid field
in the packet header is set to the connection's cid, but this is incorrect
as it also needs to include the channel number on that connection that the
call was made on.

Fix this by OR'ing in the channel number.

Note that this fixes the bug that:

	commit 1a025028d400b23477341aa7ec2ce55f8b39b554
	rxrpc: Fix handling of call quietly cancelled out on server

works around.  I'm not intending to revert that as it will help protect
against problems that might occur on the server.

Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/conn_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 1350f1be8037..8229a52c2acd 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -70,7 +70,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	iov[2].iov_len	= sizeof(ack_info);
 
 	pkt.whdr.epoch		= htonl(conn->proto.epoch);
-	pkt.whdr.cid		= htonl(conn->proto.cid);
+	pkt.whdr.cid		= htonl(conn->proto.cid | channel);
 	pkt.whdr.callNumber	= htonl(call_id);
 	pkt.whdr.seq		= 0;
 	pkt.whdr.type		= chan->last_type;
-- 
cgit v1.2.3-59-g8ed1b


From 7170e6045a6a8b33f4fa5753589dc77b16198e2d Mon Sep 17 00:00:00 2001
From: Doron Roberts-Kedes <doronrk@fb.com>
Date: Wed, 6 Jun 2018 09:33:28 -0700
Subject: strparser: Add __strp_unpause and use it in ktls.

strp_unpause queues strp_work in order to parse any messages that
arrived while the strparser was paused. However, the process invoking
strp_unpause could eagerly parse a buffered message itself if it held
the sock lock.

__strp_unpause is an alternative to strp_pause that avoids the scheduling
overhead that results when a receiving thread unpauses the strparser
and waits for the next message to be delivered by the workqueue thread.

This patch more than doubled the IOPS achieved in a benchmark of NBD
traffic encrypted using ktls.

Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/strparser.h   |  2 ++
 net/strparser/strparser.c | 13 +++++++++++++
 net/tls/tls_sw.c          |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/net/strparser.h b/include/net/strparser.h
index d96b59f45eba..f177c87ce38b 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -90,6 +90,8 @@ static inline void strp_pause(struct strparser *strp)
 
 /* May be called without holding lock for attached socket */
 void strp_unpause(struct strparser *strp);
+/* Must be called with process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp);
 
 static inline void save_strp_stats(struct strparser *strp,
 				   struct strp_aggr_stats *agg_stats)
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 092bebc70048..1a9695183599 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -512,6 +512,19 @@ int strp_init(struct strparser *strp, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(strp_init);
 
+/* Sock process lock held (lock_sock) */
+void __strp_unpause(struct strparser *strp)
+{
+	strp->paused = 0;
+
+	if (strp->need_bytes) {
+		if (strp_peek_len(strp) < strp->need_bytes)
+			return;
+	}
+	strp_read_sock(strp);
+}
+EXPORT_SYMBOL_GPL(__strp_unpause);
+
 void strp_unpause(struct strparser *strp)
 {
 	strp->paused = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 839e1e165a0c..8ca57d01b18f 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -735,7 +735,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
 	/* Finished with message */
 	ctx->recv_pkt = NULL;
 	kfree_skb(skb);
-	strp_unpause(&ctx->strp);
+	__strp_unpause(&ctx->strp);
 
 	return true;
 }
-- 
cgit v1.2.3-59-g8ed1b